feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing: - Two tables (edited into the original migrations): data.duplicate_pairs holds precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and data.duplicate_dismissals is a global "not a duplicate" overlay that survives rescans. New audit actions file_merge / duplicate_dismiss. - DuplicateService: - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over the perceptual hashes and replaces the pairs table. This is the only thing that populates pairs, so GET never compares all-vs-all (scales to 110k+). - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non- dismissed), groups them into connected components via union-find, and paginates whole clusters. - Resolve merges a pair field-by-field: each scalar from keep or discard, metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes the discarded file. Enforces edit ACL on both. - Dismiss records a canonical pair (view ACL on both). - Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss, POST /files/duplicates/resolve (registered before /:id to avoid collision). Plain delete reuses /files/bulk/delete. - Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo. Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and field validation; an integration test covers rescan -> list -> merge -> dismiss (including that a dismissal survives a re-rescan). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
@@ -92,6 +92,31 @@ CREATE TABLE data.file_pool (
    PRIMARY KEY (file_id, pool_id)
 );

+-- Precomputed near-duplicate candidates (phash Hamming distance <= threshold),
+-- (re)built in full by the dedup rescan. Stored once per unordered pair with a
+-- canonical file_a < file_b ordering so a pair is never duplicated as (a,b)/(b,a).
+CREATE TABLE data.duplicate_pairs (
+    file_a   uuid     NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
+    file_b   uuid     NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
+    distance smallint NOT NULL,
+
+    CONSTRAINT chk__duplicate_pairs__order CHECK (file_a < file_b),
+    PRIMARY KEY (file_a, file_b)
+);
+
+-- "Not a duplicate" decisions: a global overlay that hides a candidate pair from
+-- the duplicates view. Survives rescans (the pair may be re-found but stays
+-- hidden). Same canonical file_a < file_b ordering as data.duplicate_pairs.
+CREATE TABLE data.duplicate_dismissals (
+    file_a       uuid        NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
+    file_b       uuid        NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
+    dismissed_by smallint    NOT NULL REFERENCES core.users(id) ON UPDATE CASCADE ON DELETE RESTRICT,
+    dismissed_at timestamptz NOT NULL DEFAULT clock_timestamp(),
+
+    CONSTRAINT chk__duplicate_dismissals__order CHECK (file_a < file_b),
+    PRIMARY KEY (file_a, file_b)
+);
+
 COMMENT ON TABLE  data.categories IS 'Logical grouping of tags';
 COMMENT ON TABLE  data.tags       IS 'File labels/tags';
 COMMENT ON TABLE  data.tag_rules  IS 'Auto-tagging rules: when when_tag is assigned, then_tag follows';
@@ -99,6 +124,8 @@ COMMENT ON TABLE  data.files      IS 'Managed files; actual content stored on di
 COMMENT ON TABLE  data.file_tag   IS 'Many-to-many: files <-> tags';
 COMMENT ON TABLE  data.pools      IS 'Ordered collections of files';
 COMMENT ON TABLE  data.file_pool  IS 'Many-to-many: files <-> pools, with ordering';
+COMMENT ON TABLE  data.duplicate_pairs       IS 'Precomputed near-duplicate candidate pairs (perceptual-hash distance)';
+COMMENT ON TABLE  data.duplicate_dismissals  IS 'Pairs marked "not a duplicate"; hidden from the duplicates view';

 COMMENT ON COLUMN data.files.original_name    IS 'Original filename at upload time';
 COMMENT ON COLUMN data.files.content_datetime IS 'Content datetime (e.g. when photo was taken); falls back to EXIF DateTimeOriginal';
@@ -110,6 +137,8 @@ COMMENT ON COLUMN data.file_pool.position     IS 'Manual ordering within pool; u

 -- +goose Down

+DROP TABLE IF EXISTS data.duplicate_dismissals;
+DROP TABLE IF EXISTS data.duplicate_pairs;
 DROP TABLE IF EXISTS data.file_pool;
 DROP TABLE IF EXISTS data.pools;
 DROP TABLE IF EXISTS data.file_tag;