feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing:

- Two tables (edited into the original migrations): data.duplicate_pairs holds
  precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
  data.duplicate_dismissals is a global "not a duplicate" overlay that survives
  rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
  - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
    the perceptual hashes and replaces the pairs table. This is the only thing
    that populates pairs, so GET never compares all-vs-all (scales to 110k+).
  - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
    dismissed), groups them into connected components via union-find, and
    paginates whole clusters.
  - Resolve merges a pair field-by-field: each scalar from keep or discard,
    metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
    the discarded file. Enforces edit ACL on both.
  - Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
  POST /files/duplicates/resolve (registered before /:id to avoid collision).
  Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
  DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.

Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
+29
View File
@@ -92,6 +92,31 @@ CREATE TABLE data.file_pool (
PRIMARY KEY (file_id, pool_id)
);
-- Precomputed near-duplicate candidates (phash Hamming distance <= threshold),
-- (re)built in full by the dedup rescan. Stored once per unordered pair with a
-- canonical file_a < file_b ordering so a pair is never duplicated as (a,b)/(b,a).
CREATE TABLE data.duplicate_pairs (
file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
distance smallint NOT NULL,
CONSTRAINT chk__duplicate_pairs__order CHECK (file_a < file_b),
PRIMARY KEY (file_a, file_b)
);
-- "Not a duplicate" decisions: a global overlay that hides a candidate pair from
-- the duplicates view. Survives rescans (the pair may be re-found but stays
-- hidden). Same canonical file_a < file_b ordering as data.duplicate_pairs.
CREATE TABLE data.duplicate_dismissals (
file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
dismissed_by smallint NOT NULL REFERENCES core.users(id) ON UPDATE CASCADE ON DELETE RESTRICT,
dismissed_at timestamptz NOT NULL DEFAULT clock_timestamp(),
CONSTRAINT chk__duplicate_dismissals__order CHECK (file_a < file_b),
PRIMARY KEY (file_a, file_b)
);
COMMENT ON TABLE data.categories IS 'Logical grouping of tags';
COMMENT ON TABLE data.tags IS 'File labels/tags';
COMMENT ON TABLE data.tag_rules IS 'Auto-tagging rules: when when_tag is assigned, then_tag follows';
@@ -99,6 +124,8 @@ COMMENT ON TABLE data.files IS 'Managed files; actual content stored on di
COMMENT ON TABLE data.file_tag IS 'Many-to-many: files <-> tags';
COMMENT ON TABLE data.pools IS 'Ordered collections of files';
COMMENT ON TABLE data.file_pool IS 'Many-to-many: files <-> pools, with ordering';
COMMENT ON TABLE data.duplicate_pairs IS 'Precomputed near-duplicate candidate pairs (perceptual-hash distance)';
COMMENT ON TABLE data.duplicate_dismissals IS 'Pairs marked "not a duplicate"; hidden from the duplicates view';
COMMENT ON COLUMN data.files.original_name IS 'Original filename at upload time';
COMMENT ON COLUMN data.files.content_datetime IS 'Content datetime (e.g. when photo was taken); falls back to EXIF DateTimeOriginal';
@@ -110,6 +137,8 @@ COMMENT ON COLUMN data.file_pool.position IS 'Manual ordering within pool; u
-- +goose Down
DROP TABLE IF EXISTS data.duplicate_dismissals;
DROP TABLE IF EXISTS data.duplicate_pairs;
DROP TABLE IF EXISTS data.file_pool;
DROP TABLE IF EXISTS data.pools;
DROP TABLE IF EXISTS data.file_tag;
+8
View File
@@ -26,6 +26,12 @@ CREATE INDEX idx__files__needs_review ON data.files USING btree (id) WHERE
CREATE INDEX idx__file_tag__tag_id ON data.file_tag USING hash (tag_id);
CREATE INDEX idx__file_tag__file_id ON data.file_tag USING hash (file_id);
-- data.duplicate_pairs / data.duplicate_dismissals
-- The composite primary keys cover lookups on file_a; these add the file_b side
-- (used by the ON DELETE CASCADE and by the visibility join on the second file).
CREATE INDEX idx__duplicate_pairs__file_b ON data.duplicate_pairs USING hash (file_b);
CREATE INDEX idx__duplicate_dismissals__file_b ON data.duplicate_dismissals USING hash (file_b);
-- data.pools
CREATE INDEX idx__pools__creator_id ON data.pools USING hash (creator_id);
@@ -70,6 +76,8 @@ DROP INDEX IF EXISTS activity.idx__sessions__token_hash;
DROP INDEX IF EXISTS activity.idx__sessions__user_id;
DROP INDEX IF EXISTS acl.idx__acl__user;
DROP INDEX IF EXISTS acl.idx__acl__object;
DROP INDEX IF EXISTS data.idx__duplicate_dismissals__file_b;
DROP INDEX IF EXISTS data.idx__duplicate_pairs__file_b;
DROP INDEX IF EXISTS data.idx__file_pool__file_id;
DROP INDEX IF EXISTS data.idx__file_pool__pool_id;
DROP INDEX IF EXISTS data.idx__pools__creator_id;
+1
View File
@@ -21,6 +21,7 @@ INSERT INTO activity.action_types (name) VALUES
-- Files
('file_create'), ('file_edit'), ('file_delete'), ('file_restore'),
('file_permanent_delete'), ('file_replace'), ('file_review'),
('file_merge'), ('duplicate_dismiss'),
-- Tags
('tag_create'), ('tag_edit'), ('tag_delete'),
-- Categories