feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -446,6 +446,89 @@ func (r *FileRepo) SetPHash(ctx context.Context, id uuid.UUID, phash *int64) err
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Perceptual-hash / duplicate support
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// ListMissingPHash returns live image/video files that have no perceptual hash
|
||||
// yet — the work list for the dedup backfill. Tags are not loaded (the backfill
|
||||
// only needs the id and MIME type to choose image vs video hashing).
|
||||
func (r *FileRepo) ListMissingPHash(ctx context.Context) ([]domain.File, error) {
|
||||
const sqlStr = `
|
||||
SELECT f.id, f.original_name,
|
||||
mt.name AS mime_type, mt.extension AS mime_extension,
|
||||
f.content_datetime, f.notes, f.metadata, f.exif, f.phash,
|
||||
f.creator_id, u.name AS creator_name,
|
||||
f.is_public, f.is_deleted, f.needs_review
|
||||
FROM data.files f
|
||||
JOIN core.mime_types mt ON mt.id = f.mime_id
|
||||
JOIN core.users u ON u.id = f.creator_id
|
||||
WHERE f.phash IS NULL AND f.is_deleted = false
|
||||
AND (mt.name LIKE 'image/%' OR mt.name LIKE 'video/%')
|
||||
ORDER BY f.id`
|
||||
|
||||
q := connOrTx(ctx, r.pool)
|
||||
rows, err := q.Query(ctx, sqlStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("FileRepo.ListMissingPHash: %w", err)
|
||||
}
|
||||
collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[fileRow])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("FileRepo.ListMissingPHash scan: %w", err)
|
||||
}
|
||||
files := make([]domain.File, len(collected))
|
||||
for i, row := range collected {
|
||||
files[i] = toFile(row)
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// phashRow is the minimal projection used to build duplicate clusters.
|
||||
type phashRow struct {
|
||||
ID uuid.UUID `db:"id"`
|
||||
PHash int64 `db:"phash"`
|
||||
}
|
||||
|
||||
// ListAllPHashes returns the id and perceptual hash of every live, hashed file.
|
||||
// It is the global input to the dedup rescan, so it deliberately ignores ACL —
|
||||
// the rescan builds the shared pairs table; visibility is enforced on read.
|
||||
func (r *FileRepo) ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error) {
|
||||
const sqlStr = `SELECT id, phash FROM data.files WHERE is_deleted = false AND phash IS NOT NULL`
|
||||
q := connOrTx(ctx, r.pool)
|
||||
rows, err := q.Query(ctx, sqlStr)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("FileRepo.ListAllPHashes: %w", err)
|
||||
}
|
||||
collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[phashRow])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("FileRepo.ListAllPHashes scan: %w", err)
|
||||
}
|
||||
out := make([]domain.PHashEntry, len(collected))
|
||||
for i, row := range collected {
|
||||
out[i] = domain.PHashEntry{ID: row.ID, PHash: row.PHash}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// CopyPoolMemberships adds targetID to every pool sourceID belongs to (copying
|
||||
// the source's position), skipping pools the target is already in. Used by the
|
||||
// duplicate merge to preserve the discarded file's pool memberships on the
|
||||
// survivor. The merge is authorised at the file level, so pool ACL is not
|
||||
// re-checked here.
|
||||
func (r *FileRepo) CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error {
|
||||
const sqlStr = `
|
||||
INSERT INTO data.file_pool (file_id, pool_id, position)
|
||||
SELECT $1, fp.pool_id, fp.position
|
||||
FROM data.file_pool fp
|
||||
WHERE fp.file_id = $2
|
||||
ON CONFLICT (file_id, pool_id) DO NOTHING`
|
||||
q := connOrTx(ctx, r.pool)
|
||||
if _, err := q.Exec(ctx, sqlStr, targetID, sourceID); err != nil {
|
||||
return fmt.Errorf("FileRepo.CopyPoolMemberships: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SoftDelete / Restore / DeletePermanent
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user