feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -52,6 +52,15 @@ type FileRepo interface {
|
||||
SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error
|
||||
// SetPHash sets (or clears, when nil) the perceptual hash of a file.
|
||||
SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error
|
||||
// ListMissingPHash returns live image/video files that have no perceptual
|
||||
// hash yet (the dedup backfill work list).
|
||||
ListMissingPHash(ctx context.Context) ([]domain.File, error)
|
||||
// ListAllPHashes returns the id and perceptual hash of every live, hashed
|
||||
// file (the global input to the dedup rescan; not ACL-filtered).
|
||||
ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error)
|
||||
// CopyPoolMemberships adds targetID to every pool sourceID belongs to,
|
||||
// skipping pools target is already in (used by the duplicate merge).
|
||||
CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error
|
||||
// SoftDelete moves a file to trash (sets is_deleted = true).
|
||||
SoftDelete(ctx context.Context, id uuid.UUID) error
|
||||
// Restore moves a file out of trash (sets is_deleted = false).
|
||||
@@ -72,6 +81,21 @@ type FileRepo interface {
|
||||
RecordTagUses(ctx context.Context, userID int16, filterDSL string) error
|
||||
}
|
||||
|
||||
// DuplicatePairRepo persists the precomputed near-duplicate candidate pairs.
|
||||
type DuplicatePairRepo interface {
|
||||
// ReplaceAll atomically replaces the whole pairs table (used by the rescan).
|
||||
ReplaceAll(ctx context.Context, pairs []domain.DuplicatePair) error
|
||||
// ListVisible returns pairs whose both files are live, not dismissed, and
|
||||
// (for non-admins) visible to the viewer.
|
||||
ListVisible(ctx context.Context, viewerID int16, isAdmin bool) ([]domain.DuplicatePair, error)
|
||||
}
|
||||
|
||||
// DismissalRepo persists "not a duplicate" decisions.
|
||||
type DismissalRepo interface {
|
||||
// Add records a pair as dismissed (canonical order, idempotent).
|
||||
Add(ctx context.Context, a, b uuid.UUID, userID int16) error
|
||||
}
|
||||
|
||||
// TagRepo is the persistence interface for tags.
|
||||
type TagRepo interface {
|
||||
List(ctx context.Context, params OffsetParams) (*domain.TagOffsetPage, error)
|
||||
|
||||
Reference in New Issue
Block a user