feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,152 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"github.com/google/uuid"
|
||||
|
||||
"tanabata/backend/internal/domain"
|
||||
)
|
||||
|
||||
// id builds a deterministic UUID whose byte order matches n, so tests can reason
|
||||
// about the canonical (FileA < FileB) ordering buildPairs produces.
|
||||
func id(n int) uuid.UUID {
|
||||
return uuid.MustParse(fmt.Sprintf("00000000-0000-0000-0000-%012d", n))
|
||||
}
|
||||
|
||||
func entry(n int, hash uint64) domain.PHashEntry {
|
||||
return domain.PHashEntry{ID: id(n), PHash: int64(hash)}
|
||||
}
|
||||
|
||||
// pairKey canonicalises a pair for set comparison regardless of emission order.
|
||||
func pairKey(p domain.DuplicatePair) string {
|
||||
a, b := p.FileA, p.FileB
|
||||
if bytes.Compare(a[:], b[:]) > 0 {
|
||||
a, b = b, a
|
||||
}
|
||||
return fmt.Sprintf("%s|%s|%d", a, b, p.Distance)
|
||||
}
|
||||
|
||||
func TestBuildPairs_ThresholdAndCanonicalOrder(t *testing.T) {
|
||||
entries := []domain.PHashEntry{
|
||||
entry(1, 0x0000000000000000),
|
||||
entry(2, 0x0000000000000001), // distance 1 from #1
|
||||
entry(3, 0x00000000000000FF), // distance 8 from #1, 7 from #2
|
||||
entry(4, 0xFFFFFFFFFFFFFFFF), // distance 64 from #1
|
||||
}
|
||||
|
||||
// Tight threshold: only the distance-1 pair qualifies.
|
||||
got := buildPairs(entries, 2, nil)
|
||||
if len(got) != 1 {
|
||||
t.Fatalf("threshold 2: got %d pairs, want 1: %+v", len(got), got)
|
||||
}
|
||||
if got[0].FileA != id(1) || got[0].FileB != id(2) || got[0].Distance != 1 {
|
||||
t.Errorf("threshold 2: unexpected pair %+v", got[0])
|
||||
}
|
||||
// Canonical order always FileA < FileB.
|
||||
if bytes.Compare(got[0].FileA[:], got[0].FileB[:]) >= 0 {
|
||||
t.Error("pair not in canonical FileA < FileB order")
|
||||
}
|
||||
|
||||
// Looser threshold pulls in #3's pairs but never #4.
|
||||
got8 := buildPairs(entries, 8, nil)
|
||||
want := map[string]bool{
|
||||
pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(2), Distance: 1}): true,
|
||||
pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(3), Distance: 8}): true,
|
||||
pairKey(domain.DuplicatePair{FileA: id(2), FileB: id(3), Distance: 7}): true,
|
||||
}
|
||||
if len(got8) != len(want) {
|
||||
t.Fatalf("threshold 8: got %d pairs, want %d: %+v", len(got8), len(want), got8)
|
||||
}
|
||||
for _, p := range got8 {
|
||||
if !want[pairKey(p)] {
|
||||
t.Errorf("threshold 8: unexpected pair %+v", p)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildPairs_IdenticalHashesPairAtDistanceZero(t *testing.T) {
|
||||
entries := []domain.PHashEntry{
|
||||
entry(1, 0xABCDABCDABCDABCD),
|
||||
entry(2, 0xABCDABCDABCDABCD),
|
||||
}
|
||||
got := buildPairs(entries, 0, nil)
|
||||
if len(got) != 1 || got[0].Distance != 0 || got[0].FileA != id(1) || got[0].FileB != id(2) {
|
||||
t.Fatalf("identical hashes: got %+v, want one distance-0 pair (1,2)", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestClusterPairs_ConnectedComponents(t *testing.T) {
|
||||
pairs := []domain.DuplicatePair{
|
||||
{FileA: id(1), FileB: id(2)},
|
||||
{FileA: id(2), FileB: id(3)}, // transitively joins 1-2-3
|
||||
{FileA: id(5), FileB: id(6)},
|
||||
}
|
||||
clusters := clusterPairs(pairs)
|
||||
if len(clusters) != 2 {
|
||||
t.Fatalf("got %d clusters, want 2: %+v", len(clusters), clusters)
|
||||
}
|
||||
// Sorted by smallest id: {1,2,3} then {5,6}.
|
||||
if len(clusters[0]) != 3 || clusters[0][0] != id(1) || clusters[0][2] != id(3) {
|
||||
t.Errorf("cluster 0 = %v, want [1 2 3]", clusters[0])
|
||||
}
|
||||
if len(clusters[1]) != 2 || clusters[1][0] != id(5) {
|
||||
t.Errorf("cluster 1 = %v, want [5 6]", clusters[1])
|
||||
}
|
||||
// Each cluster's ids are sorted.
|
||||
for _, c := range clusters {
|
||||
if !sort.SliceIsSorted(c, func(i, j int) bool { return bytes.Compare(c[i][:], c[j][:]) < 0 }) {
|
||||
t.Errorf("cluster not sorted: %v", c)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestPickMetadata_Merge(t *testing.T) {
|
||||
keep := json.RawMessage(`{"a":1,"b":2}`)
|
||||
discard := json.RawMessage(`{"b":9,"c":3}`)
|
||||
|
||||
out := pickMetadata(mergeMerge, keep, discard)
|
||||
var m map[string]int
|
||||
if err := json.Unmarshal(out, &m); err != nil {
|
||||
t.Fatalf("merge result not valid JSON: %v (%s)", err, out)
|
||||
}
|
||||
want := map[string]int{"a": 1, "b": 2, "c": 3} // survivor wins on "b"
|
||||
if fmt.Sprint(m) != fmt.Sprint(want) {
|
||||
t.Errorf("merge = %v, want %v", m, want)
|
||||
}
|
||||
|
||||
if string(pickMetadata(mergeKeep, keep, discard)) != string(keep) {
|
||||
t.Error("keep choice should return survivor metadata unchanged")
|
||||
}
|
||||
if string(pickMetadata(mergeDiscard, keep, discard)) != string(discard) {
|
||||
t.Error("discard choice should return the other file's metadata")
|
||||
}
|
||||
}
|
||||
|
||||
func TestMergeSpec_Normalize(t *testing.T) {
|
||||
// Empty fields default to "keep".
|
||||
spec := MergeSpec{Keep: id(1), Discard: id(2)}
|
||||
if err := spec.normalize(); err != nil {
|
||||
t.Fatalf("normalize empty: %v", err)
|
||||
}
|
||||
if spec.Fields.OriginalName != mergeKeep || spec.Fields.Tags != mergeKeep || spec.Fields.Metadata != mergeKeep {
|
||||
t.Errorf("empty fields not defaulted to keep: %+v", spec.Fields)
|
||||
}
|
||||
|
||||
// "both" is invalid for a scalar field.
|
||||
bad := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Notes: mergeBoth}}
|
||||
if err := bad.normalize(); !errors.Is(err, domain.ErrValidation) {
|
||||
t.Errorf("scalar=both: got %v, want ErrValidation", err)
|
||||
}
|
||||
|
||||
// "discard" is invalid for a relation field.
|
||||
badRel := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Tags: mergeDiscard}}
|
||||
if err := badRel.normalize(); !errors.Is(err, domain.ErrValidation) {
|
||||
t.Errorf("relation=discard: got %v, want ErrValidation", err)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user