feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing: - Two tables (edited into the original migrations): data.duplicate_pairs holds precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and data.duplicate_dismissals is a global "not a duplicate" overlay that survives rescans. New audit actions file_merge / duplicate_dismiss. - DuplicateService: - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over the perceptual hashes and replaces the pairs table. This is the only thing that populates pairs, so GET never compares all-vs-all (scales to 110k+). - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non- dismissed), groups them into connected components via union-find, and paginates whole clusters. - Resolve merges a pair field-by-field: each scalar from keep or discard, metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes the discarded file. Enforces edit ACL on both. - Dismiss records a canonical pair (view ACL on both). - Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss, POST /files/duplicates/resolve (registered before /:id to avoid collision). Plain delete reuses /files/bulk/delete. - Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo. Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and field validation; an integration test covers rescan -> list -> merge -> dismiss (including that a dismissal survives a re-rescan). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
@@ -0,0 +1,148 @@
+package service
+
+import (
+	"bytes"
+	"math/bits"
+	"sort"
+
+	"github.com/google/uuid"
+
+	"tanabata/backend/internal/domain"
+)
+
+// hamming returns the number of differing bits between two perceptual hashes.
+func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }
+
+// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
+// same hash are collected in ids (a distance-0 collision), so identical images
+// don't degenerate the tree into a chain.
+type bkNode struct {
+	hash     uint64
+	ids      []uuid.UUID
+	children map[int]*bkNode
+}
+
+// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
+// querying every element with a small radius is far cheaper than the O(N²) all-
+// pairs comparison at 100k+ files.
+type bkTree struct{ root *bkNode }
+
+func (t *bkTree) insert(hash uint64, id uuid.UUID) {
+	if t.root == nil {
+		t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
+		return
+	}
+	node := t.root
+	for {
+		d := hamming(hash, node.hash)
+		if d == 0 {
+			node.ids = append(node.ids, id)
+			return
+		}
+		child, ok := node.children[d]
+		if !ok {
+			node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
+			return
+		}
+		node = child
+	}
+}
+
+// query visits every node whose hash is within radius of target. The triangle
+// inequality bounds which children can hold a match to [d-radius, d+radius].
+func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
+	if t.root == nil {
+		return
+	}
+	stack := []*bkNode{t.root}
+	for len(stack) > 0 {
+		node := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		d := hamming(target, node.hash)
+		if d <= radius {
+			visit(node, d)
+		}
+		lo, hi := d-radius, d+radius
+		for cd, child := range node.children {
+			if cd >= lo && cd <= hi {
+				stack = append(stack, child)
+			}
+		}
+	}
+}
+
+// buildPairs returns every unordered pair of files whose hashes are within
+// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
+// onProgress, if set, is called periodically with (processed, total).
+func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
+	tree := &bkTree{}
+	for _, e := range entries {
+		tree.insert(uint64(e.PHash), e.ID)
+	}
+
+	var pairs []domain.DuplicatePair
+	total := len(entries)
+	for i := range entries {
+		e := entries[i]
+		tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
+			for _, other := range node.ids {
+				// Emit each pair once, from the smaller id, which also skips self.
+				if bytes.Compare(e.ID[:], other[:]) < 0 {
+					pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
+				}
+			}
+		})
+		if onProgress != nil && (i+1)%1000 == 0 {
+			onProgress(i+1, total)
+		}
+	}
+	if onProgress != nil {
+		onProgress(total, total)
+	}
+	return pairs
+}
+
+// clusterPairs groups pairs into connected components (transitive closure) via
+// union-find. Every returned cluster has at least two files; clusters and the ids
+// within them are sorted by UUID for stable pagination.
+func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
+	parent := map[uuid.UUID]uuid.UUID{}
+	var find func(uuid.UUID) uuid.UUID
+	find = func(x uuid.UUID) uuid.UUID {
+		p, ok := parent[x]
+		if !ok {
+			parent[x] = x
+			return x
+		}
+		if p != x {
+			parent[x] = find(p)
+		}
+		return parent[x]
+	}
+	union := func(a, b uuid.UUID) {
+		ra, rb := find(a), find(b)
+		if ra != rb {
+			parent[ra] = rb
+		}
+	}
+	for _, p := range pairs {
+		union(p.FileA, p.FileB)
+	}
+
+	groups := map[uuid.UUID][]uuid.UUID{}
+	for node := range parent {
+		root := find(node)
+		groups[root] = append(groups[root], node)
+	}
+
+	clusters := make([][]uuid.UUID, 0, len(groups))
+	for _, ids := range groups {
+		sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
+		clusters = append(clusters, ids)
+	}
+	sort.Slice(clusters, func(i, j int) bool {
+		return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
+	})
+	return clusters
+}