feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing:

- Two tables (edited into the original migrations): data.duplicate_pairs holds
  precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
  data.duplicate_dismissals is a global "not a duplicate" overlay that survives
  rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
  - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
    the perceptual hashes and replaces the pairs table. This is the only thing
    that populates pairs, so GET never compares all-vs-all (scales to 110k+).
  - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
    dismissed), groups them into connected components via union-find, and
    paginates whole clusters.
  - Resolve merges a pair field-by-field: each scalar from keep or discard,
    metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
    the discarded file. Enforces edit ACL on both.
  - Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
  POST /files/duplicates/resolve (registered before /:id to avoid collision).
  Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
  DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.

Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
+148
View File
@@ -0,0 +1,148 @@
package service
import (
"bytes"
"math/bits"
"sort"
"github.com/google/uuid"
"tanabata/backend/internal/domain"
)
// hamming returns the number of differing bits between two perceptual hashes.
func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }
// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
// same hash are collected in ids (a distance-0 collision), so identical images
// don't degenerate the tree into a chain.
type bkNode struct {
hash uint64
ids []uuid.UUID
children map[int]*bkNode
}
// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
// querying every element with a small radius is far cheaper than the O(N²) all-
// pairs comparison at 100k+ files.
type bkTree struct{ root *bkNode }
func (t *bkTree) insert(hash uint64, id uuid.UUID) {
if t.root == nil {
t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
return
}
node := t.root
for {
d := hamming(hash, node.hash)
if d == 0 {
node.ids = append(node.ids, id)
return
}
child, ok := node.children[d]
if !ok {
node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
return
}
node = child
}
}
// query visits every node whose hash is within radius of target. The triangle
// inequality bounds which children can hold a match to [d-radius, d+radius].
func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
if t.root == nil {
return
}
stack := []*bkNode{t.root}
for len(stack) > 0 {
node := stack[len(stack)-1]
stack = stack[:len(stack)-1]
d := hamming(target, node.hash)
if d <= radius {
visit(node, d)
}
lo, hi := d-radius, d+radius
for cd, child := range node.children {
if cd >= lo && cd <= hi {
stack = append(stack, child)
}
}
}
}
// buildPairs returns every unordered pair of files whose hashes are within
// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
// onProgress, if set, is called periodically with (processed, total).
func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
tree := &bkTree{}
for _, e := range entries {
tree.insert(uint64(e.PHash), e.ID)
}
var pairs []domain.DuplicatePair
total := len(entries)
for i := range entries {
e := entries[i]
tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
for _, other := range node.ids {
// Emit each pair once, from the smaller id, which also skips self.
if bytes.Compare(e.ID[:], other[:]) < 0 {
pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
}
}
})
if onProgress != nil && (i+1)%1000 == 0 {
onProgress(i+1, total)
}
}
if onProgress != nil {
onProgress(total, total)
}
return pairs
}
// clusterPairs groups pairs into connected components (transitive closure) via
// union-find. Every returned cluster has at least two files; clusters and the ids
// within them are sorted by UUID for stable pagination.
func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
parent := map[uuid.UUID]uuid.UUID{}
var find func(uuid.UUID) uuid.UUID
find = func(x uuid.UUID) uuid.UUID {
p, ok := parent[x]
if !ok {
parent[x] = x
return x
}
if p != x {
parent[x] = find(p)
}
return parent[x]
}
union := func(a, b uuid.UUID) {
ra, rb := find(a), find(b)
if ra != rb {
parent[ra] = rb
}
}
for _, p := range pairs {
union(p.FileA, p.FileB)
}
groups := map[uuid.UUID][]uuid.UUID{}
for node := range parent {
root := find(node)
groups[root] = append(groups[root], node)
}
clusters := make([][]uuid.UUID, 0, len(groups))
for _, ids := range groups {
sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
clusters = append(clusters, ids)
}
sort.Slice(clusters, func(i, j int) bool {
return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
})
return clusters
}