9216a8687f
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
149 lines
3.9 KiB
Go
149 lines
3.9 KiB
Go
package service
|
|
|
|
import (
|
|
"bytes"
|
|
"math/bits"
|
|
"sort"
|
|
|
|
"github.com/google/uuid"
|
|
|
|
"tanabata/backend/internal/domain"
|
|
)
|
|
|
|
// hamming returns the number of differing bits between two perceptual hashes.
|
|
func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }
|
|
|
|
// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
|
|
// same hash are collected in ids (a distance-0 collision), so identical images
|
|
// don't degenerate the tree into a chain.
|
|
type bkNode struct {
|
|
hash uint64
|
|
ids []uuid.UUID
|
|
children map[int]*bkNode
|
|
}
|
|
|
|
// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
|
|
// querying every element with a small radius is far cheaper than the O(N²) all-
|
|
// pairs comparison at 100k+ files.
|
|
type bkTree struct{ root *bkNode }
|
|
|
|
func (t *bkTree) insert(hash uint64, id uuid.UUID) {
|
|
if t.root == nil {
|
|
t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
|
|
return
|
|
}
|
|
node := t.root
|
|
for {
|
|
d := hamming(hash, node.hash)
|
|
if d == 0 {
|
|
node.ids = append(node.ids, id)
|
|
return
|
|
}
|
|
child, ok := node.children[d]
|
|
if !ok {
|
|
node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
|
|
return
|
|
}
|
|
node = child
|
|
}
|
|
}
|
|
|
|
// query visits every node whose hash is within radius of target. The triangle
|
|
// inequality bounds which children can hold a match to [d-radius, d+radius].
|
|
func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
|
|
if t.root == nil {
|
|
return
|
|
}
|
|
stack := []*bkNode{t.root}
|
|
for len(stack) > 0 {
|
|
node := stack[len(stack)-1]
|
|
stack = stack[:len(stack)-1]
|
|
|
|
d := hamming(target, node.hash)
|
|
if d <= radius {
|
|
visit(node, d)
|
|
}
|
|
lo, hi := d-radius, d+radius
|
|
for cd, child := range node.children {
|
|
if cd >= lo && cd <= hi {
|
|
stack = append(stack, child)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// buildPairs returns every unordered pair of files whose hashes are within
|
|
// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
|
|
// onProgress, if set, is called periodically with (processed, total).
|
|
func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
|
|
tree := &bkTree{}
|
|
for _, e := range entries {
|
|
tree.insert(uint64(e.PHash), e.ID)
|
|
}
|
|
|
|
var pairs []domain.DuplicatePair
|
|
total := len(entries)
|
|
for i := range entries {
|
|
e := entries[i]
|
|
tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
|
|
for _, other := range node.ids {
|
|
// Emit each pair once, from the smaller id, which also skips self.
|
|
if bytes.Compare(e.ID[:], other[:]) < 0 {
|
|
pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
|
|
}
|
|
}
|
|
})
|
|
if onProgress != nil && (i+1)%1000 == 0 {
|
|
onProgress(i+1, total)
|
|
}
|
|
}
|
|
if onProgress != nil {
|
|
onProgress(total, total)
|
|
}
|
|
return pairs
|
|
}
|
|
|
|
// clusterPairs groups pairs into connected components (transitive closure) via
|
|
// union-find. Every returned cluster has at least two files; clusters and the ids
|
|
// within them are sorted by UUID for stable pagination.
|
|
func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
|
|
parent := map[uuid.UUID]uuid.UUID{}
|
|
var find func(uuid.UUID) uuid.UUID
|
|
find = func(x uuid.UUID) uuid.UUID {
|
|
p, ok := parent[x]
|
|
if !ok {
|
|
parent[x] = x
|
|
return x
|
|
}
|
|
if p != x {
|
|
parent[x] = find(p)
|
|
}
|
|
return parent[x]
|
|
}
|
|
union := func(a, b uuid.UUID) {
|
|
ra, rb := find(a), find(b)
|
|
if ra != rb {
|
|
parent[ra] = rb
|
|
}
|
|
}
|
|
for _, p := range pairs {
|
|
union(p.FileA, p.FileB)
|
|
}
|
|
|
|
groups := map[uuid.UUID][]uuid.UUID{}
|
|
for node := range parent {
|
|
root := find(node)
|
|
groups[root] = append(groups[root], node)
|
|
}
|
|
|
|
clusters := make([][]uuid.UUID, 0, len(groups))
|
|
for _, ids := range groups {
|
|
sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
|
|
clusters = append(clusters, ids)
|
|
}
|
|
sort.Slice(clusters, func(i, j int) bool {
|
|
return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
|
|
})
|
|
return clusters
|
|
}
|