tanabata/backend/internal/service/duplicate_index.go

package service

import (
	"bytes"
	"math/bits"
	"sort"

	"github.com/google/uuid"

	"tanabata/backend/internal/domain"
)

// hamming returns the number of differing bits between two perceptual hashes.
func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }

// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
// same hash are collected in ids (a distance-0 collision), so identical images
// don't degenerate the tree into a chain.
type bkNode struct {
	hash     uint64
	ids      []uuid.UUID
	children map[int]*bkNode
}

// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
// querying every element with a small radius is far cheaper than the O(N²) all-
// pairs comparison at 100k+ files.
type bkTree struct{ root *bkNode }

func (t *bkTree) insert(hash uint64, id uuid.UUID) {
	if t.root == nil {
		t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
		return
	}
	node := t.root
	for {
		d := hamming(hash, node.hash)
		if d == 0 {
			node.ids = append(node.ids, id)
			return
		}
		child, ok := node.children[d]
		if !ok {
			node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
			return
		}
		node = child
	}
}

// query visits every node whose hash is within radius of target. The triangle
// inequality bounds which children can hold a match to [d-radius, d+radius].
func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
	if t.root == nil {
		return
	}
	stack := []*bkNode{t.root}
	for len(stack) > 0 {
		node := stack[len(stack)-1]
		stack = stack[:len(stack)-1]

		d := hamming(target, node.hash)
		if d <= radius {
			visit(node, d)
		}
		lo, hi := d-radius, d+radius
		for cd, child := range node.children {
			if cd >= lo && cd <= hi {
				stack = append(stack, child)
			}
		}
	}
}

// buildPairs returns every unordered pair of files whose hashes are within
// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
// onProgress, if set, is called periodically with (processed, total).
func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
	tree := &bkTree{}
	for _, e := range entries {
		tree.insert(uint64(e.PHash), e.ID)
	}

	var pairs []domain.DuplicatePair
	total := len(entries)
	for i := range entries {
		e := entries[i]
		tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
			for _, other := range node.ids {
				// Emit each pair once, from the smaller id, which also skips self.
				if bytes.Compare(e.ID[:], other[:]) < 0 {
					pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
				}
			}
		})
		if onProgress != nil && (i+1)%1000 == 0 {
			onProgress(i+1, total)
		}
	}
	if onProgress != nil {
		onProgress(total, total)
	}
	return pairs
}

// clusterPairs groups pairs into connected components (transitive closure) via
// union-find. Every returned cluster has at least two files; clusters and the ids
// within them are sorted by UUID for stable pagination.
func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
	parent := map[uuid.UUID]uuid.UUID{}
	var find func(uuid.UUID) uuid.UUID
	find = func(x uuid.UUID) uuid.UUID {
		p, ok := parent[x]
		if !ok {
			parent[x] = x
			return x
		}
		if p != x {
			parent[x] = find(p)
		}
		return parent[x]
	}
	union := func(a, b uuid.UUID) {
		ra, rb := find(a), find(b)
		if ra != rb {
			parent[ra] = rb
		}
	}
	for _, p := range pairs {
		union(p.FileA, p.FileB)
	}

	groups := map[uuid.UUID][]uuid.UUID{}
	for node := range parent {
		root := find(node)
		groups[root] = append(groups[root], node)
	}

	clusters := make([][]uuid.UUID, 0, len(groups))
	for _, ids := range groups {
		sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
		clusters = append(clusters, ids)
	}
	sort.Slice(clusters, func(i, j int) bool {
		return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
	})
	return clusters
}