feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing:

- Two tables (edited into the original migrations): data.duplicate_pairs holds
  precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
  data.duplicate_dismissals is a global "not a duplicate" overlay that survives
  rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
  - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
    the perceptual hashes and replaces the pairs table. This is the only thing
    that populates pairs, so GET never compares all-vs-all (scales to 110k+).
  - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
    dismissed), groups them into connected components via union-find, and
    paginates whole clusters.
  - Resolve merges a pair field-by-field: each scalar from keep or discard,
    metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
    the discarded file. Enforces edit ACL on both.
  - Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
  POST /files/duplicates/resolve (registered before /:id to avoid collision).
  Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
  DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.

Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
+109 -1
View File
@@ -56,6 +56,9 @@ type harness struct {
client *http.Client
importDir string
pool *pgxpool.Pool
// dupSvc lets duplicate tests trigger a pairs rescan directly: rebuilding the
// pairs table is a CLI/maintenance action with no HTTP endpoint.
dupSvc *service.DuplicateService
}
// setupSuite creates an ephemeral database, runs migrations, wires the full
@@ -125,6 +128,8 @@ func setupSuite(t *testing.T) *harness {
tagRuleRepo := postgres.NewTagRuleRepo(pool)
categoryRepo := postgres.NewCategoryRepo(pool)
poolRepo := postgres.NewPoolRepo(pool)
duplicatePairRepo := postgres.NewDuplicatePairRepo(pool)
dismissalRepo := postgres.NewDismissalRepo(pool)
transactor := postgres.NewTransactor(pool)
// --- Services ------------------------------------------------------------
@@ -134,6 +139,7 @@ func setupSuite(t *testing.T) *harness {
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
duplicateSvc := service.NewDuplicateService(fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, 10)
fileSvc := service.NewFileService(fileRepo, mimeRepo, diskStorage, aclSvc, auditSvc, tagSvc, transactor, importDir)
userSvc := service.NewUserService(userRepo, sessionRepo, auditSvc)
@@ -145,6 +151,7 @@ func setupSuite(t *testing.T) *harness {
authMiddleware := handler.NewAuthMiddleware(authSvc)
authHandler := handler.NewAuthHandler(authSvc)
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, 500<<20)
duplicateHandler := handler.NewDuplicateHandler(duplicateSvc)
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
categoryHandler := handler.NewCategoryHandler(categorySvc)
poolHandler := handler.NewPoolHandler(poolSvc)
@@ -154,7 +161,7 @@ func setupSuite(t *testing.T) *harness {
r, err := handler.NewRouter(
authMiddleware, authHandler,
fileHandler, tagHandler, categoryHandler, poolHandler,
fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler,
userHandler, aclHandler, auditHandler,
"",
nil,
@@ -170,6 +177,7 @@ func setupSuite(t *testing.T) *harness {
client: srv.Client(),
importDir: importDir,
pool: pool,
dupSvc: duplicateSvc,
}
}
@@ -1643,3 +1651,103 @@ var (
_ = freePort
_ = writeFile
)
// dupListResponse decodes GET /files/duplicates.
type dupListResponse struct {
Items []struct {
Files []struct {
ID string `json:"id"`
Tags []struct {
ID string `json:"id"`
} `json:"tags"`
} `json:"files"`
} `json:"items"`
Total int `json:"total"`
}
// TestDuplicateDetection exercises the full duplicate lifecycle: perceptual hashes
// are computed on upload, a rescan builds the pairs table, the cluster surfaces,
// a field-by-field merge unions tags and trashes the discarded file, and a
// dismissal hides a pair permanently (surviving a re-rescan).
//
// minimalJPEG() is a 1×1 image, so every upload hashes identically — in a fresh
// database any two uploads form one duplicate pair, which keeps this deterministic.
func TestDuplicateDetection(t *testing.T) {
if testing.Short() {
t.Skip("skipping integration test in short mode")
}
h := setupSuite(t)
ctx := context.Background()
admin := h.login("admin", "admin")
// --- two uploads => one duplicate pair after a rescan ---------------------
f1 := h.uploadJPEG(admin, "a.jpg")["id"].(string)
f2 := h.uploadJPEG(admin, "b.jpg")["id"].(string)
// Tag f2 so the merge has something to union onto the survivor.
resp := h.doJSON("POST", "/tags", map[string]any{"name": "kept", "is_public": true}, admin)
require.Equal(t, http.StatusCreated, resp.StatusCode, resp.String())
var tag map[string]any
resp.decode(t, &tag)
tagID := tag["id"].(string)
resp = h.doJSON("PUT", "/files/"+f2+"/tags", map[string]any{"tag_ids": []string{tagID}}, admin)
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
var list dupListResponse
resp.decode(t, &list)
require.Equal(t, 1, list.Total, "expected one duplicate cluster: %s", resp)
require.Len(t, list.Items, 1)
require.Len(t, list.Items[0].Files, 2)
// --- resolve: keep f1, union tags from f2, trash f2 ----------------------
resp = h.doJSON("POST", "/files/duplicates/resolve", map[string]any{
"keep": f1,
"discard": f2,
"fields": map[string]any{"tags": "both"},
"delete_discarded": true,
}, admin)
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
var survivor struct {
ID string `json:"id"`
Tags []struct {
ID string `json:"id"`
} `json:"tags"`
}
resp.decode(t, &survivor)
assert.Equal(t, f1, survivor.ID)
require.Len(t, survivor.Tags, 1, "survivor should have inherited the discarded file's tag")
assert.Equal(t, tagID, survivor.Tags[0].ID)
// f2 is now trashed, so the pair drops out of the duplicates view.
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
resp.decode(t, &list)
assert.Equal(t, 0, list.Total, "resolved pair should no longer surface: %s", resp)
// --- dismiss: a new pair, hidden and staying hidden across a rescan -------
f3 := h.uploadJPEG(admin, "c.jpg")["id"].(string)
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
resp.decode(t, &list)
require.Equal(t, 1, list.Total, "f1 and f3 should now form a cluster: %s", resp)
resp = h.doJSON("POST", "/files/duplicates/dismiss", map[string]any{
"file_id_a": f1, "file_id_b": f3,
}, admin)
require.Equal(t, http.StatusNoContent, resp.StatusCode, resp.String())
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
resp.decode(t, &list)
assert.Equal(t, 0, list.Total, "dismissed pair should be hidden")
// A rescan re-finds the pair but the dismissal still hides it.
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
resp.decode(t, &list)
assert.Equal(t, 0, list.Total, "dismissal must survive a rescan")
}