feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing:

- Two tables (edited into the original migrations): data.duplicate_pairs holds
  precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
  data.duplicate_dismissals is a global "not a duplicate" overlay that survives
  rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
  - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
    the perceptual hashes and replaces the pairs table. This is the only thing
    that populates pairs, so GET never compares all-vs-all (scales to 110k+).
  - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
    dismissed), groups them into connected components via union-find, and
    paginates whole clusters.
  - Resolve merges a pair field-by-field: each scalar from keep or discard,
    metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
    the discarded file. Enforces edit ACL on both.
  - Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
  POST /files/duplicates/resolve (registered before /:id to avoid collision).
  Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
  DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.

Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
@@ -0,0 +1,122 @@
package handler
import (
"net/http"
"strconv"
"github.com/gin-gonic/gin"
"tanabata/backend/internal/domain"
"tanabata/backend/internal/service"
)
// DuplicateHandler handles the /files/duplicates endpoints.
type DuplicateHandler struct {
dupSvc *service.DuplicateService
}
// NewDuplicateHandler creates a DuplicateHandler.
func NewDuplicateHandler(dupSvc *service.DuplicateService) *DuplicateHandler {
return &DuplicateHandler{dupSvc: dupSvc}
}
// List handles GET /files/duplicates — an offset-paginated list of duplicate
// clusters, each a group of files within the perceptual-hash threshold.
func (h *DuplicateHandler) List(c *gin.Context) {
limit, offset := 20, 0
if n, err := strconv.Atoi(c.Query("limit")); err == nil {
limit = n
}
if n, err := strconv.Atoi(c.Query("offset")); err == nil {
offset = n
}
if limit < 1 {
limit = 1
}
if limit > 50 {
limit = 50
}
if offset < 0 {
offset = 0
}
clusters, total, err := h.dupSvc.Clusters(c.Request.Context(), limit, offset)
if err != nil {
respondError(c, err)
return
}
items := make([]gin.H, len(clusters))
for i, files := range clusters {
fs := make([]fileJSON, len(files))
for j, f := range files {
fs[j] = toFileJSON(f)
}
items[i] = gin.H{"files": fs}
}
respondJSON(c, http.StatusOK, gin.H{
"items": items,
"total": total,
"limit": limit,
"offset": offset,
})
}
// Dismiss handles POST /files/duplicates/dismiss — mark a pair "not a duplicate".
func (h *DuplicateHandler) Dismiss(c *gin.Context) {
var body struct {
FileIDA string `json:"file_id_a" binding:"required"`
FileIDB string `json:"file_id_b" binding:"required"`
}
if err := c.ShouldBindJSON(&body); err != nil {
respondError(c, domain.ErrValidation)
return
}
ids, err := parseUUIDs([]string{body.FileIDA, body.FileIDB})
if err != nil {
respondError(c, domain.ErrValidation)
return
}
if err := h.dupSvc.Dismiss(c.Request.Context(), ids[0], ids[1]); err != nil {
respondError(c, err)
return
}
c.Status(http.StatusNoContent)
}
// Resolve handles POST /files/duplicates/resolve — merge a duplicate pair,
// keeping one file and folding the chosen fields in from the other. Returns the
// updated survivor. delete_discarded defaults to true.
func (h *DuplicateHandler) Resolve(c *gin.Context) {
var body struct {
Keep string `json:"keep" binding:"required"`
Discard string `json:"discard" binding:"required"`
Fields service.MergeFields `json:"fields"`
DeleteDiscarded *bool `json:"delete_discarded"`
}
if err := c.ShouldBindJSON(&body); err != nil {
respondError(c, domain.ErrValidation)
return
}
ids, err := parseUUIDs([]string{body.Keep, body.Discard})
if err != nil {
respondError(c, domain.ErrValidation)
return
}
del := true
if body.DeleteDiscarded != nil {
del = *body.DeleteDiscarded
}
f, err := h.dupSvc.Resolve(c.Request.Context(), service.MergeSpec{
Keep: ids[0],
Discard: ids[1],
Fields: body.Fields,
DeleteDiscarded: del,
})
if err != nil {
respondError(c, err)
return
}
respondJSON(c, http.StatusOK, toFileJSON(*f))
}
+6 -1
View File
@@ -26,6 +26,7 @@ func NewRouter(
auth *AuthMiddleware,
authHandler *AuthHandler,
fileHandler *FileHandler,
duplicateHandler *DuplicateHandler,
tagHandler *TagHandler,
categoryHandler *CategoryHandler,
poolHandler *PoolHandler,
@@ -80,7 +81,11 @@ func NewRouter(
files.GET("", fileHandler.List)
files.POST("", fileHandler.Upload)
// Bulk + import routes registered before /:id to prevent param collision.
// Bulk + import + duplicates routes registered before /:id to prevent
// param collision (e.g. "duplicates" being captured as :id).
files.GET("/duplicates", duplicateHandler.List)
files.POST("/duplicates/dismiss", duplicateHandler.Dismiss)
files.POST("/duplicates/resolve", duplicateHandler.Resolve)
files.POST("/bulk/tags", fileHandler.BulkSetTags)
files.POST("/bulk/delete", fileHandler.BulkDelete)
files.POST("/bulk/review", fileHandler.BulkReview)
+1 -1
View File
@@ -10,7 +10,7 @@ import "testing"
func TestNewRouterRegisters(t *testing.T) {
r, err := NewRouter(
(*AuthMiddleware)(nil), (*AuthHandler)(nil),
(*FileHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil),
(*FileHandler)(nil), (*DuplicateHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil),
(*UserHandler)(nil), (*ACLHandler)(nil), (*AuditHandler)(nil),
"", nil,
)