feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,122 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
|
||||
"tanabata/backend/internal/domain"
|
||||
"tanabata/backend/internal/service"
|
||||
)
|
||||
|
||||
// DuplicateHandler handles the /files/duplicates endpoints.
|
||||
type DuplicateHandler struct {
|
||||
dupSvc *service.DuplicateService
|
||||
}
|
||||
|
||||
// NewDuplicateHandler creates a DuplicateHandler.
|
||||
func NewDuplicateHandler(dupSvc *service.DuplicateService) *DuplicateHandler {
|
||||
return &DuplicateHandler{dupSvc: dupSvc}
|
||||
}
|
||||
|
||||
// List handles GET /files/duplicates — an offset-paginated list of duplicate
|
||||
// clusters, each a group of files within the perceptual-hash threshold.
|
||||
func (h *DuplicateHandler) List(c *gin.Context) {
|
||||
limit, offset := 20, 0
|
||||
if n, err := strconv.Atoi(c.Query("limit")); err == nil {
|
||||
limit = n
|
||||
}
|
||||
if n, err := strconv.Atoi(c.Query("offset")); err == nil {
|
||||
offset = n
|
||||
}
|
||||
if limit < 1 {
|
||||
limit = 1
|
||||
}
|
||||
if limit > 50 {
|
||||
limit = 50
|
||||
}
|
||||
if offset < 0 {
|
||||
offset = 0
|
||||
}
|
||||
|
||||
clusters, total, err := h.dupSvc.Clusters(c.Request.Context(), limit, offset)
|
||||
if err != nil {
|
||||
respondError(c, err)
|
||||
return
|
||||
}
|
||||
|
||||
items := make([]gin.H, len(clusters))
|
||||
for i, files := range clusters {
|
||||
fs := make([]fileJSON, len(files))
|
||||
for j, f := range files {
|
||||
fs[j] = toFileJSON(f)
|
||||
}
|
||||
items[i] = gin.H{"files": fs}
|
||||
}
|
||||
respondJSON(c, http.StatusOK, gin.H{
|
||||
"items": items,
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
})
|
||||
}
|
||||
|
||||
// Dismiss handles POST /files/duplicates/dismiss — mark a pair "not a duplicate".
|
||||
func (h *DuplicateHandler) Dismiss(c *gin.Context) {
|
||||
var body struct {
|
||||
FileIDA string `json:"file_id_a" binding:"required"`
|
||||
FileIDB string `json:"file_id_b" binding:"required"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&body); err != nil {
|
||||
respondError(c, domain.ErrValidation)
|
||||
return
|
||||
}
|
||||
ids, err := parseUUIDs([]string{body.FileIDA, body.FileIDB})
|
||||
if err != nil {
|
||||
respondError(c, domain.ErrValidation)
|
||||
return
|
||||
}
|
||||
if err := h.dupSvc.Dismiss(c.Request.Context(), ids[0], ids[1]); err != nil {
|
||||
respondError(c, err)
|
||||
return
|
||||
}
|
||||
c.Status(http.StatusNoContent)
|
||||
}
|
||||
|
||||
// Resolve handles POST /files/duplicates/resolve — merge a duplicate pair,
|
||||
// keeping one file and folding the chosen fields in from the other. Returns the
|
||||
// updated survivor. delete_discarded defaults to true.
|
||||
func (h *DuplicateHandler) Resolve(c *gin.Context) {
|
||||
var body struct {
|
||||
Keep string `json:"keep" binding:"required"`
|
||||
Discard string `json:"discard" binding:"required"`
|
||||
Fields service.MergeFields `json:"fields"`
|
||||
DeleteDiscarded *bool `json:"delete_discarded"`
|
||||
}
|
||||
if err := c.ShouldBindJSON(&body); err != nil {
|
||||
respondError(c, domain.ErrValidation)
|
||||
return
|
||||
}
|
||||
ids, err := parseUUIDs([]string{body.Keep, body.Discard})
|
||||
if err != nil {
|
||||
respondError(c, domain.ErrValidation)
|
||||
return
|
||||
}
|
||||
|
||||
del := true
|
||||
if body.DeleteDiscarded != nil {
|
||||
del = *body.DeleteDiscarded
|
||||
}
|
||||
f, err := h.dupSvc.Resolve(c.Request.Context(), service.MergeSpec{
|
||||
Keep: ids[0],
|
||||
Discard: ids[1],
|
||||
Fields: body.Fields,
|
||||
DeleteDiscarded: del,
|
||||
})
|
||||
if err != nil {
|
||||
respondError(c, err)
|
||||
return
|
||||
}
|
||||
respondJSON(c, http.StatusOK, toFileJSON(*f))
|
||||
}
|
||||
Reference in New Issue
Block a user