diff --git a/backend/cmd/server/main.go b/backend/cmd/server/main.go index 1db842a..0ca0195 100644 --- a/backend/cmd/server/main.go +++ b/backend/cmd/server/main.go @@ -70,6 +70,8 @@ func main() { tagRuleRepo := postgres.NewTagRuleRepo(pool) categoryRepo := postgres.NewCategoryRepo(pool) poolRepo := postgres.NewPoolRepo(pool) + duplicatePairRepo := postgres.NewDuplicatePairRepo(pool) + dismissalRepo := postgres.NewDismissalRepo(pool) transactor := postgres.NewTransactor(pool) // Services @@ -86,6 +88,9 @@ func main() { tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor) categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc) poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc) + duplicateSvc := service.NewDuplicateService( + fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, cfg.DuplicateHashThreshold, + ) fileSvc := service.NewFileService( fileRepo, mimeRepo, @@ -108,6 +113,7 @@ func main() { authMiddleware := handler.NewAuthMiddleware(authSvc) authHandler := handler.NewAuthHandler(authSvc) fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, cfg.MaxUploadBytes) + duplicateHandler := handler.NewDuplicateHandler(duplicateSvc) tagHandler := handler.NewTagHandler(tagSvc, fileSvc) categoryHandler := handler.NewCategoryHandler(categorySvc) poolHandler := handler.NewPoolHandler(poolSvc) @@ -117,7 +123,7 @@ func main() { r, err := handler.NewRouter( authMiddleware, authHandler, - fileHandler, tagHandler, categoryHandler, poolHandler, + fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler, userHandler, aclHandler, auditHandler, cfg.StaticDir, cfg.TrustedProxies, diff --git a/backend/internal/db/postgres/duplicate_repo.go b/backend/internal/db/postgres/duplicate_repo.go new file mode 100644 index 0000000..650292b --- /dev/null +++ b/backend/internal/db/postgres/duplicate_repo.go @@ -0,0 +1,145 @@ +package postgres + +import ( + "bytes" + "context" + "fmt" + + "github.com/google/uuid" + "github.com/jackc/pgx/v5" + "github.com/jackc/pgx/v5/pgxpool" + + "tanabata/backend/internal/domain" + "tanabata/backend/internal/port" +) + +// --------------------------------------------------------------------------- +// DuplicatePairRepo +// --------------------------------------------------------------------------- + +// DuplicatePairRepo implements port.DuplicatePairRepo using PostgreSQL. +type DuplicatePairRepo struct { + pool *pgxpool.Pool +} + +// NewDuplicatePairRepo creates a DuplicatePairRepo backed by pool. +func NewDuplicatePairRepo(pool *pgxpool.Pool) *DuplicatePairRepo { + return &DuplicatePairRepo{pool: pool} +} + +var _ port.DuplicatePairRepo = (*DuplicatePairRepo)(nil) + +// ReplaceAll atomically replaces the entire pairs table with the given set. +// The rescan recomputes pairs from scratch, so a full DELETE + COPY is both +// correct and the simplest way to drop pairs that no longer qualify. +func (r *DuplicatePairRepo) ReplaceAll(ctx context.Context, pairs []domain.DuplicatePair) error { + tx, err := r.pool.Begin(ctx) + if err != nil { + return fmt.Errorf("DuplicatePairRepo.ReplaceAll begin: %w", err) + } + defer tx.Rollback(ctx) //nolint:errcheck // no-op after a successful commit + + if _, err := tx.Exec(ctx, `DELETE FROM data.duplicate_pairs`); err != nil { + return fmt.Errorf("DuplicatePairRepo.ReplaceAll delete: %w", err) + } + + if len(pairs) > 0 { + rows := make([][]any, len(pairs)) + for i, p := range pairs { + rows[i] = []any{p.FileA, p.FileB, int16(p.Distance)} + } + if _, err := tx.CopyFrom(ctx, + pgx.Identifier{"data", "duplicate_pairs"}, + []string{"file_a", "file_b", "distance"}, + pgx.CopyFromRows(rows), + ); err != nil { + return fmt.Errorf("DuplicatePairRepo.ReplaceAll copy: %w", err) + } + } + + if err := tx.Commit(ctx); err != nil { + return fmt.Errorf("DuplicatePairRepo.ReplaceAll commit: %w", err) + } + return nil +} + +type pairRow struct { + FileA uuid.UUID `db:"file_a"` + FileB uuid.UUID `db:"file_b"` + Distance int16 `db:"distance"` +} + +// ListVisible returns every stored pair where both files are live (not trashed), +// the pair is not dismissed, and — for non-admins — both files are visible to the +// viewer under the private-by-default model. This is the input to clustering. +func (r *DuplicatePairRepo) ListVisible(ctx context.Context, viewerID int16, isAdmin bool) ([]domain.DuplicatePair, error) { + args := make([]any, 0, 4) + n := 1 + aclWhere := "" + if !isAdmin { + var ca, cb string + ca, n, args = aclVisibilityCond("fa", objTypeFile, viewerID, n, args) + cb, n, args = aclVisibilityCond("fb", objTypeFile, viewerID, n, args) + aclWhere = "AND " + ca + " AND " + cb + } + + sqlStr := fmt.Sprintf(` + SELECT p.file_a, p.file_b, p.distance + FROM data.duplicate_pairs p + JOIN data.files fa ON fa.id = p.file_a AND fa.is_deleted = false + JOIN data.files fb ON fb.id = p.file_b AND fb.is_deleted = false + WHERE NOT EXISTS ( + SELECT 1 FROM data.duplicate_dismissals d + WHERE d.file_a = p.file_a AND d.file_b = p.file_b + ) + %s + ORDER BY p.file_a, p.file_b`, aclWhere) + + rows, err := r.pool.Query(ctx, sqlStr, args...) + if err != nil { + return nil, fmt.Errorf("DuplicatePairRepo.ListVisible: %w", err) + } + collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[pairRow]) + if err != nil { + return nil, fmt.Errorf("DuplicatePairRepo.ListVisible scan: %w", err) + } + out := make([]domain.DuplicatePair, len(collected)) + for i, row := range collected { + out[i] = domain.DuplicatePair{FileA: row.FileA, FileB: row.FileB, Distance: int(row.Distance)} + } + return out, nil +} + +// --------------------------------------------------------------------------- +// DismissalRepo +// --------------------------------------------------------------------------- + +// DismissalRepo implements port.DismissalRepo using PostgreSQL. +type DismissalRepo struct { + pool *pgxpool.Pool +} + +// NewDismissalRepo creates a DismissalRepo backed by pool. +func NewDismissalRepo(pool *pgxpool.Pool) *DismissalRepo { + return &DismissalRepo{pool: pool} +} + +var _ port.DismissalRepo = (*DismissalRepo)(nil) + +// Add records a pair as "not a duplicate". The two ids are stored in canonical +// (file_a < file_b) order to match the table's CHECK and avoid (a,b)/(b,a) +// duplicates; a repeated dismissal is a no-op. +func (r *DismissalRepo) Add(ctx context.Context, a, b uuid.UUID, userID int16) error { + if bytes.Compare(a[:], b[:]) > 0 { + a, b = b, a + } + const sqlStr = ` + INSERT INTO data.duplicate_dismissals (file_a, file_b, dismissed_by) + VALUES ($1, $2, $3) + ON CONFLICT (file_a, file_b) DO NOTHING` + q := connOrTx(ctx, r.pool) + if _, err := q.Exec(ctx, sqlStr, a, b, userID); err != nil { + return fmt.Errorf("DismissalRepo.Add: %w", err) + } + return nil +} diff --git a/backend/internal/db/postgres/file_repo.go b/backend/internal/db/postgres/file_repo.go index 22d9990..acff0d7 100644 --- a/backend/internal/db/postgres/file_repo.go +++ b/backend/internal/db/postgres/file_repo.go @@ -446,6 +446,89 @@ func (r *FileRepo) SetPHash(ctx context.Context, id uuid.UUID, phash *int64) err return nil } +// --------------------------------------------------------------------------- +// Perceptual-hash / duplicate support +// --------------------------------------------------------------------------- + +// ListMissingPHash returns live image/video files that have no perceptual hash +// yet — the work list for the dedup backfill. Tags are not loaded (the backfill +// only needs the id and MIME type to choose image vs video hashing). +func (r *FileRepo) ListMissingPHash(ctx context.Context) ([]domain.File, error) { + const sqlStr = ` + SELECT f.id, f.original_name, + mt.name AS mime_type, mt.extension AS mime_extension, + f.content_datetime, f.notes, f.metadata, f.exif, f.phash, + f.creator_id, u.name AS creator_name, + f.is_public, f.is_deleted, f.needs_review + FROM data.files f + JOIN core.mime_types mt ON mt.id = f.mime_id + JOIN core.users u ON u.id = f.creator_id + WHERE f.phash IS NULL AND f.is_deleted = false + AND (mt.name LIKE 'image/%' OR mt.name LIKE 'video/%') + ORDER BY f.id` + + q := connOrTx(ctx, r.pool) + rows, err := q.Query(ctx, sqlStr) + if err != nil { + return nil, fmt.Errorf("FileRepo.ListMissingPHash: %w", err) + } + collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[fileRow]) + if err != nil { + return nil, fmt.Errorf("FileRepo.ListMissingPHash scan: %w", err) + } + files := make([]domain.File, len(collected)) + for i, row := range collected { + files[i] = toFile(row) + } + return files, nil +} + +// phashRow is the minimal projection used to build duplicate clusters. +type phashRow struct { + ID uuid.UUID `db:"id"` + PHash int64 `db:"phash"` +} + +// ListAllPHashes returns the id and perceptual hash of every live, hashed file. +// It is the global input to the dedup rescan, so it deliberately ignores ACL — +// the rescan builds the shared pairs table; visibility is enforced on read. +func (r *FileRepo) ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error) { + const sqlStr = `SELECT id, phash FROM data.files WHERE is_deleted = false AND phash IS NOT NULL` + q := connOrTx(ctx, r.pool) + rows, err := q.Query(ctx, sqlStr) + if err != nil { + return nil, fmt.Errorf("FileRepo.ListAllPHashes: %w", err) + } + collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[phashRow]) + if err != nil { + return nil, fmt.Errorf("FileRepo.ListAllPHashes scan: %w", err) + } + out := make([]domain.PHashEntry, len(collected)) + for i, row := range collected { + out[i] = domain.PHashEntry{ID: row.ID, PHash: row.PHash} + } + return out, nil +} + +// CopyPoolMemberships adds targetID to every pool sourceID belongs to (copying +// the source's position), skipping pools the target is already in. Used by the +// duplicate merge to preserve the discarded file's pool memberships on the +// survivor. The merge is authorised at the file level, so pool ACL is not +// re-checked here. +func (r *FileRepo) CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error { + const sqlStr = ` + INSERT INTO data.file_pool (file_id, pool_id, position) + SELECT $1, fp.pool_id, fp.position + FROM data.file_pool fp + WHERE fp.file_id = $2 + ON CONFLICT (file_id, pool_id) DO NOTHING` + q := connOrTx(ctx, r.pool) + if _, err := q.Exec(ctx, sqlStr, targetID, sourceID); err != nil { + return fmt.Errorf("FileRepo.CopyPoolMemberships: %w", err) + } + return nil +} + // --------------------------------------------------------------------------- // SoftDelete / Restore / DeletePermanent // --------------------------------------------------------------------------- diff --git a/backend/internal/domain/duplicate.go b/backend/internal/domain/duplicate.go new file mode 100644 index 0000000..8853096 --- /dev/null +++ b/backend/internal/domain/duplicate.go @@ -0,0 +1,18 @@ +package domain + +import "github.com/google/uuid" + +// PHashEntry is a file's perceptual hash, the input to duplicate clustering. +type PHashEntry struct { + ID uuid.UUID + PHash int64 +} + +// DuplicatePair is an unordered pair of files whose perceptual hashes are within +// the configured Hamming threshold. FileA < FileB by UUID byte order (canonical), +// so a pair is represented exactly once. +type DuplicatePair struct { + FileA uuid.UUID + FileB uuid.UUID + Distance int +} diff --git a/backend/internal/handler/duplicate_handler.go b/backend/internal/handler/duplicate_handler.go new file mode 100644 index 0000000..7018e59 --- /dev/null +++ b/backend/internal/handler/duplicate_handler.go @@ -0,0 +1,122 @@ +package handler + +import ( + "net/http" + "strconv" + + "github.com/gin-gonic/gin" + + "tanabata/backend/internal/domain" + "tanabata/backend/internal/service" +) + +// DuplicateHandler handles the /files/duplicates endpoints. +type DuplicateHandler struct { + dupSvc *service.DuplicateService +} + +// NewDuplicateHandler creates a DuplicateHandler. +func NewDuplicateHandler(dupSvc *service.DuplicateService) *DuplicateHandler { + return &DuplicateHandler{dupSvc: dupSvc} +} + +// List handles GET /files/duplicates — an offset-paginated list of duplicate +// clusters, each a group of files within the perceptual-hash threshold. +func (h *DuplicateHandler) List(c *gin.Context) { + limit, offset := 20, 0 + if n, err := strconv.Atoi(c.Query("limit")); err == nil { + limit = n + } + if n, err := strconv.Atoi(c.Query("offset")); err == nil { + offset = n + } + if limit < 1 { + limit = 1 + } + if limit > 50 { + limit = 50 + } + if offset < 0 { + offset = 0 + } + + clusters, total, err := h.dupSvc.Clusters(c.Request.Context(), limit, offset) + if err != nil { + respondError(c, err) + return + } + + items := make([]gin.H, len(clusters)) + for i, files := range clusters { + fs := make([]fileJSON, len(files)) + for j, f := range files { + fs[j] = toFileJSON(f) + } + items[i] = gin.H{"files": fs} + } + respondJSON(c, http.StatusOK, gin.H{ + "items": items, + "total": total, + "limit": limit, + "offset": offset, + }) +} + +// Dismiss handles POST /files/duplicates/dismiss — mark a pair "not a duplicate". +func (h *DuplicateHandler) Dismiss(c *gin.Context) { + var body struct { + FileIDA string `json:"file_id_a" binding:"required"` + FileIDB string `json:"file_id_b" binding:"required"` + } + if err := c.ShouldBindJSON(&body); err != nil { + respondError(c, domain.ErrValidation) + return + } + ids, err := parseUUIDs([]string{body.FileIDA, body.FileIDB}) + if err != nil { + respondError(c, domain.ErrValidation) + return + } + if err := h.dupSvc.Dismiss(c.Request.Context(), ids[0], ids[1]); err != nil { + respondError(c, err) + return + } + c.Status(http.StatusNoContent) +} + +// Resolve handles POST /files/duplicates/resolve — merge a duplicate pair, +// keeping one file and folding the chosen fields in from the other. Returns the +// updated survivor. delete_discarded defaults to true. +func (h *DuplicateHandler) Resolve(c *gin.Context) { + var body struct { + Keep string `json:"keep" binding:"required"` + Discard string `json:"discard" binding:"required"` + Fields service.MergeFields `json:"fields"` + DeleteDiscarded *bool `json:"delete_discarded"` + } + if err := c.ShouldBindJSON(&body); err != nil { + respondError(c, domain.ErrValidation) + return + } + ids, err := parseUUIDs([]string{body.Keep, body.Discard}) + if err != nil { + respondError(c, domain.ErrValidation) + return + } + + del := true + if body.DeleteDiscarded != nil { + del = *body.DeleteDiscarded + } + f, err := h.dupSvc.Resolve(c.Request.Context(), service.MergeSpec{ + Keep: ids[0], + Discard: ids[1], + Fields: body.Fields, + DeleteDiscarded: del, + }) + if err != nil { + respondError(c, err) + return + } + respondJSON(c, http.StatusOK, toFileJSON(*f)) +} diff --git a/backend/internal/handler/router.go b/backend/internal/handler/router.go index 85b5a5b..041cae7 100644 --- a/backend/internal/handler/router.go +++ b/backend/internal/handler/router.go @@ -26,6 +26,7 @@ func NewRouter( auth *AuthMiddleware, authHandler *AuthHandler, fileHandler *FileHandler, + duplicateHandler *DuplicateHandler, tagHandler *TagHandler, categoryHandler *CategoryHandler, poolHandler *PoolHandler, @@ -80,7 +81,11 @@ func NewRouter( files.GET("", fileHandler.List) files.POST("", fileHandler.Upload) - // Bulk + import routes registered before /:id to prevent param collision. + // Bulk + import + duplicates routes registered before /:id to prevent + // param collision (e.g. "duplicates" being captured as :id). + files.GET("/duplicates", duplicateHandler.List) + files.POST("/duplicates/dismiss", duplicateHandler.Dismiss) + files.POST("/duplicates/resolve", duplicateHandler.Resolve) files.POST("/bulk/tags", fileHandler.BulkSetTags) files.POST("/bulk/delete", fileHandler.BulkDelete) files.POST("/bulk/review", fileHandler.BulkReview) diff --git a/backend/internal/handler/router_test.go b/backend/internal/handler/router_test.go index aaba7e3..8a4e042 100644 --- a/backend/internal/handler/router_test.go +++ b/backend/internal/handler/router_test.go @@ -10,7 +10,7 @@ import "testing" func TestNewRouterRegisters(t *testing.T) { r, err := NewRouter( (*AuthMiddleware)(nil), (*AuthHandler)(nil), - (*FileHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil), + (*FileHandler)(nil), (*DuplicateHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil), (*UserHandler)(nil), (*ACLHandler)(nil), (*AuditHandler)(nil), "", nil, ) diff --git a/backend/internal/integration/server_test.go b/backend/internal/integration/server_test.go index 549b676..ae65a0c 100644 --- a/backend/internal/integration/server_test.go +++ b/backend/internal/integration/server_test.go @@ -56,6 +56,9 @@ type harness struct { client *http.Client importDir string pool *pgxpool.Pool + // dupSvc lets duplicate tests trigger a pairs rescan directly: rebuilding the + // pairs table is a CLI/maintenance action with no HTTP endpoint. + dupSvc *service.DuplicateService } // setupSuite creates an ephemeral database, runs migrations, wires the full @@ -125,6 +128,8 @@ func setupSuite(t *testing.T) *harness { tagRuleRepo := postgres.NewTagRuleRepo(pool) categoryRepo := postgres.NewCategoryRepo(pool) poolRepo := postgres.NewPoolRepo(pool) + duplicatePairRepo := postgres.NewDuplicatePairRepo(pool) + dismissalRepo := postgres.NewDismissalRepo(pool) transactor := postgres.NewTransactor(pool) // --- Services ------------------------------------------------------------ @@ -134,6 +139,7 @@ func setupSuite(t *testing.T) *harness { tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor) categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc) poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc) + duplicateSvc := service.NewDuplicateService(fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, 10) fileSvc := service.NewFileService(fileRepo, mimeRepo, diskStorage, aclSvc, auditSvc, tagSvc, transactor, importDir) userSvc := service.NewUserService(userRepo, sessionRepo, auditSvc) @@ -145,6 +151,7 @@ func setupSuite(t *testing.T) *harness { authMiddleware := handler.NewAuthMiddleware(authSvc) authHandler := handler.NewAuthHandler(authSvc) fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, 500<<20) + duplicateHandler := handler.NewDuplicateHandler(duplicateSvc) tagHandler := handler.NewTagHandler(tagSvc, fileSvc) categoryHandler := handler.NewCategoryHandler(categorySvc) poolHandler := handler.NewPoolHandler(poolSvc) @@ -154,7 +161,7 @@ func setupSuite(t *testing.T) *harness { r, err := handler.NewRouter( authMiddleware, authHandler, - fileHandler, tagHandler, categoryHandler, poolHandler, + fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler, userHandler, aclHandler, auditHandler, "", nil, @@ -170,6 +177,7 @@ func setupSuite(t *testing.T) *harness { client: srv.Client(), importDir: importDir, pool: pool, + dupSvc: duplicateSvc, } } @@ -1643,3 +1651,103 @@ var ( _ = freePort _ = writeFile ) + +// dupListResponse decodes GET /files/duplicates. +type dupListResponse struct { + Items []struct { + Files []struct { + ID string `json:"id"` + Tags []struct { + ID string `json:"id"` + } `json:"tags"` + } `json:"files"` + } `json:"items"` + Total int `json:"total"` +} + +// TestDuplicateDetection exercises the full duplicate lifecycle: perceptual hashes +// are computed on upload, a rescan builds the pairs table, the cluster surfaces, +// a field-by-field merge unions tags and trashes the discarded file, and a +// dismissal hides a pair permanently (surviving a re-rescan). +// +// minimalJPEG() is a 1×1 image, so every upload hashes identically — in a fresh +// database any two uploads form one duplicate pair, which keeps this deterministic. +func TestDuplicateDetection(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + h := setupSuite(t) + ctx := context.Background() + admin := h.login("admin", "admin") + + // --- two uploads => one duplicate pair after a rescan --------------------- + f1 := h.uploadJPEG(admin, "a.jpg")["id"].(string) + f2 := h.uploadJPEG(admin, "b.jpg")["id"].(string) + + // Tag f2 so the merge has something to union onto the survivor. + resp := h.doJSON("POST", "/tags", map[string]any{"name": "kept", "is_public": true}, admin) + require.Equal(t, http.StatusCreated, resp.StatusCode, resp.String()) + var tag map[string]any + resp.decode(t, &tag) + tagID := tag["id"].(string) + resp = h.doJSON("PUT", "/files/"+f2+"/tags", map[string]any{"tag_ids": []string{tagID}}, admin) + require.Equal(t, http.StatusOK, resp.StatusCode, resp.String()) + + require.NoError(t, h.dupSvc.Rescan(ctx, nil)) + + resp = h.doJSON("GET", "/files/duplicates", nil, admin) + require.Equal(t, http.StatusOK, resp.StatusCode, resp.String()) + var list dupListResponse + resp.decode(t, &list) + require.Equal(t, 1, list.Total, "expected one duplicate cluster: %s", resp) + require.Len(t, list.Items, 1) + require.Len(t, list.Items[0].Files, 2) + + // --- resolve: keep f1, union tags from f2, trash f2 ---------------------- + resp = h.doJSON("POST", "/files/duplicates/resolve", map[string]any{ + "keep": f1, + "discard": f2, + "fields": map[string]any{"tags": "both"}, + "delete_discarded": true, + }, admin) + require.Equal(t, http.StatusOK, resp.StatusCode, resp.String()) + var survivor struct { + ID string `json:"id"` + Tags []struct { + ID string `json:"id"` + } `json:"tags"` + } + resp.decode(t, &survivor) + assert.Equal(t, f1, survivor.ID) + require.Len(t, survivor.Tags, 1, "survivor should have inherited the discarded file's tag") + assert.Equal(t, tagID, survivor.Tags[0].ID) + + // f2 is now trashed, so the pair drops out of the duplicates view. + resp = h.doJSON("GET", "/files/duplicates", nil, admin) + resp.decode(t, &list) + assert.Equal(t, 0, list.Total, "resolved pair should no longer surface: %s", resp) + + // --- dismiss: a new pair, hidden and staying hidden across a rescan ------- + f3 := h.uploadJPEG(admin, "c.jpg")["id"].(string) + require.NoError(t, h.dupSvc.Rescan(ctx, nil)) + + resp = h.doJSON("GET", "/files/duplicates", nil, admin) + resp.decode(t, &list) + require.Equal(t, 1, list.Total, "f1 and f3 should now form a cluster: %s", resp) + + resp = h.doJSON("POST", "/files/duplicates/dismiss", map[string]any{ + "file_id_a": f1, "file_id_b": f3, + }, admin) + require.Equal(t, http.StatusNoContent, resp.StatusCode, resp.String()) + + resp = h.doJSON("GET", "/files/duplicates", nil, admin) + resp.decode(t, &list) + assert.Equal(t, 0, list.Total, "dismissed pair should be hidden") + + // A rescan re-finds the pair but the dismissal still hides it. + require.NoError(t, h.dupSvc.Rescan(ctx, nil)) + resp = h.doJSON("GET", "/files/duplicates", nil, admin) + resp.decode(t, &list) + assert.Equal(t, 0, list.Total, "dismissal must survive a rescan") +} diff --git a/backend/internal/port/repository.go b/backend/internal/port/repository.go index 48d7cc9..524b739 100644 --- a/backend/internal/port/repository.go +++ b/backend/internal/port/repository.go @@ -52,6 +52,15 @@ type FileRepo interface { SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error // SetPHash sets (or clears, when nil) the perceptual hash of a file. SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error + // ListMissingPHash returns live image/video files that have no perceptual + // hash yet (the dedup backfill work list). + ListMissingPHash(ctx context.Context) ([]domain.File, error) + // ListAllPHashes returns the id and perceptual hash of every live, hashed + // file (the global input to the dedup rescan; not ACL-filtered). + ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error) + // CopyPoolMemberships adds targetID to every pool sourceID belongs to, + // skipping pools target is already in (used by the duplicate merge). + CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error // SoftDelete moves a file to trash (sets is_deleted = true). SoftDelete(ctx context.Context, id uuid.UUID) error // Restore moves a file out of trash (sets is_deleted = false). @@ -72,6 +81,21 @@ type FileRepo interface { RecordTagUses(ctx context.Context, userID int16, filterDSL string) error } +// DuplicatePairRepo persists the precomputed near-duplicate candidate pairs. +type DuplicatePairRepo interface { + // ReplaceAll atomically replaces the whole pairs table (used by the rescan). + ReplaceAll(ctx context.Context, pairs []domain.DuplicatePair) error + // ListVisible returns pairs whose both files are live, not dismissed, and + // (for non-admins) visible to the viewer. + ListVisible(ctx context.Context, viewerID int16, isAdmin bool) ([]domain.DuplicatePair, error) +} + +// DismissalRepo persists "not a duplicate" decisions. +type DismissalRepo interface { + // Add records a pair as dismissed (canonical order, idempotent). + Add(ctx context.Context, a, b uuid.UUID, userID int16) error +} + // TagRepo is the persistence interface for tags. type TagRepo interface { List(ctx context.Context, params OffsetParams) (*domain.TagOffsetPage, error) diff --git a/backend/internal/service/duplicate_index.go b/backend/internal/service/duplicate_index.go new file mode 100644 index 0000000..475e088 --- /dev/null +++ b/backend/internal/service/duplicate_index.go @@ -0,0 +1,148 @@ +package service + +import ( + "bytes" + "math/bits" + "sort" + + "github.com/google/uuid" + + "tanabata/backend/internal/domain" +) + +// hamming returns the number of differing bits between two perceptual hashes. +func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) } + +// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact +// same hash are collected in ids (a distance-0 collision), so identical images +// don't degenerate the tree into a chain. +type bkNode struct { + hash uint64 + ids []uuid.UUID + children map[int]*bkNode +} + +// bkTree indexes perceptual hashes for sublinear radius queries. Building one and +// querying every element with a small radius is far cheaper than the O(N²) all- +// pairs comparison at 100k+ files. +type bkTree struct{ root *bkNode } + +func (t *bkTree) insert(hash uint64, id uuid.UUID) { + if t.root == nil { + t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}} + return + } + node := t.root + for { + d := hamming(hash, node.hash) + if d == 0 { + node.ids = append(node.ids, id) + return + } + child, ok := node.children[d] + if !ok { + node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}} + return + } + node = child + } +} + +// query visits every node whose hash is within radius of target. The triangle +// inequality bounds which children can hold a match to [d-radius, d+radius]. +func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) { + if t.root == nil { + return + } + stack := []*bkNode{t.root} + for len(stack) > 0 { + node := stack[len(stack)-1] + stack = stack[:len(stack)-1] + + d := hamming(target, node.hash) + if d <= radius { + visit(node, d) + } + lo, hi := d-radius, d+radius + for cd, child := range node.children { + if cd >= lo && cd <= hi { + stack = append(stack, child) + } + } + } +} + +// buildPairs returns every unordered pair of files whose hashes are within +// threshold, each emitted exactly once with FileA < FileB (UUID byte order). +// onProgress, if set, is called periodically with (processed, total). +func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair { + tree := &bkTree{} + for _, e := range entries { + tree.insert(uint64(e.PHash), e.ID) + } + + var pairs []domain.DuplicatePair + total := len(entries) + for i := range entries { + e := entries[i] + tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) { + for _, other := range node.ids { + // Emit each pair once, from the smaller id, which also skips self. + if bytes.Compare(e.ID[:], other[:]) < 0 { + pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist}) + } + } + }) + if onProgress != nil && (i+1)%1000 == 0 { + onProgress(i+1, total) + } + } + if onProgress != nil { + onProgress(total, total) + } + return pairs +} + +// clusterPairs groups pairs into connected components (transitive closure) via +// union-find. Every returned cluster has at least two files; clusters and the ids +// within them are sorted by UUID for stable pagination. +func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID { + parent := map[uuid.UUID]uuid.UUID{} + var find func(uuid.UUID) uuid.UUID + find = func(x uuid.UUID) uuid.UUID { + p, ok := parent[x] + if !ok { + parent[x] = x + return x + } + if p != x { + parent[x] = find(p) + } + return parent[x] + } + union := func(a, b uuid.UUID) { + ra, rb := find(a), find(b) + if ra != rb { + parent[ra] = rb + } + } + for _, p := range pairs { + union(p.FileA, p.FileB) + } + + groups := map[uuid.UUID][]uuid.UUID{} + for node := range parent { + root := find(node) + groups[root] = append(groups[root], node) + } + + clusters := make([][]uuid.UUID, 0, len(groups)) + for _, ids := range groups { + sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 }) + clusters = append(clusters, ids) + } + sort.Slice(clusters, func(i, j int) bool { + return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0 + }) + return clusters +} diff --git a/backend/internal/service/duplicate_service.go b/backend/internal/service/duplicate_service.go new file mode 100644 index 0000000..b076ef2 --- /dev/null +++ b/backend/internal/service/duplicate_service.go @@ -0,0 +1,361 @@ +package service + +import ( + "context" + "encoding/json" + "errors" + "time" + + "github.com/google/uuid" + + "tanabata/backend/internal/domain" + "tanabata/backend/internal/port" +) + +// Merge field source values. +const ( + mergeKeep = "keep" + mergeDiscard = "discard" + mergeBoth = "both" + mergeMerge = "merge" +) + +// MergeFields chooses, per field, which file supplies the survivor's value when +// resolving a duplicate. Scalars accept "keep"/"discard"; metadata also accepts +// "merge" (shallow object merge, survivor wins on key conflicts); relations +// (tags, pools) accept "keep"/"both" (union) — there is deliberately no option +// to drop the survivor's own tags/pools. An empty value defaults to "keep". +type MergeFields struct { + OriginalName string `json:"original_name"` + Notes string `json:"notes"` + ContentDatetime string `json:"content_datetime"` + IsPublic string `json:"is_public"` + Metadata string `json:"metadata"` + Tags string `json:"tags"` + Pools string `json:"pools"` +} + +// MergeSpec is the input to a duplicate resolution: keep one file, fold chosen +// fields in from the other, and (usually) trash the other. +type MergeSpec struct { + Keep uuid.UUID + Discard uuid.UUID + Fields MergeFields + DeleteDiscarded bool +} + +// normalize fills empty choices with "keep" and rejects unknown values. +func (m *MergeSpec) normalize() error { + scalar := func(v *string) error { + if *v == "" { + *v = mergeKeep + } + if *v != mergeKeep && *v != mergeDiscard { + return domain.ErrValidation + } + return nil + } + relation := func(v *string) error { + if *v == "" { + *v = mergeKeep + } + if *v != mergeKeep && *v != mergeBoth { + return domain.ErrValidation + } + return nil + } + f := &m.Fields + if err := scalar(&f.OriginalName); err != nil { + return err + } + if err := scalar(&f.Notes); err != nil { + return err + } + if err := scalar(&f.ContentDatetime); err != nil { + return err + } + if err := scalar(&f.IsPublic); err != nil { + return err + } + if f.Metadata == "" { + f.Metadata = mergeKeep + } + if f.Metadata != mergeKeep && f.Metadata != mergeDiscard && f.Metadata != mergeMerge { + return domain.ErrValidation + } + if err := relation(&f.Tags); err != nil { + return err + } + if err := relation(&f.Pools); err != nil { + return err + } + return nil +} + +// DuplicateService finds near-duplicate clusters and resolves them. +type DuplicateService struct { + files port.FileRepo + pairs port.DuplicatePairRepo + dismissals port.DismissalRepo + acl *ACLService + audit *AuditService + tx port.Transactor + threshold int +} + +// NewDuplicateService creates a DuplicateService. threshold is the maximum +// Hamming distance for two files to be treated as duplicate candidates. +func NewDuplicateService( + files port.FileRepo, + pairs port.DuplicatePairRepo, + dismissals port.DismissalRepo, + acl *ACLService, + audit *AuditService, + tx port.Transactor, + threshold int, +) *DuplicateService { + return &DuplicateService{ + files: files, + pairs: pairs, + dismissals: dismissals, + acl: acl, + audit: audit, + tx: tx, + threshold: threshold, + } +} + +// Clusters returns a page of duplicate clusters visible to the caller. Pairs are +// read from the precomputed table (no all-pairs scan here) and grouped into +// connected components; pagination is over whole clusters. +func (s *DuplicateService) Clusters(ctx context.Context, limit, offset int) (clusters [][]domain.File, total int, err error) { + userID, isAdmin, _ := domain.UserFromContext(ctx) + + pairs, err := s.pairs.ListVisible(ctx, userID, isAdmin) + if err != nil { + return nil, 0, err + } + groups := clusterPairs(pairs) + total = len(groups) + + if offset < 0 { + offset = 0 + } + if offset >= len(groups) { + return [][]domain.File{}, total, nil + } + end := offset + limit + if end > len(groups) || limit <= 0 { + end = len(groups) + } + + out := make([][]domain.File, 0, end-offset) + for _, ids := range groups[offset:end] { + files := make([]domain.File, 0, len(ids)) + for _, id := range ids { + f, err := s.files.GetByID(ctx, id) + if err != nil { + // A file deleted between the pair read and now just drops out. + if errors.Is(err, domain.ErrNotFound) { + continue + } + return nil, 0, err + } + files = append(files, *f) + } + if len(files) >= 2 { + out = append(out, files) + } + } + return out, total, nil +} + +// Rescan recomputes the entire duplicate_pairs table from the current set of +// perceptual hashes. It is the only thing that populates the table, so the +// duplicates view reflects state as of the last rescan. Called by the dedup CLI. +func (s *DuplicateService) Rescan(ctx context.Context, onProgress func(done, total int)) error { + entries, err := s.files.ListAllPHashes(ctx) + if err != nil { + return err + } + pairs := buildPairs(entries, s.threshold, onProgress) + return s.pairs.ReplaceAll(ctx, pairs) +} + +// Dismiss records two files as "not a duplicate" so the pair stops surfacing. +// The caller must be able to view both files. +func (s *DuplicateService) Dismiss(ctx context.Context, a, b uuid.UUID) error { + if a == b { + return domain.ErrValidation + } + userID, isAdmin, _ := domain.UserFromContext(ctx) + for _, id := range []uuid.UUID{a, b} { + f, err := s.files.GetByID(ctx, id) + if err != nil { + return err + } + ok, err := s.acl.CanView(ctx, userID, isAdmin, f.CreatorID, f.IsPublic, fileObjectTypeID, id) + if err != nil { + return err + } + if !ok { + return domain.ErrForbidden + } + } + if err := s.dismissals.Add(ctx, a, b, userID); err != nil { + return err + } + objType := fileObjectType + _ = s.audit.Log(ctx, "duplicate_dismiss", &objType, &a, map[string]any{"other": b.String()}) + return nil +} + +// Resolve merges a duplicate pair: the survivor (keep) takes the chosen fields +// from the other (discard), and the other is trashed when DeleteDiscarded is set. +// The caller must be able to edit both files. Returns the updated survivor. +func (s *DuplicateService) Resolve(ctx context.Context, spec MergeSpec) (*domain.File, error) { + if spec.Keep == spec.Discard { + return nil, domain.ErrValidation + } + if err := spec.normalize(); err != nil { + return nil, err + } + + keep, err := s.files.GetByID(ctx, spec.Keep) + if err != nil { + return nil, err + } + discard, err := s.files.GetByID(ctx, spec.Discard) + if err != nil { + return nil, err + } + + userID, isAdmin, _ := domain.UserFromContext(ctx) + for _, f := range []*domain.File{keep, discard} { + ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, f.ID) + if err != nil { + return nil, err + } + if !ok { + return nil, domain.ErrForbidden + } + } + + // FileRepo.Update rewrites all editable scalar columns, so build the complete + // resolved set (each field from keep or discard) rather than a sparse patch. + patch := &domain.File{ + OriginalName: pickPtr(spec.Fields.OriginalName, keep.OriginalName, discard.OriginalName), + Notes: pickPtr(spec.Fields.Notes, keep.Notes, discard.Notes), + ContentDatetime: pickTime(spec.Fields.ContentDatetime, keep.ContentDatetime, discard.ContentDatetime), + IsPublic: pickBool(spec.Fields.IsPublic, keep.IsPublic, discard.IsPublic), + Metadata: pickMetadata(spec.Fields.Metadata, keep.Metadata, discard.Metadata), + } + + var result *domain.File + txErr := s.tx.WithTx(ctx, func(ctx context.Context) error { + updated, err := s.files.Update(ctx, keep.ID, patch) + if err != nil { + return err + } + + if spec.Fields.Tags == mergeBoth { + if err := s.files.SetTags(ctx, keep.ID, unionTagIDs(keep.Tags, discard.Tags)); err != nil { + return err + } + tags, err := s.files.ListTags(ctx, keep.ID) + if err != nil { + return err + } + updated.Tags = tags + } + if spec.Fields.Pools == mergeBoth { + if err := s.files.CopyPoolMemberships(ctx, keep.ID, discard.ID); err != nil { + return err + } + } + if spec.DeleteDiscarded { + if err := s.files.SoftDelete(ctx, discard.ID); err != nil { + return err + } + } + result = updated + return nil + }) + if txErr != nil { + return nil, txErr + } + + objType := fileObjectType + _ = s.audit.Log(ctx, "file_merge", &objType, &keep.ID, map[string]any{ + "discard": spec.Discard.String(), + "fields": spec.Fields, + "deleted_discarded": spec.DeleteDiscarded, + }) + return result, nil +} + +// --- field pickers --------------------------------------------------------- + +func pickPtr(choice string, keep, discard *string) *string { + if choice == mergeDiscard { + return discard + } + return keep +} + +func pickBool(choice string, keep, discard bool) bool { + if choice == mergeDiscard { + return discard + } + return keep +} + +func pickTime(choice string, keep, discard time.Time) time.Time { + if choice == mergeDiscard { + return discard + } + return keep +} + +func unionTagIDs(a, b []domain.Tag) []uuid.UUID { + seen := make(map[uuid.UUID]bool, len(a)+len(b)) + ids := make([]uuid.UUID, 0, len(a)+len(b)) + for _, t := range append(append([]domain.Tag{}, a...), b...) { + if !seen[t.ID] { + seen[t.ID] = true + ids = append(ids, t.ID) + } + } + return ids +} + +// pickMetadata returns keep's metadata, discard's, or a shallow merge in which +// the survivor's keys win on conflict. +func pickMetadata(choice string, keep, discard json.RawMessage) json.RawMessage { + switch choice { + case mergeDiscard: + return discard + case mergeMerge: + km := map[string]json.RawMessage{} + dm := map[string]json.RawMessage{} + _ = json.Unmarshal(keep, &km) + _ = json.Unmarshal(discard, &dm) + out := make(map[string]json.RawMessage, len(km)+len(dm)) + for k, v := range dm { + out[k] = v + } + for k, v := range km { // survivor wins + out[k] = v + } + if len(out) == 0 { + return keep + } + b, err := json.Marshal(out) + if err != nil { + return keep + } + return b + default: + return keep + } +} diff --git a/backend/internal/service/duplicate_test.go b/backend/internal/service/duplicate_test.go new file mode 100644 index 0000000..af8b0e2 --- /dev/null +++ b/backend/internal/service/duplicate_test.go @@ -0,0 +1,152 @@ +package service + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "sort" + "testing" + + "github.com/google/uuid" + + "tanabata/backend/internal/domain" +) + +// id builds a deterministic UUID whose byte order matches n, so tests can reason +// about the canonical (FileA < FileB) ordering buildPairs produces. +func id(n int) uuid.UUID { + return uuid.MustParse(fmt.Sprintf("00000000-0000-0000-0000-%012d", n)) +} + +func entry(n int, hash uint64) domain.PHashEntry { + return domain.PHashEntry{ID: id(n), PHash: int64(hash)} +} + +// pairKey canonicalises a pair for set comparison regardless of emission order. +func pairKey(p domain.DuplicatePair) string { + a, b := p.FileA, p.FileB + if bytes.Compare(a[:], b[:]) > 0 { + a, b = b, a + } + return fmt.Sprintf("%s|%s|%d", a, b, p.Distance) +} + +func TestBuildPairs_ThresholdAndCanonicalOrder(t *testing.T) { + entries := []domain.PHashEntry{ + entry(1, 0x0000000000000000), + entry(2, 0x0000000000000001), // distance 1 from #1 + entry(3, 0x00000000000000FF), // distance 8 from #1, 7 from #2 + entry(4, 0xFFFFFFFFFFFFFFFF), // distance 64 from #1 + } + + // Tight threshold: only the distance-1 pair qualifies. + got := buildPairs(entries, 2, nil) + if len(got) != 1 { + t.Fatalf("threshold 2: got %d pairs, want 1: %+v", len(got), got) + } + if got[0].FileA != id(1) || got[0].FileB != id(2) || got[0].Distance != 1 { + t.Errorf("threshold 2: unexpected pair %+v", got[0]) + } + // Canonical order always FileA < FileB. + if bytes.Compare(got[0].FileA[:], got[0].FileB[:]) >= 0 { + t.Error("pair not in canonical FileA < FileB order") + } + + // Looser threshold pulls in #3's pairs but never #4. + got8 := buildPairs(entries, 8, nil) + want := map[string]bool{ + pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(2), Distance: 1}): true, + pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(3), Distance: 8}): true, + pairKey(domain.DuplicatePair{FileA: id(2), FileB: id(3), Distance: 7}): true, + } + if len(got8) != len(want) { + t.Fatalf("threshold 8: got %d pairs, want %d: %+v", len(got8), len(want), got8) + } + for _, p := range got8 { + if !want[pairKey(p)] { + t.Errorf("threshold 8: unexpected pair %+v", p) + } + } +} + +func TestBuildPairs_IdenticalHashesPairAtDistanceZero(t *testing.T) { + entries := []domain.PHashEntry{ + entry(1, 0xABCDABCDABCDABCD), + entry(2, 0xABCDABCDABCDABCD), + } + got := buildPairs(entries, 0, nil) + if len(got) != 1 || got[0].Distance != 0 || got[0].FileA != id(1) || got[0].FileB != id(2) { + t.Fatalf("identical hashes: got %+v, want one distance-0 pair (1,2)", got) + } +} + +func TestClusterPairs_ConnectedComponents(t *testing.T) { + pairs := []domain.DuplicatePair{ + {FileA: id(1), FileB: id(2)}, + {FileA: id(2), FileB: id(3)}, // transitively joins 1-2-3 + {FileA: id(5), FileB: id(6)}, + } + clusters := clusterPairs(pairs) + if len(clusters) != 2 { + t.Fatalf("got %d clusters, want 2: %+v", len(clusters), clusters) + } + // Sorted by smallest id: {1,2,3} then {5,6}. + if len(clusters[0]) != 3 || clusters[0][0] != id(1) || clusters[0][2] != id(3) { + t.Errorf("cluster 0 = %v, want [1 2 3]", clusters[0]) + } + if len(clusters[1]) != 2 || clusters[1][0] != id(5) { + t.Errorf("cluster 1 = %v, want [5 6]", clusters[1]) + } + // Each cluster's ids are sorted. + for _, c := range clusters { + if !sort.SliceIsSorted(c, func(i, j int) bool { return bytes.Compare(c[i][:], c[j][:]) < 0 }) { + t.Errorf("cluster not sorted: %v", c) + } + } +} + +func TestPickMetadata_Merge(t *testing.T) { + keep := json.RawMessage(`{"a":1,"b":2}`) + discard := json.RawMessage(`{"b":9,"c":3}`) + + out := pickMetadata(mergeMerge, keep, discard) + var m map[string]int + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("merge result not valid JSON: %v (%s)", err, out) + } + want := map[string]int{"a": 1, "b": 2, "c": 3} // survivor wins on "b" + if fmt.Sprint(m) != fmt.Sprint(want) { + t.Errorf("merge = %v, want %v", m, want) + } + + if string(pickMetadata(mergeKeep, keep, discard)) != string(keep) { + t.Error("keep choice should return survivor metadata unchanged") + } + if string(pickMetadata(mergeDiscard, keep, discard)) != string(discard) { + t.Error("discard choice should return the other file's metadata") + } +} + +func TestMergeSpec_Normalize(t *testing.T) { + // Empty fields default to "keep". + spec := MergeSpec{Keep: id(1), Discard: id(2)} + if err := spec.normalize(); err != nil { + t.Fatalf("normalize empty: %v", err) + } + if spec.Fields.OriginalName != mergeKeep || spec.Fields.Tags != mergeKeep || spec.Fields.Metadata != mergeKeep { + t.Errorf("empty fields not defaulted to keep: %+v", spec.Fields) + } + + // "both" is invalid for a scalar field. + bad := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Notes: mergeBoth}} + if err := bad.normalize(); !errors.Is(err, domain.ErrValidation) { + t.Errorf("scalar=both: got %v, want ErrValidation", err) + } + + // "discard" is invalid for a relation field. + badRel := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Tags: mergeDiscard}} + if err := badRel.normalize(); !errors.Is(err, domain.ErrValidation) { + t.Errorf("relation=discard: got %v, want ErrValidation", err) + } +} diff --git a/backend/migrations/003_data_tables.sql b/backend/migrations/003_data_tables.sql index 0fc9f1b..288d509 100644 --- a/backend/migrations/003_data_tables.sql +++ b/backend/migrations/003_data_tables.sql @@ -92,6 +92,31 @@ CREATE TABLE data.file_pool ( PRIMARY KEY (file_id, pool_id) ); +-- Precomputed near-duplicate candidates (phash Hamming distance <= threshold), +-- (re)built in full by the dedup rescan. Stored once per unordered pair with a +-- canonical file_a < file_b ordering so a pair is never duplicated as (a,b)/(b,a). +CREATE TABLE data.duplicate_pairs ( + file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE, + file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE, + distance smallint NOT NULL, + + CONSTRAINT chk__duplicate_pairs__order CHECK (file_a < file_b), + PRIMARY KEY (file_a, file_b) +); + +-- "Not a duplicate" decisions: a global overlay that hides a candidate pair from +-- the duplicates view. Survives rescans (the pair may be re-found but stays +-- hidden). Same canonical file_a < file_b ordering as data.duplicate_pairs. +CREATE TABLE data.duplicate_dismissals ( + file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE, + file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE, + dismissed_by smallint NOT NULL REFERENCES core.users(id) ON UPDATE CASCADE ON DELETE RESTRICT, + dismissed_at timestamptz NOT NULL DEFAULT clock_timestamp(), + + CONSTRAINT chk__duplicate_dismissals__order CHECK (file_a < file_b), + PRIMARY KEY (file_a, file_b) +); + COMMENT ON TABLE data.categories IS 'Logical grouping of tags'; COMMENT ON TABLE data.tags IS 'File labels/tags'; COMMENT ON TABLE data.tag_rules IS 'Auto-tagging rules: when when_tag is assigned, then_tag follows'; @@ -99,6 +124,8 @@ COMMENT ON TABLE data.files IS 'Managed files; actual content stored on di COMMENT ON TABLE data.file_tag IS 'Many-to-many: files <-> tags'; COMMENT ON TABLE data.pools IS 'Ordered collections of files'; COMMENT ON TABLE data.file_pool IS 'Many-to-many: files <-> pools, with ordering'; +COMMENT ON TABLE data.duplicate_pairs IS 'Precomputed near-duplicate candidate pairs (perceptual-hash distance)'; +COMMENT ON TABLE data.duplicate_dismissals IS 'Pairs marked "not a duplicate"; hidden from the duplicates view'; COMMENT ON COLUMN data.files.original_name IS 'Original filename at upload time'; COMMENT ON COLUMN data.files.content_datetime IS 'Content datetime (e.g. when photo was taken); falls back to EXIF DateTimeOriginal'; @@ -110,6 +137,8 @@ COMMENT ON COLUMN data.file_pool.position IS 'Manual ordering within pool; u -- +goose Down +DROP TABLE IF EXISTS data.duplicate_dismissals; +DROP TABLE IF EXISTS data.duplicate_pairs; DROP TABLE IF EXISTS data.file_pool; DROP TABLE IF EXISTS data.pools; DROP TABLE IF EXISTS data.file_tag; diff --git a/backend/migrations/006_indexes.sql b/backend/migrations/006_indexes.sql index 8f09758..638e98c 100644 --- a/backend/migrations/006_indexes.sql +++ b/backend/migrations/006_indexes.sql @@ -26,6 +26,12 @@ CREATE INDEX idx__files__needs_review ON data.files USING btree (id) WHERE CREATE INDEX idx__file_tag__tag_id ON data.file_tag USING hash (tag_id); CREATE INDEX idx__file_tag__file_id ON data.file_tag USING hash (file_id); +-- data.duplicate_pairs / data.duplicate_dismissals +-- The composite primary keys cover lookups on file_a; these add the file_b side +-- (used by the ON DELETE CASCADE and by the visibility join on the second file). +CREATE INDEX idx__duplicate_pairs__file_b ON data.duplicate_pairs USING hash (file_b); +CREATE INDEX idx__duplicate_dismissals__file_b ON data.duplicate_dismissals USING hash (file_b); + -- data.pools CREATE INDEX idx__pools__creator_id ON data.pools USING hash (creator_id); @@ -70,6 +76,8 @@ DROP INDEX IF EXISTS activity.idx__sessions__token_hash; DROP INDEX IF EXISTS activity.idx__sessions__user_id; DROP INDEX IF EXISTS acl.idx__acl__user; DROP INDEX IF EXISTS acl.idx__acl__object; +DROP INDEX IF EXISTS data.idx__duplicate_dismissals__file_b; +DROP INDEX IF EXISTS data.idx__duplicate_pairs__file_b; DROP INDEX IF EXISTS data.idx__file_pool__file_id; DROP INDEX IF EXISTS data.idx__file_pool__pool_id; DROP INDEX IF EXISTS data.idx__pools__creator_id; diff --git a/backend/migrations/007_seed_data.sql b/backend/migrations/007_seed_data.sql index 2de972d..2486029 100644 --- a/backend/migrations/007_seed_data.sql +++ b/backend/migrations/007_seed_data.sql @@ -21,6 +21,7 @@ INSERT INTO activity.action_types (name) VALUES -- Files ('file_create'), ('file_edit'), ('file_delete'), ('file_restore'), ('file_permanent_delete'), ('file_replace'), ('file_review'), + ('file_merge'), ('duplicate_dismiss'), -- Tags ('tag_create'), ('tag_edit'), ('tag_delete'), -- Categories