feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,8 @@ func main() {
|
|||||||
tagRuleRepo := postgres.NewTagRuleRepo(pool)
|
tagRuleRepo := postgres.NewTagRuleRepo(pool)
|
||||||
categoryRepo := postgres.NewCategoryRepo(pool)
|
categoryRepo := postgres.NewCategoryRepo(pool)
|
||||||
poolRepo := postgres.NewPoolRepo(pool)
|
poolRepo := postgres.NewPoolRepo(pool)
|
||||||
|
duplicatePairRepo := postgres.NewDuplicatePairRepo(pool)
|
||||||
|
dismissalRepo := postgres.NewDismissalRepo(pool)
|
||||||
transactor := postgres.NewTransactor(pool)
|
transactor := postgres.NewTransactor(pool)
|
||||||
|
|
||||||
// Services
|
// Services
|
||||||
@@ -86,6 +88,9 @@ func main() {
|
|||||||
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
|
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
|
||||||
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
|
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
|
||||||
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
|
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
|
||||||
|
duplicateSvc := service.NewDuplicateService(
|
||||||
|
fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, cfg.DuplicateHashThreshold,
|
||||||
|
)
|
||||||
fileSvc := service.NewFileService(
|
fileSvc := service.NewFileService(
|
||||||
fileRepo,
|
fileRepo,
|
||||||
mimeRepo,
|
mimeRepo,
|
||||||
@@ -108,6 +113,7 @@ func main() {
|
|||||||
authMiddleware := handler.NewAuthMiddleware(authSvc)
|
authMiddleware := handler.NewAuthMiddleware(authSvc)
|
||||||
authHandler := handler.NewAuthHandler(authSvc)
|
authHandler := handler.NewAuthHandler(authSvc)
|
||||||
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, cfg.MaxUploadBytes)
|
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, cfg.MaxUploadBytes)
|
||||||
|
duplicateHandler := handler.NewDuplicateHandler(duplicateSvc)
|
||||||
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
|
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
|
||||||
categoryHandler := handler.NewCategoryHandler(categorySvc)
|
categoryHandler := handler.NewCategoryHandler(categorySvc)
|
||||||
poolHandler := handler.NewPoolHandler(poolSvc)
|
poolHandler := handler.NewPoolHandler(poolSvc)
|
||||||
@@ -117,7 +123,7 @@ func main() {
|
|||||||
|
|
||||||
r, err := handler.NewRouter(
|
r, err := handler.NewRouter(
|
||||||
authMiddleware, authHandler,
|
authMiddleware, authHandler,
|
||||||
fileHandler, tagHandler, categoryHandler, poolHandler,
|
fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler,
|
||||||
userHandler, aclHandler, auditHandler,
|
userHandler, aclHandler, auditHandler,
|
||||||
cfg.StaticDir,
|
cfg.StaticDir,
|
||||||
cfg.TrustedProxies,
|
cfg.TrustedProxies,
|
||||||
|
|||||||
@@ -0,0 +1,145 @@
|
|||||||
|
package postgres
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
"github.com/jackc/pgx/v5"
|
||||||
|
"github.com/jackc/pgx/v5/pgxpool"
|
||||||
|
|
||||||
|
"tanabata/backend/internal/domain"
|
||||||
|
"tanabata/backend/internal/port"
|
||||||
|
)
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// DuplicatePairRepo
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// DuplicatePairRepo implements port.DuplicatePairRepo using PostgreSQL.
|
||||||
|
type DuplicatePairRepo struct {
|
||||||
|
pool *pgxpool.Pool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDuplicatePairRepo creates a DuplicatePairRepo backed by pool.
|
||||||
|
func NewDuplicatePairRepo(pool *pgxpool.Pool) *DuplicatePairRepo {
|
||||||
|
return &DuplicatePairRepo{pool: pool}
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ port.DuplicatePairRepo = (*DuplicatePairRepo)(nil)
|
||||||
|
|
||||||
|
// ReplaceAll atomically replaces the entire pairs table with the given set.
|
||||||
|
// The rescan recomputes pairs from scratch, so a full DELETE + COPY is both
|
||||||
|
// correct and the simplest way to drop pairs that no longer qualify.
|
||||||
|
func (r *DuplicatePairRepo) ReplaceAll(ctx context.Context, pairs []domain.DuplicatePair) error {
|
||||||
|
tx, err := r.pool.Begin(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("DuplicatePairRepo.ReplaceAll begin: %w", err)
|
||||||
|
}
|
||||||
|
defer tx.Rollback(ctx) //nolint:errcheck // no-op after a successful commit
|
||||||
|
|
||||||
|
if _, err := tx.Exec(ctx, `DELETE FROM data.duplicate_pairs`); err != nil {
|
||||||
|
return fmt.Errorf("DuplicatePairRepo.ReplaceAll delete: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(pairs) > 0 {
|
||||||
|
rows := make([][]any, len(pairs))
|
||||||
|
for i, p := range pairs {
|
||||||
|
rows[i] = []any{p.FileA, p.FileB, int16(p.Distance)}
|
||||||
|
}
|
||||||
|
if _, err := tx.CopyFrom(ctx,
|
||||||
|
pgx.Identifier{"data", "duplicate_pairs"},
|
||||||
|
[]string{"file_a", "file_b", "distance"},
|
||||||
|
pgx.CopyFromRows(rows),
|
||||||
|
); err != nil {
|
||||||
|
return fmt.Errorf("DuplicatePairRepo.ReplaceAll copy: %w", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tx.Commit(ctx); err != nil {
|
||||||
|
return fmt.Errorf("DuplicatePairRepo.ReplaceAll commit: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type pairRow struct {
|
||||||
|
FileA uuid.UUID `db:"file_a"`
|
||||||
|
FileB uuid.UUID `db:"file_b"`
|
||||||
|
Distance int16 `db:"distance"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListVisible returns every stored pair where both files are live (not trashed),
|
||||||
|
// the pair is not dismissed, and — for non-admins — both files are visible to the
|
||||||
|
// viewer under the private-by-default model. This is the input to clustering.
|
||||||
|
func (r *DuplicatePairRepo) ListVisible(ctx context.Context, viewerID int16, isAdmin bool) ([]domain.DuplicatePair, error) {
|
||||||
|
args := make([]any, 0, 4)
|
||||||
|
n := 1
|
||||||
|
aclWhere := ""
|
||||||
|
if !isAdmin {
|
||||||
|
var ca, cb string
|
||||||
|
ca, n, args = aclVisibilityCond("fa", objTypeFile, viewerID, n, args)
|
||||||
|
cb, n, args = aclVisibilityCond("fb", objTypeFile, viewerID, n, args)
|
||||||
|
aclWhere = "AND " + ca + " AND " + cb
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlStr := fmt.Sprintf(`
|
||||||
|
SELECT p.file_a, p.file_b, p.distance
|
||||||
|
FROM data.duplicate_pairs p
|
||||||
|
JOIN data.files fa ON fa.id = p.file_a AND fa.is_deleted = false
|
||||||
|
JOIN data.files fb ON fb.id = p.file_b AND fb.is_deleted = false
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT 1 FROM data.duplicate_dismissals d
|
||||||
|
WHERE d.file_a = p.file_a AND d.file_b = p.file_b
|
||||||
|
)
|
||||||
|
%s
|
||||||
|
ORDER BY p.file_a, p.file_b`, aclWhere)
|
||||||
|
|
||||||
|
rows, err := r.pool.Query(ctx, sqlStr, args...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("DuplicatePairRepo.ListVisible: %w", err)
|
||||||
|
}
|
||||||
|
collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[pairRow])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("DuplicatePairRepo.ListVisible scan: %w", err)
|
||||||
|
}
|
||||||
|
out := make([]domain.DuplicatePair, len(collected))
|
||||||
|
for i, row := range collected {
|
||||||
|
out[i] = domain.DuplicatePair{FileA: row.FileA, FileB: row.FileB, Distance: int(row.Distance)}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// DismissalRepo
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// DismissalRepo implements port.DismissalRepo using PostgreSQL.
|
||||||
|
type DismissalRepo struct {
|
||||||
|
pool *pgxpool.Pool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDismissalRepo creates a DismissalRepo backed by pool.
|
||||||
|
func NewDismissalRepo(pool *pgxpool.Pool) *DismissalRepo {
|
||||||
|
return &DismissalRepo{pool: pool}
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ port.DismissalRepo = (*DismissalRepo)(nil)
|
||||||
|
|
||||||
|
// Add records a pair as "not a duplicate". The two ids are stored in canonical
|
||||||
|
// (file_a < file_b) order to match the table's CHECK and avoid (a,b)/(b,a)
|
||||||
|
// duplicates; a repeated dismissal is a no-op.
|
||||||
|
func (r *DismissalRepo) Add(ctx context.Context, a, b uuid.UUID, userID int16) error {
|
||||||
|
if bytes.Compare(a[:], b[:]) > 0 {
|
||||||
|
a, b = b, a
|
||||||
|
}
|
||||||
|
const sqlStr = `
|
||||||
|
INSERT INTO data.duplicate_dismissals (file_a, file_b, dismissed_by)
|
||||||
|
VALUES ($1, $2, $3)
|
||||||
|
ON CONFLICT (file_a, file_b) DO NOTHING`
|
||||||
|
q := connOrTx(ctx, r.pool)
|
||||||
|
if _, err := q.Exec(ctx, sqlStr, a, b, userID); err != nil {
|
||||||
|
return fmt.Errorf("DismissalRepo.Add: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
@@ -446,6 +446,89 @@ func (r *FileRepo) SetPHash(ctx context.Context, id uuid.UUID, phash *int64) err
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// Perceptual-hash / duplicate support
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// ListMissingPHash returns live image/video files that have no perceptual hash
|
||||||
|
// yet — the work list for the dedup backfill. Tags are not loaded (the backfill
|
||||||
|
// only needs the id and MIME type to choose image vs video hashing).
|
||||||
|
func (r *FileRepo) ListMissingPHash(ctx context.Context) ([]domain.File, error) {
|
||||||
|
const sqlStr = `
|
||||||
|
SELECT f.id, f.original_name,
|
||||||
|
mt.name AS mime_type, mt.extension AS mime_extension,
|
||||||
|
f.content_datetime, f.notes, f.metadata, f.exif, f.phash,
|
||||||
|
f.creator_id, u.name AS creator_name,
|
||||||
|
f.is_public, f.is_deleted, f.needs_review
|
||||||
|
FROM data.files f
|
||||||
|
JOIN core.mime_types mt ON mt.id = f.mime_id
|
||||||
|
JOIN core.users u ON u.id = f.creator_id
|
||||||
|
WHERE f.phash IS NULL AND f.is_deleted = false
|
||||||
|
AND (mt.name LIKE 'image/%' OR mt.name LIKE 'video/%')
|
||||||
|
ORDER BY f.id`
|
||||||
|
|
||||||
|
q := connOrTx(ctx, r.pool)
|
||||||
|
rows, err := q.Query(ctx, sqlStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("FileRepo.ListMissingPHash: %w", err)
|
||||||
|
}
|
||||||
|
collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[fileRow])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("FileRepo.ListMissingPHash scan: %w", err)
|
||||||
|
}
|
||||||
|
files := make([]domain.File, len(collected))
|
||||||
|
for i, row := range collected {
|
||||||
|
files[i] = toFile(row)
|
||||||
|
}
|
||||||
|
return files, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// phashRow is the minimal projection used to build duplicate clusters.
|
||||||
|
type phashRow struct {
|
||||||
|
ID uuid.UUID `db:"id"`
|
||||||
|
PHash int64 `db:"phash"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// ListAllPHashes returns the id and perceptual hash of every live, hashed file.
|
||||||
|
// It is the global input to the dedup rescan, so it deliberately ignores ACL —
|
||||||
|
// the rescan builds the shared pairs table; visibility is enforced on read.
|
||||||
|
func (r *FileRepo) ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error) {
|
||||||
|
const sqlStr = `SELECT id, phash FROM data.files WHERE is_deleted = false AND phash IS NOT NULL`
|
||||||
|
q := connOrTx(ctx, r.pool)
|
||||||
|
rows, err := q.Query(ctx, sqlStr)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("FileRepo.ListAllPHashes: %w", err)
|
||||||
|
}
|
||||||
|
collected, err := pgx.CollectRows(rows, pgx.RowToStructByName[phashRow])
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("FileRepo.ListAllPHashes scan: %w", err)
|
||||||
|
}
|
||||||
|
out := make([]domain.PHashEntry, len(collected))
|
||||||
|
for i, row := range collected {
|
||||||
|
out[i] = domain.PHashEntry{ID: row.ID, PHash: row.PHash}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CopyPoolMemberships adds targetID to every pool sourceID belongs to (copying
|
||||||
|
// the source's position), skipping pools the target is already in. Used by the
|
||||||
|
// duplicate merge to preserve the discarded file's pool memberships on the
|
||||||
|
// survivor. The merge is authorised at the file level, so pool ACL is not
|
||||||
|
// re-checked here.
|
||||||
|
func (r *FileRepo) CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error {
|
||||||
|
const sqlStr = `
|
||||||
|
INSERT INTO data.file_pool (file_id, pool_id, position)
|
||||||
|
SELECT $1, fp.pool_id, fp.position
|
||||||
|
FROM data.file_pool fp
|
||||||
|
WHERE fp.file_id = $2
|
||||||
|
ON CONFLICT (file_id, pool_id) DO NOTHING`
|
||||||
|
q := connOrTx(ctx, r.pool)
|
||||||
|
if _, err := q.Exec(ctx, sqlStr, targetID, sourceID); err != nil {
|
||||||
|
return fmt.Errorf("FileRepo.CopyPoolMemberships: %w", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
// SoftDelete / Restore / DeletePermanent
|
// SoftDelete / Restore / DeletePermanent
|
||||||
// ---------------------------------------------------------------------------
|
// ---------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
package domain
|
||||||
|
|
||||||
|
import "github.com/google/uuid"
|
||||||
|
|
||||||
|
// PHashEntry is a file's perceptual hash, the input to duplicate clustering.
|
||||||
|
type PHashEntry struct {
|
||||||
|
ID uuid.UUID
|
||||||
|
PHash int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// DuplicatePair is an unordered pair of files whose perceptual hashes are within
|
||||||
|
// the configured Hamming threshold. FileA < FileB by UUID byte order (canonical),
|
||||||
|
// so a pair is represented exactly once.
|
||||||
|
type DuplicatePair struct {
|
||||||
|
FileA uuid.UUID
|
||||||
|
FileB uuid.UUID
|
||||||
|
Distance int
|
||||||
|
}
|
||||||
@@ -0,0 +1,122 @@
|
|||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"net/http"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/gin-gonic/gin"
|
||||||
|
|
||||||
|
"tanabata/backend/internal/domain"
|
||||||
|
"tanabata/backend/internal/service"
|
||||||
|
)
|
||||||
|
|
||||||
|
// DuplicateHandler handles the /files/duplicates endpoints.
|
||||||
|
type DuplicateHandler struct {
|
||||||
|
dupSvc *service.DuplicateService
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDuplicateHandler creates a DuplicateHandler.
|
||||||
|
func NewDuplicateHandler(dupSvc *service.DuplicateService) *DuplicateHandler {
|
||||||
|
return &DuplicateHandler{dupSvc: dupSvc}
|
||||||
|
}
|
||||||
|
|
||||||
|
// List handles GET /files/duplicates — an offset-paginated list of duplicate
|
||||||
|
// clusters, each a group of files within the perceptual-hash threshold.
|
||||||
|
func (h *DuplicateHandler) List(c *gin.Context) {
|
||||||
|
limit, offset := 20, 0
|
||||||
|
if n, err := strconv.Atoi(c.Query("limit")); err == nil {
|
||||||
|
limit = n
|
||||||
|
}
|
||||||
|
if n, err := strconv.Atoi(c.Query("offset")); err == nil {
|
||||||
|
offset = n
|
||||||
|
}
|
||||||
|
if limit < 1 {
|
||||||
|
limit = 1
|
||||||
|
}
|
||||||
|
if limit > 50 {
|
||||||
|
limit = 50
|
||||||
|
}
|
||||||
|
if offset < 0 {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
clusters, total, err := h.dupSvc.Clusters(c.Request.Context(), limit, offset)
|
||||||
|
if err != nil {
|
||||||
|
respondError(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
items := make([]gin.H, len(clusters))
|
||||||
|
for i, files := range clusters {
|
||||||
|
fs := make([]fileJSON, len(files))
|
||||||
|
for j, f := range files {
|
||||||
|
fs[j] = toFileJSON(f)
|
||||||
|
}
|
||||||
|
items[i] = gin.H{"files": fs}
|
||||||
|
}
|
||||||
|
respondJSON(c, http.StatusOK, gin.H{
|
||||||
|
"items": items,
|
||||||
|
"total": total,
|
||||||
|
"limit": limit,
|
||||||
|
"offset": offset,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dismiss handles POST /files/duplicates/dismiss — mark a pair "not a duplicate".
|
||||||
|
func (h *DuplicateHandler) Dismiss(c *gin.Context) {
|
||||||
|
var body struct {
|
||||||
|
FileIDA string `json:"file_id_a" binding:"required"`
|
||||||
|
FileIDB string `json:"file_id_b" binding:"required"`
|
||||||
|
}
|
||||||
|
if err := c.ShouldBindJSON(&body); err != nil {
|
||||||
|
respondError(c, domain.ErrValidation)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids, err := parseUUIDs([]string{body.FileIDA, body.FileIDB})
|
||||||
|
if err != nil {
|
||||||
|
respondError(c, domain.ErrValidation)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := h.dupSvc.Dismiss(c.Request.Context(), ids[0], ids[1]); err != nil {
|
||||||
|
respondError(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
c.Status(http.StatusNoContent)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve handles POST /files/duplicates/resolve — merge a duplicate pair,
|
||||||
|
// keeping one file and folding the chosen fields in from the other. Returns the
|
||||||
|
// updated survivor. delete_discarded defaults to true.
|
||||||
|
func (h *DuplicateHandler) Resolve(c *gin.Context) {
|
||||||
|
var body struct {
|
||||||
|
Keep string `json:"keep" binding:"required"`
|
||||||
|
Discard string `json:"discard" binding:"required"`
|
||||||
|
Fields service.MergeFields `json:"fields"`
|
||||||
|
DeleteDiscarded *bool `json:"delete_discarded"`
|
||||||
|
}
|
||||||
|
if err := c.ShouldBindJSON(&body); err != nil {
|
||||||
|
respondError(c, domain.ErrValidation)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
ids, err := parseUUIDs([]string{body.Keep, body.Discard})
|
||||||
|
if err != nil {
|
||||||
|
respondError(c, domain.ErrValidation)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
del := true
|
||||||
|
if body.DeleteDiscarded != nil {
|
||||||
|
del = *body.DeleteDiscarded
|
||||||
|
}
|
||||||
|
f, err := h.dupSvc.Resolve(c.Request.Context(), service.MergeSpec{
|
||||||
|
Keep: ids[0],
|
||||||
|
Discard: ids[1],
|
||||||
|
Fields: body.Fields,
|
||||||
|
DeleteDiscarded: del,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
respondError(c, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
respondJSON(c, http.StatusOK, toFileJSON(*f))
|
||||||
|
}
|
||||||
@@ -26,6 +26,7 @@ func NewRouter(
|
|||||||
auth *AuthMiddleware,
|
auth *AuthMiddleware,
|
||||||
authHandler *AuthHandler,
|
authHandler *AuthHandler,
|
||||||
fileHandler *FileHandler,
|
fileHandler *FileHandler,
|
||||||
|
duplicateHandler *DuplicateHandler,
|
||||||
tagHandler *TagHandler,
|
tagHandler *TagHandler,
|
||||||
categoryHandler *CategoryHandler,
|
categoryHandler *CategoryHandler,
|
||||||
poolHandler *PoolHandler,
|
poolHandler *PoolHandler,
|
||||||
@@ -80,7 +81,11 @@ func NewRouter(
|
|||||||
files.GET("", fileHandler.List)
|
files.GET("", fileHandler.List)
|
||||||
files.POST("", fileHandler.Upload)
|
files.POST("", fileHandler.Upload)
|
||||||
|
|
||||||
// Bulk + import routes registered before /:id to prevent param collision.
|
// Bulk + import + duplicates routes registered before /:id to prevent
|
||||||
|
// param collision (e.g. "duplicates" being captured as :id).
|
||||||
|
files.GET("/duplicates", duplicateHandler.List)
|
||||||
|
files.POST("/duplicates/dismiss", duplicateHandler.Dismiss)
|
||||||
|
files.POST("/duplicates/resolve", duplicateHandler.Resolve)
|
||||||
files.POST("/bulk/tags", fileHandler.BulkSetTags)
|
files.POST("/bulk/tags", fileHandler.BulkSetTags)
|
||||||
files.POST("/bulk/delete", fileHandler.BulkDelete)
|
files.POST("/bulk/delete", fileHandler.BulkDelete)
|
||||||
files.POST("/bulk/review", fileHandler.BulkReview)
|
files.POST("/bulk/review", fileHandler.BulkReview)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import "testing"
|
|||||||
func TestNewRouterRegisters(t *testing.T) {
|
func TestNewRouterRegisters(t *testing.T) {
|
||||||
r, err := NewRouter(
|
r, err := NewRouter(
|
||||||
(*AuthMiddleware)(nil), (*AuthHandler)(nil),
|
(*AuthMiddleware)(nil), (*AuthHandler)(nil),
|
||||||
(*FileHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil),
|
(*FileHandler)(nil), (*DuplicateHandler)(nil), (*TagHandler)(nil), (*CategoryHandler)(nil), (*PoolHandler)(nil),
|
||||||
(*UserHandler)(nil), (*ACLHandler)(nil), (*AuditHandler)(nil),
|
(*UserHandler)(nil), (*ACLHandler)(nil), (*AuditHandler)(nil),
|
||||||
"", nil,
|
"", nil,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -56,6 +56,9 @@ type harness struct {
|
|||||||
client *http.Client
|
client *http.Client
|
||||||
importDir string
|
importDir string
|
||||||
pool *pgxpool.Pool
|
pool *pgxpool.Pool
|
||||||
|
// dupSvc lets duplicate tests trigger a pairs rescan directly: rebuilding the
|
||||||
|
// pairs table is a CLI/maintenance action with no HTTP endpoint.
|
||||||
|
dupSvc *service.DuplicateService
|
||||||
}
|
}
|
||||||
|
|
||||||
// setupSuite creates an ephemeral database, runs migrations, wires the full
|
// setupSuite creates an ephemeral database, runs migrations, wires the full
|
||||||
@@ -125,6 +128,8 @@ func setupSuite(t *testing.T) *harness {
|
|||||||
tagRuleRepo := postgres.NewTagRuleRepo(pool)
|
tagRuleRepo := postgres.NewTagRuleRepo(pool)
|
||||||
categoryRepo := postgres.NewCategoryRepo(pool)
|
categoryRepo := postgres.NewCategoryRepo(pool)
|
||||||
poolRepo := postgres.NewPoolRepo(pool)
|
poolRepo := postgres.NewPoolRepo(pool)
|
||||||
|
duplicatePairRepo := postgres.NewDuplicatePairRepo(pool)
|
||||||
|
dismissalRepo := postgres.NewDismissalRepo(pool)
|
||||||
transactor := postgres.NewTransactor(pool)
|
transactor := postgres.NewTransactor(pool)
|
||||||
|
|
||||||
// --- Services ------------------------------------------------------------
|
// --- Services ------------------------------------------------------------
|
||||||
@@ -134,6 +139,7 @@ func setupSuite(t *testing.T) *harness {
|
|||||||
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
|
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
|
||||||
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
|
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
|
||||||
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
|
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
|
||||||
|
duplicateSvc := service.NewDuplicateService(fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, 10)
|
||||||
fileSvc := service.NewFileService(fileRepo, mimeRepo, diskStorage, aclSvc, auditSvc, tagSvc, transactor, importDir)
|
fileSvc := service.NewFileService(fileRepo, mimeRepo, diskStorage, aclSvc, auditSvc, tagSvc, transactor, importDir)
|
||||||
userSvc := service.NewUserService(userRepo, sessionRepo, auditSvc)
|
userSvc := service.NewUserService(userRepo, sessionRepo, auditSvc)
|
||||||
|
|
||||||
@@ -145,6 +151,7 @@ func setupSuite(t *testing.T) *harness {
|
|||||||
authMiddleware := handler.NewAuthMiddleware(authSvc)
|
authMiddleware := handler.NewAuthMiddleware(authSvc)
|
||||||
authHandler := handler.NewAuthHandler(authSvc)
|
authHandler := handler.NewAuthHandler(authSvc)
|
||||||
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, 500<<20)
|
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, 500<<20)
|
||||||
|
duplicateHandler := handler.NewDuplicateHandler(duplicateSvc)
|
||||||
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
|
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
|
||||||
categoryHandler := handler.NewCategoryHandler(categorySvc)
|
categoryHandler := handler.NewCategoryHandler(categorySvc)
|
||||||
poolHandler := handler.NewPoolHandler(poolSvc)
|
poolHandler := handler.NewPoolHandler(poolSvc)
|
||||||
@@ -154,7 +161,7 @@ func setupSuite(t *testing.T) *harness {
|
|||||||
|
|
||||||
r, err := handler.NewRouter(
|
r, err := handler.NewRouter(
|
||||||
authMiddleware, authHandler,
|
authMiddleware, authHandler,
|
||||||
fileHandler, tagHandler, categoryHandler, poolHandler,
|
fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler,
|
||||||
userHandler, aclHandler, auditHandler,
|
userHandler, aclHandler, auditHandler,
|
||||||
"",
|
"",
|
||||||
nil,
|
nil,
|
||||||
@@ -170,6 +177,7 @@ func setupSuite(t *testing.T) *harness {
|
|||||||
client: srv.Client(),
|
client: srv.Client(),
|
||||||
importDir: importDir,
|
importDir: importDir,
|
||||||
pool: pool,
|
pool: pool,
|
||||||
|
dupSvc: duplicateSvc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1643,3 +1651,103 @@ var (
|
|||||||
_ = freePort
|
_ = freePort
|
||||||
_ = writeFile
|
_ = writeFile
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// dupListResponse decodes GET /files/duplicates.
|
||||||
|
type dupListResponse struct {
|
||||||
|
Items []struct {
|
||||||
|
Files []struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Tags []struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
} `json:"tags"`
|
||||||
|
} `json:"files"`
|
||||||
|
} `json:"items"`
|
||||||
|
Total int `json:"total"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestDuplicateDetection exercises the full duplicate lifecycle: perceptual hashes
|
||||||
|
// are computed on upload, a rescan builds the pairs table, the cluster surfaces,
|
||||||
|
// a field-by-field merge unions tags and trashes the discarded file, and a
|
||||||
|
// dismissal hides a pair permanently (surviving a re-rescan).
|
||||||
|
//
|
||||||
|
// minimalJPEG() is a 1×1 image, so every upload hashes identically — in a fresh
|
||||||
|
// database any two uploads form one duplicate pair, which keeps this deterministic.
|
||||||
|
func TestDuplicateDetection(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("skipping integration test in short mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
h := setupSuite(t)
|
||||||
|
ctx := context.Background()
|
||||||
|
admin := h.login("admin", "admin")
|
||||||
|
|
||||||
|
// --- two uploads => one duplicate pair after a rescan ---------------------
|
||||||
|
f1 := h.uploadJPEG(admin, "a.jpg")["id"].(string)
|
||||||
|
f2 := h.uploadJPEG(admin, "b.jpg")["id"].(string)
|
||||||
|
|
||||||
|
// Tag f2 so the merge has something to union onto the survivor.
|
||||||
|
resp := h.doJSON("POST", "/tags", map[string]any{"name": "kept", "is_public": true}, admin)
|
||||||
|
require.Equal(t, http.StatusCreated, resp.StatusCode, resp.String())
|
||||||
|
var tag map[string]any
|
||||||
|
resp.decode(t, &tag)
|
||||||
|
tagID := tag["id"].(string)
|
||||||
|
resp = h.doJSON("PUT", "/files/"+f2+"/tags", map[string]any{"tag_ids": []string{tagID}}, admin)
|
||||||
|
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
|
||||||
|
|
||||||
|
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
|
||||||
|
|
||||||
|
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
|
||||||
|
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
|
||||||
|
var list dupListResponse
|
||||||
|
resp.decode(t, &list)
|
||||||
|
require.Equal(t, 1, list.Total, "expected one duplicate cluster: %s", resp)
|
||||||
|
require.Len(t, list.Items, 1)
|
||||||
|
require.Len(t, list.Items[0].Files, 2)
|
||||||
|
|
||||||
|
// --- resolve: keep f1, union tags from f2, trash f2 ----------------------
|
||||||
|
resp = h.doJSON("POST", "/files/duplicates/resolve", map[string]any{
|
||||||
|
"keep": f1,
|
||||||
|
"discard": f2,
|
||||||
|
"fields": map[string]any{"tags": "both"},
|
||||||
|
"delete_discarded": true,
|
||||||
|
}, admin)
|
||||||
|
require.Equal(t, http.StatusOK, resp.StatusCode, resp.String())
|
||||||
|
var survivor struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
Tags []struct {
|
||||||
|
ID string `json:"id"`
|
||||||
|
} `json:"tags"`
|
||||||
|
}
|
||||||
|
resp.decode(t, &survivor)
|
||||||
|
assert.Equal(t, f1, survivor.ID)
|
||||||
|
require.Len(t, survivor.Tags, 1, "survivor should have inherited the discarded file's tag")
|
||||||
|
assert.Equal(t, tagID, survivor.Tags[0].ID)
|
||||||
|
|
||||||
|
// f2 is now trashed, so the pair drops out of the duplicates view.
|
||||||
|
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
|
||||||
|
resp.decode(t, &list)
|
||||||
|
assert.Equal(t, 0, list.Total, "resolved pair should no longer surface: %s", resp)
|
||||||
|
|
||||||
|
// --- dismiss: a new pair, hidden and staying hidden across a rescan -------
|
||||||
|
f3 := h.uploadJPEG(admin, "c.jpg")["id"].(string)
|
||||||
|
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
|
||||||
|
|
||||||
|
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
|
||||||
|
resp.decode(t, &list)
|
||||||
|
require.Equal(t, 1, list.Total, "f1 and f3 should now form a cluster: %s", resp)
|
||||||
|
|
||||||
|
resp = h.doJSON("POST", "/files/duplicates/dismiss", map[string]any{
|
||||||
|
"file_id_a": f1, "file_id_b": f3,
|
||||||
|
}, admin)
|
||||||
|
require.Equal(t, http.StatusNoContent, resp.StatusCode, resp.String())
|
||||||
|
|
||||||
|
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
|
||||||
|
resp.decode(t, &list)
|
||||||
|
assert.Equal(t, 0, list.Total, "dismissed pair should be hidden")
|
||||||
|
|
||||||
|
// A rescan re-finds the pair but the dismissal still hides it.
|
||||||
|
require.NoError(t, h.dupSvc.Rescan(ctx, nil))
|
||||||
|
resp = h.doJSON("GET", "/files/duplicates", nil, admin)
|
||||||
|
resp.decode(t, &list)
|
||||||
|
assert.Equal(t, 0, list.Total, "dismissal must survive a rescan")
|
||||||
|
}
|
||||||
|
|||||||
@@ -52,6 +52,15 @@ type FileRepo interface {
|
|||||||
SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error
|
SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error
|
||||||
// SetPHash sets (or clears, when nil) the perceptual hash of a file.
|
// SetPHash sets (or clears, when nil) the perceptual hash of a file.
|
||||||
SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error
|
SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error
|
||||||
|
// ListMissingPHash returns live image/video files that have no perceptual
|
||||||
|
// hash yet (the dedup backfill work list).
|
||||||
|
ListMissingPHash(ctx context.Context) ([]domain.File, error)
|
||||||
|
// ListAllPHashes returns the id and perceptual hash of every live, hashed
|
||||||
|
// file (the global input to the dedup rescan; not ACL-filtered).
|
||||||
|
ListAllPHashes(ctx context.Context) ([]domain.PHashEntry, error)
|
||||||
|
// CopyPoolMemberships adds targetID to every pool sourceID belongs to,
|
||||||
|
// skipping pools target is already in (used by the duplicate merge).
|
||||||
|
CopyPoolMemberships(ctx context.Context, targetID, sourceID uuid.UUID) error
|
||||||
// SoftDelete moves a file to trash (sets is_deleted = true).
|
// SoftDelete moves a file to trash (sets is_deleted = true).
|
||||||
SoftDelete(ctx context.Context, id uuid.UUID) error
|
SoftDelete(ctx context.Context, id uuid.UUID) error
|
||||||
// Restore moves a file out of trash (sets is_deleted = false).
|
// Restore moves a file out of trash (sets is_deleted = false).
|
||||||
@@ -72,6 +81,21 @@ type FileRepo interface {
|
|||||||
RecordTagUses(ctx context.Context, userID int16, filterDSL string) error
|
RecordTagUses(ctx context.Context, userID int16, filterDSL string) error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DuplicatePairRepo persists the precomputed near-duplicate candidate pairs.
|
||||||
|
type DuplicatePairRepo interface {
|
||||||
|
// ReplaceAll atomically replaces the whole pairs table (used by the rescan).
|
||||||
|
ReplaceAll(ctx context.Context, pairs []domain.DuplicatePair) error
|
||||||
|
// ListVisible returns pairs whose both files are live, not dismissed, and
|
||||||
|
// (for non-admins) visible to the viewer.
|
||||||
|
ListVisible(ctx context.Context, viewerID int16, isAdmin bool) ([]domain.DuplicatePair, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DismissalRepo persists "not a duplicate" decisions.
|
||||||
|
type DismissalRepo interface {
|
||||||
|
// Add records a pair as dismissed (canonical order, idempotent).
|
||||||
|
Add(ctx context.Context, a, b uuid.UUID, userID int16) error
|
||||||
|
}
|
||||||
|
|
||||||
// TagRepo is the persistence interface for tags.
|
// TagRepo is the persistence interface for tags.
|
||||||
type TagRepo interface {
|
type TagRepo interface {
|
||||||
List(ctx context.Context, params OffsetParams) (*domain.TagOffsetPage, error)
|
List(ctx context.Context, params OffsetParams) (*domain.TagOffsetPage, error)
|
||||||
|
|||||||
@@ -0,0 +1,148 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"math/bits"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
|
||||||
|
"tanabata/backend/internal/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// hamming returns the number of differing bits between two perceptual hashes.
|
||||||
|
func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }
|
||||||
|
|
||||||
|
// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
|
||||||
|
// same hash are collected in ids (a distance-0 collision), so identical images
|
||||||
|
// don't degenerate the tree into a chain.
|
||||||
|
type bkNode struct {
|
||||||
|
hash uint64
|
||||||
|
ids []uuid.UUID
|
||||||
|
children map[int]*bkNode
|
||||||
|
}
|
||||||
|
|
||||||
|
// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
|
||||||
|
// querying every element with a small radius is far cheaper than the O(N²) all-
|
||||||
|
// pairs comparison at 100k+ files.
|
||||||
|
type bkTree struct{ root *bkNode }
|
||||||
|
|
||||||
|
func (t *bkTree) insert(hash uint64, id uuid.UUID) {
|
||||||
|
if t.root == nil {
|
||||||
|
t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
node := t.root
|
||||||
|
for {
|
||||||
|
d := hamming(hash, node.hash)
|
||||||
|
if d == 0 {
|
||||||
|
node.ids = append(node.ids, id)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
child, ok := node.children[d]
|
||||||
|
if !ok {
|
||||||
|
node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
node = child
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// query visits every node whose hash is within radius of target. The triangle
|
||||||
|
// inequality bounds which children can hold a match to [d-radius, d+radius].
|
||||||
|
func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
|
||||||
|
if t.root == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
stack := []*bkNode{t.root}
|
||||||
|
for len(stack) > 0 {
|
||||||
|
node := stack[len(stack)-1]
|
||||||
|
stack = stack[:len(stack)-1]
|
||||||
|
|
||||||
|
d := hamming(target, node.hash)
|
||||||
|
if d <= radius {
|
||||||
|
visit(node, d)
|
||||||
|
}
|
||||||
|
lo, hi := d-radius, d+radius
|
||||||
|
for cd, child := range node.children {
|
||||||
|
if cd >= lo && cd <= hi {
|
||||||
|
stack = append(stack, child)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildPairs returns every unordered pair of files whose hashes are within
|
||||||
|
// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
|
||||||
|
// onProgress, if set, is called periodically with (processed, total).
|
||||||
|
func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
|
||||||
|
tree := &bkTree{}
|
||||||
|
for _, e := range entries {
|
||||||
|
tree.insert(uint64(e.PHash), e.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
var pairs []domain.DuplicatePair
|
||||||
|
total := len(entries)
|
||||||
|
for i := range entries {
|
||||||
|
e := entries[i]
|
||||||
|
tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
|
||||||
|
for _, other := range node.ids {
|
||||||
|
// Emit each pair once, from the smaller id, which also skips self.
|
||||||
|
if bytes.Compare(e.ID[:], other[:]) < 0 {
|
||||||
|
pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
if onProgress != nil && (i+1)%1000 == 0 {
|
||||||
|
onProgress(i+1, total)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if onProgress != nil {
|
||||||
|
onProgress(total, total)
|
||||||
|
}
|
||||||
|
return pairs
|
||||||
|
}
|
||||||
|
|
||||||
|
// clusterPairs groups pairs into connected components (transitive closure) via
|
||||||
|
// union-find. Every returned cluster has at least two files; clusters and the ids
|
||||||
|
// within them are sorted by UUID for stable pagination.
|
||||||
|
func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
|
||||||
|
parent := map[uuid.UUID]uuid.UUID{}
|
||||||
|
var find func(uuid.UUID) uuid.UUID
|
||||||
|
find = func(x uuid.UUID) uuid.UUID {
|
||||||
|
p, ok := parent[x]
|
||||||
|
if !ok {
|
||||||
|
parent[x] = x
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
if p != x {
|
||||||
|
parent[x] = find(p)
|
||||||
|
}
|
||||||
|
return parent[x]
|
||||||
|
}
|
||||||
|
union := func(a, b uuid.UUID) {
|
||||||
|
ra, rb := find(a), find(b)
|
||||||
|
if ra != rb {
|
||||||
|
parent[ra] = rb
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for _, p := range pairs {
|
||||||
|
union(p.FileA, p.FileB)
|
||||||
|
}
|
||||||
|
|
||||||
|
groups := map[uuid.UUID][]uuid.UUID{}
|
||||||
|
for node := range parent {
|
||||||
|
root := find(node)
|
||||||
|
groups[root] = append(groups[root], node)
|
||||||
|
}
|
||||||
|
|
||||||
|
clusters := make([][]uuid.UUID, 0, len(groups))
|
||||||
|
for _, ids := range groups {
|
||||||
|
sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
|
||||||
|
clusters = append(clusters, ids)
|
||||||
|
}
|
||||||
|
sort.Slice(clusters, func(i, j int) bool {
|
||||||
|
return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
|
||||||
|
})
|
||||||
|
return clusters
|
||||||
|
}
|
||||||
@@ -0,0 +1,361 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
|
||||||
|
"tanabata/backend/internal/domain"
|
||||||
|
"tanabata/backend/internal/port"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Merge field source values.
|
||||||
|
const (
|
||||||
|
mergeKeep = "keep"
|
||||||
|
mergeDiscard = "discard"
|
||||||
|
mergeBoth = "both"
|
||||||
|
mergeMerge = "merge"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MergeFields chooses, per field, which file supplies the survivor's value when
|
||||||
|
// resolving a duplicate. Scalars accept "keep"/"discard"; metadata also accepts
|
||||||
|
// "merge" (shallow object merge, survivor wins on key conflicts); relations
|
||||||
|
// (tags, pools) accept "keep"/"both" (union) — there is deliberately no option
|
||||||
|
// to drop the survivor's own tags/pools. An empty value defaults to "keep".
|
||||||
|
type MergeFields struct {
|
||||||
|
OriginalName string `json:"original_name"`
|
||||||
|
Notes string `json:"notes"`
|
||||||
|
ContentDatetime string `json:"content_datetime"`
|
||||||
|
IsPublic string `json:"is_public"`
|
||||||
|
Metadata string `json:"metadata"`
|
||||||
|
Tags string `json:"tags"`
|
||||||
|
Pools string `json:"pools"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// MergeSpec is the input to a duplicate resolution: keep one file, fold chosen
|
||||||
|
// fields in from the other, and (usually) trash the other.
|
||||||
|
type MergeSpec struct {
|
||||||
|
Keep uuid.UUID
|
||||||
|
Discard uuid.UUID
|
||||||
|
Fields MergeFields
|
||||||
|
DeleteDiscarded bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize fills empty choices with "keep" and rejects unknown values.
|
||||||
|
func (m *MergeSpec) normalize() error {
|
||||||
|
scalar := func(v *string) error {
|
||||||
|
if *v == "" {
|
||||||
|
*v = mergeKeep
|
||||||
|
}
|
||||||
|
if *v != mergeKeep && *v != mergeDiscard {
|
||||||
|
return domain.ErrValidation
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
relation := func(v *string) error {
|
||||||
|
if *v == "" {
|
||||||
|
*v = mergeKeep
|
||||||
|
}
|
||||||
|
if *v != mergeKeep && *v != mergeBoth {
|
||||||
|
return domain.ErrValidation
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
f := &m.Fields
|
||||||
|
if err := scalar(&f.OriginalName); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := scalar(&f.Notes); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := scalar(&f.ContentDatetime); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := scalar(&f.IsPublic); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if f.Metadata == "" {
|
||||||
|
f.Metadata = mergeKeep
|
||||||
|
}
|
||||||
|
if f.Metadata != mergeKeep && f.Metadata != mergeDiscard && f.Metadata != mergeMerge {
|
||||||
|
return domain.ErrValidation
|
||||||
|
}
|
||||||
|
if err := relation(&f.Tags); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := relation(&f.Pools); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DuplicateService finds near-duplicate clusters and resolves them.
|
||||||
|
type DuplicateService struct {
|
||||||
|
files port.FileRepo
|
||||||
|
pairs port.DuplicatePairRepo
|
||||||
|
dismissals port.DismissalRepo
|
||||||
|
acl *ACLService
|
||||||
|
audit *AuditService
|
||||||
|
tx port.Transactor
|
||||||
|
threshold int
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDuplicateService creates a DuplicateService. threshold is the maximum
|
||||||
|
// Hamming distance for two files to be treated as duplicate candidates.
|
||||||
|
func NewDuplicateService(
|
||||||
|
files port.FileRepo,
|
||||||
|
pairs port.DuplicatePairRepo,
|
||||||
|
dismissals port.DismissalRepo,
|
||||||
|
acl *ACLService,
|
||||||
|
audit *AuditService,
|
||||||
|
tx port.Transactor,
|
||||||
|
threshold int,
|
||||||
|
) *DuplicateService {
|
||||||
|
return &DuplicateService{
|
||||||
|
files: files,
|
||||||
|
pairs: pairs,
|
||||||
|
dismissals: dismissals,
|
||||||
|
acl: acl,
|
||||||
|
audit: audit,
|
||||||
|
tx: tx,
|
||||||
|
threshold: threshold,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clusters returns a page of duplicate clusters visible to the caller. Pairs are
|
||||||
|
// read from the precomputed table (no all-pairs scan here) and grouped into
|
||||||
|
// connected components; pagination is over whole clusters.
|
||||||
|
func (s *DuplicateService) Clusters(ctx context.Context, limit, offset int) (clusters [][]domain.File, total int, err error) {
|
||||||
|
userID, isAdmin, _ := domain.UserFromContext(ctx)
|
||||||
|
|
||||||
|
pairs, err := s.pairs.ListVisible(ctx, userID, isAdmin)
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
groups := clusterPairs(pairs)
|
||||||
|
total = len(groups)
|
||||||
|
|
||||||
|
if offset < 0 {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
if offset >= len(groups) {
|
||||||
|
return [][]domain.File{}, total, nil
|
||||||
|
}
|
||||||
|
end := offset + limit
|
||||||
|
if end > len(groups) || limit <= 0 {
|
||||||
|
end = len(groups)
|
||||||
|
}
|
||||||
|
|
||||||
|
out := make([][]domain.File, 0, end-offset)
|
||||||
|
for _, ids := range groups[offset:end] {
|
||||||
|
files := make([]domain.File, 0, len(ids))
|
||||||
|
for _, id := range ids {
|
||||||
|
f, err := s.files.GetByID(ctx, id)
|
||||||
|
if err != nil {
|
||||||
|
// A file deleted between the pair read and now just drops out.
|
||||||
|
if errors.Is(err, domain.ErrNotFound) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
files = append(files, *f)
|
||||||
|
}
|
||||||
|
if len(files) >= 2 {
|
||||||
|
out = append(out, files)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, total, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rescan recomputes the entire duplicate_pairs table from the current set of
|
||||||
|
// perceptual hashes. It is the only thing that populates the table, so the
|
||||||
|
// duplicates view reflects state as of the last rescan. Called by the dedup CLI.
|
||||||
|
func (s *DuplicateService) Rescan(ctx context.Context, onProgress func(done, total int)) error {
|
||||||
|
entries, err := s.files.ListAllPHashes(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
pairs := buildPairs(entries, s.threshold, onProgress)
|
||||||
|
return s.pairs.ReplaceAll(ctx, pairs)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dismiss records two files as "not a duplicate" so the pair stops surfacing.
|
||||||
|
// The caller must be able to view both files.
|
||||||
|
func (s *DuplicateService) Dismiss(ctx context.Context, a, b uuid.UUID) error {
|
||||||
|
if a == b {
|
||||||
|
return domain.ErrValidation
|
||||||
|
}
|
||||||
|
userID, isAdmin, _ := domain.UserFromContext(ctx)
|
||||||
|
for _, id := range []uuid.UUID{a, b} {
|
||||||
|
f, err := s.files.GetByID(ctx, id)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
ok, err := s.acl.CanView(ctx, userID, isAdmin, f.CreatorID, f.IsPublic, fileObjectTypeID, id)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
return domain.ErrForbidden
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := s.dismissals.Add(ctx, a, b, userID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
objType := fileObjectType
|
||||||
|
_ = s.audit.Log(ctx, "duplicate_dismiss", &objType, &a, map[string]any{"other": b.String()})
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resolve merges a duplicate pair: the survivor (keep) takes the chosen fields
|
||||||
|
// from the other (discard), and the other is trashed when DeleteDiscarded is set.
|
||||||
|
// The caller must be able to edit both files. Returns the updated survivor.
|
||||||
|
func (s *DuplicateService) Resolve(ctx context.Context, spec MergeSpec) (*domain.File, error) {
|
||||||
|
if spec.Keep == spec.Discard {
|
||||||
|
return nil, domain.ErrValidation
|
||||||
|
}
|
||||||
|
if err := spec.normalize(); err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
keep, err := s.files.GetByID(ctx, spec.Keep)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
discard, err := s.files.GetByID(ctx, spec.Discard)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
userID, isAdmin, _ := domain.UserFromContext(ctx)
|
||||||
|
for _, f := range []*domain.File{keep, discard} {
|
||||||
|
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, f.ID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
return nil, domain.ErrForbidden
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FileRepo.Update rewrites all editable scalar columns, so build the complete
|
||||||
|
// resolved set (each field from keep or discard) rather than a sparse patch.
|
||||||
|
patch := &domain.File{
|
||||||
|
OriginalName: pickPtr(spec.Fields.OriginalName, keep.OriginalName, discard.OriginalName),
|
||||||
|
Notes: pickPtr(spec.Fields.Notes, keep.Notes, discard.Notes),
|
||||||
|
ContentDatetime: pickTime(spec.Fields.ContentDatetime, keep.ContentDatetime, discard.ContentDatetime),
|
||||||
|
IsPublic: pickBool(spec.Fields.IsPublic, keep.IsPublic, discard.IsPublic),
|
||||||
|
Metadata: pickMetadata(spec.Fields.Metadata, keep.Metadata, discard.Metadata),
|
||||||
|
}
|
||||||
|
|
||||||
|
var result *domain.File
|
||||||
|
txErr := s.tx.WithTx(ctx, func(ctx context.Context) error {
|
||||||
|
updated, err := s.files.Update(ctx, keep.ID, patch)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if spec.Fields.Tags == mergeBoth {
|
||||||
|
if err := s.files.SetTags(ctx, keep.ID, unionTagIDs(keep.Tags, discard.Tags)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tags, err := s.files.ListTags(ctx, keep.ID)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
updated.Tags = tags
|
||||||
|
}
|
||||||
|
if spec.Fields.Pools == mergeBoth {
|
||||||
|
if err := s.files.CopyPoolMemberships(ctx, keep.ID, discard.ID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if spec.DeleteDiscarded {
|
||||||
|
if err := s.files.SoftDelete(ctx, discard.ID); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = updated
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
if txErr != nil {
|
||||||
|
return nil, txErr
|
||||||
|
}
|
||||||
|
|
||||||
|
objType := fileObjectType
|
||||||
|
_ = s.audit.Log(ctx, "file_merge", &objType, &keep.ID, map[string]any{
|
||||||
|
"discard": spec.Discard.String(),
|
||||||
|
"fields": spec.Fields,
|
||||||
|
"deleted_discarded": spec.DeleteDiscarded,
|
||||||
|
})
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- field pickers ---------------------------------------------------------
|
||||||
|
|
||||||
|
func pickPtr(choice string, keep, discard *string) *string {
|
||||||
|
if choice == mergeDiscard {
|
||||||
|
return discard
|
||||||
|
}
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
|
||||||
|
func pickBool(choice string, keep, discard bool) bool {
|
||||||
|
if choice == mergeDiscard {
|
||||||
|
return discard
|
||||||
|
}
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
|
||||||
|
func pickTime(choice string, keep, discard time.Time) time.Time {
|
||||||
|
if choice == mergeDiscard {
|
||||||
|
return discard
|
||||||
|
}
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
|
||||||
|
func unionTagIDs(a, b []domain.Tag) []uuid.UUID {
|
||||||
|
seen := make(map[uuid.UUID]bool, len(a)+len(b))
|
||||||
|
ids := make([]uuid.UUID, 0, len(a)+len(b))
|
||||||
|
for _, t := range append(append([]domain.Tag{}, a...), b...) {
|
||||||
|
if !seen[t.ID] {
|
||||||
|
seen[t.ID] = true
|
||||||
|
ids = append(ids, t.ID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ids
|
||||||
|
}
|
||||||
|
|
||||||
|
// pickMetadata returns keep's metadata, discard's, or a shallow merge in which
|
||||||
|
// the survivor's keys win on conflict.
|
||||||
|
func pickMetadata(choice string, keep, discard json.RawMessage) json.RawMessage {
|
||||||
|
switch choice {
|
||||||
|
case mergeDiscard:
|
||||||
|
return discard
|
||||||
|
case mergeMerge:
|
||||||
|
km := map[string]json.RawMessage{}
|
||||||
|
dm := map[string]json.RawMessage{}
|
||||||
|
_ = json.Unmarshal(keep, &km)
|
||||||
|
_ = json.Unmarshal(discard, &dm)
|
||||||
|
out := make(map[string]json.RawMessage, len(km)+len(dm))
|
||||||
|
for k, v := range dm {
|
||||||
|
out[k] = v
|
||||||
|
}
|
||||||
|
for k, v := range km { // survivor wins
|
||||||
|
out[k] = v
|
||||||
|
}
|
||||||
|
if len(out) == 0 {
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
b, err := json.Marshal(out)
|
||||||
|
if err != nil {
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
return b
|
||||||
|
default:
|
||||||
|
return keep
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,152 @@
|
|||||||
|
package service
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/json"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"sort"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/google/uuid"
|
||||||
|
|
||||||
|
"tanabata/backend/internal/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// id builds a deterministic UUID whose byte order matches n, so tests can reason
|
||||||
|
// about the canonical (FileA < FileB) ordering buildPairs produces.
|
||||||
|
func id(n int) uuid.UUID {
|
||||||
|
return uuid.MustParse(fmt.Sprintf("00000000-0000-0000-0000-%012d", n))
|
||||||
|
}
|
||||||
|
|
||||||
|
func entry(n int, hash uint64) domain.PHashEntry {
|
||||||
|
return domain.PHashEntry{ID: id(n), PHash: int64(hash)}
|
||||||
|
}
|
||||||
|
|
||||||
|
// pairKey canonicalises a pair for set comparison regardless of emission order.
|
||||||
|
func pairKey(p domain.DuplicatePair) string {
|
||||||
|
a, b := p.FileA, p.FileB
|
||||||
|
if bytes.Compare(a[:], b[:]) > 0 {
|
||||||
|
a, b = b, a
|
||||||
|
}
|
||||||
|
return fmt.Sprintf("%s|%s|%d", a, b, p.Distance)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildPairs_ThresholdAndCanonicalOrder(t *testing.T) {
|
||||||
|
entries := []domain.PHashEntry{
|
||||||
|
entry(1, 0x0000000000000000),
|
||||||
|
entry(2, 0x0000000000000001), // distance 1 from #1
|
||||||
|
entry(3, 0x00000000000000FF), // distance 8 from #1, 7 from #2
|
||||||
|
entry(4, 0xFFFFFFFFFFFFFFFF), // distance 64 from #1
|
||||||
|
}
|
||||||
|
|
||||||
|
// Tight threshold: only the distance-1 pair qualifies.
|
||||||
|
got := buildPairs(entries, 2, nil)
|
||||||
|
if len(got) != 1 {
|
||||||
|
t.Fatalf("threshold 2: got %d pairs, want 1: %+v", len(got), got)
|
||||||
|
}
|
||||||
|
if got[0].FileA != id(1) || got[0].FileB != id(2) || got[0].Distance != 1 {
|
||||||
|
t.Errorf("threshold 2: unexpected pair %+v", got[0])
|
||||||
|
}
|
||||||
|
// Canonical order always FileA < FileB.
|
||||||
|
if bytes.Compare(got[0].FileA[:], got[0].FileB[:]) >= 0 {
|
||||||
|
t.Error("pair not in canonical FileA < FileB order")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Looser threshold pulls in #3's pairs but never #4.
|
||||||
|
got8 := buildPairs(entries, 8, nil)
|
||||||
|
want := map[string]bool{
|
||||||
|
pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(2), Distance: 1}): true,
|
||||||
|
pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(3), Distance: 8}): true,
|
||||||
|
pairKey(domain.DuplicatePair{FileA: id(2), FileB: id(3), Distance: 7}): true,
|
||||||
|
}
|
||||||
|
if len(got8) != len(want) {
|
||||||
|
t.Fatalf("threshold 8: got %d pairs, want %d: %+v", len(got8), len(want), got8)
|
||||||
|
}
|
||||||
|
for _, p := range got8 {
|
||||||
|
if !want[pairKey(p)] {
|
||||||
|
t.Errorf("threshold 8: unexpected pair %+v", p)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestBuildPairs_IdenticalHashesPairAtDistanceZero(t *testing.T) {
|
||||||
|
entries := []domain.PHashEntry{
|
||||||
|
entry(1, 0xABCDABCDABCDABCD),
|
||||||
|
entry(2, 0xABCDABCDABCDABCD),
|
||||||
|
}
|
||||||
|
got := buildPairs(entries, 0, nil)
|
||||||
|
if len(got) != 1 || got[0].Distance != 0 || got[0].FileA != id(1) || got[0].FileB != id(2) {
|
||||||
|
t.Fatalf("identical hashes: got %+v, want one distance-0 pair (1,2)", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestClusterPairs_ConnectedComponents(t *testing.T) {
|
||||||
|
pairs := []domain.DuplicatePair{
|
||||||
|
{FileA: id(1), FileB: id(2)},
|
||||||
|
{FileA: id(2), FileB: id(3)}, // transitively joins 1-2-3
|
||||||
|
{FileA: id(5), FileB: id(6)},
|
||||||
|
}
|
||||||
|
clusters := clusterPairs(pairs)
|
||||||
|
if len(clusters) != 2 {
|
||||||
|
t.Fatalf("got %d clusters, want 2: %+v", len(clusters), clusters)
|
||||||
|
}
|
||||||
|
// Sorted by smallest id: {1,2,3} then {5,6}.
|
||||||
|
if len(clusters[0]) != 3 || clusters[0][0] != id(1) || clusters[0][2] != id(3) {
|
||||||
|
t.Errorf("cluster 0 = %v, want [1 2 3]", clusters[0])
|
||||||
|
}
|
||||||
|
if len(clusters[1]) != 2 || clusters[1][0] != id(5) {
|
||||||
|
t.Errorf("cluster 1 = %v, want [5 6]", clusters[1])
|
||||||
|
}
|
||||||
|
// Each cluster's ids are sorted.
|
||||||
|
for _, c := range clusters {
|
||||||
|
if !sort.SliceIsSorted(c, func(i, j int) bool { return bytes.Compare(c[i][:], c[j][:]) < 0 }) {
|
||||||
|
t.Errorf("cluster not sorted: %v", c)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestPickMetadata_Merge(t *testing.T) {
|
||||||
|
keep := json.RawMessage(`{"a":1,"b":2}`)
|
||||||
|
discard := json.RawMessage(`{"b":9,"c":3}`)
|
||||||
|
|
||||||
|
out := pickMetadata(mergeMerge, keep, discard)
|
||||||
|
var m map[string]int
|
||||||
|
if err := json.Unmarshal(out, &m); err != nil {
|
||||||
|
t.Fatalf("merge result not valid JSON: %v (%s)", err, out)
|
||||||
|
}
|
||||||
|
want := map[string]int{"a": 1, "b": 2, "c": 3} // survivor wins on "b"
|
||||||
|
if fmt.Sprint(m) != fmt.Sprint(want) {
|
||||||
|
t.Errorf("merge = %v, want %v", m, want)
|
||||||
|
}
|
||||||
|
|
||||||
|
if string(pickMetadata(mergeKeep, keep, discard)) != string(keep) {
|
||||||
|
t.Error("keep choice should return survivor metadata unchanged")
|
||||||
|
}
|
||||||
|
if string(pickMetadata(mergeDiscard, keep, discard)) != string(discard) {
|
||||||
|
t.Error("discard choice should return the other file's metadata")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMergeSpec_Normalize(t *testing.T) {
|
||||||
|
// Empty fields default to "keep".
|
||||||
|
spec := MergeSpec{Keep: id(1), Discard: id(2)}
|
||||||
|
if err := spec.normalize(); err != nil {
|
||||||
|
t.Fatalf("normalize empty: %v", err)
|
||||||
|
}
|
||||||
|
if spec.Fields.OriginalName != mergeKeep || spec.Fields.Tags != mergeKeep || spec.Fields.Metadata != mergeKeep {
|
||||||
|
t.Errorf("empty fields not defaulted to keep: %+v", spec.Fields)
|
||||||
|
}
|
||||||
|
|
||||||
|
// "both" is invalid for a scalar field.
|
||||||
|
bad := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Notes: mergeBoth}}
|
||||||
|
if err := bad.normalize(); !errors.Is(err, domain.ErrValidation) {
|
||||||
|
t.Errorf("scalar=both: got %v, want ErrValidation", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// "discard" is invalid for a relation field.
|
||||||
|
badRel := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Tags: mergeDiscard}}
|
||||||
|
if err := badRel.normalize(); !errors.Is(err, domain.ErrValidation) {
|
||||||
|
t.Errorf("relation=discard: got %v, want ErrValidation", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -92,6 +92,31 @@ CREATE TABLE data.file_pool (
|
|||||||
PRIMARY KEY (file_id, pool_id)
|
PRIMARY KEY (file_id, pool_id)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
-- Precomputed near-duplicate candidates (phash Hamming distance <= threshold),
|
||||||
|
-- (re)built in full by the dedup rescan. Stored once per unordered pair with a
|
||||||
|
-- canonical file_a < file_b ordering so a pair is never duplicated as (a,b)/(b,a).
|
||||||
|
CREATE TABLE data.duplicate_pairs (
|
||||||
|
file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
|
||||||
|
file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
|
||||||
|
distance smallint NOT NULL,
|
||||||
|
|
||||||
|
CONSTRAINT chk__duplicate_pairs__order CHECK (file_a < file_b),
|
||||||
|
PRIMARY KEY (file_a, file_b)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- "Not a duplicate" decisions: a global overlay that hides a candidate pair from
|
||||||
|
-- the duplicates view. Survives rescans (the pair may be re-found but stays
|
||||||
|
-- hidden). Same canonical file_a < file_b ordering as data.duplicate_pairs.
|
||||||
|
CREATE TABLE data.duplicate_dismissals (
|
||||||
|
file_a uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
|
||||||
|
file_b uuid NOT NULL REFERENCES data.files(id) ON UPDATE CASCADE ON DELETE CASCADE,
|
||||||
|
dismissed_by smallint NOT NULL REFERENCES core.users(id) ON UPDATE CASCADE ON DELETE RESTRICT,
|
||||||
|
dismissed_at timestamptz NOT NULL DEFAULT clock_timestamp(),
|
||||||
|
|
||||||
|
CONSTRAINT chk__duplicate_dismissals__order CHECK (file_a < file_b),
|
||||||
|
PRIMARY KEY (file_a, file_b)
|
||||||
|
);
|
||||||
|
|
||||||
COMMENT ON TABLE data.categories IS 'Logical grouping of tags';
|
COMMENT ON TABLE data.categories IS 'Logical grouping of tags';
|
||||||
COMMENT ON TABLE data.tags IS 'File labels/tags';
|
COMMENT ON TABLE data.tags IS 'File labels/tags';
|
||||||
COMMENT ON TABLE data.tag_rules IS 'Auto-tagging rules: when when_tag is assigned, then_tag follows';
|
COMMENT ON TABLE data.tag_rules IS 'Auto-tagging rules: when when_tag is assigned, then_tag follows';
|
||||||
@@ -99,6 +124,8 @@ COMMENT ON TABLE data.files IS 'Managed files; actual content stored on di
|
|||||||
COMMENT ON TABLE data.file_tag IS 'Many-to-many: files <-> tags';
|
COMMENT ON TABLE data.file_tag IS 'Many-to-many: files <-> tags';
|
||||||
COMMENT ON TABLE data.pools IS 'Ordered collections of files';
|
COMMENT ON TABLE data.pools IS 'Ordered collections of files';
|
||||||
COMMENT ON TABLE data.file_pool IS 'Many-to-many: files <-> pools, with ordering';
|
COMMENT ON TABLE data.file_pool IS 'Many-to-many: files <-> pools, with ordering';
|
||||||
|
COMMENT ON TABLE data.duplicate_pairs IS 'Precomputed near-duplicate candidate pairs (perceptual-hash distance)';
|
||||||
|
COMMENT ON TABLE data.duplicate_dismissals IS 'Pairs marked "not a duplicate"; hidden from the duplicates view';
|
||||||
|
|
||||||
COMMENT ON COLUMN data.files.original_name IS 'Original filename at upload time';
|
COMMENT ON COLUMN data.files.original_name IS 'Original filename at upload time';
|
||||||
COMMENT ON COLUMN data.files.content_datetime IS 'Content datetime (e.g. when photo was taken); falls back to EXIF DateTimeOriginal';
|
COMMENT ON COLUMN data.files.content_datetime IS 'Content datetime (e.g. when photo was taken); falls back to EXIF DateTimeOriginal';
|
||||||
@@ -110,6 +137,8 @@ COMMENT ON COLUMN data.file_pool.position IS 'Manual ordering within pool; u
|
|||||||
|
|
||||||
-- +goose Down
|
-- +goose Down
|
||||||
|
|
||||||
|
DROP TABLE IF EXISTS data.duplicate_dismissals;
|
||||||
|
DROP TABLE IF EXISTS data.duplicate_pairs;
|
||||||
DROP TABLE IF EXISTS data.file_pool;
|
DROP TABLE IF EXISTS data.file_pool;
|
||||||
DROP TABLE IF EXISTS data.pools;
|
DROP TABLE IF EXISTS data.pools;
|
||||||
DROP TABLE IF EXISTS data.file_tag;
|
DROP TABLE IF EXISTS data.file_tag;
|
||||||
|
|||||||
@@ -26,6 +26,12 @@ CREATE INDEX idx__files__needs_review ON data.files USING btree (id) WHERE
|
|||||||
CREATE INDEX idx__file_tag__tag_id ON data.file_tag USING hash (tag_id);
|
CREATE INDEX idx__file_tag__tag_id ON data.file_tag USING hash (tag_id);
|
||||||
CREATE INDEX idx__file_tag__file_id ON data.file_tag USING hash (file_id);
|
CREATE INDEX idx__file_tag__file_id ON data.file_tag USING hash (file_id);
|
||||||
|
|
||||||
|
-- data.duplicate_pairs / data.duplicate_dismissals
|
||||||
|
-- The composite primary keys cover lookups on file_a; these add the file_b side
|
||||||
|
-- (used by the ON DELETE CASCADE and by the visibility join on the second file).
|
||||||
|
CREATE INDEX idx__duplicate_pairs__file_b ON data.duplicate_pairs USING hash (file_b);
|
||||||
|
CREATE INDEX idx__duplicate_dismissals__file_b ON data.duplicate_dismissals USING hash (file_b);
|
||||||
|
|
||||||
-- data.pools
|
-- data.pools
|
||||||
CREATE INDEX idx__pools__creator_id ON data.pools USING hash (creator_id);
|
CREATE INDEX idx__pools__creator_id ON data.pools USING hash (creator_id);
|
||||||
|
|
||||||
@@ -70,6 +76,8 @@ DROP INDEX IF EXISTS activity.idx__sessions__token_hash;
|
|||||||
DROP INDEX IF EXISTS activity.idx__sessions__user_id;
|
DROP INDEX IF EXISTS activity.idx__sessions__user_id;
|
||||||
DROP INDEX IF EXISTS acl.idx__acl__user;
|
DROP INDEX IF EXISTS acl.idx__acl__user;
|
||||||
DROP INDEX IF EXISTS acl.idx__acl__object;
|
DROP INDEX IF EXISTS acl.idx__acl__object;
|
||||||
|
DROP INDEX IF EXISTS data.idx__duplicate_dismissals__file_b;
|
||||||
|
DROP INDEX IF EXISTS data.idx__duplicate_pairs__file_b;
|
||||||
DROP INDEX IF EXISTS data.idx__file_pool__file_id;
|
DROP INDEX IF EXISTS data.idx__file_pool__file_id;
|
||||||
DROP INDEX IF EXISTS data.idx__file_pool__pool_id;
|
DROP INDEX IF EXISTS data.idx__file_pool__pool_id;
|
||||||
DROP INDEX IF EXISTS data.idx__pools__creator_id;
|
DROP INDEX IF EXISTS data.idx__pools__creator_id;
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ INSERT INTO activity.action_types (name) VALUES
|
|||||||
-- Files
|
-- Files
|
||||||
('file_create'), ('file_edit'), ('file_delete'), ('file_restore'),
|
('file_create'), ('file_edit'), ('file_delete'), ('file_restore'),
|
||||||
('file_permanent_delete'), ('file_replace'), ('file_review'),
|
('file_permanent_delete'), ('file_replace'), ('file_review'),
|
||||||
|
('file_merge'), ('duplicate_dismiss'),
|
||||||
-- Tags
|
-- Tags
|
||||||
('tag_create'), ('tag_edit'), ('tag_delete'),
|
('tag_create'), ('tag_edit'), ('tag_delete'),
|
||||||
-- Categories
|
-- Categories
|
||||||
|
|||||||
Reference in New Issue
Block a user