feat(backend): duplicate pairs, dismissals, and merge resolution
Adds the duplicate-detection backend on top of perceptual hashing:
- Two tables (edited into the original migrations): data.duplicate_pairs holds
precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and
data.duplicate_dismissals is a global "not a duplicate" overlay that survives
rescans. New audit actions file_merge / duplicate_dismiss.
- DuplicateService:
- Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over
the perceptual hashes and replaces the pairs table. This is the only thing
that populates pairs, so GET never compares all-vs-all (scales to 110k+).
- Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non-
dismissed), groups them into connected components via union-find, and
paginates whole clusters.
- Resolve merges a pair field-by-field: each scalar from keep or discard,
metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes
the discarded file. Enforces edit ACL on both.
- Dismiss records a canonical pair (view ACL on both).
- Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss,
POST /files/duplicates/resolve (registered before /:id to avoid collision).
Plain delete reuses /files/bulk/delete.
- Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the
DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo.
Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and
field validation; an integration test covers rescan -> list -> merge -> dismiss
(including that a dismissal survives a re-rescan).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -70,6 +70,8 @@ func main() {
|
||||
tagRuleRepo := postgres.NewTagRuleRepo(pool)
|
||||
categoryRepo := postgres.NewCategoryRepo(pool)
|
||||
poolRepo := postgres.NewPoolRepo(pool)
|
||||
duplicatePairRepo := postgres.NewDuplicatePairRepo(pool)
|
||||
dismissalRepo := postgres.NewDismissalRepo(pool)
|
||||
transactor := postgres.NewTransactor(pool)
|
||||
|
||||
// Services
|
||||
@@ -86,6 +88,9 @@ func main() {
|
||||
tagSvc := service.NewTagService(tagRepo, tagRuleRepo, aclSvc, auditSvc, transactor)
|
||||
categorySvc := service.NewCategoryService(categoryRepo, tagRepo, aclSvc, auditSvc)
|
||||
poolSvc := service.NewPoolService(poolRepo, aclSvc, auditSvc)
|
||||
duplicateSvc := service.NewDuplicateService(
|
||||
fileRepo, duplicatePairRepo, dismissalRepo, aclSvc, auditSvc, transactor, cfg.DuplicateHashThreshold,
|
||||
)
|
||||
fileSvc := service.NewFileService(
|
||||
fileRepo,
|
||||
mimeRepo,
|
||||
@@ -108,6 +113,7 @@ func main() {
|
||||
authMiddleware := handler.NewAuthMiddleware(authSvc)
|
||||
authHandler := handler.NewAuthHandler(authSvc)
|
||||
fileHandler := handler.NewFileHandler(fileSvc, tagSvc, authSvc, cfg.MaxUploadBytes)
|
||||
duplicateHandler := handler.NewDuplicateHandler(duplicateSvc)
|
||||
tagHandler := handler.NewTagHandler(tagSvc, fileSvc)
|
||||
categoryHandler := handler.NewCategoryHandler(categorySvc)
|
||||
poolHandler := handler.NewPoolHandler(poolSvc)
|
||||
@@ -117,7 +123,7 @@ func main() {
|
||||
|
||||
r, err := handler.NewRouter(
|
||||
authMiddleware, authHandler,
|
||||
fileHandler, tagHandler, categoryHandler, poolHandler,
|
||||
fileHandler, duplicateHandler, tagHandler, categoryHandler, poolHandler,
|
||||
userHandler, aclHandler, auditHandler,
|
||||
cfg.StaticDir,
|
||||
cfg.TrustedProxies,
|
||||
|
||||
Reference in New Issue
Block a user