feat(backend): duplicate pairs, dismissals, and merge resolution

Adds the duplicate-detection backend on top of perceptual hashing: - Two tables (edited into the original migrations): data.duplicate_pairs holds precomputed near-duplicate candidates (rebuilt wholesale by the rescan), and data.duplicate_dismissals is a global "not a duplicate" overlay that survives rescans. New audit actions file_merge / duplicate_dismiss. - DuplicateService: - Rescan builds every pair within DUPLICATE_HASH_THRESHOLD via a BK-tree over the perceptual hashes and replaces the pairs table. This is the only thing that populates pairs, so GET never compares all-vs-all (scales to 110k+). - Clusters reads the precomputed pairs (ACL-filtered, non-trashed, non- dismissed), groups them into connected components via union-find, and paginates whole clusters. - Resolve merges a pair field-by-field: each scalar from keep or discard, metadata keep/discard/shallow-merge, tags/pools keep or union; then trashes the discarded file. Enforces edit ACL on both. - Dismiss records a canonical pair (view ACL on both). - Endpoints under /files: GET /files/duplicates, POST /files/duplicates/dismiss, POST /files/duplicates/resolve (registered before /:id to avoid collision). Plain delete reuses /files/bulk/delete. - Repo support: ListMissingPHash, ListAllPHashes, CopyPoolMemberships, plus the DuplicatePairRepo (ReplaceAll via COPY, ListVisible) and DismissalRepo. Unit tests cover the BK-tree pairing, union-find clustering, metadata merge and field validation; an integration test covers rescan -> list -> merge -> dismiss (including that a dismissal survives a re-rescan). Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:42:37 +03:00
parent 88849cc16b
commit 9216a8687f
15 changed files with 1214 additions and 4 deletions
@@ -0,0 +1,148 @@
+package service
+
+import (
+	"bytes"
+	"math/bits"
+	"sort"
+
+	"github.com/google/uuid"
+
+	"tanabata/backend/internal/domain"
+)
+
+// hamming returns the number of differing bits between two perceptual hashes.
+func hamming(a, b uint64) int { return bits.OnesCount64(a ^ b) }
+
+// bkNode is a node in a BK-tree over Hamming distance. Files that share the exact
+// same hash are collected in ids (a distance-0 collision), so identical images
+// don't degenerate the tree into a chain.
+type bkNode struct {
+	hash     uint64
+	ids      []uuid.UUID
+	children map[int]*bkNode
+}
+
+// bkTree indexes perceptual hashes for sublinear radius queries. Building one and
+// querying every element with a small radius is far cheaper than the O(N²) all-
+// pairs comparison at 100k+ files.
+type bkTree struct{ root *bkNode }
+
+func (t *bkTree) insert(hash uint64, id uuid.UUID) {
+	if t.root == nil {
+		t.root = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
+		return
+	}
+	node := t.root
+	for {
+		d := hamming(hash, node.hash)
+		if d == 0 {
+			node.ids = append(node.ids, id)
+			return
+		}
+		child, ok := node.children[d]
+		if !ok {
+			node.children[d] = &bkNode{hash: hash, ids: []uuid.UUID{id}, children: map[int]*bkNode{}}
+			return
+		}
+		node = child
+	}
+}
+
+// query visits every node whose hash is within radius of target. The triangle
+// inequality bounds which children can hold a match to [d-radius, d+radius].
+func (t *bkTree) query(target uint64, radius int, visit func(node *bkNode, dist int)) {
+	if t.root == nil {
+		return
+	}
+	stack := []*bkNode{t.root}
+	for len(stack) > 0 {
+		node := stack[len(stack)-1]
+		stack = stack[:len(stack)-1]
+
+		d := hamming(target, node.hash)
+		if d <= radius {
+			visit(node, d)
+		}
+		lo, hi := d-radius, d+radius
+		for cd, child := range node.children {
+			if cd >= lo && cd <= hi {
+				stack = append(stack, child)
+			}
+		}
+	}
+}
+
+// buildPairs returns every unordered pair of files whose hashes are within
+// threshold, each emitted exactly once with FileA < FileB (UUID byte order).
+// onProgress, if set, is called periodically with (processed, total).
+func buildPairs(entries []domain.PHashEntry, threshold int, onProgress func(done, total int)) []domain.DuplicatePair {
+	tree := &bkTree{}
+	for _, e := range entries {
+		tree.insert(uint64(e.PHash), e.ID)
+	}
+
+	var pairs []domain.DuplicatePair
+	total := len(entries)
+	for i := range entries {
+		e := entries[i]
+		tree.query(uint64(e.PHash), threshold, func(node *bkNode, dist int) {
+			for _, other := range node.ids {
+				// Emit each pair once, from the smaller id, which also skips self.
+				if bytes.Compare(e.ID[:], other[:]) < 0 {
+					pairs = append(pairs, domain.DuplicatePair{FileA: e.ID, FileB: other, Distance: dist})
+				}
+			}
+		})
+		if onProgress != nil && (i+1)%1000 == 0 {
+			onProgress(i+1, total)
+		}
+	}
+	if onProgress != nil {
+		onProgress(total, total)
+	}
+	return pairs
+}
+
+// clusterPairs groups pairs into connected components (transitive closure) via
+// union-find. Every returned cluster has at least two files; clusters and the ids
+// within them are sorted by UUID for stable pagination.
+func clusterPairs(pairs []domain.DuplicatePair) [][]uuid.UUID {
+	parent := map[uuid.UUID]uuid.UUID{}
+	var find func(uuid.UUID) uuid.UUID
+	find = func(x uuid.UUID) uuid.UUID {
+		p, ok := parent[x]
+		if !ok {
+			parent[x] = x
+			return x
+		}
+		if p != x {
+			parent[x] = find(p)
+		}
+		return parent[x]
+	}
+	union := func(a, b uuid.UUID) {
+		ra, rb := find(a), find(b)
+		if ra != rb {
+			parent[ra] = rb
+		}
+	}
+	for _, p := range pairs {
+		union(p.FileA, p.FileB)
+	}
+
+	groups := map[uuid.UUID][]uuid.UUID{}
+	for node := range parent {
+		root := find(node)
+		groups[root] = append(groups[root], node)
+	}
+
+	clusters := make([][]uuid.UUID, 0, len(groups))
+	for _, ids := range groups {
+		sort.Slice(ids, func(i, j int) bool { return bytes.Compare(ids[i][:], ids[j][:]) < 0 })
+		clusters = append(clusters, ids)
+	}
+	sort.Slice(clusters, func(i, j int) bool {
+		return bytes.Compare(clusters[i][0][:], clusters[j][0][:]) < 0
+	})
+	return clusters
+}
@@ -0,0 +1,361 @@
+package service
+
+import (
+	"context"
+	"encoding/json"
+	"errors"
+	"time"
+
+	"github.com/google/uuid"
+
+	"tanabata/backend/internal/domain"
+	"tanabata/backend/internal/port"
+)
+
+// Merge field source values.
+const (
+	mergeKeep    = "keep"
+	mergeDiscard = "discard"
+	mergeBoth    = "both"
+	mergeMerge   = "merge"
+)
+
+// MergeFields chooses, per field, which file supplies the survivor's value when
+// resolving a duplicate. Scalars accept "keep"/"discard"; metadata also accepts
+// "merge" (shallow object merge, survivor wins on key conflicts); relations
+// (tags, pools) accept "keep"/"both" (union) — there is deliberately no option
+// to drop the survivor's own tags/pools. An empty value defaults to "keep".
+type MergeFields struct {
+	OriginalName    string `json:"original_name"`
+	Notes           string `json:"notes"`
+	ContentDatetime string `json:"content_datetime"`
+	IsPublic        string `json:"is_public"`
+	Metadata        string `json:"metadata"`
+	Tags            string `json:"tags"`
+	Pools           string `json:"pools"`
+}
+
+// MergeSpec is the input to a duplicate resolution: keep one file, fold chosen
+// fields in from the other, and (usually) trash the other.
+type MergeSpec struct {
+	Keep            uuid.UUID
+	Discard         uuid.UUID
+	Fields          MergeFields
+	DeleteDiscarded bool
+}
+
+// normalize fills empty choices with "keep" and rejects unknown values.
+func (m *MergeSpec) normalize() error {
+	scalar := func(v *string) error {
+		if *v == "" {
+			*v = mergeKeep
+		}
+		if *v != mergeKeep && *v != mergeDiscard {
+			return domain.ErrValidation
+		}
+		return nil
+	}
+	relation := func(v *string) error {
+		if *v == "" {
+			*v = mergeKeep
+		}
+		if *v != mergeKeep && *v != mergeBoth {
+			return domain.ErrValidation
+		}
+		return nil
+	}
+	f := &m.Fields
+	if err := scalar(&f.OriginalName); err != nil {
+		return err
+	}
+	if err := scalar(&f.Notes); err != nil {
+		return err
+	}
+	if err := scalar(&f.ContentDatetime); err != nil {
+		return err
+	}
+	if err := scalar(&f.IsPublic); err != nil {
+		return err
+	}
+	if f.Metadata == "" {
+		f.Metadata = mergeKeep
+	}
+	if f.Metadata != mergeKeep && f.Metadata != mergeDiscard && f.Metadata != mergeMerge {
+		return domain.ErrValidation
+	}
+	if err := relation(&f.Tags); err != nil {
+		return err
+	}
+	if err := relation(&f.Pools); err != nil {
+		return err
+	}
+	return nil
+}
+
+// DuplicateService finds near-duplicate clusters and resolves them.
+type DuplicateService struct {
+	files      port.FileRepo
+	pairs      port.DuplicatePairRepo
+	dismissals port.DismissalRepo
+	acl        *ACLService
+	audit      *AuditService
+	tx         port.Transactor
+	threshold  int
+}
+
+// NewDuplicateService creates a DuplicateService. threshold is the maximum
+// Hamming distance for two files to be treated as duplicate candidates.
+func NewDuplicateService(
+	files port.FileRepo,
+	pairs port.DuplicatePairRepo,
+	dismissals port.DismissalRepo,
+	acl *ACLService,
+	audit *AuditService,
+	tx port.Transactor,
+	threshold int,
+) *DuplicateService {
+	return &DuplicateService{
+		files:      files,
+		pairs:      pairs,
+		dismissals: dismissals,
+		acl:        acl,
+		audit:      audit,
+		tx:         tx,
+		threshold:  threshold,
+	}
+}
+
+// Clusters returns a page of duplicate clusters visible to the caller. Pairs are
+// read from the precomputed table (no all-pairs scan here) and grouped into
+// connected components; pagination is over whole clusters.
+func (s *DuplicateService) Clusters(ctx context.Context, limit, offset int) (clusters [][]domain.File, total int, err error) {
+	userID, isAdmin, _ := domain.UserFromContext(ctx)
+
+	pairs, err := s.pairs.ListVisible(ctx, userID, isAdmin)
+	if err != nil {
+		return nil, 0, err
+	}
+	groups := clusterPairs(pairs)
+	total = len(groups)
+
+	if offset < 0 {
+		offset = 0
+	}
+	if offset >= len(groups) {
+		return [][]domain.File{}, total, nil
+	}
+	end := offset + limit
+	if end > len(groups) || limit <= 0 {
+		end = len(groups)
+	}
+
+	out := make([][]domain.File, 0, end-offset)
+	for _, ids := range groups[offset:end] {
+		files := make([]domain.File, 0, len(ids))
+		for _, id := range ids {
+			f, err := s.files.GetByID(ctx, id)
+			if err != nil {
+				// A file deleted between the pair read and now just drops out.
+				if errors.Is(err, domain.ErrNotFound) {
+					continue
+				}
+				return nil, 0, err
+			}
+			files = append(files, *f)
+		}
+		if len(files) >= 2 {
+			out = append(out, files)
+		}
+	}
+	return out, total, nil
+}
+
+// Rescan recomputes the entire duplicate_pairs table from the current set of
+// perceptual hashes. It is the only thing that populates the table, so the
+// duplicates view reflects state as of the last rescan. Called by the dedup CLI.
+func (s *DuplicateService) Rescan(ctx context.Context, onProgress func(done, total int)) error {
+	entries, err := s.files.ListAllPHashes(ctx)
+	if err != nil {
+		return err
+	}
+	pairs := buildPairs(entries, s.threshold, onProgress)
+	return s.pairs.ReplaceAll(ctx, pairs)
+}
+
+// Dismiss records two files as "not a duplicate" so the pair stops surfacing.
+// The caller must be able to view both files.
+func (s *DuplicateService) Dismiss(ctx context.Context, a, b uuid.UUID) error {
+	if a == b {
+		return domain.ErrValidation
+	}
+	userID, isAdmin, _ := domain.UserFromContext(ctx)
+	for _, id := range []uuid.UUID{a, b} {
+		f, err := s.files.GetByID(ctx, id)
+		if err != nil {
+			return err
+		}
+		ok, err := s.acl.CanView(ctx, userID, isAdmin, f.CreatorID, f.IsPublic, fileObjectTypeID, id)
+		if err != nil {
+			return err
+		}
+		if !ok {
+			return domain.ErrForbidden
+		}
+	}
+	if err := s.dismissals.Add(ctx, a, b, userID); err != nil {
+		return err
+	}
+	objType := fileObjectType
+	_ = s.audit.Log(ctx, "duplicate_dismiss", &objType, &a, map[string]any{"other": b.String()})
+	return nil
+}
+
+// Resolve merges a duplicate pair: the survivor (keep) takes the chosen fields
+// from the other (discard), and the other is trashed when DeleteDiscarded is set.
+// The caller must be able to edit both files. Returns the updated survivor.
+func (s *DuplicateService) Resolve(ctx context.Context, spec MergeSpec) (*domain.File, error) {
+	if spec.Keep == spec.Discard {
+		return nil, domain.ErrValidation
+	}
+	if err := spec.normalize(); err != nil {
+		return nil, err
+	}
+
+	keep, err := s.files.GetByID(ctx, spec.Keep)
+	if err != nil {
+		return nil, err
+	}
+	discard, err := s.files.GetByID(ctx, spec.Discard)
+	if err != nil {
+		return nil, err
+	}
+
+	userID, isAdmin, _ := domain.UserFromContext(ctx)
+	for _, f := range []*domain.File{keep, discard} {
+		ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, f.ID)
+		if err != nil {
+			return nil, err
+		}
+		if !ok {
+			return nil, domain.ErrForbidden
+		}
+	}
+
+	// FileRepo.Update rewrites all editable scalar columns, so build the complete
+	// resolved set (each field from keep or discard) rather than a sparse patch.
+	patch := &domain.File{
+		OriginalName:    pickPtr(spec.Fields.OriginalName, keep.OriginalName, discard.OriginalName),
+		Notes:           pickPtr(spec.Fields.Notes, keep.Notes, discard.Notes),
+		ContentDatetime: pickTime(spec.Fields.ContentDatetime, keep.ContentDatetime, discard.ContentDatetime),
+		IsPublic:        pickBool(spec.Fields.IsPublic, keep.IsPublic, discard.IsPublic),
+		Metadata:        pickMetadata(spec.Fields.Metadata, keep.Metadata, discard.Metadata),
+	}
+
+	var result *domain.File
+	txErr := s.tx.WithTx(ctx, func(ctx context.Context) error {
+		updated, err := s.files.Update(ctx, keep.ID, patch)
+		if err != nil {
+			return err
+		}
+
+		if spec.Fields.Tags == mergeBoth {
+			if err := s.files.SetTags(ctx, keep.ID, unionTagIDs(keep.Tags, discard.Tags)); err != nil {
+				return err
+			}
+			tags, err := s.files.ListTags(ctx, keep.ID)
+			if err != nil {
+				return err
+			}
+			updated.Tags = tags
+		}
+		if spec.Fields.Pools == mergeBoth {
+			if err := s.files.CopyPoolMemberships(ctx, keep.ID, discard.ID); err != nil {
+				return err
+			}
+		}
+		if spec.DeleteDiscarded {
+			if err := s.files.SoftDelete(ctx, discard.ID); err != nil {
+				return err
+			}
+		}
+		result = updated
+		return nil
+	})
+	if txErr != nil {
+		return nil, txErr
+	}
+
+	objType := fileObjectType
+	_ = s.audit.Log(ctx, "file_merge", &objType, &keep.ID, map[string]any{
+		"discard":           spec.Discard.String(),
+		"fields":            spec.Fields,
+		"deleted_discarded": spec.DeleteDiscarded,
+	})
+	return result, nil
+}
+
+// --- field pickers ---------------------------------------------------------
+
+func pickPtr(choice string, keep, discard *string) *string {
+	if choice == mergeDiscard {
+		return discard
+	}
+	return keep
+}
+
+func pickBool(choice string, keep, discard bool) bool {
+	if choice == mergeDiscard {
+		return discard
+	}
+	return keep
+}
+
+func pickTime(choice string, keep, discard time.Time) time.Time {
+	if choice == mergeDiscard {
+		return discard
+	}
+	return keep
+}
+
+func unionTagIDs(a, b []domain.Tag) []uuid.UUID {
+	seen := make(map[uuid.UUID]bool, len(a)+len(b))
+	ids := make([]uuid.UUID, 0, len(a)+len(b))
+	for _, t := range append(append([]domain.Tag{}, a...), b...) {
+		if !seen[t.ID] {
+			seen[t.ID] = true
+			ids = append(ids, t.ID)
+		}
+	}
+	return ids
+}
+
+// pickMetadata returns keep's metadata, discard's, or a shallow merge in which
+// the survivor's keys win on conflict.
+func pickMetadata(choice string, keep, discard json.RawMessage) json.RawMessage {
+	switch choice {
+	case mergeDiscard:
+		return discard
+	case mergeMerge:
+		km := map[string]json.RawMessage{}
+		dm := map[string]json.RawMessage{}
+		_ = json.Unmarshal(keep, &km)
+		_ = json.Unmarshal(discard, &dm)
+		out := make(map[string]json.RawMessage, len(km)+len(dm))
+		for k, v := range dm {
+			out[k] = v
+		}
+		for k, v := range km { // survivor wins
+			out[k] = v
+		}
+		if len(out) == 0 {
+			return keep
+		}
+		b, err := json.Marshal(out)
+		if err != nil {
+			return keep
+		}
+		return b
+	default:
+		return keep
+	}
+}
@@ -0,0 +1,152 @@
+package service
+
+import (
+	"bytes"
+	"encoding/json"
+	"errors"
+	"fmt"
+	"sort"
+	"testing"
+
+	"github.com/google/uuid"
+
+	"tanabata/backend/internal/domain"
+)
+
+// id builds a deterministic UUID whose byte order matches n, so tests can reason
+// about the canonical (FileA < FileB) ordering buildPairs produces.
+func id(n int) uuid.UUID {
+	return uuid.MustParse(fmt.Sprintf("00000000-0000-0000-0000-%012d", n))
+}
+
+func entry(n int, hash uint64) domain.PHashEntry {
+	return domain.PHashEntry{ID: id(n), PHash: int64(hash)}
+}
+
+// pairKey canonicalises a pair for set comparison regardless of emission order.
+func pairKey(p domain.DuplicatePair) string {
+	a, b := p.FileA, p.FileB
+	if bytes.Compare(a[:], b[:]) > 0 {
+		a, b = b, a
+	}
+	return fmt.Sprintf("%s|%s|%d", a, b, p.Distance)
+}
+
+func TestBuildPairs_ThresholdAndCanonicalOrder(t *testing.T) {
+	entries := []domain.PHashEntry{
+		entry(1, 0x0000000000000000),
+		entry(2, 0x0000000000000001), // distance 1 from #1
+		entry(3, 0x00000000000000FF), // distance 8 from #1, 7 from #2
+		entry(4, 0xFFFFFFFFFFFFFFFF), // distance 64 from #1
+	}
+
+	// Tight threshold: only the distance-1 pair qualifies.
+	got := buildPairs(entries, 2, nil)
+	if len(got) != 1 {
+		t.Fatalf("threshold 2: got %d pairs, want 1: %+v", len(got), got)
+	}
+	if got[0].FileA != id(1) || got[0].FileB != id(2) || got[0].Distance != 1 {
+		t.Errorf("threshold 2: unexpected pair %+v", got[0])
+	}
+	// Canonical order always FileA < FileB.
+	if bytes.Compare(got[0].FileA[:], got[0].FileB[:]) >= 0 {
+		t.Error("pair not in canonical FileA < FileB order")
+	}
+
+	// Looser threshold pulls in #3's pairs but never #4.
+	got8 := buildPairs(entries, 8, nil)
+	want := map[string]bool{
+		pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(2), Distance: 1}): true,
+		pairKey(domain.DuplicatePair{FileA: id(1), FileB: id(3), Distance: 8}): true,
+		pairKey(domain.DuplicatePair{FileA: id(2), FileB: id(3), Distance: 7}): true,
+	}
+	if len(got8) != len(want) {
+		t.Fatalf("threshold 8: got %d pairs, want %d: %+v", len(got8), len(want), got8)
+	}
+	for _, p := range got8 {
+		if !want[pairKey(p)] {
+			t.Errorf("threshold 8: unexpected pair %+v", p)
+		}
+	}
+}
+
+func TestBuildPairs_IdenticalHashesPairAtDistanceZero(t *testing.T) {
+	entries := []domain.PHashEntry{
+		entry(1, 0xABCDABCDABCDABCD),
+		entry(2, 0xABCDABCDABCDABCD),
+	}
+	got := buildPairs(entries, 0, nil)
+	if len(got) != 1 || got[0].Distance != 0 || got[0].FileA != id(1) || got[0].FileB != id(2) {
+		t.Fatalf("identical hashes: got %+v, want one distance-0 pair (1,2)", got)
+	}
+}
+
+func TestClusterPairs_ConnectedComponents(t *testing.T) {
+	pairs := []domain.DuplicatePair{
+		{FileA: id(1), FileB: id(2)},
+		{FileA: id(2), FileB: id(3)}, // transitively joins 1-2-3
+		{FileA: id(5), FileB: id(6)},
+	}
+	clusters := clusterPairs(pairs)
+	if len(clusters) != 2 {
+		t.Fatalf("got %d clusters, want 2: %+v", len(clusters), clusters)
+	}
+	// Sorted by smallest id: {1,2,3} then {5,6}.
+	if len(clusters[0]) != 3 || clusters[0][0] != id(1) || clusters[0][2] != id(3) {
+		t.Errorf("cluster 0 = %v, want [1 2 3]", clusters[0])
+	}
+	if len(clusters[1]) != 2 || clusters[1][0] != id(5) {
+		t.Errorf("cluster 1 = %v, want [5 6]", clusters[1])
+	}
+	// Each cluster's ids are sorted.
+	for _, c := range clusters {
+		if !sort.SliceIsSorted(c, func(i, j int) bool { return bytes.Compare(c[i][:], c[j][:]) < 0 }) {
+			t.Errorf("cluster not sorted: %v", c)
+		}
+	}
+}
+
+func TestPickMetadata_Merge(t *testing.T) {
+	keep := json.RawMessage(`{"a":1,"b":2}`)
+	discard := json.RawMessage(`{"b":9,"c":3}`)
+
+	out := pickMetadata(mergeMerge, keep, discard)
+	var m map[string]int
+	if err := json.Unmarshal(out, &m); err != nil {
+		t.Fatalf("merge result not valid JSON: %v (%s)", err, out)
+	}
+	want := map[string]int{"a": 1, "b": 2, "c": 3} // survivor wins on "b"
+	if fmt.Sprint(m) != fmt.Sprint(want) {
+		t.Errorf("merge = %v, want %v", m, want)
+	}
+
+	if string(pickMetadata(mergeKeep, keep, discard)) != string(keep) {
+		t.Error("keep choice should return survivor metadata unchanged")
+	}
+	if string(pickMetadata(mergeDiscard, keep, discard)) != string(discard) {
+		t.Error("discard choice should return the other file's metadata")
+	}
+}
+
+func TestMergeSpec_Normalize(t *testing.T) {
+	// Empty fields default to "keep".
+	spec := MergeSpec{Keep: id(1), Discard: id(2)}
+	if err := spec.normalize(); err != nil {
+		t.Fatalf("normalize empty: %v", err)
+	}
+	if spec.Fields.OriginalName != mergeKeep || spec.Fields.Tags != mergeKeep || spec.Fields.Metadata != mergeKeep {
+		t.Errorf("empty fields not defaulted to keep: %+v", spec.Fields)
+	}
+
+	// "both" is invalid for a scalar field.
+	bad := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Notes: mergeBoth}}
+	if err := bad.normalize(); !errors.Is(err, domain.ErrValidation) {
+		t.Errorf("scalar=both: got %v, want ErrValidation", err)
+	}
+
+	// "discard" is invalid for a relation field.
+	badRel := MergeSpec{Keep: id(1), Discard: id(2), Fields: MergeFields{Tags: mergeDiscard}}
+	if err := badRel.normalize(); !errors.Is(err, domain.ErrValidation) {
+		t.Errorf("relation=discard: got %v, want ErrValidation", err)
+	}
+}