feat(backend): perceptual hashing for images and video
Adds a 64-bit dHash perceptual hash (internal/imagehash, built on the existing disintegration/imaging — no new dependency) and starts populating the long-unused data.files.phash column: - Upload sets phash inline for images (cheap, from the in-memory bytes). - Replace recomputes it from new content for images and clears it for anything else, so a stale hash never survives a content swap. - FileRepo.SetPHash sets/clears the hash (used by Replace and, later, the dedup backfill). - DiskStorage.VideoFrameMiddle extracts a frame from the middle of a clip (ffprobe duration -> ffmpeg -ss duration/2), avoiding the shared-intro collision a fixed early offset causes. It is a concrete method, not part of the storage port: only the dedup CLI needs it, keeping ffmpeg off the upload path. Video phashes are therefore computed by that CLI, not at upload time. - DUPLICATE_HASH_THRESHOLD config (default 10/64) for the later pair rescan. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -63,6 +63,12 @@ type Config struct {
|
||||
// Import
|
||||
ImportPath string
|
||||
|
||||
// DuplicateHashThreshold is the maximum Hamming distance (out of 64) between
|
||||
// two perceptual hashes for the files to be treated as duplicate candidates.
|
||||
// Lower = stricter (fewer, more confident matches); higher = looser. Used only
|
||||
// by the dedup rescan that (re)builds data.duplicate_pairs.
|
||||
DuplicateHashThreshold int
|
||||
|
||||
// Static SPA. When set, the server serves the built frontend (and falls
|
||||
// back to index.html for client routes) on the same port as the API. Empty
|
||||
// in local development, where the Vite dev server serves the UI separately.
|
||||
@@ -176,6 +182,8 @@ func Load() (*Config, error) {
|
||||
|
||||
ImportPath: requireStr("IMPORT_PATH"),
|
||||
|
||||
DuplicateHashThreshold: parseInt("DUPLICATE_HASH_THRESHOLD", 10),
|
||||
|
||||
StaticDir: defaultStr("STATIC_DIR", ""),
|
||||
}
|
||||
|
||||
|
||||
@@ -434,6 +434,18 @@ func (r *FileRepo) SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bo
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetPHash sets (or clears, when phash is nil) the perceptual hash of a file.
|
||||
// Used by the dedup backfill and on content replacement; phash is non-critical,
|
||||
// recomputable metadata, so callers may treat failures as best-effort.
|
||||
func (r *FileRepo) SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error {
|
||||
const sqlStr = `UPDATE data.files SET phash = $2 WHERE id = $1`
|
||||
q := connOrTx(ctx, r.pool)
|
||||
if _, err := q.Exec(ctx, sqlStr, id, phash); err != nil {
|
||||
return fmt.Errorf("FileRepo.SetPHash: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// SoftDelete / Restore / DeletePermanent
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
// Package imagehash computes a 64-bit perceptual hash (dHash) of an image and
|
||||
// compares two hashes by Hamming distance. It is used for near-duplicate
|
||||
// detection: visually similar images (re-encoded, resized, recompressed) produce
|
||||
// hashes a small distance apart, while unrelated images are far apart.
|
||||
//
|
||||
// dHash is chosen for its robustness and simplicity: the image is reduced to a
|
||||
// 9×8 grayscale and each pixel is compared to its right-hand neighbour, yielding
|
||||
// 64 gradient-direction bits. It tolerates scaling and brightness/contrast
|
||||
// changes well, which is exactly what re-encoded duplicates exhibit.
|
||||
package imagehash
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
_ "image/gif" // register GIF decoder
|
||||
_ "image/jpeg" // register JPEG decoder
|
||||
_ "image/png" // register PNG decoder
|
||||
"math/bits"
|
||||
|
||||
"github.com/disintegration/imaging"
|
||||
_ "golang.org/x/image/webp" // register WebP decoder
|
||||
)
|
||||
|
||||
// hashWidth/hashHeight define the reduced grayscale used for dHash. The extra
|
||||
// column (width = height+1) provides the right-hand neighbour for the 64
|
||||
// horizontal comparisons that make up the hash.
|
||||
const (
|
||||
hashHeight = 8
|
||||
hashWidth = hashHeight + 1
|
||||
)
|
||||
|
||||
// FromImage reduces img to a 9×8 grayscale and returns its 64-bit dHash. The
|
||||
// uint64 of gradient bits is returned as int64 (a plain bit reinterpretation) so
|
||||
// it fits PostgreSQL's bigint; equality and Distance are bitwise, so the signed
|
||||
// interpretation never matters.
|
||||
func FromImage(img image.Image) int64 {
|
||||
small := imaging.Grayscale(imaging.Resize(img, hashWidth, hashHeight, imaging.Lanczos))
|
||||
|
||||
var hash uint64
|
||||
bit := 0
|
||||
for y := 0; y < hashHeight; y++ {
|
||||
for x := 0; x < hashHeight; x++ {
|
||||
// After Grayscale, R == G == B, so the red channel is the luminance.
|
||||
left := small.Pix[small.PixOffset(x, y)]
|
||||
right := small.Pix[small.PixOffset(x+1, y)]
|
||||
if left < right {
|
||||
hash |= 1 << uint(63-bit)
|
||||
}
|
||||
bit++
|
||||
}
|
||||
}
|
||||
return int64(hash)
|
||||
}
|
||||
|
||||
// FromBytes decodes data (JPEG/PNG/GIF/WebP) and returns its dHash. ok is false
|
||||
// when the bytes are not a decodable image, so callers can simply skip hashing
|
||||
// (e.g. leave phash NULL) rather than fail.
|
||||
func FromBytes(data []byte) (hash int64, ok bool) {
|
||||
img, _, err := image.Decode(bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return FromImage(img), true
|
||||
}
|
||||
|
||||
// Distance returns the Hamming distance (0–64) between two hashes: the number of
|
||||
// differing bits. 0 means identical; small values mean near-duplicate.
|
||||
func Distance(a, b int64) int {
|
||||
return bits.OnesCount64(uint64(a) ^ uint64(b))
|
||||
}
|
||||
@@ -0,0 +1,99 @@
|
||||
package imagehash
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"image"
|
||||
"image/color"
|
||||
"image/jpeg"
|
||||
"image/png"
|
||||
"math"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// radial renders a smooth grayscale image whose brightness falls off with
|
||||
// distance from (cx, cy). Smooth gradients are the realistic case for perceptual
|
||||
// hashing and survive JPEG re-encoding well, so they make stable test fixtures.
|
||||
func radial(w, h int, cx, cy float64) image.Image {
|
||||
img := image.NewRGBA(image.Rect(0, 0, w, h))
|
||||
maxD := math.Hypot(float64(w), float64(h))
|
||||
for y := 0; y < h; y++ {
|
||||
for x := 0; x < w; x++ {
|
||||
d := math.Hypot(float64(x)-cx, float64(y)-cy)
|
||||
v := uint8(255 * (1 - d/maxD))
|
||||
img.Set(x, y, color.RGBA{v, v, v, 255})
|
||||
}
|
||||
}
|
||||
return img
|
||||
}
|
||||
|
||||
func encodePNG(t *testing.T, img image.Image) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
if err := png.Encode(&buf, img); err != nil {
|
||||
t.Fatalf("png encode: %v", err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
func encodeJPEG(t *testing.T, img image.Image, quality int) []byte {
|
||||
t.Helper()
|
||||
var buf bytes.Buffer
|
||||
if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality}); err != nil {
|
||||
t.Fatalf("jpeg encode: %v", err)
|
||||
}
|
||||
return buf.Bytes()
|
||||
}
|
||||
|
||||
// The same image re-encoded as PNG (lossless) and JPEG (lossy) must hash to a
|
||||
// small Hamming distance — that is the whole point of a perceptual hash.
|
||||
func TestFromBytes_SameImageAcrossEncodings(t *testing.T) {
|
||||
img := radial(64, 64, 32, 32)
|
||||
|
||||
pngHash, ok := FromBytes(encodePNG(t, img))
|
||||
if !ok {
|
||||
t.Fatal("FromBytes(PNG): ok=false")
|
||||
}
|
||||
jpgHash, ok := FromBytes(encodeJPEG(t, img, 90))
|
||||
if !ok {
|
||||
t.Fatal("FromBytes(JPEG): ok=false")
|
||||
}
|
||||
|
||||
if d := Distance(pngHash, jpgHash); d > 8 {
|
||||
t.Errorf("same image, different encodings: distance = %d, want <= 8", d)
|
||||
}
|
||||
}
|
||||
|
||||
// Visually different images must be far apart, and clearly farther than the same
|
||||
// image across encodings.
|
||||
func TestDistance_DifferentImagesAreFarApart(t *testing.T) {
|
||||
a := FromImage(radial(64, 64, 32, 32)) // centred
|
||||
b := FromImage(radial(64, 64, 0, 0)) // corner
|
||||
|
||||
same, _ := FromBytes(encodeJPEG(t, radial(64, 64, 32, 32), 90))
|
||||
|
||||
d := Distance(a, b)
|
||||
if d < 12 {
|
||||
t.Errorf("different images: distance = %d, want >= 12", d)
|
||||
}
|
||||
if d <= Distance(a, same) {
|
||||
t.Errorf("different images (%d) not farther than re-encoded same image (%d)", d, Distance(a, same))
|
||||
}
|
||||
}
|
||||
|
||||
func TestDistance_SymmetricAndZeroForEqual(t *testing.T) {
|
||||
a := FromImage(radial(64, 64, 20, 40))
|
||||
b := FromImage(radial(64, 64, 40, 20))
|
||||
|
||||
if Distance(a, a) != 0 {
|
||||
t.Errorf("Distance(a, a) = %d, want 0", Distance(a, a))
|
||||
}
|
||||
if Distance(a, b) != Distance(b, a) {
|
||||
t.Errorf("Distance not symmetric: %d vs %d", Distance(a, b), Distance(b, a))
|
||||
}
|
||||
}
|
||||
|
||||
func TestFromBytes_RejectsNonImage(t *testing.T) {
|
||||
if _, ok := FromBytes([]byte("definitely not an image")); ok {
|
||||
t.Error("FromBytes on garbage: ok=true, want false")
|
||||
}
|
||||
}
|
||||
@@ -50,6 +50,8 @@ type FileRepo interface {
|
||||
Update(ctx context.Context, id uuid.UUID, f *domain.File) (*domain.File, error)
|
||||
// SetNeedsReview sets the review status on the given (non-trashed) files.
|
||||
SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error
|
||||
// SetPHash sets (or clears, when nil) the perceptual hash of a file.
|
||||
SetPHash(ctx context.Context, id uuid.UUID, phash *int64) error
|
||||
// SoftDelete moves a file to trash (sets is_deleted = true).
|
||||
SoftDelete(ctx context.Context, id uuid.UUID) error
|
||||
// Restore moves a file out of trash (sets is_deleted = false).
|
||||
|
||||
@@ -16,6 +16,7 @@ import (
|
||||
"github.com/google/uuid"
|
||||
|
||||
"tanabata/backend/internal/domain"
|
||||
"tanabata/backend/internal/imagehash"
|
||||
"tanabata/backend/internal/port"
|
||||
)
|
||||
|
||||
@@ -154,6 +155,17 @@ func (s *FileService) Upload(ctx context.Context, p UploadParams) (*domain.File,
|
||||
}
|
||||
exifData, exifDatetime := extractMetadata(data, origName, p.ContentDatetimeFallback)
|
||||
|
||||
// Compute a perceptual hash for images so duplicate detection can later match
|
||||
// near-identical files. Best-effort: a decode failure just leaves phash unset
|
||||
// (the dedup CLI backfills it). Video is hashed by that CLI, not inline, to keep
|
||||
// ffmpeg off the upload path.
|
||||
var phash *int64
|
||||
if strings.HasPrefix(mime.Name, "image/") {
|
||||
if h, ok := imagehash.FromBytes(data); ok {
|
||||
phash = &h
|
||||
}
|
||||
}
|
||||
|
||||
// Resolve content datetime: explicit > metadata date > fallback (e.g. import mtime) > zero.
|
||||
var contentDatetime time.Time
|
||||
if p.ContentDatetime != nil {
|
||||
@@ -187,6 +199,7 @@ func (s *FileService) Upload(ctx context.Context, p UploadParams) (*domain.File,
|
||||
Notes: p.Notes,
|
||||
Metadata: p.Metadata,
|
||||
EXIF: exifData,
|
||||
PHash: phash,
|
||||
CreatorID: userID,
|
||||
IsPublic: p.IsPublic,
|
||||
}
|
||||
@@ -453,6 +466,18 @@ func (s *FileService) Replace(ctx context.Context, id uuid.UUID, p UploadParams)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Recompute the perceptual hash from the new content: images inline, anything
|
||||
// else cleared to NULL so the old content's hash never lingers (the dedup CLI
|
||||
// recomputes video). Best-effort, like on upload — phash is recomputable.
|
||||
var phash *int64
|
||||
if strings.HasPrefix(mime.Name, "image/") {
|
||||
if h, ok := imagehash.FromBytes(data); ok {
|
||||
phash = &h
|
||||
}
|
||||
}
|
||||
_ = s.files.SetPHash(ctx, id, phash)
|
||||
updated.PHash = phash
|
||||
|
||||
objType := fileObjectType
|
||||
_ = s.audit.Log(ctx, "file_replace", &objType, &id, nil)
|
||||
return updated, nil
|
||||
|
||||
@@ -16,6 +16,8 @@ import (
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/disintegration/imaging"
|
||||
@@ -155,6 +157,30 @@ func (s *DiskStorage) Preview(ctx context.Context, id uuid.UUID) (io.ReadCloser,
|
||||
return s.serveGenerated(ctx, id, s.previewCachePath(id), s.previewWidth, s.previewHeight)
|
||||
}
|
||||
|
||||
// VideoFrameMiddle decodes a representative frame from the middle of a video
|
||||
// (duration/2). The midpoint avoids the shared intros, title cards and black
|
||||
// lead-in frames that make a fixed early offset collide across unrelated clips,
|
||||
// so it is the right source for the video's perceptual (duplicate-detection)
|
||||
// hash. The file must already exist in storage; ffmpeg/ffprobe must be installed.
|
||||
// This is not part of port.FileStorage — only the dedup CLI needs it, with a
|
||||
// concrete *DiskStorage — so the interface stays lean and ffmpeg stays out of the
|
||||
// upload path.
|
||||
func (s *DiskStorage) VideoFrameMiddle(ctx context.Context, id uuid.UUID) (image.Image, error) {
|
||||
srcPath := s.originalPath(id)
|
||||
if _, err := os.Stat(srcPath); err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return nil, domain.ErrNotFound
|
||||
}
|
||||
return nil, fmt.Errorf("storage: stat %q: %w", srcPath, err)
|
||||
}
|
||||
// Fall back to a 1s offset if duration can't be probed — better a frame than none.
|
||||
at := 1.0
|
||||
if d, err := videoDurationSeconds(ctx, srcPath); err == nil && d > 0 {
|
||||
at = d / 2
|
||||
}
|
||||
return extractVideoFrameAt(ctx, srcPath, at)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
@@ -342,19 +368,25 @@ func (s *DiskStorage) vipsThumbnail(ctx context.Context, srcPath, cachePath stri
|
||||
return f, nil
|
||||
}
|
||||
|
||||
// extractVideoFrame uses ffmpeg to extract a single frame from a video file.
|
||||
// It seeks 1 second in (keyframe-accurate fast seek) and pipes the frame out
|
||||
// as PNG. If the video is shorter than 1 s the seek is silently ignored by
|
||||
// ffmpeg and the first available frame is returned instead.
|
||||
// Returns an error if ffmpeg is not installed or produces no output. The run is
|
||||
// bounded by a timeout so a malformed file cannot hang the request indefinitely.
|
||||
// extractVideoFrame extracts a single frame ~1 second into the video — a safe
|
||||
// default for thumbnails. See extractVideoFrameAt for the mechanics.
|
||||
func extractVideoFrame(ctx context.Context, srcPath string) (image.Image, error) {
|
||||
return extractVideoFrameAt(ctx, srcPath, 1)
|
||||
}
|
||||
|
||||
// extractVideoFrameAt uses ffmpeg to extract a single frame at atSec seconds into
|
||||
// the video, piped out as PNG. The fast input seek (-ss before -i) is keyframe-
|
||||
// accurate and cheap; if atSec is past the end the seek is silently ignored and
|
||||
// the first available frame is returned instead. Returns an error if ffmpeg is
|
||||
// not installed or produces no output. The run is bounded by a timeout so a
|
||||
// malformed file cannot hang the caller indefinitely.
|
||||
func extractVideoFrameAt(ctx context.Context, srcPath string, atSec float64) (image.Image, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
var out bytes.Buffer
|
||||
cmd := exec.CommandContext(ctx, "ffmpeg",
|
||||
"-ss", "1", // fast input seek; ignored gracefully on short files
|
||||
"-ss", strconv.FormatFloat(atSec, 'f', 3, 64), // fast input seek; ignored gracefully past end
|
||||
"-i", srcPath,
|
||||
"-vframes", "1",
|
||||
"-f", "image2",
|
||||
@@ -370,6 +402,29 @@ func extractVideoFrame(ctx context.Context, srcPath string) (image.Image, error)
|
||||
return imaging.Decode(&out)
|
||||
}
|
||||
|
||||
// videoDurationSeconds returns the container duration in seconds via ffprobe.
|
||||
// Used to seek to the middle of a clip for perceptual hashing.
|
||||
func videoDurationSeconds(ctx context.Context, srcPath string) (float64, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
|
||||
cmd := exec.CommandContext(ctx, "ffprobe",
|
||||
"-v", "error",
|
||||
"-show_entries", "format=duration",
|
||||
"-of", "default=noprint_wrappers=1:nokey=1",
|
||||
srcPath,
|
||||
)
|
||||
out, err := cmd.Output()
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ffprobe duration: %w", err)
|
||||
}
|
||||
d, err := strconv.ParseFloat(strings.TrimSpace(string(out)), 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("ffprobe duration parse %q: %w", out, err)
|
||||
}
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Path helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user