feat(backend): perceptual hashing for images and video

Adds a 64-bit dHash perceptual hash (internal/imagehash, built on the existing
disintegration/imaging — no new dependency) and starts populating the long-unused
data.files.phash column:

- Upload sets phash inline for images (cheap, from the in-memory bytes).
- Replace recomputes it from new content for images and clears it for anything
  else, so a stale hash never survives a content swap.
- FileRepo.SetPHash sets/clears the hash (used by Replace and, later, the dedup
  backfill).
- DiskStorage.VideoFrameMiddle extracts a frame from the middle of a clip
  (ffprobe duration -> ffmpeg -ss duration/2), avoiding the shared-intro collision
  a fixed early offset causes. It is a concrete method, not part of the storage
  port: only the dedup CLI needs it, keeping ffmpeg off the upload path. Video
  phashes are therefore computed by that CLI, not at upload time.
- DUPLICATE_HASH_THRESHOLD config (default 10/64) for the later pair rescan.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-16 12:20:52 +03:00
parent 58cea88f52
commit 88849cc16b
7 changed files with 278 additions and 7 deletions
+70
View File
@@ -0,0 +1,70 @@
// Package imagehash computes a 64-bit perceptual hash (dHash) of an image and
// compares two hashes by Hamming distance. It is used for near-duplicate
// detection: visually similar images (re-encoded, resized, recompressed) produce
// hashes a small distance apart, while unrelated images are far apart.
//
// dHash is chosen for its robustness and simplicity: the image is reduced to a
// 9×8 grayscale and each pixel is compared to its right-hand neighbour, yielding
// 64 gradient-direction bits. It tolerates scaling and brightness/contrast
// changes well, which is exactly what re-encoded duplicates exhibit.
package imagehash
import (
"bytes"
"image"
_ "image/gif" // register GIF decoder
_ "image/jpeg" // register JPEG decoder
_ "image/png" // register PNG decoder
"math/bits"
"github.com/disintegration/imaging"
_ "golang.org/x/image/webp" // register WebP decoder
)
// hashWidth/hashHeight define the reduced grayscale used for dHash. The extra
// column (width = height+1) provides the right-hand neighbour for the 64
// horizontal comparisons that make up the hash.
const (
hashHeight = 8
hashWidth = hashHeight + 1
)
// FromImage reduces img to a 9×8 grayscale and returns its 64-bit dHash. The
// uint64 of gradient bits is returned as int64 (a plain bit reinterpretation) so
// it fits PostgreSQL's bigint; equality and Distance are bitwise, so the signed
// interpretation never matters.
func FromImage(img image.Image) int64 {
small := imaging.Grayscale(imaging.Resize(img, hashWidth, hashHeight, imaging.Lanczos))
var hash uint64
bit := 0
for y := 0; y < hashHeight; y++ {
for x := 0; x < hashHeight; x++ {
// After Grayscale, R == G == B, so the red channel is the luminance.
left := small.Pix[small.PixOffset(x, y)]
right := small.Pix[small.PixOffset(x+1, y)]
if left < right {
hash |= 1 << uint(63-bit)
}
bit++
}
}
return int64(hash)
}
// FromBytes decodes data (JPEG/PNG/GIF/WebP) and returns its dHash. ok is false
// when the bytes are not a decodable image, so callers can simply skip hashing
// (e.g. leave phash NULL) rather than fail.
func FromBytes(data []byte) (hash int64, ok bool) {
img, _, err := image.Decode(bytes.NewReader(data))
if err != nil {
return 0, false
}
return FromImage(img), true
}
// Distance returns the Hamming distance (064) between two hashes: the number of
// differing bits. 0 means identical; small values mean near-duplicate.
func Distance(a, b int64) int {
return bits.OnesCount64(uint64(a) ^ uint64(b))
}
@@ -0,0 +1,99 @@
package imagehash
import (
"bytes"
"image"
"image/color"
"image/jpeg"
"image/png"
"math"
"testing"
)
// radial renders a smooth grayscale image whose brightness falls off with
// distance from (cx, cy). Smooth gradients are the realistic case for perceptual
// hashing and survive JPEG re-encoding well, so they make stable test fixtures.
func radial(w, h int, cx, cy float64) image.Image {
img := image.NewRGBA(image.Rect(0, 0, w, h))
maxD := math.Hypot(float64(w), float64(h))
for y := 0; y < h; y++ {
for x := 0; x < w; x++ {
d := math.Hypot(float64(x)-cx, float64(y)-cy)
v := uint8(255 * (1 - d/maxD))
img.Set(x, y, color.RGBA{v, v, v, 255})
}
}
return img
}
func encodePNG(t *testing.T, img image.Image) []byte {
t.Helper()
var buf bytes.Buffer
if err := png.Encode(&buf, img); err != nil {
t.Fatalf("png encode: %v", err)
}
return buf.Bytes()
}
func encodeJPEG(t *testing.T, img image.Image, quality int) []byte {
t.Helper()
var buf bytes.Buffer
if err := jpeg.Encode(&buf, img, &jpeg.Options{Quality: quality}); err != nil {
t.Fatalf("jpeg encode: %v", err)
}
return buf.Bytes()
}
// The same image re-encoded as PNG (lossless) and JPEG (lossy) must hash to a
// small Hamming distance — that is the whole point of a perceptual hash.
func TestFromBytes_SameImageAcrossEncodings(t *testing.T) {
img := radial(64, 64, 32, 32)
pngHash, ok := FromBytes(encodePNG(t, img))
if !ok {
t.Fatal("FromBytes(PNG): ok=false")
}
jpgHash, ok := FromBytes(encodeJPEG(t, img, 90))
if !ok {
t.Fatal("FromBytes(JPEG): ok=false")
}
if d := Distance(pngHash, jpgHash); d > 8 {
t.Errorf("same image, different encodings: distance = %d, want <= 8", d)
}
}
// Visually different images must be far apart, and clearly farther than the same
// image across encodings.
func TestDistance_DifferentImagesAreFarApart(t *testing.T) {
a := FromImage(radial(64, 64, 32, 32)) // centred
b := FromImage(radial(64, 64, 0, 0)) // corner
same, _ := FromBytes(encodeJPEG(t, radial(64, 64, 32, 32), 90))
d := Distance(a, b)
if d < 12 {
t.Errorf("different images: distance = %d, want >= 12", d)
}
if d <= Distance(a, same) {
t.Errorf("different images (%d) not farther than re-encoded same image (%d)", d, Distance(a, same))
}
}
func TestDistance_SymmetricAndZeroForEqual(t *testing.T) {
a := FromImage(radial(64, 64, 20, 40))
b := FromImage(radial(64, 64, 40, 20))
if Distance(a, a) != 0 {
t.Errorf("Distance(a, a) = %d, want 0", Distance(a, a))
}
if Distance(a, b) != Distance(b, a) {
t.Errorf("Distance not symmetric: %d vs %d", Distance(a, b), Distance(b, a))
}
}
func TestFromBytes_RejectsNonImage(t *testing.T) {
if _, ok := FromBytes([]byte("definitely not an image")); ok {
t.Error("FromBytes on garbage: ok=true, want false")
}
}