Files
tanabata/backend/internal/service/file_service.go
T
H1K0 88849cc16b feat(backend): perceptual hashing for images and video
Adds a 64-bit dHash perceptual hash (internal/imagehash, built on the existing
disintegration/imaging — no new dependency) and starts populating the long-unused
data.files.phash column:

- Upload sets phash inline for images (cheap, from the in-memory bytes).
- Replace recomputes it from new content for images and clears it for anything
  else, so a stale hash never survives a content swap.
- FileRepo.SetPHash sets/clears the hash (used by Replace and, later, the dedup
  backfill).
- DiskStorage.VideoFrameMiddle extracts a frame from the middle of a clip
  (ffprobe duration -> ffmpeg -ss duration/2), avoiding the shared-intro collision
  a fixed early offset causes. It is a concrete method, not part of the storage
  port: only the dedup CLI needs it, keeping ffmpeg off the upload path. Video
  phashes are therefore computed by that CLI, not at upload time.
- DUPLICATE_HASH_THRESHOLD config (default 10/64) for the later pair rescan.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-16 12:20:52 +03:00

798 lines
23 KiB
Go

package service
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/gabriel-vasile/mimetype"
"github.com/google/uuid"
"tanabata/backend/internal/domain"
"tanabata/backend/internal/imagehash"
"tanabata/backend/internal/port"
)
const fileObjectType = "file"
// fileObjectTypeID is the primary key of the "file" row in core.object_types.
// It matches the first value inserted in 007_seed_data.sql.
const fileObjectTypeID int16 = 1
// UploadParams holds the parameters for uploading a new file.
type UploadParams struct {
Reader io.Reader
MIMEType string
OriginalName *string
Notes *string
Metadata json.RawMessage
ContentDatetime *time.Time
// ContentDatetimeFallback is used for content_datetime only when neither an
// explicit ContentDatetime nor an EXIF date is available (e.g. the source
// file's mtime on a server-side import).
ContentDatetimeFallback *time.Time
IsPublic bool
TagIDs []uuid.UUID
}
// UpdateParams holds the parameters for updating file metadata.
type UpdateParams struct {
OriginalName *string
Notes *string
Metadata json.RawMessage
ContentDatetime *time.Time
IsPublic *bool
TagIDs *[]uuid.UUID // nil means don't change tags
}
// ContentResult holds the open reader and metadata for a file download.
type ContentResult struct {
Body io.ReadCloser
MIMEType string
OriginalName *string
}
// ImportFileError records a failed file during an import operation.
type ImportFileError struct {
Filename string `json:"filename"`
Reason string `json:"reason"`
}
// ImportResult summarises a directory import.
type ImportResult struct {
Imported int `json:"imported"`
Skipped int `json:"skipped"`
Errors []ImportFileError `json:"errors"`
}
// ImportEvent is one progress message streamed during an import, letting the UI
// show a live progress bar and a per-file status list. Type is the discriminator:
//
// "start" — total is the number of entries about to be processed.
// "file" — one entry finished: index (1-based), filename, status, optional reason.
// "done" — final tallies (imported/skipped/errors).
type ImportEvent struct {
Type string `json:"type"`
Total int `json:"total,omitempty"`
Index int `json:"index,omitempty"`
Filename string `json:"filename,omitempty"`
Status string `json:"status,omitempty"` // "imported" | "skipped" | "error"
Reason string `json:"reason,omitempty"`
Imported int `json:"imported,omitempty"`
Skipped int `json:"skipped,omitempty"`
Errors int `json:"errors,omitempty"`
}
// FileService handles business logic for file records.
type FileService struct {
files port.FileRepo
mimes port.MimeRepo
storage port.FileStorage
acl *ACLService
audit *AuditService
tags *TagService
tx port.Transactor
importPath string // default server-side import directory
}
// NewFileService creates a FileService.
func NewFileService(
files port.FileRepo,
mimes port.MimeRepo,
storage port.FileStorage,
acl *ACLService,
audit *AuditService,
tags *TagService,
tx port.Transactor,
importPath string,
) *FileService {
return &FileService{
files: files,
mimes: mimes,
storage: storage,
acl: acl,
audit: audit,
tags: tags,
tx: tx,
importPath: importPath,
}
}
// ---------------------------------------------------------------------------
// Core CRUD
// ---------------------------------------------------------------------------
// Upload validates the MIME type, saves the file to storage, creates the DB
// record, and applies any initial tags — all within a single transaction.
// If ContentDatetime is nil and the metadata carries a capture date, it is used.
func (s *FileService) Upload(ctx context.Context, p UploadParams) (*domain.File, error) {
userID, _, _ := domain.UserFromContext(ctx)
// Validate MIME type against the whitelist.
mime, err := s.mimes.GetByName(ctx, p.MIMEType)
if err != nil {
return nil, err // ErrUnsupportedMIME or DB error
}
// Buffer the upload so we can extract EXIF without re-reading storage.
var buf bytes.Buffer
if _, err := io.Copy(&buf, p.Reader); err != nil {
return nil, fmt.Errorf("FileService.Upload: read body: %w", err)
}
data := buf.Bytes()
// Extract rich metadata (best-effort; covers images, video and audio).
var origName string
if p.OriginalName != nil {
origName = *p.OriginalName
}
exifData, exifDatetime := extractMetadata(data, origName, p.ContentDatetimeFallback)
// Compute a perceptual hash for images so duplicate detection can later match
// near-identical files. Best-effort: a decode failure just leaves phash unset
// (the dedup CLI backfills it). Video is hashed by that CLI, not inline, to keep
// ffmpeg off the upload path.
var phash *int64
if strings.HasPrefix(mime.Name, "image/") {
if h, ok := imagehash.FromBytes(data); ok {
phash = &h
}
}
// Resolve content datetime: explicit > metadata date > fallback (e.g. import mtime) > zero.
var contentDatetime time.Time
if p.ContentDatetime != nil {
contentDatetime = *p.ContentDatetime
} else if exifDatetime != nil {
contentDatetime = *exifDatetime
} else if p.ContentDatetimeFallback != nil {
contentDatetime = *p.ContentDatetimeFallback
}
// Assign UUID v7 so CreatedAt can be derived from it later.
fileID, err := uuid.NewV7()
if err != nil {
return nil, fmt.Errorf("FileService.Upload: generate UUID: %w", err)
}
// Save file bytes to disk before opening the transaction so that a disk
// failure does not abort an otherwise healthy DB transaction.
if _, err := s.storage.Save(ctx, fileID, bytes.NewReader(data)); err != nil {
return nil, fmt.Errorf("FileService.Upload: save to storage: %w", err)
}
var created *domain.File
txErr := s.tx.WithTx(ctx, func(ctx context.Context) error {
f := &domain.File{
ID: fileID,
OriginalName: p.OriginalName,
MIMEType: mime.Name,
MIMEExtension: mime.Extension,
ContentDatetime: contentDatetime,
Notes: p.Notes,
Metadata: p.Metadata,
EXIF: exifData,
PHash: phash,
CreatorID: userID,
IsPublic: p.IsPublic,
}
var createErr error
created, createErr = s.files.Create(ctx, f)
if createErr != nil {
return createErr
}
if len(p.TagIDs) > 0 {
tags, err := s.tags.SetFileTags(ctx, created.ID, p.TagIDs)
if err != nil {
return err
}
created.Tags = tags
}
return nil
})
if txErr != nil {
// Attempt to clean up the orphaned file; ignore cleanup errors.
_ = s.storage.Delete(ctx, fileID)
return nil, txErr
}
objType := fileObjectType
_ = s.audit.Log(ctx, "file_create", &objType, &created.ID, nil)
return created, nil
}
// Get returns a file by ID, enforcing view ACL.
func (s *FileService) Get(ctx context.Context, id uuid.UUID) (*domain.File, error) {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return nil, err
}
ok, err := s.acl.CanView(ctx, userID, isAdmin, f.CreatorID, f.IsPublic, fileObjectTypeID, id)
if err != nil {
return nil, err
}
if !ok {
return nil, domain.ErrForbidden
}
return f, nil
}
// RecordView appends a view-history entry for the current user, enforcing view
// ACL (you can only record a view of a file you may see).
func (s *FileService) RecordView(ctx context.Context, id uuid.UUID) error {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return err
}
ok, err := s.acl.CanView(ctx, userID, isAdmin, f.CreatorID, f.IsPublic, fileObjectTypeID, id)
if err != nil {
return err
}
if !ok {
return domain.ErrForbidden
}
return s.files.RecordView(ctx, id, userID)
}
// Update applies metadata changes to a file, enforcing edit ACL.
func (s *FileService) Update(ctx context.Context, id uuid.UUID, p UpdateParams) (*domain.File, error) {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return nil, err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return nil, err
}
if !ok {
return nil, domain.ErrForbidden
}
patch := &domain.File{}
if p.OriginalName != nil {
patch.OriginalName = p.OriginalName
}
if p.Notes != nil {
patch.Notes = p.Notes
}
if p.Metadata != nil {
patch.Metadata = p.Metadata
}
if p.ContentDatetime != nil {
patch.ContentDatetime = *p.ContentDatetime
}
if p.IsPublic != nil {
patch.IsPublic = *p.IsPublic
}
var updated *domain.File
txErr := s.tx.WithTx(ctx, func(ctx context.Context) error {
var updateErr error
updated, updateErr = s.files.Update(ctx, id, patch)
if updateErr != nil {
return updateErr
}
if p.TagIDs != nil {
tags, err := s.tags.SetFileTags(ctx, id, *p.TagIDs)
if err != nil {
return err
}
updated.Tags = tags
}
return nil
})
if txErr != nil {
return nil, txErr
}
objType := fileObjectType
_ = s.audit.Log(ctx, "file_edit", &objType, &id, nil)
return updated, nil
}
// Delete soft-deletes a file (moves to trash), enforcing edit ACL.
func (s *FileService) Delete(ctx context.Context, id uuid.UUID) error {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return err
}
if !ok {
return domain.ErrForbidden
}
if err := s.files.SoftDelete(ctx, id); err != nil {
return err
}
objType := fileObjectType
_ = s.audit.Log(ctx, "file_delete", &objType, &id, nil)
return nil
}
// Restore moves a soft-deleted file out of trash, enforcing edit ACL.
func (s *FileService) Restore(ctx context.Context, id uuid.UUID) (*domain.File, error) {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return nil, err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return nil, err
}
if !ok {
return nil, domain.ErrForbidden
}
restored, err := s.files.Restore(ctx, id)
if err != nil {
return nil, err
}
objType := fileObjectType
_ = s.audit.Log(ctx, "file_restore", &objType, &id, nil)
return restored, nil
}
// PermanentDelete removes the file record and its stored bytes. Only allowed
// when the file is already in trash.
func (s *FileService) PermanentDelete(ctx context.Context, id uuid.UUID) error {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return err
}
if !f.IsDeleted {
return domain.ErrConflict
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return err
}
if !ok {
return domain.ErrForbidden
}
if err := s.files.DeletePermanent(ctx, id); err != nil {
return err
}
_ = s.storage.Delete(ctx, id)
_ = s.storage.InvalidateCache(ctx, id)
objType := fileObjectType
_ = s.audit.Log(ctx, "file_permanent_delete", &objType, &id, nil)
return nil
}
// Replace swaps the stored bytes for a file with new content.
func (s *FileService) Replace(ctx context.Context, id uuid.UUID, p UploadParams) (*domain.File, error) {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return nil, err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return nil, err
}
if !ok {
return nil, domain.ErrForbidden
}
mime, err := s.mimes.GetByName(ctx, p.MIMEType)
if err != nil {
return nil, err
}
var buf bytes.Buffer
if _, err := io.Copy(&buf, p.Reader); err != nil {
return nil, fmt.Errorf("FileService.Replace: read body: %w", err)
}
data := buf.Bytes()
var origName string
if p.OriginalName != nil {
origName = *p.OriginalName
}
exifData, _ := extractMetadata(data, origName, nil)
if _, err := s.storage.Save(ctx, id, bytes.NewReader(data)); err != nil {
return nil, fmt.Errorf("FileService.Replace: save to storage: %w", err)
}
// Drop stale thumbnail/preview so they regenerate from the new content.
_ = s.storage.InvalidateCache(ctx, id)
patch := &domain.File{
MIMEType: mime.Name,
MIMEExtension: mime.Extension,
EXIF: exifData,
}
if p.OriginalName != nil {
patch.OriginalName = p.OriginalName
}
updated, err := s.files.Update(ctx, id, patch)
if err != nil {
return nil, err
}
// Recompute the perceptual hash from the new content: images inline, anything
// else cleared to NULL so the old content's hash never lingers (the dedup CLI
// recomputes video). Best-effort, like on upload — phash is recomputable.
var phash *int64
if strings.HasPrefix(mime.Name, "image/") {
if h, ok := imagehash.FromBytes(data); ok {
phash = &h
}
}
_ = s.files.SetPHash(ctx, id, phash)
updated.PHash = phash
objType := fileObjectType
_ = s.audit.Log(ctx, "file_replace", &objType, &id, nil)
return updated, nil
}
// List delegates to FileRepo with the given params, restricting results to
// files the caller may see (unless they are an admin).
func (s *FileService) List(ctx context.Context, params domain.FileListParams) (*domain.FilePage, error) {
params.ViewerID, params.ViewerIsAdmin, _ = domain.UserFromContext(ctx)
page, err := s.files.List(ctx, params)
if err != nil {
return nil, err
}
// Log tag usage when a filter is first applied — not on pagination (cursor)
// or an anchored return, so a single browse counts once. Best-effort
// analytics; a failed write never breaks the listing.
if params.Filter != "" && params.Cursor == "" && params.Anchor == nil && params.ViewerID != 0 {
_ = s.files.RecordTagUses(ctx, params.ViewerID, params.Filter)
}
return page, nil
}
// AuthorizeView ensures the caller may view the file. Returns ErrNotFound if the
// file does not exist or ErrForbidden if the caller lacks view access.
func (s *FileService) AuthorizeView(ctx context.Context, id uuid.UUID) error {
_, err := s.Get(ctx, id)
return err
}
// AuthorizeEdit ensures the caller may edit the file. Returns ErrNotFound if the
// file does not exist or ErrForbidden if the caller lacks edit access.
func (s *FileService) AuthorizeEdit(ctx context.Context, id uuid.UUID) error {
userID, isAdmin, _ := domain.UserFromContext(ctx)
f, err := s.files.GetByID(ctx, id)
if err != nil {
return err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return err
}
if !ok {
return domain.ErrForbidden
}
return nil
}
// ---------------------------------------------------------------------------
// Content / thumbnail / preview streaming
// ---------------------------------------------------------------------------
// GetContent opens the raw file for download, enforcing view ACL.
func (s *FileService) GetContent(ctx context.Context, id uuid.UUID) (*ContentResult, error) {
f, err := s.Get(ctx, id) // ACL checked inside Get
if err != nil {
return nil, err
}
rc, err := s.storage.Read(ctx, id)
if err != nil {
return nil, err
}
return &ContentResult{
Body: rc,
MIMEType: f.MIMEType,
OriginalName: f.OriginalName,
}, nil
}
// GetThumbnail returns the thumbnail JPEG, enforcing view ACL.
func (s *FileService) GetThumbnail(ctx context.Context, id uuid.UUID) (io.ReadCloser, error) {
if _, err := s.Get(ctx, id); err != nil {
return nil, err
}
return s.storage.Thumbnail(ctx, id)
}
// GetPreview returns the preview JPEG, enforcing view ACL.
func (s *FileService) GetPreview(ctx context.Context, id uuid.UUID) (io.ReadCloser, error) {
if _, err := s.Get(ctx, id); err != nil {
return nil, err
}
return s.storage.Preview(ctx, id)
}
// ---------------------------------------------------------------------------
// Bulk operations
// ---------------------------------------------------------------------------
// BulkDelete soft-deletes multiple files. Files the caller cannot edit are silently skipped.
func (s *FileService) BulkDelete(ctx context.Context, fileIDs []uuid.UUID) error {
for _, id := range fileIDs {
if err := s.Delete(ctx, id); err != nil {
if err == domain.ErrNotFound || err == domain.ErrForbidden {
continue
}
return err
}
}
return nil
}
// SetNeedsReview sets the review status ("needs tagging" vs marked done) on the
// given files. Each file is checked against edit ACL; files the caller cannot
// edit, or that do not exist, are skipped (same forgiving semantics as
// BulkDelete). Authorized files are updated in a single statement and each is
// audit-logged. Works for one file (single-element slice) or many.
func (s *FileService) SetNeedsReview(ctx context.Context, ids []uuid.UUID, value bool) error {
userID, isAdmin, _ := domain.UserFromContext(ctx)
authorized := make([]uuid.UUID, 0, len(ids))
for _, id := range ids {
f, err := s.files.GetByID(ctx, id)
if err != nil {
if err == domain.ErrNotFound {
continue
}
return err
}
ok, err := s.acl.CanEdit(ctx, userID, isAdmin, f.CreatorID, fileObjectTypeID, id)
if err != nil {
return err
}
if ok {
authorized = append(authorized, id)
}
}
if len(authorized) == 0 {
return nil
}
if err := s.files.SetNeedsReview(ctx, authorized, value); err != nil {
return err
}
objType := fileObjectType
details := map[string]any{"needs_review": value}
for i := range authorized {
_ = s.audit.Log(ctx, "file_review", &objType, &authorized[i], details)
}
return nil
}
// ---------------------------------------------------------------------------
// Import
// ---------------------------------------------------------------------------
// Import scans a server-side directory and uploads all supported files.
// If path is empty, the configured default import path is used.
//
// onProgress, when non-nil, receives a "start" event, one "file" event per
// directory entry as it is processed, and a final "done" event — letting a
// caller stream live progress. It is always called from this goroutine (never
// concurrently). The aggregate result is also returned for non-streaming callers.
func (s *FileService) Import(ctx context.Context, path string, onProgress func(ImportEvent)) (*ImportResult, error) {
if s.importPath == "" {
return nil, domain.ErrValidation
}
dir := s.importPath
if path != "" {
// Confine caller-supplied paths to the configured import directory so a
// directory-traversal value cannot read arbitrary host files.
confined, err := confineToBase(s.importPath, path)
if err != nil {
return nil, err
}
dir = confined
}
entries, err := os.ReadDir(dir)
if err != nil {
return nil, fmt.Errorf("FileService.Import: read dir %q: %w", dir, err)
}
// Import oldest-first: process entries in ascending mtime order so the
// resulting records' creation order matches the files' chronological order.
// mtimes are cached once (re-stat'ing per comparison would be wasteful) and
// reused below as the content_datetime fallback. Entries whose info can't be
// read get a zero time (sort first); they surface their error in the loop.
modTimes := make(map[string]time.Time, len(entries))
for _, e := range entries {
if info, infoErr := e.Info(); infoErr == nil {
modTimes[e.Name()] = info.ModTime()
}
}
// SliceStable preserves ReadDir's name order as a deterministic tiebreak for
// entries sharing an mtime.
sort.SliceStable(entries, func(a, b int) bool {
return modTimes[entries[a].Name()].Before(modTimes[entries[b].Name()])
})
emit := func(ev ImportEvent) {
if onProgress != nil {
onProgress(ev)
}
}
result := &ImportResult{Errors: []ImportFileError{}}
total := len(entries)
emit(ImportEvent{Type: "start", Total: total})
for i, entry := range entries {
name := entry.Name()
file := func(status, reason string) {
emit(ImportEvent{
Type: "file", Index: i + 1, Total: total,
Filename: name, Status: status, Reason: reason,
})
}
fail := func(reason string) {
result.Errors = append(result.Errors, ImportFileError{Filename: name, Reason: reason})
file("error", reason)
}
if entry.IsDir() {
result.Skipped++
file("skipped", "directory")
continue
}
fullPath := filepath.Join(dir, name)
mt, err := mimetype.DetectFile(fullPath)
if err != nil {
fail(fmt.Sprintf("MIME detection failed: %s", err))
continue
}
mimeStr := mt.String()
// Strip parameters (e.g. "text/plain; charset=utf-8" → "text/plain").
if j := strings.IndexByte(mimeStr, ';'); j >= 0 {
mimeStr = mimeStr[:j]
}
if _, err := s.mimes.GetByName(ctx, mimeStr); err != nil {
result.Skipped++
file("skipped", "unsupported type: "+mimeStr)
continue
}
f, err := os.Open(fullPath)
if err != nil {
fail(fmt.Sprintf("open failed: %s", err))
continue
}
// Preserve the file's mtime as a content_datetime fallback (used only when
// the file has no EXIF date) — once the source is removed below it's the
// only date left for non-photo files. Reuses the mtime cached for sorting.
var mtime *time.Time
if t, ok := modTimes[name]; ok {
mtime = &t
}
_, uploadErr := s.Upload(ctx, UploadParams{
Reader: f,
MIMEType: mimeStr,
OriginalName: &name,
ContentDatetimeFallback: mtime,
})
f.Close()
if uploadErr != nil {
fail(uploadErr.Error())
continue
}
result.Imported++
// Remove the source on success so the import folder drains and re-running
// doesn't duplicate. The file is already safely copied into storage; a
// removal failure is reported but doesn't undo the import.
if rmErr := os.Remove(fullPath); rmErr != nil {
reason := fmt.Sprintf("imported, but failed to remove source: %s", rmErr)
result.Errors = append(result.Errors, ImportFileError{Filename: name, Reason: reason})
file("imported", reason) // imported, with a warning
continue
}
file("imported", "")
}
emit(ImportEvent{
Type: "done", Total: total,
Imported: result.Imported, Skipped: result.Skipped, Errors: len(result.Errors),
})
return result, nil
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
// confineToBase resolves target and verifies it does not escape base (after
// cleaning and resolving "..") so a caller cannot read files outside the
// configured import directory. Returns the cleaned absolute path on success.
func confineToBase(base, target string) (string, error) {
absBase, err := filepath.Abs(base)
if err != nil {
return "", domain.ErrValidation
}
absTarget, err := filepath.Abs(target)
if err != nil {
return "", domain.ErrValidation
}
rel, err := filepath.Rel(absBase, absTarget)
if err != nil {
return "", domain.ErrValidation
}
if rel == ".." || strings.HasPrefix(rel, ".."+string(os.PathSeparator)) {
return "", domain.ErrForbidden
}
return absTarget, nil
}