Files
tanabata/scripts/migrate-legacy/transform.sql
T
H1K0 76942721ad
deploy / deploy (push) Successful in 5s
chore(scripts): add legacy data migration
One-time migration from the old Python/Flask Tanabata DB into the new
core/data/acl/activity schema.

- transform.sql: reads a `legacy` schema and writes the new one in a single,
  idempotent transaction. Remaps user/mime ids (uuid -> smallint by name),
  inverts is_private -> is_public, lifts EXIF out of files.metadata into the
  exif column, preserves pool hierarchy/created under metadata, synthesises
  file_pool ordering, derives acl object types, sanitises colors/notes.
- migrate.sh: links the new DB to the old one via postgres_fdw, imports the
  old public schema as `legacy`, runs the transform, tears the link down.
- README.md: mapping table, decisions/lossy points, and the separate
  physical-blob copy step.
- docs/reference/schema.sql: the old DB schema the migration is built from
  (referenced by the README).

Verified end-to-end on PostgreSQL 16 (synthetic legacy data, all
transformations and idempotency checked).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 12:43:43 +03:00

221 lines
10 KiB
PL/PgSQL

-- =============================================================================
-- Tanabata legacy -> new schema data migration (transform step)
--
-- Reads the OLD database (exposed as the `legacy` schema — see migrate.sh, which
-- imports it via postgres_fdw) and inserts the transformed rows into the new
-- core / data / acl / activity schemas.
--
-- Assumes the new schema already exists (goose migrations applied) and is seeded
-- (core.mime_types, core.object_types from 007_seed_data.sql).
--
-- Idempotent: ON CONFLICT DO NOTHING everywhere + preserved UUID PKs, so a
-- re-run inserts only what is missing. Runs as one transaction — all or nothing.
--
-- Run with: psql "<new-dsn>" -v ON_ERROR_STOP=1 -f transform.sql
-- (migrate.sh does this for you after setting up the `legacy` schema.)
-- =============================================================================
\set ON_ERROR_STOP on
-- Fail early and clearly if the legacy data hasn't been made available.
DO $$
BEGIN
IF to_regclass('legacy.users') IS NULL THEN
RAISE EXCEPTION
'legacy.* tables not found. Populate the "legacy" schema first '
'(run migrate.sh, or load the old dump into a schema named legacy).';
END IF;
END $$;
BEGIN;
-- ---------------------------------------------------------------------------
-- 1. Users. Old PK is uuid; the new table uses a smallint identity. Insert by
-- the unique `name`, then build a uuid -> smallint map used by every FK below.
-- Old `can_edit` becomes the new `can_create`; nobody is blocked on import.
-- ---------------------------------------------------------------------------
INSERT INTO core.users (name, password, is_admin, can_create, is_blocked)
SELECT name, password, is_admin, can_edit, false
FROM legacy.users
ON CONFLICT (name) DO NOTHING;
CREATE TEMP TABLE user_id_map ON COMMIT DROP AS
SELECT lu.id AS old_id, nu.id AS new_id
FROM legacy.users lu
JOIN core.users nu ON nu.name = lu.name;
-- ---------------------------------------------------------------------------
-- 2. MIME types. Same uuid -> smallint remap, keyed by the MIME name. The new
-- DB is pre-seeded with the common types; add any legacy ones not seeded.
-- ---------------------------------------------------------------------------
INSERT INTO core.mime_types (name, extension)
SELECT name, extension
FROM legacy.mime
ON CONFLICT (name) DO NOTHING;
CREATE TEMP TABLE mime_id_map ON COMMIT DROP AS
SELECT lm.id AS old_id, nm.id AS new_id
FROM legacy.mime lm
JOIN core.mime_types nm ON nm.name = lm.name;
-- ---------------------------------------------------------------------------
-- 3. Categories. UUID PK preserved. is_private -> is_public (inverted),
-- '' notes -> NULL, non-hex colors -> NULL (to satisfy the hex CHECK that the
-- old NOT VALID constraint may not have enforced on existing rows).
-- ---------------------------------------------------------------------------
INSERT INTO data.categories (id, name, notes, color, metadata, creator_id, is_public)
SELECT c.id,
c.name,
NULLIF(c.notes, ''),
CASE WHEN c.color ~* '^[A-Fa-f0-9]{6}$' THEN c.color END,
NULL,
um.new_id,
NOT c.is_private
FROM legacy.categories c
JOIN user_id_map um ON um.old_id = c.creator_id
ON CONFLICT (id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 4. Tags. UUID PK + category_id preserved.
-- ---------------------------------------------------------------------------
INSERT INTO data.tags (id, name, notes, color, category_id, metadata, creator_id, is_public)
SELECT t.id,
t.name,
NULLIF(t.notes, ''),
CASE WHEN t.color ~* '^[A-Fa-f0-9]{6}$' THEN t.color END,
t.category_id,
NULL,
um.new_id,
NOT t.is_private
FROM legacy.tags t
JOIN user_id_map um ON um.old_id = t.creator_id
ON CONFLICT (id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 5. Tag rules (old `autotags`): parent -> when_tag, child -> then_tag.
-- Skip rules whose tags didn't migrate.
-- ---------------------------------------------------------------------------
INSERT INTO data.tag_rules (when_tag_id, then_tag_id, is_active)
SELECT a.parent_id, a.child_id, a.is_active
FROM legacy.autotags a
WHERE EXISTS (SELECT 1 FROM data.tags t WHERE t.id = a.parent_id)
AND EXISTS (SELECT 1 FROM data.tags t WHERE t.id = a.child_id)
ON CONFLICT (when_tag_id, then_tag_id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 6. Files. UUID PK preserved. old `datetime` -> content_datetime,
-- `orig_name` -> original_name. EXIF is lifted out of the old metadata blob
-- into its own column; whatever else was in metadata stays as user metadata
-- (NULL if nothing remains). No phash / soft-delete existed before.
-- ---------------------------------------------------------------------------
INSERT INTO data.files (id, original_name, mime_id, content_datetime, notes,
metadata, exif, phash, creator_id, is_public, is_deleted)
SELECT f.id,
f.orig_name,
mm.new_id,
f.datetime,
NULLIF(f.notes, ''),
NULLIF(f.metadata - 'exif', '{}'::jsonb),
f.metadata -> 'exif',
NULL,
um.new_id,
NOT f.is_private,
false
FROM legacy.files f
JOIN user_id_map um ON um.old_id = f.creator_id
JOIN mime_id_map mm ON mm.old_id = f.mime_id
ON CONFLICT (id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 7. File <-> tag. Skip orphan junction rows.
-- ---------------------------------------------------------------------------
INSERT INTO data.file_tag (file_id, tag_id)
SELECT ft.file_id, ft.tag_id
FROM legacy.file_tag ft
WHERE EXISTS (SELECT 1 FROM data.files f WHERE f.id = ft.file_id)
AND EXISTS (SELECT 1 FROM data.tags t WHERE t.id = ft.tag_id)
ON CONFLICT DO NOTHING;
-- ---------------------------------------------------------------------------
-- 8. Pools. UUID PK preserved. The new schema has neither pool hierarchy nor a
-- `created` column, so the legacy parent_id and created timestamp are kept
-- under metadata (pool ids are random v4, so created isn't otherwise
-- recoverable). is_private -> is_public.
-- ---------------------------------------------------------------------------
INSERT INTO data.pools (id, name, notes, metadata, creator_id, is_public)
SELECT p.id,
p.name,
NULLIF(p.notes, ''),
jsonb_strip_nulls(jsonb_build_object(
'legacy_parent_id', p.parent_id,
'legacy_created', p.created)),
um.new_id,
NOT p.is_private
FROM legacy.pools p
JOIN user_id_map um ON um.old_id = p.creator_id
ON CONFLICT (id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 9. File <-> pool. The old table has no ordering column; synthesise a stable
-- gapped position per pool, ordered by file id (UUID v7 ≈ chronological), so
-- the app's gap-based reordering keeps working.
-- ---------------------------------------------------------------------------
INSERT INTO data.file_pool (file_id, pool_id, position)
SELECT fp.file_id,
fp.pool_id,
(row_number() OVER (PARTITION BY fp.pool_id ORDER BY fp.file_id))::int * 1000
FROM legacy.file_pool fp
WHERE EXISTS (SELECT 1 FROM data.files f WHERE f.id = fp.file_id)
AND EXISTS (SELECT 1 FROM data.pools p WHERE p.id = fp.pool_id)
ON CONFLICT DO NOTHING;
-- ---------------------------------------------------------------------------
-- 10. ACL. The old table stored no object type; derive it by locating the
-- object among files/tags/categories/pools. read/write -> can_view/can_edit.
-- Rows whose object no longer exists are skipped.
-- ---------------------------------------------------------------------------
INSERT INTO acl.permissions (user_id, object_type_id, object_id, can_view, can_edit)
SELECT um.new_id, ot.id, a.object_id, a.read, a.write
FROM legacy.acl a
JOIN user_id_map um ON um.old_id = a.user_id
JOIN LATERAL (
SELECT CASE
WHEN EXISTS (SELECT 1 FROM data.files f WHERE f.id = a.object_id) THEN 'file'
WHEN EXISTS (SELECT 1 FROM data.tags t WHERE t.id = a.object_id) THEN 'tag'
WHEN EXISTS (SELECT 1 FROM data.categories c WHERE c.id = a.object_id) THEN 'category'
WHEN EXISTS (SELECT 1 FROM data.pools p WHERE p.id = a.object_id) THEN 'pool'
END AS type_name
) k ON true
JOIN core.object_types ot ON ot.name = k.type_name
ON CONFLICT (user_id, object_type_id, object_id) DO NOTHING;
-- ---------------------------------------------------------------------------
-- 11. File view history. old `datetime` -> viewed_at.
-- ---------------------------------------------------------------------------
INSERT INTO activity.file_views (file_id, user_id, viewed_at)
SELECT fv.file_id, um.new_id, fv.datetime
FROM legacy.file_views fv
JOIN user_id_map um ON um.old_id = fv.user_id
WHERE EXISTS (SELECT 1 FROM data.files f WHERE f.id = fv.file_id)
ON CONFLICT DO NOTHING;
COMMIT;
-- ---------------------------------------------------------------------------
-- Summary of what now lives in the new schema.
-- ---------------------------------------------------------------------------
\echo ''
\echo 'Migration committed. New row counts:'
SELECT 'core.users' AS table, count(*) FROM core.users
UNION ALL SELECT 'core.mime_types', count(*) FROM core.mime_types
UNION ALL SELECT 'data.categories', count(*) FROM data.categories
UNION ALL SELECT 'data.tags', count(*) FROM data.tags
UNION ALL SELECT 'data.tag_rules', count(*) FROM data.tag_rules
UNION ALL SELECT 'data.files', count(*) FROM data.files
UNION ALL SELECT 'data.file_tag', count(*) FROM data.file_tag
UNION ALL SELECT 'data.pools', count(*) FROM data.pools
UNION ALL SELECT 'data.file_pool', count(*) FROM data.file_pool
UNION ALL SELECT 'acl.permissions', count(*) FROM acl.permissions
UNION ALL SELECT 'activity.file_views', count(*) FROM activity.file_views
ORDER BY 1;