diff --git a/doc/postgres-migration.md b/doc/postgres-migration.md new file mode 100644 index 0000000..e9d24d2 --- /dev/null +++ b/doc/postgres-migration.md @@ -0,0 +1,271 @@ +# Migrating PostgreSQL cluster to a new major version + +## Summary + +1. Dump from a replica +2. Restore to fresh VM running new major version +3. Add logical replication for delta sync from current/old primary +4. Switch primary to new server +5. Remove logical replication on new server + +## Runbook + +* Primary host: `PRIMARY_HOST` +* Replica host: `REPLICA_HOST` +* New PG14 host: `NEW_HOST` +* PostgreSQL superuser: `postgres` +* Running locally on each machine via `sudo -u postgres` + +Adjust hostnames/IPs/etc. where needed. + +--- + +### ๐ŸŸข 0. PRIMARY โ€” Pre-checks + +```bash +sudo -u postgres psql -c "SHOW wal_level;" +sudo -u postgres psql -c "SHOW max_replication_slots;" +``` + +If needed, edit config: + +```bash +sudo -u postgres vi $PGDATA/postgresql.conf +``` + +Ensure: + +```conf +wal_level = logical +max_replication_slots = 10 +``` + +Restart if changed: + +```bash +sudo systemctl restart postgresql +``` + +--- + +### ๐Ÿ”ต๐ŸŸก 3. Create keypair for syncing dump later + +๐Ÿ”ต On NEW_HOST: + +```bash +sudo mkdir -p /home/postgres/.ssh && \ +sudo chown -R postgres:postgres /home/postgres && \ +sudo chmod 700 /home/postgres/.ssh && \ +sudo -u postgres bash -c 'ssh-keygen -t ecdsa -b 256 -f /home/postgres/.ssh/id_ecdsa -N "" -C "postgres@$(hostname)"' && \ +sudo cat /home/postgres/.ssh/id_ecdsa.pub +``` + +Copy the public key from the above output + +๐ŸŸก On replica: + +```bash +sudo mkdir -p /home/postgres/.ssh && \ +sudo chown -R postgres:postgres /home/postgres && \ +sudo chmod 700 /home/postgres/.ssh && \ +echo [public_key] | sudo tee /home/postgres/.ssh/authorized_keys > /dev/null && \ +sudo chmod 700 /home/postgres/.ssh +``` + +--- + +### ๐ŸŸข 1. PRIMARY โ€” Create publication and replication slots + +```bash +sudo -u postgres pg_create_replication_publications +``` + +or + +```bash +sudo -u postgres pg_create_replication_publication [db_name] +``` + +Listing publications and slots: + +```bash +sudo -u postgres pg_list_replication_publications +sudo -u postgres pg_list_replication_slots +``` + +--- + +### ๐ŸŸก 3. REPLICA โ€” Pause replication + +```bash +sudo -u postgres psql -c "SELECT pg_wal_replay_pause();" +``` + +Verify: + +```bash +sudo -u postgres psql -c "SELECT pg_is_wal_replay_paused();" +``` + +--- + +### ๐ŸŸก 4. REPLICA โ€” Run dump + +```bash +sudo -u postgres pg_dump_all_databases +``` + +or + +```bash +sudo -u postgres bash -c "pg_dumpall --globals-only > /tmp/globals.sql" +sudo -u postgres pg_dump_database [db_name] +``` + +--- + +### ๐ŸŸก 5. REPLICA โ€” Resume replication + +```bash +sudo -u postgres psql -c "SELECT pg_wal_replay_resume();" +``` + +--- + +### ๐Ÿ”ต 6. COPY dumps to NEW HOST + +From NEW_HOST: + +```bash +export REPLICA_HOST=[private_ip] && \ +cd /tmp && \ +sudo -u postgres scp "postgres@$REPLICA_HOST:/tmp/globals.sql" . && \ +sudo -u postgres scp "postgres@$REPLICA_HOST:/tmp/dump_*.tar.zst" . +``` + +--- + +### ๐Ÿ”ต 7. NEW HOST (PostgreSQL 14) โ€” Restore + +#### 7.1 Restore globals + +```bash +sudo -u postgres psql -f /tmp/globals.sql +``` + +--- + +#### 7.2 Create databases + +```bash +sudo -u postgres psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')" | \ +xargs -I{} sudo -u postgres createdb {} +``` + +or + +```bash +sudo -u postgres createdb [db_name] +``` + +--- + +#### 7.3 Restore each database + +```bash +sudo -u postgres pg_restore_all_databases +``` + +or + +```bash +sudo -u postgres pg_restore_database [db_name] +``` + +--- + +### ๐Ÿ”ต 8. NEW HOST โ€” Create subscriptions + +```bash +sudo -u postgres pg_create_replication_subscriptions +``` + +or + +```bash +sudo -u postgres pg_create_replication_subscription [db_name] +``` + +--- + +### ๐Ÿ”ต 9. NEW HOST โ€” Monitor replication + +```bash +sudo -u postgres pg_list_replication_subscriptions +``` + +--- + +### ๐Ÿ”ด 11. CUTOVER + +#### 11.1 Stop writes on old primary + +(app maintenance mode, stop app/daemons) + +--- + +#### 11.2 Wait for replication to catch up + +TODO: not the best way to check, since WAL LSNs keep increasing + +```bash +sudo -u postgres psql -d [db_name] -c "SELECT * FROM pg_stat_subscription;" +``` + +--- + +#### 11.3 Fix sequences + +Run per DB: + +```bash +sudo -u postgres pg_fix_sequences_in_all_databases +``` + +or + +```bash +sudo -u postgres pg_fix_sequences [db_name] +``` + +--- + +#### 11.4 Point app to NEW_HOST + +* Update pg.kosmos.local in /etc/hosts on app server(s) (maybe override + attribute for role and converge) +* Start app/daemons, deactivate maintenance mode + +--- + +### ๐Ÿงน 12. CLEANUP (NEW_HOST) + +```bash +sudo -u postgres pg_drop_replication_subscriptions +``` + +--- + +### ๐Ÿงน 13. CLEANUP (PRIMARY) + +TODO: Looks like slots are dropped automatically, when subscriptions are dropped + +```bash +sudo -u postgres pg_drop_replication_publications +``` + +--- + +### โœ… DONE + +--- diff --git a/roles/postgresql_replica_logical.rb b/roles/postgresql_replica_logical.rb new file mode 100644 index 0000000..1e1a7c9 --- /dev/null +++ b/roles/postgresql_replica_logical.rb @@ -0,0 +1,8 @@ +name "postgresql_replica_logical" + +run_list [ + "kosmos_postgresql::hostsfile", + "kosmos_postgresql::replica_logical", + "kosmos_postgresql::firewall", + "kosmos_postgresql::management_scripts" +] diff --git a/site-cookbooks/kosmos_postgresql/files/create_publication.sh b/site-cookbooks/kosmos_postgresql/files/create_publication.sh new file mode 100644 index 0000000..2aa7e8f --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/create_publication.sh @@ -0,0 +1,31 @@ +#!/bin/bash +set -euo pipefail + +DB_NAME="${1:?Usage: $0 }" + +echo "== Processing DB: $DB_NAME ==" + +# Create publication (idempotent) +psql -d "$DB_NAME" -v ON_ERROR_STOP=1 <<'SQL' +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_publication WHERE pubname = 'migrate_pub' + ) THEN + CREATE PUBLICATION migrate_pub FOR ALL TABLES; + END IF; +END +$$; +SQL + +# Create logical replication slot (idempotent-ish) +SLOT="migrate_slot_${DB_NAME}" + +if ! psql -d "$DB_NAME" -Atqc "SELECT 1 FROM pg_replication_slots WHERE slot_name = '$SLOT'" | grep -q 1; then + echo " Creating slot: $SLOT" + psql -d "$DB_NAME" -c "SELECT pg_create_logical_replication_slot('$SLOT', 'pgoutput');" +else + echo " Slot already exists: $SLOT" +fi + +echo "== Done ==" diff --git a/site-cookbooks/kosmos_postgresql/files/create_publications.sh b/site-cookbooks/kosmos_postgresql/files/create_publications.sh new file mode 100644 index 0000000..eda54ee --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/create_publications.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +echo "== Creating publication in each database ==" + +for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')"); do + echo "Processing DB: $db" + + # Create publication (idempotent) + psql -d "$db" -v ON_ERROR_STOP=1 < globals.sql) && \ +psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN (''template1'')" | \ +xargs -I{} -P4 sh -c " + pg_dump -Fd -j 4 -d \"{}\" -f dump_{} && + tar -cf - dump_{} | zstd -19 -T0 > dump_{}.tar.zst && + rm -rf dump_{} +" diff --git a/site-cookbooks/kosmos_postgresql/files/dump_database.sh b/site-cookbooks/kosmos_postgresql/files/dump_database.sh new file mode 100644 index 0000000..028ad77 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/dump_database.sh @@ -0,0 +1,10 @@ +#!/bin/bash +set -euo pipefail + +DB_NAME="${1:?Usage: $0 }" + +cd /tmp + +pg_dump -Fd -j 4 -d "$DB_NAME" -f "dump_${DB_NAME}" +tar -cf - "dump_${DB_NAME}" | zstd -19 -T0 > "dump_${DB_NAME}.tar.zst" +rm -rf "dump_${DB_NAME}" diff --git a/site-cookbooks/kosmos_postgresql/files/fix_sequences.sh b/site-cookbooks/kosmos_postgresql/files/fix_sequences.sh new file mode 100644 index 0000000..a158cdd --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/fix_sequences.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e + +DB="$1" + +if [ -z "$DB" ]; then + echo "Usage: $0 " + exit 1 +fi + +echo "== Fixing sequences in database: $DB ==" + +SQL=$(psql -d "$DB" -Atqc " + SELECT + 'SELECT setval(' || + quote_literal(pg_get_serial_sequence(quote_ident(n.nspname)||'.'||quote_ident(c.relname), a.attname)) || + ', COALESCE(MAX(' || quote_ident(a.attname) || '), 0) + 1, false) FROM ' || + quote_ident(n.nspname)||'.'||quote_ident(c.relname) || ';' + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN pg_attribute a ON a.attrelid = c.oid + WHERE c.relkind = 'r' + AND a.attnum > 0 + AND NOT a.attisdropped + AND pg_get_serial_sequence(quote_ident(n.nspname)||'.'||quote_ident(c.relname), a.attname) IS NOT NULL; +") + +if [ -z "$SQL" ]; then + echo "No sequences found in $DB" + exit 0 +fi + +echo "$SQL" | psql -d "$DB" + +echo "== Done ==" diff --git a/site-cookbooks/kosmos_postgresql/files/fix_sequences_in_all_databases.sh b/site-cookbooks/kosmos_postgresql/files/fix_sequences_in_all_databases.sh new file mode 100644 index 0000000..73b9a7d --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/fix_sequences_in_all_databases.sh @@ -0,0 +1,38 @@ +#!/bin/bash +set -e + +echo "== Fixing sequences across all databases ==" + +for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')"); do + echo "---- DB: $db ----" + + # Generate fix statements + SQL=$(psql -d "$db" -Atqc " + SELECT + 'SELECT setval(' || + quote_literal(pg_get_serial_sequence(quote_ident(n.nspname)||'.'||quote_ident(c.relname), a.attname)) || + ', COALESCE(MAX(' || quote_ident(a.attname) || '), 0) + 1, false) FROM ' || + quote_ident(n.nspname)||'.'||quote_ident(c.relname) || ';' + FROM pg_class c + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN pg_attribute a ON a.attrelid = c.oid + WHERE c.relkind = 'r' + AND a.attnum > 0 + AND NOT a.attisdropped + AND pg_get_serial_sequence(quote_ident(n.nspname)||'.'||quote_ident(c.relname), a.attname) IS NOT NULL; + ") + + if [ -z "$SQL" ]; then + echo "No sequences found in $db" + continue + fi + + echo "Fixing sequences in $db..." + + # Execute generated statements + echo "$SQL" | psql -d "$db" + +done + +echo "== Done fixing sequences ==" + diff --git a/site-cookbooks/kosmos_postgresql/files/list_publications.sh b/site-cookbooks/kosmos_postgresql/files/list_publications.sh new file mode 100644 index 0000000..7459005 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/list_publications.sh @@ -0,0 +1,5 @@ +#!/bin/bash +for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')"); do + echo "DB: $db" + psql -d "$db" -Atqc "SELECT pubname FROM pg_publication;" +done diff --git a/site-cookbooks/kosmos_postgresql/files/list_replication_slots.sh b/site-cookbooks/kosmos_postgresql/files/list_replication_slots.sh new file mode 100644 index 0000000..591479b --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/list_replication_slots.sh @@ -0,0 +1,5 @@ +#!/bin/bash +psql -c " +SELECT slot_name, +pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) +FROM pg_replication_slots;" diff --git a/site-cookbooks/kosmos_postgresql/files/list_subscriptions.sh b/site-cookbooks/kosmos_postgresql/files/list_subscriptions.sh new file mode 100644 index 0000000..30796d5 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/list_subscriptions.sh @@ -0,0 +1,5 @@ +#!/bin/bash +for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')"); do + echo "==== DB: $db ====" + psql -d "$db" -c "SELECT * FROM pg_stat_subscription;" +done diff --git a/site-cookbooks/kosmos_postgresql/files/restore_all_databases.sh b/site-cookbooks/kosmos_postgresql/files/restore_all_databases.sh new file mode 100644 index 0000000..18c4a7b --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/restore_all_databases.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -euo pipefail + +cd /tmp + +for f in dump_*.tar.zst; do + db=$(echo $f | sed "s/dump_\(.*\)\.tar\.zst/\1/") + echo "Restoring $db" + zstd -d "$f" -c | tar -xf - + pg_restore -j 4 -d "$db" dump_$db + rm -rf "dump_$db" +done diff --git a/site-cookbooks/kosmos_postgresql/files/restore_database.sh b/site-cookbooks/kosmos_postgresql/files/restore_database.sh new file mode 100644 index 0000000..49e7ce2 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/files/restore_database.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -euo pipefail + +DB_NAME="${1:?Usage: $0 }" + +cd /tmp + +FILE="dump_${DB_NAME}.tar.zst" +DIR="dump_${DB_NAME}" + +echo "Restoring $DB_NAME" +zstd -d "$FILE" -c | tar -xf - +pg_restore -j 4 -d "$DB_NAME" "$DIR" +rm -rf "$DIR" diff --git a/site-cookbooks/kosmos_postgresql/recipes/management_scripts.rb b/site-cookbooks/kosmos_postgresql/recipes/management_scripts.rb new file mode 100644 index 0000000..7a90bc2 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/recipes/management_scripts.rb @@ -0,0 +1,121 @@ +# +# Cookbook:: kosmos_postgresql +# Recipe:: management_scripts +# + +credentials = data_bag_item('credentials', 'postgresql') + +cookbook_file "/usr/local/bin/pg_dump_all_databases" do + source "dump_all_databases.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_dump_database" do + source "dump_database.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_restore_all_databases" do + source "restore_all_databases.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_restore_database" do + source "restore_database.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_create_replication_publications" do + source "create_publications.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_create_replication_publication" do + source "create_publication.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_drop_replication_publications" do + source "drop_publications.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_list_replication_publications" do + source "list_publications.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_list_replication_slots" do + source "list_replication_slots.sh" + user "postgres" + group "postgres" + mode "0744" +end + +template "/usr/local/bin/pg_create_replication_subscriptions" do + source "create_subscriptions.sh.erb" + user "postgres" + group "postgres" + mode "0740" + variables pg_host: "pg.kosmos.local", + pg_port: 5432, + pg_user: "replication", + pg_pass: credentials["replication_password"] + sensitive true +end + +template "/usr/local/bin/pg_create_replication_subscription" do + source "create_subscription.sh.erb" + user "postgres" + group "postgres" + mode "0740" + variables pg_host: "pg.kosmos.local", + pg_port: 5432, + pg_user: "replication", + pg_pass: credentials["replication_password"] + sensitive true +end + +cookbook_file "/usr/local/bin/pg_drop_replication_subscriptions" do + source "drop_subscriptions.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_list_replication_subscriptions" do + source "list_subscriptions.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_fix_sequences_in_all_databases" do + source "fix_sequences.sh" + user "postgres" + group "postgres" + mode "0744" +end + +cookbook_file "/usr/local/bin/pg_fix_sequences" do + source "fix_sequences.sh" + user "postgres" + group "postgres" + mode "0744" +end diff --git a/site-cookbooks/kosmos_postgresql/recipes/primary.rb b/site-cookbooks/kosmos_postgresql/recipes/primary.rb index 406e2ae..73c0224 100644 --- a/site-cookbooks/kosmos_postgresql/recipes/primary.rb +++ b/site-cookbooks/kosmos_postgresql/recipes/primary.rb @@ -6,4 +6,3 @@ postgresql_custom_server postgresql_version do role "primary" end - diff --git a/site-cookbooks/kosmos_postgresql/recipes/replica.rb b/site-cookbooks/kosmos_postgresql/recipes/replica.rb index 69d5cd0..95a037a 100644 --- a/site-cookbooks/kosmos_postgresql/recipes/replica.rb +++ b/site-cookbooks/kosmos_postgresql/recipes/replica.rb @@ -3,10 +3,6 @@ # Recipe:: replica # -service postgresql_service do - supports restart: true, status: true, reload: true -end - postgresql_custom_server postgresql_version do role "replica" end @@ -20,6 +16,9 @@ if primary.nil? return end +# TODO Replace pg.kosmos.local with private IP once available +# via proper node attribute +# https://gitea.kosmos.org/kosmos/chef/issues/263 execute "set up replication" do command <<-EOF systemctl stop #{postgresql_service} diff --git a/site-cookbooks/kosmos_postgresql/recipes/replica_logical.rb b/site-cookbooks/kosmos_postgresql/recipes/replica_logical.rb new file mode 100644 index 0000000..1f3dab9 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/recipes/replica_logical.rb @@ -0,0 +1,15 @@ +# +# Cookbook:: kosmos_postgresql +# Recipe:: replica_logical +# + +postgresql_custom_server postgresql_version do + role "replica_logical" +end + +# primary = postgresql_primary +# +# if primary.nil? +# Chef::Log.warn("No PostgreSQL primary node found. Skipping replication setup.") +# return +# end diff --git a/site-cookbooks/kosmos_postgresql/resources/server.rb b/site-cookbooks/kosmos_postgresql/resources/server.rb index f08a70d..e6c72f7 100644 --- a/site-cookbooks/kosmos_postgresql/resources/server.rb +++ b/site-cookbooks/kosmos_postgresql/resources/server.rb @@ -56,13 +56,15 @@ action :create do timezone: "UTC", # default is GMT listen_addresses: "0.0.0.0", promote_trigger_file: "#{postgresql_data_dir}/failover.trigger", - wal_keep_size: 4096 # 256 segments, 16MB each + wal_level: "logical", + wal_keep_size: 4096, # 256 segments, 16MB each + max_replication_slots: 16 } postgresql_server_conf "main" do version postgresql_version additional_config additional_config - notifies :reload, "service[#{postgresql_service}]", :delayed + notifies :restart, "service[#{postgresql_service}]", :delayed end postgresql_user "replication" do diff --git a/site-cookbooks/kosmos_postgresql/templates/create_subscription.sh.erb b/site-cookbooks/kosmos_postgresql/templates/create_subscription.sh.erb new file mode 100644 index 0000000..0839775 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/templates/create_subscription.sh.erb @@ -0,0 +1,31 @@ +#!/bin/bash +set -euo pipefail + +DB_NAME="${1:?Usage: $0 }" + +echo "== Processing DB: $DB_NAME ==" + +SLOT="migrate_slot_${DB_NAME}" +SUB="migrate_sub_${DB_NAME}" + +psql -d "$DB_NAME" -v ON_ERROR_STOP=1 < port=<%= @pg_port %> dbname=$DB_NAME user=<%= @pg_user %> password=<%= @pg_pass %>' + PUBLICATION migrate_pub + WITH ( + slot_name = '$SLOT', + create_slot = false, + copy_data = false, + enabled = true + ); + END IF; +END +\$\$; +SQL + +echo "== Done ==" diff --git a/site-cookbooks/kosmos_postgresql/templates/create_subscriptions.sh.erb b/site-cookbooks/kosmos_postgresql/templates/create_subscriptions.sh.erb new file mode 100644 index 0000000..8d79c73 --- /dev/null +++ b/site-cookbooks/kosmos_postgresql/templates/create_subscriptions.sh.erb @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +echo "== Creating subscriptions for all databases ==" + +for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template1')"); do + echo "Processing DB: $db" + + SLOT="migrate_slot_${db}" + SUB="migrate_sub_${db}" + + psql -d "$db" -v ON_ERROR_STOP=1 < port=<%= @pg_port %> dbname=$db user=<%= @pg_user %> password=<%= @pg_pass %>' + PUBLICATION migrate_pub + WITH ( + slot_name = '$SLOT', + create_slot = false, + copy_data = false, + enabled = true + ); + END IF; +END +\$\$; +SQL + +done + +echo "== Done =="