3 Commits

Author SHA1 Message Date
22d459b558 Create new VMs with Ubuntu 24.04 2026-04-12 08:54:47 +04:00
5ed5af6d50 Use hardware clock sync on Ubuntu 24.04+ VMs 2026-04-12 08:53:50 +04:00
41e6b29b97 Add AGENTS.md 2026-04-11 15:36:54 +04:00
20 changed files with 106 additions and 301 deletions

41
AGENTS.md Normal file
View File

@@ -0,0 +1,41 @@
# AGENTS.md
Welcome, AI Agent! This file contains essential context and rules for interacting with the Kosmos Chef repository. Read this carefully before planning or executing any changes.
## 🏢 Project Overview
This repository contains the infrastructure automation code used by Kosmos to provision and configure bare metal servers (KVM hosts) and Ubuntu virtual machines (KVM guests).
We use **Chef Infra**, managed locally via **Knife Zero** (agentless Chef), and **Berkshelf** for dependency management.
## 📂 Directory Structure & Rules
* **`site-cookbooks/`**: 🟢 **EDITABLE.** This directory contains all custom, internal cookbooks written specifically for Kosmos services (e.g., `kosmos-postgresql`, `kosmos_gitea`, `kosmos-mastodon`). *Active development happens here.*
* **`cookbooks/`**: 🔴 **DO NOT EDIT.** This directory contains third-party/community cookbooks that are vendored. These are managed by Berkshelf. Modifying them directly will result in lost changes.
* **`roles/`**: 🟢 **EDITABLE.** Contains Chef roles written in Ruby (e.g., `base.rb`, `kvm_guest.rb`, `postgresql_primary.rb`). These define run-lists and role-specific default attributes for servers.
* **`environments/`**: Contains Chef environment definitions (like `production.rb`).
* **`data_bags/`**: Contains data bag configurations, often encrypted. Be cautious and do not expose secrets. (Note: Agents should not manage data bag secrets directly unless provided the `.chef/encrypted_data_bag_secret`).
* **`nodes/`**: Contains JSON state files for bootstrapped nodes. *Agents typically do not edit these directly unless cleaning up a deleted node.*
* **`Berksfile`**: Defines community cookbook dependencies.
* **`Vagrantfile` / `.kitchen/`**: Used for local virtualization and integration testing.
## 🛠️ Tooling & Workflows
1. **Dependency Management (Berkshelf)**
If a new community cookbook is required:
- Add it to the `Berksfile` at the root.
- Instruct the user to run `berks install` and `berks vendor cookbooks/ --delete` (or run it via the `bash` tool if permitted).
2. **Provisioning (Knife Zero)**
- Bootstrapping and converging nodes is done using `knife zero`.
- *Example:* `knife zero converge name:server-name.kosmos.org`
3. **Code Style & Conventions**
- Chef recipes, resources, and roles are written in **Ruby**.
- Follow standard Chef and Ruby (RuboCop) idioms. Look at neighboring files in `site-cookbooks/` or `roles/` to match formatting and naming conventions.
## 🚨 Core Directives for AI Agents
1. **Infrastructure as Code**: Manual server configurations are highly discouraged. All changes must be codified in a cookbook or role.
2. **Test Safety Nets**: Look for `.kitchen.yml` within specific `site-cookbooks/<name>` to understand if local integration tests are available.
3. **No Assumptions**: Do not assume standard test commands. Check `README.md` and repository config files first.
4. **Secret Handling**: Avoid hardcoding passwords or API keys in recipes or roles. Assume sensitive information is managed via Chef `data_bags`.

View File

@@ -1,7 +0,0 @@
name "postgresql_replica_logical"
run_list %w(
kosmos_postgresql::hostsfile
kosmos_postgresql::replica_logical
kosmos_postgresql::firewall
)

View File

@@ -26,9 +26,9 @@
include_recipe 'apt'
include_recipe 'timezone_iii'
include_recipe 'ntp'
include_recipe 'kosmos-base::journald_conf'
include_recipe 'kosmos-base::systemd_emails'
include_recipe "ntp" if node["platform"] == "ubuntu" && node["platform_version"].to_f < 24.04
node.override["apt"]["unattended_upgrades"]["enable"] = true
node.override["apt"]["unattended_upgrades"]["mail_only_on_error"] = false

View File

@@ -1,9 +1,9 @@
release = "20260320"
img_filename = "ubuntu-22.04-server-cloudimg-amd64-disk-kvm"
release = "20260321"
img_filename = "ubuntu-24.04-server-cloudimg-amd64"
node.default["kosmos_kvm"]["host"]["qemu_base_image"] = {
"url" => "https://cloud-images.ubuntu.com/releases/jammy/release-#{release}/#{img_filename}.img",
"checksum" => "f7173eb7137b4f0ebeaea8fffe68ecdab1e3c787bde1fd8dfdf27103554332b3",
"url" => "https://cloud-images.ubuntu.com/releases/noble/release-#{release}/#{img_filename}.img",
"checksum" => "5c3ddb00f60bc455dac0862fabe9d8bacec46c33ac1751143c5c3683404b110d",
"path" => "/var/lib/libvirt/images/base/#{img_filename}-#{release}.qcow2"
}

View File

@@ -70,7 +70,7 @@ virt-install \
--vcpus "$CPUS" \
--cpu host \
--arch x86_64 \
--osinfo detect=on,name=ubuntujammy \
--osinfo detect=on,name=ubuntu24.04 \
--hvm \
--virt-type kvm \
--disk "$IMAGE_PATH" \

View File

@@ -1,8 +1,3 @@
node.default['kosmos_postgresql']['postgresql_version'] = "14"
# This is set to false by default, and set to true in the server resource
# for replicas.
node.default['kosmos_postgresql']['ready_to_set_up_replica'] = false
# Address space from which clients are allowed to connect
node.default['kosmos_postgresql']['access_addr'] = "10.1.1.0/24"

View File

@@ -1,34 +0,0 @@
#!/bin/bash
set -e
echo "== Creating publication in each database =="
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template0','template1')"); do
echo "Processing DB: $db"
# Create publication (idempotent)
psql -d "$db" -v ON_ERROR_STOP=1 <<SQL
DO \$\$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_publication WHERE pubname = 'migrate_pub'
) THEN
CREATE PUBLICATION migrate_pub FOR ALL TABLES;
END IF;
END
\$\$;
SQL
# Create logical replication slot (idempotent-ish)
SLOT="migrate_slot_${db}"
if ! psql -d "$db" -Atqc "SELECT 1 FROM pg_replication_slots WHERE slot_name = '$SLOT'" | grep -q 1; then
echo " Creating slot: $SLOT"
psql -d "$db" -c "SELECT pg_create_logical_replication_slot('$SLOT', 'pgoutput');"
else
echo " Slot already exists: $SLOT"
fi
done
echo "== Done =="

View File

@@ -1,33 +0,0 @@
set -e
echo "== Dropping subscriptions slots and publications on PRIMARY =="
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template0','template1')"); do
echo "Processing DB: $db"
SLOT="migrate_slot_${db}"
# Drop slot if exists
if psql -d "$db" -Atqc "SELECT 1 FROM pg_replication_slots WHERE slot_name = '$SLOT'" | grep -q 1; then
echo " Dropping slot: $SLOT"
psql -d "$db" -c "SELECT pg_drop_replication_slot('$SLOT');"
else
echo " Slot not found: $SLOT"
fi
# Drop publication if exists
psql -d "$db" -v ON_ERROR_STOP=1 <<SQL
DO \$\$
BEGIN
IF EXISTS (
SELECT 1 FROM pg_publication WHERE pubname = 'migrate_pub'
) THEN
DROP PUBLICATION migrate_pub;
END IF;
END
\$\$;
SQL
done
echo "== Done =="

View File

@@ -1,28 +0,0 @@
set -e
echo "== Dropping subscriptions on PG14 =="
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template0,'template1'')"); do
echo "Processing DB: $db"
SUB="migrate_sub_${db}"
# Disable first (important)
psql -d "$db" -c "ALTER SUBSCRIPTION $SUB DISABLE;" 2>/dev/null || true
# Drop subscription if exists
psql -d "$db" -v ON_ERROR_STOP=1 <<SQL
DO \$\$
BEGIN
IF EXISTS (
SELECT 1 FROM pg_subscription WHERE subname = '$SUB'
) THEN
DROP SUBSCRIPTION $SUB;
END IF;
END
\$\$;
SQL
done
echo "== Done =="

View File

@@ -1,9 +0,0 @@
#!/bin/bash
cd /tmp && \
(pg_dumpall --globals-only > globals.sql) && \
psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN (''template0'')" | \
xargs -I{} -P4 sh -c "
pg_dump -Fd -j 4 -d \"{}\" -f dump_{} &&
tar -cf - dump_{} | zstd -19 -T0 > dump_{}.tar.zst &&
rm -rf dump_{}
"

View File

@@ -1,5 +0,0 @@
#!/bin/bash
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn"); do
echo "DB: $db"
psql -d "$db" -Atqc "SELECT pubname FROM pg_publication;"
done

View File

@@ -1,5 +0,0 @@
#!/bin/bash
psql -c "
SELECT slot_name,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn))
FROM pg_replication_slots;"

View File

@@ -1,5 +0,0 @@
#!/bin/bash
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template0','template1')"); do
echo "==== DB: $db ===="
psql -d "$db" -c "SELECT * FROM pg_stat_subscription;"
done

View File

@@ -1,8 +0,0 @@
#!/bin/bash
cd /tmp
for f in dump_*.tar.zst; do
db=$(echo $f | sed "s/dump_\(.*\)\.tar\.zst/\1/")
echo "Restoring $db"
zstd -d "$f" -c | tar -xf -
pg_restore -j 4 -d "$db" dump_$db
done

View File

@@ -36,16 +36,10 @@ class Chef
end
end
def postgresql_version
node['kosmos_postgresql']['postgresql_version']
end
def postgresql_service_name
postgresql_version = "12"
def postgresql_service
"postgresql@#{postgresql_version}-main"
end
def postgresql_data_dir
"/var/lib/postgresql/#{postgresql_version}/main"
end
end
end

View File

@@ -3,41 +3,31 @@
# Recipe:: primary
#
postgresql_version = "12"
postgresql_service = "postgresql@#{postgresql_version}-main"
service postgresql_service do
supports restart: true, status: true, reload: true
end
postgresql_custom_server postgresql_version do
role "primary"
end
cookbook_file "/usr/local/bin/pg_dump_all_databases" do
source "dump_all_databases.sh"
user "postgres"
group "postgres"
mode "0744"
postgresql_access "zerotier members" do
access_type "host"
access_db "all"
access_user "all"
access_addr "10.1.1.0/24"
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end
cookbook_file "/usr/local/bin/pg_create_replication_publications" do
source "create_publications.sh"
user "postgres"
group "postgres"
mode "0744"
end
cookbook_file "/usr/local/bin/pg_drop_replication_publications" do
source "drop_publications.sh"
user "postgres"
group "postgres"
mode "0744"
end
cookbook_file "/usr/local/bin/pg_list_replication_publications" do
source "list_publications.sh"
user "postgres"
group "postgres"
mode "0744"
end
cookbook_file "/usr/local/bin/pg_list_replication_slots" do
source "list_replication_slots.sh"
user "postgres"
group "postgres"
mode "0744"
postgresql_access "zerotier members replication" do
access_type "host"
access_db "replication"
access_user "replication"
access_addr "10.1.1.0/24"
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end

View File

@@ -3,32 +3,54 @@
# Recipe:: replica
#
service postgresql_service do
supports restart: true, status: true, reload: true
end
postgresql_version = "12"
postgresql_service = "postgresql@#{postgresql_version}-main"
postgresql_custom_server postgresql_version do
role "replica"
end
service postgresql_service do
supports restart: true, status: true, reload: true
end
postgresql_data_bag_item = data_bag_item('credentials', 'postgresql')
primary = postgresql_primary
if primary.nil?
Chef::Log.warn("No PostgreSQL primary node found. Skipping replication setup.")
return
end
unless primary.nil?
# TODO
postgresql_data_dir = "/var/lib/postgresql/#{postgresql_version}/main"
execute "set up replication" do
command <<-EOF
# FIXME get zerotier IP
execute "set up replication" do
command <<-EOF
systemctl stop #{postgresql_service}
mv #{postgresql_data_dir} #{postgresql_data_dir}.old
pg_basebackup -h pg.kosmos.local -U replication -D #{postgresql_data_dir} -R
chown -R postgres:postgres #{postgresql_data_dir}
systemctl start #{postgresql_service}
EOF
environment 'PGPASSWORD' => postgresql_data_bag_item['replication_password']
sensitive true
not_if { ::File.exist? "#{postgresql_data_dir}/standby.signal" }
EOF
environment 'PGPASSWORD' => postgresql_data_bag_item['replication_password']
sensitive true
not_if { ::File.exist? "#{postgresql_data_dir}/standby.signal" }
end
postgresql_access "zerotier members" do
access_type "host"
access_db "all"
access_user "all"
access_addr "10.1.1.0/24"
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end
postgresql_access "zerotier members replication" do
access_type "host"
access_db "replication"
access_user "replication"
access_addr "10.1.1.0/24"
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end
end

View File

@@ -1,50 +0,0 @@
#
# Cookbook:: kosmos_postgresql
# Recipe:: replica_logical
#
service postgresql_service do
supports restart: true, status: true, reload: true
end
postgresql_custom_server postgresql_version do
role "replica_logical"
end
postgresql_data_bag_item = data_bag_item('credentials', 'postgresql')
primary = postgresql_primary
if primary.nil?
Chef::Log.warn("No PostgreSQL primary node found. Skipping replication setup.")
return
end
template "/usr/local/bin/pg_create_replication_subscriptions" do
source "create_subscriptions.sh.erb"
user "postgres"
group "postgres"
mode "0740"
sensitive true
end
cookbook_file "/usr/local/bin/pg_drop_replication_subscriptions" do
source "drop_subscriptions.sh"
user "postgres"
group "postgres"
mode "0744"
end
cookbook_file "/usr/local/bin/pg_list_replication_subscriptions" do
source "list_subscriptions.sh"
user "postgres"
group "postgres"
mode "0744"
end
cookbook_file "/usr/local/bin/pg_restore_all_databases" do
source "restore_all_databases.sh"
user "postgres"
group "postgres"
mode "0744"
end

View File

@@ -56,9 +56,7 @@ action :create do
timezone: "UTC", # default is GMT
listen_addresses: "0.0.0.0",
promote_trigger_file: "#{postgresql_data_dir}/failover.trigger",
wal_level: "logical",
wal_keep_size: 4096, # 256 segments, 16MB each
max_replication_slots: 16
wal_keep_segments: 256
}
postgresql_server_conf "main" do
@@ -72,24 +70,6 @@ action :create do
replication true
password postgresql_credentials['replication_password']
end
postgresql_access "all members" do
access_type "host"
access_db "all"
access_user "all"
access_addr node['kosmos_postgresql']['access_addr']
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end
postgresql_access "replication members" do
access_type "host"
access_db "replication"
access_user "replication"
access_addr node['kosmos_postgresql']['access_addr']
access_method "md5"
notifies :reload, "service[#{postgresql_service}]", :immediately
end
end
action_class do

View File

@@ -1,33 +0,0 @@
set -e
echo "== Creating subscriptions for all databases =="
for db in $(psql -Atqc "SELECT datname FROM pg_database WHERE datallowconn AND datname NOT IN ('template0','template1')"); do
echo "Processing DB: $db"
SLOT="migrate_slot_${db}"
SUB="migrate_sub_${db}"
psql -d "$db" -v ON_ERROR_STOP=1 <<SQL
DO \$\$
BEGIN
IF NOT EXISTS (
SELECT 1 FROM pg_subscription WHERE subname = '$SUB'
) THEN
CREATE SUBSCRIPTION $SUB
CONNECTION 'host=<%= @pg_host %> port=<%= @pg_port %> dbname=$db user=<%= @pg_user %> password=<%= @pg_pass %>'
PUBLICATION migrate_pub
WITH (
slot_name = '$SLOT',
create_slot = false,
copy_data = false,
enabled = true
);
END IF;
END
\$\$;
SQL
done
echo "== Done =="