Remove upstream prometheus cookbook, migrate to our own

This commit is contained in:
2026-07-04 15:27:18 +02:00
parent 2d835335b5
commit 63534e1cf5
37 changed files with 230 additions and 1480 deletions
@@ -1,8 +1,19 @@
node.default["prometheus"]["version"] = "3.13.0"
node.default["prometheus"]["checksum"] = "744d93324cc024d82089921737bd797474d7f1e5dbbfd1c6b387bad258538cb9"
node.default["kosmos_prometheus"]["version"] = "3.13.0"
node.default["kosmos_prometheus"]["checksum"] = "744d93324cc024d82089921737bd797474d7f1e5dbbfd1c6b387bad258538cb9"
node.default["prometheus"]["alertmanager"]["version"] = "0.33.0"
node.default["prometheus"]["alertmanager"]["checksum"] = "8ce11c42e8a6dfbbf93a59c0b193cb1329210b36d0c7ef3df7b745608675a1d1"
node.default["kosmos_prometheus"]["alertmanager"]["version"] = "0.33.0"
node.default["kosmos_prometheus"]["alertmanager"]["checksum"] = "8ce11c42e8a6dfbbf93a59c0b193cb1329210b36d0c7ef3df7b745608675a1d1"
node.default["prometheus"]["node_exporter"]["version"] = "1.11.1"
node.default["prometheus"]["node_exporter"]["checksum"] = "9f5ea48e5bc7b656f8a91a32e7d7deb89f70f73dabd0d974418aca15f37d6810"
node.default["kosmos_prometheus"]["node_exporter"]["version"] = "1.11.1"
node.default["kosmos_prometheus"]["node_exporter"]["checksum"] = "9f5ea48e5bc7b656f8a91a32e7d7deb89f70f73dabd0d974418aca15f37d6810"
node.default["kosmos_prometheus"]["global"] = {
"scrape_interval" => "30s",
"evaluation_interval" => "30s",
}
node.default["kosmos_prometheus"]["jobs"] = {
"prometheus" => { "targets" => ["localhost:9090"] }
}
node.default["kosmos_prometheus"]["rule_files"] = []
+1 -1
View File
@@ -6,7 +6,7 @@ description 'Installs/Configures prometheus'
version '0.1.0'
chef_version '>= 16.0'
depends "prometheus"
depends "firewall"
# The `issues_url` points to the location where issues for this cookbook are
# tracked. A `View Issues` link will be displayed on this cookbook's page when
@@ -5,20 +5,89 @@
include_recipe "firewall"
prometheus_alertmanager_install "alertmanager" do
version node["prometheus"]["alertmanager"]["version"]
checksum node["prometheus"]["alertmanager"]["checksum"]
version = node["kosmos_prometheus"]["alertmanager"]["version"]
checksum = node["kosmos_prometheus"]["alertmanager"]["checksum"]
tarball = "#{Chef::Config[:file_cache_path]}/alertmanager-#{version}.linux-amd64.tar.gz"
binary_url = "https://github.com/prometheus/alertmanager/releases/download/v#{version}/alertmanager-#{version}.linux-amd64.tar.gz"
group "alertmanager"
user "alertmanager" do
gid "alertmanager"
system true
shell "/bin/false"
home "/nonexistent"
end
prometheus_alertmanager_config "alertmanager"
directory "/var/lib/alertmanager" do
owner "alertmanager"
group "alertmanager"
mode "0755"
recursive true
end
execute "restart alertmanager config" do
command "systemctl restart alertmanager.service"
directory "/etc/prometheus" do
owner "root"
group "root"
mode "0755"
recursive true
end
package %w(tar bzip2)
remote_file tarball do
source binary_url
checksum checksum
action :create
notifies :run, "execute[install_alertmanager]", :immediately
end
execute "install_alertmanager" do
command "tar -xzf #{tarball} -C /usr/local/bin --strip-components=1 alertmanager-#{version}.linux-amd64/alertmanager"
action :nothing
subscribes :run, "template[/opt/prometheus/alertmanager.yml]", :delayed
notifies :restart, "service[alertmanager]", :delayed
end
prometheus_alertmanager_service "alertmanager"
file "/usr/local/bin/alertmanager" do
owner "root"
group "root"
mode "0755"
notifies :restart, "service[alertmanager]", :delayed
end
template "/etc/prometheus/alertmanager.yml" do
source "alertmanager.yml.erb"
owner "root"
group "alertmanager"
mode "0644"
notifies :restart, "service[alertmanager]", :delayed
end
systemd_unit "alertmanager.service" do
content({
Unit: {
Description: "Prometheus Alertmanager",
After: "network.target",
},
Service: {
Type: "simple",
User: "alertmanager",
Group: "alertmanager",
ExecStart: "/usr/local/bin/alertmanager --config.file=/etc/prometheus/alertmanager.yml --storage.path=/var/lib/alertmanager --web.listen-address=:9093",
Restart: "on-failure",
RestartSec: "5",
},
Install: {
WantedBy: "multi-user.target",
},
})
triggers_reload true
action :create
end
service "alertmanager" do
action [:enable, :start]
end
firewall_rule "prometheus alertmanager" do
port 9093
@@ -5,8 +5,8 @@
include_recipe "firewall"
version = node["prometheus"]["node_exporter"]["version"]
checksum = node["prometheus"]["node_exporter"]["checksum"]
version = node["kosmos_prometheus"]["node_exporter"]["version"]
checksum = node["kosmos_prometheus"]["node_exporter"]["checksum"]
tarball = "#{Chef::Config[:file_cache_path]}/node_exporter-#{version}.linux-amd64.tar.gz"
binary_url = "https://github.com/prometheus/node_exporter/releases/download/v#{version}/node_exporter-#{version}.linux-amd64.tar.gz"
@@ -5,45 +5,101 @@
include_recipe "firewall"
prometheus_install "prometheus" do
version node["prometheus"]["version"]
checksum node["prometheus"]["checksum"]
version = node["kosmos_prometheus"]["version"]
checksum = node["kosmos_prometheus"]["checksum"]
tarball = "#{Chef::Config[:file_cache_path]}/prometheus-#{version}.linux-amd64.tar.gz"
binary_url = "https://github.com/prometheus/prometheus/releases/download/v#{version}/prometheus-#{version}.linux-amd64.tar.gz"
group "prometheus"
user "prometheus" do
gid "prometheus"
system true
shell "/bin/false"
home "/nonexistent"
end
prometheus_config "prometheus" do
global_config(
"scrape_interval" => "30s",
"evaluation_interval" => "30s"
directory "/var/lib/prometheus" do
owner "prometheus"
group "prometheus"
mode "0755"
recursive true
end
directory "/etc/prometheus" do
owner "root"
group "root"
mode "0755"
recursive true
end
directory "/etc/prometheus/rules" do
owner "root"
group "root"
mode "0755"
recursive true
end
package %w(tar bzip2)
remote_file tarball do
source binary_url
checksum checksum
action :create
notifies :run, "execute[install_prometheus]", :immediately
end
execute "install_prometheus" do
command "tar -xzf #{tarball} -C /usr/local/bin --strip-components=1 prometheus-#{version}.linux-amd64/prometheus"
action :nothing
notifies :restart, "service[prometheus]", :delayed
end
file "/usr/local/bin/prometheus" do
owner "root"
group "root"
mode "0755"
notifies :restart, "service[prometheus]", :delayed
end
template "/etc/prometheus/prometheus.yml" do
source "prometheus.yml.erb"
owner "root"
group "prometheus"
mode "0644"
variables(
global_config: node["kosmos_prometheus"]["global"],
jobs: node["kosmos_prometheus"]["jobs"],
rule_files: node["kosmos_prometheus"]["rule_files"]
)
notifies :reload, "service[prometheus]", :delayed
end
prometheus_job "prometheus" do
target "localhost:9090"
end
prometheus_job "node" do
target "localhost:9100"
end
with_run_context :root do
execute "reload prometheus config" do
command "systemctl reload prometheus.service"
action :nothing
subscribes :run, "template[/opt/prometheus/prometheus.yml]", :delayed
end
end
prometheus_service "prometheus" do
cli_options({
"config.file" => "/opt/prometheus/prometheus.yml",
"log.level" => "info",
"query.max-concurrency" => 20,
"query.lookback-delta" => "5m",
"query.timeout" => "2m",
"storage.tsdb.path" => "/var/lib/prometheus",
"storage.tsdb.retention.time" => "15d",
"web.listen-address" => ":9090"
systemd_unit "prometheus.service" do
content({
Unit: {
Description: "Prometheus",
After: "network.target",
},
Service: {
Type: "simple",
User: "prometheus",
Group: "prometheus",
ExecStart: "/usr/local/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/var/lib/prometheus --storage.tsdb.retention.time=15d --web.listen-address=:9090 --web.enable-lifecycle",
ExecReload: "/bin/kill -HUP $MAINPID",
Restart: "on-failure",
RestartSec: "5",
},
Install: {
WantedBy: "multi-user.target",
},
})
triggers_reload true
action :create
end
service "prometheus" do
action [:enable, :start]
end
firewall_rule "prometheus web" do
@@ -0,0 +1,12 @@
global:
resolve_timeout: 5m
route:
receiver: default
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receivers:
- name: default
@@ -0,0 +1,31 @@
global:
<% @global_config.each do |k, v| %>
<%= k %>: "<%= v %>"
<% end %>
scrape_configs:
<% @jobs.each do |name, job| %>
- job_name: "<%= name %>"
<% if job['scrape_interval'] %>
scrape_interval: "<%= job['scrape_interval'] %>"
<% end %>
<% if job['scrape_timeout'] %>
scrape_timeout: "<%= job['scrape_timeout'] %>"
<% end %>
metrics_path: "<%= job.fetch('metrics_path', '/metrics') %>"
static_configs:
- targets: <%= Array(job['targets']) %>
<% if job['labels'] %>
labels:
<% job['labels'].each do |label, label_config| %>
<%= label %>: <%= label_config %>
<% end %>
<% end %>
<% end %>
<% if @rule_files && !@rule_files.empty? %>
rule_files:
<% @rule_files.each do |filename| %>
- <%= filename %>
<% end %>
<% end %>