diff --git a/.gitignore b/.gitignore index 0e5d31d..2ba2638 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ */__pycache__/ check_patroni.egg-info test/*.state_file +test/vagrant/.vagrant .*.swp diff --git a/test/vagrant/LICENSE b/test/vagrant/LICENSE new file mode 100644 index 0000000..e149d75 --- /dev/null +++ b/test/vagrant/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2019, Jehan-Guillaume (ioguix) de Rorthais +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/test/vagrant/Makefile b/test/vagrant/Makefile new file mode 100644 index 0000000..64b27c3 --- /dev/null +++ b/test/vagrant/Makefile @@ -0,0 +1,22 @@ +export VAGRANT_BOX_UPDATE_CHECK_DISABLE=1 +export VAGRANT_CHECKPOINT_DISABLE=1 + +.PHONY: all prov validate + +all: prov + +prov: + vagrant up --provision + +clean: + vagrant destroy -f + +validate: + @vagrant validate + @if which shellcheck >/dev/null ;\ + then shellcheck provision/* ;\ + else echo "WARNING: shellcheck is not in PATH, not checking bash syntax" ;\ + fi + + + diff --git a/test/vagrant/README.md b/test/vagrant/README.md new file mode 100644 index 0000000..81339bc --- /dev/null +++ b/test/vagrant/README.md @@ -0,0 +1,127 @@ +# Icinga + +## Install + +Create the VM: + +``` +make +``` + +## IcingaWeb + +Configure Icingaweb : + +``` +http://$IP/icingaweb2/setup +``` + +* Screen 1: Welcome + + Use the icinga token given a the end of the `icinga2-setup` provision, or: + + ``` + sudo icingacli setup token show + ``` + + Next + +* Screen 2: Modules + + Activate Monitor (already set) + + Next + +* Screen 3: Icinga Web 2 + + Next + +* Screen 4: Authentication + + Next + +* Screen 5: Database Resource + + Database Name: icingaweb_db + Username: supervisor + Password: th3Pass + Charset: UTF8 + + Validate + Next + +* Screen 6: Authentication Backend + + Next + +* Screen 7: Administration + + Fill the blanks + Next + +* Screen 8: Application Configuration + + Next + +* Screen 9: Summary + + Next + +* Screen 10: Welcome ... again + + Next + +* Screen 11: Monitoring IDO Resource + + Database Name: icinga2 + Username: supervisor + Password: th3Pass + Charset: UTF8 + + Validate + Next + +* Screen 12: Command Transport + + Transaport name: icinga2 + Transport Type: API + Host: 127.0.0.1 + Port: 5665 + User: icinga_api + Password: th3Pass + + Next + +* Screen 13: Monitoring Security + + Next + +* Screen 14: Summary + + Finish + +* Screen 15: Hopefuly success + + Login + +## Add servers to icinga + +``` +# Connect to the vm +vagrant ssh s1 + +# Create /etc/icinga2/conf.d/check_patroni.conf +sudo /vagrant/provision/director.bash init cluster1 p1=10.20.89.54 p2=10.20.89.55 + +# Check and load conf +sudo icinga2 daemon -C +sudo systemctl restart icinga2.service +``` + +# Grafana + +Connect to: http://10.20.89.52:3000/login +User / pass: admin/admin + +Import the dashboards for the grafana directory. They are created for cluster1, +and servers p1, p2. diff --git a/test/vagrant/Vagrantfile b/test/vagrant/Vagrantfile new file mode 100644 index 0000000..03d35c0 --- /dev/null +++ b/test/vagrant/Vagrantfile @@ -0,0 +1,62 @@ +require 'ipaddr' +#require 'yaml' + +ENV["LC_ALL"] = 'en_US.utf8' + +myBox = 'debian/buster64' +myProvider = 'libvirt' + +pgver = 11 +start_ip = '10.20.89.51' +etcd_nodes = [] +patroni_nodes = [] +sup_nodes = ['s1'] + +Vagrant.configure(2) do |config| + config.vm.provider myProvider + + next_ip = IPAddr.new(start_ip).succ + host_ip = (IPAddr.new(start_ip) & "255.255.255.0").succ.to_s + nodes_ips = {} + + ( patroni_nodes + etcd_nodes + sup_nodes ).each do |node| + nodes_ips[node] = next_ip.to_s + next_ip = next_ip.succ + end + + # don't mind about insecure ssh key + config.ssh.insert_key = false + + # https://vagrantcloud.com/search. + config.vm.box = myBox + + # hardware and host settings + config.vm.provider 'libvirt' do |lv| + lv.cpus = 1 + lv.memory = 512 + lv.watchdog model: 'i6300esb' + lv.default_prefix = 'patroni_' + lv.qemu_use_session = false + end + + # disable default share (NFS is not working directly in DEBIAN 10) + config.vm.synced_folder ".", "/vagrant", type: "rsync" + config.vm.synced_folder "/home/benoit/git/dalibo/check_patroni", "/check_patroni", type: "rsync" + + ## allow root@vm to ssh to ssh_login@network_1 + #config.vm.synced_folder 'ssh', '/root/.ssh', type: 'rsync', + # owner: 'root', group: 'root', + # rsync__args: [ "--verbose", "--archive", "--delete", "--copy-links", "--no-perms" ] + + # system setup for sup nodes + (sup_nodes).each do |node| + config.vm.define node do |conf| + conf.vm.network 'private_network', ip: nodes_ips[node] + conf.vm.provision 'icinga2-setup', type: 'shell', path: 'provision/icinga2.bash', + args: [ node ], + preserve_order: true + conf.vm.provision 'check_patroni', type: 'shell', path: 'provision/check_patroni.bash', + preserve_order: true + end + end +end diff --git a/test/vagrant/grafana/cluster_status_cluster1.json b/test/vagrant/grafana/cluster_status_cluster1.json new file mode 100644 index 0000000..f6ab13b --- /dev/null +++ b/test/vagrant/grafana/cluster_status_cluster1.json @@ -0,0 +1,793 @@ +{ + "__inputs": [ + { + "name": "DS_OPM", + "label": "opm", + "description": "", + "type": "datasource", + "pluginId": "postgres", + "pluginName": "PostgreSQL" + }, + { + "name": "VAR_CLUSTER_NAME", + "type": "constant", + "label": "cluster_name", + "value": "cluster1", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.3.3" + }, + { + "type": "datasource", + "id": "postgres", + "name": "PostgreSQL", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1640960519458, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 20, + "x": 0, + "y": 0 + }, + "id": 14, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_has_replica'\n ) \n AND m.label ilike '%lag%' \nGROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster replica lag", + "type": "timeseries" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_has_leader'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster has primary", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 2 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_config_has_changed'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster config has changed", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 4 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_is_in_maintenance'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster is in maintenance", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "role_leader" + }, + "properties": [ + { + "id": "displayName", + "value": "leader" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "role_replica" + }, + "properties": [ + { + "id": "displayName", + "value": "replicas" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "state_running" + }, + "properties": [ + { + "id": "displayName", + "value": "running" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM public.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id \n FROM public.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_node_count'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster node count", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "healthy_replica" + }, + "properties": [ + { + "id": "displayName", + "value": "healthy" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "unhealthy_replica" + }, + "properties": [ + { + "id": "displayName", + "value": "unhealthy" + } + ] + } + ] + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM public.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id \n FROM public.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$cluster_name' AND s.service = 'check_patroni_cluster_has_replica'\n )\n AND m.label IN('healthy_replica','unhealthy_replica') \n GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Cluster has replica", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "hide": 2, + "name": "cluster_name", + "query": "${VAR_CLUSTER_NAME}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_CLUSTER_NAME}", + "text": "${VAR_CLUSTER_NAME}", + "selected": false + }, + "options": [ + { + "value": "${VAR_CLUSTER_NAME}", + "text": "${VAR_CLUSTER_NAME}", + "selected": false + } + ] + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": true, + "text": "1m", + "value": "1m" + }, + "hide": 0, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Cluster status: cluster1", + "uid": "4BullO0nk", + "version": 10, + "weekStart": "" +} \ No newline at end of file diff --git a/test/vagrant/grafana/node_status_p1.json b/test/vagrant/grafana/node_status_p1.json new file mode 100644 index 0000000..3b868c7 --- /dev/null +++ b/test/vagrant/grafana/node_status_p1.json @@ -0,0 +1,496 @@ +{ + "__inputs": [ + { + "name": "DS_OPM", + "label": "opm", + "description": "", + "type": "datasource", + "pluginId": "postgres", + "pluginName": "PostgreSQL" + }, + { + "name": "VAR_NODE_NAME", + "type": "constant", + "label": "node_name", + "value": "p1", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.3.3" + }, + { + "type": "datasource", + "id": "postgres", + "name": "PostgreSQL", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1640961009033, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "is_primary" + }, + "properties": [ + { + "id": "displayName", + "value": "Primaire" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "is_replica" + }, + "properties": [ + { + "id": "displayName", + "value": "Secondaire" + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_primary'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_replica'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "B", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Node type", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "is_alive" + }, + "properties": [ + { + "id": "displayName", + "value": "Node is alive" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "is_pending_restart" + }, + "properties": [ + { + "id": "displayName", + "value": "Node is pending restart" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "timeline" + }, + "properties": [ + { + "id": "displayName", + "value": "Current timeline" + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_alive'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_tl_has_changed'\n )\nAND m.label = 'timeline'\nGROUP BY time, m.label ORDER BY time", + "refId": "B", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_pending_restart'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "D", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Health stats", + "type": "stat" + } + ], + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "hide": 2, + "name": "node_name", + "query": "${VAR_NODE_NAME}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_NAME}", + "text": "${VAR_NODE_NAME}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_NAME}", + "text": "${VAR_NODE_NAME}", + "selected": false + } + ] + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "1m", + "value": "1m" + }, + "hide": 0, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node status: p1", + "uid": "2LfUnFAnk", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/test/vagrant/grafana/node_status_p2.json b/test/vagrant/grafana/node_status_p2.json new file mode 100644 index 0000000..b2eb6f9 --- /dev/null +++ b/test/vagrant/grafana/node_status_p2.json @@ -0,0 +1,496 @@ +{ + "__inputs": [ + { + "name": "DS_OPM", + "label": "opm", + "description": "", + "type": "datasource", + "pluginId": "postgres", + "pluginName": "PostgreSQL" + }, + { + "name": "VAR_NODE_NAME", + "type": "constant", + "label": "node_name", + "value": "p2", + "description": "" + } + ], + "__elements": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.3.3" + }, + { + "type": "datasource", + "id": "postgres", + "name": "PostgreSQL", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "iteration": 1640960994907, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "is_primary" + }, + "properties": [ + { + "id": "displayName", + "value": "Primaire" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "is_replica" + }, + "properties": [ + { + "id": "displayName", + "value": "Secondaire" + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_primary'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_replica'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "B", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Node type", + "type": "stat" + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "is_alive" + }, + "properties": [ + { + "id": "displayName", + "value": "Node is alive" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "is_pending_restart" + }, + "properties": [ + { + "id": "displayName", + "value": "Node is pending restart" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "timeline" + }, + "properties": [ + { + "id": "displayName", + "value": "Current timeline" + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_alive'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "A", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_tl_has_changed'\n )\nAND m.label = 'timeline'\nGROUP BY time, m.label ORDER BY time", + "refId": "B", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + }, + { + "datasource": { + "type": "postgres", + "uid": "${DS_OPM}" + }, + "format": "time_series", + "group": [], + "hide": false, + "metricColumn": "none", + "rawQuery": true, + "rawSql": " SELECT $__timeGroup(timet, $interval) AS time, MAX(d.value), m.label AS metric\n FROM wh_nagios.metrics m,\nLATERAL wh_nagios.get_metric_data(m.id, $__timeFrom(), $__timeTo()) d\n WHERE m.id_service = (\n SELECT s.id FROM wh_nagios.services s \n JOIN public.servers h ON h.id=s.id_server\n WHERE h.hostname = '$node_name' AND s.service = 'check_patroni_node_is_pending_restart'\n ) GROUP BY time, m.label ORDER BY time", + "refId": "D", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "column" + } + ] + ], + "timeColumn": "time", + "where": [ + { + "name": "$__timeFilter", + "params": [], + "type": "macro" + } + ] + } + ], + "title": "Health stats", + "type": "stat" + } + ], + "schemaVersion": 34, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "hide": 2, + "name": "node_name", + "query": "${VAR_NODE_NAME}", + "skipUrlSync": false, + "type": "constant", + "current": { + "value": "${VAR_NODE_NAME}", + "text": "${VAR_NODE_NAME}", + "selected": false + }, + "options": [ + { + "value": "${VAR_NODE_NAME}", + "text": "${VAR_NODE_NAME}", + "selected": false + } + ] + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "1m", + "value": "1m" + }, + "hide": 0, + "name": "interval", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Node status: p2", + "uid": "2LfUnFAnkr", + "version": 1, + "weekStart": "" +} \ No newline at end of file diff --git a/test/vagrant/provision/check_patroni.bash b/test/vagrant/provision/check_patroni.bash new file mode 100755 index 0000000..cd67369 --- /dev/null +++ b/test/vagrant/provision/check_patroni.bash @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +info (){ + echo "$1" +} + +set -o errexit +set -o nounset +set -o pipefail + +info "#=============================================================================" +info "# check_patroni" +info "#=============================================================================" + +DEBIAN_FRONTEND=noninteractive apt install -q -y git python3-pip +pip3 install --upgrade pip + +cd /check_patroni +pip3 install . +ln -s /usr/local/bin/check_patroni /usr/lib/nagios/plugins/check_patroni + +check_patroni --version diff --git a/test/vagrant/provision/director.bash b/test/vagrant/provision/director.bash new file mode 100755 index 0000000..37cfbfe --- /dev/null +++ b/test/vagrant/provision/director.bash @@ -0,0 +1,392 @@ +#!/usr/bin/env bash + +info(){ + echo "$1" +} + +usage(){ + echo "$0 ACTION CLUSTER_NAME [NODE..]" + echo "" + echo " ACTION: init | add" + echo " CLUSTER: cluster name" + echo " NODE: HOST=IP" + echo " HOST: any name for icinga" + echo " IP: the IP" +} + +if [ "$#" -le "3" ]; then + usage + exit 1 +fi + +ACTION="$1" +shift +CLUSTER="$1" +shift +NODES=( "$@" ) + +TARGET="/etc/icinga2/conf.d/check_patroni.conf" + +#set -o errexit +set -o nounset +set -o pipefail + +init(){ + cat << '__EOF__' > "$TARGET" +# =================================================================== +# Check Commands +# =================================================================== +template CheckCommand "check_patroni" { + command = [ PluginDir + "/check_patroni" ] + + arguments = { + "--endpoints" = { + value = "$endpoints$" + order = -2 + repeat_key = true + } + "--timeout" = { + value = "$timeout$" + order = -1 + } + } +} + +object CheckCommand "check_patroni_node_is_alive" { + import "check_patroni" + + arguments += { + "node_is_alive" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_node_is_primary" { + import "check_patroni" + + arguments += { + "node_is_primary" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_node_is_replica" { + import "check_patroni" + + arguments += { + "node_is_replica" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_node_is_pending_restart" { + import "check_patroni" + + arguments += { + "node_is_pending_restart" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_node_patroni_version" { + import "check_patroni" + + arguments += { + "node_patroni_version" = { + order = 1 + } + "--patroni-version" = { + value = "$patroni_version$" + order = 2 + } + } +} + +object CheckCommand "check_patroni_node_tl_has_changed" { + import "check_patroni" + + arguments += { + "node_tl_has_changed" = { + order = 1 + } + "--state-file" = { + value = "/tmp/$state_file$" # a quick and dirty way for this poc + order = 2 + } + } +} + +# ------------------------------------------------------------------- + +object CheckCommand "check_patroni_cluster_has_leader" { + import "check_patroni" + + arguments += { + "cluster_has_leader" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_cluster_has_replica" { + import "check_patroni" + + arguments += { + "cluster_has_replica" = { + order = 1 + } + "--warning" = { + value = "$has_replica_warning$" + order = 2 + } + "--critical" = { + value = "$has_replica_critical$" + order = 3 + } + } +} + +object CheckCommand "check_patroni_cluster_config_has_changed" { + import "check_patroni" + + arguments += { + "cluster_config_has_changed" = { + order = 1 + } + "--state-file" = { + value = "/tmp/$state_file$" # a quick and dirty way for this poc + order = 2 + } + } +} + +object CheckCommand "check_patroni_cluster_is_in_maintenance" { + import "check_patroni" + + arguments += { + "cluster_is_in_maintenance" = { + order = 1 + } + } +} + +object CheckCommand "check_patroni_cluster_node_count" { + import "check_patroni" + + arguments += { + "cluster_node_count" = { + order = 1 + } + "--warning" = { + value = "$node_count_warning$" + order = 2 + } + "--critical" = { + value = "$node_count_critical$" + order = 3 + } + "--running-warning" = { + value = "$node_count_running_warning$" + order = 4 + } + "--running-critical" = { + value = "$node_count_running_critical$" + order = 5 + } + } +} + +# =================================================================== +# Services +# =================================================================== +template Service "check_patroni" { + max_check_attempts = 3 + check_interval = 1m # we spam a little for the sake of testing + retry_interval = 15 # we spam a little for the sake of testing + enable_perfdata = true + vars.timeout = 10 +} + +apply Service "check_patroni_node_is_alive" { + import "check_patroni" + check_command = "check_patroni_node_is_alive" + + assign where "patroni_servers" in host.groups +} + +apply Service "check_patroni_node_is_primary" { + import "check_patroni" + check_command = "check_patroni_node_is_primary" + + assign where "patroni_servers" in host.groups +} + +apply Service "check_patroni_node_is_replica" { + import "check_patroni" + check_command = "check_patroni_node_is_replica" + + assign where "patroni_servers" in host.groups +} + +apply Service "check_patroni_node_is_pending_restart" { + import "check_patroni" + check_command = "check_patroni_node_is_pending_restart" + + assign where "patroni_servers" in host.groups +} + +apply Service "check_patroni_node_patroni_version" { + import "check_patroni" + check_command = "check_patroni_node_patroni_version" + + assign where "patroni_servers" in host.groups +} + +apply Service "check_patroni_node_tl_has_changed" { + import "check_patroni" + vars.state_file = host.name + ".state" + check_command = "check_patroni_node_tl_has_changed" + + assign where "patroni_servers" in host.groups +} + +# ------------------------------------------------------------------- + +apply Service "check_patroni_cluster_has_leader" { + import "check_patroni" + check_command = "check_patroni_cluster_has_leader" + + assign where "patroni_clusters" in host.groups +} + +apply Service "check_patroni_cluster_has_replica" { + import "check_patroni" + check_command = "check_patroni_cluster_has_replica" + + assign where "patroni_clusters" in host.groups +} + +apply Service "check_patroni_cluster_config_has_changed" { + import "check_patroni" + vars.state_file = host.name + ".state" + check_command = "check_patroni_cluster_config_has_changed" + + assign where "patroni_clusters" in host.groups +} + +apply Service "check_patroni_cluster_is_in_maintenance" { + import "check_patroni" + check_command = "check_patroni_cluster_is_in_maintenance" + + assign where "patroni_clusters" in host.groups +} + +apply Service "check_patroni_cluster_node_count" { + import "check_patroni" + check_command = "check_patroni_cluster_node_count" + + assign where "patroni_clusters" in host.groups +} + +# =================================================================== +# Hosts meta +# =================================================================== +object HostGroup "patroni_servers" { + display_name = "patroni servers" +} + +template Host "patroni_servers" { + groups = [ "patroni_servers" ] + check_command = "hostalive" + + vars.patroni_version = "2.1.2" +} + +# ------------------------------------------------------------------- + +object HostGroup "patroni_clusters" { + display_name = "patroni clusters" +} + +template Host "patroni_clusters" { + groups = [ "patroni_clusters" ] + check_command = "dummy" +} + +# =================================================================== +# Hosts meta +# =================================================================== +__EOF__ +} + +add_hosts(){ + NODES=$@ + for N in "${NODES[@]}"; do + IP="${N##*=}" + HOST="${N%=*}" + + cat << __EOF__ >> "$TARGET" + +object Host "$HOST" { + import "patroni_servers" + + display_name = "Server patroni $HOST" + address = "$IP" + + vars.endpoints = [ "http://" + address + ":8008" ] +} +__EOF__ + done +} + +add_cluster(){ + CLUSTER=$1 + NODES=$2 + + NAME="" + IPS=" " + for N in "${NODES[@]}"; do + IP="${N##*=}" + HOST="${N%=*}" + + NAME="$NAME $HOST" + IPS="$IPS\"http://${IP}:8008\", " + done + + cat << __EOF__ >> "$TARGET" + +object Host "$CLUSTER" { + import "patroni_clusters" + + display_name = "Cluster: $CLUSTER ($NAME )" + + vars.endpoints = [$IPS ] + vars.has_replica_warning = "1:" + vars.has_replica_critical = "1:" + vars.node_count_warning = "2:" + vars.node_count_critical = "1:" + vars.node_count_running_warning = "2:" + vars.node_count_running_critical = "1:" +} +__EOF__ +} + +case "$ACTION" in + "init") + init + add_hosts "$NODES" + add_cluster "$CLUSTER" "$NODES" + ;; + "add") + add_hosts "$NODES" + add_cluster "$CLUSTER" "$NODES" + ;; + *) + usage + echo "error: invalid action" + exit 1 +esac diff --git a/test/vagrant/provision/icinga2.bash b/test/vagrant/provision/icinga2.bash new file mode 100755 index 0000000..023f45a --- /dev/null +++ b/test/vagrant/provision/icinga2.bash @@ -0,0 +1,347 @@ +#!/usr/bin/env bash + +info (){ + echo "$1" +} + +#set -o errexit +set -o nounset +set -o pipefail + +NODENAME="$1" +shift + +PG_ICINGA_USER_NAME="supervisor" +PG_ICINGA_USER_PWD="th3Pass" +PG_ICINGAWEB_USER_NAME="supervisor" +PG_ICINGAWEB_USER_PWD="th3Pass" +PG_DIRECTOR_USER_NAME="supervisor" +PG_DIRECTOR_USER_PWD="th3Pass" +PG_OPM_USER_NAME="opm" +PG_OPM_USER_PWD="th3Pass" +PG_GRAFANA_USER_NAME="supervisor" +PG_GRAFANA_USER_PWD="th3Pass" + +set_hostname(){ + info "#=============================================================================" + info "# hostname and /etc/hosts setup" + info "#=============================================================================" + hostnamectl set-hostname "${NODENAME}" + sed --in-place -e "s/\(127\.0\.0\.1\s*localhost$\)/\1 ${NODENAME}/" /etc/hosts +} + +packages(){ + info "#=============================================================================" + info "# install required repos and packages" + info "#=============================================================================" + apt-get update || true + apt-get -y install apt-transport-https wget gnupg software-properties-common + + DIST=$(awk -F"[)(]+" '/VERSION=/ {print $2}' /etc/os-release) + echo "deb https://packages.icinga.com/debian icinga-${DIST} main" > "/etc/apt/sources.list.d/${DIST}-icinga.list" + echo "deb-src https://packages.icinga.com/debian icinga-${DIST} main" >> "/etc/apt/sources.list.d/${DIST}-icinga.list" + echo "deb https://packages.grafana.com/oss/deb stable main" > /etc/apt/sources.list.d/grafana.list + echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list + + wget -q -O - https://packages.icinga.com/icinga.key | apt-key add - + wget -q -O - https://packages.grafana.com/gpg.key | sudo apt-key add - + wget -q -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + + apt-get update || true + + PACKAGES=( + grafana + icinga2 icinga2-ido-pgsql icingaweb2 icingaweb2-module-monitoring icingacli + postgresql-client postgresql-14 + php7.3-pgsql php7.3-imagick php7.3-intl + nagios-plugins + ) + DEBIAN_FRONTEND=noninteractive apt install -q -y "${PACKAGES[@]}" + + systemctl --quiet --now enable postgresql@14 +} + +icinga_setup(){ + info "#=============================================================================" + info "# Icinga setup" + info "#=============================================================================" + +## this part is already done by the standart icinga install with the user icinga2 +## and a random password, here we dont really care + + cat << __EOF__ | sudo -u postgres psql +DROP ROLE IF EXISTS supervisor; +DROP DATABASE IF EXISTS icinga2; +CREATE ROLE ${PG_ICINGA_USER_NAME} WITH LOGIN SUPERUSER PASSWORD '${PG_ICINGA_USER_PWD}'; +CREATE DATABASE icinga2; +__EOF__ + echo "*:*:*:${PG_ICINGA_USER_NAME}:${PG_ICINGA_USER_PWD}" > ~postgres/.pgpass + chown postgres:postgres ~postgres/.pgpass + chmod 600 ~postgres/.pgpass + PGPASSFILE=~postgres/.pgpass psql -U $PG_ICINGA_USER_NAME -h 127.0.0.1 -d icinga2 -f /usr/share/icinga2-ido-pgsql/schema/pgsql.sql + + icingacli setup config directory --group icingaweb2 + icingacli setup token create + +## this part is already done by the standart icinga install with the user icinga2 + cat << __EOF__ > /etc/icinga2/features-available/ido-pgsql.conf +/** + * The db_ido_pgsql library implements IDO functionality + * for PostgreSQL. + */ + +library "db_ido_pgsql" + +object IdoPgsqlConnection "ido-pgsql" { + user = "${PG_ICINGA_USER_NAME}", + password = "${PG_ICINGA_USER_PWD}", + host = "localhost", + database = "icinga2" +} +__EOF__ + icinga2 feature enable ido-pgsql + icinga2 feature enable command + icinga2 feature enable perfdata + +#icinga2 node wizard + icinga2 node setup --master --cn s1 --zone master + + systemctl restart icinga2.service +} + +icinga_API(){ + info "#=============================================================================" + info "# Icinga API" + info "#=============================================================================" + icinga2 api setup + + cat <<__EOF__ >> /etc/icinga2/conf.d/api-users.conf +object ApiUser "icingaapi" { + password = "th3Pass" + permissions = [ "*" ] +} +__EOF__ + systemctl restart icinga2.service +} + +icinga_web(){ + info "#=============================================================================" + info "# Icinga2 Web" + info "#=============================================================================" + if [ "$PG_ICINGA_USER_NAME" != "$PG_ICINGAWEB_USER_NAME" ]; then + sudo -u postgres psql -c "CREATE ROLE ${PG_ICINGAWEB_USER_NAME} WITH LOGIN PASSWORD '${PG_ICINGAWEB_USER_PWD}';" + fi + sudo -u postgres psql -c "CREATE DATABASE icingaweb_db OWNER ${PG_ICINGAWEB_USER_NAME};" + + sed --in-place -e "s/;date\.timezone =/date.timezone = europe\/paris/" /etc/php/7.3/apache2/php.ini + a2enconf icingaweb2 + a2enmod rewrite + a2dismod mpm_event + a2enmod php7.3 + + systemctl restart apache2 +} + +director(){ + info "#=============================================================================" + info "# Icinga director" + info "#=============================================================================" +# Create the database + if [ "$PG_ICINGA_USER_NAME" != "$PG_DIRECTOR_USER_NAME" ]; then + sudo -u postgres psql -c "CREATE ROLE ${PG_DIRECTOR_USER_NAME} WITH LOGIN PASSWORD '${PG_DIRECTOR_USER_PWD}';" + fi + sudo -u postgres psql -c "CREATE DATABASE director_db OWNER ${PG_DIRECTOR_USER_NAME};" + sudo -iu postgres psql -d director_db -c "CREATE EXTENSION pgcrypto;" + +## Prereq + MODULE_NAME=incubator + MODULE_VERSION=v0.11.0 + MODULES_PATH="/usr/share/icingaweb2/modules" + MODULE_PATH="${MODULES_PATH}/${MODULE_NAME}" + RELEASES="https://github.com/Icinga/icingaweb2-module-${MODULE_NAME}/archive" + mkdir "$MODULE_PATH" \ + && wget -q $RELEASES/${MODULE_VERSION}.tar.gz -O - \ + | tar xfz - -C "$MODULE_PATH" --strip-components 1 + icingacli module enable "${MODULE_NAME}" + +## Director + MODULE_VERSION="1.8.1" + ICINGAWEB_MODULEPATH="/usr/share/icingaweb2/modules" + REPO_URL="https://github.com/icinga/icingaweb2-module-director" + TARGET_DIR="${ICINGAWEB_MODULEPATH}/director" + URL="${REPO_URL}/archive/v${MODULE_VERSION}.tar.gz" + + useradd -r -g icingaweb2 -d /var/lib/icingadirector -s /bin/false icingadirector + install -d -o icingadirector -g icingaweb2 -m 0750 /var/lib/icingadirector + install -d -m 0755 "${TARGET_DIR}" + wget -q -O - "$URL" | tar xfz - -C "${TARGET_DIR}" --strip-components 1 + cp "${TARGET_DIR}/contrib/systemd/icinga-director.service" /etc/systemd/system/ + + icingacli module enable director + systemctl daemon-reload + systemctl enable icinga-director.service + systemctl start icinga-director.service + +# The permission have to be like this to let icingaweb activate modules + chown -R www-data:icingaweb2 /etc/icingaweb2 +} + +grafana(){ + info "#=============================================================================" + info "# Grafana" + info "#=============================================================================" + if [ "$PG_ICINGA_USER_NAME" != "$PG_GRAFANA_USER_NAME" ]; then + sudo -u postgres psql -c "CREATE ROLE ${PG_GRAFANA_USER_NAME} WITH LOGIN PASSWORD '${PG_GRAFANA_USER_PWD}';" + fi + sudo -u postgres psql -c "CREATE DATABASE grafana OWNER ${PG_GRAFANA_USER_NAME};" + + cat << __EOF__ > /etc/grafana/grafana.ini +[database] +# You can configure the database connection by specifying type, host, name, user and password +# as seperate properties or as on string using the url propertie. + +# Either "mysql", "postgres" or "sqlite3", it's your choice +type = postgres +host = 127.0.0.1:5432 +name = grafana +user = $PG_GRAFANA_USER_NAME +password = $PG_GRAFANA_USER_PWD +__EOF__ + systemctl --quiet --now enable grafana-server.service +} + +opm(){ + info "#=============================================================================" + info "# OPM" + info "#=============================================================================" + +## OPM Install + + DEBIAN_FRONTEND=noninteractive apt install -q -y postgresql-server-dev-10 libdbd-pg-perl git build-essential + + cd /usr/local/src || exit 1 + git clone https://github.com/OPMDG/opm-core.git + git clone https://github.com/OPMDG/opm-wh_nagios.git + cd /usr/local/src/opm-wh_nagios/pg/ || exit 1 + make install + cd /usr/local/src/opm-core/pg/ || exit 1 + make install + +## OPM db setup + + cat << __EOF__ | sudo -iu postgres psql +CREATE ROLE ${PG_OPM_USER_NAME} WITH LOGIN PASSWORD '${PG_OPM_USER_PWD}'; +CREATE DATABASE opm OWNER ${PG_OPM_USER_NAME}; +__EOF__ + cat << __EOF__ | sudo -iu postgres psql -d opm +CREATE EXTENSION opm_core; +CREATE EXTENSION wh_nagios CASCADE; +SELECT * FROM grant_dispatcher('wh_nagios', 'opm'); +__EOF__ + +## OPM dispatcher + + cat < /etc/opm_dispatcher.conf +daemon=0 +directory=/var/spool/icinga2/perfdata +frequency=5 +db_connection_string=dbi:Pg:dbname=opm host=localhost +db_user=${PG_OPM_USER_NAME} +db_password=${PG_OPM_USER_PWD} +debug=0 +syslog=1 +hostname_filter = /^$/ # Empty hostname. Never happens +service_filter = /^$/ # Empty service +label_filter = /^$/ # Empty label +EOF + + cat <<'EOF' > /etc/systemd/system/opm_dispatcher.service +[Unit] +Description=dispatcher nagios, import perf files from icinga to opm + +[Service] +User=nagios +Group=nagios +ExecStart=/usr/local/src/opm-wh_nagios/bin/nagios_dispatcher.pl -c /etc/opm_dispatcher.conf + +# start right after boot +Type=simple +# restart on crash +Restart=always +# after 10s +RestartSec=10 + +[Install] +WantedBy=multi-user.target +EOF + +## OPM planned task + + cat <<'EOF' > /etc/systemd/system/opm_dispatch_record.service +[Unit] +Description=Run wh_nagios.dispatch_record() on OPM database + +[Service] +Type=oneshot +User=postgres +Group=postgres +SyslogIdentifier=opm_dispatch_record +ExecStart=/usr/bin/psql -U postgres -d opm -c "SELECT * FROM wh_nagios.dispatch_record()" +EOF + + cat <<'EOF' > /etc/systemd/system/opm_dispatch_record.timer +[Unit] +Description=Timer to run wh_nagios.dispatch_record() on OPM + +[Timer] +OnBootSec=60s +OnUnitInactiveSec=1min + +[Install] +WantedBy=timers.target +EOF + + systemctl daemon-reload + systemctl enable opm_dispatcher + systemctl start opm_dispatcher + systemctl enable opm_dispatch_record.timer + systemctl start opm_dispatch_record.timer + +## To check once everything is setup (icingaweb is setup) +# sudo journalctl -fu opm_dispatcher +# sudo ournalctl -ft opm_dispatch_record + +## Grants for graphana + + sudo -iu postgres psql -c "CREATE ROLE grafana WITH LOGIN PASSWORD 'th3Pass'" + cat <