docknetwork
diff --git a/Diff for: ‎.maintain/monitoring/README.md
+13 b/Diff for: ‎.maintain/monitoring/README.md
+13
diff --git a/Diff for: ‎.maintain/monitoring/alerting-rules.yaml
+29 b/Diff for: ‎.maintain/monitoring/alerting-rules.yaml
+29
@@ -0,0 +1,13 @@
+## Substrate Dashboard
+
+We are using a very slightly modified version of the Robonomics dashboard https://grafana.com/grafana/dashboards/13015 which has Substrate prometheus metrics aswell as node exporter metrics.
+
+You can find our version in `./grafana-dashboard.json`
+
+## Prometheus and Alert Manager config
+
+Two files `prometheus.yaml` and `alerting-rules.yaml` are used for prometheus and alert manager config respectively. The simple configuration lets us scrape Substrate and Node Exporter metrics, giving us alerts through Alert Manager if theres node downtime. Please refer to the setup guide for more information.
+
+## Setup guide
+
+The good people at robonomics have created a nice guide to get you started: https://github.com/hubobubo/robonomics/wiki/Robonomics-(XRT)-metrics-using-Prometheus-and-Grafana - you can follow this and import the dashboard JSON here or their panel from Grafana.
@@ -0,0 +1,29 @@
+groups:
+  - name: alert_rules
+    rules:
+      - alert: InstanceDown
+        expr: up == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Instance [{{ $labels.instance }}] down"
+          description: "[{{ $labels.instance }}] of job [{{ $labels.job }}] has been down for more than 1 minute."
+
+      - alert: HostOutOfMemory
+        expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host out of memory (instance {{ $labels.instance }})
+          description: "Node memory is filling up (< 10% left)"
+
+      - alert: HostHighCpuLoad
+        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host high CPU load (instance {{ $labels.instance }})
+          description: "CPU load is > 80%"