k8s/charts/lcm-bricks/templates/prometheus/alertingRules.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: lcm-bricks-monitoring-rules
namespace: monitoring
labels:
app.kubernetes.io/name: {{ include "lcm-bricks.name" . }}
team: lcm
helm.sh/chart: {{ include "lcm-bricks.chart" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
data:
lcm-bricks-monitoring-rules.yaml: |+
groups:
- name: lcm-bricks-monitoring-rules
rules:
- record: "container_pod:lcm_pod_container_status_restarts:increase10m"
expr: increase(kube_pod_container_status_restarts_total{namespace='{{ .Release.Namespace }}'}[10m])
- alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
- alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
- record: "container_pod:lcm_pod_container_status_oomkilled:increase10m"
expr: increase(kube_pod_container_status_terminated_reason{namespace='{{ .Release.Namespace }}', reason='OOMKilled'}[10m])
- alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit."
summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
- alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit."
summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
- alert: "[LCM] Container is being throttled on cluster={{ .Values.clusterId }}"
expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1
for: 5m
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary."
summary: "{{`{{ $labels.pod_name }}`}} Container is being throttled"
- alert: "[LCM] is doing too much pause GC on cluster={{ .Values.clusterId }}"
expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1
for: 5m
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary."
summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC"
- alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}"
expr: count(kube_job_info{namespace="lcm"}) > 100
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "There is more than 100 jobs in LCM namespace. They are likely not deleted."
summary: "There is more than 100 jobs in LCM namespace."
- alert: "[LCM] Resource quotas hit CPU limit on cluster={{ .Values.clusterId }}"
expr: kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="hard"} - ignoring(type) kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="used"} == 0
labels:
severity: warning
team: lcm
cluster_id: {{ .Values.clusterId }}
annotations:
description: "We are hitting CPU limit in LCM namespace."
summary: "We are hitting CPU limit in LCM namespace."
- alert: "[LCM] POD is in undesirable state on cluster={{ .Values.clusterId }}"
expr: kube_pod_status_phase{namespace='{{ .Release.Namespace }}', phase!~"Running|Succeeded|Failed"} > 0
for: 5m
labels:
cluster_id: {{ .Values.clusterId }}
severity: critical
team: lcm
annotations:
description: "POD {{`{{ $labels.pod }}`}} is not in desirable state"
summary: "POD is not in desirable state"
runbook: "https://confluence.intgdc.com/display/plat/Generic+runbooks#Genericrunbooks-Podisinundesirablestate"