gooddata/gooddata-ruby

View on GitHub
k8s/charts/lcm-bricks/templates/prometheus/alertingRules.yaml

Summary

Maintainability
Test Coverage
apiVersion: v1
kind: ConfigMap
metadata:
  name: lcm-bricks-monitoring-rules
  namespace: monitoring
  labels:
    app.kubernetes.io/name: {{ include "lcm-bricks.name" . }}
    team: lcm
    helm.sh/chart: {{ include "lcm-bricks.chart" . }}
    app.kubernetes.io/instance: {{ .Release.Name }}
    app.kubernetes.io/managed-by: {{ .Release.Service }}
data:
  lcm-bricks-monitoring-rules.yaml: |+
    groups:
    - name: lcm-bricks-monitoring-rules
      rules:
      - record: "container_pod:lcm_pod_container_status_restarts:increase10m"
        expr: increase(kube_pod_container_status_restarts_total{namespace='{{ .Release.Namespace }}'}[10m])
      - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 1
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 0 restarts of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
          summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
      - alert: "[LCM] Pod has too many restarts on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_restarts:increase10m >= 2
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 1 restart of {{`{{ $labels.pod }}`}} pod in the last 10 minutes"
          summary: "{{`{{ $labels.pod }}`}} pod has too many restarts"
      - record: "container_pod:lcm_pod_container_status_oomkilled:increase10m"
        expr: increase(kube_pod_container_status_terminated_reason{namespace='{{ .Release.Namespace }}', reason='OOMKilled'}[10m])
      - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 1
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 30 minutes. Investigate and/or increase memoryRequest or memoryLimit."
          summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
      - alert: "[LCM] OOMKill occured on cluster={{ .Values.clusterId }}"
        expr: container_pod:lcm_pod_container_status_oomkilled:increase10m >= 2
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod }}`}} was OOMKilled in the last 10 minutes. Investigate and/or increase memoryRequest or memoryLimit."
          summary: "{{`{{ $labels.pod }}`}} OOMKill occured"
      - alert: "[LCM] Container is being throttled on cluster={{ .Values.clusterId }}"
        expr: rate(container_cpu_cfs_throttled_seconds_total{namespace='{{ .Release.Namespace }}'}[1m]) > 1
        for: 5m
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.pod_name }}`}} container is beeing throttled and probably hit CPU limit. Investigate root cause and increase limit and/or number of replicas if necessary."
          summary: "{{`{{ $labels.pod_name }}`}} Container is being throttled"
      - alert: "[LCM] is doing too much pause GC on cluster={{ .Values.clusterId }}"
        expr: rate(jvm_gc_pause_seconds_sum{kubernetes_namespace='{{ .Release.Namespace }}'}[1m]) > 1
        for: 5m
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "{{`{{ $labels.kubernetes_pod_name }}`}} container is spending too much time in pause garbage collector. Investigate root cause and increase heap size and/or number of replicas if necessary."
          summary: "{{`{{ $labels.kubernetes_pod_name }}`}} is doing too much pause GC"
      - alert: "[LCM] there is more than 100 jobs on cluster={{ .Values.clusterId }}"
        expr: count(kube_job_info{namespace="lcm"}) > 100
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "There is more than 100 jobs in LCM namespace. They are likely not deleted."
          summary: "There is more than 100 jobs in LCM namespace."
      - alert: "[LCM] Resource quotas hit CPU limit on cluster={{ .Values.clusterId }}"
        expr: kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="hard"} - ignoring(type) kube_resourcequota{namespace='{{ .Release.Namespace }}',resource="limits.cpu",type="used"} == 0
        labels:
          severity: warning
          team: lcm
          cluster_id: {{ .Values.clusterId }}
        annotations:
          description: "We are hitting CPU limit in LCM namespace."
          summary: "We are hitting CPU limit in LCM namespace."
      - alert: "[LCM] POD is in undesirable state on cluster={{ .Values.clusterId }}"
        expr: kube_pod_status_phase{namespace='{{ .Release.Namespace }}', phase!~"Running|Succeeded|Failed"} > 0
        for: 5m
        labels:
          cluster_id: {{ .Values.clusterId }}
          severity: critical
          team: lcm
        annotations:
          description: "POD {{`{{ $labels.pod }}`}} is not in desirable state"
          summary: "POD is not in desirable state"
          runbook: "https://confluence.intgdc.com/display/plat/Generic+runbooks#Genericrunbooks-Podisinundesirablestate"