ministryofjustice/Claim-for-Crown-Court-Defence

View on GitHub
.k8s/live/dev/prometheus-custom-rules.yaml

Summary

Maintainability
Test Coverage
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  namespace: cccd-dev
  labels:
    prometheus: cloud-platform
    role: alert-rules
  name: prometheus-custom-rules-cccd
spec:
  groups:
  - name: application-rules
    rules:
    - alert: Quota-Exceeded
      expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used",namespace="cccd-dev"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard",namespace="cccd-dev"} > 0) > 90
      for: 1m
      labels:
        severity: laa-cccd-alerts
      annotations:
        message: cccd-dev is using {{ printf "%0.0f" $value}}% of its {{ $labels.resource }} quota.
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
    - alert: NotFound-Threshold-Reached
      expr: sum(rate(nginx_ingress_controller_requests{exported_namespace="cccd-dev", status="400"}[86400s])) * 86400 > 100
      for: 1m
      labels:
        severity: laa-cccd-alerts
      annotations:
        message: cccd-dev More than a hundred 404 errors in one day
        runbook_url: https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/kibana#/discover?_g=(refreshInterval:(pause:!t,value:0),time:(from:now-24h,mode:quick,to:now))&_a=(columns:!(_source),filters:!(('$state':(store:appState),meta:(alias:!n,disabled:!f,index:ec9109a0-2b35-11e9-ac82-95e56bd45b02,key:kubernetes.namespace_name,negate:!f,params:(query:cccd-dev,type:phrase),type:phrase,value:cccd-dev),query:(match:(kubernetes.namespace_name:(query:cccd-dev,type:phrase))))),index:ec9109a0-2b35-11e9-ac82-95e56bd45b02,interval:auto,query:(language:lucene,query:'log:%22RoutingError%22'),sort:!('@timestamp',desc))
    - alert: nginx-5xx-error
      expr: sum(rate(nginx_ingress_controller_requests{exported_namespace="cccd-dev", status=~"5.."}[5m])) * 300 > 5
      for: 1m
      labels:
        severity: laa-cccd-alerts
      annotations:
        message: cccd-dev An HTTP 5xx error has occurred
        runbook_url: https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/kibana#/discover?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-24h,to:now))&_a=(columns:!(log_processed.status,log_processed.http_referer,log_processed.request_uri),filters:!(('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'71644ed0-d648-11ea-b6f0-6bf964cd13a4',key:log_processed.kubernetes_namespace,negate:!f,params:(query:cccd-production),type:phrase,value:cccd-dev),query:(match:(log_processed.kubernetes_namespace:(query:cccd-dev,type:phrase)))),('$state':(store:appState),meta:(alias:!n,disabled:!f,index:'71644ed0-d648-11ea-b6f0-6bf964cd13a4',key:log_processed.status,negate:!f,params:(query:'500'),type:phrase,value:'500'),query:(match:(log_processed.status:(query:'500',type:phrase))))),index:'71644ed0-d648-11ea-b6f0-6bf964cd13a4',interval:auto,query:(language:lucene,query:''),sort:!(!('@timestamp',desc)))
    - alert: DEV-SQS-Responses-For-CCCD-oldest-message
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-responses-for-cccd' has messages older than or equal to 10 mins, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_age_of_oldest_message_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-responses-for-cccd%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: |-
          aws_sqs_approximate_age_of_oldest_message_maximum{queue_name=~"laa-get-paid-dev-responses-for-cccd"} >= 10 * 60
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-Responses-For-CCCD-Message-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-responses-for-cccd' has more than or equal to 10 messages, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-responses-for-cccd%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_number_of_messages_visible_maximum{queue_name="laa-get-paid-dev-responses-for-cccd"} >= 10
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCCD-Claims-For-CCR-oldest-message
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-for-ccr' has messages older than or equal to 10 mins, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_age_of_oldest_message_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-for-ccr%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_age_of_oldest_message_maximum{queue_name=~"laa-get-paid-dev-cccd-claims-for-ccr"} >= 10 * 60
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCCD-Claims-For-CCR-Message-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-for-ccr' has more than or equal to 10 messages, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-for-ccr%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_number_of_messages_visible_maximum{queue_name="laa-get-paid-dev-cccd-claims-for-ccr"} >= 10
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCCD-Claims-For-CCLF-oldest-message
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-for-cclf' has messages older than or equal to 10 mins, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_age_of_oldest_message_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-for-cclf%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_age_of_oldest_message_maximum{queue_name=~"laa-get-paid-dev-cccd-claims-for-cclf"} >= 10 * 60
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCCD-Claims-For-CCLF-Message-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-for-cclf' has more than or equal to 10 messages, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-for-cclf%5C%22%7D%20%3E%3D%20bool%2010%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_number_of_messages_visible_maximum{queue_name="laa-get-paid-dev-cccd-claims-for-cclf"} >= 10
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCCD-Dead-Letter-Queue-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-reponses-for-cccd-dlq' has more than or equal to 1 message, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-reponses-for-cccd-dlq%5C%22%7D%20%3E%3D%20bool%201%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_number_of_messages_visible_maximum{queue_name="laa-get-paid-dev-reponses-for-cccd-dlq"} >= 1
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCR-Dead-Letter-Queue-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-submitted-ccr-dlq' has more than or equal to 1 message, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-submitted-ccr-dlq%5C%22%7D%20%3E%3D%20bool%201%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_age_of_oldest_message_maximum{queue_name="laa-get-paid-dev-cccd-claims-submitted-ccr-dlq"} >= 1
      for: 1m
      labels:
        severity: laa-cccd-alerts
    - alert: DEV-SQS-CCLF-Dead-Letter-Queue-Threshold-Reached
      annotations:
        message: DEV SQS queue 'laa-get-paid-dev-cccd-claims-submitted-cclf-dlq' has more than or equal to 1 message, check consumers are healthy.
        dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/explore?orgId=1&left=%7B%22datasource%22:%22prometheus%22,%22queries%22:%5B%7B%22queryMode%22:%22Metrics%22,%22namespace%22:%22%22,%22metricName%22:%22%22,%22expression%22:%22%22,%22dimensions%22:%7B%7D,%22region%22:%22default%22,%22id%22:%22%22,%22statistic%22:%22Average%22,%22period%22:%22%22,%22metricQueryType%22:0,%22metricEditorMode%22:0,%22sqlExpression%22:%22%22,%22matchExact%22:true,%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22prometheus%22,%22uid%22:%22prometheus%22%7D,%22editorMode%22:%22builder%22,%22expr%22:%22aws_sqs_approximate_number_of_messages_visible_maximum%7Bqueue_name%3D%5C%22laa-get-paid-dev-cccd-claims-submitted-cclf-dlq%5C%22%7D%20%3E%3D%20bool%201%22,%22legendFormat%22:%22__auto%22,%22range%22:true,%22instant%22:false,%22label%22:%22%22,%22exemplar%22:false%7D%5D,%22range%22:%7B%22from%22:%22now-10m%22,%22to%22:%22now%22%7D%7D
      expr: aws_sqs_approximate_age_of_oldest_message_maximum{queue_name="laa-get-paid-dev-cccd-claims-submitted-cclf-dlq"} >= 1
      for: 1m
      labels:
        severity: laa-cccd-alerts