BluePrint

Prometheus

version: '3.8'
services:
  prometheus:
    container_name: prom
    image: prom/prometheus:latest
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention=1h'
      - '--storage.tsdb.retention.size=1TB'
      - '--storage.tsdb.wal-compression'
      - '--web.enable-lifecycle'
      - '--web.console.libraries=/usr/share/prometheus/console_libraries'
      - '--web.console.templates=/usr/share/prometheus/consoles'
    volumes:
      - ../../config/prometheus-ec2/prometheus.yml:/etc/prometheus/prometheus.yml
      - ./rules:/etc/prometheus/rules
    ports:
      - "9090:9090"

networks:
  default:
    external:
      name: monitoring
prometheus-ec2/prometheus.yml
global:
  scrape_interval:     15s
  evaluation_interval: 15s

  external_labels:
    monitor: 'devops-monitoring'

rule_files:

scrape_configs:
  - job_name: 'prometheus'
    scrape_interval: 5s
    static_configs:
      - targets: [ 'localhost:9090' ]

  - job_name: 'node'
    metrics_path: /metrics
    ec2_sd_configs:
      - region: ap-northeast-2
        port: 9100 # node-exporter
        refresh_interval: 10s # default 60s
        filters:
          - name: tag:monitoring
            values:
              - 'true'
    relabel_configs:
      - source_labels: [ __meta_ec2_private_ip ]
        regex: '(.*)'
        replacement: '${1}:9100'
        action: replace
        target_label: __address__
      - source_labels: [ __meta_ec2_tag_Name ]
        target_label: instance
      - source_labels: [ __meta_ec2_tag_group ]
        target_label: group
      - source_labels: [ __meta_ec2_instance_type ]
        target_label: instance_type
      - source_labels: [ __meta_ec2_architecture ]
        target_label: arch
Rules - alerts.yaml
groups:
  - name: prometheus-cpu
    interval: 10s
    rules:
      - alert: alerts:cpu_usage:prometheus:80
        expr: rate(process_cpu_seconds_total{job=~"prometheus"}[1m]) * 100 > 0
        for: 5s
        labels:
          service: prometheus
          serverity: critical
        annotations:
          title: "Prometheus CPU alert 80%"
          summary: "{{ $labels.instance }}: {{ $value }} ( {{ $labels.job }} )"
          description: "{{ $labels.instance }}: {{ $value }} ( {{ $labels.job }} )"

  - name: blackbox-monitor
    interval: 10s
    limit: 0
    rules:
      - alert: alerts:probe_success
        expr: probe_success == 0
        for: 5s
        labels:
          serverity: critical
        annotations:
          title: "black-box monitoring alert"
          summary: "black-box monitoring alert ( {{ $labels.job }} )"
          description: "{{ $labels.instance }} is down"


Rules - rules.yaml
groups:
  - name: dj-rule
    interval: 10s
    limit: 0
    rules:
      - record: rules:prometheus_engine_query:speed
        expr: prometheus_engine_query_duration_seconds_sum / on (slice) group_left prometheus_engine_query_duration_seconds_count
        labels:
          dj: test

Grafana

version: '3.8'
services:
  grafana:
    container_name: grafana
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    volumes:
      - ../../config/grafana/datasource.yml:/etc/grafana/provisioning/datasources/datasource.yml
    environment:
      GF_RENDERING_SERVER_URL: "http://renderer:8081/render"
      GF_RENDERING_CALLBACK_URL: "http://grafana:3000/"
      GF_LOG_FILTERS: "rendering:debug"
      GF_USERS_DEFAULT_THEME: "light"
      GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin"
      GF_AUTH_ANONYMOUS_ENABLED: "true"
  renderer:
    image: grafana/grafana-image-renderer:latest
    ports:
      - "8081:8081"
    environment:
      IGNORE_HTTPS_ERRORS: "true"

networks:
  default:
    external:
      name: monitoring
/datasources/datasource.yml
# config file version
apiVersion: 1

datasources:
  - name: Prometheus
    type: prometheus
    access: proxy
    orgId: 1
    url: http://prometheus:9090
    basicAuth: false
    isDefault: false
    version: 1
    editable: false

EC2 Provisioning

#!/bin/bash

# install cwagent
sudo yum install -y amazon-cloudwatch-agent

# download config
wget \
  https://raw.githubusercontent.com/dev-chulbuji/devops_infra/master/apne2/dev/ec2/bastion/templates/cloudwatch-agent-config.json \
  -O /opt/aws/amazon-cloudwatch-agent/bin/config.json

# run agent
sudo amazon-cloudwatch-agent-ctl \
  -a fetch-config \
  -m ec2 \
  -c file:/opt/aws/amazon-cloudwatch-agent/bin/config.json \
  -s

# check agent status
amazon-cloudwatch-agent-ctl -m ec2 -a status

# docker
sudo yum update
sudo yum install -y docker git
systemctl enable docker
systemctl start docker
sudo usermod -aG docker ec2-user

# docker-compose
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose

docker network create monitoring

# prometheus
git clone https://github.com/dev-chulbuji/devops_prometheus.git /home/ec2-user/devops_prometheus
cd /home/ec2-user/devops_prometheus/compose-files/prometheus-ec2
docker-compose up -d

# grafana
cd /home/ec2-user/devops_prometheus/compose-files/grafana
docker-compose up -d

# node-exporter
cd /home/ec2-user/devops_prometheus/compose-files/node-exporter
docker-compose up -d
  • No labels
Write a comment…