- Created by Sansae, last modified on Feb 23, 2022
You are viewing an old version of this page. View the current version.
Compare with Current View Page History
« Previous Version 3 Current »
Prometheus
version: '3.8' services: prometheus: container_name: prom image: prom/prometheus:latest command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention=1h' - '--storage.tsdb.retention.size=1TB' - '--storage.tsdb.wal-compression' - '--web.enable-lifecycle' - '--web.console.libraries=/usr/share/prometheus/console_libraries' - '--web.console.templates=/usr/share/prometheus/consoles' volumes: - ../../config/prometheus-ec2/prometheus.yml:/etc/prometheus/prometheus.yml - ./rules:/etc/prometheus/rules ports: - "9090:9090" networks: default: external: name: monitoring
prometheus-ec2/prometheus.yml
global: scrape_interval: 15s evaluation_interval: 15s external_labels: monitor: 'devops-monitoring' rule_files: scrape_configs: - job_name: 'prometheus' scrape_interval: 5s static_configs: - targets: [ 'localhost:9090' ] - job_name: 'node' metrics_path: /metrics ec2_sd_configs: - region: ap-northeast-2 port: 9100 # node-exporter refresh_interval: 10s # default 60s filters: - name: tag:monitoring values: - 'true' relabel_configs: - source_labels: [ __meta_ec2_private_ip ] regex: '(.*)' replacement: '${1}:9100' action: replace target_label: __address__ - source_labels: [ __meta_ec2_tag_Name ] target_label: instance - source_labels: [ __meta_ec2_tag_group ] target_label: group - source_labels: [ __meta_ec2_instance_type ] target_label: instance_type - source_labels: [ __meta_ec2_architecture ] target_label: arch
Rules - alerts.yaml
groups: - name: prometheus-cpu interval: 10s rules: - alert: alerts:cpu_usage:prometheus:80 expr: rate(process_cpu_seconds_total{job=~"prometheus"}[1m]) * 100 > 0 for: 5s labels: service: prometheus serverity: critical annotations: title: "Prometheus CPU alert 80%" summary: "{{ $labels.instance }}: {{ $value }} ( {{ $labels.job }} )" description: "{{ $labels.instance }}: {{ $value }} ( {{ $labels.job }} )" - name: blackbox-monitor interval: 10s limit: 0 rules: - alert: alerts:probe_success expr: probe_success == 0 for: 5s labels: serverity: critical annotations: title: "black-box monitoring alert" summary: "black-box monitoring alert ( {{ $labels.job }} )" description: "{{ $labels.instance }} is down"
Rules - rules.yaml
groups: - name: dj-rule interval: 10s limit: 0 rules: - record: rules:prometheus_engine_query:speed expr: prometheus_engine_query_duration_seconds_sum / on (slice) group_left prometheus_engine_query_duration_seconds_count labels: dj: test
Grafana
version: '3.8' services: grafana: container_name: grafana image: grafana/grafana:latest ports: - "3000:3000" volumes: - ../../config/grafana/datasource.yml:/etc/grafana/provisioning/datasources/datasource.yml environment: GF_RENDERING_SERVER_URL: "http://renderer:8081/render" GF_RENDERING_CALLBACK_URL: "http://grafana:3000/" GF_LOG_FILTERS: "rendering:debug" GF_USERS_DEFAULT_THEME: "light" GF_AUTH_ANONYMOUS_ORG_ROLE: "Admin" GF_AUTH_ANONYMOUS_ENABLED: "true" renderer: image: grafana/grafana-image-renderer:latest ports: - "8081:8081" environment: IGNORE_HTTPS_ERRORS: "true" networks: default: external: name: monitoring
/datasources/datasource.yml
# config file version apiVersion: 1 datasources: - name: Prometheus type: prometheus access: proxy orgId: 1 url: http://prometheus:9090 basicAuth: false isDefault: false version: 1 editable: false
- No labels