compose.yml

# https://docs.docker.com/reference/compose-file/
name: zjusct-observability # COMPOSE_PROJECT_NAME variable

services:
# https://opentelemetry.io/docs/collector/installation/
  otelcol-gateway:
    image: otel/opentelemetry-collector-contrib
    volumes:
      - ./config/otelcol/gateway.yaml:/etc/otelcol-contrib/config.yaml:ro
    # https://opentelemetry.io/docs/collector/configuration/#environment-variables
    environment:
      - OTEL_BEARER_TOKEN=${OTEL_BEARER_TOKEN}
    ports:
    # extension ports
      - 127.0.0.1:13133:13133   # health_check extension
      - 127.0.0.1:1777:1777     # pprof
      - 127.0.0.1:55679:55679   # zpages
    # receiver ports
      - 514:514/udp   # syslog receiver
    #  - 4317:4317     # OTLP gRPC receiver, only for internal use
      - 4319:4318     # OTLP http receiver, conflict with host otel-collector
    restart: unless-stopped
    # cannot use any command in this image...
    # healthcheck:
    # if clickhouse is not ready, otel-collector will shutdown: cannot start pipelines
    depends_on:
      clickhouse:
        condition: service_healthy
        restart: true
      elasticsearch:
        condition: service_healthy
        restart: true
      prometheus:
        condition: service_healthy
        restart: true
      influxdb:
        condition: service_healthy
        restart: true

  otelcol-snmp:
    image: otel/opentelemetry-collector-contrib
    volumes:
      - ./config/otelcol/snmp.yaml:/etc/otelcol-contrib/config.yaml:ro
    environment:
      - SNMP_PRIVATE_KEY=${SNMP_PRIVATE_KEY}
      - SNMP_AUTH_KEY=${SNMP_AUTH_KEY}
    restart: unless-stopped
    depends_on:
      otelcol-gateway:
        condition: service_started
        restart: true

# https://hub.docker.com/r/clickhouse/clickhouse-server/
  clickhouse:
    image: clickhouse/clickhouse-server
    volumes:
      - clickhouse:/var/lib/clickhouse
    # only for localhost
    ports:
      - 127.0.0.1:8123:8123 # HTTP interface
      - 127.0.0.1:9000:9000 # Native Client
    # https://github.com/plausible/analytics/discussions/4629#discussioncomment-10774222
    cap_add:
      - CAP_SYS_NICE
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "clickhouse-client", "--query", "SHOW DATABASES"]
      interval: 10s
      timeout: 3s
      retries: 3

# https://hub.docker.com/r/prom/prometheus/
  prometheus:
    image: prom/prometheus
    volumes:
      - ./config/prometheus.yml:/etc/prometheus/prometheus.yml
      - prometheus:/prometheus
    # only for localhost
    ports:
      - 127.0.0.1:9090:9090
    restart: unless-stopped
    # https://github.com/prometheus/prometheus/blob/main/Dockerfile
    command: [
      "--config.file=/etc/prometheus/prometheus.yml",
      "--storage.tsdb.path=/prometheus",
      "--storage.tsdb.retention.time=180d",
      "--web.enable-remote-write-receiver",
      "--web.enable-admin-api"
    ]
    healthcheck:
      # no curl command in the image
      test: ["CMD", "wget", "-q", "--spider", "http://prometheus:9090/-/healthy"]
      interval: 10s
      timeout: 3s
      retries: 3

# https://hub.docker.com/_/influxdb
  influxdb:
    image: influxdb:2
    volumes:
      - influxdb:/var/lib/influxdb2
    # not only for localhost
    ports:
      - 8086:8086
    environment:
      - DOCKER_INFLUXDB_INIT_MODE=setup
      - DOCKER_INFLUXDB_INIT_USERNAME=zjusct
      - DOCKER_INFLUXDB_INIT_PASSWORD=${INFLUXDB_PASSWORD}
      - DOCKER_INFLUXDB_INIT_ORG=zjusct
      - DOCKER_INFLUXDB_INIT_BUCKET=zjusct
      - DOCKER_INFLUXDB_INIT_RETENTION=180d
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "influx", "ping"]
      interval: 10s
      timeout: 3s
      retries: 3

# https://www.elastic.co/guide/en/elasticsearch/reference/8.15/docker.html
  elasticsearch:
    image: docker.elastic.co/elasticsearch/elasticsearch:8.15.3
    volumes:
      - ./config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
      - elasticsearch:/usr/share/elasticsearch/data
    # only for localhost
    ports:
      - 127.0.0.1:9200:9200
    restart: unless-stopped
    healthcheck:
      test:
        [
          "CMD-SHELL",
          "curl http://elasticsearch:9200",
        ]
      interval: 10s
      timeout: 10s
      retries: 120

# https://hub.docker.com/r/grafana/grafana-enterprise/
  grafana:
    image: grafana/grafana-enterprise
    volumes:
      # https://grafana.com/docs/grafana/latest/administration/provisioning/
      - ./config/grafana:/etc/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=${GF_SECURITY_ADMIN_PASSWORD}
      # https://github.com/grafana/grafana/issues/11409
      - GF_INSTALL_PLUGINS=grafana-clickhouse-datasource
      - GF_TG_BOT_TOKEN=${GF_TG_BOT_TOKEN}
      - GF_TG_CHAT_ID=${GF_TG_CHAT_ID}
      - INFLUXDB_TOKEN=${INFLUXDB_TOKEN}
      # - http_proxy=http://bridge.internal.zjusct.io:7890
      # - https_proxy=http://bridge.internal.zjusct.io:7890
      # https://unix.stackexchange.com/questions/23452/set-a-network-range-in-the-no-proxy-environment-variable
      # - no_proxy=127.0.0.1,.cn,.zjusct.io,grafana,prometheus,clickhouse,otelcol,elasticsearch,influxdb
    ports:
      - 3000:3000
    restart: unless-stopped
    depends_on: # data source
      clickhouse:
        condition: service_healthy
      prometheus:
        condition: service_healthy
    healthcheck:
      test: ["CMD", "curl", "-f", "http://grafana:3000/"]
      interval: 10s
      timeout: 3s
      retries: 3

volumes:
  # https://stackoverflow.com/questions/74079078/what-is-the-meaning-of-the-type-o-device-flags-in-driver-opts-in-the-docker-comp
  clickhouse:
    driver: local
    driver_opts:
      type: none
      device: ./database/clickhouse
      o: bind
  prometheus:
    driver: local
    driver_opts:
      type: none
      device: ./database/prometheus
      o: bind
  elasticsearch:
    driver: local
    driver_opts:
      type: none
      device: ./database/elasticsearch
      o: bind
  influxdb:
    driver: local
    driver_opts:
      type: none
      device: ./database/influxdb
      o: bind