Why do Docker Swarm Environments Periodically Freeze

53 views Asked by At

I have an 18 node Docker Swarm setup using 5 managers. This is an AWS setup, using c6a.2xlarge EC2 families for all nodes. Attaching Docker info data below. We have approximately 300 "website environments" running about 6 services per environment. 2 of these services are exposed by Traefik. Each of these environments is running it's own local docker stack network, as well as an externally defined network "public-" to expose it via Traefik.

My problem, is that sometimes these environments will return 504. I don't see any events in Docker events. I don't see anything unusually problematic in syslog.

Docker info:

Client: Docker Engine - Community
 Version:    24.0.6
 Context:    default
 Debug Mode: false
 Plugins:
  buildx: Docker Buildx (Docker Inc.)
    Version:  v0.11.2
    Path:     /usr/libexec/docker/cli-plugins/docker-buildx
  compose: Docker Compose (Docker Inc.)
    Version:  v2.21.0
    Path:     /usr/libexec/docker/cli-plugins/docker-compose

Server:
 Containers: 104
  Running: 104
  Paused: 0
  Stopped: 0
 Images: 8
 Server Version: 24.0.6
 Storage Driver: overlay2
  Backing Filesystem: extfs
  Supports d_type: true
  Using metacopy: false
  Native Overlay Diff: true
  userxattr: false
 Logging Driver: json-file
 Cgroup Driver: systemd
 Cgroup Version: 2
 Plugins:
  Volume: local
  Network: bridge host ipvlan macvlan null overlay
  Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
 Swarm: active
  NodeID: b19thmeod7yhmwpzqe34srqfu
  Is Manager: true
  ClusterID: vq0wn6wif1q8fm4j73ka58240
  Managers: 5
  Nodes: 18
  Default Address Pool: 10.0.0.0/8
  SubnetSize: 24
  Data Path Port: 4789
  Orchestration:
   Task History Retention Limit: 5
  Raft:
   Snapshot Interval: 10000
   Number of Old Snapshots to Retain: 0
   Heartbeat Tick: 1
   Election Tick: 10
  Dispatcher:
   Heartbeat Period: 5 seconds
  CA Configuration:
   Expiry Duration: 3 months
   Force Rotate: 0
  Autolock Managers: false
  Root Rotation In Progress: false
  Node Address: 10.0.21.176
  Manager Addresses:
   10.0.16.23:2377
   10.0.21.176:2377
   10.0.27.74:2377
   10.0.28.132:2377
   10.0.30.241:2377
 Runtimes: io.containerd.runc.v2 runc
 Default Runtime: runc
 Init Binary: docker-init
 containerd version: 8165feabfdfe38c65b599c4993d227328c231fca
 runc version: v1.1.8-0-g82f18fe
 init version: de40ad0
 Security Options:
  apparmor
  seccomp
   Profile: builtin
  cgroupns
 Kernel Version: 6.2.0-1011-aws
 Operating System: Ubuntu 22.04.3 LTS
 OSType: linux
 Architecture: x86_64
 CPUs: 8
 Total Memory: 15.26GiB
 Name: <redacted-host-name>
 ID: bbcdfb17-0d75-4edf-880f-050eb96f02dd
 Docker Root Dir: /var/lib/docker
 Debug Mode: false
 Experimental: true
 Insecure Registries:
 Live Restore Enabled: false

Traefik Yaml:

    version: '3'

services:
  lb:
    image: traefik:v2.9
    env_file: .env
    command:
      - "--providers.docker.endpoint=unix:///var/run/docker.sock"
      - "--providers.docker.swarmMode=true"
      - "--providers.docker.exposedbydefault=true"
      - "--providers.docker.network=traefik-public"
      - "--api.dashboard=true"
      - "--api.insecure=true"
      - "--accesslog=true"
      - "--accesslog.format=json"
      - "--accesslog.fields.headers.defaultmode=keep"
      - "--entrypoints.web.address=:80"
      - "--entryPoints.web.forwardedHeaders.insecure"
      - "--entryPoints.web.proxyProtocol.trustedIPs=127.0.0.1/32,10.10.0.0/16"
      - "--entryPoints.web.proxyProtocol.insecure"
      - "--metrics.prometheus=true"
      - "--metrics.prometheus.addEntryPointsLabels=true"
      - "--metrics.prometheus.addrouterslabels=true"
      - "--entryPoints.metrics.address=:8082"
      - "--metrics.prometheus.entryPoint=metrics"
      - "--providers.docker.network=traefik-public-docx"
      ...

    environment:
      - TZ=US/Chicago
    ports:
      - 80:80
      - 8083:8080
      - 8082:8082
    volumes:
      # So that Traefik can listen to the Docker events
      - /var/run/docker.sock:/var/run/docker.sock:ro
    networks:
      - traefik-public
      - traefik-public-docx
      ...

    logging:
      driver: "json-file"
      options:
        max-size: "50k"
    deploy:
      mode: replicated
      replicas: 5
      placement:
        constraints: [node.role == manager]
      labels:
        - "traefik.enable=true"
        - "traefik.http.services.traefik.loadbalancer.server.port=888" # required by swarm but not used.
        - "traefik.http.routers.traefik.rule=Host(\`traefik.example.com\`)"
        - "traefik.http.routers.traefik.entrypoints=traefik"
        - "traefik.http.routers.traefik.service=api@internal"
        - "traefik.http.routers.traefik.middlewares=traefik-auth"
        - "traefik.http.middlewares.traefik-auth.basicauth.users=fake-user:fake-pass"

networks:
  traefik-public:
    external: true
  traefik-public-docx:
    external: true
  ...

Example Customer Environment YAML:

version: "3.3"

services:
  # The main database instance
  db:
    env_file: .env
    image: <ecr-url>/db:stable
    ports:
      - target: 3306
        published: <dynamically-assigned-port>
        protocol: tcp
        mode: host
    volumes:
      - dbtmp:/tmp
      - dbdata:/var/lib/mysql
    configs:
      - source: mariadb-1694946020
        target: /etc/mysql/mariadb.conf.d/99-custom.cnf
    networks:
      - default
    deploy:
      labels:
        - "traefik.enable=false"
      placement:
        constraints: [node.hostname == <static-host-value>]

  # The API
  api:
    env_file: .env
    image: <ecr-url>/api:stable
    networks:
      - traefik-public-<name>
      - default
    configs:
      - source: api-1694946020
        target: /var/www/html/.env
    deploy:
      placement:
        constraints: [node.role == worker]
      resources:
        limits:
          cpus: '1.0'
          memory: '512M'
      labels:
        - "traefik.enable=true"
        - "traefik.docker.network=traefik-public-<name>"
        - "traefik.http.routers.<name>_api.rule=Host(`<name>-api.example.com`)"
        - "traefik.http.routers.<name>_api.entrypoints=web"
        - "traefik.http.routers.<name>_api.middlewares=<name>_api_security"
        - "traefik.http.services.<name>_api.loadbalancer.server.port=80"

        # Security headers
        - "traefik.http.middlewares.<name>_api_security.headers.frameDeny=true"
        - "traefik.http.middlewares.<name>_api_security.headers.browserXssFilter=true"
        - "traefik.http.middlewares.<name><name>_api_security.headers.contentTypeNosniff=true"

  front:
    env_file: .env
    image: <ecr-url>/front:stable
    networks:
      - traefik-public-<name>
      - default
    deploy:
      placement:
        constraints: [node.role == worker]
      resources:
        limits:
          cpus: '1.0'
          memory: '512M'
      labels:
        - "traefik.enable=true"
        - "traefik.docker.network=traefik-public-<name>"
        - "traefik.http.routers.<name>_front.rule=Host(`<name>.example.com`)"
        - "traefik.http.routers.<name>_front.entrypoints=web"
        - "traefik.http.routers.<name>_front.middlewares=<name>_front_security"
        - "traefik.http.services.<name>_front.loadbalancer.server.port=80"

        # Security headers
        - "traefik.http.middlewares.<name>_front_security.headers.customFrameOptionsValue=SAMEORIGIN"
        - "traefik.http.middlewares.<name>_front_security.headers.browserXssFilter=true"
        - "traefik.http.middlewares.<name>_front_security.headers.contentTypeNosniff=true"
        - "traefik.http.middlewares.<name>_front_security.headers.featurepolicy=camera 'none'; geolocation 'none'; microphone 'none'; payment 'none'; usb 'none'; vr 'none';"
        - "traefik.http.middlewares.<name>_front_security.headers.customresponseheaders.X-Robots-Tag=none,noarchive,nosnippet,notranslate,noimageindex"

# define the volumes
volumes:
  dbtmp:
    driver: local
    driver_opts:
      o: bind
      device: /mnt/dbtmp/<name>
      type: none
  dbdata:

# external configuration
configs:
  api-1694946020:
    external: true
  mariadb-1694946020:
    external: true

# network definitions
networks:
  default:
  traefik-public-<name>:
    external: true
0

There are 0 answers