I have an 18 node Docker Swarm setup using 5 managers. This is an AWS setup, using c6a.2xlarge EC2 families for all nodes. Attaching Docker info data below. We have approximately 300 "website environments" running about 6 services per environment. 2 of these services are exposed by Traefik. Each of these environments is running it's own local docker stack network, as well as an externally defined network "public-" to expose it via Traefik.
My problem, is that sometimes these environments will return 504. I don't see any events in Docker events. I don't see anything unusually problematic in syslog.
Docker info:
Client: Docker Engine - Community
Version: 24.0.6
Context: default
Debug Mode: false
Plugins:
buildx: Docker Buildx (Docker Inc.)
Version: v0.11.2
Path: /usr/libexec/docker/cli-plugins/docker-buildx
compose: Docker Compose (Docker Inc.)
Version: v2.21.0
Path: /usr/libexec/docker/cli-plugins/docker-compose
Server:
Containers: 104
Running: 104
Paused: 0
Stopped: 0
Images: 8
Server Version: 24.0.6
Storage Driver: overlay2
Backing Filesystem: extfs
Supports d_type: true
Using metacopy: false
Native Overlay Diff: true
userxattr: false
Logging Driver: json-file
Cgroup Driver: systemd
Cgroup Version: 2
Plugins:
Volume: local
Network: bridge host ipvlan macvlan null overlay
Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
Swarm: active
NodeID: b19thmeod7yhmwpzqe34srqfu
Is Manager: true
ClusterID: vq0wn6wif1q8fm4j73ka58240
Managers: 5
Nodes: 18
Default Address Pool: 10.0.0.0/8
SubnetSize: 24
Data Path Port: 4789
Orchestration:
Task History Retention Limit: 5
Raft:
Snapshot Interval: 10000
Number of Old Snapshots to Retain: 0
Heartbeat Tick: 1
Election Tick: 10
Dispatcher:
Heartbeat Period: 5 seconds
CA Configuration:
Expiry Duration: 3 months
Force Rotate: 0
Autolock Managers: false
Root Rotation In Progress: false
Node Address: 10.0.21.176
Manager Addresses:
10.0.16.23:2377
10.0.21.176:2377
10.0.27.74:2377
10.0.28.132:2377
10.0.30.241:2377
Runtimes: io.containerd.runc.v2 runc
Default Runtime: runc
Init Binary: docker-init
containerd version: 8165feabfdfe38c65b599c4993d227328c231fca
runc version: v1.1.8-0-g82f18fe
init version: de40ad0
Security Options:
apparmor
seccomp
Profile: builtin
cgroupns
Kernel Version: 6.2.0-1011-aws
Operating System: Ubuntu 22.04.3 LTS
OSType: linux
Architecture: x86_64
CPUs: 8
Total Memory: 15.26GiB
Name: <redacted-host-name>
ID: bbcdfb17-0d75-4edf-880f-050eb96f02dd
Docker Root Dir: /var/lib/docker
Debug Mode: false
Experimental: true
Insecure Registries:
Live Restore Enabled: false
Traefik Yaml:
version: '3'
services:
lb:
image: traefik:v2.9
env_file: .env
command:
- "--providers.docker.endpoint=unix:///var/run/docker.sock"
- "--providers.docker.swarmMode=true"
- "--providers.docker.exposedbydefault=true"
- "--providers.docker.network=traefik-public"
- "--api.dashboard=true"
- "--api.insecure=true"
- "--accesslog=true"
- "--accesslog.format=json"
- "--accesslog.fields.headers.defaultmode=keep"
- "--entrypoints.web.address=:80"
- "--entryPoints.web.forwardedHeaders.insecure"
- "--entryPoints.web.proxyProtocol.trustedIPs=127.0.0.1/32,10.10.0.0/16"
- "--entryPoints.web.proxyProtocol.insecure"
- "--metrics.prometheus=true"
- "--metrics.prometheus.addEntryPointsLabels=true"
- "--metrics.prometheus.addrouterslabels=true"
- "--entryPoints.metrics.address=:8082"
- "--metrics.prometheus.entryPoint=metrics"
- "--providers.docker.network=traefik-public-docx"
...
environment:
- TZ=US/Chicago
ports:
- 80:80
- 8083:8080
- 8082:8082
volumes:
# So that Traefik can listen to the Docker events
- /var/run/docker.sock:/var/run/docker.sock:ro
networks:
- traefik-public
- traefik-public-docx
...
logging:
driver: "json-file"
options:
max-size: "50k"
deploy:
mode: replicated
replicas: 5
placement:
constraints: [node.role == manager]
labels:
- "traefik.enable=true"
- "traefik.http.services.traefik.loadbalancer.server.port=888" # required by swarm but not used.
- "traefik.http.routers.traefik.rule=Host(\`traefik.example.com\`)"
- "traefik.http.routers.traefik.entrypoints=traefik"
- "traefik.http.routers.traefik.service=api@internal"
- "traefik.http.routers.traefik.middlewares=traefik-auth"
- "traefik.http.middlewares.traefik-auth.basicauth.users=fake-user:fake-pass"
networks:
traefik-public:
external: true
traefik-public-docx:
external: true
...
Example Customer Environment YAML:
version: "3.3"
services:
# The main database instance
db:
env_file: .env
image: <ecr-url>/db:stable
ports:
- target: 3306
published: <dynamically-assigned-port>
protocol: tcp
mode: host
volumes:
- dbtmp:/tmp
- dbdata:/var/lib/mysql
configs:
- source: mariadb-1694946020
target: /etc/mysql/mariadb.conf.d/99-custom.cnf
networks:
- default
deploy:
labels:
- "traefik.enable=false"
placement:
constraints: [node.hostname == <static-host-value>]
# The API
api:
env_file: .env
image: <ecr-url>/api:stable
networks:
- traefik-public-<name>
- default
configs:
- source: api-1694946020
target: /var/www/html/.env
deploy:
placement:
constraints: [node.role == worker]
resources:
limits:
cpus: '1.0'
memory: '512M'
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik-public-<name>"
- "traefik.http.routers.<name>_api.rule=Host(`<name>-api.example.com`)"
- "traefik.http.routers.<name>_api.entrypoints=web"
- "traefik.http.routers.<name>_api.middlewares=<name>_api_security"
- "traefik.http.services.<name>_api.loadbalancer.server.port=80"
# Security headers
- "traefik.http.middlewares.<name>_api_security.headers.frameDeny=true"
- "traefik.http.middlewares.<name>_api_security.headers.browserXssFilter=true"
- "traefik.http.middlewares.<name><name>_api_security.headers.contentTypeNosniff=true"
front:
env_file: .env
image: <ecr-url>/front:stable
networks:
- traefik-public-<name>
- default
deploy:
placement:
constraints: [node.role == worker]
resources:
limits:
cpus: '1.0'
memory: '512M'
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik-public-<name>"
- "traefik.http.routers.<name>_front.rule=Host(`<name>.example.com`)"
- "traefik.http.routers.<name>_front.entrypoints=web"
- "traefik.http.routers.<name>_front.middlewares=<name>_front_security"
- "traefik.http.services.<name>_front.loadbalancer.server.port=80"
# Security headers
- "traefik.http.middlewares.<name>_front_security.headers.customFrameOptionsValue=SAMEORIGIN"
- "traefik.http.middlewares.<name>_front_security.headers.browserXssFilter=true"
- "traefik.http.middlewares.<name>_front_security.headers.contentTypeNosniff=true"
- "traefik.http.middlewares.<name>_front_security.headers.featurepolicy=camera 'none'; geolocation 'none'; microphone 'none'; payment 'none'; usb 'none'; vr 'none';"
- "traefik.http.middlewares.<name>_front_security.headers.customresponseheaders.X-Robots-Tag=none,noarchive,nosnippet,notranslate,noimageindex"
# define the volumes
volumes:
dbtmp:
driver: local
driver_opts:
o: bind
device: /mnt/dbtmp/<name>
type: none
dbdata:
# external configuration
configs:
api-1694946020:
external: true
mariadb-1694946020:
external: true
# network definitions
networks:
default:
traefik-public-<name>:
external: true