Merge branch 'fix/autoheal' into 'master'

grafolean · grafolean · commit b9e253636d69 · 2019-10-09T19:03:13.000Z
Restart SNMP collector if BrokenProcessPool exception is caught

See merge request grafolean/grafolean-collector-snmp!10
diff --git a/Dockerfile b/Dockerfile
@@ -40,4 +40,5 @@ RUN \
     echo "alias l='ls -altr'" >> /root/.bashrc
 COPY --from=build-backend /snmpcollector/ /snmpcollector/
 WORKDIR /snmpcollector
+HEALTHCHECK --interval=10s --retries=1 CMD /bin/bash -c "[ ! -f /tmp/fail_health_check ]"
 CMD ["python", "-m", "snmpcollector"]
diff --git a/collector.py b/collector.py
@@ -154,6 +154,32 @@ def run_job(job, jobstore_alias, run_times, logger_name):
 
         return events
 
+    def _run_job_error(self, job_id, exc, traceback=None):
+        """
+            > Called by the executor with the exception if there is an error  calling `run_job`.
+
+            Sometimes we start getting traceback, after which collector no longer works:
+            -----
+                2019-10-04 19:45:38 | ERR | Error submitting job "SNMPCollector.do_snmp (trigger: <collector.MultipleIntervalsTrigger object at 0x7fd866b9aee8>, next run at: 2019-10-04 19:45:38 UTC)" to executor "iaexecutor"
+                Traceback (most recent call last):
+                File "/usr/local/lib/python3.6/site-packages/apscheduler/schedulers/base.py", line 974, in _process_jobs
+                    executor.submit_job(job, run_times)
+                File "/usr/local/lib/python3.6/site-packages/apscheduler/executors/base.py", line 71, in submit_job
+                    self._do_submit_job(job, run_times)
+                File "./collector.py", line 92, in _do_submit_job
+                File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 452, in submit
+                    raise BrokenProcessPool('A child process terminated '
+                concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
+            -----
+
+            The idea is that we remember that we are in this state, so that we can make Docker health check fail.
+        """
+        super()._run_job_error(job_id, exc, traceback)
+
+        if 'BrokenProcessPool' in exc.__class__.__name__:
+            # this file is checked by the Docker health check and if it exists, container should be restarted:
+            open('/tmp/fail_health_check', 'a').close()
+
 
 class Collector(object):
     __slots__ = 'backend_url', 'bot_token', 'scheduler', 'known_jobs', 'jobs_refresh_interval'
diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml
@@ -32,11 +32,21 @@ services:
     # so that Docker networking is bypassed.
     network_mode: "host"
 
+
   redis:
     image: redis:5-alpine
     container_name: grafolean-collector-snmp-redis
     ports:
-      - "127.0.0.1:6379:6379"
-    # We advise not to use `network_mode: "host"` in production, because it would expose Redis to host network
-    # (even if access is limited to 127.0.0.1).
+      - "6379:6379"
+    # We advise not to use `network_mode: "host"` in production, because it would expose Redis to the network.
     network_mode: "host"
+
+
+  autoheal:
+    image: willfarrell/autoheal
+    container_name: autoheal-snmp
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=all
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    restart: always
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -26,7 +26,19 @@ services:
       - REDIS_HOST=redis
     restart: always
 
+
   redis:
     image: redis:5-alpine
     container_name: grafolean-collector-snmp-redis
     restart: always
+
+
+  autoheal:
+    # This container automatically restarts any container that fails its health check. Not a bullet-proof solution, but better than nothing.
+    image: willfarrell/autoheal
+    container_name: autoheal-snmp
+    environment:
+      - AUTOHEAL_CONTAINER_LABEL=all
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    restart: always