@@ -154,6 +154,32 @@ def run_job(job, jobstore_alias, run_times, logger_name):
154154
155155 return events
156156
157+ def _run_job_error (self , job_id , exc , traceback = None ):
158+ """
159+ > Called by the executor with the exception if there is an error calling `run_job`.
160+
161+ Sometimes we start getting traceback, after which collector no longer works:
162+ -----
163+ 2019-10-04 19:45:38 | ERR | Error submitting job "SNMPCollector.do_snmp (trigger: <collector.MultipleIntervalsTrigger object at 0x7fd866b9aee8>, next run at: 2019-10-04 19:45:38 UTC)" to executor "iaexecutor"
164+ Traceback (most recent call last):
165+ File "/usr/local/lib/python3.6/site-packages/apscheduler/schedulers/base.py", line 974, in _process_jobs
166+ executor.submit_job(job, run_times)
167+ File "/usr/local/lib/python3.6/site-packages/apscheduler/executors/base.py", line 71, in submit_job
168+ self._do_submit_job(job, run_times)
169+ File "./collector.py", line 92, in _do_submit_job
170+ File "/usr/local/lib/python3.6/concurrent/futures/process.py", line 452, in submit
171+ raise BrokenProcessPool('A child process terminated '
172+ concurrent.futures.process.BrokenProcessPool: A child process terminated abruptly, the process pool is not usable anymore
173+ -----
174+
175+ The idea is that we remember that we are in this state, so that we can make Docker health check fail.
176+ """
177+ super ()._run_job_error (job_id , exc , traceback )
178+
179+ if 'BrokenProcessPool' in exc .__class__ .__name__ :
180+ # this file is checked by the Docker health check and if it exists, container should be restarted:
181+ open ('/tmp/fail_health_check' , 'a' ).close ()
182+
157183
158184class Collector (object ):
159185 __slots__ = 'backend_url' , 'bot_token' , 'scheduler' , 'known_jobs' , 'jobs_refresh_interval'
0 commit comments