Skip to content

Commit dc98053

Browse files
author
Anze
committed
Gracefully handle DB connection loss and startup time connectivity problems
1 parent 69e7091 commit dc98053

File tree

2 files changed

+55
-21
lines changed

2 files changed

+55
-21
lines changed

dbutils.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66
import copy
77
import json
8+
import time
89

910
import psycopg2
1011
from psycopg2.pool import ThreadedConnectionPool
@@ -21,6 +22,10 @@
2122
log = logging.getLogger("{}.{}".format(__name__, "dbutils"))
2223

2324

25+
class DBConnectionError(Exception):
26+
pass
27+
28+
2429
db_pool = None
2530
DB_PREFIX = 'snmp_'
2631
register_adapter(dict, Json)
@@ -32,24 +37,35 @@ def get_db_connection():
3237
global db_pool
3338
if db_pool is None:
3439
db_connect()
35-
3640
try:
41+
if db_pool is None:
42+
# connecting to DB failed
43+
raise DBConnectionError()
3744
conn = db_pool.getconn()
45+
if conn is None:
46+
# pool wasn't able to return a valid connection
47+
raise DBConnectionError()
48+
3849
conn.autocommit = True
3950
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
4051
yield conn
52+
except DBConnectionError:
53+
yield None
4154
finally:
42-
db_pool.putconn(conn)
55+
if db_pool is not None and conn is not None:
56+
db_pool.putconn(conn)
4357

4458

4559
@contextmanager
46-
def get_db_cursor(commit=False):
60+
def get_db_cursor():
4761
with get_db_connection() as connection:
62+
if connection is None:
63+
yield None
64+
return
65+
4866
cursor = connection.cursor()
4967
try:
5068
yield cursor
51-
if commit:
52-
connection.commit()
5369
finally:
5470
cursor.close()
5571

@@ -74,7 +90,7 @@ def db_connect():
7490
connect_timeout=connect_timeout)
7591
except:
7692
db_pool = None
77-
log.exception("DB connection failed")
93+
log.warning("DB connection failed")
7894

7995

8096
def db_disconnect():
@@ -86,6 +102,15 @@ def db_disconnect():
86102
log.info("DB connection is closed")
87103

88104

105+
def initial_wait_for_db():
106+
while True:
107+
with get_db_cursor() as c:
108+
if c is not None:
109+
return
110+
log.info("DB connection failed - waiting for DB to become available, sleeping 5s")
111+
time.sleep(5)
112+
113+
89114
###########################
90115
# DB schema migration #
91116
###########################

snmpbot.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
import psycopg2
1515

1616
from grafoleancollector import Collector
17-
from dbutils import get_db_cursor, DB_PREFIX, migrate_if_needed, db_disconnect
17+
from dbutils import get_db_cursor, DB_PREFIX, initial_wait_for_db, migrate_if_needed, db_disconnect, DBConnectionError
1818

1919

2020
logging.basicConfig(format='%(asctime)s | %(levelname)s | %(message)s',
@@ -40,6 +40,8 @@ class InvalidOutputPath(Exception):
4040

4141
def _get_previous_counter_value(counter_ident):
4242
with get_db_cursor() as c:
43+
if c is None:
44+
raise DBConnectionError()
4345
try:
4446
c.execute(f'SELECT value, ts FROM {DB_PREFIX}bot_counters WHERE id = %s;', (counter_ident,))
4547
rec = c.fetchone()
@@ -54,6 +56,8 @@ def _get_previous_counter_value(counter_ident):
5456

5557
def _save_current_counter_value(new_value, now, counter_ident):
5658
with get_db_cursor() as c:
59+
if c is None:
60+
raise DBConnectionError()
5761
c.execute(f"INSERT INTO {DB_PREFIX}bot_counters (id, value, ts) VALUES (%s, %s, %s) ON CONFLICT (id) DO UPDATE SET value = %s, ts = %s;",
5862
(counter_ident, new_value, now, new_value, now))
5963

@@ -67,24 +71,28 @@ def _convert_counters_to_values(results, now, counter_ident_prefix):
6771
if v.snmp_type not in ['COUNTER', 'COUNTER64']:
6872
new_results.append(v)
6973
continue
74+
7075
# counter - deal with it:
71-
counter_ident = counter_ident_prefix + f'/{i}/{v.oid}/{v.oid_index}'
72-
old_value, t = _get_previous_counter_value(counter_ident)
7376
new_value = int(float(v.value))
74-
_save_current_counter_value(new_value, now, counter_ident)
75-
if old_value is None:
76-
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=None, snmp_type='COUNTER_PER_S'))
77-
continue
77+
counter_ident = counter_ident_prefix + f'/{i}/{v.oid}/{v.oid_index}'
78+
try:
79+
old_value, t = _get_previous_counter_value(counter_ident)
80+
_save_current_counter_value(new_value, now, counter_ident)
81+
if old_value is None:
82+
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=None, snmp_type='COUNTER_PER_S'))
83+
continue
7884

79-
# it seems like the counter overflow happened, discard result:
80-
if new_value < old_value:
81-
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=None, snmp_type='COUNTER_PER_S'))
82-
log.warning(f"Counter overflow detected for oid {v.oid}, oid index {v.oid_index}, discarding value - if this happens often, consider using OIDS with 64bit counters (if available) or decreasing polling interval.")
83-
continue
85+
# it seems like the counter overflow happened, discard result:
86+
if new_value < old_value:
87+
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=None, snmp_type='COUNTER_PER_S'))
88+
log.warning(f"Counter overflow detected for oid {v.oid}, oid index {v.oid_index}, discarding value - if this happens often, consider using OIDS with 64bit counters (if available) or decreasing polling interval.")
89+
continue
8490

85-
dt = now - t
86-
dv = (new_value - old_value) / dt
87-
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=dv, snmp_type='COUNTER_PER_S'))
91+
dt = now - t
92+
dv = (new_value - old_value) / dt
93+
new_results.append(SNMPVariable(oid=v.oid, oid_index=v.oid_index, value=dv, snmp_type='COUNTER_PER_S'))
94+
except DBConnectionError:
95+
log.error(f"Could not convert counter due to DB error: {counter_ident} / {new_value}")
8896
return new_results
8997

9098

@@ -443,6 +451,7 @@ def wait_for_grafolean(backend_url):
443451
if __name__ == "__main__":
444452
dotenv.load_dotenv()
445453

454+
initial_wait_for_db()
446455
migrate_if_needed()
447456
db_disconnect() # each worker should open their own connection pool
448457

0 commit comments

Comments
 (0)