From 5aa8c6553621764d615d7bb11b21bb08c3389f36 Mon Sep 17 00:00:00 2001 From: nil0x42 Date: Fri, 16 May 2025 18:37:34 +0200 Subject: [PATCH] Fix thpool_destroy() hang: unlock all workers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `thpool_destroy()` is supposed to wake every worker, wait until the pool is empty, and return quickly. In practice it can block **indefinitely** (or for dozens of seconds) as soon as the pool size out-grows its one-second “fast-exit” window. * `bsem_post_all()` sets `v = 1` and broadcasts. * The **first** worker that wakes executes `bsem_wait() → while (v==0) … ; v = 0; …` → the single ticket is consumed, all other awakened threads fall straight back into `pthread_cond_wait()`. * After that first second, the destroy code slow-polls: one `bsem_post_all()` + `sleep(1)` per loop → **one ticket per second**. With 4000 threads the destructor needs \~4000 s; even with only **16 threads** (real-world `duplicut` run) it can exceed a watchdog such as `timeout 5`, hanging about 1 ‰ of the executions. --- tests/src/thpool_destroy_hang.c | 55 +++++++++++++++++++++++++++++++++ tests/thpool_destroy_hang.sh | 25 +++++++++++++++ thpool.c | 7 +++-- 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 tests/src/thpool_destroy_hang.c create mode 100755 tests/thpool_destroy_hang.sh diff --git a/tests/src/thpool_destroy_hang.c b/tests/src/thpool_destroy_hang.c new file mode 100644 index 0000000..1136a7d --- /dev/null +++ b/tests/src/thpool_destroy_hang.c @@ -0,0 +1,55 @@ +/* + * Reproduces the “hang inside thpool_destroy()” bug. + * – Fails (SIGALRM) with the original binary-semaphore implementation. + * – Succeeds instantly once bsem_wait / bsem_post_all are patched. + */ + +#include +#include +#include +#include +#include +#include "../../thpool.h" + +#define TIMEOUT 1 + +/* -------------------------------------------------------------- */ +/* 2. SIGALRM handler: triggers when thpool_destroy() does not */ +/* return within 1 sec, indicating the bug is reproduced. */ +/* -------------------------------------------------------------- */ +static void timeout_handler(int sig) +{ + (void)sig; + fprintf(stderr, + "FAIL: thpool_destroy() did not finish within %d s " + "(bug reproduced)\n", TIMEOUT); + _exit(EXIT_FAILURE); +} + +int main(void) +{ + const int THREADS = 4000; + + /* Watchdog: if we are still alive after 3 s, the bug is present */ + signal(SIGALRM, timeout_handler); + alarm(TIMEOUT); + + printf("Creating pool with %d threads …\n", THREADS); + threadpool tp = thpool_init(THREADS); + + struct timeval t0, t1; + gettimeofday(&t0, NULL); + + printf("Calling thpool_destroy() …\n"); + thpool_destroy(tp); /* ← hangs here when not patched */ + + gettimeofday(&t1, NULL); + alarm(0); /* disarm watchdog */ + + double elapsed = + (t1.tv_sec - t0.tv_sec) + (t1.tv_usec - t0.tv_usec) / 1e6; + + printf("PASS: thpool_destroy() returned in %.3f s (patch OK)\n", + elapsed); + return 0; +} diff --git a/tests/thpool_destroy_hang.sh b/tests/thpool_destroy_hang.sh new file mode 100755 index 0000000..1a7b959 --- /dev/null +++ b/tests/thpool_destroy_hang.sh @@ -0,0 +1,25 @@ +#! /bin/bash +# +# Test for the deterministic thpool_destroy() hang +# + +. funcs.sh + +# -------------------------------------------------- # +# Single test: must finish without hitting SIGALRM # +# -------------------------------------------------- # +function test_thpool_destroy_hang { + compile src/thpool_destroy_hang.c + echo "Testing for thpool_destroy() hang" + output=$(timeout 2 ./test) + if [[ $? != 0 ]]; then + err "thpool_destroy() hang reproduced" "$output" + exit 1 + fi + +} + +# Run the test +test_thpool_destroy_hang + +echo "No errors" diff --git a/thpool.c b/thpool.c index 83885c3..9dfa704 100644 --- a/thpool.c +++ b/thpool.c @@ -25,6 +25,7 @@ #include #include #include +#include #if defined(__linux__) #include #endif @@ -554,7 +555,7 @@ static void bsem_post(bsem *bsem_p) { /* Post to all threads */ static void bsem_post_all(bsem *bsem_p) { pthread_mutex_lock(&bsem_p->mutex); - bsem_p->v = 1; + bsem_p->v = INT_MAX; pthread_cond_broadcast(&bsem_p->cond); pthread_mutex_unlock(&bsem_p->mutex); } @@ -563,9 +564,9 @@ static void bsem_post_all(bsem *bsem_p) { /* Wait on semaphore until semaphore has value 0 */ static void bsem_wait(bsem* bsem_p) { pthread_mutex_lock(&bsem_p->mutex); - while (bsem_p->v != 1) { + while (bsem_p->v == 0) { pthread_cond_wait(&bsem_p->cond, &bsem_p->mutex); } - bsem_p->v = 0; + bsem_p->v--; pthread_mutex_unlock(&bsem_p->mutex); }