From 6b3da7a977db47516a1db54523bcca2da612ff1c Mon Sep 17 00:00:00 2001 From: David Greenberg Date: Fri, 2 Sep 2016 17:01:57 -0600 Subject: [PATCH 1/3] Initial tracing gc draft without testing --- src/hitchhiker/tracing_gc.clj | 69 +++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/hitchhiker/tracing_gc.clj diff --git a/src/hitchhiker/tracing_gc.clj b/src/hitchhiker/tracing_gc.clj new file mode 100644 index 0000000..0576ab6 --- /dev/null +++ b/src/hitchhiker/tracing_gc.clj @@ -0,0 +1,69 @@ +(ns hitchhiker.tracing-gc + (:require [hitchhiker.tree.core :as hh])) + +;; Note: this implementation is single-threaded, and could be made parallel without too much effort + +;; We might need to trace millions or billions of keys. That might not fit in memory, so this could be backed +;; by leveldb or hsql so that we can spill to disk when necessary. We don't need a functional datastructure here. +(defprotocol IGCScratch + (add-to-work-queue! [this addr] "Adds the given address to the work queue to be processed") + (pop-from-work-queue! [this] "Pops the next element off of the work queue, or returns nil if we're done") + (observe-addr! [this addr] "Marks the given addr as being currently active") + (observed? [this addr] "Returns true if the given addr was observed")) +; +;;; The workq is a ref containing a collection of addresses we still need to scan. +;;; The observed-set is a ref containing the set of addresses we know are active +;;; For simplicity, adding an addr to the workq automatically observes it as well +;;; ^^ this allows us to only add new addrs to the workq, without a separate set of "in workq" +(defrecord InMemScratch [workq observed-set] + IGCScratch + (add-to-work-queue! [_ addr] + (dosync + (when-not (contains? @observed-set addr) + (alter workq conj addr) + (alter observed-set conj addr)))) + (pop-from-work-queue! [_] + (dosync + (when (seq @workq) + (let [head (peek @workq)] + (alter workq pop) + head)))) + (observe-addr! [_ addr] + (dosync + (alter conj observed-set addr))) + (observed? [_ addr] + (contains? @observed-set addr))) + +(defn in-mem-scratch + "Creates an instance of in memory GC scratch" + [] + (->InMemScratch [] #{})) + +(defn trace-gc! + "Does a tracing GC and frees up all unused keys. + This is a simple mark-sweep algorithm. + + gc-scratch should be an instance of IGCScratch + gc-roots should be a list of the roots, which should implement IResolve. These are generated by calls to anchor-root. + all-keys should be a lazy sequence that will contain every key in storage. This algorithm will not hold the whole sequence in memory + delete-fn will be called on every key that should be deleted during the sweep phase" + [gc-scratch gc-roots all-keys delete-fn] + ;; First, we'll initialize the work queue + (doseq [root gc-roots] + (add-to-work-queue! gc-scratch root)) + ;; Now, we'll do the mark phase + (loop [] + (when-let [addr (pop-from-work-queue! gc-scratch)] + (observe-addr! gc-scratch addr) + (let [node (hh/resolve addr)] + (when (hh/index-node? node) + (doseq [c (:children node)] + (add-to-work-queue! gc-scratch c)))) + (recur))) + ;; Next, we do the sweep + (loop [ks all-keys] + (when (seq ks) + (let [head (first ks)] + (when-not (observed? gc-scratch head) + (delete-fn head))) + (recur (next ks))))) From 5602e6176403ed0b43afa6c6b016f8323849a452 Mon Sep 17 00:00:00 2001 From: Andy Chambers Date: Sun, 4 Sep 2016 20:09:37 -0700 Subject: [PATCH 2/3] Add basic test for tracing garbage collection The test builds two random b-trees, and then uses one of them as a gc root, and including the other in the lazy sequence of "all keys" to be dealt with by the collector. After running a gc, we assert that the `deleted-fn` has been invoked against all the "dead" nodes. --- src/hitchhiker/tracing_gc.clj | 6 ++--- test/hitchhiker/tracing_gc_test.clj | 38 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 test/hitchhiker/tracing_gc_test.clj diff --git a/src/hitchhiker/tracing_gc.clj b/src/hitchhiker/tracing_gc.clj index 0576ab6..655876f 100644 --- a/src/hitchhiker/tracing_gc.clj +++ b/src/hitchhiker/tracing_gc.clj @@ -30,19 +30,19 @@ head)))) (observe-addr! [_ addr] (dosync - (alter conj observed-set addr))) + (alter observed-set conj addr))) (observed? [_ addr] (contains? @observed-set addr))) (defn in-mem-scratch "Creates an instance of in memory GC scratch" [] - (->InMemScratch [] #{})) + (->InMemScratch (ref []) (ref #{}))) (defn trace-gc! "Does a tracing GC and frees up all unused keys. This is a simple mark-sweep algorithm. - + gc-scratch should be an instance of IGCScratch gc-roots should be a list of the roots, which should implement IResolve. These are generated by calls to anchor-root. all-keys should be a lazy sequence that will contain every key in storage. This algorithm will not hold the whole sequence in memory diff --git a/test/hitchhiker/tracing_gc_test.clj b/test/hitchhiker/tracing_gc_test.clj new file mode 100644 index 0000000..761fa67 --- /dev/null +++ b/test/hitchhiker/tracing_gc_test.clj @@ -0,0 +1,38 @@ +(ns hitchhiker.tracing-gc-test + (:require + [clojure.set :as set] + [clojure.test.check :as tc] + [clojure.test.check.clojure-test :refer [defspec]] + [clojure.test.check.generators :as gen] + [clojure.test.check.properties :as prop] + [hitchhiker.tree.core :as tree] + [hitchhiker.tracing-gc :as gc])) + +(defn tree-insert + [t k] + (tree/insert t k k)) + +(def gen-tree + (gen/bind (gen/vector gen/int) + (fn [vs] + (let [tree (reduce tree-insert (tree/b-tree (tree/->Config 3 2 1)) + vs)] + (gen/return tree))))) + +(defn tree-keys + [b-tree] + (tree-seq tree/index-node? :children b-tree)) + +(defspec unreferenced-keys-are-deleted + 1000 + (prop/for-all [live gen-tree + dead gen-tree] + (let [deleted (atom (set [])) + delete-fn (fn [item] + (swap! deleted conj item))] + (gc/trace-gc! (gc/in-mem-scratch) [live] + (-> (concat (tree-keys live) + (tree-keys dead)) + (shuffle)) + delete-fn) + (= (set @deleted) (set (tree-keys dead)))))) From 5824058f2d5c03b15c9f9c754b9639fba790222e Mon Sep 17 00:00:00 2001 From: David Greenberg Date: Tue, 6 Sep 2016 10:37:13 -0600 Subject: [PATCH 3/3] Reduce IO costs for tracing GC --- project.clj | 3 ++- src/hitchhiker/tracing_gc.clj | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/project.clj b/project.clj index 47d2d1b..68c4a08 100644 --- a/project.clj +++ b/project.clj @@ -10,7 +10,8 @@ :aliases {"bench" ["with-profile" "profiling" "run" "-m" "hitchhiker.bench"]} :jvm-opts ["-server" "-Xmx3700m" "-Xms3700m"] :profiles {:test - {:dependencies [[org.clojure/test.check "0.9.0"]]} + {:dependencies [[org.clojure/test.check "0.9.0"] + [org.clojure/tools.nrepl "0.2.11"]]} :profiling {:main hitchhiker.bench :source-paths ["env/profiling"] diff --git a/src/hitchhiker/tracing_gc.clj b/src/hitchhiker/tracing_gc.clj index 655876f..2c21f67 100644 --- a/src/hitchhiker/tracing_gc.clj +++ b/src/hitchhiker/tracing_gc.clj @@ -55,8 +55,8 @@ (loop [] (when-let [addr (pop-from-work-queue! gc-scratch)] (observe-addr! gc-scratch addr) - (let [node (hh/resolve addr)] - (when (hh/index-node? node) + (when (hh/index? addr) + (let [node (hh/resolve addr)] (doseq [c (:children node)] (add-to-work-queue! gc-scratch c)))) (recur)))