diff --git a/project.clj b/project.clj index 47d2d1b..68c4a08 100644 --- a/project.clj +++ b/project.clj @@ -10,7 +10,8 @@ :aliases {"bench" ["with-profile" "profiling" "run" "-m" "hitchhiker.bench"]} :jvm-opts ["-server" "-Xmx3700m" "-Xms3700m"] :profiles {:test - {:dependencies [[org.clojure/test.check "0.9.0"]]} + {:dependencies [[org.clojure/test.check "0.9.0"] + [org.clojure/tools.nrepl "0.2.11"]]} :profiling {:main hitchhiker.bench :source-paths ["env/profiling"] diff --git a/src/hitchhiker/tracing_gc.clj b/src/hitchhiker/tracing_gc.clj new file mode 100644 index 0000000..2c21f67 --- /dev/null +++ b/src/hitchhiker/tracing_gc.clj @@ -0,0 +1,69 @@ +(ns hitchhiker.tracing-gc + (:require [hitchhiker.tree.core :as hh])) + +;; Note: this implementation is single-threaded, and could be made parallel without too much effort + +;; We might need to trace millions or billions of keys. That might not fit in memory, so this could be backed +;; by leveldb or hsql so that we can spill to disk when necessary. We don't need a functional datastructure here. +(defprotocol IGCScratch + (add-to-work-queue! [this addr] "Adds the given address to the work queue to be processed") + (pop-from-work-queue! [this] "Pops the next element off of the work queue, or returns nil if we're done") + (observe-addr! [this addr] "Marks the given addr as being currently active") + (observed? [this addr] "Returns true if the given addr was observed")) +; +;;; The workq is a ref containing a collection of addresses we still need to scan. +;;; The observed-set is a ref containing the set of addresses we know are active +;;; For simplicity, adding an addr to the workq automatically observes it as well +;;; ^^ this allows us to only add new addrs to the workq, without a separate set of "in workq" +(defrecord InMemScratch [workq observed-set] + IGCScratch + (add-to-work-queue! [_ addr] + (dosync + (when-not (contains? @observed-set addr) + (alter workq conj addr) + (alter observed-set conj addr)))) + (pop-from-work-queue! [_] + (dosync + (when (seq @workq) + (let [head (peek @workq)] + (alter workq pop) + head)))) + (observe-addr! [_ addr] + (dosync + (alter observed-set conj addr))) + (observed? [_ addr] + (contains? @observed-set addr))) + +(defn in-mem-scratch + "Creates an instance of in memory GC scratch" + [] + (->InMemScratch (ref []) (ref #{}))) + +(defn trace-gc! + "Does a tracing GC and frees up all unused keys. + This is a simple mark-sweep algorithm. + + gc-scratch should be an instance of IGCScratch + gc-roots should be a list of the roots, which should implement IResolve. These are generated by calls to anchor-root. + all-keys should be a lazy sequence that will contain every key in storage. This algorithm will not hold the whole sequence in memory + delete-fn will be called on every key that should be deleted during the sweep phase" + [gc-scratch gc-roots all-keys delete-fn] + ;; First, we'll initialize the work queue + (doseq [root gc-roots] + (add-to-work-queue! gc-scratch root)) + ;; Now, we'll do the mark phase + (loop [] + (when-let [addr (pop-from-work-queue! gc-scratch)] + (observe-addr! gc-scratch addr) + (when (hh/index? addr) + (let [node (hh/resolve addr)] + (doseq [c (:children node)] + (add-to-work-queue! gc-scratch c)))) + (recur))) + ;; Next, we do the sweep + (loop [ks all-keys] + (when (seq ks) + (let [head (first ks)] + (when-not (observed? gc-scratch head) + (delete-fn head))) + (recur (next ks))))) diff --git a/test/hitchhiker/tracing_gc_test.clj b/test/hitchhiker/tracing_gc_test.clj new file mode 100644 index 0000000..761fa67 --- /dev/null +++ b/test/hitchhiker/tracing_gc_test.clj @@ -0,0 +1,38 @@ +(ns hitchhiker.tracing-gc-test + (:require + [clojure.set :as set] + [clojure.test.check :as tc] + [clojure.test.check.clojure-test :refer [defspec]] + [clojure.test.check.generators :as gen] + [clojure.test.check.properties :as prop] + [hitchhiker.tree.core :as tree] + [hitchhiker.tracing-gc :as gc])) + +(defn tree-insert + [t k] + (tree/insert t k k)) + +(def gen-tree + (gen/bind (gen/vector gen/int) + (fn [vs] + (let [tree (reduce tree-insert (tree/b-tree (tree/->Config 3 2 1)) + vs)] + (gen/return tree))))) + +(defn tree-keys + [b-tree] + (tree-seq tree/index-node? :children b-tree)) + +(defspec unreferenced-keys-are-deleted + 1000 + (prop/for-all [live gen-tree + dead gen-tree] + (let [deleted (atom (set [])) + delete-fn (fn [item] + (swap! deleted conj item))] + (gc/trace-gc! (gc/in-mem-scratch) [live] + (-> (concat (tree-keys live) + (tree-keys dead)) + (shuffle)) + delete-fn) + (= (set @deleted) (set (tree-keys dead))))))