diff --git a/Makefile b/Makefile
index c5433bb..60d14fe 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
 CXX := g++
-CXXFLAGS := -std=c++17 -O3 -fPIC -march=native -fopenmp
+BASE_CXXFLAGS := -std=c++17 -O3 -fPIC
 
 # Python / pybind11 include flags
 PYBIND11_INCLUDES := $(shell python3 -m pybind11 --includes)
@@ -28,35 +28,116 @@ SOURCES := \
 
 # Extension suffix (.so or .cpython-XYm-x86_64-linux-gnu.so, etc.)
 EXT_SUFFIX := $(shell python3-config --extension-suffix)
-TARGET := build/zenann$(EXT_SUFFIX)
 
 # Platform‐specific linker flags
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Darwin)
     # on macOS use dynamic_lookup
-    LDFLAGS := -undefined dynamic_lookup
+    BASE_LDFLAGS := -undefined dynamic_lookup
 else
     # on Linux embed rpath to pick up our extern/faiss libfaiss.so
-    LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib
+    BASE_LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib
 endif
 
-# Add OpenMP linking
-LDFLAGS += -fopenmp
+# ============================================================================
+# Version-specific build configurations
+# ============================================================================
+
+# Output target (all versions output to the same file)
+TARGET := build/zenann$(EXT_SUFFIX)
+
+# NAIVE: No parallelization (baseline)
+NAIVE_CXXFLAGS := $(BASE_CXXFLAGS)
+NAIVE_LDFLAGS := $(BASE_LDFLAGS)
+
+# OPENMP: Only OpenMP parallelization
+OPENMP_CXXFLAGS := $(BASE_CXXFLAGS) -fopenmp -DENABLE_OPENMP
+OPENMP_LDFLAGS := $(BASE_LDFLAGS) -fopenmp
+
+# SIMD: Only SIMD vectorization (AVX2)
+SIMD_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -DENABLE_SIMD
+SIMD_LDFLAGS := $(BASE_LDFLAGS)
 
-.PHONY: all clean prepare
+# FULL: OpenMP + SIMD (fully optimized)
+FULL_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -fopenmp -DENABLE_OPENMP -DENABLE_SIMD
+FULL_LDFLAGS := $(BASE_LDFLAGS) -fopenmp
 
-all: prepare $(TARGET)
+# CUDA: CUDA acceleration (placeholder for future)
+CUDA_CXXFLAGS := $(BASE_CXXFLAGS) -DENABLE_CUDA
+CUDA_LDFLAGS := $(BASE_LDFLAGS) -lcuda -lcudart
+
+# ============================================================================
+# Targets
+# ============================================================================
+
+.PHONY: all clean prepare naive openmp simd full cuda help
+
+# Default target: build full version
+all: full
 
 prepare:
-	mkdir -p build
+	@mkdir -p build
 
-# Build the Python extension, linking against our Faiss
-$(TARGET): $(SOURCES)
-	$(CXX) $(CXXFLAGS) $(ALL_INCLUDES) -shared -o $@ \
+# Build naive version (no parallelization)
+naive: prepare
+	$(CXX) $(NAIVE_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
 	    $(SOURCES) \
 	    -L$(FAISS_ROOT)/lib -lfaiss \
 	    $(ALL_LIBS) \
-	    $(LDFLAGS)
+	    $(NAIVE_LDFLAGS)
+	@echo "✓ Built NAIVE version: $(TARGET)"
 
+# Build OpenMP-only version
+openmp: prepare
+	$(CXX) $(OPENMP_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
+	    $(SOURCES) \
+	    -L$(FAISS_ROOT)/lib -lfaiss \
+	    $(ALL_LIBS) \
+	    $(OPENMP_LDFLAGS)
+	@echo "✓ Built OPENMP version: $(TARGET)"
+
+# Build SIMD-only version
+simd: prepare
+	$(CXX) $(SIMD_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
+	    $(SOURCES) \
+	    -L$(FAISS_ROOT)/lib -lfaiss \
+	    $(ALL_LIBS) \
+	    $(SIMD_LDFLAGS)
+	@echo "✓ Built SIMD version: $(TARGET)"
+
+# Build full version (OpenMP + SIMD)
+full: prepare
+	$(CXX) $(FULL_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
+	    $(SOURCES) \
+	    -L$(FAISS_ROOT)/lib -lfaiss \
+	    $(ALL_LIBS) \
+	    $(FULL_LDFLAGS)
+	@echo "✓ Built FULL version: $(TARGET)"
+
+# Build CUDA version (placeholder)
+cuda: prepare
+	@echo "CUDA version not yet implemented"
+	@echo "Will output to: $(TARGET)"
+
+# Clean all builds
 clean:
 	rm -rf build
+
+# Help message
+help:
+	@echo "ZenANN Build System - Multiple Optimization Versions"
+	@echo ""
+	@echo "Available targets:"
+	@echo "  make naive   - Build naive version (no parallelization)"
+	@echo "  make openmp  - Build OpenMP-only version"
+	@echo "  make simd    - Build SIMD-only version (AVX2)"
+	@echo "  make full    - Build fully optimized version (OpenMP + SIMD)"
+	@echo "  make cuda    - Build CUDA version (not yet implemented)"
+	@echo "  make all     - Build full version (default)"
+	@echo "  make clean   - Remove all built files"
+	@echo ""
+	@echo "Note: All versions output to build/zenann.so"
+	@echo "Each build will overwrite the previous one."
+	@echo ""
+	@echo "Usage:"
+	@echo "  import build.zenann as zenann    # Always works regardless of version"
diff --git a/include/zenann/SimdUtils.h b/include/zenann/SimdUtils.h
index c1e522e..92d23b8 100644
--- a/include/zenann/SimdUtils.h
+++ b/include/zenann/SimdUtils.h
@@ -1,12 +1,16 @@
 #pragma once
 #include <cstddef>
+
+#if defined(ENABLE_SIMD)
 #include <immintrin.h>
+#endif
 
 namespace zenann {
-inline float l2_simd(const float* __restrict a,
-                     const float* __restrict b,
-                     size_t dim) {
-#if defined(__AVX2__)
+
+// L2 distance calculation with optional SIMD optimization
+inline float l2_distance(const float* a, const float* b, size_t dim) {
+#if defined(ENABLE_SIMD) && defined(__AVX2__)
+    // SIMD version using AVX2
     const size_t step = 8;            // 8 × 32-bit floats
     __m256 acc       = _mm256_setzero_ps();
     size_t i         = 0;
@@ -27,6 +31,7 @@ inline float l2_simd(const float* __restrict a,
     }
     return d;
 #else
+    // Naive version (no SIMD or AVX2 not available)
     float d = 0.f;
     for (size_t i = 0; i < dim; ++i) {
         float diff = a[i] - b[i];
diff --git a/src/IVFFlatIndex.cpp b/src/IVFFlatIndex.cpp
index 45465ba..37c7375 100644
--- a/src/IVFFlatIndex.cpp
+++ b/src/IVFFlatIndex.cpp
@@ -1,6 +1,8 @@
 #include "IVFFlatIndex.h"
 #include "SimdUtils.h"
+#ifdef ENABLE_OPENMP
 #include <omp.h>
+#endif
 #include <limits>
 #include <random>
 #include <algorithm>
@@ -48,10 +50,12 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
     std::vector<Pair> cdist(nlist_);
     std::vector<Pair> heap;
 
-    // Calculate distance from query to all centroids (parallelized)
+    // Calculate distance from query to all centroids
+#ifdef ENABLE_OPENMP
     #pragma omp parallel for schedule(static)
+#endif
     for (size_t c = 0; c < nlist_; ++c) {
-        float d = l2_simd(query.data(), centroids_[c].data(), dimension_);
+        float d = l2_distance(query.data(), centroids_[c].data(), dimension_);
         cdist[c] = {d, c};
     }
 
@@ -62,23 +66,27 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
         }
     );
 
-    // Probe nprobe nearest lists in parallel
-    // Each thread maintains a local heap, then merges into global heap
+    // Probe nprobe nearest lists
     heap.reserve(k);
     const auto& data = datastore_->getAll();
 
+#ifdef ENABLE_OPENMP
     #pragma omp parallel for schedule(dynamic)
+#endif
     for (size_t pi = 0; pi < nprobe_; ++pi) {
         size_t c = cdist[pi].second;
 
+#ifdef ENABLE_OPENMP
         // Thread-local heap for this cluster
         std::vector<Pair> local;
         local.reserve(k);
+#endif
 
         // Search within this cluster's inverted list
         for (size_t id : lists_[c]) {
-            float dist = l2_simd(query.data(), data[id].data(), dimension_);
+            float dist = l2_distance(query.data(), data[id].data(), dimension_);
 
+#ifdef ENABLE_OPENMP
             if (local.size() < k) {
                 local.emplace_back(dist, id);
                 if (local.size() == k) {
@@ -89,8 +97,21 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
                 local.back() = {dist, id};
                 std::push_heap(local.begin(), local.end());
             }
+#else
+            if (heap.size() < k) {
+                heap.emplace_back(dist, id);
+                if (heap.size() == k) {
+                    std::make_heap(heap.begin(), heap.end());
+                }
+            } else if (dist < heap.front().first) {
+                std::pop_heap(heap.begin(), heap.end());
+                heap.back() = {dist, id};
+                std::push_heap(heap.begin(), heap.end());
+            }
+#endif
         }
 
+#ifdef ENABLE_OPENMP
         // Merge local results into global heap (thread-safe)
         #pragma omp critical
         {
@@ -107,6 +128,7 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
                 }
             }
         }
+#endif
     }
 
     std::sort(heap.begin(), heap.end(),
@@ -134,8 +156,10 @@ std::vector<SearchResult> IVFFlatIndex::search_batch(const Dataset& queries, siz
     const size_t nq = queries.size();
     std::vector<SearchResult> results(nq);
 
-    // Parallel batch search with dynamic scheduling
+    // Batch search with optional parallelization
+#ifdef ENABLE_OPENMP
     #pragma omp parallel for schedule(dynamic)
+#endif
     for (size_t i = 0; i < nq; ++i) {
         results[i] = search(queries[i], k);
     }