diff --git a/Makefile b/Makefile index c5433bb..60d14fe 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CXX := g++ -CXXFLAGS := -std=c++17 -O3 -fPIC -march=native -fopenmp +BASE_CXXFLAGS := -std=c++17 -O3 -fPIC # Python / pybind11 include flags PYBIND11_INCLUDES := $(shell python3 -m pybind11 --includes) @@ -28,35 +28,116 @@ SOURCES := \ # Extension suffix (.so or .cpython-XYm-x86_64-linux-gnu.so, etc.) EXT_SUFFIX := $(shell python3-config --extension-suffix) -TARGET := build/zenann$(EXT_SUFFIX) # Platform‐specific linker flags UNAME_S := $(shell uname -s) ifeq ($(UNAME_S),Darwin) # on macOS use dynamic_lookup - LDFLAGS := -undefined dynamic_lookup + BASE_LDFLAGS := -undefined dynamic_lookup else # on Linux embed rpath to pick up our extern/faiss libfaiss.so - LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib + BASE_LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib endif -# Add OpenMP linking -LDFLAGS += -fopenmp +# ============================================================================ +# Version-specific build configurations +# ============================================================================ + +# Output target (all versions output to the same file) +TARGET := build/zenann$(EXT_SUFFIX) + +# NAIVE: No parallelization (baseline) +NAIVE_CXXFLAGS := $(BASE_CXXFLAGS) +NAIVE_LDFLAGS := $(BASE_LDFLAGS) + +# OPENMP: Only OpenMP parallelization +OPENMP_CXXFLAGS := $(BASE_CXXFLAGS) -fopenmp -DENABLE_OPENMP +OPENMP_LDFLAGS := $(BASE_LDFLAGS) -fopenmp + +# SIMD: Only SIMD vectorization (AVX2) +SIMD_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -DENABLE_SIMD +SIMD_LDFLAGS := $(BASE_LDFLAGS) -.PHONY: all clean prepare +# FULL: OpenMP + SIMD (fully optimized) +FULL_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -fopenmp -DENABLE_OPENMP -DENABLE_SIMD +FULL_LDFLAGS := $(BASE_LDFLAGS) -fopenmp -all: prepare $(TARGET) +# CUDA: CUDA acceleration (placeholder for future) +CUDA_CXXFLAGS := $(BASE_CXXFLAGS) -DENABLE_CUDA +CUDA_LDFLAGS := $(BASE_LDFLAGS) -lcuda -lcudart + +# ============================================================================ +# Targets +# ============================================================================ + +.PHONY: all clean prepare naive openmp simd full cuda help + +# Default target: build full version +all: full prepare: - mkdir -p build + @mkdir -p build -# Build the Python extension, linking against our Faiss -$(TARGET): $(SOURCES) - $(CXX) $(CXXFLAGS) $(ALL_INCLUDES) -shared -o $@ \ +# Build naive version (no parallelization) +naive: prepare + $(CXX) $(NAIVE_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \ $(SOURCES) \ -L$(FAISS_ROOT)/lib -lfaiss \ $(ALL_LIBS) \ - $(LDFLAGS) + $(NAIVE_LDFLAGS) + @echo "✓ Built NAIVE version: $(TARGET)" +# Build OpenMP-only version +openmp: prepare + $(CXX) $(OPENMP_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \ + $(SOURCES) \ + -L$(FAISS_ROOT)/lib -lfaiss \ + $(ALL_LIBS) \ + $(OPENMP_LDFLAGS) + @echo "✓ Built OPENMP version: $(TARGET)" + +# Build SIMD-only version +simd: prepare + $(CXX) $(SIMD_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \ + $(SOURCES) \ + -L$(FAISS_ROOT)/lib -lfaiss \ + $(ALL_LIBS) \ + $(SIMD_LDFLAGS) + @echo "✓ Built SIMD version: $(TARGET)" + +# Build full version (OpenMP + SIMD) +full: prepare + $(CXX) $(FULL_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \ + $(SOURCES) \ + -L$(FAISS_ROOT)/lib -lfaiss \ + $(ALL_LIBS) \ + $(FULL_LDFLAGS) + @echo "✓ Built FULL version: $(TARGET)" + +# Build CUDA version (placeholder) +cuda: prepare + @echo "CUDA version not yet implemented" + @echo "Will output to: $(TARGET)" + +# Clean all builds clean: rm -rf build + +# Help message +help: + @echo "ZenANN Build System - Multiple Optimization Versions" + @echo "" + @echo "Available targets:" + @echo " make naive - Build naive version (no parallelization)" + @echo " make openmp - Build OpenMP-only version" + @echo " make simd - Build SIMD-only version (AVX2)" + @echo " make full - Build fully optimized version (OpenMP + SIMD)" + @echo " make cuda - Build CUDA version (not yet implemented)" + @echo " make all - Build full version (default)" + @echo " make clean - Remove all built files" + @echo "" + @echo "Note: All versions output to build/zenann.so" + @echo "Each build will overwrite the previous one." + @echo "" + @echo "Usage:" + @echo " import build.zenann as zenann # Always works regardless of version" diff --git a/include/zenann/SimdUtils.h b/include/zenann/SimdUtils.h index c1e522e..92d23b8 100644 --- a/include/zenann/SimdUtils.h +++ b/include/zenann/SimdUtils.h @@ -1,12 +1,16 @@ #pragma once #include + +#if defined(ENABLE_SIMD) #include +#endif namespace zenann { -inline float l2_simd(const float* __restrict a, - const float* __restrict b, - size_t dim) { -#if defined(__AVX2__) + +// L2 distance calculation with optional SIMD optimization +inline float l2_distance(const float* a, const float* b, size_t dim) { +#if defined(ENABLE_SIMD) && defined(__AVX2__) + // SIMD version using AVX2 const size_t step = 8; // 8 × 32-bit floats __m256 acc = _mm256_setzero_ps(); size_t i = 0; @@ -27,6 +31,7 @@ inline float l2_simd(const float* __restrict a, } return d; #else + // Naive version (no SIMD or AVX2 not available) float d = 0.f; for (size_t i = 0; i < dim; ++i) { float diff = a[i] - b[i]; diff --git a/src/IVFFlatIndex.cpp b/src/IVFFlatIndex.cpp index 45465ba..37c7375 100644 --- a/src/IVFFlatIndex.cpp +++ b/src/IVFFlatIndex.cpp @@ -1,6 +1,8 @@ #include "IVFFlatIndex.h" #include "SimdUtils.h" +#ifdef ENABLE_OPENMP #include +#endif #include #include #include @@ -48,10 +50,12 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const { std::vector cdist(nlist_); std::vector heap; - // Calculate distance from query to all centroids (parallelized) + // Calculate distance from query to all centroids +#ifdef ENABLE_OPENMP #pragma omp parallel for schedule(static) +#endif for (size_t c = 0; c < nlist_; ++c) { - float d = l2_simd(query.data(), centroids_[c].data(), dimension_); + float d = l2_distance(query.data(), centroids_[c].data(), dimension_); cdist[c] = {d, c}; } @@ -62,23 +66,27 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const { } ); - // Probe nprobe nearest lists in parallel - // Each thread maintains a local heap, then merges into global heap + // Probe nprobe nearest lists heap.reserve(k); const auto& data = datastore_->getAll(); +#ifdef ENABLE_OPENMP #pragma omp parallel for schedule(dynamic) +#endif for (size_t pi = 0; pi < nprobe_; ++pi) { size_t c = cdist[pi].second; +#ifdef ENABLE_OPENMP // Thread-local heap for this cluster std::vector local; local.reserve(k); +#endif // Search within this cluster's inverted list for (size_t id : lists_[c]) { - float dist = l2_simd(query.data(), data[id].data(), dimension_); + float dist = l2_distance(query.data(), data[id].data(), dimension_); +#ifdef ENABLE_OPENMP if (local.size() < k) { local.emplace_back(dist, id); if (local.size() == k) { @@ -89,8 +97,21 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const { local.back() = {dist, id}; std::push_heap(local.begin(), local.end()); } +#else + if (heap.size() < k) { + heap.emplace_back(dist, id); + if (heap.size() == k) { + std::make_heap(heap.begin(), heap.end()); + } + } else if (dist < heap.front().first) { + std::pop_heap(heap.begin(), heap.end()); + heap.back() = {dist, id}; + std::push_heap(heap.begin(), heap.end()); + } +#endif } +#ifdef ENABLE_OPENMP // Merge local results into global heap (thread-safe) #pragma omp critical { @@ -107,6 +128,7 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const { } } } +#endif } std::sort(heap.begin(), heap.end(), @@ -134,8 +156,10 @@ std::vector IVFFlatIndex::search_batch(const Dataset& queries, siz const size_t nq = queries.size(); std::vector results(nq); - // Parallel batch search with dynamic scheduling + // Batch search with optional parallelization +#ifdef ENABLE_OPENMP #pragma omp parallel for schedule(dynamic) +#endif for (size_t i = 0; i < nq; ++i) { results[i] = search(queries[i], k); }