Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 94 additions & 13 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CXX := g++
CXXFLAGS := -std=c++17 -O3 -fPIC -march=native -fopenmp
BASE_CXXFLAGS := -std=c++17 -O3 -fPIC

# Python / pybind11 include flags
PYBIND11_INCLUDES := $(shell python3 -m pybind11 --includes)
Expand Down Expand Up @@ -28,35 +28,116 @@ SOURCES := \

# Extension suffix (.so or .cpython-XYm-x86_64-linux-gnu.so, etc.)
EXT_SUFFIX := $(shell python3-config --extension-suffix)
TARGET := build/zenann$(EXT_SUFFIX)

# Platform‐specific linker flags
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
# on macOS use dynamic_lookup
LDFLAGS := -undefined dynamic_lookup
BASE_LDFLAGS := -undefined dynamic_lookup
else
# on Linux embed rpath to pick up our extern/faiss libfaiss.so
LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib
BASE_LDFLAGS := -Wl,-rpath,$$ORIGIN/../extern/faiss/build/install/lib
endif

# Add OpenMP linking
LDFLAGS += -fopenmp
# ============================================================================
# Version-specific build configurations
# ============================================================================

# Output target (all versions output to the same file)
TARGET := build/zenann$(EXT_SUFFIX)

# NAIVE: No parallelization (baseline)
NAIVE_CXXFLAGS := $(BASE_CXXFLAGS)
NAIVE_LDFLAGS := $(BASE_LDFLAGS)

# OPENMP: Only OpenMP parallelization
OPENMP_CXXFLAGS := $(BASE_CXXFLAGS) -fopenmp -DENABLE_OPENMP
OPENMP_LDFLAGS := $(BASE_LDFLAGS) -fopenmp

# SIMD: Only SIMD vectorization (AVX2)
SIMD_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -DENABLE_SIMD
SIMD_LDFLAGS := $(BASE_LDFLAGS)

.PHONY: all clean prepare
# FULL: OpenMP + SIMD (fully optimized)
FULL_CXXFLAGS := $(BASE_CXXFLAGS) -march=native -fopenmp -DENABLE_OPENMP -DENABLE_SIMD
FULL_LDFLAGS := $(BASE_LDFLAGS) -fopenmp

all: prepare $(TARGET)
# CUDA: CUDA acceleration (placeholder for future)
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) -DENABLE_CUDA
CUDA_LDFLAGS := $(BASE_LDFLAGS) -lcuda -lcudart

# ============================================================================
# Targets
# ============================================================================

.PHONY: all clean prepare naive openmp simd full cuda help

# Default target: build full version
all: full

prepare:
mkdir -p build
@mkdir -p build

# Build the Python extension, linking against our Faiss
$(TARGET): $(SOURCES)
$(CXX) $(CXXFLAGS) $(ALL_INCLUDES) -shared -o $@ \
# Build naive version (no parallelization)
naive: prepare
$(CXX) $(NAIVE_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
$(SOURCES) \
-L$(FAISS_ROOT)/lib -lfaiss \
$(ALL_LIBS) \
$(LDFLAGS)
$(NAIVE_LDFLAGS)
@echo "✓ Built NAIVE version: $(TARGET)"

# Build OpenMP-only version
openmp: prepare
$(CXX) $(OPENMP_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
$(SOURCES) \
-L$(FAISS_ROOT)/lib -lfaiss \
$(ALL_LIBS) \
$(OPENMP_LDFLAGS)
@echo "✓ Built OPENMP version: $(TARGET)"

# Build SIMD-only version
simd: prepare
$(CXX) $(SIMD_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
$(SOURCES) \
-L$(FAISS_ROOT)/lib -lfaiss \
$(ALL_LIBS) \
$(SIMD_LDFLAGS)
@echo "✓ Built SIMD version: $(TARGET)"

# Build full version (OpenMP + SIMD)
full: prepare
$(CXX) $(FULL_CXXFLAGS) $(ALL_INCLUDES) -shared -o $(TARGET) \
$(SOURCES) \
-L$(FAISS_ROOT)/lib -lfaiss \
$(ALL_LIBS) \
$(FULL_LDFLAGS)
@echo "✓ Built FULL version: $(TARGET)"

# Build CUDA version (placeholder)
cuda: prepare
@echo "CUDA version not yet implemented"
@echo "Will output to: $(TARGET)"

# Clean all builds
clean:
rm -rf build

# Help message
help:
@echo "ZenANN Build System - Multiple Optimization Versions"
@echo ""
@echo "Available targets:"
@echo " make naive - Build naive version (no parallelization)"
@echo " make openmp - Build OpenMP-only version"
@echo " make simd - Build SIMD-only version (AVX2)"
@echo " make full - Build fully optimized version (OpenMP + SIMD)"
@echo " make cuda - Build CUDA version (not yet implemented)"
@echo " make all - Build full version (default)"
@echo " make clean - Remove all built files"
@echo ""
@echo "Note: All versions output to build/zenann.so"
@echo "Each build will overwrite the previous one."
@echo ""
@echo "Usage:"
@echo " import build.zenann as zenann # Always works regardless of version"
Comment on lines +127 to +143
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make help
說明 make 選項

13 changes: 9 additions & 4 deletions include/zenann/SimdUtils.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
#pragma once
#include <cstddef>

#if defined(ENABLE_SIMD)
#include <immintrin.h>
#endif

namespace zenann {
inline float l2_simd(const float* __restrict a,
const float* __restrict b,
size_t dim) {
#if defined(__AVX2__)

// L2 distance calculation with optional SIMD optimization
inline float l2_distance(const float* a, const float* b, size_t dim) {
#if defined(ENABLE_SIMD) && defined(__AVX2__)
// SIMD version using AVX2
const size_t step = 8; // 8 × 32-bit floats
__m256 acc = _mm256_setzero_ps();
size_t i = 0;
Expand All @@ -27,6 +31,7 @@ inline float l2_simd(const float* __restrict a,
}
return d;
#else
// Naive version (no SIMD or AVX2 not available)
float d = 0.f;
for (size_t i = 0; i < dim; ++i) {
float diff = a[i] - b[i];
Expand Down
36 changes: 30 additions & 6 deletions src/IVFFlatIndex.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "IVFFlatIndex.h"
#include "SimdUtils.h"
#ifdef ENABLE_OPENMP
#include <omp.h>
#endif
#include <limits>
#include <random>
#include <algorithm>
Expand Down Expand Up @@ -48,10 +50,12 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
std::vector<Pair> cdist(nlist_);
std::vector<Pair> heap;

// Calculate distance from query to all centroids (parallelized)
// Calculate distance from query to all centroids
#ifdef ENABLE_OPENMP
#pragma omp parallel for schedule(static)
#endif
for (size_t c = 0; c < nlist_; ++c) {
float d = l2_simd(query.data(), centroids_[c].data(), dimension_);
float d = l2_distance(query.data(), centroids_[c].data(), dimension_);
cdist[c] = {d, c};
}

Expand All @@ -62,23 +66,27 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
}
);

// Probe nprobe nearest lists in parallel
// Each thread maintains a local heap, then merges into global heap
// Probe nprobe nearest lists
heap.reserve(k);
const auto& data = datastore_->getAll();

#ifdef ENABLE_OPENMP
#pragma omp parallel for schedule(dynamic)
#endif
for (size_t pi = 0; pi < nprobe_; ++pi) {
size_t c = cdist[pi].second;

#ifdef ENABLE_OPENMP
// Thread-local heap for this cluster
std::vector<Pair> local;
local.reserve(k);
#endif

// Search within this cluster's inverted list
for (size_t id : lists_[c]) {
float dist = l2_simd(query.data(), data[id].data(), dimension_);
float dist = l2_distance(query.data(), data[id].data(), dimension_);

#ifdef ENABLE_OPENMP
if (local.size() < k) {
local.emplace_back(dist, id);
if (local.size() == k) {
Expand All @@ -89,8 +97,21 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
local.back() = {dist, id};
std::push_heap(local.begin(), local.end());
}
#else
if (heap.size() < k) {
heap.emplace_back(dist, id);
if (heap.size() == k) {
std::make_heap(heap.begin(), heap.end());
}
} else if (dist < heap.front().first) {
std::pop_heap(heap.begin(), heap.end());
heap.back() = {dist, id};
std::push_heap(heap.begin(), heap.end());
}
#endif
}

#ifdef ENABLE_OPENMP
// Merge local results into global heap (thread-safe)
#pragma omp critical
{
Expand All @@ -107,6 +128,7 @@ SearchResult IVFFlatIndex::search(const Vector& query, size_t k) const {
}
}
}
#endif
}

std::sort(heap.begin(), heap.end(),
Expand Down Expand Up @@ -134,8 +156,10 @@ std::vector<SearchResult> IVFFlatIndex::search_batch(const Dataset& queries, siz
const size_t nq = queries.size();
std::vector<SearchResult> results(nq);

// Parallel batch search with dynamic scheduling
// Batch search with optional parallelization
#ifdef ENABLE_OPENMP
#pragma omp parallel for schedule(dynamic)
#endif
for (size_t i = 0; i < nq; ++i) {
results[i] = search(queries[i], k);
}
Expand Down