Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
4a9d6a7
Implement Kernel Optimization and Custom Operators for Inference (#412)
claude Nov 8, 2025
828b1fc
merge: integrate master into inference optimization branch
ooples Dec 14, 2025
781ac66
refactor: move simdkernels and platformdetector to aidotnet.tensors
ooples Dec 14, 2025
9128f40
refactor: move optimization utilities to aidotnet.tensors
ooples Dec 15, 2025
510b57d
fix: update tensor api from dimensions to shape in kernels
ooples Dec 15, 2025
337def0
chore: remove outdated examples from inferenceoptimization
ooples Dec 15, 2025
41cc4b6
fix: correct simd api usage for runtime intrinsics compatibility
ooples Dec 15, 2025
e45fcae
feat: add data property for direct array access in tensor and vector
ooples Dec 15, 2025
e586b54
fix: resolve nullable reference type warnings in inference optimization
ooples Dec 15, 2025
8852012
fix: enable ilgpu algorithms extension for roundtoeven support
ooples Dec 15, 2025
810481a
fix: correct gpu stress test performance assertion and update readme …
ooples Dec 15, 2025
1c14ccd
Merge branch 'master' into claude/fix-issue-412-011CUvkJr1v1wzQk6GydfWbN
ooples Dec 15, 2025
c1c4628
fix: address pr review comments for code quality improvements
ooples Dec 15, 2025
5c5a1aa
fix: address pr review comments for inference optimization
ooples Dec 15, 2025
646dd21
Merge branch 'claude/fix-issue-412-011CUvkJr1v1wzQk6GydfWbN' of https…
ooples Dec 15, 2025
6f8589e
fix: remove unused scope stack from performanceprofiler
ooples Dec 15, 2025
257da21
fix: remove stubs and fix net471 compatibility issues
ooples Dec 15, 2025
41c0c77
refactor: use mathhelper.clamp for net471 compatibility in inferenceo…
ooples Dec 15, 2025
836a9fa
Update src/AiDotNet.Tensors/Engines/Optimization/PerformanceProfiler.cs
ooples Dec 15, 2025
982ae25
Update src/InferenceOptimization/CustomOperatorRegistry.cs
ooples Dec 15, 2025
b1d45eb
Update src/InferenceOptimization/Kernels/ConvolutionKernel.cs
ooples Dec 15, 2025
769f7de
Merge branch 'master' into claude/fix-issue-412-011CUvkJr1v1wzQk6GydfWbN
ooples Dec 15, 2025
ce511e6
fix: integrate PR433 inference optimizations + address review
ooples Dec 16, 2025
e1014cf
feat: add speculation policy + continuous batcher support
ooples Dec 16, 2025
8eb096e
fix: add inference diagnostics and stability guardrails
ooples Dec 16, 2025
301ebed
feat: optimize self-attention via cached attention rewrite
ooples Dec 16, 2025
f657130
feat: add kv-cache fp16 option
ooples Dec 16, 2025
e1a43c0
fix: make cloning preserve layer parameters
ooples Dec 16, 2025
b4354e4
fix: make speculative decoding draft selection non-throwing
ooples Dec 16, 2025
90c15e9
feat: add dynamic speculative decoding backoff
ooples Dec 16, 2025
f9b8a54
feat: add int8 kv-cache quantization option
ooples Dec 16, 2025
26cd7ba
feat: route serving requests via adapter header
ooples Dec 16, 2025
10b3c1c
docs: update inference MVP plan with implemented hooks
ooples Dec 16, 2025
8cb39c6
test: tighten speculative draft fallback assertion
ooples Dec 16, 2025
babcbc3
bench: group SIMD benchmarks by category
ooples Dec 16, 2025
f11b222
docs: add phase mapping table for MVP sequencing
ooples Dec 16, 2025
9e81f92
fix: improve adapter model lookup error
ooples Dec 16, 2025
09a0141
fix: guard large unbatched predict requests
ooples Dec 16, 2025
74d7432
fix: bound paged kv-cache sequence allocation retries
ooples Dec 16, 2025
9d32e89
fix: rescale int8 kv-cache across all batches
ooples Dec 16, 2025
391b7c9
fix: mark paged cached attention as inference-only
ooples Dec 16, 2025
1b4f39b
docs: document paged cached attention batch limitation
ooples Dec 16, 2025
2cdc0ff
perf: cache paged attention weights and reuse buffers
ooples Dec 16, 2025
8c7c6cb
perf: use optimized output projection in paged attention fallback
ooples Dec 16, 2025
94ff07f
perf: use matmul for paged attention qkv
ooples Dec 16, 2025
4812a7d
fix: harden attention kernel shape validation
ooples Dec 16, 2025
ffb1e60
fix: validate conv2d kernel in-channels
ooples Dec 16, 2025
e84d0d7
fix: round-trip inference optimization config
ooples Dec 16, 2025
70b42d0
fix: stabilize paged attention allocation and tests
ooples Dec 16, 2025
a5eb3d7
feat: add speculation policies and method hooks
ooples Dec 16, 2025
25fd49f
feat: add weight-only int8 dense quantization
ooples Dec 16, 2025
20292e1
docs: address PR433 review feedback
ooples Dec 16, 2025
a7bb3b9
feat: support Multi-LoRA deep clone and session isolation
ooples Dec 16, 2025
d23342f
test: cover int8 KV-cache quantization
ooples Dec 16, 2025
3f3e887
docs: add strict PR433 phase audit and gap plan
ooples Dec 16, 2025
4e66820
fix: avoid swallowing unexpected deserialization errors
ooples Dec 16, 2025
f529b6a
feat: integrate tree speculation and paged attention WOQ
ooples Dec 17, 2025
2e8b8e3
test: add phase 5/7/8 coverage
ooples Dec 17, 2025
5df1c72
fix: make inference diagnostics runtime-toggleable
ooples Dec 17, 2025
9e49323
test: close remaining PR433 mvp gaps
ooples Dec 17, 2025
47478c9
docs: update PR433 phase audit
ooples Dec 17, 2025
40247bb
fix: address PR433 review feedback
ooples Dec 17, 2025
205caa4
test: serialize diagnostics env var tests
ooples Dec 17, 2025
5ab4bca
fix: tighten deserialization and speculation safety
ooples Dec 17, 2025
3d5ff71
fix: satisfy CodeQL unused-collection
ooples Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions AiDotNetBenchmarkTests/AiDotNetBenchmarkTests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>latest</LangVersion>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<!-- CA1822: BenchmarkDotNet requires instance methods for benchmarks -->
<NoWarn>$(NoWarn);CA1822</NoWarn>
</PropertyGroup>
Expand Down
135 changes: 135 additions & 0 deletions AiDotNetBenchmarkTests/InferenceOptimization/AttentionBenchmark.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using AiDotNet.InferenceOptimization;
using AiDotNet.InferenceOptimization.Kernels;
using AiDotNet.LinearAlgebra;
using System;

namespace AiDotNetBenchmarkTests.InferenceOptimization
{
/// <summary>
/// Benchmarks for fused attention kernel
/// </summary>
[SimpleJob(RuntimeMoniker.Net80)]
[MemoryDiagnoser]
[CsvExporter]
[HtmlExporter]
public class AttentionBenchmark
{
private Tensor<float> _q;
private Tensor<float> _k;
private Tensor<float> _v;
private AttentionKernel _attentionKernel;

[Params(64, 128, 256)]
public int SequenceLength { get; set; }

[Params(32, 64)]
public int FeatureDim { get; set; }

[GlobalSetup]
public void Setup()
{
OptimizationInitializer.Initialize(enableProfiling: false);

_attentionKernel = new AttentionKernel();

// Initialize Q, K, V tensors
var random = new Random(42);
_q = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
_k = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
_v = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });

for (int i = 0; i < _q.Data.Length; i++)
{
_q.Data[i] = (float)random.NextDouble();
}

for (int i = 0; i < _k.Data.Length; i++)
{
_k.Data[i] = (float)random.NextDouble();
}

for (int i = 0; i < _v.Data.Length; i++)
{
_v.Data[i] = (float)random.NextDouble();
}
}

[Benchmark(Baseline = true)]
public Tensor<float> NaiveAttention()
{
// Naive implementation: QK^T, softmax, multiply by V
float scale = 1.0f / MathF.Sqrt(FeatureDim);

// Compute attention scores
var scores = new float[SequenceLength * SequenceLength];

for (int i = 0; i < SequenceLength; i++)
{
for (int j = 0; j < SequenceLength; j++)
{
float score = 0.0f;
for (int k = 0; k < FeatureDim; k++)
{
score += _q.Data[i * FeatureDim + k] * _k.Data[j * FeatureDim + k];
}
scores[i * SequenceLength + j] = score * scale;
}
}

// Apply softmax
for (int i = 0; i < SequenceLength; i++)
{
float maxVal = float.NegativeInfinity;
for (int j = 0; j < SequenceLength; j++)
{
if (scores[i * SequenceLength + j] > maxVal)
maxVal = scores[i * SequenceLength + j];
}

float sum = 0.0f;
for (int j = 0; j < SequenceLength; j++)
{
scores[i * SequenceLength + j] = MathF.Exp(scores[i * SequenceLength + j] - maxVal);
sum += scores[i * SequenceLength + j];
}

for (int j = 0; j < SequenceLength; j++)
{
scores[i * SequenceLength + j] /= sum;
}
}

// Multiply by V
var result = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });

for (int i = 0; i < SequenceLength; i++)
{
for (int j = 0; j < FeatureDim; j++)
{
float sum = 0.0f;
for (int k = 0; k < SequenceLength; k++)
{
sum += scores[i * SequenceLength + k] * _v.Data[k * FeatureDim + j];
}
result.Data[i * FeatureDim + j] = sum;
}
}

return result;
}

[Benchmark]
public Tensor<float> OptimizedAttention()
{
return _attentionKernel.Execute(_q, _k, _v);
}

[Benchmark]
public Tensor<float> MultiHeadAttention()
{
return _attentionKernel.MultiHeadAttention(_q, _k, _v, numHeads: 8);
}
}
}
84 changes: 84 additions & 0 deletions AiDotNetBenchmarkTests/InferenceOptimization/GemmBenchmark.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using AiDotNet.InferenceOptimization;
using AiDotNet.InferenceOptimization.Kernels;
using AiDotNet.LinearAlgebra;
using System;

namespace AiDotNetBenchmarkTests.InferenceOptimization
{
/// <summary>
/// Benchmarks for GEMM (General Matrix Multiplication) kernel
/// Tests optimized implementation against naive implementation
/// </summary>
[SimpleJob(RuntimeMoniker.Net80)]
[MemoryDiagnoser]
[CsvExporter]
[HtmlExporter]
public class GemmBenchmark
{
private Tensor<float> _matrixA;
private Tensor<float> _matrixB;
private GemmKernel _gemmKernel;

[Params(64, 128, 256, 512, 1024)]
public int MatrixSize { get; set; }

[GlobalSetup]
public void Setup()
{
OptimizationInitializer.Initialize(enableProfiling: false);

_gemmKernel = new GemmKernel();

// Initialize matrices with random data
var random = new Random(42);
_matrixA = new Tensor<float>(new[] { MatrixSize, MatrixSize });
_matrixB = new Tensor<float>(new[] { MatrixSize, MatrixSize });

for (int i = 0; i < _matrixA.Data.Length; i++)
{
_matrixA.Data[i] = (float)random.NextDouble();
}

for (int i = 0; i < _matrixB.Data.Length; i++)
{
_matrixB.Data[i] = (float)random.NextDouble();
}
}

[Benchmark(Baseline = true)]
public Tensor<float> NaiveGemm()
{
// Naive triple-nested loop implementation
var result = new Tensor<float>(new[] { MatrixSize, MatrixSize });

for (int i = 0; i < MatrixSize; i++)
{
for (int j = 0; j < MatrixSize; j++)
{
float sum = 0.0f;
for (int k = 0; k < MatrixSize; k++)
{
sum += _matrixA.Data[i * MatrixSize + k] * _matrixB.Data[k * MatrixSize + j];
}
result.Data[i * MatrixSize + j] = sum;
}
}

return result;
}

[Benchmark]
public Tensor<float> OptimizedGemm()
{
return _gemmKernel.Execute(_matrixA, _matrixB);
}

[Benchmark]
public Tensor<float> OptimizedGemmTranspose()
{
return _gemmKernel.GemmTransposeB(_matrixA, _matrixB);
}
}
}
Loading
Loading