diff --git a/AiDotNetBenchmarkTests/AiDotNetBenchmarkTests.csproj b/AiDotNetBenchmarkTests/AiDotNetBenchmarkTests.csproj
index 81661d704..43bf58f15 100644
--- a/AiDotNetBenchmarkTests/AiDotNetBenchmarkTests.csproj
+++ b/AiDotNetBenchmarkTests/AiDotNetBenchmarkTests.csproj
@@ -6,6 +6,7 @@
     <ImplicitUsings>enable</ImplicitUsings>
     <Nullable>enable</Nullable>
 	  <LangVersion>latest</LangVersion>
+    <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
     <!-- CA1822: BenchmarkDotNet requires instance methods for benchmarks -->
     <NoWarn>$(NoWarn);CA1822</NoWarn>
   </PropertyGroup>
diff --git a/AiDotNetBenchmarkTests/InferenceOptimization/AttentionBenchmark.cs b/AiDotNetBenchmarkTests/InferenceOptimization/AttentionBenchmark.cs
new file mode 100644
index 000000000..95ee71519
--- /dev/null
+++ b/AiDotNetBenchmarkTests/InferenceOptimization/AttentionBenchmark.cs
@@ -0,0 +1,135 @@
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using AiDotNet.InferenceOptimization;
+using AiDotNet.InferenceOptimization.Kernels;
+using AiDotNet.LinearAlgebra;
+using System;
+
+namespace AiDotNetBenchmarkTests.InferenceOptimization
+{
+    /// <summary>
+    /// Benchmarks for fused attention kernel
+    /// </summary>
+    [SimpleJob(RuntimeMoniker.Net80)]
+    [MemoryDiagnoser]
+    [CsvExporter]
+    [HtmlExporter]
+    public class AttentionBenchmark
+    {
+        private Tensor<float> _q;
+        private Tensor<float> _k;
+        private Tensor<float> _v;
+        private AttentionKernel _attentionKernel;
+
+        [Params(64, 128, 256)]
+        public int SequenceLength { get; set; }
+
+        [Params(32, 64)]
+        public int FeatureDim { get; set; }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            OptimizationInitializer.Initialize(enableProfiling: false);
+
+            _attentionKernel = new AttentionKernel();
+
+            // Initialize Q, K, V tensors
+            var random = new Random(42);
+            _q = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
+            _k = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
+            _v = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
+
+            for (int i = 0; i < _q.Data.Length; i++)
+            {
+                _q.Data[i] = (float)random.NextDouble();
+            }
+
+            for (int i = 0; i < _k.Data.Length; i++)
+            {
+                _k.Data[i] = (float)random.NextDouble();
+            }
+
+            for (int i = 0; i < _v.Data.Length; i++)
+            {
+                _v.Data[i] = (float)random.NextDouble();
+            }
+        }
+
+        [Benchmark(Baseline = true)]
+        public Tensor<float> NaiveAttention()
+        {
+            // Naive implementation: QK^T, softmax, multiply by V
+            float scale = 1.0f / MathF.Sqrt(FeatureDim);
+
+            // Compute attention scores
+            var scores = new float[SequenceLength * SequenceLength];
+
+            for (int i = 0; i < SequenceLength; i++)
+            {
+                for (int j = 0; j < SequenceLength; j++)
+                {
+                    float score = 0.0f;
+                    for (int k = 0; k < FeatureDim; k++)
+                    {
+                        score += _q.Data[i * FeatureDim + k] * _k.Data[j * FeatureDim + k];
+                    }
+                    scores[i * SequenceLength + j] = score * scale;
+                }
+            }
+
+            // Apply softmax
+            for (int i = 0; i < SequenceLength; i++)
+            {
+                float maxVal = float.NegativeInfinity;
+                for (int j = 0; j < SequenceLength; j++)
+                {
+                    if (scores[i * SequenceLength + j] > maxVal)
+                        maxVal = scores[i * SequenceLength + j];
+                }
+
+                float sum = 0.0f;
+                for (int j = 0; j < SequenceLength; j++)
+                {
+                    scores[i * SequenceLength + j] = MathF.Exp(scores[i * SequenceLength + j] - maxVal);
+                    sum += scores[i * SequenceLength + j];
+                }
+
+                for (int j = 0; j < SequenceLength; j++)
+                {
+                    scores[i * SequenceLength + j] /= sum;
+                }
+            }
+
+            // Multiply by V
+            var result = new Tensor<float>(new[] { 1, SequenceLength, FeatureDim });
+
+            for (int i = 0; i < SequenceLength; i++)
+            {
+                for (int j = 0; j < FeatureDim; j++)
+                {
+                    float sum = 0.0f;
+                    for (int k = 0; k < SequenceLength; k++)
+                    {
+                        sum += scores[i * SequenceLength + k] * _v.Data[k * FeatureDim + j];
+                    }
+                    result.Data[i * FeatureDim + j] = sum;
+                }
+            }
+
+            return result;
+        }
+
+        [Benchmark]
+        public Tensor<float> OptimizedAttention()
+        {
+            return _attentionKernel.Execute(_q, _k, _v);
+        }
+
+        [Benchmark]
+        public Tensor<float> MultiHeadAttention()
+        {
+            return _attentionKernel.MultiHeadAttention(_q, _k, _v, numHeads: 8);
+        }
+    }
+}
diff --git a/AiDotNetBenchmarkTests/InferenceOptimization/GemmBenchmark.cs b/AiDotNetBenchmarkTests/InferenceOptimization/GemmBenchmark.cs
new file mode 100644
index 000000000..1da8a5815
--- /dev/null
+++ b/AiDotNetBenchmarkTests/InferenceOptimization/GemmBenchmark.cs
@@ -0,0 +1,84 @@
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Jobs;
+using AiDotNet.InferenceOptimization;
+using AiDotNet.InferenceOptimization.Kernels;
+using AiDotNet.LinearAlgebra;
+using System;
+
+namespace AiDotNetBenchmarkTests.InferenceOptimization
+{
+    /// <summary>
+    /// Benchmarks for GEMM (General Matrix Multiplication) kernel
+    /// Tests optimized implementation against naive implementation
+    /// </summary>
+    [SimpleJob(RuntimeMoniker.Net80)]
+    [MemoryDiagnoser]
+    [CsvExporter]
+    [HtmlExporter]
+    public class GemmBenchmark
+    {
+        private Tensor<float> _matrixA;
+        private Tensor<float> _matrixB;
+        private GemmKernel _gemmKernel;
+
+        [Params(64, 128, 256, 512, 1024)]
+        public int MatrixSize { get; set; }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            OptimizationInitializer.Initialize(enableProfiling: false);
+
+            _gemmKernel = new GemmKernel();
+
+            // Initialize matrices with random data
+            var random = new Random(42);
+            _matrixA = new Tensor<float>(new[] { MatrixSize, MatrixSize });
+            _matrixB = new Tensor<float>(new[] { MatrixSize, MatrixSize });
+
+            for (int i = 0; i < _matrixA.Data.Length; i++)
+            {
+                _matrixA.Data[i] = (float)random.NextDouble();
+            }
+
+            for (int i = 0; i < _matrixB.Data.Length; i++)
+            {
+                _matrixB.Data[i] = (float)random.NextDouble();
+            }
+        }
+
+        [Benchmark(Baseline = true)]
+        public Tensor<float> NaiveGemm()
+        {
+            // Naive triple-nested loop implementation
+            var result = new Tensor<float>(new[] { MatrixSize, MatrixSize });
+
+            for (int i = 0; i < MatrixSize; i++)
+            {
+                for (int j = 0; j < MatrixSize; j++)
+                {
+                    float sum = 0.0f;
+                    for (int k = 0; k < MatrixSize; k++)
+                    {
+                        sum += _matrixA.Data[i * MatrixSize + k] * _matrixB.Data[k * MatrixSize + j];
+                    }
+                    result.Data[i * MatrixSize + j] = sum;
+                }
+            }
+
+            return result;
+        }
+
+        [Benchmark]
+        public Tensor<float> OptimizedGemm()
+        {
+            return _gemmKernel.Execute(_matrixA, _matrixB);
+        }
+
+        [Benchmark]
+        public Tensor<float> OptimizedGemmTranspose()
+        {
+            return _gemmKernel.GemmTransposeB(_matrixA, _matrixB);
+        }
+    }
+}
diff --git a/AiDotNetBenchmarkTests/InferenceOptimization/README.md b/AiDotNetBenchmarkTests/InferenceOptimization/README.md
new file mode 100644
index 000000000..c4551f1cb
--- /dev/null
+++ b/AiDotNetBenchmarkTests/InferenceOptimization/README.md
@@ -0,0 +1,209 @@
+# Inference Optimization Benchmarks
+
+This directory contains benchmark tests for the inference optimization components.
+
+## Running Benchmarks
+
+### All Benchmarks
+
+```bash
+cd AiDotNetBenchmarkTests
+dotnet run -c Release --project InferenceOptimization
+```
+
+### Individual Benchmarks
+
+```bash
+# GEMM benchmark
+dotnet run -c Release --filter "*GemmBenchmark*"
+
+# SIMD benchmark
+dotnet run -c Release --filter "*SimdBenchmark*"
+
+# Attention benchmark
+dotnet run -c Release --filter "*AttentionBenchmark*"
+```
+
+## Benchmark Descriptions
+
+### GemmBenchmark
+Tests matrix multiplication performance:
+- **NaiveGemm**: Baseline triple-nested loop implementation
+- **OptimizedGemm**: Cache-blocked SIMD-optimized implementation
+- **OptimizedGemmTranspose**: Optimized implementation for transposed matrices
+
+**Matrix sizes tested**: 64x64, 128x128, 256x256, 512x512, 1024x1024
+
+**Expected results**:
+- 2-3x speedup on AVX2 systems
+- 2.5x speedup on ARM NEON systems
+- Better speedup for larger matrices
+
+### SimdBenchmark
+Tests SIMD-optimized vector operations:
+- **Vector Addition**: Element-wise addition
+- **Vector Multiplication**: Element-wise multiplication
+- **Dot Product**: Inner product with FMA optimization
+- **ReLU**: Activation function
+- **Sum**: Reduction operation
+
+**Array sizes tested**: 1K, 10K, 100K, 1M elements
+
+**Expected results**:
+- 4-8x speedup on AVX2 systems (processes 8 floats at once)
+- 2-4x speedup on SSE systems (processes 4 floats at once)
+- 2-4x speedup on NEON systems (processes 4 floats at once)
+
+### AttentionBenchmark
+Tests fused attention kernel performance:
+- **NaiveAttention**: Standard three-step implementation (QK^T, softmax, V)
+- **OptimizedAttention**: Fused implementation with SIMD
+- **MultiHeadAttention**: Multi-head variant (8 heads)
+
+**Parameters tested**:
+- Sequence lengths: 64, 128, 256
+- Feature dimensions: 32, 64
+
+**Expected results**:
+- 2-2.5x speedup from memory traffic reduction
+- Better performance for longer sequences
+
+## Interpreting Results
+
+BenchmarkDotNet produces detailed reports including:
+
+### Timing Metrics
+- **Mean**: Average execution time
+- **Error**: Half of 99.9% confidence interval
+- **StdDev**: Standard deviation
+- **Median**: 50th percentile
+
+### Memory Metrics
+- **Gen0/Gen1/Gen2**: Garbage collection frequency
+- **Allocated**: Total memory allocated
+
+### Speedup Calculation
+```text
+Speedup = Baseline Time / Optimized Time
+```
+
+Example output:
+```text
+|                Method | MatrixSize |      Mean |    Error |   StdDev | Ratio |
+|---------------------- |----------- |----------:|---------:|---------:|------:|
+|             NaiveGemm |        256 | 27.45 ms  | 0.421 ms | 0.394 ms |  1.00 |
+|         OptimizedGemm |        256 |  9.12 ms  | 0.142 ms | 0.133 ms |  0.33 |
+
+Speedup = 27.45 / 9.12 = 3.01x
+```
+
+## Performance Targets
+
+### GEMM
+- ✅ Target: 2-5x speedup
+- Platform dependent:
+  - AVX2: 2.5-3x
+  - AVX-512: 3-4x
+  - NEON: 2-2.5x
+
+### SIMD Operations
+- ✅ Target: 2-8x speedup
+- Depends on:
+  - Instruction set (AVX2 > SSE > scalar)
+  - Array size (larger = better amortization)
+  - Operation type (simple ops get higher speedup)
+
+### Attention
+- ✅ Target: 2-3x speedup
+- Benefits:
+  - Reduced memory traffic
+  - Fused operations
+  - Cache efficiency
+
+## Platform-Specific Results
+
+Your benchmark results will vary based on:
+
+1. **CPU Architecture**
+   - Intel/AMD x86_64: Best with AVX2/AVX-512
+   - ARM: Good with NEON
+   - Older CPUs: Falls back to SSE or scalar
+
+2. **Cache Hierarchy**
+   - Larger caches = Better performance for blocked algorithms
+   - L1/L2/L3 sizes affect optimal tile sizes
+
+3. **Memory Bandwidth**
+   - DDR4/DDR5 speed affects large matrix operations
+   - Memory channels matter for parallel operations
+
+4. **Thermal Throttling**
+   - Sustained benchmarks may hit thermal limits
+   - Use adequate cooling
+
+## Comparing to Reference Implementations
+
+To compare against Intel MKL or OpenBLAS:
+
+```csharp
+// Add reference implementation
+[Benchmark]
+public Tensor<float> MKL_SGEMM()
+{
+    // Call Intel MKL cblas_sgemm
+    // ...
+}
+```
+
+## Contributing
+
+To add new benchmarks:
+
+1. Create a new class inheriting benchmark attributes
+2. Add `[Params]` for different sizes/configurations
+3. Implement baseline (naive) version
+4. Implement optimized version
+5. Add `[GlobalSetup]` for initialization
+6. Mark baseline with `[Benchmark(Baseline = true)]`
+7. Add memory diagnostics: `[MemoryDiagnoser]`
+
+## CI/CD Integration
+
+Add to your CI pipeline:
+
+```yaml
+- name: Run Benchmarks
+  run: |
+    cd AiDotNetBenchmarkTests
+    dotnet run -c Release --filter "*InferenceOptimization*"
+
+- name: Upload Results
+  uses: actions/upload-artifact@v3
+  with:
+    name: benchmark-results
+    path: BenchmarkDotNet.Artifacts/results/
+```
+
+## Troubleshooting
+
+### Benchmark takes too long
+- Reduce parameter ranges
+- Use `[SimpleJob]` instead of full job
+- Reduce warmup/iteration counts
+
+### Inconsistent results
+- Close other applications
+- Disable CPU frequency scaling
+- Run multiple iterations
+- Check for thermal throttling
+
+### Out of memory
+- Reduce test sizes
+- Add `[MemoryDiagnoser]` to track allocations
+- Consider streaming benchmarks for large data
+
+## References
+
+- [BenchmarkDotNet Documentation](https://benchmarkdotnet.org/)
+- [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/)
+- [ARM NEON Documentation](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon)
diff --git a/AiDotNetBenchmarkTests/InferenceOptimization/SimdBenchmark.cs b/AiDotNetBenchmarkTests/InferenceOptimization/SimdBenchmark.cs
new file mode 100644
index 000000000..fb9dcf925
--- /dev/null
+++ b/AiDotNetBenchmarkTests/InferenceOptimization/SimdBenchmark.cs
@@ -0,0 +1,168 @@
+using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Configs;
+using BenchmarkDotNet.Jobs;
+using AiDotNet.InferenceOptimization;
+using AiDotNet.Tensors.Engines.Simd;
+using System;
+
+namespace AiDotNetBenchmarkTests.InferenceOptimization
+{
+    /// <summary>
+    /// Benchmarks for SIMD-optimized operations
+    /// </summary>
+    [SimpleJob(RuntimeMoniker.Net80)]
+    [MemoryDiagnoser]
+    [CsvExporter]
+    [HtmlExporter]
+    [GroupBenchmarksBy(BenchmarkLogicalGroupRule.ByCategory)]
+    public class SimdBenchmark
+    {
+        private float[] _arrayA;
+        private float[] _arrayB;
+        private float[] _result;
+
+        [Params(1000, 10000, 100000, 1000000)]
+        public int ArraySize { get; set; }
+
+        [GlobalSetup]
+        public void Setup()
+        {
+            OptimizationInitializer.Initialize(enableProfiling: false);
+
+            var random = new Random(42);
+            _arrayA = new float[ArraySize];
+            _arrayB = new float[ArraySize];
+            _result = new float[ArraySize];
+
+            for (int i = 0; i < ArraySize; i++)
+            {
+                _arrayA[i] = (float)random.NextDouble();
+                _arrayB[i] = (float)random.NextDouble();
+            }
+        }
+
+        #region Vector Addition
+
+        [Benchmark(Baseline = true)]
+        [BenchmarkCategory("VectorAdd")]
+        public void VectorAdd_Scalar()
+        {
+            for (int i = 0; i < ArraySize; i++)
+            {
+                _result[i] = _arrayA[i] + _arrayB[i];
+            }
+        }
+
+        [Benchmark]
+        [BenchmarkCategory("VectorAdd")]
+        public unsafe void VectorAdd_SIMD()
+        {
+            fixed (float* pA = _arrayA, pB = _arrayB, pR = _result)
+            {
+                SimdKernels.VectorAdd(pA, pB, pR, ArraySize);
+            }
+        }
+
+        #endregion
+
+        #region Vector Multiplication
+
+        [Benchmark(Baseline = true)]
+        [BenchmarkCategory("VectorMultiply")]
+        public void VectorMultiply_Scalar()
+        {
+            for (int i = 0; i < ArraySize; i++)
+            {
+                _result[i] = _arrayA[i] * _arrayB[i];
+            }
+        }
+
+        [Benchmark]
+        [BenchmarkCategory("VectorMultiply")]
+        public unsafe void VectorMultiply_SIMD()
+        {
+            fixed (float* pA = _arrayA, pB = _arrayB, pR = _result)
+            {
+                SimdKernels.VectorMultiply(pA, pB, pR, ArraySize);
+            }
+        }
+
+        #endregion
+
+        #region Dot Product
+
+        [Benchmark(Baseline = true)]
+        [BenchmarkCategory("DotProduct")]
+        public float DotProduct_Scalar()
+        {
+            float sum = 0.0f;
+            for (int i = 0; i < ArraySize; i++)
+            {
+                sum += _arrayA[i] * _arrayB[i];
+            }
+            return sum;
+        }
+
+        [Benchmark]
+        [BenchmarkCategory("DotProduct")]
+        public unsafe float DotProduct_SIMD()
+        {
+            fixed (float* pA = _arrayA, pB = _arrayB)
+            {
+                return SimdKernels.DotProduct(pA, pB, ArraySize);
+            }
+        }
+
+        #endregion
+
+        #region ReLU Activation
+
+        [Benchmark(Baseline = true)]
+        [BenchmarkCategory("ReLU")]
+        public void ReLU_Scalar()
+        {
+            for (int i = 0; i < ArraySize; i++)
+            {
+                _result[i] = Math.Max(0.0f, _arrayA[i]);
+            }
+        }
+
+        [Benchmark]
+        [BenchmarkCategory("ReLU")]
+        public unsafe void ReLU_SIMD()
+        {
+            fixed (float* pA = _arrayA, pR = _result)
+            {
+                SimdKernels.ReLU(pA, pR, ArraySize);
+            }
+        }
+
+        #endregion
+
+        #region Sum Reduction
+
+        [Benchmark(Baseline = true)]
+        [BenchmarkCategory("Sum")]
+        public float Sum_Scalar()
+        {
+            float sum = 0.0f;
+            for (int i = 0; i < ArraySize; i++)
+            {
+                sum += _arrayA[i];
+            }
+            return sum;
+        }
+
+        [Benchmark]
+        [BenchmarkCategory("Sum")]
+        public unsafe float Sum_SIMD()
+        {
+            fixed (float* pA = _arrayA)
+            {
+                return SimdKernels.Sum(pA, ArraySize);
+            }
+        }
+
+        #endregion
+    }
+}
diff --git a/INTEGRATION_PLAN_PR433.md b/INTEGRATION_PLAN_PR433.md
new file mode 100644
index 000000000..1cb9457a1
--- /dev/null
+++ b/INTEGRATION_PLAN_PR433.md
@@ -0,0 +1,257 @@
+# PR #433 Integration Plan: Option A - Enhance CpuEngine with SIMD
+
+## Executive Summary
+
+This plan integrates the InferenceOptimization code from PR #433 into the existing IEngine architecture, eliminating duplication and ensuring all optimizations benefit the entire codebase.
+
+---
+
+## Part 1: Analysis Summary
+
+### What Already Exists in Master
+| Component | Location | Notes |
+|-----------|----------|-------|
+| IEngine interface | `AiDotNet.Tensors/Engines/IEngine.cs` | Unified engine abstraction |
+| CpuEngine | `AiDotNet.Tensors/Engines/CpuEngine.cs` | Generic O(n³) MatrixMultiply, NO SIMD |
+| GpuEngine | `AiDotNet.Tensors/Engines/GpuEngine.cs` | GPU operations |
+| TensorBase<T> | `AiDotNet.Tensors/LinearAlgebra/TensorBase.cs` | Uses `Shape`, protected `_data` |
+
+### What PR #433 Adds (44 files in InferenceOptimization/)
+| Category | Files | Value |
+|----------|-------|-------|
+| SIMD Kernels | `SimdKernels.cs` | HIGH - AVX/AVX2/SSE/NEON explicit intrinsics |
+| Optimized GEMM | `GemmKernel.cs` | DUPLICATE - conflicts with CpuEngine |
+| Attention | `AttentionKernel.cs` | DUPLICATE - uses wrong Tensor API |
+| Convolution | `ConvolutionKernel.cs` | DUPLICATE - uses wrong Tensor API |
+| Platform Detection | `PlatformDetector.cs` | HIGH - CPU/SIMD capability detection |
+| CPU Optimization | `CacheOptimizer.cs`, `LoopOptimizer.cs` | HIGH - cache/loop optimization utilities |
+| Performance Profiler | `PerformanceProfiler.cs` | MEDIUM - profiling infrastructure |
+| Graph Optimization | `Core/*.cs`, `Passes/*.cs` | HIGH - 13 optimization passes |
+| IR System | `IR/*.cs` | HIGH - HLIR/LLIR intermediate representation |
+| Custom Operators | `CustomOperatorRegistry.cs`, `ICustomOperator.cs` | MEDIUM - extensibility |
+| GPU Infrastructure | `GpuKernelBase.cs` | LOW - base class only |
+
+### API Mismatch Issues
+The InferenceOptimization code uses a different Tensor API:
+- **PR #433 uses**: `tensor.Dimensions`, `tensor.Data`, `new Tensor<float>(int[])`
+- **Actual API**: `tensor.Shape`, `tensor._data` (protected), `new TensorBase(int[])`
+
+This causes 130+ build errors.
+
+---
+
+## Part 2: Integration Steps
+
+### Step 1: Move SimdKernels to AiDotNet.Tensors (KEEP)
+
+**Action**: Move `SimdKernels.cs` to `AiDotNet.Tensors/Engines/Simd/` folder
+
+**Changes**:
+```
+src/InferenceOptimization/Kernels/SimdKernels.cs
+  → src/AiDotNet.Tensors/Engines/Simd/SimdKernels.cs
+```
+
+**Namespace change**: `AiDotNet.InferenceOptimization.Kernels` → `AiDotNet.Tensors.Engines.Simd`
+
+### Step 2: Integrate SIMD into CpuEngine
+
+**Action**: Add SIMD-accelerated paths to CpuEngine methods
+
+**Target methods to enhance**:
+1. `VectorAdd<T>` - use `SimdKernels.VectorAdd` when T is float
+2. `VectorMultiply<T>` - use `SimdKernels.VectorMultiply` when T is float
+3. `DotProduct<T>` - use `SimdKernels.DotProduct` when T is float
+4. `MatrixMultiply<T>` - use SIMD-optimized GEMM when T is float
+
+**Pattern**:
+```csharp
+public Vector<T> VectorAdd<T>(Vector<T> a, Vector<T> b)
+{
+    // Check if we can use SIMD optimization
+    if (typeof(T) == typeof(float) && SimdCapabilities.HasSimd)
+    {
+        return VectorAddSimd((Vector<float>)(object)a, (Vector<float>)(object)b);
+    }
+
+    // Generic fallback
+    return VectorAddGeneric(a, b);
+}
+```
+
+### Step 3: Move PlatformDetector to AiDotNet.Tensors (KEEP)
+
+**Action**: Move and integrate platform detection
+
+**Changes**:
+```
+src/InferenceOptimization/PlatformDetector.cs
+  → src/AiDotNet.Tensors/Engines/PlatformDetector.cs
+```
+
+**Integration**:
+- Initialize at `AiDotNetEngine` startup
+- Expose via `AiDotNetEngine.Capabilities`
+
+### Step 4: Move CPU Optimization Utilities (KEEP)
+
+**Action**: Move cache and loop optimizers
+
+**Changes**:
+```
+src/InferenceOptimization/CpuOptimization/CacheOptimizer.cs
+  → src/AiDotNet.Tensors/Engines/Optimization/CacheOptimizer.cs
+
+src/InferenceOptimization/CpuOptimization/LoopOptimizer.cs
+  → src/AiDotNet.Tensors/Engines/Optimization/LoopOptimizer.cs
+```
+
+**Use in**: CpuEngine MatrixMultiply for cache-blocked algorithms
+
+### Step 5: Move Performance Profiler (KEEP)
+
+**Action**: Move profiler to Helpers namespace
+
+**Changes**:
+```
+src/InferenceOptimization/Profiling/PerformanceProfiler.cs
+  → src/Helpers/PerformanceProfiler.cs
+```
+
+### Step 6: Fix Graph Optimization Passes (KEEP with fixes)
+
+**Action**: Keep IR and Passes but fix Tensor API usage
+
+**Files to fix** (use `Shape` instead of `Dimensions`):
+- `src/InferenceOptimization/Core/*.cs`
+- `src/InferenceOptimization/Passes/*.cs`
+- `src/InferenceOptimization/IR/*.cs`
+
+**API changes needed**:
+| Wrong | Correct |
+|-------|---------|
+| `tensor.Dimensions` | `tensor.Shape` |
+| `tensor.Data` | Create public accessor or use indexer |
+| `new Tensor<float>(int[])` | Use TensorBase constructor |
+
+### Step 7: FIX Kernel Files API (KEEP ALL)
+
+**Action**: Fix Tensor API in all kernel files to use TensorBase properly
+
+**Files to FIX (change `Dimensions` → `Shape`, fix data access)**:
+- `src/InferenceOptimization/Kernels/GemmKernel.cs` - Industry-standard cache-blocked SIMD GEMM
+- `src/InferenceOptimization/Kernels/AttentionKernel.cs` - Fused transformer attention
+- `src/InferenceOptimization/Kernels/ConvolutionKernel.cs` - Optimized convolutions
+
+**Files to KEEP (extensibility infrastructure)**:
+- `src/InferenceOptimization/ICustomOperator.cs` - Extensibility interface
+- `src/InferenceOptimization/CustomOperatorRegistry.cs` - Operator registration system
+- `src/InferenceOptimization/OptimizationInitializer.cs` - Initialization entry point
+- `src/InferenceOptimization/GpuOptimization/GpuKernelBase.cs` - GPU kernel base class
+
+**Files to DELETE (examples only)**:
+- `src/InferenceOptimization/Examples/*.cs` - Example files that will be outdated
+
+### Step 8: Update Namespace References
+
+**Action**: Update all using statements
+
+**Old namespaces to replace**:
+- `AiDotNet.InferenceOptimization.Kernels` → `AiDotNet.Tensors.Engines.Simd`
+- `AiDotNet.InferenceOptimization.CpuOptimization` → `AiDotNet.Tensors.Engines.Optimization`
+- `AiDotNet.InferenceOptimization.Profiling` → `AiDotNet.Helpers`
+
+---
+
+## Part 3: File Disposition Matrix
+
+| File | Action | Notes |
+|------|--------|-------|
+| `Kernels/SimdKernels.cs` | MOVE | → `AiDotNet.Tensors/Engines/Simd/` |
+| `Kernels/GemmKernel.cs` | FIX | Fix Tensor API - industry-standard cache-blocked GEMM |
+| `Kernels/AttentionKernel.cs` | FIX | Fix Tensor API - fused transformer attention |
+| `Kernels/ConvolutionKernel.cs` | FIX | Fix Tensor API - optimized convolutions |
+| `PlatformDetector.cs` | MOVE | → `AiDotNet.Tensors/Engines/` |
+| `CpuOptimization/CacheOptimizer.cs` | MOVE | → `AiDotNet.Tensors/Engines/Optimization/` |
+| `CpuOptimization/LoopOptimizer.cs` | MOVE | → `AiDotNet.Tensors/Engines/Optimization/` |
+| `Profiling/PerformanceProfiler.cs` | MOVE | → `Helpers/` |
+| `Core/*.cs` | FIX | Fix Tensor API in place |
+| `Passes/*.cs` | FIX | Fix Tensor API in place |
+| `IR/*.cs` | FIX | Fix Tensor API in place |
+| `CustomOperatorRegistry.cs` | FIX | Fix nullability + keep for extensibility |
+| `ICustomOperator.cs` | KEEP | Extensibility interface |
+| `OptimizationInitializer.cs` | FIX | Fix nullability + keep as entry point |
+| `GpuOptimization/GpuKernelBase.cs` | FIX | Fix nullability + keep for future GPU work |
+| `Examples/*.cs` | DELETE | Will be outdated after API fixes |
+| `README.md` | UPDATE | Update for new architecture |
+
+---
+
+## Part 4: Expected Outcomes
+
+### After Integration:
+1. **Single Architecture**: All optimizations flow through IEngine
+2. **SIMD Everywhere**: CpuEngine automatically uses SIMD for float operations
+3. **No API Conflicts**: Graph passes use correct TensorBase API
+4. **Clean Codebase**: No duplicate kernel implementations
+
+### Issue #412 Completion After Integration:
+| Requirement | Status | Notes |
+|-------------|--------|-------|
+| SIMD Vectorization | 95% | Integrated into CpuEngine |
+| Optimized GEMM | 90% | Cache-blocked in CpuEngine |
+| Platform Detection | 100% | PlatformDetector integrated |
+| CPU Optimization | 90% | CacheOptimizer, LoopOptimizer |
+| Graph Optimization | 80% | IR system, 13 passes (needs API fix) |
+| Custom Operators | REMOVED | Not needed with IEngine |
+| GPU Optimization | 30% | Future work |
+| Benchmarks vs MKL | 0% | Future work |
+
+---
+
+## Part 5: Implementation Order
+
+1. **Phase 1 - Core SIMD** (Critical Path)
+   - [ ] Move SimdKernels.cs to AiDotNet.Tensors
+   - [ ] Move PlatformDetector.cs
+   - [ ] Integrate SIMD paths into CpuEngine
+
+2. **Phase 2 - Utilities**
+   - [ ] Move CacheOptimizer.cs
+   - [ ] Move LoopOptimizer.cs
+   - [ ] Move PerformanceProfiler.cs
+
+3. **Phase 3 - Graph Optimization**
+   - [ ] Fix Tensor API usage in Core/*.cs
+   - [ ] Fix Tensor API usage in Passes/*.cs
+   - [ ] Fix Tensor API usage in IR/*.cs
+
+4. **Phase 4 - Cleanup**
+   - [ ] Delete duplicate kernel files
+   - [ ] Delete unused infrastructure files
+   - [ ] Update README.md
+   - [ ] Build and test
+
+---
+
+## Appendix: Build Error Categories (130 errors)
+
+1. **CS1061 - Missing Members** (~100 errors)
+   - `Tensor<float>` does not contain `Dimensions` → Use `Shape`
+   - `Tensor<float>` does not contain `Data` → Use protected `_data` or accessor
+
+2. **CS8618 - Non-nullable Properties** (~15 errors)
+   - Properties need default values or nullable types
+
+3. **CS8603 - Possible Null Reference** (~10 errors)
+   - Need null checks or nullable return types
+
+4. **CS0103/CS0104 - Ambiguous/Missing References** (~5 errors)
+   - `Avx512VL` not found
+   - `Aes` ambiguous between X86 and ARM
+
+---
+
+*Plan created: 2025-12-14*
+*Target: PR #433, Issue #412*
+*Approach: Option A - Enhance CpuEngine with SIMD*
diff --git a/docs/INFERENCE_MVP_PHASES.md b/docs/INFERENCE_MVP_PHASES.md
new file mode 100644
index 000000000..de402b925
--- /dev/null
+++ b/docs/INFERENCE_MVP_PHASES.md
@@ -0,0 +1,180 @@
+# Inference MVP Phases (PR #433) — Implementation Plan
+
+This plan breaks down the remaining inference MVP work into phases that can be implemented and verified independently, while preserving the project’s facade philosophy (minimal public API surface) and default-first configuration approach.
+
+## Goals
+
+- Keep the public surface area limited to `PredictionModelBuilder` (build/train) and `PredictionModelResult` (inference), with a small number of carefully chosen inference entrypoints (e.g., session start).
+- Use industry-standard defaults when users do not specify options; allow opt-out via `InferenceOptimizationConfig`.
+- Ensure all PR #433 components are wired end-to-end: config → inference optimizer → optimized layers/kernels → serving integration.
+- Exceed industry standards where feasible (paged KV-cache, FP16 KV-cache, batching, speculative decoding, dynamic scheduling), without exposing internal IP.
+
+## Non-Goals (MVP)
+
+- Exposing layer-level kernels or optimizers publicly.
+- Full-blown public text-generation UX/API surface (tokenization, sampling, streaming) beyond what serving needs.
+- GPU-specific paging kernels (CPU-first correctness is priority for MVP).
+
+---
+
+## Phase 0 — Baseline Safety & Diagnostics
+
+**Outcome:** Optimization pipeline is observable and safe-by-default.
+
+1. Add internal inference diagnostics:
+   - Record decisions (enabled/disabled) per feature (KV-cache, masking mode resolution, flash attention, paging, speculative).
+   - Record exceptions with a reason and feature tag (do not throw from the facade in normal inference).
+2. Add stability guardrails:
+   - Avoid mutating user-supplied model instances unless explicitly requested (e.g., `cloneModel: false` internal path).
+   - When deep copy fails for a model, fall back to baseline inference and record diagnostics.
+3. Verification:
+   - Unit tests for optimizer decisions and non-throwing fallbacks.
+
+---
+
+## Phase 1 — Attention Rewrite Integration
+
+**Outcome:** All supported attention layers are rewritten consistently based on config, and the optimizer can run end-to-end.
+
+1. Wire attention rewrite decisions in `InferenceOptimizer`:
+   - `MultiHeadAttentionLayer` → `FlashAttentionLayer` when enabled.
+   - `MultiHeadAttentionLayer`/`FlashAttentionLayer` → cached attention when KV-cache enabled (causal default for sessions).
+   - `SelfAttentionLayer` conversion path to multi-head for downstream rewrites (when feasible).
+2. Add missing attention layer support:
+   - Ensure `SelfAttentionLayer`, `AttentionLayer`, and `GraphAttentionLayer` are handled for cloning/serialization and/or have clear non-supported fallbacks.
+3. Ensure cloning is truly deep:
+   - Serialization/deserialization round-trip must preserve parameters exactly.
+4. Verification:
+   - Unit tests that assert rewritten layers exist as expected per config.
+   - Clone tests verifying parameters match before mutation.
+
+---
+
+## Phase 2 — Paged Attention + Paged KV-Cache (Industry Standard Default)
+
+**Outcome:** Paged KV-cache is available and auto-enabled by default for KV-cache workloads, with opt-out.
+
+1. Default behavior:
+   - `EnablePagedKVCache = true` by default; user can disable.
+2. Wiring:
+   - `InferenceOptimizer` initializes paged cache when paged attention layers exist; otherwise falls back to contiguous KV-cache.
+3. Session behavior:
+   - Sessions should prefer causal masking when user sets `AttentionMasking=Auto`.
+4. Verification:
+   - Unit tests for paged attention kernel and cache mechanics (block tables, COW).
+   - Integration tests for session cache growth and reset.
+
+---
+
+## Phase 3 — KV-Cache Precision (FP16 default, opt-out)
+
+**Outcome:** KV-cache can store keys/values in FP16 to reduce memory and improve capacity/throughput, with opt-out to FP32.
+
+1. Configuration:
+   - Add `KVCachePrecision` with default `Auto` selecting FP16 when possible.
+2. Implementation:
+   - KV-cache uses FP16 backing storage when enabled and `T` supports conversion.
+3. Wiring:
+   - `InferenceOptimizer` resolves cache data type and records decision.
+4. Verification:
+   - Unit tests for FP16 round-trip and memory usage calculations.
+
+---
+
+## Phase 4 — Inference Sessions (Multi-Sequence, Facade-Friendly)
+
+**Outcome:** `PredictionModelResult.BeginInferenceSession()` supports multiple independent sequences for serving-style workloads without exposing internal implementation details.
+
+1. API:
+   - Keep public API minimal (session + sequence objects), do not expose layer internals.
+2. Behavior:
+   - Each sequence maintains independent KV-cache state and can `Reset()`.
+   - Session is safe to use for multiple sequences in parallel (thread-safety where feasible).
+3. Internal diagnostics:
+   - Provide internal-only stats hooks to validate caching behavior in integration tests without expanding public API.
+4. Verification:
+   - Integration tests for:
+     - Stateless `Predict()` behavior.
+     - Sequence independence.
+     - Reset restoring initial state.
+
+---
+
+## Phase 5 — Batching (Serving-First) + Resource Arbitration
+
+**Outcome:** `EnableBatching` is honored in serving and session contexts, with guardrails around incompatibilities with speculation.
+
+1. Serving integration:
+   - Use `AiDotNet.Serving` to host batching behavior and backpressure.
+2. Conflict policy:
+   - Document and implement a policy when batching and speculative decoding both enabled:
+     - Default: prioritize batching for throughput, optional override to prioritize speculation for latency.
+3. Verification:
+   - Serving-side tests around batch coalescing and max batch size enforcement.
+
+---
+
+## Phase 6 — Speculative Decoding MVP (Draft Model + Policy)
+
+**Outcome:** `EnableSpeculativeDecoding` and `SpeculationPolicy` are fully wired, with a draft model option and safe defaults.
+
+1. Configuration:
+   - Draft model selection via `DraftModelType` (NGram, small neural) and speculation depth.
+2. Session + serving:
+   - Enable speculation wherever sessions are used when flag is enabled.
+   - Serving integration for production usage (streaming/latency).
+3. Verification:
+   - Unit tests for policy decisions and safe fallback when draft model not available.
+
+---
+
+## Phase 7 - Dynamic Speculation & Alternative Speculators (Medusa/EAGLE)
+
+**Outcome:** Add next-gen speculative methods and dynamic scheduling options.
+
+1. Dynamic scheduling:
+   - Adaptive speculation depth based on acceptance rate, queue pressure, and compute budget.
+   - MVP implementation: serving-side `ContinuousBatcher` backs off speculative decoding under load and after observing low draft acceptance rates.
+2. Alternative methods:
+   - Add config hooks for Medusa/EAGLE-style multi-head draft proposals as a future opt-in.
+3. Verification:
+   - Bench-style tests (non-flaky) for acceptance-rate-driven behavior.
+
+---
+
+## Phase 8 - Inference Quantization (Gap-Closing)
+
+**Outcome:** Extend quantization support beyond training into inference areas where it is industry standard.
+
+1. KV-cache quantization:
+   - Optional per-layer KV-cache quantization (e.g., int8) with dequant on read.
+   - MVP implementation: `InferenceOptimizationConfig.KVCacheQuantization = Int8` routes KV-cache storage to int8 quantized backing storage with dequant-on-read.
+2. Weight-only quantization:
+   - Optional weight-only quant for inference (e.g., int8/int4) with fast matmul paths.
+3. Weight + activation quantization (advanced):
+   - Add as opt-in; ensure correctness-first.
+4. Verification:
+   - Unit tests validating numerics and shape correctness.
+
+---
+
+## Phase 9 - Multi-LoRA (Serving-First, Secure Defaults)
+
+**Outcome:** Multi-LoRA can be selected per request without leaking internal implementation details.
+
+1. Serving integration:
+   - Prefer selecting LoRA adapters from headers/metadata on serving side.
+   - MVP implementation: serving can route to a pre-loaded per-adapter model variant using `X-AiDotNet-Lora`/`X-AiDotNet-Adapter` headers (`{baseModelName}__{adapterId}`).
+2. Session integration:
+   - Optional adapter selection per sequence, but keep surface minimal.
+3. Verification:
+   - Serving tests for adapter routing and isolation.
+
+---
+
+## Release Checklist (Per Phase)
+
+- `dotnet build AiDotNet.sln -c Release`
+- Targeted `dotnet test` runs for touched areas
+- Update docs and XML comments to match project conventions (summary/remarks + “For Beginners” sections where appropriate)
+- Commit with conventional prefix (`feat:`, `fix:`, `test:`, `docs:`, `refactor:`) in small, regular increments
diff --git a/docs/PR433_FACADE_INFERENCE_PLAN.md b/docs/PR433_FACADE_INFERENCE_PLAN.md
new file mode 100644
index 000000000..f0e72603c
--- /dev/null
+++ b/docs/PR433_FACADE_INFERENCE_PLAN.md
@@ -0,0 +1,658 @@
+# PR #433 Facade Integration Plan (Inference Optimizations)
+
+## 0) Purpose
+
+Integrate PR #433 inference optimizations into the *existing facade pattern* so that:
+- Users configure everything via `PredictionModelBuilder.ConfigureInferenceOptimizations(...)`.
+- Users consume everything via `PredictionModelResult` (e.g., `Predict`, plus a session API).
+- Internal implementation details (optimizer, caches, kernels, batching, speculative decoding) remain hidden behind the facade.
+
+This plan is written to be actionable for a junior engineer without requiring deep prior context.
+
+---
+
+## 1) Constraints / Non‑Goals
+
+### 1.1 Facade constraints (must keep)
+- No new user-facing entry points besides:
+  - `PredictionModelBuilder` for configuration + training/build.
+  - `PredictionModelResult` for inference.
+- Any “advanced” types should be:
+  - `internal`, or
+  - nested under `PredictionModelResult` (if we must expose a session object).
+
+### 1.2 Correctness constraints (must keep)
+- "Industry standard defaults" when user omits config values.
+- Avoid semantic changes by default for non-autoregressive models, but prefer serving-grade defaults when inference optimizations are explicitly enabled:
+  - `AttentionMasking=Auto` should assume causal for inference sessions unless the model/layer explicitly indicates bidirectional attention.
+  - Paged KV-cache should be auto-enabled by default (opt-out via config).
+  - KV-cache must not silently change results for models that are not compatible with caching.
+
+### 1.3 Non-goals (for this integration pass)
+- Rewriting training-time behavior.
+- Full rewrite of the `src/InferenceOptimization/*` SIMD/IR plan (tracked separately in `INTEGRATION_PLAN_PR433.md`).
+
+---
+
+## 2) Current Issues / Gaps (What to Fix)
+
+### 2.1 “Disconnected” implementations
+New PR #433 classes exist but are not consistently invoked through the facade:
+- `FlashAttention.cs`, `CachedMultiHeadAttention.cs`, `InferenceOptimizer.cs`, KV cache variants, etc.
+
+### 2.2 Model cloning/serialization assumptions
+We need to treat `NeuralNetworkBase<T>.Clone()` behavior carefully:
+- Confirm whether it is deep copy vs shallow copy in practice.
+- If cloning relies on serialization, ensure all attention layers can be deserialized with correct constructor arguments (constructor mismatch is a known risk).
+- If serialization cannot faithfully round-trip all layer types, implement a robust deep-clone path that still preserves the facade (users never touch these internals).
+
+### 2.3 Layer coverage
+The optimizer rewrite logic must recognize more attention layer types, not just:
+- `MultiHeadAttentionLayer<T>` and `FlashAttentionLayer<T>`.
+
+Add support/coverage for (at minimum):
+- `AttentionLayer<T>` (mask-aware)
+- `SelfAttentionLayer<T>`
+- `GraphAttentionLayer<T>`
+- Any transformer encoder/decoder attention layers in `src/NeuralNetworks/Layers/*`
+
+### 2.4 Missing facade integration
+`EnableBatching` and `EnableSpeculativeDecoding` must be wired end-to-end:
+- Builder stores config.
+- Result uses config during inference.
+- Serving project should be able to leverage the same internal components.
+
+### 2.5 Paged attention integration missing
+`PagedAttention` / `PagedKVCache` exist but are not selected/used based on config.
+
+### 2.6 Tests
+We need:
+- Unit tests for low-level correctness (some exist).
+- Integration tests that validate the facade wiring end-to-end via `PredictionModelBuilder` + `PredictionModelResult`.
+
+---
+
+## 3) Target UX (What the user sees)
+
+### 3.1 Builder usage (no new user-facing components)
+```csharp
+var result = await new PredictionModelBuilder<float, Tensor<float>, Tensor<float>>()
+    .ConfigureModel(model)
+    .ConfigureInferenceOptimizations(new InferenceOptimizationConfig
+    {
+        EnableFlashAttention = true,
+        EnableKVCache = true,
+        EnableBatching = true,
+        EnableSpeculativeDecoding = true
+    })
+    .BuildAsync(x, y);
+```
+
+### 3.2 Result usage
+Default usage stays the same:
+```csharp
+var y = result.Predict(x);
+```
+
+For sequence-based inference (KV-cache, generation, serving), add a *single* facade-friendly entry:
+- `result.BeginInferenceSession()` returns a session object that manages caches/batching internally.
+
+Example shape:
+```csharp
+using var session = result.BeginInferenceSession();
+var y1 = session.Predict(x1);
+var y2 = session.Predict(x2);
+```
+
+Notes:
+- The session type should be nested under `PredictionModelResult` (or otherwise kept hidden).
+- "Clear cache" should be internal to session lifecycle; avoid exposing raw cache types.
+- Avoid adding new public cache-control methods on `PredictionModelResult`; prefer session `Dispose()`/`Reset()`.
+
+---
+
+## 4) Implementation Plan (Phased)
+
+### Phase A - Baseline facade wiring + safety (no batching/spec yet)
+
+**Goal:** Ensure optimizations are *actually applied* and correct during inference for supported model types.
+
+1) **Confirm cloning semantics**
+   - Inspect `NeuralNetworkBase<T>.Clone()` and `DeepCopy()` implementation.
+   - Decide policy:
+     - If deep clone is reliable for all relevant layers, prefer cloning before rewrites.
+     - If cloning is unreliable, either:
+       - Fix serialization/deserialization for attention layers, or
+       - Apply rewrites in-place but only inside an inference-session-scoped model instance owned by the result (never mutate the user's original model object).
+   - Required outcome for facade safety:
+     - Never mutate the user's original model object.
+     - Any optimized/mutated model instance is owned by the result/session internally.
+   - Acceptance criteria:
+      - No runtime errors when inference optimizations are enabled.
+      - No cross-request contamination (weights/caches).
+   - Fallback behavior + diagnostics (must be explicit):
+     - If an optimization is unsupported for the current model/layer/platform, it must be **auto-disabled** and inference proceeds with the baseline path (never throw by default).
+     - If an optimization throws at runtime, catch and **fall back to baseline** for that session/sequence where possible.
+     - Record decisions/exceptions via internal diagnostics (e.g., `InferenceDiagnostics.RecordDecision/RecordException`) so we can validate selection in tests and troubleshoot in serving logs without expanding the public API.
+
+2) **Make serialization/deserialization round-trip attention layers (if used by clone)**
+   - Inventory layer constructors that require metadata:
+     - `MultiHeadAttentionLayer<T>` (e.g., head count)
+     - `SelfAttentionLayer<T>`
+     - `AttentionLayer<T>` (mask semantics)
+     - `GraphAttentionLayer<T>` (graph-specific parameters)
+   - Update the model serialization format in a backward-compatible way:
+     - Preserve current header/structure so existing saved models still load.
+     - Add per-layer metadata payload (versioned) needed to reconstruct constructors.
+   - Extend deserialization to use metadata when present and fall back to safe defaults when absent.
+   - Acceptance criteria:
+     - `Clone()` becomes a true deep copy for supported model types.
+     - `OptimizeForInference()` can safely operate on a cloned instance.
+
+3) **Extend optimizer layer detection/rewrite coverage**
+   - Identify all attention-related layer types and their expected shapes/masking behavior:
+     - `MultiHeadAttentionLayer<T>`
+     - `FlashAttentionLayer<T>`
+     - `AttentionLayer<T>` (mask input)
+     - `SelfAttentionLayer<T>`
+     - `GraphAttentionLayer<T>` (graph adjacency/attention specifics)
+   - For each layer type, decide:
+     - Can we rewrite to `FlashAttentionLayer<T>` safely?
+     - Can we wrap/replace with KV-cache variants safely?
+     - If not, skip and record diagnostics (internal).
+   - Acceptance criteria:
+     - Transformer models built via default layer helper are optimized.
+     - Non-transformer models are unchanged.
+
+4) **Masking policy**
+   - Make masking decision consistent and safe:
+     - `AttentionMasking=Auto` defaults to causal masking for inference sessions unless a bidirectional model is clearly indicated.
+     - `AttentionMasking=Causal` forces causal mask.
+     - `AttentionMasking=Disabled` disables causal mask even for text generation.
+   - Heuristics for `Auto` (ordered, conservative):
+     - If the layer explicitly takes an attention mask input and one is provided, honor it (don’t infer).
+     - If KV-cache is enabled (or a session is created), assume causal unless explicitly overridden.
+     - If model metadata/task type exists and indicates non-causal (e.g., encoder/classification), prefer non-causal.
+     - If uncertain, default to causal only inside a session (so plain `Predict()` stays safest).
+   - Acceptance criteria:
+     - No causal mask applied for classification/encoder tasks unless forced.
+
+5) **Paged attention config plumbing (no swap yet)**
+   - Add config surface for paged attention selection with industry standard defaults:
+     - `EnablePagedKVCache = true` by default (opt-out).
+     - `PagedKVCacheBlockSize = 16` by default (configurable).
+   - Validate config values.
+   - Acceptance criteria:
+     - Configuration exists and is validated; actual usage comes in Phase C.
+
+---
+
+### Phase B — Inference Session API (facade-compliant)
+
+**Goal:** Provide a safe, explicit lifecycle for stateful inference features (KV-cache, batching queues).
+
+1) **Add `BeginInferenceSession()`**
+   - Implement as a method on `PredictionModelResult`:
+     - `public InferenceSession BeginInferenceSession(...)`
+   - Session responsibilities:
+     - Own (or reference) an optimized model instance.
+     - Own cache state (KV cache or paged KV cache).
+     - Provide session-scoped prediction methods.
+     - Reset/clear caches on `Dispose()`.
+
+2) **Decide session API surface**
+   - Minimum recommended:
+     - `Predict(TInput input)` for session-scoped inference.
+     - Optional: `Reset()` (but keep it on session, not on result).
+   - Avoid exposing raw cache objects.
+
+3) **Backwards compatibility**
+   - Keep `PredictionModelResult.Predict` working:
+     - It may internally use a “default session” with conservative behavior.
+     - But for KV-cache and generation patterns, prefer explicit session use.
+
+4) **Serving integration point**
+   - Provide internal hooks so `AiDotNet.Serving` can:
+     - Create sessions per request/sequence.
+     - Route speculative decoding/batching through the same internal components.
+
+Acceptance criteria:
+- Session correctly isolates cache state across concurrent requests.
+- No public exposure of internal optimization classes beyond session wrapper.
+- No public cache-control surface on `PredictionModelResult` (session owns lifecycle).
+
+---
+
+### Phase C — PagedAttention / PagedKVCache integration
+
+**Goal:** Use paged KV-cache for long-context / many-concurrent-sequence serving.
+
+1) **Select KV-cache backend**
+   - Based on config and/or heuristics (but default to paged when enabled):
+     - If `EnablePagedKVCache` is true, prefer `PagedKVCache<T>`.
+     - Otherwise use contiguous `KVCache<T>` (optionally with sliding window).
+
+2) **Bridge cached attention layers to paged cache**
+   - Options:
+     - Implement a new cached attention layer variant that reads/writes via `PagedKVCache`.
+     - Or implement an adapter interface (internal) that abstracts KV-cache operations used by cached attention.
+   - Ensure:
+     - Prefill path supported.
+     - Decode path supported.
+     - Sliding window works without O(n) shifting (paged/ring behavior).
+   - Ensure causal masking logic supports both:
+     - Prefill (many query tokens at once)
+     - Decode (queryOffset / position-based masking)
+
+3) **Integrate with serving continuous batching**
+   - Ensure a single source of truth for per-sequence cache state.
+   - Session ID / sequence ID mapping must be deterministic and internal.
+
+Acceptance criteria:
+- Multi-sequence inference works without cache corruption.
+- Memory growth is bounded via paging/windowing.
+
+---
+
+### Phase D — EnableBatching integration (facade + serving)
+
+**Goal:** Make `EnableBatching` actually affect inference.
+
+1) **Define batching scope**
+   - Offline “single-thread user calling Predict” batching is usually not beneficial.
+   - Batching matters for:
+     - `AiDotNet.Serving` pipelines
+     - Concurrent inference workloads
+
+2) **Implement batching in session/serving**
+   - In `PredictionModelResult.BeginInferenceSession()` optionally accept:
+     - `BatchingMode` (Auto/Disabled/Force) (or use config only).
+   - In serving:
+     - Use `RequestBatcher`/`ContinuousBatchingRequestBatcher` to batch requests.
+     - Ensure batched forward uses the optimized model instance.
+
+3) **Metrics & safety**
+   - Ensure per-request latency bounds (timeout).
+   - Ensure correctness when mixing sequence lengths.
+
+Acceptance criteria:
+- When enabled and used via serving, throughput increases measurably.
+- Batching never changes numerical outputs (only performance).
+
+---
+
+### Phase E — EnableSpeculativeDecoding integration
+
+**Goal:** Route speculative decoding through facade/session/serving for text generation models.
+
+1) **Add a facade entry point for generation**
+   - If generation already exists elsewhere, reuse it.
+   - Otherwise, add a method on the *session*:
+     - `GenerateNextToken(...)` or `Generate(...)` (scope depends on existing LLM infra).
+
+2) **Wire `SpeculativeDecoder<T>`**
+   - Construct from config when:
+     - Task type is text generation (or user forces).
+     - Draft model is configured (NGram default is allowed if desired).
+   - Ensure the “target model forward” used by the decoder is the optimized model forward.
+
+3) **Serving integration**
+   - Ensure speculative decoding works with:
+     - KV-cache
+     - Paged cache (if enabled)
+     - Continuous batching (optional but desirable)
+
+Acceptance criteria:
+- Speculative decoding can be enabled end-to-end via builder config.
+- No public exposure of speculative internals; only facade methods (prefer serving-only unless a strong session use-case exists).
+
+---
+
+## 4.1) Gap Analysis Backlog (to exceed industry standards)
+
+These are common inference optimizations worth confirming (or adding) beyond the basic wiring work:
+- **Attention kernels:** prefill + decode correctness, queryOffset-based causal masking, multi-query attention support, stable softmax.
+- **KV-cache:** per-layer/per-batch correctness, sliding window, paged/ring behavior, cache eviction policy.
+- **Inference quantization (missing today):**
+  - **Weight-only quantization** (INT8/INT4, per-channel scales, optional GPTQ/AWQ style offline calibration) for transformer blocks and projection matrices.
+  - **Activation quantization** (INT8) for matmuls/MLP with calibration (min/max, percentile, KL) and safe fallbacks.
+  - **KV-cache quantization** (INT8/FP8) for K/V storage with dequant-on-read or fused quantized attention kernels; configurable per-layer/per-head.
+  - **Mixed precision** defaults (FP16/BF16/FP8 where supported) with numerically safe softmax/LayerNorm paths.
+  - **Quantization-aware cache policies** (paged KV-cache block reuse/eviction with quantized blocks).
+- **Thread safety:** concurrent sessions, cache isolation, avoiding shared mutable state.
+- **Serving throughput:** continuous batching, request timeouts, micro-batching heuristics.
+- **Speculative decoding:** draft model choices (N-gram vs small neural draft), accept/reject efficiency metrics.
+- **Speculation + batching co-scheduling (missing today):**
+  - Avoid “double spending” compute when both continuous batching and speculative decoding are enabled.
+  - Add policies that trade throughput vs latency under load.
+  - Add modern multi-candidate methods (Medusa/EAGLE) and dynamic speculation (“speculative scheduling”).
+- **Multi-LoRA (missing today):**
+  - Per-session/per-sequence adapter selection, hot-swap, and safe isolation across concurrent requests.
+  - Multi-adapter composition policies (merge vs stack) and caching of merged weights.
+- **Model cloning/serialization:** versioning + backward compatibility for saved models, deterministic round-trips.
+- **Telemetry (internal):** rewrite decisions, cache hit rates, kernel selection, disable-on-failure behavior (internal only).
+
+---
+
+## 4.2) Inference Quantization Roadmap (New)
+
+Goal: add inference-side quantization without expanding public surface beyond `InferenceOptimizationConfig` (builder) and session/serving behavior (result).
+
+### 4.2.1) What exists today
+- Deployment/quantization configuration exists, but current usage is primarily training/export oriented.
+- PR#433 inference optimizations currently operate on FP32/FP64 tensors and caches.
+- MVP now supports KV-cache int8 storage via `InferenceOptimizationConfig.KVCacheQuantization = Int8` (dequant-on-read).
+
+### 4.2.2) Target capabilities (industry standard baseline → exceed)
+1) **Weight-only quantization (WOQ)** for inference
+   - INT8 first (simpler), then INT4.
+   - Per-channel scales for linear projections (Q/K/V/O, FFN).
+   - Offline calibration supported through builder tooling/agents (hidden behind facade).
+
+2) **Activation quantization**
+   - Optional INT8 activations for matmul-heavy blocks.
+   - Calibration strategies:
+     - min/max
+     - percentile
+     - KL-divergence
+   - Safe fallback per-layer when calibration insufficient.
+
+3) **KV-cache quantization**
+   - Quantize K/V storage (INT8 or FP8) with dequant-on-read OR fused kernel support.
+   - Must work with:
+     - contiguous KV cache
+     - paged KV cache
+     - sliding window mode
+   - Defaults:
+     - Off by default until kernels are proven stable; once stable, enable by default for serving workloads with opt-out.
+
+### 4.2.3) Facade integration (no new public types)
+Add to `InferenceOptimizationConfig` (or reuse `QuantizationConfig` internally):
+- `EnableWeightOnlyQuantization` (default: false until validated)
+- `WeightQuantizationBits` (8/4)
+- `EnableActivationQuantization` (default: false)
+- `EnableKVCacheQuantization` (default: false initially; planned default true for serving after validation)
+- `KVCacheQuantizationFormat` (INT8/FP8)
+- `QuantizationCalibrationMode` (Auto/MinMax/Percentile/KL)
+
+Implementation detail: keep all quantized kernels/types `internal` and selected via the optimizer/session/serving pipeline.
+
+### 4.2.4) Testing/acceptance for quantization
+- Golden-output tests with tolerances per quant mode.
+- Determinism tests (same input → same output) under identical config.
+- Memory-budget tests: confirm KV-cache footprint reduction.
+- Regression tests: ensure non-quantized path unchanged.
+
+---
+
+## 4.3) Speculation vs Continuous Batching (Scheduling) (New)
+
+### 4.3.1) Problem
+Continuous batching and speculative decoding can compete for the same compute/memory bandwidth:
+- Speculation increases per-step compute (draft + verify), but may reduce total steps.
+- Continuous batching improves utilization by packing many sequences, but adds scheduling complexity.
+
+### 4.3.2) Policy-based approach (recommended)
+Keep `EnableSpeculativeDecoding` usable in sessions and serving, but add **internal policies** that decide *when* to apply it:
+- **Latency-first** (small batches): enable speculation more often.
+- **Throughput-first** (large batches): reduce speculation depth or disable speculation under high load.
+- **Auto** (default): dynamic based on queue depth, batch size, and recent accept-rate.
+
+Add to config (still facade-only):
+- `SpeculationPolicy` (Auto/ForceOn/ForceOff/LatencyFirst/ThroughputFirst)
+- `MaxSpeculationComputeFraction` (cap draft overhead)
+
+### 4.3.3) Dynamic speculation (“speculative scheduling”)
+Implement “dynamic speculation” that adapts:
+- speculation depth
+- draft batch size
+- whether to speculate at all
+based on:
+- rolling accept-rate
+- queue depth / batcher load
+- KV-cache pressure (paged block availability)
+
+### 4.3.4) Medusa / EAGLE support
+These methods reduce “draft model” overhead by producing multiple candidate tokens with lightweight heads:
+- **Medusa**: extra heads on top of the target model to propose multiple tokens.
+- **EAGLE**: enhanced draft proposals with improved verification efficiency.
+
+Plan:
+1) Add internal capability detection: “model supports Medusa/EAGLE heads”.
+2) Extend session/serving generation to use these heads when enabled.
+3) Add config flags:
+   - `SpeculativeMethod` (ClassicDraftModel/Medusa/Eagle/Auto)
+   - `NumSpeculationHeads` (for Medusa-like methods)
+4) Ensure policies still apply (auto-disable under high batching load).
+
+---
+
+## 4.4) Multi-LoRA for Inference Sessions (New)
+
+### 4.4.1) Goals
+- Allow multiple LoRA adapters to be applied per-session/per-sequence without exposing LoRA internals publicly.
+- Make it compatible with serving: per-request adapter selection.
+- MVP interim: serving can route to a pre-loaded per-adapter model variant via `X-AiDotNet-Lora`/`X-AiDotNet-Adapter` headers using `{baseModelName}__{adapterId}`.
+
+### 4.4.2) Required behaviors
+1) **Selection**
+   - Session/serving chooses adapter by ID (string) via internal route/metadata.
+2) **Isolation**
+   - No cross-request weight pollution; adapters never mutate the base weights.
+3) **Performance**
+   - Cache merged weights per adapter (and per precision/quantization mode).
+4) **Composition**
+   - Support multiple adapters:
+     - merge (weighted sum) OR
+     - stack (apply sequential deltas)
+   - Keep composition policy internal, configurable via builder options.
+
+### 4.4.3) Test plan for multi-LoRA
+- Two sequences with different adapters must produce different outputs for same input.
+- Same adapter reused across sequences must hit cache and remain deterministic.
+- Adapter hot-swap mid-session must not corrupt caches (KV-cache reset rules must be defined).
+
+## 5) Testing Plan
+
+### 5.1 Unit tests (fast, deterministic)
+Add/extend tests for:
+- FlashAttention masking correctness (including cached decoding offsets).
+- KV-cache correctness across layers, batches, and truncation.
+- Optimizer rewrite decisions for each supported attention layer type.
+
+### 5.2 Integration tests (facade end-to-end)
+Create tests that only use the public facade:
+1) Build a small transformer with `PredictionModelBuilder` + `ConfigureInferenceOptimizations()`.
+2) Call `Predict()` and assert:
+   - No exceptions.
+   - Output shape matches expected.
+3) Call `BeginInferenceSession()` and run multiple steps:
+   - Verify caches don't leak across sessions.
+   - Verify `Dispose()` resets state.
+4) Validate attention layer coverage:
+   - Ensure models using `AttentionLayer<T>`, `SelfAttentionLayer<T>`, and `GraphAttentionLayer<T>` either:
+     - optimize safely, or
+     - are explicitly skipped with internal diagnostics (but still function).
+
+If `AiDotNet.Serving` has a test harness, add a serving integration test:
+- Spin up a batcher with a model + config.
+- Submit concurrent requests.
+- Validate outputs and ensure no deadlocks/file-lock issues.
+
+### 5.3 Performance smoke tests (optional)
+- Benchmarks belong in benchmark projects; for this PR, a smoke test is enough:
+  - Validate the optimized path is selected (internal diagnostics).
+
+---
+
+## 6) Acceptance Criteria Checklist
+
+- [ ] No new public entrypoints besides builder/result (session may be nested under result).
+- [ ] `ConfigureInferenceOptimizations()` has full effect in inference.
+- [ ] KV-cache correctness for multi-layer models (no cross-layer corruption).
+- [ ] Attention optimization supports major attention layer types used in the repo.
+- [ ] Paged KV-cache can be enabled (backend selection + attention integration).
+- [ ] Batching and speculative decoding are usable via facade and serving.
+- [ ] Speculation + batching policy prevents throughput regressions under load (Auto backoff).
+- [ ] Inference WOQ (INT8) works end-to-end with safe fallback.
+- [ ] Multi-LoRA works per-request/per-sequence with cache isolation (KV reset on adapter change).
+- [ ] Unit tests + integration tests cover the end-to-end wiring.
+
+Mapping (so reviewers can quickly validate where each checklist item is implemented):
+
+| Checklist item | Primary phase(s) | Primary MVP step(s) | Notes / validation |
+| --- | --- | --- | --- |
+| Minimal public surface | A–E | MVP-0..3 | Session types nested/hidden; internals remain `internal`. |
+| Config has full effect | A, B | MVP-0 | Builder stores config; result/session consumes it. |
+| KV-cache correctness | B, C | MVP-0 | Per-layer/per-sequence cache isolation; no cross-layer corruption. |
+| Attention layer coverage | A | MVP-0 | Support/skip with diagnostics for `AttentionLayer`, `SelfAttentionLayer`, `GraphAttentionLayer`, etc. |
+| Paged KV-cache integration | C | MVP-0 | Paged backend selection + cached attention bridge. |
+| Batching + speculation usable | D, E | MVP-1 | Serving uses the same internals; session support only if facade stays minimal. |
+| Speculation backoff policy | E | MVP-1 | Auto/LatencyFirst/ThroughputFirst; backs off under batching load. |
+| Inference quantization (WOQ) | (add-on) | MVP-2 | Safe fallback per-layer; deterministic tests. |
+| Multi-LoRA per-request/per-seq | (add-on) | MVP-3 | Adapter selection + cache reset on adapter change. |
+| End-to-end tests | A–E | MVP-0..3 | Integration tests must use facade-only entry points. |
+
+---
+
+## 7) Resolved Decisions (from discussion)
+
+- **Facade:** Keep public surface minimal; avoid public cache-control methods on the result.
+- **`BeginInferenceSession()` shape:** Choose whatever is best; prefer a nested session type under `PredictionModelResult`.
+- **`AttentionMasking=Auto`:** Assume causal in typical inference/session usage when task type isn’t set; provide opt-out/override via config.
+- **Paged KV-cache:** Auto-enabled by default; users can opt-out via config.
+- **Speculative decoding:** Serving-first; session support only if it fits cleanly without expanding public surface.
+- **Cloning policy:** Improving cloning via better serialization/deserialization is acceptable to ensure a true deep copy.
+
+## 8) Remaining Questions (small, but useful before coding)
+
+1) Should a session support multiple independent sequences (e.g., `session.CreateSequence()` / `sequenceId`), or is “one session = one sequence” acceptable for now?
+2) Do you already have a preferred public API for text generation (e.g., `Generate(...)`) elsewhere, or should speculative decoding remain strictly within `AiDotNet.Serving` for now?
+
+---
+
+## 9) MVP Sequencing (to raise implementation confidence)
+
+This section turns the backlog into a concrete, low-risk execution order with explicit "first targets" and acceptance checks.
+It is written so a junior engineer can start implementation without having to make major architectural decisions.
+
+### 9.0) Mapping: Phases A–E vs MVP-0/1/2/3
+
+Phases **A–E** describe the main integration areas (wiring, sessions, paging, batching, speculation). The **MVP-0/1/2/3**
+sequence is the recommended *execution order* that also adds two additional "industry standard" gaps (quantization and multi-LoRA).
+
+| Phase (A–E) | MVP step(s) that implement it | Notes |
+|---|---|---|
+| Phase A (baseline wiring + safety) | MVP-0 | Guardrails + diagnostics + safe fallbacks. |
+| Phase B (session API) | MVP-0 | Session surface remains nested under `PredictionModelResult`. |
+| Phase C (paged KV-cache) | MVP-0 / MVP-1 | Paged cache is a default serving primitive; speculation should work with it. |
+| Phase D (EnableBatching) | MVP-0 / MVP-1 | Batching is serving-first; MVP-1 adds arbitration with speculation. |
+| Phase E (EnableSpeculativeDecoding) | MVP-1 | Implements speculation and its policy so it doesn't regress throughput under load. |
+| (not in A–E) Inference quantization | MVP-2 | Adds quantization beyond the original A–E scope. |
+| (not in A–E) Multi-LoRA | MVP-3 | Adds per-request/per-sequence adapter selection beyond the original A–E scope. |
+
+### 9.1) MVP-0: Guardrails (do first)
+1) Keep public API surface unchanged:
+   - Only `PredictionModelBuilder` and `PredictionModelResult` are user-facing.
+   - Sessions remain nested under `PredictionModelResult`.
+   - All new inference types remain `internal` (kernels/caches/schedulers/draft models).
+2) Add internal policy switches (config-driven) but keep defaults safe:
+   - If anything fails (unsupported model/layer), auto-disable that optimization and fall back to baseline inference.
+3) Add internal diagnostics (non-user-facing) to record which optimizations were applied and why others were skipped.
+
+Acceptance:
+- `dotnet build AiDotNet.sln` passes.
+- Existing unit tests still pass (warnings acceptable, no new failures introduced by MVP-0).
+
+### 9.2) MVP-1: Speculative Decoding in Sessions + Serving with Auto Policy
+
+**Goal:** Speculation is available wherever sessions are used when `EnableSpeculativeDecoding=true`, but it does not tank throughput under load.
+
+**First target method:** Classic “draft-model speculation” (existing draft model support) with a new internal policy layer.
+
+Implementation steps:
+1) Introduce a single internal decision point (used by both session and serving):
+   - “Should we speculate this step?” and “What depth should we use?”
+2) Implement `SpeculationPolicy=Auto` (internal default recommendation):
+   - Inputs to policy:
+     - rolling accept-rate
+     - current batch size / queue depth (serving)
+     - KV-cache pressure (paged block availability, optional)
+   - Behavior:
+     - Under high batching load: reduce depth or disable speculation.
+     - Under low load / latency-sensitive: increase depth up to configured max.
+3) Respect `EnableBatching` and `EnableSpeculativeDecoding` together:
+   - When both true, `Auto` policy prevents “double spending” compute.
+   - Provide `ForceOn/ForceOff` to override for experimentation (still configured via builder, not new public APIs).
+
+Defaults:
+- Sessions: if `EnableSpeculativeDecoding=true` and `SpeculationPolicy=Auto`, speculate when accept-rate is good and batch load is low/moderate.
+- Serving: if continuous batching queue is deep, speculation backs off automatically.
+
+Acceptance:
+- Integration tests show sessions still isolate state across sequences.
+- Serving throughput under batching load does not regress materially when `EnableSpeculativeDecoding=true` (policy must disable speculation under heavy load).
+
+### 9.3) MVP-2: Inference Quantization “First Target” (Weight-Only INT8)
+
+**Goal:** Get a real inference quantization win with minimal kernel churn and low correctness risk.
+
+**First target mode:** **Weight-only INT8 (WOQ)** for dense/linear matmuls in transformer blocks:
+- Q/K/V projections
+- output projection
+- FFN projections
+
+Non-goals for MVP-2 (explicitly deferred):
+- Activation quantization (INT8) (Phase 2)
+- KV-cache quantization (INT8/FP8) (Phase 3)
+- INT4 weight-only (Phase 4)
+
+Implementation steps:
+1) Add internal quantized weight containers (per-channel scales) and an `internal` matmul kernel that supports:
+   - FP32/FP16 activations * INT8 weights → FP32/FP16 output
+2) Add configuration wiring via `InferenceOptimizationConfig` (builder-only surface):
+   - `EnableWeightOnlyQuantization` (default false until validated)
+   - `WeightQuantizationBits` (start with 8)
+   - `QuantizationCalibrationMode` (Auto/MinMax/Percentile/KL) — for WOQ, “calibration” is mostly scale estimation; keep it simple initially.
+3) Selection rules:
+   - Only apply WOQ when model/task matches supported inference path (e.g., transformer-style models).
+   - Skip layers not yet supported; do not partially quantize in ways that break determinism.
+4) Agent support (optional but planned):
+   - Agents can recommend enabling WOQ for serving workloads; still configured via builder.
+
+Acceptance:
+- Accuracy within tolerance against FP baseline on deterministic tests.
+- Measurable memory reduction for weights (and ideally throughput gain on CPU/GPU depending on engine).
+
+### 9.4) MVP-3: Multi-LoRA (Per-Session/Per-Request Adapter Selection)
+
+**Goal:** Multiple LoRA adapters can be used concurrently (serving) or per-sequence (sessions) without exposing LoRA internals publicly.
+
+First target behavior:
+1) Adapter selection:
+   - Serving: adapter ID selected per request (e.g., metadata field in serving request model).
+   - Sessions: adapter ID set at sequence creation time (internal hook; no new public surface required beyond existing builder/result/session shape).
+2) Weight handling:
+   - Never mutate base weights.
+   - Cache merged weights per adapter ID (and per precision/quantization mode) to avoid recomputing merges.
+3) KV-cache interaction rules:
+    - If adapter changes for a given sequence, **reset KV-cache for that sequence only** (deterministic + correctness first).
+    - Scope of reset (must be consistent across backends):
+      - Reset contiguous `KVCache<T>` *or* paged `PagedKVCache<T>` state for that sequence ID.
+      - Do not clear other sequences' caches.
+      - Also reset any sequence-scoped optimized model state that depends on the adapter (e.g., merged weights / optimizer state), so adapter changes cannot reuse stale K/V pages.
+
+Non-goals for MVP-3 (defer):
+- Multi-adapter composition beyond simple “single adapter at a time” (Phase 2: merge/stack).
+- Hot-swapping adapters mid-generation without KV reset (Phase 3: advanced cache partitioning).
+
+Acceptance:
+- Two concurrent sequences using different adapters produce different outputs for same input.
+- Same adapter reused across sequences hits merge cache and remains deterministic.
+
+### 9.5) Phase 2+ (after MVPs)
+1) Dynamic speculation improvements (speculative scheduling refinements).
+2) Medusa vs EAGLE:
+   - Recommended order: implement **Medusa first** if it fits the model architecture best (multiple lightweight heads), then add EAGLE if needed.
+3) Activation quantization (INT8) and then KV-cache quantization (INT8/FP8), including paged KV-cache support.
+4) Multi-adapter LoRA composition (merge/stack), plus better cache invalidation rules.
diff --git a/docs/PR433_PHASE_AUDIT.md b/docs/PR433_PHASE_AUDIT.md
new file mode 100644
index 000000000..0b0a70bde
--- /dev/null
+++ b/docs/PR433_PHASE_AUDIT.md
@@ -0,0 +1,358 @@
+# PR #433 — Strict 9‑Phase Audit + Gap‑Closure Plan
+
+This document audits `docs/INFERENCE_MVP_PHASES.md` phase-by-phase against the current PR #433 implementation, using **concrete code locations and tests** as evidence, and lists the remaining work required to reach **100% confidence** with production-ready behavior.
+
+**Audit basis**
+- Phase source of truth: `docs/INFERENCE_MVP_PHASES.md`
+- Branch head used for this audit: `9e493239`
+
+---
+
+## Current confidence summary
+
+**Overall confidence that all 9 phases are 100% complete:** **~95%** (MVP plan complete; remaining work is mostly post-MVP feature depth such as INT4/activation quantization).
+
+**High-confidence areas:** Phase 1, 2, 3, 4, 6, 9 (core wiring + tests exist).
+
+**Low-confidence areas:** Phase 8 (post-MVP quantization depth such as INT4 and activation quantization).
+
+---
+
+## Phase 0 — Baseline Safety & Diagnostics
+
+**Phase intent:** observable and safe-by-default (auto-disable on unsupported, record decisions/exceptions, avoid mutating user models).
+
+**Implemented evidence**
+- Diagnostics collector: `src/Helpers/InferenceDiagnostics.cs:1`
+- Optimizer decision recording: `src/Inference/InferenceOptimizer.cs:119`
+- Serving decision recording: `src/Serving/ContinuousBatching/ContinuousBatcher.cs:400`
+- Session decision recording (Multi‑LoRA): `src/Models/Results/PredictionModelResult.cs:1317`
+
+**Existing verification**
+- Indirect coverage through tests that validate fallback behavior (speculation fallback, etc.):
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:92`
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:116`
+
+**Status:** Closed for MVP.
+
+**Verification added**
+- Diagnostics toggling (env var) + queue clear:
+  - `tests/AiDotNet.Tests/UnitTests/Helpers/InferenceDiagnosticsTests.cs:7`
+- Optimizer decision logging is exercised when diagnostics are enabled:
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:14`
+
+---
+
+## Phase 1 — Attention Rewrite Integration
+
+**Phase intent:** optimizer rewrites supported attention layers consistently; cloning is truly deep.
+
+**Implemented evidence**
+- Attention rewrite selection, including SelfAttention conversion and Flash/KV paths:
+  - Layer detection and rewrite: `src/Inference/InferenceOptimizer.cs:95`
+  - SelfAttention conversion: `src/Inference/InferenceOptimizer.cs:577`
+- Clone/deep copy via serialization:
+  - Serialization metadata and activation persistence: `src/NeuralNetworks/NeuralNetworkBase.cs:1311`
+
+**Existing verification**
+- Rewrites:
+  - Flash rewrite: `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:14`
+  - KV cached rewrite for text generation: `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:36`
+  - SelfAttention -> cached rewrite: `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:67`
+- Clone correctness baseline:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:213`
+
+**Status:** Closed for MVP (explicit skip coverage; no rewrite is attempted).
+
+**Verification added**
+- Explicit skip (no crash, no rewrite) tests:
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:206`
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:226`
+
+---
+
+## Phase 2 — Paged Attention + Paged KV‑Cache
+
+**Phase intent:** paged KV-cache is available and default-on; attention layers bridge to paged cache.
+
+**Implemented evidence**
+- Paged cache initialization + selection:
+  - `src/Inference/InferenceOptimizer.cs:276`
+- Paged cache implementation and guards:
+  - `src/Inference/PagedAttention/PagedKVCache.cs:26`
+- Paged cached attention layer:
+  - `src/Inference/PagedCachedMultiHeadAttention.cs:1`
+
+**Existing verification**
+- Unit coverage for paged cache + kernel:
+  - `tests/AiDotNet.Tests/UnitTests/Inference/PagedAttentionTests.cs:454`
+- Serving-side paged attention stability test exists (and net471 guard was added previously):
+  - `tests/AiDotNet.Tests/UnitTests/Serving/ServingComponentsTests.cs` (see paged attention test name if present)
+
+**Status:** Closed for MVP.
+
+**Verification added**
+- Session integration verifies paged KV-cache initialization + paged attention rewrite selection via internal stats:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:270`
+
+---
+
+## Phase 3 — KV‑Cache Precision (FP16 default, opt-out) + Quantized KV‑cache
+
+**Phase intent:** FP16 default for cache when supported; opt-out; optional int8 quantization.
+
+**Implemented evidence**
+- Precision/quantization resolution:
+  - `src/Inference/InferenceOptimizer.cs:247`
+- KV-cache FP16 + int8 storage:
+  - `src/Inference/KVCache.cs:99`
+- Config surface:
+  - `src/Configuration/InferenceOptimizationConfig.cs:152`
+  - `src/Configuration/InferenceOptimizationConfig.cs:167`
+
+**Existing verification**
+- Unit tests:
+  - FP16: `tests/AiDotNet.Tests/UnitTests/Inference/KVCacheTests.cs:61`
+  - Int8: `tests/AiDotNet.Tests/UnitTests/Inference/KVCacheTests.cs:96`
+- Integration test (int8 selection is visible via internal stats):
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:157`
+
+**Status:** Closed for MVP.
+
+**Verification added**
+- Session integration verifies FP16 selection in Auto mode:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:216`
+
+---
+
+## Phase 4 — Inference Sessions (Multi‑Sequence, Facade‑Friendly)
+
+**Phase intent:** `PredictionModelResult.BeginInferenceSession()` supports multi-sequence inference with isolation; minimal public API; internal stats for tests.
+
+**Implemented evidence**
+- Facade entrypoint:
+  - `src/Models/Results/PredictionModelResult.cs:1024`
+- Multi-sequence support:
+  - `src/Models/Results/PredictionModelResult.cs:1118`
+- Per-sequence internal stats hook:
+  - `src/Models/Results/PredictionModelResult.cs:1270`
+
+**Existing verification**
+- Predict remains stateless even with optimizations configured:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:24`
+- Multi-sequence independence:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:77`
+- Reset restores baseline state:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:130`
+
+**Status:** Closed for MVP.
+
+**Verification added**
+- Concurrent multi-sequence Predict test:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:159`
+
+---
+
+## Phase 5 — Batching (Serving‑First) + Resource Arbitration
+
+**Phase intent:** batching enabled in serving; clear policy for batching vs speculation conflicts.
+
+**Implemented evidence**
+- Continuous batching implementation:
+  - `src/Serving/ContinuousBatching/ContinuousBatcher.cs:1`
+- Conflict policy hooks and backoff:
+  - `src/Serving/ContinuousBatching/ContinuousBatcher.cs:426`
+
+**Existing verification**
+- Serving batching tests:
+  - `tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs:11`
+- Serving integration test verifies batching with concurrent requests:
+  - `tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs:298`
+
+**Status:** Closed for MVP (serving arbitration tests added; sessions do not batch across sequences).
+
+---
+
+## Phase 6 — Speculative Decoding MVP (Draft Model + Policy)
+
+**Phase intent:** speculative decoding is wired via config; safe fallback when draft unavailable; serving integration.
+
+**Implemented evidence**
+- Inference optimizer speculative initialization:
+  - `src/Inference/InferenceOptimizer.cs:726`
+- Config and policy:
+  - `src/Configuration/InferenceOptimizationConfig.cs:389`
+  - `src/Configuration/InferenceOptimizationConfig.cs:449`
+
+**Existing verification**
+- Fallback to NGram:
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:92`
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:116`
+
+**Status:** Closed for MVP (speculative decoding is configured in sessions but only executed by serving/generation flows, not plain `Predict()`).
+
+**Verification added**
+- Session integration validates "configured but not executed during Predict" behavior:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:244`
+
+---
+
+## Phase 7 — Dynamic Speculation & Alternative Speculators (Medusa/EAGLE)
+
+**Phase intent:** dynamic scheduling (acceptance/queue pressure) and alternative methods.
+
+**Implemented evidence (partial)**
+- Config hooks exist:
+  - `src/Configuration/InferenceOptimizationConfig.cs:462`
+  - `src/Configuration/InferenceOptimizationConfig.cs:523`
+- Serving backoff logic (dynamic-ish policy):
+  - `src/Serving/ContinuousBatching/ContinuousBatcher.cs:426`
+
+**Existing verification (partial)**
+- Policy tests:
+  - Auto acceptance backoff: `tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs:589`
+  - ThroughputFirst behavior: `tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs:651`
+
+**Status:** Closed for MVP.
+
+---
+
+## Phase 8 — Inference Quantization (Gap‑Closing)
+
+**Phase intent:** inference quantization beyond training: KV-cache quantization + weight-only quantization; later activation quantization.
+
+**Implemented evidence (partial)**
+- KV-cache quantization (int8) is wired and implemented:
+  - `src/Inference/InferenceOptimizer.cs:247`
+  - `src/Inference/KVCache.cs:99`
+- Weight-only quantization MVP (Dense-only float):
+  - `src/Inference/InferenceOptimizer.cs:538`
+  - `src/Inference/Quantization/QuantizedDenseLayer.cs:10`
+  - `src/Inference/Quantization/Int8WeightOnlyQuantization.cs:5`
+
+**Existing verification**
+- WOQ rewrite and output preservation:
+  - `tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs:139`
+- KV-cache quantization verified in integration:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:157`
+
+**Status:** Closed for MVP (WOQ covers Dense + paged attention projections with correctness tests).
+
+**Post-MVP opportunities**
+- INT4 WOQ and activation quantization (opt-in, correctness-first).
+
+---
+
+## Phase 9 — Multi‑LoRA (Serving‑First, Secure Defaults)
+
+**Phase intent:** per-request adapter selection in serving; optional per-sequence selection in sessions; no public LoRA internals; cache reset rules.
+
+**Implemented evidence**
+- Serving adapter routing (header-based):
+  - `src/AiDotNet.Serving/Controllers/InferenceController.cs:220`
+- Session per-sequence task selection + reset:
+  - `src/Models/Results/PredictionModelResult.cs:1125`
+  - `src/Models/Results/PredictionModelResult.cs:1225`
+
+**Clone/serialization correctness (critical for isolation)**
+- Serialization v2 + extras:
+  - `src/NeuralNetworks/NeuralNetworkBase.cs:1261`
+  - `src/NeuralNetworks/Layers/ILayerSerializationExtras.cs:1`
+- MultiLoRA metadata + deterministic ordering + frozen-base extras:
+  - `src/LoRA/Adapters/MultiLoRAAdapter.cs:54`
+- MultiLoRA deserialization support:
+  - `src/Helpers/DeserializationHelper.cs:332`
+
+**Existing verification**
+- Serving routes to variant model:
+  - `tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs:232`
+- Session sequences isolate task selection:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:182`
+
+**Status:** Closed for MVP.
+
+**Verification added**
+- Task switch resets cache state (same sequence) and records MultiLoRA decision:
+  - `tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs:348`
+
+---
+
+## Unresolved PR threads (code scanning)
+
+Two PR threads are unresolved because they are code-scanning alert threads (not manually resolvable via the review API):
+- `src/AiDotNet.Tensors/Engines/Optimization/PerformanceProfiler.cs`
+- `src/InferenceOptimization/Kernels/AttentionKernel.cs`
+
+For strict closure, any fixes should be recorded as comments on those threads (per repo workflow).
+
+---
+
+## Gap‑closure plan to reach 100% confidence (prioritized)
+
+### P0 — Required to claim “100% complete”
+1) **Phase 7 implementations (not just hooks)**
+   - Define internal abstraction(s) (no new public API):
+     - `internal interface ISpeculativeProposer<T>`: `Propose(...)` returns N candidate tokens (or token trees) + optional scores.
+     - `internal sealed class ClassicDraftModelProposer<T>` wraps existing `IDraftModel<T>`.
+     - `internal sealed class MedusaProposer<T>` and/or `EagleProposer<T>`: MVP uses “multi-head proposal” logic implemented *internally* (even if initial implementation is CPU-only).
+   - Wire proposer selection:
+     - `InferenceOptimizationConfig.SpeculativeMethod` selects proposer (Auto=>ClassicDraftModel).
+     - Serving (`ContinuousBatcher`) uses proposer; sessions use proposer only if session exposes a generation API (otherwise serving-first is acceptable, but must be documented).
+   - Implement **dynamic scheduling** (a real algorithm, not only queue-size backoff):
+     - Inputs: acceptance rate EMA, batch size / queue depth, recent latency (optional), configured max depth.
+     - Output: per-step speculation depth (and optionally proposer disable).
+     - Required properties: deterministic in tests (use fixed seeds and controlled inputs), monotonic backoff under sustained low acceptance, and fast recovery under high acceptance.
+   - Tests (must be deterministic / non-flaky):
+     - Acceptance rate drives depth up/down (unit).
+     - Under batching load, speculation is disabled or depth reduced (unit).
+     - Policy respects ForceOn/ForceOff/LatencyFirst/ThroughputFirst (unit).
+
+2) **Phase 8 broaden quantization to “industry standard” scope**
+   - Expand beyond “DenseLayer-as-a-top-level-layer” where possible:
+     - If transformer blocks are composed from explicit `DenseLayer<T>` layers in the model graph, extend rewrite detection to those layers (straightforward).
+     - If attention layers own projection matrices internally (common), add **internal** quantization paths inside:
+       - `src/Inference/CachedMultiHeadAttention.cs` (Q/K/V/O matvecs)
+       - `src/Inference/PagedCachedMultiHeadAttention.cs` (paged kernel weight paths)
+       - `src/NeuralNetworks/Attention/FlashAttentionLayer.cs` (if applicable)
+   - Quantization modes and constraints for MVP:
+     - WOQ INT8 for float inference only (keep correctness first).
+     - Per-row/per-channel scales; deterministic rounding.
+     - Clear fallback to FP if unsupported or errors (record diagnostics).
+   - Tests:
+     - Unit: WOQ matvec kernel correctness vs FP baseline.
+     - Integration: enabling WOQ changes selected path (diagnostics/stats) and output remains within tolerance.
+
+3) **Phase 5 arbitration completeness**
+   - Add explicit tests for `EnableBatching && EnableSpeculativeDecoding` under load.
+   - Ensure serving chooses the intended policy for `ThroughputFirst/LatencyFirst/Auto`.
+   - Add explicit “resource competition” tests:
+     - Same workload, batching depth>1 => speculation off in ThroughputFirst.
+     - Low load, LatencyFirst => speculation on with configured depth.
+
+### P1 — Strongly recommended for production readiness
+4) Add integration test for paged selection (Phase 2) to prove the optimizer chooses `PagedCachedMultiHeadAttention` when enabled.
+5) Add integration test for FP16 auto selection (Phase 3).
+6) Add concurrency test for sessions (Phase 4).
+7) Add adapter/task-switch KV reset test (Phase 9).
+
+### P2 — Diagnostics/test completeness
+8) Add diagnostics assertion tests (Phase 0) with `AIDOTNET_DIAGNOSTICS=1` and expected decision entries.
+9) Add a minimal “selection report” surface for tests only (internal), if needed to avoid brittle string checks.
+
+---
+
+## Verification matrix (what we run to maintain 100% confidence)
+
+**Build**
+- `dotnet build AiDotNet.sln -c Release`
+
+**Targeted tests (fast, must pass before pushing)**
+- Inference optimizer + sessions: `dotnet test tests/AiDotNet.Tests/AiDotNetTests.csproj -c Release --filter FullyQualifiedName~InferenceOptimizerTests`
+- Session integration: `dotnet test tests/AiDotNet.Tests/AiDotNetTests.csproj -c Release --filter FullyQualifiedName~InferenceSessionIntegrationTests`
+- Paged attention: `dotnet test tests/AiDotNet.Tests/AiDotNetTests.csproj -c Release --filter FullyQualifiedName~PagedKVCacheTests|FullyQualifiedName~PagedAttentionKernelTests`
+- Serving batching: `dotnet test tests/AiDotNet.Tests/AiDotNetTests.csproj -c Release --filter FullyQualifiedName~ContinuousBatchingTests`
+- Serving integration: `dotnet test tests/AiDotNet.Serving.Tests/AiDotNet.Serving.Tests.csproj -c Release`
+
+**Full test suite**
+- `dotnet test tests/AiDotNet.Tests/AiDotNetTests.csproj -c Release`
+  - Note: currently fails due to unrelated JIT/time-series/regression failures; those must be resolved or explicitly quarantined outside PR #433 before claiming repo-wide “100% green”.
diff --git a/docs/PR433_REVIEW_WORKFLOW.md b/docs/PR433_REVIEW_WORKFLOW.md
new file mode 100644
index 000000000..61128b572
--- /dev/null
+++ b/docs/PR433_REVIEW_WORKFLOW.md
@@ -0,0 +1,15 @@
+## PR#433 Review Workflow (Unresolved Comments)
+
+This repo’s PR#433 work must be completed using the following loop, **in order**, until there are no unresolved comments left:
+
+1) Use the **GitHub GraphQL API** (via `gh api graphql`) to fetch **unresolved review threads** for PR#433.
+2) Take the next unresolved thread (oldest first unless specified otherwise).
+3) Implement the required code/docs change in the repo.
+4) **Before moving to the next thread**:
+   - If it’s a normal review thread: resolve it via the GraphQL `resolveReviewThread` mutation.
+   - If it’s a code-scanning / bot thread that cannot be manually resolved: add a reply comment to the thread describing what was fixed, so we can track progress.
+
+Notes:
+- Do not batch multiple threads in one pass; always follow the “fix → resolve/reply → next” sequence.
+- Keep the public API surface minimal per the facade philosophy; prefer `internal` and nested session types.
+
diff --git a/examples/JitCompiler/BasicUsageExample.cs b/examples/JitCompiler/BasicUsageExample.cs
index 008403957..a359b8ff3 100644
--- a/examples/JitCompiler/BasicUsageExample.cs
+++ b/examples/JitCompiler/BasicUsageExample.cs
@@ -205,7 +205,7 @@ public static void CachingExample()
             OperationType = OperationType.ReLU
         };
 
-        var (compiled1, stats1) = jit.CompileWithStats(relu1, new List<ComputationNode<float>> { input1 });
+        var (_, stats1) = jit.CompileWithStats(relu1, new List<ComputationNode<float>> { input1 });
         Console.WriteLine($"First compilation:");
         Console.WriteLine($"  Cache hit: {stats1.CacheHit}");
         Console.WriteLine($"  Compilation time: {stats1.CompilationTime.TotalMilliseconds:F2}ms\n");
@@ -219,7 +219,7 @@ public static void CachingExample()
             OperationType = OperationType.ReLU
         };
 
-        var (compiled2, stats2) = jit.CompileWithStats(relu2, new List<ComputationNode<float>> { input2 });
+        var (_, stats2) = jit.CompileWithStats(relu2, new List<ComputationNode<float>> { input2 });
         Console.WriteLine($"Second compilation (same structure):");
         Console.WriteLine($"  Cache hit: {stats2.CacheHit}");
         Console.WriteLine($"  Compilation time: {stats2.CompilationTime.TotalMilliseconds:F2}ms\n");
@@ -232,7 +232,7 @@ public static void CachingExample()
             OperationType = OperationType.Sigmoid
         };
 
-        var (compiled3, stats3) = jit.CompileWithStats(sigmoid2, new List<ComputationNode<float>> { input2 });
+        var (_, stats3) = jit.CompileWithStats(sigmoid2, new List<ComputationNode<float>> { input2 });
         Console.WriteLine($"Third compilation (different structure):");
         Console.WriteLine($"  Cache hit: {stats3.CacheHit}");
         Console.WriteLine($"  Compilation time: {stats3.CompilationTime.TotalMilliseconds:F2}ms\n");
diff --git a/src/AiDotNet.Serving/Controllers/InferenceController.cs b/src/AiDotNet.Serving/Controllers/InferenceController.cs
index b33fc4bb2..d445a1810 100644
--- a/src/AiDotNet.Serving/Controllers/InferenceController.cs
+++ b/src/AiDotNet.Serving/Controllers/InferenceController.cs
@@ -135,6 +135,12 @@ public async Task<IActionResult> Predict(string modelName, [FromBody] Prediction
         catch (ArgumentException ex)
         {
             _logger.LogError(ex, "Invalid argument during prediction for model '{ModelName}'", modelName);
+
+            if (ex.Message.Contains("maximum allowed when batching is disabled", StringComparison.OrdinalIgnoreCase))
+            {
+                return StatusCode(StatusCodes.Status413PayloadTooLarge, new { error = ex.Message });
+            }
+
             return BadRequest(new { error = $"Invalid input: {ex.Message}" });
         }
         catch (Exception ex)
@@ -149,24 +155,118 @@ public async Task<IActionResult> Predict(string modelName, [FromBody] Prediction
     /// </summary>
     private async Task<double[][]> PredictWithType<T>(string modelName, double[][] features)
     {
+        string effectiveModelName = ResolveModelNameWithAdapter(modelName);
+        var model = _modelRepository.GetModel<T>(effectiveModelName) ?? _modelRepository.GetModel<T>(modelName);
+        if (model == null)
+        {
+            string attemptedNames = effectiveModelName != modelName
+                ? $"'{effectiveModelName}' (with adapter) or '{modelName}'"
+                : $"'{modelName}'";
+            throw new InvalidOperationException($"Model {attemptedNames} was not found.");
+        }
+
+        // Respect per-model inference configuration: bypass batching when disabled.
+        if (model is AiDotNet.Serving.Models.IServableModelInferenceOptions opts && !opts.EnableBatching)
+        {
+            const int MaxUnbatchedItems = 1000;
+            if (features.Length > MaxUnbatchedItems)
+            {
+                _logger.LogWarning(
+                    "Rejected large unbatched request ({Count} items) for model '{ModelName}' (batching disabled)",
+                    features.Length,
+                    modelName);
+
+                throw new ArgumentException(
+                    $"Request batch size ({features.Length}) exceeds the maximum allowed when batching is disabled ({MaxUnbatchedItems}). " +
+                    $"Enable batching for model '{modelName}' or split the request into smaller batches.");
+            }
+
+            _logger.LogDebug(
+                "Batching disabled for model '{ModelName}', processing {Count} items individually",
+                modelName,
+                features.Length);
+
+            var predictions = new double[features.Length][];
+            for (int i = 0; i < features.Length; i++)
+            {
+                var inputVector = ConvertToVector<T>(features[i]);
+                var resultVector = model.Predict(inputVector);
+                predictions[i] = ConvertFromVector(resultVector);
+            }
+
+            return predictions;
+        }
+
         // Queue all requests first to enable batching
         var tasks = features.Select(featureArray =>
         {
             var inputVector = ConvertToVector<T>(featureArray);
-            return _requestBatcher.QueueRequest(modelName, inputVector);
+            return _requestBatcher.QueueRequest(effectiveModelName, inputVector);
         }).ToArray();
 
         // Await all requests together
         var resultVectors = await Task.WhenAll(tasks);
 
         // Convert results back to double arrays
-        var predictions = new double[resultVectors.Length][];
+        var batchedPredictions = new double[resultVectors.Length][];
         for (int i = 0; i < resultVectors.Length; i++)
         {
-            predictions[i] = ConvertFromVector(resultVectors[i]);
+            batchedPredictions[i] = ConvertFromVector(resultVectors[i]);
+        }
+
+        return batchedPredictions;
+    }
+
+    private string ResolveModelNameWithAdapter(string modelName)
+    {
+        // Multi-LoRA / adapter routing (serving-first): select a pre-loaded model variant via request header.
+        // This keeps adapter details out of the public model facade while enabling per-request selection.
+        if (Request?.Headers == null)
+        {
+            _logger.LogDebug("No request headers available; routing to base model '{ModelName}'.", modelName);
+            return modelName;
+        }
+
+        if (!Request.Headers.TryGetValue("X-AiDotNet-Lora", out var adapterValues) &&
+            !Request.Headers.TryGetValue("X-AiDotNet-Adapter", out adapterValues))
+        {
+            _logger.LogDebug("No adapter header present; routing to base model '{ModelName}'.", modelName);
+            return modelName;
+        }
+
+        var adapterId = adapterValues.ToString()?.Trim();
+        if (string.IsNullOrWhiteSpace(adapterId) || adapterId.Length > 64 || !IsSafeAdapterId(adapterId))
+        {
+            if (!string.IsNullOrWhiteSpace(adapterId))
+            {
+                string reason = adapterId.Length > 64 ? "TooLong" : (!IsSafeAdapterId(adapterId) ? "UnsafeCharacters" : "EmptyOrWhitespace");
+                var level = adapterId.Length > 64 || !IsSafeAdapterId(adapterId) ? LogLevel.Warning : LogLevel.Debug;
+                _logger.Log(level,
+                    "Ignoring invalid adapter ID '{AdapterId}' for model '{ModelName}' (reason: {Reason}).",
+                    adapterId,
+                    modelName,
+                    reason);
+            }
+            return modelName;
+        }
+
+        _logger.LogDebug("Routing to adapter model '{EffectiveModelName}'.", $"{modelName}__{adapterId}");
+        return $"{modelName}__{adapterId}";
+    }
+
+    private static bool IsSafeAdapterId(string adapterId)
+    {
+        for (int i = 0; i < adapterId.Length; i++)
+        {
+            char c = adapterId[i];
+            bool ok = (c >= 'a' && c <= 'z') ||
+                      (c >= 'A' && c <= 'Z') ||
+                      (c >= '0' && c <= '9') ||
+                      c == '-' || c == '_' || c == '.';
+            if (!ok) return false;
         }
 
-        return predictions;
+        return true;
     }
 
     /// <summary>
diff --git a/src/AiDotNet.Serving/Models/IServableModelInferenceOptions.cs b/src/AiDotNet.Serving/Models/IServableModelInferenceOptions.cs
new file mode 100644
index 000000000..42fb632b0
--- /dev/null
+++ b/src/AiDotNet.Serving/Models/IServableModelInferenceOptions.cs
@@ -0,0 +1,11 @@
+namespace AiDotNet.Serving.Models;
+
+/// <summary>
+/// Internal serving-only inference options derived from the model's facade configuration.
+/// </summary>
+internal interface IServableModelInferenceOptions
+{
+    bool EnableBatching { get; }
+    bool EnableSpeculativeDecoding { get; }
+}
+
diff --git a/src/AiDotNet.Serving/Models/ServableModelWrapper.cs b/src/AiDotNet.Serving/Models/ServableModelWrapper.cs
index 36dfb6154..1f37de1d3 100644
--- a/src/AiDotNet.Serving/Models/ServableModelWrapper.cs
+++ b/src/AiDotNet.Serving/Models/ServableModelWrapper.cs
@@ -8,13 +8,15 @@ namespace AiDotNet.Serving.Models;
 /// This allows any model with a Predict method to be served via the REST API.
 /// </summary>
 /// <typeparam name="T">The numeric type used by the model</typeparam>
-public class ServableModelWrapper<T> : IServableModel<T>
+public class ServableModelWrapper<T> : IServableModel<T>, IServableModelInferenceOptions
 {
     private readonly Func<Vector<T>, Vector<T>> _predictFunc;
     private readonly Func<Matrix<T>, Matrix<T>>? _predictBatchFunc;
     private readonly string _modelName;
     private readonly int _inputDimension;
     private readonly int _outputDimension;
+    private readonly bool _enableBatching;
+    private readonly bool _enableSpeculativeDecoding;
 
     /// <summary>
     /// Initializes a new instance of the ServableModelWrapper with custom prediction functions.
@@ -24,18 +26,24 @@ public class ServableModelWrapper<T> : IServableModel<T>
     /// <param name="outputDimension">The number of output dimensions</param>
     /// <param name="predictFunc">Function to perform single prediction</param>
     /// <param name="predictBatchFunc">Optional function to perform batch prediction. If not provided, batch prediction will use multiple single predictions.</param>
+    /// <param name="enableBatching">Whether this model supports serving-side batching.</param>
+    /// <param name="enableSpeculativeDecoding">Whether this model supports speculative decoding in serving/session workflows.</param>
     public ServableModelWrapper(
         string modelName,
         int inputDimension,
         int outputDimension,
         Func<Vector<T>, Vector<T>> predictFunc,
-        Func<Matrix<T>, Matrix<T>>? predictBatchFunc = null)
+        Func<Matrix<T>, Matrix<T>>? predictBatchFunc = null,
+        bool enableBatching = true,
+        bool enableSpeculativeDecoding = false)
     {
         _modelName = modelName ?? throw new ArgumentNullException(nameof(modelName));
         _inputDimension = inputDimension;
         _outputDimension = outputDimension;
         _predictFunc = predictFunc ?? throw new ArgumentNullException(nameof(predictFunc));
         _predictBatchFunc = predictBatchFunc;
+        _enableBatching = enableBatching;
+        _enableSpeculativeDecoding = enableSpeculativeDecoding;
     }
 
     /// <summary>
@@ -52,6 +60,8 @@ public ServableModelWrapper(
         _modelName = modelName ?? throw new ArgumentNullException(nameof(modelName));
         _inputDimension = inputDimension;
         _outputDimension = 1; // Regression models typically output a single value
+        _enableBatching = true;
+        _enableSpeculativeDecoding = false;
 
         if (regressionModel == null)
         {
@@ -135,4 +145,7 @@ public Matrix<T> PredictBatch(Matrix<T> inputs)
 
         return result;
     }
+
+    bool IServableModelInferenceOptions.EnableBatching => _enableBatching;
+    bool IServableModelInferenceOptions.EnableSpeculativeDecoding => _enableSpeculativeDecoding;
 }
diff --git a/src/AiDotNet.Serving/Services/ModelStartupService.cs b/src/AiDotNet.Serving/Services/ModelStartupService.cs
index 956b0fc33..166e8b661 100644
--- a/src/AiDotNet.Serving/Services/ModelStartupService.cs
+++ b/src/AiDotNet.Serving/Services/ModelStartupService.cs
@@ -200,6 +200,10 @@ private void LoadTypedModel<T>(string name, string path)
         var modelResult = new PredictionModelResult<T, Matrix<T>, Vector<T>>();
         modelResult.LoadFromFile(path);
 
+        var inferenceConfig = modelResult.GetInferenceOptimizationConfigForServing();
+        bool enableBatching = inferenceConfig?.EnableBatching ?? true;
+        bool enableSpeculativeDecoding = inferenceConfig?.EnableSpeculativeDecoding ?? false;
+
         // Get dimensions from the model metadata
         var metadata = modelResult.GetModelMetadata();
         var inputDim = metadata.FeatureCount > 0 ? metadata.FeatureCount : 1;
@@ -267,7 +271,9 @@ private void LoadTypedModel<T>(string name, string path)
             inputDim,
             outputDim,
             predictFunc,
-            predictBatchFunc);
+            predictBatchFunc,
+            enableBatching: enableBatching,
+            enableSpeculativeDecoding: enableSpeculativeDecoding);
 
         // Register with the repository
         var success = _modelRepository.LoadModel(name, servableModel, path);
diff --git a/src/AiDotNet.Tensors/Engines/GpuEngine.cs b/src/AiDotNet.Tensors/Engines/GpuEngine.cs
index 238454243..2ebed18cd 100644
--- a/src/AiDotNet.Tensors/Engines/GpuEngine.cs
+++ b/src/AiDotNet.Tensors/Engines/GpuEngine.cs
@@ -1048,8 +1048,8 @@ public GpuEngine(AdaptiveThresholds thresholds)
 
         try
         {
-            // Create ILGPU context
-            _context = Context.CreateDefault();
+            // Create ILGPU context with Algorithms extension enabled for RoundToEven support
+            _context = Context.Create(builder => builder.Default().EnableAlgorithms());
 
             // Try to get preferred device (GPU over CPU)
             var device = _context.GetPreferredDevice(preferCPU: false);
diff --git a/src/AiDotNet.Tensors/Engines/Optimization/CacheOptimizer.cs b/src/AiDotNet.Tensors/Engines/Optimization/CacheOptimizer.cs
new file mode 100644
index 000000000..46e832af9
--- /dev/null
+++ b/src/AiDotNet.Tensors/Engines/Optimization/CacheOptimizer.cs
@@ -0,0 +1,209 @@
+using System;
+using System.Runtime.CompilerServices;
+
+namespace AiDotNet.Tensors.Engines.Optimization
+{
+    /// <summary>
+    /// Provides CPU cache optimization utilities including prefetching and cache-aware algorithms.
+    /// These utilities help maximize cache efficiency for tensor operations.
+    /// </summary>
+    public static class CacheOptimizer
+    {
+        /// <summary>
+        /// Gets the optimal block size for the L1 cache
+        /// </summary>
+        public static int L1BlockSize => 64; // 64 floats = 256 bytes, typical L1 cache line
+
+        /// <summary>
+        /// Gets the optimal block size for the L2 cache
+        /// </summary>
+        public static int L2BlockSize => 512; // Tuned for typical L2 cache
+
+        /// <summary>
+        /// Gets the optimal block size for the L3 cache
+        /// </summary>
+        public static int L3BlockSize => 2048; // Tuned for typical L3 cache
+
+        /// <summary>
+        /// Prefetch data for reading (temporal locality)
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void Prefetch(void* address)
+        {
+            // This hints the CPU to fetch data into cache
+            // Note: .NET JIT may or may not honor this depending on platform
+#if NET5_0_OR_GREATER
+            if (System.Runtime.Intrinsics.X86.Sse.IsSupported)
+            {
+                System.Runtime.Intrinsics.X86.Sse.Prefetch0(address);
+            }
+#endif
+            // No-op on non-x86 platforms, if SSE is not supported, or on .NET Framework
+        }
+
+        /// <summary>
+        /// Prefetch data with low temporal locality (won't pollute cache)
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void PrefetchNonTemporal(void* address)
+        {
+#if NET5_0_OR_GREATER
+            if (System.Runtime.Intrinsics.X86.Sse.IsSupported)
+            {
+                System.Runtime.Intrinsics.X86.Sse.PrefetchNonTemporal(address);
+            }
+#endif
+            // No-op on non-x86 platforms, if SSE is not supported, or on .NET Framework
+        }
+
+        /// <summary>
+        /// Computes optimal tiling parameters for a 2D operation
+        /// </summary>
+        public static (int tileM, int tileN, int tileK) ComputeOptimalTiling(
+            int m, int n, int k,
+            int elementSize = 4) // 4 bytes for float
+        {
+            var caps = PlatformDetector.Capabilities;
+            int l1Size = caps.L1CacheSize;
+
+            // We want tiles to fit in L1 cache
+            // For matrix multiplication: tileM * tileK + tileK * tileN + tileM * tileN elements
+            // Simplified: aim for sqrt(L1Size / (3 * elementSize)) per dimension
+
+            int maxTileSize = (int)Math.Sqrt(l1Size / (3.0 * elementSize));
+
+            // Round down to nearest power of 2 for better memory alignment
+            int tileSize = 1;
+            while (tileSize * 2 <= maxTileSize)
+            {
+                tileSize *= 2;
+            }
+
+            // Ensure minimum tile size
+            tileSize = Math.Max(tileSize, 16);
+
+            // Adjust based on actual matrix dimensions
+            int tileM = Math.Min(tileSize, m);
+            int tileN = Math.Min(tileSize, n);
+            int tileK = Math.Min(tileSize, k);
+
+            return (tileM, tileN, tileK);
+        }
+
+        /// <summary>
+        /// Cache-aware transpose of a 2D array
+        /// </summary>
+        public static unsafe void TransposeBlocked(float* src, float* dst, int rows, int cols)
+        {
+            const int blockSize = 32; // Tuned for cache line size
+
+            for (int i = 0; i < rows; i += blockSize)
+            {
+                for (int j = 0; j < cols; j += blockSize)
+                {
+                    int iMax = Math.Min(i + blockSize, rows);
+                    int jMax = Math.Min(j + blockSize, cols);
+
+                    // Transpose block
+                    for (int ii = i; ii < iMax; ii++)
+                    {
+                        for (int jj = j; jj < jMax; jj++)
+                        {
+                            dst[jj * rows + ii] = src[ii * cols + jj];
+                        }
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Cache-aware copying with prefetching
+        /// </summary>
+        public static unsafe void CopyWithPrefetch(float* src, float* dst, int length)
+        {
+            const int prefetchDistance = 64; // Prefetch 64 elements ahead
+
+            int i = 0;
+
+            // Main loop with prefetching
+            for (; i < length - prefetchDistance; i++)
+            {
+                Prefetch(src + i + prefetchDistance);
+                dst[i] = src[i];
+            }
+
+            // Remaining elements without prefetch
+            for (; i < length; i++)
+            {
+                dst[i] = src[i];
+            }
+        }
+
+        /// <summary>
+        /// Z-order (Morton order) indexing for better cache locality in 2D access patterns
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static int MortonEncode(int x, int y)
+        {
+            return (Part1By1(y) << 1) | Part1By1(x);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int Part1By1(int n)
+        {
+            n &= 0x0000ffff;
+            n = (n ^ (n << 8)) & 0x00ff00ff;
+            n = (n ^ (n << 4)) & 0x0f0f0f0f;
+            n = (n ^ (n << 2)) & 0x33333333;
+            n = (n ^ (n << 1)) & 0x55555555;
+            return n;
+        }
+
+        /// <summary>
+        /// Converts Z-order index back to 2D coordinates
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static (int x, int y) MortonDecode(int code)
+        {
+            return (Compact1By1(code), Compact1By1(code >> 1));
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private static int Compact1By1(int n)
+        {
+            n &= 0x55555555;
+            n = (n ^ (n >> 1)) & 0x33333333;
+            n = (n ^ (n >> 2)) & 0x0f0f0f0f;
+            n = (n ^ (n >> 4)) & 0x00ff00ff;
+            n = (n ^ (n >> 8)) & 0x0000ffff;
+            return n;
+        }
+
+        /// <summary>
+        /// Estimates the number of cache misses for a given access pattern
+        /// </summary>
+        public static double EstimateCacheMisses(int dataSize, int accessStride, int cacheSize, int cacheLineSize)
+        {
+            // Simple cache miss estimation model
+            int elementsPerLine = cacheLineSize / sizeof(float);
+            int totalLines = (dataSize + elementsPerLine - 1) / elementsPerLine;
+            int cacheLinesAvailable = cacheSize / cacheLineSize;
+
+            if (accessStride <= elementsPerLine)
+            {
+                // Sequential access - good cache behavior
+                return totalLines * 0.1; // ~10% miss rate for sequential
+            }
+            else if (totalLines <= cacheLinesAvailable)
+            {
+                // Data fits in cache
+                return totalLines * 0.05; // ~5% miss rate
+            }
+            else
+            {
+                // Poor cache behavior - strided access with cache thrashing
+                return totalLines * 0.8; // ~80% miss rate
+            }
+        }
+    }
+}
diff --git a/src/AiDotNet.Tensors/Engines/Optimization/LoopOptimizer.cs b/src/AiDotNet.Tensors/Engines/Optimization/LoopOptimizer.cs
new file mode 100644
index 000000000..32a83a9a9
--- /dev/null
+++ b/src/AiDotNet.Tensors/Engines/Optimization/LoopOptimizer.cs
@@ -0,0 +1,218 @@
+using System;
+using System.Runtime.CompilerServices;
+
+namespace AiDotNet.Tensors.Engines.Optimization
+{
+    /// <summary>
+    /// Provides loop optimization techniques including tiling and vectorization hints.
+    /// These utilities help maximize performance for tensor operations.
+    /// </summary>
+    public static class LoopOptimizer
+    {
+        /// <summary>
+        /// 2D loop tiling for matrix operations
+        /// </summary>
+        public static void Tile2D(
+            int rows, int cols,
+            int tileSize,
+            Action<int, int, int, int> tileAction)
+        {
+            for (int i = 0; i < rows; i += tileSize)
+            {
+                int iEnd = Math.Min(i + tileSize, rows);
+
+                for (int j = 0; j < cols; j += tileSize)
+                {
+                    int jEnd = Math.Min(j + tileSize, cols);
+
+                    tileAction(i, iEnd, j, jEnd);
+                }
+            }
+        }
+
+        /// <summary>
+        /// 3D loop tiling for tensor operations
+        /// </summary>
+        public static void Tile3D(
+            int dim1, int dim2, int dim3,
+            int tileSize1, int tileSize2, int tileSize3,
+            Action<int, int, int, int, int, int> tileAction)
+        {
+            for (int i = 0; i < dim1; i += tileSize1)
+            {
+                int iEnd = Math.Min(i + tileSize1, dim1);
+
+                for (int j = 0; j < dim2; j += tileSize2)
+                {
+                    int jEnd = Math.Min(j + tileSize2, dim2);
+
+                    for (int k = 0; k < dim3; k += tileSize3)
+                    {
+                        int kEnd = Math.Min(k + tileSize3, dim3);
+
+                        tileAction(i, iEnd, j, jEnd, k, kEnd);
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Loop unrolling hint - processes elements in groups
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void UnrollBy4(int length, Action<int> action)
+        {
+            int i = 0;
+            int unrolledLength = length & ~3; // Round down to multiple of 4
+
+            // Unrolled loop
+            for (; i < unrolledLength; i += 4)
+            {
+                action(i);
+                action(i + 1);
+                action(i + 2);
+                action(i + 3);
+            }
+
+            // Remainder
+            for (; i < length; i++)
+            {
+                action(i);
+            }
+        }
+
+        /// <summary>
+        /// Loop unrolling by 8 for better SIMD utilization
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static void UnrollBy8(int length, Action<int> action)
+        {
+            int i = 0;
+            int unrolledLength = length & ~7;
+
+            for (; i < unrolledLength; i += 8)
+            {
+                action(i);
+                action(i + 1);
+                action(i + 2);
+                action(i + 3);
+                action(i + 4);
+                action(i + 5);
+                action(i + 6);
+                action(i + 7);
+            }
+
+            for (; i < length; i++)
+            {
+                action(i);
+            }
+        }
+
+        /// <summary>
+        /// Strip mining - breaks loop into chunks for better cache utilization
+        /// </summary>
+        public static void StripMine(int totalSize, int stripSize, Action<int, int> stripAction)
+        {
+            for (int start = 0; start < totalSize; start += stripSize)
+            {
+                int end = Math.Min(start + stripSize, totalSize);
+                stripAction(start, end);
+            }
+        }
+
+        /// <summary>
+        /// Loop fusion helper - executes multiple operations in a single pass
+        /// </summary>
+        public static void Fuse(int length, params Action<int>[] actions)
+        {
+            for (int i = 0; i < length; i++)
+            {
+                foreach (var action in actions)
+                {
+                    action(i);
+                }
+            }
+        }
+
+        /// <summary>
+        /// Loop interchange optimization for better cache locality
+        /// Automatically chooses better loop order based on access pattern
+        /// </summary>
+        public static void OptimalOrder2D(
+            int rows, int cols,
+            bool rowMajorAccess,
+            Action<int, int> action)
+        {
+            if (rowMajorAccess)
+            {
+                // Standard order for row-major access
+                for (int i = 0; i < rows; i++)
+                {
+                    for (int j = 0; j < cols; j++)
+                    {
+                        action(i, j);
+                    }
+                }
+            }
+            else
+            {
+                // Interchanged order for column-major access
+                for (int j = 0; j < cols; j++)
+                {
+                    for (int i = 0; i < rows; i++)
+                    {
+                        action(i, j);
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Parallel loop tiling with work stealing
+        /// </summary>
+        public static void ParallelTile2D(
+            int rows, int cols,
+            int tileSize,
+            Action<int, int, int, int> tileAction)
+        {
+            int numTilesI = (rows + tileSize - 1) / tileSize;
+            int numTilesJ = (cols + tileSize - 1) / tileSize;
+            int totalTiles = numTilesI * numTilesJ;
+
+            System.Threading.Tasks.Parallel.For(0, totalTiles, tileIdx =>
+            {
+                int ti = tileIdx / numTilesJ;
+                int tj = tileIdx % numTilesJ;
+
+                int iStart = ti * tileSize;
+                int iEnd = Math.Min(iStart + tileSize, rows);
+
+                int jStart = tj * tileSize;
+                int jEnd = Math.Min(jStart + tileSize, cols);
+
+                tileAction(iStart, iEnd, jStart, jEnd);
+            });
+        }
+
+        /// <summary>
+        /// Automatically determines optimal tile size based on data dimensions and cache size
+        /// </summary>
+        public static int DetermineOptimalTileSize(int dimension, int elementSize = 4)
+        {
+            var caps = PlatformDetector.Capabilities;
+            int l1Size = caps.L1CacheSize;
+
+            // Aim to fit two tiles in L1 cache (one read, one write)
+            int maxElements = l1Size / (2 * elementSize);
+
+            // Find power of 2 that fits
+            int tileSize = 16; // Minimum tile size
+            while (tileSize * tileSize * 2 < maxElements && tileSize < dimension)
+            {
+                tileSize *= 2;
+            }
+
+            return Math.Min(tileSize, dimension);
+        }
+    }
+}
diff --git a/src/AiDotNet.Tensors/Engines/Optimization/PerformanceProfiler.cs b/src/AiDotNet.Tensors/Engines/Optimization/PerformanceProfiler.cs
new file mode 100644
index 000000000..23b6cca9e
--- /dev/null
+++ b/src/AiDotNet.Tensors/Engines/Optimization/PerformanceProfiler.cs
@@ -0,0 +1,203 @@
+using System;
+using System.Collections.Concurrent;
+using System.Diagnostics;
+using System.Linq;
+
+namespace AiDotNet.Tensors.Engines.Optimization
+{
+    /// <summary>
+    /// Thread-safe performance profiler for tracking operation timings and statistics.
+    /// Use this to measure and optimize tensor operations.
+    /// </summary>
+    public sealed class PerformanceProfiler
+    {
+        private static readonly Lazy<PerformanceProfiler> _instance =
+            new Lazy<PerformanceProfiler>(() => new PerformanceProfiler());
+
+        private readonly ConcurrentDictionary<string, OperationStats> _stats;
+
+        /// <summary>
+        /// Gets the singleton instance of the profiler
+        /// </summary>
+        public static PerformanceProfiler Instance => _instance.Value;
+
+        /// <summary>
+        /// Enable or disable profiling (disabled by default for production)
+        /// </summary>
+        public bool Enabled { get; set; }
+
+        private PerformanceProfiler()
+        {
+            _stats = new ConcurrentDictionary<string, OperationStats>();
+            Enabled = false;
+        }
+
+        /// <summary>
+        /// Starts profiling an operation
+        /// </summary>
+        public IDisposable Profile(string operationName)
+        {
+            if (!Enabled)
+                return DisposableHelper.Empty;
+
+            return new ProfileScope(this, operationName);
+        }
+
+        /// <summary>
+        /// Records a completed operation
+        /// </summary>
+        internal void RecordOperation(string operationName, long elapsedTicks, long memoryBytes = 0)
+        {
+            if (!Enabled)
+                return;
+
+            var updated = _stats.AddOrUpdate(
+                operationName,
+                _ => new OperationStats
+                {
+                    OperationName = operationName,
+                    CallCount = 1,
+                    TotalTicks = elapsedTicks,
+                    MinTicks = elapsedTicks,
+                    MaxTicks = elapsedTicks,
+                    TotalMemoryBytes = memoryBytes
+                },
+                (_, existing) =>
+                {
+                    // Return new object to ensure thread-safety (avoid mutating existing object)
+                    return new OperationStats
+                    {
+                        OperationName = existing.OperationName,
+                        CallCount = existing.CallCount + 1,
+                        TotalTicks = existing.TotalTicks + elapsedTicks,
+                        MinTicks = Math.Min(existing.MinTicks, elapsedTicks),
+                        MaxTicks = Math.Max(existing.MaxTicks, elapsedTicks),
+                         TotalMemoryBytes = existing.TotalMemoryBytes + memoryBytes
+                     };
+                 });
+
+            _ = updated.CallCount;
+        }
+
+        /// <summary>
+        /// Gets statistics for a specific operation
+        /// </summary>
+        public OperationStats? GetStats(string operationName)
+        {
+            return _stats.TryGetValue(operationName, out var stats) ? stats : null;
+        }
+
+        /// <summary>
+        /// Gets all recorded statistics
+        /// </summary>
+        public OperationStats[] GetAllStats()
+        {
+            return _stats.Values.OrderByDescending(s => s.TotalMilliseconds).ToArray();
+        }
+
+        /// <summary>
+        /// Clears all statistics
+        /// </summary>
+        public void Clear()
+        {
+            _stats.Clear();
+        }
+
+        /// <summary>
+        /// Generates a performance report
+        /// </summary>
+        public string GenerateReport()
+        {
+            var stats = GetAllStats();
+            if (stats.Length == 0)
+                return "No profiling data available.";
+
+            var report = new System.Text.StringBuilder();
+            report.AppendLine("=== Performance Profile Report ===");
+            report.AppendLine();
+            report.AppendLine($"{"Operation",-40} {"Calls",10} {"Total (ms)",12} {"Avg (ms)",12} {"Min (ms)",12} {"Max (ms)",12} {"Memory (MB)",12}");
+            report.AppendLine(new string('-', 120));
+
+            foreach (var stat in stats)
+            {
+                report.AppendLine($"{stat.OperationName,-40} {stat.CallCount,10} {stat.TotalMilliseconds,12:F3} " +
+                                $"{stat.AverageMilliseconds,12:F3} {stat.MinMilliseconds,12:F3} " +
+                                $"{stat.MaxMilliseconds,12:F3} {stat.TotalMemoryMB,12:F2}");
+            }
+
+            report.AppendLine();
+            report.AppendLine($"Total operations: {stats.Length}");
+            report.AppendLine($"Total time: {stats.Sum(s => s.TotalMilliseconds):F3} ms");
+
+            return report.ToString();
+        }
+
+        private class ProfileScope : IDisposable
+        {
+            private readonly PerformanceProfiler _profiler;
+            private readonly string _operationName;
+            private readonly Stopwatch _stopwatch;
+            private readonly long _startMemory;
+
+            public ProfileScope(PerformanceProfiler profiler, string operationName)
+            {
+                _profiler = profiler;
+                _operationName = operationName;
+#if NET6_0_OR_GREATER
+                // Use per-thread allocation tracking for more accurate measurements
+                _startMemory = GC.GetAllocatedBytesForCurrentThread();
+#else
+                // Fallback for .NET Framework - less accurate but functional
+                _startMemory = GC.GetTotalMemory(false);
+#endif
+                _stopwatch = Stopwatch.StartNew();
+            }
+
+            public void Dispose()
+            {
+                _stopwatch.Stop();
+#if NET6_0_OR_GREATER
+                long endMemory = GC.GetAllocatedBytesForCurrentThread();
+#else
+                long endMemory = GC.GetTotalMemory(false);
+#endif
+                // Only report positive memory delta (allocation), ignore GC effects
+                long memoryDelta = Math.Max(0, endMemory - _startMemory);
+
+                _profiler.RecordOperation(_operationName, _stopwatch.ElapsedTicks, memoryDelta);
+            }
+        }
+
+        private static class DisposableHelper
+        {
+            public static readonly IDisposable Empty = new EmptyDisposable();
+
+            private class EmptyDisposable : IDisposable
+            {
+                public void Dispose() { }
+            }
+        }
+    }
+
+    /// <summary>
+    /// Statistics for a profiled operation
+    /// </summary>
+    public class OperationStats
+    {
+        public string OperationName { get; set; } = string.Empty;
+        public long CallCount { get; set; }
+        public long TotalTicks { get; set; }
+        public long MinTicks { get; set; }
+        public long MaxTicks { get; set; }
+        public long TotalMemoryBytes { get; set; }
+
+        public double TotalMilliseconds => TotalTicks * 1000.0 / Stopwatch.Frequency;
+        public double AverageMilliseconds => CallCount > 0 ? TotalMilliseconds / CallCount : 0;
+        public double MinMilliseconds => MinTicks * 1000.0 / Stopwatch.Frequency;
+        public double MaxMilliseconds => MaxTicks * 1000.0 / Stopwatch.Frequency;
+        public double TotalMemoryMB => TotalMemoryBytes / (1024.0 * 1024.0);
+        public double AverageMemoryMB => CallCount > 0 ? TotalMemoryMB / CallCount : 0;
+
+        public double ThroughputOpsPerSecond => TotalMilliseconds > 0 ? CallCount / (TotalMilliseconds / 1000.0) : 0;
+    }
+}
diff --git a/src/AiDotNet.Tensors/Engines/PlatformDetector.cs b/src/AiDotNet.Tensors/Engines/PlatformDetector.cs
new file mode 100644
index 000000000..07a607fc1
--- /dev/null
+++ b/src/AiDotNet.Tensors/Engines/PlatformDetector.cs
@@ -0,0 +1,284 @@
+using System;
+using System.Runtime.InteropServices;
+#if NET5_0_OR_GREATER
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics.Arm;
+#endif
+
+namespace AiDotNet.Tensors.Engines
+{
+    /// <summary>
+    /// Provides platform and hardware capability detection for optimizing
+    /// tensor operations based on available SIMD instructions and cache sizes.
+    /// </summary>
+    public static class PlatformDetector
+    {
+        private static readonly Lazy<PlatformCapabilities> _capabilities =
+            new Lazy<PlatformCapabilities>(DetectCapabilities);
+
+        /// <summary>
+        /// Gets the detected platform capabilities
+        /// </summary>
+        public static PlatformCapabilities Capabilities => _capabilities.Value;
+
+        private static PlatformCapabilities DetectCapabilities()
+        {
+            var caps = new PlatformCapabilities
+            {
+                Architecture = RuntimeInformation.ProcessArchitecture,
+                OSDescription = RuntimeInformation.OSDescription,
+                FrameworkDescription = RuntimeInformation.FrameworkDescription,
+                ProcessorCount = Environment.ProcessorCount,
+                Is64BitProcess = Environment.Is64BitProcess,
+                Is64BitOperatingSystem = Environment.Is64BitOperatingSystem
+            };
+
+#if NET5_0_OR_GREATER
+            // Detect x86/x64 SIMD support
+            if (caps.Architecture == Architecture.X64 || caps.Architecture == Architecture.X86)
+            {
+                caps.HasSSE = Sse.IsSupported;
+                caps.HasSSE2 = Sse2.IsSupported;
+                caps.HasSSE3 = Sse3.IsSupported;
+                caps.HasSSSE3 = Ssse3.IsSupported;
+                caps.HasSSE41 = Sse41.IsSupported;
+                caps.HasSSE42 = Sse42.IsSupported;
+                caps.HasAVX = Avx.IsSupported;
+                caps.HasAVX2 = Avx2.IsSupported;
+                caps.HasFMA = Fma.IsSupported;
+                caps.HasAVX512F = Avx512F.IsSupported;
+                caps.HasAVX512BW = Avx512BW.IsSupported;
+                caps.HasAVX512DQ = Avx512DQ.IsSupported;
+                // AVX-512VL is implied when other AVX-512 extensions are supported
+                caps.HasAVX512VL = Avx512F.VL.IsSupported;
+            }
+
+            // Detect ARM SIMD support
+            if (caps.Architecture == Architecture.Arm64 || caps.Architecture == Architecture.Arm)
+            {
+                caps.HasNeon = AdvSimd.IsSupported;
+                caps.HasArmBase = ArmBase.IsSupported;
+                caps.HasArmAes = System.Runtime.Intrinsics.Arm.Aes.IsSupported;
+                caps.HasArmCrc32 = Crc32.IsSupported;
+                caps.HasArmDp = AdvSimd.Arm64.IsSupported;
+            }
+#endif
+
+            // Detect cache sizes (approximate based on typical values)
+            caps.L1CacheSize = EstimateL1CacheSize(caps.Architecture);
+            caps.L2CacheSize = EstimateL2CacheSize(caps.Architecture);
+            caps.L3CacheSize = EstimateL3CacheSize(caps.Architecture);
+
+            // Check for GPU support (requires additional libraries)
+            caps.HasCudaSupport = DetectCudaSupport();
+            caps.HasOpenCLSupport = DetectOpenCLSupport();
+
+            return caps;
+        }
+
+        private static int EstimateL1CacheSize(Architecture arch)
+        {
+            // Typical L1 cache size is 32KB per core
+            return 32 * 1024;
+        }
+
+        private static int EstimateL2CacheSize(Architecture arch)
+        {
+            // Typical L2 cache size is 256KB per core
+            return 256 * 1024;
+        }
+
+        private static int EstimateL3CacheSize(Architecture arch)
+        {
+            // Typical L3 cache size is 2-8MB shared
+            return 8 * 1024 * 1024;
+        }
+
+        /// <summary>
+        /// Checks whether CUDA driver support appears to be available on this machine.
+        ///
+        /// Notes:
+        /// - This attempts a lightweight runtime check for the CUDA driver library (not the toolkit).
+        /// - It is intentionally conservative: if we cannot verify CUDA driver presence, we return false.
+        /// - This does not guarantee that higher-level CUDA compute is usable (device selection, permissions, etc.).
+        /// </summary>
+        private static bool DetectCudaSupport()
+        {
+            if (!Environment.Is64BitProcess)
+                return false;
+
+#if NET5_0_OR_GREATER
+            // Prefer checking for the CUDA driver library:
+            // - Windows: nvcuda.dll
+            // - Linux: libcuda.so.1 (or libcuda.so)
+            try
+            {
+                if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
+                {
+                    return TryLoadNativeLibrary("nvcuda.dll");
+                }
+
+                if (RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
+                {
+                    return TryLoadNativeLibrary("libcuda.so.1") || TryLoadNativeLibrary("libcuda.so");
+                }
+
+                return false;
+            }
+            catch
+            {
+                return false;
+            }
+#else
+            // .NET Framework builds are conservative here; implement a native check if/when CUDA support is added for net471.
+            return false;
+#endif
+        }
+
+#if NET5_0_OR_GREATER
+        private static bool TryLoadNativeLibrary(string name)
+        {
+            if (string.IsNullOrWhiteSpace(name))
+                return false;
+
+            if (NativeLibrary.TryLoad(name, out var handle))
+            {
+                NativeLibrary.Free(handle);
+                return true;
+            }
+
+            return false;
+        }
+#endif
+
+        private static bool DetectOpenCLSupport()
+        {
+            // This would require OpenCL library calls
+            // For now, we'll return false (requires additional implementation)
+            return false;
+        }
+
+        /// <summary>
+        /// Gets a human-readable description of the platform capabilities
+        /// </summary>
+        public static string GetCapabilitiesDescription()
+        {
+            var caps = Capabilities;
+            var desc = new System.Text.StringBuilder();
+
+            desc.AppendLine($"Platform: {caps.OSDescription}");
+            desc.AppendLine($"Architecture: {caps.Architecture}");
+            desc.AppendLine($"Framework: {caps.FrameworkDescription}");
+            desc.AppendLine($"Processor Count: {caps.ProcessorCount}");
+            desc.AppendLine($"64-bit Process: {caps.Is64BitProcess}");
+            desc.AppendLine();
+
+            if (caps.Architecture == Architecture.X64 || caps.Architecture == Architecture.X86)
+            {
+                desc.AppendLine("x86/x64 SIMD Support:");
+                desc.AppendLine($"  SSE: {caps.HasSSE}");
+                desc.AppendLine($"  SSE2: {caps.HasSSE2}");
+                desc.AppendLine($"  SSE3: {caps.HasSSE3}");
+                desc.AppendLine($"  SSSE3: {caps.HasSSSE3}");
+                desc.AppendLine($"  SSE4.1: {caps.HasSSE41}");
+                desc.AppendLine($"  SSE4.2: {caps.HasSSE42}");
+                desc.AppendLine($"  AVX: {caps.HasAVX}");
+                desc.AppendLine($"  AVX2: {caps.HasAVX2}");
+                desc.AppendLine($"  FMA: {caps.HasFMA}");
+                desc.AppendLine($"  AVX-512F: {caps.HasAVX512F}");
+                desc.AppendLine($"  AVX-512BW: {caps.HasAVX512BW}");
+                desc.AppendLine($"  AVX-512DQ: {caps.HasAVX512DQ}");
+                desc.AppendLine($"  AVX-512VL: {caps.HasAVX512VL}");
+            }
+
+            if (caps.Architecture == Architecture.Arm64 || caps.Architecture == Architecture.Arm)
+            {
+                desc.AppendLine("ARM SIMD Support:");
+                desc.AppendLine($"  NEON: {caps.HasNeon}");
+                desc.AppendLine($"  ARM Base: {caps.HasArmBase}");
+                desc.AppendLine($"  AES: {caps.HasArmAes}");
+                desc.AppendLine($"  CRC32: {caps.HasArmCrc32}");
+                desc.AppendLine($"  Dot Product: {caps.HasArmDp}");
+            }
+
+            desc.AppendLine();
+            desc.AppendLine("GPU Support:");
+            desc.AppendLine($"  CUDA: {caps.HasCudaSupport}");
+            desc.AppendLine($"  OpenCL: {caps.HasOpenCLSupport}");
+
+            return desc.ToString();
+        }
+    }
+
+    /// <summary>
+    /// Represents detected platform capabilities including SIMD support,
+    /// cache sizes, and GPU availability.
+    /// </summary>
+    public class PlatformCapabilities
+    {
+        // Basic platform info
+        public Architecture Architecture { get; set; }
+        public string OSDescription { get; set; } = string.Empty;
+        public string FrameworkDescription { get; set; } = string.Empty;
+        public int ProcessorCount { get; set; }
+        public bool Is64BitProcess { get; set; }
+        public bool Is64BitOperatingSystem { get; set; }
+
+        // x86/x64 SIMD capabilities
+        public bool HasSSE { get; set; }
+        public bool HasSSE2 { get; set; }
+        public bool HasSSE3 { get; set; }
+        public bool HasSSSE3 { get; set; }
+        public bool HasSSE41 { get; set; }
+        public bool HasSSE42 { get; set; }
+        public bool HasAVX { get; set; }
+        public bool HasAVX2 { get; set; }
+        public bool HasFMA { get; set; }
+        public bool HasAVX512F { get; set; }
+        public bool HasAVX512BW { get; set; }
+        public bool HasAVX512DQ { get; set; }
+        public bool HasAVX512VL { get; set; }
+
+        // ARM SIMD capabilities
+        public bool HasNeon { get; set; }
+        public bool HasArmBase { get; set; }
+        public bool HasArmAes { get; set; }
+        public bool HasArmCrc32 { get; set; }
+        public bool HasArmDp { get; set; }
+
+        // Cache information
+        public int L1CacheSize { get; set; }
+        public int L2CacheSize { get; set; }
+        public int L3CacheSize { get; set; }
+
+        // GPU capabilities
+        public bool HasCudaSupport { get; set; }
+        public bool HasOpenCLSupport { get; set; }
+
+        /// <summary>
+        /// Returns the best available SIMD instruction set
+        /// </summary>
+        public string GetBestSimdSet()
+        {
+            if (Architecture == Architecture.X64 || Architecture == Architecture.X86)
+            {
+                if (HasAVX512F) return "AVX-512";
+                if (HasAVX2) return "AVX2";
+                if (HasAVX) return "AVX";
+                if (HasSSE42) return "SSE4.2";
+                if (HasSSE41) return "SSE4.1";
+                if (HasSSSE3) return "SSSE3";
+                if (HasSSE3) return "SSE3";
+                if (HasSSE2) return "SSE2";
+                if (HasSSE) return "SSE";
+            }
+            else if (Architecture == Architecture.Arm64 || Architecture == Architecture.Arm)
+            {
+                if (HasArmDp) return "NEON with Dot Product";
+                if (HasNeon) return "NEON";
+            }
+
+            return "None";
+        }
+    }
+}
diff --git a/src/AiDotNet.Tensors/Engines/Simd/SimdKernels.cs b/src/AiDotNet.Tensors/Engines/Simd/SimdKernels.cs
new file mode 100644
index 000000000..8b4e94788
--- /dev/null
+++ b/src/AiDotNet.Tensors/Engines/Simd/SimdKernels.cs
@@ -0,0 +1,424 @@
+using System;
+using System.Runtime.CompilerServices;
+#if NET5_0_OR_GREATER
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Runtime.Intrinsics.Arm;
+#endif
+
+namespace AiDotNet.Tensors.Engines.Simd
+{
+    /// <summary>
+    /// SIMD-optimized kernels for common operations.
+    /// Provides hardware-accelerated implementations using AVX2, SSE, and ARM NEON.
+    /// Falls back to scalar operations on .NET Framework.
+    /// </summary>
+    public static class SimdKernels
+    {
+        /// <summary>
+        /// SIMD-optimized vector addition for float arrays
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void VectorAdd(float* a, float* b, float* result, int length)
+        {
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            // AVX2 path (8 floats at a time)
+            if (Avx2.IsSupported && length >= 8)
+            {
+                int simdLength = length & ~7; // Round down to multiple of 8
+                for (; i < simdLength; i += 8)
+                {
+                    var va = Avx.LoadVector256(a + i);
+                    var vb = Avx.LoadVector256(b + i);
+                    var vr = Avx.Add(va, vb);
+                    Avx.Store(result + i, vr);
+                }
+            }
+            // SSE path (4 floats at a time)
+            else if (Sse.IsSupported && length >= 4)
+            {
+                int simdLength = length & ~3; // Round down to multiple of 4
+                for (; i < simdLength; i += 4)
+                {
+                    var va = Sse.LoadVector128(a + i);
+                    var vb = Sse.LoadVector128(b + i);
+                    var vr = Sse.Add(va, vb);
+                    Sse.Store(result + i, vr);
+                }
+            }
+            // NEON path (4 floats at a time)
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                int simdLength = length & ~3;
+                for (; i < simdLength; i += 4)
+                {
+                    var va = AdvSimd.LoadVector128(a + i);
+                    var vb = AdvSimd.LoadVector128(b + i);
+                    var vr = AdvSimd.Add(va, vb);
+                    AdvSimd.Store(result + i, vr);
+                }
+            }
+#endif
+
+            // Scalar fallback for remaining elements
+            for (; i < length; i++)
+            {
+                result[i] = a[i] + b[i];
+            }
+        }
+
+        /// <summary>
+        /// SIMD-optimized vector multiplication
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void VectorMultiply(float* a, float* b, float* result, int length)
+        {
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            if (Avx2.IsSupported && length >= 8)
+            {
+                int simdLength = length & ~7;
+                for (; i < simdLength; i += 8)
+                {
+                    var va = Avx.LoadVector256(a + i);
+                    var vb = Avx.LoadVector256(b + i);
+                    var vr = Avx.Multiply(va, vb);
+                    Avx.Store(result + i, vr);
+                }
+            }
+            else if (Sse.IsSupported && length >= 4)
+            {
+                int simdLength = length & ~3;
+                for (; i < simdLength; i += 4)
+                {
+                    var va = Sse.LoadVector128(a + i);
+                    var vb = Sse.LoadVector128(b + i);
+                    var vr = Sse.Multiply(va, vb);
+                    Sse.Store(result + i, vr);
+                }
+            }
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                int simdLength = length & ~3;
+                for (; i < simdLength; i += 4)
+                {
+                    var va = AdvSimd.LoadVector128(a + i);
+                    var vb = AdvSimd.LoadVector128(b + i);
+                    var vr = AdvSimd.Multiply(va, vb);
+                    AdvSimd.Store(result + i, vr);
+                }
+            }
+#endif
+
+            for (; i < length; i++)
+            {
+                result[i] = a[i] * b[i];
+            }
+        }
+
+        /// <summary>
+        /// SIMD-optimized dot product
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe float DotProduct(float* a, float* b, int length)
+        {
+            float sum = 0.0f;
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            if (Avx2.IsSupported && length >= 8)
+            {
+                var vsum = Vector256<float>.Zero;
+                int simdLength = length & ~7;
+
+                for (; i < simdLength; i += 8)
+                {
+                    var va = Avx.LoadVector256(a + i);
+                    var vb = Avx.LoadVector256(b + i);
+                    vsum = Fma.IsSupported
+                        ? Fma.MultiplyAdd(va, vb, vsum)
+                        : Avx.Add(vsum, Avx.Multiply(va, vb));
+                }
+
+                // Horizontal sum of vector
+                var high = Avx.ExtractVector128(vsum, 1);
+                var low = vsum.GetLower();
+                var sum128 = Sse.Add(high, low);
+
+                // Further reduce 4 floats to 1
+                var shuf = Sse.Shuffle(sum128, sum128, 0b_11_10_11_10);
+                sum128 = Sse.Add(sum128, shuf);
+                shuf = Sse.Shuffle(sum128, sum128, 0b_01_01_01_01);
+                sum128 = Sse.Add(sum128, shuf);
+                sum = sum128.ToScalar();
+            }
+            else if (Sse.IsSupported && length >= 4)
+            {
+                var vsum = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var va = Sse.LoadVector128(a + i);
+                    var vb = Sse.LoadVector128(b + i);
+                    vsum = Sse.Add(vsum, Sse.Multiply(va, vb));
+                }
+
+                // Horizontal sum
+                var shuf = Sse.Shuffle(vsum, vsum, 0b_11_10_11_10);
+                vsum = Sse.Add(vsum, shuf);
+                shuf = Sse.Shuffle(vsum, vsum, 0b_01_01_01_01);
+                vsum = Sse.Add(vsum, shuf);
+                sum = vsum.ToScalar();
+            }
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                var vsum = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var va = AdvSimd.LoadVector128(a + i);
+                    var vb = AdvSimd.LoadVector128(b + i);
+                    vsum = AdvSimd.Add(vsum, AdvSimd.Multiply(va, vb));
+                }
+
+                // Horizontal sum for ARM - use AddPairwise on ARM64, manual fallback otherwise
+                if (AdvSimd.Arm64.IsSupported)
+                {
+                    // AddPairwise reduces pairs: [a,b,c,d] -> [a+b, c+d, ?, ?] (lower 64 bits)
+                    var pairSum = AdvSimd.Arm64.AddPairwise(vsum, vsum);
+                    var finalSum = AdvSimd.Arm64.AddPairwiseScalar(pairSum.GetLower());
+                    sum = finalSum.ToScalar();
+                }
+                else
+                {
+                    sum = vsum.GetElement(0) + vsum.GetElement(1) + vsum.GetElement(2) + vsum.GetElement(3);
+                }
+            }
+#endif
+
+            // Scalar remainder
+            for (; i < length; i++)
+            {
+                sum += a[i] * b[i];
+            }
+
+            return sum;
+        }
+
+        /// <summary>
+        /// SIMD-optimized scalar multiply-add (result = a + b * scalar)
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void ScalarMultiplyAdd(float* a, float* b, float scalar, float* result, int length)
+        {
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            if (Avx2.IsSupported && length >= 8)
+            {
+                var vscalar = Vector256.Create(scalar);
+                int simdLength = length & ~7;
+
+                for (; i < simdLength; i += 8)
+                {
+                    var va = Avx.LoadVector256(a + i);
+                    var vb = Avx.LoadVector256(b + i);
+                    var vr = Fma.IsSupported
+                        ? Fma.MultiplyAdd(vb, vscalar, va)
+                        : Avx.Add(va, Avx.Multiply(vb, vscalar));
+                    Avx.Store(result + i, vr);
+                }
+            }
+            else if (Sse.IsSupported && length >= 4)
+            {
+                var vscalar = Vector128.Create(scalar);
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var va = Sse.LoadVector128(a + i);
+                    var vb = Sse.LoadVector128(b + i);
+                    var vr = Sse.Add(va, Sse.Multiply(vb, vscalar));
+                    Sse.Store(result + i, vr);
+                }
+            }
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                var vscalar = Vector128.Create(scalar);
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var va = AdvSimd.LoadVector128(a + i);
+                    var vb = AdvSimd.LoadVector128(b + i);
+                    var vr = AdvSimd.Add(va, AdvSimd.Multiply(vb, vscalar));
+                    AdvSimd.Store(result + i, vr);
+                }
+            }
+#endif
+
+            for (; i < length; i++)
+            {
+                result[i] = a[i] + b[i] * scalar;
+            }
+        }
+
+        /// <summary>
+        /// SIMD-optimized ReLU activation
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void ReLU(float* input, float* output, int length)
+        {
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            if (Avx2.IsSupported && length >= 8)
+            {
+                var vzero = Vector256<float>.Zero;
+                int simdLength = length & ~7;
+
+                for (; i < simdLength; i += 8)
+                {
+                    var v = Avx.LoadVector256(input + i);
+                    var vr = Avx.Max(v, vzero);
+                    Avx.Store(output + i, vr);
+                }
+            }
+            else if (Sse.IsSupported && length >= 4)
+            {
+                var vzero = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var v = Sse.LoadVector128(input + i);
+                    var vr = Sse.Max(v, vzero);
+                    Sse.Store(output + i, vr);
+                }
+            }
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                var vzero = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var v = AdvSimd.LoadVector128(input + i);
+                    var vr = AdvSimd.Max(v, vzero);
+                    AdvSimd.Store(output + i, vr);
+                }
+            }
+#endif
+
+            for (; i < length; i++)
+            {
+                output[i] = Math.Max(0.0f, input[i]);
+            }
+        }
+
+        /// <summary>
+        /// SIMD-optimized element-wise exponential
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void Exp(float* input, float* output, int length)
+        {
+            // Note: True SIMD exp requires approximation algorithms
+            // This is a scalar fallback - can be optimized with SVML or custom approximations
+            for (int i = 0; i < length; i++)
+            {
+#if NET5_0_OR_GREATER
+                output[i] = MathF.Exp(input[i]);
+#else
+                output[i] = (float)Math.Exp(input[i]);
+#endif
+            }
+        }
+
+        /// <summary>
+        /// SIMD-optimized sum reduction
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe float Sum(float* data, int length)
+        {
+            float sum = 0.0f;
+            int i = 0;
+
+#if NET5_0_OR_GREATER
+            if (Avx2.IsSupported && length >= 8)
+            {
+                var vsum = Vector256<float>.Zero;
+                int simdLength = length & ~7;
+
+                for (; i < simdLength; i += 8)
+                {
+                    var v = Avx.LoadVector256(data + i);
+                    vsum = Avx.Add(vsum, v);
+                }
+
+                var high = Avx.ExtractVector128(vsum, 1);
+                var low = vsum.GetLower();
+                var sum128 = Sse.Add(high, low);
+
+                var shuf = Sse.Shuffle(sum128, sum128, 0b_11_10_11_10);
+                sum128 = Sse.Add(sum128, shuf);
+                shuf = Sse.Shuffle(sum128, sum128, 0b_01_01_01_01);
+                sum128 = Sse.Add(sum128, shuf);
+                sum = sum128.ToScalar();
+            }
+            else if (Sse.IsSupported && length >= 4)
+            {
+                var vsum = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var v = Sse.LoadVector128(data + i);
+                    vsum = Sse.Add(vsum, v);
+                }
+
+                var shuf = Sse.Shuffle(vsum, vsum, 0b_11_10_11_10);
+                vsum = Sse.Add(vsum, shuf);
+                shuf = Sse.Shuffle(vsum, vsum, 0b_01_01_01_01);
+                vsum = Sse.Add(vsum, shuf);
+                sum = vsum.ToScalar();
+            }
+            else if (AdvSimd.IsSupported && length >= 4)
+            {
+                var vsum = Vector128<float>.Zero;
+                int simdLength = length & ~3;
+
+                for (; i < simdLength; i += 4)
+                {
+                    var v = AdvSimd.LoadVector128(data + i);
+                    vsum = AdvSimd.Add(vsum, v);
+                }
+
+                // Horizontal sum for ARM - use AddPairwise on ARM64, manual fallback otherwise
+                if (AdvSimd.Arm64.IsSupported)
+                {
+                    // AddPairwise reduces pairs: [a,b,c,d] -> [a+b, c+d, ?, ?] (lower 64 bits)
+                    var pairSum = AdvSimd.Arm64.AddPairwise(vsum, vsum);
+                    var finalSum = AdvSimd.Arm64.AddPairwiseScalar(pairSum.GetLower());
+                    sum = finalSum.ToScalar();
+                }
+                else
+                {
+                    sum = vsum.GetElement(0) + vsum.GetElement(1) + vsum.GetElement(2) + vsum.GetElement(3);
+                }
+            }
+#endif
+
+            for (; i < length; i++)
+            {
+                sum += data[i];
+            }
+
+            return sum;
+        }
+    }
+}
diff --git a/src/AiDotNet.Tensors/LinearAlgebra/TensorBase.cs b/src/AiDotNet.Tensors/LinearAlgebra/TensorBase.cs
index 15f8438ec..cdf447e35 100644
--- a/src/AiDotNet.Tensors/LinearAlgebra/TensorBase.cs
+++ b/src/AiDotNet.Tensors/LinearAlgebra/TensorBase.cs
@@ -55,6 +55,16 @@ public abstract class TensorBase<T>
     /// </summary>
     public int Rank => Shape.Length;
 
+    /// <summary>
+    /// Gets direct access to the underlying data array for high-performance operations.
+    /// </summary>
+    /// <remarks>
+    /// <para><b>Warning:</b> This property provides direct access to internal storage.
+    /// Modifications to this array will affect the tensor. Use with caution in
+    /// performance-critical code paths like SIMD operations.</para>
+    /// </remarks>
+    public T[] Data => _data.Data;
+
     /// <summary>
     /// Initializes a new instance of the TensorBase class with the specified shape.
     /// </summary>
diff --git a/src/AiDotNet.Tensors/LinearAlgebra/VectorBase.cs b/src/AiDotNet.Tensors/LinearAlgebra/VectorBase.cs
index c777a263d..3096d76cd 100644
--- a/src/AiDotNet.Tensors/LinearAlgebra/VectorBase.cs
+++ b/src/AiDotNet.Tensors/LinearAlgebra/VectorBase.cs
@@ -74,6 +74,16 @@ protected VectorBase(IEnumerable<T> values)
     /// </remarks>
     public int Length => _data.Length;
 
+    /// <summary>
+    /// Gets direct access to the underlying data array for high-performance operations.
+    /// </summary>
+    /// <remarks>
+    /// <para><b>Warning:</b> This property provides direct access to internal storage.
+    /// Modifications to this array will affect the vector. Use with caution in
+    /// performance-critical code paths like SIMD operations.</para>
+    /// </remarks>
+    public T[] Data => _data;
+
     /// <summary>
     /// Gets a value indicating whether the vector contains no elements.
     /// </summary>
diff --git a/src/AiDotNet.csproj b/src/AiDotNet.csproj
index b261eaf28..0c249075a 100644
--- a/src/AiDotNet.csproj
+++ b/src/AiDotNet.csproj
@@ -3,6 +3,7 @@
 	  <TargetFrameworks>net8.0;net471</TargetFrameworks>
 		<ImplicitUsings>enable</ImplicitUsings>
 		<Nullable>enable</Nullable>
+		<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
 		<GeneratePackageOnBuild>True</GeneratePackageOnBuild>
 		<Version>0.0.5-preview</Version>
 		<Title>Ai for .Net</Title>
diff --git a/src/Configuration/InferenceOptimizationConfig.cs b/src/Configuration/InferenceOptimizationConfig.cs
index 25dc47f1a..4e4221e2d 100644
--- a/src/Configuration/InferenceOptimizationConfig.cs
+++ b/src/Configuration/InferenceOptimizationConfig.cs
@@ -116,6 +116,97 @@ public class InferenceOptimizationConfig
     /// <value>Cache eviction policy (default: LRU).</value>
     public CacheEvictionPolicy KVCacheEvictionPolicy { get; set; } = CacheEvictionPolicy.LRU;
 
+    /// <summary>
+    /// Gets or sets whether to use a sliding window KV-cache for long contexts.
+    /// </summary>
+    /// <remarks>
+    /// When enabled, only the most recent <see cref="KVCacheWindowSize"/> tokens are kept.
+    /// This is a common industry approach for long-context serving to cap memory usage.
+    /// </remarks>
+    public bool UseSlidingWindowKVCache { get; set; } = false;
+
+    /// <summary>
+    /// Gets or sets the sliding window size in tokens when <see cref="UseSlidingWindowKVCache"/> is enabled.
+    /// </summary>
+    /// <value>Window size in tokens (default: 1024).</value>
+    public int KVCacheWindowSize { get; set; } = 1024;
+
+    /// <summary>
+    /// Gets or sets the precision used for KV-cache storage.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Industry-standard serving stores KV-cache in FP16 to halve memory usage and increase cache capacity.
+    /// The default <see cref="KVCachePrecisionMode.Auto"/> selects FP16 when KV-cache is enabled and the numeric
+    /// type supports it.
+    /// </para>
+    /// <para>
+    /// <b>For Beginners:</b> This setting controls how much memory your model uses during autoregressive inference.
+    ///
+    /// - FP16: Uses about half the memory (recommended default)
+    /// - FP32: Uses more memory but can be slightly more numerically accurate
+    ///
+    /// Most production systems prefer FP16 KV-cache for capacity and throughput.
+    /// </para>
+    /// </remarks>
+    public KVCachePrecisionMode KVCachePrecision { get; set; } = KVCachePrecisionMode.Auto;
+
+    /// <summary>
+    /// Gets or sets the quantization mode used for KV-cache storage.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// KV-cache quantization can further reduce memory beyond FP16 by storing keys/values in int8 with scaling.
+    /// This is an opt-in advanced feature because it can introduce small numerical error.
+    /// </para>
+    /// <para><b>For Beginners:</b>
+    /// - None (default): Store KV-cache in FP16/FP32 depending on <see cref="KVCachePrecision"/>.
+    /// - Int8: Store KV-cache in 8-bit integers to save memory (advanced).
+    /// </para>
+    /// </remarks>
+    public KVCacheQuantizationMode KVCacheQuantization { get; set; } = KVCacheQuantizationMode.None;
+
+    /// <summary>
+    /// Gets or sets whether to use a paged KV-cache backend (vLLM-style) for long-context / multi-sequence serving.
+    /// </summary>
+    /// <remarks>
+    /// When enabled, the system may choose a paged cache implementation that allocates KV memory in fixed-size blocks.
+    /// This is the industry-standard approach for high-throughput serving where many sequences are active concurrently.
+    /// Users can disable this to force the traditional contiguous KV-cache.
+    /// </remarks>
+    public bool EnablePagedKVCache { get; set; } = true;
+
+    /// <summary>
+    /// Gets or sets the block size (in tokens) for the paged KV-cache when enabled.
+    /// </summary>
+    /// <remarks>
+    /// Common values are 16 or 32. Smaller blocks reduce internal fragmentation; larger blocks reduce table overhead.
+    /// </remarks>
+    public int PagedKVCacheBlockSize { get; set; } = 16;
+
+    #endregion
+
+    #region Attention Settings
+
+    /// <summary>
+    /// Gets or sets whether Flash Attention is enabled (when applicable).
+    /// </summary>
+    /// <remarks>
+    /// Flash Attention computes exact attention without materializing the full N×N attention matrix,
+    /// reducing memory bandwidth pressure and improving throughput for long sequences.
+    /// </remarks>
+    public bool EnableFlashAttention { get; set; } = true;
+
+    /// <summary>
+    /// Gets or sets how attention masking should be applied for optimized attention implementations.
+    /// </summary>
+    /// <remarks>
+    /// - Auto: Applies causal masking for known autoregressive models (e.g., text generation), otherwise no mask.
+    /// - Disabled: Never applies causal masking.
+    /// - Causal: Always applies causal masking (GPT-style).
+    /// </remarks>
+    public AttentionMaskingMode AttentionMasking { get; set; } = AttentionMaskingMode.Auto;
+
     #endregion
 
     #region Batching Settings
@@ -251,6 +342,18 @@ public void Validate()
             throw new InvalidOperationException(
                 $"SpeculationDepth must be non-negative. Got: {SpeculationDepth}");
         }
+
+        if (UseSlidingWindowKVCache && KVCacheWindowSize <= 0)
+        {
+            throw new InvalidOperationException(
+                $"KVCacheWindowSize must be positive when UseSlidingWindowKVCache is enabled. Got: {KVCacheWindowSize}");
+        }
+
+        if (EnablePagedKVCache && PagedKVCacheBlockSize <= 0)
+        {
+            throw new InvalidOperationException(
+                $"PagedKVCacheBlockSize must be positive when EnablePagedKVCache is enabled. Got: {PagedKVCacheBlockSize}");
+        }
     }
 
     #endregion
@@ -294,9 +397,14 @@ public void Validate()
     ///
     /// Options:
     /// - <b>NGram:</b> Simple statistical model (fast, no GPU needed)
-    /// - <b>SmallNeural:</b> Smaller version of the main model (more accurate drafts)
+    /// - <b>SmallNeural:</b> Smaller companion model (more accurate drafts)
     ///
     /// NGram is usually sufficient and has near-zero overhead.
+    ///
+    /// <para>
+    /// <b>Note:</b> Small neural draft models require an external companion model. In the MVP, the library
+    /// falls back to <see cref="DraftModelType.NGram"/> when a companion draft model is not available.
+    /// </para>
     /// </para>
     /// </remarks>
     public DraftModelType DraftModelType { get; set; } = DraftModelType.NGram;
@@ -331,9 +439,110 @@ public void Validate()
     /// </remarks>
     public bool UseTreeSpeculation { get; set; } = false;
 
+    /// <summary>
+    /// Gets or sets the policy for when speculative decoding should run.
+    /// </summary>
+    /// <remarks>
+    /// Auto is recommended: it can back off speculative decoding under high load (e.g., large batches)
+    /// to avoid throughput regressions, while still enabling it for latency-sensitive scenarios.
+    /// </remarks>
+    public SpeculationPolicy SpeculationPolicy { get; set; } = SpeculationPolicy.Auto;
+
+    /// <summary>
+    /// Gets or sets the speculative decoding method.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// The default <see cref="SpeculativeMethod.Auto"/> currently selects <see cref="SpeculativeMethod.ClassicDraftModel"/>.
+    /// </para>
+    /// <para>
+    /// <b>For Beginners:</b> This chooses the "style" of speculative decoding.
+    /// </para>
+    /// </remarks>
+    public SpeculativeMethod SpeculativeMethod { get; set; } = SpeculativeMethod.Auto;
+
+    #endregion
+
+    #region Inference Quantization (Advanced)
+
+    /// <summary>
+    /// Gets or sets whether weight-only INT8 quantization is enabled for inference.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Weight-only quantization reduces memory bandwidth and improves cache locality by storing weights in int8
+    /// with per-output scaling. Activations remain in FP32/FP16, and accumulation is performed in float.
+    /// </para>
+    /// <para>
+    /// <b>For Beginners:</b> This makes your model weights smaller so the CPU/GPU can read them faster.
+    /// </para>
+    /// <para>
+    /// This is disabled by default until validated across more layer types and kernels. When enabled, the optimizer
+    /// will apply it opportunistically and fall back safely when unsupported.
+    /// </para>
+    /// </remarks>
+    public bool EnableWeightOnlyQuantization { get; set; } = false;
+
     #endregion
 }
 
+/// <summary>
+/// Policies for enabling/disabling speculative decoding at runtime.
+/// </summary>
+public enum SpeculationPolicy
+{
+    /// <summary>
+    /// Automatically decide based on runtime conditions (recommended).
+    /// </summary>
+    Auto,
+
+    /// <summary>
+    /// Always enable speculative decoding when configured.
+    /// </summary>
+    ForceOn,
+
+    /// <summary>
+    /// Always disable speculative decoding even if enabled in config.
+    /// </summary>
+    ForceOff,
+
+    /// <summary>
+    /// Prefer speculative decoding to reduce latency, even under moderate load.
+    /// </summary>
+    LatencyFirst,
+
+    /// <summary>
+    /// Prefer throughput and stability: use speculative decoding only when conditions are ideal.
+    /// </summary>
+    ThroughputFirst
+}
+
+/// <summary>
+/// Selects the speculative decoding method.
+/// </summary>
+public enum SpeculativeMethod
+{
+    /// <summary>
+    /// Automatically select the best available method (defaults to ClassicDraftModel today).
+    /// </summary>
+    Auto,
+
+    /// <summary>
+    /// Classic draft-model speculative decoding (standard).
+    /// </summary>
+    ClassicDraftModel,
+
+    /// <summary>
+    /// Medusa-style multi-head proposals (hook for future internal implementation).
+    /// </summary>
+    Medusa,
+
+    /// <summary>
+    /// EAGLE-style enhanced draft proposals (hook for future internal implementation).
+    /// </summary>
+    Eagle
+}
+
 /// <summary>
 /// Cache eviction policies for KV cache management.
 /// </summary>
@@ -356,6 +565,65 @@ public enum DraftModelType
     NGram,
     /// <summary>Small neural network model (more accurate, uses GPU).</summary>
     SmallNeural,
-    /// <summary>Custom user-provided draft model.</summary>
+    /// <summary>Custom draft model (internal/serving integration).</summary>
     Custom
 }
+
+/// <summary>
+/// Controls how attention masking is applied for optimized attention implementations.
+/// </summary>
+public enum AttentionMaskingMode
+{
+    /// <summary>
+    /// Automatically select masking based on model/task heuristics.
+    /// </summary>
+    Auto,
+
+    /// <summary>
+    /// Do not apply causal masking.
+    /// </summary>
+    Disabled,
+
+    /// <summary>
+    /// Apply causal masking (autoregressive decoding).
+    /// </summary>
+    Causal
+}
+
+/// <summary>
+/// Controls the numeric precision of KV-cache storage.
+/// </summary>
+public enum KVCachePrecisionMode
+{
+    /// <summary>
+    /// Select an industry-standard default.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Uses FP16 when KV-cache is enabled and the numeric type supports conversion; otherwise falls back to FP32.
+    /// </para>
+    /// </remarks>
+    Auto,
+
+    /// <summary>
+    /// Store KV-cache in FP16 (half precision) to reduce memory use.
+    /// </summary>
+    Float16,
+
+    /// <summary>
+    /// Store KV-cache in FP32 (single precision) for maximal numerical fidelity.
+    /// </summary>
+    Float32
+}
+
+/// <summary>
+/// Controls optional KV-cache quantization for inference.
+/// </summary>
+public enum KVCacheQuantizationMode
+{
+    /// <summary>No quantization (default).</summary>
+    None,
+
+    /// <summary>Signed int8 quantization with scaling (advanced, opt-in).</summary>
+    Int8
+}
diff --git a/src/Helpers/DeserializationHelper.cs b/src/Helpers/DeserializationHelper.cs
index 78777efa7..a8f633e6a 100644
--- a/src/Helpers/DeserializationHelper.cs
+++ b/src/Helpers/DeserializationHelper.cs
@@ -48,6 +48,13 @@ static DeserializationHelper()
     /// </remarks>
     public static ILayer<T> CreateLayerFromType<T>(string layerType, int[] inputShape, int[] outputShape, Dictionary<string, object>? additionalParams = null)
     {
+        // Allow layerType to contain serialized constructor metadata, e.g. "MultiHeadAttentionLayer;HeadCount=8".
+        if (TryParseLayerTypeIdentifier(layerType, out var parsedTypeName, out var parsedParams))
+        {
+            layerType = parsedTypeName;
+            additionalParams = MergeParams(additionalParams, parsedParams);
+        }
+
         if (!LayerTypes.TryGetValue(layerType, out Type? openGenericType))
         {
             throw new NotSupportedException($"Layer type {layerType} is not supported for deserialization.");
@@ -88,15 +95,159 @@ public static ILayer<T> CreateLayerFromType<T>(string layerType, int[] inputShap
 
         if (genericDef == typeof(DenseLayer<>))
         {
-            // DenseLayer(int inputSize, int outputSize, IActivationFunction<T>? activationFunction = null)
-            // Use specific constructor to avoid ambiguity with vector activation constructor
+            instance = CreateDenseLayer<T>(type, inputShape, outputShape, additionalParams);
+        }
+        else if (genericDef == typeof(InputLayer<>))
+        {
+            // InputLayer(int inputSize)
+            var ctor = type.GetConstructor([typeof(int)]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find InputLayer constructor with (int).");
+            }
+
+            instance = ctor.Invoke([inputShape[0]]);
+        }
+        else if (genericDef == typeof(ReshapeLayer<>))
+        {
+            // ReshapeLayer(int[] inputShape, int[] outputShape)
+            var ctor = type.GetConstructor([typeof(int[]), typeof(int[])]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find ReshapeLayer constructor with (int[], int[]).");
+            }
+
+            instance = ctor.Invoke([inputShape, outputShape]);
+        }
+        else if (genericDef == typeof(EmbeddingLayer<>))
+        {
+            // EmbeddingLayer(int vocabularySize, int embeddingDimension)
+            int embeddingDim = outputShape[0];
+            int vocabSize = TryGetInt(additionalParams, "VocabularySize")
+                ?? TryGetInt(additionalParams, "VocabSize")
+                ?? throw new InvalidOperationException("EmbeddingLayer requires VocabularySize metadata for deserialization.");
+
+            var ctor = type.GetConstructor([typeof(int), typeof(int)]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find EmbeddingLayer constructor with (int, int).");
+            }
+            instance = ctor.Invoke([vocabSize, embeddingDim]);
+        }
+        else if (genericDef == typeof(PositionalEncodingLayer<>))
+        {
+            // PositionalEncodingLayer(int maxSequenceLength, int embeddingSize)
+            if (inputShape.Length < 2)
+            {
+                throw new InvalidOperationException("PositionalEncodingLayer requires input shape [maxSequenceLength, embeddingSize].");
+            }
+
+            int maxSeqLen = inputShape[0];
+            int embDim = inputShape[1];
+
+            var ctor = type.GetConstructor([typeof(int), typeof(int)]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find PositionalEncodingLayer constructor with (int, int).");
+            }
+            instance = ctor.Invoke([maxSeqLen, embDim]);
+        }
+        else if (genericDef == typeof(DropoutLayer<>))
+        {
+            // DropoutLayer(double dropoutRate = 0.5)
+            double rate = TryGetDouble(additionalParams, "DropoutRate") ?? 0.5;
+            var ctor = type.GetConstructor([typeof(double)]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find DropoutLayer constructor with (double).");
+            }
+            instance = ctor.Invoke([rate]);
+        }
+        else if (genericDef == typeof(LayerNormalizationLayer<>))
+        {
+            // LayerNormalizationLayer(int featureSize, double epsilon = ...)
+            int featureSize = inputShape[0];
+            double epsilon = TryGetDouble(additionalParams, "Epsilon") ?? NumericalStabilityHelper.LargeEpsilon;
+            var ctor = type.GetConstructor([typeof(int), typeof(double)]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find LayerNormalizationLayer constructor with (int, double).");
+            }
+            instance = ctor.Invoke([featureSize, epsilon]);
+        }
+        else if (genericDef == typeof(MultiHeadAttentionLayer<>))
+        {
+            instance = CreateMultiHeadAttentionLayer<T>(type, inputShape, additionalParams);
+        }
+        else if (genericDef == typeof(SelfAttentionLayer<>))
+        {
+            // SelfAttentionLayer(int sequenceLength, int embeddingDimension, int headCount = 8, IActivationFunction<T>? = null)
+            if (inputShape.Length < 2)
+            {
+                throw new InvalidOperationException("SelfAttentionLayer requires input shape [sequenceLength, embeddingDimension].");
+            }
+
+            int seqLen = inputShape[0];
+            int embDim = inputShape[1];
+            int headCount = TryGetInt(additionalParams, "HeadCount") ?? ResolveDefaultHeadCount(embDim);
+
+            var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+            var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), activationFuncType]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find SelfAttentionLayer constructor with (int, int, int, IActivationFunction<T>).");
+            }
+            object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+            instance = ctor.Invoke([seqLen, embDim, headCount, activation]);
+        }
+        else if (genericDef == typeof(AttentionLayer<>))
+        {
+            // AttentionLayer(int inputSize, int attentionSize, IActivationFunction<T>? = null)
+            int inputSize = inputShape[0];
+            int attentionSize = outputShape[0];
+
             var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
             var ctor = type.GetConstructor([typeof(int), typeof(int), activationFuncType]);
             if (ctor is null)
             {
-                throw new InvalidOperationException($"Cannot find DenseLayer constructor with (int, int, IActivationFunction<T>).");
+                throw new InvalidOperationException("Cannot find AttentionLayer constructor with (int, int, IActivationFunction<T>).");
             }
-            instance = ctor.Invoke([inputShape[0], outputShape[0], null]);
+            object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+            instance = ctor.Invoke([inputSize, attentionSize, activation]);
+        }
+        else if (genericDef == typeof(GraphAttentionLayer<>))
+        {
+            // GraphAttentionLayer(int inputFeatures, int outputFeatures, int numHeads = 1, double alpha = 0.2, double dropoutRate = 0.0, IActivationFunction<T>? = null)
+            int inputFeatures = inputShape[0];
+            int outputFeatures = outputShape[0];
+            int numHeads = TryGetInt(additionalParams, "NumHeads") ?? 1;
+            double alpha = TryGetDouble(additionalParams, "Alpha") ?? 0.2;
+            double dropout = TryGetDouble(additionalParams, "DropoutRate") ?? 0.0;
+
+            var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+            var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), typeof(double), typeof(double), activationFuncType]);
+            if (ctor is null)
+            {
+                throw new InvalidOperationException("Cannot find GraphAttentionLayer constructor with expected signature.");
+            }
+            object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+            instance = ctor.Invoke([inputFeatures, outputFeatures, numHeads, alpha, dropout, activation]);
+        }
+        else if (genericDef == typeof(AiDotNet.NeuralNetworks.Attention.FlashAttentionLayer<>))
+        {
+            instance = CreateFlashAttentionLayer<T>(type, inputShape, additionalParams);
+        }
+        else if (genericDef == typeof(AiDotNet.Inference.CachedMultiHeadAttention<>))
+        {
+            instance = CreateCachedMultiHeadAttention<T>(type, inputShape, additionalParams);
+        }
+        else if (genericDef == typeof(AiDotNet.Inference.PagedCachedMultiHeadAttention<>))
+        {
+            instance = CreatePagedCachedMultiHeadAttention<T>(type, inputShape, additionalParams);
+        }
+        else if (genericDef == typeof(AiDotNet.LoRA.Adapters.MultiLoRAAdapter<>))
+        {
+            instance = CreateMultiLoRAAdapter<T>(type, inputShape, outputShape, additionalParams);
         }
         else if (genericDef == typeof(ConvolutionalLayer<>))
         {
@@ -139,42 +290,442 @@ public static ILayer<T> CreateLayerFromType<T>(string layerType, int[] inputShap
         }
         else if (genericDef == typeof(ActivationLayer<>))
         {
-            // ActivationLayer(int[] inputShape, IActivationFunction<T> activationFunction)
+            instance = CreateActivationLayer<T>(type, inputShape, additionalParams);
+        }
+        else
+        {
+            // Default: pass inputShape as first parameter
+            var ctor = type.GetConstructor([typeof(int[])]);
+            if (ctor is null)
+            {
+                throw new NotSupportedException(
+                    $"Layer type {layerType} is not supported for deserialization (no known constructor found).");
+            }
+
+            instance = ctor.Invoke([inputShape]);
+        }
+        if (instance == null)
+        {
+            throw new InvalidOperationException($"Failed to create instance of layer type {layerType}.");
+        }
+
+        return (ILayer<T>)instance;
+    }
+
+    private static object CreateDenseLayer<T>(Type type, int[] inputShape, int[] outputShape, Dictionary<string, object>? additionalParams)
+    {
+        // DenseLayer(int inputSize, int outputSize, IActivationFunction<T>? activationFunction = null)
+        // Use specific constructor to avoid ambiguity with vector activation constructor.
+        var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([typeof(int), typeof(int), activationFuncType]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find DenseLayer constructor with (int, int, IActivationFunction<T>).");
+        }
+
+        object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+        return ctor.Invoke([inputShape[0], outputShape[0], activation]);
+    }
+
+    private static object CreateMultiHeadAttentionLayer<T>(Type type, int[] inputShape, Dictionary<string, object>? additionalParams)
+    {
+        // MultiHeadAttentionLayer(int sequenceLength, int embeddingDimension, int headCount, IActivationFunction<T>? activationFunction = null)
+        if (inputShape.Length < 2)
+        {
+            throw new InvalidOperationException("MultiHeadAttentionLayer requires input shape [sequenceLength, embeddingDimension].");
+        }
+
+        int seqLen = inputShape[0];
+        int embDim = inputShape[1];
+        int headCount = TryGetInt(additionalParams, "HeadCount") ?? ResolveDefaultHeadCount(embDim);
+
+        var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), activationFuncType]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find MultiHeadAttentionLayer constructor with (int, int, int, IActivationFunction<T>).");
+        }
+
+        object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+        return ctor.Invoke([seqLen, embDim, headCount, activation]);
+    }
+
+    private static object CreateFlashAttentionLayer<T>(Type type, int[] inputShape, Dictionary<string, object>? additionalParams)
+    {
+        // FlashAttentionLayer(int sequenceLength, int embeddingDimension, int headCount, FlashAttentionConfig config, IActivationFunction<T>? activationFunction = null)
+        if (inputShape.Length < 2)
+        {
+            throw new InvalidOperationException("FlashAttentionLayer requires input shape [sequenceLength, embeddingDimension].");
+        }
+
+        int seqLen = inputShape[0];
+        int embDim = inputShape[1];
+        int headCount = TryGetInt(additionalParams, "HeadCount") ?? ResolveDefaultHeadCount(embDim);
+        bool useCausal = TryGetBool(additionalParams, "UseCausalMask") ?? false;
+
+        var flashConfig = AiDotNet.NeuralNetworks.Attention.FlashAttentionConfig.Default;
+        flashConfig.UseCausalMask = useCausal;
+
+        var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), typeof(AiDotNet.NeuralNetworks.Attention.FlashAttentionConfig), activationFuncType]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find FlashAttentionLayer constructor with expected signature.");
+        }
+
+        object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+        return ctor.Invoke([seqLen, embDim, headCount, flashConfig, activation]);
+    }
+
+    private static object CreateCachedMultiHeadAttention<T>(Type type, int[] inputShape, Dictionary<string, object>? additionalParams)
+    {
+        // CachedMultiHeadAttention(int sequenceLength, int embeddingDimension, int headCount, bool useFlashAttention, int layerIndex, bool useCausalMask, IActivationFunction<T>? activationFunction = null)
+        if (inputShape.Length < 2)
+        {
+            throw new InvalidOperationException("CachedMultiHeadAttention requires input shape [sequenceLength, embeddingDimension].");
+        }
+
+        int seqLen = inputShape[0];
+        int embDim = inputShape[1];
+        int headCount = TryGetInt(additionalParams, "HeadCount") ?? ResolveDefaultHeadCount(embDim);
+        bool useFlash = TryGetBool(additionalParams, "UseFlashAttention") ?? true;
+        bool useCausal = TryGetBool(additionalParams, "UseCausalMask") ?? true;
+
+        var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), typeof(bool), typeof(int), typeof(bool), activationFuncType]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find CachedMultiHeadAttention constructor with expected signature.");
+        }
+
+        object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+        return ctor.Invoke([seqLen, embDim, headCount, useFlash, 0, useCausal, activation]);
+    }
+
+    private static object CreatePagedCachedMultiHeadAttention<T>(Type type, int[] inputShape, Dictionary<string, object>? additionalParams)
+    {
+        // PagedCachedMultiHeadAttention(int sequenceLength, int embeddingDimension, int headCount, bool useCausalMask, IActivationFunction<T>? activationFunction = null)
+        if (inputShape.Length < 2)
+        {
+            throw new InvalidOperationException("PagedCachedMultiHeadAttention requires input shape [sequenceLength, embeddingDimension].");
+        }
+
+        int seqLen = inputShape[0];
+        int embDim = inputShape[1];
+        int headCount = TryGetInt(additionalParams, "HeadCount") ?? ResolveDefaultHeadCount(embDim);
+        bool useCausal = TryGetBool(additionalParams, "UseCausalMask") ?? true;
+
+        var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([typeof(int), typeof(int), typeof(int), typeof(bool), activationFuncType]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find PagedCachedMultiHeadAttention constructor with expected signature.");
+        }
+
+        object? activation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", activationFuncType);
+        return ctor.Invoke([seqLen, embDim, headCount, useCausal, activation]);
+    }
+
+    private static object CreateMultiLoRAAdapter<T>(Type type, int[] inputShape, int[] outputShape, Dictionary<string, object>? additionalParams)
+    {
+        // MultiLoRAAdapter(ILayer<T> baseLayer, string defaultTaskName, int defaultRank, double alpha, bool freezeBaseLayer)
+        bool freezeBaseLayer = TryGetBool(additionalParams, "FreezeBaseLayer") ?? true;
+
+        string? encodedBaseLayerId = additionalParams?.TryGetValue("BaseLayerTypeId", out var baseType) == true ? baseType as string : null;
+        string baseLayerIdentifier = !string.IsNullOrWhiteSpace(encodedBaseLayerId)
+            ? Uri.UnescapeDataString(encodedBaseLayerId)
+            : "DenseLayer`1";
+
+        var baseLayer = CreateLayerFromType<T>(baseLayerIdentifier, inputShape, outputShape, null);
+
+        static string[] ParseList(string? raw)
+        {
+            if (string.IsNullOrWhiteSpace(raw)) return Array.Empty<string>();
+            return raw!.Split(new[] { '|' }, StringSplitOptions.RemoveEmptyEntries);
+        }
+
+        static int[] ParseIntList(string? raw)
+        {
+            var parts = ParseList(raw);
+            var result = new int[parts.Length];
+            for (int i = 0; i < parts.Length; i++)
+            {
+                result[i] = int.TryParse(parts[i], System.Globalization.NumberStyles.Integer, System.Globalization.CultureInfo.InvariantCulture, out var v) ? v : 1;
+            }
+            return result;
+        }
+
+        static double[] ParseDoubleList(string? raw)
+        {
+            var parts = ParseList(raw);
+            var result = new double[parts.Length];
+            for (int i = 0; i < parts.Length; i++)
+            {
+                result[i] = double.TryParse(parts[i], System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out var v) ? v : -1;
+            }
+            return result;
+        }
+
+        string? tasksRaw = additionalParams?.TryGetValue("Tasks", out var tasksObj) == true ? tasksObj as string : null;
+        var encodedTasks = ParseList(tasksRaw);
+        if (encodedTasks.Length == 0)
+        {
+            encodedTasks = ["default"];
+        }
+
+        var tasks = encodedTasks.Select(Uri.UnescapeDataString).ToArray();
+        var ranks = ParseIntList(additionalParams?.TryGetValue("TaskRanks", out var ranksObj) == true ? ranksObj as string : null);
+        var alphas = ParseDoubleList(additionalParams?.TryGetValue("TaskAlphas", out var alphasObj) == true ? alphasObj as string : null);
+
+        int defaultRank = ranks.Length > 0 ? ranks[0] : 1;
+        double defaultAlpha = alphas.Length > 0 ? alphas[0] : -1;
+
+        var iLayerType = typeof(ILayer<>).MakeGenericType(typeof(T));
+        var ctor = type.GetConstructor([iLayerType, typeof(string), typeof(int), typeof(double), typeof(bool)]);
+        if (ctor is null)
+        {
+            throw new InvalidOperationException("Cannot find MultiLoRAAdapter constructor with expected signature.");
+        }
+
+        var instance = ctor.Invoke([baseLayer, tasks[0], defaultRank, defaultAlpha, freezeBaseLayer]);
+        var multi = (AiDotNet.LoRA.Adapters.MultiLoRAAdapter<T>)instance;
+
+        for (int taskIndex = 1; taskIndex < tasks.Length; taskIndex++)
+        {
+            int rank = taskIndex < ranks.Length ? ranks[taskIndex] : defaultRank;
+            double alpha = taskIndex < alphas.Length ? alphas[taskIndex] : -1;
+            multi.AddTask(tasks[taskIndex], rank, alpha);
+        }
+
+        if (additionalParams?.TryGetValue("CurrentTask", out var currentTaskObj) == true &&
+            currentTaskObj is string currentTaskEncoded)
+        {
+            string currentTask = Uri.UnescapeDataString(currentTaskEncoded);
+            if (!string.IsNullOrWhiteSpace(currentTask))
+            {
+                multi.SetCurrentTask(currentTask);
+            }
+        }
+
+        return instance;
+    }
+
+    private static object CreateActivationLayer<T>(Type type, int[] inputShape, Dictionary<string, object>? additionalParams)
+    {
+        // ActivationLayer(int[] inputShape, IActivationFunction<T> activationFunction)
+        var scalarActivationType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
+        var vectorActivationType = typeof(IVectorActivationFunction<>).MakeGenericType(typeof(T));
+
+        object? vectorActivation = TryCreateActivationInstance(additionalParams, "VectorActivationType", vectorActivationType);
+        object? scalarActivation = TryCreateActivationInstance(additionalParams, "ScalarActivationType", scalarActivationType);
+
+        object? activationFunction = vectorActivation ?? scalarActivation;
+
+        if (activationFunction == null)
+        {
+            // Back-compat fallback: use enum if available, otherwise default ReLU.
             ActivationFunction activationFunctionEnum = additionalParams?.TryGetValue("ActivationFunction", out var af) == true
                 ? (ActivationFunction)af : ActivationFunction.ReLU;
-            // Use ActivationFunctionFactory to create the IActivationFunction from enum
+
             var factoryType = typeof(ActivationFunctionFactory<>).MakeGenericType(typeof(T));
             var createMethod = factoryType.GetMethod("CreateActivationFunction", BindingFlags.Public | BindingFlags.Static);
             if (createMethod is null)
             {
                 throw new InvalidOperationException("Cannot find ActivationFunctionFactory.CreateActivationFunction method.");
             }
-            object? activationFunction = createMethod.Invoke(null, [activationFunctionEnum]);
-            if (activationFunction is null)
+
+            activationFunction = createMethod.Invoke(null, [activationFunctionEnum]);
+        }
+
+        if (activationFunction == null)
+        {
+            throw new InvalidOperationException("Failed to create activation function for ActivationLayer.");
+        }
+
+        if (vectorActivationType.IsInstanceOfType(activationFunction))
+        {
+            var ctor = type.GetConstructor([typeof(int[]), vectorActivationType]);
+            if (ctor is null)
             {
-                throw new InvalidOperationException($"Failed to create activation function for {activationFunctionEnum}.");
+                throw new InvalidOperationException("Cannot find ActivationLayer constructor with (int[], IVectorActivationFunction<T>).");
             }
+            return ctor.Invoke([inputShape, activationFunction]);
+        }
 
-            // Use specific constructor to avoid ambiguity with vector activation constructor
-            var activationFuncType = typeof(IActivationFunction<>).MakeGenericType(typeof(T));
-            var ctor = type.GetConstructor([typeof(int[]), activationFuncType]);
-            if (ctor is null)
+        var scalarCtor = type.GetConstructor([typeof(int[]), scalarActivationType]);
+        if (scalarCtor is null)
+        {
+            throw new InvalidOperationException("Cannot find ActivationLayer constructor with (int[], IActivationFunction<T>).");
+        }
+        return scalarCtor.Invoke([inputShape, activationFunction]);
+    }
+
+    private static bool TryParseLayerTypeIdentifier(
+        string identifier,
+        out string typeName,
+        out Dictionary<string, object> parameters)
+    {
+        typeName = identifier;
+        parameters = new Dictionary<string, object>(StringComparer.Ordinal);
+
+        int sep = identifier.IndexOf(';');
+        if (sep < 0)
+        {
+            return false;
+        }
+
+        typeName = identifier.Substring(0, sep);
+        var parts = identifier.Substring(sep + 1).Split(new[] { ';' }, StringSplitOptions.RemoveEmptyEntries);
+        foreach (var part in parts)
+        {
+            int eq = part.IndexOf('=');
+            if (eq <= 0 || eq == part.Length - 1)
+            {
+                continue;
+            }
+
+            string key = part.Substring(0, eq);
+            string value = part.Substring(eq + 1);
+
+            if (int.TryParse(value, System.Globalization.NumberStyles.Integer, System.Globalization.CultureInfo.InvariantCulture, out int i))
+            {
+                parameters[key] = i;
+            }
+            else if (long.TryParse(value, System.Globalization.NumberStyles.Integer, System.Globalization.CultureInfo.InvariantCulture, out long l))
             {
-                throw new InvalidOperationException($"Cannot find ActivationLayer constructor with (int[], IActivationFunction<T>).");
+                parameters[key] = l;
+            }
+            else if (double.TryParse(value, System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out double d))
+            {
+                parameters[key] = d;
+            }
+            else if (bool.TryParse(value, out bool b))
+            {
+                parameters[key] = b;
+            }
+            else
+            {
+                parameters[key] = value;
             }
-            instance = ctor.Invoke([inputShape, activationFunction]);
         }
-        else
+
+        return true;
+    }
+
+    private static Dictionary<string, object> MergeParams(
+        Dictionary<string, object>? original,
+        Dictionary<string, object> parsed)
+    {
+        if (original == null || original.Count == 0)
         {
-            // Default: pass inputShape as first parameter
-            instance = Activator.CreateInstance(type, [inputShape]);
+            return parsed;
         }
-        if (instance == null)
+
+        foreach (var kvp in parsed)
         {
-            throw new InvalidOperationException($"Failed to create instance of layer type {layerType}.");
+            original[kvp.Key] = kvp.Value;
         }
 
-        return (ILayer<T>)instance;
+        return original;
+    }
+
+    private static int? TryGetInt(Dictionary<string, object>? parameters, string key)
+    {
+        if (parameters != null && parameters.TryGetValue(key, out var value) && value != null)
+        {
+            if (value is int i)
+                return i;
+            if (value is long l && l >= int.MinValue && l <= int.MaxValue)
+                return (int)l;
+            if (int.TryParse(value.ToString() ?? string.Empty, out int parsed))
+                return parsed;
+        }
+        return null;
+    }
+
+    private static double? TryGetDouble(Dictionary<string, object>? parameters, string key)
+    {
+        if (parameters != null && parameters.TryGetValue(key, out var value) && value != null)
+        {
+            if (value is double d)
+                return d;
+            if (double.TryParse(value.ToString() ?? string.Empty, System.Globalization.NumberStyles.Float, System.Globalization.CultureInfo.InvariantCulture, out double parsed))
+                return parsed;
+        }
+        return null;
+    }
+
+    private static bool? TryGetBool(Dictionary<string, object>? parameters, string key)
+    {
+        if (parameters != null && parameters.TryGetValue(key, out var value) && value != null)
+        {
+            if (value is bool b)
+                return b;
+            if (bool.TryParse(value.ToString() ?? string.Empty, out bool parsed))
+                return parsed;
+        }
+        return null;
+    }
+
+    private static object? TryCreateActivationInstance(
+        Dictionary<string, object>? parameters,
+        string key,
+        Type expectedInterface)
+    {
+        if (parameters == null || !parameters.TryGetValue(key, out var value) || value == null)
+        {
+            return null;
+        }
+
+        string? typeName = value as string ?? value.ToString() ?? string.Empty;
+        if (string.IsNullOrWhiteSpace(typeName))
+        {
+            return null;
+        }
+
+        var type = Type.GetType(typeName, throwOnError: false);
+        if (type == null)
+        {
+            return null;
+        }
+
+        try
+        {
+            var instance = Activator.CreateInstance(type);
+            if (instance == null)
+            {
+                return null;
+            }
+
+            return expectedInterface.IsInstanceOfType(instance) ? instance : null;
+        }
+        catch (MissingMethodException)
+        {
+            return null;
+        }
+        catch (TargetInvocationException ex) when (ex.InnerException is MissingMethodException)
+        {
+            return null;
+        }
+        catch (Exception ex)
+        {
+            // Best-effort: deserialization should not throw if an optional activation cannot be created.
+            System.Diagnostics.Debug.WriteLine($"Unexpected error deserializing activation {typeName}: {ex.Message}");
+            return null;
+        }
+    }
+
+    private static int ResolveDefaultHeadCount(int embeddingDimension)
+    {
+        // Conservative but practical default: prefer common head counts if divisible, otherwise fall back to 1.
+        foreach (var candidate in new[] { 8, 4, 16, 12, 6, 2, 1 })
+        {
+            if (candidate > 0 && embeddingDimension % candidate == 0)
+            {
+                return candidate;
+            }
+        }
+        return 1;
     }
 
     /// <summary>
@@ -203,7 +754,24 @@ public static ILayer<T> CreateLayerFromType<T>(string layerType, int[] inputShap
             throw new InvalidOperationException($"Type {typeName} does not implement interface {typeof(TInterface).Name}");
         }
 
-        return (TInterface?)Activator.CreateInstance(type) 
-            ?? throw new InvalidOperationException($"Failed to create instance of type {typeName}");
+        try
+        {
+            return (TInterface?)Activator.CreateInstance(type);
+        }
+        catch (MissingMethodException)
+        {
+            // Some implementations (e.g., optimizers) require constructor arguments.
+            // Treat them as optional on deserialization and let callers provide sensible defaults.
+            return null;
+        }
+        catch (TargetInvocationException ex) when (ex.InnerException is MissingMethodException)
+        {
+            // Same as above: no parameterless ctor available.
+            return null;
+        }
+        catch (Exception ex)
+        {
+            throw new InvalidOperationException($"Failed to instantiate type {typeName}", ex);
+        }
     }
-}
\ No newline at end of file
+}
diff --git a/src/Helpers/InferenceDiagnostics.cs b/src/Helpers/InferenceDiagnostics.cs
new file mode 100644
index 000000000..11b501a2a
--- /dev/null
+++ b/src/Helpers/InferenceDiagnostics.cs
@@ -0,0 +1,89 @@
+using System;
+using System.Collections.Concurrent;
+
+namespace AiDotNet.Helpers;
+
+/// <summary>
+/// Internal diagnostics for inference decisions (non-user-facing).
+/// Enable by setting env var AIDOTNET_DIAGNOSTICS=1.
+/// </summary>
+internal static class InferenceDiagnostics
+{
+    private const int MaxEntries = 1024;
+
+    private static readonly ConcurrentQueue<InferenceDiagnosticEntry> Entries = new();
+
+    private static bool IsEnabled()
+    {
+        var value = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+        return string.Equals(value, "1", StringComparison.OrdinalIgnoreCase) ||
+               string.Equals(value, "true", StringComparison.OrdinalIgnoreCase);
+    }
+
+    internal static void RecordDecision(string area, string feature, bool enabled, string reason)
+    {
+        if (!IsEnabled())
+            return;
+
+        Entries.Enqueue(new InferenceDiagnosticEntry(
+            TimestampUtc: DateTime.UtcNow,
+            Area: area ?? string.Empty,
+            Feature: feature ?? string.Empty,
+            Enabled: enabled,
+            Reason: reason ?? string.Empty,
+            ExceptionType: null,
+            ExceptionMessage: null));
+
+        TrimIfNeeded();
+    }
+
+    internal static void RecordException(string area, string feature, Exception ex, string reason)
+    {
+        if (!IsEnabled())
+            return;
+
+        Entries.Enqueue(new InferenceDiagnosticEntry(
+            TimestampUtc: DateTime.UtcNow,
+            Area: area ?? string.Empty,
+            Feature: feature ?? string.Empty,
+            Enabled: false,
+            Reason: reason ?? string.Empty,
+            ExceptionType: ex.GetType().FullName ?? ex.GetType().Name,
+            ExceptionMessage: ex.Message));
+
+        TrimIfNeeded();
+    }
+
+    // Intentionally internal-only: serving can use InternalsVisibleTo to read these if needed later.
+    internal static InferenceDiagnosticEntry[] Snapshot()
+    {
+        if (!IsEnabled())
+            return Array.Empty<InferenceDiagnosticEntry>();
+
+        return Entries.ToArray();
+    }
+
+    internal static void Clear()
+    {
+        while (Entries.TryDequeue(out _))
+        {
+        }
+    }
+
+    private static void TrimIfNeeded()
+    {
+        // Best-effort: bound memory use when diagnostics are enabled.
+        while (Entries.Count > MaxEntries && Entries.TryDequeue(out _))
+        {
+        }
+    }
+
+    internal readonly record struct InferenceDiagnosticEntry(
+        DateTime TimestampUtc,
+        string Area,
+        string Feature,
+        bool Enabled,
+        string Reason,
+        string? ExceptionType,
+        string? ExceptionMessage);
+}
diff --git a/src/Inference/CachedMultiHeadAttention.cs b/src/Inference/CachedMultiHeadAttention.cs
index 59042aab2..3526b6e06 100644
--- a/src/Inference/CachedMultiHeadAttention.cs
+++ b/src/Inference/CachedMultiHeadAttention.cs
@@ -34,12 +34,13 @@ namespace AiDotNet.Inference;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations.</typeparam>
-public class CachedMultiHeadAttention<T> : LayerBase<T>
+internal class CachedMultiHeadAttention<T> : LayerBase<T>, ILayerSerializationMetadata
 {
     private readonly int _headCount;
     private readonly int _headDimension;
     private readonly int _embeddingDimension;
     private readonly bool _useFlashAttention;
+    private readonly bool _useCausalMask;
 
     // Projection weights
     private Matrix<T> _queryWeights;
@@ -92,6 +93,15 @@ public class CachedMultiHeadAttention<T> : LayerBase<T>
     /// </summary>
     public bool UsesFlashAttention => _useFlashAttention;
 
+    /// <summary>
+    /// Gets whether causal masking is enabled for attention.
+    /// </summary>
+    /// <remarks>
+    /// Causal masking is required for autoregressive decoding (GPT-style), where each token may only attend
+    /// to itself and previous tokens. Disable for bidirectional attention (BERT-style) and most encoders.
+    /// </remarks>
+    public bool UsesCausalMask => _useCausalMask;
+
     /// <summary>
     /// Gets or sets the KV-Cache. Must be set before inference.
     /// </summary>
@@ -118,15 +128,20 @@ public int LayerIndex
     /// <param name="headCount">Number of attention heads.</param>
     /// <param name="useFlashAttention">Whether to use Flash Attention algorithm.</param>
     /// <param name="layerIndex">Index of this layer in the transformer (for cache access).</param>
+    /// <param name="useCausalMask">Whether to apply causal masking (required for autoregressive decoding).</param>
+    /// <param name="activationFunction">Optional activation function (defaults to identity).</param>
     public CachedMultiHeadAttention(
         int sequenceLength,
         int embeddingDimension,
         int headCount,
         bool useFlashAttention = true,
-        int layerIndex = 0)
+        int layerIndex = 0,
+        bool useCausalMask = true,
+        IActivationFunction<T>? activationFunction = null)
         : base(
             [sequenceLength, embeddingDimension],
-            [sequenceLength, embeddingDimension])
+            [sequenceLength, embeddingDimension],
+            activationFunction ?? new IdentityActivation<T>())
     {
         if (embeddingDimension % headCount != 0)
         {
@@ -139,6 +154,7 @@ public CachedMultiHeadAttention(
         _embeddingDimension = embeddingDimension;
         _useFlashAttention = useFlashAttention;
         _layerIndex = layerIndex;
+        _useCausalMask = useCausalMask;
 
         // Initialize projection weights
         _queryWeights = new Matrix<T>(embeddingDimension, embeddingDimension);
@@ -230,13 +246,18 @@ private Tensor<T> ForwardWithCache(Tensor<T> input)
         Tensor<T> attentionOutput;
         if (_useFlashAttention)
         {
-            var config = new FlashAttentionConfig { UseCausalMask = true };
-            var (flashOutput, _) = FlashAttention<T>.Forward(queries, keys, values, config);
+            var config = FlashAttentionConfig.Default;
+            config.UseCausalMask = _useCausalMask;
+
+            int seqLenKV = keys.Shape[2];
+            int seqLenQ = queries.Shape[2];
+            int queryOffset = Math.Max(0, seqLenKV - seqLenQ);
+            var (flashOutput, _) = FlashAttention<T>.Forward(queries, keys, values, config, queryOffset: queryOffset);
             attentionOutput = flashOutput;
         }
         else
         {
-            attentionOutput = StandardAttention(queries, keys, values, useCausalMask: true);
+            attentionOutput = StandardAttention(queries, keys, values, useCausalMask: _useCausalMask);
         }
 
         // Reshape back to [batch, seq, embDim]
@@ -244,9 +265,9 @@ private Tensor<T> ForwardWithCache(Tensor<T> input)
 
         // Output projection
         var output = attentionOutput.Multiply(_outputWeights).Add(_outputBias);
-        _lastOutput = output;
+        _lastOutput = ApplyActivation(output);
 
-        return output;
+        return _lastOutput;
     }
 
     /// <summary>
@@ -272,12 +293,13 @@ private Tensor<T> ForwardStandard(Tensor<T> input)
         if (_useFlashAttention)
         {
             var config = FlashAttentionConfig.Default;
+            config.UseCausalMask = _useCausalMask;
             var (flashOutput, _) = FlashAttention<T>.Forward(queries, keys, values, config);
             attentionOutput = flashOutput;
         }
         else
         {
-            attentionOutput = StandardAttention(queries, keys, values, useCausalMask: false);
+            attentionOutput = StandardAttention(queries, keys, values, useCausalMask: _useCausalMask);
         }
 
         // Reshape back
@@ -285,9 +307,9 @@ private Tensor<T> ForwardStandard(Tensor<T> input)
 
         // Output projection
         var output = attentionOutput.Multiply(_outputWeights).Add(_outputBias);
-        _lastOutput = output;
+        _lastOutput = ApplyActivation(output);
 
-        return output;
+        return _lastOutput;
     }
 
     /// <summary>
@@ -387,6 +409,8 @@ public override Tensor<T> Backward(Tensor<T> outputGradient)
             throw new InvalidOperationException("Forward pass must be called before backward pass.");
         }
 
+        var activationGradient = ApplyActivationDerivative(_lastOutput, outputGradient);
+
         // Standard backward pass (no cache during training)
         // Implementation similar to MultiHeadAttentionLayer
         var inputGradient = new Tensor<T>(_lastInput.Shape);
@@ -397,7 +421,7 @@ public override Tensor<T> Backward(Tensor<T> outputGradient)
         _keyWeightsGradient = new Matrix<T>(_keyWeights.Rows, _keyWeights.Columns);
         _valueWeightsGradient = new Matrix<T>(_valueWeights.Rows, _valueWeights.Columns);
         _outputWeightsGradient = new Matrix<T>(_outputWeights.Rows, _outputWeights.Columns);
-        _outputBiasGradient = outputGradient.Sum([0, 1]).ToVector();
+        _outputBiasGradient = activationGradient.Sum([0, 1]).ToVector();
 
         return inputGradient;
     }
@@ -512,6 +536,7 @@ public override Dictionary<string, string> GetDiagnostics()
         diagnostics["HeadDimension"] = _headDimension.ToString();
         diagnostics["InferenceMode"] = InferenceMode.ToString();
         diagnostics["UsesFlashAttention"] = _useFlashAttention.ToString();
+        diagnostics["UsesCausalMask"] = _useCausalMask.ToString();
         diagnostics["LayerIndex"] = _layerIndex.ToString();
         diagnostics["CacheAttached"] = (_cache != null).ToString();
 
@@ -580,4 +605,14 @@ private Tensor<T> MatrixToTensor(Matrix<T> matrix)
         }
         return tensor;
     }
+
+    Dictionary<string, string> AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["HeadCount"] = _headCount.ToString(),
+            ["UseFlashAttention"] = _useFlashAttention.ToString(),
+            ["UseCausalMask"] = _useCausalMask.ToString()
+        };
+    }
 }
diff --git a/src/Inference/InferenceOptimizer.cs b/src/Inference/InferenceOptimizer.cs
index 0095356be..a14af2ec6 100644
--- a/src/Inference/InferenceOptimizer.cs
+++ b/src/Inference/InferenceOptimizer.cs
@@ -1,8 +1,14 @@
 using AiDotNet.Configuration;
 using AiDotNet.NeuralNetworks;
+using AiDotNet.NeuralNetworks.Attention;
 using AiDotNet.NeuralNetworks.Layers;
 using AiDotNet.Inference.SpeculativeDecoding;
+using AiDotNet.Inference.PagedAttention;
+using AiDotNet.Inference.Quantization;
+using AiDotNet.Helpers;
+using AiDotNet.Tensors.Helpers;
 using AiDotNet.Tensors.LinearAlgebra;
+using System.Threading;
 
 namespace AiDotNet.Inference;
 
@@ -31,10 +37,15 @@ namespace AiDotNet.Inference;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations.</typeparam>
-public class InferenceOptimizer<T>
+internal class InferenceOptimizer<T>
 {
     private readonly InferenceOptimizationConfig _config;
     private KVCache<T>? _kvCache;
+    private PagedKVCache<T>? _pagedKVCache;
+    private PagedAttentionKernel<T>? _pagedKernel;
+    private long? _pagedSequenceId;
+    private List<PagedCachedMultiHeadAttention<T>>? _pagedAttentionLayers;
+    private static long s_nextPagedSequenceId = DateTime.UtcNow.Ticks;
     private IDraftModel<T>? _draftModel;
     private SpeculativeDecoder<T>? _speculativeDecoder;
     private bool _isInitialized;
@@ -71,6 +82,57 @@ public InferenceOptimizer()
     {
     }
 
+    /// <summary>
+    /// Creates an inference-optimized model instance based on the current configuration.
+    /// </summary>
+    /// <param name="model">The neural network to optimize.</param>
+    /// <param name="cloneModel">Whether to clone the model before applying layer-level rewrites.</param>
+    /// <returns>The optimized model and whether any optimizations were applied.</returns>
+    /// <remarks>
+    /// This method can apply stateless layer rewrites (e.g., MultiHeadAttention -> FlashAttentionLayer)
+    /// and then initialize stateful inference features (e.g., KV-cache) on the resulting model.
+    /// </remarks>
+    public (NeuralNetworkBase<T> OptimizedModel, bool AnyOptimizationsApplied) OptimizeForInference(
+        NeuralNetworkBase<T> model,
+        bool cloneModel = true)
+    {
+        if (model == null)
+            throw new ArgumentNullException(nameof(model));
+
+        _config.Validate();
+
+        // Clone only when we might rewrite layers; otherwise keep original reference.
+        bool mayRewriteAttention = _config.EnableFlashAttention || _config.EnableKVCache || _config.EnableWeightOnlyQuantization;
+        var workingModel = model;
+        if (cloneModel && mayRewriteAttention && HasOptimizableAttentionLayers(model))
+        {
+            try
+            {
+                // NeuralNetworkBase.Clone performs a deep copy via serialization.
+                workingModel = (NeuralNetworkBase<T>)model.Clone();
+            }
+            catch (Exception ex)
+            {
+                // Some layer types may not yet support serialization-based cloning.
+                // Do not mutate the user's original model; just skip optimizations.
+                Console.WriteLine($"Warning: model cloning failed for inference optimizations: {ex.Message}. Skipping inference optimizations for this model instance.");
+                InferenceDiagnostics.RecordException(
+                    area: "InferenceOptimizer",
+                    feature: "CloneForRewrite",
+                    ex: ex,
+                    reason: "Clone failed; skipping all inference optimizations to avoid mutating user model.");
+                return (model, false);
+            }
+        }
+
+        bool anyApplied = ApplyAttentionOptimizations(workingModel);
+        InferenceDiagnostics.RecordDecision("InferenceOptimizer", "AttentionRewrites", enabled: anyApplied, reason: anyApplied ? "Applied" : "NoApplicableLayersOrDisabled");
+        anyApplied |= ApplyWeightOnlyQuantization(workingModel);
+        anyApplied |= Initialize(workingModel);
+
+        return (workingModel, anyApplied);
+    }
+
     /// <summary>
     /// Initializes inference optimizations for a neural network model.
     /// </summary>
@@ -91,12 +153,20 @@ public bool Initialize(NeuralNetworkBase<T> model)
         if (model == null)
             throw new ArgumentNullException(nameof(model));
 
+        _config.Validate();
+
         bool anyOptimizationsApplied = false;
 
         // Find and configure attention layers for KV caching
         if (_config.EnableKVCache)
         {
-            anyOptimizationsApplied |= InitializeKVCache(model);
+            anyOptimizationsApplied |= _config.EnablePagedKVCache
+                ? InitializePagedKVCache(model)
+                : InitializeKVCache(model);
+        }
+        else
+        {
+            InferenceDiagnostics.RecordDecision("InferenceOptimizer", "KVCache", enabled: false, reason: "DisabledByConfig");
         }
 
         // Initialize speculative decoding if enabled
@@ -104,6 +174,10 @@ public bool Initialize(NeuralNetworkBase<T> model)
         {
             anyOptimizationsApplied |= InitializeSpeculativeDecoding(model);
         }
+        else
+        {
+            InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDecoding", enabled: false, reason: "DisabledByConfig");
+        }
 
         _isInitialized = true;
         return anyOptimizationsApplied;
@@ -138,7 +212,8 @@ private bool InitializeKVCache(NeuralNetworkBase<T> model)
         var firstLayer = attentionLayers[0];
         int numHeads = firstLayer.HeadCount;
         int headDim = firstLayer.HeadDimension;
-        int maxSeqLen = EstimateMaxSequenceLength();
+        int numLayers = attentionLayers.Count;
+        int maxSeqLen = EstimateMaxSequenceLength(numLayers, numHeads, headDim);
 
         // Create KV cache configuration
         var cacheConfig = new KVCacheConfig
@@ -148,7 +223,12 @@ private bool InitializeKVCache(NeuralNetworkBase<T> model)
             HeadDimension = headDim,
             MaxSequenceLength = maxSeqLen,
             MaxBatchSize = _config.MaxBatchSize,
-            PreAllocate = true
+            PreAllocate = true,
+            UseSlidingWindow = _config.UseSlidingWindowKVCache,
+            WindowSize = _config.UseSlidingWindowKVCache
+                ? Math.Min(_config.KVCacheWindowSize, maxSeqLen)
+                : 1024,
+            DataType = ResolveKVCacheDataType()
         };
 
         // Create and attach KV cache
@@ -164,21 +244,482 @@ private bool InitializeKVCache(NeuralNetworkBase<T> model)
         return true;
     }
 
+    private CacheDataType ResolveKVCacheDataType()
+    {
+        bool fp16Capable = typeof(T) == typeof(float) || typeof(T) == typeof(double) || typeof(T) == typeof(Half);
+        bool int8Capable = fp16Capable;
+
+        CacheDataType resolved;
+        if (_config.KVCacheQuantization == KVCacheQuantizationMode.Int8 && int8Capable)
+        {
+            resolved = CacheDataType.Int8;
+        }
+        else
+        {
+            resolved = _config.KVCachePrecision switch
+            {
+                KVCachePrecisionMode.Float32 => CacheDataType.Float32,
+                KVCachePrecisionMode.Float16 => fp16Capable ? CacheDataType.Float16 : CacheDataType.Float32,
+                _ => fp16Capable ? CacheDataType.Float16 : CacheDataType.Float32
+            };
+        }
+
+        InferenceDiagnostics.RecordDecision(
+            area: "InferenceOptimizer",
+            feature: "KVCachePrecision",
+            enabled: resolved == CacheDataType.Float16 || resolved == CacheDataType.Int8,
+            reason: $"Precision={_config.KVCachePrecision};Quant={_config.KVCacheQuantization};Resolved={resolved};Type={typeof(T).Name}");
+
+        return resolved;
+    }
+
+    private bool InitializePagedKVCache(NeuralNetworkBase<T> model)
+    {
+        var attentionLayers = new List<PagedCachedMultiHeadAttention<T>>();
+        int layerIndex = 0;
+
+        foreach (var layer in model.Layers)
+        {
+            if (layer is PagedCachedMultiHeadAttention<T> pagedAttention)
+            {
+                pagedAttention.LayerIndex = layerIndex;
+                attentionLayers.Add(pagedAttention);
+                layerIndex++;
+            }
+        }
+
+        if (attentionLayers.Count == 0)
+        {
+            // No paged attention layers present; fall back to contiguous cache if applicable.
+            return InitializeKVCache(model);
+        }
+
+        var firstLayer = attentionLayers[0];
+        int numHeads = firstLayer.HeadCount;
+        int headDim = firstLayer.HeadDimension;
+        int numLayers = attentionLayers.Count;
+
+        long availableBytes = (long)_config.KVCacheMaxSizeMB * 1024 * 1024;
+        int blockSize = _config.PagedKVCacheBlockSize;
+
+        _pagedKVCache = PagedKVCache<T>.FromMemorySize(availableBytes, numLayers, numHeads, headDim, blockSize);
+        _pagedKernel = new PagedAttentionKernel<T>(_pagedKVCache, new PagedAttentionConfig
+        {
+            NumHeads = numHeads,
+            HeadDimension = headDim,
+            BlockSize = blockSize,
+            MaxBatchSize = _config.MaxBatchSize
+        });
+
+        // Allocate a fresh sequence ID for this optimized model instance (one model == one sequence).
+        if (!TryAllocatePagedSequenceId(_pagedKVCache, initialTokens: 0, out long sequenceId))
+        {
+            InferenceDiagnostics.RecordDecision(
+                area: "InferenceOptimizer",
+                feature: "PagedKVCache",
+                enabled: false,
+                reason: "AllocateSequenceFailed(OutOfMemoryOrExhausted)");
+            _pagedKVCache = null;
+            _pagedKernel = null;
+            _pagedAttentionLayers = null;
+            _pagedSequenceId = null;
+            return false;
+        }
+
+        _pagedSequenceId = sequenceId;
+        _pagedAttentionLayers = attentionLayers;
+
+        foreach (var layer in attentionLayers)
+        {
+            layer.Kernel = _pagedKernel;
+            layer.SequenceId = sequenceId;
+            layer.InferenceMode = true;
+        }
+
+        return true;
+    }
+
+    private static bool TryAllocatePagedSequenceId(PagedKVCache<T> cache, int initialTokens, out long sequenceId)
+    {
+        const int maxAttempts = 1024;
+        var spin = new SpinWait();
+
+        for (int attempt = 0; attempt < maxAttempts; attempt++)
+        {
+            sequenceId = Interlocked.Increment(ref s_nextPagedSequenceId);
+            if (cache.AllocateSequence(sequenceId, initialTokens))
+            {
+                return true;
+            }
+
+            spin.SpinOnce();
+        }
+
+        sequenceId = 0;
+        return false;
+    }
+
+    private static bool TryAllocatePagedSequenceId(PagedKVCache<T> cache, long preferredId, int initialTokens, out long sequenceId)
+    {
+        if (cache.AllocateSequence(preferredId, initialTokens))
+        {
+            sequenceId = preferredId;
+            return true;
+        }
+
+        return TryAllocatePagedSequenceId(cache, initialTokens, out sequenceId);
+    }
+
+    private bool HasOptimizableAttentionLayers(NeuralNetworkBase<T> model)
+    {
+        foreach (var layer in model.Layers)
+        {
+            if (layer is MultiHeadAttentionLayer<T> || layer is FlashAttentionLayer<T> || layer is SelfAttentionLayer<T>)
+                return true;
+
+            if (_config.EnableWeightOnlyQuantization &&
+                typeof(T) == typeof(float) &&
+                layer is DenseLayer<float>)
+            {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    private bool ApplyAttentionOptimizations(NeuralNetworkBase<T> model)
+    {
+        bool useCausalMask = ResolveCausalMask(model);
+        InferenceDiagnostics.RecordDecision(
+            area: "InferenceOptimizer",
+            feature: "CausalMask",
+            enabled: useCausalMask,
+            reason: _config.AttentionMasking == AttentionMaskingMode.Auto ? "Auto" : _config.AttentionMasking.ToString());
+
+        // KV-cache is only beneficial for incremental decoding patterns; default to enabling it only when causal masking applies.
+        bool enableKVCache = _config.EnableKVCache && useCausalMask;
+        bool enablePagedKVCache = enableKVCache && _config.EnablePagedKVCache;
+        bool enableFlashAttention = _config.EnableFlashAttention;
+
+        bool anyRewritten = false;
+
+        for (int i = 0; i < model.Layers.Count; i++)
+        {
+            var layer = model.Layers[i];
+
+            if (layer is SelfAttentionLayer<T> selfAttention && (enableKVCache || enableFlashAttention))
+            {
+                var converted = TryConvertSelfAttentionToMultiHead(selfAttention);
+                if (converted != null)
+                {
+                    model.Layers[i] = converted;
+                    anyRewritten = true;
+
+                    // Re-process this index under MultiHeadAttention rules.
+                    i--;
+                    continue;
+                }
+
+                InferenceDiagnostics.RecordDecision(
+                    area: "InferenceOptimizer",
+                    feature: "SelfAttentionRewrite",
+                    enabled: false,
+                    reason: "UnsupportedSelfAttentionLayer(HeadCountOrShape)");
+                continue;
+            }
+
+            if (layer is MultiHeadAttentionLayer<T> mha)
+            {
+                var inputShape = mha.GetInputShape();
+                if (inputShape.Length < 2)
+                {
+                    continue;
+                }
+
+                int seqLen = inputShape[0];
+                int embDim = inputShape[1];
+                int headCount = mha.HeadCount;
+                var activation = mha.ScalarActivation;
+
+                if (enableKVCache)
+                {
+                    if (enablePagedKVCache)
+                    {
+                        var paged = new PagedCachedMultiHeadAttention<T>(
+                            sequenceLength: seqLen,
+                            embeddingDimension: embDim,
+                            headCount: headCount,
+                            useCausalMask: useCausalMask,
+                            activationFunction: activation);
+                        paged.EnableWeightOnlyQuantization = _config.EnableWeightOnlyQuantization;
+                        paged.SetParameters(mha.GetParameters());
+                        model.Layers[i] = paged;
+                    }
+                    else
+                    {
+                        var cached = new CachedMultiHeadAttention<T>(
+                            sequenceLength: seqLen,
+                            embeddingDimension: embDim,
+                            headCount: headCount,
+                            useFlashAttention: enableFlashAttention,
+                            layerIndex: 0,
+                            useCausalMask: useCausalMask,
+                            activationFunction: activation);
+                        cached.SetParameters(mha.GetParameters());
+                        model.Layers[i] = cached;
+                    }
+                    anyRewritten = true;
+                    continue;
+                }
+
+                if (enableFlashAttention)
+                {
+                    var flashConfig = FlashAttentionConfig.Default;
+                    flashConfig.UseCausalMask = useCausalMask;
+
+                    var flashLayer = new FlashAttentionLayer<T>(
+                        sequenceLength: seqLen,
+                        embeddingDimension: embDim,
+                        headCount: headCount,
+                        config: flashConfig,
+                        activationFunction: activation);
+                    flashLayer.SetParameters(mha.GetParameters());
+                    model.Layers[i] = flashLayer;
+                    anyRewritten = true;
+                }
+
+                continue;
+            }
+
+            if (layer is FlashAttentionLayer<T> flash && enableKVCache)
+            {
+                var inputShape = flash.GetInputShape();
+                if (inputShape.Length < 2)
+                {
+                    continue;
+                }
+
+                int seqLen = inputShape[0];
+                int embDim = inputShape[1];
+                int headCount = flash.HeadCount;
+                var activation = flash.ScalarActivation;
+
+                if (enablePagedKVCache)
+                {
+                    var paged = new PagedCachedMultiHeadAttention<T>(
+                        sequenceLength: seqLen,
+                        embeddingDimension: embDim,
+                        headCount: headCount,
+                        useCausalMask: useCausalMask,
+                        activationFunction: activation);
+                    paged.EnableWeightOnlyQuantization = _config.EnableWeightOnlyQuantization;
+                    paged.SetParameters(flash.GetParameters());
+                    model.Layers[i] = paged;
+                }
+                else
+                {
+                    var cached = new CachedMultiHeadAttention<T>(
+                        sequenceLength: seqLen,
+                        embeddingDimension: embDim,
+                        headCount: headCount,
+                        useFlashAttention: enableFlashAttention,
+                        layerIndex: 0,
+                        useCausalMask: useCausalMask,
+                        activationFunction: activation);
+                    cached.SetParameters(flash.GetParameters());
+                    model.Layers[i] = cached;
+                }
+                anyRewritten = true;
+            }
+        }
+
+        return anyRewritten;
+    }
+
+    private bool ApplyWeightOnlyQuantization(NeuralNetworkBase<T> model)
+    {
+        if (!_config.EnableWeightOnlyQuantization)
+        {
+            InferenceDiagnostics.RecordDecision("InferenceOptimizer", "WeightOnlyQuantization", enabled: false, reason: "DisabledByConfig");
+            return false;
+        }
+
+        if (typeof(T) != typeof(float))
+        {
+            InferenceDiagnostics.RecordDecision("InferenceOptimizer", "WeightOnlyQuantization", enabled: false, reason: $"UnsupportedType({typeof(T).Name})");
+            return false;
+        }
+
+        bool any = false;
+        for (int i = 0; i < model.Layers.Count; i++)
+        {
+            if (model.Layers[i] is DenseLayer<float> dense)
+            {
+                try
+                {
+                    var replacement = dense.VectorActivation != null
+                        ? new QuantizedDenseLayer(dense, dense.VectorActivation)
+                        : new QuantizedDenseLayer(dense);
+
+                    model.Layers[i] = (AiDotNet.Interfaces.ILayer<T>)(object)replacement;
+                    any = true;
+                }
+                catch (Exception ex)
+                {
+                    InferenceDiagnostics.RecordException("InferenceOptimizer", "WeightOnlyQuantization", ex, "DenseLayerQuantizationFailed;FallbackToFP");
+                }
+            }
+        }
+
+        InferenceDiagnostics.RecordDecision("InferenceOptimizer", "WeightOnlyQuantization", enabled: any, reason: any ? "Applied(DenseLayer)" : "NoApplicableLayers");
+        return any;
+    }
+
+    private MultiHeadAttentionLayer<T>? TryConvertSelfAttentionToMultiHead(SelfAttentionLayer<T> layer)
+    {
+        var inputShape = layer.GetInputShape();
+        if (inputShape.Length < 2)
+            return null;
+
+        int seqLen = inputShape[0];
+        int embDim = inputShape[1];
+        if (seqLen <= 0 || embDim <= 0)
+            return null;
+
+        int headCount = TryGetHeadCountFromMetadata(layer) ?? 0;
+        if (headCount <= 0)
+            return null;
+
+        if (embDim % headCount != 0)
+            return null;
+
+        // SelfAttentionLayer has Q/K/V projections plus bias, but no output projection.
+        // We convert it into a MultiHeadAttentionLayer with an identity output projection so that
+        // downstream inference rewrites (FlashAttention / KV-cache) can be applied consistently.
+        var activation = layer.ScalarActivation;
+        var mha = new MultiHeadAttentionLayer<T>(seqLen, embDim, headCount, activationFunction: activation);
+
+        var selfParams = layer.GetParameters();
+        int projSize = embDim * embDim;
+        int expectedSelf = (3 * projSize) + embDim;
+        if (selfParams.Length != expectedSelf)
+            return null;
+
+        var numOps = MathHelper.GetNumericOperations<T>();
+
+        // MultiHead params: Q, K, V, O, bias
+        var combined = new Vector<T>((4 * projSize) + embDim);
+        int idx = 0;
+
+        // Copy Q/K/V (3 * projSize)
+        for (int i = 0; i < 3 * projSize; i++)
+            combined[idx++] = selfParams[i];
+
+        // Output weights: identity matrix (embDim x embDim) flattened row-major
+        for (int r = 0; r < embDim; r++)
+        {
+            for (int c = 0; c < embDim; c++)
+            {
+                combined[idx++] = r == c ? numOps.One : numOps.Zero;
+            }
+        }
+
+        // Output bias (embDim)
+        for (int i = 0; i < embDim; i++)
+            combined[idx++] = selfParams[(3 * projSize) + i];
+
+        mha.SetParameters(combined);
+        return mha;
+    }
+
+    private static int? TryGetHeadCountFromMetadata(ILayer<T> layer)
+    {
+        if (layer is not ILayerSerializationMetadata meta)
+            return null;
+
+        if (!meta.GetSerializationMetadata().TryGetValue("HeadCount", out var raw) || string.IsNullOrWhiteSpace(raw))
+            return null;
+
+        return int.TryParse(raw, out var parsed) ? parsed : null;
+    }
+
+    private bool ResolveCausalMask(NeuralNetworkBase<T> model)
+    {
+        return _config.AttentionMasking switch
+        {
+            AttentionMaskingMode.Causal => true,
+            AttentionMaskingMode.Disabled => false,
+            _ => InferCausalFromModel(model)
+        };
+    }
+
+    private bool InferCausalFromModel(NeuralNetworkBase<T> model)
+    {
+        // Default to causal when the user enables generation-oriented inference features.
+        // This matches industry-standard expectations for autoregressive decoding and avoids
+        // relying on users to set TaskType explicitly.
+        if (_config.EnableKVCache || _config.EnableSpeculativeDecoding)
+            return true;
+
+        // Otherwise, keep heuristics conservative to avoid changing semantics for non-generative models.
+        return model.Architecture.TaskType == NeuralNetworkTaskType.TextGeneration;
+    }
+
     /// <summary>
     /// Estimates the maximum sequence length based on config and memory constraints.
     /// </summary>
-    private int EstimateMaxSequenceLength()
+    /// <param name="numLayers">Number of attention layers in the model.</param>
+    /// <param name="numHeads">Number of attention heads per layer.</param>
+    /// <param name="headDim">Dimension of each attention head.</param>
+    /// <returns>Maximum sequence length that fits within the configured memory budget.</returns>
+    private int EstimateMaxSequenceLength(int numLayers, int numHeads, int headDim)
     {
-        // Calculate based on available memory
-        // Formula: maxSeqLen = (maxMemoryMB * 1024 * 1024) / (numLayers * numHeads * headDim * 2 * bytesPerElement)
-        // Using a simplified estimate
+        // KV cache memory per token = numLayers * numHeads * headDim * 2 (K and V) * bytesPerElement
+        // For batch size, multiply by maxBatchSize
+        // Total: maxSeqLen * numLayers * numHeads * headDim * 2 * bytesPerElement * batchSize <= maxMemoryBytes
+
         long maxMemoryBytes = (long)_config.KVCacheMaxSizeMB * 1024 * 1024;
 
-        // Default reasonable sequence length
-        const int defaultMaxSeqLen = 2048;
+        // Estimate bytes per element based on type T
+        int bytesPerElement = EstimateBytesPerElement();
+
+        // Memory per token per batch item = numLayers * numHeads * headDim * 2 * bytesPerElement
+        long memoryPerToken = (long)numLayers * numHeads * headDim * 2 * bytesPerElement;
+
+        // Account for batch size
+        long memoryPerTokenWithBatch = memoryPerToken * _config.MaxBatchSize;
+
+        // Prevent division by zero
+        if (memoryPerTokenWithBatch <= 0)
+        {
+            return 2048; // Reasonable default
+        }
+
+        // Calculate maximum sequence length
+        long calculatedMaxSeqLen = maxMemoryBytes / memoryPerTokenWithBatch;
 
-        // Cap at reasonable maximum
-        return Math.Min(defaultMaxSeqLen, 8192);
+        // Apply reasonable bounds using MathHelper.Clamp for net471 compatibility
+        const long minSeqLen = 128;
+        const long maxSeqLen = 32768; // Reasonable upper bound
+
+        return (int)MathHelper.Clamp(calculatedMaxSeqLen, minSeqLen, maxSeqLen);
+    }
+
+    /// <summary>
+    /// Estimates bytes per element based on the generic type T.
+    /// </summary>
+    private static int EstimateBytesPerElement()
+    {
+        // Common numeric types used in neural networks
+        var type = typeof(T);
+        if (type == typeof(float)) return 4;
+        if (type == typeof(double)) return 8;
+        if (type == typeof(Half)) return 2;
+        if (type == typeof(decimal)) return 16;
+
+        // Default to float size if unknown
+        return 4;
     }
 
     /// <summary>
@@ -186,23 +727,56 @@ private int EstimateMaxSequenceLength()
     /// </summary>
     private bool InitializeSpeculativeDecoding(NeuralNetworkBase<T> model)
     {
-        // Create draft model based on configuration
-        IDraftModel<T>? draftModel = _config.DraftModelType switch
+        // Facade-friendly behavior: speculative decoding configuration must never crash inference.
+        // If a requested draft model is unavailable, fall back to an N-gram draft model and record diagnostics.
+        try
         {
-            DraftModelType.NGram => CreateNGramDraftModel(),
-            DraftModelType.SmallNeural => CreateNeuralDraftModel(model),
-            _ => null
-        };
+            // For Custom draft models, an internal caller can provide one via SetCustomDraftModel().
+            if (_config.DraftModelType == DraftModelType.Custom)
+            {
+                if (_draftModel != null)
+                {
+                    InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDraftModel", enabled: true, reason: "CustomProvided");
+                    return true;
+                }
+
+                _draftModel = CreateNGramDraftModel();
+                InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDraftModel", enabled: _draftModel != null, reason: "CustomNotProvided_FallbackToNGram");
+                return _draftModel != null;
+            }
 
-        if (draftModel == null)
+            IDraftModel<T>? draftModel = _config.DraftModelType switch
+            {
+                DraftModelType.NGram => CreateNGramDraftModel(),
+                DraftModelType.SmallNeural => CreateNeuralDraftModel(model),
+                _ => CreateNGramDraftModel()
+            };
+
+            _draftModel = draftModel ?? CreateNGramDraftModel();
+            InferenceDiagnostics.RecordDecision(
+                "InferenceOptimizer",
+                "SpeculativeDraftModel",
+                enabled: _draftModel != null,
+                reason: draftModel != null ? _config.DraftModelType.ToString() : $"Unavailable({_config.DraftModelType})_FallbackToNGram");
+
+            return _draftModel != null;
+        }
+        catch (Exception ex)
         {
-            return false;
+            InferenceDiagnostics.RecordException("InferenceOptimizer", "SpeculativeDecoding", ex, "Draft model init failed; falling back to NGram.");
+            try
+            {
+                _draftModel = CreateNGramDraftModel();
+                InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDraftModel", enabled: _draftModel != null, reason: "ExceptionFallbackToNGram");
+                return _draftModel != null;
+            }
+            catch
+            {
+                InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDraftModel", enabled: false, reason: "FallbackFailed");
+                _draftModel = null;
+                return false;
+            }
         }
-
-        // Note: SpeculativeDecoder requires a target forward function
-        // This will be set when actually doing inference via CreateSpeculativeDecoder
-        _draftModel = draftModel;
-        return true;
     }
 
     /// <summary>
@@ -217,10 +791,19 @@ private bool InitializeSpeculativeDecoding(NeuralNetworkBase<T> model)
     /// <summary>
     /// Creates a small neural network draft model.
     /// </summary>
+    /// <remarks>
+    /// SmallNeural draft models require a pre-trained companion model that is smaller
+    /// and faster than the target model but trained on similar data. This cannot be
+    /// automatically generated from the target model.
+    /// </remarks>
+    /// <exception cref="NotSupportedException">
+    /// Always thrown because SmallNeural draft models require external pre-trained models.
+    /// </exception>
     private IDraftModel<T>? CreateNeuralDraftModel(NeuralNetworkBase<T> model)
     {
-        // For neural draft models, we would need a pre-trained smaller model
-        // This is a placeholder - in production, this would load a companion model
+        // SmallNeural draft models require a separate pre-trained smaller model. We do not expose
+        // draft model wiring via the public facade in the MVP, so treat this as unavailable.
+        InferenceDiagnostics.RecordDecision("InferenceOptimizer", "SpeculativeDraftModel", enabled: false, reason: "SmallNeuralUnavailable_FallbackToNGram");
         return null;
     }
 
@@ -258,6 +841,10 @@ public void DisableInferenceMode(NeuralNetworkBase<T> model)
             {
                 cachedAttention.InferenceMode = false;
             }
+            else if (layer is PagedCachedMultiHeadAttention<T> pagedAttention)
+            {
+                pagedAttention.InferenceMode = false;
+            }
         }
     }
 
@@ -267,6 +854,54 @@ public void DisableInferenceMode(NeuralNetworkBase<T> model)
     public void ClearCache()
     {
         _kvCache?.Clear();
+        if (_pagedKVCache != null && _pagedSequenceId.HasValue)
+        {
+            try
+            {
+                _pagedKVCache.FreeSequence(_pagedSequenceId.Value);
+            }
+            catch
+            {
+                // Best-effort cleanup.
+            }
+
+            // Re-allocate with the same ID if possible; otherwise allocate a new one.
+            if (!TryAllocatePagedSequenceId(_pagedKVCache, _pagedSequenceId.Value, initialTokens: 0, out long allocated))
+            {
+                InferenceDiagnostics.RecordDecision(
+                    area: "InferenceOptimizer",
+                    feature: "PagedKVCache",
+                    enabled: false,
+                    reason: "ClearCacheAllocateSequenceFailed(OutOfMemoryOrExhausted)");
+
+                // Safe fallback: disable paged inference mode on layers and keep session alive.
+                if (_pagedAttentionLayers != null)
+                {
+                    foreach (var layer in _pagedAttentionLayers)
+                    {
+                        layer.InferenceMode = false;
+                        layer.Kernel = null;
+                        layer.ResetState();
+                    }
+                }
+
+                _pagedSequenceId = null;
+                return;
+            }
+
+            _pagedSequenceId = allocated;
+
+            if (_pagedAttentionLayers != null && _pagedSequenceId.HasValue)
+            {
+                foreach (var layer in _pagedAttentionLayers)
+                {
+                    layer.SequenceId = _pagedSequenceId.Value;
+                    layer.ResetState();
+                    layer.InferenceMode = true;
+                    layer.Kernel ??= _pagedKernel;
+                }
+            }
+        }
     }
 
     /// <summary>
@@ -280,7 +915,10 @@ public Dictionary<string, object> GetStatistics()
             ["IsInitialized"] = _isInitialized,
             ["KVCacheEnabled"] = _config.EnableKVCache,
             ["SpeculativeDecodingEnabled"] = _config.EnableSpeculativeDecoding,
-            ["BatchingEnabled"] = _config.EnableBatching
+            ["BatchingEnabled"] = _config.EnableBatching,
+            ["PagedKVCacheInitialized"] = _pagedKVCache != null,
+            ["PagedAttentionLayerCount"] = _pagedAttentionLayers?.Count ?? 0,
+            ["PagedAttentionWeightOnlyQuantizationEnabled"] = _pagedAttentionLayers?.Any(l => l.EnableWeightOnlyQuantization) ?? false
         };
 
         if (_kvCache != null)
@@ -310,6 +948,34 @@ public Dictionary<string, object> GetStatistics()
     /// </summary>
     public IDraftModel<T>? DraftModel => _draftModel;
 
+    /// <summary>
+    /// Sets a custom draft model for speculative decoding.
+    /// </summary>
+    /// <param name="draftModel">The custom draft model implementation.</param>
+    /// <remarks>
+    /// <para><b>For Beginners:</b> Use this method when you have your own draft model implementation.
+    ///
+    /// This is required when using DraftModelType.Custom or when you want to replace the
+    /// default NGram draft model with a more sophisticated model.
+    ///
+    /// Your custom draft model must implement IDraftModel&lt;T&gt; and provide:
+    /// - Draft token generation
+    /// - Probability estimation for speculative decoding verification
+    ///
+    /// Example:
+    /// <code>
+    /// var optimizer = new InferenceOptimizer&lt;float&gt;(config);
+    /// optimizer.SetCustomDraftModel(myCustomDraftModel);
+    /// optimizer.Initialize(mainModel);
+    /// </code>
+    /// </para>
+    /// </remarks>
+    /// <exception cref="ArgumentNullException">Thrown when draftModel is null.</exception>
+    public void SetCustomDraftModel(IDraftModel<T> draftModel)
+    {
+        _draftModel = draftModel ?? throw new ArgumentNullException(nameof(draftModel));
+    }
+
     /// <summary>
     /// Creates a speculative decoder with the given target forward function.
     /// </summary>
@@ -339,7 +1005,13 @@ public Dictionary<string, object> GetStatistics()
         var speculativeConfig = new SpeculativeDecodingConfig<T>
         {
             NumDraftTokens = _config.SpeculationDepth,
-            UseTreeSpeculation = _config.UseTreeSpeculation
+            UseTreeSpeculation = _config.UseTreeSpeculation ||
+                                _config.SpeculativeMethod == SpeculativeMethod.Medusa ||
+                                _config.SpeculativeMethod == SpeculativeMethod.Eagle,
+            AdaptiveDraftLength = _config.SpeculationPolicy == SpeculationPolicy.Auto,
+            TreeBranchFactor = _config.SpeculativeMethod == SpeculativeMethod.Medusa ? 4 : 2,
+            MaxTreeDepth = Math.Max(1, _config.SpeculationDepth),
+            MinAcceptanceRate = MathHelper.GetNumericOperations<T>().FromDouble(0.5)
         };
 
         _speculativeDecoder = new SpeculativeDecoder<T>(_draftModel, targetForward, speculativeConfig);
diff --git a/src/Inference/KVCache.cs b/src/Inference/KVCache.cs
index b391faa38..bbf017f1c 100644
--- a/src/Inference/KVCache.cs
+++ b/src/Inference/KVCache.cs
@@ -28,7 +28,7 @@ namespace AiDotNet.Inference;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for cache storage (typically float or double).</typeparam>
-public class KVCache<T>
+internal class KVCache<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
@@ -38,8 +38,24 @@ public class KVCache<T>
     private readonly Tensor<T>[] _keyCache;
     private readonly Tensor<T>[] _valueCache;
 
-    // Current sequence length for each batch item
-    private readonly int[] _sequenceLengths;
+    // Optional FP16 cache storage (used when Config.DataType == Float16 and T is float/double)
+    private readonly Tensor<Half>[]? _keyCacheFp16;
+    private readonly Tensor<Half>[]? _valueCacheFp16;
+    private readonly bool _useFp16Storage;
+    private readonly Func<T, Half>? _toHalf;
+    private readonly Func<Half, T>? _fromHalf;
+
+    // Optional int8 quantized cache storage (used when Config.DataType == Int8).
+    private readonly Tensor<sbyte>[]? _keyCacheInt8;
+    private readonly Tensor<sbyte>[]? _valueCacheInt8;
+    private readonly bool _useInt8Storage;
+    private readonly float[]? _keyScaleInt8;
+    private readonly float[]? _valueScaleInt8;
+    private readonly Func<T, float>? _toFloat;
+    private readonly Func<float, T>? _fromFloat;
+
+    // Current sequence length for each layer and batch item: [layer][batch]
+    private readonly int[][] _sequenceLengths;
 
     // Statistics
     private long _cacheHits;
@@ -54,7 +70,7 @@ public class KVCache<T>
     /// <summary>
     /// Gets the current number of cached tokens for batch item 0.
     /// </summary>
-    public int CurrentLength => _sequenceLengths[0];
+    public int CurrentLength => _sequenceLengths.Length > 0 ? _sequenceLengths[0][0] : 0;
 
     /// <summary>
     /// Gets the maximum sequence length this cache can hold.
@@ -86,7 +102,66 @@ public KVCache(KVCacheConfig config)
 
         _keyCache = new Tensor<T>[config.NumLayers];
         _valueCache = new Tensor<T>[config.NumLayers];
-        _sequenceLengths = new int[config.MaxBatchSize];
+
+        if (config.DataType == CacheDataType.Int8)
+        {
+            // Only enable int8 storage when we can safely convert between T and float.
+            if (typeof(T) == typeof(float))
+            {
+                _useInt8Storage = true;
+                _toFloat = value => (float)(object)value!;
+                _fromFloat = value => (T)(object)value;
+            }
+            else if (typeof(T) == typeof(double))
+            {
+                _useInt8Storage = true;
+                _toFloat = value => (float)(double)(object)value!;
+                _fromFloat = value => (T)(object)(double)value;
+            }
+            else if (typeof(T) == typeof(Half))
+            {
+                _useInt8Storage = true;
+                _toFloat = value => (float)(Half)(object)value!;
+                _fromFloat = value => (T)(object)(Half)value;
+            }
+        }
+
+        if (config.DataType == CacheDataType.Float16 && typeof(T) != typeof(Half))
+        {
+            // Only enable FP16 storage when we can safely convert between T and Half.
+            if (typeof(T) == typeof(float))
+            {
+                _useFp16Storage = true;
+                _toHalf = value => (Half)(float)(object)value!;
+                _fromHalf = value => (T)(object)(float)value;
+            }
+            else if (typeof(T) == typeof(double))
+            {
+                _useFp16Storage = true;
+                _toHalf = value => (Half)(double)(object)value!;
+                _fromHalf = value => (T)(object)(double)(float)value;
+            }
+        }
+
+        if (_useFp16Storage)
+        {
+            _keyCacheFp16 = new Tensor<Half>[config.NumLayers];
+            _valueCacheFp16 = new Tensor<Half>[config.NumLayers];
+        }
+
+        if (_useInt8Storage)
+        {
+            _keyCacheInt8 = new Tensor<sbyte>[config.NumLayers];
+            _valueCacheInt8 = new Tensor<sbyte>[config.NumLayers];
+            _keyScaleInt8 = new float[config.NumLayers];
+            _valueScaleInt8 = new float[config.NumLayers];
+        }
+
+        _sequenceLengths = new int[config.NumLayers][];
+        for (int layer = 0; layer < config.NumLayers; layer++)
+        {
+            _sequenceLengths[layer] = new int[config.MaxBatchSize];
+        }
 
         if (config.PreAllocate)
         {
@@ -121,8 +196,23 @@ private void AllocateCaches()
 
         for (int layer = 0; layer < _config.NumLayers; layer++)
         {
-            _keyCache[layer] = new Tensor<T>(shape);
-            _valueCache[layer] = new Tensor<T>(shape);
+            if (_useInt8Storage)
+            {
+                _keyCacheInt8![layer] = new Tensor<sbyte>(shape);
+                _valueCacheInt8![layer] = new Tensor<sbyte>(shape);
+                _keyScaleInt8![layer] = 0f;
+                _valueScaleInt8![layer] = 0f;
+            }
+            else if (_useFp16Storage)
+            {
+                _keyCacheFp16![layer] = new Tensor<Half>(shape);
+                _valueCacheFp16![layer] = new Tensor<Half>(shape);
+            }
+            else
+            {
+                _keyCache[layer] = new Tensor<T>(shape);
+                _valueCache[layer] = new Tensor<T>(shape);
+            }
         }
     }
 
@@ -166,10 +256,15 @@ private void AllocateCaches()
             HandleSlidingWindowEviction(layerIndex, batchSize, newSeqLen);
         }
 
+        if (_useInt8Storage)
+        {
+            EnsureInt8Scales(layerIndex, newKeys, newValues, batchSize, newSeqLen);
+        }
+
         // Append new entries
         for (int b = 0; b < batchSize; b++)
         {
-            int currentLen = _sequenceLengths[b];
+            int currentLen = _sequenceLengths[layerIndex][b];
             int newLen = currentLen + newSeqLen;
 
             if (newLen > _config.MaxSequenceLength)
@@ -187,13 +282,28 @@ private void AllocateCaches()
                     int targetPos = currentLen + s;
                     for (int d = 0; d < _config.HeadDimension; d++)
                     {
-                        _keyCache[layerIndex][new[] { b, h, targetPos, d }] = newKeys[new[] { b, h, s, d }];
-                        _valueCache[layerIndex][new[] { b, h, targetPos, d }] = newValues[new[] { b, h, s, d }];
+                        if (_useInt8Storage)
+                        {
+                            var keyScale = _keyScaleInt8![layerIndex];
+                            var valueScale = _valueScaleInt8![layerIndex];
+                            _keyCacheInt8![layerIndex][new[] { b, h, targetPos, d }] = QuantizeToInt8(_toFloat!(newKeys[new[] { b, h, s, d }]), keyScale);
+                            _valueCacheInt8![layerIndex][new[] { b, h, targetPos, d }] = QuantizeToInt8(_toFloat!(newValues[new[] { b, h, s, d }]), valueScale);
+                        }
+                        else if (_useFp16Storage)
+                        {
+                            _keyCacheFp16![layerIndex][new[] { b, h, targetPos, d }] = _toHalf!(newKeys[new[] { b, h, s, d }]);
+                            _valueCacheFp16![layerIndex][new[] { b, h, targetPos, d }] = _toHalf!(newValues[new[] { b, h, s, d }]);
+                        }
+                        else
+                        {
+                            _keyCache[layerIndex][new[] { b, h, targetPos, d }] = newKeys[new[] { b, h, s, d }];
+                            _valueCache[layerIndex][new[] { b, h, targetPos, d }] = newValues[new[] { b, h, s, d }];
+                        }
                     }
                 }
             }
 
-            _sequenceLengths[b] = newLen;
+            _sequenceLengths[layerIndex][b] = newLen;
             _cacheMisses += newSeqLen;
         }
 
@@ -211,7 +321,7 @@ private void AllocateCaches()
     {
         ValidateLayerIndex(layerIndex);
 
-        if (_keyCache[layerIndex] == null)
+        if (!IsLayerAllocated(layerIndex))
         {
             throw new InvalidOperationException($"Layer {layerIndex} cache not initialized. Call Append first.");
         }
@@ -220,7 +330,7 @@ private void AllocateCaches()
         int maxLen = 0;
         for (int b = 0; b < batchSize; b++)
         {
-            if (_sequenceLengths[b] > maxLen) maxLen = _sequenceLengths[b];
+            if (_sequenceLengths[layerIndex][b] > maxLen) maxLen = _sequenceLengths[layerIndex][b];
         }
 
         if (maxLen == 0)
@@ -238,15 +348,30 @@ private void AllocateCaches()
         // Copy cached values
         for (int b = 0; b < batchSize; b++)
         {
-            int seqLen = _sequenceLengths[b];
+            int seqLen = _sequenceLengths[layerIndex][b];
             for (int h = 0; h < _config.NumHeads; h++)
             {
                 for (int s = 0; s < seqLen; s++)
                 {
                     for (int d = 0; d < _config.HeadDimension; d++)
                     {
-                        keys[new[] { b, h, s, d }] = _keyCache[layerIndex][new[] { b, h, s, d }];
-                        values[new[] { b, h, s, d }] = _valueCache[layerIndex][new[] { b, h, s, d }];
+                        if (_useInt8Storage)
+                        {
+                            float keyScale = _keyScaleInt8![layerIndex];
+                            float valueScale = _valueScaleInt8![layerIndex];
+                            keys[new[] { b, h, s, d }] = _fromFloat!(DequantizeInt8(_keyCacheInt8![layerIndex][new[] { b, h, s, d }], keyScale));
+                            values[new[] { b, h, s, d }] = _fromFloat!(DequantizeInt8(_valueCacheInt8![layerIndex][new[] { b, h, s, d }], valueScale));
+                        }
+                        else if (_useFp16Storage)
+                        {
+                            keys[new[] { b, h, s, d }] = _fromHalf!(_keyCacheFp16![layerIndex][new[] { b, h, s, d }]);
+                            values[new[] { b, h, s, d }] = _fromHalf!(_valueCacheFp16![layerIndex][new[] { b, h, s, d }]);
+                        }
+                        else
+                        {
+                            keys[new[] { b, h, s, d }] = _keyCache[layerIndex][new[] { b, h, s, d }];
+                            values[new[] { b, h, s, d }] = _valueCache[layerIndex][new[] { b, h, s, d }];
+                        }
                     }
                 }
             }
@@ -270,6 +395,12 @@ public void Update(int layerIndex, int[] positions, Tensor<T> keys, Tensor<T> va
         int batchSize = keys.Shape[0];
         int numPositions = positions.Length;
 
+        if (_useInt8Storage)
+        {
+            EnsureCacheAllocated(layerIndex);
+            EnsureInt8Scales(layerIndex, keys, values, batchSize, numPositions);
+        }
+
         for (int b = 0; b < batchSize; b++)
         {
             for (int p = 0; p < numPositions; p++)
@@ -285,8 +416,23 @@ public void Update(int layerIndex, int[] positions, Tensor<T> keys, Tensor<T> va
                 {
                     for (int d = 0; d < _config.HeadDimension; d++)
                     {
-                        _keyCache[layerIndex][new[] { b, h, pos, d }] = keys[new[] { b, h, p, d }];
-                        _valueCache[layerIndex][new[] { b, h, pos, d }] = values[new[] { b, h, p, d }];
+                        if (_useInt8Storage)
+                        {
+                            var keyScale = _keyScaleInt8![layerIndex];
+                            var valueScale = _valueScaleInt8![layerIndex];
+                            _keyCacheInt8![layerIndex][new[] { b, h, pos, d }] = QuantizeToInt8(_toFloat!(keys[new[] { b, h, p, d }]), keyScale);
+                            _valueCacheInt8![layerIndex][new[] { b, h, pos, d }] = QuantizeToInt8(_toFloat!(values[new[] { b, h, p, d }]), valueScale);
+                        }
+                        else if (_useFp16Storage)
+                        {
+                            _keyCacheFp16![layerIndex][new[] { b, h, pos, d }] = _toHalf!(keys[new[] { b, h, p, d }]);
+                            _valueCacheFp16![layerIndex][new[] { b, h, pos, d }] = _toHalf!(values[new[] { b, h, p, d }]);
+                        }
+                        else
+                        {
+                            _keyCache[layerIndex][new[] { b, h, pos, d }] = keys[new[] { b, h, p, d }];
+                            _valueCache[layerIndex][new[] { b, h, pos, d }] = values[new[] { b, h, p, d }];
+                        }
                     }
                 }
             }
@@ -307,18 +453,25 @@ public void Truncate(int newLength, int batchIndex = -1)
 
         if (batchIndex == -1)
         {
-            for (int b = 0; b < _sequenceLengths.Length; b++)
+            for (int layer = 0; layer < _sequenceLengths.Length; layer++)
             {
-                _sequenceLengths[b] = Math.Min(_sequenceLengths[b], newLength);
+                for (int b = 0; b < _sequenceLengths[layer].Length; b++)
+                {
+                    _sequenceLengths[layer][b] = Math.Min(_sequenceLengths[layer][b], newLength);
+                }
             }
         }
         else
         {
-            if (batchIndex < 0 || batchIndex >= _sequenceLengths.Length)
+            if (batchIndex < 0 || (_sequenceLengths.Length > 0 && batchIndex >= _sequenceLengths[0].Length))
             {
                 throw new ArgumentOutOfRangeException(nameof(batchIndex));
             }
-            _sequenceLengths[batchIndex] = Math.Min(_sequenceLengths[batchIndex], newLength);
+
+            for (int layer = 0; layer < _sequenceLengths.Length; layer++)
+            {
+                _sequenceLengths[layer][batchIndex] = Math.Min(_sequenceLengths[layer][batchIndex], newLength);
+            }
         }
     }
 
@@ -327,9 +480,18 @@ public void Truncate(int newLength, int batchIndex = -1)
     /// </summary>
     public void Clear()
     {
-        for (int b = 0; b < _sequenceLengths.Length; b++)
+        for (int layer = 0; layer < _sequenceLengths.Length; layer++)
         {
-            _sequenceLengths[b] = 0;
+            for (int b = 0; b < _sequenceLengths[layer].Length; b++)
+            {
+                _sequenceLengths[layer][b] = 0;
+            }
+
+            if (_useInt8Storage)
+            {
+                _keyScaleInt8![layer] = 0f;
+                _valueScaleInt8![layer] = 0f;
+            }
         }
 
         // Reset statistics
@@ -343,11 +505,15 @@ public void Clear()
     /// </summary>
     public void Clear(int batchIndex)
     {
-        if (batchIndex < 0 || batchIndex >= _sequenceLengths.Length)
+        if (batchIndex < 0 || (_sequenceLengths.Length > 0 && batchIndex >= _sequenceLengths[0].Length))
         {
             throw new ArgumentOutOfRangeException(nameof(batchIndex));
         }
-        _sequenceLengths[batchIndex] = 0;
+
+        for (int layer = 0; layer < _sequenceLengths.Length; layer++)
+        {
+            _sequenceLengths[layer][batchIndex] = 0;
+        }
     }
 
     /// <summary>
@@ -355,11 +521,12 @@ public void Clear(int batchIndex)
     /// </summary>
     public int GetSequenceLength(int batchIndex = 0)
     {
-        if (batchIndex < 0 || batchIndex >= _sequenceLengths.Length)
+        if (batchIndex < 0 || (_sequenceLengths.Length > 0 && batchIndex >= _sequenceLengths[0].Length))
         {
             throw new ArgumentOutOfRangeException(nameof(batchIndex));
         }
-        return _sequenceLengths[batchIndex];
+
+        return _sequenceLengths.Length > 0 ? _sequenceLengths[0][batchIndex] : 0;
     }
 
     /// <summary>
@@ -370,14 +537,26 @@ public long GetCurrentMemoryUsage()
         long totalElements = 0;
         for (int layer = 0; layer < _config.NumLayers; layer++)
         {
-            if (_keyCache[layer] != null)
+            if (IsLayerAllocated(layer))
             {
-                totalElements += _keyCache[layer].Length + _valueCache[layer].Length;
+                if (_useInt8Storage)
+                {
+                    totalElements += _keyCacheInt8![layer].Length + _valueCacheInt8![layer].Length;
+                }
+                else if (_useFp16Storage)
+                {
+                    totalElements += _keyCacheFp16![layer].Length + _valueCacheFp16![layer].Length;
+                }
+                else
+                {
+                    totalElements += _keyCache[layer].Length + _valueCache[layer].Length;
+                }
             }
         }
 
         int bytesPerElement = _config.DataType switch
         {
+            CacheDataType.Int8 => 1,
             CacheDataType.Float16 => 2,
             CacheDataType.Float32 => 4,
             CacheDataType.Float64 => 8,
@@ -395,6 +574,9 @@ public Dictionary<string, object> GetStatistics()
     {
         return new Dictionary<string, object>
         {
+            ["DataType"] = _config.DataType.ToString(),
+            ["UseInt8Storage"] = _useInt8Storage,
+            ["UseFp16Storage"] = _useFp16Storage,
             ["CacheHits"] = _cacheHits,
             ["CacheMisses"] = _cacheMisses,
             ["Evictions"] = _evictions,
@@ -403,7 +585,9 @@ public Dictionary<string, object> GetStatistics()
                 : 0.0,
             ["CurrentMemoryMB"] = GetCurrentMemoryUsage() / (1024.0 * 1024.0),
             ["MaxMemoryMB"] = _config.EstimateMemoryBytes() / (1024.0 * 1024.0),
-            ["SequenceLengths"] = _sequenceLengths.ToArray()
+            ["SequenceLengths"] = _sequenceLengths.Length > 0
+                ? _sequenceLengths[0].ToArray()
+                : Array.Empty<int>()
         };
     }
 
@@ -417,11 +601,11 @@ public void CopyBatchState(int sourceBatch, int destBatch)
         if (destBatch < 0 || destBatch >= _config.MaxBatchSize)
             throw new ArgumentOutOfRangeException(nameof(destBatch));
 
-        int seqLen = _sequenceLengths[sourceBatch];
-
         for (int layer = 0; layer < _config.NumLayers; layer++)
         {
-            if (_keyCache[layer] == null) continue;
+            if (!IsLayerAllocated(layer)) continue;
+
+            int seqLen = _sequenceLengths[layer][sourceBatch];
 
             for (int h = 0; h < _config.NumHeads; h++)
             {
@@ -429,16 +613,130 @@ public void CopyBatchState(int sourceBatch, int destBatch)
                 {
                     for (int d = 0; d < _config.HeadDimension; d++)
                     {
-                        _keyCache[layer][new[] { destBatch, h, s, d }] =
-                            _keyCache[layer][new[] { sourceBatch, h, s, d }];
-                        _valueCache[layer][new[] { destBatch, h, s, d }] =
-                            _valueCache[layer][new[] { sourceBatch, h, s, d }];
+                        if (_useInt8Storage)
+                        {
+                            _keyCacheInt8![layer][new[] { destBatch, h, s, d }] =
+                                _keyCacheInt8![layer][new[] { sourceBatch, h, s, d }];
+                            _valueCacheInt8![layer][new[] { destBatch, h, s, d }] =
+                                _valueCacheInt8![layer][new[] { sourceBatch, h, s, d }];
+                        }
+                        else if (_useFp16Storage)
+                        {
+                            _keyCacheFp16![layer][new[] { destBatch, h, s, d }] =
+                                _keyCacheFp16![layer][new[] { sourceBatch, h, s, d }];
+                            _valueCacheFp16![layer][new[] { destBatch, h, s, d }] =
+                                _valueCacheFp16![layer][new[] { sourceBatch, h, s, d }];
+                        }
+                        else
+                        {
+                            _keyCache[layer][new[] { destBatch, h, s, d }] =
+                                _keyCache[layer][new[] { sourceBatch, h, s, d }];
+                            _valueCache[layer][new[] { destBatch, h, s, d }] =
+                                _valueCache[layer][new[] { sourceBatch, h, s, d }];
+                        }
                     }
                 }
             }
+
+            _sequenceLengths[layer][destBatch] = seqLen;
+        }
+    }
+
+    private void EnsureInt8Scales(int layerIndex, Tensor<T> newKeys, Tensor<T> newValues, int batchSize, int newSeqLen)
+    {
+        if (!_useInt8Storage)
+        {
+            return;
         }
 
-        _sequenceLengths[destBatch] = seqLen;
+        float maxAbsK = 0f;
+        float maxAbsV = 0f;
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            for (int h = 0; h < _config.NumHeads; h++)
+            {
+                for (int s = 0; s < newSeqLen; s++)
+                {
+                    for (int d = 0; d < _config.HeadDimension; d++)
+                    {
+                        float k = _toFloat!(newKeys[new[] { b, h, s, d }]);
+                        float v = _toFloat!(newValues[new[] { b, h, s, d }]);
+                        float ak = Math.Abs(k);
+                        float av = Math.Abs(v);
+                        if (ak > maxAbsK) maxAbsK = ak;
+                        if (av > maxAbsV) maxAbsV = av;
+                    }
+                }
+            }
+        }
+
+        EnsureInt8ScaleForLayer(layerIndex, isKey: true, maxAbs: maxAbsK);
+        EnsureInt8ScaleForLayer(layerIndex, isKey: false, maxAbs: maxAbsV);
+    }
+
+    private void EnsureInt8ScaleForLayer(int layerIndex, bool isKey, float maxAbs)
+    {
+        float requiredScale = maxAbs > 0f ? (maxAbs / 127f) : 1f;
+        if (requiredScale <= 0f) requiredScale = 1f;
+
+        float currentScale = isKey ? _keyScaleInt8![layerIndex] : _valueScaleInt8![layerIndex];
+
+        if (currentScale <= 0f)
+        {
+            if (isKey) _keyScaleInt8![layerIndex] = requiredScale;
+            else _valueScaleInt8![layerIndex] = requiredScale;
+            return;
+        }
+
+        if (requiredScale > currentScale)
+        {
+            RescaleInt8Layer(layerIndex, isKey, currentScale, requiredScale);
+            if (isKey) _keyScaleInt8![layerIndex] = requiredScale;
+            else _valueScaleInt8![layerIndex] = requiredScale;
+        }
+    }
+
+    private void RescaleInt8Layer(int layerIndex, bool isKey, float oldScale, float newScale)
+    {
+        if (oldScale <= 0f || newScale <= 0f || Math.Abs(newScale - oldScale) < float.Epsilon)
+        {
+            return;
+        }
+
+        var cache = isKey ? _keyCacheInt8![layerIndex] : _valueCacheInt8![layerIndex];
+
+        for (int b = 0; b < _sequenceLengths[layerIndex].Length; b++)
+        {
+            int seqLen = _sequenceLengths[layerIndex][b];
+            for (int h = 0; h < _config.NumHeads; h++)
+            {
+                for (int s = 0; s < seqLen; s++)
+                {
+                    for (int d = 0; d < _config.HeadDimension; d++)
+                    {
+                        sbyte q = cache[new[] { b, h, s, d }];
+                        float value = q * oldScale;
+                        cache[new[] { b, h, s, d }] = QuantizeToInt8(value, newScale);
+                    }
+                }
+            }
+        }
+    }
+
+    private static sbyte QuantizeToInt8(float value, float scale)
+    {
+        if (scale <= 0f) scale = 1f;
+        int q = (int)Math.Round(value / scale);
+        if (q > 127) q = 127;
+        if (q < -127) q = -127;
+        return (sbyte)q;
+    }
+
+    private static float DequantizeInt8(sbyte value, float scale)
+    {
+        if (scale <= 0f) scale = 1f;
+        return value * scale;
     }
 
     private void ValidateLayerIndex(int layerIndex)
@@ -480,7 +778,7 @@ private void ValidateInputShapes(Tensor<T> keys, Tensor<T> values)
 
     private void EnsureCacheAllocated(int layerIndex)
     {
-        if (_keyCache[layerIndex] == null)
+        if (!IsLayerAllocated(layerIndex))
         {
             var shape = new[]
             {
@@ -490,8 +788,23 @@ private void EnsureCacheAllocated(int layerIndex)
                 _config.HeadDimension
             };
 
-            _keyCache[layerIndex] = new Tensor<T>(shape);
-            _valueCache[layerIndex] = new Tensor<T>(shape);
+            if (_useFp16Storage)
+            {
+                _keyCacheFp16![layerIndex] = new Tensor<Half>(shape);
+                _valueCacheFp16![layerIndex] = new Tensor<Half>(shape);
+            }
+            else if (_useInt8Storage)
+            {
+                _keyCacheInt8![layerIndex] = new Tensor<sbyte>(shape);
+                _valueCacheInt8![layerIndex] = new Tensor<sbyte>(shape);
+                _keyScaleInt8![layerIndex] = 0f;
+                _valueScaleInt8![layerIndex] = 0f;
+            }
+            else
+            {
+                _keyCache[layerIndex] = new Tensor<T>(shape);
+                _valueCache[layerIndex] = new Tensor<T>(shape);
+            }
         }
     }
 
@@ -499,7 +812,7 @@ private void HandleSlidingWindowEviction(int layerIndex, int batchSize, int newS
     {
         for (int b = 0; b < batchSize; b++)
         {
-            int currentLen = _sequenceLengths[b];
+            int currentLen = _sequenceLengths[layerIndex][b];
             int newLen = currentLen + newSeqLen;
 
             if (newLen > _config.WindowSize)
@@ -517,18 +830,43 @@ private void HandleSlidingWindowEviction(int layerIndex, int batchSize, int newS
                             int srcPos = evictCount + s;
                             for (int d = 0; d < _config.HeadDimension; d++)
                             {
-                                _keyCache[layerIndex][new[] { b, h, s, d }] =
-                                    _keyCache[layerIndex][new[] { b, h, srcPos, d }];
-                                _valueCache[layerIndex][new[] { b, h, s, d }] =
-                                    _valueCache[layerIndex][new[] { b, h, srcPos, d }];
+                                if (_useInt8Storage)
+                                {
+                                    _keyCacheInt8![layerIndex][new[] { b, h, s, d }] =
+                                        _keyCacheInt8![layerIndex][new[] { b, h, srcPos, d }];
+                                    _valueCacheInt8![layerIndex][new[] { b, h, s, d }] =
+                                        _valueCacheInt8![layerIndex][new[] { b, h, srcPos, d }];
+                                }
+                                else if (_useFp16Storage)
+                                {
+                                    _keyCacheFp16![layerIndex][new[] { b, h, s, d }] =
+                                        _keyCacheFp16![layerIndex][new[] { b, h, srcPos, d }];
+                                    _valueCacheFp16![layerIndex][new[] { b, h, s, d }] =
+                                        _valueCacheFp16![layerIndex][new[] { b, h, srcPos, d }];
+                                }
+                                else
+                                {
+                                    _keyCache[layerIndex][new[] { b, h, s, d }] =
+                                        _keyCache[layerIndex][new[] { b, h, srcPos, d }];
+                                    _valueCache[layerIndex][new[] { b, h, s, d }] =
+                                        _valueCache[layerIndex][new[] { b, h, srcPos, d }];
+                                }
                             }
                         }
                     }
                 }
 
-                _sequenceLengths[b] = keepCount;
+                _sequenceLengths[layerIndex][b] = keepCount;
                 _evictions += evictCount;
             }
         }
     }
+
+    private bool IsLayerAllocated(int layerIndex)
+    {
+        if (_useInt8Storage)
+            return _keyCacheInt8![layerIndex] != null;
+
+        return _useFp16Storage ? _keyCacheFp16![layerIndex] != null : _keyCache[layerIndex] != null;
+    }
 }
diff --git a/src/Inference/KVCacheConfig.cs b/src/Inference/KVCacheConfig.cs
index fe549f3d0..b7a3448c4 100644
--- a/src/Inference/KVCacheConfig.cs
+++ b/src/Inference/KVCacheConfig.cs
@@ -21,7 +21,7 @@ namespace AiDotNet.Inference;
 /// which don't change once computed for a given position.
 /// </para>
 /// </remarks>
-public class KVCacheConfig
+internal class KVCacheConfig
 {
     /// <summary>
     /// Maximum sequence length the cache can hold.
@@ -113,14 +113,15 @@ public long EstimateMemoryBytes()
         long elementsPerLayer = (long)MaxBatchSize * NumHeads * MaxSequenceLength * HeadDimension;
         long totalElements = elementsPerLayer * NumLayers * 2; // K and V
 
-        int bytesPerElement = DataType switch
-        {
-            CacheDataType.Float16 => 2,
-            CacheDataType.Float32 => 4,
-            CacheDataType.Float64 => 8,
-            CacheDataType.BFloat16 => 2,
-            _ => 4
-        };
+    int bytesPerElement = DataType switch
+    {
+        CacheDataType.Int8 => 1,
+        CacheDataType.Float16 => 2,
+        CacheDataType.Float32 => 4,
+        CacheDataType.Float64 => 8,
+        CacheDataType.BFloat16 => 2,
+        _ => 4
+    };
 
         return totalElements * bytesPerElement;
     }
@@ -187,8 +188,11 @@ public static KVCacheConfig ForModel(string modelSize)
 /// <summary>
 /// Data types supported for KV-Cache storage.
 /// </summary>
-public enum CacheDataType
+internal enum CacheDataType
 {
+    /// <summary>Signed 8-bit integer quantization (int8) with scaling.</summary>
+    Int8,
+
     /// <summary>Half precision (16-bit float).</summary>
     Float16,
 
@@ -205,7 +209,7 @@ public enum CacheDataType
 /// <summary>
 /// Device placement options for KV-Cache.
 /// </summary>
-public enum CacheDevice
+internal enum CacheDevice
 {
     /// <summary>Automatically select based on available hardware.</summary>
     Auto,
diff --git a/src/Inference/PagedAttention/BlockManager.cs b/src/Inference/PagedAttention/BlockManager.cs
index 582c7ac6e..89614b080 100644
--- a/src/Inference/PagedAttention/BlockManager.cs
+++ b/src/Inference/PagedAttention/BlockManager.cs
@@ -24,7 +24,7 @@ namespace AiDotNet.Inference.PagedAttention;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for tensor computations.</typeparam>
-public class BlockManager<T>
+internal class BlockManager<T>
 {
     private readonly BlockManagerConfig _config;
     private readonly object _lock = new();
@@ -338,7 +338,7 @@ public void Reset()
 /// <summary>
 /// Configuration for the block manager.
 /// </summary>
-public class BlockManagerConfig
+internal class BlockManagerConfig
 {
     /// <summary>
     /// Number of tokens per block.
@@ -434,7 +434,7 @@ public static BlockManagerConfig ForModel(string modelName, long availableMemory
 /// <summary>
 /// Statistics about the block manager state.
 /// </summary>
-public class BlockManagerStats
+internal class BlockManagerStats
 {
     /// <summary>Total number of blocks in the pool.</summary>
     public int TotalBlocks { get; set; }
diff --git a/src/Inference/PagedAttention/BlockTable.cs b/src/Inference/PagedAttention/BlockTable.cs
index 3533f0ada..8c7ec16d3 100644
--- a/src/Inference/PagedAttention/BlockTable.cs
+++ b/src/Inference/PagedAttention/BlockTable.cs
@@ -21,7 +21,7 @@ namespace AiDotNet.Inference.PagedAttention;
 /// - Swapping to disk (move a chapter to storage, update the table)
 /// </para>
 /// </remarks>
-public class BlockTable
+internal class BlockTable
 {
     private readonly int _blockSize;
     private readonly List<int> _physicalBlockIds;
@@ -236,7 +236,7 @@ public override string ToString()
 /// Manages block tables for multiple sequences.
 /// </summary>
 /// <typeparam name="T">The numeric type.</typeparam>
-public class BlockTableManager<T>
+internal class BlockTableManager<T>
 {
     private readonly BlockManager<T> _blockManager;
     private readonly Dictionary<long, BlockTable> _blockTables;
diff --git a/src/Inference/PagedAttention/PagedAttentionKernel.cs b/src/Inference/PagedAttention/PagedAttentionKernel.cs
index ccca7cf26..356e27e73 100644
--- a/src/Inference/PagedAttention/PagedAttentionKernel.cs
+++ b/src/Inference/PagedAttention/PagedAttentionKernel.cs
@@ -1,4 +1,6 @@
+using System.Buffers;
 using System.Runtime.CompilerServices;
+using AiDotNet.Inference.Quantization;
 
 namespace AiDotNet.Inference.PagedAttention;
 
@@ -23,7 +25,7 @@ namespace AiDotNet.Inference.PagedAttention;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for tensor computations.</typeparam>
-public class PagedAttentionKernel<T>
+internal class PagedAttentionKernel<T>
 {
     private readonly PagedKVCache<T> _kvCache;
     private readonly PagedAttentionConfig _config;
@@ -300,15 +302,22 @@ public void UpdateCache(
         int position,
         int layer)
     {
-        // Ensure capacity
-        if (!_kvCache.HasCapacityFor(sequenceId, 1))
+        // Ensure logical length and capacity for this position.
+        int requiredLength = position + 1;
+        int currentLength = _kvCache.GetSequenceLength(sequenceId);
+        if (requiredLength > currentLength)
         {
-            _kvCache.ExtendSequence(sequenceId, 1);
+            int additionalTokens = requiredLength - currentLength;
+            if (!_kvCache.ExtendSequence(sequenceId, additionalTokens))
+            {
+                throw new InvalidOperationException(
+                    $"Failed to extend PagedKVCache sequence {sequenceId} to length {requiredLength}.");
+            }
         }
 
         // Convert and write
-        var keyT = ConvertSpan(key);
-        var valueT = ConvertSpan(value);
+        var keyT = ConvertArray(key);
+        var valueT = ConvertArray(value);
 
         _kvCache.WriteKey(sequenceId, position, layer, keyT);
         _kvCache.WriteValue(sequenceId, position, layer, valueT);
@@ -343,27 +352,94 @@ public void Forward(
         int projDim = numHeads * headDim;
         float scale = 1.0f / MathF.Sqrt(headDim);
 
-        // Project Q, K, V
-        var query = new float[projDim];
-        var key = new float[projDim];
-        var value = new float[projDim];
+        var pool = ArrayPool<float>.Shared;
+        var queryBuf = pool.Rent(projDim);
+        var keyBuf = pool.Rent(projDim);
+        var valueBuf = pool.Rent(projDim);
+        var attnBuf = pool.Rent(projDim);
 
-        // Q = hidden @ wQ
-        MatVecMul(hiddenStates, wQ, query.AsSpan(), hiddenDim, projDim);
-        // K = hidden @ wK
-        MatVecMul(hiddenStates, wK, key.AsSpan(), hiddenDim, projDim);
-        // V = hidden @ wV
-        MatVecMul(hiddenStates, wV, value.AsSpan(), hiddenDim, projDim);
+        try
+        {
+            var query = queryBuf.AsSpan(0, projDim);
+            var key = keyBuf.AsSpan(0, projDim);
+            var value = valueBuf.AsSpan(0, projDim);
+            var attnOutput = attnBuf.AsSpan(0, projDim);
+
+            // Q = hidden @ wQ
+            MatVecMul(hiddenStates, wQ, query, hiddenDim, projDim);
+            // K = hidden @ wK
+            MatVecMul(hiddenStates, wK, key, hiddenDim, projDim);
+            // V = hidden @ wV
+            MatVecMul(hiddenStates, wV, value, hiddenDim, projDim);
+
+            // Update cache with new K, V
+            UpdateCache(key, value, sequenceId, position, layer);
+
+            // Compute attention
+            ComputeTiledPagedAttention(query, sequenceId, layer, attnOutput, scale);
+
+            // Project output: out = attn @ wO
+            MatVecMul(attnOutput, wO, output, projDim, hiddenDim);
+        }
+        finally
+        {
+            pool.Return(queryBuf);
+            pool.Return(keyBuf);
+            pool.Return(valueBuf);
+            pool.Return(attnBuf);
+        }
+    }
+
+    public void ForwardQuantized(
+        ReadOnlySpan<float> hiddenStates,
+        in Int8WeightOnlyQuantization.QuantizedWeights wQ,
+        in Int8WeightOnlyQuantization.QuantizedWeights wK,
+        in Int8WeightOnlyQuantization.QuantizedWeights wV,
+        in Int8WeightOnlyQuantization.QuantizedWeights wO,
+        long sequenceId,
+        int position,
+        int layer,
+        Span<float> output)
+    {
+        int hiddenDim = hiddenStates.Length;
+        int numHeads = _config.NumHeads;
+        int headDim = _config.HeadDimension;
+        int projDim = numHeads * headDim;
+        float scale = 1.0f / MathF.Sqrt(headDim);
 
-        // Update cache with new K, V
-        UpdateCache(key.AsSpan(), value.AsSpan(), sequenceId, position, layer);
+        if (wQ.Cols != hiddenDim || wK.Cols != hiddenDim || wV.Cols != hiddenDim || wO.Cols != projDim)
+        {
+            throw new ArgumentException("Quantized weight dimensions do not match expected shapes.");
+        }
 
-        // Compute attention
-        var attnOutput = new float[projDim];
-        ComputeTiledPagedAttention(query.AsSpan(), sequenceId, layer, attnOutput.AsSpan(), scale);
+        var pool = ArrayPool<float>.Shared;
+        var queryBuf = pool.Rent(projDim);
+        var keyBuf = pool.Rent(projDim);
+        var valueBuf = pool.Rent(projDim);
+        var attnBuf = pool.Rent(projDim);
 
-        // Project output: out = attn @ wO
-        MatVecMul(attnOutput.AsSpan(), wO, output, projDim, hiddenDim);
+        try
+        {
+            var query = queryBuf.AsSpan(0, projDim);
+            var key = keyBuf.AsSpan(0, projDim);
+            var value = valueBuf.AsSpan(0, projDim);
+            var attnOutput = attnBuf.AsSpan(0, projDim);
+
+            MatVecMulInt8(hiddenStates, wQ, query);
+            MatVecMulInt8(hiddenStates, wK, key);
+            MatVecMulInt8(hiddenStates, wV, value);
+
+            UpdateCache(key, value, sequenceId, position, layer);
+            ComputeTiledPagedAttention(query, sequenceId, layer, attnOutput, scale);
+            MatVecMulInt8(attnOutput, wO, output);
+        }
+        finally
+        {
+            pool.Return(queryBuf);
+            pool.Return(keyBuf);
+            pool.Return(valueBuf);
+            pool.Return(attnBuf);
+        }
     }
 
     private static void MatVecMul(ReadOnlySpan<float> vec, ReadOnlySpan<float> mat, Span<float> output, int inDim, int outDim)
@@ -381,6 +457,32 @@ private static void MatVecMul(ReadOnlySpan<float> vec, ReadOnlySpan<float> mat,
         }
     }
 
+    private static void MatVecMulInt8(ReadOnlySpan<float> vec, in Int8WeightOnlyQuantization.QuantizedWeights mat, Span<float> output)
+    {
+        int rows = mat.Rows;
+        int cols = mat.Cols;
+
+        if (vec.Length != cols)
+            throw new ArgumentException("Input vector length must match quantized matrix column count.", nameof(vec));
+        if (output.Length < rows)
+            throw new ArgumentException("Output span too small for quantized matvec.", nameof(output));
+
+        var weights = mat.Weights;
+        var scales = mat.Scales;
+
+        for (int r = 0; r < rows; r++)
+        {
+            int baseIdx = r * cols;
+            float sum = 0f;
+            for (int c = 0; c < cols; c++)
+            {
+                sum += weights[baseIdx + c] * vec[c];
+            }
+
+            output[r] = sum * scales[r];
+        }
+    }
+
     private static float ToFloat(T value)
     {
         if (typeof(T) == typeof(float))
@@ -405,15 +507,13 @@ private static T FromFloat(float value)
         return (T)Convert.ChangeType(value, typeof(T))!;
     }
 
-    private static ReadOnlySpan<T> ConvertSpan(ReadOnlySpan<float> source)
+    private static T[] ConvertArray(ReadOnlySpan<float> source)
     {
         if (typeof(T) == typeof(float))
         {
-            // Safe: We've verified T == float at runtime
-            // Reinterpret the array using object cast
-            var floatArray = source.ToArray();
-            var tArray = (T[])(object)floatArray;
-            return new ReadOnlySpan<T>(tArray);
+            // Safe: runtime-verified T == float.
+            // Return a rooted array so GC cannot collect it while spans are in use.
+            return (T[])(object)source.ToArray();
         }
 
         var result = new T[source.Length];
@@ -428,7 +528,7 @@ private static ReadOnlySpan<T> ConvertSpan(ReadOnlySpan<float> source)
 /// <summary>
 /// Configuration for paged attention kernel.
 /// </summary>
-public class PagedAttentionConfig
+internal class PagedAttentionConfig
 {
     /// <summary>Number of attention heads.</summary>
     public int NumHeads { get; set; } = 32;
@@ -453,7 +553,7 @@ public class PagedAttentionConfig
 /// Integrates PagedAttention with ContinuousBatcher for high-throughput serving.
 /// </summary>
 /// <typeparam name="T">Numeric type.</typeparam>
-public class PagedAttentionServer<T> : IDisposable
+internal class PagedAttentionServer<T> : IDisposable
 {
     private readonly PagedKVCache<T> _kvCache;
     private readonly PagedAttentionKernel<T> _kernel;
diff --git a/src/Inference/PagedAttention/PagedKVCache.cs b/src/Inference/PagedAttention/PagedKVCache.cs
index 52af9c28e..bcffc9863 100644
--- a/src/Inference/PagedAttention/PagedKVCache.cs
+++ b/src/Inference/PagedAttention/PagedKVCache.cs
@@ -23,7 +23,7 @@ namespace AiDotNet.Inference.PagedAttention;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for tensor computations.</typeparam>
-public class PagedKVCache<T> : IDisposable
+internal class PagedKVCache<T> : IDisposable
 {
     private readonly PagedKVCacheConfig _config;
     private readonly BlockManager<T> _blockManager;
@@ -87,7 +87,21 @@ public PagedKVCache(PagedKVCacheConfig config)
 
         // Allocate physical storage
         long totalElements = _elementsPerBlock * config.NumBlocks;
-        _kvStorage = new T[totalElements];
+        if (totalElements > int.MaxValue)
+            throw new ArgumentOutOfRangeException(nameof(config), $"PagedKVCache requires totalElements <= {int.MaxValue}, but got {totalElements}. Reduce NumBlocks or memory size.");
+
+        try
+        {
+            _kvStorage = new T[(int)totalElements];
+        }
+        catch (OutOfMemoryException ex)
+        {
+            throw new InvalidOperationException(
+                $"Failed to allocate PagedKVCache storage ({totalElements} elements). " +
+                "This can happen when requesting very large contiguous memory blocks (e.g., multi-GB) in environments with tighter single-object limits. " +
+                "Reduce available memory/NumBlocks or use a runtime that supports larger allocations.",
+                ex);
+        }
 
         _sequenceMetadata = new Dictionary<long, SequenceMetadata>();
     }
@@ -120,7 +134,10 @@ public bool AllocateSequence(long sequenceId, int initialTokens)
             if (_sequenceMetadata.ContainsKey(sequenceId))
                 return false;
 
-            int blocksNeeded = _blockManager.BlocksForTokens(initialTokens);
+            // Allocate at least one block up-front so the first token write (position 0) always has capacity.
+            // Actual "current length" bookkeeping still starts at initialTokens.
+            int blocksNeeded = _blockManager.BlocksForTokens(Math.Max(1, initialTokens));
+            blocksNeeded = Math.Max(1, blocksNeeded);
             var table = _blockTableManager.CreateBlockTable(sequenceId, blocksNeeded);
 
             if (table == null)
@@ -444,7 +461,7 @@ private class SequenceMetadata
 /// <summary>
 /// Configuration for PagedKVCache.
 /// </summary>
-public class PagedKVCacheConfig
+internal class PagedKVCacheConfig
 {
     /// <summary>
     /// Number of tokens per block.
@@ -521,7 +538,7 @@ public static PagedKVCacheConfig ForModel(string modelName, long availableBytes,
 /// <summary>
 /// Statistics about the paged KV cache.
 /// </summary>
-public class PagedKVCacheStats
+internal class PagedKVCacheStats
 {
     /// <summary>Number of active sequences.</summary>
     public int ActiveSequences { get; set; }
diff --git a/src/Inference/PagedCachedMultiHeadAttention.cs b/src/Inference/PagedCachedMultiHeadAttention.cs
new file mode 100644
index 000000000..c139d17e5
--- /dev/null
+++ b/src/Inference/PagedCachedMultiHeadAttention.cs
@@ -0,0 +1,487 @@
+using AiDotNet.Inference.PagedAttention;
+using AiDotNet.NeuralNetworks.Attention;
+using AiDotNet.NeuralNetworks.Layers;
+using AiDotNet.Tensors.LinearAlgebra;
+using System.Buffers;
+using AiDotNet.Inference.Quantization;
+
+namespace AiDotNet.Inference;
+
+/// <summary>
+/// Multi-head attention layer backed by PagedKVCache for efficient multi-sequence inference.
+/// </summary>
+/// <remarks>
+/// This layer is intended for inference-time usage. When <see cref="InferenceMode"/> is enabled
+/// and a <see cref="Kernel"/> is attached, it uses PagedKVCache to avoid reallocations and
+/// allow many independent sequences to grow efficiently.
+/// <para>
+/// <b>Limitation:</b> This layer currently supports <c>batchSize == 1</c> per sequence to avoid cache mixing.
+/// For concurrent serving, create one sequence per request (distinct <see cref="SequenceId"/> values).
+/// </para>
+/// </remarks>
+internal class PagedCachedMultiHeadAttention<T> : LayerBase<T>, AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata
+{
+    private readonly int _headCount;
+    private readonly int _headDimension;
+    private readonly int _embeddingDimension;
+    private readonly bool _useCausalMask;
+
+    private Matrix<T> _queryWeights;
+    private Matrix<T> _keyWeights;
+    private Matrix<T> _valueWeights;
+    private Matrix<T> _outputWeights;
+    private Vector<T> _outputBias;
+
+    private Tensor<T>? _lastInput;
+    private Tensor<T>? _lastOutput;
+    private int _currentPosition;
+
+    private readonly FlashAttentionConfig _flashConfig;
+
+    private readonly object _kernelWeightsLock = new();
+    private float[]? _cachedWQ;
+    private float[]? _cachedWK;
+    private float[]? _cachedWV;
+    private float[]? _cachedWO;
+    private Int8WeightOnlyQuantization.QuantizedWeights? _cachedWQInt8;
+    private Int8WeightOnlyQuantization.QuantizedWeights? _cachedWKInt8;
+    private Int8WeightOnlyQuantization.QuantizedWeights? _cachedWVInt8;
+    private Int8WeightOnlyQuantization.QuantizedWeights? _cachedWOInt8;
+
+    internal bool EnableWeightOnlyQuantization { get; set; }
+
+    /// <summary>
+    /// Gets whether this layer supports training.
+    /// </summary>
+    public override bool SupportsTraining => false;
+
+    /// <summary>
+    /// Gets the number of attention heads.
+    /// </summary>
+    public int HeadCount => _headCount;
+
+    /// <summary>
+    /// Gets the dimension of each attention head.
+    /// </summary>
+    public int HeadDimension => _headDimension;
+
+    /// <summary>
+    /// Gets or sets the layer index for KV-cache addressing.
+    /// </summary>
+    public int LayerIndex { get; set; }
+
+    /// <summary>
+    /// Gets or sets whether the layer is in inference mode (uses paged cache).
+    /// </summary>
+    public bool InferenceMode { get; set; }
+
+    /// <summary>
+    /// Gets or sets the PagedAttention kernel (owns the paged cache).
+    /// </summary>
+    public PagedAttentionKernel<T>? Kernel { get; set; }
+
+    /// <summary>
+    /// Gets or sets the sequence ID used for this layer's cache operations.
+    /// </summary>
+    public long SequenceId { get; set; }
+
+    public PagedCachedMultiHeadAttention(
+        int sequenceLength,
+        int embeddingDimension,
+        int headCount,
+        bool useCausalMask,
+        IActivationFunction<T>? activationFunction = null)
+        : base(
+            [sequenceLength, embeddingDimension],
+            [sequenceLength, embeddingDimension],
+            activationFunction ?? new IdentityActivation<T>())
+    {
+        if (embeddingDimension % headCount != 0)
+        {
+            throw new ArgumentException(
+                $"Embedding dimension ({embeddingDimension}) must be divisible by head count ({headCount}).",
+                nameof(headCount));
+        }
+
+        _embeddingDimension = embeddingDimension;
+        _headCount = headCount;
+        _headDimension = embeddingDimension / headCount;
+        _useCausalMask = useCausalMask;
+
+        _queryWeights = new Matrix<T>(embeddingDimension, embeddingDimension);
+        _keyWeights = new Matrix<T>(embeddingDimension, embeddingDimension);
+        _valueWeights = new Matrix<T>(embeddingDimension, embeddingDimension);
+        _outputWeights = new Matrix<T>(embeddingDimension, embeddingDimension);
+        _outputBias = new Vector<T>(embeddingDimension);
+
+        _flashConfig = FlashAttentionConfig.Default;
+        _flashConfig.UseCausalMask = useCausalMask;
+    }
+
+    public override Tensor<T> Forward(Tensor<T> input)
+    {
+        _lastInput = input;
+
+        if (!InferenceMode || Kernel == null)
+        {
+            var statelessOutput = ForwardStateless(input);
+            _lastOutput = statelessOutput;
+            return statelessOutput;
+        }
+
+        // Inference mode: update cache and compute attention token-by-token.
+        // This supports both prefill (seqLen>1) and decode (seqLen==1) by iterating tokens.
+        if (input.Shape.Length < 3)
+        {
+            throw new ArgumentException("Expected input shape [batch, seqLen, embeddingDim].", nameof(input));
+        }
+
+        int batchSize = input.Shape[0];
+        int seqLen = input.Shape[1];
+        int embDim = input.Shape[2];
+
+        if (embDim != _embeddingDimension)
+        {
+            throw new ArgumentException($"Expected embeddingDim={_embeddingDimension}, got {embDim}.", nameof(input));
+        }
+
+        if (batchSize != 1)
+        {
+            // PagedAttentionKernel supports batched attention, but this layer's state model is per-sequence.
+            // Keep it strict for now to avoid cache mixing.
+            throw new NotSupportedException("PagedCachedMultiHeadAttention currently supports batchSize==1 per sequence.");
+        }
+
+        var output = new Tensor<T>([batchSize, seqLen, embDim]);
+
+        // Materialize weights to float spans for the paged kernel.
+        // Note: This is intentionally conservative and prioritizes correctness.
+        // PagedAttentionKernel's MatVecMul expects matrices stored as [outDim, inDim] row-major.
+        // Our weights are stored as [inDim, outDim], so we pass a transposed layout.
+        EnsureKernelWeightCache();
+        var wQ = _cachedWQ!;
+        var wK = _cachedWK!;
+        var wV = _cachedWV!;
+        var wO = _cachedWO!;
+
+        // Process each token sequentially to ensure causal behavior during prefill.
+        var pool = ArrayPool<float>.Shared;
+        var hiddenBuffer = pool.Rent(embDim);
+        var tokenOutBuffer = pool.Rent(embDim);
+
+        try
+        {
+            var hidden = hiddenBuffer.AsSpan(0, embDim);
+            var tokenOut = tokenOutBuffer.AsSpan(0, embDim);
+
+            for (int t = 0; t < seqLen; t++)
+            {
+                for (int d = 0; d < embDim; d++)
+                {
+                    hidden[d] = Convert.ToSingle(input[0, t, d]);
+                }
+
+                if (EnableWeightOnlyQuantization &&
+                    typeof(T) == typeof(float) &&
+                    _cachedWQInt8.HasValue &&
+                    _cachedWKInt8.HasValue &&
+                    _cachedWVInt8.HasValue &&
+                    _cachedWOInt8.HasValue)
+                {
+                    Kernel.ForwardQuantized(
+                        hiddenStates: hidden,
+                        wQ: _cachedWQInt8.Value,
+                        wK: _cachedWKInt8.Value,
+                        wV: _cachedWVInt8.Value,
+                        wO: _cachedWOInt8.Value,
+                        sequenceId: SequenceId,
+                        position: _currentPosition,
+                        layer: LayerIndex,
+                        output: tokenOut);
+                }
+                else
+                {
+                    Kernel.Forward(
+                        hiddenStates: hidden,
+                        wQ: wQ,
+                        wK: wK,
+                        wV: wV,
+                        wO: wO,
+                        sequenceId: SequenceId,
+                        position: _currentPosition,
+                        layer: LayerIndex,
+                        output: tokenOut);
+                }
+
+                // Add bias and activation.
+                for (int d = 0; d < embDim; d++)
+                {
+                    T value = NumOps.FromDouble(tokenOut[d]);
+                    value = NumOps.Add(value, _outputBias[d]);
+                    output[0, t, d] = ScalarActivation!.Activate(value);
+                }
+
+                _currentPosition++;
+            }
+        }
+        finally
+        {
+            pool.Return(hiddenBuffer);
+            pool.Return(tokenOutBuffer);
+        }
+
+        _lastOutput = output;
+        return output;
+    }
+
+    private Tensor<T> ForwardStateless(Tensor<T> input)
+    {
+        // Stateless fallback using FlashAttention.
+        // Compute Q,K,V projections.
+        var (q, k, v) = ComputeQkv(input);
+
+        // FlashAttention expects [B, H, S, D]
+        var qh = SplitHeads(q);
+        var kh = SplitHeads(k);
+        var vh = SplitHeads(v);
+
+        var (attn, _) = FlashAttention<T>.Forward(qh, kh, vh, _flashConfig);
+
+        // Merge heads back to [B, S, E]
+        var merged = MergeHeads(attn);
+
+        // Output projection + bias + activation.
+        // Use the tensor/matrix multiply path to leverage optimized kernels.
+        var projected = merged.Multiply(_outputWeights);
+
+        int batch = projected.Shape[0];
+        int seqLen = projected.Shape[1];
+        var output = new Tensor<T>([batch, seqLen, _embeddingDimension]);
+
+        for (int b = 0; b < batch; b++)
+        {
+            for (int s = 0; s < seqLen; s++)
+            {
+                for (int o = 0; o < _embeddingDimension; o++)
+                {
+                    T value = NumOps.Add(projected[b, s, o], _outputBias[o]);
+                    output[b, s, o] = ScalarActivation!.Activate(value);
+                }
+            }
+        }
+
+        return output;
+    }
+
+    private (Tensor<T> Q, Tensor<T> K, Tensor<T> V) ComputeQkv(Tensor<T> input)
+    {
+        // Use the tensor/matrix multiply path to leverage optimized kernels.
+        var q = input.Multiply(_queryWeights);
+        var k = input.Multiply(_keyWeights);
+        var v = input.Multiply(_valueWeights);
+        return (q, k, v);
+    }
+
+    private Tensor<T> SplitHeads(Tensor<T> x)
+    {
+        int batchSize = x.Shape[0];
+        int seqLen = x.Shape[1];
+        var reshaped = new Tensor<T>([batchSize, _headCount, seqLen, _headDimension]);
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            for (int s = 0; s < seqLen; s++)
+            {
+                for (int h = 0; h < _headCount; h++)
+                {
+                    int baseOffset = h * _headDimension;
+                    for (int d = 0; d < _headDimension; d++)
+                    {
+                        reshaped[b, h, s, d] = x[b, s, baseOffset + d];
+                    }
+                }
+            }
+        }
+
+        return reshaped;
+    }
+
+    private Tensor<T> MergeHeads(Tensor<T> x)
+    {
+        int batchSize = x.Shape[0];
+        int seqLen = x.Shape[2];
+        var merged = new Tensor<T>([batchSize, seqLen, _embeddingDimension]);
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            for (int s = 0; s < seqLen; s++)
+            {
+                for (int h = 0; h < _headCount; h++)
+                {
+                    int baseOffset = h * _headDimension;
+                    for (int d = 0; d < _headDimension; d++)
+                    {
+                        merged[b, s, baseOffset + d] = x[b, h, s, d];
+                    }
+                }
+            }
+        }
+
+        return merged;
+    }
+
+    private static float[] MatrixToFloatForKernel(Matrix<T> matrix)
+    {
+        int inDim = matrix.Rows;
+        int outDim = matrix.Columns;
+        var data = new float[outDim * inDim];
+
+        for (int o = 0; o < outDim; o++)
+        {
+            int rowOffset = o * inDim;
+            for (int i = 0; i < inDim; i++)
+            {
+                data[rowOffset + i] = Convert.ToSingle(matrix[i, o]);
+            }
+        }
+
+        return data;
+    }
+
+    public override Vector<T> GetParameters()
+    {
+        int totalParams = _queryWeights.Rows * _queryWeights.Columns * 4 + _outputBias.Length;
+        var parameters = new Vector<T>(totalParams);
+        int index = 0;
+
+        foreach (var matrix in new[] { _queryWeights, _keyWeights, _valueWeights, _outputWeights })
+        {
+            for (int i = 0; i < matrix.Rows; i++)
+            {
+                for (int j = 0; j < matrix.Columns; j++)
+                {
+                    parameters[index++] = matrix[i, j];
+                }
+            }
+        }
+
+        for (int i = 0; i < _outputBias.Length; i++)
+        {
+            parameters[index++] = _outputBias[i];
+        }
+
+        return parameters;
+    }
+
+    public override void SetParameters(Vector<T> parameters)
+    {
+        int expectedParams = _queryWeights.Rows * _queryWeights.Columns * 4 + _outputBias.Length;
+        if (parameters.Length != expectedParams)
+        {
+            throw new ArgumentException($"Expected {expectedParams} parameters, got {parameters.Length}");
+        }
+
+        int index = 0;
+
+        foreach (var matrix in new[] { _queryWeights, _keyWeights, _valueWeights, _outputWeights })
+        {
+            for (int i = 0; i < matrix.Rows; i++)
+            {
+                for (int j = 0; j < matrix.Columns; j++)
+                {
+                    matrix[i, j] = parameters[index++];
+                }
+            }
+        }
+
+        for (int i = 0; i < _outputBias.Length; i++)
+        {
+            _outputBias[i] = parameters[index++];
+        }
+
+        InvalidateKernelWeightCache();
+    }
+
+    private void EnsureKernelWeightCache()
+    {
+        if (_cachedWQ != null && _cachedWK != null && _cachedWV != null && _cachedWO != null)
+        {
+            return;
+        }
+
+        lock (_kernelWeightsLock)
+        {
+            _cachedWQ ??= MatrixToFloatForKernel(_queryWeights);
+            _cachedWK ??= MatrixToFloatForKernel(_keyWeights);
+            _cachedWV ??= MatrixToFloatForKernel(_valueWeights);
+            _cachedWO ??= MatrixToFloatForKernel(_outputWeights);
+
+            if (EnableWeightOnlyQuantization && typeof(T) == typeof(float))
+            {
+                int projDim = _headCount * _headDimension;
+                int hiddenDim = _embeddingDimension;
+
+                _cachedWQInt8 = Int8WeightOnlyQuantization.QuantizePerRow(_cachedWQ, projDim, hiddenDim);
+                _cachedWKInt8 = Int8WeightOnlyQuantization.QuantizePerRow(_cachedWK, projDim, hiddenDim);
+                _cachedWVInt8 = Int8WeightOnlyQuantization.QuantizePerRow(_cachedWV, projDim, hiddenDim);
+                _cachedWOInt8 = Int8WeightOnlyQuantization.QuantizePerRow(_cachedWO, hiddenDim, projDim);
+            }
+            else
+            {
+                _cachedWQInt8 = null;
+                _cachedWKInt8 = null;
+                _cachedWVInt8 = null;
+                _cachedWOInt8 = null;
+            }
+        }
+    }
+
+    private void InvalidateKernelWeightCache()
+    {
+        lock (_kernelWeightsLock)
+        {
+            _cachedWQ = null;
+            _cachedWK = null;
+            _cachedWV = null;
+            _cachedWO = null;
+            _cachedWQInt8 = null;
+            _cachedWKInt8 = null;
+            _cachedWVInt8 = null;
+            _cachedWOInt8 = null;
+        }
+    }
+
+    public override void ResetState()
+    {
+        _lastInput = null;
+        _lastOutput = null;
+        _currentPosition = 0;
+    }
+
+    public override Tensor<T> Backward(Tensor<T> outputGradient)
+    {
+        throw new NotSupportedException($"{nameof(PagedCachedMultiHeadAttention<T>)} is intended for inference-time usage only.");
+    }
+
+    public override void UpdateParameters(T learningRate)
+    {
+        throw new NotSupportedException($"{nameof(PagedCachedMultiHeadAttention<T>)} is intended for inference-time usage only.");
+    }
+
+    public override bool SupportsJitCompilation => false;
+
+    public override Autodiff.ComputationNode<T> ExportComputationGraph(List<Autodiff.ComputationNode<T>> inputNodes)
+    {
+        throw new NotSupportedException($"{nameof(PagedCachedMultiHeadAttention<T>)} does not support JIT compilation.");
+    }
+
+    Dictionary<string, string> AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["HeadCount"] = _headCount.ToString(),
+            ["UseCausalMask"] = _useCausalMask.ToString(),
+            ["EnableWeightOnlyQuantization"] = EnableWeightOnlyQuantization.ToString()
+        };
+    }
+}
diff --git a/src/Inference/Quantization/Int8WeightOnlyQuantization.cs b/src/Inference/Quantization/Int8WeightOnlyQuantization.cs
new file mode 100644
index 000000000..ed82923d1
--- /dev/null
+++ b/src/Inference/Quantization/Int8WeightOnlyQuantization.cs
@@ -0,0 +1,100 @@
+using AiDotNet.Tensors.LinearAlgebra;
+
+namespace AiDotNet.Inference.Quantization;
+
+internal static class Int8WeightOnlyQuantization
+{
+    internal readonly struct QuantizedWeights
+    {
+        public QuantizedWeights(sbyte[] weights, float[] scales, int rows, int cols)
+        {
+            Weights = weights;
+            Scales = scales;
+            Rows = rows;
+            Cols = cols;
+        }
+
+        public sbyte[] Weights { get; }
+        public float[] Scales { get; }
+        public int Rows { get; }
+        public int Cols { get; }
+    }
+
+    public static QuantizedWeights QuantizePerRow(Tensor<float> weights)
+    {
+        if (weights.Rank != 2)
+            throw new ArgumentException("Expected 2D weight tensor.", nameof(weights));
+
+        int rows = weights.Shape[0];
+        int cols = weights.Shape[1];
+
+        var q = new sbyte[rows * cols];
+        var scales = new float[rows];
+
+        for (int r = 0; r < rows; r++)
+        {
+            float maxAbs = 0f;
+            int baseIdx = r * cols;
+            for (int c = 0; c < cols; c++)
+            {
+                float v = weights[r, c];
+                float av = MathF.Abs(v);
+                if (av > maxAbs)
+                    maxAbs = av;
+            }
+
+            float scale = maxAbs > 0f ? (maxAbs / 127f) : 1f;
+            scales[r] = scale;
+
+            float inv = 1f / scale;
+            for (int c = 0; c < cols; c++)
+            {
+                float v = weights[r, c] * inv;
+                int qi = (int)MathF.Round(v);
+                if (qi > 127) qi = 127;
+                if (qi < -127) qi = -127;
+                q[baseIdx + c] = (sbyte)qi;
+            }
+        }
+
+        return new QuantizedWeights(q, scales, rows, cols);
+    }
+
+    public static QuantizedWeights QuantizePerRow(ReadOnlySpan<float> weights, int rows, int cols)
+    {
+        if (rows <= 0) throw new ArgumentOutOfRangeException(nameof(rows));
+        if (cols <= 0) throw new ArgumentOutOfRangeException(nameof(cols));
+        if (weights.Length < rows * cols) throw new ArgumentException("Weight span too small for given dimensions.", nameof(weights));
+
+        var q = new sbyte[rows * cols];
+        var scales = new float[rows];
+
+        for (int r = 0; r < rows; r++)
+        {
+            float maxAbs = 0f;
+            int baseIdx = r * cols;
+            for (int c = 0; c < cols; c++)
+            {
+                float v = weights[baseIdx + c];
+                float av = MathF.Abs(v);
+                if (av > maxAbs)
+                    maxAbs = av;
+            }
+
+            float scale = maxAbs > 0f ? (maxAbs / 127f) : 1f;
+            scales[r] = scale;
+
+            float inv = 1f / scale;
+            for (int c = 0; c < cols; c++)
+            {
+                float v = weights[baseIdx + c] * inv;
+                int qi = (int)MathF.Round(v);
+                if (qi > 127) qi = 127;
+                if (qi < -127) qi = -127;
+                q[baseIdx + c] = (sbyte)qi;
+            }
+        }
+
+        return new QuantizedWeights(q, scales, rows, cols);
+    }
+}
diff --git a/src/Inference/Quantization/QuantizedDenseLayer.cs b/src/Inference/Quantization/QuantizedDenseLayer.cs
new file mode 100644
index 000000000..14c8a2190
--- /dev/null
+++ b/src/Inference/Quantization/QuantizedDenseLayer.cs
@@ -0,0 +1,154 @@
+using AiDotNet.Autodiff;
+using AiDotNet.NeuralNetworks.Layers;
+using AiDotNet.Tensors.LinearAlgebra;
+
+namespace AiDotNet.Inference.Quantization;
+
+/// <summary>
+/// Inference-only dense layer that uses weight-only INT8 quantization (per-output scaling).
+/// </summary>
+internal sealed class QuantizedDenseLayer : LayerBase<float>
+{
+    private readonly int _inputSize;
+    private readonly int _outputSize;
+    private readonly sbyte[] _weightsInt8; // row-major [out, in]
+    private readonly float[] _rowScales; // per out
+    private readonly float[] _biases;
+
+    public QuantizedDenseLayer(DenseLayer<float> source)
+        : base(
+            inputShape: source.GetInputShape(),
+            outputShape: source.GetOutputShape(),
+            scalarActivation: source.ScalarActivation ?? new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+    {
+        _inputSize = source.GetInputShape()[0];
+        _outputSize = source.GetOutputShape()[0];
+
+        if (source.VectorActivation != null)
+            throw new InvalidOperationException("QuantizedDenseLayer scalar-activation ctor called for a vector-activation layer.");
+
+        var weights = source.GetWeights();
+        var biases = source.GetBiases();
+        if (weights == null || biases == null)
+            throw new ArgumentException("Dense layer must expose weights and biases.", nameof(source));
+
+        var q = Int8WeightOnlyQuantization.QuantizePerRow(weights);
+        _weightsInt8 = q.Weights;
+        _rowScales = q.Scales;
+
+        _biases = new float[biases.Length];
+        for (int i = 0; i < _biases.Length; i++)
+        {
+            _biases[i] = biases[i];
+        }
+    }
+
+    public QuantizedDenseLayer(DenseLayer<float> source, IVectorActivationFunction<float> vectorActivation)
+        : base(
+            inputShape: source.GetInputShape(),
+            outputShape: source.GetOutputShape(),
+            vectorActivation: vectorActivation)
+    {
+        _inputSize = source.GetInputShape()[0];
+        _outputSize = source.GetOutputShape()[0];
+
+        var weights = source.GetWeights();
+        var biases = source.GetBiases();
+        if (weights == null || biases == null)
+            throw new ArgumentException("Dense layer must expose weights and biases.", nameof(source));
+
+        var q = Int8WeightOnlyQuantization.QuantizePerRow(weights);
+        _weightsInt8 = q.Weights;
+        _rowScales = q.Scales;
+
+        _biases = new float[biases.Length];
+        for (int i = 0; i < _biases.Length; i++)
+        {
+            _biases[i] = biases[i];
+        }
+    }
+
+    public override bool SupportsTraining => false;
+
+    public override bool SupportsJitCompilation => false;
+
+    public override int ParameterCount => 0;
+
+    public override Tensor<float>? GetWeights() => null;
+
+    public override Tensor<float>? GetBiases() => null;
+
+    public override Tensor<float> Forward(Tensor<float> input)
+    {
+        bool inputWas1D = false;
+        Tensor<float> flat;
+        if (input.Rank == 1)
+        {
+            inputWas1D = true;
+            flat = input.Reshape(1, input.Shape[0]);
+        }
+        else if (input.Rank == 2)
+        {
+            flat = input;
+        }
+        else
+        {
+            int batch = input.Shape[0];
+            int features = input.Length / batch;
+            flat = input.Reshape(batch, features);
+        }
+
+        int batchSize = flat.Shape[0];
+        int featuresIn = flat.Shape[1];
+        if (featuresIn != _inputSize)
+            throw new ArgumentException($"QuantizedDenseLayer input size mismatch. Expected {_inputSize}, got {featuresIn}.");
+
+        var output = new Tensor<float>(new[] { batchSize, _outputSize });
+
+        for (int b = 0; b < batchSize; b++)
+        {
+            for (int o = 0; o < _outputSize; o++)
+            {
+                float sum = _biases[o];
+                float scale = _rowScales[o];
+                int wBase = o * _inputSize;
+                for (int i = 0; i < _inputSize; i++)
+                {
+                    sum += flat[b, i] * (_weightsInt8[wBase + i] * scale);
+                }
+                output[b, o] = sum;
+            }
+        }
+
+        var activated = ApplyActivation(output);
+        if (inputWas1D)
+        {
+            return activated.Reshape(_outputSize);
+        }
+
+        return activated;
+    }
+
+    public override Tensor<float> Backward(Tensor<float> outputGradient)
+        => throw new NotSupportedException("QuantizedDenseLayer is inference-only.");
+
+    public override void UpdateParameters(float learningRate)
+        => throw new NotSupportedException("QuantizedDenseLayer is inference-only.");
+
+    public override void UpdateParameters(Vector<float> parameters)
+        => throw new NotSupportedException("QuantizedDenseLayer is inference-only.");
+
+    public override Vector<float> GetParameters()
+        => Vector<float>.Empty();
+
+    public override void ResetState()
+    {
+        // Inference-only; no recurrent state to clear.
+    }
+
+    public override ComputationNode<float> ExportComputationGraph(List<ComputationNode<float>> inputNodes)
+    {
+        // WOQ is a runtime inference rewrite; we intentionally don't support JIT graph export here.
+        throw new NotSupportedException("QuantizedDenseLayer does not support JIT compilation.");
+    }
+}
diff --git a/src/Inference/SpeculativeDecoding/DraftResult.cs b/src/Inference/SpeculativeDecoding/DraftResult.cs
index e1e0ad375..5968bf053 100644
--- a/src/Inference/SpeculativeDecoding/DraftResult.cs
+++ b/src/Inference/SpeculativeDecoding/DraftResult.cs
@@ -6,7 +6,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// Result of draft token generation.
 /// </summary>
 /// <typeparam name="T">The numeric type.</typeparam>
-public class DraftResult<T>
+internal class DraftResult<T>
 {
     /// <summary>
     /// Gets the generated draft tokens.
diff --git a/src/Inference/SpeculativeDecoding/IDraftModel.cs b/src/Inference/SpeculativeDecoding/IDraftModel.cs
index ffb6f886e..9464806c7 100644
--- a/src/Inference/SpeculativeDecoding/IDraftModel.cs
+++ b/src/Inference/SpeculativeDecoding/IDraftModel.cs
@@ -13,7 +13,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations.</typeparam>
-public interface IDraftModel<T>
+internal interface IDraftModel<T>
 {
     /// <summary>
     /// Gets the maximum number of tokens this draft model can generate in one call.
diff --git a/src/Inference/SpeculativeDecoding/NGramDraftModel.cs b/src/Inference/SpeculativeDecoding/NGramDraftModel.cs
index 6438c9f42..ebad5fdef 100644
--- a/src/Inference/SpeculativeDecoding/NGramDraftModel.cs
+++ b/src/Inference/SpeculativeDecoding/NGramDraftModel.cs
@@ -17,7 +17,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type.</typeparam>
-public class NGramDraftModel<T> : IDraftModel<T>
+internal class NGramDraftModel<T> : IDraftModel<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
diff --git a/src/Inference/SpeculativeDecoding/NeuralDraftModel.cs b/src/Inference/SpeculativeDecoding/NeuralDraftModel.cs
index 701759843..65128d715 100644
--- a/src/Inference/SpeculativeDecoding/NeuralDraftModel.cs
+++ b/src/Inference/SpeculativeDecoding/NeuralDraftModel.cs
@@ -13,7 +13,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type.</typeparam>
-public class NeuralDraftModel<T> : IDraftModel<T>
+internal class NeuralDraftModel<T> : IDraftModel<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
diff --git a/src/Inference/SpeculativeDecoding/SpeculativeDecoder.cs b/src/Inference/SpeculativeDecoding/SpeculativeDecoder.cs
index cd0b4ca30..a22872a43 100644
--- a/src/Inference/SpeculativeDecoding/SpeculativeDecoder.cs
+++ b/src/Inference/SpeculativeDecoding/SpeculativeDecoder.cs
@@ -30,7 +30,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations.</typeparam>
-public class SpeculativeDecoder<T>
+internal class SpeculativeDecoder<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
@@ -38,6 +38,10 @@ public class SpeculativeDecoder<T>
     private readonly Func<Vector<int>, Matrix<T>> _targetForward;
     private readonly SpeculativeDecodingConfig<T> _config;
     private readonly Random _random;
+    private readonly int _maxDraftTokens;
+    private readonly int _maxTreeDepth;
+    private int _currentDraftTokens;
+    private int _currentMaxTreeDepth;
 
     // Statistics
     private long _totalTokensGenerated;
@@ -45,6 +49,10 @@ public class SpeculativeDecoder<T>
     private long _acceptedDraftTokens;
     private long _totalVerificationCalls;
 
+    // Tree speculation tracking (when enabled)
+    private long _treeTotalNodes;
+    private long _treeAcceptedNodes;
+
     /// <summary>
     /// Gets the configuration.
     /// </summary>
@@ -53,9 +61,9 @@ public class SpeculativeDecoder<T>
     /// <summary>
     /// Gets the draft acceptance rate.
     /// </summary>
-    public double AcceptanceRate => _totalDraftTokens > 0
-        ? (double)_acceptedDraftTokens / _totalDraftTokens
-        : 0;
+    public double AcceptanceRate => _config.UseTreeSpeculation
+        ? (_treeTotalNodes > 0 ? (double)_treeAcceptedNodes / _treeTotalNodes : 0)
+        : (_totalDraftTokens > 0 ? (double)_acceptedDraftTokens / _totalDraftTokens : 0);
 
     /// <summary>
     /// Gets the average tokens generated per verification call.
@@ -64,6 +72,29 @@ public class SpeculativeDecoder<T>
         ? (double)_totalTokensGenerated / _totalVerificationCalls
         : 0;
 
+    /// <summary>
+    /// Gets the total amount of draft work proposed so far.
+    /// </summary>
+    /// <remarks>
+    /// For classic speculation, this counts draft tokens. For tree speculation, this counts explored nodes.
+    /// </remarks>
+    internal long TotalDraftTokens => _config.UseTreeSpeculation ? _treeTotalNodes : _totalDraftTokens;
+
+    /// <summary>
+    /// Gets the current adaptive draft length.
+    /// </summary>
+    internal int CurrentDraftTokens => _currentDraftTokens;
+
+    /// <summary>
+    /// Gets the current adaptive tree depth.
+    /// </summary>
+    internal int CurrentMaxTreeDepth => _currentMaxTreeDepth;
+
+    /// <summary>
+    /// Gets the total number of verification calls performed so far.
+    /// </summary>
+    internal long TotalVerificationCalls => _totalVerificationCalls;
+
     /// <summary>
     /// Creates a speculative decoder.
     /// </summary>
@@ -79,6 +110,10 @@ public SpeculativeDecoder(
         _draftModel = draftModel ?? throw new ArgumentNullException(nameof(draftModel));
         _targetForward = targetForward ?? throw new ArgumentNullException(nameof(targetForward));
         _config = config ?? new SpeculativeDecodingConfig<T>();
+        _maxDraftTokens = Math.Max(1, _config.NumDraftTokens);
+        _maxTreeDepth = Math.Max(1, _config.MaxTreeDepth);
+        _currentDraftTokens = _maxDraftTokens;
+        _currentMaxTreeDepth = _maxTreeDepth;
         _random = _config.Seed.HasValue ? new Random(_config.Seed.Value) : new Random();
     }
 
@@ -98,6 +133,11 @@ public async Task<SpeculativeResult> GenerateAsync(
         int? eosToken = null,
         CancellationToken cancellationToken = default)
     {
+        if (_config.UseTreeSpeculation)
+        {
+            return await GenerateTreeAsync(inputTokens, maxNewTokens, temperature, eosToken, cancellationToken).ConfigureAwait(false);
+        }
+
         var tokens = new List<int>(inputTokens.Length + maxNewTokens);
         for (int i = 0; i < inputTokens.Length; i++)
         {
@@ -112,7 +152,7 @@ public async Task<SpeculativeResult> GenerateAsync(
             cancellationToken.ThrowIfCancellationRequested();
 
             // Determine how many draft tokens to generate
-            int numDraft = Math.Min(_config.NumDraftTokens, maxNewTokens - generated);
+            int numDraft = Math.Min(_currentDraftTokens, maxNewTokens - generated);
 
             // Generate draft tokens
             var currentTokens = new Vector<int>(tokens.ToArray());
@@ -238,6 +278,11 @@ public async Task<SpeculativeResult> GenerateAsync(
                     BonusToken = true
                 });
             }
+
+            if (_config.AdaptiveDraftLength)
+            {
+                AdjustDraftLength();
+            }
         }
 
         done:
@@ -261,6 +306,94 @@ public async Task<SpeculativeResult> GenerateAsync(
         };
     }
 
+    private async Task<SpeculativeResult> GenerateTreeAsync(
+        Vector<int> inputTokens,
+        int maxNewTokens,
+        T temperature,
+        int? eosToken,
+        CancellationToken cancellationToken)
+    {
+        List<Matrix<T>> BatchTargetForward(List<Vector<int>> sequences)
+        {
+            var results = new List<Matrix<T>>(sequences.Count);
+            for (int i = 0; i < sequences.Count; i++)
+            {
+                results.Add(_targetForward(sequences[i]));
+            }
+            return results;
+        }
+
+        var treeConfig = new TreeSpeculativeConfig
+        {
+            BranchFactor = Math.Max(1, _config.TreeBranchFactor),
+            MaxDepth = _currentMaxTreeDepth,
+            Seed = _config.Seed
+        };
+
+        var decoder = new TreeSpeculativeDecoder<T>(_draftModel, BatchTargetForward, treeConfig);
+        var treeResult = await decoder.GenerateAsync(inputTokens, maxNewTokens, temperature, eosToken, cancellationToken).ConfigureAwait(false);
+
+        long nodes = 0;
+        long accepted = 0;
+        var stepStats = new List<StepStatistics>(treeResult.StepStatistics.Count);
+        for (int i = 0; i < treeResult.StepStatistics.Count; i++)
+        {
+            var s = treeResult.StepStatistics[i];
+            nodes += s.TreeNodes;
+            accepted += s.BestPathLength;
+            stepStats.Add(new StepStatistics
+            {
+                DraftTokens = s.TreeNodes,
+                AcceptedTokens = s.BestPathLength,
+                ResampledToken = false,
+                BonusToken = false
+            });
+        }
+
+        _treeTotalNodes += nodes;
+        _treeAcceptedNodes += accepted;
+
+        if (_config.AdaptiveDraftLength)
+        {
+            AdjustDraftLength();
+        }
+
+        return new SpeculativeResult
+        {
+            Tokens = treeResult.Tokens,
+            NewTokens = treeResult.NewTokens,
+            NumGenerated = treeResult.NumGenerated,
+            AcceptanceRate = AcceptanceRate,
+            TokensPerVerification = treeResult.StepStatistics.Count > 0 ? (double)treeResult.NumGenerated / treeResult.StepStatistics.Count : 0,
+            StepStatistics = stepStats
+        };
+    }
+
+    private void AdjustDraftLength()
+    {
+        double minAccept = NumOps.ToDouble(_config.MinAcceptanceRate);
+        double ar = AcceptanceRate;
+        long work = _config.UseTreeSpeculation ? _treeTotalNodes : _totalDraftTokens;
+
+        if (work < 8)
+        {
+            return;
+        }
+
+        if (ar < minAccept)
+        {
+            _currentDraftTokens = Math.Max(1, _currentDraftTokens - 1);
+            _currentMaxTreeDepth = Math.Max(1, _currentMaxTreeDepth - 1);
+            return;
+        }
+
+        if (ar >= minAccept + 0.2)
+        {
+            _currentDraftTokens = Math.Min(_maxDraftTokens, _currentDraftTokens + 1);
+            _currentMaxTreeDepth = Math.Min(_maxTreeDepth, _currentMaxTreeDepth + 1);
+        }
+    }
+
     /// <summary>
     /// Synchronous generation method.
     /// </summary>
@@ -282,6 +415,10 @@ public void ResetStatistics()
         _totalDraftTokens = 0;
         _acceptedDraftTokens = 0;
         _totalVerificationCalls = 0;
+        _treeTotalNodes = 0;
+        _treeAcceptedNodes = 0;
+        _currentDraftTokens = _maxDraftTokens;
+        _currentMaxTreeDepth = _maxTreeDepth;
         _draftModel.Reset();
     }
 
diff --git a/src/Inference/SpeculativeDecoding/SpeculativeDecodingConfig.cs b/src/Inference/SpeculativeDecoding/SpeculativeDecodingConfig.cs
index b95b7bbbd..c88293d9d 100644
--- a/src/Inference/SpeculativeDecoding/SpeculativeDecodingConfig.cs
+++ b/src/Inference/SpeculativeDecoding/SpeculativeDecodingConfig.cs
@@ -6,7 +6,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// Configuration for speculative decoding.
 /// </summary>
 /// <typeparam name="T">The numeric type for threshold values.</typeparam>
-public class SpeculativeDecodingConfig<T>
+internal class SpeculativeDecodingConfig<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
diff --git a/src/Inference/SpeculativeDecoding/SpeculativeDecodingStats.cs b/src/Inference/SpeculativeDecoding/SpeculativeDecodingStats.cs
index cb5c69cee..355173917 100644
--- a/src/Inference/SpeculativeDecoding/SpeculativeDecodingStats.cs
+++ b/src/Inference/SpeculativeDecoding/SpeculativeDecodingStats.cs
@@ -3,7 +3,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Overall statistics for speculative decoding.
 /// </summary>
-public class SpeculativeDecodingStats
+internal class SpeculativeDecodingStats
 {
     /// <summary>Total tokens generated.</summary>
     public long TotalTokensGenerated { get; set; }
diff --git a/src/Inference/SpeculativeDecoding/SpeculativeResult.cs b/src/Inference/SpeculativeDecoding/SpeculativeResult.cs
index 967b1d431..f28389fc0 100644
--- a/src/Inference/SpeculativeDecoding/SpeculativeResult.cs
+++ b/src/Inference/SpeculativeDecoding/SpeculativeResult.cs
@@ -5,7 +5,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Result of speculative decoding generation.
 /// </summary>
-public class SpeculativeResult
+internal class SpeculativeResult
 {
     /// <summary>
     /// All tokens (input + generated).
diff --git a/src/Inference/SpeculativeDecoding/StepStatistics.cs b/src/Inference/SpeculativeDecoding/StepStatistics.cs
index 3fafdfef7..20df55712 100644
--- a/src/Inference/SpeculativeDecoding/StepStatistics.cs
+++ b/src/Inference/SpeculativeDecoding/StepStatistics.cs
@@ -3,7 +3,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Statistics for a single decoding step.
 /// </summary>
-public class StepStatistics
+internal class StepStatistics
 {
     /// <summary>Number of draft tokens generated.</summary>
     public int DraftTokens { get; set; }
diff --git a/src/Inference/SpeculativeDecoding/TreeSpeculativeConfig.cs b/src/Inference/SpeculativeDecoding/TreeSpeculativeConfig.cs
index 2311b60e5..b5e4417d0 100644
--- a/src/Inference/SpeculativeDecoding/TreeSpeculativeConfig.cs
+++ b/src/Inference/SpeculativeDecoding/TreeSpeculativeConfig.cs
@@ -3,7 +3,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Configuration for tree-based speculative decoding.
 /// </summary>
-public class TreeSpeculativeConfig
+internal class TreeSpeculativeConfig
 {
     /// <summary>Number of branches per node.</summary>
     public int BranchFactor { get; set; } = 2;
diff --git a/src/Inference/SpeculativeDecoding/TreeSpeculativeDecoder.cs b/src/Inference/SpeculativeDecoding/TreeSpeculativeDecoder.cs
index 869ec82ee..b2b042d28 100644
--- a/src/Inference/SpeculativeDecoding/TreeSpeculativeDecoder.cs
+++ b/src/Inference/SpeculativeDecoding/TreeSpeculativeDecoder.cs
@@ -27,7 +27,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type.</typeparam>
-public class TreeSpeculativeDecoder<T>
+internal class TreeSpeculativeDecoder<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
diff --git a/src/Inference/SpeculativeDecoding/TreeSpeculativeResult.cs b/src/Inference/SpeculativeDecoding/TreeSpeculativeResult.cs
index 463dcc4e7..ae19a2adc 100644
--- a/src/Inference/SpeculativeDecoding/TreeSpeculativeResult.cs
+++ b/src/Inference/SpeculativeDecoding/TreeSpeculativeResult.cs
@@ -5,7 +5,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Result of tree-based speculative decoding.
 /// </summary>
-public class TreeSpeculativeResult
+internal class TreeSpeculativeResult
 {
     /// <summary>
     /// All tokens (input + generated).
diff --git a/src/Inference/SpeculativeDecoding/TreeStepStatistics.cs b/src/Inference/SpeculativeDecoding/TreeStepStatistics.cs
index ca5d53f49..6ce7c203e 100644
--- a/src/Inference/SpeculativeDecoding/TreeStepStatistics.cs
+++ b/src/Inference/SpeculativeDecoding/TreeStepStatistics.cs
@@ -3,7 +3,7 @@ namespace AiDotNet.Inference.SpeculativeDecoding;
 /// <summary>
 /// Statistics for a tree speculation step.
 /// </summary>
-public class TreeStepStatistics
+internal class TreeStepStatistics
 {
     /// <summary>Number of nodes in tree.</summary>
     public int TreeNodes { get; set; }
diff --git a/src/InferenceOptimization/ARCHITECTURE.md b/src/InferenceOptimization/ARCHITECTURE.md
new file mode 100644
index 000000000..71b03957e
--- /dev/null
+++ b/src/InferenceOptimization/ARCHITECTURE.md
@@ -0,0 +1,448 @@
+# Inference Optimization Architecture
+
+This document describes the architecture and design of the AiDotNet Inference Optimization module.
+
+## Design Goals
+
+1. **Hardware-specific optimization**: Leverage SIMD instructions (AVX2, AVX-512, NEON)
+2. **Graceful fallback**: Automatically fall back to scalar implementations
+3. **Extensibility**: Easy to add new optimized operators
+4. **Zero overhead**: No performance penalty when optimizations not available
+5. **Type safety**: Maintain strong typing throughout
+6. **Thread safety**: Support concurrent execution
+7. **Profiling**: Track performance metrics
+
+## Architecture Overview
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                   Application Layer                             │
+│  (Neural Networks, Inference Serving, Training)                 │
+└───────────────────────┬─────────────────────────────────────────┘
+                        │
+                        ▼
+┌─────────────────────────────────────────────────────────────────┐
+│              OptimizationInitializer                            │
+│  - Platform detection                                           │
+│  - Operator registration                                        │
+│  - Profiling initialization                                     │
+└───────────────────────┬─────────────────────────────────────────┘
+                        │
+        ┌───────────────┼───────────────┐
+        │               │               │
+        ▼               ▼               ▼
+┌──────────────┐ ┌──────────────┐ ┌──────────────┐
+│   Kernels    │ │ CPU Optimize │ │ GPU Optimize │
+│              │ │              │ │              │
+│ - GEMM       │ │ - Cache      │ │ - CUDA Base  │
+│ - Attention  │ │ - Loop       │ │ - Memory Mgr │
+│ - Conv2D     │ │ - Tiling     │ │              │
+│ - SIMD       │ │              │ │              │
+└──────────────┘ └──────────────┘ └──────────────┘
+        │               │               │
+        └───────────────┼───────────────┘
+                        │
+                        ▼
+┌─────────────────────────────────────────────────────────────────┐
+│           Custom Operator Registry                              │
+│  - Priority-based selection                                     │
+│  - Platform capability matching                                 │
+│  - Fallback management                                          │
+└───────────────────────┬─────────────────────────────────────────┘
+                        │
+        ┌───────────────┼───────────────┐
+        │               │               │
+        ▼               ▼               ▼
+┌──────────────┐ ┌──────────────┐ ┌──────────────┐
+│  Platform    │ │  Profiler    │ │   Tensor     │
+│  Detector    │ │              │ │   Operations │
+│              │ │ - Timing     │ │              │
+│ - SIMD caps  │ │ - Memory     │ │ (LinearAlg)  │
+│ - CPU info   │ │ - Stats      │ │              │
+│ - GPU detect │ │              │ │              │
+└──────────────┘ └──────────────┘ └──────────────┘
+```
+
+## Core Components
+
+### 1. Platform Detection (`PlatformDetector`)
+
+**Responsibility**: Detect hardware capabilities at startup
+
+**Key features**:
+- CPU architecture detection (x86/x64, ARM)
+- SIMD instruction set detection (SSE, AVX, AVX-512, NEON)
+- Cache size estimation
+- GPU capability detection
+- Thread-safe singleton pattern
+
+**Detection flow**:
+```
+Startup
+  ↓
+Check Architecture (x86/ARM)
+  ↓
+Query SIMD Support
+  ├─ x86: SSE* → AVX* → AVX-512*
+  └─ ARM: NEON → Dot Product
+  ↓
+Estimate Cache Sizes
+  ↓
+Check GPU APIs (CUDA/OpenCL)
+  ↓
+Create PlatformCapabilities object
+```
+
+### 2. Custom Operator Registry (`CustomOperatorRegistry`)
+
+**Responsibility**: Manage and select optimal operator implementations
+
+**Key features**:
+- Thread-safe operator registration
+- Priority-based selection
+- Automatic platform capability matching
+- Multiple implementations per operation
+- Lazy operator selection
+
+**Selection algorithm**:
+```
+GetOperator(name)
+  ↓
+Check cache
+  ├─ Found → Return cached
+  └─ Not found ↓
+Get candidates for name
+  ↓
+Sort by priority (descending)
+  ↓
+For each candidate:
+  └─ If IsSupported() → Select and cache
+  ↓
+Return best supported operator
+```
+
+### 3. SIMD Kernels (`SimdKernels`)
+
+**Responsibility**: Low-level SIMD-optimized operations
+
+**Key features**:
+- Platform-specific implementations (AVX2, SSE, NEON)
+- Automatic fallback to scalar code
+- Unsafe pointer-based for zero overhead
+- Aggressive inlining
+
+**Implementation pattern**:
+```csharp
+public static unsafe void Operation(float* input, float* output, int length)
+{
+    int i = 0;
+
+    // AVX2 path (8 floats)
+    if (Avx2.IsSupported && length >= 8)
+    {
+        // Process 8 floats at a time
+        // ...
+    }
+    // SSE path (4 floats)
+    else if (Sse.IsSupported && length >= 4)
+    {
+        // Process 4 floats at a time
+        // ...
+    }
+    // NEON path (4 floats)
+    else if (AdvSimd.IsSupported && length >= 4)
+    {
+        // Process 4 floats at a time
+        // ...
+    }
+
+    // Scalar fallback for remainder
+    for (; i < length; i++)
+    {
+        // Process one element
+    }
+}
+```
+
+### 4. High-Level Kernels
+
+#### GEMM Kernel (`GemmKernel`)
+
+**Algorithm**: Cache-blocked matrix multiplication
+
+**Optimization techniques**:
+1. **Cache blocking**: Tile matrices to fit in L1 cache
+2. **SIMD vectorization**: Use SimdKernels for inner loops
+3. **Parallelization**: Parallel.For for row blocks
+4. **Memory access**: Row-major optimized access patterns
+
+**Pseudo-code**:
+```
+For each block_i in (0, M, BlockSize):
+  For each block_j in (0, N, BlockSize):
+    For each block_k in (0, K, BlockSize):
+      For each i in block_i:
+        For each k in block_k:
+          a_val = A[i, k]
+          // SIMD-optimized:
+          C[i, j:j+blocksize] += a_val * B[k, j:j+blocksize]
+```
+
+#### Attention Kernel (`AttentionKernel`)
+
+**Algorithm**: Fused scaled dot-product attention
+
+**Optimization techniques**:
+1. **Kernel fusion**: Compute QK^T, softmax, and *V in single pass
+2. **SIMD dot products**: Use optimized dot product for scores
+3. **Batch parallelization**: Parallel over batch dimension
+4. **Memory efficiency**: Minimize temporary allocations
+
+**Pseudo-code**:
+```
+For each batch in parallel:
+  // Compute attention scores
+  For i in seq_len_q:
+    For j in seq_len_k:
+      scores[i,j] = SIMD_DotProduct(Q[i], K[j]) / sqrt(d_k)
+
+  // Apply softmax per row
+  For i in seq_len_q:
+    scores[i] = Softmax(scores[i])
+
+  // Weighted sum with V
+  For i in seq_len_q:
+    output[i] = Σ(scores[i,j] * V[j]) // SIMD-optimized
+```
+
+#### Convolution Kernel (`ConvolutionKernel`)
+
+**Variants**:
+1. Standard 2D convolution
+2. Depthwise separable convolution
+3. Group convolution
+
+**Optimization techniques**:
+1. **Parallelization**: Over batch and output channels
+2. **Memory layout**: NCHW format for cache efficiency
+3. **Padding handling**: Boundary checks in inner loop
+
+### 5. CPU Optimization Utilities
+
+#### Cache Optimizer (`CacheOptimizer`)
+
+**Features**:
+- Cache size-aware tiling
+- Prefetching hints
+- Cache-aware transpose
+- Z-order indexing for 2D locality
+- Cache miss estimation
+
+**Tiling calculation**:
+```
+OptimalTileSize = sqrt(L1_Size / (3 * element_size))
+// Factor of 3 for: A tile + B tile + C tile
+```
+
+#### Loop Optimizer (`LoopOptimizer`)
+
+**Features**:
+- 2D/3D loop tiling
+- Loop unrolling (4x, 8x)
+- Strip mining
+- Loop fusion
+- Loop interchange
+- Parallel tiling
+
+### 6. Performance Profiler (`PerformanceProfiler`)
+
+**Responsibility**: Track and report operation performance
+
+**Key features**:
+- Thread-safe operation tracking
+- Timing with Stopwatch (high precision)
+- Memory allocation tracking
+- Statistical aggregation (min/avg/max)
+- Enable/disable at runtime
+
+**Usage pattern**:
+```csharp
+using (profiler.Profile("OperationName"))
+{
+    // Operation code
+}
+// Timing and memory automatically recorded
+```
+
+### 7. GPU Optimization Infrastructure
+
+GPU acceleration is provided by the **AiDotNet.Tensors** project via ILGPU.
+
+**Components** (in `AiDotNet.Tensors.Engines` namespace):
+- `GpuEngine`: Full ILGPU implementation with CUDA/OpenCL kernels for tensor operations
+- `GpuMemoryPool<T>`: Buffer pooling with rent/return pattern and size-based bucketing
+- `MultiGpuManager`: Multi-GPU support for distributed tensor operations
+- `AsyncGpuTransfer`: Asynchronous host-device data transfers
+
+**Key Features**:
+- Real ILGPU integration (not placeholders)
+- CUDA and OpenCL backend support
+- Optimized Conv2D, GEMM, and element-wise kernels
+- Memory pooling to reduce allocation overhead (5-10x improvement)
+- Automatic fallback to CPU when GPU unavailable
+
+**Usage**:
+```csharp
+// GPU operations are automatically used when available
+var engine = new GpuEngine();
+var result = engine.MatMul(a, b); // Uses GPU if available
+```
+
+## Data Flow
+
+### Typical Execution Flow
+
+```
+Application requests matrix multiplication
+  ↓
+Looks up "GEMM" in CustomOperatorRegistry
+  ↓
+Registry returns GemmKernel (if supported)
+  ↓
+GemmKernel.Execute(A, B)
+  ↓
+Checks matrix size
+  ├─ Small → GemmBlocked (single-threaded, cache-blocked)
+  └─ Large → GemmParallel (multi-threaded)
+  ↓
+Inner loop uses SimdKernels.ScalarMultiplyAdd
+  ↓
+SimdKernels detects platform
+  ├─ AVX2 available → Use AVX2 instructions
+  ├─ SSE available → Use SSE instructions
+  ├─ NEON available → Use NEON instructions
+  └─ Otherwise → Scalar fallback
+  ↓
+Returns result Tensor<float>
+```
+
+## Memory Management
+
+### Allocation Strategy
+
+1. **Input/Output tensors**: Managed by `Tensor<T>` class
+2. **Temporary buffers**: Stackalloc for small, heap for large
+3. **SIMD operations**: Unsafe pointers, no allocation
+4. **GPU memory**: Future - explicit device allocation
+
+### Cache Efficiency
+
+1. **Blocking**: Tile operations to fit in cache
+2. **Prefetching**: Hint CPU to load data ahead
+3. **Access patterns**: Row-major optimized
+4. **Data reuse**: Maximize temporal locality
+
+## Thread Safety
+
+### Thread-Safe Components
+
+- `CustomOperatorRegistry`: ConcurrentDictionary
+- `PerformanceProfiler`: ConcurrentDictionary + atomic operations
+- Platform detection: Lazy initialization with lock
+
+### Parallel Execution
+
+- `Parallel.For` for data parallelism
+- Work stealing for load balancing
+- Minimal synchronization overhead
+
+## Extensibility
+
+### Adding a New Optimized Operator
+
+1. **Implement interface**:
+   ```csharp
+   public class MyKernel : ICustomOperator<float>
+   {
+       public string Name => "MyOperation";
+       public string Version => "1.0.0";
+       public int Priority => 100;
+
+       public bool IsSupported() { /* check platform */ }
+       public double EstimatedSpeedup() { /* estimate */ }
+       public Tensor<float> Execute(params Tensor<float>[] inputs) { /* implement */ }
+   }
+   ```
+
+2. **Register operator**:
+   ```csharp
+   CustomOperatorRegistry.Instance.Register(new MyKernel());
+   ```
+
+3. **Use operator**:
+   ```csharp
+   var kernel = CustomOperatorRegistry.Instance.GetOperator<float>("MyOperation");
+   var result = kernel.Execute(input);
+   ```
+
+## Performance Considerations
+
+### SIMD Vectorization
+
+- **AVX2**: 8x float32 per instruction (256-bit)
+- **AVX-512**: 16x float32 per instruction (512-bit)
+- **SSE**: 4x float32 per instruction (128-bit)
+- **NEON**: 4x float32 per instruction (128-bit)
+
+### Cache Hierarchy
+
+Typical cache sizes and optimization targets:
+
+| Cache | Size | Latency | Optimization |
+|-------|------|---------|--------------|
+| L1    | 32 KB | 4 cycles | Inner loop tiles |
+| L2    | 256 KB | 12 cycles | Mid-level blocks |
+| L3    | 8 MB | 40 cycles | Outer blocks |
+| RAM   | GB | 200+ cycles | Minimize access |
+
+### Parallelization Overhead
+
+- Thread creation: ~50 µs
+- Work distribution: ~10 µs per task
+- **Threshold**: Only parallelize if work > 100 µs per task
+
+## Future Enhancements
+
+### Planned Features
+
+1. **GPU Kernel Enhancements** (base GPU support already implemented in AiDotNet.Tensors):
+   - Tensor core utilization (FP16/INT8)
+   - Additional specialized kernels (Winograd convolution, etc.)
+   - Multi-GPU pipeline optimization
+
+2. **Quantization**:
+   - INT8 inference
+   - FP16 mixed precision
+   - Dynamic quantization
+
+3. **Graph Optimization**:
+   - Operator fusion
+   - Dead code elimination
+   - Constant folding
+
+4. **Memory Optimization**:
+   - Buffer pooling
+   - In-place operations
+   - Memory defragmentation
+
+5. **Advanced Kernels**:
+   - Winograd convolution
+   - FFT-based convolution
+   - Sparse matrix operations
+
+## References
+
+- [Intel Intrinsics Guide](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/)
+- [ARM NEON Programmer's Guide](https://developer.arm.com/architectures/instruction-sets/simd-isas/neon)
+- [Cache-Oblivious Algorithms](https://en.wikipedia.org/wiki/Cache-oblivious_algorithm)
+- [BLAS Optimization Techniques](http://www.netlib.org/blas/)
diff --git a/src/InferenceOptimization/CustomOperatorRegistry.cs b/src/InferenceOptimization/CustomOperatorRegistry.cs
new file mode 100644
index 000000000..cf661f544
--- /dev/null
+++ b/src/InferenceOptimization/CustomOperatorRegistry.cs
@@ -0,0 +1,205 @@
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace AiDotNet.InferenceOptimization
+{
+    /// <summary>
+    /// Thread-safe registry for managing custom operators with automatic fallback
+    /// </summary>
+    public sealed class CustomOperatorRegistry
+    {
+        private static readonly Lazy<CustomOperatorRegistry> _instance =
+            new Lazy<CustomOperatorRegistry>(() => new CustomOperatorRegistry());
+
+        private readonly ConcurrentDictionary<string, List<ICustomOperator>> _operators;
+        private readonly ConcurrentDictionary<string, SelectedOperatorEntry> _selectedOperators;
+        private readonly ConcurrentDictionary<string, long> _operatorVersions;
+
+        /// <summary>
+        /// Gets the singleton instance of the registry
+        /// </summary>
+        public static CustomOperatorRegistry Instance => _instance.Value;
+
+        private CustomOperatorRegistry()
+        {
+            _operators = new ConcurrentDictionary<string, List<ICustomOperator>>();
+            _selectedOperators = new ConcurrentDictionary<string, SelectedOperatorEntry>();
+            _operatorVersions = new ConcurrentDictionary<string, long>();
+        }
+
+        /// <summary>
+        /// Registers a custom operator
+        /// </summary>
+        public void Register(ICustomOperator op)
+        {
+            if (op == null)
+                throw new ArgumentNullException(nameof(op));
+
+            // Bump the version after the operator set is updated.
+            // This avoids stale cached selections without requiring coarse locking.
+            void BumpVersion() => _operatorVersions.AddOrUpdate(op.Name, 1, (_, v) => v + 1);
+
+            // Use AddOrUpdate with factory that always creates a new sorted list
+            // This ensures thread-safety by never mutating existing lists
+            _operators.AddOrUpdate(
+                op.Name,
+                _ => new List<ICustomOperator> { op },
+                (_, existingList) =>
+                {
+                    // Create a new list with all existing operators plus the new one
+                    // This avoids race conditions from modifying the existing list
+                    List<ICustomOperator> newList;
+                    lock (existingList)
+                    {
+                        newList = new List<ICustomOperator>(existingList) { op };
+                    }
+                    newList.Sort((a, b) => b.Priority.CompareTo(a.Priority));
+                    return newList;
+                });
+
+            BumpVersion();
+        }
+
+        /// <summary>
+        /// Gets the best available operator for the given name
+        /// </summary>
+        public ICustomOperator? GetOperator(string name)
+        {
+            if (string.IsNullOrEmpty(name))
+                throw new ArgumentException("Operator name cannot be null or empty", nameof(name));
+
+            while (true)
+            {
+                long version = _operatorVersions.GetOrAdd(name, 0);
+
+                if (_selectedOperators.TryGetValue(name, out var existing) && existing.Version == version)
+                {
+                    return existing.Operator is NullOperator ? null : existing.Operator;
+                }
+
+                var selected = SelectOperatorOrNull(name);
+
+                // Only publish the cached selection if the operator set version did not change while we were selecting.
+                if (_operatorVersions.TryGetValue(name, out var current) && current == version)
+                {
+                    _selectedOperators[name] = new SelectedOperatorEntry(version, selected);
+                    return selected is NullOperator ? null : selected;
+                }
+
+                // Operator set changed while selecting; retry to avoid caching a stale choice.
+            }
+        }
+
+        private ICustomOperator SelectOperatorOrNull(string name)
+        {
+            if (!_operators.TryGetValue(name, out var candidates))
+                return new NullOperator();
+
+            lock (candidates)
+            {
+                // Find the highest priority supported operator
+                var result = candidates.FirstOrDefault(op => op.IsSupported());
+                return result ?? new NullOperator();
+            }
+        }
+
+        /// <summary>
+        /// Gets a typed operator
+        /// </summary>
+        public ICustomOperator<T>? GetOperator<T>(string name) where T : struct
+        {
+            return GetOperator(name) as ICustomOperator<T>;
+        }
+
+        /// <summary>
+        /// Internal marker type for null operators
+        /// </summary>
+        private sealed class NullOperator : ICustomOperator
+        {
+            public string Name => string.Empty;
+            public string Version => string.Empty;
+            public int Priority => int.MinValue;
+            public bool IsSupported() => false;
+            public double EstimatedSpeedup() => 0;
+        }
+
+        /// <summary>
+        /// Checks if an operator is available
+        /// </summary>
+        public bool HasOperator(string name)
+        {
+            return GetOperator(name) != null;
+        }
+
+        /// <summary>
+        /// Unregisters all operators with the given name
+        /// </summary>
+        public void Unregister(string name)
+        {
+            _operators.TryRemove(name, out _);
+            _selectedOperators.TryRemove(name, out _);
+            _operatorVersions.TryRemove(name, out _);
+        }
+
+        /// <summary>
+        /// Gets all registered operator names
+        /// </summary>
+        public IEnumerable<string> GetRegisteredOperatorNames()
+        {
+            return _operators.Keys.ToArray();
+        }
+
+        /// <summary>
+        /// Gets detailed information about all registered operators
+        /// </summary>
+        public Dictionary<string, List<OperatorInfo>> GetOperatorInfo()
+        {
+            var result = new Dictionary<string, List<OperatorInfo>>();
+
+            foreach (var kvp in _operators)
+            {
+                lock (kvp.Value)
+                {
+                    result[kvp.Key] = kvp.Value.Select(op => new OperatorInfo
+                    {
+                        Name = op.Name,
+                        Version = op.Version,
+                        Priority = op.Priority,
+                        IsSupported = op.IsSupported(),
+                        EstimatedSpeedup = op.EstimatedSpeedup(),
+                        Type = op.GetType().FullName ?? op.GetType().Name
+                    }).ToList();
+                }
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Clears all registered operators
+        /// </summary>
+        public void Clear()
+        {
+            _operators.Clear();
+            _selectedOperators.Clear();
+            _operatorVersions.Clear();
+        }
+
+        private readonly record struct SelectedOperatorEntry(long Version, ICustomOperator Operator);
+    }
+
+    /// <summary>
+    /// Information about a registered operator
+    /// </summary>
+    public class OperatorInfo
+    {
+        public string Name { get; set; } = string.Empty;
+        public string Version { get; set; } = string.Empty;
+        public int Priority { get; set; }
+        public bool IsSupported { get; set; }
+        public double EstimatedSpeedup { get; set; }
+        public string Type { get; set; } = string.Empty;
+    }
+}
diff --git a/src/InferenceOptimization/Examples/OptimizationExample.cs b/src/InferenceOptimization/Examples/OptimizationExample.cs
deleted file mode 100644
index 049b0cccf..000000000
--- a/src/InferenceOptimization/Examples/OptimizationExample.cs
+++ /dev/null
@@ -1,242 +0,0 @@
-using AiDotNet.InferenceOptimization.Core;
-using AiDotNet.Interfaces;
-
-namespace AiDotNet.InferenceOptimization.Examples;
-
-/// <summary>
-/// Example usage of the inference optimization system.
-/// </summary>
-public class OptimizationExample
-{
-    /// <summary>
-    /// Example 1: Basic optimization of a simple CNN
-    /// </summary>
-    public static void BasicCNNOptimization()
-    {
-        Console.WriteLine("=== Example 1: Basic CNN Optimization ===\n");
-
-        // Create a simple CNN (pseudo-code, adapt to your model structure)
-        var layers = new List<ILayer<double>>
-        {
-            // Convolutional layer + BatchNorm + ReLU (will be fused)
-            // MaxPooling
-            // Another Conv + BatchNorm + ReLU (will be fused)
-            // Flatten
-            // Dense + Bias + ReLU (will be fused)
-            // Output Dense
-        };
-
-        // Build optimization graph
-        var graphBuilder = new GraphBuilder<double>();
-        var graph = graphBuilder.BuildFromLayers(layers);
-
-        Console.WriteLine($"Original Graph: {graph.GetStatistics()}\n");
-
-        // Optimize with Standard level
-        var options = OptimizationOptions.FromLevel(OptimizationLevel.Standard);
-        options.PrintStatistics = true;
-
-        var optimizer = new GraphOptimizer<double>(options);
-        optimizer.Optimize(graph);
-
-        Console.WriteLine("\nOptimization complete!");
-    }
-
-    /// <summary>
-    /// Example 2: Aggressive optimization for production deployment
-    /// </summary>
-    public static void ProductionOptimization()
-    {
-        Console.WriteLine("=== Example 2: Production Optimization ===\n");
-
-        // Create your model layers
-        var layers = new List<ILayer<double>>(); // Your layers here
-
-        var graphBuilder = new GraphBuilder<double>();
-        var graph = graphBuilder.BuildFromLayers(layers);
-
-        // Use Aggressive optimization for production
-        var options = new OptimizationOptions
-        {
-            Level = OptimizationLevel.Aggressive,
-            EnableOperatorFusion = true,
-            EnableMemoryReuse = true,
-            EnableCSE = true,
-            EnableInPlaceOptimization = true,
-            TargetLayout = "NCHW", // Optimize for GPU
-            PrintStatistics = true,
-            ValidateAfterEachPass = true
-        };
-
-        var optimizer = new GraphOptimizer<double>(options);
-        optimizer.Optimize(graph);
-
-        Console.WriteLine("\nProduction-ready optimized graph created!");
-    }
-
-    /// <summary>
-    /// Example 3: Custom optimization pass
-    /// </summary>
-    public static void CustomPassExample()
-    {
-        Console.WriteLine("=== Example 3: Custom Optimization Pass ===\n");
-
-        var graphBuilder = new GraphBuilder<double>();
-        var layers = new List<ILayer<double>>(); // Your layers
-        var graph = graphBuilder.BuildFromLayers(layers);
-
-        // Create optimizer
-        var optimizer = new GraphOptimizer<double>();
-
-        // Add custom pass (implement your own IOptimizationPass<T>)
-        // optimizer.AddPass(new MyCustomPass<double>());
-
-        optimizer.Optimize(graph);
-
-        Console.WriteLine("Custom optimization applied!");
-    }
-
-    /// <summary>
-    /// Example 4: Comparing different optimization levels
-    /// </summary>
-    public static void CompareOptimizationLevels()
-    {
-        Console.WriteLine("=== Example 4: Comparing Optimization Levels ===\n");
-
-        var graphBuilder = new GraphBuilder<double>();
-        var layers = new List<ILayer<double>>(); // Your layers
-        var originalGraph = graphBuilder.BuildFromLayers(layers);
-
-        var levels = new[]
-        {
-            OptimizationLevel.None,
-            OptimizationLevel.Basic,
-            OptimizationLevel.Standard,
-            OptimizationLevel.Aggressive,
-            OptimizationLevel.Maximum
-        };
-
-        foreach (var level in levels)
-        {
-            Console.WriteLine($"\n--- Testing {level} Level ---");
-
-            var options = OptimizationOptions.FromLevel(level);
-            options.PrintStatistics = true;
-
-            var optimizer = new GraphOptimizer<double>(options);
-            optimizer.Optimize(originalGraph.Clone());
-
-            Console.WriteLine($"Level {level} complete\n");
-        }
-    }
-
-    /// <summary>
-    /// Example 5: Transformer model optimization
-    /// </summary>
-    public static void TransformerOptimization()
-    {
-        Console.WriteLine("=== Example 5: Transformer Optimization ===\n");
-
-        // Build transformer graph
-        var graphBuilder = new GraphBuilder<double>();
-        var layers = new List<ILayer<double>>
-        {
-            // Multi-head attention (will be fused)
-            // Layer normalization
-            // Feed-forward: Dense + Bias + GELU (will be fused)
-            // Dense + Bias (will be fused)
-            // Layer normalization
-            // etc.
-        };
-
-        var graph = graphBuilder.BuildFromLayers(layers);
-
-        // Optimize for transformer
-        var options = new OptimizationOptions
-        {
-            Level = OptimizationLevel.Aggressive,
-            EnableOperatorFusion = true,
-            EnableMemoryReuse = true,
-            PrintStatistics = true
-        };
-
-        var optimizer = new GraphOptimizer<double>(options);
-        optimizer.Optimize(graph);
-
-        Console.WriteLine("\nTransformer optimized!");
-        Console.WriteLine("Expected speedup: 2-3x");
-    }
-
-    /// <summary>
-    /// Example 6: Memory-constrained optimization
-    /// </summary>
-    public static void MemoryConstrainedOptimization()
-    {
-        Console.WriteLine("=== Example 6: Memory-Constrained Optimization ===\n");
-
-        var graphBuilder = new GraphBuilder<double>();
-        var layers = new List<ILayer<double>>(); // Your layers
-        var graph = graphBuilder.BuildFromLayers(layers);
-
-        // Prioritize memory optimizations
-        var options = new OptimizationOptions
-        {
-            Level = OptimizationLevel.Aggressive,
-            EnableMemoryReuse = true,
-            EnableInPlaceOptimization = true,
-            EnableOperatorFusion = true, // Also reduces memory
-            PrintStatistics = true
-        };
-
-        var optimizer = new GraphOptimizer<double>(options);
-        optimizer.Optimize(graph);
-
-        Console.WriteLine("\nMemory-optimized graph created!");
-        Console.WriteLine("Expected memory reduction: 30-50%");
-    }
-
-    /// <summary>
-    /// Example 7: Inspect optimization passes
-    /// </summary>
-    public static void InspectPasses()
-    {
-        Console.WriteLine("=== Example 7: Inspect Optimization Passes ===\n");
-
-        var optimizer = new GraphOptimizer<double>(
-            OptimizationOptions.FromLevel(OptimizationLevel.Aggressive)
-        );
-
-        var passes = optimizer.GetPasses();
-
-        Console.WriteLine($"Total passes: {passes.Count}\n");
-
-        foreach (var pass in passes)
-        {
-            Console.WriteLine($"- {pass.Name} ({pass.PassType})");
-        }
-    }
-
-    public static void Main(string[] args)
-    {
-        // Run all examples
-        BasicCNNOptimization();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        ProductionOptimization();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        CustomPassExample();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        CompareOptimizationLevels();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        TransformerOptimization();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        MemoryConstrainedOptimization();
-        Console.WriteLine("\n" + new string('=', 60) + "\n");
-
-        InspectPasses();
-    }
-}
diff --git a/src/InferenceOptimization/ICustomOperator.cs b/src/InferenceOptimization/ICustomOperator.cs
new file mode 100644
index 000000000..734285534
--- /dev/null
+++ b/src/InferenceOptimization/ICustomOperator.cs
@@ -0,0 +1,48 @@
+using System;
+using AiDotNet.LinearAlgebra;
+
+namespace AiDotNet.InferenceOptimization
+{
+    /// <summary>
+    /// Defines the contract for custom operators with hardware-specific optimizations
+    /// </summary>
+    public interface ICustomOperator
+    {
+        /// <summary>
+        /// Gets the unique name of the operator
+        /// </summary>
+        string Name { get; }
+
+        /// <summary>
+        /// Gets the version of the operator implementation
+        /// </summary>
+        string Version { get; }
+
+        /// <summary>
+        /// Gets the priority level (higher values are preferred)
+        /// </summary>
+        int Priority { get; }
+
+        /// <summary>
+        /// Determines if the operator can run on the current platform
+        /// </summary>
+        bool IsSupported();
+
+        /// <summary>
+        /// Estimates the relative performance gain over reference implementation
+        /// </summary>
+        /// <returns>Expected speedup multiplier (e.g., 2.0 for 2x speedup)</returns>
+        double EstimatedSpeedup();
+    }
+
+    /// <summary>
+    /// Base interface for custom operators that work with tensors
+    /// </summary>
+    public interface ICustomOperator<T> : ICustomOperator where T : struct
+    {
+        /// <summary>
+        /// Executes the operator on input tensors
+        /// </summary>
+        Tensor<T> Execute(params Tensor<T>[] inputs);
+    }
+}
diff --git a/src/InferenceOptimization/Kernels/AttentionKernel.cs b/src/InferenceOptimization/Kernels/AttentionKernel.cs
new file mode 100644
index 000000000..930512e80
--- /dev/null
+++ b/src/InferenceOptimization/Kernels/AttentionKernel.cs
@@ -0,0 +1,331 @@
+using System;
+using System.Threading.Tasks;
+using AiDotNet.LinearAlgebra;
+using AiDotNet.Tensors.Engines.Simd;
+
+namespace AiDotNet.InferenceOptimization.Kernels
+{
+    /// <summary>
+    /// Fused attention kernel for transformer models
+    /// Implements optimized scaled dot-product attention: softmax(QK^T/sqrt(d_k))V
+    /// </summary>
+    public class AttentionKernel : ICustomOperator<float>
+    {
+        public string Name => "FusedAttention";
+        public string Version => "1.0.0";
+        public int Priority => 100;
+
+        public AttentionKernel() { }
+
+        public bool IsSupported()
+        {
+            return true;
+        }
+
+        public double EstimatedSpeedup()
+        {
+            // Fused attention reduces memory traffic significantly
+            return 2.5;
+        }
+
+        public Tensor<float> Execute(params Tensor<float>[] inputs)
+        {
+            if (inputs == null || inputs.Length < 3)
+                throw new ArgumentException("Attention requires Q, K, V tensors");
+
+            var q = inputs[0]; // [batch_size, seq_len_q, d_k]
+            var k = inputs[1]; // [batch_size, seq_len_k, d_k]
+            var v = inputs[2]; // [batch_size, seq_len_v, d_v]
+
+            bool useMask = inputs.Length > 3;
+            Tensor<float>? mask = useMask ? inputs[3] : null;
+
+            return ExecuteInternal(q, k, v, mask, maskBatchModulo: q.Shape.Length == 3 ? q.Shape[0] : 0);
+        }
+
+        private Tensor<float> ExecuteInternal(
+            Tensor<float> q,
+            Tensor<float> k,
+            Tensor<float> v,
+            Tensor<float>? mask,
+            int maskBatchModulo)
+        {
+            if (q.Shape.Length != 3 || k.Shape.Length != 3 || v.Shape.Length != 3)
+                throw new ArgumentException("Attention requires 3D tensors [batch, seq_len, features]");
+
+            int batchSize = q.Shape[0];
+            int seqLenQ = q.Shape[1];
+            int seqLenK = k.Shape[1];
+            int dK = q.Shape[2];
+            int dV = v.Shape[2];
+
+            if (k.Shape[0] != batchSize || v.Shape[0] != batchSize)
+                throw new ArgumentException("Q, K, and V must have the same batch size");
+
+            if (k.Shape[2] != dK)
+                throw new ArgumentException("Q and K must have same feature dimension");
+
+            if (v.Shape[1] != seqLenK)
+                throw new ArgumentException("K and V must have same sequence length");
+
+            if (mask != null)
+            {
+                if (mask.Shape.Length != 3)
+                    throw new ArgumentException("Attention mask must be a 3D tensor [batch, seq_len_q, seq_len_k]");
+
+                if (mask.Shape[1] != seqLenQ || mask.Shape[2] != seqLenK)
+                    throw new ArgumentException("Attention mask must match [batch, seq_len_q, seq_len_k]");
+
+                if (maskBatchModulo <= 0)
+                {
+                    if (mask.Shape[0] != batchSize)
+                        throw new ArgumentException("Attention mask must have the same batch size as Q when used in Execute()");
+                }
+                else
+                {
+                    if (mask.Shape[0] != maskBatchModulo)
+                        throw new ArgumentException("Attention mask batch dimension must match the provided maskBatchModulo");
+                }
+            }
+
+            var result = new Tensor<float>(new[] { batchSize, seqLenQ, dV });
+
+            // Process each batch in parallel
+            Parallel.For(0, batchSize, b =>
+            {
+                ProcessBatch(q, k, v, mask, result, b, seqLenQ, seqLenK, dK, dV, maskBatchModulo);
+            });
+
+            return result;
+        }
+
+        private unsafe void ProcessBatch(
+            Tensor<float> q, Tensor<float> k, Tensor<float> v,
+            Tensor<float>? mask, Tensor<float> result,
+            int batchIdx, int seqLenQ, int seqLenK, int dK, int dV,
+            int maskBatchModulo)
+        {
+            float scale = 1.0f / MathF.Sqrt(dK);
+
+            // Extract batch slices
+            int qOffset = batchIdx * seqLenQ * dK;
+            int kOffset = batchIdx * seqLenK * dK;
+            int vOffset = batchIdx * seqLenK * dV;
+            int outOffset = batchIdx * seqLenQ * dV;
+
+            // Compute attention scores: QK^T
+            var scores = new float[seqLenQ * seqLenK];
+
+            fixed (float* pQ = q.Data, pK = k.Data, pScores = scores)
+            {
+                for (int i = 0; i < seqLenQ; i++)
+                {
+                    for (int j = 0; j < seqLenK; j++)
+                    {
+                        float* qRow = pQ + qOffset + i * dK;
+                        float* kRow = pK + kOffset + j * dK;
+                        float score = SimdKernels.DotProduct(qRow, kRow, dK) * scale;
+
+                        // Apply mask if provided
+                        if (mask != null)
+                        {
+                            int effectiveMaskBatch = maskBatchModulo > 0 ? (batchIdx % maskBatchModulo) : batchIdx;
+                            int maskIdx = effectiveMaskBatch * seqLenQ * seqLenK + i * seqLenK + j;
+                            // Use epsilon-based comparison for floating point equality
+                            if (MathF.Abs(mask.Data[maskIdx]) < 1e-6f)
+                            {
+                                score = float.NegativeInfinity;
+                            }
+                        }
+
+                        pScores[i * seqLenK + j] = score;
+                    }
+                }
+            }
+
+            // Apply softmax over each row
+            ApplySoftmax(scores, seqLenQ, seqLenK);
+
+            // Compute weighted sum: attention_weights * V
+            fixed (float* pScores = scores, pV = v.Data, pOut = result.Data)
+            {
+                for (int i = 0; i < seqLenQ; i++)
+                {
+                    float* outRow = pOut + outOffset + i * dV;
+
+                    // Initialize output row to zero
+                    for (int j = 0; j < dV; j++)
+                    {
+                        outRow[j] = 0.0f;
+                    }
+
+                    // Accumulate weighted values
+                    for (int j = 0; j < seqLenK; j++)
+                    {
+                        float weight = pScores[i * seqLenK + j];
+                        float* vRow = pV + vOffset + j * dV;
+
+                        // outRow += weight * vRow
+                        SimdKernels.ScalarMultiplyAdd(outRow, vRow, weight, outRow, dV);
+                    }
+                }
+            }
+        }
+
+        private unsafe void ApplySoftmax(float[] data, int rows, int cols)
+        {
+            fixed (float* pData = data)
+            {
+                for (int i = 0; i < rows; i++)
+                {
+                    float* row = pData + i * cols;
+
+                    // Find max for numerical stability
+                    float maxVal = float.NegativeInfinity;
+                    for (int j = 0; j < cols; j++)
+                    {
+                        if (row[j] > maxVal)
+                            maxVal = row[j];
+                    }
+
+                    // Compute exp and sum
+                    float sum = 0.0f;
+                    for (int j = 0; j < cols; j++)
+                    {
+                        if (float.IsNegativeInfinity(row[j]))
+                        {
+                            row[j] = 0.0f;
+                        }
+                        else
+                        {
+                            row[j] = MathF.Exp(row[j] - maxVal);
+                            sum += row[j];
+                        }
+                    }
+
+                    // Normalize
+                    if (sum > 0.0f)
+                    {
+                        float invSum = 1.0f / sum;
+                        for (int j = 0; j < cols; j++)
+                        {
+                            row[j] *= invSum;
+                        }
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Multi-head attention variant
+        /// </summary>
+        public Tensor<float> MultiHeadAttention(
+            Tensor<float> q, Tensor<float> k, Tensor<float> v,
+            int numHeads, Tensor<float>? mask = null)
+        {
+            if (q.Shape.Length != 3 || k.Shape.Length != 3 || v.Shape.Length != 3)
+                throw new ArgumentException("Multi-head attention requires 3D tensors");
+
+            int batchSize = q.Shape[0];
+            int dModel = q.Shape[2];
+
+            if (k.Shape[0] != batchSize || v.Shape[0] != batchSize)
+                throw new ArgumentException("Q, K, and V must have the same batch size");
+
+            if (dModel % numHeads != 0)
+                throw new ArgumentException("d_model must be divisible by num_heads");
+
+            int dK = dModel / numHeads;
+
+            if (k.Shape[2] != dModel || v.Shape[2] != dModel)
+                throw new ArgumentException("Q, K, and V must have the same feature dimension (d_model)");
+
+            if (v.Shape[1] != k.Shape[1])
+                throw new ArgumentException("K and V must have the same sequence length");
+
+            // Reshape to [batch * num_heads, seq_len, d_k]
+            var qReshaped = ReshapeForMultiHead(q, numHeads, dK);
+            var kReshaped = ReshapeForMultiHead(k, numHeads, dK);
+            var vReshaped = ReshapeForMultiHead(v, numHeads, dK);
+
+            // Apply attention
+            Tensor<float> attended;
+            if (mask is null)
+            {
+                attended = ExecuteInternal(qReshaped, kReshaped, vReshaped, mask: null, maskBatchModulo: 0);
+            }
+            else
+            {
+                int expectedPerHeadBatch = batchSize * numHeads;
+                if (mask.Shape.Length != 3)
+                    throw new ArgumentException("Multi-head attention mask must be a 3D tensor");
+
+                if (mask.Shape[1] != q.Shape[1] || mask.Shape[2] != k.Shape[1])
+                    throw new ArgumentException("Multi-head attention mask must match [batch, seq_len_q, seq_len_k]");
+
+                // Accept either per-batch mask [B, SQ, SK] (broadcast across heads) or per-head mask [B*H, SQ, SK].
+                int maskBatchModulo = mask.Shape[0] switch
+                {
+                    int b when b == expectedPerHeadBatch => 0,
+                    int b when b == batchSize => batchSize,
+                    _ => throw new ArgumentException("Multi-head attention mask must have batch dimension B or B*numHeads"),
+                };
+
+                attended = ExecuteInternal(qReshaped, kReshaped, vReshaped, mask, maskBatchModulo);
+            }
+
+            // Reshape back to [batch, seq_len, d_model]
+            return ReshapeFromMultiHead(attended, batchSize, q.Shape[1], dModel);
+        }
+
+        private Tensor<float> ReshapeForMultiHead(Tensor<float> input, int numHeads, int dK)
+        {
+            int batchSize = input.Shape[0];
+            int seqLen = input.Shape[1];
+            var reshaped = new Tensor<float>(new[] { batchSize * numHeads, seqLen, dK });
+
+            for (int b = 0; b < batchSize; b++)
+            {
+                for (int h = 0; h < numHeads; h++)
+                {
+                    for (int s = 0; s < seqLen; s++)
+                    {
+                        for (int d = 0; d < dK; d++)
+                        {
+                            int srcIdx = b * seqLen * numHeads * dK + s * numHeads * dK + h * dK + d;
+                            int dstIdx = (b * numHeads + h) * seqLen * dK + s * dK + d;
+                            reshaped.Data[dstIdx] = input.Data[srcIdx];
+                        }
+                    }
+                }
+            }
+
+            return reshaped;
+        }
+
+        private Tensor<float> ReshapeFromMultiHead(Tensor<float> input, int batchSize, int seqLen, int dModel)
+        {
+            var reshaped = new Tensor<float>(new[] { batchSize, seqLen, dModel });
+            int numHeads = input.Shape[0] / batchSize;
+            int dK = input.Shape[2];
+
+            for (int b = 0; b < batchSize; b++)
+            {
+                for (int h = 0; h < numHeads; h++)
+                {
+                    for (int s = 0; s < seqLen; s++)
+                    {
+                        for (int d = 0; d < dK; d++)
+                        {
+                            int srcIdx = (b * numHeads + h) * seqLen * dK + s * dK + d;
+                            int dstIdx = b * seqLen * dModel + s * dModel + h * dK + d;
+                            reshaped.Data[dstIdx] = input.Data[srcIdx];
+                        }
+                    }
+                }
+            }
+
+            return reshaped;
+        }
+    }
+}
diff --git a/src/InferenceOptimization/Kernels/ConvolutionKernel.cs b/src/InferenceOptimization/Kernels/ConvolutionKernel.cs
new file mode 100644
index 000000000..1c25a4b32
--- /dev/null
+++ b/src/InferenceOptimization/Kernels/ConvolutionKernel.cs
@@ -0,0 +1,367 @@
+using System;
+using System.Threading.Tasks;
+using AiDotNet.LinearAlgebra;
+using AiDotNet.Tensors.Engines;
+
+namespace AiDotNet.InferenceOptimization.Kernels
+{
+    /// <summary>
+    /// Optimized convolution kernels including depthwise and group convolutions
+    /// </summary>
+    public class ConvolutionKernel : ICustomOperator<float>
+    {
+        public string Name => "Convolution";
+        public string Version => "1.0.0";
+        public int Priority => 100;
+
+        public bool IsSupported()
+        {
+            return true;
+        }
+
+        public double EstimatedSpeedup()
+        {
+            var caps = PlatformDetector.Capabilities;
+            if (caps.HasAVX2) return 2.5;
+            if (caps.HasNeon) return 2.0;
+            return 1.5;
+        }
+
+        /// <summary>
+        /// Executes convolution on the provided inputs.
+        /// Expects 2-3 inputs: input tensor, kernel tensor, and optional config tensor.
+        /// Config tensor format: [stride, padding] (defaults to stride=1, padding=0)
+        /// </summary>
+        public Tensor<float> Execute(params Tensor<float>[] inputs)
+        {
+            if (inputs == null || inputs.Length < 2)
+            {
+                throw new ArgumentException(
+                    "ConvolutionKernel requires at least 2 inputs: input tensor and kernel tensor. " +
+                    "Optional 3rd input for config [stride, padding].");
+            }
+
+            var input = inputs[0];
+            var kernel = inputs[1];
+
+            // Extract stride and padding from optional config tensor or use defaults
+            int stride = 1;
+            int padding = 0;
+
+            if (inputs.Length >= 3 && inputs[2] != null && inputs[2].Data.Length >= 2)
+            {
+                stride = Math.Max(1, (int)inputs[2].Data[0]);
+                padding = Math.Max(0, (int)inputs[2].Data[1]);
+            }
+
+            // Determine convolution type based on kernel shape
+            // Standard: kernel[out_channels, in_channels, kH, kW]
+            // Depthwise: kernel[channels, 1, kH, kW]
+            if (kernel.Shape.Length == 4 && kernel.Shape[1] == 1)
+            {
+                // Depthwise convolution (kernel has 1 in_channel dimension)
+                return DepthwiseConv2D(input, kernel, stride, padding);
+            }
+
+            // Default to standard 2D convolution
+            return Conv2D(input, kernel, stride, padding);
+        }
+
+        /// <summary>
+        /// Standard 2D convolution
+        /// </summary>
+        public Tensor<float> Conv2D(
+            Tensor<float> input,
+            Tensor<float> kernel,
+            int stride = 1,
+            int padding = 0)
+        {
+            // Input: [batch, in_channels, height, width]
+            // Kernel: [out_channels, in_channels, kernel_h, kernel_w]
+
+            if (input.Shape.Length != 4 || kernel.Shape.Length != 4)
+                throw new ArgumentException("Conv2D requires 4D tensors");
+
+            int batchSize = input.Shape[0];
+            int inChannels = input.Shape[1];
+            int inHeight = input.Shape[2];
+            int inWidth = input.Shape[3];
+
+            int outChannels = kernel.Shape[0];
+            int kernelH = kernel.Shape[2];
+            int kernelW = kernel.Shape[3];
+
+            if (kernel.Shape[1] != inChannels)
+                throw new ArgumentException($"Conv2D requires kernel.Shape[1] == inChannels ({inChannels}), but got {kernel.Shape[1]}");
+
+            int outHeight = (inHeight + 2 * padding - kernelH) / stride + 1;
+            int outWidth = (inWidth + 2 * padding - kernelW) / stride + 1;
+
+            if (outHeight <= 0 || outWidth <= 0)
+                throw new ArgumentException(
+                    $"Invalid output dimensions ({outHeight}x{outWidth}). " +
+                    $"Check stride ({stride}), padding ({padding}), and kernel size ({kernelH}x{kernelW}).");
+            var output = new Tensor<float>(new[] { batchSize, outChannels, outHeight, outWidth });
+
+            // Parallelize over batch and output channels
+            Parallel.For(0, batchSize * outChannels, idx =>
+            {
+                int b = idx / outChannels;
+                int oc = idx % outChannels;
+
+                Conv2DSingleOutput(input, kernel, output, b, oc,
+                    inChannels, inHeight, inWidth,
+                    kernelH, kernelW, stride, padding,
+                    outHeight, outWidth);
+            });
+
+            return output;
+        }
+
+        private unsafe void Conv2DSingleOutput(
+            Tensor<float> input, Tensor<float> kernel, Tensor<float> output,
+            int batch, int outChannel,
+            int inChannels, int inHeight, int inWidth,
+            int kernelH, int kernelW, int stride, int padding,
+            int outHeight, int outWidth)
+        {
+            fixed (float* pInput = input.Data, pKernel = kernel.Data, pOutput = output.Data)
+            {
+                for (int oh = 0; oh < outHeight; oh++)
+                {
+                    for (int ow = 0; ow < outWidth; ow++)
+                    {
+                        float sum = 0.0f;
+
+                        for (int ic = 0; ic < inChannels; ic++)
+                        {
+                            for (int kh = 0; kh < kernelH; kh++)
+                            {
+                                for (int kw = 0; kw < kernelW; kw++)
+                                {
+                                    int ih = oh * stride - padding + kh;
+                                    int iw = ow * stride - padding + kw;
+
+                                    if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
+                                    {
+                                        int inputIdx = ((batch * inChannels + ic) * inHeight + ih) * inWidth + iw;
+                                        int kernelIdx = ((outChannel * inChannels + ic) * kernelH + kh) * kernelW + kw;
+
+                                        sum += pInput[inputIdx] * pKernel[kernelIdx];
+                                    }
+                                }
+                            }
+                        }
+
+                        int outputIdx = ((batch * output.Shape[1] + outChannel) * outHeight + oh) * outWidth + ow;
+                        pOutput[outputIdx] = sum;
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Depthwise separable convolution (more efficient for mobile architectures)
+        /// </summary>
+        public Tensor<float> DepthwiseConv2D(
+            Tensor<float> input,
+            Tensor<float> kernel,
+            int stride = 1,
+            int padding = 0)
+        {
+            // Input: [batch, channels, height, width]
+            // Kernel: [channels, 1, kernel_h, kernel_w]
+
+            if (input.Shape.Length != 4 || kernel.Shape.Length != 4)
+                throw new ArgumentException("DepthwiseConv2D requires 4D tensors");
+
+            int batchSize = input.Shape[0];
+            int channels = input.Shape[1];
+            int inHeight = input.Shape[2];
+            int inWidth = input.Shape[3];
+
+            int kernelH = kernel.Shape[2];
+            int kernelW = kernel.Shape[3];
+
+            int outHeight = (inHeight + 2 * padding - kernelH) / stride + 1;
+            int outWidth = (inWidth + 2 * padding - kernelW) / stride + 1;
+
+            if (outHeight <= 0 || outWidth <= 0)
+                throw new ArgumentException(
+                    $"Invalid output dimensions ({outHeight}x{outWidth}). " +
+                    $"Check stride ({stride}), padding ({padding}), and kernel size ({kernelH}x{kernelW}).");
+
+            if (kernel.Shape[1] != 1)
+                throw new ArgumentException(
+                    $"Depthwise convolution requires kernel.Shape[1] == 1, but got {kernel.Shape[1]}");
+
+            if (kernel.Shape[0] != channels)
+                throw new ArgumentException(
+                    $"Depthwise convolution requires kernel.Shape[0] == channels ({channels}), but got {kernel.Shape[0]}");
+
+            var output = new Tensor<float>(new[] { batchSize, channels, outHeight, outWidth });
+
+            Parallel.For(0, batchSize * channels, idx =>
+            {
+                int b = idx / channels;
+                int c = idx % channels;
+
+                DepthwiseConv2DSingleChannel(input, kernel, output, b, c,
+                    inHeight, inWidth, kernelH, kernelW,
+                    stride, padding, outHeight, outWidth);
+            });
+
+            return output;
+        }
+
+        private unsafe void DepthwiseConv2DSingleChannel(
+            Tensor<float> input, Tensor<float> kernel, Tensor<float> output,
+            int batch, int channel,
+            int inHeight, int inWidth, int kernelH, int kernelW,
+            int stride, int padding, int outHeight, int outWidth)
+        {
+            fixed (float* pInput = input.Data, pKernel = kernel.Data, pOutput = output.Data)
+            {
+                for (int oh = 0; oh < outHeight; oh++)
+                {
+                    for (int ow = 0; ow < outWidth; ow++)
+                    {
+                        float sum = 0.0f;
+
+                        for (int kh = 0; kh < kernelH; kh++)
+                        {
+                            for (int kw = 0; kw < kernelW; kw++)
+                            {
+                                int ih = oh * stride - padding + kh;
+                                int iw = ow * stride - padding + kw;
+
+                                if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
+                                {
+                                    int inputIdx = ((batch * input.Shape[1] + channel) * inHeight + ih) * inWidth + iw;
+                                    int kernelIdx = (channel * kernelH + kh) * kernelW + kw;
+
+                                    sum += pInput[inputIdx] * pKernel[kernelIdx];
+                                }
+                            }
+                        }
+
+                        int outputIdx = ((batch * output.Shape[1] + channel) * outHeight + oh) * outWidth + ow;
+                        pOutput[outputIdx] = sum;
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Group convolution (reduces parameters and computation)
+        /// </summary>
+        public Tensor<float> GroupConv2D(
+            Tensor<float> input,
+            Tensor<float> kernel,
+            int groups,
+            int stride = 1,
+            int padding = 0)
+        {
+            if (input.Shape.Length != 4 || kernel.Shape.Length != 4)
+                throw new ArgumentException("GroupConv2D requires 4D tensors");
+
+            int batchSize = input.Shape[0];
+            int inChannels = input.Shape[1];
+            int inHeight = input.Shape[2];
+            int inWidth = input.Shape[3];
+
+            int outChannels = kernel.Shape[0];
+            int kernelH = kernel.Shape[2];
+            int kernelW = kernel.Shape[3];
+
+            if (groups <= 0)
+                throw new ArgumentOutOfRangeException(nameof(groups), "groups must be positive.");
+
+            if (inChannels % groups != 0 || outChannels % groups != 0)
+                throw new ArgumentException("Channels must be divisible by groups");
+
+            int inChannelsPerGroup = inChannels / groups;
+            int outChannelsPerGroup = outChannels / groups;
+
+            if (kernel.Shape[1] != inChannelsPerGroup)
+                throw new ArgumentException(
+                    $"Group convolution requires kernel.Shape[1] == inChannelsPerGroup ({inChannelsPerGroup}), " +
+                    $"but got {kernel.Shape[1]}");
+
+            int outHeight = (inHeight + 2 * padding - kernelH) / stride + 1;
+            int outWidth = (inWidth + 2 * padding - kernelW) / stride + 1;
+
+            if (outHeight <= 0 || outWidth <= 0)
+                throw new ArgumentException(
+                    $"Invalid output dimensions ({outHeight}x{outWidth}). " +
+                    $"Check stride ({stride}), padding ({padding}), and kernel size ({kernelH}x{kernelW}).");
+
+            var output = new Tensor<float>(new[] { batchSize, outChannels, outHeight, outWidth });
+
+            // Process each group independently
+            Parallel.For(0, groups, g =>
+            {
+                for (int b = 0; b < batchSize; b++)
+                {
+                    for (int oc = 0; oc < outChannelsPerGroup; oc++)
+                    {
+                        int globalOutChannel = g * outChannelsPerGroup + oc;
+
+                        GroupConv2DSingleOutput(input, kernel, output, b, globalOutChannel, g,
+                            inChannelsPerGroup, inHeight, inWidth,
+                            kernelH, kernelW, stride, padding,
+                            outHeight, outWidth);
+                    }
+                }
+            });
+
+            return output;
+        }
+
+        private unsafe void GroupConv2DSingleOutput(
+            Tensor<float> input, Tensor<float> kernel, Tensor<float> output,
+            int batch, int outChannel, int group,
+            int inChannelsPerGroup, int inHeight, int inWidth,
+            int kernelH, int kernelW, int stride, int padding,
+            int outHeight, int outWidth)
+        {
+            int inChannelStart = group * inChannelsPerGroup;
+
+            fixed (float* pInput = input.Data, pKernel = kernel.Data, pOutput = output.Data)
+            {
+                for (int oh = 0; oh < outHeight; oh++)
+                {
+                    for (int ow = 0; ow < outWidth; ow++)
+                    {
+                        float sum = 0.0f;
+
+                        for (int ic = 0; ic < inChannelsPerGroup; ic++)
+                        {
+                            int globalInChannel = inChannelStart + ic;
+
+                            for (int kh = 0; kh < kernelH; kh++)
+                            {
+                                for (int kw = 0; kw < kernelW; kw++)
+                                {
+                                    int ih = oh * stride - padding + kh;
+                                    int iw = ow * stride - padding + kw;
+
+                                    if (ih >= 0 && ih < inHeight && iw >= 0 && iw < inWidth)
+                                    {
+                                        int inputIdx = ((batch * input.Shape[1] + globalInChannel) * inHeight + ih) * inWidth + iw;
+                                        int kernelIdx = ((outChannel * inChannelsPerGroup + ic) * kernelH + kh) * kernelW + kw;
+
+                                        sum += pInput[inputIdx] * pKernel[kernelIdx];
+                                    }
+                                }
+                            }
+                        }
+
+                        int outputIdx = ((batch * output.Shape[1] + outChannel) * outHeight + oh) * outWidth + ow;
+                        pOutput[outputIdx] = sum;
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/InferenceOptimization/Kernels/GemmKernel.cs b/src/InferenceOptimization/Kernels/GemmKernel.cs
new file mode 100644
index 000000000..5b36479e1
--- /dev/null
+++ b/src/InferenceOptimization/Kernels/GemmKernel.cs
@@ -0,0 +1,186 @@
+using System;
+using System.Runtime.CompilerServices;
+using System.Threading.Tasks;
+using AiDotNet.LinearAlgebra;
+using AiDotNet.Tensors.Engines;
+using AiDotNet.Tensors.Engines.Simd;
+
+namespace AiDotNet.InferenceOptimization.Kernels
+{
+    /// <summary>
+    /// Optimized General Matrix Multiplication (GEMM) kernel
+    /// Implements cache-aware blocked matrix multiplication with SIMD
+    /// </summary>
+    public class GemmKernel : ICustomOperator<float>
+    {
+        private const int BlockSize = 64; // Tuned for typical L1 cache
+        private const int MinParallelSize = 256; // Minimum size for parallel execution
+
+        public string Name => "GEMM";
+        public string Version => "1.0.0";
+        public int Priority => 100;
+
+        public bool IsSupported()
+        {
+            // GEMM is always supported, but performance varies by platform
+            return true;
+        }
+
+        public double EstimatedSpeedup()
+        {
+            var caps = PlatformDetector.Capabilities;
+            if (caps.HasAVX2) return 3.0;
+            if (caps.HasSSE42) return 2.0;
+            if (caps.HasNeon) return 2.5;
+            return 1.5;
+        }
+
+        public Tensor<float> Execute(params Tensor<float>[] inputs)
+        {
+            if (inputs == null || inputs.Length < 2)
+                throw new ArgumentException("GEMM requires at least 2 input tensors");
+
+            var a = inputs[0];
+            var b = inputs[1];
+
+            if (a.Shape.Length != 2 || b.Shape.Length != 2)
+                throw new ArgumentException("GEMM requires 2D tensors (matrices)");
+
+            int m = a.Shape[0];
+            int k = a.Shape[1];
+            int n = b.Shape[1];
+
+            if (k != b.Shape[0])
+                throw new ArgumentException($"Matrix dimensions incompatible: ({m}x{k}) * ({b.Shape[0]}x{n})");
+
+            var result = new Tensor<float>(new[] { m, n });
+
+            // Choose strategy based on matrix size
+            if (m * n * k < MinParallelSize * MinParallelSize)
+            {
+                GemmBlocked(a.Data, b.Data, result.Data, m, n, k);
+            }
+            else
+            {
+                GemmParallel(a.Data, b.Data, result.Data, m, n, k);
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Cache-blocked GEMM implementation
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe void GemmBlocked(float[] A, float[] B, float[] C, int M, int N, int K)
+        {
+            fixed (float* pA = A, pB = B, pC = C)
+            {
+                // Blocked algorithm for cache efficiency
+                for (int i = 0; i < M; i += BlockSize)
+                {
+                    int iMax = Math.Min(i + BlockSize, M);
+
+                    for (int j = 0; j < N; j += BlockSize)
+                    {
+                        int jMax = Math.Min(j + BlockSize, N);
+
+                        for (int k = 0; k < K; k += BlockSize)
+                        {
+                            int kMax = Math.Min(k + BlockSize, K);
+
+                            // Process block
+                            for (int ii = i; ii < iMax; ii++)
+                            {
+                                for (int kk = k; kk < kMax; kk++)
+                                {
+                                    float aVal = pA[ii * K + kk];
+                                    float* pBRow = pB + kk * N + j;
+                                    float* pCRow = pC + ii * N + j;
+
+                                    // SIMD-optimized inner loop
+                                    SimdKernels.ScalarMultiplyAdd(pCRow, pBRow, aVal, pCRow, jMax - j);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Parallel GEMM implementation for large matrices
+        /// </summary>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private unsafe void GemmParallel(float[] A, float[] B, float[] C, int M, int N, int K)
+        {
+            // Parallelize over rows of A
+            Parallel.For(0, (M + BlockSize - 1) / BlockSize, iBlock =>
+            {
+                int i = iBlock * BlockSize;
+                int iMax = Math.Min(i + BlockSize, M);
+
+                fixed (float* pA = A, pB = B, pC = C)
+                {
+                    for (int j = 0; j < N; j += BlockSize)
+                    {
+                        int jMax = Math.Min(j + BlockSize, N);
+
+                        for (int k = 0; k < K; k += BlockSize)
+                        {
+                            int kMax = Math.Min(k + BlockSize, K);
+
+                            for (int ii = i; ii < iMax; ii++)
+                            {
+                                for (int kk = k; kk < kMax; kk++)
+                                {
+                                    float aVal = pA[ii * K + kk];
+                                    float* pBRow = pB + kk * N + j;
+                                    float* pCRow = pC + ii * N + j;
+
+                                    SimdKernels.ScalarMultiplyAdd(pCRow, pBRow, aVal, pCRow, jMax - j);
+                                }
+                            }
+                        }
+                    }
+                }
+            });
+        }
+
+        /// <summary>
+        /// Matrix multiplication with transpose B optimization (C = A * B^T)
+        /// </summary>
+        public Tensor<float> GemmTransposeB(Tensor<float> a, Tensor<float> b)
+        {
+            if (a.Shape.Length != 2 || b.Shape.Length != 2)
+                throw new ArgumentException("GemmTransposeB requires 2D tensors");
+
+            int m = a.Shape[0];
+            int k = a.Shape[1];
+            int n = b.Shape[0]; // Note: B is transposed
+
+            if (k != b.Shape[1])
+                throw new ArgumentException("Matrix dimensions incompatible for transpose");
+
+            var result = new Tensor<float>(new[] { m, n });
+
+            Parallel.For(0, m, i =>
+            {
+                unsafe
+                {
+                    fixed (float* pA = a.Data, pB = b.Data, pC = result.Data)
+                    {
+                        for (int j = 0; j < n; j++)
+                        {
+                            float* rowA = pA + i * k;
+                            float* rowB = pB + j * k;
+                            pC[i * n + j] = SimdKernels.DotProduct(rowA, rowB, k);
+                        }
+                    }
+                }
+            });
+
+            return result;
+        }
+    }
+}
diff --git a/src/InferenceOptimization/OptimizationInitializer.cs b/src/InferenceOptimization/OptimizationInitializer.cs
new file mode 100644
index 000000000..2ae7068aa
--- /dev/null
+++ b/src/InferenceOptimization/OptimizationInitializer.cs
@@ -0,0 +1,108 @@
+using System;
+using AiDotNet.InferenceOptimization.Kernels;
+using AiDotNet.Tensors.Engines;
+using AiDotNet.Tensors.Engines.Optimization;
+
+namespace AiDotNet.InferenceOptimization
+{
+    /// <summary>
+    /// Initializes and registers all optimized kernels and operators
+    /// </summary>
+    public static class OptimizationInitializer
+    {
+        private static bool _initialized = false;
+        private static readonly object _lock = new object();
+
+        /// <summary>
+        /// Initializes the inference optimization system
+        /// </summary>
+        public static void Initialize(bool enableProfiling = false)
+        {
+            lock (_lock)
+            {
+                if (_initialized)
+                    return;
+
+                // Enable profiling if requested
+                PerformanceProfiler.Instance.Enabled = enableProfiling;
+
+                // Register optimized kernels
+                RegisterKernels();
+
+                // Print platform capabilities
+                LogPlatformInfo();
+
+                _initialized = true;
+            }
+        }
+
+        private static void RegisterKernels()
+        {
+            var registry = CustomOperatorRegistry.Instance;
+
+            // Register GEMM kernel
+            registry.Register(new GemmKernel());
+
+            // Register Attention kernel
+            registry.Register(new AttentionKernel());
+
+            // Register Convolution kernel
+            registry.Register(new ConvolutionKernel());
+
+            // Future: Register GPU kernels when available
+            // if (PlatformDetector.Capabilities.HasCudaSupport)
+            // {
+            //     registry.Register(new CudaGemmKernel());
+            //     registry.Register(new CudaConvolutionKernel());
+            // }
+        }
+
+        private static void LogPlatformInfo()
+        {
+            Console.WriteLine("=== AiDotNet Inference Optimization ===");
+            Console.WriteLine(PlatformDetector.GetCapabilitiesDescription());
+            Console.WriteLine();
+            Console.WriteLine("Registered Operators:");
+
+            var operatorInfo = CustomOperatorRegistry.Instance.GetOperatorInfo();
+            foreach (var kvp in operatorInfo)
+            {
+                Console.WriteLine($"  {kvp.Key}:");
+                foreach (var info in kvp.Value)
+                {
+                    var status = info.IsSupported ? "✓" : "✗";
+                    Console.WriteLine($"    {status} {info.Version} - Priority: {info.Priority}, Speedup: {info.EstimatedSpeedup:F1}x");
+                }
+            }
+            Console.WriteLine();
+        }
+
+        /// <summary>
+        /// Gets a performance summary
+        /// </summary>
+        public static string GetPerformanceSummary()
+        {
+            if (!_initialized)
+                return "Optimization system not initialized.";
+
+            var report = PerformanceProfiler.Instance.GenerateReport();
+            return report;
+        }
+
+        /// <summary>
+        /// Resets all profiling statistics
+        /// </summary>
+        public static void ResetStatistics()
+        {
+            PerformanceProfiler.Instance.Clear();
+        }
+
+        /// <summary>
+        /// Enables or disables profiling at runtime
+        /// </summary>
+        public static void SetProfilingEnabled(bool enabled)
+        {
+            PerformanceProfiler.Instance.Enabled = enabled;
+        }
+    }
+}
diff --git a/src/InferenceOptimization/README.md b/src/InferenceOptimization/README.md
index 910f80682..de91df0fe 100644
--- a/src/InferenceOptimization/README.md
+++ b/src/InferenceOptimization/README.md
@@ -1,8 +1,98 @@
-# Inference Optimization
-
-This module provides graph-level optimizations for neural network inference in AiDotNet. It implements operator fusion, graph transformations, memory optimizations, and computation optimizations to achieve 2-5x inference speedup.
-
-## Features
+# AiDotNet Inference Optimization
+
+This module provides comprehensive inference optimization for neural networks in AiDotNet, including low-level kernel optimizations and graph-level transformations to achieve 2-5x inference speedup.
+
+## Overview
+
+The optimization module consists of two main components:
+
+1. **Kernel Optimizations**: Hardware-specific acceleration using SIMD, cache optimization, and optimized kernels
+2. **Graph Optimizations**: Operator fusion, graph transformations, and memory optimizations
+
+---
+
+## Part 1: Kernel Optimizations
+
+### Custom Operator Registration System
+- Thread-safe operator registry with automatic fallback
+- Priority-based operator selection
+- Support for multiple implementations per operation
+- Runtime operator switching based on platform capabilities
+
+### Platform Detection
+- Automatic detection of CPU architecture (x86/x64, ARM)
+- SIMD instruction set detection (SSE, AVX, AVX2, AVX-512, NEON)
+- Cache size estimation
+- GPU capability detection (CUDA, OpenCL)
+
+### SIMD Vectorization
+- AVX2/AVX-512 optimized kernels for x86/x64
+- ARM NEON optimized kernels
+- Automatic fallback to scalar implementations
+- Optimized operations:
+  - Vector addition/multiplication
+  - Dot product with FMA support
+  - ReLU activation
+  - Sum reduction
+  - Scalar multiply-add
+
+### Optimized Kernels
+
+#### GEMM (General Matrix Multiplication)
+- Cache-blocked algorithm for L1 cache efficiency
+- Parallel execution for large matrices
+- SIMD-optimized inner loops
+- Transpose optimization for better memory access patterns
+- Expected speedup: 2-3x on AVX2, 2.5x on NEON
+
+#### Fused Attention Kernel
+- Scaled dot-product attention: `softmax(QK^T/sqrt(d_k))V`
+- Multi-head attention support
+- Memory-efficient implementation
+- Mask support for causal attention
+- Expected speedup: 2.5x
+
+#### Convolution Kernels
+- Standard 2D convolution
+- Depthwise separable convolution
+- Group convolution
+- Parallel batch processing
+- Expected speedup: 2-2.5x
+
+### CPU Optimizations
+
+#### Cache Optimizer
+- L1/L2/L3 cache-aware algorithms
+- Automatic tiling parameter computation
+- Prefetching for reduced latency
+- Cache-aware transpose
+- Z-order (Morton) indexing for 2D access patterns
+- Cache miss estimation
+
+#### Loop Optimizer
+- 2D and 3D loop tiling
+- Loop unrolling (4x, 8x)
+- Strip mining for cache utilization
+- Loop fusion
+- Loop interchange optimization
+- Parallel tiling with work stealing
+
+### Performance Profiling
+- Thread-safe operation tracking
+- Timing and memory usage statistics
+- Per-operation metrics (min/avg/max/total)
+- Performance report generation
+- Runtime enable/disable capability
+
+### GPU Optimization Infrastructure
+- Base classes for GPU kernel implementations
+- Memory management abstractions
+- CUDA kernel base (ready for ILGPU/ManagedCuda integration)
+- Device capability querying
+
+---
+
+## Part 2: Graph Optimizations
 
 ### Operator Fusion (Critical for Performance)
 
@@ -43,13 +133,44 @@ Reduces memory footprint during inference:
 Replaces expensive operations with cheaper equivalents:
 
 - **Algebraic Simplification**: x*1=x, x+0=x, x*0=0, etc.
-- **Strength Reduction**: x^2 → x*x, x/2 → x*0.5
+- **Strength Reduction**: x^2 -> x*x, x/2 -> x*0.5
 
 **Expected Speedup**: 1.1-1.3x additional speedup
 
-## Usage
+---
+
+## Quick Start
+
+### Kernel-Level Optimization
+
+```csharp
+using AiDotNet.InferenceOptimization;
+using AiDotNet.InferenceOptimization.Kernels;
+using AiDotNet.Tensors.Engines.Simd;  // SimdKernels location
+using AiDotNet.Tensors.LinearAlgebra;
+
+// Initialize the optimization system
+OptimizationInitializer.Initialize(enableProfiling: true);
+
+// Use optimized GEMM
+var gemmKernel = new GemmKernel();
+var a = new Tensor<float>(new[] { 1000, 500 });
+var b = new Tensor<float>(new[] { 500, 1000 });
+var result = gemmKernel.Execute(a, b);
+
+// Use fused attention
+var attentionKernel = new AttentionKernel();
+var q = new Tensor<float>(new[] { 1, 128, 64 }); // [batch, seq_len, d_k]
+var k = new Tensor<float>(new[] { 1, 128, 64 });
+var v = new Tensor<float>(new[] { 1, 128, 64 });
+var attended = attentionKernel.Execute(q, k, v);
+
+// Get performance report
+var report = OptimizationInitializer.GetPerformanceSummary();
+Console.WriteLine(report);
+```
 
-### Basic Example
+### Graph-Level Optimization
 
 ```csharp
 using AiDotNet.InferenceOptimization.Core;
@@ -64,12 +185,9 @@ var optimizer = new GraphOptimizer<double>();
 
 // Optimize the graph
 var optimizedGraph = optimizer.Optimize(graph);
-
-// Use optimized graph for inference
-// (Integration with NeuralNetworkBase is automatic)
 ```
 
-### Advanced Example with Custom Options
+### Advanced Configuration
 
 ```csharp
 // Configure optimization options
@@ -86,255 +204,119 @@ var options = new OptimizationOptions
 
 // Create optimizer with options
 var optimizer = new GraphOptimizer<double>(options);
-
-// Add custom optimization pass
-optimizer.AddPass(new MyCustomPass<double>());
-
-// Optimize
 var optimizedGraph = optimizer.Optimize(graph);
 ```
 
-### Optimization Levels
-
-#### None
-No optimizations applied. Use for debugging.
-
-#### Basic
-- Constant Folding
-- Dead Code Elimination
-
-**Use when**: Fast compilation is critical, minimal speedup needed
-
-#### Standard (Recommended)
-- All Basic optimizations
-- Operator Fusion
-- Algebraic Simplification
-
-**Use when**: Balanced performance and compilation time
-**Expected Speedup**: 2-3x
-
-#### Aggressive
-- All Standard optimizations
-- Common Subexpression Elimination
-- Strength Reduction
-- In-Place Optimization
-- Memory Reuse
-
-**Use when**: Production deployments, inference is performance-critical
-**Expected Speedup**: 3-4x
-
-#### Maximum
-- All optimizations enabled
-- Layout Optimization
-- Maximum fusion opportunities
-
-**Use when**: Critical inference paths, compilation time not important
-**Expected Speedup**: 4-5x
-
-## Optimization Passes
-
-### Operator Fusion Passes
-
-1. **ConvBatchNormReLUFusionPass**: Fuses Conv → BatchNorm → ReLU
-2. **ConvBatchNormFusionPass**: Fuses Conv → BatchNorm
-3. **MatMulBiasActivationFusionPass**: Fuses MatMul → Bias → Activation
-4. **MatMulBiasFusionPass**: Fuses MatMul → Bias into Gemm
-5. **ElementwiseFusionPass**: Fuses chains of elementwise operations
-6. **MultiHeadAttentionFusionPass**: Optimizes attention mechanisms
-
-### Graph Structure Passes
-
-1. **ConstantFoldingPass**: Evaluates constant expressions
-2. **DeadCodeEliminationPass**: Removes unreachable nodes
-3. **CommonSubexpressionEliminationPass**: Shares common computations
-4. **LayoutOptimizationPass**: Optimizes tensor layout
+## Platform Capabilities
 
-### Memory Passes
+Check what optimizations are available on your platform:
 
-1. **InPlaceOptimizationPass**: Enables in-place operations
-2. **MemoryReuseOptimizationPass**: Optimizes buffer allocation
+```csharp
+var caps = PlatformDetector.Capabilities;
+Console.WriteLine($"Best SIMD: {caps.GetBestSimdSet()}");
+Console.WriteLine($"Has AVX2: {caps.HasAVX2}");
+Console.WriteLine($"Has NEON: {caps.HasNeon}");
+Console.WriteLine($"Processor Count: {caps.ProcessorCount}");
+```
 
-### Computation Passes
+## Optimization Levels
 
-1. **AlgebraicSimplificationPass**: Applies algebraic identities
-2. **StrengthReductionPass**: Replaces expensive operations
+| Level | Optimizations | Use Case | Expected Speedup |
+|-------|---------------|----------|------------------|
+| None | None | Debugging | 1x |
+| Basic | Constant Folding, Dead Code Elimination | Fast compilation | 1.2x |
+| Standard | + Operator Fusion, Algebraic Simplification | Balanced | 2-3x |
+| Aggressive | + CSE, Strength Reduction, In-Place, Memory Reuse | Production | 3-4x |
+| Maximum | All optimizations + Layout Optimization | Critical paths | 4-5x |
 
 ## Performance Benchmarks
 
 ### CNN Models (ResNet-50)
 
-| Optimization Level | Inference Time | Memory Usage | Compile Time |
-|-------------------|----------------|--------------|--------------|
-| None              | 100 ms         | 1000 MB      | 0 ms         |
-| Basic             | 85 ms          | 950 MB       | 50 ms        |
-| Standard          | 40 ms          | 800 MB       | 150 ms       |
-| Aggressive        | 30 ms          | 600 MB       | 300 ms       |
-| Maximum           | 25 ms          | 550 MB       | 500 ms       |
-
-**Speedup**: 4x (None → Maximum)
+| Optimization Level | Inference Time | Memory Usage | Speedup |
+|-------------------|----------------|--------------|---------|
+| None | 100 ms | 1000 MB | 1x |
+| Standard | 40 ms | 800 MB | 2.5x |
+| Aggressive | 30 ms | 600 MB | 3.3x |
+| Maximum | 25 ms | 550 MB | 4x |
 
 ### Transformer Models (BERT-Base)
 
-| Optimization Level | Inference Time | Memory Usage | Compile Time |
-|-------------------|----------------|--------------|--------------|
-| None              | 200 ms         | 2000 MB      | 0 ms         |
-| Basic             | 180 ms         | 1900 MB      | 75 ms        |
-| Standard          | 120 ms         | 1600 MB      | 200 ms       |
-| Aggressive        | 90 ms          | 1300 MB      | 400 ms       |
-| Maximum           | 75 ms          | 1200 MB      | 600 ms       |
+| Optimization Level | Inference Time | Memory Usage | Speedup |
+|-------------------|----------------|--------------|---------|
+| None | 200 ms | 2000 MB | 1x |
+| Standard | 120 ms | 1600 MB | 1.7x |
+| Aggressive | 90 ms | 1300 MB | 2.2x |
+| Maximum | 75 ms | 1200 MB | 2.7x |
 
-**Speedup**: 2.7x (None → Maximum)
-
-## Architecture
-
-```
-InferenceOptimization/
-├── Core/
-│   ├── ComputationGraph.cs          # Graph data structure
-│   ├── ComputationNode.cs           # Graph node
-│   ├── GraphOptimizer.cs            # Main optimization engine
-│   ├── GraphBuilder.cs              # Build graphs from layers
-│   ├── OptimizationOptions.cs       # Configuration
-│   └── OptimizationLevel.cs         # Optimization levels
-│
-└── Passes/
-    ├── IOptimizationPass.cs         # Pass interface
-    ├── OptimizationPassBase.cs      # Pass base class
-    │
-    ├── Fusion/
-    │   ├── ConvBatchNormFusionPass.cs
-    │   ├── ConvBatchNormReLUFusionPass.cs
-    │   ├── MatMulBiasFusionPass.cs
-    │   ├── MatMulBiasActivationFusionPass.cs
-    │   ├── ElementwiseFusionPass.cs
-    │   └── MultiHeadAttentionFusionPass.cs
-    │
-    ├── Graph/
-    │   ├── ConstantFoldingPass.cs
-    │   ├── DeadCodeEliminationPass.cs
-    │   ├── CommonSubexpressionEliminationPass.cs
-    │   └── LayoutOptimizationPass.cs
-    │
-    ├── Memory/
-    │   ├── InPlaceOptimizationPass.cs
-    │   └── MemoryReuseOptimizationPass.cs
-    │
-    └── Computation/
-        ├── AlgebraicSimplificationPass.cs
-        └── StrengthReductionPass.cs
-```
+## Custom Operators
 
-## Creating Custom Optimization Passes
+Register your own optimized operators:
 
 ```csharp
-using AiDotNet.InferenceOptimization.Passes;
-using AiDotNet.InferenceOptimization.Core;
-using AiDotNet.Enums;
-
-public class MyCustomFusionPass<T> : OptimizationPassBase<T> where T : struct
+public class MyCustomKernel : ICustomOperator<float>
 {
-    public override OptimizationPassType PassType => OptimizationPassType.Custom;
-    public override string Name => "My Custom Fusion";
+    public string Name => "MyOperation";
+    public string Version => "1.0.0";
+    public int Priority => 100;
 
-    public override bool Apply(IComputationGraph<T> graph)
+    public bool IsSupported()
     {
-        bool modified = false;
-
-        // Find pattern: LayerNorm → Attention
-        var candidates = FindFusionCandidates(
-            graph,
-            OperationType.LayerNormalization,
-            OperationType.Attention
-        );
-
-        foreach (var sequence in candidates)
-        {
-            // Create fused node
-            var fusedNode = FuseNodes(
-                graph,
-                sequence,
-                OperationType.FusedLayerNormAttention
-            );
-
-            modified = true;
-        }
-
-        return modified;
+        return PlatformDetector.Capabilities.HasAVX2;
     }
 
-    public override bool CanApply(IComputationGraph<T> graph)
+    public double EstimatedSpeedup()
     {
-        return graph.Nodes.Any(n => n.OperationType == OperationType.LayerNormalization);
+        return 3.0; // Expected 3x speedup
+    }
+
+    public Tensor<float> Execute(params Tensor<float>[] inputs)
+    {
+        // Your optimized implementation
     }
 }
 
-// Use the custom pass
-var optimizer = new GraphOptimizer<double>();
-optimizer.AddPass(new MyCustomFusionPass<double>());
-var optimizedGraph = optimizer.Optimize(graph);
+// Register the operator
+CustomOperatorRegistry.Instance.Register(new MyCustomKernel());
 ```
 
-## Integration with Existing Models
-
-The optimizer automatically integrates with existing AiDotNet models:
+## CPU Optimization Utilities
 
 ```csharp
-// Your existing model
-var cnn = new ConvolutionalNeuralNetwork<double>();
-cnn.AddLayer(new ConvolutionalLayer<double>());
-cnn.AddLayer(new BatchNormalizationLayer<double>());
-cnn.AddLayer(new ReLUActivationLayer<double>());
-// ... more layers
-
-// Optimize for inference
-var optimizer = new GraphOptimizer<double>(
-    OptimizationOptions.FromLevel(OptimizationLevel.Aggressive)
-);
-
-// Build and optimize graph from the model
-var graphBuilder = new GraphBuilder<double>();
-var graph = graphBuilder.BuildFromLayers(cnn.Layers);
-var optimizedGraph = optimizer.Optimize(graph);
+using AiDotNet.InferenceOptimization.CpuOptimization;
 
-// The optimized graph can now be used for inference
-// (Future: ExecuteOptimized method on NeuralNetworkBase)
+// Determine optimal tile size
+int tileSize = LoopOptimizer.DetermineOptimalTileSize(matrixSize);
+
+// Use parallel tiling
+LoopOptimizer.ParallelTile2D(rows, cols, tileSize, (iStart, iEnd, jStart, jEnd) =>
+{
+    // Process tile in parallel
+});
 ```
 
-## Comparison with Other Frameworks
-
-| Feature                          | AiDotNet | TensorRT | ONNX Runtime | TorchScript |
-|----------------------------------|----------|----------|--------------|-------------|
-| Conv+BN+ReLU Fusion             | ✓        | ✓        | ✓            | ✓           |
-| MatMul+Bias+Activation Fusion   | ✓        | ✓        | ✓            | ✓           |
-| Constant Folding                 | ✓        | ✓        | ✓            | ✓           |
-| Dead Code Elimination            | ✓        | ✓        | ✓            | ✓           |
-| Common Subexpression Elimination | ✓        | ✓        | ✓            | ✓           |
-| Memory Reuse Optimization        | ✓        | ✓        | ✓            | ✓           |
-| In-Place Operations              | ✓        | ✓        | ✓            | ✓           |
-| Layout Optimization              | ✓        | ✓        | ✓            | ✓           |
-| Algebraic Simplification         | ✓        | ✓        | ✓            | ✓           |
-| Native .NET Integration          | ✓        | ✗        | Partial      | ✗           |
+## Requirements
 
-## Future Enhancements
+- .NET 8.0 or later (also supports .NET Framework 4.7.1+)
+- x86/x64 or ARM64 processor
+- For GPU support: CUDA-capable GPU (future implementation)
 
-- [ ] Quantization (Int8, Float16)
-- [ ] Kernel auto-tuning
-- [ ] Multi-GPU support
-- [ ] ONNX export with optimizations
-- [ ] TensorRT backend integration
-- [ ] Flash Attention implementation
-- [ ] Dynamic batching optimization
-- [ ] Automatic mixed precision
+## Performance Targets
 
-## Related Issues
+- 2-5x speedup on critical operations (achieved through SIMD and graph optimization)
+- Hardware-specific optimizations (AVX2, AVX-512, NEON)
+- Graceful fallback behavior (automatic platform detection)
+- 30-50% memory reduction through optimization
+
+## Future Enhancements
 
-- Issue #409: Graph Optimization and Operator Fusion (this implementation)
-- Issue #280: ONNX Export
-- Issue #277: Inference Optimizations
+- GPU kernel implementations using ILGPU or ManagedCuda
+- Quantization support (INT8, FP16)
+- Flash Attention implementation
+- TensorRT backend integration
+- ONNX export with optimizations
+- Automatic mixed precision
 
-## References
+## License
 
-- [TensorRT Optimization Guide](https://docs.nvidia.com/deeplearning/tensorrt/)
-- [ONNX Runtime Performance Tuning](https://onnxruntime.ai/docs/performance/)
-- [PyTorch JIT and TorchScript](https://pytorch.org/docs/stable/jit.html)
-- [TVM: End-to-End Deep Learning Compiler](https://tvm.apache.org/)
+Same as parent AiDotNet project.
diff --git a/src/LoRA/Adapters/MultiLoRAAdapter.cs b/src/LoRA/Adapters/MultiLoRAAdapter.cs
index ce1ac9093..a6a089785 100644
--- a/src/LoRA/Adapters/MultiLoRAAdapter.cs
+++ b/src/LoRA/Adapters/MultiLoRAAdapter.cs
@@ -1,4 +1,7 @@
 using AiDotNet.Interfaces;
+using AiDotNet.NeuralNetworks.Layers;
+using AiDotNet.Tensors.LinearAlgebra;
+using System.Globalization;
 
 namespace AiDotNet.LoRA.Adapters;
 
@@ -48,7 +51,7 @@ namespace AiDotNet.LoRA.Adapters;
 /// You can switch between tasks at runtime, and each task only trains its specific LoRA weights!
 /// </para>
 /// </remarks>
-public class MultiLoRAAdapter<T> : LoRAAdapterBase<T>
+public class MultiLoRAAdapter<T> : LoRAAdapterBase<T>, ILayerSerializationExtras<T>, AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata
 {
     /// <summary>
     /// Dictionary mapping task names to their specific LoRA layers.
@@ -443,12 +446,13 @@ public override Vector<T> GetParameters()
             }
         }
 
-        // All task adapters' parameters
+        // All task adapters' parameters (stable ordering for deterministic serialization)
         // Guard against null _taskAdapters during base constructor calls
         if (_taskAdapters != null)
         {
-            foreach (var adapter in _taskAdapters.Values)
+            foreach (var taskName in _taskAdapters.Keys.OrderBy(k => k, StringComparer.Ordinal))
             {
+                var adapter = _taskAdapters[taskName];
                 Vector<T> taskParams = adapter.GetParameters();
                 for (int i = 0; i < taskParams.Length; i++)
                 {
@@ -485,12 +489,13 @@ public override void SetParameters(Vector<T> parameters)
             _baseLayer.SetParameters(baseParams);
         }
 
-        // All task adapters' parameters
+        // All task adapters' parameters (stable ordering for deterministic serialization)
         // Guard against null _taskAdapters during construction or early calls
         if (_taskAdapters != null)
         {
-            foreach (var adapter in _taskAdapters.Values)
+            foreach (var taskName in _taskAdapters.Keys.OrderBy(k => k, StringComparer.Ordinal))
             {
+                var adapter = _taskAdapters[taskName];
                 int taskParamCount = adapter.ParameterCount;
                 Vector<T> taskParams = new Vector<T>(taskParamCount);
                 for (int i = 0; i < taskParamCount; i++)
@@ -630,8 +635,9 @@ private void UpdateParameterGradientsFromLayers()
             currentAdapter = _taskAdapters[_currentTask];
         }
 
-        foreach (var adapter in _taskAdapters.Values)
+        foreach (var taskName in _taskAdapters.Keys.OrderBy(k => k, StringComparer.Ordinal))
         {
+            var adapter = _taskAdapters[taskName];
             Vector<T>? grads = (adapter == currentAdapter && currentAdapter != null)
                 ? adapter.GetParameterGradients()
                 : null;
@@ -643,6 +649,97 @@ private void UpdateParameterGradientsFromLayers()
         }
     }
 
+    int ILayerSerializationExtras<T>.ExtraParameterCount => _freezeBaseLayer && _baseLayer != null ? _baseLayer.ParameterCount : 0;
+
+    Vector<T> ILayerSerializationExtras<T>.GetExtraParameters()
+    {
+        if (!_freezeBaseLayer || _baseLayer == null)
+        {
+            return new Vector<T>(0);
+        }
+
+        return _baseLayer.GetParameters();
+    }
+
+    void ILayerSerializationExtras<T>.SetExtraParameters(Vector<T> extraParameters)
+    {
+        if (!_freezeBaseLayer || _baseLayer == null)
+        {
+            return;
+        }
+
+        if (extraParameters.Length != _baseLayer.ParameterCount)
+        {
+            throw new ArgumentException(
+                $"Expected {_baseLayer.ParameterCount} extra parameters for frozen base layer, got {extraParameters.Length}",
+                nameof(extraParameters));
+        }
+
+        _baseLayer.SetParameters(extraParameters);
+    }
+
+    Dictionary<string, string> AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        var meta = new Dictionary<string, string>(StringComparer.Ordinal)
+        {
+            ["FreezeBaseLayer"] = _freezeBaseLayer.ToString(CultureInfo.InvariantCulture),
+            ["BaseLayerTypeId"] = Uri.EscapeDataString(BuildLayerTypeIdentifier(_baseLayer))
+        };
+
+        if (_taskAdapters != null)
+        {
+            var ordered = _taskAdapters.Keys.OrderBy(k => k, StringComparer.Ordinal).ToArray();
+            meta["Tasks"] = string.Join("|", ordered.Select(Uri.EscapeDataString));
+            meta["TaskRanks"] = string.Join("|", ordered.Select(t => _taskAdapters[t].Rank.ToString(CultureInfo.InvariantCulture)));
+            meta["TaskAlphas"] = string.Join("|", ordered.Select(t => Convert.ToDouble(_taskAdapters[t].Alpha).ToString(CultureInfo.InvariantCulture)));
+        }
+
+        if (!string.IsNullOrWhiteSpace(_currentTask))
+        {
+            meta["CurrentTask"] = Uri.EscapeDataString(_currentTask);
+        }
+
+        return meta;
+    }
+
+    private static string BuildLayerTypeIdentifier(ILayer<T> layer)
+    {
+        string typeName = layer.GetType().Name;
+        var metadata = new Dictionary<string, string>(StringComparer.Ordinal);
+
+        if (layer is AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata meta)
+        {
+            foreach (var kvp in meta.GetSerializationMetadata())
+            {
+                metadata[kvp.Key] = kvp.Value;
+            }
+        }
+
+        if (layer is LayerBase<T> layerBase)
+        {
+            if (layerBase.VectorActivation != null)
+            {
+                metadata["VectorActivationType"] = layerBase.VectorActivation.GetType().AssemblyQualifiedName ?? layerBase.VectorActivation.GetType().FullName ?? string.Empty;
+            }
+            else if (layerBase.ScalarActivation != null)
+            {
+                metadata["ScalarActivationType"] = layerBase.ScalarActivation.GetType().AssemblyQualifiedName ?? layerBase.ScalarActivation.GetType().FullName ?? string.Empty;
+            }
+        }
+
+        if (metadata.Count == 0)
+        {
+            return typeName;
+        }
+
+        foreach (var kvp in metadata.OrderBy(k => k.Key, StringComparer.Ordinal))
+        {
+            typeName += $";{kvp.Key}={kvp.Value}";
+        }
+
+        return typeName;
+    }
+
     /// <summary>
     /// Resets the internal state of all layers.
     /// </summary>
diff --git a/src/Models/Results/PredictionModelResult.cs b/src/Models/Results/PredictionModelResult.cs
index b5972e4a9..9f1d386b8 100644
--- a/src/Models/Results/PredictionModelResult.cs
+++ b/src/Models/Results/PredictionModelResult.cs
@@ -14,10 +14,12 @@
 using AiDotNet.Deployment.Mobile.CoreML;
 using AiDotNet.Deployment.Mobile.TensorFlowLite;
 using AiDotNet.Deployment.Runtime;
+using AiDotNet.Inference;
 using AiDotNet.Reasoning;
 using AiDotNet.Reasoning.Models;
 using AiDotNet.LanguageModels;
 using AiDotNet.Enums;
+using AiDotNet.NeuralNetworks;
 using AiDotNet.Tokenization.Interfaces;
 using AiDotNet.Tokenization.Configuration;
 using AiDotNet.Tokenization.Models;
@@ -455,8 +457,26 @@ public class PredictionModelResult<T, TInput, TOutput> : IFullModel<T, TInput, T
     /// </remarks>
     [JsonIgnore]  // Don't serialize - will need to be recompiled after deserialization
     private Func<Tensor<T>[], Tensor<T>[]>? JitCompiledFunction { get; set; }
+
+    [JsonProperty]
     private AiDotNet.Configuration.InferenceOptimizationConfig? InferenceOptimizationConfig { get; set; }
 
+    [JsonIgnore]
+    private readonly object _inferenceOptimizationLock = new();
+
+    [JsonIgnore]
+    private InferenceOptimizer<T>? _inferenceOptimizer;
+
+    [JsonIgnore]
+    private NeuralNetworkBase<T>? _inferenceOptimizedNeuralModel;
+
+    [JsonIgnore]
+    private bool _inferenceOptimizationsInitialized;
+
+    // Serving assembly uses InternalsVisibleTo; keep this internal to avoid expanding user-facing API surface.
+    internal AiDotNet.Configuration.InferenceOptimizationConfig? GetInferenceOptimizationConfigForServing()
+        => InferenceOptimizationConfig;
+
     /// <summary>
     /// Gets the reasoning configuration for advanced Chain-of-Thought, Tree-of-Thoughts, and Self-Consistency reasoning.
     /// </summary>
@@ -939,10 +959,34 @@ public TOutput Predict(TInput newData)
 
         // Use JIT-compiled function if available for 5-10x faster predictions
         TOutput normalizedPredictions;
-        if (JitCompiledFunction != null && normalizedNewData is Tensor<T> inputTensor)
+
+        // INFERENCE OPTIMIZATION PATH: apply configured inference optimizations for neural network models
+        if (InferenceOptimizationConfig != null &&
+            Model is NeuralNetworkBase<T> neuralModel &&
+            normalizedNewData is Tensor<T> inputTensor)
+        {
+            var optimizedNeuralModel = EnsureStatelessInferenceOptimizationsInitialized(neuralModel);
+            if (optimizedNeuralModel != null)
+            {
+                var optimizedOutput = optimizedNeuralModel.Predict(inputTensor);
+                if ((object)optimizedOutput is TOutput output)
+                {
+                    normalizedPredictions = output;
+                }
+                else
+                {
+                    // Fallback to the wrapped model if type mismatch occurs
+                    normalizedPredictions = Model.Predict(normalizedNewData);
+                }
+
+                return NormalizationInfo.Normalizer.Denormalize(normalizedPredictions, NormalizationInfo.YParams);
+            }
+        }
+
+        if (JitCompiledFunction != null && normalizedNewData is Tensor<T> inputTensor2)
         {
             // JIT PATH: Use compiled function for accelerated inference
-            var jitResult = JitCompiledFunction(new[] { inputTensor });
+            var jitResult = JitCompiledFunction(new[] { inputTensor2 });
             if (jitResult != null && jitResult.Length > 0 && jitResult[0] is TOutput output)
             {
                 normalizedPredictions = output;
@@ -962,10 +1006,390 @@ public TOutput Predict(TInput newData)
         return NormalizationInfo.Normalizer.Denormalize(normalizedPredictions, NormalizationInfo.YParams);
     }
 
+    /// <summary>
+    /// Begins an inference session for stateful inference features (e.g., KV-cache).
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// Sessions are intended for serving-style workloads where you run many sequential inference steps.
+    /// A session can create multiple independent sequences, each maintaining its own state (like KV-cache).
+    /// </para>
+    /// <para>
+    /// <b>For Beginners:</b> Use a session when you are doing "token-by-token" inference.
+    ///
+    /// - Use <see cref="Predict(TInput)"/> for one-off, stateless predictions.
+    /// - Use <see cref="BeginInferenceSession"/> when you need the model to remember prior calls in the same sequence.
+    /// </para>
+    /// </remarks>
+    public InferenceSession BeginInferenceSession()
+    {
+        return new InferenceSession(this, InferenceOptimizationConfig);
+    }
+
+    private NeuralNetworkBase<T>? EnsureStatelessInferenceOptimizationsInitialized(NeuralNetworkBase<T> model)
+    {
+        if (_inferenceOptimizationsInitialized)
+        {
+            return _inferenceOptimizedNeuralModel;
+        }
+
+        lock (_inferenceOptimizationLock)
+        {
+            if (_inferenceOptimizationsInitialized)
+            {
+                return _inferenceOptimizedNeuralModel;
+            }
+
+            try
+            {
+                if (InferenceOptimizationConfig != null)
+                {
+                    // Stateless-only optimizations for plain Predict(): avoid stateful features that can leak across calls.
+                    var statelessConfig = CreateStatelessInferenceConfig(InferenceOptimizationConfig);
+                    var optimizer = new InferenceOptimizer<T>(statelessConfig);
+                    var (optimizedModel, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: true);
+
+                    _inferenceOptimizer = optimizer;
+                    _inferenceOptimizedNeuralModel = anyApplied ? optimizedModel : null;
+                }
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine($"Warning: inference optimizations failed: {ex.Message}");
+                _inferenceOptimizer = null;
+                _inferenceOptimizedNeuralModel = null;
+            }
+            finally
+            {
+                _inferenceOptimizationsInitialized = true;
+            }
+
+            return _inferenceOptimizedNeuralModel;
+        }
+    }
+
+    private static AiDotNet.Configuration.InferenceOptimizationConfig CreateStatelessInferenceConfig(
+        AiDotNet.Configuration.InferenceOptimizationConfig config)
+    {
+        return new AiDotNet.Configuration.InferenceOptimizationConfig
+        {
+            EnableFlashAttention = config.EnableFlashAttention,
+            AttentionMasking = config.AttentionMasking,
+
+            // Disable stateful/session-centric features for plain Predict().
+            EnableKVCache = false,
+            EnablePagedKVCache = false,
+            EnableBatching = false,
+            EnableSpeculativeDecoding = false
+        };
+    }
+
+    /// <summary>
+    /// Facade-friendly inference session that owns stateful inference internals.
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// This type intentionally keeps inference internals behind the facade. Users create sequences via
+    /// <see cref="CreateSequence"/> and run inference via <see cref="InferenceSequence.Predict(TInput)"/>.
+    /// </para>
+    /// </remarks>
+    public sealed class InferenceSession : IDisposable
+    {
+        private readonly PredictionModelResult<T, TInput, TOutput> _result;
+        private readonly AiDotNet.Configuration.InferenceOptimizationConfig? _config;
+        private bool _disposed;
+
+        internal InferenceSession(
+            PredictionModelResult<T, TInput, TOutput> result,
+            AiDotNet.Configuration.InferenceOptimizationConfig? config)
+        {
+            _result = result ?? throw new ArgumentNullException(nameof(result));
+            _config = config;
+        }
+
+        /// <summary>
+        /// Creates an independent sequence within this session.
+        /// </summary>
+        /// <remarks>
+        /// <para>
+        /// Each sequence represents an independent stream (e.g., one chat) and owns its own state.
+        /// </para>
+        /// </remarks>
+        public InferenceSequence CreateSequence()
+        {
+            ThrowIfDisposed();
+            return new InferenceSequence(_result, _config, multiLoRATask: null);
+        }
+
+        // Internal (serving/tests): allow selecting a Multi-LoRA task per sequence without expanding public API surface.
+        internal InferenceSequence CreateSequence(string? multiLoRATask)
+        {
+            ThrowIfDisposed();
+            return new InferenceSequence(_result, _config, multiLoRATask);
+        }
+
+        public void Dispose()
+        {
+            _disposed = true;
+        }
+
+        private void ThrowIfDisposed()
+        {
+            if (_disposed)
+            {
+                throw new ObjectDisposedException(nameof(InferenceSession));
+            }
+        }
+    }
+
+    /// <summary>
+    /// Represents one independent, stateful inference sequence (e.g., one chat/generation stream).
+    /// </summary>
+    /// <remarks>
+    /// <para>
+    /// A sequence may keep internal state across calls when inference optimizations are enabled (e.g., KV-cache).
+    /// Call <see cref="Reset"/> to start a new logical sequence on the same object.
+    /// </para>
+    /// </remarks>
+    public sealed class InferenceSequence : IDisposable
+    {
+        private readonly PredictionModelResult<T, TInput, TOutput> _result;
+        private readonly AiDotNet.Configuration.InferenceOptimizationConfig? _config;
+        private bool _disposed;
+
+        // Session-local inference state (populated lazily when used).
+        private InferenceOptimizer<T>? _sequenceOptimizer;
+        private NeuralNetworkBase<T>? _sequenceOptimizedNeuralModel;
+        private bool _sequenceInitialized;
+        private readonly object _sequenceLock = new();
+
+        internal InferenceSequence(
+            PredictionModelResult<T, TInput, TOutput> result,
+            AiDotNet.Configuration.InferenceOptimizationConfig? config,
+            string? multiLoRATask)
+        {
+            _result = result ?? throw new ArgumentNullException(nameof(result));
+            _config = config;
+            _multiLoRATask = multiLoRATask;
+        }
+
+        private string? _multiLoRATask;
+
+        public TOutput Predict(TInput newData)
+        {
+            ThrowIfDisposed();
+
+            if (_result.Model == null)
+            {
+                throw new InvalidOperationException("Model is not initialized.");
+            }
+
+            if (_result.NormalizationInfo.Normalizer == null)
+            {
+                throw new InvalidOperationException("Normalizer is not initialized.");
+            }
+
+            var (normalizedNewData, _) = _result.NormalizationInfo.Normalizer.NormalizeInput(newData);
+
+            // Session inference: use configured inference optimizations, including stateful ones, if applicable.
+            if (_config != null &&
+                _result.Model is NeuralNetworkBase<T> neuralModel &&
+                normalizedNewData is Tensor<T> inputTensor)
+            {
+                var optimized = EnsureSequenceOptimizationsInitialized(neuralModel);
+                if (optimized != null)
+                {
+                    var optimizedOutput = optimized.Predict(inputTensor);
+                    if ((object)optimizedOutput is TOutput output)
+                    {
+                        return _result.NormalizationInfo.Normalizer.Denormalize(output, _result.NormalizationInfo.YParams);
+                    }
+                }
+            }
+
+            // Fallback: normal predict path (no JIT inside a session to keep behavior consistent).
+            var normalizedPredictions = _result.Model.Predict(normalizedNewData);
+            return _result.NormalizationInfo.Normalizer.Denormalize(normalizedPredictions, _result.NormalizationInfo.YParams);
+        }
+
+        public void Reset()
+        {
+            ThrowIfDisposed();
+            lock (_sequenceLock)
+            {
+                _sequenceOptimizer?.ClearCache();
+            }
+        }
+
+        // Internal: switch Multi-LoRA task for this sequence, resetting state to avoid cache leakage.
+        internal void SetMultiLoRATask(string? taskName)
+        {
+            ThrowIfDisposed();
+            lock (_sequenceLock)
+            {
+                if (string.Equals(_multiLoRATask, taskName, StringComparison.Ordinal))
+                    return;
+
+                _multiLoRATask = taskName;
+
+                try
+                {
+                    _sequenceOptimizer?.ClearCache();
+                }
+                catch
+                {
+                    // Best-effort.
+                }
+
+                _sequenceOptimizer = null;
+                _sequenceOptimizedNeuralModel = null;
+                _sequenceInitialized = false;
+            }
+        }
+
+        public void Dispose()
+        {
+            if (_disposed)
+            {
+                return;
+            }
+
+            try
+            {
+                _sequenceOptimizer?.ClearCache();
+            }
+            catch
+            {
+                // Best-effort cleanup; disposal must not throw.
+            }
+
+            _disposed = true;
+        }
+
+        // Exposed to AiDotNetTests via InternalsVisibleTo for integration verification without expanding the public API surface.
+        internal Dictionary<string, object> GetInferenceStatistics()
+        {
+            ThrowIfDisposed();
+            lock (_sequenceLock)
+            {
+                return _sequenceOptimizer?.GetStatistics() ?? new Dictionary<string, object>();
+            }
+        }
+
+        private NeuralNetworkBase<T>? EnsureSequenceOptimizationsInitialized(NeuralNetworkBase<T> model)
+        {
+            if (_sequenceInitialized)
+            {
+                return _sequenceOptimizedNeuralModel;
+            }
+
+            lock (_sequenceLock)
+            {
+                if (_sequenceInitialized)
+                {
+                    return _sequenceOptimizedNeuralModel;
+                }
+
+                try
+                {
+                    if (_config != null)
+                    {
+                        // If Multi-LoRA is in use, isolate per-sequence task selection by cloning and selecting task
+                        // before applying any further inference optimizations.
+                        NeuralNetworkBase<T> modelForSequence = model;
+                        bool hasMultiLoRATask = !string.IsNullOrWhiteSpace(_multiLoRATask);
+                        if (hasMultiLoRATask)
+                        {
+                            try
+                            {
+                                modelForSequence = (NeuralNetworkBase<T>)model.Clone();
+
+                                int appliedCount = 0;
+                                foreach (var layer in modelForSequence.Layers)
+                                {
+                                    if (layer is AiDotNet.LoRA.Adapters.MultiLoRAAdapter<T> multi)
+                                    {
+                                        multi.SetCurrentTask(_multiLoRATask!);
+                                        appliedCount++;
+                                    }
+                                }
+
+                                InferenceDiagnostics.RecordDecision(
+                                    area: "InferenceSession",
+                                    feature: "MultiLoRA",
+                                    enabled: appliedCount > 0,
+                                    reason: appliedCount > 0 ? $"Task={_multiLoRATask}" : $"NoMultiLoRAAdapters(Task={_multiLoRATask})");
+                            }
+                            catch (Exception ex)
+                            {
+                                InferenceDiagnostics.RecordException("InferenceSession", "MultiLoRA", ex, $"Task={_multiLoRATask};FallbackToBaseModel");
+                                modelForSequence = model;
+                            }
+                        }
+
+                        // In a session, prefer causal masking defaults when user left it as Auto.
+                        var sessionConfig = _config.AttentionMasking == AiDotNet.Configuration.AttentionMaskingMode.Auto
+                            ? new AiDotNet.Configuration.InferenceOptimizationConfig
+                            {
+                                EnableFlashAttention = _config.EnableFlashAttention,
+                                EnableKVCache = _config.EnableKVCache,
+                                EnablePagedKVCache = _config.EnablePagedKVCache,
+                                PagedKVCacheBlockSize = _config.PagedKVCacheBlockSize,
+                                MaxBatchSize = _config.MaxBatchSize,
+                                KVCacheMaxSizeMB = _config.KVCacheMaxSizeMB,
+                                KVCachePrecision = _config.KVCachePrecision,
+                                KVCacheQuantization = _config.KVCacheQuantization,
+                                UseSlidingWindowKVCache = _config.UseSlidingWindowKVCache,
+                                KVCacheWindowSize = _config.KVCacheWindowSize,
+                                EnableBatching = _config.EnableBatching,
+                                EnableSpeculativeDecoding = _config.EnableSpeculativeDecoding,
+                                SpeculationPolicy = _config.SpeculationPolicy,
+                                SpeculativeMethod = _config.SpeculativeMethod,
+                                DraftModelType = _config.DraftModelType,
+                                SpeculationDepth = _config.SpeculationDepth,
+                                UseTreeSpeculation = _config.UseTreeSpeculation,
+                                EnableWeightOnlyQuantization = _config.EnableWeightOnlyQuantization,
+                                AttentionMasking = AiDotNet.Configuration.AttentionMaskingMode.Causal
+                            }
+                            : _config;
+
+                        var optimizer = new InferenceOptimizer<T>(sessionConfig);
+                        var (optimizedModel, anyApplied) = optimizer.OptimizeForInference(modelForSequence, cloneModel: ReferenceEquals(modelForSequence, model));
+
+                        _sequenceOptimizer = optimizer;
+                        // If Multi-LoRA was requested, keep the per-sequence model even when no other optimizations apply.
+                        _sequenceOptimizedNeuralModel = anyApplied || !ReferenceEquals(modelForSequence, model) ? optimizedModel : null;
+                    }
+                }
+                catch (Exception ex)
+                {
+                    Console.WriteLine($"Warning: inference session optimizations failed: {ex.Message}");
+                    _sequenceOptimizer = null;
+                    _sequenceOptimizedNeuralModel = null;
+                }
+                finally
+                {
+                    _sequenceInitialized = true;
+                }
+
+                return _sequenceOptimizedNeuralModel;
+            }
+        }
+
+        private void ThrowIfDisposed()
+        {
+            if (_disposed)
+            {
+                throw new ObjectDisposedException(nameof(InferenceSequence));
+            }
+        }
+    }
+
     /// <summary>
     /// Gets the default loss function used by this model for gradient computation.
     /// </summary>
     /// <exception cref="InvalidOperationException">If Model is not initialized.</exception>
+    [JsonIgnore]
     public ILossFunction<T> DefaultLossFunction
     {
         get
@@ -1680,6 +2104,7 @@ public void Deserialize(byte[] data)
                 ModelMetaData = deserializedObject.ModelMetaData;
                 BiasDetector = deserializedObject.BiasDetector;
                 FairnessEvaluator = deserializedObject.FairnessEvaluator;
+                InferenceOptimizationConfig = deserializedObject.InferenceOptimizationConfig;
 
                 // Preserve RAG components and all configuration properties
                 RagRetriever = deserializedObject.RagRetriever;
@@ -1691,6 +2116,12 @@ public void Deserialize(byte[] data)
                 AgentConfig = deserializedObject.AgentConfig;
                 AgentRecommendation = deserializedObject.AgentRecommendation;
                 DeploymentConfiguration = deserializedObject.DeploymentConfiguration;
+
+                // Reset transient runtime state (will be reinitialized lazily)
+                JitCompiledFunction = null;
+                _inferenceOptimizer = null;
+                _inferenceOptimizedNeuralModel = null;
+                _inferenceOptimizationsInitialized = false;
             }
             else
             {
diff --git a/src/NeuralNetworks/Attention/FlashAttention.cs b/src/NeuralNetworks/Attention/FlashAttention.cs
index 9974e1a31..dbfe87963 100644
--- a/src/NeuralNetworks/Attention/FlashAttention.cs
+++ b/src/NeuralNetworks/Attention/FlashAttention.cs
@@ -32,7 +32,7 @@ namespace AiDotNet.NeuralNetworks.Attention;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations (typically float or double).</typeparam>
-public static class FlashAttention<T>
+internal static class FlashAttention<T>
 {
     private static readonly INumericOperations<T> NumOps = MathHelper.GetNumericOperations<T>();
 
@@ -43,12 +43,17 @@ public static class FlashAttention<T>
     /// <param name="key">Key tensor of shape [batch, seqLen, headDim] or [batch, heads, seqLen, headDim].</param>
     /// <param name="value">Value tensor of shape [batch, seqLen, headDim] or [batch, heads, seqLen, headDim].</param>
     /// <param name="config">Flash Attention configuration.</param>
+    /// <param name="queryOffset">
+    /// Optional offset for causal masking when <paramref name="query"/> represents a window into a longer KV sequence.
+    /// Use this for KV-cached decoding where Q is the newly appended tokens and K/V contain the full cached sequence.
+    /// </param>
     /// <returns>Output tensor of same shape as query, and optionally attention weights if configured.</returns>
     public static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward(
         Tensor<T> query,
         Tensor<T> key,
         Tensor<T> value,
-        FlashAttentionConfig? config = null)
+        FlashAttentionConfig? config = null,
+        int queryOffset = 0)
     {
         config ??= FlashAttentionConfig.Default;
 
@@ -58,14 +63,18 @@ public static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward(
         // Determine if inputs are 3D [batch, seq, dim] or 4D [batch, heads, seq, dim]
         bool is4D = query.Shape.Length == 4;
 
-        if (is4D)
-        {
-            return Forward4D(query, key, value, config);
-        }
-        else
+        int seqLenQ = is4D ? query.Shape[2] : query.Shape[1];
+        int seqLenKV = is4D ? key.Shape[2] : key.Shape[1];
+        if (queryOffset < 0 || queryOffset + seqLenQ > seqLenKV)
         {
-            return Forward3D(query, key, value, config);
+            throw new ArgumentOutOfRangeException(
+                nameof(queryOffset),
+                $"queryOffset ({queryOffset}) must satisfy 0 <= queryOffset and queryOffset + seqLenQ ({seqLenQ}) <= seqLenKV ({seqLenKV}).");
         }
+
+        return is4D
+            ? Forward4D(query, key, value, config, queryOffset)
+            : Forward3D(query, key, value, config, queryOffset);
     }
 
     /// <summary>
@@ -75,7 +84,8 @@ private static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward3D(
         Tensor<T> query,
         Tensor<T> key,
         Tensor<T> value,
-        FlashAttentionConfig config)
+        FlashAttentionConfig config,
+        int queryOffset)
     {
         int batchSize = query.Shape[0];
         int seqLenQ = query.Shape[1];
@@ -100,7 +110,7 @@ private static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward3D(
         {
             FlashAttentionCore(
                 query, key, value, output, attentionWeights,
-                b, 0, seqLenQ, seqLenKV, headDim, scale, config);
+                b, 0, seqLenQ, seqLenKV, headDim, scale, config, queryOffset);
         }
 
         return (output, attentionWeights);
@@ -113,7 +123,8 @@ private static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward4D(
         Tensor<T> query,
         Tensor<T> key,
         Tensor<T> value,
-        FlashAttentionConfig config)
+        FlashAttentionConfig config,
+        int queryOffset)
     {
         int batchSize = query.Shape[0];
         int numHeads = query.Shape[1];
@@ -141,7 +152,7 @@ private static (Tensor<T> Output, Tensor<T>? AttentionWeights) Forward4D(
             {
                 FlashAttentionCore4D(
                     query, key, value, output, attentionWeights,
-                    b, h, seqLenQ, seqLenKV, headDim, scale, config);
+                    b, h, seqLenQ, seqLenKV, headDim, scale, config, queryOffset);
             }
         }
 
@@ -173,7 +184,8 @@ private static void FlashAttentionCore(
         int seqLenKV,
         int headDim,
         T scale,
-        FlashAttentionConfig config)
+        FlashAttentionConfig config,
+        int queryOffset)
     {
         int blockSizeQ = Math.Min(config.BlockSizeQ, seqLenQ);
         int blockSizeKV = Math.Min(config.BlockSizeKV, seqLenKV);
@@ -211,7 +223,7 @@ private static void FlashAttentionCore(
                 int kvBlockSize = kvEnd - kvStart;
 
                 // Apply causal mask: skip blocks that are entirely masked
-                if (config.UseCausalMask && kvStart > qEnd - 1)
+                if (config.UseCausalMask && kvStart > queryOffset + qEnd - 1)
                 {
                     continue;
                 }
@@ -228,7 +240,7 @@ private static void FlashAttentionCore(
                         int kIdx = kvStart + kj;
 
                         // Apply causal mask
-                        if (config.UseCausalMask && kIdx > qIdx)
+                        if (config.UseCausalMask && kIdx > queryOffset + qIdx)
                         {
                             scores[qi, kj] = negInf;
                             continue;
@@ -349,7 +361,8 @@ private static void FlashAttentionCore4D(
         int seqLenKV,
         int headDim,
         T scale,
-        FlashAttentionConfig config)
+        FlashAttentionConfig config,
+        int queryOffset)
     {
         int blockSizeQ = Math.Min(config.BlockSizeQ, seqLenQ);
         int blockSizeKV = Math.Min(config.BlockSizeKV, seqLenKV);
@@ -381,7 +394,7 @@ private static void FlashAttentionCore4D(
                 int kvEnd = Math.Min(kvStart + blockSizeKV, seqLenKV);
                 int kvBlockSize = kvEnd - kvStart;
 
-                if (config.UseCausalMask && kvStart > qEnd - 1)
+                if (config.UseCausalMask && kvStart > queryOffset + qEnd - 1)
                 {
                     continue;
                 }
@@ -397,7 +410,7 @@ private static void FlashAttentionCore4D(
                     {
                         int kIdx = kvStart + kj;
 
-                        if (config.UseCausalMask && kIdx > qIdx)
+                        if (config.UseCausalMask && kIdx > queryOffset + qIdx)
                         {
                             scores[qi, kj] = negInf;
                             continue;
diff --git a/src/NeuralNetworks/Attention/FlashAttentionLayer.cs b/src/NeuralNetworks/Attention/FlashAttentionLayer.cs
index 4a3c86924..eede4c4e2 100644
--- a/src/NeuralNetworks/Attention/FlashAttentionLayer.cs
+++ b/src/NeuralNetworks/Attention/FlashAttentionLayer.cs
@@ -27,7 +27,7 @@ namespace AiDotNet.NeuralNetworks.Attention;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for computations (typically float or double).</typeparam>
-public class FlashAttentionLayer<T> : LayerBase<T>
+internal class FlashAttentionLayer<T> : LayerBase<T>, AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata
 {
     private readonly int _headCount;
     private readonly int _headDimension;
@@ -519,4 +519,13 @@ public override Dictionary<string, string> GetDiagnostics()
     /// Gets the output projection weights.
     /// </summary>
     public Matrix<T> GetOutputWeights() => _outputWeights;
+
+    Dictionary<string, string> AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["HeadCount"] = _headCount.ToString(),
+            ["UseCausalMask"] = _config.UseCausalMask.ToString()
+        };
+    }
 }
diff --git a/src/NeuralNetworks/Layers/DropoutLayer.cs b/src/NeuralNetworks/Layers/DropoutLayer.cs
index 96837c3b1..a8e51a5f0 100644
--- a/src/NeuralNetworks/Layers/DropoutLayer.cs
+++ b/src/NeuralNetworks/Layers/DropoutLayer.cs
@@ -29,7 +29,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for computations (e.g., float, double).</typeparam>
-public class DropoutLayer<T> : LayerBase<T>
+public class DropoutLayer<T> : LayerBase<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// The probability of dropping out (deactivating) a neuron during training.
@@ -554,4 +554,12 @@ public override ComputationNode<T> ExportComputationGraph(List<ComputationNode<T
     /// </para>
     /// </remarks>
     public override bool SupportsJitCompilation => true;
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["DropoutRate"] = Convert.ToDouble(_dropoutRate).ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
 }
diff --git a/src/NeuralNetworks/Layers/EmbeddingLayer.cs b/src/NeuralNetworks/Layers/EmbeddingLayer.cs
index c6ad2df2e..ec10a0ccd 100644
--- a/src/NeuralNetworks/Layers/EmbeddingLayer.cs
+++ b/src/NeuralNetworks/Layers/EmbeddingLayer.cs
@@ -34,7 +34,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for calculations, typically float or double.</typeparam>
-public class EmbeddingLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>
+public class EmbeddingLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// The embedding tensor that stores vector representations for each token in the vocabulary.
@@ -755,4 +755,13 @@ public override Autodiff.ComputationNode<T> ExportComputationGraph(List<Autodiff
         // Use EmbeddingLookup operation which supports gradients
         return Autodiff.TensorOperations<T>.EmbeddingLookup(embeddingNode, inputNode);
     }
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["VocabularySize"] = _embeddingTensor.Shape[0].ToString(System.Globalization.CultureInfo.InvariantCulture),
+            ["EmbeddingDimension"] = _embeddingTensor.Shape[1].ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
 }
diff --git a/src/NeuralNetworks/Layers/GraphAttentionLayer.cs b/src/NeuralNetworks/Layers/GraphAttentionLayer.cs
index 626c5c4fd..5c696af2e 100644
--- a/src/NeuralNetworks/Layers/GraphAttentionLayer.cs
+++ b/src/NeuralNetworks/Layers/GraphAttentionLayer.cs
@@ -30,7 +30,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for calculations, typically float or double.</typeparam>
-public class GraphAttentionLayer<T> : LayerBase<T>, IGraphConvolutionLayer<T>
+public class GraphAttentionLayer<T> : LayerBase<T>, IGraphConvolutionLayer<T>, ILayerSerializationMetadata
 {
     private readonly int _inputFeatures;
     private readonly int _outputFeatures;
@@ -1207,4 +1207,14 @@ public override ComputationNode<T> ExportComputationGraph(List<ComputationNode<T
 
         return output;
     }
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["NumHeads"] = _numHeads.ToString(System.Globalization.CultureInfo.InvariantCulture),
+            ["Alpha"] = Convert.ToDouble(_alpha).ToString(System.Globalization.CultureInfo.InvariantCulture),
+            ["DropoutRate"] = _dropoutRate.ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
 }
diff --git a/src/NeuralNetworks/Layers/ILayerSerializationExtras.cs b/src/NeuralNetworks/Layers/ILayerSerializationExtras.cs
new file mode 100644
index 000000000..9b494c382
--- /dev/null
+++ b/src/NeuralNetworks/Layers/ILayerSerializationExtras.cs
@@ -0,0 +1,21 @@
+using AiDotNet.Tensors.LinearAlgebra;
+
+namespace AiDotNet.NeuralNetworks.Layers;
+
+/// <summary>
+/// Provides additional, optional parameter blocks for serialization that are not part of <see cref="ILayer{T}.ParameterCount"/>.
+/// </summary>
+/// <typeparam name="T">Numeric type for the layer.</typeparam>
+/// <remarks>
+/// This exists to support layers where <see cref="ILayer{T}.ParameterCount"/> intentionally reflects trainable parameters
+/// (e.g., frozen base weights in LoRA adapters) but full model serialization/cloning must still preserve non-trainable state.
+/// </remarks>
+internal interface ILayerSerializationExtras<T>
+{
+    int ExtraParameterCount { get; }
+
+    Vector<T> GetExtraParameters();
+
+    void SetExtraParameters(Vector<T> extraParameters);
+}
+
diff --git a/src/NeuralNetworks/Layers/ILayerSerializationMetadata.cs b/src/NeuralNetworks/Layers/ILayerSerializationMetadata.cs
new file mode 100644
index 000000000..319a6481f
--- /dev/null
+++ b/src/NeuralNetworks/Layers/ILayerSerializationMetadata.cs
@@ -0,0 +1,14 @@
+namespace AiDotNet.NeuralNetworks.Layers;
+
+/// <summary>
+/// Internal hook for providing constructor metadata needed to reliably round-trip layers through
+/// NeuralNetworkBase serialization (used by Clone/DeepCopy).
+/// </summary>
+/// <remarks>
+/// This is intentionally internal to avoid expanding the user-facing surface area.
+/// </remarks>
+internal interface ILayerSerializationMetadata
+{
+    Dictionary<string, string> GetSerializationMetadata();
+}
+
diff --git a/src/NeuralNetworks/Layers/LayerBase.cs b/src/NeuralNetworks/Layers/LayerBase.cs
index 02c2f557b..ddb93683a 100644
--- a/src/NeuralNetworks/Layers/LayerBase.cs
+++ b/src/NeuralNetworks/Layers/LayerBase.cs
@@ -1381,7 +1381,9 @@ public virtual void UpdateParameters(Vector<T> parameters)
             throw new ArgumentException($"Expected {ParameterCount} parameters, but got {parameters.Length}");
         }
 
-        Parameters = parameters;
+        // Delegate to SetParameters so derived layers that manage structured weights/biases
+        // can correctly materialize the provided flat parameter vector.
+        SetParameters(parameters);
     }
 
     /// <summary>
@@ -1737,4 +1739,4 @@ protected bool CanActivationBeJitted()
         // No activation (identity) always supports JIT
         return true;
     }
-}
\ No newline at end of file
+}
diff --git a/src/NeuralNetworks/Layers/LayerNormalizationLayer.cs b/src/NeuralNetworks/Layers/LayerNormalizationLayer.cs
index fe6a763f4..4bf202b30 100644
--- a/src/NeuralNetworks/Layers/LayerNormalizationLayer.cs
+++ b/src/NeuralNetworks/Layers/LayerNormalizationLayer.cs
@@ -32,7 +32,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for calculations, typically float or double.</typeparam>
-public class LayerNormalizationLayer<T> : LayerBase<T>
+public class LayerNormalizationLayer<T> : LayerBase<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// A small value added to the variance for numerical stability.
@@ -594,4 +594,12 @@ public override bool SupportsJitCompilation
             return _gamma != null && _beta != null;
         }
     }
-}
\ No newline at end of file
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["Epsilon"] = Convert.ToDouble(_epsilon).ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
+}
diff --git a/src/NeuralNetworks/Layers/MultiHeadAttentionLayer.cs b/src/NeuralNetworks/Layers/MultiHeadAttentionLayer.cs
index b8c7a4ca3..130b8bdcb 100644
--- a/src/NeuralNetworks/Layers/MultiHeadAttentionLayer.cs
+++ b/src/NeuralNetworks/Layers/MultiHeadAttentionLayer.cs
@@ -14,7 +14,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// several friends for advice on a decision - each person might notice different important factors.
 /// </para>
 /// </remarks>
-public class MultiHeadAttentionLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>
+public class MultiHeadAttentionLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// Gets or sets whether auxiliary loss (attention regularization) should be used during training.
@@ -477,6 +477,14 @@ public Dictionary<string, string> GetAuxiliaryLossDiagnostics()
         };
     }
 
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["HeadCount"] = _headCount.ToString()
+        };
+    }
+
     /// <summary>
     /// Gets diagnostic information about this component's state and behavior.
     /// Overrides <see cref="LayerBase{T}.GetDiagnostics"/> to include auxiliary loss diagnostics.
diff --git a/src/NeuralNetworks/Layers/PositionalEncodingLayer.cs b/src/NeuralNetworks/Layers/PositionalEncodingLayer.cs
index dc10ad131..811cdbb6c 100644
--- a/src/NeuralNetworks/Layers/PositionalEncodingLayer.cs
+++ b/src/NeuralNetworks/Layers/PositionalEncodingLayer.cs
@@ -32,7 +32,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for calculations, typically float or double.</typeparam>
-public class PositionalEncodingLayer<T> : LayerBase<T>
+public class PositionalEncodingLayer<T> : LayerBase<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// The maximum sequence length that this layer can handle.
@@ -429,4 +429,13 @@ public override ComputationNode<T> ExportComputationGraph(List<ComputationNode<T
     }
 
     public override bool SupportsJitCompilation => true;
-}
\ No newline at end of file
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["MaxSequenceLength"] = maxSequenceLength.ToString(System.Globalization.CultureInfo.InvariantCulture),
+            ["EmbeddingSize"] = embeddingSize.ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
+}
diff --git a/src/NeuralNetworks/Layers/SelfAttentionLayer.cs b/src/NeuralNetworks/Layers/SelfAttentionLayer.cs
index e16edca9e..6d2143f50 100644
--- a/src/NeuralNetworks/Layers/SelfAttentionLayer.cs
+++ b/src/NeuralNetworks/Layers/SelfAttentionLayer.cs
@@ -33,7 +33,7 @@ namespace AiDotNet.NeuralNetworks.Layers;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type used for calculations, typically float or double.</typeparam>
-public class SelfAttentionLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>
+public class SelfAttentionLayer<T> : LayerBase<T>, IAuxiliaryLossLayer<T>, ILayerSerializationMetadata
 {
     /// <summary>
     /// Gets or sets whether auxiliary loss (attention sparsity regularization) should be used during training.
@@ -1371,4 +1371,12 @@ public override bool SupportsJitCompilation
                    _valueWeights.Shape.Length >= 2 && _valueWeights.Shape[0] > 0;
         }
     }
+
+    Dictionary<string, string> ILayerSerializationMetadata.GetSerializationMetadata()
+    {
+        return new Dictionary<string, string>
+        {
+            ["HeadCount"] = _headCount.ToString(System.Globalization.CultureInfo.InvariantCulture)
+        };
+    }
 }
diff --git a/src/NeuralNetworks/NeuralNetworkBase.cs b/src/NeuralNetworks/NeuralNetworkBase.cs
index 147b17dcc..ddbb31de0 100644
--- a/src/NeuralNetworks/NeuralNetworkBase.cs
+++ b/src/NeuralNetworks/NeuralNetworkBase.cs
@@ -1263,6 +1263,12 @@ public virtual byte[] Serialize()
         using var ms = new MemoryStream();
         using var writer = new BinaryWriter(ms);
 
+        // Serialization format:
+        // - V1: [layerCount:int32] ...
+        // - V2+: [-version:int32][layerCount:int32] ... (supports per-layer extra parameter blocks)
+        const int serializationVersion = 2;
+        writer.Write(-serializationVersion);
+
         // Write the number of layers
         writer.Write(Layers.Count);
 
@@ -1270,7 +1276,7 @@ public virtual byte[] Serialize()
         foreach (var layer in Layers)
         {
             // Write layer type
-            writer.Write(layer.GetType().Name);
+            writer.Write(GetSerializedLayerTypeIdentifier(layer));
 
             // Write input shape
             var inputShape = layer.GetInputShape();
@@ -1300,6 +1306,25 @@ public virtual byte[] Serialize()
                     writer.Write(Convert.ToDouble(param));
                 }
             }
+
+            // Write any extra parameter blocks (V2+).
+            int extraCount = 0;
+            AiDotNet.Tensors.LinearAlgebra.Vector<T>? extras = null;
+            if (layer is AiDotNet.NeuralNetworks.Layers.ILayerSerializationExtras<T> extraProvider &&
+                extraProvider.ExtraParameterCount > 0)
+            {
+                extras = extraProvider.GetExtraParameters();
+                extraCount = extras.Length;
+            }
+
+            writer.Write(extraCount);
+            if (extraCount > 0 && extras != null)
+            {
+                for (int i = 0; i < extras.Length; i++)
+                {
+                    writer.Write(Convert.ToDouble(extras[i]));
+                }
+            }
         }
 
         // Write network-specific data
@@ -1308,6 +1333,47 @@ public virtual byte[] Serialize()
         return ms.ToArray();
     }
 
+    private static string GetSerializedLayerTypeIdentifier(ILayer<T> layer)
+    {
+        string typeName = layer.GetType().Name;
+
+        var metadata = new Dictionary<string, string>(StringComparer.Ordinal);
+
+        if (layer is AiDotNet.NeuralNetworks.Layers.ILayerSerializationMetadata metadataProvider)
+        {
+            foreach (var kvp in metadataProvider.GetSerializationMetadata())
+            {
+                metadata[kvp.Key] = kvp.Value;
+            }
+        }
+
+        // Persist activation types for LayerBase-derived layers so Clone/DeepCopy round-trips behavior.
+        if (layer is AiDotNet.NeuralNetworks.Layers.LayerBase<T> layerBase)
+        {
+            if (layerBase.VectorActivation != null)
+            {
+                metadata["VectorActivationType"] = layerBase.VectorActivation.GetType().AssemblyQualifiedName ?? layerBase.VectorActivation.GetType().FullName ?? string.Empty;
+            }
+            else if (layerBase.ScalarActivation != null)
+            {
+                metadata["ScalarActivationType"] = layerBase.ScalarActivation.GetType().AssemblyQualifiedName ?? layerBase.ScalarActivation.GetType().FullName ?? string.Empty;
+            }
+        }
+
+        if (metadata.Count == 0)
+        {
+            return typeName;
+        }
+
+        // Stable ordering for deterministic serialization.
+        foreach (var kvp in metadata.OrderBy(k => k.Key, StringComparer.Ordinal))
+        {
+            typeName += $";{kvp.Key}={kvp.Value}";
+        }
+
+        return typeName;
+    }
+
     /// <summary>
     /// Deserializes the neural network from a byte array.
     /// </summary>
@@ -1320,8 +1386,20 @@ public virtual void Deserialize(byte[] data)
         // Clear existing layers
         ClearLayers();
 
-        // Read the number of layers
-        int layerCount = reader.ReadInt32();
+        // Read the number of layers (support both V1 and V2+ formats).
+        int first = reader.ReadInt32();
+        int serializationVersion;
+        int layerCount;
+        if (first < 0)
+        {
+            serializationVersion = -first;
+            layerCount = reader.ReadInt32();
+        }
+        else
+        {
+            serializationVersion = 1;
+            layerCount = first;
+        }
 
         // Read and recreate each layer
         for (int i = 0; i < layerCount; i++)
@@ -1363,6 +1441,24 @@ public virtual void Deserialize(byte[] data)
                 layer.UpdateParameters(parameters);
             }
 
+            if (serializationVersion >= 2)
+            {
+                int extraCount = reader.ReadInt32();
+                if (extraCount > 0)
+                {
+                    var extraParams = new Vector<T>(extraCount);
+                    for (int j = 0; j < extraCount; j++)
+                    {
+                        extraParams[j] = NumOps.FromDouble(reader.ReadDouble());
+                    }
+
+                    if (layer is AiDotNet.NeuralNetworks.Layers.ILayerSerializationExtras<T> extraProvider)
+                    {
+                        extraProvider.SetExtraParameters(extraParams);
+                    }
+                }
+            }
+
             // Add the layer to the network
             _layers.Add(layer);
         }
@@ -2440,4 +2536,4 @@ protected virtual ComputationNode<T> ConvertLayerToGraph(ILayer<T> layer, Comput
 
     #endregion
 
-}
\ No newline at end of file
+}
diff --git a/src/NeuralNetworks/Transformer.cs b/src/NeuralNetworks/Transformer.cs
index fbef43d10..592e2e168 100644
--- a/src/NeuralNetworks/Transformer.cs
+++ b/src/NeuralNetworks/Transformer.cs
@@ -660,7 +660,11 @@ protected override void DeserializeNetworkSpecificData(BinaryReader reader)
         T dropoutRate = NumOps.FromDouble(reader.ReadDouble());
 
         // Read and reconstruct loss function and optimizer
-        _optimizer = DeserializationHelper.DeserializeInterface<IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>>(reader) ?? new GradientDescentOptimizer<T, Tensor<T>, Tensor<T>>(this);
+        LossFunction = DeserializationHelper.DeserializeInterface<ILossFunction<T>>(reader)
+            ?? NeuralNetworkHelper<T>.GetDefaultLossFunction(_transformerArchitecture.TaskType);
+
+        _optimizer = DeserializationHelper.DeserializeInterface<IGradientBasedOptimizer<T, Tensor<T>, Tensor<T>>>(reader)
+            ?? new GradientDescentOptimizer<T, Tensor<T>, Tensor<T>>(this);
     }
 
     /// <summary>
@@ -698,4 +702,4 @@ protected override IFullModel<T, Tensor<T>, Tensor<T>> CreateNewInstance()
             LossFunction,
             _optimizer);
     }
-}
\ No newline at end of file
+}
diff --git a/src/Normalizers/NoNormalizer.cs b/src/Normalizers/NoNormalizer.cs
index ff16962e4..6960cf01a 100644
--- a/src/Normalizers/NoNormalizer.cs
+++ b/src/Normalizers/NoNormalizer.cs
@@ -122,16 +122,17 @@ public override (TInput, List<NormalizationParameters<T>>) NormalizeInput(TInput
             var parameters = Enumerable.Repeat(new NormalizationParameters<T> { Method = NormalizationMethod.None }, matrix.Columns).ToList();
             return (data, parameters);
         }
-        else if (data is Tensor<T> tensor && tensor.Shape.Length == 2)
+        else if (data is Tensor<T> tensor)
         {
-            int columns = tensor.Shape[1];
-            var parameters = Enumerable.Repeat(new NormalizationParameters<T> { Method = NormalizationMethod.None }, columns).ToList();
+            // Treat the last dimension as the "feature" dimension for parameter bookkeeping.
+            int featureCount = tensor.Shape.Length == 0 ? 1 : tensor.Shape[^1];
+            var parameters = Enumerable.Repeat(new NormalizationParameters<T> { Method = NormalizationMethod.None }, featureCount).ToList();
             return (data, parameters);
         }
         
         throw new InvalidOperationException(
             $"Unsupported data type {typeof(TInput).Name}. " +
-            $"Supported types are Matrix<{typeof(T).Name}> and 2D Tensor<{typeof(T).Name}>.");
+            $"Supported types are Matrix<{typeof(T).Name}> and Tensor<{typeof(T).Name}>.");
     }
 
     /// <summary>
@@ -298,4 +299,4 @@ public override T Denormalize(TInput xMatrix, TOutput y, TOutput coefficients,
             $"y: {y?.GetType().Name}, coefficients: {coefficients?.GetType().Name}. " +
             $"Expected Matrix<T> or 2D Tensor<T> for xMatrix, and Vector<T> or Tensor<T> for y and coefficients.");
     }
-}
\ No newline at end of file
+}
diff --git a/src/Serving/ContinuousBatching/BatchScheduler.cs b/src/Serving/ContinuousBatching/BatchScheduler.cs
index 313e10ccd..3540758c5 100644
--- a/src/Serving/ContinuousBatching/BatchScheduler.cs
+++ b/src/Serving/ContinuousBatching/BatchScheduler.cs
@@ -108,7 +108,19 @@ public List<SequenceState<T>> ScheduleNextBatch()
         lock (_lock)
         {
             var batch = new List<SequenceState<T>>();
-            int availableSlots = _config.MaxBatchSize - _runningSequences.Count;
+
+            // Always include already-running sequences (continuous batching).
+            // These sequences must be processed every iteration until they complete or are preempted.
+            foreach (var seq in _runningSequences)
+            {
+                if (batch.Count >= _config.MaxBatchSize)
+                    break;
+
+                if (seq.Status is SequenceStatus.Generating or SequenceStatus.Prefilling)
+                    batch.Add(seq);
+            }
+
+            int availableSlots = _config.MaxBatchSize - batch.Count;
             long availableMemory = _config.MaxMemoryBytes - _usedMemoryBytes;
 
             // First, try to resume preempted sequences (FIFO order)
diff --git a/src/Serving/ContinuousBatching/ContinuousBatcher.cs b/src/Serving/ContinuousBatching/ContinuousBatcher.cs
index 82de7cc36..bf369b26c 100644
--- a/src/Serving/ContinuousBatching/ContinuousBatcher.cs
+++ b/src/Serving/ContinuousBatching/ContinuousBatcher.cs
@@ -1,5 +1,7 @@
 using System.Collections.Concurrent;
 using AiDotNet.Inference;
+using AiDotNet.Inference.SpeculativeDecoding;
+using AiDotNet.Helpers;
 using AiDotNet.Tensors.Helpers;
 
 namespace AiDotNet.Serving.ContinuousBatching;
@@ -33,12 +35,22 @@ namespace AiDotNet.Serving.ContinuousBatching;
 /// </para>
 /// </remarks>
 /// <typeparam name="T">The numeric type for tensor computations.</typeparam>
-public class ContinuousBatcher<T> : IDisposable
+internal class ContinuousBatcher<T> : IDisposable
 {
     private readonly ContinuousBatcherConfig _config;
     private readonly BatchScheduler<T> _scheduler;
     private readonly KVCache<T>? _kvCache;
     private readonly Func<Tensor<T>, Tensor<T>>? _model;
+    private readonly IDraftModel<T>? _draftModelOverride;
+
+    private SpeculativeDecoder<T>? _speculativeDecoder;
+    private readonly object _speculativeLock = new();
+    private volatile bool _speculationDisabledDueToFailure;
+    private long _speculationDisabledUntilIteration;
+
+    internal bool LastStepUsedSpeculation { get; private set; }
+    internal int LastStepSpeculationTokens { get; private set; }
+    internal string LastStepSpeculationReason { get; private set; } = string.Empty;
 
     private readonly ConcurrentDictionary<long, TaskCompletionSource<GenerationResult<T>>> _pendingResults;
     private readonly ConcurrentQueue<SequenceState<T>> _incomingRequests;
@@ -88,11 +100,13 @@ public class ContinuousBatcher<T> : IDisposable
     public ContinuousBatcher(
         ContinuousBatcherConfig config,
         Func<Tensor<T>, Tensor<T>>? model = null,
-        KVCache<T>? kvCache = null)
+        KVCache<T>? kvCache = null,
+        IDraftModel<T>? draftModel = null)
     {
         _config = config ?? throw new ArgumentNullException(nameof(config));
         _model = model;
         _kvCache = kvCache;
+        _draftModelOverride = draftModel;
 
         _scheduler = new BatchScheduler<T>(config.SchedulerConfig);
         _pendingResults = new ConcurrentDictionary<long, TaskCompletionSource<GenerationResult<T>>>();
@@ -202,6 +216,11 @@ public int Step()
         if (batch.Count == 0)
             return 0;
 
+        bool useSpeculation = ShouldUseSpeculativeDecoding(batch, out var speculationReason);
+        LastStepUsedSpeculation = useSpeculation;
+        LastStepSpeculationTokens = 0;
+        LastStepSpeculationReason = speculationReason;
+
         _totalIterations++;
         int tokensGenerated = 0;
 
@@ -220,26 +239,32 @@ public int Step()
         {
             if (seq.Status == SequenceStatus.Generating)
             {
-                int newToken = RunDecodeStep(seq);
-                if (newToken >= 0)
+                var newTokens = useSpeculation ? RunDecodeStepSpeculative(seq) : RunDecodeStep(seq);
+                if (newTokens.Count > 0)
                 {
-                    tokensGenerated++;
-                    _totalTokensGenerated++;
-
-                    // Fire token generated event
-                    TokenGenerated?.Invoke(this, new TokenGeneratedEventArgs<T>
+                    foreach (var newToken in newTokens)
                     {
-                        Sequence = seq,
-                        TokenId = newToken
-                    });
-
-                    // Invoke callback if provided
-                    seq.Request.OnTokenGenerated?.Invoke(newToken);
-
-                    // Check for completion
-                    if (seq.ShouldStop(_config.EosTokenId, seq.Request.StopTokenIds))
-                    {
-                        CompleteSequence(seq);
+                        tokensGenerated++;
+                        _totalTokensGenerated++;
+                        if (useSpeculation)
+                            LastStepSpeculationTokens++;
+
+                        // Fire token generated event
+                        TokenGenerated?.Invoke(this, new TokenGeneratedEventArgs<T>
+                        {
+                            Sequence = seq,
+                            TokenId = newToken
+                        });
+
+                        // Invoke callback if provided
+                        seq.Request.OnTokenGenerated?.Invoke(newToken);
+
+                        // Check for completion after each appended token
+                        if (seq.ShouldStop(_config.EosTokenId, seq.Request.StopTokenIds))
+                        {
+                            CompleteSequence(seq);
+                            break;
+                        }
                     }
                 }
             }
@@ -325,9 +350,9 @@ private void RunPrefill(SequenceState<T> sequence)
         sequence.Status = SequenceStatus.Generating;
     }
 
-    private int RunDecodeStep(SequenceState<T> sequence)
+    private IReadOnlyList<int> RunDecodeStep(SequenceState<T> sequence)
     {
-        if (_model == null) return -1;
+        if (_model == null) return Array.Empty<int>();
 
         // Create input tensor from last token only (incremental decoding)
         int lastToken = sequence.TokenIds[^1];
@@ -340,7 +365,292 @@ private int RunDecodeStep(SequenceState<T> sequence)
         int nextToken = SampleFromLogits(logits, sequence.Request);
         sequence.AppendToken(nextToken);
 
-        return nextToken;
+        return new[] { nextToken };
+    }
+
+    private IReadOnlyList<int> RunDecodeStepSpeculative(SequenceState<T> sequence)
+    {
+        if (_model == null) return Array.Empty<int>();
+        if (!ShouldSpeculateForThisIteration()) return RunDecodeStep(sequence);
+
+        int remaining = sequence.MaxNewTokens - sequence.GeneratedLength;
+        if (remaining <= 0) return Array.Empty<int>();
+
+        var decoder = EnsureSpeculativeDecoder();
+        if (decoder == null) return RunDecodeStep(sequence);
+
+        var numOps = MathHelper.GetNumericOperations<T>();
+        T temperature = numOps.FromDouble(sequence.Request.Temperature);
+
+        var inputTokens = new Vector<int>(sequence.TokenIds.ToArray());
+        int maxNew = Math.Min(remaining, Math.Max(1, _config.SpeculationDepth + 1));
+
+        SpeculativeResult result;
+        try
+        {
+            result = decoder.Generate(
+                inputTokens,
+                maxNewTokens: maxNew,
+                temperature: temperature,
+                eosToken: _config.EosTokenId);
+        }
+        catch (Exception ex)
+        {
+            _speculationDisabledDueToFailure = true;
+            InferenceDiagnostics.RecordException(
+                area: "Serving.ContinuousBatching",
+                feature: "SpeculativeDecoding",
+                ex: ex,
+                reason: "Speculative decoder execution failed; falling back to baseline decode.");
+            InferenceDiagnostics.RecordDecision(
+                area: "Serving.ContinuousBatching",
+                feature: "SpeculativeDecoding",
+                enabled: false,
+                reason: "DisabledDueToFailure");
+            return RunDecodeStep(sequence);
+        }
+
+        if (result.NewTokens.Length == 0)
+            return Array.Empty<int>();
+
+        var tokens = new List<int>(result.NewTokens.Length);
+        for (int i = 0; i < result.NewTokens.Length; i++)
+        {
+            int token = result.NewTokens[i];
+            sequence.AppendToken(token);
+            tokens.Add(token);
+
+            // Prevent appending beyond stop conditions (e.g., EOS in the speculative batch).
+            if (sequence.ShouldStop(_config.EosTokenId, sequence.Request.StopTokenIds))
+            {
+                break;
+            }
+        }
+
+        return tokens;
+    }
+
+    private bool ShouldUseSpeculativeDecoding(IReadOnlyCollection<SequenceState<T>> batch, out string reason)
+    {
+        if (_speculationDisabledDueToFailure)
+        {
+            reason = "DisabledDueToFailure";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        if (!_config.EnableSpeculativeDecoding)
+        {
+            reason = "DisabledByConfig";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        if (_config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.ForceOff)
+        {
+            reason = "ForceOff";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        if (_config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.ForceOn)
+        {
+            reason = "ForceOn";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: true, reason: reason);
+            return true;
+        }
+
+        if (_config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.ThroughputFirst)
+        {
+            // Extremely conservative: only speculate when there is no queue pressure and batches are tiny.
+            bool ok = batch.Count == 1 && _scheduler.WaitingCount == 0 && _speculationDisabledUntilIteration <= _totalIterations;
+            reason = ok ? "ThroughputFirst(Enabled)" : "ThroughputFirst(Backoff)";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: ok, reason: reason);
+            return ok;
+        }
+
+        // Auto policy: back off under load and when draft acceptance is too low.
+        if (_speculationDisabledUntilIteration > _totalIterations)
+        {
+            reason = "AutoBackoff(Cooldown)";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        int maxBatchForSpeculation = _config.SchedulerConfig.MaxBatchSize / 2;
+        if (_config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.LatencyFirst)
+        {
+            // Allow more speculation under load, but still avoid it when the queue is growing.
+            maxBatchForSpeculation = Math.Max(1, _config.SchedulerConfig.MaxBatchSize);
+        }
+
+        bool enabled = batch.Count <= Math.Max(1, maxBatchForSpeculation) && _scheduler.WaitingCount == 0;
+        if (!enabled)
+        {
+            reason = _config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.LatencyFirst
+                ? "LatencyFirst(Backoff:LoadOrQueue)"
+                : "AutoBackoff(LoadOrQueue)";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        // If we have enough evidence that the draft model is low-quality, disable speculation for a short cooldown.
+        var decoder = _speculativeDecoder;
+        if (decoder != null && decoder.TotalDraftTokens >= 32 && decoder.AcceptanceRate < 0.25)
+        {
+            _speculationDisabledUntilIteration = _totalIterations + 25;
+            reason = $"AutoBackoff(LowAcceptanceRate={decoder.AcceptanceRate:0.00})";
+            InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: reason);
+            return false;
+        }
+
+        reason = "AutoEnabled";
+        InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: true, reason: reason);
+        return true;
+    }
+
+    private bool ShouldSpeculateForThisIteration()
+    {
+        // Defensive: if speculation is enabled but we don't have a model forward, we can't speculate.
+        return !_speculationDisabledDueToFailure &&
+               _model != null &&
+               _config.EnableSpeculativeDecoding &&
+               _config.SpeculationPolicy != AiDotNet.Configuration.SpeculationPolicy.ForceOff;
+    }
+
+    private SpeculativeDecoder<T>? EnsureSpeculativeDecoder()
+    {
+        if (_speculationDisabledDueToFailure)
+            return null;
+
+        if (_speculativeDecoder != null)
+            return _speculativeDecoder;
+
+        lock (_speculativeLock)
+        {
+            if (_speculativeDecoder != null)
+                return _speculativeDecoder;
+
+            if (_speculationDisabledDueToFailure)
+                return null;
+
+            if (_model == null)
+                return null;
+
+            int vocabSize;
+            try
+            {
+                vocabSize = DetectVocabSize();
+            }
+            catch (Exception ex)
+            {
+                _speculationDisabledDueToFailure = true;
+                InferenceDiagnostics.RecordException("Serving.ContinuousBatching", "SpeculativeDecoding", ex, "Vocab size detection failed; disabling speculation.");
+                return null;
+            }
+
+            if (vocabSize <= 0)
+            {
+                _speculationDisabledDueToFailure = true;
+                InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: false, reason: "DisabledDueToFailure(VocabSizeInvalid)");
+                return null;
+            }
+
+            IDraftModel<T> draft;
+            try
+            {
+                draft = _draftModelOverride ?? new NGramDraftModel<T>(ngramSize: 3, vocabSize: vocabSize, seed: 42);
+            }
+            catch (Exception ex)
+            {
+                _speculationDisabledDueToFailure = true;
+                InferenceDiagnostics.RecordException("Serving.ContinuousBatching", "SpeculativeDecoding", ex, "Draft model init failed; disabling speculation.");
+                return null;
+            }
+
+            Matrix<T> TargetForward(Vector<int> tokens)
+            {
+                // Run the target model over the full sequence and return per-position probabilities.
+                var input = CreateInputTensor(tokens.ToArray());
+                var logits = _model(input);
+
+                int seqLen = logits.Shape.Length > 2 ? logits.Shape[^2] : 1;
+                int localVocabSize = logits.Shape[^1];
+
+                var numOps = MathHelper.GetNumericOperations<T>();
+                var probs = new Matrix<T>(seqLen, localVocabSize);
+                for (int pos = 0; pos < seqLen; pos++)
+                {
+                    // Extract logits for this position
+                    var row = new double[localVocabSize];
+                    double max = double.NegativeInfinity;
+                    for (int v = 0; v < localVocabSize; v++)
+                    {
+                        double val = Convert.ToDouble(logits[logits.Shape.Length > 2 ? new[] { 0, pos, v } : new[] { 0, v }]);
+                        row[v] = val;
+                        if (val > max) max = val;
+                    }
+
+                    // Softmax
+                    double sum = 0.0;
+                    for (int v = 0; v < localVocabSize; v++)
+                    {
+                        row[v] = Math.Exp(row[v] - max);
+                        sum += row[v];
+                    }
+                    if (sum <= 0) sum = 1;
+
+                    for (int v = 0; v < localVocabSize; v++)
+                    {
+                        probs[pos, v] = numOps.FromDouble(row[v] / sum);
+                    }
+                }
+
+                return probs;
+            }
+
+            var config = new SpeculativeDecodingConfig<T>
+            {
+                NumDraftTokens = Math.Max(1, _config.SpeculationDepth),
+                Seed = 42,
+                AdaptiveDraftLength = _config.SpeculationPolicy == AiDotNet.Configuration.SpeculationPolicy.Auto,
+                MinAcceptanceRate = MathHelper.GetNumericOperations<T>().FromDouble(0.5),
+                UseTreeSpeculation = _config.UseTreeSpeculation ||
+                                    _config.SpeculativeMethod == AiDotNet.Configuration.SpeculativeMethod.Medusa ||
+                                    _config.SpeculativeMethod == AiDotNet.Configuration.SpeculativeMethod.Eagle,
+                TreeBranchFactor = _config.SpeculativeMethod == AiDotNet.Configuration.SpeculativeMethod.Medusa ? 4 : 2,
+                MaxTreeDepth = Math.Max(1, _config.SpeculationDepth)
+            };
+
+            try
+            {
+                _speculativeDecoder = new SpeculativeDecoder<T>(draft, TargetForward, config);
+                InferenceDiagnostics.RecordDecision("Serving.ContinuousBatching", "SpeculativeDecoding", enabled: true, reason: "DecoderInitialized");
+                return _speculativeDecoder;
+            }
+            catch (Exception ex)
+            {
+                _speculationDisabledDueToFailure = true;
+                InferenceDiagnostics.RecordException("Serving.ContinuousBatching", "SpeculativeDecoding", ex, "Decoder init failed; disabling speculation.");
+                return null;
+            }
+        }
+    }
+
+    private int DetectVocabSize()
+    {
+        try
+        {
+            // Probe the model with a minimal input to infer the vocabulary dimension.
+            var probe = CreateInputTensor([0]);
+            var logits = _model!(probe);
+            return logits.Shape.Length >= 1 ? logits.Shape[^1] : 0;
+        }
+        catch
+        {
+            // Let the caller handle vocab detection failure.
+            return 0;
+        }
     }
 
     private Tensor<T> CreateInputTensor(int[] tokenIds)
diff --git a/src/Serving/ContinuousBatching/ContinuousBatcherConfig.cs b/src/Serving/ContinuousBatching/ContinuousBatcherConfig.cs
index acaffdcce..fb91ecf93 100644
--- a/src/Serving/ContinuousBatching/ContinuousBatcherConfig.cs
+++ b/src/Serving/ContinuousBatching/ContinuousBatcherConfig.cs
@@ -35,6 +35,34 @@ public class ContinuousBatcherConfig
     /// </summary>
     public bool EnableSpeculativeDecoding { get; set; } = false;
 
+    /// <summary>
+    /// Policy for when speculative decoding should run (default: Auto).
+    /// </summary>
+    public AiDotNet.Configuration.SpeculationPolicy SpeculationPolicy { get; set; } = AiDotNet.Configuration.SpeculationPolicy.Auto;
+
+    /// <summary>
+    /// Number of tokens to draft ahead when speculative decoding is enabled.
+    /// </summary>
+    public int SpeculationDepth { get; set; } = 4;
+
+    /// <summary>
+    /// Speculative decoding method to use (default: Auto).
+    /// </summary>
+    /// <remarks>
+    /// This keeps the public serving surface compact while enabling internal selection of
+    /// classic draft-model speculation vs tree-based alternatives (Medusa/EAGLE).
+    /// </remarks>
+    public AiDotNet.Configuration.SpeculativeMethod SpeculativeMethod { get; set; } = AiDotNet.Configuration.SpeculativeMethod.Auto;
+
+    /// <summary>
+    /// Whether to use tree-based speculation (multiple draft continuations).
+    /// </summary>
+    /// <remarks>
+    /// This is an advanced option; when false the batcher uses classic speculative decoding.
+    /// Some speculative methods may implicitly enable this internally.
+    /// </remarks>
+    public bool UseTreeSpeculation { get; set; } = false;
+
     /// <summary>
     /// Creates config for a specific model.
     /// </summary>
diff --git a/tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs b/tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs
index cabf61e1a..521e0b4d1 100644
--- a/tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs
+++ b/tests/AiDotNet.Serving.Tests/ServingIntegrationTests.cs
@@ -225,6 +225,69 @@ public async Task Predict_WithValidInput_ReturnsResults()
         repository.UnloadModel("test-model-4");
     }
 
+    /// <summary>
+    /// Verifies that serving can route to a pre-loaded model variant via an adapter header (Multi-LoRA MVP).
+    /// </summary>
+    [Fact]
+    public async Task Predict_WithAdapterHeader_RoutesToModelVariant()
+    {
+        // Arrange
+        using var scope = _factory.Services.CreateScope();
+        var repository = scope.ServiceProvider.GetRequiredService<IModelRepository>();
+
+        var baseName = "test-model-variant";
+        var adapterId = "adapterA";
+        var variantName = $"{baseName}__{adapterId}";
+
+        repository.LoadModel(baseName, CreateSimpleTestModel(baseName));
+
+        // Variant model returns (sum + 100) so we can detect routing.
+        var numOps = MathHelper.GetNumericOperations<double>();
+        var variant = new ServableModelWrapper<double>(
+            modelName: variantName,
+            inputDimension: 3,
+            outputDimension: 1,
+            predictFunc: input =>
+            {
+                var sum = numOps.Zero;
+                for (int i = 0; i < input.Length; i++)
+                {
+                    sum = numOps.Add(sum, input[i]);
+                }
+                return new Vector<double>(new[] { sum + 100.0 });
+            });
+        repository.LoadModel(variantName, variant);
+
+        var request = new PredictionRequest
+        {
+            Features = new[] { new[] { 1.0, 2.0, 3.0 } },
+            RequestId = "test-request-variant"
+        };
+
+        // Act
+        var message = new HttpRequestMessage(HttpMethod.Post, $"/api/inference/predict/{baseName}")
+        {
+            Content = JsonContent.Create(request)
+        };
+        message.Headers.Add("X-AiDotNet-Lora", adapterId);
+
+        var response = await _client.SendAsync(message);
+
+        // Assert
+        response.EnsureSuccessStatusCode();
+        var result = await response.Content.ReadFromJsonAsync<PredictionResponse>();
+        Assert.NotNull(result);
+        Assert.Equal("test-request-variant", result.RequestId);
+        Assert.NotNull(result.Predictions);
+        Assert.Single(result.Predictions);
+        Assert.Single(result.Predictions[0]);
+        Assert.Equal(106.0, result.Predictions[0][0], 5);
+
+        // Cleanup
+        repository.UnloadModel(baseName);
+        repository.UnloadModel(variantName);
+    }
+
     /// <summary>
     /// Critical test: Verifies that batch processing works correctly.
     /// This test ensures that multiple concurrent requests are batched together
diff --git a/tests/AiDotNet.Tests/InferenceOptimization/ConvolutionKernelValidationTests.cs b/tests/AiDotNet.Tests/InferenceOptimization/ConvolutionKernelValidationTests.cs
new file mode 100644
index 000000000..ecbd2b81f
--- /dev/null
+++ b/tests/AiDotNet.Tests/InferenceOptimization/ConvolutionKernelValidationTests.cs
@@ -0,0 +1,22 @@
+using System;
+using AiDotNet.InferenceOptimization.Kernels;
+using AiDotNet.LinearAlgebra;
+using Xunit;
+
+namespace AiDotNet.Tests.InferenceOptimization;
+
+public class ConvolutionKernelValidationTests
+{
+    [Fact]
+    public void Conv2D_Throws_WhenKernelInChannelsMismatch()
+    {
+        var kernel = new ConvolutionKernel();
+
+        var input = new Tensor<float>(new[] { 1, 3, 5, 5 });
+        var badKernel = new Tensor<float>(new[] { 2, 2, 3, 3 });
+
+        var ex = Assert.Throws<ArgumentException>(() => kernel.Conv2D(input, badKernel));
+        Assert.Contains("kernel.Shape[1] == inChannels", ex.Message, StringComparison.OrdinalIgnoreCase);
+    }
+}
+
diff --git a/tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs b/tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs
new file mode 100644
index 000000000..3f09c7ff7
--- /dev/null
+++ b/tests/AiDotNet.Tests/IntegrationTests/Inference/InferenceSessionIntegrationTests.cs
@@ -0,0 +1,644 @@
+using AiDotNet.Configuration;
+using AiDotNet.Enums;
+using AiDotNet.Interfaces;
+using AiDotNet.Models;
+using AiDotNet.Models.Options;
+using AiDotNet.Models.Results;
+using AiDotNet.NeuralNetworks;
+using AiDotNet.NeuralNetworks.Layers;
+using AiDotNet.Normalizers;
+using AiDotNet.Tensors.LinearAlgebra;
+using System.Linq;
+using System.Threading.Tasks;
+using Xunit;
+
+namespace AiDotNet.Tests.IntegrationTests.Inference;
+
+[Collection(AiDotNet.Tests.TestInfrastructure.DiagnosticsEnvironmentCollection.Name)]
+public class InferenceSessionIntegrationTests
+{
+    private const float Tolerance = 1e-4f;
+    private const int SequenceLength = 1;
+    private const int EmbeddingDimension = 8;
+    private const int HeadCount = 2;
+    private const int FlatSize = SequenceLength * EmbeddingDimension;
+
+    [Fact]
+    public void PredictionModelResult_Predict_IsStateless_WhenInferenceOptimizationsConfigured()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = false,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        var token = CreateTokenTensor(1.0f);
+
+        var y1 = result.Predict(token);
+        var y2 = result.Predict(token);
+
+        AssertTensorsEqual(y1, y2, Tolerance);
+    }
+
+    [Fact]
+    public void PredictionModelResult_SerializeDeserialize_PreservesInferenceOptimizationConfig()
+    {
+        var config = new InferenceOptimizationConfig
+        {
+            EnableFlashAttention = false,
+            EnableKVCache = true,
+            EnablePagedKVCache = false,
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var original = CreateDeterministicResult(config);
+        var bytes = original.Serialize();
+
+        var loaded = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = true,
+                EnableKVCache = false,
+                EnablePagedKVCache = true,
+                AttentionMasking = AttentionMaskingMode.Causal
+            });
+
+        loaded.Deserialize(bytes);
+
+        var loadedConfig = loaded.GetInferenceOptimizationConfigForServing();
+        Assert.NotNull(loadedConfig);
+        Assert.Equal(config.EnableFlashAttention, loadedConfig!.EnableFlashAttention);
+        Assert.Equal(config.EnableKVCache, loadedConfig.EnableKVCache);
+        Assert.Equal(config.EnablePagedKVCache, loadedConfig.EnablePagedKVCache);
+        Assert.Equal(config.AttentionMasking, loadedConfig.AttentionMasking);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_SequencesAreIndependent()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = false,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        var token = CreateTokenTensor(0.75f);
+        var tokenForB = CreateTokenTensor(0.75f);
+        var tokenFresh = CreateTokenTensor(0.75f);
+
+        using var session = result.BeginInferenceSession();
+
+        var seqA = session.CreateSequence();
+        var seqB = session.CreateSequence();
+        var seqFresh = session.CreateSequence();
+
+        var a1 = seqA.Predict(token);
+        var statsAfterFirst = seqA.GetInferenceStatistics();
+        var lengthsAfterFirst = (int[])statsAfterFirst["KVCache_SequenceLengths"];
+        int lenAfterFirst = lengthsAfterFirst[0];
+
+        var b1 = seqB.Predict(tokenForB);
+        var fresh1 = seqFresh.Predict(tokenFresh);
+
+        AssertTensorsEqual(a1, b1, Tolerance);
+        AssertTensorsEqual(a1, fresh1, Tolerance);
+
+        var freshStatsAfterFirst = seqFresh.GetInferenceStatistics();
+        var freshLengthsAfterFirst = (int[])freshStatsAfterFirst["KVCache_SequenceLengths"];
+        int freshLenAfterFirst = freshLengthsAfterFirst[0];
+        Assert.Equal(lenAfterFirst, freshLenAfterFirst);
+
+        _ = seqA.Predict(CreateTokenTensor(-0.25f));
+
+        var statsAfterSecond = seqA.GetInferenceStatistics();
+        var lengthsAfterSecond = (int[])statsAfterSecond["KVCache_SequenceLengths"];
+        Assert.True(lengthsAfterSecond[0] > lenAfterFirst, $"Expected KV-cache length to grow, but got {lenAfterFirst} -> {lengthsAfterSecond[0]}");
+
+        // Fresh sequence should grow independently when it advances.
+        _ = seqFresh.Predict(CreateTokenTensor(-0.25f));
+        var freshStatsAfterSecond = seqFresh.GetInferenceStatistics();
+        var freshLengthsAfterSecond = (int[])freshStatsAfterSecond["KVCache_SequenceLengths"];
+        Assert.True(
+            freshLengthsAfterSecond[0] > freshLenAfterFirst,
+            $"Expected fresh KV-cache length to grow, but got {freshLenAfterFirst} -> {freshLengthsAfterSecond[0]}");
+    }
+
+    [Fact]
+    public void BeginInferenceSession_ResetRestoresInitialSequenceState()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = false,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        var token1 = CreateTokenTensor(0.25f);
+        var token2 = CreateTokenTensor(0.5f);
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        var y1 = seq.Predict(token1);
+        _ = seq.Predict(token2);
+
+        seq.Reset();
+
+        var y1AfterReset = seq.Predict(token1);
+        AssertTensorsEqual(y1, y1AfterReset, Tolerance);
+    }
+
+    [Fact]
+    public async Task BeginInferenceSession_ConcurrentPredict_MultipleSequences_DoesNotThrow()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = true,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seqA = session.CreateSequence();
+        var seqB = session.CreateSequence();
+
+        var tasks = Enumerable.Range(0, 20)
+            .Select(i => Task.Run(() =>
+            {
+                var t = CreateTokenTensor(0.1f + (i * 0.01f));
+                _ = (i % 2 == 0 ? seqA : seqB).Predict(t);
+            }))
+            .ToArray();
+
+        await Task.WhenAll(tasks);
+
+        var statsA = seqA.GetInferenceStatistics();
+        var statsB = seqB.GetInferenceStatistics();
+        Assert.True((int)statsA["PagedAttentionLayerCount"] > 0);
+        Assert.True((int)statsB["PagedAttentionLayerCount"] > 0);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_KVCacheQuantization_Int8_UsesQuantizedStorage()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = false,
+                KVCacheQuantization = KVCacheQuantizationMode.Int8,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        _ = seq.Predict(CreateTokenTensor(0.1f));
+
+        var stats = seq.GetInferenceStatistics();
+        Assert.True(stats.TryGetValue("KVCache_DataType", out var dataType));
+        Assert.Equal("Int8", dataType);
+        Assert.True(stats.TryGetValue("KVCache_UseInt8Storage", out var useInt8));
+        Assert.True((bool)useInt8);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_KVCachePrecision_Auto_UsesFloat16Storage_ForFloatModel()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = false,
+                KVCachePrecision = KVCachePrecisionMode.Auto,
+                KVCacheQuantization = KVCacheQuantizationMode.None,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        _ = seq.Predict(CreateTokenTensor(0.1f));
+
+        var stats = seq.GetInferenceStatistics();
+        Assert.True(stats.TryGetValue("KVCache_DataType", out var dataType));
+        Assert.Equal("Float16", dataType);
+        Assert.True(stats.TryGetValue("KVCache_UseFp16Storage", out var useFp16));
+        Assert.True((bool)useFp16);
+        Assert.True(stats.TryGetValue("KVCache_UseInt8Storage", out var useInt8));
+        Assert.False((bool)useInt8);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_SpeculativeDecoding_Configured_DoesNotRunDuringPredict()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = false,
+                EnablePagedKVCache = false,
+                EnableSpeculativeDecoding = true,
+                DraftModelType = DraftModelType.NGram,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        _ = seq.Predict(CreateTokenTensor(0.1f));
+
+        var stats = seq.GetInferenceStatistics();
+        Assert.True(stats.TryGetValue("SpeculativeDecodingEnabled", out var enabled));
+        Assert.True((bool)enabled);
+        Assert.False(stats.ContainsKey("DraftModelType"));
+        Assert.False(stats.ContainsKey("SpeculationDepth"));
+    }
+
+    [Fact]
+    public void BeginInferenceSession_PagedKVCache_IsInitialized_WhenEnabled()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = true,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        _ = seq.Predict(CreateTokenTensor(0.1f));
+
+        var stats = seq.GetInferenceStatistics();
+        Assert.True(stats.TryGetValue("PagedKVCacheInitialized", out var initialized));
+        Assert.True((bool)initialized);
+        Assert.True(stats.TryGetValue("PagedAttentionLayerCount", out var count));
+        Assert.True((int)count > 0);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_PagedAttention_WOQ_IsEnabled_WhenConfigured()
+    {
+        var result = CreateDeterministicResult(
+            new InferenceOptimizationConfig
+            {
+                EnableFlashAttention = false,
+                EnableKVCache = true,
+                EnablePagedKVCache = true,
+                EnableWeightOnlyQuantization = true,
+                AttentionMasking = AttentionMaskingMode.Auto
+            });
+
+        using var session = result.BeginInferenceSession();
+        var seq = session.CreateSequence();
+
+        _ = seq.Predict(CreateTokenTensor(0.2f));
+
+        var stats = seq.GetInferenceStatistics();
+        Assert.True(stats.TryGetValue("PagedAttentionWeightOnlyQuantizationEnabled", out var enabled));
+        Assert.True((bool)enabled);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_MultiLoRA_TaskSelection_IsIsolatedPerSequence()
+    {
+        var config = new InferenceOptimizationConfig
+        {
+            EnableFlashAttention = false,
+            EnableKVCache = false,
+            EnablePagedKVCache = false,
+            EnableSpeculativeDecoding = false,
+            EnableBatching = false
+        };
+
+        var model = CreateDeterministicMultiLoRAModel();
+        var result = CreateDeterministicResultWithModel(config, model);
+
+        var token = CreateTokenTensor(0.25f);
+
+        using var session = result.BeginInferenceSession();
+        var seqA = session.CreateSequence("taskA");
+        var seqB = session.CreateSequence("taskB");
+
+        var yA = seqA.Predict(token);
+        var yB = seqB.Predict(token);
+
+        AssertTensorsNotEqual(yA, yB, minAbsDiff: 1e-3f);
+
+        seqA.SetMultiLoRATask("taskB");
+        var yA2 = seqA.Predict(token);
+        AssertTensorsNotEqual(yA, yA2, minAbsDiff: 1e-3f);
+    }
+
+    [Fact]
+    public void BeginInferenceSession_MultiLoRA_TaskSwitch_ResetsKVCacheState_ForSameSequence()
+    {
+        var originalDiagnostics = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableFlashAttention = false,
+            EnableKVCache = true,
+            EnablePagedKVCache = false,
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var model = CreateDeterministicAttentionWithMultiLoRAModel();
+        var result = CreateDeterministicResultWithModel(config, model);
+
+        try
+        {
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", "1");
+            AiDotNet.Helpers.InferenceDiagnostics.Clear();
+
+            using var session = result.BeginInferenceSession();
+            var seq = session.CreateSequence("taskA");
+
+            var token1 = CreateTokenTensor(0.25f);
+            var token2 = CreateTokenTensor(0.5f);
+
+            _ = seq.Predict(token1);
+            var statsAfterFirst = seq.GetInferenceStatistics();
+            var lenAfterFirst = ((int[])statsAfterFirst["KVCache_SequenceLengths"])[0];
+
+            _ = seq.Predict(token2);
+            var statsAfterSecond = seq.GetInferenceStatistics();
+            var lenAfterSecond = ((int[])statsAfterSecond["KVCache_SequenceLengths"])[0];
+            Assert.True(lenAfterSecond > lenAfterFirst, $"Expected KV-cache length to grow, but got {lenAfterFirst} -> {lenAfterSecond}");
+
+            seq.SetMultiLoRATask("taskB");
+            _ = seq.Predict(token1);
+            var statsAfterSwitch = seq.GetInferenceStatistics();
+            var lenAfterSwitch = ((int[])statsAfterSwitch["KVCache_SequenceLengths"])[0];
+
+            Assert.True(lenAfterSwitch <= lenAfterFirst, $"Expected KV-cache to reset after task switch, but got {lenAfterFirst} -> {lenAfterSwitch}");
+
+            var entries = AiDotNet.Helpers.InferenceDiagnostics.Snapshot();
+            Assert.Contains(entries, e => e.Area == "InferenceSession" && e.Feature == "MultiLoRA" && e.Reason.Contains("Task=taskB"));
+        }
+        finally
+        {
+            AiDotNet.Helpers.InferenceDiagnostics.Clear();
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", originalDiagnostics);
+        }
+    }
+
+    [Fact]
+    public void NeuralNetworkBase_Clone_DoesNotShareParameters()
+    {
+        var model = CreateDeterministicAttentionOnlyModel();
+        var clone = (NeuralNetworkBase<float>)model.Clone();
+
+        // Clone should preserve parameters exactly (deep copy via serialization/deserialization).
+        Assert.Equal(model.GetParameters().Length, clone.GetParameters().Length);
+        for (int i = 0; i < model.GetParameters().Length; i++)
+        {
+            Assert.True(
+                Math.Abs(model.GetParameters()[i] - clone.GetParameters()[i]) <= Tolerance,
+                $"Parameter mismatch at {i}: {model.GetParameters()[i]} != {clone.GetParameters()[i]}");
+        }
+
+        var cloneParams = clone.GetParameters();
+        cloneParams[0] += 1.0f;
+        clone.UpdateParameters(cloneParams);
+
+        Assert.NotEqual(model.GetParameters()[0], clone.GetParameters()[0]);
+    }
+
+    private static PredictionModelResult<float, Tensor<float>, Tensor<float>> CreateDeterministicResult(InferenceOptimizationConfig config)
+    {
+        var model = CreateDeterministicAttentionOnlyModel();
+        return CreateDeterministicResultWithModel(config, model);
+    }
+
+    private static PredictionModelResult<float, Tensor<float>, Tensor<float>> CreateDeterministicResultWithModel(
+        InferenceOptimizationConfig config,
+        NeuralNetworkBase<float> model)
+    {
+        if (model == null) throw new ArgumentNullException(nameof(model));
+
+        var optimization = new OptimizationResult<float, Tensor<float>, Tensor<float>>
+        {
+            BestSolution = model
+        };
+
+        var normalization = new NormalizationInfo<float, Tensor<float>, Tensor<float>>
+        {
+            Normalizer = new NoNormalizer<float, Tensor<float>, Tensor<float>>(),
+            YParams = new NormalizationParameters<float> { Method = NormalizationMethod.None }
+        };
+
+        var options = new PredictionModelResultOptions<float, Tensor<float>, Tensor<float>>
+        {
+            OptimizationResult = optimization,
+            NormalizationInfo = normalization,
+            InferenceOptimizationConfig = config
+        };
+
+        return new PredictionModelResult<float, Tensor<float>, Tensor<float>>(options);
+    }
+
+    private static NeuralNetworkBase<float> CreateDeterministicMultiLoRAModel()
+    {
+        const int inputSize = FlatSize;
+        const int outputSize = FlatSize;
+
+        var baseDense = new DenseLayer<float>(inputSize, outputSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>());
+        var multi = new AiDotNet.LoRA.Adapters.MultiLoRAAdapter<float>(baseDense, defaultTaskName: "taskA", defaultRank: 1, alpha: 1.0, freezeBaseLayer: true);
+        multi.AddTask("taskB", rank: 1, alpha: 1.0);
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(inputSize),
+            multi,
+            new DenseLayer<float>(outputSize, outputSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.Regression,
+            complexity: NetworkComplexity.Simple,
+            inputSize: inputSize,
+            outputSize: outputSize,
+            layers: layers);
+
+        var model = new NeuralNetwork<float>(architecture);
+
+        // Deterministic base weights across the whole model.
+        var p = model.GetParameters();
+        var deterministic = new float[p.Length];
+        for (int i = 0; i < deterministic.Length; i++)
+        {
+            deterministic[i] = ((i % 19) - 9) / 9.0f;
+        }
+        model.UpdateParameters(new Vector<float>(deterministic));
+
+        // Make taskB differ from taskA by setting distinct LoRA parameters.
+        // (Both A and B must be non-zero for the low-rank delta to have an effect.)
+        var taskA = multi.GetTaskAdapter("taskA");
+        var taskB = multi.GetTaskAdapter("taskB");
+
+        var aParams = taskA.GetParameters();
+        var bParams = taskB.GetParameters();
+
+        var a = new float[aParams.Length]; // all zeros => no delta
+        var b = new float[bParams.Length];
+        for (int i = 0; i < b.Length; i++)
+        {
+            b[i] = 0.05f;
+        }
+
+        taskA.UpdateParameters(new Vector<float>(a));
+        taskB.UpdateParameters(new Vector<float>(b));
+
+        return model;
+    }
+
+    private static NeuralNetworkBase<float> CreateDeterministicAttentionWithMultiLoRAModel()
+    {
+        const int inputSize = FlatSize;
+        const int outputSize = FlatSize;
+
+        var baseDense = new DenseLayer<float>(outputSize, outputSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>());
+        var multi = new AiDotNet.LoRA.Adapters.MultiLoRAAdapter<float>(baseDense, defaultTaskName: "taskA", defaultRank: 1, alpha: 1.0, freezeBaseLayer: true);
+        multi.AddTask("taskB", rank: 1, alpha: 1.0);
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(inputSize),
+            new ReshapeLayer<float>(new[] { FlatSize }, new[] { SequenceLength, EmbeddingDimension }),
+            new MultiHeadAttentionLayer<float>(
+                sequenceLength: SequenceLength,
+                embeddingDimension: EmbeddingDimension,
+                headCount: HeadCount,
+                activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>()),
+            new FlattenLayer<float>(new[] { SequenceLength, EmbeddingDimension }),
+            multi,
+            new DenseLayer<float>(outputSize, outputSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.TextGeneration,
+            complexity: NetworkComplexity.Simple,
+            inputSize: inputSize,
+            outputSize: outputSize,
+            layers: layers);
+
+        var model = new NeuralNetwork<float>(architecture);
+
+        var p = model.GetParameters();
+        var deterministic = new float[p.Length];
+        for (int i = 0; i < deterministic.Length; i++)
+        {
+            deterministic[i] = ((i % 23) - 11) / 11.0f;
+        }
+        model.UpdateParameters(new Vector<float>(deterministic));
+
+        var taskA = multi.GetTaskAdapter("taskA");
+        var taskB = multi.GetTaskAdapter("taskB");
+
+        var aParams = taskA.GetParameters();
+        var bParams = taskB.GetParameters();
+
+        var a = new float[aParams.Length];
+        var b = new float[bParams.Length];
+        for (int i = 0; i < b.Length; i++)
+        {
+            b[i] = 0.05f;
+        }
+
+        taskA.UpdateParameters(new Vector<float>(a));
+        taskB.UpdateParameters(new Vector<float>(b));
+
+        return model;
+    }
+
+    private static NeuralNetworkBase<float> CreateDeterministicAttentionOnlyModel()
+    {
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(FlatSize),
+            new ReshapeLayer<float>(new[] { FlatSize }, new[] { SequenceLength, EmbeddingDimension }),
+            new MultiHeadAttentionLayer<float>(
+                sequenceLength: SequenceLength,
+                embeddingDimension: EmbeddingDimension,
+                headCount: HeadCount,
+                activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+            ,
+            new FlattenLayer<float>(new[] { SequenceLength, EmbeddingDimension }),
+            new DenseLayer<float>(FlatSize, FlatSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.TextGeneration,
+            complexity: NetworkComplexity.Simple,
+            inputSize: FlatSize,
+            outputSize: FlatSize,
+            layers: layers);
+
+        var model = new NeuralNetwork<float>(architecture);
+
+        var p = model.GetParameters();
+        var deterministic = new float[p.Length];
+        for (int i = 0; i < deterministic.Length; i++)
+        {
+            deterministic[i] = ((i % 23) - 11) / 11.0f;
+        }
+        model.UpdateParameters(new Vector<float>(deterministic));
+
+        return model;
+    }
+
+    private static Tensor<float> CreateTokenTensor(float scalar)
+    {
+        var t = new Tensor<float>(new[] { 1, FlatSize });
+        for (int i = 0; i < t.Length; i++)
+        {
+            t[i] = scalar + (i * 0.01f);
+        }
+        return t;
+    }
+
+    private static void AssertTensorsEqual(Tensor<float> a, Tensor<float> b, float tolerance)
+    {
+        Assert.Equal(a.Shape, b.Shape);
+        for (int i = 0; i < a.Length; i++)
+        {
+            Assert.True(Math.Abs(a[i] - b[i]) <= tolerance, $"Index {i}: {a[i]} != {b[i]}");
+        }
+    }
+
+    private static void AssertTensorsNotEqual(Tensor<float> a, Tensor<float> b, float minAbsDiff)
+    {
+        Assert.Equal(a.Shape, b.Shape);
+
+        float maxAbs = 0f;
+        for (int i = 0; i < a.Length; i++)
+        {
+            float abs = Math.Abs(a[i] - b[i]);
+            if (abs > maxAbs)
+            {
+                maxAbs = abs;
+            }
+        }
+
+        Assert.True(maxAbs >= minAbsDiff, $"Expected tensors to differ by at least {minAbsDiff}, but max diff was {maxAbs}");
+    }
+}
diff --git a/tests/AiDotNet.Tests/StressTests/GpuStressTests.cs b/tests/AiDotNet.Tests/StressTests/GpuStressTests.cs
index 47748f758..58aa68287 100644
--- a/tests/AiDotNet.Tests/StressTests/GpuStressTests.cs
+++ b/tests/AiDotNet.Tests/StressTests/GpuStressTests.cs
@@ -201,15 +201,16 @@ public void Conv2D_LongRun_1KIterations_StablePerformance()
         var lastQuartileAvg = timings.Skip(3 * MediumRunIterations / 4).Average();
 
         // Guard against zero division on very fast hardware
-        double performanceDrift = 0;
-        if (firstQuartileAvg > 0)
+        // Only check for degradation (last > first), not improvement
+        double performanceDegradation = 0;
+        if (firstQuartileAvg > 0 && lastQuartileAvg > firstQuartileAvg)
         {
-            performanceDrift = Math.Abs(lastQuartileAvg - firstQuartileAvg) / firstQuartileAvg;
+            performanceDegradation = (lastQuartileAvg - firstQuartileAvg) / firstQuartileAvg;
         }
 
-        // Performance should not degrade by more than 20%
-        Assert.True(performanceDrift < 0.20,
-            $"Performance degraded by {performanceDrift * 100:F1}% (first: {firstQuartileAvg:F2}ms, last: {lastQuartileAvg:F2}ms)");
+        // Performance should not degrade by more than 20% (improvement is acceptable)
+        Assert.True(performanceDegradation < 0.20,
+            $"Performance degraded by {performanceDegradation * 100:F1}% (first: {firstQuartileAvg:F2}ms, last: {lastQuartileAvg:F2}ms)");
 
         // Memory growth should be minimal
         Assert.True(memoryGrowth < 20_000_000,
diff --git a/tests/AiDotNet.Tests/TestInfrastructure/DiagnosticsEnvironmentCollection.cs b/tests/AiDotNet.Tests/TestInfrastructure/DiagnosticsEnvironmentCollection.cs
new file mode 100644
index 000000000..cc02f7c3f
--- /dev/null
+++ b/tests/AiDotNet.Tests/TestInfrastructure/DiagnosticsEnvironmentCollection.cs
@@ -0,0 +1,28 @@
+using System;
+using AiDotNet.Helpers;
+using Xunit;
+
+namespace AiDotNet.Tests.TestInfrastructure;
+
+[CollectionDefinition(Name, DisableParallelization = true)]
+public sealed class DiagnosticsEnvironmentCollection : ICollectionFixture<DiagnosticsEnvironmentCollection.Fixture>
+{
+    public const string Name = "DiagnosticsEnv";
+
+    public sealed class Fixture : IDisposable
+    {
+        private readonly string? _original;
+
+        public Fixture()
+        {
+            _original = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+        }
+
+        public void Dispose()
+        {
+            InferenceDiagnostics.Clear();
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", _original);
+        }
+    }
+}
+
diff --git a/tests/AiDotNet.Tests/UnitTests/Attention/FlashAttentionTests.cs b/tests/AiDotNet.Tests/UnitTests/Attention/FlashAttentionTests.cs
index 1019b5e8d..5b8bd00fe 100644
--- a/tests/AiDotNet.Tests/UnitTests/Attention/FlashAttentionTests.cs
+++ b/tests/AiDotNet.Tests/UnitTests/Attention/FlashAttentionTests.cs
@@ -111,6 +111,41 @@ public void FlashAttention_WithCausalMask_MasksCorrectly()
         }
     }
 
+    [Fact]
+    public void FlashAttention_WithCausalMask_RespectsQueryOffsetForCachedDecoding()
+    {
+        // Arrange
+        int batchSize = 1;
+        int seqLenKV = 6;
+        int seqLenQ = 2;
+        int headDim = 8;
+
+        var query = CreateRandomTensor(batchSize, seqLenQ, headDim, seed: 42);
+        var key = CreateRandomTensor(batchSize, seqLenKV, headDim, seed: 43);
+        var value = CreateRandomTensor(batchSize, seqLenKV, headDim, seed: 44);
+
+        int queryOffset = seqLenKV - seqLenQ;
+        var config = new FlashAttentionConfig { UseCausalMask = true, ReturnAttentionWeights = true };
+
+        // Act
+        var (_, attnWeights) = FlashAttention<float>.Forward(query, key, value, config, queryOffset: queryOffset);
+
+        // Assert - For each query row, positions beyond (queryOffset + qIdx) must be masked
+        Assert.NotNull(attnWeights);
+        for (int b = 0; b < batchSize; b++)
+        {
+            for (int q = 0; q < seqLenQ; q++)
+            {
+                int maxAllowedK = queryOffset + q;
+                for (int k = maxAllowedK + 1; k < seqLenKV; k++)
+                {
+                    float weight = attnWeights[new[] { b, q, k }];
+                    Assert.True(weight < 1e-6f, $"Position (q={q}, k={k}) should be masked but has weight {weight}");
+                }
+            }
+        }
+    }
+
     [Fact]
     public void FlashAttention_AttentionWeightsRowSumToOne()
     {
diff --git a/tests/AiDotNet.Tests/UnitTests/Helpers/InferenceDiagnosticsTests.cs b/tests/AiDotNet.Tests/UnitTests/Helpers/InferenceDiagnosticsTests.cs
new file mode 100644
index 000000000..0deb5e699
--- /dev/null
+++ b/tests/AiDotNet.Tests/UnitTests/Helpers/InferenceDiagnosticsTests.cs
@@ -0,0 +1,50 @@
+using System;
+using AiDotNet.Helpers;
+using Xunit;
+
+namespace AiDotNet.Tests.UnitTests.Helpers;
+
+[Collection(AiDotNet.Tests.TestInfrastructure.DiagnosticsEnvironmentCollection.Name)]
+public class InferenceDiagnosticsTests
+{
+    [Fact]
+    public void InferenceDiagnostics_Disabled_DoesNotRecord()
+    {
+        var original = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+        try
+        {
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", null);
+            InferenceDiagnostics.Clear();
+
+            InferenceDiagnostics.RecordDecision("Test", "Feature", enabled: true, reason: "Reason");
+
+            Assert.Empty(InferenceDiagnostics.Snapshot());
+        }
+        finally
+        {
+            InferenceDiagnostics.Clear();
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", original);
+        }
+    }
+
+    [Fact]
+    public void InferenceDiagnostics_Enabled_Records()
+    {
+        var original = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+        try
+        {
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", "1");
+            InferenceDiagnostics.Clear();
+
+            InferenceDiagnostics.RecordDecision("Test", "Feature", enabled: true, reason: "Reason");
+
+            var entries = InferenceDiagnostics.Snapshot();
+            Assert.Contains(entries, e => e.Area == "Test" && e.Feature == "Feature" && e.Enabled);
+        }
+        finally
+        {
+            InferenceDiagnostics.Clear();
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", original);
+        }
+    }
+}
diff --git a/tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs b/tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs
new file mode 100644
index 000000000..31a16d1b7
--- /dev/null
+++ b/tests/AiDotNet.Tests/UnitTests/Inference/InferenceOptimizerTests.cs
@@ -0,0 +1,384 @@
+using AiDotNet.Configuration;
+using AiDotNet.Enums;
+using AiDotNet.Inference;
+using AiDotNet.NeuralNetworks;
+using AiDotNet.NeuralNetworks.Attention;
+using AiDotNet.NeuralNetworks.Layers;
+using Xunit;
+
+namespace AiDotNet.Tests.UnitTests.Inference;
+
+[Collection(AiDotNet.Tests.TestInfrastructure.DiagnosticsEnvironmentCollection.Name)]
+public class InferenceOptimizerTests
+{
+    [Fact]
+    public void InferenceOptimizer_WhenDiagnosticsEnabled_RecordsDecisions()
+    {
+        var original = Environment.GetEnvironmentVariable("AIDOTNET_DIAGNOSTICS");
+        try
+        {
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", "1");
+            AiDotNet.Helpers.InferenceDiagnostics.Clear();
+
+            var model = CreateTinyTransformer(taskType: NeuralNetworkTaskType.TextGeneration);
+            var config = new InferenceOptimizationConfig
+            {
+                EnableKVCache = true,
+                EnableFlashAttention = false,
+                EnablePagedKVCache = false,
+                AttentionMasking = AttentionMaskingMode.Auto
+            };
+
+            var optimizer = new InferenceOptimizer<float>(config);
+            _ = optimizer.OptimizeForInference(model, cloneModel: false);
+
+            var entries = AiDotNet.Helpers.InferenceDiagnostics.Snapshot();
+            Assert.Contains(entries, e => e.Area == "InferenceOptimizer" && e.Feature == "KVCachePrecision");
+        }
+        finally
+        {
+            AiDotNet.Helpers.InferenceDiagnostics.Clear();
+            Environment.SetEnvironmentVariable("AIDOTNET_DIAGNOSTICS", original);
+        }
+    }
+
+    [Fact]
+    public void InferenceOptimizer_RewritesMultiHeadAttention_ToFlashAttention_WhenEnabled()
+    {
+        var model = CreateTinyTransformer(taskType: NeuralNetworkTaskType.Regression);
+        Assert.Contains(model.Layers, l => l is MultiHeadAttentionLayer<float>);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = false,
+            EnableFlashAttention = true,
+            AttentionMasking = AttentionMaskingMode.Disabled
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+        // Clone relies on serialization of every layer in the graph; this test focuses on rewrite behavior.
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: false);
+
+        Assert.True(anyApplied);
+        Assert.Contains(optimized.Layers, l => l is FlashAttentionLayer<float>);
+        Assert.DoesNotContain(optimized.Layers, l => l is MultiHeadAttentionLayer<float>);
+    }
+
+    [Fact]
+    public void InferenceOptimizer_RewritesMultiHeadAttention_ToCachedAttention_ForTextGeneration_WhenKVCacheEnabled()
+    {
+        var model = CreateTinyTransformer(taskType: NeuralNetworkTaskType.TextGeneration);
+        Assert.Contains(model.Layers, l => l is MultiHeadAttentionLayer<float>);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = true,
+            EnableFlashAttention = true,
+            // Paged KV-cache is industry-standard and enabled by default; keep it enabled for this test.
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: true);
+
+        Assert.True(anyApplied);
+        Assert.Contains(optimized.Layers, l => l is PagedCachedMultiHeadAttention<float>);
+        Assert.DoesNotContain(optimized.Layers, l => l is MultiHeadAttentionLayer<float>);
+
+        foreach (var layer in optimized.Layers)
+        {
+            if (layer is PagedCachedMultiHeadAttention<float> cached)
+            {
+                Assert.True(cached.InferenceMode);
+                Assert.NotNull(cached.Kernel);
+            }
+        }
+    }
+
+    [Fact]
+    public void InferenceOptimizer_RewritesSelfAttention_ToCachedAttention_WhenKVCacheEnabled()
+    {
+        var model = CreateTinySelfAttentionModel(taskType: NeuralNetworkTaskType.TextGeneration);
+        Assert.Contains(model.Layers, l => l is SelfAttentionLayer<float>);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = true,
+            EnablePagedKVCache = false,
+            EnableFlashAttention = false,
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: false);
+
+        Assert.True(anyApplied);
+        Assert.Contains(optimized.Layers, l => l is CachedMultiHeadAttention<float>);
+        Assert.DoesNotContain(optimized.Layers, l => l is SelfAttentionLayer<float>);
+
+        // In-place rewrite expected when cloneModel=false.
+        Assert.DoesNotContain(model.Layers, l => l is SelfAttentionLayer<float>);
+    }
+
+    [Fact]
+    public void InferenceOptimizer_SpeculativeDecoding_FallsBackToNGram_WhenSmallNeuralUnavailable()
+    {
+        var model = CreateTinyTransformer(taskType: NeuralNetworkTaskType.TextGeneration);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = false,
+            EnableFlashAttention = false,
+            EnableSpeculativeDecoding = true,
+            DraftModelType = DraftModelType.SmallNeural
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+
+        // Should never throw: SmallNeural draft models are not available in MVP and must fall back.
+        var (_, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: false);
+
+        Assert.True(anyApplied);
+        Assert.NotNull(optimizer.DraftModel);
+        Assert.Contains("NGramDraftModel", optimizer.DraftModel!.GetType().Name);
+        Assert.True(optimizer.DraftModel!.VocabSize > 0);
+    }
+
+    [Fact]
+    public void InferenceOptimizer_SpeculativeDecoding_FallsBackToNGram_WhenCustomNotProvided()
+    {
+        var model = CreateTinyTransformer(taskType: NeuralNetworkTaskType.TextGeneration);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = false,
+            EnableFlashAttention = false,
+            EnableSpeculativeDecoding = true,
+            DraftModelType = DraftModelType.Custom
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+
+        // Should never throw: the public facade does not wire custom draft models in MVP.
+        var (_, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: false);
+
+        Assert.True(anyApplied);
+        Assert.NotNull(optimizer.DraftModel);
+        Assert.True(optimizer.DraftModel!.VocabSize > 0);
+    }
+
+    [Fact]
+    public void InferenceOptimizer_WeightOnlyQuantization_RewritesDenseLayer_OnClonedModel_AndPreservesOutputs()
+    {
+        var model = CreateTinyDenseModel();
+
+        var input = new AiDotNet.Tensors.LinearAlgebra.Tensor<float>(new[] { 1, 4 });
+        for (int i = 0; i < input.Length; i++)
+        {
+            input[i] = 0.1f * (i + 1);
+        }
+
+        var baseline = model.Predict(input);
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = false,
+            EnableFlashAttention = false,
+            EnableWeightOnlyQuantization = true
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: true);
+
+        Assert.True(anyApplied);
+        Assert.Contains(optimized.Layers, l => l.GetType().Name.Contains("QuantizedDenseLayer"));
+        Assert.Contains(model.Layers, l => l is DenseLayer<float>);
+
+        var y = optimized.Predict(input);
+        Assert.Equal(baseline.Shape, y.Shape);
+
+        for (int i = 0; i < y.Length; i++)
+        {
+            Assert.True(Math.Abs(baseline[i] - y[i]) < 1e-1f, $"Mismatch at {i}: {baseline[i]} vs {y[i]}");
+        }
+    }
+
+    [Fact]
+    public void InferenceOptimizer_Skips_AttentionLayer_WhenKVCacheEnabled()
+    {
+        var model = CreateTinyAttentionLayerModel();
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = true,
+            EnableFlashAttention = true,
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: true);
+
+        Assert.False(anyApplied);
+        Assert.Same(model, optimized);
+        Assert.Contains(optimized.Layers, l => l is AttentionLayer<float>);
+    }
+
+    [Fact]
+    public void InferenceOptimizer_Skips_GraphAttentionLayer_WhenKVCacheEnabled()
+    {
+        var model = CreateTinyGraphAttentionModel();
+
+        var config = new InferenceOptimizationConfig
+        {
+            EnableKVCache = true,
+            EnableFlashAttention = true,
+            AttentionMasking = AttentionMaskingMode.Auto
+        };
+
+        var optimizer = new InferenceOptimizer<float>(config);
+
+        // Should not throw: graph attention is not part of inference-time transformer KV-cache rewriting.
+        var (optimized, anyApplied) = optimizer.OptimizeForInference(model, cloneModel: true);
+
+        Assert.False(anyApplied);
+        Assert.Same(model, optimized);
+        Assert.Contains(optimized.Layers, l => l is GraphAttentionLayer<float>);
+    }
+
+    private static Transformer<float> CreateTinyTransformer(NeuralNetworkTaskType taskType)
+    {
+        var architecture = new TransformerArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: taskType,
+            numEncoderLayers: 1,
+            numDecoderLayers: 0,
+            numHeads: 2,
+            modelDimension: 8,
+            feedForwardDimension: 16,
+            complexity: NetworkComplexity.Simple,
+            inputSize: 1,
+            outputSize: 8,
+            dropoutRate: 0.0,
+            maxSequenceLength: 4,
+            vocabularySize: 0,
+            usePositionalEncoding: false);
+
+        return new Transformer<float>(architecture);
+    }
+
+    private static NeuralNetworkBase<float> CreateTinySelfAttentionModel(NeuralNetworkTaskType taskType)
+    {
+        const int seqLen = 4;
+        const int embDim = 8;
+        const int headCount = 2;
+        const int flatSize = seqLen * embDim;
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(flatSize),
+            new ReshapeLayer<float>(new[] { flatSize }, new[] { seqLen, embDim }),
+            new SelfAttentionLayer<float>(seqLen, embDim, headCount, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>()),
+            new FlattenLayer<float>(new[] { seqLen, embDim }),
+            new DenseLayer<float>(flatSize, flatSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: taskType,
+            complexity: NetworkComplexity.Simple,
+            inputSize: flatSize,
+            outputSize: flatSize,
+            layers: layers);
+
+        var model = new NeuralNetwork<float>(architecture);
+
+        // Ensure parameters are initialized deterministically for stable tests.
+        var p = model.GetParameters();
+        var deterministic = new float[p.Length];
+        for (int i = 0; i < deterministic.Length; i++)
+        {
+            deterministic[i] = ((i % 17) - 8) / 8.0f;
+        }
+        model.UpdateParameters(new AiDotNet.Tensors.LinearAlgebra.Vector<float>(deterministic));
+
+        return model;
+    }
+
+    private static NeuralNetworkBase<float> CreateTinyDenseModel()
+    {
+        const int inSize = 4;
+        const int outSize = 3;
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(inSize),
+            new DenseLayer<float>(inSize, outSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.Regression,
+            complexity: NetworkComplexity.Simple,
+            inputSize: inSize,
+            outputSize: outSize,
+            layers: layers);
+
+        var model = new NeuralNetwork<float>(architecture);
+
+        var p = model.GetParameters();
+        var deterministic = new float[p.Length];
+        for (int i = 0; i < deterministic.Length; i++)
+        {
+            deterministic[i] = ((i % 13) - 6) / 6.0f;
+        }
+        model.UpdateParameters(new AiDotNet.Tensors.LinearAlgebra.Vector<float>(deterministic));
+
+        return model;
+    }
+
+    private static NeuralNetworkBase<float> CreateTinyAttentionLayerModel()
+    {
+        const int inputSize = 8;
+        const int attentionSize = 8;
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(inputSize),
+            new AttentionLayer<float>(inputSize, attentionSize, activation: (AiDotNet.Interfaces.IActivationFunction<float>?)null),
+            new DenseLayer<float>(attentionSize, attentionSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.Regression,
+            complexity: NetworkComplexity.Simple,
+            inputSize: inputSize,
+            outputSize: attentionSize,
+            layers: layers);
+
+        return new NeuralNetwork<float>(architecture);
+    }
+
+    private static NeuralNetworkBase<float> CreateTinyGraphAttentionModel()
+    {
+        const int inputSize = 8;
+        const int outputSize = 8;
+
+        var layers = new System.Collections.Generic.List<AiDotNet.Interfaces.ILayer<float>>
+        {
+            new InputLayer<float>(inputSize),
+            new GraphAttentionLayer<float>(inputSize, outputSize, numHeads: 1),
+            new DenseLayer<float>(outputSize, outputSize, activationFunction: new AiDotNet.ActivationFunctions.IdentityActivation<float>())
+        };
+
+        var architecture = new NeuralNetworkArchitecture<float>(
+            inputType: InputType.OneDimensional,
+            taskType: NeuralNetworkTaskType.Regression,
+            complexity: NetworkComplexity.Simple,
+            inputSize: inputSize,
+            outputSize: outputSize,
+            layers: layers);
+
+        return new NeuralNetwork<float>(architecture);
+    }
+}
diff --git a/tests/AiDotNet.Tests/UnitTests/Inference/KVCacheTests.cs b/tests/AiDotNet.Tests/UnitTests/Inference/KVCacheTests.cs
new file mode 100644
index 000000000..14bfdbd4b
--- /dev/null
+++ b/tests/AiDotNet.Tests/UnitTests/Inference/KVCacheTests.cs
@@ -0,0 +1,131 @@
+using AiDotNet.Inference;
+using AiDotNet.Tensors.LinearAlgebra;
+using Xunit;
+
+namespace AiDotNet.Tests.UnitTests.Inference;
+
+public class KVCacheTests
+{
+    [Fact]
+    public void KVCache_AppendAcrossLayers_MaintainsIndependentLengths()
+    {
+        var config = new KVCacheConfig
+        {
+            NumLayers = 2,
+            NumHeads = 1,
+            HeadDimension = 2,
+            MaxSequenceLength = 8,
+            MaxBatchSize = 1,
+            PreAllocate = true
+        };
+
+        var cache = new KVCache<float>(config);
+
+        var keys0 = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        var values0 = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        keys0[new[] { 0, 0, 0, 0 }] = 1f;
+        keys0[new[] { 0, 0, 0, 1 }] = 2f;
+        keys0[new[] { 0, 0, 1, 0 }] = 3f;
+        keys0[new[] { 0, 0, 1, 1 }] = 4f;
+        values0[new[] { 0, 0, 0, 0 }] = 5f;
+        values0[new[] { 0, 0, 0, 1 }] = 6f;
+        values0[new[] { 0, 0, 1, 0 }] = 7f;
+        values0[new[] { 0, 0, 1, 1 }] = 8f;
+
+        var (layer0Keys, _) = cache.Append(0, keys0, values0);
+        Assert.Equal(2, layer0Keys.Shape[2]);
+
+        var keys1 = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        var values1 = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        keys1[new[] { 0, 0, 0, 0 }] = 10f;
+        keys1[new[] { 0, 0, 0, 1 }] = 11f;
+        keys1[new[] { 0, 0, 1, 0 }] = 12f;
+        keys1[new[] { 0, 0, 1, 1 }] = 13f;
+        values1[new[] { 0, 0, 0, 0 }] = 14f;
+        values1[new[] { 0, 0, 0, 1 }] = 15f;
+        values1[new[] { 0, 0, 1, 0 }] = 16f;
+        values1[new[] { 0, 0, 1, 1 }] = 17f;
+
+        var (layer1Keys, _) = cache.Append(1, keys1, values1);
+        Assert.Equal(2, layer1Keys.Shape[2]);
+        Assert.Equal(10f, layer1Keys[new[] { 0, 0, 0, 0 }]);
+        Assert.Equal(13f, layer1Keys[new[] { 0, 0, 1, 1 }]);
+
+        var (layer0KeysAfter, _) = cache.GetCached(0, batchSize: 1);
+        Assert.Equal(2, layer0KeysAfter.Shape[2]);
+        Assert.Equal(1f, layer0KeysAfter[new[] { 0, 0, 0, 0 }]);
+        Assert.Equal(4f, layer0KeysAfter[new[] { 0, 0, 1, 1 }]);
+    }
+
+    [Fact]
+    public void KVCache_Float16Storage_RoundTripsValues()
+    {
+        var config = new KVCacheConfig
+        {
+            NumLayers = 1,
+            NumHeads = 1,
+            HeadDimension = 2,
+            MaxSequenceLength = 8,
+            MaxBatchSize = 1,
+            PreAllocate = true,
+            DataType = CacheDataType.Float16
+        };
+
+        var cache = new KVCache<float>(config);
+
+        var keys = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        var values = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        keys[new[] { 0, 0, 0, 0 }] = 1f;
+        keys[new[] { 0, 0, 0, 1 }] = 2f;
+        keys[new[] { 0, 0, 1, 0 }] = 3f;
+        keys[new[] { 0, 0, 1, 1 }] = 4f;
+        values[new[] { 0, 0, 0, 0 }] = 5f;
+        values[new[] { 0, 0, 0, 1 }] = 6f;
+        values[new[] { 0, 0, 1, 0 }] = 7f;
+        values[new[] { 0, 0, 1, 1 }] = 8f;
+
+        var (cachedKeys, cachedValues) = cache.Append(0, keys, values);
+        Assert.Equal(2, cachedKeys.Shape[2]);
+        Assert.Equal(1f, cachedKeys[new[] { 0, 0, 0, 0 }]);
+        Assert.Equal(4f, cachedKeys[new[] { 0, 0, 1, 1 }]);
+        Assert.Equal(5f, cachedValues[new[] { 0, 0, 0, 0 }]);
+        Assert.Equal(8f, cachedValues[new[] { 0, 0, 1, 1 }]);
+    }
+
+    [Fact]
+    public void KVCache_Int8Storage_RoundTripsApproximately()
+    {
+        var config = new KVCacheConfig
+        {
+            NumLayers = 1,
+            NumHeads = 1,
+            HeadDimension = 2,
+            MaxSequenceLength = 8,
+            MaxBatchSize = 1,
+            PreAllocate = true,
+            DataType = CacheDataType.Int8
+        };
+
+        var cache = new KVCache<float>(config);
+
+        var keys = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        var values = new Tensor<float>(new[] { 1, 1, 2, 2 });
+        keys[new[] { 0, 0, 0, 0 }] = 1f;
+        keys[new[] { 0, 0, 0, 1 }] = 2f;
+        keys[new[] { 0, 0, 1, 0 }] = 3f;
+        keys[new[] { 0, 0, 1, 1 }] = 4f;
+        values[new[] { 0, 0, 0, 0 }] = 5f;
+        values[new[] { 0, 0, 0, 1 }] = 6f;
+        values[new[] { 0, 0, 1, 0 }] = 7f;
+        values[new[] { 0, 0, 1, 1 }] = 8f;
+
+        var (cachedKeys, cachedValues) = cache.Append(0, keys, values);
+        Assert.Equal(2, cachedKeys.Shape[2]);
+
+        // Int8 quantization is approximate; tolerate small error.
+        Assert.InRange(Math.Abs(cachedKeys[new[] { 0, 0, 0, 0 }] - 1f), 0f, 0.1f);
+        Assert.InRange(Math.Abs(cachedKeys[new[] { 0, 0, 1, 1 }] - 4f), 0f, 0.1f);
+        Assert.InRange(Math.Abs(cachedValues[new[] { 0, 0, 0, 0 }] - 5f), 0f, 0.1f);
+        Assert.InRange(Math.Abs(cachedValues[new[] { 0, 0, 1, 1 }] - 8f), 0f, 0.1f);
+    }
+}
diff --git a/tests/AiDotNet.Tests/UnitTests/Inference/PagedAttentionTests.cs b/tests/AiDotNet.Tests/UnitTests/Inference/PagedAttentionTests.cs
index a3ac7acf9..1caf12632 100644
--- a/tests/AiDotNet.Tests/UnitTests/Inference/PagedAttentionTests.cs
+++ b/tests/AiDotNet.Tests/UnitTests/Inference/PagedAttentionTests.cs
@@ -1,4 +1,5 @@
 using AiDotNet.Inference.PagedAttention;
+using AiDotNet.Inference.Quantization;
 using Xunit;
 
 namespace AiDotNet.Tests.UnitTests.Inference;
@@ -777,6 +778,66 @@ public void PagedAttentionKernel_ComputeBatchedAttention_ProcessesMultiple()
         // Assert
         Assert.True(outputs.Any(v => v != 0));
     }
+
+    [Fact]
+    public void PagedAttentionKernel_ForwardQuantized_MatchesFloatWithinTolerance()
+    {
+        // Arrange
+        using var cacheFloat = CreateTestCache();
+        cacheFloat.AllocateSequence(1, 1);
+        var kernelFloat = new PagedAttentionKernel<float>(cacheFloat);
+
+        using var cacheQ = CreateTestCache();
+        cacheQ.AllocateSequence(1, 1);
+        var kernelQ = new PagedAttentionKernel<float>(cacheQ);
+
+        int hiddenDim = kernelFloat.Config.NumHeads * kernelFloat.Config.HeadDimension;
+        int projDim = hiddenDim;
+
+        var rnd = new Random(42);
+        var hidden = new float[hiddenDim];
+        for (int i = 0; i < hidden.Length; i++)
+        {
+            hidden[i] = (float)(rnd.NextDouble() * 0.2 - 0.1);
+        }
+
+        float[] MakeWeights(int rows, int cols)
+        {
+            var w = new float[rows * cols];
+            for (int i = 0; i < w.Length; i++)
+            {
+                w[i] = (float)(rnd.NextDouble() * 0.02 - 0.01);
+            }
+            return w;
+        }
+
+        var wQ = MakeWeights(projDim, hiddenDim);
+        var wK = MakeWeights(projDim, hiddenDim);
+        var wV = MakeWeights(projDim, hiddenDim);
+        var wO = MakeWeights(hiddenDim, projDim);
+
+        var qWQ = Int8WeightOnlyQuantization.QuantizePerRow(wQ, projDim, hiddenDim);
+        var qWK = Int8WeightOnlyQuantization.QuantizePerRow(wK, projDim, hiddenDim);
+        var qWV = Int8WeightOnlyQuantization.QuantizePerRow(wV, projDim, hiddenDim);
+        var qWO = Int8WeightOnlyQuantization.QuantizePerRow(wO, hiddenDim, projDim);
+
+        var outFloat = new float[hiddenDim];
+        var outQ = new float[hiddenDim];
+
+        // Act
+        kernelFloat.Forward(hidden, wQ, wK, wV, wO, sequenceId: 1, position: 0, layer: 0, output: outFloat);
+        kernelQ.ForwardQuantized(hidden, qWQ, qWK, qWV, qWO, sequenceId: 1, position: 0, layer: 0, output: outQ);
+
+        // Assert
+        float maxAbsDiff = 0f;
+        for (int i = 0; i < hiddenDim; i++)
+        {
+            float diff = MathF.Abs(outFloat[i] - outQ[i]);
+            if (diff > maxAbsDiff) maxAbsDiff = diff;
+        }
+
+        Assert.True(maxAbsDiff <= 1e-2f, $"Max abs diff was {maxAbsDiff}");
+    }
 }
 
 /// <summary>
@@ -850,8 +911,14 @@ public void PagedAttentionServer_ForkSequence_ForBeamSearch()
         Assert.Equal(4, server.GetStats().ActiveSequences);
     }
 
+#if NET471
+    [Fact(Skip = "4GB contiguous allocation exceeds typical .NET Framework single-object limits; validated on net8.0.")]
+    public void PagedAttentionServer_ForModel_CreatesValidServer()
+    {
+    }
+#else
     [Fact]
-    [Trait("Category", "Integration")]  // Skip on net471 - 4GB allocation exceeds .NET Framework array size limits
+    [Trait("Category", "Integration")]
     public void PagedAttentionServer_ForModel_CreatesValidServer()
     {
         // Act
@@ -861,6 +928,7 @@ public void PagedAttentionServer_ForModel_CreatesValidServer()
         Assert.NotNull(server.KVCache);
         Assert.NotNull(server.Kernel);
     }
+#endif
 }
 
 /// <summary>
diff --git a/tests/AiDotNet.Tests/UnitTests/Inference/SpeculativeDecodingTests.cs b/tests/AiDotNet.Tests/UnitTests/Inference/SpeculativeDecodingTests.cs
index a584bbbb6..0d0aa8a9e 100644
--- a/tests/AiDotNet.Tests/UnitTests/Inference/SpeculativeDecodingTests.cs
+++ b/tests/AiDotNet.Tests/UnitTests/Inference/SpeculativeDecodingTests.cs
@@ -608,4 +608,122 @@ public void TreeSpeculativeConfig_DefaultValues_AreReasonable()
         Assert.Equal(4, config.MaxDepth);
         Assert.Equal(16, config.MaxNodes);
     }
+
+    [Fact]
+    public async Task SpeculativeDecoder_GenerateAsync_TreeMode_RecordsDraftWork()
+    {
+        // Arrange
+        var draftModel = new NGramDraftModel<float>(ngramSize: 2, vocabSize: 20, seed: 42);
+        var corpus = new List<Vector<int>>
+        {
+            new Vector<int>(Enumerable.Range(0, 200).Select(i => i % 10).ToArray())
+        };
+        draftModel.Train(corpus);
+
+        Func<Vector<int>, Matrix<float>> targetForward = tokens =>
+        {
+            var probs = new Matrix<float>(tokens.Length, 20);
+            for (int i = 0; i < tokens.Length; i++)
+            {
+                for (int v = 0; v < 20; v++) probs[i, v] = 0.05f;
+                probs[i, 7] = 0.8f;
+            }
+            return probs;
+        };
+
+        var decoder = new SpeculativeDecoder<float>(
+            draftModel,
+            targetForward,
+            new SpeculativeDecodingConfig<float>
+            {
+                UseTreeSpeculation = true,
+                TreeBranchFactor = 3,
+                MaxTreeDepth = 3,
+                Seed = 42
+            });
+
+        // Act
+        var result = await decoder.GenerateAsync(new Vector<int>(new[] { 1 }), maxNewTokens: 8, temperature: 1.0f);
+
+        // Assert
+        Assert.True(result.NumGenerated > 0);
+        Assert.True(decoder.TotalDraftTokens > 0);
+        Assert.True(result.StepStatistics.Count > 0);
+        Assert.True(result.TokensPerVerification > 0);
+        Assert.True(result.StepStatistics.Any(s => s.DraftTokens > 0));
+    }
+
+    [Fact]
+    public async Task SpeculativeDecoder_AdaptiveDraftLength_ReducesDraftTokens_WhenAcceptanceLow()
+    {
+        // Arrange
+        var config = new SpeculativeDecodingConfig<float>
+        {
+            NumDraftTokens = 4,
+            AdaptiveDraftLength = true,
+            MinAcceptanceRate = 0.8f
+        };
+
+        // Target strongly prefers token 2.
+        Func<Vector<int>, Matrix<float>> targetForward = tokens =>
+        {
+            var probs = new Matrix<float>(tokens.Length, 10);
+            for (int i = 0; i < tokens.Length; i++)
+            {
+                for (int v = 0; v < 10; v++) probs[i, v] = 0.0001f;
+                probs[i, 2] = 0.999f;
+            }
+            return probs;
+        };
+
+        // Draft consistently proposes token 1 => low acceptance.
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 1);
+        var decoder = new SpeculativeDecoder<float>(draft, targetForward, config);
+
+        // Act
+        _ = await decoder.GenerateAsync(new Vector<int>(new[] { 0 }), maxNewTokens: 24, temperature: 1.0f);
+
+        // Assert
+        Assert.True(decoder.CurrentDraftTokens < 4);
+    }
+}
+
+internal sealed class DeterministicDraftModel : IDraftModel<float>
+{
+    private readonly int _vocabSize;
+    private readonly int _tokenId;
+
+    public DeterministicDraftModel(int vocabSize, int tokenId)
+    {
+        _vocabSize = vocabSize;
+        _tokenId = tokenId;
+    }
+
+    public int MaxDraftTokens => 32;
+
+    public int VocabSize => _vocabSize;
+
+    public DraftResult<float> GenerateDraft(Vector<int> inputTokens, int numDraftTokens, float temperature)
+    {
+        int n = Math.Max(0, numDraftTokens);
+        var tokens = new Vector<int>(Enumerable.Repeat(_tokenId, n).ToArray());
+
+        var probs = new Matrix<float>(n, _vocabSize);
+        for (int i = 0; i < n; i++)
+        {
+            probs[i, _tokenId] = 1.0f;
+        }
+
+        var tokenProbs = new Vector<float>(Enumerable.Repeat(1.0f, n).ToArray());
+        return new DraftResult<float>
+        {
+            Tokens = tokens,
+            TokenProbabilities = tokenProbs,
+            Probabilities = probs
+        };
+    }
+
+    public void Reset()
+    {
+    }
 }
diff --git a/tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs b/tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs
index 5f730e6ed..9cf03e445 100644
--- a/tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs
+++ b/tests/AiDotNet.Tests/UnitTests/Serving/ContinuousBatchingTests.cs
@@ -1,5 +1,6 @@
 using AiDotNet.Tensors.LinearAlgebra;
 using AiDotNet.Serving.ContinuousBatching;
+using AiDotNet.Inference.SpeculativeDecoding;
 using Xunit;
 
 namespace AiDotNet.Tests.UnitTests.Serving;
@@ -488,6 +489,290 @@ public void ContinuousBatcher_StartStop_Works()
         Assert.False(isNowRunning);
     }
 
+    [Fact]
+    public void ContinuousBatcher_SpeculationPolicy_ForceOn_GeneratesMultipleTokensPerStep()
+    {
+        // Arrange
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EosTokenId = 2,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.ForceOn,
+            SpeculationDepth = 3
+        };
+
+        // Target model: always makes token 5 overwhelmingly likely for every position.
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            int seqLen = input.Shape[1];
+            var logits = new Tensor<float>(new[] { 1, seqLen, vocabSize });
+            for (int pos = 0; pos < seqLen; pos++)
+            {
+                for (int i = 0; i < vocabSize; i++)
+                {
+                    logits[new[] { 0, pos, i }] = i == 5 ? 100f : -100f;
+                }
+            }
+            return logits;
+        }
+
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 5);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: draft);
+
+        var request = new GenerationRequest<float>
+        {
+            PromptTokenIds = new List<int> { 1, 2, 3 },
+            MaxNewTokens = 10,
+            Temperature = 1.0f
+        };
+
+        var sequence = new SequenceState<float>(request);
+
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(sequence);
+
+        // Act
+        int tokensGenerated = batcher.Step();
+
+        // Assert
+        Assert.True(tokensGenerated > 1);
+        Assert.True(batcher.LastStepUsedSpeculation);
+        Assert.True(batcher.LastStepSpeculationTokens > 1);
+    }
+
+    [Fact]
+    public void ContinuousBatcher_SpeculationPolicy_ForceOff_DisablesSpeculation()
+    {
+        // Arrange
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.ForceOff,
+            SpeculationDepth = 3
+        };
+
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            var logits = new Tensor<float>(new[] { 1, 1, vocabSize });
+            logits[new[] { 0, 0, 5 }] = 10f;
+            return logits;
+        }
+
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 5);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: draft);
+
+        var request = new GenerationRequest<float>
+        {
+            PromptTokenIds = new List<int> { 1 },
+            MaxNewTokens = 10
+        };
+
+        var sequence = new SequenceState<float>(request);
+
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(sequence);
+
+        // Act
+        int tokensGenerated = batcher.Step();
+
+        // Assert - baseline path generates one token per sequence per step
+        Assert.Equal(1, tokensGenerated);
+        Assert.False(batcher.LastStepUsedSpeculation);
+        Assert.Equal(0, batcher.LastStepSpeculationTokens);
+    }
+
+    [Fact]
+    public void ContinuousBatcher_SpeculationPolicy_Auto_BacksOff_WhenAcceptanceRateLow()
+    {
+        // Arrange
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EosTokenId = 2,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.Auto,
+            SpeculationDepth = 8,
+            SchedulerConfig = new BatchSchedulerConfig { MaxBatchSize = 8 }
+        };
+
+        // Target model strongly prefers token 5 at every position.
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            int seqLen = input.Shape[1];
+            var logits = new Tensor<float>(new[] { 1, seqLen, vocabSize });
+            for (int pos = 0; pos < seqLen; pos++)
+            {
+                for (int i = 0; i < vocabSize; i++)
+                {
+                    logits[new[] { 0, pos, i }] = i == 5 ? 100f : -100f;
+                }
+            }
+            return logits;
+        }
+
+        // Draft always proposes token 4 => low acceptance.
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 4);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: draft);
+
+        var request = new GenerationRequest<float>
+        {
+            PromptTokenIds = new List<int> { 1, 2, 3 },
+            MaxNewTokens = 64,
+            Temperature = 1.0f
+        };
+
+        var sequence = new SequenceState<float>(request);
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(sequence);
+
+        // Act: run enough steps to gather acceptance-rate evidence and trigger auto backoff.
+        bool sawAutoBackoff = false;
+        for (int i = 0; i < 12; i++)
+        {
+            batcher.Step();
+            if (!batcher.LastStepUsedSpeculation &&
+                batcher.LastStepSpeculationReason.StartsWith("AutoBackoff(LowAcceptanceRate="))
+            {
+                sawAutoBackoff = true;
+                break;
+            }
+        }
+
+        // Assert
+        Assert.True(sawAutoBackoff);
+    }
+
+    [Fact]
+    public void ContinuousBatcher_SpeculationPolicy_ThroughputFirst_BacksOff_WhenBatchSizeGreaterThanOne()
+    {
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EosTokenId = 2,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.ThroughputFirst,
+            SpeculationDepth = 4,
+            SchedulerConfig = new BatchSchedulerConfig { MaxBatchSize = 4 }
+        };
+
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            int seqLen = input.Shape[1];
+            var logits = new Tensor<float>(new[] { 1, seqLen, vocabSize });
+            for (int pos = 0; pos < seqLen; pos++)
+            {
+                logits[new[] { 0, pos, 5 }] = 10f;
+            }
+            return logits;
+        }
+
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 5);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: draft);
+
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(new SequenceState<float>(new GenerationRequest<float> { PromptTokenIds = new List<int> { 1 }, MaxNewTokens = 10 }));
+        scheduler.AddSequence(new SequenceState<float>(new GenerationRequest<float> { PromptTokenIds = new List<int> { 1 }, MaxNewTokens = 10 }));
+
+        batcher.Step();
+
+        Assert.False(batcher.LastStepUsedSpeculation);
+        Assert.Equal("ThroughputFirst(Backoff)", batcher.LastStepSpeculationReason);
+    }
+
+    [Fact]
+    public void ContinuousBatcher_SpeculationPolicy_LatencyFirst_AllowsSpeculation_WithBatchSizeGreaterThanOne()
+    {
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EosTokenId = 2,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.LatencyFirst,
+            SpeculationDepth = 4,
+            SchedulerConfig = new BatchSchedulerConfig { MaxBatchSize = 4 }
+        };
+
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            int seqLen = input.Shape[1];
+            var logits = new Tensor<float>(new[] { 1, seqLen, vocabSize });
+            for (int pos = 0; pos < seqLen; pos++)
+            {
+                logits[new[] { 0, pos, 5 }] = 10f;
+            }
+            return logits;
+        }
+
+        var draft = new DeterministicDraftModel(vocabSize: 10, tokenId: 5);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: draft);
+
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(new SequenceState<float>(new GenerationRequest<float> { PromptTokenIds = new List<int> { 1 }, MaxNewTokens = 10 }));
+        scheduler.AddSequence(new SequenceState<float>(new GenerationRequest<float> { PromptTokenIds = new List<int> { 1 }, MaxNewTokens = 10 }));
+
+        batcher.Step();
+
+        Assert.True(batcher.LastStepUsedSpeculation);
+        Assert.DoesNotContain("Backoff", batcher.LastStepSpeculationReason);
+    }
+
+    [Fact]
+    public void ContinuousBatcher_SpeculativeDecoding_DisablesAfterFailure()
+    {
+        // Arrange
+        var config = new ContinuousBatcherConfig
+        {
+            AutoStart = false,
+            EnableSpeculativeDecoding = true,
+            SpeculationPolicy = AiDotNet.Configuration.SpeculationPolicy.ForceOn,
+            SpeculationDepth = 3
+        };
+
+        Tensor<float> mockModel(Tensor<float> input)
+        {
+            var vocabSize = 10;
+            var logits = new Tensor<float>(new[] { 1, 1, vocabSize });
+            logits[new[] { 0, 0, 5 }] = 10f;
+            return logits;
+        }
+
+        var throwingDraft = new ThrowingDraftModel(vocabSize: 10);
+        using var batcher = new ContinuousBatcher<float>(config, mockModel, draftModel: throwingDraft);
+
+        var request = new GenerationRequest<float>
+        {
+            PromptTokenIds = new List<int> { 1 },
+            MaxNewTokens = 10
+        };
+
+        var sequence = new SequenceState<float>(request);
+
+        var scheduler = GetSchedulerFromBatcher(batcher);
+        scheduler.AddSequence(sequence);
+
+        // Act
+        int tokensGeneratedFirst = batcher.Step();
+        bool usedSpeculationFirst = batcher.LastStepUsedSpeculation;
+
+        int tokensGeneratedSecond = batcher.Step();
+        bool usedSpeculationSecond = batcher.LastStepUsedSpeculation;
+
+        // Assert
+        Assert.Equal(1, tokensGeneratedFirst); // falls back to baseline
+        Assert.True(usedSpeculationFirst); // ForceOn decision, even though it failed internally
+
+        Assert.Equal(1, tokensGeneratedSecond);
+        Assert.False(usedSpeculationSecond); // disabled after failure
+        Assert.Equal("DisabledDueToFailure", batcher.LastStepSpeculationReason);
+    }
+
     [Fact]
     public async Task ContinuousBatcher_GenerateAsync_ReturnsCancellableTask()
     {
@@ -625,4 +910,66 @@ private static BatchScheduler<T> GetSchedulerFromBatcher<T>(ContinuousBatcher<T>
     }
 
     #endregion
+
+    private sealed class DeterministicDraftModel : IDraftModel<float>
+    {
+        public int MaxDraftTokens => 16;
+        public int VocabSize { get; }
+
+        private readonly int _tokenId;
+
+        public DeterministicDraftModel(int vocabSize, int tokenId)
+        {
+            VocabSize = vocabSize;
+            _tokenId = tokenId;
+        }
+
+        public DraftResult<float> GenerateDraft(Vector<int> inputTokens, int numDraftTokens, float temperature)
+        {
+            var tokens = new Vector<int>(numDraftTokens);
+            var tokenProbs = new Vector<float>(numDraftTokens);
+            var probs = new Matrix<float>(numDraftTokens, VocabSize);
+
+            for (int i = 0; i < numDraftTokens; i++)
+            {
+                tokens[i] = _tokenId;
+                tokenProbs[i] = 1.0f;
+                for (int v = 0; v < VocabSize; v++)
+                {
+                    probs[i, v] = v == _tokenId ? 1.0f : 0.0f;
+                }
+            }
+
+            return new DraftResult<float>
+            {
+                Tokens = tokens,
+                TokenProbabilities = tokenProbs,
+                Probabilities = probs
+            };
+        }
+
+        public void Reset()
+        {
+        }
+    }
+
+    private sealed class ThrowingDraftModel : IDraftModel<float>
+    {
+        public int MaxDraftTokens => 16;
+        public int VocabSize { get; }
+
+        public ThrowingDraftModel(int vocabSize)
+        {
+            VocabSize = vocabSize;
+        }
+
+        public DraftResult<float> GenerateDraft(Vector<int> inputTokens, int numDraftTokens, float temperature)
+        {
+            throw new InvalidOperationException("Draft model failure (test).");
+        }
+
+        public void Reset()
+        {
+        }
+    }
 }