intel
diff --git a/‎IGC/VectorCompiler/CMCL/lib/Headers/cm-cl/detail/builtins.h‎
Lines changed: 4 additions & 1 deletion b/‎IGC/VectorCompiler/CMCL/lib/Headers/cm-cl/detail/builtins.h‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/CMCL/lib/Headers/cm-cl/math.h‎
Lines changed: 22 additions & 1 deletion b/‎IGC/VectorCompiler/CMCL/lib/Headers/cm-cl/math.h‎
Lines changed: 22 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/CMCL/lib/Support/TranslationDescription.json‎
Lines changed: 16 additions & 1 deletion b/‎IGC/VectorCompiler/CMCL/lib/Support/TranslationDescription.json‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/cmake/supported_platforms_list.cmake‎
Lines changed: 2 additions & 1 deletion b/‎IGC/VectorCompiler/cmake/supported_platforms_list.cmake‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/igcdeps/include/vc/igcdeps/cmc.h‎
Lines changed: 3 additions & 1 deletion b/‎IGC/VectorCompiler/igcdeps/include/vc/igcdeps/cmc.h‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/igcdeps/src/TranslationInterface.cpp‎
Lines changed: 10 additions & 0 deletions b/‎IGC/VectorCompiler/igcdeps/src/TranslationInterface.cpp‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎IGC/VectorCompiler/igcdeps/src/cmc.cpp‎
Lines changed: 23 additions & 0 deletions b/‎IGC/VectorCompiler/igcdeps/src/cmc.cpp‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎IGC/VectorCompiler/include/GenX.h‎
Lines changed: 2 additions & 1 deletion b/‎IGC/VectorCompiler/include/GenX.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/include/GenXPassRegistry.h‎
Lines changed: 5 additions & 1 deletion b/‎IGC/VectorCompiler/include/GenXPassRegistry.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎IGC/VectorCompiler/include/GenXSubtarget.h‎
Lines changed: 62 additions & 2 deletions b/‎IGC/VectorCompiler/include/GenXSubtarget.h‎
Lines changed: 62 additions & 2 deletions
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2021-2024 Intel Corporation
+Copyright (C) 2021-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -185,6 +185,9 @@ uint32_t __cm_cl_group_id_z();
 
 int __cm_cl_hw_thread_id();
 
+template <typename T, typename U>
+T __cm_cl_packed_4bit_upconvert_lut(T lut, U src);
+
 void __cm_cl_barrier();
 void __cm_cl_sbarrier(uint8_t);
 void __cm_cl_fence(memory_order semantics, memory_scope scope);
 
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2021-2024 Intel Corporation
+Copyright (C) 2021-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -379,6 +379,27 @@ vector<T, width> cosine(vector<T, width> src, cm::tag::fast_t) {
 
 /*==========================================================*/
 
+template <int Index, typename T, int NumElts>
+auto upconvert_4bit_lut(vector<uint32_t, 16> Lut, vector<T, NumElts> Src) {
+  constexpr int Stride = sizeof(uint32_t) / sizeof(T);
+  constexpr int Width = NumElts / Stride;
+  constexpr int VWidth = Width / 16;
+
+  static_assert(Width == 16 || Width == 32,
+                "upconvert_4bit_lut expects 16 or 32 elements");
+
+  vector<uint32_t, Width> Res;
+
+  vector<T, Width> _Src = Src.template select<Width, Stride>(Index);
+
+  auto _Lut =
+      detail::read_region<VWidth, 0, 16, 1, uint32_t, 16>(Lut.cl_vector(), 0);
+
+  Res = detail::__cm_cl_packed_4bit_upconvert_lut(_Lut, _Src.cl_vector());
+
+  return Res;
+}
+
 } // namespace math
 } // namespace cm
 
 
@@ -2,7 +2,7 @@
   "copyright": [
     "============================ begin_copyright_notice ============================",
     "",
-    "Copyright (C) 2021-2024 Intel Corporation",
+    "Copyright (C) 2021-2025 Intel Corporation",
     "",
     "SPDX-License-Identifier: MIT",
     "",
@@ -910,6 +910,21 @@
         ]
       }
     },
+    "UpconvertLut": {
+      "Name": "packed_4bit_upconvert_lut",
+      "Operands": [
+        {"Name": "Lut", "Kind": "Input"},
+        {"Name": "Source", "Kind": "Input"}
+      ],
+      "TranslateInto": {
+        "VC-Intrinsic": "packed_4bit_upconvert_lut",
+        "ReturnType": {"GetBuiltinReturnType": []},
+        "Operands": [
+          {"GetBuiltinOperand": ["Lut"]},
+          {"GetBuiltinOperand": ["Source"]}
+        ]
+      }
+    },
     "AtomicRMW": {
         "Name": "atomicrmw",
         "Operands": [
 
@@ -1,6 +1,6 @@
 #=========================== begin_copyright_notice ============================
 #
-# Copyright (C) 2020-2022 Intel Corporation
+# Copyright (C) 2020-2025 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -20,4 +20,5 @@ set(SUPPORTED_VC_PLATFORMS
     "XeHPCVG"
     "Xe2"
     "Xe3"
+    "Xe3P"
     )
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2019-2024 Intel Corporation
+Copyright (C) 2019-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -108,6 +108,8 @@ class CMKernel {
                                      unsigned Size, unsigned ArgOffset);
 
   void createImplArgsBufferAnnotation(unsigned Size, unsigned ArgOffset);
+  void createIndirectDataBufferAnnotation(unsigned Size, unsigned ArgOffset);
+  void createScratchBufferAnnotation(unsigned Size, unsigned ArgOffset);
 
   void RecomputeBTLayout(int numUAVs, int numResources);
 };
 
@@ -182,6 +182,10 @@ getPlatformName(const PLATFORM &Platform) {
     if (Product == IGFX_NVL_XE3G)
       return {"Xe3", RevId};
     LLVM_FALLTHROUGH;
+  case IGFX_XE3P_CORE:
+    if (Product == IGFX_CRI)
+      return {"Xe3P", RevId};
+    break;
   default:
     break;
   }
@@ -203,6 +207,12 @@ static void adjustPlatform(const IGC::CPlatform &IGCPlatform,
       IGCPlatform.hasL3FlushOnGPUScopeInvalidate();
   Opts.HasHalfSIMDLSC = IGCPlatform.hasHalfSIMDLSC();
   Opts.WATable = &IGCPlatform.getWATable();
+
+  if (IGCPlatform.hasEfficient64bEnabled()) {
+    if (!Opts.FeaturesString.empty())
+      Opts.FeaturesString.append(",");
+    Opts.FeaturesString.append("+efficient_64b_enabled");
+  }
 }
 
 static void adjustFileType(TC::TB_DATA_FORMAT DataFormat,
 
@@ -307,6 +307,20 @@ void CMKernel::createImplArgsBufferAnnotation(unsigned Size,
       zebin::PreDefinedAttrGetter::ArgType::implicit_arg_buffer, ArgOffset,
       Size);
 }
+void CMKernel::createIndirectDataBufferAnnotation(unsigned Size,
+                                                  unsigned ArgOffset) {
+  zebin::ZEInfoBuilder::addPayloadArgumentImplicit(
+      m_kernelInfo.m_zePayloadArgs,
+      zebin::PreDefinedAttrGetter::ArgType::indirect_data_pointer, ArgOffset,
+      Size);
+}
+
+void CMKernel::createScratchBufferAnnotation(unsigned Size,
+                                             unsigned ArgOffset) {
+  zebin::ZEInfoBuilder::addPayloadArgumentImplicit(
+      m_kernelInfo.m_zePayloadArgs,
+      zebin::PreDefinedAttrGetter::ArgType::scratch_pointer, ArgOffset, Size);
+}
 
 // TODO: refactor this function with the OCL part.
 void CMKernel::RecomputeBTLayout(int numUAVs, int numResources) {
@@ -466,6 +480,13 @@ static void setArgumentsInfo(const GenXOCLRuntimeInfo::KernelInfo &Info,
     case ArgKind::ImplicitArgsBuffer:
       Kernel.createImplArgsBufferAnnotation(Arg.getSizeInBytes(), ArgOffset);
       break;
+    case ArgKind::IndirectDataBuffer:
+      Kernel.createIndirectDataBufferAnnotation(Arg.getSizeInBytes(),
+                                                ArgOffset);
+      break;
+    case ArgKind::ScratchBuffer:
+      Kernel.createScratchBufferAnnotation(Arg.getSizeInBytes(), ArgOffset);
+      break;
     }
   }
 
@@ -522,6 +543,8 @@ static void setExecutionInfo(const GenXOCLRuntimeInfo::KernelInfo &BackendInfo,
   ExecEnv.HasLscStoresWithNonDefaultL1CacheControls =
       BackendInfo.hasLscStoresWithNonDefaultL1CacheControls();
 
+  auto &ThreadPayload = Kernel.m_kernelInfo.m_threadPayload;
+  ThreadPayload.PassInlineDataSize = BackendInfo.getInlineDataPayloadSize();
 
   // Allocate spill-fill buffer
   if (JitterInfo.hasStackcalls) {
 
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -159,6 +159,7 @@ FunctionPass *createGenXLegacyToLscTranslatorPass();
 ModulePass *createGenXSLMResolution();
 FunctionPass *createGenXLscAddrCalcFoldingPass();
 ModulePass *createGenXDetectPointerArgPass();
+ModulePass *createGenXPropagateSurfaceStatePass();
 FunctionPass *createGenXLCECalculationPass();
 FunctionPass *createGenXFloatControlPass();
 ModulePass *createGenXCountIndirectStatelessPass();
 
@@ -16,10 +16,13 @@ MODULE_PASS("CMABI", CMABIPass())
 MODULE_PASS("CMImpParam", CMImpParamPass())
 MODULE_PASS("CMKernelArgOffset",
             CMKernelArgOffsetPass(GTM->getGenXSubtarget().getGRFByteSize(),
+                                  GTM->getGenXSubtarget().hasEfficient64b(),
                                   BC->useBindlessImages()))
 MODULE_PASS("GenXPacketize", GenXPacketizePass())
 MODULE_PASS("GenXBIFFlagCtrlResolution", GenXBIFFlagCtrlResolutionPass())
-MODULE_PASS("GenXBTIAssignment", GenXBTIAssignmentPass(BC->getResult()))
+MODULE_PASS("GenXBTIAssignment",
+            GenXBTIAssignmentPass(BC->getResult(),
+                                  GTM->getGenXSubtarget().hasEfficient64b()))
 
 MODULE_PASS("GenXImportOCLBiF", GenXImportOCLBiFPass())
 MODULE_PASS("GenXLegalizeGVLoadUses", GenXLegalizeGVLoadUsesPass())
@@ -42,6 +45,7 @@ MODULE_PASS("GenXVerify", GenXVerifyPass())
 #endif
 
 FUNCTION_PASS("GenXSimplify", GenXSimplifyPass())
+FUNCTION_PASS("GenXStatePointerFence", GenXStatePointerFencePass())
 FUNCTION_PASS("CMLowerVLoadVStore", CMLowerVLoadVStorePass())
 FUNCTION_PASS("GenXTypeLegalization", GenXTypeLegalizationPass())
 FUNCTION_PASS("GenXTranslateIntrinsics", GenXTranslateIntrinsicsPass())
 
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2024 Intel Corporation
+Copyright (C) 2017-2025 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -63,6 +63,7 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
     XeHPCVG,
     Xe2,
     Xe3,
+    Xe3P,
     Invalid,
   };
 
@@ -79,6 +80,12 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   // HasFP64 - True if subtarget supports double type
   bool HasFP64 = false;
 
+  // HasNativeBFloat16 - True if subtarget supports bfloat16 arithmeics
+  bool HasNativeBFloat16 = false;
+
+  // HasMxfp - True if subtarget supports mxfp* operations
+  bool HasMxfp = false;
+
   // HasIEEEDivSqrt - True if subtarget supports IEEE-754 div and sqrt
   bool HasIEEEDivSqrt = false;
 
@@ -123,6 +130,10 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   // True if it is profitable to use native DxD->Q multiplication
   bool UseMulDDQ = false;
 
+  // True if it is profitable to use native DxD+D->Q and DxD+Q->Q multiply-add
+  // operations
+  bool UseMadDDQ = false;
+
   // True if codegenerating for OCL runtime (set by default since CMRT removed)
   bool OCLRuntime = true;
 
@@ -175,6 +186,15 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   /// True if subtarget supports half SIMD LSC messages
   bool HasHalfSIMDLSC = false;
 
+  /// True if subtarget supports efficient 64-bit addressing mode
+  bool HasEfficient64b = false;
+
+  /// True if efficient 64-bit mode is enabled
+  bool EnabledEfficient64b = false;
+
+  /// Number of supported cache levels
+  unsigned NumCacheLevels = 2;
+
   /// True if subtarget supports sampler messages
   bool HasSampler = false;
 
@@ -235,12 +255,21 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   // True if target supports global double precision atomic add/sub
   bool HasGlobalAtomicAddF64 = false;
 
+  // True if target supports half precision atomics
+  bool HasInstrAtomicHF16 = false;
+
+  // True if target supports local single precision atomic add/sub
+  bool HasInstrLocalAtomicAddF32 = false;
+
   /// Max supported SLM size (in kbytes)
   int MaxSLMSize = 64;
 
   // Number of elements in Address Register
   unsigned AddressRegisterElements = 16;
 
+  // True if subtarget supports SIMD32 programming model
+  bool HasEfficientSIMD32 = false;
+
   // Shows which surface should we use for stack
   PreDefined_Surface StackSurf;
 
@@ -303,6 +332,18 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
 
   bool hasLSCOffset() const { return HasLSCOffset; }
 
+  // * efficient 64-bit addressing is supported
+  bool supportEfficient64b() const { return HasEfficient64b; }
+
+  // * efficient 64-bit addressing is supported and enabled
+  bool hasEfficient64b() const {
+    return HasEfficient64b && EnabledEfficient64b;
+  }
+
+  bool hasLSCBase() const { return hasEfficient64b(); }
+
+  unsigned getLSCScaleMax() const { return hasEfficient64b() ? 32 : 1; }
+
   bool translateLegacyMessages() const {
     return HasLSCMessages && TranslateLegacyMessages;
   }
@@ -332,6 +373,12 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   /// * hasFP64 - true if target supports double fp
   bool hasFP64() const { return HasFP64; }
 
+  /// * hasNativeBFloat16 - true if target supports bfloat16 arithmetic
+  bool hasNativeBFloat16() const { return HasNativeBFloat16; }
+
+  /// * hasMxfp - true if target supports mxfp* operations
+  bool hasMxfp() const { return HasMxfp; }
+
   /// * hasIEEEDivSqrt - true if target supports IEEE-754 div and sqrt
   bool hasIEEEDivSqrt() const { return HasIEEEDivSqrt; }
 
@@ -344,6 +391,10 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   /// * useMulDDQ - true if is desired to emit DxD->Q mul instruction
   bool useMulDDQ() const { return UseMulDDQ; }
 
+  /// * useMadDDQ - true if is desired to emit DxD+Q->Q and DxD+D->Q mad
+  /// instruction
+  bool useMadDDQ() const { return UseMadDDQ; }
+
   /// * disableJmpi - true if jmpi is disabled.
   bool disableJmpi() const { return DisableJmpi; }
 
@@ -411,6 +462,9 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
 
   bool hasGlobalAtomicAddF64() const { return HasGlobalAtomicAddF64; }
 
+  bool hasInstrAtomicHF16() const { return HasInstrAtomicHF16; }
+  bool hasInstrLocalAtomicAddF32() const { return HasInstrLocalAtomicAddF32; }
+
   bool hasL1ReadOnlyCache() const { return HasL1ReadOnlyCache; }
   bool hasLocalMemFenceSupress() const { return HasLocalMemFenceSupress; }
   bool hasMultiTile() const { return HasMultiTile; };
@@ -447,13 +501,19 @@ class GenXSubtarget final : public GenXGenSubtargetInfo {
   /// bit fields for ThreadID (from lsb to msb).
   ArrayRef<std::pair<int, int>> getThreadIdBits() const;
 
-  unsigned getNumCacheLevels() const { return 2; }
+  unsigned getNumCacheLevels() const {
+    if (hasEfficient64b())
+      return NumCacheLevels;
+    return 2;
+  }
 
   // Address Register size in elements.
   unsigned getAddressRegisterElements() const {
     return AddressRegisterElements;
   }
 
+  bool hasEfficientSIMD32() const { return HasEfficientSIMD32; }
+
   // Generic helper functions...
   const Triple &getTargetTriple() const { return TargetTriple; }
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`#=========================== begin_copyright_notice ============================`
`2`	`2`	`#`
`3`		`-# Copyright (C) 2020-2022 Intel Corporation`
	`3`	`+# Copyright (C) 2020-2025 Intel Corporation`
`4`	`4`	`#`
`5`	`5`	`# SPDX-License-Identifier: MIT`
`6`	`6`	`#`
`@@ -20,4 +20,5 @@ set(SUPPORTED_VC_PLATFORMS`
`20`	`20`	`"XeHPCVG"`
`21`	`21`	`"Xe2"`
`22`	`22`	`"Xe3"`
	`23`	`+ "Xe3P"`
`23`	`24`	`)`