@@ -125,31 +125,27 @@ void CComputeShader::ParseShaderSpecificOpcode(llvm::Instruction* inst)
125125 }
126126}
127127
128- void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& curbeTotalDataLength, uint& curbeReadLength )
128+ void CComputeShader::CreateThreadPayloadData (void * & pThreadPayload, uint& threadPayloadSize )
129129{
130- typedef uint16_t ThreadPayloadEntry;
131-
132130 // Find the max thread group dimension
133131 const OctEltUnit SIZE_OF_DQWORD = OctEltUnit (2 );
134- const OctEltUnit SIZE_OF_OWORD = OctEltUnit (1 );
135132 uint numberOfId = GetNumberOfId ();
136133 uint dimX = numLanes (m_dispatchSize);
137- // dimX must align to alignment_X bytes (one GRF)
138- uint alignment_X = EltUnit (SIZE_OF_OWORD). Count () * sizeof (DWORD);
139- uint dimX_aligned = iSTD::Align (dimX * sizeof ( ThreadPayloadEntry), alignment_X) / sizeof (ThreadPayloadEntry) ;
140- uint dimY = ( iSTD::Align (m_threadGroupSize, dimX) / dimX) * numberOfId;
141- curbeReadLength = dimX_aligned * numberOfId * sizeof (ThreadPayloadEntry) / alignment_X;
134+ uint dimY = ( iSTD::Align (m_threadGroupSize, dimX)/dimX) * numberOfId;
135+
136+ typedef uint ThreadPayloadEntry;
137+
138+ uint alignedVal = EltUnit (SIZE_OF_DQWORD). Count () * sizeof (DWORD); // Oct Element is 8 DWORDS
142139
143- uint alignedVal = EltUnit (SIZE_OF_DQWORD).Count () * sizeof (ThreadPayloadEntry); // Oct Element is 8 Entries
144140 // m_NOSBufferSize is the additional space for cross-thread constant data (constants set by driver).
145- curbeTotalDataLength = iSTD::Align (dimX_aligned * dimY * sizeof (ThreadPayloadEntry) + m_NOSBufferSize, alignedVal);
141+ threadPayloadSize = iSTD::Align ( dimX * dimY * sizeof ( ThreadPayloadEntry ) + m_NOSBufferSize, alignedVal );
146142
147143 assert (pThreadPayload == nullptr && " Thread payload should be a null variable" );
148144
149- unsigned threadPayloadEntries = curbeTotalDataLength / sizeof (ThreadPayloadEntry);
145+ unsigned threadPayloadEntries = threadPayloadSize / sizeof (ThreadPayloadEntry);
150146
151- ThreadPayloadEntry* pThreadPayloadMem =
152- (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries * sizeof (ThreadPayloadEntry), 16 );
147+ ThreadPayloadEntry* pThreadPayloadMem =
148+ (ThreadPayloadEntry*)IGC::aligned_malloc (threadPayloadEntries* sizeof (ThreadPayloadEntry), 16 );
153149 std::fill (pThreadPayloadMem, pThreadPayloadMem + threadPayloadEntries, 0 );
154150
155151 pThreadPayload = pThreadPayloadMem;
@@ -173,17 +169,17 @@ void CComputeShader::CreateThreadPayloadData(void* & pThreadPayload, uint& curbe
173169 uint lane = 0 ;
174170 if (m_pThread_ID_in_Group_X)
175171 {
176- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadX;
172+ pThreadPayloadMem[(y + lane) * dimX + x] = currThreadX;
177173 lane++;
178174 }
179175 if (m_pThread_ID_in_Group_Y)
180176 {
181- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadY;
177+ pThreadPayloadMem[(y + lane) * dimX + x] = currThreadY;
182178 lane++;
183179 }
184180 if (m_pThread_ID_in_Group_Z)
185181 {
186- pThreadPayloadMem[(y + lane) * dimX_aligned + x] = currThreadZ;
182+ pThreadPayloadMem[(y + lane) * dimX + x] = currThreadZ;
187183 lane++;
188184 }
189185
@@ -263,19 +259,19 @@ CVariable* CComputeShader::CreateThreadIDinGroup(uint channelNum)
263259 case 0 :
264260 if (m_pThread_ID_in_Group_X == nullptr )
265261 {
266- m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
262+ m_pThread_ID_in_Group_X = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
267263 }
268264 return m_pThread_ID_in_Group_X;
269265 case 1 :
270266 if (m_pThread_ID_in_Group_Y == nullptr )
271267 {
272- m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
268+ m_pThread_ID_in_Group_Y = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
273269 }
274270 return m_pThread_ID_in_Group_Y;
275271 case 2 :
276272 if (m_pThread_ID_in_Group_Z == nullptr )
277273 {
278- m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_W , EALIGN_GRF, false , m_numberInstance);
274+ m_pThread_ID_in_Group_Z = GetNewVariable (numLanes (m_SIMDSize), ISA_TYPE_D , EALIGN_GRF, false , m_numberInstance);
279275 }
280276 return m_pThread_ID_in_Group_Z;
281277 default :
@@ -339,7 +335,6 @@ void CComputeShader::AllocatePayload()
339335 {
340336 AllocateInput (m_pThread_ID_in_Group_X, offset, i);
341337 offset += m_pThread_ID_in_Group_X->GetSize ();
342- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_X->GetAlign ()]);
343338 }
344339 }
345340
@@ -349,7 +344,6 @@ void CComputeShader::AllocatePayload()
349344 {
350345 AllocateInput (m_pThread_ID_in_Group_Y, offset, i);
351346 offset += m_pThread_ID_in_Group_Y->GetSize ();
352- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Y->GetAlign ()]);
353347 }
354348 }
355349
@@ -359,7 +353,6 @@ void CComputeShader::AllocatePayload()
359353 {
360354 AllocateInput (m_pThread_ID_in_Group_Z, offset, i);
361355 offset += m_pThread_ID_in_Group_Z->GetSize ();
362- offset = iSTD::Round (offset, alignmentSize[m_pThread_ID_in_Group_Z->GetAlign ()]);
363356 }
364357 }
365358
@@ -473,6 +466,8 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
473466 pKernelProgram->FloatingPointMode = USC::GFX3DSTATE_FLOATING_POINT_IEEE_754;
474467 pKernelProgram->SingleProgramFlow = USC::GFX3DSTATE_PROGRAM_FLOW_MULTIPLE;
475468 pKernelProgram->CurbeReadOffset = 0 ;
469+ pKernelProgram->CurbeReadLength = GetNumberOfId () * (numLanes (m_dispatchSize) / numLanes (SIMDMode::SIMD8));
470+
476471 pKernelProgram->PhysicalThreadsInGroup = static_cast <int >(
477472 std::ceil ((static_cast <float >(m_threadGroupSize) /
478473 static_cast <float >((numLanes (m_dispatchSize))))));
@@ -492,8 +487,7 @@ void CComputeShader::FillProgram(SComputeShaderKernelProgram* pKernelProgram)
492487 pKernelProgram->ThreadPayloadData = nullptr ;
493488 CreateThreadPayloadData (
494489 pKernelProgram->ThreadPayloadData ,
495- pKernelProgram->CurbeTotalDataLength ,
496- pKernelProgram->CurbeReadLength );
490+ pKernelProgram->CurbeTotalDataLength );
497491
498492 pKernelProgram->ThreadGroupSize = m_threadGroupSize;
499493
0 commit comments