@@ -5175,6 +5175,162 @@ void EmitPass::emitSimdShuffleDown(llvm::Instruction* inst)
51755175 }
51765176}
51775177
5178+ void EmitPass::emitSimdShuffleXor(llvm::Instruction* inst)
5179+ {
5180+ CVariable* pData = m_currShader->GetSymbol(inst->getOperand(0));
5181+ CVariable* pXorValue = m_currShader->GetSymbol(inst->getOperand(1));
5182+
5183+ IGC_ASSERT_MESSAGE(pXorValue->IsImmediate(), "simdShuffleXor must have \
5184+ constant xorValue parameter");
5185+
5186+ // emit move sequence for 1 bit
5187+ // case 0: 1 2 3 4 5 6 7 8 => 2 1 4 3 6 5 8 7
5188+ // case 1: 1 2 3 4 5 6 7 8 => 3 4 1 2 7 8 5 6
5189+ // case 1: 1 2 3 4 5 6 7 8 => 3 4 1 2 7 8 5 6
5190+ // case 2: 1 2 3 4 5 6 7 8 => 5 6 7 8 1 2 3 4
5191+ // case 3: 1 2 .. 8 9 .. 15 16 => 9 10 .. 15 16 1 2 .. 7 8
5192+ auto emitShuffleXor1Bit = [&](CVariable* pData, uint xorBit) -> CVariable*
5193+ {
5194+ VISA_Type type = pData->GetType();
5195+ bool is64bitType = type == ISA_TYPE_Q || type == ISA_TYPE_UQ || type == ISA_TYPE_DF;
5196+
5197+ CVariable* pResult = m_currShader->GetNewVariable(
5198+ pData->GetNumberElement(),
5199+ pData->GetType(),
5200+ pData->GetAlign(),
5201+ false,
5202+ 1,
5203+ "simdShuffleXorTmp");
5204+
5205+ if (xorBit == 0 || (xorBit == 1 && !is64bitType)) {
5206+ // Use strided access of max possible length
5207+ // For simd16 and xorBit == 0
5208+ // mov (M1_NM, 8) simdShuffleXorTmp(0,0)<2> V0040(0,1)<2;1,0> /// $11
5209+ // mov (M1_NM, 8) simdShuffleXorTmp(0,1)<2> V0040(0,0)<2;1,0> /// $12
5210+ // For 32-bit it will be just 2 movs, for 64-bit double type let the finalizer to split the vars:
5211+ // r10 is the source
5212+ // (W) mov (16|M0) r19.0<1>:ud r10.2<4;2,1>:ud {$4.dst} // $13
5213+ // (W) mov (8|M0) r18.0<1>:df r19.0<1;1,0>:df {I@1} // $13
5214+ // (W) mov (8|M0) r12.0<4>:ud r18.0<2;1,0>:ud {Compacted,L@1} // $13
5215+ // (W) mov (8|M0) r12.1<4>:ud r18.1<2;1,0>:ud // $13
5216+ // (W) mov (16|M0) r21.0<1>:ud r10.0<4;2,1>:ud // $14
5217+ // (W) mov (8|M0) r20.0<1>:df r21.0<1;1,0>:df {I@1} // $14
5218+ // (W) mov (8|M0) r12.2<4>:ud r20.0<2;1,0>:ud {Compacted,L@1} // $14
5219+ // (W) mov (8|M0) r12.3<4>:ud r20.1<2;1,0>:ud // $14
5220+
5221+ // For int32 and xorBit == 1
5222+ // mov (M1_NM, 4) simdShuffleXorTmp(0,0)<4> V0040(0,2)<4;1,0> /// $11
5223+ // mov (M1_NM, 4) simdShuffleXorTmp(0,2)<4> V0040(0,0)<4;1,0> /// $12
5224+ // mov (M1_NM, 4) simdShuffleXorTmp(0,1)<4> V0040(0,3)<4;1,0> /// $13
5225+ // mov (M1_NM, 4) simdShuffleXorTmp(0,3)<4> V0040(0,1)<4;1,0> /// $14
5226+ // for xorBit == 1 strided moves are beneficial only if the type is less that 64-bit
5227+ // (fewer moves will be generated)
5228+
5229+ // for xorBit > 1 is it always more beneficial to copy with subsequent chunks
5230+
5231+ auto stride = (2 * (xorBit + 1));
5232+ auto width = pData->GetNumberElement() / stride;
5233+ auto currentSimdMode = lanesToSIMDMode(width);
5234+
5235+ for (uint i = 0; i < xorBit + 1; i++) {
5236+ m_encoder->SetSimdSize(currentSimdMode);
5237+ m_encoder->SetSrcRegion(0, stride, 1, 0);
5238+ m_encoder->SetSrcSubReg(0, i + xorBit + 1);
5239+ m_encoder->SetDstRegion(stride);
5240+ m_encoder->SetDstSubReg(i);
5241+ m_encoder->SetNoMask();
5242+ m_encoder->Copy(pResult, pData);
5243+ m_encoder->Push();
5244+
5245+ m_encoder->SetSimdSize(currentSimdMode);
5246+ m_encoder->SetSrcRegion(0, stride, 1, 0);
5247+ m_encoder->SetSrcSubReg(0, i);
5248+ m_encoder->SetDstRegion(stride);
5249+ m_encoder->SetDstSubReg(i + xorBit + 1);
5250+ m_encoder->SetNoMask();
5251+ m_encoder->Copy(pResult, pData);
5252+ m_encoder->Push();
5253+ }
5254+ }
5255+ else if ((xorBit >= 1) && (xorBit <= 3)) {
5256+ // Use subsequent accesses to copy all subsequent chunks
5257+ // for xorBit == 2
5258+ // mov (M1_NM, 4) simdShuffleXorTmp(0,0)<1> V0043(0,4)<1;1,0> /// $13
5259+ // mov (M1_NM, 4) simdShuffleXorTmp(0,4)<1> V0043(0,0)<1;1,0> /// $14
5260+ // mov (M1_NM, 4) simdShuffleXorTmp(1,0)<1> V0043(1,4)<1;1,0> /// $15
5261+ // mov (M1_NM, 4) simdShuffleXorTmp(1,4)<1> V0043(1,0)<1;1,0> /// $16
5262+ // for 64-bit types the accesses will be 2x widened in finalizer
5263+ // (W) mov (8|M0) r12.0<1>:ud r10.8<1;1,0>:ud {Compacted,$4.dst} // $13
5264+ // (W) mov (8|M0) r12.8<1>:ud r10.0<1;1,0>:ud {Compacted} // $14
5265+ // (W) mov (8|M0) r13.0<1>:ud r11.8<1;1,0>:ud {Compacted} // $15
5266+ // (W) mov (8|M0) r13.8<1>:ud r11.0<1;1,0>:ud {Compacted} // $16
5267+ // The number of chunks is larger on the larger SIMD
5268+
5269+ auto width = static_cast<int>(std::pow(2, xorBit));
5270+ auto currentSimdMode = lanesToSIMDMode(width);
5271+
5272+ for (uint i = 0; i < pData->GetNumberElement(); i += width * 2) {
5273+ m_encoder->SetSimdSize(currentSimdMode);
5274+ m_encoder->SetSrcRegion(0, 1, 1, 0);
5275+ m_encoder->SetSrcSubReg(0, i + width);
5276+ m_encoder->SetDstRegion(1);
5277+ m_encoder->SetDstSubReg(i);
5278+ m_encoder->SetNoMask();
5279+ m_encoder->Copy(pResult, pData);
5280+ m_encoder->Push();
5281+
5282+ m_encoder->SetSimdSize(currentSimdMode);
5283+ m_encoder->SetSrcRegion(0, 1, 1, 0);
5284+ m_encoder->SetSrcSubReg(0, i);
5285+ m_encoder->SetDstRegion(1);
5286+ m_encoder->SetDstSubReg(i + width);
5287+ m_encoder->SetNoMask();
5288+ m_encoder->Copy(pResult, pData);
5289+ m_encoder->Push();
5290+ }
5291+ }
5292+ else {
5293+ IGC_ASSERT_MESSAGE(false, "simdShuffleXor is only implemented for 0 <= xor_value <= 15");
5294+ };
5295+
5296+ return pResult;
5297+ };
5298+
5299+ // just broadcast the value if the value is uniform
5300+ if (pData->IsUniform()) {
5301+ m_encoder->SetSrcRegion(0, 0, 1, 0);
5302+ m_encoder->SetSrcSubReg(0, 0);
5303+ m_encoder->SetDstRegion(1);
5304+ m_encoder->SetDstSubReg(0);
5305+ m_encoder->Copy(m_destination, pData);
5306+ m_encoder->Push();
5307+ return;
5308+ }
5309+
5310+ // emit moves for every non-zero bit subsequently
5311+ const auto xorValue = pXorValue->GetImmediateValue();
5312+ CVariable* tempValue = pData;
5313+ for (uint i = 0; i < 5; i++)
5314+ {
5315+ if (((xorValue >> i) & 0x1) == 0x1)
5316+ {
5317+ tempValue = emitShuffleXor1Bit(tempValue, i);
5318+ }
5319+ }
5320+
5321+ // final copy, respecting the execution mask if in divergent CF
5322+ if (!m_currShader->InsideDivergentCF(inst))
5323+ {
5324+ m_encoder->SetNoMask();
5325+ }
5326+ m_encoder->SetSrcRegion(0, 1, 1, 0);
5327+ m_encoder->SetSrcSubReg(0, 0);
5328+ m_encoder->SetDstRegion(1);
5329+ m_encoder->SetDstSubReg(0);
5330+ m_encoder->Copy(m_destination, tempValue);
5331+ m_encoder->Push();
5332+ }
5333+
51785334static uint32_t getBlockMsgSize(uint32_t bytesRemaining, uint32_t maxSize)
51795335{
51805336 uint32_t size = 0;
@@ -7235,6 +7391,9 @@ void EmitPass::EmitGenIntrinsicMessage(llvm::GenIntrinsicInst* inst)
72357391 case GenISAIntrinsic::GenISA_simdShuffleDown:
72367392 emitSimdShuffleDown(inst);
72377393 break;
7394+ case GenISAIntrinsic::GenISA_simdShuffleXor:
7395+ emitSimdShuffleXor(inst);
7396+ break;
72387397 case GenISAIntrinsic::GenISA_simdBlockRead:
72397398 emitSimdBlockRead(inst);
72407399 break;
0 commit comments