Skip to content

Commit bcf93a6

Browse files
Use Eu per dss to callculate max work group size
Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
1 parent 6b0b5ef commit bcf93a6

File tree

8 files changed

+35
-8
lines changed

8 files changed

+35
-8
lines changed

opencl/source/kernel/kernel.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@ Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &c
7777
program->retainForKernel();
7878
imageTransformer.reset(new ImageTransformer);
7979
if (kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize == 1u) {
80-
maxKernelWorkGroupSize = HwHelper::get(getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroup(getHardwareInfo(), static_cast<uint32_t>(getDevice().getDevice().getDeviceInfo().maxNumEUsPerSubSlice));
80+
auto deviceInfo = getDevice().getDevice().getDeviceInfo();
81+
maxKernelWorkGroupSize = HwHelper::get(getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
8182
} else {
8283
maxKernelWorkGroupSize = static_cast<uint32_t>(clDevice.getSharedDeviceInfo().maxWorkGroupSize);
8384
}

opencl/test/unit_test/device/device_caps_tests.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1685,3 +1685,20 @@ HWTEST_F(QueueFamilyNameTest, givenBcsWhenGettingQueueFamilyNameThenReturnProper
16851685
HWTEST_F(QueueFamilyNameTest, givenInvalidEngineGroupWhenGettingQueueFamilyNameThenReturnEmptyName) {
16861686
verify(EngineGroupType::MaxEngineGroups, "");
16871687
}
1688+
HWCMDTEST_F(IGFX_GEN8_CORE, DeviceGetCapsTest, givenSysInfoWhenDeviceCreatedThenMaxWorkGroupCalculatedCorrectly) {
1689+
HardwareInfo myHwInfo = *defaultHwInfo;
1690+
GT_SYSTEM_INFO &mySysInfo = myHwInfo.gtSystemInfo;
1691+
PLATFORM &myPlatform = myHwInfo.platform;
1692+
1693+
mySysInfo.EUCount = 16;
1694+
mySysInfo.SubSliceCount = 4;
1695+
mySysInfo.DualSubSliceCount = 2;
1696+
mySysInfo.ThreadCount = 16 * 8;
1697+
myPlatform.usRevId = 0x4;
1698+
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&myHwInfo));
1699+
auto minSimd = 8;
1700+
1701+
auto expectedWG = (mySysInfo.ThreadCount / mySysInfo.EUCount) * (mySysInfo.EUCount / mySysInfo.SubSliceCount) * minSimd;
1702+
1703+
EXPECT_EQ(expectedWG, device->sharedDeviceInfo.maxWorkGroupSize);
1704+
}

opencl/test/unit_test/kernel/kernel_tests.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2515,6 +2515,7 @@ HWTEST_F(KernelTest, givenKernelWhenDebugFlagToUseMaxSimdForCalculationsIsUsedTh
25152515

25162516
mySysInfo.EUCount = 24;
25172517
mySysInfo.SubSliceCount = 3;
2518+
mySysInfo.DualSubSliceCount = 3;
25182519
mySysInfo.ThreadCount = 24 * 7;
25192520
auto device = std::make_unique<MockClDevice>(MockDevice::createWithNewExecutionEnvironment<MockDevice>(&myHwInfo));
25202521

@@ -3166,7 +3167,8 @@ TEST_F(KernelTests, givenKernelWithSimdEqual1WhenKernelCreatedThenMaxWorgGroupSi
31663167
std::unique_ptr<MockKernel> pKernel(new MockKernel(pProgram, *pKernelInfo, *pClDevice));
31673168

31683169
auto deviceMaxWorkGroupSize = pDevice->getDeviceInfo().maxWorkGroupSize;
3169-
auto maxThreadsPerWG = HwHelper::get(pKernel->getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroup(pKernel->getHardwareInfo(), static_cast<uint32_t>(pClDevice->getDevice().getDeviceInfo().maxNumEUsPerSubSlice));
3170+
auto deviceInfo = pClDevice->getDevice().getDeviceInfo();
3171+
auto maxThreadsPerWG = HwHelper::get(pKernel->getHardwareInfo().platform.eRenderCoreFamily).getMaxThreadsForWorkgroupInDSSOrSS(pKernel->getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
31703172

31713173
EXPECT_LT(pKernel->getMaxKernelWorkGroupSize(), deviceMaxWorkGroupSize);
31723174
EXPECT_EQ(pKernel->getMaxKernelWorkGroupSize(), maxThreadsPerWG);

shared/source/device/device_caps.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,14 @@ void Device::initializeCaps() {
107107
deviceInfo.maxNumEUsPerSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.ftrPooledEuEnabled == 0)
108108
? (systemInfo.EUCount / systemInfo.SubSliceCount)
109109
: systemInfo.EuCountPerPoolMin;
110+
111+
deviceInfo.maxNumEUsPerDualSubSlice = (systemInfo.EuCountPerPoolMin == 0 || hwInfo.featureTable.ftrPooledEuEnabled == 0)
112+
? (systemInfo.EUCount / systemInfo.DualSubSliceCount)
113+
: systemInfo.EuCountPerPoolMin;
114+
110115
deviceInfo.numThreadsPerEU = systemInfo.ThreadCount / systemInfo.EUCount;
111116
deviceInfo.threadsPerEUConfigs = hwHelper.getThreadsPerEUConfigs();
112-
auto maxWS = hwHelper.getMaxThreadsForWorkgroup(hwInfo, static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice)) * simdSizeUsed;
117+
auto maxWS = hwHelper.getMaxThreadsForWorkgroupInDSSOrSS(hwInfo, static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice)) * simdSizeUsed;
113118

114119
maxWS = Math::prevPowerOfTwo(maxWS);
115120
deviceInfo.maxWorkGroupSize = std::min(maxWS, 1024u);

shared/source/device/device_info.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct DeviceInfo {
2626
size_t imageMaxArraySize;
2727
size_t imageMaxBufferSize;
2828
size_t maxNumEUsPerSubSlice;
29+
size_t maxNumEUsPerDualSubSlice;
2930
size_t maxParameterSize;
3031
size_t maxWorkGroupSize;
3132
size_t maxWorkItemSizes[3];

shared/source/helpers/hw_helper.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ class HwHelper {
9797
virtual std::string getExtensions() const = 0;
9898
static uint32_t getMaxThreadsForVfe(const HardwareInfo &hwInfo);
9999
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const;
100+
virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
100101
virtual uint32_t getMetricsLibraryGenId() const = 0;
101102
virtual uint32_t getMocsIndex(const GmmHelper &gmmHelper, bool l3enabled, bool l1enabled) const = 0;
102103
virtual bool tilingAllowed(bool isSharedContext, bool isImage1d, bool forceLinearStorage) = 0;
@@ -211,7 +212,7 @@ class HwHelperHw : public HwHelper {
211212

212213
size_t getPaddingForISAAllocation() const override;
213214

214-
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
215+
uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
215216

216217
uint32_t getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const override;
217218

shared/source/helpers/hw_helper_bdw_plus.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ uint32_t HwHelperHw<GfxFamily>::getPlanarYuvMaxHeight() const {
111111
}
112112

113113
template <typename GfxFamily>
114-
uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
114+
uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
115115
return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice);
116116
}
117117

shared/source/helpers/hw_helper_xehp_plus.inl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,11 @@ inline bool HwHelperHw<GfxFamily>::preferSmallWorkgroupSizeForKernel(const size_
194194
}
195195

196196
template <typename GfxFamily>
197-
inline uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const {
197+
inline uint32_t HwHelperHw<GfxFamily>::getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const {
198198
if (isWorkaroundRequired(REVISION_A0, REVISION_B, hwInfo)) {
199-
return std::min(HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice), 64u);
199+
return std::min(HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice), 64u);
200200
}
201-
return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerSubSlice);
201+
return HwHelper::getMaxThreadsForWorkgroup(hwInfo, maxNumEUsPerDualSubSlice);
202202
}
203203

204204
} // namespace NEO

0 commit comments

Comments
 (0)