Skip to content

Commit a9a3ad2

Browse files
Add Windows ARM64 I8MM detection. (#24563)
### Description <!-- Describe your changes. --> Add some logic to detect whether I8MM is actually supported. This info can be read from the registry. See the helpful comments here for more details: https://github.com/Dr-Noob/cpufetch/blob/a0c08ccc0b64b524ad2122e0595099f73cbba9c4/src/arm/midr.c#L30-L52 ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. --> Detect I8MM correctly to enable better performance. --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent ad5b97b commit a9a3ad2

File tree

1 file changed

+57
-41
lines changed

1 file changed

+57
-41
lines changed

onnxruntime/core/common/cpuid_info.cc

Lines changed: 57 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@
4242

4343
#include <Windows.h>
4444

45-
#define HAS_WINDOWS_DESKTOP WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
46-
4745
#ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
4846
#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
4947
#endif
@@ -207,21 +205,22 @@ void CPUIDInfo::ArmWindowsInit() {
207205
// Get the ARM vendor string from the registry
208206
vendor_ = GetArmWindowsVendor();
209207

210-
// ARM32 certainly doesn't have fp16, so we will skip the logic to avoid using RegGetValueA Windows API
211-
#if !defined(_M_ARM)
212-
#pragma region Application Family or OneCore Family
213-
#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
214-
// Read MIDR from windows registry
208+
// Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry
209+
// There should be one per CPU
210+
std::vector<uint64_t> midr_values{}, id_aa64isar1_el1_values{};
211+
215212
// TODO!! Don't support multiple processor group yet!!
216213
constexpr int MAX_CORES = 64;
217214
constexpr int MAX_VALUE_NAME = 4096;
218215

219-
CHAR midrKey[MAX_VALUE_NAME] = ""; // buffer for processor registry name
220-
uint32_t lastUarch = cpuinfo_uarch_unknown;
221-
for (int i = 0; i < MAX_CORES - 1; i++) {
222-
snprintf(midrKey, MAX_VALUE_NAME, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\%d", i);
223-
uint64_t midrVal;
224-
unsigned long midrSize = sizeof(uint64_t);
216+
CHAR processor_subkey[MAX_VALUE_NAME] = ""; // buffer for processor registry name
217+
218+
for (size_t i = 0; i < MAX_CORES - 1; i++) {
219+
snprintf(processor_subkey, MAX_VALUE_NAME, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\%d",
220+
static_cast<int>(i));
221+
222+
uint64_t midr_value;
223+
unsigned long data_size = sizeof(midr_value);
225224

226225
/*
227226
* ARM lists for each coprocessor register 5 fields: op0/op1/CRn/CRm/op2.
@@ -236,48 +235,65 @@ void CPUIDInfo::ArmWindowsInit() {
236235
*
237236
* For the CP value of MIDR, op0 = 3 and the others are all = 0, so we come up with 0x4000,
238237
*/
239-
auto retCode = ::RegGetValueA(HKEY_LOCAL_MACHINE, midrKey, "CP 4000", RRF_RT_REG_QWORD, nullptr, &midrVal, &midrSize);
240-
if (retCode != ERROR_SUCCESS) {
238+
if (::RegGetValueA(HKEY_LOCAL_MACHINE, processor_subkey, "CP 4000", RRF_RT_REG_QWORD,
239+
nullptr, &midr_value, &data_size) != ERROR_SUCCESS) {
241240
break;
242241
}
243-
uint32_t uarch = cpuinfo_uarch_unknown;
244-
decodeMIDR((uint32_t)midrVal, &uarch);
245-
core_uarchs_.push_back(uarch);
246-
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
247-
uarch == cpuinfo_uarch_cortex_a55) {
248-
is_armv8_narrow_ld_.push_back(true);
249-
} else {
250-
is_armv8_narrow_ld_.push_back(false);
242+
243+
uint64_t id_aa64isar1_el1_value;
244+
data_size = sizeof(id_aa64isar1_el1_value);
245+
246+
// CP 4031 corresponds to ID_AA64ISAR1_EL1 register
247+
if (::RegGetValueA(HKEY_LOCAL_MACHINE, processor_subkey, "CP 4031", RRF_RT_REG_QWORD,
248+
nullptr, &id_aa64isar1_el1_value, &data_size) != ERROR_SUCCESS) {
249+
break;
251250
}
252251

253-
if (i == 0) {
254-
lastUarch = uarch;
255-
} else if (lastUarch != uarch) {
256-
is_hybrid_ = true;
257-
lastUarch = uarch;
252+
midr_values.push_back(midr_value);
253+
id_aa64isar1_el1_values.push_back(id_aa64isar1_el1_value);
254+
}
255+
256+
// process midr_values
257+
{
258+
uint32_t lastUarch = cpuinfo_uarch_unknown;
259+
for (size_t i = 0; i < midr_values.size(); ++i) {
260+
uint32_t uarch = cpuinfo_uarch_unknown;
261+
decodeMIDR(static_cast<uint32_t>(midr_values[i]), &uarch);
262+
core_uarchs_.push_back(uarch);
263+
if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
264+
uarch == cpuinfo_uarch_cortex_a55) {
265+
is_armv8_narrow_ld_.push_back(true);
266+
} else {
267+
is_armv8_narrow_ld_.push_back(false);
268+
}
269+
270+
if (i == 0) {
271+
lastUarch = uarch;
272+
} else if (lastUarch != uarch) {
273+
is_hybrid_ = true;
274+
lastUarch = uarch;
275+
}
258276
}
259277
}
260-
#endif // WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
278+
279+
has_arm_neon_i8mm_ = std::all_of(
280+
id_aa64isar1_el1_values.begin(), id_aa64isar1_el1_values.end(),
281+
[](uint64_t id_aa64isar1_el1_value) {
282+
// I8MM, bits [55:52]
283+
return ((id_aa64isar1_el1_value >> 52) & 0xF) != 0;
284+
});
261285

262286
has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0);
263-
#else // ^ !defined(_M_ARM) / v defined(_M_ARM)
264-
has_arm_neon_dot_ = false;
265-
#endif // defined(_M_ARM)
266287

267288
#if defined(CPUINFO_SUPPORTED)
268289
if (pytorch_cpuinfo_init_) {
269290
has_fp16_ = cpuinfo_has_arm_neon_fp16_arith();
270-
has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
271-
has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm();
291+
// cpuinfo_has_arm_i8mm() doesn't work on Windows yet. See https://github.com/pytorch/cpuinfo/issues/279.
292+
// has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
293+
has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && has_arm_neon_i8mm_;
272294
has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16();
273-
} else
274-
#endif // defined(CPUINFO_SUPPORTED)
275-
{
276-
has_fp16_ = false;
277-
has_arm_neon_i8mm_ = false;
278-
has_arm_sve_i8mm_ = false;
279-
has_arm_neon_bf16_ = false;
280295
}
296+
#endif // defined(CPUINFO_SUPPORTED)
281297
}
282298

283299
std::string CPUIDInfo::GetArmWindowsVendor() {

0 commit comments

Comments
 (0)