42
42
43
43
#include < Windows.h>
44
44
45
- #define HAS_WINDOWS_DESKTOP WINAPI_FAMILY_PARTITION (WINAPI_PARTITION_DESKTOP)
46
-
47
45
#ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
48
46
#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
49
47
#endif
@@ -207,21 +205,22 @@ void CPUIDInfo::ArmWindowsInit() {
207
205
// Get the ARM vendor string from the registry
208
206
vendor_ = GetArmWindowsVendor ();
209
207
210
- // ARM32 certainly doesn't have fp16, so we will skip the logic to avoid using RegGetValueA Windows API
211
- #if !defined(_M_ARM)
212
- #pragma region Application Family or OneCore Family
213
- #if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
214
- // Read MIDR from windows registry
208
+ // Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry
209
+ // There should be one per CPU
210
+ std::vector<uint64_t > midr_values{}, id_aa64isar1_el1_values{};
211
+
215
212
// TODO!! Don't support multiple processor group yet!!
216
213
constexpr int MAX_CORES = 64 ;
217
214
constexpr int MAX_VALUE_NAME = 4096 ;
218
215
219
- CHAR midrKey[MAX_VALUE_NAME] = " " ; // buffer for processor registry name
220
- uint32_t lastUarch = cpuinfo_uarch_unknown;
221
- for (int i = 0 ; i < MAX_CORES - 1 ; i++) {
222
- snprintf (midrKey, MAX_VALUE_NAME, " HARDWARE\\ DESCRIPTION\\ System\\ CentralProcessor\\ %d" , i);
223
- uint64_t midrVal;
224
- unsigned long midrSize = sizeof (uint64_t );
216
+ CHAR processor_subkey[MAX_VALUE_NAME] = " " ; // buffer for processor registry name
217
+
218
+ for (size_t i = 0 ; i < MAX_CORES - 1 ; i++) {
219
+ snprintf (processor_subkey, MAX_VALUE_NAME, " HARDWARE\\ DESCRIPTION\\ System\\ CentralProcessor\\ %d" ,
220
+ static_cast <int >(i));
221
+
222
+ uint64_t midr_value;
223
+ unsigned long data_size = sizeof (midr_value);
225
224
226
225
/*
227
226
* ARM lists for each coprocessor register 5 fields: op0/op1/CRn/CRm/op2.
@@ -236,48 +235,65 @@ void CPUIDInfo::ArmWindowsInit() {
236
235
*
237
236
* For the CP value of MIDR, op0 = 3 and the others are all = 0, so we come up with 0x4000,
238
237
*/
239
- auto retCode = ::RegGetValueA (HKEY_LOCAL_MACHINE, midrKey , " CP 4000" , RRF_RT_REG_QWORD, nullptr , &midrVal, &midrSize);
240
- if (retCode != ERROR_SUCCESS) {
238
+ if ( ::RegGetValueA (HKEY_LOCAL_MACHINE, processor_subkey , " CP 4000" , RRF_RT_REG_QWORD,
239
+ nullptr , &midr_value, &data_size) != ERROR_SUCCESS) {
241
240
break ;
242
241
}
243
- uint32_t uarch = cpuinfo_uarch_unknown;
244
- decodeMIDR (( uint32_t )midrVal, &uarch) ;
245
- core_uarchs_. push_back (uarch );
246
- if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
247
- uarch == cpuinfo_uarch_cortex_a55) {
248
- is_armv8_narrow_ld_. push_back ( true );
249
- } else {
250
- is_armv8_narrow_ld_. push_back ( false ) ;
242
+
243
+ uint64_t id_aa64isar1_el1_value ;
244
+ data_size = sizeof (id_aa64isar1_el1_value );
245
+
246
+ // CP 4031 corresponds to ID_AA64ISAR1_EL1 register
247
+ if (:: RegGetValueA (HKEY_LOCAL_MACHINE, processor_subkey, " CP 4031 " , RRF_RT_REG_QWORD,
248
+ nullptr , &id_aa64isar1_el1_value, &data_size) != ERROR_SUCCESS) {
249
+ break ;
251
250
}
252
251
253
- if (i == 0 ) {
254
- lastUarch = uarch;
255
- } else if (lastUarch != uarch) {
256
- is_hybrid_ = true ;
257
- lastUarch = uarch;
252
+ midr_values.push_back (midr_value);
253
+ id_aa64isar1_el1_values.push_back (id_aa64isar1_el1_value);
254
+ }
255
+
256
+ // process midr_values
257
+ {
258
+ uint32_t lastUarch = cpuinfo_uarch_unknown;
259
+ for (size_t i = 0 ; i < midr_values.size (); ++i) {
260
+ uint32_t uarch = cpuinfo_uarch_unknown;
261
+ decodeMIDR (static_cast <uint32_t >(midr_values[i]), &uarch);
262
+ core_uarchs_.push_back (uarch);
263
+ if (uarch == cpuinfo_uarch_cortex_a53 || uarch == cpuinfo_uarch_cortex_a55r0 ||
264
+ uarch == cpuinfo_uarch_cortex_a55) {
265
+ is_armv8_narrow_ld_.push_back (true );
266
+ } else {
267
+ is_armv8_narrow_ld_.push_back (false );
268
+ }
269
+
270
+ if (i == 0 ) {
271
+ lastUarch = uarch;
272
+ } else if (lastUarch != uarch) {
273
+ is_hybrid_ = true ;
274
+ lastUarch = uarch;
275
+ }
258
276
}
259
277
}
260
- #endif // WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP | WINAPI_PARTITION_SYSTEM)
278
+
279
+ has_arm_neon_i8mm_ = std::all_of (
280
+ id_aa64isar1_el1_values.begin (), id_aa64isar1_el1_values.end (),
281
+ [](uint64_t id_aa64isar1_el1_value) {
282
+ // I8MM, bits [55:52]
283
+ return ((id_aa64isar1_el1_value >> 52 ) & 0xF ) != 0 ;
284
+ });
261
285
262
286
has_arm_neon_dot_ = (IsProcessorFeaturePresent (PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0 );
263
- #else // ^ !defined(_M_ARM) / v defined(_M_ARM)
264
- has_arm_neon_dot_ = false ;
265
- #endif // defined(_M_ARM)
266
287
267
288
#if defined(CPUINFO_SUPPORTED)
268
289
if (pytorch_cpuinfo_init_) {
269
290
has_fp16_ = cpuinfo_has_arm_neon_fp16_arith ();
270
- has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm ();
271
- has_arm_sve_i8mm_ = cpuinfo_has_arm_sve () && cpuinfo_has_arm_i8mm ();
291
+ // cpuinfo_has_arm_i8mm() doesn't work on Windows yet. See https://github.com/pytorch/cpuinfo/issues/279.
292
+ // has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm();
293
+ has_arm_sve_i8mm_ = cpuinfo_has_arm_sve () && has_arm_neon_i8mm_;
272
294
has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16 ();
273
- } else
274
- #endif // defined(CPUINFO_SUPPORTED)
275
- {
276
- has_fp16_ = false ;
277
- has_arm_neon_i8mm_ = false ;
278
- has_arm_sve_i8mm_ = false ;
279
- has_arm_neon_bf16_ = false ;
280
295
}
296
+ #endif // defined(CPUINFO_SUPPORTED)
281
297
}
282
298
283
299
std::string CPUIDInfo::GetArmWindowsVendor () {
0 commit comments