@@ -3135,8 +3135,6 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
3135
3135
be_parameter maskp (*this , inst.param (3 ), PTYPE_MRI);
3136
3136
3137
3137
const a64::Gp output = dstp.select_register (TEMP_REG1, inst.size ());
3138
- const a64::Gp shift = shiftp.select_register (TEMP_REG2, inst.size ());
3139
- const a64::Gp scratch = select_register (FUNC_SCRATCH_REG, inst.size ());
3140
3138
const uint64_t instbits = inst.size () * 8 ;
3141
3139
3142
3140
bool optimized = false ;
@@ -3187,44 +3185,41 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
3187
3185
3188
3186
if (!optimized)
3189
3187
{
3188
+ const a64::Gp shift = shiftp.select_register (TEMP_REG2, inst.size ());
3189
+ const a64::Gp rshift = select_register (TEMP_REG2, inst.size ());
3190
+ const a64::Gp mask = (dstp != maskp) ? maskp.select_register (SCRATCH_REG1, inst.size ()) : select_register (SCRATCH_REG1, inst.size ());
3191
+
3192
+ if (!shiftp.is_immediate ())
3193
+ {
3194
+ // do this first as dst and shift could be the same register
3195
+ mov_reg_param (a, inst.size (), shift, shiftp);
3196
+
3197
+ a.neg (rshift, shift);
3198
+ }
3199
+
3200
+ // mask and dst could also be the same register so do this before rotating dst
3201
+ if (!maskp.is_immediate () || !is_valid_immediate_mask (maskp.immediate (), inst.size ()))
3202
+ mov_reg_param (a, inst.size (), mask, maskp);
3203
+
3190
3204
mov_reg_param (a, inst.size (), output, srcp);
3191
3205
3192
3206
if (shiftp.is_immediate ())
3193
3207
{
3194
3208
const auto s = -int64_t (shiftp.immediate ()) & (instbits - 1 );
3195
-
3196
3209
if (s != 0 )
3197
3210
a.ror (output, output, s);
3198
3211
}
3199
3212
else
3200
3213
{
3201
- const a64::Gp scratch2 = select_register (SCRATCH_REG2, inst.size ());
3202
-
3203
- mov_reg_param (a, inst.size (), shift, shiftp);
3204
-
3205
- a.and_ (scratch, shift, inst.size () * 8 - 1 );
3206
- a.mov (scratch2, instbits);
3207
- a.sub (scratch, scratch2, scratch);
3208
- a.ror (output, output, scratch);
3214
+ a.and_ (rshift, rshift, (inst.size () * 8 ) - 1 );
3215
+ a.ror (output, output, rshift);
3209
3216
}
3210
3217
3211
- // srcp and the results of the rors above are already going to the output register, so if the mask is all 1s this can all be skipped
3212
3218
const a64::Inst::Id maskop = inst.flags () ? a64::Inst::kIdAnds : a64::Inst::kIdAnd ;
3213
3219
if (maskp.is_immediate () && is_valid_immediate_mask (maskp.immediate (), inst.size ()))
3214
- {
3215
3220
a.emit (maskop, output, output, maskp.immediate ());
3216
- }
3217
- else if (!maskp.is_immediate () || maskp.immediate () != util::make_bitmask<uint64_t >(instbits))
3218
- {
3219
- const a64::Gp mask = maskp.select_register (TEMP_REG2, inst.size ());
3220
- mov_reg_param (a, inst.size (), mask, maskp);
3221
-
3222
- a.emit (maskop, output, output, mask);
3223
- }
3224
3221
else
3225
- {
3226
- optimized = true ; // need explicit tst if flags are requested
3227
- }
3222
+ a.emit (maskop, output, output, mask);
3228
3223
}
3229
3224
3230
3225
mov_param_reg (a, inst.size (), dstp, output);
@@ -3250,15 +3245,9 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
3250
3245
be_parameter maskp (*this , inst.param (3 ), PTYPE_MRI);
3251
3246
const uint64_t instbits = inst.size () * 8 ;
3252
3247
3253
- a64::Gp dst;
3254
-
3255
- bool can_use_dst_reg = dstp.is_int_register ();
3256
- if (can_use_dst_reg && srcp.is_int_register ())
3257
- can_use_dst_reg = srcp.ireg () != dstp.ireg ();
3258
- if (can_use_dst_reg && maskp.is_int_register ())
3259
- can_use_dst_reg = maskp.ireg () != dstp.ireg ();
3260
- if (can_use_dst_reg && shiftp.is_int_register ())
3261
- can_use_dst_reg = shiftp.ireg () != dstp.ireg ();
3248
+ const a64::Gp dst = dstp.select_register (TEMP_REG2, inst.size ());
3249
+ const a64::Gp src = srcp.select_register (TEMP_REG1, inst.size ());
3250
+ const a64::Gp scratch = select_register (TEMP_REG1, inst.size ());
3262
3251
3263
3252
bool optimized = false ;
3264
3253
if (maskp.is_immediate () && shiftp.is_immediate ())
@@ -3270,16 +3259,12 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
3270
3259
const bool is_contiguous = (invlamask & (invlamask + 1 )) == 0 ;
3271
3260
const auto s = shiftp.immediate () & (instbits - 1 );
3272
3261
3273
- const a64::Gp src = select_register (SCRATCH_REG2, inst.size ());
3274
-
3275
3262
if (is_right_aligned || is_contiguous)
3276
3263
{
3277
- dst = can_use_dst_reg ? dstp.select_register (SCRATCH_REG1, inst.size ()) : select_register (SCRATCH_REG1, inst.size ());
3278
3264
mov_reg_param (a, inst.size (), dst, dstp);
3279
3265
3280
3266
uint32_t rot = 0 ;
3281
3267
uint32_t lsb = 0 ;
3282
-
3283
3268
if (is_right_aligned)
3284
3269
{
3285
3270
// Optimize a contiguous right-aligned mask
@@ -3292,63 +3277,61 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
3292
3277
lsb = instbits - pop - lz;
3293
3278
}
3294
3279
3295
- if (srcp.is_immediate () && rot > 0 )
3280
+ if (srcp.is_immediate () && ( rot > 0 ) )
3296
3281
{
3297
- // save some instructions by avoid mov to register by computing the ror and storing it into src directly
3298
- uint64_t result = 0 ;
3299
-
3282
+ // save some instructions by avoid mov to register by computing the ror and storing it into scratch directly
3283
+ uint64_t result;
3300
3284
if (inst.size () == 4 )
3301
3285
result = rotr_32 (srcp.immediate (), rot);
3302
3286
else
3303
3287
result = rotr_64 (srcp.immediate (), rot);
3304
3288
3305
- a.mov (src , result);
3289
+ a.mov (scratch , result);
3306
3290
}
3307
- else
3291
+ else if (rot > 0 )
3308
3292
{
3309
3293
mov_reg_param (a, inst.size (), src, srcp);
3310
3294
3311
- if (rot > 0 )
3312
- a.ror (src, src, rot);
3295
+ a.ror (scratch, src, rot);
3296
+ }
3297
+ else
3298
+ {
3299
+ mov_reg_param (a, inst.size (), scratch, srcp);
3313
3300
}
3314
3301
3315
- a.bfi (dst, src , lsb, pop);
3302
+ a.bfi (dst, scratch , lsb, pop);
3316
3303
3317
3304
optimized = true ;
3318
3305
}
3319
3306
else if (srcp.is_immediate ())
3320
3307
{
3321
- const a64::Gp scratch = select_register (SCRATCH_REG1, inst.size ());
3322
-
3323
- dst = dstp.select_register (TEMP_REG2, inst.size ());
3324
-
3325
3308
// val1 = src & ~PARAM3
3326
- if (is_valid_immediate_mask (maskp.immediate (), inst.size ()))
3309
+ if (is_valid_immediate_mask (~ maskp.immediate () & util::make_bitmask< uint64_t >(instbits ), inst.size ()))
3327
3310
{
3328
- a.and_ (dst, dst, ~maskp.immediate ());
3311
+ a.and_ (dst, dst, ~maskp.immediate () & util::make_bitmask< uint64_t >(instbits) );
3329
3312
}
3330
3313
else
3331
3314
{
3332
- a. mov ( scratch, ~maskp.immediate ());
3315
+ get_imm_relative (a, scratch, ~maskp.immediate () & util::make_bitmask< uint64_t >(instbits ));
3333
3316
a.and_ (dst, dst, scratch);
3334
3317
}
3335
3318
3336
- uint64_t result = 0 ;
3319
+ uint64_t result;
3337
3320
if (inst.size () == 4 )
3338
3321
result = rotl_32 (srcp.immediate (), s) & maskp.immediate ();
3339
3322
else
3340
3323
result = rotl_64 (srcp.immediate (), s) & maskp.immediate ();
3341
3324
3342
3325
if (result != 0 )
3343
3326
{
3344
- if (is_valid_immediate (result, 12 ))
3327
+ if (is_valid_immediate_mask (result, inst. size () ))
3345
3328
{
3346
3329
a.orr (dst, dst, result);
3347
3330
}
3348
3331
else
3349
3332
{
3350
- a. mov ( scratch, result);
3351
- a.orr (dst, dst, select_register ( scratch, inst. size ()) );
3333
+ get_imm_relative (a, scratch, result);
3334
+ a.orr (dst, dst, scratch);
3352
3335
}
3353
3336
}
3354
3337
@@ -3358,40 +3341,57 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
3358
3341
3359
3342
if (!optimized)
3360
3343
{
3361
- dst = can_use_dst_reg ? dstp .select_register (TEMP_REG2, inst. size ()) : select_register (TEMP_REG2 , inst.size ());
3362
- mov_reg_param (a , inst.size (), dst, dstp );
3344
+ const a64::Gp shift = shiftp .select_register (SCRATCH_REG1 , inst.size ());
3345
+ const a64::Gp rshift = select_register (SCRATCH_REG1 , inst.size ());
3363
3346
3364
- const a64::Gp src = srcp.select_register (TEMP_REG1, inst.size ());
3365
- const a64::Gp scratch = select_register (SCRATCH_REG1, inst.size ());
3347
+ if (!shiftp.is_immediate ())
3348
+ {
3349
+ // do this first as dst could be the same register as shift
3350
+ mov_reg_param (a, inst.size (), shift, shiftp);
3366
3351
3367
- mov_reg_param (a, inst.size (), src, srcp);
3352
+ a.neg (rshift, shift);
3353
+ }
3354
+
3355
+ mov_reg_param (a, inst.size (), dst, dstp);
3368
3356
3369
3357
if (shiftp.is_immediate ())
3370
3358
{
3371
3359
const auto shift = -int64_t (shiftp.immediate ()) & ((inst.size () * 8 ) - 1 );
3372
3360
3373
3361
if (shift != 0 )
3362
+ {
3363
+ mov_reg_param (a, inst.size (), src, srcp);
3374
3364
a.ror (scratch, src, shift);
3365
+ }
3375
3366
else
3376
- a.mov (scratch, src);
3367
+ {
3368
+ mov_reg_param (a, inst.size (), scratch, srcp);
3369
+ }
3377
3370
}
3378
3371
else
3379
3372
{
3380
- const a64::Gp shift = shiftp.select_register (SCRATCH_REG2, inst.size ());
3381
- const a64::Gp scratch2 = select_register (FUNC_SCRATCH_REG, inst.size ());
3382
- mov_reg_param (a, inst.size (), shift, shiftp);
3373
+ mov_reg_param (a, inst.size (), src, srcp);
3383
3374
3384
- a.mov (scratch, inst.size () * 8 );
3385
- a.and_ (scratch2, shift, inst.size () * 8 - 1 );
3386
- a.sub (scratch2, scratch, scratch2);
3387
- a.ror (scratch, src, scratch2);
3375
+ a.and_ (rshift, rshift, (inst.size () * 8 ) - 1 );
3376
+ a.ror (scratch, src, rshift);
3388
3377
}
3389
3378
3390
- const a64::Gp mask = maskp.select_register (SCRATCH_REG2, inst.size ());
3391
- mov_reg_param (a, inst.size (), mask, maskp);
3379
+ const a64::Gp mask = maskp.select_register (SCRATCH_REG1, inst.size ());
3380
+ if (!maskp.is_immediate () || !is_valid_immediate_mask (maskp.immediate (), inst.size ()) || !is_valid_immediate_mask (~maskp.immediate () & util::make_bitmask<uint64_t >(instbits), inst.size ()))
3381
+ mov_reg_param (a, inst.size (), mask, maskp);
3382
+
3383
+ // val2 = val2 & PARAM3
3384
+ if (maskp.is_immediate () && is_valid_immediate_mask (maskp.immediate (), inst.size ()))
3385
+ a.and_ (scratch, scratch, maskp.immediate ());
3386
+ else
3387
+ a.and_ (scratch, scratch, mask);
3388
+
3389
+ // val1 = src & ~PARAM3
3390
+ if (maskp.is_immediate () && is_valid_immediate_mask (~maskp.immediate () & util::make_bitmask<uint64_t >(instbits), inst.size ()))
3391
+ a.and_ (dst, dst, ~maskp.immediate () & util::make_bitmask<uint64_t >(instbits));
3392
+ else
3393
+ a.bic (dst, dst, mask);
3392
3394
3393
- a.bic (dst, dst, mask); // val1 = src & ~PARAM3
3394
- a.and_ (scratch, scratch, mask); // val2 = val2 & PARAM3
3395
3395
a.orr (dst, dst, scratch); // val1 | val2
3396
3396
}
3397
3397
0 commit comments