Skip to content

Commit 519cbd9

Browse files
committed
cpu/drcbearm64.cpp: Fixed more cases where ROLAND clobbered source registers.
ROLAND could clobber source registers or produce incorrect results if the desination was the same UML register as the shift or mask. Fixed a bug in the logic for deciding whether to use a temporary register for the mask for ROLINS. Also optimised ROLAND and ROLINS some more (particularly translation from UML's left rotate to ARM's right rotate) and removed a special case that the simplifier now takes care of.
1 parent ae41239 commit 519cbd9

File tree

1 file changed

+74
-74
lines changed

1 file changed

+74
-74
lines changed

src/devices/cpu/drcbearm64.cpp

+74-74
Original file line numberDiff line numberDiff line change
@@ -3135,8 +3135,6 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
31353135
be_parameter maskp(*this, inst.param(3), PTYPE_MRI);
31363136

31373137
const a64::Gp output = dstp.select_register(TEMP_REG1, inst.size());
3138-
const a64::Gp shift = shiftp.select_register(TEMP_REG2, inst.size());
3139-
const a64::Gp scratch = select_register(FUNC_SCRATCH_REG, inst.size());
31403138
const uint64_t instbits = inst.size() * 8;
31413139

31423140
bool optimized = false;
@@ -3187,44 +3185,41 @@ void drcbe_arm64::op_roland(a64::Assembler &a, const uml::instruction &inst)
31873185

31883186
if (!optimized)
31893187
{
3188+
const a64::Gp shift = shiftp.select_register(TEMP_REG2, inst.size());
3189+
const a64::Gp rshift = select_register(TEMP_REG2, inst.size());
3190+
const a64::Gp mask = (dstp != maskp) ? maskp.select_register(SCRATCH_REG1, inst.size()) : select_register(SCRATCH_REG1, inst.size());
3191+
3192+
if (!shiftp.is_immediate())
3193+
{
3194+
// do this first as dst and shift could be the same register
3195+
mov_reg_param(a, inst.size(), shift, shiftp);
3196+
3197+
a.neg(rshift, shift);
3198+
}
3199+
3200+
// mask and dst could also be the same register so do this before rotating dst
3201+
if (!maskp.is_immediate() || !is_valid_immediate_mask(maskp.immediate(), inst.size()))
3202+
mov_reg_param(a, inst.size(), mask, maskp);
3203+
31903204
mov_reg_param(a, inst.size(), output, srcp);
31913205

31923206
if (shiftp.is_immediate())
31933207
{
31943208
const auto s = -int64_t(shiftp.immediate()) & (instbits - 1);
3195-
31963209
if (s != 0)
31973210
a.ror(output, output, s);
31983211
}
31993212
else
32003213
{
3201-
const a64::Gp scratch2 = select_register(SCRATCH_REG2, inst.size());
3202-
3203-
mov_reg_param(a, inst.size(), shift, shiftp);
3204-
3205-
a.and_(scratch, shift, inst.size() * 8 - 1);
3206-
a.mov(scratch2, instbits);
3207-
a.sub(scratch, scratch2, scratch);
3208-
a.ror(output, output, scratch);
3214+
a.and_(rshift, rshift, (inst.size() * 8) - 1);
3215+
a.ror(output, output, rshift);
32093216
}
32103217

3211-
// srcp and the results of the rors above are already going to the output register, so if the mask is all 1s this can all be skipped
32123218
const a64::Inst::Id maskop = inst.flags() ? a64::Inst::kIdAnds : a64::Inst::kIdAnd;
32133219
if (maskp.is_immediate() && is_valid_immediate_mask(maskp.immediate(), inst.size()))
3214-
{
32153220
a.emit(maskop, output, output, maskp.immediate());
3216-
}
3217-
else if (!maskp.is_immediate() || maskp.immediate() != util::make_bitmask<uint64_t>(instbits))
3218-
{
3219-
const a64::Gp mask = maskp.select_register(TEMP_REG2, inst.size());
3220-
mov_reg_param(a, inst.size(), mask, maskp);
3221-
3222-
a.emit(maskop, output, output, mask);
3223-
}
32243221
else
3225-
{
3226-
optimized = true; // need explicit tst if flags are requested
3227-
}
3222+
a.emit(maskop, output, output, mask);
32283223
}
32293224

32303225
mov_param_reg(a, inst.size(), dstp, output);
@@ -3250,15 +3245,9 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
32503245
be_parameter maskp(*this, inst.param(3), PTYPE_MRI);
32513246
const uint64_t instbits = inst.size() * 8;
32523247

3253-
a64::Gp dst;
3254-
3255-
bool can_use_dst_reg = dstp.is_int_register();
3256-
if (can_use_dst_reg && srcp.is_int_register())
3257-
can_use_dst_reg = srcp.ireg() != dstp.ireg();
3258-
if (can_use_dst_reg && maskp.is_int_register())
3259-
can_use_dst_reg = maskp.ireg() != dstp.ireg();
3260-
if (can_use_dst_reg && shiftp.is_int_register())
3261-
can_use_dst_reg = shiftp.ireg() != dstp.ireg();
3248+
const a64::Gp dst = dstp.select_register(TEMP_REG2, inst.size());
3249+
const a64::Gp src = srcp.select_register(TEMP_REG1, inst.size());
3250+
const a64::Gp scratch = select_register(TEMP_REG1, inst.size());
32623251

32633252
bool optimized = false;
32643253
if (maskp.is_immediate() && shiftp.is_immediate())
@@ -3270,16 +3259,12 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
32703259
const bool is_contiguous = (invlamask & (invlamask + 1)) == 0;
32713260
const auto s = shiftp.immediate() & (instbits - 1);
32723261

3273-
const a64::Gp src = select_register(SCRATCH_REG2, inst.size());
3274-
32753262
if (is_right_aligned || is_contiguous)
32763263
{
3277-
dst = can_use_dst_reg ? dstp.select_register(SCRATCH_REG1, inst.size()) : select_register(SCRATCH_REG1, inst.size());
32783264
mov_reg_param(a, inst.size(), dst, dstp);
32793265

32803266
uint32_t rot = 0;
32813267
uint32_t lsb = 0;
3282-
32833268
if (is_right_aligned)
32843269
{
32853270
// Optimize a contiguous right-aligned mask
@@ -3292,63 +3277,61 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
32923277
lsb = instbits - pop - lz;
32933278
}
32943279

3295-
if (srcp.is_immediate() && rot > 0)
3280+
if (srcp.is_immediate() && (rot > 0))
32963281
{
3297-
// save some instructions by avoid mov to register by computing the ror and storing it into src directly
3298-
uint64_t result = 0;
3299-
3282+
// save some instructions by avoid mov to register by computing the ror and storing it into scratch directly
3283+
uint64_t result;
33003284
if (inst.size() == 4)
33013285
result = rotr_32(srcp.immediate(), rot);
33023286
else
33033287
result = rotr_64(srcp.immediate(), rot);
33043288

3305-
a.mov(src, result);
3289+
a.mov(scratch, result);
33063290
}
3307-
else
3291+
else if (rot > 0)
33083292
{
33093293
mov_reg_param(a, inst.size(), src, srcp);
33103294

3311-
if (rot > 0)
3312-
a.ror(src, src, rot);
3295+
a.ror(scratch, src, rot);
3296+
}
3297+
else
3298+
{
3299+
mov_reg_param(a, inst.size(), scratch, srcp);
33133300
}
33143301

3315-
a.bfi(dst, src, lsb, pop);
3302+
a.bfi(dst, scratch, lsb, pop);
33163303

33173304
optimized = true;
33183305
}
33193306
else if (srcp.is_immediate())
33203307
{
3321-
const a64::Gp scratch = select_register(SCRATCH_REG1, inst.size());
3322-
3323-
dst = dstp.select_register(TEMP_REG2, inst.size());
3324-
33253308
// val1 = src & ~PARAM3
3326-
if (is_valid_immediate_mask(maskp.immediate(), inst.size()))
3309+
if (is_valid_immediate_mask(~maskp.immediate() & util::make_bitmask<uint64_t>(instbits), inst.size()))
33273310
{
3328-
a.and_(dst, dst, ~maskp.immediate());
3311+
a.and_(dst, dst, ~maskp.immediate() & util::make_bitmask<uint64_t>(instbits));
33293312
}
33303313
else
33313314
{
3332-
a.mov(scratch, ~maskp.immediate());
3315+
get_imm_relative(a, scratch, ~maskp.immediate() & util::make_bitmask<uint64_t>(instbits));
33333316
a.and_(dst, dst, scratch);
33343317
}
33353318

3336-
uint64_t result = 0;
3319+
uint64_t result;
33373320
if (inst.size() == 4)
33383321
result = rotl_32(srcp.immediate(), s) & maskp.immediate();
33393322
else
33403323
result = rotl_64(srcp.immediate(), s) & maskp.immediate();
33413324

33423325
if (result != 0)
33433326
{
3344-
if (is_valid_immediate(result, 12))
3327+
if (is_valid_immediate_mask(result, inst.size()))
33453328
{
33463329
a.orr(dst, dst, result);
33473330
}
33483331
else
33493332
{
3350-
a.mov(scratch, result);
3351-
a.orr(dst, dst, select_register(scratch, inst.size()));
3333+
get_imm_relative(a, scratch, result);
3334+
a.orr(dst, dst, scratch);
33523335
}
33533336
}
33543337

@@ -3358,40 +3341,57 @@ void drcbe_arm64::op_rolins(a64::Assembler &a, const uml::instruction &inst)
33583341

33593342
if (!optimized)
33603343
{
3361-
dst = can_use_dst_reg ? dstp.select_register(TEMP_REG2, inst.size()) : select_register(TEMP_REG2, inst.size());
3362-
mov_reg_param(a, inst.size(), dst, dstp);
3344+
const a64::Gp shift = shiftp.select_register(SCRATCH_REG1, inst.size());
3345+
const a64::Gp rshift = select_register(SCRATCH_REG1, inst.size());
33633346

3364-
const a64::Gp src = srcp.select_register(TEMP_REG1, inst.size());
3365-
const a64::Gp scratch = select_register(SCRATCH_REG1, inst.size());
3347+
if (!shiftp.is_immediate())
3348+
{
3349+
// do this first as dst could be the same register as shift
3350+
mov_reg_param(a, inst.size(), shift, shiftp);
33663351

3367-
mov_reg_param(a, inst.size(), src, srcp);
3352+
a.neg(rshift, shift);
3353+
}
3354+
3355+
mov_reg_param(a, inst.size(), dst, dstp);
33683356

33693357
if (shiftp.is_immediate())
33703358
{
33713359
const auto shift = -int64_t(shiftp.immediate()) & ((inst.size() * 8) - 1);
33723360

33733361
if (shift != 0)
3362+
{
3363+
mov_reg_param(a, inst.size(), src, srcp);
33743364
a.ror(scratch, src, shift);
3365+
}
33753366
else
3376-
a.mov(scratch, src);
3367+
{
3368+
mov_reg_param(a, inst.size(), scratch, srcp);
3369+
}
33773370
}
33783371
else
33793372
{
3380-
const a64::Gp shift = shiftp.select_register(SCRATCH_REG2, inst.size());
3381-
const a64::Gp scratch2 = select_register(FUNC_SCRATCH_REG, inst.size());
3382-
mov_reg_param(a, inst.size(), shift, shiftp);
3373+
mov_reg_param(a, inst.size(), src, srcp);
33833374

3384-
a.mov(scratch, inst.size() * 8);
3385-
a.and_(scratch2, shift, inst.size() * 8 - 1);
3386-
a.sub(scratch2, scratch, scratch2);
3387-
a.ror(scratch, src, scratch2);
3375+
a.and_(rshift, rshift, (inst.size() * 8) - 1);
3376+
a.ror(scratch, src, rshift);
33883377
}
33893378

3390-
const a64::Gp mask = maskp.select_register(SCRATCH_REG2, inst.size());
3391-
mov_reg_param(a, inst.size(), mask, maskp);
3379+
const a64::Gp mask = maskp.select_register(SCRATCH_REG1, inst.size());
3380+
if (!maskp.is_immediate() || !is_valid_immediate_mask(maskp.immediate(), inst.size()) || !is_valid_immediate_mask(~maskp.immediate() & util::make_bitmask<uint64_t>(instbits), inst.size()))
3381+
mov_reg_param(a, inst.size(), mask, maskp);
3382+
3383+
// val2 = val2 & PARAM3
3384+
if (maskp.is_immediate() && is_valid_immediate_mask(maskp.immediate(), inst.size()))
3385+
a.and_(scratch, scratch, maskp.immediate());
3386+
else
3387+
a.and_(scratch, scratch, mask);
3388+
3389+
// val1 = src & ~PARAM3
3390+
if (maskp.is_immediate() && is_valid_immediate_mask(~maskp.immediate() & util::make_bitmask<uint64_t>(instbits), inst.size()))
3391+
a.and_(dst, dst, ~maskp.immediate() & util::make_bitmask<uint64_t>(instbits));
3392+
else
3393+
a.bic(dst, dst, mask);
33923394

3393-
a.bic(dst, dst, mask); // val1 = src & ~PARAM3
3394-
a.and_(scratch, scratch, mask); // val2 = val2 & PARAM3
33953395
a.orr(dst, dst, scratch); // val1 | val2
33963396
}
33973397

0 commit comments

Comments
 (0)