target/arm: Implement SVE2p1 PEXT

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Message-id: 20250704142112.1018902-83-richard.henderson@linaro.org
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Richard Henderson 2025-07-04 08:20:45 -06:00 committed by Peter Maydell
parent 8f7e127b66
commit 16fe3bb942
5 changed files with 146 additions and 0 deletions

View file

@ -2953,3 +2953,5 @@ DEF_HELPER_FLAGS_4(sve2p1_uminqv_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2p1_uminqv_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2p1_uminqv_s, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_4(sve2p1_uminqv_d, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
DEF_HELPER_FLAGS_3(pext, TCG_CALL_NO_RWG, void, ptr, i32, i32)

View file

@ -60,6 +60,7 @@
%rn_ax2 6:4 !function=times_2
%pnd 0:3 !function=plus_8
%pnn 5:3 !function=plus_8
###########################################################################
# Named attribute sets. These are used to make nice(er) names
@ -823,6 +824,11 @@ WHILE_lt_cnt4 00100101 .. 1 ..... 0110 . 1 ..... 1 . ... @while_cnt
WHILE_gt_cnt2 00100101 .. 1 ..... 0100 . 0 ..... 1 . ... @while_cnt
WHILE_gt_cnt4 00100101 .. 1 ..... 0110 . 0 ..... 1 . ... @while_cnt
# SVE2.1 extract mask predicate from predicate-as-counter
&pext rd rn esz imm
PEXT_1 00100101 esz:2 1 00000 0111 00 imm:2 ... 1 rd:4 &pext rn=%pnn
PEXT_2 00100101 esz:2 1 00000 0111 010 imm:1 ... 1 rd:4 &pext rn=%pnn
### SVE Integer Wide Immediate - Unpredicated Group
# SVE broadcast floating-point immediate (unpredicated)

View file

@ -7821,3 +7821,31 @@ DO_FCVTLT(sve2_fcvtlt_sd, uint64_t, uint32_t, H1_8, H1_4, float32_to_float64)
#undef DO_FCVTLT
#undef DO_FCVTNT
void HELPER(pext)(void *vd, uint32_t png, uint32_t desc)
{
int pl = FIELD_EX32(desc, PREDDESC, OPRSZ);
int vl = pl * 8;
unsigned v_esz = FIELD_EX32(desc, PREDDESC, ESZ);
int part = FIELD_EX32(desc, PREDDESC, DATA);
DecodeCounter p = decode_counter(png, vl, v_esz);
uint64_t mask = pred_esz_masks[v_esz + p.lg2_stride];
ARMPredicateReg *d = vd;
/*
* Convert from element count to byte count and adjust
* for the portion of the 4*VL counter to be extracted.
*/
int b_count = (p.count << v_esz) - vl * part;
memset(d, 0, sizeof(*d));
if (p.invert) {
if (b_count <= 0) {
do_whilel(vd, mask, vl, vl);
} else if (b_count < vl) {
do_whileg(vd, mask, vl - b_count, vl);
}
} else if (b_count > 0) {
do_whilel(vd, mask, MIN(b_count, vl), vl);
}
}

View file

@ -3336,6 +3336,42 @@ static bool trans_WHILE_ptr(DisasContext *s, arg_WHILE_ptr *a)
return true;
}
static bool do_pext(DisasContext *s, arg_pext *a, int n)
{
TCGv_i32 t_png;
TCGv_ptr t_pd;
int pl;
if (!sve_access_check(s)) {
return true;
}
t_png = tcg_temp_new_i32();
tcg_gen_ld16u_i32(t_png, tcg_env,
pred_full_reg_offset(s, a->rn) ^
(HOST_BIG_ENDIAN ? 6 : 0));
t_pd = tcg_temp_new_ptr();
pl = pred_full_reg_size(s);
for (int i = 0; i < n; ++i) {
int rd = (a->rd + i) % 16;
int part = a->imm * n + i;
unsigned desc = 0;
desc = FIELD_DP32(desc, PREDDESC, OPRSZ, pl);
desc = FIELD_DP32(desc, PREDDESC, ESZ, a->esz);
desc = FIELD_DP32(desc, PREDDESC, DATA, part);
tcg_gen_addi_ptr(t_pd, tcg_env, pred_full_reg_offset(s, rd));
gen_helper_pext(t_pd, t_png, tcg_constant_i32(desc));
}
return true;
}
TRANS_FEAT(PEXT_1, aa64_sme2_or_sve2p1, do_pext, a, 1)
TRANS_FEAT(PEXT_2, aa64_sme2_or_sve2p1, do_pext, a, 2)
/*
*** SVE Integer Wide Immediate - Unpredicated Group
*/

View file

@ -337,4 +337,78 @@ bfloat16 helper_sme2_ah_fmin_b16(bfloat16 a, bfloat16 b, float_status *fpst);
float32 sve_f16_to_f32(float16 f, float_status *fpst);
float16 sve_f32_to_f16(float32 f, float_status *fpst);
/*
* Decode helper functions for predicate as counter.
*/
typedef struct {
unsigned count;
unsigned lg2_stride;
bool invert;
} DecodeCounter;
static inline DecodeCounter
decode_counter(unsigned png, unsigned vl, unsigned v_esz)
{
DecodeCounter ret = { };
/* C.f. Arm pseudocode CounterToPredicate. */
if (likely(png & 0xf)) {
unsigned p_esz = ctz32(png);
/*
* maxbit = log2(pl(bits) * 4)
* = log2(vl(bytes) * 4)
* = log2(vl) + 2
* maxbit_mask = ones<maxbit:0>
* = (1 << (maxbit + 1)) - 1
* = (1 << (log2(vl) + 2 + 1)) - 1
* = (1 << (log2(vl) + 3)) - 1
* = (pow2ceil(vl) << 3) - 1
*/
ret.count = png & (((unsigned)pow2ceil(vl) << 3) - 1);
ret.count >>= p_esz + 1;
ret.invert = (png >> 15) & 1;
/*
* The Arm pseudocode for CounterToPredicate expands the count to
* a set of bits, and then the operation proceeds as for the original
* interpretation of predicates as a set of bits.
*
* We can avoid the expansion by adjusting the count and supplying
* an element stride.
*/
if (unlikely(p_esz != v_esz)) {
if (p_esz < v_esz) {
/*
* For predicate esz < vector esz, the expanded predicate
* will have more bits set than will be consumed.
* Adjust the count down, rounding up.
* Consider p_esz = MO_8, v_esz = MO_64, count 14:
* The expanded predicate would be
* 0011 1111 1111 1111
* The significant bits are
* ...1 ...1 ...1 ...1
*/
unsigned shift = v_esz - p_esz;
unsigned trunc = ret.count >> shift;
ret.count = trunc + (ret.count != (trunc << shift));
} else {
/*
* For predicate esz > vector esz, the expanded predicate
* will have bits set only at power-of-two multiples of
* the vector esz. Bits at other multiples will all be
* false. Adjust the count up, and supply the caller
* with a stride of elements to skip.
*/
unsigned shift = p_esz - v_esz;
ret.count <<= shift;
ret.lg2_stride = shift;
}
}
}
return ret;
}
#endif /* TARGET_ARM_VEC_INTERNAL_H */