tcg/i386: Use vgf2p8affineqb for MO_8 vector shifts
A constant matrix can describe the movement of the 8 bits, so these shifts can be performed with one instruction. Logic courtesy of Andi Kleen <ak@linux.intel.com>: https://gcc.gnu.org/pipermail/gcc-patches/2025-August/691624.html Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
parent
6c76a1f687
commit
cb25409792
1 changed files with 71 additions and 4 deletions
|
|
@ -4342,12 +4342,46 @@ int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void gen_vgf2p8affineqb0(TCGType type, TCGv_vec v0,
|
||||||
|
TCGv_vec v1, uint64_t matrix)
|
||||||
|
{
|
||||||
|
vec_gen_4(INDEX_op_x86_vgf2p8affineqb_vec, type, MO_8,
|
||||||
|
tcgv_vec_arg(v0), tcgv_vec_arg(v1),
|
||||||
|
tcgv_vec_arg(tcg_constant_vec(type, MO_64, matrix)), 0);
|
||||||
|
}
|
||||||
|
|
||||||
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
|
static void expand_vec_shi(TCGType type, unsigned vece, bool right,
|
||||||
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
||||||
{
|
{
|
||||||
|
static const uint64_t gf2_shi[2][8] = {
|
||||||
|
/* left shift */
|
||||||
|
{ 0,
|
||||||
|
0x0001020408102040ull,
|
||||||
|
0x0000010204081020ull,
|
||||||
|
0x0000000102040810ull,
|
||||||
|
0x0000000001020408ull,
|
||||||
|
0x0000000000010204ull,
|
||||||
|
0x0000000000000102ull,
|
||||||
|
0x0000000000000001ull },
|
||||||
|
/* right shift */
|
||||||
|
{ 0,
|
||||||
|
0x0204081020408000ull,
|
||||||
|
0x0408102040800000ull,
|
||||||
|
0x0810204080000000ull,
|
||||||
|
0x1020408000000000ull,
|
||||||
|
0x2040800000000000ull,
|
||||||
|
0x4080000000000000ull,
|
||||||
|
0x8000000000000000ull }
|
||||||
|
};
|
||||||
uint8_t mask;
|
uint8_t mask;
|
||||||
|
|
||||||
tcg_debug_assert(vece == MO_8);
|
tcg_debug_assert(vece == MO_8);
|
||||||
|
|
||||||
|
if (cpuinfo & CPUINFO_GFNI) {
|
||||||
|
gen_vgf2p8affineqb0(type, v0, v1, gf2_shi[right][imm]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (right) {
|
if (right) {
|
||||||
mask = 0xff >> imm;
|
mask = 0xff >> imm;
|
||||||
tcg_gen_shri_vec(MO_16, v0, v1, imm);
|
tcg_gen_shri_vec(MO_16, v0, v1, imm);
|
||||||
|
|
@ -4361,6 +4395,16 @@ static void expand_vec_shi(TCGType type, unsigned vece, bool right,
|
||||||
static void expand_vec_sari(TCGType type, unsigned vece,
|
static void expand_vec_sari(TCGType type, unsigned vece,
|
||||||
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
||||||
{
|
{
|
||||||
|
static const uint64_t gf2_sar[8] = {
|
||||||
|
0,
|
||||||
|
0x0204081020408080ull,
|
||||||
|
0x0408102040808080ull,
|
||||||
|
0x0810204080808080ull,
|
||||||
|
0x1020408080808080ull,
|
||||||
|
0x2040808080808080ull,
|
||||||
|
0x4080808080808080ull,
|
||||||
|
0x8080808080808080ull,
|
||||||
|
};
|
||||||
TCGv_vec t1, t2;
|
TCGv_vec t1, t2;
|
||||||
|
|
||||||
if (imm >= (8 << vece) - 1) {
|
if (imm >= (8 << vece) - 1) {
|
||||||
|
|
@ -4371,6 +4415,11 @@ static void expand_vec_sari(TCGType type, unsigned vece,
|
||||||
|
|
||||||
switch (vece) {
|
switch (vece) {
|
||||||
case MO_8:
|
case MO_8:
|
||||||
|
if (cpuinfo & CPUINFO_GFNI) {
|
||||||
|
gen_vgf2p8affineqb0(type, v0, v1, gf2_sar[imm]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
/* Unpack to 16-bit, shift, and repack. */
|
/* Unpack to 16-bit, shift, and repack. */
|
||||||
t1 = tcg_temp_new_vec(type);
|
t1 = tcg_temp_new_vec(type);
|
||||||
t2 = tcg_temp_new_vec(type);
|
t2 = tcg_temp_new_vec(type);
|
||||||
|
|
@ -4422,12 +4471,30 @@ static void expand_vec_sari(TCGType type, unsigned vece,
|
||||||
static void expand_vec_rotli(TCGType type, unsigned vece,
|
static void expand_vec_rotli(TCGType type, unsigned vece,
|
||||||
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
TCGv_vec v0, TCGv_vec v1, TCGArg imm)
|
||||||
{
|
{
|
||||||
|
static const uint64_t gf2_rol[8] = {
|
||||||
|
0,
|
||||||
|
0x8001020408102040ull,
|
||||||
|
0x4080010204081020ull,
|
||||||
|
0x2040800102040810ull,
|
||||||
|
0x1020408001020408ull,
|
||||||
|
0x0810204080010204ull,
|
||||||
|
0x0408102040800102ull,
|
||||||
|
0x0204081020408001ull,
|
||||||
|
};
|
||||||
TCGv_vec t;
|
TCGv_vec t;
|
||||||
|
|
||||||
if (vece != MO_8 && have_avx512vbmi2) {
|
if (vece == MO_8) {
|
||||||
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
|
if (cpuinfo & CPUINFO_GFNI) {
|
||||||
tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
|
gen_vgf2p8affineqb0(type, v0, v1, gf2_rol[imm]);
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (have_avx512vbmi2) {
|
||||||
|
vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
|
||||||
|
tcgv_vec_arg(v0), tcgv_vec_arg(v1),
|
||||||
|
tcgv_vec_arg(v1), imm);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
t = tcg_temp_new_vec(type);
|
t = tcg_temp_new_vec(type);
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue