Since Boolean values are either -1 (true) or 0 (false), b2f(inot(a)) maps -1 => 0.0 and 0 => 1.0. This is equivalent to 1.0 + float(boolBitsToInt(a)). On Intel GPUs, ADD is one of the few instructions that can type-convert during write to destination, so we can achieve this in a single instruction: add g47F, g26D, 1D v2: Fix swizzles. v3: Fix typos in comments. Noticed by Ken. All Gen6+ platforms had similar results. (Skylake shown) Skylake total instructions in shared programs:tags/19.1-branchpoint15185583
->15184683
(<.01%) instructions in affected programs: 239389 -> 238489 (-0.38%) helped: 899 HURT: 1 helped stats (abs) min: 1 max: 2 x̄: 1.00 x̃: 1 helped stats (rel) min: 0.15% max: 1.85% x̄: 0.49% x̃: 0.44% HURT stats (abs) min: 2 max: 2 x̄: 2.00 x̃: 2 HURT stats (rel) min: 0.09% max: 0.09% x̄: 0.09% x̃: 0.09% 95% mean confidence interval for instructions value: -1.01 -0.99 95% mean confidence interval for instructions %-change: -0.51% -0.48% Instructions are helped. total cycles in shared programs:370964249
->370961508
(<.01%) cycles in affected programs:1487586
->1484845
(-0.18%) helped: 420 HURT: 268 helped stats (abs) min: 1 max: 232 x̄: 22.41 x̃: 6 helped stats (rel) min: 0.05% max: 22.60% x̄: 1.30% x̃: 0.41% HURT stats (abs) min: 1 max: 230 x̄: 24.90 x̃: 10 HURT stats (rel) min: <.01% max: 21.60% x̄: 1.45% x̃: 0.52% 95% mean confidence interval for cycles value: -7.61 -0.36 95% mean confidence interval for cycles %-change: -0.44% -0.02% Cycles are helped. No changes on Iron Lake or GM45. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
@@ -205,6 +205,8 @@ public: | |||
void nir_emit_block(nir_block *block); | |||
void nir_emit_instr(nir_instr *instr); | |||
void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr); | |||
bool try_emit_b2fi_of_inot(const brw::fs_builder &bld, fs_reg result, | |||
nir_alu_instr *instr); | |||
void nir_emit_load_const(const brw::fs_builder &bld, | |||
nir_load_const_instr *instr); | |||
void nir_emit_vs_intrinsic(const brw::fs_builder &bld, |
@@ -753,6 +753,42 @@ fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, | |||
} | |||
} | |||
bool | |||
fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, | |||
fs_reg result, | |||
nir_alu_instr *instr) | |||
{ | |||
if (devinfo->gen < 6 || devinfo->gen >= 12) | |||
return false; | |||
nir_alu_instr *const inot_instr = nir_src_as_alu_instr(&instr->src[0].src); | |||
if (inot_instr == NULL || inot_instr->op != nir_op_inot) | |||
return false; | |||
/* HF is also possible as a destination on BDW+. For nir_op_b2i, the set | |||
* of valid size-changing combinations is a bit more complex. | |||
* | |||
* The source restriction is just because I was lazy about generating the | |||
* constant below. | |||
*/ | |||
if (nir_dest_bit_size(instr->dest.dest) != 32 || | |||
nir_src_bit_size(inot_instr->src[0].src) != 32) | |||
return false; | |||
/* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, | |||
* this is float(1 + a). | |||
*/ | |||
fs_reg op; | |||
prepare_alu_destination_and_sources(bld, inot_instr, &op, false); | |||
bld.ADD(result, op, brw_imm_d(1)); | |||
assert(!instr->dest.saturate); | |||
return true; | |||
} | |||
void | |||
fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) | |||
{ | |||
@@ -844,6 +880,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) | |||
case nir_op_b2f16: | |||
case nir_op_b2f32: | |||
case nir_op_b2f64: | |||
if (try_emit_b2fi_of_inot(bld, result, instr)) | |||
break; | |||
op[0].type = BRW_REGISTER_TYPE_D; | |||
op[0].negate = !op[0].negate; | |||
/* fallthrough */ |