浏览代码

i965/fs: Combine pixel center calculation into one inst.

The X and Y values come interleaved in g1 (.4-.11 inclusive), so we can
calculate them together with a single add(32) instruction on some
platforms like Broadwell and newer or in SIMD8 elsewhere.

Note that I also moved the PIXEL_X/PIXEL_Y virtual opcodes from before
LINTERP to after it. That's because the writes_accumulator_implicitly()
function in backend_instruction tests for <= LINTERP for determining
whether the instruction indeed writes the accumulator implicitly. The
old FS_OPCODE_PIXEL_X/Y emitted ADD instructions, which did, but the new
opcodes just emit MOVs, which don't. It doesn't matter, since we don't
use these opcodes on Gen4/5 anymore, but in the case that we do...

On Broadwell:
total instructions in shared programs: 7192355 -> 7186224 (-0.09%)
instructions in affected programs:     1190700 -> 1184569 (-0.51%)
helped:                                6131

On Haswell:
total instructions in shared programs: 6155979 -> 6152800 (-0.05%)
instructions in affected programs:     652362 -> 649183 (-0.49%)
helped:                                3179

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
tags/10.6-branchpoint
Matt Turner 10 年前
父节点
当前提交
529064f6a8

+ 2
- 0
src/mesa/drivers/dri/i965/brw_defines.h 查看文件

@@ -925,6 +925,8 @@ enum opcode {
FS_OPCODE_DDY_FINE,
FS_OPCODE_CINTERP,
FS_OPCODE_LINTERP,
FS_OPCODE_PIXEL_X,
FS_OPCODE_PIXEL_Y,
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,

+ 10
- 0
src/mesa/drivers/dri/i965/brw_fs_generator.cpp 查看文件

@@ -1940,6 +1940,16 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
case FS_OPCODE_LINTERP:
generate_linterp(inst, dst, src);
break;
case FS_OPCODE_PIXEL_X:
assert(src[0].type == BRW_REGISTER_TYPE_UW);
src[0].subnr = 0 * type_sz(src[0].type);
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
break;
case FS_OPCODE_PIXEL_Y:
assert(src[0].type == BRW_REGISTER_TYPE_UW);
src[0].subnr = 4 * type_sz(src[0].type);
brw_MOV(p, dst, stride(src[0], 8, 4, 1));
break;
case SHADER_OPCODE_TEX:
case FS_OPCODE_TXB:
case SHADER_OPCODE_TXD:

+ 51
- 20
src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 查看文件

@@ -3478,27 +3478,58 @@ fs_visitor::emit_interpolation_setup_gen6()
{
struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);

/* If the pixel centers end up used, the setup is the same as for gen4. */
this->current_annotation = "compute pixel centers";
fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
int_pixel_x.type = BRW_REGISTER_TYPE_UW;
int_pixel_y.type = BRW_REGISTER_TYPE_UW;
emit(ADD(int_pixel_x,
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
fs_reg(brw_imm_v(0x10101010))));
emit(ADD(int_pixel_y,
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
fs_reg(brw_imm_v(0x11001100))));

/* As of gen6, we can no longer mix float and int sources. We have
* to turn the integer pixel centers into floats for their actual
* use.
*/
this->pixel_x = vgrf(glsl_type::float_type);
this->pixel_y = vgrf(glsl_type::float_type);
emit(MOV(this->pixel_x, int_pixel_x));
emit(MOV(this->pixel_y, int_pixel_y));
if (brw->gen >= 8 || dispatch_width == 8) {
/* The "Register Region Restrictions" page says for BDW (and newer,
* presumably):
*
* "When destination spans two registers, the source may be one or
* two registers. The destination elements must be evenly split
* between the two registers."
*
* Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
* compute our pixel centers.
*/
fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
BRW_REGISTER_TYPE_UW, dispatch_width * 2);
emit(ADD(int_pixel_xy,
fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
fs_reg(brw_imm_v(0x11001010))))
->force_writemask_all = true;

this->pixel_x = vgrf(glsl_type::float_type);
this->pixel_y = vgrf(glsl_type::float_type);
emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
} else {
/* The "Register Region Restrictions" page says for SNB, IVB, HSW:
*
* "When destination spans two registers, the source MUST span two
* registers."
*
* Since the GRF source of the ADD will only read a single register, we
* must do two separate ADDs in SIMD16.
*/
fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
int_pixel_x.type = BRW_REGISTER_TYPE_UW;
int_pixel_y.type = BRW_REGISTER_TYPE_UW;
emit(ADD(int_pixel_x,
fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
fs_reg(brw_imm_v(0x10101010))));
emit(ADD(int_pixel_y,
fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
fs_reg(brw_imm_v(0x11001100))));

/* As of gen6, we can no longer mix float and int sources. We have
* to turn the integer pixel centers into floats for their actual
* use.
*/
this->pixel_x = vgrf(glsl_type::float_type);
this->pixel_y = vgrf(glsl_type::float_type);
emit(MOV(this->pixel_x, int_pixel_x));
emit(MOV(this->pixel_y, int_pixel_y));
}

this->current_annotation = "compute pos.w";
this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));

正在加载...
取消
保存