This lets SIMD16 programs on G45 and Gen5 use the PLN instruction. On Ironlake: total instructions in shared programs:tags/10.6-branchpoint5634757
->5518055
(-2.07%) instructions in affected programs:1745837
->1629135
(-6.68%) helped: 11439 HURT: 4 Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
@@ -1265,8 +1265,7 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer, | |||
emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)))); | |||
} else { | |||
emit(FS_OPCODE_LINTERP, wpos, | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
interp_reg(VARYING_SLOT_POS, 2)); | |||
} | |||
wpos = offset(wpos, 1); | |||
@@ -1308,8 +1307,7 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, | |||
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; | |||
} | |||
return emit(FS_OPCODE_LINTERP, attr, | |||
this->delta_x[barycoord_mode], | |||
this->delta_y[barycoord_mode], interp); | |||
this->delta_xy[barycoord_mode], interp); | |||
} | |||
void | |||
@@ -1859,8 +1857,8 @@ fs_visitor::assign_urb_setup() | |||
*/ | |||
foreach_block_and_inst(block, fs_inst, inst, cfg) { | |||
if (inst->opcode == FS_OPCODE_LINTERP) { | |||
assert(inst->src[2].file == HW_REG); | |||
inst->src[2].fixed_hw_reg.nr += urb_start; | |||
assert(inst->src[1].file == HW_REG); | |||
inst->src[1].fixed_hw_reg.nr += urb_start; | |||
} | |||
if (inst->opcode == FS_OPCODE_CINTERP) { | |||
@@ -2114,25 +2112,16 @@ fs_visitor::compact_virtual_grfs() | |||
} | |||
} | |||
/* Patch all the references to delta_x/delta_y, since they're used in | |||
* register allocation. If they're unused, switch them to BAD_FILE so | |||
* we don't think some random VGRF is delta_x/delta_y. | |||
/* Patch all the references to delta_xy, since they're used in register | |||
* allocation. If they're unused, switch them to BAD_FILE so we don't | |||
* think some random VGRF is delta_xy. | |||
*/ | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) { | |||
if (delta_x[i].file == GRF) { | |||
if (remap_table[delta_x[i].reg] != -1) { | |||
delta_x[i].reg = remap_table[delta_x[i].reg]; | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { | |||
if (delta_xy[i].file == GRF) { | |||
if (remap_table[delta_xy[i].reg] != -1) { | |||
delta_xy[i].reg = remap_table[delta_xy[i].reg]; | |||
} else { | |||
delta_x[i].file = BAD_FILE; | |||
} | |||
} | |||
} | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) { | |||
if (delta_y[i].file == GRF) { | |||
if (remap_table[delta_y[i].reg] != -1) { | |||
delta_y[i].reg = remap_table[delta_y[i].reg]; | |||
} else { | |||
delta_y[i].file = BAD_FILE; | |||
delta_xy[i].file = BAD_FILE; | |||
} | |||
} | |||
} | |||
@@ -2685,14 +2674,9 @@ fs_visitor::opt_register_renaming() | |||
if (progress) { | |||
invalidate_live_intervals(); | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) { | |||
if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) { | |||
delta_x[i].reg = remap[delta_x[i].reg]; | |||
} | |||
} | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) { | |||
if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) { | |||
delta_y[i].reg = remap[delta_y[i].reg]; | |||
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { | |||
if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) { | |||
delta_xy[i].reg = remap[delta_xy[i].reg]; | |||
} | |||
} | |||
} |
@@ -514,8 +514,7 @@ public: | |||
fs_reg pixel_y; | |||
fs_reg wpos_w; | |||
fs_reg pixel_w; | |||
fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; | |||
fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; | |||
fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; | |||
fs_reg shader_start_time; | |||
fs_reg userplane[MAX_CLIP_PLANES]; | |||
@@ -391,12 +391,31 @@ void | |||
fs_generator::generate_linterp(fs_inst *inst, | |||
struct brw_reg dst, struct brw_reg *src) | |||
{ | |||
/* PLN reads: | |||
* / in SIMD16 \ | |||
* ----------------------------------- | |||
* | src1+0 | src1+1 | src1+2 | src1+3 | | |||
* |-----------------------------------| | |||
* |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)| | |||
* ----------------------------------- | |||
* | |||
* but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys: | |||
* | |||
* ----------------------------------- | |||
* | src1+0 | src1+1 | src1+2 | src1+3 | | |||
* |-----------------------------------| | |||
* |(x0, x1)|(y0, y1)| | | in SIMD8 | |||
* |-----------------------------------| | |||
* |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16 | |||
* ----------------------------------- | |||
* | |||
* See also: emit_interpolation_setup_gen4(). | |||
*/ | |||
struct brw_reg delta_x = src[0]; | |||
struct brw_reg delta_y = src[1]; | |||
struct brw_reg interp = src[2]; | |||
struct brw_reg delta_y = offset(src[0], dispatch_width / 8); | |||
struct brw_reg interp = src[1]; | |||
if (brw->has_pln && | |||
delta_y.nr == delta_x.nr + 1 && | |||
(brw->gen >= 7 || (delta_x.nr & 1) == 0)) { | |||
brw_PLN(p, dst, interp, delta_x); | |||
} else { |
@@ -1482,8 +1482,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
*/ | |||
no16("interpolate_at_* not yet supported in SIMD16 mode."); | |||
fs_reg dst_x = vgrf(2); | |||
fs_reg dst_y = offset(dst_x, 1); | |||
fs_reg dst_xy = vgrf(2); | |||
/* For most messages, we need one reg of ignored data; the hardware | |||
* requires mlen==1 even when there is no payload. in the per-slot | |||
@@ -1495,7 +1494,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
switch (instr->intrinsic) { | |||
case nir_intrinsic_interp_var_at_centroid: | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u)); | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u)); | |||
break; | |||
case nir_intrinsic_interp_var_at_sample: { | |||
@@ -1503,7 +1502,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); | |||
assert(const_sample); | |||
unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0; | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, | |||
fs_reg(msg_data)); | |||
break; | |||
} | |||
@@ -1515,7 +1514,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf; | |||
unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf; | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src, | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, | |||
fs_reg(off_x | (off_y << 4))); | |||
} else { | |||
src = vgrf(glsl_type::ivec2_type); | |||
@@ -1548,7 +1547,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
} | |||
mlen = 2; | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src, | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, | |||
fs_reg(0u)); | |||
} | |||
break; | |||
@@ -1567,7 +1566,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) | |||
fs_reg src = interp_reg(instr->variables[0]->var->data.location, j); | |||
src.type = dest.type; | |||
emit(FS_OPCODE_LINTERP, dest, dst_x, dst_y, src); | |||
emit(FS_OPCODE_LINTERP, dest, dst_xy, src); | |||
dest = offset(dest, 1); | |||
} | |||
break; |
@@ -244,7 +244,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) | |||
} | |||
assert(reg == ra_reg_count); | |||
/* Add a special class for aligned pairs, which we'll put delta_x/y | |||
/* Add a special class for aligned pairs, which we'll put delta_xy | |||
* in on Gen <= 6 so that we can do PLN. | |||
*/ | |||
if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) { | |||
@@ -558,14 +558,14 @@ fs_visitor::assign_regs(bool allow_spilling) | |||
* second operand of a PLN instruction needs to be an | |||
* even-numbered register, so we have a special register class | |||
* wm_aligned_pairs_class to handle this case. pre-GEN6 always | |||
* uses this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the | |||
* uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the | |||
* second operand of a PLN instruction (since it doesn't support | |||
* any other interpolation modes). So all we need to do is find | |||
* that register and set it to the appropriate class. | |||
*/ | |||
if (screen->wm_reg_sets[rsi].aligned_pairs_class >= 0 && | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { | |||
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && | |||
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { | |||
c = screen->wm_reg_sets[rsi].aligned_pairs_class; | |||
} | |||
@@ -593,8 +593,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
/* 1. collect interpolation factors */ | |||
fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1)); | |||
fs_reg dst_y = offset(dst_x, 1); | |||
fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1)); | |||
/* for most messages, we need one reg of ignored data; the hardware requires mlen==1 | |||
* even when there is no payload. in the per-slot offset case, we'll replace this with | |||
@@ -606,7 +605,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
switch (ir->operation) { | |||
case ir_unop_interpolate_at_centroid: | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u)); | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u)); | |||
break; | |||
case ir_binop_interpolate_at_sample: { | |||
@@ -614,7 +613,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
assert(sample_num || !"nonconstant sample number should have been lowered."); | |||
unsigned msg_data = sample_num->value.i[0] << 4; | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data)); | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data)); | |||
break; | |||
} | |||
@@ -623,7 +622,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
if (const_offset) { | |||
unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) | | |||
(pack_pixel_offset(const_offset->value.f[1]) << 4); | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src, | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src, | |||
fs_reg(msg_data)); | |||
} else { | |||
/* pack the operands: hw wants offsets as 4 bit signed ints */ | |||
@@ -656,7 +655,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
} | |||
mlen = 2 * reg_width; | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src, | |||
inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src, | |||
fs_reg(0u)); | |||
} | |||
break; | |||
@@ -678,8 +677,7 @@ fs_visitor::emit_interpolate_expression(ir_expression *ir) | |||
for (int i = 0; i < ir->type->vector_elements; i++) { | |||
int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i; | |||
emit(FS_OPCODE_LINTERP, res, | |||
dst_x, dst_y, | |||
emit(FS_OPCODE_LINTERP, res, dst_xy, | |||
fs_reg(interp_reg(var->data.location, ch))); | |||
res = offset(res, 1); | |||
} | |||
@@ -3443,31 +3441,31 @@ fs_visitor::emit_interpolation_setup_gen4() | |||
fs_reg(brw_imm_v(0x11001100)))); | |||
this->current_annotation = "compute pixel deltas from v0"; | |||
if (brw->has_pln) { | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = | |||
vgrf(glsl_type::vec2_type); | |||
this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = | |||
offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1); | |||
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = | |||
vgrf(glsl_type::vec2_type); | |||
const fs_reg &delta_xy = this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC]; | |||
const fs_reg xstart(negate(brw_vec1_grf(1, 0))); | |||
const fs_reg ystart(negate(brw_vec1_grf(1, 1))); | |||
if (brw->has_pln && dispatch_width == 16) { | |||
emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart)); | |||
emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart)); | |||
emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart)) | |||
->force_sechalf = true; | |||
emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart)) | |||
->force_sechalf = true; | |||
} else { | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = | |||
vgrf(glsl_type::float_type); | |||
this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] = | |||
vgrf(glsl_type::float_type); | |||
emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart)); | |||
emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart)); | |||
} | |||
emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))))); | |||
emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))))); | |||
this->current_annotation = "compute pos.w and 1/pos.w"; | |||
/* Compute wpos.w. It's always in our setup, since it's needed to | |||
* interpolate the other attributes. | |||
*/ | |||
this->wpos_w = vgrf(glsl_type::float_type); | |||
emit(FS_OPCODE_LINTERP, wpos_w, | |||
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], | |||
interp_reg(VARYING_SLOT_POS, 3)); | |||
emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3)); | |||
/* Compute the pixel 1/W value from wpos.w. */ | |||
this->pixel_w = vgrf(glsl_type::float_type); | |||
emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); | |||
@@ -3509,8 +3507,7 @@ fs_visitor::emit_interpolation_setup_gen6() | |||
for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) { | |||
uint8_t reg = payload.barycentric_coord_reg[i]; | |||
this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0)); | |||
this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0)); | |||
this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0)); | |||
} | |||
this->current_annotation = NULL; |
@@ -704,6 +704,13 @@ brw_vec8_grf(unsigned nr, unsigned subnr) | |||
return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); | |||
} | |||
/** Construct float[16] general-purpose register */ | |||
static inline struct brw_reg | |||
brw_vec16_grf(unsigned nr, unsigned subnr) | |||
{ | |||
return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr); | |||
} | |||
static inline struct brw_reg | |||
brw_uw8_grf(unsigned nr, unsigned subnr) |