Browse Source

nvc0: detect no-op MIN/MAX, do CSE earlier to succeed more often

tags/android-x86-2.2-r2
Christoph Bumiller 14 years ago
parent
commit
f0d7429623
1 changed files with 79 additions and 48 deletions
  1. 79
    48
      src/gallium/drivers/nvc0/nvc0_pc_optimize.c

+ 79
- 48
src/gallium/drivers/nvc0/nvc0_pc_optimize.c View File

@@ -607,25 +607,83 @@ constant_operand(struct nv_pc *pc,
}
}

static void
handle_min_max(struct nv_pass *ctx, struct nv_instruction *nvi)
{
struct nv_value *src0 = nvi->src[0]->value;
struct nv_value *src1 = nvi->src[1]->value;

if (src0 != src1 || (nvi->src[0]->mod | nvi->src[1]->mod))
return;
if (src0->reg.file != NV_FILE_GPR)
return;
nvc0_pc_replace_value(ctx->pc, nvi->def[0], src0);
nvc0_insn_delete(nvi);
}

/* check if we can MUL + ADD -> MAD/FMA */
static void
handle_add_mul(struct nv_pass *ctx, struct nv_instruction *nvi)
{
struct nv_value *src0 = nvi->src[0]->value;
struct nv_value *src1 = nvi->src[1]->value;
struct nv_value *src;
int s;
uint8_t mod[4];

if (SRC_IS_MUL(src0) && src0->refc == 1) s = 0;
else
if (SRC_IS_MUL(src1) && src1->refc == 1) s = 1;
else
return;

if ((src0->insn && src0->insn->bb != nvi->bb) ||
(src1->insn && src1->insn->bb != nvi->bb))
return;

/* check for immediates from prior constant folding */
if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
return;
src = nvi->src[s]->value;

mod[0] = nvi->src[0]->mod;
mod[1] = nvi->src[1]->mod;
mod[2] = src->insn->src[0]->mod;
mod[3] = src->insn->src[1]->mod;

if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
return;

nvi->opcode = NV_OP_MAD_F32;

nv_reference(ctx->pc, nvi, s, NULL);
nvi->src[2] = nvi->src[!s];
nvi->src[!s] = NULL;

nv_reference(ctx->pc, nvi, 0, src->insn->src[0]->value);
nvi->src[0]->mod = mod[2] ^ mod[s];
nv_reference(ctx->pc, nvi, 1, src->insn->src[1]->value);
nvi->src[1]->mod = mod[3];
}

static int
nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
nv_pass_algebraic_opt(struct nv_pass *ctx, struct nv_basic_block *b)
{
struct nv_instruction *nvi, *next;
int j;

for (nvi = b->entry; nvi; nvi = next) {
struct nv_value *src0, *src1, *src;
int s;
uint8_t mod[4];
struct nv_value *src0, *src1;
uint baseop = NV_BASEOP(nvi->opcode);

next = nvi->next;

src0 = nvc0_pc_find_immediate(nvi->src[0]);
src1 = nvc0_pc_find_immediate(nvi->src[1]);

if (src0 && src1)
if (src0 && src1) {
constant_expression(ctx->pc, nvi, src0, src1);
else {
} else {
if (src0)
constant_operand(ctx->pc, nvi, src0, 0);
else
@@ -633,44 +691,13 @@ nv_pass_lower_arith(struct nv_pass *ctx, struct nv_basic_block *b)
constant_operand(ctx->pc, nvi, src1, 1);
}

/* check if we can MUL + ADD -> MAD/FMA */
if (nvi->opcode != NV_OP_ADD)
continue;

src0 = nvi->src[0]->value;
src1 = nvi->src[1]->value;

if (SRC_IS_MUL(src0) && src0->refc == 1)
src = src0;
else
if (SRC_IS_MUL(src1) && src1->refc == 1)
src = src1;
if (baseop == NV_OP_MIN || baseop == NV_OP_MAX)
handle_min_max(ctx, nvi);
else
continue;

/* could have an immediate from above constant_* */
if (src0->reg.file != NV_FILE_GPR || src1->reg.file != NV_FILE_GPR)
continue;
s = (src == src0) ? 0 : 1;

mod[0] = nvi->src[0]->mod;
mod[1] = nvi->src[1]->mod;
mod[2] = src->insn->src[0]->mod;
mod[3] = src->insn->src[0]->mod;

if ((mod[0] | mod[1] | mod[2] | mod[3]) & ~NV_MOD_NEG)
continue;

nvi->opcode = NV_OP_MAD;
nv_reference(ctx->pc, nvi, s, NULL);
nvi->src[2] = nvi->src[!s];

nvi->src[0] = new_ref(ctx->pc, src->insn->src[0]->value);
nvi->src[1] = new_ref(ctx->pc, src->insn->src[1]->value);
nvi->src[0]->mod = mod[2] ^ mod[s];
nvi->src[1]->mod = mod[3];
if (nvi->opcode == NV_OP_ADD_F32)
handle_add_mul(ctx, nvi);
}
DESCEND_ARBITRARY(j, nv_pass_lower_arith);
DESCEND_ARBITRARY(j, nv_pass_algebraic_opt);

return 0;
}
@@ -1158,11 +1185,17 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
pass.n = 0;
pass.pc = pc;

/* Do CSE so we can just compare values by pointer in subsequent passes. */
pc->pass_seq++;
ret = nv_pass_cse(&pass, root);
if (ret)
return ret;

/* Do this first, so we don't have to pay attention
* to whether sources are supported memory loads.
*/
pc->pass_seq++;
ret = nv_pass_lower_arith(&pass, root);
ret = nv_pass_algebraic_opt(&pass, root);
if (ret)
return ret;

@@ -1190,11 +1223,9 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
reldelim->pc = pc;
}

pc->pass_seq++;
ret = nv_pass_cse(&pass, root);
if (ret)
return ret;

/* May run DCE before load-combining since that pass will clean up
* after itself.
*/
dce.pc = pc;
do {
dce.removed = 0;

Loading…
Cancel
Save