소스 검색

i965/fs: Add empirically-determined instruction latencies for gen7.

v2: Actually switch on the other math instructions mentioned in the
    comment.
v3: Add timing data for textureSize(), and clean up some long comment
    lines.

Testing shader_time of fs16 shaders on a few frames of various apps:
nexuiz improved by 2.9% +/- 1.5% (n=10)
no difference on GLB2.5 (n=36, outliers removed)
no difference on GLB2.7 (n=25)
etqw improved by 2.6% +/- 2.2% (n=25)
no difference on lightsmark (n=25)

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
tags/gles3-fmt-v1
Eric Anholt 13 년 전
부모
커밋
2cae9f2d4a
1개의 변경된 파일179개의 추가작업 그리고 3개의 파일을 삭제
  1. 179
    3
      src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp

+ 179
- 3
src/mesa/drivers/dri/i965/brw_fs_schedule_instructions.cpp 파일 보기

@@ -57,7 +57,7 @@ static bool debug = false;
class schedule_node : public exec_node
{
public:
schedule_node(fs_inst *inst)
schedule_node(fs_inst *inst, int gen)
{
this->inst = inst;
this->child_array_size = 0;
@@ -67,10 +67,14 @@ public:
this->parent_count = 0;
this->unblocked_time = 0;

set_latency_gen4();
if (gen >= 7)
set_latency_gen7();
else
set_latency_gen4();
}

void set_latency_gen4();
void set_latency_gen7();

fs_inst *inst;
schedule_node **children;
@@ -120,6 +124,178 @@ schedule_node::set_latency_gen4()
}
}

void
schedule_node::set_latency_gen7()
{
switch (inst->opcode) {
case BRW_OPCODE_MAD:
/* 3 cycles (this is said to be 4 cycles sometimes depending on the
* register numbers in the sources):
* mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
*
* 20 cycles:
* mad(8) g4<1>F g2.2<4,1,1>F.x g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
* mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
*/
latency = 17;
break;

case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
/* 2 cycles:
* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
*
* 18 cycles:
* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* Same for exp2, log2, rsq, sqrt, sin, cos.
*/
latency = 16;
break;

case SHADER_OPCODE_POW:
/* 2 cycles:
* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
*
* 26 cycles:
* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*/
latency = 24;
break;

case SHADER_OPCODE_TEX:
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXL:
/* 18 cycles:
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
* send(8) g4<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
*
* 697 +/-49 cycles (min 610, n=26):
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
* send(8) g4<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* So the latency on our first texture load of the batchbuffer takes
* ~700 cycles, since the caches are cold at that point.
*
* 840 +/- 92 cycles (min 720, n=25):
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
* send(8) g4<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
* send(8) g4<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* On the second load, it takes just an extra ~140 cycles, and after
* accounting for the 14 cycles of the MOV's latency, that makes ~130.
*
* 683 +/- 49 cycles (min = 602, n=47):
* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
* send(8) g4<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
* send(8) g50<1>UW g114<8,8,1>F
* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* The unit appears to be pipelined, since this matches up with the
* cache-cold case, despite there being two loads here. If you replace
* the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
*
* So, take some number between the cache-hot 140 cycles and the
* cache-cold 700 cycles. No particular tuning was done on this.
*
* I haven't done significant testing of the non-TEX opcodes. TXL at
* least looked about the same as TEX.
*/
latency = 200;
break;

case SHADER_OPCODE_TXS:
/* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
* cycles (n=15):
* mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
* send(8) g6<1>UW g114<8,8,1>F
* sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
*
*
* Two loads was 535 +/- 30 cycles (n=19):
* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
* send(16) g6<1>UW g114<8,8,1>F
* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
* send(16) g8<1>UW g114<8,8,1>F
* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
* mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
* add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
*
* Since the only caches that should matter are just the
* instruction/state cache containing the surface state, assume that we
* always have hot caches.
*/
latency = 100;
break;

case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
/* testing using varying-index pull constants:
*
* 16 cycles:
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
* send(8) g4<1>F g4<8,8,1>D
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
*
* ~480 cycles:
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
* send(8) g4<1>F g4<8,8,1>D
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* ~620 cycles:
* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
* send(8) g4<1>F g4<8,8,1>D
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
* send(8) g4<1>F g4<8,8,1>D
* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*
* So, if it's cache-hot, it's about 140. If it's cache cold, it's
* about 460. We expect to mostly be cache hot, so pick something more
* in that direction.
*/
latency = 200;
break;

default:
/* 2 cycles:
* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
*
* 16 cycles:
* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
*/
latency = 14;
break;
}
}

class instruction_scheduler {
public:
instruction_scheduler(fs_visitor *v, void *mem_ctx, int grf_count,
@@ -159,7 +335,7 @@ public:
void
instruction_scheduler::add_inst(fs_inst *inst)
{
schedule_node *n = new(mem_ctx) schedule_node(inst);
schedule_node *n = new(mem_ctx) schedule_node(inst, v->intel->gen);

assert(!inst->is_head_sentinel());
assert(!inst->is_tail_sentinel());

Loading…
취소
저장