|
|
|
@@ -24,6 +24,7 @@ |
|
|
|
|
|
|
|
#include <algorithm> |
|
|
|
#include <map> |
|
|
|
#include <stack> |
|
|
|
|
|
|
|
#include "aco_ir.h" |
|
|
|
#include "vulkan/radv_shader.h" |
|
|
|
@@ -34,8 +35,9 @@ namespace { |
|
|
|
|
|
|
|
/** |
|
|
|
* The general idea of this pass is: |
|
|
|
* The CFG is traversed in reverse postorder (forward). |
|
|
|
* Per BB one wait_ctx is maintained. |
|
|
|
* The CFG is traversed in reverse postorder (forward) and loops are processed |
|
|
|
* several times until no progress is made. |
|
|
|
* Per BB two wait_ctx is maintained: an in-context and out-context. |
|
|
|
* The in-context is the joined out-contexts of the predecessors. |
|
|
|
* The context contains a map: gpr -> wait_entry |
|
|
|
* consisting of the information about the cnt values to be waited for. |
|
|
|
@@ -114,6 +116,19 @@ struct wait_imm { |
|
|
|
wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : |
|
|
|
vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} |
|
|
|
|
|
|
|
wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) |
|
|
|
{ |
|
|
|
vm = packed & 0xf; |
|
|
|
if (chip >= GFX9) |
|
|
|
vm |= (packed >> 10) & 0x30; |
|
|
|
|
|
|
|
exp = (packed >> 4) & 0x7; |
|
|
|
|
|
|
|
lgkm = (packed >> 8) & 0xf; |
|
|
|
if (chip >= GFX10) |
|
|
|
lgkm |= (packed >> 8) & 0x30; |
|
|
|
} |
|
|
|
|
|
|
|
uint16_t pack(enum chip_class chip) const |
|
|
|
{ |
|
|
|
uint16_t imm = 0; |
|
|
|
@@ -142,12 +157,14 @@ struct wait_imm { |
|
|
|
return imm; |
|
|
|
} |
|
|
|
|
|
|
|
void combine(const wait_imm& other) |
|
|
|
bool combine(const wait_imm& other) |
|
|
|
{ |
|
|
|
bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; |
|
|
|
vm = std::min(vm, other.vm); |
|
|
|
exp = std::min(exp, other.exp); |
|
|
|
lgkm = std::min(lgkm, other.lgkm); |
|
|
|
vs = std::min(vs, other.vs); |
|
|
|
return changed; |
|
|
|
} |
|
|
|
|
|
|
|
bool empty() const |
|
|
|
@@ -168,13 +185,17 @@ struct wait_entry { |
|
|
|
: imm(imm), events(event), counters(get_counters_for_event(event)), |
|
|
|
wait_on_read(wait_on_read), logical(logical) {} |
|
|
|
|
|
|
|
void join(const wait_entry& other) |
|
|
|
bool join(const wait_entry& other) |
|
|
|
{ |
|
|
|
bool changed = (other.events & ~events) || |
|
|
|
(other.counters & ~counters) || |
|
|
|
(other.wait_on_read && !wait_on_read); |
|
|
|
events |= other.events; |
|
|
|
counters |= other.counters; |
|
|
|
imm.combine(other.imm); |
|
|
|
changed |= imm.combine(other.imm); |
|
|
|
wait_on_read = wait_on_read || other.wait_on_read; |
|
|
|
assert(logical == other.logical); |
|
|
|
return changed; |
|
|
|
} |
|
|
|
|
|
|
|
void remove_counter(counter_type counter) |
|
|
|
@@ -237,8 +258,15 @@ struct wait_ctx { |
|
|
|
max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), |
|
|
|
unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} |
|
|
|
|
|
|
|
void join(const wait_ctx* other, bool logical) |
|
|
|
bool join(const wait_ctx* other, bool logical) |
|
|
|
{ |
|
|
|
bool changed = other->exp_cnt > exp_cnt || |
|
|
|
other->vm_cnt > vm_cnt || |
|
|
|
other->lgkm_cnt > lgkm_cnt || |
|
|
|
other->vs_cnt > vs_cnt || |
|
|
|
(other->pending_flat_lgkm && !pending_flat_lgkm) || |
|
|
|
(other->pending_flat_vm && !pending_flat_vm); |
|
|
|
|
|
|
|
exp_cnt = std::max(exp_cnt, other->exp_cnt); |
|
|
|
vm_cnt = std::max(vm_cnt, other->vm_cnt); |
|
|
|
lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); |
|
|
|
@@ -253,14 +281,18 @@ struct wait_ctx { |
|
|
|
if (entry.second.logical != logical) |
|
|
|
continue; |
|
|
|
|
|
|
|
if (it != gpr_map.end()) |
|
|
|
it->second.join(entry.second); |
|
|
|
else |
|
|
|
if (it != gpr_map.end()) { |
|
|
|
changed |= it->second.join(entry.second); |
|
|
|
} else { |
|
|
|
gpr_map.insert(entry); |
|
|
|
changed = true; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
for (unsigned i = 0; i < barrier_count; i++) |
|
|
|
barrier_imm[i].combine(other->barrier_imm[i]); |
|
|
|
changed |= barrier_imm[i].combine(other->barrier_imm[i]); |
|
|
|
|
|
|
|
return changed; |
|
|
|
} |
|
|
|
}; |
|
|
|
|
|
|
|
@@ -319,12 +351,27 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) |
|
|
|
return wait; |
|
|
|
} |
|
|
|
|
|
|
|
wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) |
|
|
|
{ |
|
|
|
if (instr->opcode == aco_opcode::s_waitcnt_vscnt && |
|
|
|
instr->definitions[0].physReg() == sgpr_null) { |
|
|
|
wait_imm imm; |
|
|
|
imm.vs = std::min<uint8_t>(imm.vs, static_cast<SOPK_instruction*>(instr)->imm); |
|
|
|
return imm; |
|
|
|
} else if (instr->opcode == aco_opcode::s_waitcnt) { |
|
|
|
return wait_imm(ctx.chip_class, static_cast<SOPK_instruction*>(instr)->imm); |
|
|
|
} |
|
|
|
return wait_imm(); |
|
|
|
} |
|
|
|
|
|
|
|
wait_imm kill(Instruction* instr, wait_ctx& ctx) |
|
|
|
{ |
|
|
|
wait_imm imm; |
|
|
|
if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) |
|
|
|
imm.combine(check_instr(instr, ctx)); |
|
|
|
|
|
|
|
imm.combine(parse_wait_instr(ctx, instr)); |
|
|
|
|
|
|
|
if (ctx.chip_class >= GFX10) { |
|
|
|
/* Seems to be required on GFX10 to achieve correct behaviour. |
|
|
|
* It shouldn't cost anything anyways since we're about to do s_endpgm. |
|
|
|
@@ -665,39 +712,23 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) |
|
|
|
{ |
|
|
|
std::vector<aco_ptr<Instruction>> new_instructions; |
|
|
|
|
|
|
|
wait_imm queued_imm; |
|
|
|
for (aco_ptr<Instruction>& instr : block.instructions) { |
|
|
|
wait_imm imm = kill(instr.get(), ctx); |
|
|
|
bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); |
|
|
|
|
|
|
|
if (!imm.empty()) |
|
|
|
emit_waitcnt(ctx, new_instructions, imm); |
|
|
|
queued_imm.combine(kill(instr.get(), ctx)); |
|
|
|
|
|
|
|
gen(instr.get(), ctx); |
|
|
|
|
|
|
|
if (instr->format != Format::PSEUDO_BARRIER) |
|
|
|
if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { |
|
|
|
if (!queued_imm.empty()) { |
|
|
|
emit_waitcnt(ctx, new_instructions, queued_imm); |
|
|
|
queued_imm = wait_imm(); |
|
|
|
} |
|
|
|
new_instructions.emplace_back(std::move(instr)); |
|
|
|
} |
|
|
|
|
|
|
|
/* check if this block is at the end of a loop */ |
|
|
|
for (unsigned succ_idx : block.linear_succs) { |
|
|
|
/* eliminate any remaining counters */ |
|
|
|
if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) { |
|
|
|
// TODO: we could do better if we only wait if the regs between the block and other predecessors differ |
|
|
|
|
|
|
|
aco_ptr<Instruction> branch = std::move(new_instructions.back()); |
|
|
|
new_instructions.pop_back(); |
|
|
|
|
|
|
|
wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter, |
|
|
|
ctx.exp_cnt ? 0 : wait_imm::unset_counter, |
|
|
|
ctx.lgkm_cnt ? 0 : wait_imm::unset_counter, |
|
|
|
ctx.vs_cnt ? 0 : wait_imm::unset_counter); |
|
|
|
emit_waitcnt(ctx, new_instructions, imm); |
|
|
|
|
|
|
|
new_instructions.push_back(std::move(branch)); |
|
|
|
|
|
|
|
ctx = wait_ctx(program); |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
block.instructions.swap(new_instructions); |
|
|
|
} |
|
|
|
|
|
|
|
@@ -705,23 +736,55 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) |
|
|
|
|
|
|
|
void insert_wait_states(Program* program) |
|
|
|
{ |
|
|
|
wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */ |
|
|
|
/* per BB ctx */ |
|
|
|
std::vector<bool> done(program->blocks.size()); |
|
|
|
wait_ctx in_ctx[program->blocks.size()]; |
|
|
|
wait_ctx out_ctx[program->blocks.size()]; |
|
|
|
for (unsigned i = 0; i < program->blocks.size(); i++) |
|
|
|
out_ctx[i] = wait_ctx(program); |
|
|
|
|
|
|
|
for (unsigned i = 0; i < program->blocks.size(); i++) { |
|
|
|
Block& current = program->blocks[i]; |
|
|
|
wait_ctx& in = out_ctx[current.index]; |
|
|
|
in_ctx[i] = wait_ctx(program); |
|
|
|
std::stack<unsigned> loop_header_indices; |
|
|
|
unsigned loop_progress = 0; |
|
|
|
|
|
|
|
for (unsigned i = 0; i < program->blocks.size();) { |
|
|
|
Block& current = program->blocks[i++]; |
|
|
|
wait_ctx ctx = in_ctx[current.index]; |
|
|
|
|
|
|
|
if (current.kind & block_kind_loop_header) { |
|
|
|
loop_header_indices.push(current.index); |
|
|
|
} else if (current.kind & block_kind_loop_exit) { |
|
|
|
bool repeat = false; |
|
|
|
if (loop_progress == loop_header_indices.size()) { |
|
|
|
i = loop_header_indices.top(); |
|
|
|
repeat = true; |
|
|
|
} |
|
|
|
loop_header_indices.pop(); |
|
|
|
loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size()); |
|
|
|
if (repeat) |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
bool changed = false; |
|
|
|
for (unsigned b : current.linear_preds) |
|
|
|
in.join(&out_ctx[b], false); |
|
|
|
changed |= ctx.join(&out_ctx[b], false); |
|
|
|
for (unsigned b : current.logical_preds) |
|
|
|
in.join(&out_ctx[b], true); |
|
|
|
changed |= ctx.join(&out_ctx[b], true); |
|
|
|
|
|
|
|
in_ctx[current.index] = ctx; |
|
|
|
|
|
|
|
if (done[current.index] && !changed) |
|
|
|
continue; |
|
|
|
|
|
|
|
if (current.instructions.empty()) |
|
|
|
if (current.instructions.empty()) { |
|
|
|
out_ctx[current.index] = ctx; |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); |
|
|
|
done[current.index] = true; |
|
|
|
|
|
|
|
handle_block(program, current, ctx); |
|
|
|
|
|
|
|
handle_block(program, current, in); |
|
|
|
out_ctx[current.index] = ctx; |
|
|
|
} |
|
|
|
} |
|
|
|
|