|  |  | @@ -24,6 +24,7 @@ | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | #include <algorithm> | 
		
	
		
			
			|  |  |  | #include <map> | 
		
	
		
			
			|  |  |  | #include <stack> | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | #include "aco_ir.h" | 
		
	
		
			
			|  |  |  | #include "vulkan/radv_shader.h" | 
		
	
	
		
			
			|  |  | @@ -34,8 +35,9 @@ namespace { | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | /** | 
		
	
		
			
			|  |  |  | * The general idea of this pass is: | 
		
	
		
			
			|  |  |  | * The CFG is traversed in reverse postorder (forward). | 
		
	
		
			
			|  |  |  | * Per BB one wait_ctx is maintained. | 
		
	
		
			
			|  |  |  | * The CFG is traversed in reverse postorder (forward) and loops are processed | 
		
	
		
			
			|  |  |  | * several times until no progress is made. | 
		
	
		
			
			|  |  |  | * Per BB two wait_ctx is maintained: an in-context and out-context. | 
		
	
		
			
			|  |  |  | * The in-context is the joined out-contexts of the predecessors. | 
		
	
		
			
			|  |  |  | * The context contains a map: gpr -> wait_entry | 
		
	
		
			
			|  |  |  | * consisting of the information about the cnt values to be waited for. | 
		
	
	
		
			
			|  |  | @@ -114,6 +116,19 @@ struct wait_imm { | 
		
	
		
			
			|  |  |  | wait_imm(uint16_t vm_, uint16_t exp_, uint16_t lgkm_, uint16_t vs_) : | 
		
	
		
			
			|  |  |  | vm(vm_), exp(exp_), lgkm(lgkm_), vs(vs_) {} | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | wait_imm(enum chip_class chip, uint16_t packed) : vs(unset_counter) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | vm = packed & 0xf; | 
		
	
		
			
			|  |  |  | if (chip >= GFX9) | 
		
	
		
			
			|  |  |  | vm |= (packed >> 10) & 0x30; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | exp = (packed >> 4) & 0x7; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | lgkm = (packed >> 8) & 0xf; | 
		
	
		
			
			|  |  |  | if (chip >= GFX10) | 
		
	
		
			
			|  |  |  | lgkm |= (packed >> 8) & 0x30; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | uint16_t pack(enum chip_class chip) const | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | uint16_t imm = 0; | 
		
	
	
		
			
			|  |  | @@ -142,12 +157,14 @@ struct wait_imm { | 
		
	
		
			
			|  |  |  | return imm; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | void combine(const wait_imm& other) | 
		
	
		
			
			|  |  |  | bool combine(const wait_imm& other) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | bool changed = other.vm < vm || other.exp < exp || other.lgkm < lgkm || other.vs < vs; | 
		
	
		
			
			|  |  |  | vm = std::min(vm, other.vm); | 
		
	
		
			
			|  |  |  | exp = std::min(exp, other.exp); | 
		
	
		
			
			|  |  |  | lgkm = std::min(lgkm, other.lgkm); | 
		
	
		
			
			|  |  |  | vs = std::min(vs, other.vs); | 
		
	
		
			
			|  |  |  | return changed; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | bool empty() const | 
		
	
	
		
			
			|  |  | @@ -168,13 +185,17 @@ struct wait_entry { | 
		
	
		
			
			|  |  |  | : imm(imm), events(event), counters(get_counters_for_event(event)), | 
		
	
		
			
			|  |  |  | wait_on_read(wait_on_read), logical(logical) {} | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | void join(const wait_entry& other) | 
		
	
		
			
			|  |  |  | bool join(const wait_entry& other) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | bool changed = (other.events & ~events) || | 
		
	
		
			
			|  |  |  | (other.counters & ~counters) || | 
		
	
		
			
			|  |  |  | (other.wait_on_read && !wait_on_read); | 
		
	
		
			
			|  |  |  | events |= other.events; | 
		
	
		
			
			|  |  |  | counters |= other.counters; | 
		
	
		
			
			|  |  |  | imm.combine(other.imm); | 
		
	
		
			
			|  |  |  | changed |= imm.combine(other.imm); | 
		
	
		
			
			|  |  |  | wait_on_read = wait_on_read || other.wait_on_read; | 
		
	
		
			
			|  |  |  | assert(logical == other.logical); | 
		
	
		
			
			|  |  |  | return changed; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | void remove_counter(counter_type counter) | 
		
	
	
		
			
			|  |  | @@ -237,8 +258,15 @@ struct wait_ctx { | 
		
	
		
			
			|  |  |  | max_vs_cnt(program_->chip_class >= GFX10 ? 62 : 0), | 
		
	
		
			
			|  |  |  | unordered_events(event_smem | (program_->chip_class < GFX10 ? event_flat : 0)) {} | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | void join(const wait_ctx* other, bool logical) | 
		
	
		
			
			|  |  |  | bool join(const wait_ctx* other, bool logical) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | bool changed = other->exp_cnt > exp_cnt || | 
		
	
		
			
			|  |  |  | other->vm_cnt > vm_cnt || | 
		
	
		
			
			|  |  |  | other->lgkm_cnt > lgkm_cnt || | 
		
	
		
			
			|  |  |  | other->vs_cnt > vs_cnt || | 
		
	
		
			
			|  |  |  | (other->pending_flat_lgkm && !pending_flat_lgkm) || | 
		
	
		
			
			|  |  |  | (other->pending_flat_vm && !pending_flat_vm); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | exp_cnt = std::max(exp_cnt, other->exp_cnt); | 
		
	
		
			
			|  |  |  | vm_cnt = std::max(vm_cnt, other->vm_cnt); | 
		
	
		
			
			|  |  |  | lgkm_cnt = std::max(lgkm_cnt, other->lgkm_cnt); | 
		
	
	
		
			
			|  |  | @@ -253,14 +281,18 @@ struct wait_ctx { | 
		
	
		
			
			|  |  |  | if (entry.second.logical != logical) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (it != gpr_map.end()) | 
		
	
		
			
			|  |  |  | it->second.join(entry.second); | 
		
	
		
			
			|  |  |  | else | 
		
	
		
			
			|  |  |  | if (it != gpr_map.end()) { | 
		
	
		
			
			|  |  |  | changed |= it->second.join(entry.second); | 
		
	
		
			
			|  |  |  | } else { | 
		
	
		
			
			|  |  |  | gpr_map.insert(entry); | 
		
	
		
			
			|  |  |  | changed = true; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | for (unsigned i = 0; i < barrier_count; i++) | 
		
	
		
			
			|  |  |  | barrier_imm[i].combine(other->barrier_imm[i]); | 
		
	
		
			
			|  |  |  | changed |= barrier_imm[i].combine(other->barrier_imm[i]); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | return changed; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | }; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
	
		
			
			|  |  | @@ -319,12 +351,27 @@ wait_imm check_instr(Instruction* instr, wait_ctx& ctx) | 
		
	
		
			
			|  |  |  | return wait; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | wait_imm parse_wait_instr(wait_ctx& ctx, Instruction *instr) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | if (instr->opcode == aco_opcode::s_waitcnt_vscnt && | 
		
	
		
			
			|  |  |  | instr->definitions[0].physReg() == sgpr_null) { | 
		
	
		
			
			|  |  |  | wait_imm imm; | 
		
	
		
			
			|  |  |  | imm.vs = std::min<uint8_t>(imm.vs, static_cast<SOPK_instruction*>(instr)->imm); | 
		
	
		
			
			|  |  |  | return imm; | 
		
	
		
			
			|  |  |  | } else if (instr->opcode == aco_opcode::s_waitcnt) { | 
		
	
		
			
			|  |  |  | return wait_imm(ctx.chip_class, static_cast<SOPK_instruction*>(instr)->imm); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | return wait_imm(); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | wait_imm kill(Instruction* instr, wait_ctx& ctx) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | wait_imm imm; | 
		
	
		
			
			|  |  |  | if (ctx.exp_cnt || ctx.vm_cnt || ctx.lgkm_cnt) | 
		
	
		
			
			|  |  |  | imm.combine(check_instr(instr, ctx)); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | imm.combine(parse_wait_instr(ctx, instr)); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (ctx.chip_class >= GFX10) { | 
		
	
		
			
			|  |  |  | /* Seems to be required on GFX10 to achieve correct behaviour. | 
		
	
		
			
			|  |  |  | * It shouldn't cost anything anyways since we're about to do s_endpgm. | 
		
	
	
		
			
			|  |  | @@ -665,39 +712,23 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | std::vector<aco_ptr<Instruction>> new_instructions; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | wait_imm queued_imm; | 
		
	
		
			
			|  |  |  | for (aco_ptr<Instruction>& instr : block.instructions) { | 
		
	
		
			
			|  |  |  | wait_imm imm = kill(instr.get(), ctx); | 
		
	
		
			
			|  |  |  | bool is_wait = !parse_wait_instr(ctx, instr.get()).empty(); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (!imm.empty()) | 
		
	
		
			
			|  |  |  | emit_waitcnt(ctx, new_instructions, imm); | 
		
	
		
			
			|  |  |  | queued_imm.combine(kill(instr.get(), ctx)); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | gen(instr.get(), ctx); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (instr->format != Format::PSEUDO_BARRIER) | 
		
	
		
			
			|  |  |  | if (instr->format != Format::PSEUDO_BARRIER && !is_wait) { | 
		
	
		
			
			|  |  |  | if (!queued_imm.empty()) { | 
		
	
		
			
			|  |  |  | emit_waitcnt(ctx, new_instructions, queued_imm); | 
		
	
		
			
			|  |  |  | queued_imm = wait_imm(); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | new_instructions.emplace_back(std::move(instr)); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | /* check if this block is at the end of a loop */ | 
		
	
		
			
			|  |  |  | for (unsigned succ_idx : block.linear_succs) { | 
		
	
		
			
			|  |  |  | /* eliminate any remaining counters */ | 
		
	
		
			
			|  |  |  | if (succ_idx <= block.index && (ctx.vm_cnt || ctx.exp_cnt || ctx.lgkm_cnt || ctx.vs_cnt)) { | 
		
	
		
			
			|  |  |  | // TODO: we could do better if we only wait if the regs between the block and other predecessors differ | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | aco_ptr<Instruction> branch = std::move(new_instructions.back()); | 
		
	
		
			
			|  |  |  | new_instructions.pop_back(); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | wait_imm imm(ctx.vm_cnt ? 0 : wait_imm::unset_counter, | 
		
	
		
			
			|  |  |  | ctx.exp_cnt ? 0 : wait_imm::unset_counter, | 
		
	
		
			
			|  |  |  | ctx.lgkm_cnt ? 0 : wait_imm::unset_counter, | 
		
	
		
			
			|  |  |  | ctx.vs_cnt ? 0 : wait_imm::unset_counter); | 
		
	
		
			
			|  |  |  | emit_waitcnt(ctx, new_instructions, imm); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | new_instructions.push_back(std::move(branch)); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | ctx = wait_ctx(program); | 
		
	
		
			
			|  |  |  | break; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | block.instructions.swap(new_instructions); | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
	
		
			
			|  |  | @@ -705,23 +736,55 @@ void handle_block(Program *program, Block& block, wait_ctx& ctx) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | void insert_wait_states(Program* program) | 
		
	
		
			
			|  |  |  | { | 
		
	
		
			
			|  |  |  | wait_ctx out_ctx[program->blocks.size()]; /* per BB ctx */ | 
		
	
		
			
			|  |  |  | /* per BB ctx */ | 
		
	
		
			
			|  |  |  | std::vector<bool> done(program->blocks.size()); | 
		
	
		
			
			|  |  |  | wait_ctx in_ctx[program->blocks.size()]; | 
		
	
		
			
			|  |  |  | wait_ctx out_ctx[program->blocks.size()]; | 
		
	
		
			
			|  |  |  | for (unsigned i = 0; i < program->blocks.size(); i++) | 
		
	
		
			
			|  |  |  | out_ctx[i] = wait_ctx(program); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | for (unsigned i = 0; i < program->blocks.size(); i++) { | 
		
	
		
			
			|  |  |  | Block& current = program->blocks[i]; | 
		
	
		
			
			|  |  |  | wait_ctx& in = out_ctx[current.index]; | 
		
	
		
			
			|  |  |  | in_ctx[i] = wait_ctx(program); | 
		
	
		
			
			|  |  |  | std::stack<unsigned> loop_header_indices; | 
		
	
		
			
			|  |  |  | unsigned loop_progress = 0; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | for (unsigned i = 0; i < program->blocks.size();) { | 
		
	
		
			
			|  |  |  | Block& current = program->blocks[i++]; | 
		
	
		
			
			|  |  |  | wait_ctx ctx = in_ctx[current.index]; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (current.kind & block_kind_loop_header) { | 
		
	
		
			
			|  |  |  | loop_header_indices.push(current.index); | 
		
	
		
			
			|  |  |  | } else if (current.kind & block_kind_loop_exit) { | 
		
	
		
			
			|  |  |  | bool repeat = false; | 
		
	
		
			
			|  |  |  | if (loop_progress == loop_header_indices.size()) { | 
		
	
		
			
			|  |  |  | i = loop_header_indices.top(); | 
		
	
		
			
			|  |  |  | repeat = true; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | loop_header_indices.pop(); | 
		
	
		
			
			|  |  |  | loop_progress = std::min<unsigned>(loop_progress, loop_header_indices.size()); | 
		
	
		
			
			|  |  |  | if (repeat) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | bool changed = false; | 
		
	
		
			
			|  |  |  | for (unsigned b : current.linear_preds) | 
		
	
		
			
			|  |  |  | in.join(&out_ctx[b], false); | 
		
	
		
			
			|  |  |  | changed |= ctx.join(&out_ctx[b], false); | 
		
	
		
			
			|  |  |  | for (unsigned b : current.logical_preds) | 
		
	
		
			
			|  |  |  | in.join(&out_ctx[b], true); | 
		
	
		
			
			|  |  |  | changed |= ctx.join(&out_ctx[b], true); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | in_ctx[current.index] = ctx; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (done[current.index] && !changed) | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | if (current.instructions.empty()) | 
		
	
		
			
			|  |  |  | if (current.instructions.empty()) { | 
		
	
		
			
			|  |  |  | out_ctx[current.index] = ctx; | 
		
	
		
			
			|  |  |  | continue; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | loop_progress = std::max<unsigned>(loop_progress, current.loop_nest_depth); | 
		
	
		
			
			|  |  |  | done[current.index] = true; | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | handle_block(program, current, ctx); | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | handle_block(program, current, in); | 
		
	
		
			
			|  |  |  | out_ctx[current.index] = ctx; | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | } | 
		
	
		
			
			|  |  |  | 
 |