Signed-off-by: Timur Kristóf <timur.kristof@gmail.com> Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>tags/19.3-branchpoint
@@ -162,6 +162,11 @@ static Temp emit_bpermute(isel_context *ctx, Builder &bld, Temp index, Temp data | |||
* properly support subgroup shuffle like older generations (or wave32 mode), so we | |||
* emulate it here. | |||
*/ | |||
if (!ctx->has_gfx10_wave64_bpermute) { | |||
ctx->has_gfx10_wave64_bpermute = true; | |||
ctx->program->config->num_shared_vgprs = 8; /* Shared VGPRs are allocated in groups of 8 */ | |||
ctx->program->vgpr_limit -= 4; /* We allocate 8 shared VGPRs, so we'll have 4 fewer normal VGPRs */ | |||
} | |||
Temp lane_id = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), Operand((uint32_t) -1), Operand(0u)); | |||
lane_id = bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32, bld.def(v1), Operand((uint32_t) -1), lane_id); |
@@ -79,6 +79,7 @@ struct isel_context { | |||
std::unique_ptr<Temp[]> allocated; | |||
std::unordered_map<unsigned, std::array<Temp,4>> allocated_vec; | |||
Stage stage; /* Stage */ | |||
bool has_gfx10_wave64_bpermute = false; | |||
struct { | |||
bool has_branch; | |||
uint16_t loop_nest_depth = 0; | |||
@@ -1255,6 +1256,7 @@ setup_isel_context(Program* program, | |||
program->lds_alloc_granule = options->chip_class >= GFX7 ? 512 : 256; | |||
program->lds_limit = options->chip_class >= GFX7 ? 65536 : 32768; | |||
program->vgpr_limit = 256; | |||
if (options->chip_class >= GFX10) { | |||
program->physical_sgprs = 2560; /* doesn't matter as long as it's at least 128 * 20 */ |
@@ -1080,6 +1080,8 @@ public: | |||
uint16_t lds_alloc_granule; | |||
uint32_t lds_limit; /* in bytes */ | |||
uint16_t vgpr_limit; | |||
uint16_t physical_sgprs; | |||
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ | |||
uint16_t sgpr_limit; |
@@ -244,7 +244,7 @@ void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) | |||
const int16_t vgpr_alloc = std::max<int16_t>(4, (new_demand.vgpr + 3) & ~3); | |||
/* this won't compile, register pressure reduction necessary */ | |||
if (new_demand.vgpr > 256 || new_demand.sgpr > program->sgpr_limit) { | |||
if (new_demand.vgpr > program->vgpr_limit || new_demand.sgpr > program->sgpr_limit) { | |||
program->num_waves = 0; | |||
program->max_reg_demand = new_demand; | |||
} else { |
@@ -823,9 +823,6 @@ void lower_to_hw_instr(Program* program) | |||
assert(instr->operands[2].regClass() == v1); /* Indices x4 */ | |||
assert(instr->operands[3].regClass() == v1); /* Input data */ | |||
/* Shared VGPRs are allocated in groups of 8 */ | |||
program->config->num_shared_vgprs = 8; | |||
PhysReg shared_vgpr_reg_lo = PhysReg(align(program->config->num_vgprs, 4) + 256); | |||
PhysReg shared_vgpr_reg_hi = PhysReg(shared_vgpr_reg_lo + 1); | |||
Operand compare = instr->operands[0]; |
@@ -668,7 +668,8 @@ PhysReg get_reg(ra_ctx& ctx, | |||
/* try using more registers */ | |||
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit; | |||
if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < 256) { | |||
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit; | |||
if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) { | |||
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr)); | |||
return get_reg(ctx, reg_file, rc, parallelcopies, instr); | |||
} else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) { |