浏览代码

aco: always set scratch_offset in startpgm

This patch also moves private_segment_buffer and
scratch_offset to Program to easily access it.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
tags/19.3-branchpoint
Daniel Schürmann 6 年前
父节点
当前提交
c79972b604

+ 11
- 11
src/amd/compiler/aco_instruction_selection.cpp 查看文件

Temp get_scratch_resource(isel_context *ctx) Temp get_scratch_resource(isel_context *ctx)
{ {
Builder bld(ctx->program, ctx->block); Builder bld(ctx->program, ctx->block);
Temp scratch_addr = ctx->private_segment_buffer;
Temp scratch_addr = ctx->program->private_segment_buffer;
if (ctx->stage != compute_cs) if (ctx->stage != compute_cs)
scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), ctx->private_segment_buffer, Operand(0u));
scratch_addr = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand(0u));


uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) | uint32_t rsrc_conf = S_008F0C_ADD_TID_ENABLE(1) |
S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);; S_008F0C_INDEX_STRIDE(ctx->options->wave_size == 64 ? 3 : 2);;
std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems; std::array<Temp,NIR_MAX_VEC_COMPONENTS> elems;
Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4, Temp lower = bld.mubuf(aco_opcode::buffer_load_dwordx4,
bld.def(v4), offset, rsrc, bld.def(v4), offset, rsrc,
ctx->scratch_offset, 0, true);
ctx->program->scratch_offset, 0, true);
Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 : Temp upper = bld.mubuf(dst.size() == 6 ? aco_opcode::buffer_load_dwordx2 :
aco_opcode::buffer_load_dwordx4, aco_opcode::buffer_load_dwordx4,
dst.size() == 6 ? bld.def(v2) : bld.def(v4), dst.size() == 6 ? bld.def(v2) : bld.def(v4),
offset, rsrc, ctx->scratch_offset, 16, true);
offset, rsrc, ctx->program->scratch_offset, 16, true);
emit_split_vector(ctx, lower, 2); emit_split_vector(ctx, lower, 2);
elems[0] = emit_extract_vector(ctx, lower, 0, v2); elems[0] = emit_extract_vector(ctx, lower, 0, v2);
elems[1] = emit_extract_vector(ctx, lower, 1, v2); elems[1] = emit_extract_vector(ctx, lower, 1, v2);
unreachable("Wrong dst size for nir_intrinsic_load_scratch"); unreachable("Wrong dst size for nir_intrinsic_load_scratch");
} }


bld.mubuf(op, Definition(dst), offset, rsrc, ctx->scratch_offset, 0, true);
bld.mubuf(op, Definition(dst), offset, rsrc, ctx->program->scratch_offset, 0, true);
emit_split_vector(ctx, dst, instr->num_components); emit_split_vector(ctx, dst, instr->num_components);
} }


unreachable("Invalid data size for nir_intrinsic_store_scratch."); unreachable("Invalid data size for nir_intrinsic_store_scratch.");
} }


bld.mubuf(op, offset, rsrc, ctx->scratch_offset, write_data, start * elem_size_bytes, true);
bld.mubuf(op, offset, rsrc, ctx->program->scratch_offset, write_data, start * elem_size_bytes, true);
} }
} }


Temp sample_pos; Temp sample_pos;
Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); Temp addr = get_ssa_temp(ctx, instr->src[0].ssa);
nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]);
Temp private_segment_buffer = ctx->program->private_segment_buffer;
if (addr.type() == RegType::sgpr) { if (addr.type() == RegType::sgpr) {
Operand offset; Operand offset;
if (const_addr) { if (const_addr) {
offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u)); offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, Operand(3u));
offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset)); offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), addr, Operand(sample_pos_offset));
} }
addr = ctx->private_segment_buffer;
sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), addr, Operand(offset));
sample_pos = bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand(offset));


} else if (ctx->options->chip_class >= GFX9) { } else if (ctx->options->chip_class >= GFX9) {
addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr); addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand(3u), addr);
sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, ctx->private_segment_buffer, sample_pos_offset);
sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, private_segment_buffer, sample_pos_offset);
} else { } else {
/* addr += ctx->private_segment_buffer + sample_pos_offset */
/* addr += private_segment_buffer + sample_pos_offset */
Temp tmp0 = bld.tmp(s1); Temp tmp0 = bld.tmp(s1);
Temp tmp1 = bld.tmp(s1); Temp tmp1 = bld.tmp(s1);
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), ctx->private_segment_buffer);
bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), private_segment_buffer);
Definition scc_tmp = bld.def(s1, scc); Definition scc_tmp = bld.def(s1, scc);
tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset)); tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, Operand(sample_pos_offset));
tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp())); tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, Operand(0u), bld.scc(scc_tmp.getTemp()));

+ 7
- 9
src/amd/compiler/aco_instruction_selection_setup.cpp 查看文件



/* scratch */ /* scratch */
bool scratch_enabled = false; bool scratch_enabled = false;
Temp private_segment_buffer = Temp(0, s2); /* also the part of the scratch descriptor on compute */
Temp scratch_offset = Temp(0, s1);


/* inputs common for merged stages */ /* inputs common for merged stages */
Temp merged_wave_info = Temp(0, s1); Temp merged_wave_info = Temp(0, s1);


/* this needs to be in sgprs 0 and 1 */ /* this needs to be in sgprs 0 and 1 */
if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets || ctx->scratch_enabled) { if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets || ctx->scratch_enabled) {
add_arg(&args, s2, &ctx->private_segment_buffer, 0);
add_arg(&args, s2, &ctx->program->private_segment_buffer, 0);
set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx); set_loc_shader_ptr(ctx, AC_UD_SCRATCH_RING_OFFSETS, &user_sgpr_info.user_sgpr_idx);
} }


else else
declare_streamout_sgprs(ctx, &args, &idx); declare_streamout_sgprs(ctx, &args, &idx);


if (ctx->scratch_enabled)
add_arg(&args, s1, &ctx->scratch_offset, idx++);
if (ctx->options->supports_spill || ctx->scratch_enabled)
add_arg(&args, s1, &ctx->program->scratch_offset, idx++);


declare_vs_input_vgprs(ctx, &args); declare_vs_input_vgprs(ctx, &args);
break; break;
assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr); assert(user_sgpr_info.user_sgpr_idx == user_sgpr_info.num_sgpr);
add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx); add_arg(&args, s1, &ctx->prim_mask, user_sgpr_info.user_sgpr_idx);


if (ctx->scratch_enabled)
add_arg(&args, s1, &ctx->scratch_offset, user_sgpr_info.user_sgpr_idx + 1);
if (ctx->options->supports_spill || ctx->scratch_enabled)
add_arg(&args, s1, &ctx->program->scratch_offset, user_sgpr_info.user_sgpr_idx + 1);


ctx->program->config->spi_ps_input_addr = 0; ctx->program->config->spi_ps_input_addr = 0;
ctx->program->config->spi_ps_input_ena = 0; ctx->program->config->spi_ps_input_ena = 0;


if (ctx->program->info->cs.uses_local_invocation_idx) if (ctx->program->info->cs.uses_local_invocation_idx)
add_arg(&args, s1, &ctx->tg_size, idx++); add_arg(&args, s1, &ctx->tg_size, idx++);
if (ctx->scratch_enabled)
add_arg(&args, s1, &ctx->scratch_offset, idx++);
if (ctx->options->supports_spill || ctx->scratch_enabled)
add_arg(&args, s1, &ctx->program->scratch_offset, idx++);


add_arg(&args, v1, &ctx->local_invocation_ids[0], vgpr_idx++); add_arg(&args, v1, &ctx->local_invocation_ids[0], vgpr_idx++);
add_arg(&args, v1, &ctx->local_invocation_ids[1], vgpr_idx++); add_arg(&args, v1, &ctx->local_invocation_ids[1], vgpr_idx++);

+ 4
- 3
src/amd/compiler/aco_ir.h 查看文件

bool wb_smem_l1_on_end = false; bool wb_smem_l1_on_end = false;


std::vector<uint8_t> constant_data; std::vector<uint8_t> constant_data;
Temp private_segment_buffer;
Temp scratch_offset;


uint16_t lds_alloc_granule; uint16_t lds_alloc_granule;
uint32_t lds_limit; /* in bytes */ uint32_t lds_limit; /* in bytes */

uint16_t vgpr_limit; uint16_t vgpr_limit;
uint16_t sgpr_limit;
uint16_t physical_sgprs; uint16_t physical_sgprs;
uint16_t sgpr_alloc_granule; /* minus one. must be power of two */ uint16_t sgpr_alloc_granule; /* minus one. must be power of two */
uint16_t sgpr_limit;
bool needs_vcc = false; bool needs_vcc = false;
bool needs_xnack_mask = false; bool needs_xnack_mask = false;
bool needs_flat_scr = false; bool needs_flat_scr = false;

正在加载...
取消
保存