Not sure if it's possible to avoid programming the block size twice (once for the userdata and once for the dispatch). Reviewed-by: Edward O'Callaghan <funfunctor@folklore1984.net> Reviewed-by: Marek Olšák <marek.olsak@amd.com>tags/13.0-branchpoint
@@ -279,7 +279,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve | |||
GL_ARB_bindless_texture started (airlied) | |||
GL_ARB_cl_event not started | |||
GL_ARB_compute_variable_group_size DONE (nvc0) | |||
GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) | |||
GL_ARB_ES3_2_compatibility DONE (i965/gen8+) | |||
GL_ARB_fragment_shader_interlock not started | |||
GL_ARB_gl_spirv not started |
@@ -49,7 +49,7 @@ Note: some of the new features are only available with certain drivers. | |||
<li>GL_ARB_ES3_1_compatibility on i965</li> | |||
<li>GL_ARB_ES3_2_compatibility on i965/gen8+</li> | |||
<li>GL_ARB_clear_texture on r600, radeonsi</li> | |||
<li>GL_ARB_compute_variable_group_size on nvc0</li> | |||
<li>GL_ARB_compute_variable_group_size on nvc0, radeonsi</li> | |||
<li>GL_ARB_cull_distance on radeonsi</li> | |||
<li>GL_ARB_enhanced_layouts on i965</li> | |||
<li>GL_ARB_indirect_parameters on radeonsi</li> |
@@ -1037,7 +1037,15 @@ static int r600_get_compute_param(struct pipe_screen *screen, | |||
} | |||
return sizeof(uint32_t); | |||
case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: | |||
return 0; | |||
if (ret) { | |||
uint64_t *max_variable_threads_per_block = ret; | |||
if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && | |||
ir_type == PIPE_SHADER_IR_TGSI) | |||
*max_variable_threads_per_block = SI_MAX_VARIABLE_THREADS_PER_BLOCK; | |||
else | |||
*max_variable_threads_per_block = 0; | |||
} | |||
return sizeof(uint64_t); | |||
} | |||
fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); |
@@ -106,6 +106,8 @@ | |||
#define R600_MAP_BUFFER_ALIGNMENT 64 | |||
#define R600_MAX_VIEWPORTS 16 | |||
#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 | |||
enum r600_coherency { | |||
R600_COHERENCY_NONE, /* no cache flushes needed */ | |||
R600_COHERENCY_SHADER, |
@@ -601,11 +601,19 @@ static void si_setup_tgsi_grid(struct si_context *sctx, | |||
radeon_emit(cs, 0); | |||
} | |||
} else { | |||
struct si_compute *program = sctx->cs_shader_state.program; | |||
bool variable_group_size = | |||
program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; | |||
radeon_set_sh_reg_seq(cs, grid_size_reg, 3); | |||
radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? 6 : 3); | |||
radeon_emit(cs, info->grid[0]); | |||
radeon_emit(cs, info->grid[1]); | |||
radeon_emit(cs, info->grid[2]); | |||
if (variable_group_size) { | |||
radeon_emit(cs, info->block[0]); | |||
radeon_emit(cs, info->block[1]); | |||
radeon_emit(cs, info->block[2]); | |||
} | |||
} | |||
} | |||
@@ -1770,16 +1770,21 @@ static void declare_system_value( | |||
LLVMValueRef values[3]; | |||
unsigned i; | |||
unsigned *properties = ctx->shader->selector->info.properties; | |||
unsigned sizes[3] = { | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] | |||
}; | |||
for (i = 0; i < 3; ++i) | |||
values[i] = lp_build_const_int32(gallivm, sizes[i]); | |||
if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { | |||
unsigned sizes[3] = { | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] | |||
}; | |||
for (i = 0; i < 3; ++i) | |||
values[i] = lp_build_const_int32(gallivm, sizes[i]); | |||
value = lp_build_gather_values(gallivm, values, 3); | |||
value = lp_build_gather_values(gallivm, values, 3); | |||
} else { | |||
value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_SIZE); | |||
} | |||
break; | |||
} | |||
@@ -5680,6 +5685,7 @@ static void create_function(struct si_shader_context *ctx) | |||
case PIPE_SHADER_COMPUTE: | |||
params[SI_PARAM_GRID_SIZE] = v3i32; | |||
params[SI_PARAM_BLOCK_SIZE] = v3i32; | |||
params[SI_PARAM_BLOCK_ID] = v3i32; | |||
last_sgpr = SI_PARAM_BLOCK_ID; | |||
@@ -5716,7 +5722,12 @@ static void create_function(struct si_shader_context *ctx) | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * | |||
properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; | |||
assert(max_work_group_size); | |||
if (!max_work_group_size) { | |||
/* This is a variable group size compute shader, | |||
* compile it for the maximum possible group size. | |||
*/ | |||
max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; | |||
} | |||
radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, | |||
"amdgpu-max-work-group-size", | |||
@@ -6653,11 +6664,16 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, | |||
unsigned max_vgprs = 256; | |||
unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; | |||
unsigned max_sgprs_per_wave = 128; | |||
unsigned min_waves_per_cu = | |||
DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * | |||
props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * | |||
props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH], | |||
wave_size); | |||
unsigned max_block_threads; | |||
if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]) | |||
max_block_threads = props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * | |||
props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * | |||
props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; | |||
else | |||
max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK; | |||
unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, wave_size); | |||
unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); | |||
max_vgprs = max_vgprs / min_waves_per_simd; |
@@ -129,7 +129,8 @@ enum { | |||
/* CS only */ | |||
SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, | |||
SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3 | |||
SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, | |||
SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 | |||
}; | |||
/* LLVM function parameter indices */ | |||
@@ -219,6 +220,7 @@ enum { | |||
/* CS only parameters */ | |||
SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, | |||
SI_PARAM_BLOCK_SIZE, | |||
SI_PARAM_BLOCK_ID, | |||
SI_PARAM_THREAD_ID, | |||