It can be enabled with RADV_PERFTEST=cswave32. Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>tags/19.2-branchpoint
@@ -64,6 +64,7 @@ enum { | |||
RADV_PERFTEST_BO_LIST = 0x20, | |||
RADV_PERFTEST_SHADER_BALLOT = 0x40, | |||
RADV_PERFTEST_TC_COMPAT_CMASK = 0x80, | |||
RADV_PERFTEST_CS_WAVE_32 = 0x100, | |||
}; | |||
bool |
@@ -383,6 +383,14 @@ radv_physical_device_init(struct radv_physical_device *device, | |||
device->use_shader_ballot = device->instance->perftest_flags & RADV_PERFTEST_SHADER_BALLOT; | |||
/* Determine the number of threads per wave for all stages. */ | |||
device->cs_wave_size = 64; | |||
if (device->rad_info.chip_class >= GFX10) { | |||
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32) | |||
device->cs_wave_size = 32; | |||
} | |||
radv_physical_device_init_mem_types(device); | |||
radv_fill_device_extension_table(device, &device->supported_extensions); | |||
@@ -494,6 +502,7 @@ static const struct debug_control radv_perftest_options[] = { | |||
{"bolist", RADV_PERFTEST_BO_LIST}, | |||
{"shader_ballot", RADV_PERFTEST_SHADER_BALLOT}, | |||
{"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK}, | |||
{"cswave32", RADV_PERFTEST_CS_WAVE_32}, | |||
{NULL, 0} | |||
}; | |||
@@ -1930,7 +1939,8 @@ VkResult radv_CreateDevice( | |||
device->scratch_waves = MAX2(32 * physical_device->rad_info.num_good_compute_units, | |||
max_threads_per_block / 64); | |||
device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1); | |||
device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1) | | |||
S_00B800_CS_W32_EN(device->physical_device->cs_wave_size == 32); | |||
if (device->physical_device->rad_info.chip_class >= GFX7) { | |||
/* If the KMD allows it (there is a KMD hw register for it), |
@@ -4317,6 +4317,15 @@ static void declare_esgs_ring(struct radv_shader_context *ctx) | |||
LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); | |||
} | |||
static uint8_t | |||
radv_nir_shader_wave_size(struct nir_shader *const *shaders, int shader_count, | |||
const struct radv_nir_compiler_options *options) | |||
{ | |||
if (shaders[0]->info.stage == MESA_SHADER_COMPUTE) | |||
return options->cs_wave_size; | |||
return 64; | |||
} | |||
static | |||
LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, | |||
struct nir_shader *const *shaders, | |||
@@ -4333,8 +4342,11 @@ LLVMModuleRef ac_translate_nir_to_llvm(struct ac_llvm_compiler *ac_llvm, | |||
options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH : | |||
AC_FLOAT_MODE_DEFAULT; | |||
uint8_t wave_size = radv_nir_shader_wave_size(shaders, | |||
shader_count, options); | |||
ac_llvm_context_init(&ctx.ac, ac_llvm, options->chip_class, | |||
options->family, float_mode, 64); | |||
options->family, float_mode, wave_size); | |||
ctx.context = ctx.ac.context; | |||
radv_nir_shader_info_init(&shader_info->info); |
@@ -4649,7 +4649,8 @@ radv_compute_generate_pm4(struct radv_pipeline *pipeline) | |||
threads_per_threadgroup = compute_shader->info.cs.block_size[0] * | |||
compute_shader->info.cs.block_size[1] * | |||
compute_shader->info.cs.block_size[2]; | |||
waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, 64); | |||
waves_per_threadgroup = DIV_ROUND_UP(threads_per_threadgroup, | |||
device->physical_device->cs_wave_size); | |||
if (device->physical_device->rad_info.chip_class >= GFX10 && | |||
waves_per_threadgroup == 1) |
@@ -301,6 +301,9 @@ struct radv_physical_device { | |||
/* Whether DISABLE_CONSTANT_ENCODE_REG is supported. */ | |||
bool has_dcc_constant_encode; | |||
/* Number of threads per wave. */ | |||
uint8_t cs_wave_size; | |||
/* This is the drivers on-disk cache used as a fallback as opposed to | |||
* the pipeline cache defined by apps. | |||
*/ |
@@ -667,6 +667,16 @@ radv_get_shader_binary_size(size_t code_size) | |||
return code_size + DEBUGGER_NUM_MARKERS * 4; | |||
} | |||
static uint8_t | |||
radv_get_shader_wave_size(const struct radv_physical_device *pdevice, | |||
gl_shader_stage stage) | |||
{ | |||
if (stage == MESA_SHADER_COMPUTE) | |||
return pdevice->cs_wave_size; | |||
return 64; | |||
} | |||
static void radv_postprocess_config(const struct radv_physical_device *pdevice, | |||
const struct ac_shader_config *config_in, | |||
const struct radv_shader_variant_info *info, | |||
@@ -674,6 +684,7 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, | |||
struct ac_shader_config *config_out) | |||
{ | |||
bool scratch_enabled = config_in->scratch_bytes_per_wave > 0; | |||
uint8_t wave_size = radv_get_shader_wave_size(pdevice, stage); | |||
unsigned vgpr_comp_cnt = 0; | |||
unsigned num_input_vgprs = info->num_input_vgprs; | |||
@@ -743,7 +754,8 @@ static void radv_postprocess_config(const struct radv_physical_device *pdevice, | |||
S_00B12C_SO_BASE3_EN(!!info->info.so.strides[3]) | | |||
S_00B12C_SO_EN(!!info->info.so.num_outputs); | |||
config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / 4) | | |||
config_out->rsrc1 = S_00B848_VGPRS((num_vgprs - 1) / | |||
(wave_size == 32 ? 8 : 4)) | | |||
S_00B848_DX10_CLAMP(1) | | |||
S_00B848_FLOAT_MODE(config_out->float_mode); | |||
@@ -1009,10 +1021,15 @@ radv_shader_variant_create(struct radv_device *device, | |||
if (binary->variant_info.is_ngg) | |||
sym->size -= 32; | |||
} | |||
uint8_t wave_size = | |||
radv_get_shader_wave_size(device->physical_device, | |||
binary->stage); | |||
struct ac_rtld_open_info open_info = { | |||
.info = &device->physical_device->rad_info, | |||
.shader_type = binary->stage, | |||
.wave_size = 64, | |||
.wave_size = wave_size, | |||
.num_parts = 1, | |||
.elf_ptrs = &elf_data, | |||
.elf_sizes = &elf_size, | |||
@@ -1124,6 +1141,7 @@ shader_variant_compile(struct radv_device *device, | |||
options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR; | |||
options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size; | |||
options->address32_hi = device->physical_device->rad_info.address32_hi; | |||
options->cs_wave_size = device->physical_device->cs_wave_size; | |||
if (options->supports_spill) | |||
tm_options |= AC_TM_SUPPORTS_SPILL; | |||
@@ -1273,6 +1291,7 @@ generate_shader_stats(struct radv_device *device, | |||
{ | |||
enum chip_class chip_class = device->physical_device->rad_info.chip_class; | |||
unsigned lds_increment = chip_class >= GFX7 ? 512 : 256; | |||
uint8_t wave_size = radv_get_shader_wave_size(device->physical_device, stage); | |||
struct ac_shader_config *conf; | |||
unsigned max_simd_waves; | |||
unsigned lds_per_wave = 0; | |||
@@ -1289,7 +1308,7 @@ generate_shader_stats(struct radv_device *device, | |||
unsigned max_workgroup_size = | |||
radv_nir_get_max_workgroup_size(chip_class, stage, variant->nir); | |||
lds_per_wave = (conf->lds_size * lds_increment) / | |||
DIV_ROUND_UP(max_workgroup_size, 64); | |||
DIV_ROUND_UP(max_workgroup_size, wave_size); | |||
} | |||
if (conf->num_sgprs) |
@@ -128,6 +128,7 @@ struct radv_nir_compiler_options { | |||
enum chip_class chip_class; | |||
uint32_t tess_offchip_block_dw_size; | |||
uint32_t address32_hi; | |||
uint8_t cs_wave_size; | |||
}; | |||
enum radv_ud_index { |