Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>tags/19.2-branchpoint
@@ -895,3 +895,35 @@ ac_get_harvested_configs(struct radeon_info *info, | |||
} | |||
} | |||
} | |||
unsigned ac_get_compute_resource_limits(struct radeon_info *info, | |||
unsigned waves_per_threadgroup, | |||
unsigned max_waves_per_sh, | |||
unsigned threadgroups_per_cu) | |||
{ | |||
unsigned compute_resource_limits = | |||
S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); | |||
if (info->chip_class >= GFX7) { | |||
unsigned num_cu_per_se = info->num_good_compute_units / | |||
info->max_se; | |||
/* Force even distribution on all SIMDs in CU if the workgroup | |||
* size is 64. This has shown some good improvements if # of CUs | |||
* per SE is not a multiple of 4. | |||
*/ | |||
if (num_cu_per_se % 4 && waves_per_threadgroup == 1) | |||
compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); | |||
assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); | |||
compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) | | |||
S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); | |||
} else { | |||
/* GFX6 */ | |||
if (max_waves_per_sh) { | |||
unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); | |||
compute_resource_limits |= S_00B854_WAVES_PER_SH_SI(limit_div16); | |||
} | |||
} | |||
return compute_resource_limits; | |||
} |
@@ -167,6 +167,10 @@ void ac_get_harvested_configs(struct radeon_info *info, | |||
unsigned raster_config, | |||
unsigned *cik_raster_config_1_p, | |||
unsigned *raster_config_se); | |||
unsigned ac_get_compute_resource_limits(struct radeon_info *info, | |||
unsigned waves_per_threadgroup, | |||
unsigned max_waves_per_sh, | |||
unsigned threadgroups_per_cu); | |||
static inline unsigned ac_get_max_simd_waves(enum radeon_family family) | |||
{ |
@@ -772,38 +772,6 @@ static void si_setup_tgsi_user_data(struct si_context *sctx, | |||
} | |||
} | |||
unsigned si_get_compute_resource_limits(struct si_screen *sscreen, | |||
unsigned waves_per_threadgroup, | |||
unsigned max_waves_per_sh, | |||
unsigned threadgroups_per_cu) | |||
{ | |||
unsigned compute_resource_limits = | |||
S_00B854_SIMD_DEST_CNTL(waves_per_threadgroup % 4 == 0); | |||
if (sscreen->info.chip_class >= GFX7) { | |||
unsigned num_cu_per_se = sscreen->info.num_good_compute_units / | |||
sscreen->info.max_se; | |||
/* Force even distribution on all SIMDs in CU if the workgroup | |||
* size is 64. This has shown some good improvements if # of CUs | |||
* per SE is not a multiple of 4. | |||
*/ | |||
if (num_cu_per_se % 4 && waves_per_threadgroup == 1) | |||
compute_resource_limits |= S_00B854_FORCE_SIMD_DIST(1); | |||
assert(threadgroups_per_cu >= 1 && threadgroups_per_cu <= 8); | |||
compute_resource_limits |= S_00B854_WAVES_PER_SH(max_waves_per_sh) | | |||
S_00B854_CU_GROUP_COUNT(threadgroups_per_cu - 1); | |||
} else { | |||
/* GFX6 */ | |||
if (max_waves_per_sh) { | |||
unsigned limit_div16 = DIV_ROUND_UP(max_waves_per_sh, 16); | |||
compute_resource_limits |= S_00B854_WAVES_PER_SH_SI(limit_div16); | |||
} | |||
} | |||
return compute_resource_limits; | |||
} | |||
static void si_emit_dispatch_packets(struct si_context *sctx, | |||
const struct pipe_grid_info *info) | |||
{ | |||
@@ -820,7 +788,8 @@ static void si_emit_dispatch_packets(struct si_context *sctx, | |||
threadgroups_per_cu = 2; | |||
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, | |||
si_get_compute_resource_limits(sscreen, waves_per_threadgroup, | |||
ac_get_compute_resource_limits(&sscreen->info, | |||
waves_per_threadgroup, | |||
sctx->cs_max_waves_per_sh, | |||
threadgroups_per_cu)); | |||
@@ -1426,8 +1426,10 @@ void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx, | |||
S_00B84C_LDS_SIZE(shader->config.lds_size)); | |||
radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, | |||
si_get_compute_resource_limits(sctx->screen, WAVES_PER_TG, | |||
MAX_WAVES_PER_SH, THREADGROUPS_PER_CU)); | |||
ac_get_compute_resource_limits(&sctx->screen->info, | |||
WAVES_PER_TG, | |||
MAX_WAVES_PER_SH, | |||
THREADGROUPS_PER_CU)); | |||
sctx->compute_ib_last_shader = shader; | |||
} | |||
@@ -1396,10 +1396,6 @@ unsigned si_end_counter(struct si_screen *sscreen, unsigned type, | |||
/* si_compute.c */ | |||
void si_emit_initial_compute_regs(struct si_context *sctx, struct radeon_cmdbuf *cs); | |||
unsigned si_get_compute_resource_limits(struct si_screen *sscreen, | |||
unsigned waves_per_threadgroup, | |||
unsigned max_waves_per_sh, | |||
unsigned threadgroups_per_cu); | |||
void si_init_compute_functions(struct si_context *sctx); | |||
/* si_compute_prim_discard.c */ |