|
|
@@ -328,6 +328,16 @@ v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
struct vpm_config { |
|
|
|
uint32_t As; |
|
|
|
uint32_t Vc; |
|
|
|
uint32_t Gs; |
|
|
|
uint32_t Gd; |
|
|
|
uint32_t Gv; |
|
|
|
uint32_t Ve; |
|
|
|
uint32_t gs_width; |
|
|
|
}; |
|
|
|
|
|
|
|
#if V3D_VERSION >= 41 |
|
|
|
static void |
|
|
|
v3d_emit_gs_state_record(struct v3d_job *job, |
|
|
@@ -398,9 +408,28 @@ v3d_emit_tes_gs_common_params(struct v3d_job *job, |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static uint8_t |
|
|
|
simd_width_to_gs_pack_mode(uint32_t width) |
|
|
|
{ |
|
|
|
switch (width) { |
|
|
|
case 16: |
|
|
|
return V3D_PACK_MODE_16_WAY; |
|
|
|
case 8: |
|
|
|
return V3D_PACK_MODE_8_WAY; |
|
|
|
case 4: |
|
|
|
return V3D_PACK_MODE_4_WAY; |
|
|
|
case 1: |
|
|
|
return V3D_PACK_MODE_1_WAY; |
|
|
|
default: |
|
|
|
unreachable("Invalid SIMD width"); |
|
|
|
}; |
|
|
|
} |
|
|
|
|
|
|
|
static void |
|
|
|
v3d_emit_tes_gs_shader_params(struct v3d_job *job, |
|
|
|
struct v3d_gs_prog_data *gs) |
|
|
|
uint32_t gs_simd, |
|
|
|
uint32_t gs_vpm_output_size, |
|
|
|
uint32_t gs_max_vpm_input_size_per_batch) |
|
|
|
{ |
|
|
|
cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { |
|
|
|
shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; |
|
|
@@ -409,9 +438,9 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job, |
|
|
|
shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; |
|
|
|
shader.tes_output_segment_size_in_sectors = 1; |
|
|
|
shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; |
|
|
|
shader.gs_output_segment_size_in_sectors = |
|
|
|
gs->vpm_output_size; |
|
|
|
shader.gs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; /* FIXME*/ |
|
|
|
shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; |
|
|
|
shader.gs_output_segment_pack_mode = |
|
|
|
simd_width_to_gs_pack_mode(gs_simd); |
|
|
|
shader.tbg_max_patches_per_tcs_batch = 1; |
|
|
|
shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; |
|
|
|
shader.tbg_min_tcs_output_segments_required_in_play = 1; |
|
|
@@ -420,11 +449,156 @@ v3d_emit_tes_gs_shader_params(struct v3d_job *job, |
|
|
|
shader.tpg_max_vertex_segments_per_tes_batch = 0; |
|
|
|
shader.tpg_max_tcs_output_segments_per_tes_batch = 1; |
|
|
|
shader.tpg_min_tes_output_segments_required_in_play = 1; |
|
|
|
shader.gbg_max_tes_output_vertex_segments_per_gs_batch = 0; |
|
|
|
shader.gbg_max_tes_output_vertex_segments_per_gs_batch = |
|
|
|
gs_max_vpm_input_size_per_batch; |
|
|
|
shader.gbg_min_gs_output_segments_required_in_play = 1; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
static inline uint32_t |
|
|
|
compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) |
|
|
|
{ |
|
|
|
assert(devinfo->vpm_size > 0); |
|
|
|
const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; |
|
|
|
return devinfo->vpm_size / sector_size; |
|
|
|
} |
|
|
|
|
|
|
|
/* Computes various parameters affecting VPM memory configuration for programs |
|
|
|
* involving geometry shaders to ensure the program fits in memory and honors |
|
|
|
* requirements described in section "VPM usage" of the programming manual. |
|
|
|
*/ |
|
|
|
static void |
|
|
|
compute_vpm_config_gs(struct v3d_device_info *devinfo, |
|
|
|
struct v3d_vs_prog_data *vs, |
|
|
|
struct v3d_gs_prog_data *gs, |
|
|
|
struct vpm_config *vpm_cfg_out) |
|
|
|
{ |
|
|
|
const uint32_t A = vs->separate_segments ? 1 : 0; |
|
|
|
const uint32_t Ad = vs->vpm_input_size; |
|
|
|
const uint32_t Vd = vs->vpm_output_size; |
|
|
|
|
|
|
|
const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); |
|
|
|
|
|
|
|
/* Try to fit program into our VPM memory budget by adjusting |
|
|
|
* configurable parameters iteratively. We do this in two phases: |
|
|
|
* the first phase tries to fit the program into the total available |
|
|
|
* VPM memory. If we suceed at that, then the second phase attempts |
|
|
|
* to fit the program into half of that budget so we can run bin and |
|
|
|
* render programs in parallel. |
|
|
|
*/ |
|
|
|
struct vpm_config vpm_cfg[2]; |
|
|
|
struct vpm_config *final_vpm_cfg = NULL; |
|
|
|
uint32_t phase = 0; |
|
|
|
|
|
|
|
vpm_cfg[phase].As = 1; |
|
|
|
vpm_cfg[phase].Gs = 1; |
|
|
|
vpm_cfg[phase].Gd = gs->vpm_output_size; |
|
|
|
vpm_cfg[phase].gs_width = gs->simd_width; |
|
|
|
|
|
|
|
/* While there is a requirement that Vc >= [Vn / 16], this is |
|
|
|
* always the case when tessellation is not present because in that |
|
|
|
* case Vn can only be 6 at most (when input primitive is triangles |
|
|
|
* with adjacency). |
|
|
|
* |
|
|
|
* We always choose Vc=2. We can't go lower than this due to GFXH-1744, |
|
|
|
* and Broadcom has not found it worth it to increase it beyond this |
|
|
|
* in general. Increasing Vc also increases VPM memory pressure which |
|
|
|
* can turn up being detrimental for performance in some scenarios. |
|
|
|
*/ |
|
|
|
vpm_cfg[phase].Vc = 2; |
|
|
|
|
|
|
|
/* Gv is a constraint on the hardware to not exceed the |
|
|
|
* specified number of vertex segments per GS batch. If adding a |
|
|
|
* new primitive to a GS batch would result in a range of more |
|
|
|
* than Gv vertex segments being referenced by the batch, then |
|
|
|
* the hardware will flush the batch and start a new one. This |
|
|
|
* means that we can choose any value we want, we just need to |
|
|
|
* be aware that larger values improve GS batch utilization |
|
|
|
* at the expense of more VPM memory pressure (which can affect |
|
|
|
* other performance aspects, such as GS dispatch width). |
|
|
|
* We start with the largest value, and will reduce it if we |
|
|
|
* find that total memory pressure is too high. |
|
|
|
*/ |
|
|
|
vpm_cfg[phase].Gv = 3; |
|
|
|
do { |
|
|
|
/* When GS is present in absence of TES, then we need to satisfy |
|
|
|
* that Ve >= Gv. We go with the smallest value of Ve to avoid |
|
|
|
* increasing memory pressure. |
|
|
|
*/ |
|
|
|
vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; |
|
|
|
|
|
|
|
uint32_t vpm_sectors = |
|
|
|
A * vpm_cfg[phase].As * Ad + |
|
|
|
(vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + |
|
|
|
vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; |
|
|
|
|
|
|
|
/* Ideally we want to use no more than half of the available |
|
|
|
* memory so we can execute a bin and render program in parallel |
|
|
|
* without stalls. If we achieved that then we are done. |
|
|
|
*/ |
|
|
|
if (vpm_sectors <= vpm_size / 2) { |
|
|
|
final_vpm_cfg = &vpm_cfg[phase]; |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
/* At the very least, we should not allocate more than the |
|
|
|
* total available VPM memory. If we have a configuration that |
|
|
|
* succeeds at this we save it and continue to see if we can |
|
|
|
* meet the half-memory-use criteria too. |
|
|
|
*/ |
|
|
|
if (phase == 0 && vpm_sectors <= vpm_size) { |
|
|
|
vpm_cfg[1] = vpm_cfg[0]; |
|
|
|
phase = 1; |
|
|
|
} |
|
|
|
|
|
|
|
/* Try lowering Gv */ |
|
|
|
if (vpm_cfg[phase].Gv > 0) { |
|
|
|
vpm_cfg[phase].Gv--; |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
/* Try lowering GS dispatch width */ |
|
|
|
if (vpm_cfg[phase].gs_width > 1) { |
|
|
|
do { |
|
|
|
vpm_cfg[phase].gs_width >>= 1; |
|
|
|
vpm_cfg[phase].Gd = |
|
|
|
align(vpm_cfg[phase].Gd, 2) / 2; |
|
|
|
} while (vpm_cfg[phase].gs_width == 2); |
|
|
|
|
|
|
|
/* Reset Gv to max after dropping dispatch width */ |
|
|
|
vpm_cfg[phase].Gv = 3; |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
/* We ran out of options to reduce memory pressure. If we |
|
|
|
* are at phase 1 we have at least a valid configuration, so we |
|
|
|
* we use that. |
|
|
|
*/ |
|
|
|
if (phase == 1) |
|
|
|
final_vpm_cfg = &vpm_cfg[0]; |
|
|
|
break; |
|
|
|
} while (true); |
|
|
|
|
|
|
|
if (!final_vpm_cfg) { |
|
|
|
/* FIXME: maybe return a boolean to indicate failure and use |
|
|
|
* that to stop the submission for this draw call. |
|
|
|
*/ |
|
|
|
fprintf(stderr, "Failed to allocate VPM memory.\n"); |
|
|
|
abort(); |
|
|
|
} |
|
|
|
|
|
|
|
assert(final_vpm_cfg); |
|
|
|
assert(final_vpm_cfg->Gd <= 16); |
|
|
|
assert(final_vpm_cfg->Gv < 4); |
|
|
|
assert(final_vpm_cfg->Ve < 4); |
|
|
|
assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); |
|
|
|
assert(final_vpm_cfg->gs_width == 1 || |
|
|
|
final_vpm_cfg->gs_width == 4 || |
|
|
|
final_vpm_cfg->gs_width == 8 || |
|
|
|
final_vpm_cfg->gs_width == 16); |
|
|
|
|
|
|
|
*vpm_cfg_out = *final_vpm_cfg; |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
static void |
|
|
@@ -498,20 +672,51 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, |
|
|
|
* compile time, so that we mostly just have to OR the VS and FS |
|
|
|
* records together at draw time. |
|
|
|
*/ |
|
|
|
|
|
|
|
struct vpm_config vpm_cfg_bin, vpm_cfg; |
|
|
|
|
|
|
|
assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs); |
|
|
|
if (!v3d->prog.gs) { |
|
|
|
vpm_cfg_bin.As = 1; |
|
|
|
vpm_cfg_bin.Ve = 0; |
|
|
|
vpm_cfg_bin.Vc = v3d->prog.cs->prog_data.vs->vcm_cache_size; |
|
|
|
|
|
|
|
vpm_cfg.As = 1; |
|
|
|
vpm_cfg.Ve = 0; |
|
|
|
vpm_cfg.Vc = v3d->prog.vs->prog_data.vs->vcm_cache_size; |
|
|
|
} |
|
|
|
#if V3D_VERSION >= 41 |
|
|
|
if (v3d->prog.gs) { |
|
|
|
v3d_emit_gs_state_record(v3d->job, |
|
|
|
v3d->prog.gs_bin, gs_bin_uniforms, |
|
|
|
v3d->prog.gs, gs_uniforms); |
|
|
|
|
|
|
|
struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; |
|
|
|
struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; |
|
|
|
|
|
|
|
v3d_emit_tes_gs_common_params(v3d->job, |
|
|
|
gs->out_prim_type, |
|
|
|
gs->num_invocations); |
|
|
|
v3d_emit_tes_gs_shader_params(v3d->job, gs_bin); |
|
|
|
v3d_emit_tes_gs_shader_params(v3d->job, gs); |
|
|
|
else { |
|
|
|
v3d_emit_gs_state_record(v3d->job, |
|
|
|
v3d->prog.gs_bin, gs_bin_uniforms, |
|
|
|
v3d->prog.gs, gs_uniforms); |
|
|
|
|
|
|
|
struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; |
|
|
|
struct v3d_gs_prog_data *gs_bin = v3d->prog.gs_bin->prog_data.gs; |
|
|
|
|
|
|
|
v3d_emit_tes_gs_common_params(v3d->job, |
|
|
|
gs->out_prim_type, |
|
|
|
gs->num_invocations); |
|
|
|
|
|
|
|
/* Bin Tes/Gs params */ |
|
|
|
struct v3d_vs_prog_data *vs_bin = v3d->prog.cs->prog_data.vs; |
|
|
|
compute_vpm_config_gs(&v3d->screen->devinfo, |
|
|
|
vs_bin, gs_bin, &vpm_cfg_bin); |
|
|
|
|
|
|
|
v3d_emit_tes_gs_shader_params(v3d->job, |
|
|
|
vpm_cfg_bin.gs_width, |
|
|
|
vpm_cfg_bin.Gd, |
|
|
|
vpm_cfg_bin.Gv); |
|
|
|
|
|
|
|
/* Render Tes/Gs params */ |
|
|
|
struct v3d_vs_prog_data *vs = v3d->prog.vs->prog_data.vs; |
|
|
|
compute_vpm_config_gs(&v3d->screen->devinfo, |
|
|
|
vs, gs, &vpm_cfg); |
|
|
|
|
|
|
|
v3d_emit_tes_gs_shader_params(v3d->job, |
|
|
|
vpm_cfg.gs_width, |
|
|
|
vpm_cfg.Gd, |
|
|
|
vpm_cfg.Gv); |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
@@ -593,8 +798,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, |
|
|
|
shader.fragment_shader_uniforms_address = fs_uniforms; |
|
|
|
|
|
|
|
#if V3D_VERSION >= 41 |
|
|
|
shader.min_coord_shader_input_segments_required_in_play = 1; |
|
|
|
shader.min_vertex_shader_input_segments_required_in_play = 1; |
|
|
|
shader.min_coord_shader_input_segments_required_in_play = |
|
|
|
vpm_cfg_bin.As; |
|
|
|
shader.min_vertex_shader_input_segments_required_in_play = |
|
|
|
vpm_cfg.As; |
|
|
|
|
|
|
|
shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = |
|
|
|
vpm_cfg_bin.Ve; |
|
|
|
shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = |
|
|
|
vpm_cfg.Ve; |
|
|
|
|
|
|
|
shader.coordinate_shader_4_way_threadable = |
|
|
|
v3d->prog.cs->prog_data.vs->base.threads == 4; |
|
|
@@ -698,10 +910,8 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d, |
|
|
|
} |
|
|
|
|
|
|
|
cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) { |
|
|
|
vcm.number_of_16_vertex_batches_for_binning = |
|
|
|
v3d->prog.cs->prog_data.vs->vcm_cache_size; |
|
|
|
vcm.number_of_16_vertex_batches_for_rendering = |
|
|
|
v3d->prog.vs->prog_data.vs->vcm_cache_size; |
|
|
|
vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc; |
|
|
|
vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc; |
|
|
|
} |
|
|
|
|
|
|
|
#if V3D_VERSION >= 41 |