So far I assume that all the buffers get written. If they weren't, you'd probably be using UBOs instead.tags/19.0-branchpoint
@@ -107,16 +107,89 @@ vir_emit_thrsw(struct v3d_compile *c) | |||
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); | |||
} | |||
static uint32_t | |||
v3d_general_tmu_op(nir_intrinsic_instr *instr) | |||
{ | |||
switch (instr->intrinsic) { | |||
case nir_intrinsic_load_ssbo: | |||
case nir_intrinsic_load_ubo: | |||
case nir_intrinsic_load_uniform: | |||
return GENERAL_TMU_READ_OP_READ; | |||
case nir_intrinsic_store_ssbo: | |||
return GENERAL_TMU_WRITE_OP_WRITE; | |||
case nir_intrinsic_ssbo_atomic_add: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP; | |||
case nir_intrinsic_ssbo_atomic_imin: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN; | |||
case nir_intrinsic_ssbo_atomic_umin: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN; | |||
case nir_intrinsic_ssbo_atomic_imax: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX; | |||
case nir_intrinsic_ssbo_atomic_umax: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX; | |||
case nir_intrinsic_ssbo_atomic_and: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_AND; | |||
case nir_intrinsic_ssbo_atomic_or: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_OR; | |||
case nir_intrinsic_ssbo_atomic_xor: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_XOR; | |||
case nir_intrinsic_ssbo_atomic_exchange: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG; | |||
case nir_intrinsic_ssbo_atomic_comp_swap: | |||
return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG; | |||
default: | |||
unreachable("unknown intrinsic op"); | |||
} | |||
} | |||
/** | |||
* Implements indirect uniform loads through the TMU general memory access | |||
* interface. | |||
* Implements indirect uniform loads and SSBO accesses through the TMU general | |||
* memory access interface. | |||
*/ | |||
static void | |||
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
{ | |||
uint32_t tmu_op = GENERAL_TMU_READ_OP_READ; | |||
bool has_index = instr->intrinsic == nir_intrinsic_load_ubo; | |||
int offset_src = 0 + has_index; | |||
/* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR | |||
* wants to have support for inc/dec? | |||
*/ | |||
uint32_t tmu_op = v3d_general_tmu_op(instr); | |||
bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo; | |||
int offset_src; | |||
int tmu_writes = 1; /* address */ | |||
if (instr->intrinsic == nir_intrinsic_load_uniform) { | |||
offset_src = 0; | |||
} else if (instr->intrinsic == nir_intrinsic_load_ssbo || | |||
instr->intrinsic == nir_intrinsic_load_ubo) { | |||
offset_src = 1; | |||
} else if (is_store) { | |||
offset_src = 2; | |||
for (int i = 0; i < instr->num_components; i++) { | |||
vir_MOV_dest(c, | |||
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), | |||
ntq_get_src(c, instr->src[0], i)); | |||
tmu_writes++; | |||
} | |||
} else { | |||
offset_src = 1; | |||
vir_MOV_dest(c, | |||
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), | |||
ntq_get_src(c, instr->src[2], 0)); | |||
tmu_writes++; | |||
if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) { | |||
vir_MOV_dest(c, | |||
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), | |||
ntq_get_src(c, instr->src[3], 0)); | |||
tmu_writes++; | |||
} | |||
} | |||
/* Make sure we won't exceed the 16-entry TMU fifo if each thread is | |||
* storing at the same time. | |||
*/ | |||
while (tmu_writes > 16 / c->threads) | |||
c->threads /= 2; | |||
struct qreg offset; | |||
if (instr->intrinsic == nir_intrinsic_load_uniform) { | |||
@@ -149,12 +222,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
if (base != 0) | |||
offset = vir_ADD(c, offset, vir_uniform_ui(c, base)); | |||
} else { | |||
} else if (instr->intrinsic == nir_intrinsic_load_ubo) { | |||
/* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by | |||
* 1 (0 is gallium's constant buffer 0). | |||
*/ | |||
offset = vir_uniform(c, QUNIFORM_UBO_ADDR, | |||
nir_src_as_uint(instr->src[0]) + 1); | |||
} else { | |||
offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, | |||
nir_src_as_uint(instr->src[is_store ? | |||
1 : 0])); | |||
} | |||
uint32_t config = (0xffffff00 | | |||
@@ -167,6 +244,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
instr->num_components - 2); | |||
} | |||
if (c->execute.file != QFILE_NULL) | |||
vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); | |||
struct qreg dest; | |||
if (config == ~0) | |||
dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); | |||
@@ -188,10 +268,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
vir_uniform_ui(c, config); | |||
} | |||
if (c->execute.file != QFILE_NULL) | |||
vir_set_cond(tmu, V3D_QPU_COND_IFA); | |||
vir_emit_thrsw(c); | |||
/* Read the result, or wait for the TMU op to complete. */ | |||
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) | |||
ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); | |||
if (nir_intrinsic_dest_components(instr) == 0) | |||
vir_TMUWT(c); | |||
} | |||
static struct qreg * | |||
@@ -1549,6 +1636,9 @@ ntq_setup_uniforms(struct v3d_compile *c) | |||
false); | |||
unsigned vec4_size = 4 * sizeof(float); | |||
if (var->data.mode != nir_var_uniform) | |||
continue; | |||
declare_uniform_range(c, var->data.driver_location * vec4_size, | |||
vec4_count * vec4_size); | |||
@@ -1629,6 +1719,27 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
ntq_emit_tmu_general(c, instr); | |||
break; | |||
case nir_intrinsic_ssbo_atomic_add: | |||
case nir_intrinsic_ssbo_atomic_imin: | |||
case nir_intrinsic_ssbo_atomic_umin: | |||
case nir_intrinsic_ssbo_atomic_imax: | |||
case nir_intrinsic_ssbo_atomic_umax: | |||
case nir_intrinsic_ssbo_atomic_and: | |||
case nir_intrinsic_ssbo_atomic_or: | |||
case nir_intrinsic_ssbo_atomic_xor: | |||
case nir_intrinsic_ssbo_atomic_exchange: | |||
case nir_intrinsic_ssbo_atomic_comp_swap: | |||
case nir_intrinsic_load_ssbo: | |||
case nir_intrinsic_store_ssbo: | |||
ntq_emit_tmu_general(c, instr); | |||
break; | |||
case nir_intrinsic_get_buffer_size: | |||
ntq_store_dest(c, &instr->dest, 0, | |||
vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE, | |||
nir_src_as_uint(instr->src[0]))); | |||
break; | |||
case nir_intrinsic_load_user_clip_plane: | |||
for (int i = 0; i < instr->num_components; i++) { | |||
ntq_store_dest(c, &instr->dest, i, | |||
@@ -1732,6 +1843,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) | |||
break; | |||
} | |||
case nir_intrinsic_memory_barrier: | |||
case nir_intrinsic_memory_barrier_atomic_counter: | |||
case nir_intrinsic_memory_barrier_buffer: | |||
/* We don't do any instruction scheduling of these NIR | |||
* instructions between each other, so we just need to make | |||
* sure that the TMU operations before the barrier are flushed | |||
* before the ones after the barrier. That is currently | |||
* handled by having a THRSW in each of them and a LDTMU | |||
* series or a TMUWT after. | |||
*/ | |||
break; | |||
default: | |||
fprintf(stderr, "Unknown intrinsic: "); | |||
nir_print_instr(&instr->instr, stderr); |
@@ -243,6 +243,12 @@ enum quniform_contents { | |||
QUNIFORM_TEXRECT_SCALE_X, | |||
QUNIFORM_TEXRECT_SCALE_Y, | |||
/* Returns the base offset of the SSBO given by the data value. */ | |||
QUNIFORM_SSBO_OFFSET, | |||
/* Returns the size of the SSBO given by the data value. */ | |||
QUNIFORM_GET_BUFFER_SIZE, | |||
QUNIFORM_ALPHA_REF, | |||
/** |
@@ -81,6 +81,14 @@ vir_dump_uniform(enum quniform_contents contents, | |||
fprintf(stderr, "ubo[%d]", data); | |||
break; | |||
case QUNIFORM_SSBO_OFFSET: | |||
fprintf(stderr, "ssbo[%d]", data); | |||
break; | |||
case QUNIFORM_GET_BUFFER_SIZE: | |||
fprintf(stderr, "ssbo_size[%d]", data); | |||
break; | |||
default: | |||
if (quniform_contents_is_texture_p0(contents)) { | |||
fprintf(stderr, "tex[%d].p0: 0x%08x", |
@@ -65,6 +65,16 @@ v3d_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence, | |||
} | |||
} | |||
static void | |||
v3d_memory_barrier(struct pipe_context *pctx, unsigned int flags) | |||
{ | |||
struct v3d_context *v3d = v3d_context(pctx); | |||
/* We only need to flush jobs writing to SSBOs/images. */ | |||
perf_debug("Flushing all jobs for glMemoryBarrier(), could do better"); | |||
v3d_flush(pctx); | |||
} | |||
static void | |||
v3d_set_debug_callback(struct pipe_context *pctx, | |||
const struct pipe_debug_callback *cb) | |||
@@ -172,6 +182,7 @@ v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags) | |||
pctx->priv = priv; | |||
pctx->destroy = v3d_context_destroy; | |||
pctx->flush = v3d_pipe_flush; | |||
pctx->memory_barrier = v3d_memory_barrier; | |||
pctx->set_debug_callback = v3d_set_debug_callback; | |||
pctx->invalidate_resource = v3d_invalidate_resource; | |||
pctx->get_sample_position = v3d_get_sample_position; |
@@ -82,6 +82,7 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo); | |||
#define VC5_DIRTY_OQ (1 << 28) | |||
#define VC5_DIRTY_CENTROID_FLAGS (1 << 29) | |||
#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30) | |||
#define VC5_DIRTY_SSBO (1 << 31) | |||
#define VC5_MAX_FS_INPUTS 64 | |||
@@ -203,6 +204,11 @@ struct v3d_streamout_stateobj { | |||
unsigned num_targets; | |||
}; | |||
struct v3d_ssbo_stateobj { | |||
struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS]; | |||
uint32_t enabled_mask; | |||
}; | |||
/* Hash table key for v3d->jobs */ | |||
struct v3d_job_key { | |||
struct pipe_surface *cbufs[4]; | |||
@@ -433,6 +439,7 @@ struct v3d_context { | |||
struct pipe_poly_stipple stipple; | |||
struct pipe_clip_state clip; | |||
struct pipe_viewport_state viewport; | |||
struct v3d_ssbo_stateobj ssbo[PIPE_SHADER_TYPES]; | |||
struct v3d_constbuf_stateobj constbuf[PIPE_SHADER_TYPES]; | |||
struct v3d_texture_stateobj tex[PIPE_SHADER_TYPES]; | |||
struct v3d_vertexbuf_stateobj vertexbuf; |
@@ -299,8 +299,11 @@ v3d_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, | |||
case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS: | |||
case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS: | |||
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES: | |||
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: | |||
return VC5_MAX_TEXTURE_SAMPLERS; | |||
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS: | |||
return PIPE_MAX_SHADER_BUFFERS; | |||
case PIPE_SHADER_CAP_PREFERRED_IR: | |||
return PIPE_SHADER_IR_NIR; | |||
case PIPE_SHADER_CAP_SUPPORTED_IRS: |
@@ -276,6 +276,21 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader, | |||
} | |||
break; | |||
case QUNIFORM_SSBO_OFFSET: { | |||
struct pipe_shader_buffer *sb = | |||
&v3d->ssbo[stage].sb[data]; | |||
cl_aligned_reloc(&job->indirect, &uniforms, | |||
v3d_resource(sb->buffer)->bo, | |||
sb->buffer_offset); | |||
break; | |||
} | |||
case QUNIFORM_GET_BUFFER_SIZE: | |||
cl_aligned_u32(&uniforms, | |||
v3d->ssbo[stage].sb[data].buffer_size); | |||
break; | |||
case QUNIFORM_TEXTURE_FIRST_LEVEL: | |||
cl_aligned_f(&uniforms, | |||
texstate->textures[data]->u.tex.first_level); | |||
@@ -362,6 +377,11 @@ v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader) | |||
dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX; | |||
break; | |||
case QUNIFORM_SSBO_OFFSET: | |||
case QUNIFORM_GET_BUFFER_SIZE: | |||
dirty |= VC5_DIRTY_SSBO; | |||
break; | |||
case QUNIFORM_ALPHA_REF: | |||
dirty |= VC5_DIRTY_ZSA; | |||
break; |
@@ -478,6 +478,17 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info) | |||
job->submit.in_sync_bcl = v3d->out_sync; | |||
} | |||
/* Mark SSBOs as being written. We don't actually know which ones are | |||
* read vs written, so just assume the worst | |||
*/ | |||
for (int s = 0; s < PIPE_SHADER_TYPES; s++) { | |||
foreach_bit(i, v3d->ssbo[s].enabled_mask) { | |||
v3d_job_add_write_resource(job, | |||
v3d->ssbo[s].sb[i].buffer); | |||
job->tmu_dirty_rcl = true; | |||
} | |||
} | |||
/* Get space to emit our draw call into the BCL, using a branch to | |||
* jump to a new BO if necessary. | |||
*/ |
@@ -986,6 +986,53 @@ v3d_set_stream_output_targets(struct pipe_context *pctx, | |||
ctx->dirty |= VC5_DIRTY_STREAMOUT; | |||
} | |||
static void | |||
v3d_set_shader_buffers(struct pipe_context *pctx, | |||
enum pipe_shader_type shader, | |||
unsigned start, unsigned count, | |||
const struct pipe_shader_buffer *buffers) | |||
{ | |||
struct v3d_context *v3d = v3d_context(pctx); | |||
struct v3d_ssbo_stateobj *so = &v3d->ssbo[shader]; | |||
unsigned mask = 0; | |||
if (buffers) { | |||
for (unsigned i = 0; i < count; i++) { | |||
unsigned n = i + start; | |||
struct pipe_shader_buffer *buf = &so->sb[n]; | |||
if ((buf->buffer == buffers[i].buffer) && | |||
(buf->buffer_offset == buffers[i].buffer_offset) && | |||
(buf->buffer_size == buffers[i].buffer_size)) | |||
continue; | |||
mask |= 1 << n; | |||
buf->buffer_offset = buffers[i].buffer_offset; | |||
buf->buffer_size = buffers[i].buffer_size; | |||
pipe_resource_reference(&buf->buffer, buffers[i].buffer); | |||
if (buf->buffer) | |||
so->enabled_mask |= 1 << n; | |||
else | |||
so->enabled_mask &= ~(1 << n); | |||
} | |||
} else { | |||
mask = ((1 << count) - 1) << start; | |||
for (unsigned i = 0; i < count; i++) { | |||
unsigned n = i + start; | |||
struct pipe_shader_buffer *buf = &so->sb[n]; | |||
pipe_resource_reference(&buf->buffer, NULL); | |||
} | |||
so->enabled_mask &= ~mask; | |||
} | |||
v3d->dirty |= VC5_DIRTY_SSBO; | |||
} | |||
void | |||
v3dX(state_init)(struct pipe_context *pctx) | |||
{ | |||
@@ -1025,6 +1072,8 @@ v3dX(state_init)(struct pipe_context *pctx) | |||
pctx->sampler_view_destroy = v3d_sampler_view_destroy; | |||
pctx->set_sampler_views = v3d_set_sampler_views; | |||
pctx->set_shader_buffers = v3d_set_shader_buffers; | |||
pctx->create_stream_output_target = v3d_create_stream_output_target; | |||
pctx->stream_output_target_destroy = v3d_stream_output_target_destroy; | |||
pctx->set_stream_output_targets = v3d_set_stream_output_targets; |