Browse Source

v3d: Add SSBO/atomic counters support.

So far I assume that all the buffers get written.  If they weren't, you'd
probably be using UBOs instead.
tags/19.0-branchpoint
Eric Anholt 7 years ago
parent
commit
5932c2f0b9

+ 129
- 6
src/broadcom/compiler/nir_to_vir.c View File

@@ -107,16 +107,89 @@ vir_emit_thrsw(struct v3d_compile *c)
c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL);
}

static uint32_t
v3d_general_tmu_op(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
case nir_intrinsic_load_ssbo:
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_uniform:
return GENERAL_TMU_READ_OP_READ;
case nir_intrinsic_store_ssbo:
return GENERAL_TMU_WRITE_OP_WRITE;
case nir_intrinsic_ssbo_atomic_add:
return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP;
case nir_intrinsic_ssbo_atomic_imin:
return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN;
case nir_intrinsic_ssbo_atomic_umin:
return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN;
case nir_intrinsic_ssbo_atomic_imax:
return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX;
case nir_intrinsic_ssbo_atomic_umax:
return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX;
case nir_intrinsic_ssbo_atomic_and:
return GENERAL_TMU_WRITE_OP_ATOMIC_AND;
case nir_intrinsic_ssbo_atomic_or:
return GENERAL_TMU_WRITE_OP_ATOMIC_OR;
case nir_intrinsic_ssbo_atomic_xor:
return GENERAL_TMU_WRITE_OP_ATOMIC_XOR;
case nir_intrinsic_ssbo_atomic_exchange:
return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG;
case nir_intrinsic_ssbo_atomic_comp_swap:
return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG;
default:
unreachable("unknown intrinsic op");
}
}

/**
* Implements indirect uniform loads through the TMU general memory access
* interface.
* Implements indirect uniform loads and SSBO accesses through the TMU general
* memory access interface.
*/
static void
ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
{
uint32_t tmu_op = GENERAL_TMU_READ_OP_READ;
bool has_index = instr->intrinsic == nir_intrinsic_load_ubo;
int offset_src = 0 + has_index;
/* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR
* wants to have support for inc/dec?
*/

uint32_t tmu_op = v3d_general_tmu_op(instr);
bool is_store = instr->intrinsic == nir_intrinsic_store_ssbo;

int offset_src;
int tmu_writes = 1; /* address */
if (instr->intrinsic == nir_intrinsic_load_uniform) {
offset_src = 0;
} else if (instr->intrinsic == nir_intrinsic_load_ssbo ||
instr->intrinsic == nir_intrinsic_load_ubo) {
offset_src = 1;
} else if (is_store) {
offset_src = 2;
for (int i = 0; i < instr->num_components; i++) {
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
ntq_get_src(c, instr->src[0], i));
tmu_writes++;
}
} else {
offset_src = 1;
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
ntq_get_src(c, instr->src[2], 0));
tmu_writes++;
if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) {
vir_MOV_dest(c,
vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
ntq_get_src(c, instr->src[3], 0));
tmu_writes++;
}
}

/* Make sure we won't exceed the 16-entry TMU fifo if each thread is
* storing at the same time.
*/
while (tmu_writes > 16 / c->threads)
c->threads /= 2;

struct qreg offset;
if (instr->intrinsic == nir_intrinsic_load_uniform) {
@@ -149,12 +222,16 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)

if (base != 0)
offset = vir_ADD(c, offset, vir_uniform_ui(c, base));
} else {
} else if (instr->intrinsic == nir_intrinsic_load_ubo) {
/* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by
* 1 (0 is gallium's constant buffer 0).
*/
offset = vir_uniform(c, QUNIFORM_UBO_ADDR,
nir_src_as_uint(instr->src[0]) + 1);
} else {
offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET,
nir_src_as_uint(instr->src[is_store ?
1 : 0]));
}

uint32_t config = (0xffffff00 |
@@ -167,6 +244,9 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
instr->num_components - 2);
}

if (c->execute.file != QFILE_NULL)
vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ);

struct qreg dest;
if (config == ~0)
dest = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA);
@@ -188,10 +268,17 @@ ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr)
vir_uniform_ui(c, config);
}

if (c->execute.file != QFILE_NULL)
vir_set_cond(tmu, V3D_QPU_COND_IFA);

vir_emit_thrsw(c);

/* Read the result, or wait for the TMU op to complete. */
for (int i = 0; i < nir_intrinsic_dest_components(instr); i++)
ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c)));

if (nir_intrinsic_dest_components(instr) == 0)
vir_TMUWT(c);
}

static struct qreg *
@@ -1549,6 +1636,9 @@ ntq_setup_uniforms(struct v3d_compile *c)
false);
unsigned vec4_size = 4 * sizeof(float);

if (var->data.mode != nir_var_uniform)
continue;

declare_uniform_range(c, var->data.driver_location * vec4_size,
vec4_count * vec4_size);

@@ -1629,6 +1719,27 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
ntq_emit_tmu_general(c, instr);
break;

case nir_intrinsic_ssbo_atomic_add:
case nir_intrinsic_ssbo_atomic_imin:
case nir_intrinsic_ssbo_atomic_umin:
case nir_intrinsic_ssbo_atomic_imax:
case nir_intrinsic_ssbo_atomic_umax:
case nir_intrinsic_ssbo_atomic_and:
case nir_intrinsic_ssbo_atomic_or:
case nir_intrinsic_ssbo_atomic_xor:
case nir_intrinsic_ssbo_atomic_exchange:
case nir_intrinsic_ssbo_atomic_comp_swap:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_ssbo:
ntq_emit_tmu_general(c, instr);
break;

case nir_intrinsic_get_buffer_size:
ntq_store_dest(c, &instr->dest, 0,
vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE,
nir_src_as_uint(instr->src[0])));
break;

case nir_intrinsic_load_user_clip_plane:
for (int i = 0; i < instr->num_components; i++) {
ntq_store_dest(c, &instr->dest, i,
@@ -1732,6 +1843,18 @@ ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr)
break;
}

case nir_intrinsic_memory_barrier:
case nir_intrinsic_memory_barrier_atomic_counter:
case nir_intrinsic_memory_barrier_buffer:
/* We don't do any instruction scheduling of these NIR
* instructions between each other, so we just need to make
* sure that the TMU operations before the barrier are flushed
* before the ones after the barrier. That is currently
* handled by having a THRSW in each of them and a LDTMU
* series or a TMUWT after.
*/
break;

default:
fprintf(stderr, "Unknown intrinsic: ");
nir_print_instr(&instr->instr, stderr);

+ 6
- 0
src/broadcom/compiler/v3d_compiler.h View File

@@ -243,6 +243,12 @@ enum quniform_contents {
QUNIFORM_TEXRECT_SCALE_X,
QUNIFORM_TEXRECT_SCALE_Y,

/* Returns the base offset of the SSBO given by the data value. */
QUNIFORM_SSBO_OFFSET,

/* Returns the size of the SSBO given by the data value. */
QUNIFORM_GET_BUFFER_SIZE,

QUNIFORM_ALPHA_REF,

/**

+ 8
- 0
src/broadcom/compiler/vir_dump.c View File

@@ -81,6 +81,14 @@ vir_dump_uniform(enum quniform_contents contents,
fprintf(stderr, "ubo[%d]", data);
break;

case QUNIFORM_SSBO_OFFSET:
fprintf(stderr, "ssbo[%d]", data);
break;

case QUNIFORM_GET_BUFFER_SIZE:
fprintf(stderr, "ssbo_size[%d]", data);
break;

default:
if (quniform_contents_is_texture_p0(contents)) {
fprintf(stderr, "tex[%d].p0: 0x%08x",

+ 11
- 0
src/gallium/drivers/v3d/v3d_context.c View File

@@ -65,6 +65,16 @@ v3d_pipe_flush(struct pipe_context *pctx, struct pipe_fence_handle **fence,
}
}

static void
v3d_memory_barrier(struct pipe_context *pctx, unsigned int flags)
{
struct v3d_context *v3d = v3d_context(pctx);

/* We only need to flush jobs writing to SSBOs/images. */
perf_debug("Flushing all jobs for glMemoryBarrier(), could do better");
v3d_flush(pctx);
}

static void
v3d_set_debug_callback(struct pipe_context *pctx,
const struct pipe_debug_callback *cb)
@@ -172,6 +182,7 @@ v3d_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
pctx->priv = priv;
pctx->destroy = v3d_context_destroy;
pctx->flush = v3d_pipe_flush;
pctx->memory_barrier = v3d_memory_barrier;
pctx->set_debug_callback = v3d_set_debug_callback;
pctx->invalidate_resource = v3d_invalidate_resource;
pctx->get_sample_position = v3d_get_sample_position;

+ 7
- 0
src/gallium/drivers/v3d/v3d_context.h View File

@@ -82,6 +82,7 @@ void v3d_job_add_bo(struct v3d_job *job, struct v3d_bo *bo);
#define VC5_DIRTY_OQ (1 << 28)
#define VC5_DIRTY_CENTROID_FLAGS (1 << 29)
#define VC5_DIRTY_NOPERSPECTIVE_FLAGS (1 << 30)
#define VC5_DIRTY_SSBO (1 << 31)

#define VC5_MAX_FS_INPUTS 64

@@ -203,6 +204,11 @@ struct v3d_streamout_stateobj {
unsigned num_targets;
};

struct v3d_ssbo_stateobj {
struct pipe_shader_buffer sb[PIPE_MAX_SHADER_BUFFERS];
uint32_t enabled_mask;
};

/* Hash table key for v3d->jobs */
struct v3d_job_key {
struct pipe_surface *cbufs[4];
@@ -433,6 +439,7 @@ struct v3d_context {
struct pipe_poly_stipple stipple;
struct pipe_clip_state clip;
struct pipe_viewport_state viewport;
struct v3d_ssbo_stateobj ssbo[PIPE_SHADER_TYPES];
struct v3d_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
struct v3d_texture_stateobj tex[PIPE_SHADER_TYPES];
struct v3d_vertexbuf_stateobj vertexbuf;

+ 4
- 1
src/gallium/drivers/v3d/v3d_screen.c View File

@@ -299,8 +299,11 @@ v3d_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
return VC5_MAX_TEXTURE_SAMPLERS;

case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
return PIPE_MAX_SHADER_BUFFERS;

case PIPE_SHADER_CAP_PREFERRED_IR:
return PIPE_SHADER_IR_NIR;
case PIPE_SHADER_CAP_SUPPORTED_IRS:

+ 20
- 0
src/gallium/drivers/v3d/v3d_uniforms.c View File

@@ -276,6 +276,21 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_compiled_shader *shader,
}
break;

case QUNIFORM_SSBO_OFFSET: {
struct pipe_shader_buffer *sb =
&v3d->ssbo[stage].sb[data];

cl_aligned_reloc(&job->indirect, &uniforms,
v3d_resource(sb->buffer)->bo,
sb->buffer_offset);
break;
}

case QUNIFORM_GET_BUFFER_SIZE:
cl_aligned_u32(&uniforms,
v3d->ssbo[stage].sb[data].buffer_size);
break;

case QUNIFORM_TEXTURE_FIRST_LEVEL:
cl_aligned_f(&uniforms,
texstate->textures[data]->u.tex.first_level);
@@ -362,6 +377,11 @@ v3d_set_shader_uniform_dirty_flags(struct v3d_compiled_shader *shader)
dirty |= VC5_DIRTY_FRAGTEX | VC5_DIRTY_VERTTEX;
break;

case QUNIFORM_SSBO_OFFSET:
case QUNIFORM_GET_BUFFER_SIZE:
dirty |= VC5_DIRTY_SSBO;
break;

case QUNIFORM_ALPHA_REF:
dirty |= VC5_DIRTY_ZSA;
break;

+ 11
- 0
src/gallium/drivers/v3d/v3dx_draw.c View File

@@ -478,6 +478,17 @@ v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
job->submit.in_sync_bcl = v3d->out_sync;
}

/* Mark SSBOs as being written. We don't actually know which ones are
* read vs written, so just assume the worst
*/
for (int s = 0; s < PIPE_SHADER_TYPES; s++) {
foreach_bit(i, v3d->ssbo[s].enabled_mask) {
v3d_job_add_write_resource(job,
v3d->ssbo[s].sb[i].buffer);
job->tmu_dirty_rcl = true;
}
}

/* Get space to emit our draw call into the BCL, using a branch to
* jump to a new BO if necessary.
*/

+ 49
- 0
src/gallium/drivers/v3d/v3dx_state.c View File

@@ -986,6 +986,53 @@ v3d_set_stream_output_targets(struct pipe_context *pctx,
ctx->dirty |= VC5_DIRTY_STREAMOUT;
}

static void
v3d_set_shader_buffers(struct pipe_context *pctx,
enum pipe_shader_type shader,
unsigned start, unsigned count,
const struct pipe_shader_buffer *buffers)
{
struct v3d_context *v3d = v3d_context(pctx);
struct v3d_ssbo_stateobj *so = &v3d->ssbo[shader];
unsigned mask = 0;

if (buffers) {
for (unsigned i = 0; i < count; i++) {
unsigned n = i + start;
struct pipe_shader_buffer *buf = &so->sb[n];

if ((buf->buffer == buffers[i].buffer) &&
(buf->buffer_offset == buffers[i].buffer_offset) &&
(buf->buffer_size == buffers[i].buffer_size))
continue;

mask |= 1 << n;

buf->buffer_offset = buffers[i].buffer_offset;
buf->buffer_size = buffers[i].buffer_size;
pipe_resource_reference(&buf->buffer, buffers[i].buffer);

if (buf->buffer)
so->enabled_mask |= 1 << n;
else
so->enabled_mask &= ~(1 << n);
}
} else {
mask = ((1 << count) - 1) << start;

for (unsigned i = 0; i < count; i++) {
unsigned n = i + start;
struct pipe_shader_buffer *buf = &so->sb[n];

pipe_resource_reference(&buf->buffer, NULL);
}

so->enabled_mask &= ~mask;
}

v3d->dirty |= VC5_DIRTY_SSBO;
}

void
v3dX(state_init)(struct pipe_context *pctx)
{
@@ -1025,6 +1072,8 @@ v3dX(state_init)(struct pipe_context *pctx)
pctx->sampler_view_destroy = v3d_sampler_view_destroy;
pctx->set_sampler_views = v3d_set_sampler_views;

pctx->set_shader_buffers = v3d_set_shader_buffers;

pctx->create_stream_output_target = v3d_create_stream_output_target;
pctx->stream_output_target_destroy = v3d_stream_output_target_destroy;
pctx->set_stream_output_targets = v3d_set_stream_output_targets;

Loading…
Cancel
Save