Browse Source

u_vbuf: implement another upload codepath which unrolls indices

Improves performance from cca 1 fps to 23 fps in Cogs.
This new codepath is not always used, instead, there is a heuristic which
determines whether to use it. Using translate for uploads is generally
slower than what we have had already, it's a win only in a few cases.
tags/mesa-8.0-rc1
Marek Olšák 13 years ago
parent
commit
ce44bae366

+ 119
- 26
src/gallium/auxiliary/util/u_vbuf.c View File

@@ -166,7 +166,9 @@ void u_vbuf_destroy(struct u_vbuf *mgrb)
static void
u_vbuf_translate_buffers(struct u_vbuf_priv *mgr, struct translate_key *key,
unsigned vb_mask, unsigned out_vb,
int start_vertex, unsigned num_vertices)
int start_vertex, unsigned num_vertices,
int start_index, unsigned num_indices, int min_index,
bool unroll_indices)
{
struct translate *tr;
struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
@@ -198,21 +200,65 @@ u_vbuf_translate_buffers(struct u_vbuf_priv *mgr, struct translate_key *key,
PIPE_TRANSFER_READ, &vb_transfer[i]);
}

/* Subtract min_index so that indexing with the index buffer works. */
if (unroll_indices) {
map -= vb->stride * min_index;
}

tr->set_buffer(tr, i, map, vb->stride, ~0);
}
}

/* Create and map the output buffer. */
u_upload_alloc(mgr->b.uploader,
key->output_stride * start_vertex,
key->output_stride * num_vertices,
&out_offset, &out_buffer,
(void**)&out_map);
/* Translate. */
if (unroll_indices) {
struct pipe_index_buffer *ib = &mgr->b.index_buffer;
struct pipe_transfer *transfer = NULL;
unsigned offset = ib->offset + start_index * ib->index_size;
uint8_t *map;

out_offset -= key->output_stride * start_vertex;
assert(ib->buffer && ib->index_size);

/* Translate. */
tr->run(tr, 0, num_vertices, 0, out_map);
if (u_vbuf_resource(ib->buffer)->user_ptr) {
map = u_vbuf_resource(ib->buffer)->user_ptr + offset;
} else {
map = pipe_buffer_map_range(mgr->pipe, ib->buffer, offset,
num_indices * ib->index_size,
PIPE_TRANSFER_READ, &transfer);
}

/* Create and map the output buffer. */
u_upload_alloc(mgr->b.uploader, 0,
key->output_stride * num_indices,
&out_offset, &out_buffer,
(void**)&out_map);

switch (ib->index_size) {
case 4:
tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
break;
case 2:
tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, out_map);
break;
case 1:
tr->run_elts8(tr, map, num_indices, 0, out_map);
break;
}

if (transfer) {
pipe_buffer_unmap(mgr->pipe, transfer);
}
} else {
/* Create and map the output buffer. */
u_upload_alloc(mgr->b.uploader,
key->output_stride * start_vertex,
key->output_stride * num_vertices,
&out_offset, &out_buffer,
(void**)&out_map);

out_offset -= key->output_stride * start_vertex;

tr->run(tr, 0, num_vertices, 0, out_map);
}

/* Unmap all buffers. */
for (i = 0; i < mgr->b.nr_vertex_buffers; i++) {
@@ -283,7 +329,9 @@ u_vbuf_translate_find_free_vb_slots(struct u_vbuf_priv *mgr,
static boolean
u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
int start_vertex, unsigned num_vertices,
int start_instance, unsigned num_instances)
int start_instance, unsigned num_instances,
int start_index, unsigned num_indices, int min_index,
bool unroll_indices)
{
unsigned mask[VB_NUM] = {0};
struct translate_key key[VB_NUM];
@@ -310,16 +358,24 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
for (i = 0; i < mgr->ve->count; i++) {
unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;

if (!mgr->ve->incompatible_layout_elem[i] &&
!mgr->incompatible_vb[vb_index]) {
continue;
}

if (!mgr->b.vertex_buffer[vb_index].stride) {
if (!mgr->ve->incompatible_layout_elem[i] &&
!mgr->incompatible_vb[vb_index]) {
continue;
}
mask[VB_CONST] |= 1 << vb_index;
} else if (mgr->ve->ve[i].instance_divisor) {
if (!mgr->ve->incompatible_layout_elem[i] &&
!mgr->incompatible_vb[vb_index]) {
continue;
}
mask[VB_INSTANCE] |= 1 << vb_index;
} else {
if (!unroll_indices &&
!mgr->ve->incompatible_layout_elem[i] &&
!mgr->incompatible_vb[vb_index]) {
continue;
}
mask[VB_VERTEX] |= 1 << vb_index;
}
}
@@ -336,15 +392,16 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
struct translate_key *k;
struct translate_element *te;
unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
bit = 1 << vb_index;

if (!mgr->ve->incompatible_layout_elem[i] &&
!mgr->incompatible_vb[vb_index]) {
!mgr->incompatible_vb[vb_index] &&
(!unroll_indices || !(mask[VB_VERTEX] & bit))) {
continue;
}

/* Set type to what we will translate.
* Whether vertex, instance, or constant attribs. */
bit = 1 << vb_index;
for (type = 0; type < VB_NUM; type++) {
if (mask[type] & bit) {
break;
@@ -376,7 +433,9 @@ u_vbuf_translate_begin(struct u_vbuf_priv *mgr,
if (key[type].nr_elements) {
u_vbuf_translate_buffers(mgr, &key[type], mask[type],
mgr->fallback_vbs[type],
start[type], num[type]);
start[type], num[type],
start_index, num_indices, min_index,
unroll_indices && type == VB_VERTEX);

/* Fixup the stride for constant attribs. */
if (type == VB_CONST) {
@@ -917,11 +976,12 @@ static void u_vbuf_get_minmax_index(struct pipe_context *pipe,

enum u_vbuf_return_flags
u_vbuf_draw_begin(struct u_vbuf *mgrb,
const struct pipe_draw_info *info)
struct pipe_draw_info *info)
{
struct u_vbuf_priv *mgr = (struct u_vbuf_priv*)mgrb;
int start_vertex;
int start_vertex, min_index;
unsigned num_vertices;
bool unroll_indices = false;

if (!mgr->incompatible_vb_layout &&
!mgr->ve->incompatible_layout &&
@@ -930,7 +990,7 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
}

if (info->indexed) {
int min_index, max_index;
int max_index;
bool index_bounds_valid = false;

if (info->max_index != ~0) {
@@ -950,6 +1010,17 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,

start_vertex = min_index + info->index_bias;
num_vertices = max_index + 1 - min_index;

/* Primitive restart doesn't work when unrolling indices.
* We would have to break this drawing operation into several ones. */
/* Use some heuristic to see if unrolling indices improves
* performance. */
if (!info->primitive_restart &&
num_vertices > info->count*2 &&
num_vertices-info->count > 32) {
/*printf("num_vertices=%i count=%i\n", num_vertices, info->count);*/
unroll_indices = true;
}
} else {
/* Nothing to do for per-vertex attribs. */
start_vertex = 0;
@@ -959,13 +1030,18 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
} else {
start_vertex = info->start;
num_vertices = info->count;
min_index = 0;
}

/* Translate vertices with non-native layouts or formats. */
if (mgr->incompatible_vb_layout || mgr->ve->incompatible_layout) {
if (unroll_indices ||
mgr->incompatible_vb_layout ||
mgr->ve->incompatible_layout) {
/* XXX check the return value */
u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
info->start_instance, info->instance_count);
info->start_instance, info->instance_count,
info->start, info->count, min_index,
unroll_indices);
}

/* Upload user buffers. */
@@ -974,7 +1050,15 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
info->start_instance, info->instance_count);
}

/*unsigned i;
/*
if (unroll_indices) {
printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
start_vertex, num_vertices);
util_dump_draw_info(stdout, info);
printf("\n");
}

unsigned i;
for (i = 0; i < mgr->b.nr_vertex_buffers; i++) {
printf("input %i: ", i);
util_dump_vertex_buffer(stdout, mgr->b.vertex_buffer+i);
@@ -984,7 +1068,16 @@ u_vbuf_draw_begin(struct u_vbuf *mgrb,
printf("real %i: ", i);
util_dump_vertex_buffer(stdout, mgr->b.real_vertex_buffer+i);
printf("\n");
}*/
}
*/

if (unroll_indices) {
info->indexed = FALSE;
info->index_bias = 0;
info->min_index = 0;
info->max_index = info->count - 1;
info->start = 0;
}

return U_VBUF_BUFFERS_UPDATED;
}

+ 1
- 1
src/gallium/auxiliary/util/u_vbuf.h View File

@@ -130,7 +130,7 @@ void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
const struct pipe_index_buffer *ib);

enum u_vbuf_return_flags u_vbuf_draw_begin(struct u_vbuf *mgr,
const struct pipe_draw_info *info);
struct pipe_draw_info *info);

unsigned u_vbuf_draw_max_vertex_count(struct u_vbuf *mgr);


+ 1
- 1
src/gallium/drivers/r600/r600_state_common.c View File

@@ -605,7 +605,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *dinfo)

r600_update_derived_state(rctx);

u_vbuf_draw_begin(rctx->vbuf_mgr, dinfo);
u_vbuf_draw_begin(rctx->vbuf_mgr, &info);
r600_vertex_buffer_update(rctx);

rdraw.vgt_num_indices = info.count;

Loading…
Cancel
Save