Browse Source

panfrost: Pack invocation_shifts manually instead of a bit field

gcc generates exceptionally bad code for panfrost_pack_work_groups_fused
otherwise ... although that routine is somehow still hot ...

Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/merge_requests/3067>
master
Alyssa Rosenzweig 5 years ago
parent
commit
6378797a6d

+ 19
- 19
src/panfrost/encoder/pan_invocation.c View File

shifts[i + 1] = shifts[i] + bit_count; shifts[i + 1] = shifts[i] + bit_count;
} }


/* We're packed, so upload everything */
out->invocation_count = packed;
out->size_y_shift = shifts[1];
out->size_z_shift = shifts[2];
out->workgroups_x_shift = shifts[3];
out->workgroups_y_shift = shifts[4];
out->workgroups_z_shift = shifts[5];

/* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift /* Quirk: for non-instanced graphics, the blob sets workgroups_z_shift
* = 32. This doesn't appear to matter to the hardware, but it's good * = 32. This doesn't appear to matter to the hardware, but it's good
* to be bit-identical. */ * to be bit-identical. */


if (quirk_graphics && (num_z <= 1)) if (quirk_graphics && (num_z <= 1))
out->workgroups_z_shift = 32;
shifts[5] = 32;


/* Quirk: for graphics, workgroups_x_shift_2 must be at least 2, /* Quirk: for graphics, workgroups_x_shift_2 must be at least 2,
* whereas for OpenCL it is simply equal to workgroups_x_shift. For GL * whereas for OpenCL it is simply equal to workgroups_x_shift. For GL
* compute, it seems it might *always* be 2, but this is suspicious and * compute, it seems it might *always* be 2, but this is suspicious and
* needs further investigation. (I'm probably just using GL wrong). */ * needs further investigation. (I'm probably just using GL wrong). */


unsigned shift_2 = shifts[3];

if (quirk_graphics) if (quirk_graphics)
out->workgroups_x_shift_2 = MAX2(out->workgroups_x_shift, 2);
else
out->workgroups_x_shift_2 = out->workgroups_x_shift;
shift_2 = MAX2(shift_2, 2);

/* Pack them in */
uint32_t packed_shifts =
(shifts[1] << 0) |
(shifts[2] << 5) |
(shifts[3] << 10) |
(shifts[4] << 16) |
(shifts[5] << 22) |
(shift_2 << 28);

/* Upload the packed bitfields */
out->invocation_count = packed;
out->invocation_shifts = packed_shifts;


/* TODO: Compute workgroups_x_shift_3 */ /* TODO: Compute workgroups_x_shift_3 */
out->workgroups_x_shift_3 = out->workgroups_x_shift_2;
out->workgroups_x_shift_3 = shift_2;
} }


/* Packs vertex/tiler descriptors simultaneously */ /* Packs vertex/tiler descriptors simultaneously */


/* Copy results over */ /* Copy results over */
tiler->invocation_count = vertex->invocation_count; tiler->invocation_count = vertex->invocation_count;
tiler->size_y_shift = vertex->size_y_shift;
tiler->size_z_shift = vertex->size_z_shift;
tiler->workgroups_x_shift = vertex->workgroups_x_shift;
tiler->workgroups_x_shift_2 = vertex->workgroups_x_shift_2;
tiler->workgroups_y_shift = vertex->workgroups_y_shift;
tiler->workgroups_z_shift = vertex->workgroups_z_shift;
tiler->invocation_shifts = vertex->invocation_shifts;


/* Set special fields for each */ /* Set special fields for each */
vertex->workgroups_x_shift_3 = 5; vertex->workgroups_x_shift_3 = 5;

+ 10
- 7
src/panfrost/include/panfrost-job.h View File

*/ */
u32 invocation_count; u32 invocation_count;


u32 size_y_shift : 5;
u32 size_z_shift : 5;
u32 workgroups_x_shift : 6;
u32 workgroups_y_shift : 6;
u32 workgroups_z_shift : 6;
/* This is max(workgroups_x_shift, 2) in all the cases I've seen. */
u32 workgroups_x_shift_2 : 4;
/* Bitfield for shifts:
*
* size_y_shift : 5
* size_z_shift : 5
* workgroups_x_shift : 6
* workgroups_y_shift : 6
* workgroups_z_shift : 6
* workgroups_x_shift_2 : 4
*/
u32 invocation_shifts;


u32 draw_mode : 4; u32 draw_mode : 4;
u32 unknown_draw : 22; u32 unknown_draw : 22;

+ 22
- 25
src/panfrost/pandecode/decode.c View File

* invocation_count for an explanation. * invocation_count for an explanation.
*/ */


unsigned size_x = bits(p->invocation_count, 0, p->size_y_shift) + 1;
unsigned size_y = bits(p->invocation_count, p->size_y_shift, p->size_z_shift) + 1;
unsigned size_z = bits(p->invocation_count, p->size_z_shift, p->workgroups_x_shift) + 1;
unsigned size_y_shift = bits(p->invocation_shifts, 0, 5);
unsigned size_z_shift = bits(p->invocation_shifts, 5, 10);
unsigned workgroups_x_shift = bits(p->invocation_shifts, 10, 16);
unsigned workgroups_y_shift = bits(p->invocation_shifts, 16, 22);
unsigned workgroups_z_shift = bits(p->invocation_shifts, 22, 28);
unsigned workgroups_x_shift_2 = bits(p->invocation_shifts, 28, 32);


unsigned groups_x = bits(p->invocation_count, p->workgroups_x_shift, p->workgroups_y_shift) + 1;
unsigned groups_y = bits(p->invocation_count, p->workgroups_y_shift, p->workgroups_z_shift) + 1;
unsigned groups_z = bits(p->invocation_count, p->workgroups_z_shift, 32) + 1;
unsigned size_x = bits(p->invocation_count, 0, size_y_shift) + 1;
unsigned size_y = bits(p->invocation_count, size_y_shift, size_z_shift) + 1;
unsigned size_z = bits(p->invocation_count, size_z_shift, workgroups_x_shift) + 1;

unsigned groups_x = bits(p->invocation_count, workgroups_x_shift, workgroups_y_shift) + 1;
unsigned groups_y = bits(p->invocation_count, workgroups_y_shift, workgroups_z_shift) + 1;
unsigned groups_z = bits(p->invocation_count, workgroups_z_shift, 32) + 1;


/* Even though we have this decoded, we want to ensure that the /* Even though we have this decoded, we want to ensure that the
* representation is "unique" so we don't lose anything by printing only * representation is "unique" so we don't lose anything by printing only


bool canonical = bool canonical =
(p->invocation_count == ref.invocation_count) && (p->invocation_count == ref.invocation_count) &&
(p->size_y_shift == ref.size_y_shift) &&
(p->size_z_shift == ref.size_z_shift) &&
(p->workgroups_x_shift == ref.workgroups_x_shift) &&
(p->workgroups_y_shift == ref.workgroups_y_shift) &&
(p->workgroups_z_shift == ref.workgroups_z_shift) &&
(p->workgroups_x_shift_2 == ref.workgroups_x_shift_2);
(p->invocation_shifts == ref.invocation_shifts);


if (!canonical) { if (!canonical) {
pandecode_msg("XXX: non-canonical workgroups packing\n"); pandecode_msg("XXX: non-canonical workgroups packing\n");
pandecode_msg("expected: %X, %d, %d, %d, %d, %d, %d\n",
pandecode_msg("expected: %X, %X",
ref.invocation_count, ref.invocation_count,
ref.size_y_shift,
ref.size_z_shift,
ref.workgroups_x_shift,
ref.workgroups_y_shift,
ref.workgroups_z_shift,
ref.workgroups_x_shift_2);
ref.invocation_shifts);


pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count); pandecode_prop("invocation_count = 0x%" PRIx32, p->invocation_count);
pandecode_prop("size_y_shift = %d", p->size_y_shift);
pandecode_prop("size_z_shift = %d", p->size_z_shift);
pandecode_prop("workgroups_x_shift = %d", p->workgroups_x_shift);
pandecode_prop("workgroups_y_shift = %d", p->workgroups_y_shift);
pandecode_prop("workgroups_z_shift = %d", p->workgroups_z_shift);
pandecode_prop("workgroups_x_shift_2 = %d", p->workgroups_x_shift_2);
pandecode_prop("size_y_shift = %d", size_y_shift);
pandecode_prop("size_z_shift = %d", size_z_shift);
pandecode_prop("workgroups_x_shift = %d", workgroups_x_shift);
pandecode_prop("workgroups_y_shift = %d", workgroups_y_shift);
pandecode_prop("workgroups_z_shift = %d", workgroups_z_shift);
pandecode_prop("workgroups_x_shift_2 = %d", workgroups_x_shift_2);
} }


/* Regardless, print the decode */ /* Regardless, print the decode */

Loading…
Cancel
Save