Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>tags/10.6-branchpoint
@@ -50,6 +50,7 @@ Note: some of the new features are only available with certain drivers. | |||
<li>GL_ARB_gpu_shader_fp64 on nvc0, softpipe</li> | |||
<li>GL_ARB_instanced_arrays on freedreno</li> | |||
<li>GL_ARB_pipeline_statistics_query on i965, nv50, nvc0, r600, radeonsi, softpipe</li> | |||
<li>GL_ARB_uniform_buffer_object on freedreno</li> | |||
<li>GL_EXT_draw_buffers2 on freedreno</li> | |||
<li>GL_ARB_clip_control on i965</li> | |||
</ul> |
@@ -87,11 +87,12 @@ static void | |||
emit_constants(struct fd_ringbuffer *ring, | |||
enum adreno_state_block sb, | |||
struct fd_constbuf_stateobj *constbuf, | |||
struct ir3_shader_variant *shader) | |||
struct ir3_shader_variant *shader, | |||
bool emit_immediates) | |||
{ | |||
uint32_t enabled_mask = constbuf->enabled_mask; | |||
uint32_t first_immediate; | |||
uint32_t base = 0; | |||
uint32_t max_const; | |||
int i; | |||
// XXX TODO only emit dirty consts.. but we need to keep track if | |||
// they are clobbered by a clear, gmem2mem, or mem2gmem.. | |||
@@ -102,42 +103,57 @@ emit_constants(struct fd_ringbuffer *ring, | |||
* than first_immediate. In that case truncate the user consts | |||
* early to avoid HLSQ lockup caused by writing too many consts | |||
*/ | |||
first_immediate = MIN2(shader->first_immediate, shader->constlen); | |||
max_const = MIN2(shader->first_driver_param, shader->constlen); | |||
/* emit user constants: */ | |||
while (enabled_mask) { | |||
unsigned index = ffs(enabled_mask) - 1; | |||
if (enabled_mask & 1) { | |||
const unsigned index = 0; | |||
struct pipe_constant_buffer *cb = &constbuf->cb[index]; | |||
unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */ | |||
// I expect that size should be a multiple of vec4's: | |||
assert(size == align(size, 4)); | |||
/* gallium could leave const buffers bound above what the | |||
* current shader uses.. don't let that confuse us. | |||
/* and even if the start of the const buffer is before | |||
* first_immediate, the end may not be: | |||
*/ | |||
if (base >= (4 * first_immediate)) | |||
break; | |||
size = MIN2(size, 4 * max_const); | |||
if (constbuf->dirty_mask & (1 << index)) { | |||
/* and even if the start of the const buffer is before | |||
* first_immediate, the end may not be: | |||
*/ | |||
size = MIN2(size, (4 * first_immediate) - base); | |||
fd3_emit_constant(ring, sb, base, | |||
cb->buffer_offset, size, | |||
cb->user_buffer, cb->buffer); | |||
if (size && constbuf->dirty_mask & (1 << index)) { | |||
fd3_emit_constant(ring, sb, 0, | |||
cb->buffer_offset, size, | |||
cb->user_buffer, cb->buffer); | |||
constbuf->dirty_mask &= ~(1 << index); | |||
} | |||
base += size; | |||
enabled_mask &= ~(1 << index); | |||
} | |||
if (shader->constlen > shader->first_driver_param) { | |||
uint32_t params = MIN2(4, shader->constlen - shader->first_driver_param); | |||
/* emit ubos: */ | |||
OUT_PKT3(ring, CP_LOAD_STATE, 2 + params * 4); | |||
OUT_RING(ring, CP_LOAD_STATE_0_DST_OFF(shader->first_driver_param * 2) | | |||
CP_LOAD_STATE_0_STATE_SRC(SS_DIRECT) | | |||
CP_LOAD_STATE_0_STATE_BLOCK(sb) | | |||
CP_LOAD_STATE_0_NUM_UNIT(params * 2)); | |||
OUT_RING(ring, CP_LOAD_STATE_1_EXT_SRC_ADDR(0) | | |||
CP_LOAD_STATE_1_STATE_TYPE(ST_CONSTANTS)); | |||
for (i = 1; i <= params * 4; i++) { | |||
struct pipe_constant_buffer *cb = &constbuf->cb[i]; | |||
assert(!cb->user_buffer); | |||
if ((enabled_mask & (1 << i)) && cb->buffer) | |||
OUT_RELOC(ring, fd_resource(cb->buffer)->bo, cb->buffer_offset, 0, 0); | |||
else | |||
OUT_RING(ring, 0xbad00000 | ((i - 1) << 16)); | |||
} | |||
} | |||
/* emit shader immediates: */ | |||
if (shader) { | |||
if (shader && emit_immediates) { | |||
int size = shader->immediates_count; | |||
base = shader->first_immediate; | |||
uint32_t base = shader->first_immediate; | |||
/* truncate size to avoid writing constants that shader | |||
* does not use: | |||
@@ -619,11 +635,11 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, | |||
fd_wfi(ctx, ring); | |||
emit_constants(ring, SB_VERT_SHADER, | |||
&ctx->constbuf[PIPE_SHADER_VERTEX], | |||
(emit->prog->dirty & FD_SHADER_DIRTY_VP) ? vp : NULL); | |||
vp, emit->prog->dirty & FD_SHADER_DIRTY_VP); | |||
if (!emit->key.binning_pass) { | |||
emit_constants(ring, SB_FRAG_SHADER, | |||
&ctx->constbuf[PIPE_SHADER_FRAGMENT], | |||
(emit->prog->dirty & FD_SHADER_DIRTY_FP) ? fp : NULL); | |||
fp, emit->prog->dirty & FD_SHADER_DIRTY_FP); | |||
} | |||
} | |||
@@ -635,8 +651,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring, | |||
0, | |||
0 | |||
}; | |||
if (vp->constlen > vp->first_driver_param) { | |||
fd3_emit_constant(ring, SB_VERT_SHADER, vp->first_driver_param * 4, | |||
if (vp->constlen >= vp->first_driver_param + 4) { | |||
fd3_emit_constant(ring, SB_VERT_SHADER, | |||
(vp->first_driver_param + 4) * 4, | |||
0, 4, vertex_params, NULL); | |||
} | |||
} |
@@ -356,9 +356,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader, | |||
* split between VS and FS. Use lower limit of 256 to | |||
* avoid getting into impossible situations: | |||
*/ | |||
return ((is_a3xx(screen) || is_a4xx(screen)) ? 256 : 64) * sizeof(float[4]); | |||
return ((is_a3xx(screen) || is_a4xx(screen)) ? 4096 : 64) * sizeof(float[4]); | |||
case PIPE_SHADER_CAP_MAX_CONST_BUFFERS: | |||
return 1; | |||
return is_a3xx(screen) ? 16 : 1; | |||
case PIPE_SHADER_CAP_MAX_PREDS: | |||
return 0; /* nothing uses this */ | |||
case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED: |
@@ -487,7 +487,7 @@ static int emit_cat6(struct ir3_instruction *instr, void *ptr, | |||
iassert(instr->regs_count >= 2); | |||
if (instr->cat6.offset) { | |||
if (instr->cat6.offset || instr->opc == OPC_LDG) { | |||
instr_cat6a_t *cat6a = ptr; | |||
cat6->has_off = true; |
@@ -151,6 +151,7 @@ static void vectorize(struct ir3_compile_context *ctx, | |||
static void create_mov(struct ir3_compile_context *ctx, | |||
struct tgsi_dst_register *dst, struct tgsi_src_register *src); | |||
static type_t get_ftype(struct ir3_compile_context *ctx); | |||
static type_t get_utype(struct ir3_compile_context *ctx); | |||
static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i) | |||
{ | |||
@@ -252,7 +253,7 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, | |||
* the assembler what the max addr reg value can be: | |||
*/ | |||
if (info->indirect_files & FM(CONSTANT)) | |||
so->constlen = ctx->info.file_max[TGSI_FILE_CONSTANT] + 1; | |||
so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1); | |||
i = 0; | |||
i += setup_arrays(ctx, TGSI_FILE_INPUT, i); | |||
@@ -261,12 +262,13 @@ compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so, | |||
/* any others? we don't track arrays for const..*/ | |||
/* Immediates go after constants: */ | |||
if (so->type == SHADER_VERTEX) { | |||
so->first_driver_param = info->file_max[TGSI_FILE_CONSTANT] + 1; | |||
so->first_immediate = so->first_driver_param + 1; | |||
} else { | |||
so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1; | |||
} | |||
so->first_immediate = so->first_driver_param = | |||
info->const_file_max[0] + 1; | |||
/* 1 unit for the vertex id base */ | |||
if (so->type == SHADER_VERTEX) | |||
so->first_immediate++; | |||
/* 4 (vec4) units for ubo base addresses */ | |||
so->first_immediate += 4; | |||
ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1); | |||
ret = tgsi_parse_init(&ctx->parser, ctx->tokens); | |||
@@ -717,6 +719,80 @@ ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg, | |||
reg->offset = regid(off, chan); | |||
instr = array_fanin(ctx, aid, src->File); | |||
} else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) { | |||
const struct tgsi_full_src_register *fsrc = (const void *)src; | |||
struct ir3_instruction *temp = NULL; | |||
int ubo_regid = regid(ctx->so->first_driver_param, 0) + | |||
fsrc->Dimension.Index - 1; | |||
int offset = 0; | |||
/* We don't handle indirect UBO array accesses... yet. */ | |||
compile_assert(ctx, !fsrc->Dimension.Indirect); | |||
/* UBOs start at index 1. */ | |||
compile_assert(ctx, fsrc->Dimension.Index > 0); | |||
if (src->Indirect) { | |||
/* In case of an indirect index, it will have been loaded into an | |||
* address register. There will be a sequence of | |||
* | |||
* shl.b x, val, 2 | |||
* mova a0, x | |||
* | |||
* We rely on this sequence to get the original val out and shift | |||
* it by 4, since we're dealing in vec4 units. | |||
*/ | |||
compile_assert(ctx, ctx->block->address); | |||
compile_assert(ctx, ctx->block->address->regs[1]->instr->opc == | |||
OPC_SHL_B); | |||
temp = instr = instr_create(ctx, 2, OPC_SHL_B); | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr = | |||
ctx->block->address->regs[1]->instr->regs[1]->instr; | |||
ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4; | |||
} else if (src->Index >= 64) { | |||
/* Otherwise it's a plain index (in vec4 units). Move it into a | |||
* register. | |||
*/ | |||
temp = instr = instr_create(ctx, 1, 0); | |||
instr->cat1.src_type = get_utype(ctx); | |||
instr->cat1.dst_type = get_utype(ctx); | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16; | |||
} else { | |||
/* The offset is small enough to fit into the ldg instruction | |||
* directly. | |||
*/ | |||
offset = src->Index * 16; | |||
} | |||
if (temp) { | |||
/* If there was an offset (most common), add it to the buffer | |||
* address. | |||
*/ | |||
instr = instr_create(ctx, 2, OPC_ADD_S); | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp; | |||
ir3_reg_create(instr, ubo_regid, IR3_REG_CONST); | |||
} else { | |||
/* Otherwise just load the buffer address directly */ | |||
instr = instr_create(ctx, 1, 0); | |||
instr->cat1.src_type = get_utype(ctx); | |||
instr->cat1.dst_type = get_utype(ctx); | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, ubo_regid, IR3_REG_CONST); | |||
} | |||
temp = instr; | |||
instr = instr_create(ctx, 6, OPC_LDG); | |||
instr->cat6.type = TYPE_U32; | |||
instr->cat6.offset = offset + chan * 4; | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp; | |||
ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1; | |||
reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST); | |||
} else { | |||
/* normal case (not relative addressed GPR) */ | |||
instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan)); | |||
@@ -3183,7 +3259,8 @@ decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl) | |||
instr->cat1.src_type = get_stype(ctx); | |||
instr->cat1.dst_type = get_stype(ctx); | |||
ir3_reg_create(instr, 0, 0); | |||
ir3_reg_create(instr, regid(so->first_driver_param, 0), IR3_REG_CONST); | |||
ir3_reg_create(instr, regid(so->first_driver_param + 4, 0), | |||
IR3_REG_CONST); | |||
break; | |||
case TGSI_SEMANTIC_INSTANCEID: | |||
ctx->instance_id = instr = create_input(ctx->block, NULL, r); |
@@ -175,7 +175,7 @@ static void legalize(struct ir3_legalize_ctx *ctx) | |||
/* both tex/sfu appear to not always immediately consume | |||
* their src register(s): | |||
*/ | |||
if (is_tex(n) || is_sfu(n)) { | |||
if (is_tex(n) || is_sfu(n) || is_mem(n)) { | |||
foreach_src(reg, n) { | |||
if (reg_gpr(reg)) | |||
regmask_set(&needs_ss_war, reg); |
@@ -116,7 +116,7 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id) | |||
* the compiler (to worst-case value) since we don't know in | |||
* the assembler what the max addr reg value can be: | |||
*/ | |||
v->constlen = MAX2(v->constlen, v->info.max_const + 1); | |||
v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1)); | |||
fixup_regfootprint(v); | |||