diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index acd4d34f899..19522cc97b1 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -428,20 +428,6 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) } } -static LLVMValueRef get_instance_index_for_fetch( - struct si_shader_context *ctx, - unsigned param_start_instance, LLVMValueRef divisor) -{ - LLVMValueRef result = ctx->abi.instance_id; - - /* The division must be done before START_INSTANCE is added. */ - if (divisor != ctx->i32_1) - result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); - - return LLVMBuildAdd(ctx->ac.builder, result, - LLVMGetParam(ctx->main_fn, param_start_instance), ""); -} - /* Bitcast <4 x float> to <2 x double>, extract the component, and convert * to float. */ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, @@ -7302,22 +7288,32 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, key->vs_prolog.states.instance_divisor_is_one & (1u << i); bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); - LLVMValueRef index; + LLVMValueRef index = NULL; + + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; + + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); + } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } if (divisor_is_one || divisor_is_fetched) { - LLVMValueRef divisor = ctx->i32_1; - - if (divisor_is_fetched) { - divisor = buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->i32, i * 4, 0)); - divisor = ac_to_integer(&ctx->ac, divisor); - } - - /* InstanceID / Divisor + StartInstance */ - index = get_instance_index_for_fetch(ctx, - user_sgpr_base + - SI_SGPR_START_INSTANCE, - divisor); + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(ctx->ac.builder, diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index 827d8495006..8e4cdddf0b9 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -32,6 +32,7 @@ #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" +#include "util/fast_idiv_by_const.h" static unsigned si_map_swizzle(unsigned swizzle) { @@ -4372,6 +4373,29 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state) * Vertex elements & buffers */ +struct util_fast_udiv_info32 { + unsigned multiplier; /* the "magic number" multiplier */ + unsigned pre_shift; /* shift for the dividend before multiplying */ + unsigned post_shift; /* shift for the dividend after multiplying */ + int increment; /* 0 or 1; if set then increment the numerator, using one of + the two strategies */ +}; + +static struct util_fast_udiv_info32 +util_compute_fast_udiv_info32(uint32_t D, unsigned num_bits) +{ + struct util_fast_udiv_info info = + util_compute_fast_udiv_info(D, num_bits, 32); + + struct util_fast_udiv_info32 result = { + info.multiplier, + info.pre_shift, + info.post_shift, + info.increment, + }; + return result; +} + static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, const struct pipe_vertex_element *elements) @@ -4379,6 +4403,12 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, struct si_screen *sscreen = (struct si_screen*)ctx->screen; struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); bool used[SI_NUM_VERTEX_BUFFERS] = {}; + struct util_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {}; + STATIC_ASSERT(sizeof(struct util_fast_udiv_info32) == 16); + STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); int i; assert(count <= SI_MAX_ATTRIBS); @@ -4401,14 +4431,17 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, return NULL; } - if (elements[i].instance_divisor) { + unsigned instance_divisor = elements[i].instance_divisor; + if (instance_divisor) { v->uses_instance_divisors = true; - v->instance_divisors[i] = elements[i].instance_divisor; - if (v->instance_divisors[i] == 1) + if (instance_divisor == 1) { v->instance_divisor_is_one |= 1u << i; - else + } else { v->instance_divisor_is_fetched |= 1u << i; + divisor_factors[i] = + util_compute_fast_udiv_info32(instance_divisor, 32); + } } if (!used[vbo_index]) { @@ -4518,6 +4551,22 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); } + + if (v->instance_divisor_is_fetched) { + unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); + + v->instance_divisor_factor_buffer = + (struct r600_resource*) + pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, + num_divisors * sizeof(divisor_factors[0])); + if (!v->instance_divisor_factor_buffer) { + FREE(v); + return NULL; + } + void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, + NULL, PIPE_TRANSFER_WRITE); + memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0])); + } return v; } @@ -4541,10 +4590,10 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) if (v && v->instance_divisor_is_fetched) { struct pipe_constant_buffer cb; - cb.buffer = NULL; - cb.user_buffer = v->instance_divisors; + cb.buffer = &v->instance_divisor_factor_buffer->b.b; + cb.user_buffer = NULL; cb.buffer_offset = 0; - cb.buffer_size = sizeof(uint32_t) * v->count; + cb.buffer_size = 0xffffffff; si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); } } @@ -4552,9 +4601,11 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) static void si_delete_vertex_element(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *v = (struct si_vertex_elements*)state; if (sctx->vertex_elements == state) sctx->vertex_elements = NULL; + r600_resource_reference(&v->instance_divisor_factor_buffer, NULL); FREE(state); } diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 16fd223d00a..f52296d1119 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -132,7 +132,7 @@ struct si_stencil_ref { struct si_vertex_elements { - uint32_t instance_divisors[SI_MAX_ATTRIBS]; + struct r600_resource *instance_divisor_factor_buffer; uint32_t rsrc_word3[SI_MAX_ATTRIBS]; uint16_t src_offset[SI_MAX_ATTRIBS]; uint8_t fix_fetch[SI_MAX_ATTRIBS];