diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 026f715edca..33012d4bb01 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -866,6 +866,14 @@ load("output_u8_as_fp16_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER]) # src[] = { sampler_index } load("sampler_lod_parameters_pan", 1, [CAN_ELIMINATE, CAN_REORDER]) +# R600 specific instrincs +# +# R600 can only fetch 16 byte aligned data from an UBO, and the actual offset +# is given in vec4 units, so we have to fetch the a vec4 and get the component +# later +# src[] = { buffer_index, offset }. +load("ubo_r600", 2, [ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE, CAN_REORDER]) + # V3D-specific instrinc for tile buffer color reads. # # The hardware requires that we read the samples and components of a pixel diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c index 38e32e4ce3d..e0da743144a 100644 --- a/src/compiler/nir/nir_validate.c +++ b/src/compiler/nir/nir_validate.c @@ -531,6 +531,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) case nir_intrinsic_load_deref: { nir_deref_instr *src = nir_src_as_deref(instr->src[0]); + assert(src); validate_assert(state, glsl_type_is_vector_or_scalar(src->type) || (src->mode == nir_var_uniform && glsl_get_base_type(src->type) == GLSL_TYPE_SUBROUTINE)); @@ -545,6 +546,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state) case nir_intrinsic_store_deref: { nir_deref_instr *dst = nir_src_as_deref(instr->src[0]); + assert(dst); validate_assert(state, glsl_type_is_vector_or_scalar(dst->type)); validate_assert(state, instr->num_components == glsl_get_vector_elements(dst->type)); diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp index 206435528a6..ff7bee26b4f 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp @@ -386,12 +386,112 @@ bool r600_lower_scratch_addresses(nir_shader *shader) return progress; } +static nir_ssa_def * +r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options) +{ + b->cursor = nir_before_instr(instr); + + nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr); + assert(op->intrinsic == nir_intrinsic_load_ubo); + + bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0])); + + nir_ssa_def *offset = op->src[1].ssa; + + /* This is ugly: With const addressing we can actually set a proper fetch target mask, + * but for this we need the component encoded, we don't shift and do de decoding in the + * backend. Otherwise we shift by four and resolve the component here + * (TODO: encode the start component in the intrinsic when the offset base is non-constant + * but a multiple of 16 */ + + nir_ssa_def *new_offset = offset; + if (!const_address) + new_offset = nir_ishr(b, offset, nir_imm_int(b, 4)); + + nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600); + load->num_components = const_address ? op->num_components : 4; + load->src[0] = op->src[0]; + load->src[1] = nir_src_for_ssa(new_offset); + nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op)); + + nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL); + nir_builder_instr_insert(b, &load->instr); + + /* when four components are loaded or both the offset and the location + * are constant, then the backend can deal with it better */ + if (op->num_components == 4 || const_address) + return &load->dest.ssa; + + /* What comes below is a performance disaster when the offset is not constant + * because then we have to assume that any component can be the first one and we + * have to pick the result manually. */ + nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset, nir_imm_int(b, 2)), + nir_imm_int(b,3)); + + const unsigned swz_000[4] = {0, 0, 0, 0}; + nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3), + nir_swizzle(b, first_comp, swz_000, 4)); + + const unsigned szw_0[1] = {0}; + const unsigned szw_1[1] = {1}; + const unsigned szw_2[1] = {2}; + + if (op->num_components == 1) { + const unsigned szw_3[1] = {3}; + nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1), + nir_swizzle(b, &load->dest.ssa, szw_0, 1), + nir_swizzle(b, &load->dest.ssa, szw_3, 1)); + nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1), + nir_swizzle(b, &load->dest.ssa, szw_1, 1), + check0); + return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1), + nir_swizzle(b, &load->dest.ssa, szw_2, 1), + check1); + } else if (op->num_components == 2) { + const unsigned szw_01[2] = {0, 1}; + const unsigned szw_12[2] = {1, 2}; + const unsigned szw_23[2] = {2, 3}; + + nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1), + nir_swizzle(b, &load->dest.ssa, szw_01, 2), + nir_swizzle(b, &load->dest.ssa, szw_23, 2)); + return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1), + nir_swizzle(b, &load->dest.ssa, szw_12, 2), + check0); + } else { + const unsigned szw_012[3] = {0, 1, 3}; + const unsigned szw_123[3] = {1, 2, 3}; + return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1), + nir_swizzle(b, &load->dest.ssa, szw_012, 3), + nir_swizzle(b, &load->dest.ssa, szw_123, 3)); + } +} + +bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options) +{ + if (instr->type != nir_instr_type_intrinsic) + return false; + + nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr); + return op->intrinsic == nir_intrinsic_load_ubo; +} + + +bool r600_lower_ubo_to_align16(nir_shader *shader) +{ + return nir_shader_lower_instructions(shader, + r600_lower_ubo_to_align16_filter, + r600_lower_ubo_to_align16_impl, + nullptr); +} + } using r600::r600_nir_lower_int_tg4; using r600::r600_nir_lower_pack_unpack_2x16; using r600::r600_lower_scratch_addresses; using r600::r600_lower_fs_out_to_vector; +using r600::r600_lower_ubo_to_align16; int r600_glsl_type_size(const struct glsl_type *type, bool is_bindless) @@ -512,7 +612,10 @@ int r600_shader_from_nir(struct r600_context *rctx, const nir_function *func = reinterpret_cast(exec_list_get_head_const(&sel->nir->functions)); bool optimize = func->impl->registers.length() == 0 && !has_saturate(func); - + if (optimize) { + optimize_once(sel->nir); + NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16); + } /* It seems the output of this optimization is cached somewhere, and * when there are registers, then we can no longer copy propagate, so * skip the optimization then. (There is probably a better way, but yeah) diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp index 96c8e804e2b..0ee6bbd6313 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp @@ -447,7 +447,7 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins case nir_intrinsic_discard: case nir_intrinsic_discard_if: return emit_discard_if(instr); - case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_r600: return emit_load_ubo(instr); case nir_intrinsic_copy_deref: case nir_intrinsic_load_constant: