diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c index 9838f062de5..ef6ae9efa08 100644 --- a/src/compiler/nir/nir_opt_load_store_vectorize.c +++ b/src/compiler/nir/nir_opt_load_store_vectorize.c @@ -56,7 +56,7 @@ struct intrinsic_info { nir_variable_mode mode; /* 0 if the mode is obtained from the deref. */ nir_intrinsic_op op; - bool is_atomic; + bool is_unvectorizable; /* Indices into nir_intrinsic::src[] or -1 if not applicable. */ int resource_src; /* resource (e.g. from vulkan_resource_index) */ int base_src; /* offset which it loads/stores from */ @@ -71,9 +71,9 @@ static const struct intrinsic_info * get_info(nir_intrinsic_op op) { switch (op) { -#define INFO(mode, op, atomic, res, base, deref, val, scale) \ +#define INFO(mode, op, unvectorizable, res, base, deref, val, scale) \ case nir_intrinsic_##op: { \ - static const struct intrinsic_info op##_info = { mode, nir_intrinsic_##op, atomic, res, base, deref, val, scale }; \ + static const struct intrinsic_info op##_info = { mode, nir_intrinsic_##op, unvectorizable, res, base, deref, val, scale }; \ return &op##_info; \ } #define LOAD(mode, op, res, base, deref, scale) INFO(mode, load_##op, false, res, base, deref, -1, scale) @@ -90,6 +90,8 @@ get_info(nir_intrinsic_op op) STORE(0, deref, -1, -1, 0, 1, 1) LOAD(nir_var_mem_shared, shared, -1, 0, -1, 1) STORE(nir_var_mem_shared, shared, -1, 1, -1, 0, 1) + INFO(nir_var_mem_shared, load_shared2_amd, true, -1, 0, -1, -1, 1); + INFO(nir_var_mem_shared, store_shared2_amd, true, -1, 1, -1, 0, 1) LOAD(nir_var_mem_global, global, -1, 0, -1, 1) STORE(nir_var_mem_global, global, -1, 1, -1, 0, 1) LOAD(nir_var_mem_global, global_constant, -1, 0, -1, 1) @@ -594,6 +596,9 @@ create_entry(void *mem_ctx, const struct intrinsic_info *info, nir_intrinsic_instr *intrin) { + bool is_shared2 = intrin->intrinsic == nir_intrinsic_load_shared2_amd || + intrin->intrinsic == nir_intrinsic_store_shared2_amd; + struct entry *entry = rzalloc(mem_ctx, struct entry); entry->intrin = intrin; entry->instr = &intrin->instr; @@ -601,6 +606,8 @@ create_entry(void *mem_ctx, entry->is_store = entry->info->value_src >= 0; entry->num_components = entry->is_store ? intrin->num_components : nir_def_last_component_read(&intrin->def) + 1; + if (is_shared2) + entry->num_components = 1; if (entry->info->deref_src >= 0) { entry->deref = nir_src_as_deref(intrin->src[entry->info->deref_src]); @@ -1035,11 +1042,37 @@ bindings_different_restrict(nir_shader *shader, struct entry *a, struct entry *b } static int64_t -compare_entries(struct entry *a, struct entry *b) +may_alias_internal(struct entry *a, struct entry *b, uint32_t a_offset, uint32_t b_offset) { + /* use adjacency information */ + /* TODO: we can look closer at the entry keys */ if (!entry_key_equals(a->key, b->key)) - return INT64_MAX; - return b->offset_signed - a->offset_signed; + return true; + + int64_t diff = (b->offset_signed + b_offset) - (a->offset_signed + a_offset); + + /* with atomics, nir_intrinsic_instr::num_components can be 0 */ + if (diff < 0) + return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u); + else + return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u); +} + +static unsigned +parse_shared2_offsets(struct entry *entry, uint32_t offsets[2]) +{ + if (entry->intrin->intrinsic != nir_intrinsic_load_shared2_amd && + entry->intrin->intrinsic != nir_intrinsic_store_shared2_amd) { + offsets[0] = 0; + return 1; + } + + uint32_t stride = get_bit_size(entry) / 8u; + if (nir_intrinsic_st64(entry->intrin)) + stride *= 64; + offsets[0] = nir_intrinsic_offset0(entry->intrin) * stride; + offsets[1] = nir_intrinsic_offset1(entry->intrin) * stride; + return 2; } static bool @@ -1078,20 +1111,19 @@ may_alias(nir_shader *shader, struct entry *a, struct entry *b) return true; } - /* use adjacency information */ - /* TODO: we can look closer at the entry keys */ - int64_t diff = compare_entries(a, b); - if (diff != INT64_MAX) { - /* with atomics, nir_intrinsic_instr::num_components can be 0 */ - if (diff < 0) - return llabs(diff) < MAX2(b->num_components, 1u) * (get_bit_size(b) / 8u); - else - return diff < MAX2(a->num_components, 1u) * (get_bit_size(a) / 8u); + uint32_t a_offsets[2], b_offsets[2] = { 0, 0 }; + unsigned a_count = parse_shared2_offsets(a, a_offsets); + unsigned b_count = parse_shared2_offsets(b, b_offsets); + for (unsigned i = 0; i < a_count; i++) { + for (unsigned j = 0; j < b_count; j++) { + if (may_alias_internal(a, b, a_offsets[i], b_offsets[j])) + return true; + } } /* TODO: we can use deref information */ - return true; + return false; } static bool @@ -1223,7 +1255,7 @@ can_vectorize(struct vectorize_ctx *ctx, struct entry *first, struct entry *seco /* we can only vectorize non-volatile loads/stores of the same type and with * the same access */ if (first->info != second->info || first->access != second->access || - (first->access & ACCESS_VOLATILE) || first->info->is_atomic) + (first->access & ACCESS_VOLATILE) || first->info->is_unvectorizable) return false; if (first->intrin->intrinsic == nir_intrinsic_load_buffer_amd ||