diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 9a30da63a1a..9591a1a497f 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1259,25 +1259,9 @@ intrinsic("copy_ubo_to_uniform_ir3", [1, 1], indices=[BASE, RANGE]) # DXIL specific intrinsics # src[] = { value, mask, index, offset }. intrinsic("store_ssbo_masked_dxil", [1, 1, 1, 1]) -# src[] = { value, index }. -intrinsic("store_shared_dxil", [1, 1]) -# src[] = { value, mask, index }. -intrinsic("store_shared_masked_dxil", [1, 1, 1]) -# src[] = { value, index }. -intrinsic("store_scratch_dxil", [1, 1]) -# src[] = { index }. -load("shared_dxil", [1], [], [CAN_ELIMINATE]) -# src[] = { index }. -load("scratch_dxil", [1], [], [CAN_ELIMINATE]) # src[] = { index, 16-byte-based-offset } load("ubo_dxil", [1, 1], [], [CAN_ELIMINATE, CAN_REORDER]) -# DXIL Shared atomic intrinsics -# -# src0 is the index in the i32 array for by the shared memory region -intrinsic("shared_atomic_dxil", src_comp=[1, 1], dest_comp=1, indices=[ATOMIC_OP]) -intrinsic("shared_atomic_swap_dxil", src_comp=[1, 1, 1], dest_comp=1, indices=[ATOMIC_OP]) - # Intrinsics used by the Midgard/Bifrost blend pipeline. These are defined # within a blend shader to read/write the raw value from the tile buffer, # without applying any format conversion in the process. If the shader needs diff --git a/src/gallium/drivers/d3d12/d3d12_compiler.cpp b/src/gallium/drivers/d3d12/d3d12_compiler.cpp index befe646eb9c..34d650e2936 100644 --- a/src/gallium/drivers/d3d12/d3d12_compiler.cpp +++ b/src/gallium/drivers/d3d12/d3d12_compiler.cpp @@ -144,7 +144,6 @@ compile_nir(struct d3d12_context *ctx, struct d3d12_shader_selector *sel, NIR_PASS_V(nir, d3d12_lower_state_vars, shader); const struct dxil_nir_lower_loads_stores_options loads_stores_options = {}; NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); - NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil); NIR_PASS_V(nir, dxil_nir_lower_double_math); if (key->stage == PIPE_SHADER_FRAGMENT && key->fs.multisample_disabled) diff --git a/src/microsoft/clc/clc_compiler.c b/src/microsoft/clc/clc_compiler.c index f29f6cdbc54..fc73b2e730b 100644 --- a/src/microsoft/clc/clc_compiler.c +++ b/src/microsoft/clc/clc_compiler.c @@ -976,30 +976,8 @@ clc_spirv_to_dxil(struct clc_libclc *lib, const struct dxil_nir_lower_loads_stores_options loads_stores_options = { .use_16bit_ssbo = false, }; - NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); - NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs); - NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil); - NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_all); - NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); - - // Convert pack to pack_split - NIR_PASS_V(nir, nir_lower_pack); - // Lower pack_split to bit math - NIR_PASS_V(nir, nir_opt_algebraic); - - NIR_PASS_V(nir, nir_opt_dce); - - nir_validate_shader(nir, "Validate before feeding NIR to the DXIL compiler"); - struct nir_to_dxil_options opts = { - .interpolate_at_vertex = false, - .lower_int16 = (conf && (conf->lower_bit_size & 16) != 0), - .disable_math_refactoring = true, - .num_kernel_globals = num_global_inputs, - .environment = DXIL_ENVIRONMENT_CL, - .shader_model_max = conf && conf->max_shader_model ? conf->max_shader_model : SHADER_MODEL_6_2, - .validator_version_max = conf ? conf->validator_version : DXIL_VALIDATOR_1_4, - }; - + + /* Now that function-declared local vars have been sized, append args */ for (unsigned i = 0; i < out_dxil->kernel->num_args; i++) { if (out_dxil->kernel->args[i].address_qualifier != CLC_KERNEL_ARG_ADDRESS_LOCAL) continue; @@ -1026,6 +1004,29 @@ clc_spirv_to_dxil(struct clc_libclc *lib, nir->info.shared_size += size; } + NIR_PASS_V(nir, dxil_nir_lower_loads_stores_to_dxil, &loads_stores_options); + NIR_PASS_V(nir, dxil_nir_opt_alu_deref_srcs); + NIR_PASS_V(nir, nir_lower_fp16_casts, nir_lower_fp16_all); + NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); + + // Convert pack to pack_split + NIR_PASS_V(nir, nir_lower_pack); + // Lower pack_split to bit math + NIR_PASS_V(nir, nir_opt_algebraic); + + NIR_PASS_V(nir, nir_opt_dce); + + nir_validate_shader(nir, "Validate before feeding NIR to the DXIL compiler"); + struct nir_to_dxil_options opts = { + .interpolate_at_vertex = false, + .lower_int16 = (conf && (conf->lower_bit_size & 16) != 0), + .disable_math_refactoring = true, + .num_kernel_globals = num_global_inputs, + .environment = DXIL_ENVIRONMENT_CL, + .shader_model_max = conf && conf->max_shader_model ? conf->max_shader_model : SHADER_MODEL_6_2, + .validator_version_max = conf ? conf->validator_version : DXIL_VALIDATOR_1_4, + }; + metadata->local_mem_size = nir->info.shared_size; metadata->priv_mem_size = nir->scratch_size; diff --git a/src/microsoft/compiler/dxil_module.c b/src/microsoft/compiler/dxil_module.c index 39849b001de..8b13c04c24a 100644 --- a/src/microsoft/compiler/dxil_module.c +++ b/src/microsoft/compiler/dxil_module.c @@ -3372,11 +3372,10 @@ dxil_emit_extractval(struct dxil_module *m, const struct dxil_value *src, const struct dxil_value * dxil_emit_alloca(struct dxil_module *m, const struct dxil_type *alloc_type, - const struct dxil_type *size_type, const struct dxil_value *size, unsigned int align) { - assert(size_type && size_type->type == TYPE_INTEGER); + assert(size->type->type == TYPE_INTEGER); const struct dxil_type *return_type = dxil_module_get_pointer_type(m, alloc_type); @@ -3388,7 +3387,7 @@ dxil_emit_alloca(struct dxil_module *m, const struct dxil_type *alloc_type, return NULL; instr->alloca.alloc_type = alloc_type; - instr->alloca.size_type = size_type; + instr->alloca.size_type = size->type; instr->alloca.size = size; instr->alloca.align = util_logbase2(align) + 1; assert(instr->alloca.align < (1 << 5)); diff --git a/src/microsoft/compiler/dxil_module.h b/src/microsoft/compiler/dxil_module.h index 7d34458be0d..f07603dfafe 100644 --- a/src/microsoft/compiler/dxil_module.h +++ b/src/microsoft/compiler/dxil_module.h @@ -542,7 +542,6 @@ dxil_emit_ret_void(struct dxil_module *m); const struct dxil_value * dxil_emit_alloca(struct dxil_module *m, const struct dxil_type *alloc_type, - const struct dxil_type *size_type, const struct dxil_value *size, unsigned int align); diff --git a/src/microsoft/compiler/dxil_nir.c b/src/microsoft/compiler/dxil_nir.c index 36e0a3d483a..454b80aa0d5 100644 --- a/src/microsoft/compiler/dxil_nir.c +++ b/src/microsoft/compiler/dxil_nir.c @@ -332,23 +332,8 @@ lower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr, unsigned min_bit_siz return true; } -static void -lower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) -{ - for (unsigned i = 0; i < num_comps; i++) { - nir_intrinsic_instr *load = - nir_intrinsic_instr_create(b->shader, op); - - load->num_components = 1; - load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); - nir_ssa_dest_init(&load->instr, &load->dest, 1, 32); - nir_builder_instr_insert(b, &load->instr); - comps[i] = &load->dest.ssa; - } -} - static bool -lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) +lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) { assert(intr->dest.is_ssa); unsigned bit_size = nir_dest_bit_size(intr->dest); @@ -356,17 +341,13 @@ lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); - nir_intrinsic_op op = intr->intrinsic; assert(intr->src[0].is_ssa); nir_ssa_def *offset = intr->src[0].ssa; - if (op == nir_intrinsic_load_shared) { + if (intr->intrinsic == nir_intrinsic_load_shared) offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); - op = nir_intrinsic_load_shared_dxil; - } else { + else offset = nir_u2u32(b, offset); - op = nir_intrinsic_load_scratch_dxil; - } nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; @@ -375,7 +356,8 @@ lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) * is an i32 array and DXIL does not support type casts. */ unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); - lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); + for (unsigned i = 0; i < num_32bit_comps; i++) + comps_32bit[i] = nir_load_array_var(b, var, nir_iadd_imm(b, index, i)); unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { @@ -407,23 +389,9 @@ lower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) return true; } -static void -lower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) -{ - for (unsigned i = 0; i < vec32->num_components; i++) { - nir_intrinsic_instr *store = - nir_intrinsic_instr_create(b->shader, op); - - store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); - store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); - store->num_components = 1; - nir_builder_instr_insert(b, &store->instr); - } -} - static void lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, - nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op, unsigned alignment) + nir_ssa_def *vec32, unsigned num_bits, nir_variable *var, unsigned alignment) { nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); @@ -436,24 +404,26 @@ lower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index mask = nir_ishl(b, mask, shift); } - if (op == nir_intrinsic_store_shared_dxil) { + if (var->data.mode == nir_var_mem_shared) { /* Use the dedicated masked intrinsic */ - nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index); + nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index); + nir_build_deref_atomic(b, 32, &deref->dest.ssa, nir_inot(b, mask), .atomic_op = nir_atomic_op_iand); + nir_build_deref_atomic(b, 32, &deref->dest.ssa, vec32, .atomic_op = nir_atomic_op_ior); } else { /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ - nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index); + nir_ssa_def *load = nir_load_array_var(b, var, index); nir_ssa_def *new_val = nir_ior(b, vec32, nir_iand(b, nir_inot(b, mask), load)); - lower_store_vec32(b, index, new_val, op); + nir_store_array_var(b, var, index, new_val, 1); } } static bool -lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) +lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) { assert(intr->src[0].is_ssa); unsigned num_components = nir_src_num_components(intr->src[0]); @@ -461,16 +431,12 @@ lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) unsigned num_bits = num_components * bit_size; b->cursor = nir_before_instr(&intr->instr); - nir_intrinsic_op op = intr->intrinsic; nir_ssa_def *offset = intr->src[1].ssa; - if (op == nir_intrinsic_store_shared) { + if (intr->intrinsic == nir_intrinsic_store_shared) offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); - op = nir_intrinsic_store_shared_dxil; - } else { + else offset = nir_u2u32(b, offset); - op = nir_intrinsic_store_scratch_dxil; - } nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; unsigned comp_idx = 0; @@ -489,9 +455,10 @@ lower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) /* For anything less than 32bits we need to use the masked version of the * intrinsic to preserve data living in the same 32bit slot. */ if (substore_num_bits < 32) { - lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op, nir_intrinsic_align(intr)); + lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, var, nir_intrinsic_align(intr)); } else { - lower_store_vec32(b, index, vec32, op); + for (unsigned i = 0; i < vec32->num_components; ++i) + nir_store_array_var(b, var, nir_iadd_imm(b, index, i), nir_channel(b, vec32, i), 1); } comp_idx += substore_num_bits / bit_size; @@ -941,20 +908,60 @@ lower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) return true; } +static bool +lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, nir_variable *var) +{ + b->cursor = nir_before_instr(&intr->instr); + + assert(intr->src[0].is_ssa); + nir_ssa_def *offset = + nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); + nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); + + nir_deref_instr *deref = nir_build_deref_array(b, nir_build_deref_var(b, var), index); + nir_ssa_def *result; + if (intr->intrinsic == nir_intrinsic_shared_atomic_swap) + result = nir_build_deref_atomic_swap(b, 32, &deref->dest.ssa, intr->src[1].ssa, intr->src[2].ssa, + .atomic_op = nir_intrinsic_atomic_op(intr)); + else + result = nir_build_deref_atomic(b, 32, &deref->dest.ssa, intr->src[1].ssa, + .atomic_op = nir_intrinsic_atomic_op(intr)); + + nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); + nir_instr_remove(&intr->instr); + return true; +} + bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, const struct dxil_nir_lower_loads_stores_options *options) { - bool progress = false; + bool progress = nir_remove_dead_variables(nir, nir_var_function_temp | nir_var_mem_shared, NULL); + nir_variable *shared_var = NULL; + if (nir->info.shared_size) { + shared_var = nir_variable_create(nir, nir_var_mem_shared, + glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->info.shared_size, 4), 4), + "lowered_shared_mem"); + } - foreach_list_typed(nir_function, func, node, &nir->functions) { - if (!func->is_entrypoint) + unsigned ptr_size = nir->info.cs.ptr_size; + if (nir->info.stage == MESA_SHADER_KERNEL) { + /* All the derefs created here will be used as GEP indices so force 32-bit */ + nir->info.cs.ptr_size = 32; + } + nir_foreach_function(func, nir) { + if (!func->impl) continue; - assert(func->impl); nir_builder b; nir_builder_init(&b, func->impl); + nir_variable *scratch_var = NULL; + if (nir->scratch_size) { + const struct glsl_type *scratch_type = glsl_array_type(glsl_uint_type(), DIV_ROUND_UP(nir->scratch_size, 4), 4); + scratch_var = nir_local_variable_create(func->impl, scratch_type, "lowered_scratch_mem"); + } + nir_foreach_block(block, func->impl) { nir_foreach_instr_safe(instr, block) { if (instr->type != nir_instr_type_intrinsic) @@ -963,8 +970,10 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, switch (intr->intrinsic) { case nir_intrinsic_load_shared: + progress |= lower_32b_offset_load(&b, intr, shared_var); + break; case nir_intrinsic_load_scratch: - progress |= lower_32b_offset_load(&b, intr); + progress |= lower_32b_offset_load(&b, intr, scratch_var); break; case nir_intrinsic_load_ssbo: progress |= lower_load_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); @@ -973,75 +982,17 @@ dxil_nir_lower_loads_stores_to_dxil(nir_shader *nir, progress |= lower_load_ubo(&b, intr); break; case nir_intrinsic_store_shared: + progress |= lower_32b_offset_store(&b, intr, shared_var); + break; case nir_intrinsic_store_scratch: - progress |= lower_32b_offset_store(&b, intr); + progress |= lower_32b_offset_store(&b, intr, scratch_var); break; case nir_intrinsic_store_ssbo: progress |= lower_store_ssbo(&b, intr, options->use_16bit_ssbo ? 16 : 32); break; - default: - break; - } - } - } - } - - return progress; -} - -static bool -lower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr) -{ - b->cursor = nir_before_instr(&intr->instr); - - assert(intr->src[0].is_ssa); - nir_ssa_def *offset = - nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); - nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); - - nir_intrinsic_op dxil_op = intr->intrinsic == nir_intrinsic_shared_atomic_swap ? - nir_intrinsic_shared_atomic_swap_dxil : nir_intrinsic_shared_atomic_dxil; - nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); - atomic->src[0] = nir_src_for_ssa(index); - assert(intr->src[1].is_ssa); - atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); - if (dxil_op == nir_intrinsic_shared_atomic_swap_dxil) { - assert(intr->src[2].is_ssa); - atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); - } - atomic->num_components = 0; - nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32); - nir_intrinsic_set_atomic_op(atomic, nir_intrinsic_atomic_op(intr)); - - nir_builder_instr_insert(b, &atomic->instr); - nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa); - nir_instr_remove(&intr->instr); - return true; -} - -bool -dxil_nir_lower_atomics_to_dxil(nir_shader *nir) -{ - bool progress = false; - - foreach_list_typed(nir_function, func, node, &nir->functions) { - if (!func->is_entrypoint) - continue; - assert(func->impl); - - nir_builder b; - nir_builder_init(&b, func->impl); - - nir_foreach_block(block, func->impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); - - switch (intr->intrinsic) { case nir_intrinsic_shared_atomic: case nir_intrinsic_shared_atomic_swap: - progress |= lower_shared_atomic(&b, intr); + progress |= lower_shared_atomic(&b, intr, shared_var); break; default: break; @@ -1049,6 +1000,9 @@ dxil_nir_lower_atomics_to_dxil(nir_shader *nir) } } } + if (nir->info.stage == MESA_SHADER_KERNEL) { + nir->info.cs.ptr_size = ptr_size; + } return progress; } diff --git a/src/microsoft/compiler/dxil_nir.h b/src/microsoft/compiler/dxil_nir.h index 44789fdade1..03651db0bc0 100644 --- a/src/microsoft/compiler/dxil_nir.h +++ b/src/microsoft/compiler/dxil_nir.h @@ -45,7 +45,6 @@ struct dxil_nir_lower_loads_stores_options { }; bool dxil_nir_lower_loads_stores_to_dxil(nir_shader *shader, const struct dxil_nir_lower_loads_stores_options *options); -bool dxil_nir_lower_atomics_to_dxil(nir_shader *shader); bool dxil_nir_lower_deref_ssbo(nir_shader *shader); bool dxil_nir_opt_alu_deref_srcs(nir_shader *shader); bool dxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size); diff --git a/src/microsoft/compiler/nir_to_dxil.c b/src/microsoft/compiler/nir_to_dxil.c index ea92c6fa4c0..204e40345de 100644 --- a/src/microsoft/compiler/nir_to_dxil.c +++ b/src/microsoft/compiler/nir_to_dxil.c @@ -153,7 +153,7 @@ nir_options = { nir_lower_dceil | nir_lower_dround_even, .max_unroll_iterations = 32, /* arbitrary */ - .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), + .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out), .lower_device_index_to_zero = true, .linker_ignore_precision = true, .support_16bit_alu = true, @@ -582,8 +582,8 @@ struct ntd_context { unsigned num_defs; struct hash_table *phis; - const struct dxil_value *sharedvars; - const struct dxil_value *scratchvars; + const struct dxil_value **sharedvars; + const struct dxil_value **scratchvars; const struct dxil_value **consts; nir_variable *ps_front_face; @@ -1529,6 +1529,31 @@ emit_global_consts(struct ntd_context *ctx) return true; } +static bool +emit_shared_vars(struct ntd_context *ctx) +{ + uint32_t index = 0; + nir_foreach_variable_with_modes(var, ctx->shader, nir_var_mem_shared) + var->data.driver_location = index++; + + ctx->sharedvars = ralloc_array(ctx->ralloc_ctx, const struct dxil_value *, index); + + nir_foreach_variable_with_modes(var, ctx->shader, nir_var_mem_shared) { + if (!var->name) + var->name = ralloc_asprintf(var, "shared_%d", var->data.driver_location); + const struct dxil_value *gvar = dxil_add_global_ptr_var(&ctx->mod, var->name, + get_type_for_glsl_type(&ctx->mod, var->type), + DXIL_AS_GROUPSHARED, 16, + NULL); + if (!gvar) + return false; + + ctx->sharedvars[var->data.driver_location] = gvar; + } + + return true; +} + static bool emit_cbv(struct ntd_context *ctx, unsigned binding, unsigned space, unsigned size, unsigned count, char *name) @@ -3531,92 +3556,6 @@ emit_store_ssbo_masked(struct ntd_context *ctx, nir_intrinsic_instr *intr) emit_atomic_binop(ctx, handle, DXIL_ATOMIC_OR, coord, value) != NULL; } -static bool -emit_store_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr) -{ - const struct dxil_value *zero, *index; - - /* All shared mem accesses should have been lowered to scalar 32bit - * accesses. - */ - assert(nir_src_bit_size(intr->src[0]) == 32); - assert(nir_src_num_components(intr->src[0]) == 1); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - if (intr->intrinsic == nir_intrinsic_store_shared_dxil) - index = get_src(ctx, &intr->src[1], 0, nir_type_uint); - else - index = get_src(ctx, &intr->src[2], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; - const struct dxil_value *ptr, *value; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); - if (!ptr) - return false; - - value = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!value) - return false; - - if (intr->intrinsic == nir_intrinsic_store_shared_dxil) - return dxil_emit_store(&ctx->mod, value, ptr, 4, false); - - const struct dxil_value *mask = get_src(ctx, &intr->src[1], 0, nir_type_uint); - if (!mask) - return false; - - if (!dxil_emit_atomicrmw(&ctx->mod, mask, ptr, DXIL_RMWOP_AND, false, - DXIL_ATOMIC_ORDERING_ACQREL, - DXIL_SYNC_SCOPE_CROSSTHREAD)) - return false; - - if (!dxil_emit_atomicrmw(&ctx->mod, value, ptr, DXIL_RMWOP_OR, false, - DXIL_ATOMIC_ORDERING_ACQREL, - DXIL_SYNC_SCOPE_CROSSTHREAD)) - return false; - - return true; -} - -static bool -emit_store_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr) -{ - const struct dxil_value *zero, *index; - - /* All scratch mem accesses should have been lowered to scalar 32bit - * accesses. - */ - assert(nir_src_bit_size(intr->src[0]) == 32); - assert(nir_src_num_components(intr->src[0]) == 1); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - index = get_src(ctx, &intr->src[1], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->scratchvars, zero, index }; - const struct dxil_value *ptr, *value; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); - if (!ptr) - return false; - - value = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!value) - return false; - - return dxil_emit_store(&ctx->mod, value, ptr, 4, false); -} - static bool emit_load_ubo(struct ntd_context *ctx, nir_intrinsic_instr *intr) { @@ -4020,7 +3959,14 @@ deref_to_gep(struct ntd_context *ctx, nir_deref_instr *deref) const struct dxil_value *, count + 1); nir_variable *var = path.path[0]->var; - gep_indices[0] = ctx->consts[var->data.driver_location]; + const struct dxil_value **var_array; + switch (deref->modes) { + case nir_var_mem_constant: var_array = ctx->consts; break; + case nir_var_mem_shared: var_array = ctx->sharedvars; break; + case nir_var_function_temp: var_array = ctx->scratchvars; break; + default: unreachable("Invalid deref mode"); + } + gep_indices[0] = var_array[var->data.driver_location]; for (uint32_t i = 0; i < count; ++i) gep_indices[i + 1] = get_src_ssa(ctx, &path.path[i]->dest.ssa, 0); @@ -4045,34 +3991,32 @@ emit_load_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr) } static bool -emit_load_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr) +emit_store_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr) { - const struct dxil_value *zero, *index; - unsigned bit_size = nir_dest_bit_size(intr->dest); - unsigned align = bit_size / 8; - - /* All shared mem accesses should have been lowered to scalar 32bit - * accesses. - */ - assert(bit_size == 32); - assert(nir_dest_num_components(intr->dest) == 1); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - index = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; - const struct dxil_value *ptr, *retval; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); + const struct dxil_value *ptr = deref_to_gep(ctx, deref); if (!ptr) return false; - retval = dxil_emit_load(&ctx->mod, ptr, align, false); + const struct dxil_value *value = get_src(ctx, &intr->src[1], 0, nir_get_nir_type_for_glsl_type(deref->type)); + return dxil_emit_store(&ctx->mod, value, ptr, nir_src_bit_size(intr->src[1]) / 8, false); +} + +static bool +emit_atomic_deref(struct ntd_context *ctx, nir_intrinsic_instr *intr) +{ + const struct dxil_value *ptr = deref_to_gep(ctx, nir_src_as_deref(intr->src[0])); + if (!ptr) + return false; + + const struct dxil_value *value = get_src(ctx, &intr->src[1], 0, nir_type_uint); + if (!value) + return false; + + enum dxil_rmw_op dxil_op = nir_atomic_to_dxil_rmw(nir_intrinsic_atomic_op(intr)); + const struct dxil_value *retval = dxil_emit_atomicrmw(&ctx->mod, value, ptr, dxil_op, false, + DXIL_ATOMIC_ORDERING_ACQREL, + DXIL_SYNC_SCOPE_CROSSTHREAD); if (!retval) return false; @@ -4081,34 +4025,20 @@ emit_load_shared(struct ntd_context *ctx, nir_intrinsic_instr *intr) } static bool -emit_load_scratch(struct ntd_context *ctx, nir_intrinsic_instr *intr) +emit_atomic_deref_swap(struct ntd_context *ctx, nir_intrinsic_instr *intr) { - const struct dxil_value *zero, *index; - unsigned bit_size = nir_dest_bit_size(intr->dest); - unsigned align = bit_size / 8; - - /* All scratch mem accesses should have been lowered to scalar 32bit - * accesses. - */ - assert(bit_size == 32); - assert(nir_dest_num_components(intr->dest) == 1); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - index = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->scratchvars, zero, index }; - const struct dxil_value *ptr, *retval; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); + const struct dxil_value *ptr = deref_to_gep(ctx, nir_src_as_deref(intr->src[0])); if (!ptr) return false; - retval = dxil_emit_load(&ctx->mod, ptr, align, false); + const struct dxil_value *cmp = get_src(ctx, &intr->src[1], 0, nir_type_uint); + const struct dxil_value *value = get_src(ctx, &intr->src[2], 0, nir_type_uint); + if (!value) + return false; + + const struct dxil_value *retval = dxil_emit_cmpxchg(&ctx->mod, cmp, value, ptr, false, + DXIL_ATOMIC_ORDERING_ACQREL, + DXIL_SYNC_SCOPE_CROSSTHREAD); if (!retval) return false; @@ -4571,82 +4501,6 @@ emit_ssbo_atomic_comp_swap(struct ntd_context *ctx, nir_intrinsic_instr *intr) return true; } -static bool -emit_shared_atomic(struct ntd_context *ctx, nir_intrinsic_instr *intr) -{ - const struct dxil_value *zero, *index; - - assert(nir_src_bit_size(intr->src[1]) == 32); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - index = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; - const struct dxil_value *ptr, *value, *retval; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); - if (!ptr) - return false; - - nir_atomic_op nir_op = nir_intrinsic_atomic_op(intr); - enum dxil_rmw_op dxil_op = nir_atomic_to_dxil_rmw(nir_op); - nir_alu_type type = nir_atomic_op_type(nir_op); - value = get_src(ctx, &intr->src[1], 0, type); - if (!value) - return false; - - retval = dxil_emit_atomicrmw(&ctx->mod, value, ptr, dxil_op, false, - DXIL_ATOMIC_ORDERING_ACQREL, - DXIL_SYNC_SCOPE_CROSSTHREAD); - if (!retval) - return false; - - store_dest(ctx, &intr->dest, 0, retval); - return true; -} - -static bool -emit_shared_atomic_comp_swap(struct ntd_context *ctx, nir_intrinsic_instr *intr) -{ - const struct dxil_value *zero, *index; - - assert(nir_src_bit_size(intr->src[1]) == 32); - - zero = dxil_module_get_int32_const(&ctx->mod, 0); - if (!zero) - return false; - - index = get_src(ctx, &intr->src[0], 0, nir_type_uint); - if (!index) - return false; - - const struct dxil_value *ops[] = { ctx->sharedvars, zero, index }; - const struct dxil_value *ptr, *cmpval, *newval, *retval; - - ptr = dxil_emit_gep_inbounds(&ctx->mod, ops, ARRAY_SIZE(ops)); - if (!ptr) - return false; - - cmpval = get_src(ctx, &intr->src[1], 0, nir_type_uint); - newval = get_src(ctx, &intr->src[2], 0, nir_type_uint); - if (!cmpval || !newval) - return false; - - retval = dxil_emit_cmpxchg(&ctx->mod, cmpval, newval, ptr, false, - DXIL_ATOMIC_ORDERING_ACQREL, - DXIL_SYNC_SCOPE_CROSSTHREAD); - if (!retval) - return false; - - store_dest(ctx, &intr->dest, 0, retval); - return true; -} - static bool emit_vulkan_resource_index(struct ntd_context *ctx, nir_intrinsic_instr *intr) { @@ -5016,13 +4870,14 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_store_ssbo(ctx, intr); case nir_intrinsic_store_ssbo_masked_dxil: return emit_store_ssbo_masked(ctx, intr); - case nir_intrinsic_store_shared_dxil: - case nir_intrinsic_store_shared_masked_dxil: - return emit_store_shared(ctx, intr); - case nir_intrinsic_store_scratch_dxil: - return emit_store_scratch(ctx, intr); case nir_intrinsic_load_deref: return emit_load_deref(ctx, intr); + case nir_intrinsic_store_deref: + return emit_store_deref(ctx, intr); + case nir_intrinsic_deref_atomic: + return emit_atomic_deref(ctx, intr); + case nir_intrinsic_deref_atomic_swap: + return emit_atomic_deref_swap(ctx, intr); case nir_intrinsic_load_ubo: return emit_load_ubo(ctx, intr); case nir_intrinsic_load_ubo_dxil: @@ -5052,10 +4907,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_load_sample_mask_in(ctx, intr); case nir_intrinsic_load_tess_coord: return emit_load_tess_coord(ctx, intr); - case nir_intrinsic_load_shared_dxil: - return emit_load_shared(ctx, intr); - case nir_intrinsic_load_scratch_dxil: - return emit_load_scratch(ctx, intr); case nir_intrinsic_discard_if: case nir_intrinsic_demote_if: return emit_discard_if(ctx, intr); @@ -5072,10 +4923,6 @@ emit_intrinsic(struct ntd_context *ctx, nir_intrinsic_instr *intr) return emit_ssbo_atomic(ctx, intr); case nir_intrinsic_ssbo_atomic_swap: return emit_ssbo_atomic_comp_swap(ctx, intr); - case nir_intrinsic_shared_atomic_dxil: - return emit_shared_atomic(ctx, intr); - case nir_intrinsic_shared_atomic_swap_dxil: - return emit_shared_atomic_comp_swap(ctx, intr); case nir_intrinsic_image_deref_atomic: case nir_intrinsic_image_atomic: case nir_intrinsic_bindless_image_atomic: @@ -6080,60 +5927,30 @@ emit_cbvs(struct ntd_context *ctx) } static bool -emit_scratch(struct ntd_context *ctx) +emit_scratch(struct ntd_context *ctx, nir_function_impl *impl) { - if (ctx->shader->scratch_size) { - /* - * We always allocate an u32 array, no matter the actual variable types. - * According to the DXIL spec, the minimum load/store granularity is - * 32-bit, anything smaller requires using a read-extract/read-write-modify - * approach. - */ - unsigned size = ALIGN_POT(ctx->shader->scratch_size, sizeof(uint32_t)); - const struct dxil_type *int32 = dxil_module_get_int_type(&ctx->mod, 32); - const struct dxil_value *array_length = dxil_module_get_int32_const(&ctx->mod, size / sizeof(uint32_t)); - if (!int32 || !array_length) + uint32_t index = 0; + nir_foreach_function_temp_variable(var, impl) + var->data.driver_location = index++; + + if (ctx->scratchvars) + ralloc_free((void *)ctx->scratchvars); + + ctx->scratchvars = ralloc_array(ctx->ralloc_ctx, const struct dxil_value *, index); + + nir_foreach_function_temp_variable(var, impl) { + const struct dxil_type *type = get_type_for_glsl_type(&ctx->mod, var->type); + const struct dxil_value *length = dxil_module_get_int32_const(&ctx->mod, 1); + const struct dxil_value *ptr = dxil_emit_alloca(&ctx->mod, type, length, 16); + if (!ptr) return false; - const struct dxil_type *type = dxil_module_get_array_type( - &ctx->mod, int32, size / sizeof(uint32_t)); - if (!type) - return false; - - ctx->scratchvars = dxil_emit_alloca(&ctx->mod, type, int32, array_length, 4); - if (!ctx->scratchvars) - return false; + ctx->scratchvars[var->data.driver_location] = ptr; } return true; } -/* The validator complains if we don't have ops that reference a global variable. */ -static bool -shader_has_shared_ops(struct nir_shader *s) -{ - nir_foreach_function(func, s) { - if (!func->impl) - continue; - nir_foreach_block(block, func->impl) { - nir_foreach_instr(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - switch (intrin->intrinsic) { - case nir_intrinsic_load_shared_dxil: - case nir_intrinsic_store_shared_dxil: - case nir_intrinsic_shared_atomic_dxil: - case nir_intrinsic_shared_atomic_swap_dxil: - return true; - default: break; - } - } - } - } - return false; -} - static bool emit_function(struct ntd_context *ctx, nir_function *func) { @@ -6178,7 +5995,7 @@ emit_function(struct ntd_context *ctx, nir_function *func) nir_gather_ssa_types(impl, ctx->float_types, ctx->int_types); - if (!emit_scratch(ctx)) + if (!emit_scratch(ctx, impl)) return false; if (!emit_static_indexing_handles(ctx)) @@ -6243,37 +6060,16 @@ emit_module(struct ntd_context *ctx, const struct nir_to_dxil_options *opts) } } - if (ctx->shader->info.shared_size && shader_has_shared_ops(ctx->shader)) { - const struct dxil_type *type; - unsigned size; - - /* - * We always allocate an u32 array, no matter the actual variable types. - * According to the DXIL spec, the minimum load/store granularity is - * 32-bit, anything smaller requires using a read-extract/read-write-modify - * approach. Non-atomic 64-bit accesses are allowed, but the - * GEP(cast(gvar, u64[] *), offset) and cast(GEP(gvar, offset), u64 *)) - * sequences don't seem to be accepted by the DXIL validator when the - * pointer is in the groupshared address space, making the 32-bit -> 64-bit - * pointer cast impossible. - */ - size = ALIGN_POT(ctx->shader->info.shared_size, sizeof(uint32_t)); - type = dxil_module_get_array_type(&ctx->mod, - dxil_module_get_int_type(&ctx->mod, 32), - size / sizeof(uint32_t)); - ctx->sharedvars = dxil_add_global_ptr_var(&ctx->mod, "shared", type, - DXIL_AS_GROUPSHARED, - ffs(sizeof(uint64_t)), - NULL); - } + if (!emit_shared_vars(ctx)) + return false; + if (!emit_global_consts(ctx)) + return false; /* UAVs */ if (ctx->shader->info.stage == MESA_SHADER_KERNEL) { if (!emit_globals(ctx, opts->num_kernel_globals)) return false; - if (!emit_global_consts(ctx)) - return false; } else if (ctx->opts->environment == DXIL_ENVIRONMENT_VULKAN) { /* Handle read/write SSBOs as UAVs */ nir_foreach_variable_with_modes(var, ctx->shader, nir_var_mem_ssbo) { @@ -6435,7 +6231,7 @@ optimize_nir(struct nir_shader *s, const struct nir_to_dxil_options *opts) do { progress = false; NIR_PASS_V(s, nir_lower_vars_to_ssa); - NIR_PASS(progress, s, nir_lower_indirect_derefs, nir_var_function_temp, UINT32_MAX); + NIR_PASS(progress, s, nir_lower_indirect_derefs, nir_var_function_temp, 4); NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); NIR_PASS(progress, s, nir_copy_prop); NIR_PASS(progress, s, nir_opt_copy_prop_vars); @@ -6751,7 +6547,7 @@ nir_to_dxil(struct nir_shader *s, const struct nir_to_dxil_options *opts, optimize_nir(s, opts); NIR_PASS_V(s, nir_remove_dead_variables, - nir_var_function_temp | nir_var_mem_constant, NULL); + nir_var_function_temp | nir_var_mem_constant | nir_var_mem_shared, NULL); if (!allocate_sysvalues(ctx)) return false; diff --git a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c index 0f9beaa672d..90762a648f9 100644 --- a/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c +++ b/src/microsoft/spirv_to_dxil/dxil_spirv_nir.c @@ -1075,7 +1075,6 @@ dxil_spirv_nir_passes(nir_shader *nir, NIR_PASS_V(nir, nir_lower_explicit_io, nir_var_mem_shared, nir_address_format_32bit_offset); - NIR_PASS_V(nir, dxil_nir_lower_atomics_to_dxil); NIR_PASS_V(nir, dxil_nir_lower_int_cubemaps, false); NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);