diff --git a/src/intel/compiler/brw_compile_vs.cpp b/src/intel/compiler/brw_compile_vs.cpp index 0e3c20ed213..4c1fb610dcc 100644 --- a/src/intel/compiler/brw_compile_vs.cpp +++ b/src/intel/compiler/brw_compile_vs.cpp @@ -28,6 +28,159 @@ brw_assign_vs_urb_setup(brw_shader &s) } } +static unsigned +brw_nir_pack_vs_input(nir_shader *nir, struct brw_vs_prog_data *prog_data) +{ + struct vf_attribute { + unsigned reg_offset; + uint8_t component_mask; + bool is_64bit:1; + bool is_used:1; + } attributes[MAX_HW_VERT_ATTRIB] = {}; + + /* IO lowering is going to break dmat inputs into a location each, so we + * need to reproduce the 64bit nature of the variable into each slot. + */ + nir_foreach_shader_in_variable(var, nir) { + const bool is_64bit = glsl_type_is_64bit(var->type); + const uint32_t slots = glsl_count_vec4_slots(var->type, true, false); + for (uint32_t i = 0; i < slots; i++) + attributes[var->data.location + i].is_64bit = is_64bit; + } + + /* First mark all used inputs */ + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_input) + continue; + + assert(intrin->def.bit_size == 32); + + const struct nir_io_semantics io = + nir_intrinsic_io_semantics(intrin); + + attributes[io.location].is_used = true; + + /* SKL PRMs, Vol 2a: Command Reference: Instructions, + * 3DSTATE_VF_COMPONENT_PACKING: + * + * "Software shall enable all components (XYZW) for any and all + * VERTEX_ELEMENTs associated with a 256-bit SURFACE_FORMAT. + * It is INVALID to disable any components in these cases." + * + * Enable this XYZW for any > 128-bit format. + */ + if (nir->info.dual_slot_inputs & BITFIELD64_BIT(io.location)) { + attributes[io.location].component_mask |= 0xff; + } else { + const uint8_t mask = + nir_component_mask(intrin->num_components) << + nir_intrinsic_component(intrin); + + attributes[io.location].component_mask |= mask; + } + } + } + } + + /* Compute the register offsets */ + unsigned reg_offset = 0; + unsigned vertex_element = 0; + for (unsigned a = 0; a < ARRAY_SIZE(attributes); a++) { + if (!attributes[a].is_used) + continue; + + /* SKL PRMs, Vol 2a: Command Reference: Instructions, + * 3DSTATE_VF_COMPONENT_PACKING: + * + * "No enable bits are provided for Vertex Elements [32-33], + * and therefore no packing is performed on these elements (if + * Valid, all 4 components are stored)." + */ + if (vertex_element >= 32) + attributes[a].component_mask = 0xf; + + attributes[a].reg_offset = reg_offset; + + reg_offset += util_bitcount(attributes[a].component_mask); + vertex_element++; + } + + /* Remap inputs */ + nir_foreach_function_impl(impl, nir) { + nir_foreach_block(block, impl) { + nir_foreach_instr(instr, block) { + if (instr->type != nir_instr_type_intrinsic) + continue; + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_input) + continue; + + struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin); + + unsigned slot = attributes[io.location].reg_offset / 4; + unsigned slot_component = + attributes[io.location].reg_offset % 4 + + util_bitcount(attributes[io.location].component_mask & + BITFIELD_MASK(io.high_dvec2 * 4 + + nir_intrinsic_component(intrin))); + + slot += slot_component / 4; + slot_component %= 4; + + nir_intrinsic_set_base(intrin, slot); + nir_intrinsic_set_component(intrin, slot_component); + } + } + } + + /* Generate the packing array */ + unsigned vf_offset = 0; + for (unsigned a = 0; a < ARRAY_SIZE(attributes) && vf_offset < 32; a++) { + if (!attributes[a].is_used) + continue; + + uint32_t mask; + /* Stores masks in attributes[a].component_mask are in terms of 32-bit + * components, but the HW depending on the format will interpret + * prog_data->vf_component_packing[] bits as either a 32-bit or 64-bit + * component. So we need to only consider every other bit. + */ + if (attributes[a].is_64bit) { + mask = 0; + u_foreach_bit(b, attributes[a].component_mask) + mask |= BITFIELD_BIT(b / 2); + } else { + mask = attributes[a].component_mask; + } + /* We should only have 4bits enabled max */ + assert((mask & ~0xfu) == 0); + prog_data->vf_component_packing[vf_offset / 8] |= + mask << (4 * (vf_offset % 8)); + vf_offset++; + } + + /* SKL PRMs, Vol 2a: Command Reference: Instructions, + * 3DSTATE_VF_COMPONENT_PACKING: + * + * "At least one component of one "valid" Vertex Element must be + * enabled." + */ + if (prog_data->vf_component_packing[0] == 0 && + prog_data->vf_component_packing[1] == 0 && + prog_data->vf_component_packing[2] == 0 && + prog_data->vf_component_packing[3] == 0) + prog_data->vf_component_packing[0] = 0x1; + + return reg_offset; +} + static bool run_vs(brw_shader &s) { @@ -83,6 +236,13 @@ brw_compile_vs(const struct brw_compiler *compiler, brw_nir_lower_vs_inputs(nir); brw_nir_lower_vue_outputs(nir); + + memset(prog_data->vf_component_packing, 0, + sizeof(prog_data->vf_component_packing)); + unsigned nr_packed_regs = 0; + if (key->vf_component_packing) + nr_packed_regs = brw_nir_pack_vs_input(nir, prog_data); + brw_postprocess_nir(nir, compiler, debug_enabled, key->base.robust_flags); @@ -127,8 +287,14 @@ brw_compile_vs(const struct brw_compiler *compiler, if (BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID)) prog_data->uses_drawid = true; - prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); - unsigned nr_attribute_regs = 4 * nr_attribute_slots; + unsigned nr_attribute_regs; + if (key->vf_component_packing) { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); + nr_attribute_regs = nr_packed_regs; + } else { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); + nr_attribute_regs = 4 * (nr_attribute_slots); + } /* Since vertex shaders reuse the same VUE entry for inputs and outputs * (overwriting the original contents), we need to make sure the size is diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 9a102e304ee..21185d92ae5 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -233,6 +233,16 @@ struct brw_base_prog_key { /** The program key for Vertex Shaders. */ struct brw_vs_prog_key { struct brw_base_prog_key base; + + /** Enable component packing + * + * Using this option requires that the driver programs + * 3DSTATE_VF_COMPONENT_PACKING with the values provided in + * brw_vs_prog_data::vf_component_packing + */ + bool vf_component_packing : 1; + + uint32_t padding : 31; }; /** The program key for Tessellation Control Shaders. */ @@ -1028,6 +1038,8 @@ struct brw_vs_prog_data { bool uses_firstvertex; bool uses_baseinstance; bool uses_drawid; + + uint32_t vf_component_packing[4]; }; struct brw_tcs_prog_data