diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 2547833aea5..135e63a077b 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -12221,6 +12221,9 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */ unsigned num_sgprs = 0; + const struct ac_vtx_format_info* vtx_info_table = + ac_get_vtx_format_info_table(GFX8, CHIP_POLARIS10); + for (unsigned loc = 0; loc < key->num_attributes;) { unsigned num_descs = load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc); @@ -12285,9 +12288,12 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade /* perform load */ PhysReg cur_desc = desc.advance(i * 16); if ((key->misaligned_mask & (1u << loc))) { - unsigned dfmt = key->state.formats[loc] & 0xf; - unsigned nfmt = key->state.formats[loc] >> 4; - const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); + const struct ac_vtx_format_info* vtx_info = &vtx_info_table[key->state.formats[loc]]; + + assert(vtx_info->has_hw_format & 0x1); + unsigned dfmt = vtx_info->hw_format[0] & 0xf; + unsigned nfmt = vtx_info->hw_format[0] >> 4; + for (unsigned j = 0; j < vtx_info->num_channels; j++) { bool post_shuffle = key->state.post_shuffle & (1u << loc); unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j); @@ -12297,14 +12303,14 @@ select_vs_prolog(Program* program, const struct aco_vs_prolog_key* key, ac_shade * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't * care). */ - if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32) + if (dfmt == V_008F0C_BUF_DATA_FORMAT_32) bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1), Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false, false, true); else bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1), - Operand(cur_desc, s4), fetch_index, Operand::c32(0u), - vtx_info->chan_format, nfmt, offset, false, true); + Operand(cur_desc, s4), fetch_index, Operand::c32(0u), dfmt, nfmt, + offset, false, true); } uint32_t one = nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c index d3dc79e232a..b4dd00fa631 100644 --- a/src/amd/vulkan/radv_cmd_buffer.c +++ b/src/amd/vulkan/radv_cmd_buffer.c @@ -308,8 +308,6 @@ radv_destroy_cmd_buffer(struct radv_cmd_buffer *cmd_buffer) { list_del(&cmd_buffer->pool_link); - util_dynarray_fini(&cmd_buffer->cached_vertex_formats); - list_for_each_entry_safe(struct radv_cmd_buffer_upload, up, &cmd_buffer->upload.list, list) { cmd_buffer->device->ws->buffer_destroy(cmd_buffer->device->ws, up->upload_bo); @@ -380,8 +378,6 @@ radv_create_cmd_buffer(struct radv_device *device, struct radv_cmd_pool *pool, vk_object_base_init(&device->vk, &cmd_buffer->meta_push_descriptors.base, VK_OBJECT_TYPE_DESCRIPTOR_SET); - util_dynarray_init(&cmd_buffer->cached_vertex_formats, NULL); - for (unsigned i = 0; i < MAX_BIND_POINTS; i++) vk_object_base_init(&device->vk, &cmd_buffer->descriptors[i].push_set.set.base, VK_OBJECT_TYPE_DESCRIPTOR_SET); @@ -2964,6 +2960,7 @@ lookup_vs_prolog(struct radv_cmd_buffer *cmd_buffer, struct radv_shader *vs_shad cmd_buffer->state.vbo_misaligned_mask = misaligned_mask; cmd_buffer->state.vbo_misaligned_mask_invalid &= ~attribute_mask; } + misaligned_mask |= state->nontrivial_formats; /* try to use a pre-compiled prolog first */ struct radv_shader_part *prolog = NULL; @@ -3540,39 +3537,6 @@ radv_flush_constants(struct radv_cmd_buffer *cmd_buffer, VkShaderStageFlags stag cmd_buffer->push_constant_stages |= dirty_stages; } -enum radv_dst_sel { - DST_SEL_0001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), - DST_SEL_X001 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_0) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), - DST_SEL_XY01 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_0) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), - DST_SEL_XYZ1 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_1), - DST_SEL_XYZW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), - DST_SEL_ZYXW = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | - S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W), -}; - -static const uint32_t data_format_dst_sel[] = { - [V_008F0C_BUF_DATA_FORMAT_INVALID] = DST_SEL_0001, - [V_008F0C_BUF_DATA_FORMAT_8] = DST_SEL_X001, - [V_008F0C_BUF_DATA_FORMAT_16] = DST_SEL_X001, - [V_008F0C_BUF_DATA_FORMAT_8_8] = DST_SEL_XY01, - [V_008F0C_BUF_DATA_FORMAT_32] = DST_SEL_X001, - [V_008F0C_BUF_DATA_FORMAT_16_16] = DST_SEL_XY01, - [V_008F0C_BUF_DATA_FORMAT_10_11_11] = DST_SEL_XYZ1, - [V_008F0C_BUF_DATA_FORMAT_11_11_10] = DST_SEL_XYZ1, - [V_008F0C_BUF_DATA_FORMAT_10_10_10_2] = DST_SEL_XYZW, - [V_008F0C_BUF_DATA_FORMAT_2_10_10_10] = DST_SEL_XYZW, - [V_008F0C_BUF_DATA_FORMAT_8_8_8_8] = DST_SEL_XYZW, - [V_008F0C_BUF_DATA_FORMAT_32_32] = DST_SEL_XY01, - [V_008F0C_BUF_DATA_FORMAT_16_16_16_16] = DST_SEL_XYZW, - [V_008F0C_BUF_DATA_FORMAT_32_32_32] = DST_SEL_XYZ1, - [V_008F0C_BUF_DATA_FORMAT_32_32_32_32] = DST_SEL_XYZW, -}; - void radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, const struct radv_graphics_pipeline *pipeline, @@ -3580,6 +3544,7 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, { struct radv_shader *vs_shader = radv_get_shader(&pipeline->base, MESA_SHADER_VERTEX); enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; + enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; unsigned desc_index = 0; uint32_t mask = pipeline->vb_desc_usage_mask; uint64_t va; @@ -3587,6 +3552,9 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, vs_shader->info.vs.dynamic_inputs ? &cmd_buffer->state.dynamic_vs_input : NULL; assert(!vs_state || pipeline->use_per_attribute_vb_descs); + const struct ac_vtx_format_info *vtx_info_table = + vs_state ? ac_get_vtx_format_info_table(chip, family) : NULL; + while (mask) { unsigned i = u_bit_scan(&mask); uint32_t *desc = &((uint32_t *)vb_ptr)[desc_index++ * 4]; @@ -3598,23 +3566,25 @@ radv_write_vertex_descriptors(const struct radv_cmd_buffer *cmd_buffer, unsigned num_records; unsigned stride; - if (vs_state) { - unsigned format = vs_state->formats[i]; - unsigned dfmt = format & 0xf; - unsigned nfmt = (format >> 4) & 0x7; + if (vs_state && !(vs_state->nontrivial_formats & BITFIELD_BIT(i))) { + const struct ac_vtx_format_info *vtx_info = &vtx_info_table[vs_state->formats[i]]; + unsigned hw_format = vtx_info->hw_format[vtx_info->num_channels - 1]; - rsrc_word3 = vs_state->post_shuffle & (1u << i) ? DST_SEL_ZYXW : data_format_dst_sel[dfmt]; - - if (chip >= GFX10) - rsrc_word3 |= S_008F0C_FORMAT(ac_get_tbuffer_format(chip, dfmt, nfmt)); - else - rsrc_word3 |= S_008F0C_NUM_FORMAT(nfmt) | S_008F0C_DATA_FORMAT(dfmt); + if (chip >= GFX10) { + rsrc_word3 = vtx_info->dst_sel | S_008F0C_FORMAT(hw_format); + } else { + rsrc_word3 = vtx_info->dst_sel | S_008F0C_NUM_FORMAT((hw_format >> 4) & 0x7) | + S_008F0C_DATA_FORMAT(hw_format & 0xf); + } } else { + rsrc_word3 = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | + S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | + S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); if (chip >= GFX10) - rsrc_word3 = DST_SEL_XYZW | S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); + rsrc_word3 |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_UINT); else - rsrc_word3 = DST_SEL_XYZW | S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | - S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); + rsrc_word3 |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) | + S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } if (pipeline->dynamic_states & (RADV_DYNAMIC_VERTEX_INPUT_BINDING_STRIDE | @@ -5934,6 +5904,9 @@ radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingD state->bindings_match_attrib = true; enum amd_gfx_level chip = cmd_buffer->device->physical_device->rad_info.gfx_level; + enum radeon_family family = cmd_buffer->device->physical_device->rad_info.family; + const struct ac_vtx_format_info *vtx_info_table = ac_get_vtx_format_info_table(chip, family); + for (unsigned i = 0; i < vertexAttributeDescriptionCount; i++) { const VkVertexInputAttributeDescription2EXT *attrib = &pVertexAttributeDescriptions[i]; const VkVertexInputBindingDescription2EXT *binding = bindings[attrib->binding]; @@ -5955,50 +5928,27 @@ radv_CmdSetVertexInputEXT(VkCommandBuffer commandBuffer, uint32_t vertexBindingD cmd_buffer->vertex_bindings[attrib->binding].stride = binding->stride; state->offsets[loc] = attrib->offset; - struct dynamic_vertex_format_cache *found = NULL; - util_dynarray_foreach(&cmd_buffer->cached_vertex_formats, - struct dynamic_vertex_format_cache, - vf) { - if (vf->format == attrib->format) { - found = vf; - break; - } - } - if (!found) { - unsigned nfmt, dfmt; - bool post_shuffle; - enum ac_vs_input_alpha_adjust alpha_adjust; - const struct util_format_description *format_desc = vk_format_description(attrib->format); + enum pipe_format format = vk_format_to_pipe_format(attrib->format); + const struct ac_vtx_format_info *vtx_info = &vtx_info_table[format]; - found = util_dynarray_grow(&cmd_buffer->cached_vertex_formats, - struct dynamic_vertex_format_cache, 1); - radv_translate_vertex_format(cmd_buffer->device->physical_device, attrib->format, format_desc, - &dfmt, &nfmt, &post_shuffle, &alpha_adjust); - found->format = attrib->format; - found->hw_fmt = dfmt | (nfmt << 4); - const uint8_t format_align_req_minus_1 = format_desc->channel[0].size >= 32 ? 3 : - (format_desc->block.bits / 8u - 1); - found->fmt_align_req_minus_1 = format_align_req_minus_1; - found->fmt_size = format_desc->block.bits / 8u; - found->post_shuffle = post_shuffle; - found->alpha_adjust_lo = alpha_adjust & 0x1; - found->alpha_adjust_hi = (alpha_adjust >> 1) & 0x1; - } + state->formats[loc] = format; + uint8_t align_req_minus_1 = vtx_info->chan_byte_size >= 4 ? 3 : (vtx_info->element_size - 1); + state->format_align_req_minus_1[loc] = align_req_minus_1; + state->format_sizes[loc] = vtx_info->element_size; + state->alpha_adjust_lo |= (vtx_info->alpha_adjust & 0x1) << loc; + state->alpha_adjust_hi |= (vtx_info->alpha_adjust >> 1) << loc; + if (G_008F0C_DST_SEL_X(vtx_info->dst_sel) == V_008F0C_SQ_SEL_Z) + state->post_shuffle |= BITFIELD_BIT(loc); - state->formats[loc] = found->hw_fmt; - state->format_align_req_minus_1[loc] = found->fmt_align_req_minus_1; - state->format_sizes[loc] = found->fmt_size; - state->alpha_adjust_lo |= found->alpha_adjust_lo << loc; - state->alpha_adjust_hi |= found->alpha_adjust_hi << loc; - if (found->post_shuffle) - state->post_shuffle |= 1u << loc; + if (!(vtx_info->has_hw_format & BITFIELD_BIT(vtx_info->num_channels - 1))) + state->nontrivial_formats |= BITFIELD_BIT(loc); if ((chip == GFX6 || chip >= GFX10) && cmd_buffer->state.vbo_bound_mask & BITFIELD_BIT(attrib->binding)) { - if (binding->stride & found->fmt_align_req_minus_1) { + if (binding->stride & align_req_minus_1) { cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc); } else if ((cmd_buffer->vertex_bindings[attrib->binding].offset + state->offsets[loc]) & - found->fmt_align_req_minus_1) { + align_req_minus_1) { cmd_buffer->state.vbo_misaligned_mask |= BITFIELD_BIT(loc); } } diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index 10a7986af40..043022bd760 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -1568,16 +1568,6 @@ enum radv_cmd_buffer_status { RADV_CMD_BUFFER_STATUS_PENDING, }; -struct dynamic_vertex_format_cache { - VkFormat format; - uint8_t hw_fmt; - uint8_t fmt_align_req_minus_1; - uint8_t fmt_size; - bool post_shuffle; - bool alpha_adjust_lo; - bool alpha_adjust_hi; -}; - struct radv_cmd_buffer { struct vk_command_buffer vk; @@ -1586,7 +1576,6 @@ struct radv_cmd_buffer { struct radv_cmd_pool *pool; struct list_head pool_link; - struct util_dynarray cached_vertex_formats; VkCommandBufferUsageFlags usage_flags; enum radv_cmd_buffer_status status; struct radeon_cmdbuf *cs; diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 2b2d2181629..137048e0c76 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -374,6 +374,7 @@ struct radv_vs_input_state { */ uint32_t alpha_adjust_lo; uint32_t alpha_adjust_hi; + uint32_t nontrivial_formats; uint8_t bindings[MAX_VERTEX_ATTRIBS]; uint32_t divisors[MAX_VERTEX_ATTRIBS];