From a9c057d5a3568282d40ee9fc4c7b101464bb10cd Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Fri, 27 Oct 2023 10:01:09 +0200 Subject: [PATCH] radv: Implement NIR debug printf Adds radv_build_printf which can print nir_defs. The arguments are written to a buffer and printed after the submit. Reviewed-by: Samuel Pitoiset Part-of: --- src/amd/vulkan/meson.build | 1 + src/amd/vulkan/radv_device.c | 8 + src/amd/vulkan/radv_pipeline_cache.c | 4 + src/amd/vulkan/radv_printf.c | 330 +++++++++++++++++++++++++++ src/amd/vulkan/radv_private.h | 35 +++ src/amd/vulkan/radv_queue.c | 2 + src/amd/vulkan/radv_shader.c | 2 + 7 files changed, 382 insertions(+) create mode 100644 src/amd/vulkan/radv_printf.c diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build index 7aa8501d3d1..b7922a888d6 100644 --- a/src/amd/vulkan/meson.build +++ b/src/amd/vulkan/meson.build @@ -124,6 +124,7 @@ libradv_files = files( 'radv_pipeline_compute.c', 'radv_pipeline_graphics.c', 'radv_pipeline_rt.c', + 'radv_printf.c', 'radv_private.h', 'radv_queue.c', 'radv_radeon_winsys.h', diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index ac0fe78b680..9169f99b485 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -1194,6 +1194,10 @@ radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCr device->capture_replay_arena_vas = _mesa_hash_table_u64_create(NULL); } + result = radv_printf_data_init(device); + if (result != VK_SUCCESS) + goto fail_cache; + *pDevice = radv_device_to_handle(device); return VK_SUCCESS; @@ -1202,6 +1206,8 @@ fail_cache: fail_meta: radv_device_finish_meta(device); fail: + radv_printf_data_finish(device); + radv_sqtt_finish(device); radv_spm_finish(device); @@ -1313,6 +1319,8 @@ radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator) radv_destroy_shader_arenas(device); + radv_printf_data_finish(device); + radv_sqtt_finish(device); radv_rra_trace_finish(_device, &device->rra_trace); diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c index 75aa3a4ec14..939e4f2158e 100644 --- a/src/amd/vulkan/radv_pipeline_cache.c +++ b/src/amd/vulkan/radv_pipeline_cache.c @@ -38,6 +38,10 @@ static bool radv_is_cache_disabled(struct radv_device *device) { + /* The buffer address used for debug printf is hardcoded. */ + if (device->printf.buffer_addr) + return true; + /* Pipeline caches can be disabled with RADV_DEBUG=nocache, with MESA_GLSL_CACHE_DISABLE=1 and * when ACO_DEBUG is used. MESA_GLSL_CACHE_DISABLE is done elsewhere. */ diff --git a/src/amd/vulkan/radv_printf.c b/src/amd/vulkan/radv_printf.c new file mode 100644 index 00000000000..eca2cb56d37 --- /dev/null +++ b/src/amd/vulkan/radv_printf.c @@ -0,0 +1,330 @@ +/* + * Copyright © 2024 Valve Corporation + * + * SPDX-License-Identifier: MIT + */ + +#include "radv_private.h" + +#include "util/hash_table.h" +#include "util/strndup.h" +#include "util/u_printf.h" + +#include "nir.h" +#include "nir_builder.h" + +static struct hash_table *device_ht = NULL; + +VkResult +radv_printf_data_init(struct radv_device *device) +{ + util_dynarray_init(&device->printf.formats, NULL); + + device->printf.buffer_size = debug_get_num_option("RADV_PRINTF_BUFFER_SIZE", 0); + if (device->printf.buffer_size < sizeof(struct radv_printf_buffer_header)) + return VK_SUCCESS; + + VkBufferCreateInfo buffer_create_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = + &(VkBufferUsageFlags2CreateInfoKHR){ + .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR, + .usage = VK_BUFFER_USAGE_2_TRANSFER_SRC_BIT_KHR | VK_BUFFER_USAGE_2_SHADER_DEVICE_ADDRESS_BIT_KHR, + }, + .size = device->printf.buffer_size, + }; + + VkDevice _device = radv_device_to_handle(device); + VkResult result = device->vk.dispatch_table.CreateBuffer(_device, &buffer_create_info, NULL, &device->printf.buffer); + if (result != VK_SUCCESS) + return result; + + VkMemoryRequirements requirements; + device->vk.dispatch_table.GetBufferMemoryRequirements(_device, device->printf.buffer, &requirements); + + VkMemoryAllocateInfo alloc_info = { + .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, + .allocationSize = requirements.size, + .memoryTypeIndex = radv_find_memory_index(device->physical_device, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT), + }; + + result = device->vk.dispatch_table.AllocateMemory(_device, &alloc_info, NULL, &device->printf.memory); + if (result != VK_SUCCESS) + return result; + + result = device->vk.dispatch_table.MapMemory(_device, device->printf.memory, 0, VK_WHOLE_SIZE, 0, + (void **)&device->printf.data); + if (result != VK_SUCCESS) + return result; + + result = device->vk.dispatch_table.BindBufferMemory(_device, device->printf.buffer, device->printf.memory, 0); + if (result != VK_SUCCESS) + return result; + + struct radv_printf_buffer_header *header = device->printf.data; + header->offset = sizeof(struct radv_printf_buffer_header); + header->size = device->printf.buffer_size; + + VkBufferDeviceAddressInfo addr_info = { + .sType = VK_STRUCTURE_TYPE_BUFFER_DEVICE_ADDRESS_INFO, + .buffer = device->printf.buffer, + }; + device->printf.buffer_addr = device->vk.dispatch_table.GetBufferDeviceAddress(_device, &addr_info); + + return VK_SUCCESS; +} + +void +radv_printf_data_finish(struct radv_device *device) +{ + VkDevice _device = radv_device_to_handle(device); + + device->vk.dispatch_table.DestroyBuffer(_device, device->printf.buffer, NULL); + if (device->printf.memory) + device->vk.dispatch_table.UnmapMemory(_device, device->printf.memory); + device->vk.dispatch_table.FreeMemory(_device, device->printf.memory, NULL); + + util_dynarray_foreach (&device->printf.formats, struct radv_printf_format, format) + free(format->string); + + util_dynarray_fini(&device->printf.formats); +} + +void +radv_build_printf(nir_builder *b, nir_def *cond, const char *format_string, ...) +{ + if (!device_ht) + return; + + struct radv_device *device = _mesa_hash_table_search(device_ht, b->shader)->data; + if (!device->printf.buffer_addr) + return; + + struct radv_printf_format format = {0}; + format.string = strdup(format_string); + if (!format.string) + return; + + uint32_t format_index = util_dynarray_num_elements(&device->printf.formats, struct radv_printf_format); + + if (cond) + nir_push_if(b, cond); + + nir_def *size = nir_imm_int(b, 4); + + va_list arg_list; + va_start(arg_list, format_string); + + uint32_t num_args = 0; + for (uint32_t i = 0; i < strlen(format_string); i++) + if (format_string[i] == '%') + num_args++; + + nir_def **args = malloc(num_args * sizeof(nir_def *)); + nir_def **strides = malloc(num_args * sizeof(nir_def *)); + + nir_def *ballot = nir_ballot(b, 1, 64, nir_imm_true(b)); + nir_def *active_invocation_count = nir_bit_count(b, ballot); + + for (uint32_t i = 0; i < num_args; i++) { + nir_def *arg = va_arg(arg_list, nir_def *); + + if (arg->bit_size == 1) + arg = nir_b2i32(b, arg); + + args[i] = arg; + + uint32_t arg_size = arg->bit_size == 1 ? 32 : arg->bit_size / 8; + format.element_sizes[i] = arg_size; + + nir_update_instr_divergence(b->shader, arg->parent_instr); + + if (arg->divergent) { + strides[i] = nir_imul_imm(b, active_invocation_count, arg_size); + format.divergence_mask |= BITFIELD_BIT(i); + } else { + strides[i] = nir_imm_int(b, arg_size); + } + + size = nir_iadd(b, size, strides[i]); + } + + va_end(arg_list); + + nir_def *offset; + nir_def *undef; + + nir_push_if(b, nir_elect(b, 1)); + { + offset = nir_global_atomic( + b, 32, nir_imm_int64(b, device->printf.buffer_addr + offsetof(struct radv_printf_buffer_header, offset)), size, + .atomic_op = nir_atomic_op_iadd); + } + nir_push_else(b, NULL); + { + undef = nir_undef(b, 1, 32); + } + nir_pop_if(b, NULL); + + offset = nir_read_first_invocation(b, nir_if_phi(b, offset, undef)); + + nir_def *buffer_size = nir_load_global( + b, nir_imm_int64(b, device->printf.buffer_addr + offsetof(struct radv_printf_buffer_header, size)), 4, 1, 32); + + nir_push_if(b, nir_ige(b, buffer_size, nir_iadd(b, offset, size))); + { + nir_def *addr = nir_iadd_imm(b, nir_u2u64(b, offset), device->printf.buffer_addr); + + /* header */ + nir_store_global(b, addr, 4, nir_ior_imm(b, active_invocation_count, format_index << 16), 1); + addr = nir_iadd_imm(b, addr, 4); + + for (uint32_t i = 0; i < num_args; i++) { + nir_def *arg = args[i]; + + if (arg->divergent) { + nir_def *invocation_index = nir_mbcnt_amd(b, ballot, nir_imm_int(b, 0)); + nir_store_global( + b, nir_iadd(b, addr, nir_u2u64(b, nir_imul_imm(b, invocation_index, format.element_sizes[i]))), 4, arg, + 1); + } else { + nir_store_global(b, addr, 4, arg, 1); + } + + addr = nir_iadd(b, addr, nir_u2u64(b, strides[i])); + } + } + nir_pop_if(b, NULL); + + if (cond) + nir_pop_if(b, NULL); + + free(args); + free(strides); + + util_dynarray_append(&device->printf.formats, struct radv_printf_format, format); +} + +void +radv_dump_printf_data(struct radv_device *device) +{ + if (!device->printf.data) + return; + + device->vk.dispatch_table.DeviceWaitIdle(radv_device_to_handle(device)); + + struct radv_printf_buffer_header *header = device->printf.data; + uint8_t *data = device->printf.data; + + for (uint32_t offset = sizeof(struct radv_printf_buffer_header); offset < header->offset;) { + uint32_t printf_header = *(uint32_t *)&data[offset]; + offset += sizeof(uint32_t); + + uint32_t format_index = printf_header >> 16; + struct radv_printf_format *printf_format = + util_dynarray_element(&device->printf.formats, struct radv_printf_format, format_index); + + uint32_t invocation_count = printf_header & 0xFFFF; + + uint32_t num_args = 0; + for (uint32_t i = 0; i < strlen(printf_format->string); i++) + if (printf_format->string[i] == '%') + num_args++; + + char *format = printf_format->string; + + for (uint32_t i = 0; i <= num_args; i++) { + size_t spec_pos = util_printf_next_spec_pos(format, 0); + + if (spec_pos == -1) { + printf("%s", format); + continue; + } + + const char *token = util_printf_prev_tok(&format[spec_pos]); + char *next_format = &format[spec_pos + 1]; + + /* print the part before the format token */ + if (token != format) { + fwrite(format, token - format, 1, stdout); + fflush(stdout); + } + + char *print_str = strndup(token, next_format - token); + /* rebase spec_pos so we can use it with print_str */ + spec_pos += format - token; + + size_t element_size = printf_format->element_sizes[i]; + bool is_float = strpbrk(print_str, "fFeEgGaA") != NULL; + + uint32_t lane_count = (printf_format->divergence_mask & BITFIELD_BIT(i)) ? invocation_count : 1; + for (uint32_t lane = 0; lane < lane_count; lane++) { + switch (element_size) { + case 1: { + uint8_t v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + break; + } + case 2: { + uint16_t v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + break; + } + case 4: { + if (is_float) { + float v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + } else { + uint32_t v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + } + break; + } + case 8: { + if (is_float) { + double v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + } else { + uint64_t v; + memcpy(&v, &data[offset], element_size); + printf(print_str, v); + } + break; + } + default: + unreachable("Unsupported data type"); + } + + if (lane != lane_count - 1) + printf(" "); + + offset += element_size; + } + + /* rebase format */ + format = next_format; + free(print_str); + } + } + + header->offset = sizeof(struct radv_printf_buffer_header); +} + +void +radv_device_associate_nir(struct radv_device *device, nir_shader *nir) +{ + if (!device->printf.buffer_addr) + return; + + if (!device_ht) + device_ht = _mesa_pointer_hash_table_create(NULL); + + _mesa_hash_table_insert(device_ht, nir, device); +} diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h index ed4ffd02144..48eaa68f6bb 100644 --- a/src/amd/vulkan/radv_private.h +++ b/src/amd/vulkan/radv_private.h @@ -906,6 +906,39 @@ struct radv_device_cache_key { uint32_t use_ngg_culling : 1; }; +struct radv_printf_format { + char *string; + uint32_t divergence_mask; + uint8_t element_sizes[32]; +}; + +struct radv_printf_data { + uint32_t buffer_size; + VkBuffer buffer; + VkDeviceMemory memory; + VkDeviceAddress buffer_addr; + void *data; + struct util_dynarray formats; +}; + +VkResult radv_printf_data_init(struct radv_device *device); + +void radv_printf_data_finish(struct radv_device *device); + +struct radv_printf_buffer_header { + uint32_t offset; + uint32_t size; +}; + +typedef struct nir_builder nir_builder; +typedef struct nir_def nir_def; + +void radv_build_printf(nir_builder *b, nir_def *cond, const char *format, ...); + +void radv_dump_printf_data(struct radv_device *device); + +void radv_device_associate_nir(struct radv_device *device, nir_shader *nir); + struct radv_device { struct vk_device vk; @@ -1092,6 +1125,8 @@ struct radv_device { struct hash_table *rt_handles; simple_mtx_t rt_handles_mtx; + struct radv_printf_data printf; + struct radv_device_cache_key cache_key; blake3_hash cache_hash; }; diff --git a/src/amd/vulkan/radv_queue.c b/src/amd/vulkan/radv_queue.c index 68224501220..97d38627740 100644 --- a/src/amd/vulkan/radv_queue.c +++ b/src/amd/vulkan/radv_queue.c @@ -1679,6 +1679,8 @@ radv_queue_submit_normal(struct radv_queue *queue, struct vk_queue_submit *submi queue->last_shader_upload_seq = MAX2(queue->last_shader_upload_seq, shader_upload_seq); + radv_dump_printf_data(queue->device); + fail: free(cs_array); if (waits != submission->waits) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index fe7eca90528..6bd40256786 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -434,6 +434,8 @@ radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_st free(spec_entries); + radv_device_associate_nir(device, nir); + /* TODO: This can be removed once GCM (which is more general) is used. */ NIR_PASS(_, nir, nir_opt_reuse_constants);