diff --git a/src/amd/common/ac_gpu_info.c b/src/amd/common/ac_gpu_info.c index 4f9997359c6..233e95946ba 100644 --- a/src/amd/common/ac_gpu_info.c +++ b/src/amd/common/ac_gpu_info.c @@ -233,6 +233,74 @@ static bool handle_env_var_force_family(struct radeon_info *info) return true; } +void +ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_info) +{ + struct ac_cu_info *cu_info = &info->cu_info; + + if (info->gfx_level >= GFX10_3) + cu_info->max_waves_per_simd = 16; + else if (info->gfx_level == GFX10) + cu_info->max_waves_per_simd = 20; + else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM) + cu_info->max_waves_per_simd = 8; + else + cu_info->max_waves_per_simd = 10; + + if (info->gfx_level >= GFX10) { + cu_info->num_physical_sgprs_per_simd = 108 * cu_info->max_waves_per_simd; + cu_info->min_sgpr_alloc = 108; + cu_info->max_sgpr_alloc = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ + cu_info->sgpr_alloc_granularity = 108; + } else if (info->family == CHIP_TONGA || info->family == CHIP_ICELAND) { + /* SGPRInitBug: Due to a HW bug, we always have to allocate the same amount of SGPRs. */ + cu_info->num_physical_sgprs_per_simd = 800; + cu_info->min_sgpr_alloc = 96; + cu_info->max_sgpr_alloc = 96; + cu_info->sgpr_alloc_granularity = 96; + } else if (info->gfx_level >= GFX8) { + cu_info->num_physical_sgprs_per_simd = 800; + cu_info->min_sgpr_alloc = 16; + cu_info->max_sgpr_alloc = 102; + cu_info->sgpr_alloc_granularity = 16; + } else { + cu_info->num_physical_sgprs_per_simd = 512; + cu_info->min_sgpr_alloc = 8; + cu_info->max_sgpr_alloc = 104; + cu_info->sgpr_alloc_granularity = 8; + } + + /* Some GPU info was broken before DRM 3.45.0. */ + if (info->drm_minor >= 45 && device_info && device_info->num_shader_visible_vgprs) { + /* The Gfx10 VGPR count is in Wave32, so divide it by 2 for Wave64. + * Gfx6-9 numbers are in Wave64. CDNA also includes Accumulation VGPRs. + */ + if (info->gfx_level >= GFX10 || (info->gfx_level == GFX9 && info->family >= CHIP_MI100)) + cu_info->num_physical_wave64_vgprs_per_simd = device_info->num_shader_visible_vgprs / 2; + else + cu_info->num_physical_wave64_vgprs_per_simd = device_info->num_shader_visible_vgprs; + } else { + if (info->family == CHIP_NAVI31 || info->family == CHIP_NAVI32 || + info->family == CHIP_STRIX_HALO || info->gfx_level == GFX12) { + cu_info->num_physical_wave64_vgprs_per_simd = 768; + } else if (info->gfx_level >= GFX10) { + cu_info->num_physical_wave64_vgprs_per_simd = 512; + } else { + cu_info->num_physical_wave64_vgprs_per_simd = 256; + } + } + if (info->gfx_level >= GFX10_3) + cu_info->wave64_vgpr_alloc_granularity = cu_info->num_physical_wave64_vgprs_per_simd / 64; + else if (info->gfx_level == GFX9 && info->family >= CHIP_MI200) + cu_info->wave64_vgpr_alloc_granularity = 8; + else + cu_info->wave64_vgpr_alloc_granularity = 4; + cu_info->min_wave64_vgpr_alloc = cu_info->wave64_vgpr_alloc_granularity; + cu_info->max_vgpr_alloc = 256; + + cu_info->num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4; +} + enum ac_query_gpu_info_result ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, bool require_pci_bus_info) @@ -1259,39 +1327,6 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, } } - /* Fill ac_cu_info */ - if (info->gfx_level >= GFX10_3) - info->cu_info.max_waves_per_simd = 16; - else if (info->gfx_level == GFX10) - info->cu_info.max_waves_per_simd = 20; - else if (info->family >= CHIP_POLARIS10 && info->family <= CHIP_VEGAM) - info->cu_info.max_waves_per_simd = 8; - else - info->cu_info.max_waves_per_simd = 10; - - if (info->gfx_level >= GFX10) { - info->cu_info.num_physical_sgprs_per_simd = 108 * info->cu_info.max_waves_per_simd; - info->cu_info.min_sgpr_alloc = 108; - info->cu_info.max_sgpr_alloc = 108; /* includes VCC, which can be treated as s[106-107] on GFX10+ */ - info->cu_info.sgpr_alloc_granularity = 108; - } else if (info->family == CHIP_TONGA || info->family == CHIP_ICELAND) { - /* SGPRInitBug: Due to a HW bug, we always have to allocate the same amount of SGPRs. */ - info->cu_info.num_physical_sgprs_per_simd = 800; - info->cu_info.min_sgpr_alloc = 96; - info->cu_info.max_sgpr_alloc = 96; - info->cu_info.sgpr_alloc_granularity = 96; - } else if (info->gfx_level >= GFX8) { - info->cu_info.num_physical_sgprs_per_simd = 800; - info->cu_info.min_sgpr_alloc = 16; - info->cu_info.max_sgpr_alloc = 102; - info->cu_info.sgpr_alloc_granularity = 16; - } else { - info->cu_info.num_physical_sgprs_per_simd = 512; - info->cu_info.min_sgpr_alloc = 8; - info->cu_info.max_sgpr_alloc = 104; - info->cu_info.sgpr_alloc_granularity = 8; - } - info->has_3d_cube_border_color_mipmap = info->has_graphics || info->family == CHIP_MI100; info->has_image_opcodes = debug_get_bool_option("AMD_IMAGE_OPCODES", info->has_graphics || info->family < CHIP_GFX940); @@ -1308,35 +1343,7 @@ ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, /* On GFX10.3, the polarity of AUTO_FLUSH_MODE is inverted. */ info->has_sqtt_auto_flush_mode_bug = info->gfx_level == GFX10_3; - /* Some GPU info was broken before DRM 3.45.0. */ - if (info->drm_minor >= 45 && device_info.num_shader_visible_vgprs) { - /* The Gfx10 VGPR count is in Wave32, so divide it by 2 for Wave64. - * Gfx6-9 numbers are in Wave64. CDNA also includes Accumulation VGPRs. - */ - if (info->gfx_level >= GFX10 || (info->gfx_level == GFX9 && info->family >= CHIP_MI100)) - info->cu_info.num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs / 2; - else - info->cu_info.num_physical_wave64_vgprs_per_simd = device_info.num_shader_visible_vgprs; - } else { - if (info->family == CHIP_NAVI31 || info->family == CHIP_NAVI32 || - info->family == CHIP_STRIX_HALO || info->gfx_level == GFX12) { - info->cu_info.num_physical_wave64_vgprs_per_simd = 768; - } else if (info->gfx_level >= GFX10) { - info->cu_info.num_physical_wave64_vgprs_per_simd = 512; - } else { - info->cu_info.num_physical_wave64_vgprs_per_simd = 256; - } - } - if (info->gfx_level >= GFX10_3) - info->cu_info.wave64_vgpr_alloc_granularity = info->cu_info.num_physical_wave64_vgprs_per_simd / 64; - else if (info->gfx_level == GFX9 && info->family >= CHIP_MI200) - info->cu_info.wave64_vgpr_alloc_granularity = 8; - else - info->cu_info.wave64_vgpr_alloc_granularity = 4; - info->cu_info.min_wave64_vgpr_alloc = info->cu_info.wave64_vgpr_alloc_granularity; - info->cu_info.max_vgpr_alloc = 256; - - info->cu_info.num_simd_per_compute_unit = info->gfx_level >= GFX10 ? 2 : 4; + ac_fill_cu_info(info, &device_info); /* BIG_PAGE is supported since gfx10.3 and requires VRAM. VRAM is only guaranteed * with AMDGPU_GEM_CREATE_DISCARDABLE. DISCARDABLE was added in DRM 3.47.0. diff --git a/src/amd/common/ac_gpu_info.h b/src/amd/common/ac_gpu_info.h index 4f71c02f954..6687b9be569 100644 --- a/src/amd/common/ac_gpu_info.h +++ b/src/amd/common/ac_gpu_info.h @@ -20,6 +20,7 @@ extern "C" { #define AMD_MAX_WGP 60 struct amdgpu_gpu_info; +struct drm_amdgpu_info_device; struct amd_ip_info { uint8_t ver_major; @@ -364,6 +365,7 @@ enum ac_query_gpu_info_result { enum ac_query_gpu_info_result ac_query_gpu_info(int fd, void *dev_p, struct radeon_info *info, bool require_pci_bus_info); +void ac_fill_cu_info(struct radeon_info *info, struct drm_amdgpu_info_device *device_info); void ac_compute_driver_uuid(char *uuid, size_t size); diff --git a/src/amd/common/ac_null_device.c b/src/amd/common/ac_null_device.c index 597fd39c02e..d299f905bf6 100644 --- a/src/amd/common/ac_null_device.c +++ b/src/amd/common/ac_null_device.c @@ -46,21 +46,6 @@ ac_null_device_create(struct radeon_info *gpu_info, const char *family) gpu_info->pci_id = pci_ids[gpu_info->family].pci_id; gpu_info->max_se = pci_ids[gpu_info->family].has_dedicated_vram ? 4 : 1; gpu_info->num_se = gpu_info->max_se; - if (gpu_info->gfx_level >= GFX10_3) - gpu_info->cu_info.max_waves_per_simd = 16; - else if (gpu_info->gfx_level >= GFX10) - gpu_info->cu_info.max_waves_per_simd = 20; - else if (gpu_info->family >= CHIP_POLARIS10 && gpu_info->family <= CHIP_VEGAM) - gpu_info->cu_info.max_waves_per_simd = 8; - else - gpu_info->cu_info.max_waves_per_simd = 10; - - if (gpu_info->gfx_level >= GFX10) - gpu_info->cu_info.num_physical_sgprs_per_simd = 128 * gpu_info->cu_info.max_waves_per_simd; - else if (gpu_info->gfx_level >= GFX8) - gpu_info->cu_info.num_physical_sgprs_per_simd = 800; - else - gpu_info->cu_info.num_physical_sgprs_per_simd = 512; gpu_info->has_timeline_syncobj = true; gpu_info->has_vm_always_valid = true; @@ -71,13 +56,6 @@ ac_null_device_create(struct radeon_info *gpu_info, const char *family) gpu_info->has_ngg_fully_culled_bug = gpu_info->gfx_level == GFX10; gpu_info->has_ngg_passthru_no_msg = gpu_info->family >= CHIP_NAVI23; - if (gpu_info->family == CHIP_NAVI31 || gpu_info->family == CHIP_NAVI32 || gpu_info->gfx_level >= GFX12) - gpu_info->cu_info.num_physical_wave64_vgprs_per_simd = 768; - else if (gpu_info->gfx_level >= GFX10) - gpu_info->cu_info.num_physical_wave64_vgprs_per_simd = 512; - else - gpu_info->cu_info.num_physical_wave64_vgprs_per_simd = 256; - gpu_info->cu_info.num_simd_per_compute_unit = gpu_info->gfx_level >= GFX10 ? 2 : 4; gpu_info->lds_size_per_workgroup = gpu_info->gfx_level >= GFX7 ? 64 * 1024 : 32 * 1024; gpu_info->max_render_backends = pci_ids[gpu_info->family].num_render_backends; @@ -114,6 +92,9 @@ ac_null_device_create(struct radeon_info *gpu_info, const char *family) gpu_info->ip[AMD_IP_GFX].num_queues = 1; gpu_info->gart_page_size = 4096; + + ac_fill_cu_info(gpu_info, NULL); + gpu_info->family_overridden = true; return true; diff --git a/src/gallium/winsys/radeon/drm/meson.build b/src/gallium/winsys/radeon/drm/meson.build index 90fa00e8082..04ef74c4d56 100644 --- a/src/gallium/winsys/radeon/drm/meson.build +++ b/src/gallium/winsys/radeon/drm/meson.build @@ -3,10 +3,12 @@ libradeonwinsys_deps = [idep_mesautil, dep_libdrm] libradeonwinsys_c_args = [] +amd_common_libs = [] if with_gallium_radeonsi libradeonwinsys_deps += [idep_amdgfxregs_h] libradeonwinsys_c_args = ['-DHAVE_GALLIUM_RADEONSI'] + amd_common_libs += [libamd_common] endif libradeonwinsys = static_library( @@ -22,6 +24,7 @@ libradeonwinsys = static_library( 'radeon_surface.h'), include_directories : [inc_src, inc_include, inc_gallium, inc_gallium_aux], gnu_symbol_visibility : 'hidden', + link_with : amd_common_libs, c_args : libradeonwinsys_c_args, dependencies : libradeonwinsys_deps, ) diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c index c812de6cbe5..242869fd94a 100644 --- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c +++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c @@ -632,9 +632,6 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) ws->info.max_alignment = 1024*1024; ws->info.has_graphics = true; ws->info.cpdma_prefetch_writes_memory = true; - ws->info.cu_info.max_waves_per_simd = 10; - ws->info.cu_info.num_physical_sgprs_per_simd = 512; - ws->info.cu_info.num_physical_wave64_vgprs_per_simd = 256; ws->info.has_3d_cube_border_color_mipmap = true; ws->info.has_image_opcodes = true; ws->info.spi_cu_en_has_effect = false; @@ -644,15 +641,12 @@ static bool do_winsys_init(struct radeon_drm_winsys *ws) ws->info.max_gflops = 128 * ws->info.num_cu * ws->info.max_gpu_freq_mhz / 1000; ws->info.num_tcc_blocks = ws->info.max_tcc_blocks; ws->info.tcp_cache_size = 16 * 1024; - ws->info.cu_info.num_simd_per_compute_unit = 4; - ws->info.cu_info.min_sgpr_alloc = 8; - ws->info.cu_info.max_sgpr_alloc = 104; - ws->info.cu_info.sgpr_alloc_granularity = 8; - ws->info.cu_info.min_wave64_vgpr_alloc = 4; - ws->info.cu_info.max_vgpr_alloc = 256; - ws->info.cu_info.wave64_vgpr_alloc_granularity = 4; ws->info.lds_size_per_workgroup = ws->info.gfx_level == GFX7 ? 64 * 1024 : 32 * 1024; +#ifdef HAVE_GALLIUM_RADEONSI + ac_fill_cu_info(&ws->info, NULL); +#endif + for (unsigned se = 0; se < ws->info.max_se; se++) { for (unsigned sa = 0; sa < ws->info.max_sa_per_se; sa++) ws->info.cu_mask[se][sa] = BITFIELD_MASK(ws->info.max_good_cu_per_sa);