Merge remote-tracking branch 'public/master' into vulkan

This commit is contained in:
Jason Ekstrand
2016-03-24 17:30:14 -07:00
266 changed files with 4907 additions and 3626 deletions
+5 -16
View File
@@ -704,8 +704,10 @@ test "x$enable_asm" = xno && AC_MSG_RESULT([no])
if test "x$enable_asm" = xyes -a "x$cross_compiling" = xyes; then
case "$host_cpu" in
i?86 | x86_64 | amd64)
enable_asm=no
AC_MSG_RESULT([no, cross compiling])
if test "x$host_cpu" != "x$target_cpu"; then
enable_asm=no
AC_MSG_RESULT([no, cross compiling])
fi
;;
esac
fi
@@ -929,12 +931,6 @@ AC_ARG_ENABLE([xlib-glx],
[enable_xlib_glx="$enableval"],
[enable_xlib_glx=no])
AC_ARG_ENABLE([r600-llvm-compiler],
[AS_HELP_STRING([--enable-r600-llvm-compiler],
[Enable experimental LLVM backend for graphics shaders @<:@default=disabled@:>@])],
[enable_r600_llvm="$enableval"],
[enable_r600_llvm=no])
AC_ARG_ENABLE([gallium-tests],
[AS_HELP_STRING([--enable-gallium-tests],
[Enable optional Gallium tests) @<:@default=disabled@:>@])],
@@ -2238,14 +2234,8 @@ if test -n "$with_gallium_drivers"; then
PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
gallium_require_drm "Gallium R600"
gallium_require_drm_loader
if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
radeon_llvm_check "r600g"
LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
fi
if test "x$enable_r600_llvm" = xyes; then
USE_R600_LLVM_COMPILER=yes;
fi
if test "x$enable_opencl" = xyes; then
radeon_llvm_check "r600g"
LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
fi
;;
@@ -2416,7 +2406,6 @@ AM_CONDITIONAL(NEED_RADEON_DRM_WINSYS, test "x$HAVE_GALLIUM_R300" = xyes -o \
"x$HAVE_GALLIUM_RADEONSI" = xyes)
AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$NEED_WINSYS_XLIB" = xyes)
AM_CONDITIONAL(NEED_RADEON_LLVM, test x$NEED_RADEON_LLVM = xyes)
AM_CONDITIONAL(USE_R600_LLVM_COMPILER, test x$USE_R600_LLVM_COMPILER = xyes)
AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
AM_CONDITIONAL(HAVE_MESA_LLVM, test x$MESA_LLVM = x1)
AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
+184 -169
View File
@@ -1,13 +1,28 @@
# Status of OpenGL extensions in Mesa
Status of OpenGL 3.x features in Mesa
Here's how to read this file:
all DONE: <driver>, ...
All the extensions are done for the given list of drivers.
Note: when an item is marked as "DONE" it means all the core Mesa
infrastructure is complete but it may be the case that few (if any) drivers
implement the features.
DONE
The extension is done for Mesa and no implementation is necessary on the
driver-side.
DONE ()
The extension is done for Mesa and all the drivers in the "all DONE" list.
OpenGL Core and Compatibility context support
DONE (<driver>, ...)
The extension is done for Mesa, all the drivers in the "all DONE" list, and
all the drivers in the brackets.
in progress
The extension is started but not finished yet.
not started
The extension isn't started yet.
# OpenGL Core and Compatibility context support
OpenGL 3.1 and later versions are only supported with the Core profile.
There are no plans to support GL_ARB_compatibility. The last supported OpenGL
@@ -15,30 +30,30 @@ version with all deprecated features is 3.0. Some of the later GL features
are exposed in the 3.0 context as extensions.
Feature Status
----------------------------------------------------- ------------------------
Feature Status
------------------------------------------------------- ------------------------
GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
glBindFragDataLocation, glGetFragDataLocation DONE
Conditional rendering (GL_NV_conditional_render) DONE ()
Map buffer subranges (GL_ARB_map_buffer_range) DONE ()
Clamping controls (GL_ARB_color_buffer_float) DONE ()
Float textures, renderbuffers (GL_ARB_texture_float) DONE ()
GL_NV_conditional_render (Conditional rendering) DONE ()
GL_ARB_map_buffer_range (Map buffer subranges) DONE ()
GL_ARB_color_buffer_float (Clamping controls) DONE ()
GL_ARB_texture_float (Float textures, renderbuffers) DONE ()
GL_EXT_packed_float DONE ()
GL_EXT_texture_shared_exponent DONE ()
Float depth buffers (GL_ARB_depth_buffer_float) DONE ()
Framebuffer objects (GL_ARB_framebuffer_object) DONE ()
GL_ARB_depth_buffer_float (Float depth buffers) DONE ()
GL_ARB_framebuffer_object (Framebuffer objects) DONE ()
GL_ARB_half_float_pixel DONE (all drivers)
GL_ARB_half_float_vertex DONE ()
GL_EXT_texture_integer DONE ()
GL_EXT_texture_array DONE ()
Per-buffer blend and masks (GL_EXT_draw_buffers2) DONE ()
GL_EXT_draw_buffers2 (Per-buffer blend and masks) DONE ()
GL_EXT_texture_compression_rgtc DONE ()
GL_ARB_texture_rg DONE ()
Transform feedback (GL_EXT_transform_feedback) DONE ()
Vertex array objects (GL_ARB_vertex_array_object) DONE ()
sRGB framebuffer format (GL_EXT_framebuffer_sRGB) DONE ()
GL_EXT_transform_feedback (Transform feedback) DONE ()
GL_ARB_vertex_array_object (Vertex array objects) DONE ()
GL_EXT_framebuffer_sRGB (sRGB framebuffer format) DONE ()
glClearBuffer commands DONE
glGetStringi command DONE
glTexParameterI, glGetTexParameterI commands DONE
@@ -53,28 +68,28 @@ GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
GL 3.1, GLSL 1.40 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
Forward compatible context support/deprecations DONE ()
Instanced drawing (GL_ARB_draw_instanced) DONE ()
Buffer copying (GL_ARB_copy_buffer) DONE ()
Primitive restart (GL_NV_primitive_restart) DONE ()
GL_ARB_draw_instanced (Instanced drawing) DONE ()
GL_ARB_copy_buffer (Buffer copying) DONE ()
GL_NV_primitive_restart (Primitive restart) DONE ()
16 vertex texture image units DONE ()
Texture buffer objs (GL_ARB_texture_buffer_object) DONE for OpenGL 3.1 contexts ()
Rectangular textures (GL_ARB_texture_rectangle) DONE ()
Uniform buffer objs (GL_ARB_uniform_buffer_object) DONE ()
Signed normalized textures (GL_EXT_texture_snorm) DONE ()
GL_ARB_texture_buffer_object (Texture buffer objs) DONE (for OpenGL 3.1 contexts)
GL_ARB_texture_rectangle (Rectangular textures) DONE ()
GL_ARB_uniform_buffer_object (Uniform buffer objs) DONE ()
GL_EXT_texture_snorm (Signed normalized textures) DONE ()
GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
Core/compatibility profiles DONE
Geometry shaders DONE ()
BGRA vertex order (GL_ARB_vertex_array_bgra) DONE ()
Base vertex offset(GL_ARB_draw_elements_base_vertex) DONE ()
Frag shader coord (GL_ARB_fragment_coord_conventions) DONE ()
Provoking vertex (GL_ARB_provoking_vertex) DONE ()
Seamless cubemaps (GL_ARB_seamless_cube_map) DONE ()
Multisample textures (GL_ARB_texture_multisample) DONE ()
Frag depth clamp (GL_ARB_depth_clamp) DONE ()
Fence objects (GL_ARB_sync) DONE ()
GL_ARB_vertex_array_bgra (BGRA vertex order) DONE ()
GL_ARB_draw_elements_base_vertex (Base vertex offset) DONE ()
GL_ARB_fragment_coord_conventions (Frag shader coord) DONE ()
GL_ARB_provoking_vertex (Provoking vertex) DONE ()
GL_ARB_seamless_cube_map (Seamless cubemaps) DONE ()
GL_ARB_texture_multisample (Multisample textures) DONE ()
GL_ARB_depth_clamp (Frag depth clamp) DONE ()
GL_ARB_sync (Fence objects) DONE ()
GLX_ARB_create_context_profile DONE
@@ -94,170 +109,170 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi
GL_ARB_draw_buffers_blend DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_draw_indirect DONE (i965, llvmpipe, softpipe)
GL_ARB_gpu_shader5 DONE (i965)
- 'precise' qualifier DONE
- Dynamically uniform sampler array indices DONE (softpipe)
- Dynamically uniform UBO array indices DONE ()
- Implicit signed -> unsigned conversions DONE
- Fused multiply-add DONE ()
- Packing/bitfield/conversion functions DONE (softpipe)
- Enhanced textureGather DONE (softpipe)
- Geometry shader instancing DONE (llvmpipe, softpipe)
- Geometry shader multiple streams DONE ()
- Enhanced per-sample shading DONE ()
- Interpolation functions DONE ()
- New overload resolution rules DONE
GL_ARB_gpu_shader_fp64 DONE (llvmpipe, softpipe)
GL_ARB_sample_shading DONE (i965, nv50)
GL_ARB_shader_subroutine DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_tessellation_shader DONE (i965)
GL_ARB_texture_buffer_object_rgb32 DONE (i965, llvmpipe, softpipe)
GL_ARB_texture_cube_map_array DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_texture_gather DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_texture_query_lod DONE (i965, nv50, softpipe)
GL_ARB_transform_feedback2 DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_transform_feedback3 DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_draw_buffers_blend DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_draw_indirect DONE (i965, llvmpipe, softpipe)
GL_ARB_gpu_shader5 DONE (i965)
- 'precise' qualifier DONE
- Dynamically uniform sampler array indices DONE (softpipe)
- Dynamically uniform UBO array indices DONE ()
- Implicit signed -> unsigned conversions DONE
- Fused multiply-add DONE ()
- Packing/bitfield/conversion functions DONE (softpipe)
- Enhanced textureGather DONE (softpipe)
- Geometry shader instancing DONE (llvmpipe, softpipe)
- Geometry shader multiple streams DONE ()
- Enhanced per-sample shading DONE ()
- Interpolation functions DONE ()
- New overload resolution rules DONE
GL_ARB_gpu_shader_fp64 DONE (llvmpipe, softpipe)
GL_ARB_sample_shading DONE (i965, nv50)
GL_ARB_shader_subroutine DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_tessellation_shader DONE (i965)
GL_ARB_texture_buffer_object_rgb32 DONE (i965, llvmpipe, softpipe)
GL_ARB_texture_cube_map_array DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_texture_gather DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_texture_query_lod DONE (i965, nv50, softpipe)
GL_ARB_transform_feedback2 DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_transform_feedback3 DONE (i965, nv50, llvmpipe, softpipe)
GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi
GL_ARB_ES2_compatibility DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_get_program_binary DONE (0 binary formats)
GL_ARB_separate_shader_objects DONE (all drivers)
GL_ARB_shader_precision DONE (all drivers that support GLSL 4.10)
GL_ARB_vertex_attrib_64bit DONE (llvmpipe, softpipe)
GL_ARB_viewport_array DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_ES2_compatibility DONE (i965, nv50, llvmpipe, softpipe)
GL_ARB_get_program_binary DONE (0 binary formats)
GL_ARB_separate_shader_objects DONE (all drivers)
GL_ARB_shader_precision DONE (all drivers that support GLSL 4.10)
GL_ARB_vertex_attrib_64bit DONE (llvmpipe, softpipe)
GL_ARB_viewport_array DONE (i965, nv50, llvmpipe, softpipe)
GL 4.2, GLSL 4.20:
GL_ARB_texture_compression_bptc DONE (i965, nvc0, r600, radeonsi)
GL_ARB_compressed_texture_pixel_storage DONE (all drivers)
GL_ARB_shader_atomic_counters DONE (i965, nvc0)
GL_ARB_texture_storage DONE (all drivers)
GL_ARB_transform_feedback_instanced DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_base_instance DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_shader_image_load_store DONE (i965)
GL_ARB_conservative_depth DONE (all drivers that support GLSL 1.30)
GL_ARB_shading_language_420pack DONE (all drivers that support GLSL 1.30)
GL_ARB_shading_language_packing DONE (all drivers)
GL_ARB_internalformat_query DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_map_buffer_alignment DONE (all drivers)
GL_ARB_texture_compression_bptc DONE (i965, nvc0, r600, radeonsi)
GL_ARB_compressed_texture_pixel_storage DONE (all drivers)
GL_ARB_shader_atomic_counters DONE (i965, nvc0)
GL_ARB_texture_storage DONE (all drivers)
GL_ARB_transform_feedback_instanced DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_base_instance DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_shader_image_load_store DONE (i965, radeonsi)
GL_ARB_conservative_depth DONE (all drivers that support GLSL 1.30)
GL_ARB_shading_language_420pack DONE (all drivers that support GLSL 1.30)
GL_ARB_shading_language_packing DONE (all drivers)
GL_ARB_internalformat_query DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_map_buffer_alignment DONE (all drivers)
GL 4.3, GLSL 4.30:
GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30)
GL_ARB_ES3_compatibility DONE (all drivers that support GLSL 3.30)
GL_ARB_clear_buffer_object DONE (all drivers)
GL_ARB_compute_shader DONE (i965)
GL_ARB_copy_image DONE (i965, nv50, nvc0, r600, radeonsi)
GL_KHR_debug DONE (all drivers)
GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL)
GL_ARB_fragment_layer_viewport DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
GL_ARB_framebuffer_no_attachments DONE (i965)
GL_ARB_internalformat_query2 DONE (i965)
GL_ARB_invalidate_subdata DONE (all drivers)
GL_ARB_multi_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_program_interface_query DONE (all drivers)
GL_ARB_robust_buffer_access_behavior not started
GL_ARB_shader_image_size DONE (i965)
GL_ARB_shader_storage_buffer_object DONE (i965, nvc0)
GL_ARB_stencil_texturing DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_buffer_range DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
GL_ARB_texture_query_levels DONE (all drivers that support GLSL 1.30)
GL_ARB_texture_storage_multisample DONE (all drivers that support GL_ARB_texture_multisample)
GL_ARB_texture_view DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_vertex_attrib_binding DONE (all drivers)
GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30)
GL_ARB_ES3_compatibility DONE (all drivers that support GLSL 3.30)
GL_ARB_clear_buffer_object DONE (all drivers)
GL_ARB_compute_shader DONE (i965)
GL_ARB_copy_image DONE (i965, nv50, nvc0, r600, radeonsi)
GL_KHR_debug DONE (all drivers)
GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL)
GL_ARB_fragment_layer_viewport DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
GL_ARB_framebuffer_no_attachments DONE (i965)
GL_ARB_internalformat_query2 DONE (all drivers)
GL_ARB_invalidate_subdata DONE (all drivers)
GL_ARB_multi_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_program_interface_query DONE (all drivers)
GL_ARB_robust_buffer_access_behavior not started
GL_ARB_shader_image_size DONE (i965, radeonsi)
GL_ARB_shader_storage_buffer_object DONE (i965, nvc0)
GL_ARB_stencil_texturing DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_buffer_range DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
GL_ARB_texture_query_levels DONE (all drivers that support GLSL 1.30)
GL_ARB_texture_storage_multisample DONE (all drivers that support GL_ARB_texture_multisample)
GL_ARB_texture_view DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_vertex_attrib_binding DONE (all drivers)
GL 4.4, GLSL 4.40:
GL_MAX_VERTEX_ATTRIB_STRIDE DONE (all drivers)
GL_ARB_buffer_storage DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_clear_texture DONE (i965, nv50, nvc0)
GL_ARB_enhanced_layouts in progress (Timothy)
- compile-time constant expressions DONE
- explicit byte offsets for blocks DONE
- forced alignment within blocks DONE
- specified vec4-slot component numbers in progress
- specified transform/feedback layout in progress
- input/output block locations DONE
GL_ARB_multi_bind DONE (all drivers)
GL_ARB_query_buffer_object DONE (nvc0)
GL_ARB_texture_mirror_clamp_to_edge DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_stencil8 DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_vertex_type_10f_11f_11f_rev DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_MAX_VERTEX_ATTRIB_STRIDE DONE (all drivers)
GL_ARB_buffer_storage DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_clear_texture DONE (i965, nv50, nvc0)
GL_ARB_enhanced_layouts in progress (Timothy)
- compile-time constant expressions DONE
- explicit byte offsets for blocks DONE
- forced alignment within blocks DONE
- specified vec4-slot component numbers in progress
- specified transform/feedback layout in progress
- input/output block locations DONE
GL_ARB_multi_bind DONE (all drivers)
GL_ARB_query_buffer_object DONE (nvc0)
GL_ARB_texture_mirror_clamp_to_edge DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_stencil8 DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_vertex_type_10f_11f_11f_rev DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL 4.5, GLSL 4.50:
GL_ARB_ES3_1_compatibility not started
GL_ARB_clip_control DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_conditional_render_inverted DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_cull_distance in progress (Tobias)
GL_ARB_derivative_control DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_direct_state_access DONE (all drivers)
GL_ARB_get_texture_sub_image DONE (all drivers)
GL_ARB_shader_texture_image_samples DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_texture_barrier DONE (i965, nv50, nvc0, r600, radeonsi)
GL_KHR_context_flush_control DONE (all - but needs GLX/EGL extension to be useful)
GL_KHR_robust_buffer_access_behavior not started
GL_KHR_robustness 90% done (the ARB variant)
GL_EXT_shader_integer_mix DONE (all drivers that support GLSL)
GL_ARB_ES3_1_compatibility not started
GL_ARB_clip_control DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_conditional_render_inverted DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_cull_distance in progress (Tobias)
GL_ARB_derivative_control DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_direct_state_access DONE (all drivers)
GL_ARB_get_texture_sub_image DONE (all drivers)
GL_ARB_shader_texture_image_samples DONE (i965, nv50, nvc0, r600, radeonsi)
GL_ARB_texture_barrier DONE (i965, nv50, nvc0, r600, radeonsi)
GL_KHR_context_flush_control DONE (all - but needs GLX/EGL extension to be useful)
GL_KHR_robust_buffer_access_behavior not started
GL_KHR_robustness not started (90% done with the ARB variant)
GL_EXT_shader_integer_mix DONE (all drivers that support GLSL)
These are the extensions cherry-picked to make GLES 3.1
GLES3.1, GLSL ES 3.1
GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30)
GL_ARB_compute_shader DONE (i965)
GL_ARB_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL)
GL_ARB_framebuffer_no_attachments DONE (i965)
GL_ARB_program_interface_query DONE (all drivers)
GL_ARB_shader_atomic_counters DONE (i965, nvc0)
GL_ARB_shader_image_load_store DONE (i965)
GL_ARB_shader_image_size DONE (i965)
GL_ARB_shader_storage_buffer_object DONE (i965, nvc0)
GL_ARB_shading_language_packing DONE (all drivers)
GL_ARB_separate_shader_objects DONE (all drivers)
GL_ARB_stencil_texturing DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
Multisample textures (GL_ARB_texture_multisample) DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_storage_multisample DONE (all drivers that support GL_ARB_texture_multisample)
GL_ARB_vertex_attrib_binding DONE (all drivers)
GS5 Enhanced textureGather DONE (i965, nvc0, r600, radeonsi)
GS5 Packing/bitfield/conversion functions DONE (i965, nvc0, r600, radeonsi)
GL_EXT_shader_integer_mix DONE (all drivers that support GLSL)
GL_ARB_arrays_of_arrays DONE (all drivers that support GLSL 1.30)
GL_ARB_compute_shader DONE (i965)
GL_ARB_draw_indirect DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_explicit_uniform_location DONE (all drivers that support GLSL)
GL_ARB_framebuffer_no_attachments DONE (i965)
GL_ARB_program_interface_query DONE (all drivers)
GL_ARB_shader_atomic_counters DONE (i965, nvc0)
GL_ARB_shader_image_load_store DONE (i965)
GL_ARB_shader_image_size DONE (i965)
GL_ARB_shader_storage_buffer_object DONE (i965, nvc0)
GL_ARB_shading_language_packing DONE (all drivers)
GL_ARB_separate_shader_objects DONE (all drivers)
GL_ARB_stencil_texturing DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_multisample (Multisample textures) DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
GL_ARB_texture_storage_multisample DONE (all drivers that support GL_ARB_texture_multisample)
GL_ARB_vertex_attrib_binding DONE (all drivers)
GS5 Enhanced textureGather DONE (i965, nvc0, r600, radeonsi)
GS5 Packing/bitfield/conversion functions DONE (i965, nvc0, r600, radeonsi)
GL_EXT_shader_integer_mix DONE (all drivers that support GLSL)
Additional functionality not covered above:
glMemoryBarrierByRegion DONE
glGetTexLevelParameter[fi]v - needs updates DONE
glMemoryBarrierByRegion DONE
glGetTexLevelParameter[fi]v - needs updates DONE
glGetBooleani_v - restrict to GLES enums
gl_HelperInvocation support DONE (i965, nvc0, r600)
gl_HelperInvocation support DONE (i965, nvc0, r600)
GLES3.2, GLSL ES 3.2
GL_EXT_color_buffer_float DONE (all drivers)
GL_KHR_blend_equation_advanced not started
GL_KHR_debug DONE (all drivers)
GL_KHR_robustness 90% done (the ARB variant)
GL_KHR_texture_compression_astc_ldr DONE (i965/gen9+)
GL_OES_copy_image not started (based on GL_ARB_copy_image, which is done for some drivers)
GL_OES_draw_buffers_indexed not started
GL_OES_draw_elements_base_vertex DONE (all drivers)
GL_OES_geometry_shader started (Marta)
GL_OES_gpu_shader5 DONE (all drivers that support GL_ARB_gpu_shader5)
GL_OES_primitive_bounding box not started
GL_OES_sample_shading not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
GL_OES_sample_variables not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
GL_OES_shader_image_atomic DONE (all drivers that support GL_ARB_shader_image_load_store)
GL_OES_shader_io_blocks not started (based on parts of GLSL 1.50, which is done)
GL_OES_shader_multisample_interpolation not started (based on parts of GL_ARB_gpu_shader5, which is done)
GL_OES_tessellation_shader not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
GL_OES_texture_border_clamp DONE (all drivers)
GL_OES_texture_buffer not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
GL_OES_texture_cube_map_array not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
GL_OES_texture_stencil8 DONE (all drivers that support GL_ARB_texture_stencil8)
GL_OES_texture_storage_multisample_2d_array DONE (all drivers that support GL_ARB_texture_multisample)
GL_EXT_color_buffer_float DONE (all drivers)
GL_KHR_blend_equation_advanced not started
GL_KHR_debug DONE (all drivers)
GL_KHR_robustness not started (90% done with the ARB variant)
GL_KHR_texture_compression_astc_ldr DONE (i965/gen9+)
GL_OES_copy_image not started (based on GL_ARB_copy_image, which is done for some drivers)
GL_OES_draw_buffers_indexed not started
GL_OES_draw_elements_base_vertex DONE (all drivers)
GL_OES_geometry_shader started (Marta)
GL_OES_gpu_shader5 DONE (all drivers that support GL_ARB_gpu_shader5)
GL_OES_primitive_bounding box not started
GL_OES_sample_shading not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
GL_OES_sample_variables not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
GL_OES_shader_image_atomic DONE (all drivers that support GL_ARB_shader_image_load_store)
GL_OES_shader_io_blocks not started (based on parts of GLSL 1.50, which is done)
GL_OES_shader_multisample_interpolation not started (based on parts of GL_ARB_gpu_shader5, which is done)
GL_OES_tessellation_shader not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
GL_OES_texture_border_clamp DONE (all drivers)
GL_OES_texture_buffer not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
GL_OES_texture_cube_map_array not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
GL_OES_texture_stencil8 DONE (all drivers that support GL_ARB_texture_stencil8)
GL_OES_texture_storage_multisample_2d_array DONE (all drivers that support GL_ARB_texture_multisample)
More info about these features and the work involved can be found at
http://dri.freedesktop.org/wiki/MissingFunctionality
+3
View File
@@ -163,6 +163,9 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
<li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
<li>nodualobj - suppress generation of dual-object geometry shader code</li>
<li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
<li>vec4 - force vec4 mode in vertex shader</li>
<li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
<li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
</ul>
</ul>
+1 -2
View File
@@ -73,8 +73,7 @@ The following are required for DRI-based hardware acceleration with Mesa:
<ul>
<li><a href="http://xorg.freedesktop.org/releases/individual/proto/">
dri2proto</a> version 2.6 or later
<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a>
version 2.4.33 or later
<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a> latest version
<li>Xorg server version 1.5 or later
<li>Linux 2.6.28 or later
</ul>
+3 -1
View File
@@ -44,8 +44,10 @@ Note: some of the new features are only available with certain drivers.
</p>
<ul>
<li>GL_ARB_internalformat_query2 on i965</li>
<li>GL_ARB_internalformat_query2 on all drivers</li>
<li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
<li>GL_ARB_shader_image_load_store on radeonsi</li>
<li>GL_ARB_shader_image_size on radeonsi</li>
<li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
<li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
</ul>
+1
View File
@@ -129,6 +129,7 @@ LIBGLSL_FILES = \
glsl/opt_tree_grafting.cpp \
glsl/opt_vectorize.cpp \
glsl/program.h \
glsl/propagate_invariance.cpp \
glsl/s_expression.cpp \
glsl/s_expression.h
+1
View File
@@ -217,6 +217,7 @@ LIBGLSL_FILES = \
opt_tree_grafting.cpp \
opt_vectorize.cpp \
program.h \
propagate_invariance.cpp \
s_expression.cpp \
s_expression.h
+3 -1
View File
@@ -2125,7 +2125,9 @@ process_array_size(exec_node *node,
}
ir_constant *const size = ir->constant_expression_value();
if (size == NULL || array_size->has_sequence_subexpression()) {
if (size == NULL ||
(state->is_version(120, 300) &&
array_size->has_sequence_subexpression())) {
_mesa_glsl_error(& loc, state, "array size must be a "
"constant valued expression");
return 0;
+1
View File
@@ -1887,6 +1887,7 @@ do_common_optimization(exec_list *ir, bool linked,
OPT(do_dead_functions, ir);
OPT(do_structure_splitting, ir);
}
propagate_invariance(ir);
OPT(do_if_simplification, ir);
OPT(opt_flatten_nested_if_blocks, ir);
OPT(opt_conditional_discard, ir);
+7
View File
@@ -719,6 +719,13 @@ public:
*/
unsigned is_unmatched_generic_inout:1;
/**
* Is this varying used only by transform feedback?
*
* This is used by the linker to decide if its safe to pack the varying.
*/
unsigned is_xfb_only:1;
/**
* If non-zero, then this variable may be packed along with other variables
* into a single varying slot, so this offset should be applied when
+3 -1
View File
@@ -124,7 +124,8 @@ void lower_shared_reference(struct gl_shader *shader, unsigned *shared_size);
void lower_ubo_reference(struct gl_shader *shader);
void lower_packed_varyings(void *mem_ctx,
unsigned locations_used, ir_variable_mode mode,
unsigned gs_input_vertices, gl_shader *shader);
unsigned gs_input_vertices, gl_shader *shader,
bool disable_varying_packing, bool xfb_enabled);
bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
bool lower_vector_derefs(gl_shader *shader);
void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
@@ -138,6 +139,7 @@ bool lower_tess_level(gl_shader *shader);
bool lower_vertex_id(gl_shader *shader);
bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
void propagate_invariance(exec_list *instructions);
ir_rvalue *
compare_index_block(exec_list *instructions, ir_variable *index,
+158 -52
View File
@@ -826,7 +826,7 @@ namespace {
class varying_matches
{
public:
varying_matches(bool disable_varying_packing,
varying_matches(bool disable_varying_packing, bool xfb_enabled,
gl_shader_stage producer_stage,
gl_shader_stage consumer_stage);
~varying_matches();
@@ -836,13 +836,29 @@ public:
void store_locations() const;
private:
bool is_varying_packing_safe(const glsl_type *type,
const ir_variable *var);
/**
* If true, this driver disables varying packing, so all varyings need to
* be aligned on slot boundaries, and take up a number of slots equal to
* their number of matrix columns times their array size.
*
* Packing may also be disabled because our current packing method is not
* safe in SSO or versions of OpenGL where interpolation qualifiers are not
* guaranteed to match across stages.
*/
const bool disable_varying_packing;
/**
* If true, this driver has transform feedback enabled. The transform
* feedback code requires at least some packing be done even when varying
* packing is disabled, fortunately where transform feedback requires
* packing it's safe to override the disabled setting. See
* is_varying_packing_safe().
*/
const bool xfb_enabled;
/**
* Enum representing the order in which varyings are packed within a
* packing class.
@@ -862,6 +878,7 @@ private:
static unsigned compute_packing_class(const ir_variable *var);
static packing_order_enum compute_packing_order(const ir_variable *var);
static int match_comparator(const void *x_generic, const void *y_generic);
static int xfb_comparator(const void *x_generic, const void *y_generic);
/**
* Structure recording the relationship between a single producer output
@@ -917,9 +934,11 @@ private:
} /* anonymous namespace */
varying_matches::varying_matches(bool disable_varying_packing,
bool xfb_enabled,
gl_shader_stage producer_stage,
gl_shader_stage consumer_stage)
: disable_varying_packing(disable_varying_packing),
xfb_enabled(xfb_enabled),
producer_stage(producer_stage),
consumer_stage(consumer_stage)
{
@@ -941,6 +960,24 @@ varying_matches::~varying_matches()
}
/**
* Packing is always safe on individual arrays, structure and matices. It is
* also safe if the varying is only used for transform feedback.
*/
bool
varying_matches::is_varying_packing_safe(const glsl_type *type,
const ir_variable *var)
{
if (consumer_stage == MESA_SHADER_TESS_EVAL ||
consumer_stage == MESA_SHADER_TESS_CTRL ||
producer_stage == MESA_SHADER_TESS_CTRL)
return false;
return xfb_enabled && (type->is_array() || type->is_record() ||
type->is_matrix() || var->data.is_xfb_only);
}
/**
* Record the given producer/consumer variable pair in the list of variables
* that should later be assigned locations.
@@ -1020,7 +1057,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
= this->compute_packing_class(var);
this->matches[this->num_matches].packing_order
= this->compute_packing_order(var);
if (this->disable_varying_packing) {
if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) {
unsigned slots = type->count_attribute_slots(false);
this->matches[this->num_matches].num_components = slots * 4;
} else {
@@ -1046,37 +1083,28 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
uint64_t reserved_slots,
bool separate_shader)
{
/* We disable varying sorting for separate shader programs for the
* following reasons:
*
* 1/ All programs must sort the code in the same order to guarantee the
* interface matching. However varying_matches::record() will change the
* interpolation qualifier of some stages.
*
* 2/ GLSL version 4.50 removes the matching constrain on the interpolation
* qualifier.
*
* From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec:
*
* "The type and presence of interpolation qualifiers of variables with
* the same name declared in all linked shaders for the same cross-stage
* interface must match, otherwise the link command will fail.
*
* When comparing an output from one stage to an input of a subsequent
* stage, the input and output don't match if their interpolation
* qualifiers (or lack thereof) are not the same."
*
* "It is a link-time error if, within the same stage, the interpolation
* qualifiers of variables of the same name do not match."
/* If packing has been disabled then we cannot safely sort the varyings by
* class as it may mean we are using a version of OpenGL where
* interpolation qualifiers are not guaranteed to be matching across
* shaders, sorting in this case could result in mismatching shader
* interfaces.
* When packing is disabled the sort orders varyings used by transform
* feedback first, but also depends on *undefined behaviour* of qsort to
* reverse the order of the varyings. See: xfb_comparator().
*/
if (!separate_shader) {
if (!this->disable_varying_packing) {
/* Sort varying matches into an order that makes them easy to pack. */
qsort(this->matches, this->num_matches, sizeof(*this->matches),
&varying_matches::match_comparator);
} else {
/* Only sort varyings that are only used by transform feedback. */
qsort(this->matches, this->num_matches, sizeof(*this->matches),
&varying_matches::xfb_comparator);
}
unsigned generic_location = 0;
unsigned generic_patch_location = MAX_VARYING*4;
bool previous_var_xfb_only = false;
for (unsigned i = 0; i < this->num_matches; i++) {
unsigned *location = &generic_location;
@@ -1100,16 +1128,30 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
/* Advance to the next slot if this varying has a different packing
* class than the previous one, and we're not already on a slot
* boundary.
*
* Also advance to the next slot if packing is disabled. This makes sure
* we don't assign varyings the same locations which is possible
* because we still pack individual arrays, records and matrices even
* when packing is disabled. Note we don't advance to the next slot if
* we can pack varyings together that are only used for transform
* feedback.
*/
if (i > 0 &&
this->matches[i - 1].packing_class
!= this->matches[i].packing_class) {
if ((this->disable_varying_packing &&
!(previous_var_xfb_only && var->data.is_xfb_only)) ||
(i > 0 && this->matches[i - 1].packing_class
!= this->matches[i].packing_class )) {
*location = ALIGN(*location, 4);
}
previous_var_xfb_only = var->data.is_xfb_only;
unsigned num_elements = type->count_attribute_slots(is_vertex_input);
unsigned slot_end = this->disable_varying_packing ? 4 :
type->without_array()->vector_elements;
unsigned slot_end;
if (this->disable_varying_packing &&
!is_varying_packing_safe(type, var))
slot_end = 4;
else
slot_end = type->without_array()->vector_elements;
slot_end += *location - 1;
/* FIXME: We could be smarter in the below code and loop back over
@@ -1133,7 +1175,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
/* Increase the slot to make sure there is enough room for next
* array element.
*/
if (this->disable_varying_packing)
if (this->disable_varying_packing &&
!is_varying_packing_safe(type, var))
slot_end += 4;
else
slot_end += type->without_array()->vector_elements;
@@ -1258,6 +1301,32 @@ varying_matches::match_comparator(const void *x_generic, const void *y_generic)
}
/**
* Comparison function passed to qsort() to sort varyings used only by
* transform feedback when packing of other varyings is disabled.
*/
int
varying_matches::xfb_comparator(const void *x_generic, const void *y_generic)
{
const match *x = (const match *) x_generic;
if (x->producer_var != NULL && x->producer_var->data.is_xfb_only)
return match_comparator(x_generic, y_generic);
/* FIXME: When the comparator returns 0 it means the elements being
* compared are equivalent. However the qsort documentation says:
*
* "The order of equivalent elements is undefined."
*
* In practice the sort ends up reversing the order of the varyings which
* means locations are also assigned in this reversed order and happens to
* be what we want. This is also whats happening in
* varying_matches::match_comparator().
*/
return 0;
}
/**
* Is the given variable a varying variable to be counted against the
* limit in ctx->Const.MaxVarying?
@@ -1573,26 +1642,60 @@ assign_varying_locations(struct gl_context *ctx,
unsigned num_tfeedback_decls,
tfeedback_decl *tfeedback_decls)
{
if (ctx->Const.DisableVaryingPacking) {
/* Transform feedback code assumes varyings are packed, so if the driver
* has disabled varying packing, make sure it does not support transform
* feedback.
*/
assert(!ctx->Extensions.EXT_transform_feedback);
}
/* Tessellation shaders treat inputs and outputs as shared memory and can
* access inputs and outputs of other invocations.
* Therefore, they can't be lowered to temps easily (and definitely not
* efficiently).
*/
bool disable_varying_packing =
ctx->Const.DisableVaryingPacking ||
bool unpackable_tess =
(consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
(consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
(producer && producer->Stage == MESA_SHADER_TESS_CTRL);
varying_matches matches(disable_varying_packing,
/* Transform feedback code assumes varying arrays are packed, so if the
* driver has disabled varying packing, make sure to at least enable
* packing required by transform feedback.
*/
bool xfb_enabled =
ctx->Extensions.EXT_transform_feedback && !unpackable_tess;
/* Disable varying packing for GL 4.4+ as there is no guarantee
* that interpolation qualifiers will match between shaders in these
* versions. We also disable packing on outerward facing interfaces for
* SSO because in ES we need to retain the unpacked varying information
* for draw time validation. For desktop GL we could allow packing for
* versions < 4.4 but its just safer not to do packing.
*
* Packing is still enabled on individual arrays, structs, and matrices as
* these are required by the transform feedback code and it is still safe
* to do so. We also enable packing when a varying is only used for
* transform feedback and its not a SSO.
*
* Varying packing currently only packs together varyings with matching
* interpolation qualifiers as the backends assume all packed components
* are to be processed in the same way. Therefore we cannot do packing in
* these versions of GL without the risk of mismatching interfaces.
*
* From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.30 spec:
*
* "The type and presence of interpolation qualifiers of variables with
* the same name declared in all linked shaders for the same cross-stage
* interface must match, otherwise the link command will fail.
*
* When comparing an output from one stage to an input of a subsequent
* stage, the input and output don't match if their interpolation
* qualifiers (or lack thereof) are not the same."
*
* This text was also in at least revison 7 of the 4.40 spec but is no
* longer in revision 9 and not in the 4.50 spec.
*/
bool disable_varying_packing =
ctx->Const.DisableVaryingPacking || unpackable_tess;
if ((ctx->API == API_OPENGL_CORE && ctx->Version >= 44) ||
(prog->SeparateShader && (producer == NULL || consumer == NULL)))
disable_varying_packing = true;
varying_matches matches(disable_varying_packing, xfb_enabled,
producer ? producer->Stage : (gl_shader_stage)-1,
consumer ? consumer->Stage : (gl_shader_stage)-1);
hash_table *tfeedback_candidates
@@ -1711,8 +1814,10 @@ assign_varying_locations(struct gl_context *ctx,
return false;
}
if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout)
if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) {
matched_candidate->toplevel_var->data.is_xfb_only = 1;
matches.record(matched_candidate->toplevel_var, NULL);
}
}
const uint64_t reserved_slots =
@@ -1784,15 +1889,16 @@ assign_varying_locations(struct gl_context *ctx,
ir_var_shader_in);
}
if (!disable_varying_packing) {
if (producer) {
lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
0, producer);
}
if (consumer) {
lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
consumer_vertices, consumer);
}
if (producer) {
lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
0, producer, disable_varying_packing,
xfb_enabled);
}
if (consumer) {
lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
consumer_vertices, consumer,
disable_varying_packing, xfb_enabled);
}
return true;
+28 -6
View File
@@ -168,7 +168,9 @@ public:
ir_variable_mode mode,
unsigned gs_input_vertices,
exec_list *out_instructions,
exec_list *out_variables);
exec_list *out_variables,
bool disable_varying_packing,
bool xfb_enabled);
void run(struct gl_shader *shader);
@@ -231,6 +233,9 @@ private:
* Exec list into which the visitor should insert any new variables.
*/
exec_list *out_variables;
bool disable_varying_packing;
bool xfb_enabled;
};
} /* anonymous namespace */
@@ -238,7 +243,8 @@ private:
lower_packed_varyings_visitor::lower_packed_varyings_visitor(
void *mem_ctx, unsigned locations_used, ir_variable_mode mode,
unsigned gs_input_vertices, exec_list *out_instructions,
exec_list *out_variables)
exec_list *out_variables, bool disable_varying_packing,
bool xfb_enabled)
: mem_ctx(mem_ctx),
locations_used(locations_used),
packed_varyings((ir_variable **)
@@ -247,7 +253,9 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor(
mode(mode),
gs_input_vertices(gs_input_vertices),
out_instructions(out_instructions),
out_variables(out_variables)
out_variables(out_variables),
disable_varying_packing(disable_varying_packing),
xfb_enabled(xfb_enabled)
{
}
@@ -656,7 +664,18 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
if (var->data.explicit_location)
return false;
const glsl_type *type = var->type->without_array();
/* Override disable_varying_packing if the var is only used by transform
* feedback. Also override it if transform feedback is enabled and the
* variable is an array, struct or matrix as the elements of these types
* will always has the same interpolation and therefore asre safe to pack.
*/
const glsl_type *type = var->type;
if (disable_varying_packing && !var->data.is_xfb_only &&
!((type->is_array() || type->is_record() || type->is_matrix()) &&
xfb_enabled))
return false;
type = type->without_array();
if (type->vector_elements == 4 && !type->is_double())
return false;
return true;
@@ -709,7 +728,8 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev)
void
lower_packed_varyings(void *mem_ctx, unsigned locations_used,
ir_variable_mode mode, unsigned gs_input_vertices,
gl_shader *shader)
gl_shader *shader, bool disable_varying_packing,
bool xfb_enabled)
{
exec_list *instructions = shader->ir;
ir_function *main_func = shader->symbols->get_function("main");
@@ -720,7 +740,9 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used,
lower_packed_varyings_visitor visitor(mem_ctx, locations_used, mode,
gs_input_vertices,
&new_instructions,
&new_variables);
&new_variables,
disable_varying_packing,
xfb_enabled);
visitor.run(shader);
if (mode == ir_var_shader_out) {
if (shader->Stage == MESA_SHADER_GEOMETRY) {
+19
View File
@@ -58,6 +58,8 @@ public:
{
}
virtual ir_visitor_status visit_enter(ir_assignment *ir);
ir_rvalue *handle_expression(ir_expression *ir);
void handle_rvalue(ir_rvalue **rvalue);
bool reassociate_constant(ir_expression *ir1,
@@ -80,6 +82,23 @@ public:
} /* unnamed namespace */
ir_visitor_status
ir_algebraic_visitor::visit_enter(ir_assignment *ir)
{
ir_variable *var = ir->lhs->variable_referenced();
if (var->data.invariant || var->data.precise) {
/* If we're assigning to an invariant or precise variable, just bail.
* Most of the algebraic optimizations aren't precision-safe.
*
* FINISHME: Find out which optimizations are precision-safe and enable
* then only for invariant or precise trees.
*/
return visit_continue_with_parent;
} else {
return visit_continue;
}
}
static inline bool
is_vec_zero(ir_constant *ir)
{
+16
View File
@@ -131,6 +131,8 @@ public:
progress = false;
}
virtual ir_visitor_status visit_enter(ir_assignment *ir);
void handle_rvalue(ir_rvalue **rvalue);
bool progress;
@@ -146,6 +148,20 @@ struct is_reduction_data {
} /* anonymous namespace */
ir_visitor_status
ir_rebalance_visitor::visit_enter(ir_assignment *ir)
{
ir_variable *var = ir->lhs->variable_referenced();
if (var->data.invariant || var->data.precise) {
/* If we're assigning to an invariant variable, just bail. Tree
* rebalancing (reassociation) isn't precision-safe.
*/
return visit_continue_with_parent;
} else {
return visit_continue;
}
}
static bool
is_reduction_operation(ir_expression_operation operation)
{
+125
View File
@@ -0,0 +1,125 @@
/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file propagate_invariance.cpp
* Propagate the "invariant" and "precise" qualifiers to variables used to
* compute invariant or precise values.
*
* The GLSL spec (depending on what version you read) says, among the
* conditions for geting bit-for-bit the same values on an invariant output:
*
* "All operations in the consuming expressions and any intermediate
* expressions must be the same, with the same order of operands and same
* associativity, to give the same order of evaluation."
*
* This effectively means that if a variable is used to compute an invariant
* value then that variable becomes invariant. The same should apply to the
* "precise" qualifier.
*/
#include "ir.h"
#include "ir_visitor.h"
#include "ir_rvalue_visitor.h"
#include "ir_optimization.h"
#include "compiler/glsl_types.h"
namespace {
class ir_invariance_propagation_visitor : public ir_hierarchical_visitor {
public:
ir_invariance_propagation_visitor()
{
this->progress = false;
this->dst_var = NULL;
}
virtual ~ir_invariance_propagation_visitor()
{
/* empty */
}
virtual ir_visitor_status visit_enter(ir_assignment *ir);
virtual ir_visitor_status visit_leave(ir_assignment *ir);
virtual ir_visitor_status visit(ir_dereference_variable *ir);
ir_variable *dst_var;
bool progress;
};
} /* unnamed namespace */
ir_visitor_status
ir_invariance_propagation_visitor::visit_enter(ir_assignment *ir)
{
assert(this->dst_var == NULL);
ir_variable *var = ir->lhs->variable_referenced();
if (var->data.invariant || var->data.precise) {
this->dst_var = var;
return visit_continue;
} else {
return visit_continue_with_parent;
}
}
ir_visitor_status
ir_invariance_propagation_visitor::visit_leave(ir_assignment *ir)
{
this->dst_var = NULL;
return visit_continue;
}
ir_visitor_status
ir_invariance_propagation_visitor::visit(ir_dereference_variable *ir)
{
if (this->dst_var == NULL)
return visit_continue;
if (this->dst_var->data.invariant) {
if (!ir->var->data.invariant)
this->progress = true;
ir->var->data.invariant = true;
}
if (this->dst_var->data.precise) {
if (!ir->var->data.precise)
this->progress = true;
ir->var->data.precise = true;
}
return visit_continue;
}
void
propagate_invariance(exec_list *instructions)
{
ir_invariance_propagation_visitor visitor;
do {
visitor.progress = false;
visit_list_elements(&visitor, instructions);
} while (visitor.progress);
}
+16 -9
View File
@@ -731,7 +731,7 @@ nir_visitor::visit(ir_call *ir)
ir_dereference *param =
(ir_dereference *) ir->actual_parameters.get_head();
instr->variables[0] = evaluate_deref(&instr->instr, param);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
nir_builder_instr_insert(&b, &instr->instr);
break;
}
@@ -765,7 +765,7 @@ nir_visitor::visit(ir_call *ir)
const nir_intrinsic_info *info =
&nir_intrinsic_infos[instr->intrinsic];
nir_ssa_dest_init(&instr->instr, &instr->dest,
info->dest_components, NULL);
info->dest_components, 32, NULL);
}
if (op == nir_intrinsic_image_size ||
@@ -826,7 +826,7 @@ nir_visitor::visit(ir_call *ir)
nir_builder_instr_insert(&b, &instr->instr);
break;
case nir_intrinsic_shader_clock:
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
nir_builder_instr_insert(&b, &instr->instr);
break;
case nir_intrinsic_store_ssbo: {
@@ -867,7 +867,7 @@ nir_visitor::visit(ir_call *ir)
/* Setup destination register */
nir_ssa_dest_init(&instr->instr, &instr->dest,
type->vector_elements, NULL);
type->vector_elements, 32, NULL);
/* Insert the created nir instruction now since in the case of boolean
* result we will need to emit another instruction after it
@@ -890,7 +890,7 @@ nir_visitor::visit(ir_call *ir)
load_ssbo_compare->src[1].swizzle[i] = 0;
nir_ssa_dest_init(&load_ssbo_compare->instr,
&load_ssbo_compare->dest.dest,
type->vector_elements, NULL);
type->vector_elements, 32, NULL);
load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
dest = &load_ssbo_compare->dest.dest;
@@ -936,7 +936,7 @@ nir_visitor::visit(ir_call *ir)
/* Atomic result */
assert(ir->return_deref);
nir_ssa_dest_init(&instr->instr, &instr->dest,
ir->return_deref->type->vector_elements, NULL);
ir->return_deref->type->vector_elements, 32, NULL);
nir_builder_instr_insert(&b, &instr->instr);
break;
}
@@ -951,8 +951,9 @@ nir_visitor::visit(ir_call *ir)
instr->num_components = type->vector_elements;
/* Setup destination register */
unsigned bit_size = glsl_get_bit_size(type->base_type);
nir_ssa_dest_init(&instr->instr, &instr->dest,
type->vector_elements, NULL);
type->vector_elements, bit_size, NULL);
nir_builder_instr_insert(&b, &instr->instr);
break;
@@ -1013,8 +1014,10 @@ nir_visitor::visit(ir_call *ir)
/* Atomic result */
assert(ir->return_deref);
unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type);
nir_ssa_dest_init(&instr->instr, &instr->dest,
ir->return_deref->type->vector_elements, NULL);
ir->return_deref->type->vector_elements,
bit_size, NULL);
nir_builder_instr_insert(&b, &instr->instr);
break;
}
@@ -1061,6 +1064,9 @@ nir_visitor::visit(ir_assignment *ir)
{
unsigned num_components = ir->lhs->type->vector_elements;
b.exact = ir->lhs->variable_referenced()->data.invariant ||
ir->lhs->variable_referenced()->data.precise;
if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) &&
(ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) {
/* We're doing a plain-as-can-be copy, so emit a copy_var */
@@ -1163,7 +1169,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
nir_dest *dest = get_instr_dest(instr);
if (dest)
nir_ssa_dest_init(instr, dest, num_components, NULL);
nir_ssa_dest_init(instr, dest, num_components, 32, NULL);
nir_builder_instr_insert(&b, instr);
@@ -1203,6 +1209,7 @@ nir_visitor::visit(ir_expression *ir)
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo);
load->num_components = ir->type->vector_elements;
load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type);
load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
add_instr(&load->instr, ir->type->vector_elements);
+16 -17
View File
@@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list)
list_inithead(&reg->if_uses);
reg->num_components = 0;
reg->bit_size = 32;
reg->num_array_elems = 0;
reg->is_packed = false;
reg->name = NULL;
@@ -473,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components)
nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr);
instr_init(&instr->instr, nir_instr_type_load_const);
nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
return instr;
}
@@ -562,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components)
nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
instr_init(&instr->instr, nir_instr_type_ssa_undef);
nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
return instr;
}
@@ -699,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
case GLSL_TYPE_FLOAT:
case GLSL_TYPE_INT:
case GLSL_TYPE_UINT:
load->value.u[i] = constant->value.u[matrix_offset + i];
load->value.u32[i] = constant->value.u[matrix_offset + i];
break;
case GLSL_TYPE_BOOL:
load->value.u[i] = constant->value.b[matrix_offset + i] ?
load->value.u32[i] = constant->value.b[matrix_offset + i] ?
NIR_TRUE : NIR_FALSE;
break;
default:
@@ -731,18 +732,11 @@ reduce_cursor(nir_cursor cursor)
{
switch (cursor.option) {
case nir_cursor_before_block:
assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL ||
nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block);
if (exec_list_is_empty(&cursor.block->instr_list)) {
/* Empty block. After is as good as before. */
cursor.option = nir_cursor_after_block;
} else {
/* Try to switch to after the previous block if there is one.
* (This isn't likely, but it can happen.)
*/
nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node);
if (prev_node && prev_node->type == nir_cf_node_block) {
cursor.block = nir_cf_node_as_block(prev_node);
cursor.option = nir_cursor_after_block;
}
}
return cursor;
@@ -1379,15 +1373,18 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
src_add_all_uses(dest->reg.indirect, instr, NULL);
}
/* note: does *not* take ownership of 'name' */
void
nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
unsigned num_components, const char *name)
unsigned num_components,
unsigned bit_size, const char *name)
{
def->name = name;
def->name = ralloc_strdup(instr, name);
def->parent_instr = instr;
list_inithead(&def->uses);
list_inithead(&def->if_uses);
def->num_components = num_components;
def->bit_size = bit_size;
if (instr->block) {
nir_function_impl *impl =
@@ -1399,12 +1396,14 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
}
}
/* note: does *not* take ownership of 'name' */
void
nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
unsigned num_components, const char *name)
unsigned num_components, unsigned bit_size,
const char *name)
{
dest->is_ssa = true;
nir_ssa_def_init(instr, &dest->ssa, num_components, name);
nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name);
}
void
+70 -7
View File
@@ -101,6 +101,7 @@ union nir_constant_data {
int i[16];
float f[16];
bool b[16];
double d[16];
};
typedef struct nir_constant {
@@ -381,6 +382,9 @@ typedef struct nir_register {
unsigned num_components; /** < number of vector components */
unsigned num_array_elems; /** < size of array (0 for no array) */
/* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
uint8_t bit_size;
/** generic register index. */
unsigned index;
@@ -488,6 +492,9 @@ typedef struct nir_ssa_def {
struct list_head if_uses;
uint8_t num_components;
/* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
uint8_t bit_size;
} nir_ssa_def;
struct nir_src;
@@ -594,6 +601,18 @@ nir_dest_for_reg(nir_register *reg)
return dest;
}
static inline unsigned
nir_src_bit_size(nir_src src)
{
return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size;
}
static inline unsigned
nir_dest_bit_size(nir_dest dest)
{
return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size;
}
void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if);
void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr);
@@ -649,9 +668,36 @@ typedef enum {
nir_type_float,
nir_type_int,
nir_type_uint,
nir_type_bool
nir_type_bool,
nir_type_bool32 = 32 | nir_type_bool,
nir_type_int8 = 8 | nir_type_int,
nir_type_int16 = 16 | nir_type_int,
nir_type_int32 = 32 | nir_type_int,
nir_type_int64 = 64 | nir_type_int,
nir_type_uint8 = 8 | nir_type_uint,
nir_type_uint16 = 16 | nir_type_uint,
nir_type_uint32 = 32 | nir_type_uint,
nir_type_uint64 = 64 | nir_type_uint,
nir_type_float16 = 16 | nir_type_float,
nir_type_float32 = 32 | nir_type_float,
nir_type_float64 = 64 | nir_type_float,
} nir_alu_type;
#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8
#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007
static inline unsigned
nir_alu_type_get_type_size(nir_alu_type type)
{
return type & NIR_ALU_TYPE_SIZE_MASK;
}
static inline unsigned
nir_alu_type_get_base_type(nir_alu_type type)
{
return type & NIR_ALU_TYPE_BASE_TYPE_MASK;
}
typedef enum {
NIR_OP_IS_COMMUTATIVE = (1 << 0),
NIR_OP_IS_ASSOCIATIVE = (1 << 1),
@@ -708,6 +754,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes];
typedef struct nir_alu_instr {
nir_instr instr;
nir_op op;
/** Indicates that this ALU instruction generates an exact value
*
* This is kind of a mixture of GLSL "precise" and "invariant" and not
* really equivalent to either. This indicates that the value generated by
* this operation is high-precision and any code transformations that touch
* it must ensure that the resulting value is bit-for-bit identical to the
* original.
*/
bool exact;
nir_alu_dest dest;
nir_alu_src src[];
} nir_alu_instr;
@@ -1218,9 +1275,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)
typedef struct {
union {
float f[4];
int32_t i[4];
uint32_t u[4];
float f32[4];
double f64[4];
int32_t i32[4];
uint32_t u32[4];
int64_t i64[4];
uint64_t u64[4];
};
} nir_const_value;
@@ -2061,9 +2121,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest,
nir_dest new_dest);
void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
unsigned num_components, const char *name);
unsigned num_components, unsigned bit_size,
const char *name);
void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
unsigned num_components, const char *name);
unsigned num_components, unsigned bit_size,
const char *name);
void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
nir_instr *after_me);
@@ -2094,9 +2156,10 @@ void nir_index_blocks(nir_function_impl *impl);
void nir_print_shader(nir_shader *shader, FILE *fp);
void nir_print_instr(const nir_instr *instr, FILE *fp);
nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader);
#ifdef DEBUG
void nir_validate_shader(nir_shader *shader);
+27 -4
View File
@@ -63,12 +63,13 @@ class Value(object):
static const ${val.c_type} ${val.name} = {
{ ${val.type_enum} },
% if isinstance(val, Constant):
{ ${hex(val)} /* ${val.value} */ },
${val.type()}, { ${hex(val)} /* ${val.value} */ },
% elif isinstance(val, Variable):
${val.index}, /* ${val.var_name} */
${'true' if val.is_constant else 'false'},
nir_type_${ val.required_type or 'invalid' },
${val.type() or 'nir_type_invalid' },
% elif isinstance(val, Expression):
${'true' if val.inexact else 'false'},
nir_op_${val.opcode},
{ ${', '.join(src.c_ptr for src in val.sources)} },
% endif
@@ -107,10 +108,18 @@ class Constant(Value):
if isinstance(self.value, (int, long)):
return hex(self.value)
elif isinstance(self.value, float):
return hex(struct.unpack('I', struct.pack('f', self.value))[0])
return hex(struct.unpack('Q', struct.pack('d', self.value))[0])
else:
assert False
def type(self):
if isinstance(self.value, (bool)):
return "nir_type_bool32"
elif isinstance(self.value, (int, long)):
return "nir_type_int"
elif isinstance(self.value, float):
return "nir_type_float"
_var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?")
class Variable(Value):
@@ -129,12 +138,26 @@ class Variable(Value):
self.index = varset[self.var_name]
def type(self):
if self.required_type == 'bool':
return "nir_type_bool32"
elif self.required_type in ('int', 'unsigned'):
return "nir_type_int"
elif self.required_type == 'float':
return "nir_type_float"
_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)")
class Expression(Value):
def __init__(self, expr, name_base, varset):
Value.__init__(self, name_base, "expression")
assert isinstance(expr, tuple)
self.opcode = expr[0]
m = _opcode_re.match(expr[0])
assert m and m.group('opcode') is not None
self.opcode = m.group('opcode')
self.inexact = m.group('inexact') is not None
self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset)
for (i, src) in enumerate(expr[1:]) ]
+47 -15
View File
@@ -31,6 +31,9 @@ struct exec_list;
typedef struct nir_builder {
nir_cursor cursor;
/* Whether new ALU instructions will be marked "exact" */
bool exact;
nir_shader *shader;
nir_function_impl *impl;
} nir_builder;
@@ -39,6 +42,7 @@ static inline void
nir_builder_init(nir_builder *build, nir_function_impl *impl)
{
memset(build, 0, sizeof(*build));
build->exact = false;
build->impl = impl;
build->shader = impl->function->shader;
}
@@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx,
{
build->shader = nir_shader_create(mem_ctx, stage, options);
nir_function *func = nir_function_create(build->shader, "main");
build->exact = false;
build->impl = nir_function_impl_create(func);
build->cursor = nir_after_cf_list(&build->impl->body);
}
@@ -104,7 +109,7 @@ nir_imm_float(nir_builder *build, float x)
nir_const_value v;
memset(&v, 0, sizeof(v));
v.f[0] = x;
v.f32[0] = x;
return nir_build_imm(build, 1, v);
}
@@ -115,10 +120,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w)
nir_const_value v;
memset(&v, 0, sizeof(v));
v.f[0] = x;
v.f[1] = y;
v.f[2] = z;
v.f[3] = w;
v.f32[0] = x;
v.f32[1] = y;
v.f32[2] = z;
v.f32[3] = w;
return nir_build_imm(build, 4, v);
}
@@ -129,7 +134,7 @@ nir_imm_int(nir_builder *build, int x)
nir_const_value v;
memset(&v, 0, sizeof(v));
v.i[0] = x;
v.i32[0] = x;
return nir_build_imm(build, 1, v);
}
@@ -140,10 +145,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
nir_const_value v;
memset(&v, 0, sizeof(v));
v.i[0] = x;
v.i[1] = y;
v.i[2] = z;
v.i[3] = w;
v.i32[0] = x;
v.i32[1] = y;
v.i32[2] = z;
v.i32[3] = w;
return nir_build_imm(build, 4, v);
}
@@ -157,6 +162,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
if (!instr)
return NULL;
instr->exact = build->exact;
instr->src[0].src = nir_src_for_ssa(src0);
if (src1)
instr->src[1].src = nir_src_for_ssa(src1);
@@ -178,6 +185,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
}
assert(num_components != 0);
/* Figure out the bitwidth based on the source bitwidth if the instruction
* is variable-width.
*/
unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type);
if (bit_size == 0) {
for (unsigned i = 0; i < op_info->num_inputs; i++) {
unsigned src_bit_size = instr->src[i].src.ssa->bit_size;
if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) {
if (bit_size)
assert(src_bit_size == bit_size);
else
bit_size = src_bit_size;
} else {
assert(src_bit_size ==
nir_alu_type_get_type_size(op_info->input_types[i]));
}
}
}
/* Make sure we don't swizzle from outside of our source vector (like if a
* scalar value was passed into a multiply with a vector).
*/
@@ -187,7 +213,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
}
}
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
bit_size, NULL);
instr->dest.write_mask = (1 << num_components) - 1;
nir_builder_instr_insert(build, &instr->instr);
@@ -252,7 +279,9 @@ static inline nir_ssa_def *
nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
{
nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
nir_src_bit_size(src.src), NULL);
mov->exact = build->exact;
mov->dest.write_mask = (1 << num_components) - 1;
mov->src[0] = src;
nir_builder_instr_insert(build, &mov->instr);
@@ -264,7 +293,9 @@ static inline nir_ssa_def *
nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
{
nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
nir_src_bit_size(src.src), NULL);
mov->exact = build->exact;
mov->dest.write_mask = (1 << num_components) - 1;
mov->src[0] = src;
nir_builder_instr_insert(build, &mov->instr);
@@ -360,7 +391,8 @@ nir_load_var(nir_builder *build, nir_variable *var)
nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
load->num_components = num_components;
load->variables[0] = nir_deref_var_create(load, var);
nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, num_components,
glsl_get_bit_size(glsl_get_base_type(var->type)), NULL);
nir_builder_instr_insert(build, &load->instr);
return &load->dest.ssa;
}
@@ -426,7 +458,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
load->num_components = nir_intrinsic_infos[op].dest_components;
load->const_index[0] = index;
nir_ssa_dest_init(&load->instr, &load->dest,
nir_intrinsic_infos[op].dest_components, NULL);
nir_intrinsic_infos[op].dest_components, 32, NULL);
nir_builder_instr_insert(build, &load->instr);
return &load->dest.ssa;
}
+15 -5
View File
@@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar)
/* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid
* having to deal with locals and globals separately:
*/
static nir_variable *
clone_variable(clone_state *state, const nir_variable *var)
nir_variable *
nir_variable_clone(const nir_variable *var, nir_shader *shader)
{
nir_variable *nvar = rzalloc(state->ns, nir_variable);
add_remap(state, nvar, var);
nir_variable *nvar = rzalloc(shader, nir_variable);
nvar->type = var->type;
nvar->name = ralloc_strdup(nvar, var->name);
@@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var)
return nvar;
}
static nir_variable *
clone_variable(clone_state *state, const nir_variable *var)
{
nir_variable *nvar = nir_variable_clone(var, state->ns);
add_remap(state, nvar, var);
return nvar;
}
/* clone list of nir_variable: */
static void
clone_var_list(clone_state *state, struct exec_list *dst,
@@ -220,7 +228,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
{
ndst->is_ssa = dst->is_ssa;
if (dst->is_ssa) {
nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components,
dst->ssa.bit_size, dst->ssa.name);
add_remap(state, &ndst->ssa, &dst->ssa);
} else {
ndst->reg.reg = remap_reg(state, dst->reg.reg);
@@ -303,6 +312,7 @@ static nir_alu_instr *
clone_alu(clone_state *state, const nir_alu_instr *alu)
{
nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
nalu->exact = alu->exact;
__clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest);
nalu->dest.saturate = alu->dest.saturate;
+1 -1
View File
@@ -28,4 +28,4 @@
#include "nir.h"
nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
nir_const_value *src);
unsigned bit_size, nir_const_value *src);
+158 -86
View File
@@ -1,4 +1,43 @@
#! /usr/bin/python2
def type_has_size(type_):
return type_[-1:].isdigit()
def type_sizes(type_):
if type_.endswith("8"):
return [8]
elif type_.endswith("16"):
return [16]
elif type_.endswith("32"):
return [32]
elif type_.endswith("64"):
return [64]
else:
return [32, 64]
def type_add_size(type_, size):
if type_has_size(type_):
return type_
return type_ + str(size)
def get_const_field(type_):
if type_ == "int32":
return "i32"
if type_ == "uint32":
return "u32"
if type_ == "int64":
return "i64"
if type_ == "uint64":
return "u64"
if type_ == "bool32":
return "u32"
if type_ == "float32":
return "f32"
if type_ == "float64":
return "f64"
raise Exception(str(type_))
assert(0)
template = """\
/*
* Copyright (C) 2014 Intel Corporation
@@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u)
}
/* Some typed vector structures to make things like src0.y work */
% for type in ["float", "int", "uint", "bool"]:
struct ${type}_vec {
${type} x;
${type} y;
${type} z;
${type} w;
typedef float float32_t;
typedef double float64_t;
typedef bool bool32_t;
% for type in ["float", "int", "uint"]:
% for width in [32, 64]:
struct ${type}${width}_vec {
${type}${width}_t x;
${type}${width}_t y;
${type}${width}_t z;
${type}${width}_t w;
};
% endfor
% endfor
struct bool32_vec {
bool x;
bool y;
bool z;
bool w;
};
% for name, op in sorted(opcodes.iteritems()):
static nir_const_value
evaluate_${name}(unsigned num_components, nir_const_value *_src)
evaluate_${name}(unsigned num_components, unsigned bit_size,
nir_const_value *_src)
{
nir_const_value _dst_val = { { {0, 0, 0, 0} } };
## For each non-per-component input, create a variable srcN that
## contains x, y, z, and w elements which are filled in with the
## appropriately-typed values.
% for j in range(op.num_inputs):
% if op.input_sizes[j] == 0:
<% continue %>
% elif "src" + str(j) not in op.const_expr:
## Avoid unused variable warnings
<% continue %>
%endif
switch (bit_size) {
% for bit_size in [32, 64]:
case ${bit_size}: {
<%
output_type = type_add_size(op.output_type, bit_size)
input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
%>
struct ${op.input_types[j]}_vec src${j} = {
% for k in range(op.input_sizes[j]):
% if op.input_types[j] == "bool":
_src[${j}].u[${k}] != 0,
% else:
_src[${j}].${op.input_types[j][:1]}[${k}],
% endif
% endfor
};
% endfor
## For each non-per-component input, create a variable srcN that
## contains x, y, z, and w elements which are filled in with the
## appropriately-typed values.
% for j in range(op.num_inputs):
% if op.input_sizes[j] == 0:
<% continue %>
% elif "src" + str(j) not in op.const_expr:
## Avoid unused variable warnings
<% continue %>
%endif
% if op.output_size == 0:
## For per-component instructions, we need to iterate over the
## components and apply the constant expression one component
## at a time.
for (unsigned _i = 0; _i < num_components; _i++) {
## For each per-component input, create a variable srcN that
## contains the value of the current (_i'th) component.
% for j in range(op.num_inputs):
% if op.input_sizes[j] != 0:
<% continue %>
% elif "src" + str(j) not in op.const_expr:
## Avoid unused variable warnings
<% continue %>
% elif op.input_types[j] == "bool":
bool src${j} = _src[${j}].u[_i] != 0;
struct ${input_types[j]}_vec src${j} = {
% for k in range(op.input_sizes[j]):
% if input_types[j] == "bool32":
_src[${j}].u32[${k}] != 0,
% else:
${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
_src[${j}].${get_const_field(input_types[j])}[${k}],
% endif
% endfor
};
% endfor
% if op.output_size == 0:
## For per-component instructions, we need to iterate over the
## components and apply the constant expression one component
## at a time.
for (unsigned _i = 0; _i < num_components; _i++) {
## For each per-component input, create a variable srcN that
## contains the value of the current (_i'th) component.
% for j in range(op.num_inputs):
% if op.input_sizes[j] != 0:
<% continue %>
% elif "src" + str(j) not in op.const_expr:
## Avoid unused variable warnings
<% continue %>
% elif input_types[j] == "bool32":
bool src${j} = _src[${j}].u32[_i] != 0;
% else:
${input_types[j]}_t src${j} =
_src[${j}].${get_const_field(input_types[j])}[_i];
% endif
% endfor
## Create an appropriately-typed variable dst and assign the
## result of the const_expr to it. If const_expr already contains
## writes to dst, just include const_expr directly.
% if "dst" in op.const_expr:
${output_type}_t dst;
${op.const_expr}
% else:
${output_type}_t dst = ${op.const_expr};
% endif
## Store the current component of the actual destination to the
## value of dst.
% if output_type == "bool32":
## Sanitize the C value to a proper NIR bool
_dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE;
% else:
_dst_val.${get_const_field(output_type)}[_i] = dst;
% endif
}
% else:
## In the non-per-component case, create a struct dst with
## appropriately-typed elements x, y, z, and w and assign the result
## of the const_expr to all components of dst, or include the
## const_expr directly if it writes to dst already.
struct ${output_type}_vec dst;
## Create an appropriately-typed variable dst and assign the
## result of the const_expr to it. If const_expr already contains
## writes to dst, just include const_expr directly.
% if "dst" in op.const_expr:
${op.output_type} dst;
${op.const_expr}
% else:
${op.output_type} dst = ${op.const_expr};
## Splat the value to all components. This way expressions which
## write the same value to all components don't need to explicitly
## write to dest. One such example is fnoise which has a
## const_expr of 0.0f.
dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
% endif
## Store the current component of the actual destination to the
## value of dst.
% if op.output_type == "bool":
## Sanitize the C value to a proper NIR bool
_dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
% else:
_dst_val.${op.output_type[:1]}[_i] = dst;
% endif
}
% else:
## In the non-per-component case, create a struct dst with
## appropriately-typed elements x, y, z, and w and assign the result
## of the const_expr to all components of dst, or include the
## const_expr directly if it writes to dst already.
struct ${op.output_type}_vec dst;
% if "dst" in op.const_expr:
${op.const_expr}
% else:
## Splat the value to all components. This way expressions which
## write the same value to all components don't need to explicitly
## write to dest. One such example is fnoise which has a
## const_expr of 0.0f.
dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
## For each component in the destination, copy the value of dst to
## the actual destination.
% for k in range(op.output_size):
% if output_type == "bool32":
## Sanitize the C value to a proper NIR bool
_dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
% else:
_dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
% endif
% endfor
% endif
## For each component in the destination, copy the value of dst to
## the actual destination.
% for k in range(op.output_size):
% if op.output_type == "bool":
## Sanitize the C value to a proper NIR bool
_dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
% else:
_dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
% endif
% endfor
% endif
break;
}
% endfor
default:
unreachable("unknown bit width");
}
return _dst_val;
}
@@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src)
nir_const_value
nir_eval_const_opcode(nir_op op, unsigned num_components,
nir_const_value *src)
unsigned bit_width, nir_const_value *src)
{
switch (op) {
% for name in sorted(opcodes.iterkeys()):
case nir_op_${name}: {
return evaluate_${name}(num_components, src);
return evaluate_${name}(num_components, bit_width, src);
break;
}
% endfor
@@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components,
from nir_opcodes import opcodes
from mako.template import Template
print Template(template).render(opcodes=opcodes)
print Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
type_has_size=type_has_size,
type_add_size=type_add_size,
get_const_field=get_const_field)
+4 -2
View File
@@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
nir_parallel_copy_entry);
nir_ssa_dest_init(&pcopy->instr, &entry->dest,
phi->dest.ssa.num_components, src->src.ssa->name);
phi->dest.ssa.num_components,
phi->dest.ssa.bit_size, src->src.ssa->name);
exec_list_push_tail(&pcopy->entries, &entry->node);
assert(src->src.is_ssa);
@@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
nir_parallel_copy_entry);
nir_ssa_dest_init(&block_pcopy->instr, &entry->dest,
phi->dest.ssa.num_components, phi->dest.ssa.name);
phi->dest.ssa.num_components, phi->dest.ssa.bit_size,
phi->dest.ssa.name);
exec_list_push_tail(&block_pcopy->entries, &entry->node);
nir_ssa_def_rewrite_uses(&phi->dest.ssa,
+2 -2
View File
@@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader)
return -1;
if (count == -1)
count = val->i[0];
count = val->i32[0];
/* We've found contradictory set_vertex_count intrinsics.
* This can happen if there are early-returns in main() and
* different paths emit different numbers of vertices.
*/
if (count != val->i[0])
if (count != val->i32[0])
return -1;
}
}
+18 -6
View File
@@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr)
{
hash = HASH(hash, instr->op);
hash = HASH(hash, instr->dest.dest.ssa.num_components);
/* We explicitly don't hash instr->dest.dest.exact */
if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
assert(nir_op_infos[instr->op].num_inputs == 2);
@@ -81,9 +82,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
{
hash = HASH(hash, instr->def.num_components);
hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f,
hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32,
instr->def.num_components
* sizeof(instr->value.f[0]));
* sizeof(instr->value.f32[0]));
return hash;
}
@@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
return false;
/* We explicitly don't hash instr->dest.dest.exact */
if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
assert(nir_op_infos[alu1->op].num_inputs == 2);
return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
@@ -322,8 +325,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
if (load1->def.num_components != load2->def.num_components)
return false;
return memcmp(load1->value.f, load2->value.f,
load1->def.num_components * sizeof(*load2->value.f)) == 0;
return memcmp(load1->value.f32, load2->value.f32,
load1->def.num_components * sizeof(*load2->value.f32)) == 0;
}
case nir_instr_type_phi: {
nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
@@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr)
struct set_entry *entry = _mesa_set_search(instr_set, instr);
if (entry) {
nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr);
nir_ssa_def *new_def =
nir_instr_get_dest_ssa_def((nir_instr *) entry->key);
nir_instr *match = (nir_instr *) entry->key;
nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match);
/* It's safe to replace a exact instruction with an inexact one as
* long as we make it exact. If we got here, the two instructions are
* exactly identical in every other way so, once we've set the exact
* bit, they are the same.
*/
if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact)
nir_instr_as_alu(match)->exact = true;
nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def));
return true;
}
+7 -4
View File
@@ -31,9 +31,11 @@
*/
static void
nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components)
nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components,
unsigned bit_size)
{
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
bit_size, NULL);
instr->dest.write_mask = (1 << num_components) - 1;
}
@@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op,
nir_ssa_def *last = NULL;
for (unsigned i = 0; i < num_components; i++) {
nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op);
nir_alu_ssa_dest_init(chan, 1);
nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size);
nir_alu_src_copy(&chan->src[0], &instr->src[0], chan);
chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
if (nir_op_infos[chan_op].num_inputs > 1) {
@@ -80,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
assert(instr->dest.write_mask != 0);
b->cursor = nir_before_instr(&instr->instr);
b->exact = instr->exact;
#define LOWER_REDUCTION(name, chan, merge) \
case name##2: \
@@ -220,7 +223,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan];
}
nir_alu_ssa_dest_init(lower, 1);
nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size);
lower->dest.saturate = instr->dest.saturate;
comps[chan] = &lower->dest.dest.ssa;
+6 -6
View File
@@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr,
state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);
nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
offset_const->value.u[0] = instr->variables[0]->var->data.offset;
offset_const->value.u32[0] = instr->variables[0]->var->data.offset;
nir_instr_insert_before(&instr->instr, &offset_const->instr);
@@ -90,17 +90,17 @@ lower_instr(nir_intrinsic_instr *instr,
unsigned child_array_elements = tail->child != NULL ?
glsl_get_aoa_size(tail->type) : 1;
offset_const->value.u[0] += deref_array->base_offset *
offset_const->value.u32[0] += deref_array->base_offset *
child_array_elements * ATOMIC_COUNTER_SIZE;
if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
nir_load_const_instr *atomic_counter_size =
nir_load_const_instr_create(mem_ctx, 1);
atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
mul->dest.write_mask = 0x1;
nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul);
mul->src[1].src.is_ssa = true;
@@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr,
nir_instr_insert_before(&instr->instr, &mul->instr);
nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd);
nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
add->dest.write_mask = 0x1;
add->src[0].src.is_ssa = true;
add->src[0].src.ssa = &mul->dest.dest.ssa;
@@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr,
if (instr->dest.is_ssa) {
nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
instr->dest.ssa.num_components, NULL);
instr->dest.ssa.num_components, 32, NULL);
nir_ssa_def_rewrite_uses(&instr->dest.ssa,
nir_src_for_ssa(&new_instr->dest.ssa));
} else {
+1 -1
View File
@@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val)
load->num_components = 4;
nir_intrinsic_set_base(load, in->data.driver_location);
load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
val[0] = nir_channel(b, &load->dest.ssa, 0);
+4 -2
View File
@@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
if (src == NULL) {
/* We're a load. We need to insert a phi node */
nir_phi_instr *phi = nir_phi_instr_create(b->shader);
unsigned bit_size = then_dest->bit_size;
nir_ssa_dest_init(&phi->instr, &phi->dest,
then_dest->num_components, NULL);
then_dest->num_components, bit_size, NULL);
nir_phi_src *src0 = ralloc(phi, nir_phi_src);
src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt));
@@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
load->num_components = orig_instr->num_components;
load->variables[0] =
nir_deref_as_var(nir_copy_deref(load, &deref->deref));
unsigned bit_size = orig_instr->dest.ssa.bit_size;
nir_ssa_dest_init(&load->instr, &load->dest,
load->num_components, NULL);
load->num_components, bit_size, NULL);
nir_builder_instr_insert(b, &load->instr);
*dest = &load->dest.ssa;
} else {
+4 -2
View File
@@ -289,7 +289,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
if (intrin->dest.is_ssa) {
nir_ssa_dest_init(&load->instr, &load->dest,
intrin->num_components, NULL);
intrin->num_components,
intrin->dest.ssa.bit_size, NULL);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
nir_src_for_ssa(&load->dest.ssa));
} else {
@@ -369,7 +370,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
if (intrin->dest.is_ssa) {
nir_ssa_dest_init(&atomic->instr, &atomic->dest,
intrin->dest.ssa.num_components, NULL);
intrin->dest.ssa.num_components,
intrin->dest.ssa.bit_size, NULL);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
nir_src_for_ssa(&atomic->dest.ssa));
} else {
@@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
nir_ssa_def *loads[4];
for (unsigned i = 0; i < lower->def.num_components; i++) {
nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
load_comp->value.u[0] = lower->value.u[i];
load_comp->value.u32[0] = lower->value.u32[i];
nir_builder_instr_insert(&b, &load_comp->instr);
loads[i] = &load_comp->def;
}
+5 -4
View File
@@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
if (src.reg.indirect) {
nir_load_const_instr *load_const =
nir_load_const_instr_create(state->shader, 1);
load_const->value.u[0] = glsl_get_length(parent_type);
load_const->value.u32[0] = glsl_get_length(parent_type);
nir_instr_insert_before(instr, &load_const->instr);
nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul);
@@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
mul->src[1].src.is_ssa = true;
mul->src[1].src.ssa = &load_const->def;
mul->dest.write_mask = 1;
nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
nir_instr_insert_before(instr, &mul->instr);
src.reg.indirect->is_ssa = true;
@@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
add->src[0].src = *src.reg.indirect;
nir_src_copy(&add->src[1].src, &deref_array->indirect, add);
add->dest.write_mask = 1;
nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
nir_instr_insert_before(instr, &add->instr);
src.reg.indirect->is_ssa = true;
@@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
mov->dest.write_mask = (1 << intrin->num_components) - 1;
if (intrin->dest.is_ssa) {
nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
intrin->num_components, NULL);
intrin->num_components,
intrin->dest.ssa.bit_size, NULL);
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
nir_src_for_ssa(&mov->dest.dest.ssa));
} else {
+7 -3
View File
@@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
if (!should_lower_phi(phi, state))
continue;
unsigned bit_size = phi->dest.ssa.bit_size;
/* Create a vecN operation to combine the results. Most of these
* will be redundant, but copy propagation should clean them up for
* us. No need to add the complexity here.
@@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op);
nir_ssa_dest_init(&vec->instr, &vec->dest.dest,
phi->dest.ssa.num_components, NULL);
phi->dest.ssa.num_components,
bit_size, NULL);
vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) {
nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx);
nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL);
nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1,
phi->dest.ssa.bit_size, NULL);
vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa);
@@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
/* We need to insert a mov to grab the i'th component of src */
nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx,
nir_op_imov);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL);
mov->dest.write_mask = 1;
nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx);
mov->src[0].swizzle[0] = i;
+3 -3
View File
@@ -65,9 +65,9 @@ convert_block(nir_block *block, void *void_state)
*/
nir_const_value local_size;
local_size.u[0] = b->shader->info.cs.local_size[0];
local_size.u[1] = b->shader->info.cs.local_size[1];
local_size.u[2] = b->shader->info.cs.local_size[2];
local_size.u32[0] = b->shader->info.cs.local_size[0];
local_size.u32[1] = b->shader->info.cs.local_size[1];
local_size.u32[2] = b->shader->info.cs.local_size[2];
nir_ssa_def *group_id =
nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0);
+4 -4
View File
@@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex)
txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
txs->src[0].src_type = nir_tex_src_lod;
nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL);
nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL);
nir_builder_instr_insert(b, &txs->instr);
return nir_i2f(b, &txs->dest.ssa);
@@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val)
memset(&v, 0, sizeof(v));
if (swizzle_val == 4) {
v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0;
v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0;
} else {
assert(swizzle_val == 5);
if (type == nir_type_float)
v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0;
v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0;
else
v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1;
v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1;
}
return nir_build_imm(b, 4, v);
+1 -1
View File
@@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in)
load->num_components = 4;
nir_intrinsic_set_base(load, in->data.driver_location);
load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
return &load->dest.ssa;
+4 -1
View File
@@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
assert(src_tail->type == dest_tail->type);
unsigned num_components = glsl_get_vector_elements(src_tail->type);
unsigned bit_size =
glsl_get_bit_size(glsl_get_base_type(src_tail->type));
nir_intrinsic_instr *load =
nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var);
load->num_components = num_components;
load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref));
nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size,
NULL);
nir_instr_insert_before(&copy_instr->instr, &load->instr);
+4 -1
View File
@@ -505,6 +505,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
nir_ssa_undef_instr *undef =
nir_ssa_undef_instr_create(state->shader,
intrin->num_components);
undef->def.bit_size = intrin->dest.ssa.bit_size;
nir_instr_insert_before(&intrin->instr, &undef->instr);
nir_instr_remove(&intrin->instr);
@@ -528,7 +529,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
mov->dest.write_mask = (1 << intrin->num_components) - 1;
nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
intrin->num_components, NULL);
intrin->num_components,
intrin->dest.ssa.bit_size, NULL);
nir_instr_insert_before(&intrin->instr, &mov->instr);
nir_instr_remove(&intrin->instr);
@@ -719,6 +721,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
node->pb_value =
nir_phi_builder_add_value(state.phi_builder,
glsl_get_vector_elements(node->type),
glsl_get_bit_size(glsl_get_base_type(node->type)),
store_blocks);
if (node->deref->var->constant_initializer) {
+71 -67
View File
@@ -90,8 +90,12 @@ class Opcode(object):
# helper variables for strings
tfloat = "float"
tint = "int"
tbool = "bool"
tbool = "bool32"
tuint = "uint"
tfloat32 = "float32"
tint32 = "int32"
tuint32 = "uint32"
tfloat64 = "float64"
commutative = "commutative "
associative = "associative "
@@ -155,57 +159,57 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
unop("fsqrt", tfloat, "sqrtf(src0)")
unop("fexp2", tfloat, "exp2f(src0)")
unop("flog2", tfloat, "log2f(src0)")
unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
# Float-to-boolean conversion
unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
# Boolean-to-float conversion
unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
# Int-to-boolean conversion
unop_convert("i2b", tbool, tint, "src0 != 0")
unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
unop_convert("i2b", tbool, tint32, "src0 != 0")
unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
# Unary floating-point rounding operations.
unop("ftrunc", tfloat, "truncf(src0)")
unop("fceil", tfloat, "ceilf(src0)")
unop("ffloor", tfloat, "floorf(src0)")
unop("ffract", tfloat, "src0 - floorf(src0)")
unop("fround_even", tfloat, "_mesa_roundevenf(src0)")
unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
# Trigonometric operations.
unop("fsin", tfloat, "sinf(src0)")
unop("fcos", tfloat, "cosf(src0)")
unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
# Partial derivatives.
unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
unop("fddy", tfloat, "0.0f")
unop("fddx_fine", tfloat, "0.0f")
unop("fddy_fine", tfloat, "0.0f")
unop("fddx_coarse", tfloat, "0.0f")
unop("fddy_coarse", tfloat, "0.0f")
unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
unop("fddy", tfloat, "0.0")
unop("fddx_fine", tfloat, "0.0")
unop("fddy_fine", tfloat, "0.0")
unop("fddx_coarse", tfloat, "0.0")
unop("fddy_coarse", tfloat, "0.0")
# Floating point pack and unpack operations.
def pack_2x16(fmt):
unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """
unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
dst.x = (uint32_t) pack_fmt_1x16(src0.x);
dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
""".replace("fmt", fmt))
def pack_4x8(fmt):
unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """
unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
dst.x = (uint32_t) pack_fmt_1x8(src0.x);
dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
@@ -213,13 +217,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
""".replace("fmt", fmt))
def unpack_2x16(fmt):
unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """
unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
""".replace("fmt", fmt))
def unpack_4x8(fmt):
unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """
unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
@@ -238,11 +242,11 @@ unpack_2x16("unorm")
unpack_4x8("unorm")
unpack_2x16("half")
unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
dst.x = (src0.x & 0xffff) | (src0.y >> 16);
""")
unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
dst.x = (src0.x << 0) |
(src0.y << 8) |
(src0.z << 16) |
@@ -252,22 +256,22 @@ dst.x = (src0.x << 0) |
# Lowered floating point unpacking operations.
unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint,
unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
"unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint,
unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
"unpack_half_1x16((uint16_t)(src0.x >> 16))")
# Bit operations, part of ARB_gpu_shader5.
unop("bitfield_reverse", tuint, """
unop("bitfield_reverse", tuint32, """
/* we're not winning any awards for speed here, but that's ok */
dst = 0;
for (unsigned bit = 0; bit < 32; bit++)
dst |= ((src0 >> bit) & 1) << (31 - bit);
""")
unop("bit_count", tuint, """
unop("bit_count", tuint32, """
dst = 0;
for (unsigned bit = 0; bit < 32; bit++) {
if ((src0 >> bit) & 1)
@@ -275,7 +279,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
}
""")
unop_convert("ufind_msb", tint, tuint, """
unop_convert("ufind_msb", tint32, tuint32, """
dst = -1;
for (int bit = 31; bit > 0; bit--) {
if ((src0 >> bit) & 1) {
@@ -285,7 +289,7 @@ for (int bit = 31; bit > 0; bit--) {
}
""")
unop("ifind_msb", tint, """
unop("ifind_msb", tint32, """
dst = -1;
for (int bit = 31; bit >= 0; bit--) {
/* If src0 < 0, we're looking for the first 0 bit.
@@ -299,7 +303,7 @@ for (int bit = 31; bit >= 0; bit--) {
}
""")
unop("find_lsb", tint, """
unop("find_lsb", tint32, """
dst = -1;
for (unsigned bit = 0; bit < 32; bit++) {
if ((src0 >> bit) & 1) {
@@ -359,10 +363,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
# low 32-bits of signed/unsigned integer multiply
binop("imul", tint, commutative + associative, "src0 * src1")
# high 32-bits of signed integer multiply
binop("imul_high", tint, commutative,
binop("imul_high", tint32, commutative,
"(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
# high 32-bits of unsigned integer multiply
binop("umul_high", tuint, commutative,
binop("umul_high", tuint32, commutative,
"(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
binop("fdiv", tfloat, "", "src0 / src1")
@@ -427,18 +431,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
binop_reduce("fall_equal", 1, tfloat, tfloat, "{src0} == {src1}",
binop_reduce("fall_equal", 1, tfloat32, tfloat32, "{src0} == {src1}",
"{src0} && {src1}", "{src} ? 1.0f : 0.0f")
binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
"{src0} || {src1}", "{src} ? 1.0f : 0.0f")
# These comparisons for integer-less hardware return 1.0 and 0.0 for true
# and false respectively
binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
binop("ishl", tint, "", "src0 << src1")
@@ -461,11 +465,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1")
# These use (src != 0.0) for testing the truth of the input, and output 1.0
# for true and 0.0 for false
binop("fand", tfloat, commutative,
binop("fand", tfloat32, commutative,
"((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
binop("for", tfloat, commutative,
binop("for", tfloat32, commutative,
"((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
binop("fxor", tfloat, commutative,
binop("fxor", tfloat32, commutative,
"(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
@@ -487,7 +491,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
# Saturated vector add for 4 8bit ints.
binop("usadd_4x8", tint, commutative + associative, """
binop("usadd_4x8", tint32, commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
@@ -495,7 +499,7 @@ for (int i = 0; i < 32; i += 8) {
""")
# Saturated vector subtract for 4 8bit ints.
binop("ussub_4x8", tint, "", """
binop("ussub_4x8", tint32, "", """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
@@ -506,7 +510,7 @@ for (int i = 0; i < 32; i += 8) {
""")
# vector min for 4 8bit ints.
binop("umin_4x8", tint, commutative + associative, """
binop("umin_4x8", tint32, commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -514,7 +518,7 @@ for (int i = 0; i < 32; i += 8) {
""")
# vector max for 4 8bit ints.
binop("umax_4x8", tint, commutative + associative, """
binop("umax_4x8", tint32, commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -522,7 +526,7 @@ for (int i = 0; i < 32; i += 8) {
""")
# unorm multiply: (a * b) / 255.
binop("umul_unorm_4x8", tint, commutative + associative, """
binop("umul_unorm_4x8", tint32, commutative + associative, """
dst = 0;
for (int i = 0; i < 32; i += 8) {
int src0_chan = (src0 >> i) & 0xff;
@@ -531,15 +535,15 @@ for (int i = 0; i < 32; i += 8) {
}
""")
binop("fpow", tfloat, "", "powf(src0, src1)")
binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
"pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
# if either of its arguments are 32.
binop_convert("bfm", tuint, tint, "", """
binop_convert("bfm", tuint32, tint32, "", """
int bits = src0, offset = src1;
if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
dst = 0; /* undefined */
@@ -548,7 +552,7 @@ else
""")
opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
dst = ldexpf(src0, src1);
dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
/* flush denormals to zero. */
if (!isnormal(dst))
dst = copysignf(0.0f, src0);
@@ -588,12 +592,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
opcode("bcsel", 0, tuint, [0, 0, 0],
[tbool, tuint, tuint], "", "src0 ? src1 : src2")
# SM5 bfi assembly
triop("bfi", tuint, """
triop("bfi", tuint32, """
unsigned mask = src0, insert = src1, base = src2;
if (mask == 0) {
dst = base;
@@ -608,8 +612,8 @@ if (mask == 0) {
""")
# SM5 ubfe/ibfe assembly
opcode("ubfe", 0, tuint,
[0, 0, 0], [tuint, tint, tint], "", """
opcode("ubfe", 0, tuint32,
[0, 0, 0], [tuint32, tint32, tint32], "", """
unsigned base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
@@ -622,8 +626,8 @@ if (bits == 0) {
dst = base >> offset;
}
""")
opcode("ibfe", 0, tint,
[0, 0, 0], [tint, tint, tint], "", """
opcode("ibfe", 0, tint32,
[0, 0, 0], [tint32, tint32, tint32], "", """
int base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
@@ -638,8 +642,8 @@ if (bits == 0) {
""")
# GLSL bitfieldExtract()
opcode("ubitfield_extract", 0, tuint,
[0, 0, 0], [tuint, tint, tint], "", """
opcode("ubitfield_extract", 0, tuint32,
[0, 0, 0], [tuint32, tint32, tint32], "", """
unsigned base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
@@ -650,8 +654,8 @@ if (bits == 0) {
dst = (base >> offset) & ((1ull << bits) - 1);
}
""")
opcode("ibitfield_extract", 0, tint,
[0, 0, 0], [tint, tint, tint], "", """
opcode("ibitfield_extract", 0, tint32,
[0, 0, 0], [tint32, tint32, tint32], "", """
int base = src0;
int offset = src1, bits = src2;
if (bits == 0) {
@@ -678,8 +682,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
[tuint, tuint, tuint, tuint],
"", const_expr)
opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
[tuint, tuint, tint, tint], "", """
opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
[tuint32, tuint32, tint32, tint32], "", """
unsigned base = src0, insert = src1;
int offset = src2, bits = src3;
if (bits == 0) {
+92 -59
View File
@@ -35,10 +35,17 @@ d = 'd'
# Written in the form (<search>, <replace>) where <search> is an expression
# and <replace> is either an expression or a value. An expression is
# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>)
# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
# where each source is either an expression or a value. A value can be
# either a numeric constant or a string representing a variable name.
#
# If the opcode in a search expression is prefixed by a '~' character, this
# indicates that the operation is inexact. Such operations will only get
# applied to SSA values that do not have the exact bit set. This should be
# used by by any optimizations that are not bit-for-bit exact. It should not,
# however, be used for backend-requested lowering operations as those need to
# happen regardless of precision.
#
# Variable names are specified as "[#]name[@type]" where "#" inicates that
# the given variable will only match constants and the type indicates that
# the given variable will only match values from ALU instructions with the
@@ -55,19 +62,19 @@ optimizations = [
(('fabs', ('fneg', a)), ('fabs', a)),
(('iabs', ('iabs', a)), ('iabs', a)),
(('iabs', ('ineg', a)), ('iabs', a)),
(('fadd', a, 0.0), a),
(('~fadd', a, 0.0), a),
(('iadd', a, 0), a),
(('usadd_4x8', a, 0), a),
(('usadd_4x8', a, ~0), ~0),
(('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
(('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
(('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
(('fadd', ('fneg', a), a), 0.0),
(('~fadd', ('fneg', a), a), 0.0),
(('iadd', ('ineg', a), a), 0),
(('iadd', ('ineg', a), ('iadd', a, b)), b),
(('iadd', a, ('iadd', ('ineg', a), b)), b),
(('fadd', ('fneg', a), ('fadd', a, b)), b),
(('fadd', a, ('fadd', ('fneg', a), b)), b),
(('fmul', a, 0.0), 0.0),
(('~fadd', ('fneg', a), ('fadd', a, b)), b),
(('~fadd', a, ('fadd', ('fneg', a), b)), b),
(('~fmul', a, 0.0), 0.0),
(('imul', a, 0), 0),
(('umul_unorm_4x8', a, 0), 0),
(('umul_unorm_4x8', a, ~0), a),
@@ -76,32 +83,48 @@ optimizations = [
(('fmul', a, -1.0), ('fneg', a)),
(('imul', a, -1), ('ineg', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
(('ffma', 0.0, a, b), b),
(('ffma', a, 0.0, b), b),
(('ffma', a, b, 0.0), ('fmul', a, b)),
(('~ffma', 0.0, a, b), b),
(('~ffma', a, 0.0, b), b),
(('~ffma', a, b, 0.0), ('fmul', a, b)),
(('ffma', a, 1.0, b), ('fadd', a, b)),
(('ffma', 1.0, a, b), ('fadd', a, b)),
(('flrp', a, b, 0.0), a),
(('flrp', a, b, 1.0), b),
(('flrp', a, a, b), a),
(('flrp', 0.0, a, b), ('fmul', a, b)),
(('~flrp', a, b, 0.0), a),
(('~flrp', a, b, 1.0), b),
(('~flrp', a, a, b), a),
(('~flrp', 0.0, a, b), ('fmul', a, b)),
(('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
(('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
(('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
(('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'),
(('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
(('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
(('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp'),
(('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
(('~fadd', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
(('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
(('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
(('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
# Comparison simplifications
(('inot', ('flt', a, b)), ('fge', a, b)),
(('inot', ('fge', a, b)), ('flt', a, b)),
(('inot', ('feq', a, b)), ('fne', a, b)),
(('inot', ('fne', a, b)), ('feq', a, b)),
(('~inot', ('flt', a, b)), ('fge', a, b)),
(('~inot', ('fge', a, b)), ('flt', a, b)),
(('~inot', ('feq', a, b)), ('fne', a, b)),
(('~inot', ('fne', a, b)), ('feq', a, b)),
(('inot', ('ilt', a, b)), ('ige', a, b)),
(('inot', ('ige', a, b)), ('ilt', a, b)),
(('inot', ('ieq', a, b)), ('ine', a, b)),
(('inot', ('ine', a, b)), ('ieq', a, b)),
# 0.0 >= b2f(a)
# b2f(a) <= 0.0
# b2f(a) == 0.0 because b2f(a) can only be 0 or 1
# inot(a)
(('fge', 0.0, ('b2f', a)), ('inot', a)),
# 0.0 < fabs(a)
# fabs(a) > 0.0
# fabs(a) != 0.0 because fabs(a) must be >= 0
# a != 0.0
(('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
(('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
(('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
(('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
(('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
(('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
(('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
@@ -111,15 +134,19 @@ optimizations = [
(('imax', a, a), a),
(('umin', a, a), a),
(('umax', a, a), a),
(('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
(('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
(('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
(('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
(('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
(('fsat', ('fsat', a)), ('fsat', a)),
(('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
(('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
(('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
(('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
(('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
(('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
(('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
(('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
(('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
(('fabs', ('slt', a, b)), ('slt', a, b)),
(('fabs', ('sge', a, b)), ('sge', a, b)),
(('fabs', ('seq', a, b)), ('seq', a, b)),
(('fabs', ('sne', a, b)), ('sne', a, b)),
(('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
(('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
(('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
@@ -151,7 +178,6 @@ optimizations = [
(('ior', a, 0), a),
(('fxor', a, a), 0.0),
(('ixor', a, a), 0),
(('fxor', a, 0.0), a),
(('ixor', a, 0), a),
(('inot', ('inot', a)), a),
# DeMorgan's Laws
@@ -167,35 +193,35 @@ optimizations = [
(('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
(('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
# Exponential/logarithmic identities
(('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
(('flog2', ('fexp2', a)), a), # lg2(2^a) = a
(('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
(('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
(('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
(('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
(('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
(('fpow', a, 1.0), a),
(('fpow', a, 2.0), ('fmul', a, a)),
(('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
(('fpow', 2.0, a), ('fexp2', a)),
(('fpow', ('fpow', a, 2.2), 0.454545), a),
(('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
(('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
(('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
(('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
(('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
(('flog2', ('frcp', a)), ('fneg', ('flog2', a))),
(('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
(('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
(('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
(('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
(('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
(('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
(('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
(('~fpow', a, 1.0), a),
(('~fpow', a, 2.0), ('fmul', a, a)),
(('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
(('~fpow', 2.0, a), ('fexp2', a)),
(('~fpow', ('fpow', a, 2.2), 0.454545), a),
(('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
(('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
(('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
(('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
(('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
(('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
(('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
(('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
(('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
(('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
(('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
# Division and reciprocal
(('fdiv', 1.0, a), ('frcp', a)),
(('~fdiv', 1.0, a), ('frcp', a)),
(('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
(('frcp', ('frcp', a)), a),
(('frcp', ('fsqrt', a)), ('frsq', a)),
(('~frcp', ('frcp', a)), a),
(('~frcp', ('fsqrt', a)), ('frsq', a)),
(('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
(('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
(('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
# Boolean simplifications
(('ieq', 'a@bool', True), a),
(('ine', 'a@bool', True), ('inot', a)),
@@ -216,6 +242,10 @@ optimizations = [
(('i2b', ('b2i', a)), a),
(('f2i', ('ftrunc', a)), ('f2i', a)),
(('f2u', ('ftrunc', a)), ('f2u', a)),
(('i2b', ('ineg', a)), ('i2b', a)),
(('i2b', ('iabs', a)), ('i2b', a)),
(('fabs', ('b2f', a)), ('b2f', a)),
(('iabs', ('b2i', a)), ('b2i', a)),
# Byte extraction
(('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
@@ -228,7 +258,7 @@ optimizations = [
(('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
# Subtracts
(('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
(('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
(('isub', a, ('isub', 0, b)), ('iadd', a, b)),
(('ussub_4x8', a, 0), a),
(('ussub_4x8', a, ~0), 0),
@@ -236,7 +266,7 @@ optimizations = [
(('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
(('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
(('ineg', a), ('isub', 0, a), 'options->lower_negate'),
(('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
(('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
(('iadd', a, ('isub', 0, b)), ('isub', a, b)),
(('fabs', ('fsub', 0.0, a)), ('fabs', a)),
(('iabs', ('isub', 0, a)), ('iabs', a)),
@@ -368,10 +398,13 @@ for op in ['flt', 'fge', 'feq', 'fne',
# they help code generation but do not necessarily produce code that is
# more easily optimizable.
late_optimizations = [
# Most of these optimizations aren't quite safe when you get infinity or
# Nan involved but the first one should be fine.
(('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))),
(('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
(('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
(('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
(('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
(('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
(('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
(('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
(('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
(('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
+28 -3
View File
@@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
if (!instr->dest.dest.is_ssa)
return false;
/* In the case that any outputs/inputs have unsized types, then we need to
* guess the bit-size. In this case, the validator ensures that all
* bit-sizes match so we can just take the bit-size from first
* output/input with an unsized type. If all the outputs/inputs are sized
* then we don't need to guess the bit-size at all because the code we
* generate for constant opcodes in this case already knows the sizes of
* the types involved and does not need the provided bit-size for anything
* (although it still requires to receive a valid bit-size).
*/
unsigned bit_size = 0;
if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type))
bit_size = instr->dest.dest.ssa.bit_size;
for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
if (!instr->src[i].src.is_ssa)
return false;
if (bit_size == 0 &&
!nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) {
bit_size = instr->src[i].src.ssa->bit_size;
}
nir_instr *src_instr = instr->src[i].src.ssa->parent_instr;
if (src_instr->type != nir_instr_type_load_const)
@@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
j++) {
src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
if (load_const->def.bit_size == 64)
src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]];
else
src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]];
}
/* We shouldn't have any source modifiers in the optimization loop. */
assert(!instr->src[i].abs && !instr->src[i].negate);
}
if (bit_size == 0)
bit_size = 32;
/* We shouldn't have any saturate modifiers in the optimization loop. */
assert(!instr->dest.saturate);
nir_const_value dest =
nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components,
src);
bit_size, src);
nir_load_const_instr *new_instr =
nir_load_const_instr_create(mem_ctx,
instr->dest.dest.ssa.num_components);
new_instr->def.bit_size = instr->dest.dest.ssa.bit_size;
new_instr->value = dest;
nir_instr_insert_before(&instr->instr, &new_instr->instr);
@@ -106,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref)
nir_load_const_instr *indirect =
nir_instr_as_load_const(arr->indirect.ssa->parent_instr);
arr->base_offset += indirect->value.u[0];
arr->base_offset += indirect->value.u32[0];
/* Clear out the source */
nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL));
+1 -1
View File
@@ -228,7 +228,7 @@ dead_cf_block(nir_block *block)
if (!const_value)
return false;
opt_constant_if(following_if, const_value->u[0] != 0);
opt_constant_if(following_if, const_value->u32[0] != 0);
return true;
}
+2 -1
View File
@@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state)
}
nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
phi->dest.ssa.num_components, phi->dest.ssa.name);
phi->dest.ssa.num_components,
phi->dest.ssa.bit_size, phi->dest.ssa.name);
sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
nir_ssa_def_rewrite_uses(&phi->dest.ssa,
+53 -12
View File
@@ -52,6 +52,7 @@ struct nir_phi_builder_value {
/* Needed so we can create phis and undefs */
unsigned num_components;
unsigned bit_size;
/* The list of phi nodes associated with this value. Phi nodes are not
* added directly. Instead, they are created, the instr->block pointer
@@ -61,8 +62,18 @@ struct nir_phi_builder_value {
*/
struct exec_list phis;
/* Array of SSA defs, indexed by block. If a phi needs to be inserted
* in a given block, it will have the magic value NEEDS_PHI.
/* Array of SSA defs, indexed by block. For each block, this array has has
* one of three types of values:
*
* - NULL. Indicates that there is no known definition in this block. If
* you need to find one, look at the block's immediate dominator.
*
* - NEEDS_PHI. Indicates that the block may need a phi node but none has
* been created yet. If a def is requested for a block, a phi will need
* to be created.
*
* - A regular SSA def. This will be either the result of a phi node or
* one of the defs provided by nir_phi_builder_value_set_blocK_def().
*/
nir_ssa_def *defs[0];
};
@@ -101,7 +112,7 @@ nir_phi_builder_create(nir_function_impl *impl)
struct nir_phi_builder_value *
nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
const BITSET_WORD *defs)
unsigned bit_size, const BITSET_WORD *defs)
{
struct nir_phi_builder_value *val;
unsigned i, w_start = 0, w_end = 0;
@@ -109,6 +120,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks);
val->builder = pb;
val->num_components = num_components;
val->bit_size = bit_size;
exec_list_make_empty(&val->phis);
exec_list_push_tail(&pb->values, &val->node);
@@ -127,8 +139,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
set_foreach(cur->dom_frontier, dom_entry) {
nir_block *next = (nir_block *) dom_entry->key;
/*
* If there's more than one return statement, then the end block
/* If there's more than one return statement, then the end block
* can be a join point for some definitions. However, there are
* no instructions in the end block, so nothing would use those
* phi nodes. Of course, we couldn't place those phi nodes
@@ -139,6 +150,10 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
continue;
if (val->defs[next->index] == NULL) {
/* Instead of creating a phi node immediately, we simply set the
* value to the magic value NEEDS_PHI. Later, we create phi nodes
* on demand in nir_phi_builder_value_get_block_def().
*/
val->defs[next->index] = NEEDS_PHI;
if (pb->work[next->index] < pb->iter_count) {
@@ -163,7 +178,9 @@ nir_ssa_def *
nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
nir_block *block)
{
/* For each block, we have one of three types of values */
if (val->defs[block->index] == NULL) {
/* NULL indicates that we have no SSA def for this block. */
if (block->imm_dom) {
/* Grab it from our immediate dominator. We'll stash it here for
* easy access later.
@@ -185,17 +202,36 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
return &undef->def;
}
} else if (val->defs[block->index] == NEEDS_PHI) {
/* If we need a phi instruction, go ahead and create one but don't
* add it to the program yet. Later, we'll go through and set up phi
* sources and add the instructions will be added at that time.
/* The magic value NEEDS_PHI indicates that the block needs a phi node
* but none has been created. We need to create one now so we can
* return it to the caller.
*
* Because a phi node may use SSA defs that it does not dominate (this
* happens in loops), we do not yet have enough information to fully
* fill out the phi node. Instead, the phi nodes we create here will be
* empty (have no sources) and won't actually be placed in the block's
* instruction list yet. Later, in nir_phi_builder_finish(), we walk
* over all of the phi instructions, fill out the sources lists, and
* place them at the top of their respective block's instruction list.
*
* Creating phi nodes on-demand allows us to avoid creating dead phi
* nodes that will just get deleted later. While this probably isn't a
* big win for a full into-SSA pass, other users may use the phi builder
* to make small SSA form repairs where most of the phi nodes will never
* be used.
*/
nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader);
nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL);
nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components,
val->bit_size, NULL);
phi->instr.block = block;
exec_list_push_tail(&val->phis, &phi->instr.node);
val->defs[block->index] = &phi->dest.ssa;
return &phi->dest.ssa;
} else {
/* In this case, we have an actual SSA def. It's either the result of a
* phi node created by the case above or one passed to us through
* nir_phi_builder_value_set_block_def().
*/
return val->defs[block->index];
}
}
@@ -216,9 +252,14 @@ nir_phi_builder_finish(struct nir_phi_builder *pb)
NIR_VLA(nir_block *, preds, num_blocks);
foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) {
/* We can't iterate over the list of phis normally because we are
* removing them as we go and, in some cases, adding new phis as we
* build the source lists of others.
/* We treat the linked list of phi nodes like a worklist. The list is
* pre-populated by calls to nir_phi_builder_value_get_block_def() that
* create phi nodes. As we fill in the sources of phi nodes, more may
* be created and are added to the end of the list.
*
* Because we are adding and removing phi nodes from the list as we go,
* we can't iterate over it normally. Instead, we just iterate until
* the list is empty.
*/
while (!exec_list_is_empty(&val->phis)) {
struct exec_node *head = exec_list_get_head(&val->phis);
+32 -1
View File
@@ -25,7 +25,38 @@
#include "nir.h"
/** A helper for placing phi nodes in a NIR shader
*
* Basic usage goes something like this:
*
* each variable, var, has:
* a bitset var.defs of blocks where the variable is defined
* a struct nir_phi_builder_value *pb_val
*
* // initialize bitsets
* foreach block:
* foreach def of variable var:
* var.defs[def.block] = true;
*
* // initialize phi builder
* pb = nir_phi_builder_create()
* foreach var:
* var.pb_val = nir_phi_builder_add_value(pb, var.defs)
*
* // Visit each block. This needs to visit dominators first;
* // nir_for_each_block() will be ok.
* foreach block:
* foreach instruction:
* foreach use of variable var:
* replace use with nir_phi_builder_get_block_def(var.pb_val)
* foreach def of variable var:
* create ssa def, register with
* nir_phi_builder_set_block_def(var.pb_val)
*
* nir_phi_builder_finish(pb)
*/
struct nir_phi_builder;
struct nir_phi_builder_value;
/* Create a new phi builder.
@@ -43,7 +74,7 @@ struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl);
*/
struct nir_phi_builder_value *
nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
const BITSET_WORD *defs);
unsigned bit_size, const BITSET_WORD *defs);
/* Register a definition for the given value and block.
*
+3 -1
View File
@@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state)
print_alu_dest(&instr->dest, state);
fprintf(fp, " = %s", nir_op_infos[instr->op].name);
if (instr->exact)
fprintf(fp, "!");
if (instr->dest.saturate)
fprintf(fp, ".sat");
fprintf(fp, " ");
@@ -714,7 +716,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
* and then print the float in a comment for readability.
*/
fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]);
fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
}
fprintf(fp, ")");
+2 -1
View File
@@ -85,7 +85,8 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
BITSET_SET(state->def_set, def->parent_instr->block->index);
struct nir_phi_builder_value *val =
nir_phi_builder_add_value(pb, def->num_components, state->def_set);
nir_phi_builder_add_value(pb, def->num_components, def->bit_size,
state->def_set);
nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);
+232 -24
View File
@@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr)
case nir_op_inot:
return src_is_bool(instr->src[0].src);
default:
return nir_op_infos[instr->op].output_type == nir_type_bool;
return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type)
== nir_type_bool);
}
}
@@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
nir_alu_instr *src_alu =
nir_instr_as_alu(instr->src[src].src.ssa->parent_instr);
if (nir_op_infos[src_alu->op].output_type != var->type &&
!(var->type == nir_type_bool && alu_instr_is_bool(src_alu)))
if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) !=
var->type &&
!(nir_alu_type_get_base_type(var->type) == nir_type_bool &&
alu_instr_is_bool(src_alu)))
return false;
}
@@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
nir_load_const_instr *load =
nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr);
switch (nir_op_infos[instr->op].input_types[src]) {
switch (const_val->type) {
case nir_type_float:
for (unsigned i = 0; i < num_components; ++i) {
if (load->value.f[new_swizzle[i]] != const_val->data.f)
double val;
switch (load->def.bit_size) {
case 32:
val = load->value.f32[new_swizzle[i]];
break;
case 64:
val = load->value.f64[new_swizzle[i]];
break;
default:
unreachable("unknown bit size");
}
if (val != const_val->data.d)
return false;
}
return true;
case nir_type_int:
case nir_type_uint:
case nir_type_bool:
for (unsigned i = 0; i < num_components; ++i) {
if (load->value.i[new_swizzle[i]] != const_val->data.i)
int64_t val;
switch (load->def.bit_size) {
case 32:
val = load->value.i32[new_swizzle[i]];
break;
case 64:
val = load->value.i64[new_swizzle[i]];
break;
default:
unreachable("unknown bit size");
}
if (val != const_val->data.i)
return false;
}
return true;
case nir_type_uint:
case nir_type_bool32:
for (unsigned i = 0; i < num_components; ++i) {
uint64_t val;
switch (load->def.bit_size) {
case 32:
val = load->value.u32[new_swizzle[i]];
break;
case 64:
val = load->value.u64[new_swizzle[i]];
break;
default:
unreachable("unknown bit size");
}
if (val != const_val->data.u)
return false;
}
return true;
default:
unreachable("Invalid alu source type");
}
@@ -191,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
if (instr->op != expr->opcode)
return false;
assert(instr->dest.dest.is_ssa);
if (expr->inexact && instr->exact)
return false;
assert(!instr->dest.saturate);
assert(nir_op_infos[instr->op].num_inputs > 0);
@@ -244,9 +295,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
}
}
typedef struct bitsize_tree {
unsigned num_srcs;
struct bitsize_tree *srcs[4];
unsigned common_size;
bool is_src_sized[4];
bool is_dest_sized;
unsigned dest_size;
unsigned src_size[4];
} bitsize_tree;
static bitsize_tree *
build_bitsize_tree(void *mem_ctx, struct match_state *state,
const nir_search_value *value)
{
bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree);
switch (value->type) {
case nir_search_value_expression: {
nir_search_expression *expr = nir_search_value_as_expression(value);
nir_op_info info = nir_op_infos[expr->opcode];
tree->num_srcs = info.num_inputs;
tree->common_size = 0;
for (unsigned i = 0; i < info.num_inputs; i++) {
tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]);
if (tree->is_src_sized[i])
tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]);
tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]);
}
tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type);
if (tree->is_dest_sized)
tree->dest_size = nir_alu_type_get_type_size(info.output_type);
break;
}
case nir_search_value_variable: {
nir_search_variable *var = nir_search_value_as_variable(value);
tree->num_srcs = 0;
tree->is_dest_sized = true;
tree->dest_size = nir_src_bit_size(state->variables[var->variable].src);
break;
}
case nir_search_value_constant: {
tree->num_srcs = 0;
tree->is_dest_sized = false;
tree->common_size = 0;
break;
}
}
return tree;
}
static unsigned
bitsize_tree_filter_up(bitsize_tree *tree)
{
for (unsigned i = 0; i < tree->num_srcs; i++) {
unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]);
if (src_size == 0)
continue;
if (tree->is_src_sized[i]) {
assert(src_size == tree->src_size[i]);
} else if (tree->common_size != 0) {
assert(src_size == tree->common_size);
tree->src_size[i] = src_size;
} else {
tree->common_size = src_size;
tree->src_size[i] = src_size;
}
}
if (tree->num_srcs && tree->common_size) {
if (tree->dest_size == 0)
tree->dest_size = tree->common_size;
else if (!tree->is_dest_sized)
assert(tree->dest_size == tree->common_size);
for (unsigned i = 0; i < tree->num_srcs; i++) {
if (!tree->src_size[i])
tree->src_size[i] = tree->common_size;
}
}
return tree->dest_size;
}
static void
bitsize_tree_filter_down(bitsize_tree *tree, unsigned size)
{
if (tree->dest_size)
assert(tree->dest_size == size);
else
tree->dest_size = size;
if (!tree->is_dest_sized) {
if (tree->common_size)
assert(tree->common_size == size);
else
tree->common_size = size;
}
for (unsigned i = 0; i < tree->num_srcs; i++) {
if (!tree->src_size[i]) {
assert(tree->common_size);
tree->src_size[i] = tree->common_size;
}
bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]);
}
}
static nir_alu_src
construct_value(const nir_search_value *value, nir_alu_type type,
unsigned num_components, struct match_state *state,
construct_value(const nir_search_value *value,
unsigned num_components, bitsize_tree *bitsize, bool exact,
struct match_state *state,
nir_instr *instr, void *mem_ctx)
{
switch (value->type) {
@@ -257,7 +422,9 @@ construct_value(const nir_search_value *value, nir_alu_type type,
num_components = nir_op_infos[expr->opcode].output_size;
nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL);
nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components,
bitsize->dest_size, NULL);
alu->exact = exact;
alu->dest.write_mask = (1 << num_components) - 1;
alu->dest.saturate = false;
@@ -269,8 +436,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
num_components = nir_op_infos[alu->op].input_sizes[i];
alu->src[i] = construct_value(expr->srcs[i],
nir_op_infos[alu->op].input_types[i],
num_components,
num_components, bitsize->srcs[i], exact,
state, instr, mem_ctx);
}
@@ -301,23 +467,57 @@ construct_value(const nir_search_value *value, nir_alu_type type,
const nir_search_constant *c = nir_search_value_as_constant(value);
nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1);
switch (type) {
switch (c->type) {
case nir_type_float:
load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f);
load->value.f[0] = c->data.f;
load->def.name = ralloc_asprintf(load, "%f", c->data.d);
switch (bitsize->dest_size) {
case 32:
load->value.f32[0] = c->data.d;
break;
case 64:
load->value.f64[0] = c->data.d;
break;
default:
unreachable("unknown bit size");
}
break;
case nir_type_int:
load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
load->value.i[0] = c->data.i;
load->def.name = ralloc_asprintf(load, "%ld", c->data.i);
switch (bitsize->dest_size) {
case 32:
load->value.i32[0] = c->data.i;
break;
case 64:
load->value.i64[0] = c->data.i;
break;
default:
unreachable("unknown bit size");
}
break;
case nir_type_uint:
case nir_type_bool:
load->value.u[0] = c->data.u;
load->def.name = ralloc_asprintf(load, "%lu", c->data.u);
switch (bitsize->dest_size) {
case 32:
load->value.u32[0] = c->data.u;
break;
case 64:
load->value.u64[0] = c->data.u;
break;
default:
unreachable("unknown bit size");
}
case nir_type_bool32:
load->value.u32[0] = c->data.u;
break;
default:
unreachable("Invalid alu source type");
}
load->def.bit_size = bitsize->dest_size;
nir_instr_insert_before(instr, &load->instr);
nir_alu_src val;
@@ -352,6 +552,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
swizzle, &state))
return NULL;
void *bitsize_ctx = ralloc_context(NULL);
bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace);
bitsize_tree_filter_up(tree);
bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size);
/* Inserting a mov may be unnecessary. However, it's much easier to
* simply let copy propagation clean this up than to try to go through
* and rewrite swizzles ourselves.
@@ -359,11 +564,12 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
mov->dest.write_mask = instr->dest.write_mask;
nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
instr->dest.dest.ssa.num_components, NULL);
instr->dest.dest.ssa.num_components,
instr->dest.dest.ssa.bit_size, NULL);
mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type,
instr->dest.dest.ssa.num_components, &state,
&instr->instr, mem_ctx);
mov->src[0] = construct_value(replace,
instr->dest.dest.ssa.num_components, tree,
instr->exact, &state, &instr->instr, mem_ctx);
nir_instr_insert_before(&instr->instr, &mov->instr);
nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa,
@@ -375,5 +581,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
*/
nir_instr_remove(&instr->instr);
ralloc_free(bitsize_ctx);
return mov;
}
+11 -3
View File
@@ -71,16 +71,24 @@ typedef struct {
typedef struct {
nir_search_value value;
nir_alu_type type;
union {
uint32_t u;
int32_t i;
float f;
uint64_t u;
int64_t i;
double d;
} data;
} nir_search_constant;
typedef struct {
nir_search_value value;
/* When set on a search expression, the expression will only match an SSA
* value that does *not* have the exact bit set. If unset, the exact bit
* on the SSA value is ignored.
*/
bool inexact;
nir_op opcode;
const nir_search_value *srcs[4];
} nir_search_expression;
+6 -2
View File
@@ -219,7 +219,9 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
state->states[index].num_defs);
list_del(&dest->reg.def_link);
nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name);
nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
reg->bit_size, name);
ralloc_free(name);
/* push our SSA destination on the stack */
state->states[index].index++;
@@ -271,7 +273,9 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)
instr->dest.write_mask = (1 << num_components) - 1;
list_del(&instr->dest.dest.reg.def_link);
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name);
nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
reg->bit_size, name);
ralloc_free(name);
if (nir_op_infos[instr->op].output_size == 0) {
/*
+38 -4
View File
@@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
nir_alu_src *src = &instr->src[index];
unsigned num_components;
if (src->src.is_ssa)
unsigned src_bit_size;
if (src->src.is_ssa) {
src_bit_size = src->src.ssa->bit_size;
num_components = src->src.ssa->num_components;
else {
} else {
src_bit_size = src->src.reg.reg->bit_size;
if (src->src.reg.reg->is_packed)
num_components = 4; /* can't check anything */
else
@@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
assert(src->swizzle[i] < num_components);
}
nir_alu_type src_type = nir_op_infos[instr->op].input_types[index];
/* 8-bit float isn't a thing */
if (nir_alu_type_get_base_type(src_type) == nir_type_float)
assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64);
if (nir_alu_type_get_type_size(src_type)) {
/* This source has an explicit bit size */
assert(nir_alu_type_get_type_size(src_type) == src_bit_size);
} else {
if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) {
unsigned dest_bit_size =
instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size
: instr->dest.dest.reg.reg->bit_size;
assert(dest_bit_size == src_bit_size);
}
}
validate_src(&src->src, state);
}
@@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state)
}
static void
validate_alu_dest(nir_alu_dest *dest, validate_state *state)
validate_alu_dest(nir_alu_instr *instr, validate_state *state)
{
nir_alu_dest *dest = &instr->dest;
unsigned dest_size =
dest->dest.is_ssa ? dest->dest.ssa.num_components
: dest->dest.reg.reg->num_components;
@@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state)
assert(nir_op_infos[alu->op].output_type == nir_type_float ||
!dest->saturate);
unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size
: dest->dest.reg.reg->bit_size;
nir_alu_type type = nir_op_infos[instr->op].output_type;
/* 8-bit float isn't a thing */
if (nir_alu_type_get_base_type(type) == nir_type_float)
assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
assert(nir_alu_type_get_type_size(type) == 0 ||
nir_alu_type_get_type_size(type) == bit_size);
validate_dest(&dest->dest, state);
}
@@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state)
validate_alu_src(instr, i, state);
}
validate_alu_dest(&instr->dest, state);
validate_alu_dest(instr, state);
}
static void
+22 -14
View File
@@ -92,7 +92,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
nir_load_const_instr_create(b->shader, num_components);
for (unsigned i = 0; i < num_components; i++)
load->value.u[i] = constant->value.u[i];
load->value.u32[i] = constant->value.u[i];
nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
val->def = &load->def;
@@ -109,7 +109,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
nir_load_const_instr_create(b->shader, rows);
for (unsigned j = 0; j < rows; j++)
load->value.u[j] = constant->value.u[rows * i + j];
load->value.u32[j] = constant->value.u[rows * i + j];
nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
col_val->def = &load->def;
@@ -1035,6 +1035,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap);
unsigned num_components = glsl_get_vector_elements(val->const_type);
unsigned bit_size =
glsl_get_bit_size(glsl_get_base_type(val->const_type));
nir_const_value src[3];
assert(count <= 7);
@@ -1043,14 +1045,16 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
vtn_value(b, w[4 + i], vtn_value_type_constant)->constant;
unsigned j = swap ? 1 - i : i;
assert(bit_size == 32);
for (unsigned k = 0; k < num_components; k++)
src[j].u[k] = c->value.u[k];
src[j].u32[k] = c->value.u[k];
}
nir_const_value res = nir_eval_const_opcode(op, num_components, src);
nir_const_value res = nir_eval_const_opcode(op, num_components,
bit_size, src);
for (unsigned k = 0; k < num_components; k++)
val->constant->value.u[k] = res.u[k];
val->constant->value.u[k] = res.u32[k];
return;
} /* default */
@@ -1414,7 +1418,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
}
nir_ssa_dest_init(&instr->instr, &instr->dest,
nir_tex_instr_dest_size(instr), NULL);
nir_tex_instr_dest_size(instr), 32, NULL);
assert(glsl_get_vector_elements(ret_type->type) ==
nir_tex_instr_dest_size(instr));
@@ -1600,7 +1604,7 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode,
if (opcode != SpvOpImageWrite) {
struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL);
nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, 32, NULL);
nir_builder_instr_insert(&b->nb, &intrin->instr);
@@ -1738,7 +1742,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
fill_common_atomic_sources(b, opcode, w, &atomic->src[2]);
}
nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL);
nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);
struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
@@ -1750,7 +1754,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
}
static nir_alu_instr *
create_vec(nir_shader *shader, unsigned num_components)
create_vec(nir_shader *shader, unsigned num_components, unsigned bit_size)
{
nir_op op;
switch (num_components) {
@@ -1762,7 +1766,8 @@ create_vec(nir_shader *shader, unsigned num_components)
}
nir_alu_instr *vec = nir_alu_instr_create(shader, op);
nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL);
nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components,
bit_size, NULL);
vec->dest.write_mask = (1 << num_components) - 1;
return vec;
@@ -1779,7 +1784,8 @@ vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src)
for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) {
nir_alu_instr *vec = create_vec(b->shader,
glsl_get_matrix_columns(src->type));
glsl_get_matrix_columns(src->type),
glsl_get_bit_size(glsl_get_base_type(src->type)));
if (glsl_type_is_vector_or_scalar(src->type)) {
vec->src[0].src = nir_src_for_ssa(src->def);
vec->src[0].swizzle[0] = i;
@@ -1809,7 +1815,8 @@ nir_ssa_def *
vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert,
unsigned index)
{
nir_alu_instr *vec = create_vec(b->shader, src->num_components);
nir_alu_instr *vec = create_vec(b->shader, src->num_components,
src->bit_size);
for (unsigned i = 0; i < src->num_components; i++) {
if (i == index) {
@@ -1854,7 +1861,7 @@ vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components,
nir_ssa_def *src0, nir_ssa_def *src1,
const uint32_t *indices)
{
nir_alu_instr *vec = create_vec(b->shader, num_components);
nir_alu_instr *vec = create_vec(b->shader, num_components, src0->bit_size);
nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1);
nir_builder_instr_insert(&b->nb, &undef->instr);
@@ -1884,7 +1891,8 @@ static nir_ssa_def *
vtn_vector_construct(struct vtn_builder *b, unsigned num_components,
unsigned num_srcs, nir_ssa_def **srcs)
{
nir_alu_instr *vec = create_vec(b->shader, num_components);
nir_alu_instr *vec = create_vec(b->shader, num_components,
srcs[0]->bit_size);
unsigned dest_idx = 0;
for (unsigned i = 0; i < num_srcs; i++) {
+3 -1
View File
@@ -627,7 +627,9 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,
nir_alu_instr *instr = nir_alu_instr_create(b->shader, op);
nir_ssa_dest_init(&instr->instr, &instr->dest.dest,
glsl_get_vector_elements(val->ssa->type), val->name);
glsl_get_vector_elements(val->ssa->type),
glsl_get_bit_size(glsl_get_base_type(val->ssa->type)),
val->name);
instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1;
val->ssa->def = &instr->dest.dest.ssa;
+7 -4
View File
@@ -190,7 +190,9 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref,
if (load) {
nir_ssa_dest_init(&intrin->instr, &intrin->dest,
intrin->num_components, NULL);
intrin->num_components,
glsl_get_bit_size(glsl_get_base_type(tail->type)),
NULL);
inout->def = &intrin->dest.ssa;
} else {
nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1);
@@ -322,7 +324,7 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain,
nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set);
nir_intrinsic_set_binding(instr, chain->var->binding);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
nir_builder_instr_insert(&b->nb, &instr->instr);
return &instr->dest.ssa;
@@ -411,7 +413,8 @@ _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load,
if (load) {
nir_ssa_dest_init(&instr->instr, &instr->dest,
instr->num_components, NULL);
instr->num_components,
glsl_get_bit_size(glsl_get_base_type(type)), NULL);
(*inout)->def = &instr->dest.ssa;
}
@@ -1385,7 +1388,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
nir_intrinsic_instr_create(b->nb.shader,
nir_intrinsic_get_buffer_size);
instr->src[0] = nir_src_for_ssa(index);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
nir_builder_instr_insert(&b->nb, &instr->instr);
nir_ssa_def *buf_size = &instr->dest.ssa;
+21
View File
@@ -80,6 +80,27 @@ enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
unsigned glsl_get_record_location_offset(const struct glsl_type *type,
unsigned length);
static inline unsigned
glsl_get_bit_size(enum glsl_base_type type)
{
switch (type) {
case GLSL_TYPE_INT:
case GLSL_TYPE_UINT:
case GLSL_TYPE_BOOL:
case GLSL_TYPE_FLOAT: /* TODO handle mediump */
case GLSL_TYPE_SUBROUTINE:
return 32;
case GLSL_TYPE_DOUBLE:
return 64;
default:
unreachable("unknown base type");
}
return 0;
}
bool glsl_type_is_void(const struct glsl_type *type);
bool glsl_type_is_error(const struct glsl_type *type);
bool glsl_type_is_vector(const struct glsl_type *type);
-1
View File
@@ -44,7 +44,6 @@
#include "egllog.h"
#define MIN2(A, B) (((A) < (B)) ? (A) : (B))
/**
+7
View File
@@ -40,9 +40,16 @@ extern "C" {
#define _EGL_MAX_EXTENSIONS_LEN 1000
/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER,
* this is used to implement EGL_LARGEST_PBUFFER.
*/
#define _EGL_MAX_PBUFFER_WIDTH 4096
#define _EGL_MAX_PBUFFER_HEIGHT 4096
#define _EGL_VENDOR_STRING "Mesa Project"
#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
#define MIN2(A, B) (((A) < (B)) ? (A) : (B))
#ifdef __cplusplus
}
+6
View File
@@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
if (err != EGL_SUCCESS)
return _eglError(err, func);
/* if EGL_LARGEST_PBUFFER in use, clamp width and height */
if (surf->LargestPbuffer) {
surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH);
surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT);
}
return EGL_TRUE;
}
-6
View File
@@ -206,12 +206,6 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader *shader,
{
struct tgsi_exec_machine *machine = shader->machine;
tgsi_set_exec_mask(machine,
1,
input_primitives > 1,
input_primitives > 2,
input_primitives > 3);
/* run interpreter */
tgsi_exec_machine_run(machine);
@@ -264,11 +264,11 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
if (aactx->colorOutput != -1) {
/* insert texture sampling code for antialiasing. */
/* TEX texTemp, input_coord, sampler */
tgsi_transform_tex_2d_inst(ctx,
TGSI_FILE_TEMPORARY, aactx->texTemp,
TGSI_FILE_INPUT, aactx->maxInput + 1,
aactx->freeSampler);
/* TEX texTemp, input_coord, sampler, 2D */
tgsi_transform_tex_inst(ctx,
TGSI_FILE_TEMPORARY, aactx->texTemp,
TGSI_FILE_INPUT, aactx->maxInput + 1,
TGSI_TEXTURE_2D, aactx->freeSampler);
/* MOV rgb */
tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
@@ -159,12 +159,6 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
input = (const float (*)[4])((const char *)input + input_stride);
}
tgsi_set_exec_mask(machine,
1,
max_vertices > 1,
max_vertices > 2,
max_vertices > 3);
/* run interpreter */
tgsi_exec_machine_run( machine );
+1
View File
@@ -1191,6 +1191,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
"FRAG\n"
"DCL IN[0], GENERIC[0], LINEAR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], RECT, FLOAT\n"
"DCL OUT[0], COLOR[0]\n"
"DCL TEMP[0]\n"
+8 -8
View File
@@ -459,7 +459,7 @@ ttn_emit_immediate(struct ttn_compile *c)
c->next_imm++;
for (i = 0; i < 4; i++)
load_const->value.u[i] = tgsi_imm->u[i].Uint;
load_const->value.u32[i] = tgsi_imm->u[i].Uint;
nir_builder_instr_insert(b, &load_const->instr);
}
@@ -515,8 +515,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
nir_intrinsic_load_var);
load->num_components = 4;
load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
nir_ssa_dest_init(&load->instr, &load->dest,
4, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
src = nir_src_for_ssa(&load->dest.ssa);
@@ -567,7 +567,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
load = nir_intrinsic_instr_create(b->shader, op);
load->num_components = ncomp;
nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, ncomp, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
src = nir_src_for_ssa(&load->dest.ssa);
@@ -632,7 +632,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
}
load->src[srcn++] = nir_src_for_ssa(offset);
nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
nir_builder_instr_insert(b, &load->instr);
src = nir_src_for_ssa(&load->dest.ssa);
@@ -1425,7 +1425,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
assert(src_number == num_srcs);
nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
nir_builder_instr_insert(b, &instr->instr);
/* Resolve the writemask on the texture op. */
@@ -1464,10 +1464,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X));
txs->src[0].src_type = nir_tex_src_lod;
nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL);
nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL);
nir_builder_instr_insert(b, &txs->instr);
nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL);
nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL);
nir_builder_instr_insert(b, &qlv->instr);
ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
-2
View File
@@ -23,8 +23,6 @@
#include "compiler/nir/nir.h"
struct nir_shader_compiler_options *options;
struct nir_shader *
tgsi_to_nir(const void *tgsi_tokens,
const struct nir_shader_compiler_options *options);
@@ -33,6 +33,7 @@ static const char nored[] = "FRAG\n"
"DCL IN[0], GENERIC[0], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL TEMP[0]\n"
"IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n"
" 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -46,6 +47,7 @@ static const char nogreen[] = "FRAG\n"
"DCL IN[0], GENERIC[0], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL TEMP[0]\n"
"IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n"
" 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -59,6 +61,7 @@ static const char noblue[] = "FRAG\n"
"DCL IN[0], GENERIC[0], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL TEMP[0]\n"
"IMM FLT32 { 0.0000, 0.0000, 0.0000, 0.0000}\n"
" 0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -50,6 +50,7 @@ static const char depth1fs[] = "FRAG\n"
"DCL IN[2], GENERIC[11], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL TEMP[0..2]\n"
"IMM FLT32 { 0.0030, 0.0000, 1.0000, 0.0000}\n"
" 0: TEX TEMP[0].x, IN[1].xyyy, SAMP[0], 2D\n"
@@ -80,6 +81,7 @@ static const char color1fs[] = "FRAG\n"
"DCL IN[2], GENERIC[11], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL TEMP[0..2]\n"
"IMM FLT32 { 0.2126, 0.7152, 0.0722, 0.1000}\n"
"IMM FLT32 { 1.0000, 0.0000, 0.0000, 0.0000}\n"
@@ -112,6 +114,7 @@ static const char neigh3fs[] = "FRAG\n"
"DCL IN[2], GENERIC[11], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL SAMP[1]\n"
"DCL TEMP[0..8]\n"
"IMM FLT32 { 1.0000, 0.00001, 0.0000, 0.0000}\n"
@@ -175,8 +178,11 @@ static const char blend2fs_1[] = "FRAG\n"
"DCL IN[0], GENERIC[0], PERSPECTIVE\n"
"DCL OUT[0], COLOR\n"
"DCL SAMP[0]\n"
"DCL SVIEW[0], 2D, FLOAT\n"
"DCL SAMP[1]\n"
"DCL SVIEW[1], 2D, FLOAT\n"
"DCL SAMP[2]\n"
"DCL SVIEW[2], 2D, FLOAT\n"
"DCL CONST[0]\n"
"DCL TEMP[0..6]\n"
"IMM FLT32 { 0.0000, -0.2500, 0.00609756, 0.5000}\n"
+11 -1
View File
@@ -111,7 +111,7 @@ tgsi_default_declaration( void )
declaration.Local = 0;
declaration.Array = 0;
declaration.Atomic = 0;
declaration.Shared = 0;
declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
declaration.Padding = 0;
return declaration;
@@ -127,6 +127,8 @@ tgsi_build_declaration(
unsigned invariant,
unsigned local,
unsigned array,
unsigned atomic,
unsigned mem_type,
struct tgsi_header *header )
{
struct tgsi_declaration declaration;
@@ -143,6 +145,8 @@ tgsi_build_declaration(
declaration.Invariant = invariant;
declaration.Local = local;
declaration.Array = array;
declaration.Atomic = atomic;
declaration.MemType = mem_type;
header_bodysize_grow( header );
return declaration;
@@ -401,6 +405,8 @@ tgsi_build_full_declaration(
full_decl->Declaration.Invariant,
full_decl->Declaration.Local,
full_decl->Declaration.Array,
full_decl->Declaration.Atomic,
full_decl->Declaration.MemType,
header );
if (maxsize <= size)
@@ -775,6 +781,8 @@ tgsi_default_instruction_memory( void )
struct tgsi_instruction_memory instruction_memory;
instruction_memory.Qualifier = 0;
instruction_memory.Texture = 0;
instruction_memory.Format = 0;
instruction_memory.Padding = 0;
return instruction_memory;
@@ -790,6 +798,8 @@ tgsi_build_instruction_memory(
struct tgsi_instruction_memory instruction_memory;
instruction_memory.Qualifier = qualifier;
instruction_memory.Texture = 0;
instruction_memory.Format = 0;
instruction_memory.Padding = 0;
instruction->Memory = 1;
+7 -2
View File
@@ -365,8 +365,13 @@ iter_declaration(
}
if (decl->Declaration.File == TGSI_FILE_MEMORY) {
if (decl->Declaration.Shared)
TXT(", SHARED");
switch (decl->Declaration.MemType) {
/* Note: ,GLOBAL is optional / the default */
case TGSI_MEMORY_TYPE_GLOBAL: TXT(", GLOBAL"); break;
case TGSI_MEMORY_TYPE_SHARED: TXT(", SHARED"); break;
case TGSI_MEMORY_TYPE_PRIVATE: TXT(", PRIVATE"); break;
case TGSI_MEMORY_TYPE_INPUT: TXT(", INPUT"); break;
}
}
if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
-25
View File
@@ -196,10 +196,6 @@ struct tgsi_sampler
#define TGSI_EXEC_TEMP_HALF_I (TGSI_EXEC_NUM_TEMPS + 3)
#define TGSI_EXEC_TEMP_HALF_C 0
/* execution mask, each value is either 0 or ~0 */
#define TGSI_EXEC_MASK_I (TGSI_EXEC_NUM_TEMPS + 3)
#define TGSI_EXEC_MASK_C 1
/* 4 register buffer for various purposes */
#define TGSI_EXEC_TEMP_R0 (TGSI_EXEC_NUM_TEMPS + 4)
#define TGSI_EXEC_NUM_TEMP_R 4
@@ -397,27 +393,6 @@ boolean
tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);
static inline void
tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
{
mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
mask;
}
/** Set execution mask values prior to executing the shader */
static inline void
tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
boolean ch0, boolean ch1, boolean ch2, boolean ch3)
{
int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
mask[0] = ch0 ? ~0 : 0;
mask[1] = ch1 ? ~0 : 0;
mask[2] = ch2 ? ~0 : 0;
mask[3] = ch3 ? ~0 : 0;
}
extern void
tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
unsigned num_bufs,
+23 -2
View File
@@ -38,6 +38,7 @@
#include "util/u_math.h"
#include "util/u_memory.h"
#include "util/u_prim.h"
#include "tgsi/tgsi_info.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi/tgsi_scan.h"
@@ -192,8 +193,17 @@ scan_instruction(struct tgsi_shader_info *info,
}
}
if (is_memory_file(src->Register.File))
if (is_memory_file(src->Register.File)) {
is_mem_inst = true;
if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) {
info->writes_memory = TRUE;
if (src->Register.File == TGSI_FILE_IMAGE &&
!src->Register.Indirect)
info->images_writemask |= 1 << src->Register.Index;
}
}
}
/* check for indirect register writes */
@@ -204,8 +214,16 @@ scan_instruction(struct tgsi_shader_info *info,
info->indirect_files_written |= (1 << dst->Register.File);
}
if (is_memory_file(dst->Register.File))
if (is_memory_file(dst->Register.File)) {
assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE);
is_mem_inst = true;
info->writes_memory = TRUE;
if (dst->Register.File == TGSI_FILE_IMAGE &&
!dst->Register.Indirect)
info->images_writemask |= 1 << dst->Register.Index;
}
}
if (is_mem_inst)
@@ -413,6 +431,9 @@ scan_declaration(struct tgsi_shader_info *info,
}
} else if (file == TGSI_FILE_SAMPLER) {
info->samplers_declared |= 1 << reg;
} else if (file == TGSI_FILE_IMAGE) {
if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
info->images_buffers |= 1 << reg;
}
}
}
+10
View File
@@ -111,12 +111,22 @@ struct tgsi_shader_info
boolean writes_clipvertex;
boolean writes_viewport_index;
boolean writes_layer;
boolean writes_memory; /**< contains stores or atomics to buffers or images */
boolean is_msaa_sampler[PIPE_MAX_SAMPLERS];
boolean uses_doubles; /**< uses any of the double instructions */
unsigned clipdist_writemask;
unsigned culldist_writemask;
unsigned num_written_culldistance;
unsigned num_written_clipdistance;
/**
* Bitmask indicating which images are written to (STORE / ATOM*).
* Indirect image accesses are not reflected in this mask.
*/
unsigned images_writemask;
/**
* Bitmask indicating which declared image is a buffer.
*/
unsigned images_buffers;
/**
* Bitmask indicating which register files are accessed with
* indirect addressing. The bits are (1 << TGSI_FILE_x), etc.
@@ -145,6 +145,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
"NUM_CLIPDIST_ENABLED",
"NUM_CULLDIST_ENABLED",
"FS_EARLY_DEPTH_STENCIL",
"NEXT_SHADER",
};
const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
+12 -2
View File
@@ -1390,8 +1390,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
ctx->cur = cur;
}
} else if (file == TGSI_FILE_MEMORY) {
if (str_match_nocase_whole(&cur, "SHARED")) {
decl.Declaration.Shared = 1;
if (str_match_nocase_whole(&cur, "GLOBAL")) {
/* Note this is a no-op global is the default */
decl.Declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
ctx->cur = cur;
} else if (str_match_nocase_whole(&cur, "SHARED")) {
decl.Declaration.MemType = TGSI_MEMORY_TYPE_SHARED;
ctx->cur = cur;
} else if (str_match_nocase_whole(&cur, "PRIVATE")) {
decl.Declaration.MemType = TGSI_MEMORY_TYPE_PRIVATE;
ctx->cur = cur;
} else if (str_match_nocase_whole(&cur, "INPUT")) {
decl.Declaration.MemType = TGSI_MEMORY_TYPE_INPUT;
ctx->cur = cur;
}
} else {
+44 -7
View File
@@ -301,6 +301,40 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
}
static inline void
tgsi_transform_op3_inst(struct tgsi_transform_context *ctx,
unsigned opcode,
unsigned dst_file,
unsigned dst_index,
unsigned dst_writemask,
unsigned src0_file,
unsigned src0_index,
unsigned src1_file,
unsigned src1_index,
unsigned src2_file,
unsigned src2_index)
{
struct tgsi_full_instruction inst;
inst = tgsi_default_full_instruction();
inst.Instruction.Opcode = opcode;
inst.Instruction.NumDstRegs = 1;
inst.Dst[0].Register.File = dst_file,
inst.Dst[0].Register.Index = dst_index;
inst.Dst[0].Register.WriteMask = dst_writemask;
inst.Instruction.NumSrcRegs = 3;
inst.Src[0].Register.File = src0_file;
inst.Src[0].Register.Index = src0_index;
inst.Src[1].Register.File = src1_file;
inst.Src[1].Register.Index = src1_index;
inst.Src[2].Register.File = src2_file;
inst.Src[2].Register.Index = src2_index;
ctx->emit_instruction(ctx, &inst);
}
static inline void
tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
unsigned opcode,
@@ -482,15 +516,18 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
static inline void
tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
unsigned dst_file,
unsigned dst_index,
unsigned src_file,
unsigned src_index,
unsigned sampler_index)
tgsi_transform_tex_inst(struct tgsi_transform_context *ctx,
unsigned dst_file,
unsigned dst_index,
unsigned src_file,
unsigned src_index,
unsigned tex_target,
unsigned sampler_index)
{
struct tgsi_full_instruction inst;
assert(tex_target < TGSI_TEXTURE_COUNT);
inst = tgsi_default_full_instruction();
inst.Instruction.Opcode = TGSI_OPCODE_TEX;
inst.Instruction.NumDstRegs = 1;
@@ -498,7 +535,7 @@ tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
inst.Dst[0].Register.Index = dst_index;
inst.Instruction.NumSrcRegs = 2;
inst.Instruction.Texture = TRUE;
inst.Texture.Texture = TGSI_TEXTURE_2D;
inst.Texture.Texture = tex_target;
inst.Src[0].Register.File = src_file;
inst.Src[0].Register.Index = src_index;
inst.Src[1].Register.File = TGSI_FILE_SAMPLER;
+33 -11
View File
@@ -101,6 +101,7 @@ struct ureg_program
{
unsigned processor;
bool supports_any_inout_decl_range;
int next_shader_processor;
struct {
unsigned semantic_name;
@@ -190,7 +191,7 @@ struct ureg_program
struct ureg_tokens domain[2];
bool use_shared_memory;
bool use_memory[TGSI_MEMORY_TYPE_COUNT];
};
static union tgsi_any_token error_tokens[32];
@@ -729,13 +730,14 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
return reg;
}
/* Allocate a shared memory area.
/* Allocate a memory area.
*/
struct ureg_src ureg_DECL_shared_memory(struct ureg_program *ureg)
struct ureg_src ureg_DECL_memory(struct ureg_program *ureg,
unsigned memory_type)
{
struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, 0);
struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, memory_type);
ureg->use_shared_memory = true;
ureg->use_memory[memory_type] = true;
return reg;
}
@@ -1672,7 +1674,7 @@ emit_decl_buffer(struct ureg_program *ureg,
}
static void
emit_decl_shared_memory(struct ureg_program *ureg)
emit_decl_memory(struct ureg_program *ureg, unsigned memory_type)
{
union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
@@ -1681,11 +1683,11 @@ emit_decl_shared_memory(struct ureg_program *ureg)
out[0].decl.NrTokens = 2;
out[0].decl.File = TGSI_FILE_MEMORY;
out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
out[0].decl.Shared = true;
out[0].decl.MemType = memory_type;
out[1].value = 0;
out[1].decl_range.First = 0;
out[1].decl_range.Last = 0;
out[1].decl_range.First = memory_type;
out[1].decl_range.Last = memory_type;
}
static void
@@ -1860,8 +1862,10 @@ static void emit_decls( struct ureg_program *ureg )
emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
}
if (ureg->use_shared_memory)
emit_decl_shared_memory(ureg);
for (i = 0; i < TGSI_MEMORY_TYPE_COUNT; i++) {
if (ureg->use_memory[i])
emit_decl_memory(ureg, i);
}
if (ureg->const_decls.nr_constant_ranges) {
for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
@@ -1966,6 +1970,16 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
{
const struct tgsi_token *tokens;
switch (ureg->processor) {
case TGSI_PROCESSOR_VERTEX:
case TGSI_PROCESSOR_TESS_EVAL:
ureg_property(ureg, TGSI_PROPERTY_NEXT_SHADER,
ureg->next_shader_processor == -1 ?
TGSI_PROCESSOR_FRAGMENT :
ureg->next_shader_processor);
break;
}
emit_header( ureg );
emit_decls( ureg );
copy_instructions( ureg );
@@ -2079,6 +2093,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
screen->get_shader_param(screen,
util_pipe_shader_from_tgsi_processor(processor),
PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
ureg->next_shader_processor = -1;
for (i = 0; i < Elements(ureg->properties); i++)
ureg->properties[i] = ~0;
@@ -2108,6 +2123,13 @@ no_ureg:
}
void
ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor)
{
ureg->next_shader_processor = processor;
}
unsigned
ureg_get_nr_outputs( const struct ureg_program *ureg )
{
+3 -1
View File
@@ -114,6 +114,8 @@ ureg_create_shader( struct ureg_program *,
struct pipe_context *pipe,
const struct pipe_stream_output_info *so );
void
ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor);
/* Alternately, return the built token stream and hand ownership of
* that memory to the caller:
@@ -338,7 +340,7 @@ struct ureg_src
ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
struct ureg_src
ureg_DECL_shared_memory(struct ureg_program *ureg);
ureg_DECL_memory(struct ureg_program *ureg, unsigned memory_type);
static inline struct ureg_src
ureg_imm4f( struct ureg_program *ureg,
+5 -5
View File
@@ -344,11 +344,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
pctx->wincoordFile, wincoordInput,
TGSI_FILE_IMMEDIATE, pctx->numImmed);
/* TEX texTemp, texTemp, sampler; */
tgsi_transform_tex_2d_inst(ctx,
TGSI_FILE_TEMPORARY, texTemp,
TGSI_FILE_TEMPORARY, texTemp,
sampIdx);
/* TEX texTemp, texTemp, sampler, 2D; */
tgsi_transform_tex_inst(ctx,
TGSI_FILE_TEMPORARY, texTemp,
TGSI_FILE_TEMPORARY, texTemp,
TGSI_TEXTURE_2D, sampIdx);
/* KILL_IF -texTemp; # if -texTemp < 0, kill fragment */
tgsi_transform_kill_inst(ctx,
@@ -646,6 +646,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
"FRAG\n"
"DCL IN[0], GENERIC[0], LINEAR\n"
"DCL SAMP[0..1]\n"
"DCL SVIEW[0..1], %s, FLOAT\n"
"DCL OUT[0], POSITION\n"
"DCL OUT[1], STENCIL\n"
"DCL TEMP[0]\n"
@@ -663,7 +664,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA ||
tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA);
sprintf(text, shader_templ, type, type);
sprintf(text, shader_templ, type, type, type);
if (!tgsi_text_translate(text, tokens, Elements(tokens))) {
assert(0);
+8
View File
@@ -3213,6 +3213,14 @@ Whether depth test, stencil test, and occlusion query should run before
the fragment shader (regardless of fragment shader side effects). Corresponds
to GLSL early_fragment_tests.
NEXT_SHADER
"""""""""""
Which shader stage will MOST LIKELY follow after this shader when the shader
is bound. This is only a hint to the driver and doesn't have to be precise.
Only set for VS and TES.
Texture Sampling and Texture Formats
------------------------------------
@@ -1017,7 +1017,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
const_offset = nir_src_as_const_value(intr->src[1]);
if (const_offset) {
off += const_offset->u[0];
off += const_offset->u32[0];
} else {
/* For load_ubo_indirect, second src is indirect offset: */
src1 = get_src(ctx, &intr->src[1])[0];
@@ -1159,7 +1159,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
idx = nir_intrinsic_base(intr);
const_offset = nir_src_as_const_value(intr->src[0]);
if (const_offset) {
idx += const_offset->u[0];
idx += const_offset->u32[0];
for (int i = 0; i < intr->num_components; i++) {
unsigned n = idx * 4 + i;
dst[i] = create_uniform(ctx, n);
@@ -1186,7 +1186,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
idx = nir_intrinsic_base(intr);
const_offset = nir_src_as_const_value(intr->src[0]);
if (const_offset) {
idx += const_offset->u[0];
idx += const_offset->u32[0];
for (int i = 0; i < intr->num_components; i++) {
unsigned n = idx * 4 + i;
dst[i] = ctx->ir->inputs[n];
@@ -1213,7 +1213,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
idx = nir_intrinsic_base(intr);
const_offset = nir_src_as_const_value(intr->src[1]);
compile_assert(ctx, const_offset != NULL);
idx += const_offset->u[0];
idx += const_offset->u32[0];
src = get_src(ctx, &intr->src[0]);
for (int i = 0; i < intr->num_components; i++) {
@@ -1301,7 +1301,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
instr->def.num_components);
for (int i = 0; i < instr->def.num_components; i++)
dst[i] = create_immed(ctx->block, instr->value.u[i]);
dst[i] = create_immed(ctx->block, instr->value.u32[i]);
}
static void
@@ -290,7 +290,7 @@ lower_if_else_block(nir_block *block, void *void_state)
}
nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
phi->dest.ssa.num_components, phi->dest.ssa.name);
phi->dest.ssa.num_components, 32, phi->dest.ssa.name);
sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
nir_ssa_def_rewrite_uses(&phi->dest.ssa,
@@ -160,7 +160,7 @@ struct nv50_ir_prog_info
uint8_t clipDistances; /* number of clip distance outputs */
uint8_t cullDistances; /* number of cull distance outputs */
int8_t genUserClip; /* request user clip planes for ClipVertex */
uint8_t auxCBSlot; /* constant buffer index of UCP/draw data */
uint8_t auxCBSlot; /* driver constant buffer slot */
uint16_t ucpBase; /* base address for UCPs */
uint16_t drawInfoBase; /* base address for draw parameters */
uint8_t pointSize; /* output index for PointSize */
@@ -175,7 +175,6 @@ struct nv50_ir_prog_info
uint8_t globalAccess; /* 1 for read, 2 for wr, 3 for rw */
bool fp64; /* program uses fp64 math */
bool nv50styleSurfaces; /* generate gX[] access for raw buffers */
uint8_t resInfoCBSlot; /* cX[] used for tex handles, surface info */
uint16_t texBindBase; /* base address for tex handles (nve4) */
uint16_t suInfoBase; /* base address for surface info (nve4) */
uint16_t sampleInfoBase; /* base address for sample positions */
@@ -1655,10 +1655,8 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
break;
}
if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
offset &= 0xffffff;
if (code[0] & 0x2) {
offset &= 0xffffff;
emitLoadStoreType(i->dType, 0x33);
if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
emitCachingMode(i->cache, 0x2f);
@@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
code[1] |= (i->tex.mask & 0xc) << 12;
if (i->tex.liveOnly)
code[1] |= 4;
code[1] |= 1 << 2;
if (i->tex.derivAll)
code[1] |= 1 << 3;
defId(i->def(0), 2);
@@ -856,15 +856,17 @@ public:
};
std::vector<TextureView> textureViews;
/*
struct Resource {
uint8_t target; // TGSI_TEXTURE_*
bool raw;
uint8_t slot; // $surface index
};
std::vector<Resource> resources;
*/
struct MemoryFile {
bool shared;
uint8_t mem_type; // TGSI_MEMORY_TYPE_*
};
std::vector<MemoryFile> memoryFiles;
@@ -1037,6 +1039,9 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
info->io.cullDistances = prop->u[0].Data;
break;
case TGSI_PROPERTY_NEXT_SHADER:
/* Do not need to know the next shader stage. */
break;
default:
INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
break;
@@ -1222,7 +1227,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
break;
case TGSI_FILE_MEMORY:
for (i = first; i <= last; ++i)
memoryFiles[i].shared = decl->Declaration.Shared;
memoryFiles[i].mem_type = decl->Declaration.MemType;
break;
case TGSI_FILE_NULL:
case TGSI_FILE_TEMPORARY:
@@ -1261,9 +1266,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
info->numBarriers = 1;
if (insn.dstCount()) {
if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
Instruction::DstRegister dst = insn.getDst(0);
Instruction::DstRegister dst = insn.getDst(0);
if (dst.getFile() == TGSI_FILE_OUTPUT) {
if (dst.isIndirect(0))
for (unsigned i = 0; i < info->numOutputs; ++i)
info->out[i].mask = 0xf;
@@ -1280,11 +1285,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
if (isEdgeFlagPassthrough(insn))
info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
} else
if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
if (insn.getDst(0).isIndirect(0))
indirectTempArrays.insert(insn.getDst(0).getArrayId());
if (dst.getFile() == TGSI_FILE_TEMPORARY) {
if (dst.isIndirect(0))
indirectTempArrays.insert(dst.getArrayId());
} else
if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
if (dst.getFile() == TGSI_FILE_BUFFER) {
info->io.globalAccess |= 0x2;
}
}
@@ -1419,8 +1424,8 @@ private:
void handleLIT(Value *dst0[4]);
void handleUserClipPlanes();
Symbol *getResourceBase(int r);
void getResourceCoords(std::vector<Value *>&, int r, int s);
// Symbol *getResourceBase(int r);
// void getResourceCoords(std::vector<Value *>&, int r, int s);
void handleLOAD(Value *dst0[4]);
void handleSTORE();
@@ -1527,8 +1532,21 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
sym->reg.fileIndex = fileIdx;
if (tgsiFile == TGSI_FILE_MEMORY && code->memoryFiles[fileIdx].shared)
sym->setFile(FILE_MEMORY_SHARED);
if (tgsiFile == TGSI_FILE_MEMORY) {
switch (code->memoryFiles[fileIdx].mem_type) {
case TGSI_MEMORY_TYPE_SHARED:
sym->setFile(FILE_MEMORY_SHARED);
break;
case TGSI_MEMORY_TYPE_INPUT:
assert(prog->getType() == Program::TYPE_COMPUTE);
assert(idx == -1);
sym->setFile(FILE_SHADER_INPUT);
address += info->prop.cp.inputOffset;
break;
default:
assert(0); /* TODO: Add support for global and private memory */
}
}
if (idx >= 0) {
if (sym->reg.file == FILE_SHADER_INPUT)
@@ -1989,7 +2007,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
void
Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
{
Value *val;
Value *arg[4], *src[8];
Value *lod = NULL, *shd = NULL;
unsigned int s, c, d;
@@ -2032,17 +2049,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
shd = src[n - 1];
}
if (tgt.isCube()) {
for (c = 0; c < 3; ++c)
src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
val = getScratch();
mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c)
src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
}
for (c = 0, d = 0; c < 4; ++c) {
if (dst[c]) {
texi->setDef(d++, dst[c]);
@@ -2148,6 +2154,7 @@ Converter::handleLIT(Value *dst0[4])
}
}
/* Keep this around for now as reference when adding img support
static inline bool
isResourceSpecial(const int r)
{
@@ -2178,7 +2185,8 @@ Converter::getResourceBase(const int r)
switch (r) {
case TGSI_RESOURCE_GLOBAL:
sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL,
info->io.auxCBSlot);
break;
case TGSI_RESOURCE_LOCAL:
assert(prog->getType() == Program::TYPE_COMPUTE);
@@ -2243,6 +2251,7 @@ partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
}
return n + 1;
}
*/
// For raw loads, granularity is 4 byte.
// Usage of the texture read mask on OP_SULDP is not allowed.
@@ -2253,8 +2262,9 @@ Converter::handleLOAD(Value *dst0[4])
int c;
std::vector<Value *> off, src, ldv, def;
if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
switch (tgsi.getSrc(0).getFile()) {
case TGSI_FILE_BUFFER:
case TGSI_FILE_MEMORY:
for (c = 0; c < 4; ++c) {
if (!dst0[c])
continue;
@@ -2274,9 +2284,12 @@ Converter::handleLOAD(Value *dst0[4])
if (tgsi.getSrc(0).isIndirect(0))
ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
}
return;
break;
default:
assert(!"Unsupported srcFile for LOAD");
}
/* Keep this around for now as reference when adding img support
getResourceCoords(off, r, 1);
if (isResourceRaw(code, r)) {
@@ -2342,6 +2355,7 @@ Converter::handleLOAD(Value *dst0[4])
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
if (dst0[c] != def[c])
mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
*/
}
// For formatted stores, the write mask on OP_SUSTP can be used.
@@ -2353,8 +2367,9 @@ Converter::handleSTORE()
int c;
std::vector<Value *> off, src, dummy;
if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER ||
tgsi.getDst(0).getFile() == TGSI_FILE_MEMORY) {
switch (tgsi.getDst(0).getFile()) {
case TGSI_FILE_BUFFER:
case TGSI_FILE_MEMORY:
for (c = 0; c < 4; ++c) {
if (!(tgsi.getDst(0).getMask() & (1 << c)))
continue;
@@ -2375,9 +2390,12 @@ Converter::handleSTORE()
if (tgsi.getDst(0).isIndirect(0))
st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
}
return;
break;
default:
assert(!"Unsupported dstFile for STORE");
}
/* Keep this around for now as reference when adding img support
getResourceCoords(off, r, 0);
src = off;
const int s = src.size();
@@ -2425,6 +2443,7 @@ Converter::handleSTORE()
mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
dummy, src)->tex.mask = tgsi.getDst(0).getMask();
}
*/
}
// XXX: These only work on resources with the single-component u32/s32 formats.
@@ -2439,8 +2458,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
std::vector<Value *> defv;
LValue *dst = getScratch();
if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
switch (tgsi.getSrc(0).getFile()) {
case TGSI_FILE_BUFFER:
case TGSI_FILE_MEMORY:
for (int c = 0; c < 4; ++c) {
if (!dst0[c])
continue;
@@ -2468,10 +2488,12 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
for (int c = 0; c < 4; ++c)
if (dst0[c])
dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
return;
break;
default:
assert(!"Unsupported srcFile for ATOM");
}
/* Keep this around for now as reference when adding img support
getResourceCoords(srcv, r, 1);
if (isResourceSpecial(r)) {
@@ -2499,6 +2521,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
for (int c = 0; c < 4; ++c)
if (dst0[c])
dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
*/
}
void
@@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
tmp = bld.getScratch();
for (l = 0; l < 4; ++l) {
Value *src[3], *val;
// mov coordinates from lane l to all lanes
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (c = 0; c < dim; ++c) {
@@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
add->lanes = 1; /* abused for .ndv */
}
// normalize cube coordinates if necessary
if (i->tex.target.isCube()) {
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
val = bld.getScratch();
bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
bld.mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
} else {
for (c = 0; c < dim; ++c)
src[c] = crd[c];
}
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
tex->setSrc(c + array, crd[c]);
tex->setSrc(c + array, src[c]);
bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
// save results
@@ -682,7 +682,7 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
Value **ms_x, Value **ms_y) {
// This loads the texture-indexed ms setting from the constant buffer
Value *tmp = new_LValue(func, FILE_GPR);
uint8_t b = prog->driver->io.resInfoCBSlot;
uint8_t b = prog->driver->io.auxCBSlot;
off += prog->driver->io.suInfoBase;
if (prog->getType() > Program::TYPE_VERTEX)
off += 16 * 2 * 4;
@@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
const int dref = arg;
const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
/* Only normalize in the non-explicit derivatives case.
*/
if (i->tex.target.isCube() && i->op != OP_TXD) {
Value *src[3], *val;
int c;
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
val = bld.getScratch();
bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
bld.mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c) {
i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
i->getSrc(c), val));
}
}
// handle MS, which means looking up the MS params for this texture, and
// adjusting the input coordinates to point at the right sample.
if (i->tex.target.isMS()) {
@@ -934,12 +951,14 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
handleTEX(i);
i->op = OP_TEX; // no need to clone dPdx/dPdy later
i->tex.derivAll = true;
for (c = 0; c < dim; ++c)
crd[c] = bld.getScratch();
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (l = 0; l < 4; ++l) {
Value *src[3], *val;
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -949,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
// add dPdy from lane l to lanes dy
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
// normalize cube coordinates if necessary
if (i->tex.target.isCube()) {
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
val = bld.getScratch();
bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
bld.mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
} else {
for (c = 0; c < dim; ++c)
src[c] = crd[c];
}
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
tex->setSrc(c, crd[c]);
tex->setSrc(c, src[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
@@ -1174,7 +1207,7 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
bld.mkLoad(TYPE_F32,
def,
bld.mkSymbol(
FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
off);
break;
@@ -600,7 +600,7 @@ NVC0LoweringPass::visit(BasicBlock *bb)
inline Value *
NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
{
uint8_t b = prog->driver->io.resInfoCBSlot;
uint8_t b = prog->driver->io.auxCBSlot;
uint32_t off = prog->driver->io.texBindBase + slot * 4;
return bld.
mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
const int chipset = prog->getTarget()->getChipset();
/* Only normalize in the non-explicit derivatives case. For explicit
* derivatives, this is handled in handleManualTXD.
*/
if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
Value *src[3], *val;
int c;
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
val = bld.getScratch();
bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
bld.mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c) {
i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
i->getSrc(c), val));
}
}
// Arguments to the TEX instruction are a little insane. Even though the
// encoding is identical between SM20 and SM30, the arguments mean
// different things between Fermi and Kepler+. A lot of arguments are
@@ -728,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
}
Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
for (int s = dim; s >= 1; --s)
i->setSrc(s, i->getSrc(s - 1));
i->setSrc(0, arrayIndex);
if (arrayIndex) {
for (int s = dim; s >= 1; --s)
i->setSrc(s, i->getSrc(s - 1));
i->setSrc(0, arrayIndex);
} else {
i->moveSources(0, 1);
}
if (arrayIndex) {
int sat = (i->op == OP_TXF) ? 1 : 0;
@@ -861,6 +883,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
for (l = 0; l < 4; ++l) {
Value *src[3], *val;
// mov coordinates from lane l to all lanes
for (c = 0; c < dim; ++c)
bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
@@ -870,10 +893,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
// add dPdy from lane l to lanes dy
for (c = 0; c < dim; ++c)
bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
// normalize cube coordinates
if (i->tex.target.isCube()) {
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
val = bld.getScratch();
bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
bld.mkOp1(OP_RCP, TYPE_F32, val, val);
for (c = 0; c < 3; ++c)
src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
} else {
for (c = 0; c < dim; ++c)
src[c] = crd[c];
}
// texture
bld.insert(tex = cloneForward(func, i));
for (c = 0; c < dim; ++c)
tex->setSrc(c + array, crd[c]);
tex->setSrc(c + array, src[c]);
// save results
for (c = 0; i->defExists(c); ++c) {
Instruction *mov;
@@ -1098,6 +1135,7 @@ NVC0LoweringPass::handleSharedATOM(Instruction *atom)
break;
default:
assert(0);
return;
}
Instruction *i =
@@ -1204,7 +1242,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
inline Value *
NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
{
uint8_t b = prog->driver->io.resInfoCBSlot;
uint8_t b = prog->driver->io.auxCBSlot;
off += prog->driver->io.suInfoBase;
return bld.
mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -1213,7 +1251,7 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
inline Value *
NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
{
uint8_t b = prog->driver->io.resInfoCBSlot;
uint8_t b = prog->driver->io.auxCBSlot;
off += prog->driver->io.suInfoBase;
if (ptr)
@@ -1226,7 +1264,7 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
inline Value *
NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
{
uint8_t b = prog->driver->io.resInfoCBSlot;
uint8_t b = prog->driver->io.auxCBSlot;
off += prog->driver->io.suInfoBase;
if (ptr)
@@ -1540,7 +1578,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
call->indirect = 1;
call->absolute = 1;
call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
prog->driver->io.resInfoCBSlot, TYPE_U32,
prog->driver->io.auxCBSlot, TYPE_U32,
prog->driver->io.suInfoBase + base));
call->setSrc(1, r[2]);
call->setSrc(2, r[4]);
@@ -1698,7 +1736,8 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
}
addr += prog->driver->prop.cp.gridInfoBase;
bld.mkLoad(TYPE_U32, i->getDef(0),
bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
TYPE_U32, addr), NULL);
break;
case SV_SAMPLE_INDEX:
// TODO: Properly pass source as an address in the PIX address space
@@ -1715,7 +1754,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
bld.mkLoad(TYPE_F32,
i->getDef(0),
bld.mkSymbol(
FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
TYPE_U32, prog->driver->io.sampleInfoBase +
4 * sym->reg.data.sv.index),
off);
@@ -1780,7 +1819,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i)
{
if (i->dType == TYPE_F64) {
Value *pred = bld.getSSA(1, FILE_PREDICATE);
Value *zero = bld.loadImm(NULL, 0.0d);
Value *zero = bld.loadImm(NULL, 0.0);
Value *dst = bld.getSSA(8);
bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
@@ -204,6 +204,11 @@ static const char *ldstSubOpStr[] =
"", "lock", "unlock"
};
static const char *subfmOpStr[] =
{
"", "3d"
};
static const char *DataTypeStr[] =
{
"-",
@@ -548,6 +553,10 @@ void Instruction::print() const
if (subOp < Elements(ldstSubOpStr))
PRINT("%s ", ldstSubOpStr[subOp]);
break;
case OP_SUBFM:
if (subOp < Elements(subfmOpStr))
PRINT("%s ", subfmOpStr[subOp]);
break;
default:
if (subOp)
PRINT("(SUBOP:%u) ", subOp);
@@ -114,8 +114,6 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
info.io.auxCBSlot = 15;
info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
info.io.resInfoCBSlot = 15;
info.io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
info.io.msInfoCBSlot = 15;
info.io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
+59 -88
View File
@@ -67,122 +67,94 @@ nv50_screen_compute_setup(struct nv50_screen *screen,
if (ret)
return ret;
BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
PUSH_DATA (push, screen->compute->handle);
BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
BEGIN_NV04(push, NV50_CP(UNK02A0), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);
PUSH_DATAh(push, screen->stack_bo->offset);
PUSH_DATA (push, screen->stack_bo->offset);
BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);
PUSH_DATA (push, 4);
BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
BEGIN_NV04(push, NV50_CP(UNK0290), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
BEGIN_NV04(push, NV50_CP(REG_MODE), 1);
PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
BEGIN_NV04(push, NV50_CP(UNK0384), 1);
PUSH_DATA (push, 0x100);
BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);
PUSH_DATA (push, fifo->vram);
for (i = 0; i < 15; i++) {
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);
PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
}
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);
PUSH_DATA (push, ~0);
BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);
PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);
PUSH_DATA (push, 7);
BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);
PUSH_DATA (push, 7);
BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);
PUSH_DATA (push, 0x54);
BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);
PUSH_DATA (push, 0);
BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);
PUSH_DATAh(push, screen->txc->offset);
PUSH_DATA (push, screen->txc->offset);
PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);
PUSH_DATAh(push, screen->txc->offset + 65536);
PUSH_DATA (push, screen->txc->offset + 65536);
PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);
PUSH_DATA (push, fifo->vram);
BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);
PUSH_DATAh(push, screen->tls_bo->offset + 65536);
PUSH_DATA (push, screen->tls_bo->offset + 65536);
BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);
PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
return 0;
}
static bool
nv50_compute_validate_program(struct nv50_context *nv50)
{
struct nv50_program *prog = nv50->compprog;
if (prog->mem)
return true;
if (!prog->translated) {
prog->translated = nv50_program_translate(
prog, nv50->screen->base.device->chipset, &nv50->base.debug);
if (!prog->translated)
return false;
}
if (unlikely(!prog->code_size))
return false;
if (likely(prog->code_size)) {
if (nv50_program_upload_code(nv50, prog)) {
struct nouveau_pushbuf *push = nv50->base.pushbuf;
BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
PUSH_DATA (push, 0);
return true;
}
}
return false;
}
static void
nv50_compute_validate_globals(struct nv50_context *nv50)
{
@@ -198,26 +170,25 @@ nv50_compute_validate_globals(struct nv50_context *nv50)
}
}
static bool
nv50_compute_state_validate(struct nv50_context *nv50)
{
if (!nv50_compute_validate_program(nv50))
return false;
static struct nv50_state_validate
validate_list_cp[] = {
{ nv50_compprog_validate, NV50_NEW_CP_PROGRAM },
{ nv50_compute_validate_globals, NV50_NEW_CP_GLOBALS },
};
if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
nv50_compute_validate_globals(nv50);
static bool
nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)
{
bool ret;
/* TODO: validate textures, samplers, surfaces */
ret = nv50_state_validate(nv50, mask, validate_list_cp,
ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,
nv50->bufctx_cp);
nv50_bufctx_fence(nv50->bufctx_cp, false);
nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
return false;
if (unlikely(nv50->state.flushed))
nv50_bufctx_fence(nv50->bufctx_cp, true);
return true;
return ret;
}
static void
@@ -227,7 +198,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
struct nouveau_pushbuf *push = screen->base.pushbuf;
unsigned size = align(nv50->compprog->parm_size, 0x4);
BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
PUSH_DATA (push, (size / 4) << 8);
if (size) {
@@ -245,7 +216,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
nouveau_pushbuf_bufctx(push, nv50->bufctx);
nouveau_pushbuf_validate(push);
BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4);
nouveau_pushbuf_data(push, bo, offset, size);
nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
@@ -278,7 +249,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
struct nv50_program *cp = nv50->compprog;
bool ret;
ret = !nv50_compute_state_validate(nv50);
ret = !nv50_state_validate_cp(nv50, ~0);
if (ret) {
NOUVEAU_ERR("Failed to launch grid !\n");
return;
@@ -286,33 +257,33 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
nv50_compute_upload_input(nv50, info->input);
BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc));
BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
PUSH_DATA (push, cp->max_gpr);
/* grid/block setup */
BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);
PUSH_DATA (push, info->block[1] << 16 | info->block[0]);
PUSH_DATA (push, info->block[2]);
BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);
PUSH_DATA (push, 1 << 16 | block_size);
BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);
PUSH_DATA (push, 1);
BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);
PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]);
BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
BEGIN_NV04(push, NV50_CP(GRIDID), 1);
PUSH_DATA (push, 1);
/* kernel launching */
BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
BEGIN_NV04(push, NV50_CP(LAUNCH), 1);
PUSH_DATA (push, 0);
BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
PUSH_DATA (push, 0);
/* bind a compute shader clobbers fragment shader state */
nv50->dirty |= NV50_NEW_FRAGPROG;
nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
}
+17 -17
View File
@@ -176,8 +176,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
if (nv50->framebuffer.cbufs[i] &&
nv50->framebuffer.cbufs[i]->texture == res) {
nv50->dirty |= NV50_NEW_FRAMEBUFFER;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
if (!--ref)
return ref;
}
@@ -186,8 +186,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
if (bind & PIPE_BIND_DEPTH_STENCIL) {
if (nv50->framebuffer.zsbuf &&
nv50->framebuffer.zsbuf->texture == res) {
nv50->dirty |= NV50_NEW_FRAMEBUFFER;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
if (!--ref)
return ref;
}
@@ -202,8 +202,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
for (i = 0; i < nv50->num_vtxbufs; ++i) {
if (nv50->vtxbuf[i].buffer == res) {
nv50->dirty |= NV50_NEW_ARRAYS;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
if (!--ref)
return ref;
}
@@ -211,8 +211,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
if (nv50->idxbuf.buffer == res) {
/* Just rebind to the bufctx as there is no separate dirty bit */
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(res), RD);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD);
if (!--ref)
return ref;
}
@@ -222,8 +222,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
for (i = 0; i < nv50->num_textures[s]; ++i) {
if (nv50->textures[s][i] &&
nv50->textures[s][i]->texture == res) {
nv50->dirty |= NV50_NEW_TEXTURES;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
if (!--ref)
return ref;
}
@@ -236,9 +236,9 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
continue;
if (!nv50->constbuf[s][i].user &&
nv50->constbuf[s][i].u.buf == res) {
nv50->dirty |= NV50_NEW_CONSTBUF;
nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
nv50->constbuf_dirty[s] |= 1 << i;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
if (!--ref)
return ref;
}
@@ -345,10 +345,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code);
BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->code);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->uniforms);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->txc);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->stack_bo);
if (screen->compute) {
BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
@@ -357,7 +357,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo);
BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
if (screen->compute)
BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
+43 -34
View File
@@ -26,43 +26,43 @@
#include "nv50/nv50_3d.xml.h"
#include "nv50/nv50_2d.xml.h"
#define NV50_NEW_BLEND (1 << 0)
#define NV50_NEW_RASTERIZER (1 << 1)
#define NV50_NEW_ZSA (1 << 2)
#define NV50_NEW_VERTPROG (1 << 3)
#define NV50_NEW_GMTYPROG (1 << 6)
#define NV50_NEW_FRAGPROG (1 << 7)
#define NV50_NEW_BLEND_COLOUR (1 << 8)
#define NV50_NEW_STENCIL_REF (1 << 9)
#define NV50_NEW_CLIP (1 << 10)
#define NV50_NEW_SAMPLE_MASK (1 << 11)
#define NV50_NEW_FRAMEBUFFER (1 << 12)
#define NV50_NEW_STIPPLE (1 << 13)
#define NV50_NEW_SCISSOR (1 << 14)
#define NV50_NEW_VIEWPORT (1 << 15)
#define NV50_NEW_ARRAYS (1 << 16)
#define NV50_NEW_VERTEX (1 << 17)
#define NV50_NEW_CONSTBUF (1 << 18)
#define NV50_NEW_TEXTURES (1 << 19)
#define NV50_NEW_SAMPLERS (1 << 20)
#define NV50_NEW_STRMOUT (1 << 21)
#define NV50_NEW_MIN_SAMPLES (1 << 22)
#define NV50_NEW_CONTEXT (1 << 31)
#define NV50_NEW_3D_BLEND (1 << 0)
#define NV50_NEW_3D_RASTERIZER (1 << 1)
#define NV50_NEW_3D_ZSA (1 << 2)
#define NV50_NEW_3D_VERTPROG (1 << 3)
#define NV50_NEW_3D_GMTYPROG (1 << 6)
#define NV50_NEW_3D_FRAGPROG (1 << 7)
#define NV50_NEW_3D_BLEND_COLOUR (1 << 8)
#define NV50_NEW_3D_STENCIL_REF (1 << 9)
#define NV50_NEW_3D_CLIP (1 << 10)
#define NV50_NEW_3D_SAMPLE_MASK (1 << 11)
#define NV50_NEW_3D_FRAMEBUFFER (1 << 12)
#define NV50_NEW_3D_STIPPLE (1 << 13)
#define NV50_NEW_3D_SCISSOR (1 << 14)
#define NV50_NEW_3D_VIEWPORT (1 << 15)
#define NV50_NEW_3D_ARRAYS (1 << 16)
#define NV50_NEW_3D_VERTEX (1 << 17)
#define NV50_NEW_3D_CONSTBUF (1 << 18)
#define NV50_NEW_3D_TEXTURES (1 << 19)
#define NV50_NEW_3D_SAMPLERS (1 << 20)
#define NV50_NEW_3D_STRMOUT (1 << 21)
#define NV50_NEW_3D_MIN_SAMPLES (1 << 22)
#define NV50_NEW_3D_CONTEXT (1 << 31)
#define NV50_NEW_CP_PROGRAM (1 << 0)
#define NV50_NEW_CP_GLOBALS (1 << 1)
/* 3d bufctx (during draw_vbo, blit_3d) */
#define NV50_BIND_FB 0
#define NV50_BIND_VERTEX 1
#define NV50_BIND_VERTEX_TMP 2
#define NV50_BIND_INDEX 3
#define NV50_BIND_TEXTURES 4
#define NV50_BIND_CB(s, i) (5 + 16 * (s) + (i))
#define NV50_BIND_SO 53
#define NV50_BIND_SCREEN 54
#define NV50_BIND_TLS 55
#define NV50_BIND_3D_COUNT 56
#define NV50_BIND_3D_FB 0
#define NV50_BIND_3D_VERTEX 1
#define NV50_BIND_3D_VERTEX_TMP 2
#define NV50_BIND_3D_INDEX 3
#define NV50_BIND_3D_TEXTURES 4
#define NV50_BIND_3D_CB(s, i) (5 + 16 * (s) + (i))
#define NV50_BIND_3D_SO 53
#define NV50_BIND_3D_SCREEN 54
#define NV50_BIND_3D_TLS 55
#define NV50_BIND_3D_COUNT 56
/* compute bufctx (during launch_grid) */
#define NV50_BIND_CP_GLOBAL 0
@@ -115,7 +115,7 @@ struct nv50_context {
struct nouveau_bufctx *bufctx;
struct nouveau_bufctx *bufctx_cp;
uint32_t dirty;
uint32_t dirty_3d; /* dirty flags for 3d state */
uint32_t dirty_cp; /* dirty flags for compute state */
bool cb_dirty;
@@ -221,6 +221,7 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
void nv50_vertprog_validate(struct nv50_context *);
void nv50_gmtyprog_validate(struct nv50_context *);
void nv50_fragprog_validate(struct nv50_context *);
void nv50_compprog_validate(struct nv50_context *);
void nv50_fp_linkage_validate(struct nv50_context *);
void nv50_gp_linkage_validate(struct nv50_context *);
void nv50_constbufs_validate(struct nv50_context *);
@@ -231,7 +232,15 @@ void nv50_stream_output_validate(struct nv50_context *);
extern void nv50_init_state_functions(struct nv50_context *);
/* nv50_state_validate.c */
bool nv50_state_validate(struct nv50_context *, uint32_t state_mask);
struct nv50_state_validate {
void (*func)(struct nv50_context *);
uint32_t states;
};
bool nv50_state_validate(struct nv50_context *, uint32_t,
struct nv50_state_validate *, int, uint32_t *,
struct nouveau_bufctx *);
bool nv50_state_validate_3d(struct nv50_context *, uint32_t);
/* nv50_surface.c */
extern void nv50_clear(struct pipe_context *, unsigned buffers,
@@ -335,7 +335,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
info->io.genUserClip = prog->vp.clpd_nr;
info->io.resInfoCBSlot = 15;
info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
info->io.msInfoCBSlot = 15;
@@ -202,10 +202,10 @@ nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
func = nv50_hw_sm_get_func(c);
/* configure and reset the counter(s) */
BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
| cfg->ctr[i].unit | cfg->ctr[i].mode);
BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
BEGIN_NV04(push, NV50_CP(MP_PM_SET(c)), 1);
PUSH_DATA (push, 0);
}
return true;
@@ -240,7 +240,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
PUSH_SPACE(push, 8);
for (c = 0; c < 4; c++) {
if (screen->pm.mp_counter[c]) {
BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
PUSH_DATA (push, 0);
}
}
@@ -257,7 +257,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
hq->bo);
PUSH_SPACE(push, 2);
BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
PUSH_DATA (push, 0);
pipe->bind_compute_state(pipe, screen->pm.prog);
@@ -295,7 +295,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
mask |= 1 << hsq->ctr[i];
func = nv50_hw_sm_get_func(hsq->ctr[i]);
BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(hsq->ctr[i])), 1);
PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
| cfg->ctr[i].unit | cfg->ctr[i].mode);
}
@@ -29,6 +29,8 @@
#include "nv50/nv50_context.h"
#include "nv50/nv50_query_hw.h"
#include "nv50/nv50_compute.xml.h"
void
nv50_constbufs_validate(struct nv50_context *nv50)
{
@@ -94,7 +96,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
BCTX_REFN(nv50->bufctx_3d, 3D_CB(s, i), res, RD);
nv50->cb_dirty = 1; /* Force cache flush for UBO. */
} else {
@@ -131,14 +133,14 @@ nv50_program_update_context_state(struct nv50_context *nv50,
if (prog && prog->tls_space) {
if (nv50->state.new_tls_space)
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
if (!nv50->state.tls_required || nv50->state.new_tls_space)
BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
BCTX_REFN_bo(nv50->bufctx_3d, 3D_TLS, flags, nv50->screen->tls_bo);
nv50->state.new_tls_space = false;
nv50->state.tls_required |= 1 << stage;
} else {
if (nv50->state.tls_required == (1 << stage))
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
nv50->state.tls_required &= ~(1 << stage);
}
}
@@ -181,7 +183,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
fp->fp.force_persample_interp = rast->force_persample_interp;
}
if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_MIN_SAMPLES)))
return;
if (!nv50_program_validate(nv50, fp))
@@ -238,6 +240,19 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
/* GP_ENABLE is updated in linkage validation */
}
void
nv50_compprog_validate(struct nv50_context *nv50)
{
struct nouveau_pushbuf *push = nv50->base.pushbuf;
struct nv50_program *cp = nv50->compprog;
if (cp && !nv50_program_validate(nv50, cp))
return;
BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
PUSH_DATA (push, 0);
}
static void
nv50_sprite_coords_validate(struct nv50_context *nv50)
{
@@ -309,7 +324,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
}
if (nv50->dirty & NV50_NEW_FRAGPROG)
if (nv50->dirty_3d & NV50_NEW_3D_FRAGPROG)
return;
psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
@@ -378,9 +393,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
uint8_t map[64];
uint8_t so_map[64];
if (!(nv50->dirty & (NV50_NEW_VERTPROG |
NV50_NEW_FRAGPROG |
NV50_NEW_GMTYPROG))) {
if (!(nv50->dirty_3d & (NV50_NEW_3D_VERTPROG |
NV50_NEW_3D_FRAGPROG |
NV50_NEW_3D_GMTYPROG))) {
uint8_t bfc, ffc;
ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
@@ -633,8 +648,6 @@ nv50_stream_output_validate(struct nv50_context *nv50)
BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
PUSH_DATA (push, ctrl);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
for (i = 0; i < nv50->num_so_targets; ++i) {
struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
@@ -664,7 +677,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
prims = MIN2(prims, limit);
}
targ->stride = so->stride[i];
BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
BCTX_REFN(nv50->bufctx_3d, 3D_SO, buf, WR);
}
if (prims != ~0) {
BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
+30 -28
View File
@@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->blend = hwcso;
nv50->dirty |= NV50_NEW_BLEND;
nv50->dirty_3d |= NV50_NEW_3D_BLEND;
}
static void
@@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->rast = hwcso;
nv50->dirty |= NV50_NEW_RASTERIZER;
nv50->dirty_3d |= NV50_NEW_3D_RASTERIZER;
}
static void
@@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->zsa = hwcso;
nv50->dirty |= NV50_NEW_ZSA;
nv50->dirty_3d |= NV50_NEW_3D_ZSA;
}
static void
@@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
nv50->num_samplers[s] = nr;
nv50->dirty |= NV50_NEW_SAMPLERS;
nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;
}
static void
@@ -698,9 +698,9 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
nv50->num_textures[s] = nr;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
nv50->dirty |= NV50_NEW_TEXTURES;
nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
}
static void
@@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->vertprog = hwcso;
nv50->dirty |= NV50_NEW_VERTPROG;
nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
}
static void *
@@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->fragprog = hwcso;
nv50->dirty |= NV50_NEW_FRAGPROG;
nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
}
static void *
@@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->gmtyprog = hwcso;
nv50->dirty |= NV50_NEW_GMTYPROG;
nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
}
static void *
@@ -857,7 +857,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
nv50->constbuf[s][i].u.buf = NULL;
else
if (nv50->constbuf[s][i].u.buf)
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
@@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
}
nv50->constbuf_dirty[s] |= 1 << i;
nv50->dirty |= NV50_NEW_CONSTBUF;
nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
}
/* =============================================================================
@@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe,
struct nv50_context *nv50 = nv50_context(pipe);
nv50->blend_colour = *bcol;
nv50->dirty |= NV50_NEW_BLEND_COLOUR;
nv50->dirty_3d |= NV50_NEW_3D_BLEND_COLOUR;
}
static void
@@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe,
struct nv50_context *nv50 = nv50_context(pipe);
nv50->stencil_ref = *sr;
nv50->dirty |= NV50_NEW_STENCIL_REF;
nv50->dirty_3d |= NV50_NEW_3D_STENCIL_REF;
}
static void
@@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe,
memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));
nv50->dirty |= NV50_NEW_CLIP;
nv50->dirty_3d |= NV50_NEW_3D_CLIP;
}
static void
@@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->sample_mask = sample_mask;
nv50->dirty |= NV50_NEW_SAMPLE_MASK;
nv50->dirty_3d |= NV50_NEW_3D_SAMPLE_MASK;
}
static void
@@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples)
if (nv50->min_samples != min_samples) {
nv50->min_samples = min_samples;
nv50->dirty |= NV50_NEW_MIN_SAMPLES;
nv50->dirty_3d |= NV50_NEW_3D_MIN_SAMPLES;
}
}
@@ -945,11 +945,11 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
{
struct nv50_context *nv50 = nv50_context(pipe);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
util_copy_framebuffer_state(&nv50->framebuffer, fb);
nv50->dirty |= NV50_NEW_FRAMEBUFFER;
nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
}
static void
@@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe,
struct nv50_context *nv50 = nv50_context(pipe);
nv50->stipple = *stipple;
nv50->dirty |= NV50_NEW_STIPPLE;
nv50->dirty_3d |= NV50_NEW_3D_STIPPLE;
}
static void
@@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe,
continue;
nv50->scissors[start_slot + i] = scissor[i];
nv50->scissors_dirty |= 1 << (start_slot + i);
nv50->dirty |= NV50_NEW_SCISSOR;
nv50->dirty_3d |= NV50_NEW_3D_SCISSOR;
}
}
@@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe,
continue;
nv50->viewports[start_slot + i] = vpt[i];
nv50->viewports_dirty |= 1 << (start_slot + i);
nv50->dirty |= NV50_NEW_VIEWPORT;
nv50->dirty_3d |= NV50_NEW_3D_VIEWPORT;
}
}
@@ -1008,8 +1008,8 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
struct nv50_context *nv50 = nv50_context(pipe);
unsigned i;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
nv50->dirty |= NV50_NEW_ARRAYS;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
start_slot, count);
@@ -1051,14 +1051,14 @@ nv50_set_index_buffer(struct pipe_context *pipe,
struct nv50_context *nv50 = nv50_context(pipe);
if (nv50->idxbuf.buffer)
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
if (ib) {
pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
nv50->idxbuf.index_size = ib->index_size;
if (ib->buffer) {
nv50->idxbuf.offset = ib->offset;
BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD);
BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD);
} else {
nv50->idxbuf.user_buffer = ib->user_buffer;
}
@@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
struct nv50_context *nv50 = nv50_context(pipe);
nv50->vertex = hwcso;
nv50->dirty |= NV50_NEW_VERTEX;
nv50->dirty_3d |= NV50_NEW_3D_VERTEX;
}
static struct pipe_stream_output_target *
@@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
}
nv50->num_so_targets = num_targets;
if (nv50->so_targets_dirty)
nv50->dirty |= NV50_NEW_STRMOUT;
if (nv50->so_targets_dirty) {
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
nv50->dirty_3d |= NV50_NEW_3D_STRMOUT;
}
}
static void
@@ -25,7 +25,7 @@ nv50_validate_fb(struct nv50_context *nv50)
unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
uint32_t array_size = 0xffff, array_mode = 0;
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
@@ -90,7 +90,7 @@ nv50_validate_fb(struct nv50_context *nv50)
mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
/* only register for writing, otherwise we'd always serialize here */
BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
}
if (fb->zsbuf) {
@@ -118,7 +118,7 @@ nv50_validate_fb(struct nv50_context *nv50)
mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
} else {
BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
PUSH_DATA (push, 0);
@@ -187,8 +187,8 @@ nv50_validate_scissor(struct nv50_context *nv50)
#ifdef NV50_SCISSORS_CLIPPING
int minx, maxx, miny, maxy, i;
if (!(nv50->dirty &
(NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
if (!(nv50->dirty_3d &
(NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | NV50_NEW_3D_FRAMEBUFFER)) &&
nv50->state.scissor == nv50->rast->pipe.scissor)
return;
@@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
nv50->state.scissor = nv50->rast->pipe.scissor;
if ((nv50->dirty & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
if ((nv50->dirty_3d & NV50_NEW_3D_FRAMEBUFFER) && !nv50->state.scissor)
nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1;
for (i = 0; i < NV50_MAX_VIEWPORTS; i++) {
@@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50,
vp->vp.clpd_nr = n;
if (likely(vp == nv50->vertprog)) {
nv50->dirty |= NV50_NEW_VERTPROG;
nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
nv50_vertprog_validate(nv50);
} else {
nv50->dirty |= NV50_NEW_GMTYPROG;
nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
nv50_gmtyprog_validate(nv50);
}
nv50_fp_linkage_validate(nv50);
@@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50)
struct nv50_program *vp;
uint8_t clip_enable;
if (nv50->dirty & NV50_NEW_CLIP) {
if (nv50->dirty_3d & NV50_NEW_3D_CLIP) {
BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX);
BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
@@ -436,7 +436,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
else
ctx_to->state = ctx_to->screen->save_state;
ctx_to->dirty = ~0;
ctx_to->dirty_3d = ~0;
ctx_to->dirty_cp = ~0;
ctx_to->viewports_dirty = ~0;
ctx_to->scissors_dirty = ~0;
@@ -445,71 +446,71 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;
if (!ctx_to->vertex)
ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
ctx_to->dirty_3d &= ~(NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS);
if (!ctx_to->vertprog)
ctx_to->dirty &= ~NV50_NEW_VERTPROG;
ctx_to->dirty_3d &= ~NV50_NEW_3D_VERTPROG;
if (!ctx_to->fragprog)
ctx_to->dirty &= ~NV50_NEW_FRAGPROG;
ctx_to->dirty_3d &= ~NV50_NEW_3D_FRAGPROG;
if (!ctx_to->blend)
ctx_to->dirty &= ~NV50_NEW_BLEND;
ctx_to->dirty_3d &= ~NV50_NEW_3D_BLEND;
if (!ctx_to->rast)
#ifdef NV50_SCISSORS_CLIPPING
ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
ctx_to->dirty_3d &= ~(NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_SCISSOR);
#else
ctx_to->dirty &= ~NV50_NEW_RASTERIZER;
ctx_to->dirty_3d &= ~NV50_NEW_3D_RASTERIZER;
#endif
if (!ctx_to->zsa)
ctx_to->dirty &= ~NV50_NEW_ZSA;
ctx_to->dirty_3d &= ~NV50_NEW_3D_ZSA;
ctx_to->screen->cur_ctx = ctx_to;
}
static struct state_validate {
void (*func)(struct nv50_context *);
uint32_t states;
} validate_list[] = {
{ nv50_validate_fb, NV50_NEW_FRAMEBUFFER },
{ nv50_validate_blend, NV50_NEW_BLEND },
{ nv50_validate_zsa, NV50_NEW_ZSA },
{ nv50_validate_sample_mask, NV50_NEW_SAMPLE_MASK },
{ nv50_validate_rasterizer, NV50_NEW_RASTERIZER },
{ nv50_validate_blend_colour, NV50_NEW_BLEND_COLOUR },
{ nv50_validate_stencil_ref, NV50_NEW_STENCIL_REF },
{ nv50_validate_stipple, NV50_NEW_STIPPLE },
static struct nv50_state_validate
validate_list_3d[] = {
{ nv50_validate_fb, NV50_NEW_3D_FRAMEBUFFER },
{ nv50_validate_blend, NV50_NEW_3D_BLEND },
{ nv50_validate_zsa, NV50_NEW_3D_ZSA },
{ nv50_validate_sample_mask, NV50_NEW_3D_SAMPLE_MASK },
{ nv50_validate_rasterizer, NV50_NEW_3D_RASTERIZER },
{ nv50_validate_blend_colour, NV50_NEW_3D_BLEND_COLOUR },
{ nv50_validate_stencil_ref, NV50_NEW_3D_STENCIL_REF },
{ nv50_validate_stipple, NV50_NEW_3D_STIPPLE },
#ifdef NV50_SCISSORS_CLIPPING
{ nv50_validate_scissor, NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT |
NV50_NEW_RASTERIZER |
NV50_NEW_FRAMEBUFFER },
{ nv50_validate_scissor, NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT |
NV50_NEW_3D_RASTERIZER |
NV50_NEW_3D_FRAMEBUFFER },
#else
{ nv50_validate_scissor, NV50_NEW_SCISSOR },
{ nv50_validate_scissor, NV50_NEW_3D_SCISSOR },
#endif
{ nv50_validate_viewport, NV50_NEW_VIEWPORT },
{ nv50_vertprog_validate, NV50_NEW_VERTPROG },
{ nv50_gmtyprog_validate, NV50_NEW_GMTYPROG },
{ nv50_fragprog_validate, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
NV50_NEW_MIN_SAMPLES },
{ nv50_fp_linkage_validate, NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
{ nv50_gp_linkage_validate, NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
{ nv50_validate_derived_rs, NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
{ nv50_validate_derived_2, NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
{ nv50_validate_derived_3, NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
{ nv50_validate_clip, NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
{ nv50_constbufs_validate, NV50_NEW_CONSTBUF },
{ nv50_validate_textures, NV50_NEW_TEXTURES },
{ nv50_validate_samplers, NV50_NEW_SAMPLERS },
{ nv50_stream_output_validate, NV50_NEW_STRMOUT |
NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
{ nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
{ nv50_validate_min_samples, NV50_NEW_MIN_SAMPLES },
{ nv50_validate_viewport, NV50_NEW_3D_VIEWPORT },
{ nv50_vertprog_validate, NV50_NEW_3D_VERTPROG },
{ nv50_gmtyprog_validate, NV50_NEW_3D_GMTYPROG },
{ nv50_fragprog_validate, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
NV50_NEW_3D_MIN_SAMPLES },
{ nv50_fp_linkage_validate, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_VERTPROG |
NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_RASTERIZER },
{ nv50_gp_linkage_validate, NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_VERTPROG },
{ nv50_validate_derived_rs, NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
{ nv50_validate_derived_2, NV50_NEW_3D_ZSA | NV50_NEW_3D_FRAMEBUFFER },
{ nv50_validate_derived_3, NV50_NEW_3D_BLEND | NV50_NEW_3D_FRAMEBUFFER },
{ nv50_validate_clip, NV50_NEW_3D_CLIP | NV50_NEW_3D_RASTERIZER |
NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
{ nv50_constbufs_validate, NV50_NEW_3D_CONSTBUF },
{ nv50_validate_textures, NV50_NEW_3D_TEXTURES },
{ nv50_validate_samplers, NV50_NEW_3D_SAMPLERS },
{ nv50_stream_output_validate, NV50_NEW_3D_STRMOUT |
NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
{ nv50_vertex_arrays_validate, NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS },
{ nv50_validate_min_samples, NV50_NEW_3D_MIN_SAMPLES },
};
bool
nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
nv50_state_validate(struct nv50_context *nv50, uint32_t mask,
struct nv50_state_validate *validate_list, int size,
uint32_t *dirty, struct nouveau_bufctx *bufctx)
{
uint32_t state_mask;
int ret;
@@ -518,16 +519,16 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
if (nv50->screen->cur_ctx != nv50)
nv50_switch_pipe_context(nv50);
state_mask = nv50->dirty & mask;
state_mask = *dirty & mask;
if (state_mask) {
for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
struct state_validate *validate = &validate_list[i];
for (i = 0; i < size; i++) {
struct nv50_state_validate *validate = &validate_list[i];
if (state_mask & validate->states)
validate->func(nv50);
}
nv50->dirty &= ~state_mask;
*dirty &= ~state_mask;
if (nv50->state.rt_serialize) {
nv50->state.rt_serialize = false;
@@ -535,14 +536,26 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
PUSH_DATA (nv50->base.pushbuf, 0);
}
nv50_bufctx_fence(nv50->bufctx_3d, false);
nv50_bufctx_fence(bufctx, false);
}
nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
nouveau_pushbuf_bufctx(nv50->base.pushbuf, bufctx);
ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
return !ret;
}
bool
nv50_state_validate_3d(struct nv50_context *nv50, uint32_t mask)
{
bool ret;
ret = nv50_state_validate(nv50, mask, validate_list_3d,
ARRAY_SIZE(validate_list_3d), &nv50->dirty_3d,
nv50->bufctx_3d);
if (unlikely(nv50->state.flushed)) {
nv50->state.flushed = false;
nv50_bufctx_fence(nv50->bufctx_3d, true);
}
return !ret;
return ret;
}

Some files were not shown because too many files have changed in this diff Show More