Merge remote-tracking branch 'public/master' into vulkan

2016-03-24 17:30:14 -07:00
parent a5dc3c0f02 22b343a8ec
commit 2c3f95d6aa
266 changed files with 4907 additions and 3626 deletions
@@ -704,8 +704,10 @@ test "x$enable_asm" = xno && AC_MSG_RESULT([no])
 if test "x$enable_asm" = xyes -a "x$cross_compiling" = xyes; then
    case "$host_cpu" in
    i?86 | x86_64 | amd64)
-        enable_asm=no
-        AC_MSG_RESULT([no, cross compiling])
+        if test "x$host_cpu" != "x$target_cpu"; then
+            enable_asm=no
+            AC_MSG_RESULT([no, cross compiling])
+        fi
        ;;
    esac
 fi
@@ -929,12 +931,6 @@ AC_ARG_ENABLE([xlib-glx],
    [enable_xlib_glx="$enableval"],
    [enable_xlib_glx=no])

-AC_ARG_ENABLE([r600-llvm-compiler],
-    [AS_HELP_STRING([--enable-r600-llvm-compiler],
-        [Enable experimental LLVM backend for graphics shaders @<:@default=disabled@:>@])],
-    [enable_r600_llvm="$enableval"],
-    [enable_r600_llvm=no])
-
 AC_ARG_ENABLE([gallium-tests],
    [AS_HELP_STRING([--enable-gallium-tests],
        [Enable optional Gallium tests) @<:@default=disabled@:>@])],
@@ -2238,14 +2234,8 @@ if test -n "$with_gallium_drivers"; then
            PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
            gallium_require_drm "Gallium R600"
            gallium_require_drm_loader
-            if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
-                radeon_llvm_check "r600g"
-                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
-            fi
-            if test "x$enable_r600_llvm" = xyes; then
-                USE_R600_LLVM_COMPILER=yes;
-            fi
            if test "x$enable_opencl" = xyes; then
+                radeon_llvm_check "r600g"
                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
            fi
            ;;
@@ -2416,7 +2406,6 @@ AM_CONDITIONAL(NEED_RADEON_DRM_WINSYS, test "x$HAVE_GALLIUM_R300" = xyes -o \
                                            "x$HAVE_GALLIUM_RADEONSI" = xyes)
 AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$NEED_WINSYS_XLIB" = xyes)
 AM_CONDITIONAL(NEED_RADEON_LLVM, test x$NEED_RADEON_LLVM = xyes)
-AM_CONDITIONAL(USE_R600_LLVM_COMPILER, test x$USE_R600_LLVM_COMPILER = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
 AM_CONDITIONAL(HAVE_MESA_LLVM, test x$MESA_LLVM = x1)
 AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
@@ -1,13 +1,28 @@
+# Status of OpenGL extensions in Mesa

-Status of OpenGL 3.x features in Mesa
+Here's how to read this file:

+all DONE: <driver>, ...
+    All the extensions are done for the given list of drivers.

-Note: when an item is marked as "DONE" it means all the core Mesa
-infrastructure is complete but it may be the case that few (if any) drivers
-implement the features.
+DONE
+    The extension is done for Mesa and no implementation is necessary on the
+    driver-side.

+DONE ()
+    The extension is done for Mesa and all the drivers in the "all DONE" list.

-OpenGL Core and Compatibility context support
+DONE (<driver>, ...)
+    The extension is done for Mesa, all the drivers in the "all DONE" list, and
+    all the drivers in the brackets.
+
+in progress
+    The extension is started but not finished yet.
+
+not started
+    The extension isn't started yet.
+
+# OpenGL Core and Compatibility context support

 OpenGL 3.1 and later versions are only supported with the Core profile.
 There are no plans to support GL_ARB_compatibility. The last supported OpenGL
@@ -15,30 +30,30 @@ version with all deprecated features is 3.0. Some of the later GL features
 are exposed in the 3.0 context as extensions.


-Feature                                               Status
----------------------------------------------------- ------------------------
+Feature                                                 Status
+------------------------------------------------------- ------------------------

 GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe

  glBindFragDataLocation, glGetFragDataLocation         DONE
-  Conditional rendering (GL_NV_conditional_render)      DONE ()
-  Map buffer subranges (GL_ARB_map_buffer_range)        DONE ()
-  Clamping controls (GL_ARB_color_buffer_float)         DONE ()
-  Float textures, renderbuffers (GL_ARB_texture_float)  DONE ()
+  GL_NV_conditional_render (Conditional rendering)      DONE ()
+  GL_ARB_map_buffer_range (Map buffer subranges)        DONE ()
+  GL_ARB_color_buffer_float (Clamping controls)         DONE ()
+  GL_ARB_texture_float (Float textures, renderbuffers)  DONE ()
  GL_EXT_packed_float                                   DONE ()
  GL_EXT_texture_shared_exponent                        DONE ()
-  Float depth buffers (GL_ARB_depth_buffer_float)       DONE ()
-  Framebuffer objects (GL_ARB_framebuffer_object)       DONE ()
+  GL_ARB_depth_buffer_float (Float depth buffers)       DONE ()
+  GL_ARB_framebuffer_object (Framebuffer objects)       DONE ()
  GL_ARB_half_float_pixel                               DONE (all drivers)
  GL_ARB_half_float_vertex                              DONE ()
  GL_EXT_texture_integer                                DONE ()
  GL_EXT_texture_array                                  DONE ()
-  Per-buffer blend and masks (GL_EXT_draw_buffers2)     DONE ()
+  GL_EXT_draw_buffers2 (Per-buffer blend and masks)     DONE ()
  GL_EXT_texture_compression_rgtc                       DONE ()
  GL_ARB_texture_rg                                     DONE ()
-  Transform feedback (GL_EXT_transform_feedback)        DONE ()
-  Vertex array objects (GL_ARB_vertex_array_object)     DONE ()
-  sRGB framebuffer format (GL_EXT_framebuffer_sRGB)     DONE ()
+  GL_EXT_transform_feedback (Transform feedback)        DONE ()
+  GL_ARB_vertex_array_object (Vertex array objects)     DONE ()
+  GL_EXT_framebuffer_sRGB (sRGB framebuffer format)     DONE ()
  glClearBuffer commands                                DONE
  glGetStringi command                                  DONE
  glTexParameterI, glGetTexParameterI commands          DONE
@@ -53,28 +68,28 @@ GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
 GL 3.1, GLSL 1.40 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe

  Forward compatible context support/deprecations       DONE ()
-  Instanced drawing (GL_ARB_draw_instanced)             DONE ()
-  Buffer copying (GL_ARB_copy_buffer)                   DONE ()
-  Primitive restart (GL_NV_primitive_restart)           DONE ()
+  GL_ARB_draw_instanced (Instanced drawing)             DONE ()
+  GL_ARB_copy_buffer (Buffer copying)                   DONE ()
+  GL_NV_primitive_restart (Primitive restart)           DONE ()
  16 vertex texture image units                         DONE ()
-  Texture buffer objs (GL_ARB_texture_buffer_object)    DONE for OpenGL 3.1 contexts ()
-  Rectangular textures (GL_ARB_texture_rectangle)       DONE ()
-  Uniform buffer objs (GL_ARB_uniform_buffer_object)    DONE ()
-  Signed normalized textures (GL_EXT_texture_snorm)     DONE ()
+  GL_ARB_texture_buffer_object (Texture buffer objs)    DONE (for OpenGL 3.1 contexts)
+  GL_ARB_texture_rectangle (Rectangular textures)       DONE ()
+  GL_ARB_uniform_buffer_object (Uniform buffer objs)    DONE ()
+  GL_EXT_texture_snorm (Signed normalized textures)     DONE ()


 GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe

  Core/compatibility profiles                           DONE
  Geometry shaders                                      DONE ()
-  BGRA vertex order (GL_ARB_vertex_array_bgra)          DONE ()
-  Base vertex offset(GL_ARB_draw_elements_base_vertex)  DONE ()
-  Frag shader coord (GL_ARB_fragment_coord_conventions) DONE ()
-  Provoking vertex (GL_ARB_provoking_vertex)            DONE ()
-  Seamless cubemaps (GL_ARB_seamless_cube_map)          DONE ()
-  Multisample textures (GL_ARB_texture_multisample)     DONE ()
-  Frag depth clamp (GL_ARB_depth_clamp)                 DONE ()
-  Fence objects (GL_ARB_sync)                           DONE ()
+  GL_ARB_vertex_array_bgra (BGRA vertex order)          DONE ()
+  GL_ARB_draw_elements_base_vertex (Base vertex offset) DONE ()
+  GL_ARB_fragment_coord_conventions (Frag shader coord) DONE ()
+  GL_ARB_provoking_vertex (Provoking vertex)            DONE ()
+  GL_ARB_seamless_cube_map (Seamless cubemaps)          DONE ()
+  GL_ARB_texture_multisample (Multisample textures)     DONE ()
+  GL_ARB_depth_clamp (Frag depth clamp)                 DONE ()
+  GL_ARB_sync (Fence objects)                           DONE ()
  GLX_ARB_create_context_profile                        DONE


@@ -94,170 +109,170 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft

 GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi

-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965)
-  - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (softpipe)
-  - Dynamically uniform UBO array indices              DONE ()
-  - Implicit signed -> unsigned conversions            DONE
-  - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (softpipe)
-  - Enhanced textureGather                             DONE (softpipe)
-  - Geometry shader instancing                         DONE (llvmpipe, softpipe)
-  - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE ()
-  - Interpolation functions                            DONE ()
-  - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50)
-  GL_ARB_shader_subroutine                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (i965)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, softpipe)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_buffers_blend                             DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                  DONE (i965, llvmpipe, softpipe)
+  GL_ARB_gpu_shader5                                    DONE (i965)
+  - 'precise' qualifier                                 DONE
+  - Dynamically uniform sampler array indices           DONE (softpipe)
+  - Dynamically uniform UBO array indices               DONE ()
+  - Implicit signed -> unsigned conversions             DONE
+  - Fused multiply-add                                  DONE ()
+  - Packing/bitfield/conversion functions               DONE (softpipe)
+  - Enhanced textureGather                              DONE (softpipe)
+  - Geometry shader instancing                          DONE (llvmpipe, softpipe)
+  - Geometry shader multiple streams                    DONE ()
+  - Enhanced per-sample shading                         DONE ()
+  - Interpolation functions                             DONE ()
+  - New overload resolution rules                       DONE
+  GL_ARB_gpu_shader_fp64                                DONE (llvmpipe, softpipe)
+  GL_ARB_sample_shading                                 DONE (i965, nv50)
+  GL_ARB_shader_subroutine                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                            DONE (i965)
+  GL_ARB_texture_buffer_object_rgb32                    DONE (i965, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                         DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                 DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                              DONE (i965, nv50, softpipe)
+  GL_ARB_transform_feedback2                            DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                            DONE (i965, nv50, llvmpipe, softpipe)


 GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi

-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_get_program_binary                            DONE (0 binary formats)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
-  GL_ARB_vertex_attrib_64bit                           DONE (llvmpipe, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_get_program_binary                             DONE (0 binary formats)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_shader_precision                               DONE (all drivers that support GLSL 4.10)
+  GL_ARB_vertex_attrib_64bit                            DONE (llvmpipe, softpipe)
+  GL_ARB_viewport_array                                 DONE (i965, nv50, llvmpipe, softpipe)


 GL 4.2, GLSL 4.20:

-  GL_ARB_texture_compression_bptc                      DONE (i965, nvc0, r600, radeonsi)
-  GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_texture_storage                               DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_420pack                      DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_internalformat_query                          DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_map_buffer_alignment                          DONE (all drivers)
+  GL_ARB_texture_compression_bptc                       DONE (i965, nvc0, r600, radeonsi)
+  GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_texture_storage                                DONE (all drivers)
+  GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_shader_image_load_store                        DONE (i965, radeonsi)
+  GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_internalformat_query                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_map_buffer_alignment                           DONE (all drivers)


 GL 4.3, GLSL 4.30:

-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
-  GL_ARB_clear_buffer_object                           DONE (all drivers)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_copy_image                                    DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_internalformat_query2                         DONE (i965)
-  GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_robust_buffer_access_behavior                 not started
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
-  GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_ES3_compatibility                              DONE (all drivers that support GLSL 3.30)
+  GL_ARB_clear_buffer_object                            DONE (all drivers)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_copy_image                                     DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_internalformat_query2                          DONE (all drivers)
+  GL_ARB_invalidate_subdata                             DONE (all drivers)
+  GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_robust_buffer_access_behavior                  not started
+  GL_ARB_shader_image_size                              DONE (i965, radeonsi)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_buffer_range                           DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
+  GL_ARB_texture_query_levels                           DONE (all drivers that support GLSL 1.30)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_texture_view                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)


 GL 4.4, GLSL 4.40:

-  GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
-  GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
-  GL_ARB_enhanced_layouts                              in progress (Timothy)
-  - compile-time constant expressions                  DONE
-  - explicit byte offsets for blocks                   DONE
-  - forced alignment within blocks                     DONE
-  - specified vec4-slot component numbers              in progress
-  - specified transform/feedback layout                in progress
-  - input/output block locations                       DONE
-  GL_ARB_multi_bind                                    DONE (all drivers)
-  GL_ARB_query_buffer_object                           DONE (nvc0)
-  GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_stencil8                              DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_type_10f_11f_11f_rev                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_MAX_VERTEX_ATTRIB_STRIDE                           DONE (all drivers)
+  GL_ARB_buffer_storage                                 DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_clear_texture                                  DONE (i965, nv50, nvc0)
+  GL_ARB_enhanced_layouts                               in progress (Timothy)
+  - compile-time constant expressions                   DONE
+  - explicit byte offsets for blocks                    DONE
+  - forced alignment within blocks                      DONE
+  - specified vec4-slot component numbers               in progress
+  - specified transform/feedback layout                 in progress
+  - input/output block locations                        DONE
+  GL_ARB_multi_bind                                     DONE (all drivers)
+  GL_ARB_query_buffer_object                            DONE (nvc0)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_stencil8                               DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)

 GL 4.5, GLSL 4.50:

-  GL_ARB_ES3_1_compatibility                           not started
-  GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_cull_distance                                 in progress (Tobias)
-  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_direct_state_access                           DONE (all drivers)
-  GL_ARB_get_texture_sub_image                         DONE (all drivers)
-  GL_ARB_shader_texture_image_samples                  DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_texture_barrier                               DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_context_flush_control                         DONE (all - but needs GLX/EGL extension to be useful)
-  GL_KHR_robust_buffer_access_behavior                 not started
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_ES3_1_compatibility                            not started
+  GL_ARB_clip_control                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_conditional_render_inverted                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_cull_distance                                  in progress (Tobias)
+  GL_ARB_derivative_control                             DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_direct_state_access                            DONE (all drivers)
+  GL_ARB_get_texture_sub_image                          DONE (all drivers)
+  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_texture_barrier                                DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_context_flush_control                          DONE (all - but needs GLX/EGL extension to be useful)
+  GL_KHR_robust_buffer_access_behavior                  not started
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)

 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  Multisample textures (GL_ARB_texture_multisample)    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
-  GS5 Enhanced textureGather                           DONE (i965, nvc0, r600, radeonsi)
-  GS5 Packing/bitfield/conversion functions            DONE (i965, nvc0, r600, radeonsi)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_draw_indirect                                  DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_shader_image_load_store                        DONE (i965)
+  GL_ARB_shader_image_size                              DONE (i965)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
+  GS5 Enhanced textureGather                            DONE (i965, nvc0, r600, radeonsi)
+  GS5 Packing/bitfield/conversion functions             DONE (i965, nvc0, r600, radeonsi)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)

  Additional functionality not covered above:
-      glMemoryBarrierByRegion                          DONE
-      glGetTexLevelParameter[fi]v - needs updates      DONE
+      glMemoryBarrierByRegion                           DONE
+      glGetTexLevelParameter[fi]v - needs updates       DONE
      glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support                      DONE (i965, nvc0, r600)
+      gl_HelperInvocation support                       DONE (i965, nvc0, r600)

 GLES3.2, GLSL ES 3.2
-  GL_EXT_color_buffer_float                            DONE (all drivers)
-  GL_KHR_blend_equation_advanced                       not started
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_KHR_texture_compression_astc_ldr                  DONE (i965/gen9+)
-  GL_OES_copy_image                                    not started (based on GL_ARB_copy_image, which is done for some drivers)
-  GL_OES_draw_buffers_indexed                          not started
-  GL_OES_draw_elements_base_vertex                     DONE (all drivers)
-  GL_OES_geometry_shader                               started (Marta)
-  GL_OES_gpu_shader5                                   DONE (all drivers that support GL_ARB_gpu_shader5)
-  GL_OES_primitive_bounding box                        not started
-  GL_OES_sample_shading                                not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_sample_variables                              not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_shader_image_atomic                           DONE (all drivers that support GL_ARB_shader_image_load_store)
-  GL_OES_shader_io_blocks                              not started (based on parts of GLSL 1.50, which is done)
-  GL_OES_shader_multisample_interpolation              not started (based on parts of GL_ARB_gpu_shader5, which is done)
-  GL_OES_tessellation_shader                           not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
-  GL_OES_texture_border_clamp                          DONE (all drivers)
-  GL_OES_texture_buffer                                not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
-  GL_OES_texture_cube_map_array                        not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
-  GL_OES_texture_stencil8                              DONE (all drivers that support GL_ARB_texture_stencil8)
-  GL_OES_texture_storage_multisample_2d_array          DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_EXT_color_buffer_float                             DONE (all drivers)
+  GL_KHR_blend_equation_advanced                        not started
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
+  GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
+  GL_OES_copy_image                                     not started (based on GL_ARB_copy_image, which is done for some drivers)
+  GL_OES_draw_buffers_indexed                           not started
+  GL_OES_draw_elements_base_vertex                      DONE (all drivers)
+  GL_OES_geometry_shader                                started (Marta)
+  GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
+  GL_OES_primitive_bounding box                         not started
+  GL_OES_sample_shading                                 not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_sample_variables                               not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
+  GL_OES_shader_io_blocks                               not started (based on parts of GLSL 1.50, which is done)
+  GL_OES_shader_multisample_interpolation               not started (based on parts of GL_ARB_gpu_shader5, which is done)
+  GL_OES_tessellation_shader                            not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
+  GL_OES_texture_border_clamp                           DONE (all drivers)
+  GL_OES_texture_buffer                                 not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
+  GL_OES_texture_cube_map_array                         not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
+  GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
+  GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)

 More info about these features and the work involved can be found at
 http://dri.freedesktop.org/wiki/MissingFunctionality
@@ -163,6 +163,9 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
   <li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
   <li>nodualobj - suppress generation of dual-object geometry shader code</li>
   <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
+   <li>vec4 - force vec4 mode in vertex shader</li>
+   <li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
+   <li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
 </ul>
 </ul>

@@ -73,8 +73,7 @@ The following are required for DRI-based hardware acceleration with Mesa:
 <ul>
 <li><a href="http://xorg.freedesktop.org/releases/individual/proto/">
 dri2proto</a> version 2.6 or later
-<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a>
-version 2.4.33 or later
+<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a> latest version
 <li>Xorg server version 1.5 or later
 <li>Linux 2.6.28 or later
 </ul>
@@ -44,8 +44,10 @@ Note: some of the new features are only available with certain drivers.
 </p>

 <ul>
-<li>GL_ARB_internalformat_query2 on i965</li>
+<li>GL_ARB_internalformat_query2 on all drivers</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
+<li>GL_ARB_shader_image_load_store on radeonsi</li>
+<li>GL_ARB_shader_image_size on radeonsi</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>
@@ -129,6 +129,7 @@ LIBGLSL_FILES = \
 	glsl/opt_tree_grafting.cpp \
 	glsl/opt_vectorize.cpp \
 	glsl/program.h \
+	glsl/propagate_invariance.cpp \
 	glsl/s_expression.cpp \
 	glsl/s_expression.h

@@ -217,6 +217,7 @@ LIBGLSL_FILES = \
 	opt_tree_grafting.cpp \
 	opt_vectorize.cpp \
 	program.h \
+	propagate_invariance.cpp \
 	s_expression.cpp \
 	s_expression.h

@@ -2125,7 +2125,9 @@ process_array_size(exec_node *node,
   }

   ir_constant *const size = ir->constant_expression_value();
-   if (size == NULL || array_size->has_sequence_subexpression()) {
+   if (size == NULL ||
+       (state->is_version(120, 300) &&
+        array_size->has_sequence_subexpression())) {
      _mesa_glsl_error(& loc, state, "array size must be a "
                       "constant valued expression");
      return 0;
@@ -1887,6 +1887,7 @@ do_common_optimization(exec_list *ir, bool linked,
      OPT(do_dead_functions, ir);
      OPT(do_structure_splitting, ir);
   }
+   propagate_invariance(ir);
   OPT(do_if_simplification, ir);
   OPT(opt_flatten_nested_if_blocks, ir);
   OPT(opt_conditional_discard, ir);
@@ -719,6 +719,13 @@ public:
       */
      unsigned is_unmatched_generic_inout:1;

+      /**
+       * Is this varying used only by transform feedback?
+       *
+       * This is used by the linker to decide if its safe to pack the varying.
+       */
+      unsigned is_xfb_only:1;
+
      /**
       * If non-zero, then this variable may be packed along with other variables
       * into a single varying slot, so this offset should be applied when
@@ -124,7 +124,8 @@ void lower_shared_reference(struct gl_shader *shader, unsigned *shared_size);
 void lower_ubo_reference(struct gl_shader *shader);
 void lower_packed_varyings(void *mem_ctx,
                           unsigned locations_used, ir_variable_mode mode,
-                           unsigned gs_input_vertices, gl_shader *shader);
+                           unsigned gs_input_vertices, gl_shader *shader,
+                           bool disable_varying_packing, bool xfb_enabled);
 bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
 bool lower_vector_derefs(gl_shader *shader);
 void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
@@ -138,6 +139,7 @@ bool lower_tess_level(gl_shader *shader);
 bool lower_vertex_id(gl_shader *shader);

 bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
+void propagate_invariance(exec_list *instructions);

 ir_rvalue *
 compare_index_block(exec_list *instructions, ir_variable *index,
@@ -826,7 +826,7 @@ namespace {
 class varying_matches
 {
 public:
-   varying_matches(bool disable_varying_packing,
+   varying_matches(bool disable_varying_packing, bool xfb_enabled,
                   gl_shader_stage producer_stage,
                   gl_shader_stage consumer_stage);
   ~varying_matches();
@@ -836,13 +836,29 @@ public:
   void store_locations() const;

 private:
+   bool is_varying_packing_safe(const glsl_type *type,
+                                const ir_variable *var);
+
   /**
    * If true, this driver disables varying packing, so all varyings need to
    * be aligned on slot boundaries, and take up a number of slots equal to
    * their number of matrix columns times their array size.
+    *
+    * Packing may also be disabled because our current packing method is not
+    * safe in SSO or versions of OpenGL where interpolation qualifiers are not
+    * guaranteed to match across stages.
    */
   const bool disable_varying_packing;

+   /**
+    * If true, this driver has transform feedback enabled. The transform
+    * feedback code requires at least some packing be done even when varying
+    * packing is disabled, fortunately where transform feedback requires
+    * packing it's safe to override the disabled setting. See
+    * is_varying_packing_safe().
+    */
+   const bool xfb_enabled;
+
   /**
    * Enum representing the order in which varyings are packed within a
    * packing class.
@@ -862,6 +878,7 @@ private:
   static unsigned compute_packing_class(const ir_variable *var);
   static packing_order_enum compute_packing_order(const ir_variable *var);
   static int match_comparator(const void *x_generic, const void *y_generic);
+   static int xfb_comparator(const void *x_generic, const void *y_generic);

   /**
    * Structure recording the relationship between a single producer output
@@ -917,9 +934,11 @@ private:
 } /* anonymous namespace */

 varying_matches::varying_matches(bool disable_varying_packing,
+                                 bool xfb_enabled,
                                 gl_shader_stage producer_stage,
                                 gl_shader_stage consumer_stage)
   : disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled),
     producer_stage(producer_stage),
     consumer_stage(consumer_stage)
 {
@@ -941,6 +960,24 @@ varying_matches::~varying_matches()
 }


+/**
+ * Packing is always safe on individual arrays, structure and matices. It is
+ * also safe if the varying is only used for transform feedback.
+ */
+bool
+varying_matches::is_varying_packing_safe(const glsl_type *type,
+                                         const ir_variable *var)
+{
+   if (consumer_stage == MESA_SHADER_TESS_EVAL ||
+       consumer_stage == MESA_SHADER_TESS_CTRL ||
+       producer_stage == MESA_SHADER_TESS_CTRL)
+      return false;
+
+   return xfb_enabled && (type->is_array() || type->is_record() ||
+                          type->is_matrix() || var->data.is_xfb_only);
+}
+
+
 /**
 * Record the given producer/consumer variable pair in the list of variables
 * that should later be assigned locations.
@@ -1020,7 +1057,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
      = this->compute_packing_class(var);
   this->matches[this->num_matches].packing_order
      = this->compute_packing_order(var);
-   if (this->disable_varying_packing) {
+   if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) {
      unsigned slots = type->count_attribute_slots(false);
      this->matches[this->num_matches].num_components = slots * 4;
   } else {
@@ -1046,37 +1083,28 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
                                  uint64_t reserved_slots,
                                  bool separate_shader)
 {
-   /* We disable varying sorting for separate shader programs for the
-    * following reasons:
-    *
-    * 1/ All programs must sort the code in the same order to guarantee the
-    *    interface matching. However varying_matches::record() will change the
-    *    interpolation qualifier of some stages.
-    *
-    * 2/ GLSL version 4.50 removes the matching constrain on the interpolation
-    *    qualifier.
-    *
-    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec:
-    *
-    *    "The type and presence of interpolation qualifiers of variables with
-    *    the same name declared in all linked shaders for the same cross-stage
-    *    interface must match, otherwise the link command will fail.
-    *
-    *    When comparing an output from one stage to an input of a subsequent
-    *    stage, the input and output don't match if their interpolation
-    *    qualifiers (or lack thereof) are not the same."
-    *
-    *    "It is a link-time error if, within the same stage, the interpolation
-    *    qualifiers of variables of the same name do not match."
+   /* If packing has been disabled then we cannot safely sort the varyings by
+    * class as it may mean we are using a version of OpenGL where
+    * interpolation qualifiers are not guaranteed to be matching across
+    * shaders, sorting in this case could result in mismatching shader
+    * interfaces.
+    * When packing is disabled the sort orders varyings used by transform
+    * feedback first, but also depends on *undefined behaviour* of qsort to
+    * reverse the order of the varyings. See: xfb_comparator().
    */
-   if (!separate_shader) {
+   if (!this->disable_varying_packing) {
      /* Sort varying matches into an order that makes them easy to pack. */
      qsort(this->matches, this->num_matches, sizeof(*this->matches),
            &varying_matches::match_comparator);
+   } else {
+      /* Only sort varyings that are only used by transform feedback. */
+      qsort(this->matches, this->num_matches, sizeof(*this->matches),
+            &varying_matches::xfb_comparator);
   }

   unsigned generic_location = 0;
   unsigned generic_patch_location = MAX_VARYING*4;
+   bool previous_var_xfb_only = false;

   for (unsigned i = 0; i < this->num_matches; i++) {
      unsigned *location = &generic_location;
@@ -1100,16 +1128,30 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
      /* Advance to the next slot if this varying has a different packing
       * class than the previous one, and we're not already on a slot
       * boundary.
+       *
+       * Also advance to the next slot if packing is disabled. This makes sure
+       * we don't assign varyings the same locations which is possible
+       * because we still pack individual arrays, records and matrices even
+       * when packing is disabled. Note we don't advance to the next slot if
+       * we can pack varyings together that are only used for transform
+       * feedback.
       */
-      if (i > 0 &&
-          this->matches[i - 1].packing_class
-          != this->matches[i].packing_class) {
+      if ((this->disable_varying_packing &&
+           !(previous_var_xfb_only && var->data.is_xfb_only)) ||
+          (i > 0 && this->matches[i - 1].packing_class
+          != this->matches[i].packing_class )) {
         *location = ALIGN(*location, 4);
      }

+      previous_var_xfb_only = var->data.is_xfb_only;
+
      unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
-      unsigned slot_end = this->disable_varying_packing ? 4 :
-         type->without_array()->vector_elements;
+      unsigned slot_end;
+      if (this->disable_varying_packing &&
+          !is_varying_packing_safe(type, var))
+         slot_end = 4;
+      else
+         slot_end = type->without_array()->vector_elements;
      slot_end += *location - 1;

      /* FIXME: We could be smarter in the below code and loop back over
@@ -1133,7 +1175,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
         /* Increase the slot to make sure there is enough room for next
          * array element.
          */
-         if (this->disable_varying_packing)
+         if (this->disable_varying_packing &&
+             !is_varying_packing_safe(type, var))
            slot_end += 4;
         else
            slot_end += type->without_array()->vector_elements;
@@ -1258,6 +1301,32 @@ varying_matches::match_comparator(const void *x_generic, const void *y_generic)
 }


+/**
+ * Comparison function passed to qsort() to sort varyings used only by
+ * transform feedback when packing of other varyings is disabled.
+ */
+int
+varying_matches::xfb_comparator(const void *x_generic, const void *y_generic)
+{
+   const match *x = (const match *) x_generic;
+
+   if (x->producer_var != NULL && x->producer_var->data.is_xfb_only)
+         return match_comparator(x_generic, y_generic);
+
+   /* FIXME: When the comparator returns 0 it means the elements being
+    * compared are equivalent. However the qsort documentation says:
+    *
+    *    "The order of equivalent elements is undefined."
+    *
+    * In practice the sort ends up reversing the order of the varyings which
+    * means locations are also assigned in this reversed order and happens to
+    * be what we want. This is also whats happening in
+    * varying_matches::match_comparator().
+    */
+   return 0;
+}
+
+
 /**
 * Is the given variable a varying variable to be counted against the
 * limit in ctx->Const.MaxVarying?
@@ -1573,26 +1642,60 @@ assign_varying_locations(struct gl_context *ctx,
                         unsigned num_tfeedback_decls,
                         tfeedback_decl *tfeedback_decls)
 {
-   if (ctx->Const.DisableVaryingPacking) {
-      /* Transform feedback code assumes varyings are packed, so if the driver
-       * has disabled varying packing, make sure it does not support transform
-       * feedback.
-       */
-      assert(!ctx->Extensions.EXT_transform_feedback);
-   }
-
   /* Tessellation shaders treat inputs and outputs as shared memory and can
    * access inputs and outputs of other invocations.
    * Therefore, they can't be lowered to temps easily (and definitely not
    * efficiently).
    */
-   bool disable_varying_packing =
-      ctx->Const.DisableVaryingPacking ||
+   bool unpackable_tess =
      (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
      (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
      (producer && producer->Stage == MESA_SHADER_TESS_CTRL);

-   varying_matches matches(disable_varying_packing,
+   /* Transform feedback code assumes varying arrays are packed, so if the
+    * driver has disabled varying packing, make sure to at least enable
+    * packing required by transform feedback.
+    */
+   bool xfb_enabled =
+      ctx->Extensions.EXT_transform_feedback && !unpackable_tess;
+
+   /* Disable varying packing for GL 4.4+ as there is no guarantee
+    * that interpolation qualifiers will match between shaders in these
+    * versions. We also disable packing on outerward facing interfaces for
+    * SSO because in ES we need to retain the unpacked varying information
+    * for draw time validation. For desktop GL we could allow packing for
+    * versions < 4.4 but its just safer not to do packing.
+    *
+    * Packing is still enabled on individual arrays, structs, and matrices as
+    * these are required by the transform feedback code and it is still safe
+    * to do so. We also enable packing when a varying is only used for
+    * transform feedback and its not a SSO.
+    *
+    * Varying packing currently only packs together varyings with matching
+    * interpolation qualifiers as the backends assume all packed components
+    * are to be processed in the same way. Therefore we cannot do packing in
+    * these versions of GL without the risk of mismatching interfaces.
+    *
+    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.30 spec:
+    *
+    *    "The type and presence of interpolation qualifiers of variables with
+    *    the same name declared in all linked shaders for the same cross-stage
+    *    interface must match, otherwise the link command will fail.
+    *
+    *    When comparing an output from one stage to an input of a subsequent
+    *    stage, the input and output don't match if their interpolation
+    *    qualifiers (or lack thereof) are not the same."
+    *
+    * This text was also in at least revison 7 of the 4.40 spec but is no
+    * longer in revision 9 and not in the 4.50 spec.
+    */
+   bool disable_varying_packing =
+      ctx->Const.DisableVaryingPacking || unpackable_tess;
+   if ((ctx->API == API_OPENGL_CORE && ctx->Version >= 44) ||
+       (prog->SeparateShader && (producer == NULL || consumer == NULL)))
+      disable_varying_packing = true;
+
+   varying_matches matches(disable_varying_packing, xfb_enabled,
                           producer ? producer->Stage : (gl_shader_stage)-1,
                           consumer ? consumer->Stage : (gl_shader_stage)-1);
   hash_table *tfeedback_candidates
@@ -1711,8 +1814,10 @@ assign_varying_locations(struct gl_context *ctx,
         return false;
      }

-      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout)
+      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) {
+         matched_candidate->toplevel_var->data.is_xfb_only = 1;
         matches.record(matched_candidate->toplevel_var, NULL);
+      }
   }

   const uint64_t reserved_slots =
@@ -1784,15 +1889,16 @@ assign_varying_locations(struct gl_context *ctx,
                                              ir_var_shader_in);
   }

-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
+   if (producer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                            0, producer, disable_varying_packing,
+                            xfb_enabled);
+   }
+
+   if (consumer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                            consumer_vertices, consumer,
+                            disable_varying_packing, xfb_enabled);
   }

   return true;
@@ -168,7 +168,9 @@ public:
                                 ir_variable_mode mode,
                                 unsigned gs_input_vertices,
                                 exec_list *out_instructions,
-                                 exec_list *out_variables);
+                                 exec_list *out_variables,
+                                 bool disable_varying_packing,
+                                 bool xfb_enabled);

   void run(struct gl_shader *shader);

@@ -231,6 +233,9 @@ private:
    * Exec list into which the visitor should insert any new variables.
    */
   exec_list *out_variables;
+
+   bool disable_varying_packing;
+   bool xfb_enabled;
 };

 } /* anonymous namespace */
@@ -238,7 +243,8 @@ private:
 lower_packed_varyings_visitor::lower_packed_varyings_visitor(
      void *mem_ctx, unsigned locations_used, ir_variable_mode mode,
      unsigned gs_input_vertices, exec_list *out_instructions,
-      exec_list *out_variables)
+      exec_list *out_variables, bool disable_varying_packing,
+      bool xfb_enabled)
   : mem_ctx(mem_ctx),
     locations_used(locations_used),
     packed_varyings((ir_variable **)
@@ -247,7 +253,9 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor(
     mode(mode),
     gs_input_vertices(gs_input_vertices),
     out_instructions(out_instructions),
-     out_variables(out_variables)
+     out_variables(out_variables),
+     disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled)
 {
 }

@@ -656,7 +664,18 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
   if (var->data.explicit_location)
      return false;

-   const glsl_type *type = var->type->without_array();
+   /* Override disable_varying_packing if the var is only used by transform
+    * feedback. Also override it if transform feedback is enabled and the
+    * variable is an array, struct or matrix as the elements of these types
+    * will always has the same interpolation and therefore asre safe to pack.
+    */
+   const glsl_type *type = var->type;
+   if (disable_varying_packing && !var->data.is_xfb_only &&
+       !((type->is_array() || type->is_record() || type->is_matrix()) &&
+         xfb_enabled))
+      return false;
+
+   type = type->without_array();
   if (type->vector_elements == 4 && !type->is_double())
      return false;
   return true;
@@ -709,7 +728,8 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev)
 void
 lower_packed_varyings(void *mem_ctx, unsigned locations_used,
                      ir_variable_mode mode, unsigned gs_input_vertices,
-                      gl_shader *shader)
+                      gl_shader *shader, bool disable_varying_packing,
+                      bool xfb_enabled)
 {
   exec_list *instructions = shader->ir;
   ir_function *main_func = shader->symbols->get_function("main");
@@ -720,7 +740,9 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used,
   lower_packed_varyings_visitor visitor(mem_ctx, locations_used, mode,
                                         gs_input_vertices,
                                         &new_instructions,
-                                         &new_variables);
+                                         &new_variables,
+                                         disable_varying_packing,
+                                         xfb_enabled);
   visitor.run(shader);
   if (mode == ir_var_shader_out) {
      if (shader->Stage == MESA_SHADER_GEOMETRY) {
@@ -58,6 +58,8 @@ public:
   {
   }

+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
   ir_rvalue *handle_expression(ir_expression *ir);
   void handle_rvalue(ir_rvalue **rvalue);
   bool reassociate_constant(ir_expression *ir1,
@@ -80,6 +82,23 @@ public:

 } /* unnamed namespace */

+ir_visitor_status
+ir_algebraic_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant or precise variable, just bail.
+       * Most of the algebraic optimizations aren't precision-safe.
+       *
+       * FINISHME: Find out which optimizations are precision-safe and enable
+       * then only for invariant or precise trees.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static inline bool
 is_vec_zero(ir_constant *ir)
 {
@@ -131,6 +131,8 @@ public:
      progress = false;
   }

+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
   void handle_rvalue(ir_rvalue **rvalue);

   bool progress;
@@ -146,6 +148,20 @@ struct is_reduction_data {

 } /* anonymous namespace */

+ir_visitor_status
+ir_rebalance_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant variable, just bail.  Tree
+       * rebalancing (reassociation) isn't precision-safe.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static bool
 is_reduction_operation(ir_expression_operation operation)
 {
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file propagate_invariance.cpp
+ * Propagate the "invariant" and "precise" qualifiers to variables used to
+ * compute invariant or precise values.
+ *
+ * The GLSL spec (depending on what version you read) says, among the
+ * conditions for geting bit-for-bit the same values on an invariant output:
+ *
+ *    "All operations in the consuming expressions and any intermediate
+ *    expressions must be the same, with the same order of operands and same
+ *    associativity, to give the same order of evaluation."
+ *
+ * This effectively means that if a variable is used to compute an invariant
+ * value then that variable becomes invariant.  The same should apply to the
+ * "precise" qualifier.
+ */
+
+#include "ir.h"
+#include "ir_visitor.h"
+#include "ir_rvalue_visitor.h"
+#include "ir_optimization.h"
+#include "compiler/glsl_types.h"
+
+namespace {
+
+class ir_invariance_propagation_visitor : public ir_hierarchical_visitor {
+public:
+   ir_invariance_propagation_visitor()
+   {
+      this->progress = false;
+      this->dst_var = NULL;
+   }
+
+   virtual ~ir_invariance_propagation_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+   virtual ir_visitor_status visit_leave(ir_assignment *ir);
+   virtual ir_visitor_status visit(ir_dereference_variable *ir);
+
+   ir_variable *dst_var;
+   bool progress;
+};
+
+} /* unnamed namespace */
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_enter(ir_assignment *ir)
+{
+   assert(this->dst_var == NULL);
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      this->dst_var = var;
+      return visit_continue;
+   } else {
+      return visit_continue_with_parent;
+   }
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_leave(ir_assignment *ir)
+{
+   this->dst_var = NULL;
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit(ir_dereference_variable *ir)
+{
+   if (this->dst_var == NULL)
+      return visit_continue;
+
+   if (this->dst_var->data.invariant) {
+      if (!ir->var->data.invariant)
+         this->progress = true;
+
+      ir->var->data.invariant = true;
+   }
+
+   if (this->dst_var->data.precise) {
+      if (!ir->var->data.precise)
+         this->progress = true;
+
+      ir->var->data.precise = true;
+   }
+
+   return visit_continue;
+}
+
+void
+propagate_invariance(exec_list *instructions)
+{
+   ir_invariance_propagation_visitor visitor;
+
+   do {
+      visitor.progress = false;
+      visit_list_elements(&visitor, instructions);
+   } while (visitor.progress);
+}
@@ -731,7 +731,7 @@ nir_visitor::visit(ir_call *ir)
         ir_dereference *param =
            (ir_dereference *) ir->actual_parameters.get_head();
         instr->variables[0] = evaluate_deref(&instr->instr, param);
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
         nir_builder_instr_insert(&b, &instr->instr);
         break;
      }
@@ -765,7 +765,7 @@ nir_visitor::visit(ir_call *ir)
            const nir_intrinsic_info *info =
                    &nir_intrinsic_infos[instr->intrinsic];
            nir_ssa_dest_init(&instr->instr, &instr->dest,
-                              info->dest_components, NULL);
+                              info->dest_components, 32, NULL);
         }

         if (op == nir_intrinsic_image_size ||
@@ -826,7 +826,7 @@ nir_visitor::visit(ir_call *ir)
         nir_builder_instr_insert(&b, &instr->instr);
         break;
      case nir_intrinsic_shader_clock:
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
         nir_builder_instr_insert(&b, &instr->instr);
         break;
      case nir_intrinsic_store_ssbo: {
@@ -867,7 +867,7 @@ nir_visitor::visit(ir_call *ir)

         /* Setup destination register */
         nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, 32, NULL);

         /* Insert the created nir instruction now since in the case of boolean
          * result we will need to emit another instruction after it
@@ -890,7 +890,7 @@ nir_visitor::visit(ir_call *ir)
               load_ssbo_compare->src[1].swizzle[i] = 0;
            nir_ssa_dest_init(&load_ssbo_compare->instr,
                              &load_ssbo_compare->dest.dest,
-                              type->vector_elements, NULL);
+                              type->vector_elements, 32, NULL);
            load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
            nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
            dest = &load_ssbo_compare->dest.dest;
@@ -936,7 +936,7 @@ nir_visitor::visit(ir_call *ir)
         /* Atomic result */
         assert(ir->return_deref);
         nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements, 32, NULL);
         nir_builder_instr_insert(&b, &instr->instr);
         break;
      }
@@ -951,8 +951,9 @@ nir_visitor::visit(ir_call *ir)
         instr->num_components = type->vector_elements;

         /* Setup destination register */
+         unsigned bit_size = glsl_get_bit_size(type->base_type);
         nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, bit_size, NULL);

         nir_builder_instr_insert(&b, &instr->instr);
         break;
@@ -1013,8 +1014,10 @@ nir_visitor::visit(ir_call *ir)

         /* Atomic result */
         assert(ir->return_deref);
+         unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type);
         nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements,
+                           bit_size, NULL);
         nir_builder_instr_insert(&b, &instr->instr);
         break;
      }
@@ -1061,6 +1064,9 @@ nir_visitor::visit(ir_assignment *ir)
 {
   unsigned num_components = ir->lhs->type->vector_elements;

+   b.exact = ir->lhs->variable_referenced()->data.invariant ||
+             ir->lhs->variable_referenced()->data.precise;
+
   if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) &&
       (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) {
      /* We're doing a plain-as-can-be copy, so emit a copy_var */
@@ -1163,7 +1169,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
   nir_dest *dest = get_instr_dest(instr);

   if (dest)
-      nir_ssa_dest_init(instr, dest, num_components, NULL);
+      nir_ssa_dest_init(instr, dest, num_components, 32, NULL);

   nir_builder_instr_insert(&b, instr);

@@ -1203,6 +1209,7 @@ nir_visitor::visit(ir_expression *ir)
      nir_intrinsic_instr *load =
         nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo);
      load->num_components = ir->type->vector_elements;
+      load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type);
      load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
      load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
      add_instr(&load->instr, ir->type->vector_elements);
@@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list)
   list_inithead(&reg->if_uses);

   reg->num_components = 0;
+   reg->bit_size = 32;
   reg->num_array_elems = 0;
   reg->is_packed = false;
   reg->name = NULL;
@@ -473,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components)
   nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr);
   instr_init(&instr->instr, nir_instr_type_load_const);

-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);

   return instr;
 }
@@ -562,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components)
   nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
   instr_init(&instr->instr, nir_instr_type_ssa_undef);

-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);

   return instr;
 }
@@ -699,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
      case GLSL_TYPE_FLOAT:
      case GLSL_TYPE_INT:
      case GLSL_TYPE_UINT:
-         load->value.u[i] = constant->value.u[matrix_offset + i];
+         load->value.u32[i] = constant->value.u[matrix_offset + i];
         break;
      case GLSL_TYPE_BOOL:
-         load->value.u[i] = constant->value.b[matrix_offset + i] ?
+         load->value.u32[i] = constant->value.b[matrix_offset + i] ?
                             NIR_TRUE : NIR_FALSE;
         break;
      default:
@@ -731,18 +732,11 @@ reduce_cursor(nir_cursor cursor)
 {
   switch (cursor.option) {
   case nir_cursor_before_block:
+      assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL ||
+             nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block);
      if (exec_list_is_empty(&cursor.block->instr_list)) {
         /* Empty block.  After is as good as before. */
         cursor.option = nir_cursor_after_block;
-      } else {
-         /* Try to switch to after the previous block if there is one.
-          * (This isn't likely, but it can happen.)
-          */
-         nir_cf_node *prev_node = nir_cf_node_prev(&cursor.block->cf_node);
-         if (prev_node && prev_node->type == nir_cf_node_block) {
-            cursor.block = nir_cf_node_as_block(prev_node);
-            cursor.option = nir_cursor_after_block;
-         }
      }
      return cursor;

@@ -1379,15 +1373,18 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
      src_add_all_uses(dest->reg.indirect, instr, NULL);
 }

+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                 unsigned num_components, const char *name)
+                 unsigned num_components,
+                 unsigned bit_size, const char *name)
 {
-   def->name = name;
+   def->name = ralloc_strdup(instr, name);
   def->parent_instr = instr;
   list_inithead(&def->uses);
   list_inithead(&def->if_uses);
   def->num_components = num_components;
+   def->bit_size = bit_size;

   if (instr->block) {
      nir_function_impl *impl =
@@ -1399,12 +1396,14 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
   }
 }

+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                 unsigned num_components, const char *name)
+                 unsigned num_components, unsigned bit_size,
+                 const char *name)
 {
   dest->is_ssa = true;
-   nir_ssa_def_init(instr, &dest->ssa, num_components, name);
+   nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name);
 }

 void
@@ -101,6 +101,7 @@ union nir_constant_data {
   int i[16];
   float f[16];
   bool b[16];
+   double d[16];
 };

 typedef struct nir_constant {
@@ -381,6 +382,9 @@ typedef struct nir_register {
   unsigned num_components; /** < number of vector components */
   unsigned num_array_elems; /** < size of array (0 for no array) */

+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
+
   /** generic register index. */
   unsigned index;

@@ -488,6 +492,9 @@ typedef struct nir_ssa_def {
   struct list_head if_uses;

   uint8_t num_components;
+
+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
 } nir_ssa_def;

 struct nir_src;
@@ -594,6 +601,18 @@ nir_dest_for_reg(nir_register *reg)
   return dest;
 }

+static inline unsigned
+nir_src_bit_size(nir_src src)
+{
+   return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size;
+}
+
+static inline unsigned
+nir_dest_bit_size(nir_dest dest)
+{
+   return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size;
+}
+
 void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if);
 void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr);

@@ -649,9 +668,36 @@ typedef enum {
   nir_type_float,
   nir_type_int,
   nir_type_uint,
-   nir_type_bool
+   nir_type_bool,
+   nir_type_bool32 =    32 | nir_type_bool,
+   nir_type_int8 =      8  | nir_type_int,
+   nir_type_int16 =     16 | nir_type_int,
+   nir_type_int32 =     32 | nir_type_int,
+   nir_type_int64 =     64 | nir_type_int,
+   nir_type_uint8 =     8  | nir_type_uint,
+   nir_type_uint16 =    16 | nir_type_uint,
+   nir_type_uint32 =    32 | nir_type_uint,
+   nir_type_uint64 =    64 | nir_type_uint,
+   nir_type_float16 =   16 | nir_type_float,
+   nir_type_float32 =   32 | nir_type_float,
+   nir_type_float64 =   64 | nir_type_float,
 } nir_alu_type;

+#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8
+#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007
+
+static inline unsigned
+nir_alu_type_get_type_size(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_SIZE_MASK;
+}
+
+static inline unsigned
+nir_alu_type_get_base_type(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_BASE_TYPE_MASK;
+}
+
 typedef enum {
   NIR_OP_IS_COMMUTATIVE = (1 << 0),
   NIR_OP_IS_ASSOCIATIVE = (1 << 1),
@@ -708,6 +754,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes];
 typedef struct nir_alu_instr {
   nir_instr instr;
   nir_op op;
+
+   /** Indicates that this ALU instruction generates an exact value
+    *
+    * This is kind of a mixture of GLSL "precise" and "invariant" and not
+    * really equivalent to either.  This indicates that the value generated by
+    * this operation is high-precision and any code transformations that touch
+    * it must ensure that the resulting value is bit-for-bit identical to the
+    * original.
+    */
+   bool exact;
+
   nir_alu_dest dest;
   nir_alu_src src[];
 } nir_alu_instr;
@@ -1218,9 +1275,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)

 typedef struct {
   union {
-      float f[4];
-      int32_t i[4];
-      uint32_t u[4];
+      float f32[4];
+      double f64[4];
+      int32_t i32[4];
+      uint32_t u32[4];
+      int64_t i64[4];
+      uint64_t u64[4];
   };
 } nir_const_value;

@@ -2061,9 +2121,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest,
                            nir_dest new_dest);

 void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                       unsigned num_components, const char *name);
+                       unsigned num_components, unsigned bit_size,
+                       const char *name);
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                      unsigned num_components, const char *name);
+                      unsigned num_components, unsigned bit_size,
+                      const char *name);
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
                                    nir_instr *after_me);
@@ -2094,9 +2156,10 @@ void nir_index_blocks(nir_function_impl *impl);
 void nir_print_shader(nir_shader *shader, FILE *fp);
 void nir_print_instr(const nir_instr *instr, FILE *fp);

-nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
+nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader);

 #ifdef DEBUG
 void nir_validate_shader(nir_shader *shader);
@@ -63,12 +63,13 @@ class Value(object):
 static const ${val.c_type} ${val.name} = {
   { ${val.type_enum} },
 % if isinstance(val, Constant):
-   { ${hex(val)} /* ${val.value} */ },
+   ${val.type()}, { ${hex(val)} /* ${val.value} */ },
 % elif isinstance(val, Variable):
   ${val.index}, /* ${val.var_name} */
   ${'true' if val.is_constant else 'false'},
-   nir_type_${ val.required_type or 'invalid' },
+   ${val.type() or 'nir_type_invalid' },
 % elif isinstance(val, Expression):
+   ${'true' if val.inexact else 'false'},
   nir_op_${val.opcode},
   { ${', '.join(src.c_ptr for src in val.sources)} },
 % endif
@@ -107,10 +108,18 @@ class Constant(Value):
      if isinstance(self.value, (int, long)):
         return hex(self.value)
      elif isinstance(self.value, float):
-         return hex(struct.unpack('I', struct.pack('f', self.value))[0])
+         return hex(struct.unpack('Q', struct.pack('d', self.value))[0])
      else:
         assert False

+   def type(self):
+      if isinstance(self.value, (bool)):
+         return "nir_type_bool32"
+      elif isinstance(self.value, (int, long)):
+         return "nir_type_int"
+      elif isinstance(self.value, float):
+         return "nir_type_float"
+
 _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?")

 class Variable(Value):
@@ -129,12 +138,26 @@ class Variable(Value):

      self.index = varset[self.var_name]

+   def type(self):
+      if self.required_type == 'bool':
+         return "nir_type_bool32"
+      elif self.required_type in ('int', 'unsigned'):
+         return "nir_type_int"
+      elif self.required_type == 'float':
+         return "nir_type_float"
+
+_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)")
+
 class Expression(Value):
   def __init__(self, expr, name_base, varset):
      Value.__init__(self, name_base, "expression")
      assert isinstance(expr, tuple)

-      self.opcode = expr[0]
+      m = _opcode_re.match(expr[0])
+      assert m and m.group('opcode') is not None
+
+      self.opcode = m.group('opcode')
+      self.inexact = m.group('inexact') is not None
      self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset)
                       for (i, src) in enumerate(expr[1:]) ]

@@ -31,6 +31,9 @@ struct exec_list;
 typedef struct nir_builder {
   nir_cursor cursor;

+   /* Whether new ALU instructions will be marked "exact" */
+   bool exact;
+
   nir_shader *shader;
   nir_function_impl *impl;
 } nir_builder;
@@ -39,6 +42,7 @@ static inline void
 nir_builder_init(nir_builder *build, nir_function_impl *impl)
 {
   memset(build, 0, sizeof(*build));
+   build->exact = false;
   build->impl = impl;
   build->shader = impl->function->shader;
 }
@@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx,
 {
   build->shader = nir_shader_create(mem_ctx, stage, options);
   nir_function *func = nir_function_create(build->shader, "main");
+   build->exact = false;
   build->impl = nir_function_impl_create(func);
   build->cursor = nir_after_cf_list(&build->impl->body);
 }
@@ -104,7 +109,7 @@ nir_imm_float(nir_builder *build, float x)
   nir_const_value v;

   memset(&v, 0, sizeof(v));
-   v.f[0] = x;
+   v.f32[0] = x;

   return nir_build_imm(build, 1, v);
 }
@@ -115,10 +120,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w)
   nir_const_value v;

   memset(&v, 0, sizeof(v));
-   v.f[0] = x;
-   v.f[1] = y;
-   v.f[2] = z;
-   v.f[3] = w;
+   v.f32[0] = x;
+   v.f32[1] = y;
+   v.f32[2] = z;
+   v.f32[3] = w;

   return nir_build_imm(build, 4, v);
 }
@@ -129,7 +134,7 @@ nir_imm_int(nir_builder *build, int x)
   nir_const_value v;

   memset(&v, 0, sizeof(v));
-   v.i[0] = x;
+   v.i32[0] = x;

   return nir_build_imm(build, 1, v);
 }
@@ -140,10 +145,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
   nir_const_value v;

   memset(&v, 0, sizeof(v));
-   v.i[0] = x;
-   v.i[1] = y;
-   v.i[2] = z;
-   v.i[3] = w;
+   v.i32[0] = x;
+   v.i32[1] = y;
+   v.i32[2] = z;
+   v.i32[3] = w;

   return nir_build_imm(build, 4, v);
 }
@@ -157,6 +162,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
   if (!instr)
      return NULL;

+   instr->exact = build->exact;
+
   instr->src[0].src = nir_src_for_ssa(src0);
   if (src1)
      instr->src[1].src = nir_src_for_ssa(src1);
@@ -178,6 +185,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
   }
   assert(num_components != 0);

+   /* Figure out the bitwidth based on the source bitwidth if the instruction
+    * is variable-width.
+    */
+   unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type);
+   if (bit_size == 0) {
+      for (unsigned i = 0; i < op_info->num_inputs; i++) {
+         unsigned src_bit_size = instr->src[i].src.ssa->bit_size;
+         if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) {
+            if (bit_size)
+               assert(src_bit_size == bit_size);
+            else
+               bit_size = src_bit_size;
+         } else {
+            assert(src_bit_size ==
+               nir_alu_type_get_type_size(op_info->input_types[i]));
+         }
+      }
+   }
+
   /* Make sure we don't swizzle from outside of our source vector (like if a
    * scalar value was passed into a multiply with a vector).
    */
@@ -187,7 +213,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
      }
   }

-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
   instr->dest.write_mask = (1 << num_components) - 1;

   nir_builder_instr_insert(build, &instr->instr);
@@ -252,7 +279,9 @@ static inline nir_ssa_def *
 nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
   nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
   mov->dest.write_mask = (1 << num_components) - 1;
   mov->src[0] = src;
   nir_builder_instr_insert(build, &mov->instr);
@@ -264,7 +293,9 @@ static inline nir_ssa_def *
 nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
   nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
   mov->dest.write_mask = (1 << num_components) - 1;
   mov->src[0] = src;
   nir_builder_instr_insert(build, &mov->instr);
@@ -360,7 +391,8 @@ nir_load_var(nir_builder *build, nir_variable *var)
      nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
   load->num_components = num_components;
   load->variables[0] = nir_deref_var_create(load, var);
-   nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components,
+                     glsl_get_bit_size(glsl_get_base_type(var->type)), NULL);
   nir_builder_instr_insert(build, &load->instr);
   return &load->dest.ssa;
 }
@@ -426,7 +458,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
   load->num_components = nir_intrinsic_infos[op].dest_components;
   load->const_index[0] = index;
   nir_ssa_dest_init(&load->instr, &load->dest,
-                     nir_intrinsic_infos[op].dest_components, NULL);
+                     nir_intrinsic_infos[op].dest_components, 32, NULL);
   nir_builder_instr_insert(build, &load->instr);
   return &load->dest.ssa;
 }
@@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar)
 /* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid
 * having to deal with locals and globals separately:
 */
-static nir_variable *
-clone_variable(clone_state *state, const nir_variable *var)
+nir_variable *
+nir_variable_clone(const nir_variable *var, nir_shader *shader)
 {
-   nir_variable *nvar = rzalloc(state->ns, nir_variable);
-   add_remap(state, nvar, var);
+   nir_variable *nvar = rzalloc(shader, nir_variable);

   nvar->type = var->type;
   nvar->name = ralloc_strdup(nvar, var->name);
@@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var)
   return nvar;
 }

+static nir_variable *
+clone_variable(clone_state *state, const nir_variable *var)
+{
+   nir_variable *nvar = nir_variable_clone(var, state->ns);
+   add_remap(state, nvar, var);
+
+   return nvar;
+}
+
 /* clone list of nir_variable: */
 static void
 clone_var_list(clone_state *state, struct exec_list *dst,
@@ -220,7 +228,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
 {
   ndst->is_ssa = dst->is_ssa;
   if (dst->is_ssa) {
-      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
+      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components,
+                        dst->ssa.bit_size, dst->ssa.name);
      add_remap(state, &ndst->ssa, &dst->ssa);
   } else {
      ndst->reg.reg = remap_reg(state, dst->reg.reg);
@@ -303,6 +312,7 @@ static nir_alu_instr *
 clone_alu(clone_state *state, const nir_alu_instr *alu)
 {
   nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
+   nalu->exact = alu->exact;

   __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest);
   nalu->dest.saturate = alu->dest.saturate;
@@ -28,4 +28,4 @@
 #include "nir.h"

 nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
-                                      nir_const_value *src);
+                                      unsigned bit_size, nir_const_value *src);
@@ -1,4 +1,43 @@
 #! /usr/bin/python2
+
+def type_has_size(type_):
+    return type_[-1:].isdigit()
+
+def type_sizes(type_):
+    if type_.endswith("8"):
+        return [8]
+    elif type_.endswith("16"):
+        return [16]
+    elif type_.endswith("32"):
+        return [32]
+    elif type_.endswith("64"):
+        return [64]
+    else:
+        return [32, 64]
+
+def type_add_size(type_, size):
+    if type_has_size(type_):
+        return type_
+    return type_ + str(size)
+
+def get_const_field(type_):
+    if type_ == "int32":
+        return "i32"
+    if type_ == "uint32":
+        return "u32"
+    if type_ == "int64":
+        return "i64"
+    if type_ == "uint64":
+        return "u64"
+    if type_ == "bool32":
+        return "u32"
+    if type_ == "float32":
+        return "f32"
+    if type_ == "float64":
+        return "f64"
+    raise Exception(str(type_))
+    assert(0)
+
 template = """\
 /*
 * Copyright (C) 2014 Intel Corporation
@@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u)
 }

 /* Some typed vector structures to make things like src0.y work */
-% for type in ["float", "int", "uint", "bool"]:
-struct ${type}_vec {
-   ${type} x;
-   ${type} y;
-   ${type} z;
-   ${type} w;
+typedef float float32_t;
+typedef double float64_t;
+typedef bool bool32_t;
+% for type in ["float", "int", "uint"]:
+% for width in [32, 64]:
+struct ${type}${width}_vec {
+   ${type}${width}_t x;
+   ${type}${width}_t y;
+   ${type}${width}_t z;
+   ${type}${width}_t w;
 };
 % endfor
+% endfor
+
+struct bool32_vec {
+    bool x;
+    bool y;
+    bool z;
+    bool w;
+};

 % for name, op in sorted(opcodes.iteritems()):
 static nir_const_value
-evaluate_${name}(unsigned num_components, nir_const_value *_src)
+evaluate_${name}(unsigned num_components, unsigned bit_size,
+                 nir_const_value *_src)
 {
   nir_const_value _dst_val = { { {0, 0, 0, 0} } };

-   ## For each non-per-component input, create a variable srcN that
-   ## contains x, y, z, and w elements which are filled in with the
-   ## appropriately-typed values.
-   % for j in range(op.num_inputs):
-      % if op.input_sizes[j] == 0:
-         <% continue %>
-      % elif "src" + str(j) not in op.const_expr:
-         ## Avoid unused variable warnings
-         <% continue %>
-      %endif
+   switch (bit_size) {
+   % for bit_size in [32, 64]:
+   case ${bit_size}: {
+      <%
+      output_type = type_add_size(op.output_type, bit_size)
+      input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
+      %>

-      struct ${op.input_types[j]}_vec src${j} = {
-      % for k in range(op.input_sizes[j]):
-         % if op.input_types[j] == "bool":
-            _src[${j}].u[${k}] != 0,
-         % else:
-            _src[${j}].${op.input_types[j][:1]}[${k}],
-         % endif
-      % endfor
-      };
-   % endfor
+      ## For each non-per-component input, create a variable srcN that
+      ## contains x, y, z, and w elements which are filled in with the
+      ## appropriately-typed values.
+      % for j in range(op.num_inputs):
+         % if op.input_sizes[j] == 0:
+            <% continue %>
+         % elif "src" + str(j) not in op.const_expr:
+            ## Avoid unused variable warnings
+            <% continue %>
+         %endif

-   % if op.output_size == 0:
-      ## For per-component instructions, we need to iterate over the
-      ## components and apply the constant expression one component
-      ## at a time.
-      for (unsigned _i = 0; _i < num_components; _i++) {
-         ## For each per-component input, create a variable srcN that
-         ## contains the value of the current (_i'th) component.
-         % for j in range(op.num_inputs):
-            % if op.input_sizes[j] != 0:
-               <% continue %>
-            % elif "src" + str(j) not in op.const_expr:
-               ## Avoid unused variable warnings
-               <% continue %>
-            % elif op.input_types[j] == "bool":
-               bool src${j} = _src[${j}].u[_i] != 0;
+         struct ${input_types[j]}_vec src${j} = {
+         % for k in range(op.input_sizes[j]):
+            % if input_types[j] == "bool32":
+               _src[${j}].u32[${k}] != 0,
            % else:
-               ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
+               _src[${j}].${get_const_field(input_types[j])}[${k}],
            % endif
         % endfor
+         };
+      % endfor
+
+      % if op.output_size == 0:
+         ## For per-component instructions, we need to iterate over the
+         ## components and apply the constant expression one component
+         ## at a time.
+         for (unsigned _i = 0; _i < num_components; _i++) {
+            ## For each per-component input, create a variable srcN that
+            ## contains the value of the current (_i'th) component.
+            % for j in range(op.num_inputs):
+               % if op.input_sizes[j] != 0:
+                  <% continue %>
+               % elif "src" + str(j) not in op.const_expr:
+                  ## Avoid unused variable warnings
+                  <% continue %>
+               % elif input_types[j] == "bool32":
+                  bool src${j} = _src[${j}].u32[_i] != 0;
+               % else:
+                  ${input_types[j]}_t src${j} =
+                     _src[${j}].${get_const_field(input_types[j])}[_i];
+               % endif
+            % endfor
+
+            ## Create an appropriately-typed variable dst and assign the
+            ## result of the const_expr to it.  If const_expr already contains
+            ## writes to dst, just include const_expr directly.
+            % if "dst" in op.const_expr:
+               ${output_type}_t dst;
+               ${op.const_expr}
+            % else:
+               ${output_type}_t dst = ${op.const_expr};
+            % endif
+
+            ## Store the current component of the actual destination to the
+            ## value of dst.
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[_i] = dst;
+            % endif
+         }
+      % else:
+         ## In the non-per-component case, create a struct dst with
+         ## appropriately-typed elements x, y, z, and w and assign the result
+         ## of the const_expr to all components of dst, or include the
+         ## const_expr directly if it writes to dst already.
+         struct ${output_type}_vec dst;

-         ## Create an appropriately-typed variable dst and assign the
-         ## result of the const_expr to it.  If const_expr already contains
-         ## writes to dst, just include const_expr directly.
         % if "dst" in op.const_expr:
-            ${op.output_type} dst;
            ${op.const_expr}
         % else:
-            ${op.output_type} dst = ${op.const_expr};
+            ## Splat the value to all components.  This way expressions which
+            ## write the same value to all components don't need to explicitly
+            ## write to dest.  One such example is fnoise which has a
+            ## const_expr of 0.0f.
+            dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
         % endif

-         ## Store the current component of the actual destination to the
-         ## value of dst.
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[_i] = dst;
-         % endif
-      }
-   % else:
-      ## In the non-per-component case, create a struct dst with
-      ## appropriately-typed elements x, y, z, and w and assign the result
-      ## of the const_expr to all components of dst, or include the
-      ## const_expr directly if it writes to dst already.
-      struct ${op.output_type}_vec dst;
-
-      % if "dst" in op.const_expr:
-         ${op.const_expr}
-      % else:
-         ## Splat the value to all components.  This way expressions which
-         ## write the same value to all components don't need to explicitly
-         ## write to dest.  One such example is fnoise which has a
-         ## const_expr of 0.0f.
-         dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
+         ## For each component in the destination, copy the value of dst to
+         ## the actual destination.
+         % for k in range(op.output_size):
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
+            % endif
+         % endfor
      % endif

-      ## For each component in the destination, copy the value of dst to
-      ## the actual destination.
-      % for k in range(op.output_size):
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
-         % endif
-      % endfor
-   % endif
+      break;
+   }
+   % endfor
+
+   default:
+      unreachable("unknown bit width");
+   }

   return _dst_val;
 }
@@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src)

 nir_const_value
 nir_eval_const_opcode(nir_op op, unsigned num_components,
-                      nir_const_value *src)
+                      unsigned bit_width, nir_const_value *src)
 {
   switch (op) {
 % for name in sorted(opcodes.iterkeys()):
   case nir_op_${name}: {
-      return evaluate_${name}(num_components, src);
+      return evaluate_${name}(num_components, bit_width, src);
      break;
   }
 % endfor
@@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components,
 from nir_opcodes import opcodes
 from mako.template import Template

-print Template(template).render(opcodes=opcodes)
+print Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
+                                type_has_size=type_has_size,
+                                type_add_size=type_add_size,
+                                get_const_field=get_const_field)
@@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
         nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                                  nir_parallel_copy_entry);
         nir_ssa_dest_init(&pcopy->instr, &entry->dest,
-                           phi->dest.ssa.num_components, src->src.ssa->name);
+                           phi->dest.ssa.num_components,
+                           phi->dest.ssa.bit_size, src->src.ssa->name);
         exec_list_push_tail(&pcopy->entries, &entry->node);

         assert(src->src.is_ssa);
@@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
      nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                               nir_parallel_copy_entry);
      nir_ssa_dest_init(&block_pcopy->instr, &entry->dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components, phi->dest.ssa.bit_size,
+                        phi->dest.ssa.name);
      exec_list_push_tail(&block_pcopy->entries, &entry->node);

      nir_ssa_def_rewrite_uses(&phi->dest.ssa,
@@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader)
               return -1;

            if (count == -1)
-               count = val->i[0];
+               count = val->i32[0];

            /* We've found contradictory set_vertex_count intrinsics.
             * This can happen if there are early-returns in main() and
             * different paths emit different numbers of vertices.
             */
-            if (count != val->i[0])
+            if (count != val->i32[0])
               return -1;
         }
      }
@@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr)
 {
   hash = HASH(hash, instr->op);
   hash = HASH(hash, instr->dest.dest.ssa.num_components);
+   /* We explicitly don't hash instr->dest.dest.exact */

   if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
      assert(nir_op_infos[instr->op].num_inputs == 2);
@@ -81,9 +82,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
 {
   hash = HASH(hash, instr->def.num_components);

-   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f,
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32,
                                          instr->def.num_components
-                                             * sizeof(instr->value.f[0]));
+                                             * sizeof(instr->value.f32[0]));

   return hash;
 }
@@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
      if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
         return false;

+      /* We explicitly don't hash instr->dest.dest.exact */
+
      if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
         assert(nir_op_infos[alu1->op].num_inputs == 2);
         return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
@@ -322,8 +325,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
      if (load1->def.num_components != load2->def.num_components)
         return false;

-      return memcmp(load1->value.f, load2->value.f,
-                    load1->def.num_components * sizeof(*load2->value.f)) == 0;
+      return memcmp(load1->value.f32, load2->value.f32,
+                    load1->def.num_components * sizeof(*load2->value.f32)) == 0;
   }
   case nir_instr_type_phi: {
      nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
@@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr)
   struct set_entry *entry = _mesa_set_search(instr_set, instr);
   if (entry) {
      nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr);
-      nir_ssa_def *new_def =
-         nir_instr_get_dest_ssa_def((nir_instr *) entry->key);
+      nir_instr *match = (nir_instr *) entry->key;
+      nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match);
+
+      /* It's safe to replace a exact instruction with an inexact one as
+       * long as we make it exact.  If we got here, the two instructions are
+       * exactly identical in every other way so, once we've set the exact
+       * bit, they are the same.
+       */
+      if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact)
+         nir_instr_as_alu(match)->exact = true;
+
      nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def));
      return true;
   }
@@ -31,9 +31,11 @@
 */

 static void
-nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components)
+nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components,
+                      unsigned bit_size)
 {
-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
   instr->dest.write_mask = (1 << num_components) - 1;
 }

@@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op,
   nir_ssa_def *last = NULL;
   for (unsigned i = 0; i < num_components; i++) {
      nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op);
-      nir_alu_ssa_dest_init(chan, 1);
+      nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size);
      nir_alu_src_copy(&chan->src[0], &instr->src[0], chan);
      chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
      if (nir_op_infos[chan_op].num_inputs > 1) {
@@ -80,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
   assert(instr->dest.write_mask != 0);

   b->cursor = nir_before_instr(&instr->instr);
+   b->exact = instr->exact;

 #define LOWER_REDUCTION(name, chan, merge) \
   case name##2: \
@@ -220,7 +223,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
            lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan];
      }

-      nir_alu_ssa_dest_init(lower, 1);
+      nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size);
      lower->dest.saturate = instr->dest.saturate;
      comps[chan] = &lower->dest.dest.ssa;

@@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr,
      state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);

   nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
+   offset_const->value.u32[0] = instr->variables[0]->var->data.offset;

   nir_instr_insert_before(&instr->instr, &offset_const->instr);

@@ -90,17 +90,17 @@ lower_instr(nir_intrinsic_instr *instr,
      unsigned child_array_elements = tail->child != NULL ?
         glsl_get_aoa_size(tail->type) : 1;

-      offset_const->value.u[0] += deref_array->base_offset *
+      offset_const->value.u32[0] += deref_array->base_offset *
         child_array_elements * ATOMIC_COUNTER_SIZE;

      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
         nir_load_const_instr *atomic_counter_size =
               nir_load_const_instr_create(mem_ctx, 1);
-         atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
+         atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
         nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);

         nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
         mul->dest.write_mask = 0x1;
         nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul);
         mul->src[1].src.is_ssa = true;
@@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr,
         nir_instr_insert_before(&instr->instr, &mul->instr);

         nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd);
-         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
         add->dest.write_mask = 0x1;
         add->src[0].src.is_ssa = true;
         add->src[0].src.ssa = &mul->dest.dest.ssa;
@@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr,

   if (instr->dest.is_ssa) {
      nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
-                        instr->dest.ssa.num_components, NULL);
+                        instr->dest.ssa.num_components, 32, NULL);
      nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                               nir_src_for_ssa(&new_instr->dest.ssa));
   } else {
@@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val)
   load->num_components = 4;
   nir_intrinsic_set_base(load, in->data.driver_location);
   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
   nir_builder_instr_insert(b, &load->instr);

   val[0] = nir_channel(b, &load->dest.ssa, 0);
@@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
      if (src == NULL) {
         /* We're a load.  We need to insert a phi node */
         nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+         unsigned bit_size = then_dest->bit_size;
         nir_ssa_dest_init(&phi->instr, &phi->dest,
-                           then_dest->num_components, NULL);
+                           then_dest->num_components, bit_size, NULL);

         nir_phi_src *src0 = ralloc(phi, nir_phi_src);
         src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt));
@@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
      load->num_components = orig_instr->num_components;
      load->variables[0] =
         nir_deref_as_var(nir_copy_deref(load, &deref->deref));
+      unsigned bit_size = orig_instr->dest.ssa.bit_size;
      nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, NULL);
+                        load->num_components, bit_size, NULL);
      nir_builder_instr_insert(b, &load->instr);
      *dest = &load->dest.ssa;
   } else {
@@ -289,7 +289,8 @@ nir_lower_io_block(nir_block *block, void *void_state)

         if (intrin->dest.is_ssa) {
            nir_ssa_dest_init(&load->instr, &load->dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                     nir_src_for_ssa(&load->dest.ssa));
         } else {
@@ -369,7 +370,8 @@ nir_lower_io_block(nir_block *block, void *void_state)

         if (intrin->dest.is_ssa) {
            nir_ssa_dest_init(&atomic->instr, &atomic->dest,
-                              intrin->dest.ssa.num_components, NULL);
+                              intrin->dest.ssa.num_components,
+                              intrin->dest.ssa.bit_size, NULL);
            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                     nir_src_for_ssa(&atomic->dest.ssa));
         } else {
@@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
   nir_ssa_def *loads[4];
   for (unsigned i = 0; i < lower->def.num_components; i++) {
      nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
-      load_comp->value.u[0] = lower->value.u[i];
+      load_comp->value.u32[0] = lower->value.u32[i];
      nir_builder_instr_insert(&b, &load_comp->instr);
      loads[i] = &load_comp->def;
   }
@@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
      if (src.reg.indirect) {
         nir_load_const_instr *load_const =
            nir_load_const_instr_create(state->shader, 1);
-         load_const->value.u[0] = glsl_get_length(parent_type);
+         load_const->value.u32[0] = glsl_get_length(parent_type);
         nir_instr_insert_before(instr, &load_const->instr);

         nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul);
@@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
         mul->src[1].src.is_ssa = true;
         mul->src[1].src.ssa = &load_const->def;
         mul->dest.write_mask = 1;
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
         nir_instr_insert_before(instr, &mul->instr);

         src.reg.indirect->is_ssa = true;
@@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
            add->src[0].src = *src.reg.indirect;
            nir_src_copy(&add->src[1].src, &deref_array->indirect, add);
            add->dest.write_mask = 1;
-            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
            nir_instr_insert_before(instr, &add->instr);

            src.reg.indirect->is_ssa = true;
@@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
         mov->dest.write_mask = (1 << intrin->num_components) - 1;
         if (intrin->dest.is_ssa) {
            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                     nir_src_for_ssa(&mov->dest.dest.ssa));
         } else {
@@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
      if (!should_lower_phi(phi, state))
         continue;

+      unsigned bit_size = phi->dest.ssa.bit_size;
+
      /* Create a vecN operation to combine the results.  Most of these
       * will be redundant, but copy propagation should clean them up for
       * us.  No need to add the complexity here.
@@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)

      nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op);
      nir_ssa_dest_init(&vec->instr, &vec->dest.dest,
-                        phi->dest.ssa.num_components, NULL);
+                        phi->dest.ssa.num_components,
+                        bit_size, NULL);
      vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;

      for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) {
         nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx);
-         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL);
+         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1,
+                           phi->dest.ssa.bit_size, NULL);

         vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa);

@@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
            /* We need to insert a mov to grab the i'th component of src */
            nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx,
                                                      nir_op_imov);
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL);
            mov->dest.write_mask = 1;
            nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx);
            mov->src[0].swizzle[0] = i;
@@ -65,9 +65,9 @@ convert_block(nir_block *block, void *void_state)
          */

         nir_const_value local_size;
-         local_size.u[0] = b->shader->info.cs.local_size[0];
-         local_size.u[1] = b->shader->info.cs.local_size[1];
-         local_size.u[2] = b->shader->info.cs.local_size[2];
+         local_size.u32[0] = b->shader->info.cs.local_size[0];
+         local_size.u32[1] = b->shader->info.cs.local_size[1];
+         local_size.u32[2] = b->shader->info.cs.local_size[2];

         nir_ssa_def *group_id =
            nir_load_system_value(b, nir_intrinsic_load_work_group_id, 0);
@@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex)
   txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
   txs->src[0].src_type = nir_tex_src_lod;

-   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL);
   nir_builder_instr_insert(b, &txs->instr);

   return nir_i2f(b, &txs->dest.ssa);
@@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val)
   memset(&v, 0, sizeof(v));

   if (swizzle_val == 4) {
-      v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0;
+      v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0;
   } else {
      assert(swizzle_val == 5);
      if (type == nir_type_float)
-         v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0;
+         v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0;
      else
-         v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1;
+         v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1;
   }

   return nir_build_imm(b, 4, v);
@@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in)
   load->num_components = 4;
   nir_intrinsic_set_base(load, in->data.driver_location);
   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
   nir_builder_instr_insert(b, &load->instr);

   return &load->dest.ssa;
@@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
      assert(src_tail->type == dest_tail->type);

      unsigned num_components = glsl_get_vector_elements(src_tail->type);
+      unsigned bit_size =
+         glsl_get_bit_size(glsl_get_base_type(src_tail->type));

      nir_intrinsic_instr *load =
         nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var);
      load->num_components = num_components;
      load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref));
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size,
+                        NULL);

      nir_instr_insert_before(&copy_instr->instr, &load->instr);

@@ -505,6 +505,7 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
            nir_ssa_undef_instr *undef =
               nir_ssa_undef_instr_create(state->shader,
                                          intrin->num_components);
+            undef->def.bit_size = intrin->dest.ssa.bit_size;

            nir_instr_insert_before(&intrin->instr, &undef->instr);
            nir_instr_remove(&intrin->instr);
@@ -528,7 +529,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)

         mov->dest.write_mask = (1 << intrin->num_components) - 1;
         nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                           intrin->num_components, NULL);
+                           intrin->num_components,
+                           intrin->dest.ssa.bit_size, NULL);

         nir_instr_insert_before(&intrin->instr, &mov->instr);
         nir_instr_remove(&intrin->instr);
@@ -719,6 +721,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
      node->pb_value =
         nir_phi_builder_add_value(state.phi_builder,
                                   glsl_get_vector_elements(node->type),
+                                   glsl_get_bit_size(glsl_get_base_type(node->type)),
                                   store_blocks);

      if (node->deref->var->constant_initializer) {
@@ -90,8 +90,12 @@ class Opcode(object):
 # helper variables for strings
 tfloat = "float"
 tint = "int"
-tbool = "bool"
+tbool = "bool32"
 tuint = "uint"
+tfloat32 = "float32"
+tint32 = "int32"
+tuint32 = "uint32"
+tfloat64 = "float64"

 commutative = "commutative "
 associative = "associative "
@@ -155,57 +159,57 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tbool, tint, "src0 != 0")
-unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint32, "src0 != 0")
+unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.

 # Unary floating-point rounding operations.


-unop("ftrunc", tfloat, "truncf(src0)")
-unop("fceil", tfloat, "ceilf(src0)")
-unop("ffloor", tfloat, "floorf(src0)")
-unop("ffract", tfloat, "src0 - floorf(src0)")
-unop("fround_even", tfloat, "_mesa_roundevenf(src0)")
+unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
+unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
+unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
+unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
+unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")

 unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")

 # Trigonometric operations.


-unop("fsin", tfloat, "sinf(src0)")
-unop("fcos", tfloat, "cosf(src0)")
+unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
+unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")


 # Partial derivatives.


-unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
-unop("fddy", tfloat, "0.0f")
-unop("fddx_fine", tfloat, "0.0f")
-unop("fddy_fine", tfloat, "0.0f")
-unop("fddx_coarse", tfloat, "0.0f")
-unop("fddy_coarse", tfloat, "0.0f")
+unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0")
+unop("fddx_fine", tfloat, "0.0")
+unop("fddy_fine", tfloat, "0.0")
+unop("fddx_coarse", tfloat, "0.0")
+unop("fddy_coarse", tfloat, "0.0")


 # Floating point pack and unpack operations.

 def pack_2x16(fmt):
-   unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """
+   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 """.replace("fmt", fmt))

 def pack_4x8(fmt):
-   unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """
+   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
@@ -213,13 +217,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 """.replace("fmt", fmt))

 def unpack_2x16(fmt):
-   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 """.replace("fmt", fmt))

 def unpack_4x8(fmt):
-   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
@@ -238,11 +242,11 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")

-unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
+unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 """)

-unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
+unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 dst.x = (src0.x <<  0) |
        (src0.y <<  8) |
        (src0.z << 16) |
@@ -252,22 +256,22 @@ dst.x = (src0.x <<  0) |
 # Lowered floating point unpacking operations.


-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
           "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
           "unpack_half_1x16((uint16_t)(src0.x >> 16))")


 # Bit operations, part of ARB_gpu_shader5.


-unop("bitfield_reverse", tuint, """
+unop("bitfield_reverse", tuint32, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
   dst |= ((src0 >> bit) & 1) << (31 - bit);
 """)
-unop("bit_count", tuint, """
+unop("bit_count", tuint32, """
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++) {
   if ((src0 >> bit) & 1)
@@ -275,7 +279,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)

-unop_convert("ufind_msb", tint, tuint, """
+unop_convert("ufind_msb", tint32, tuint32, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
   if ((src0 >> bit) & 1) {
@@ -285,7 +289,7 @@ for (int bit = 31; bit > 0; bit--) {
 }
 """)

-unop("ifind_msb", tint, """
+unop("ifind_msb", tint32, """
 dst = -1;
 for (int bit = 31; bit >= 0; bit--) {
   /* If src0 < 0, we're looking for the first 0 bit.
@@ -299,7 +303,7 @@ for (int bit = 31; bit >= 0; bit--) {
 }
 """)

-unop("find_lsb", tint, """
+unop("find_lsb", tint32, """
 dst = -1;
 for (unsigned bit = 0; bit < 32; bit++) {
   if ((src0 >> bit) & 1) {
@@ -359,10 +363,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
 # low 32-bits of signed/unsigned integer multiply
 binop("imul", tint, commutative + associative, "src0 * src1")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative,
+binop("imul_high", tint32, commutative,
      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tuint, commutative,
+binop("umul_high", tuint32, commutative,
      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")

 binop("fdiv", tfloat, "", "src0 / src1")
@@ -427,18 +431,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",

 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0

-binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
-binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")

 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively

-binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
-binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
-binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
-binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
+binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal


 binop("ishl", tint, "", "src0 << src1")
@@ -461,11 +465,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false

-binop("fand", tfloat, commutative,
+binop("fand", tfloat32, commutative,
      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("for", tfloat, commutative,
+binop("for", tfloat32, commutative,
      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("fxor", tfloat, commutative,
+binop("fxor", tfloat32, commutative,
      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")

 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
@@ -487,7 +491,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")

 # Saturated vector add for 4 8bit ints.
-binop("usadd_4x8", tint, commutative + associative, """
+binop("usadd_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
@@ -495,7 +499,7 @@ for (int i = 0; i < 32; i += 8) {
 """)

 # Saturated vector subtract for 4 8bit ints.
-binop("ussub_4x8", tint, "", """
+binop("ussub_4x8", tint32, "", """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
   int src0_chan = (src0 >> i) & 0xff;
@@ -506,7 +510,7 @@ for (int i = 0; i < 32; i += 8) {
 """)

 # vector min for 4 8bit ints.
-binop("umin_4x8", tint, commutative + associative, """
+binop("umin_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -514,7 +518,7 @@ for (int i = 0; i < 32; i += 8) {
 """)

 # vector max for 4 8bit ints.
-binop("umax_4x8", tint, commutative + associative, """
+binop("umax_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -522,7 +526,7 @@ for (int i = 0; i < 32; i += 8) {
 """)

 # unorm multiply: (a * b) / 255.
-binop("umul_unorm_4x8", tint, commutative + associative, """
+binop("umul_unorm_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
   int src0_chan = (src0 >> i) & 0xff;
@@ -531,15 +535,15 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)

-binop("fpow", tfloat, "", "powf(src0, src1)")
+binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")

-binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
+binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")

 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 # if either of its arguments are 32.
-binop_convert("bfm", tuint, tint, "", """
+binop_convert("bfm", tuint32, tint32, "", """
 int bits = src0, offset = src1;
 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
   dst = 0; /* undefined */
@@ -548,7 +552,7 @@ else
 """)

 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
-dst = ldexpf(src0, src1);
+dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 /* flush denormals to zero. */
 if (!isnormal(dst))
   dst = copysignf(0.0f, src0);
@@ -588,12 +592,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).


-triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
+triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tuint, [0, 0, 0],
      [tbool, tuint, tuint], "", "src0 ? src1 : src2")

 # SM5 bfi assembly
-triop("bfi", tuint, """
+triop("bfi", tuint32, """
 unsigned mask = src0, insert = src1, base = src2;
 if (mask == 0) {
   dst = base;
@@ -608,8 +612,8 @@ if (mask == 0) {
 """)

 # SM5 ubfe/ibfe assembly
-opcode("ubfe", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubfe", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -622,8 +626,8 @@ if (bits == 0) {
   dst = base >> offset;
 }
 """)
-opcode("ibfe", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibfe", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -638,8 +642,8 @@ if (bits == 0) {
 """)

 # GLSL bitfieldExtract()
-opcode("ubitfield_extract", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubitfield_extract", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -650,8 +654,8 @@ if (bits == 0) {
   dst = (base >> offset) & ((1ull << bits) - 1);
 }
 """)
-opcode("ibitfield_extract", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibitfield_extract", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -678,8 +682,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
          [tuint, tuint, tuint, tuint],
          "", const_expr)

-opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
-       [tuint, tuint, tint, tint], "", """
+opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
+       [tuint32, tuint32, tint32, tint32], "", """
 unsigned base = src0, insert = src1;
 int offset = src2, bits = src3;
 if (bits == 0) {
@@ -35,10 +35,17 @@ d = 'd'

 # Written in the form (<search>, <replace>) where <search> is an expression
 # and <replace> is either an expression or a value.  An expression is
-# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>)
+# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
 # where each source is either an expression or a value.  A value can be
 # either a numeric constant or a string representing a variable name.
 #
+# If the opcode in a search expression is prefixed by a '~' character, this
+# indicates that the operation is inexact.  Such operations will only get
+# applied to SSA values that do not have the exact bit set.  This should be
+# used by by any optimizations that are not bit-for-bit exact.  It should not,
+# however, be used for backend-requested lowering operations as those need to
+# happen regardless of precision.
+#
 # Variable names are specified as "[#]name[@type]" where "#" inicates that
 # the given variable will only match constants and the type indicates that
 # the given variable will only match values from ALU instructions with the
@@ -55,19 +62,19 @@ optimizations = [
   (('fabs', ('fneg', a)), ('fabs', a)),
   (('iabs', ('iabs', a)), ('iabs', a)),
   (('iabs', ('ineg', a)), ('iabs', a)),
-   (('fadd', a, 0.0), a),
+   (('~fadd', a, 0.0), a),
   (('iadd', a, 0), a),
   (('usadd_4x8', a, 0), a),
   (('usadd_4x8', a, ~0), ~0),
-   (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
+   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
   (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
-   (('fadd', ('fneg', a), a), 0.0),
+   (('~fadd', ('fneg', a), a), 0.0),
   (('iadd', ('ineg', a), a), 0),
   (('iadd', ('ineg', a), ('iadd', a, b)), b),
   (('iadd', a, ('iadd', ('ineg', a), b)), b),
-   (('fadd', ('fneg', a), ('fadd', a, b)), b),
-   (('fadd', a, ('fadd', ('fneg', a), b)), b),
-   (('fmul', a, 0.0), 0.0),
+   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
+   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+   (('~fmul', a, 0.0), 0.0),
   (('imul', a, 0), 0),
   (('umul_unorm_4x8', a, 0), 0),
   (('umul_unorm_4x8', a, ~0), a),
@@ -76,32 +83,48 @@ optimizations = [
   (('fmul', a, -1.0), ('fneg', a)),
   (('imul', a, -1), ('ineg', a)),
   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
-   (('ffma', 0.0, a, b), b),
-   (('ffma', a, 0.0, b), b),
-   (('ffma', a, b, 0.0), ('fmul', a, b)),
+   (('~ffma', 0.0, a, b), b),
+   (('~ffma', a, 0.0, b), b),
+   (('~ffma', a, b, 0.0), ('fmul', a, b)),
   (('ffma', a, 1.0, b), ('fadd', a, b)),
   (('ffma', 1.0, a, b), ('fadd', a, b)),
-   (('flrp', a, b, 0.0), a),
-   (('flrp', a, b, 1.0), b),
-   (('flrp', a, a, b), a),
-   (('flrp', 0.0, a, b), ('fmul', a, b)),
+   (('~flrp', a, b, 0.0), a),
+   (('~flrp', a, b, 1.0), b),
+   (('~flrp', a, a, b), a),
+   (('~flrp', 0.0, a, b), ('fmul', a, b)),
+   (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
   (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
   (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
-   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'),
-   (('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
   (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
-   (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
+   (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
   # Comparison simplifications
-   (('inot', ('flt', a, b)), ('fge', a, b)),
-   (('inot', ('fge', a, b)), ('flt', a, b)),
-   (('inot', ('feq', a, b)), ('fne', a, b)),
-   (('inot', ('fne', a, b)), ('feq', a, b)),
+   (('~inot', ('flt', a, b)), ('fge', a, b)),
+   (('~inot', ('fge', a, b)), ('flt', a, b)),
+   (('~inot', ('feq', a, b)), ('fne', a, b)),
+   (('~inot', ('fne', a, b)), ('feq', a, b)),
   (('inot', ('ilt', a, b)), ('ige', a, b)),
   (('inot', ('ige', a, b)), ('ilt', a, b)),
   (('inot', ('ieq', a, b)), ('ine', a, b)),
   (('inot', ('ine', a, b)), ('ieq', a, b)),
+
+   # 0.0 >= b2f(a)
+   # b2f(a) <= 0.0
+   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
+   # inot(a)
+   (('fge', 0.0, ('b2f', a)), ('inot', a)),
+
+   # 0.0 < fabs(a)
+   # fabs(a) > 0.0
+   # fabs(a) != 0.0 because fabs(a) must be >= 0
+   # a != 0.0
+   (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+
   (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
-   (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
+   (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
   (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
   (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
@@ -111,15 +134,19 @@ optimizations = [
   (('imax', a, a), a),
   (('umin', a, a), a),
   (('umax', a, a), a),
-   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
-   (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
   (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
   (('fsat', ('fsat', a)), ('fsat', a)),
   (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
-   (('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
-   (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
-   (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
-   (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
+   (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
+   (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
+   (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('fabs', ('slt', a, b)), ('slt', a, b)),
+   (('fabs', ('sge', a, b)), ('sge', a, b)),
+   (('fabs', ('seq', a, b)), ('seq', a, b)),
+   (('fabs', ('sne', a, b)), ('sne', a, b)),
   (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
   (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
   (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
@@ -151,7 +178,6 @@ optimizations = [
   (('ior', a, 0), a),
   (('fxor', a, a), 0.0),
   (('ixor', a, a), 0),
-   (('fxor', a, 0.0), a),
   (('ixor', a, 0), a),
   (('inot', ('inot', a)), a),
   # DeMorgan's Laws
@@ -167,35 +193,35 @@ optimizations = [
   (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
   (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
   # Exponential/logarithmic identities
-   (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
-   (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
+   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
+   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
   (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
-   (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
-   (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
-    ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
-   (('fpow', a, 1.0), a),
-   (('fpow', a, 2.0), ('fmul', a, a)),
-   (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
-   (('fpow', 2.0, a), ('fexp2', a)),
-   (('fpow', ('fpow', a, 2.2), 0.454545), a),
-   (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
-   (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
-   (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
-   (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
-   (('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
-   (('flog2', ('frcp', a)), ('fneg', ('flog2', a))),
-   (('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
-   (('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
-   (('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
-   (('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
-   (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
+   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
+   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
+    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+   (('~fpow', a, 1.0), a),
+   (('~fpow', a, 2.0), ('fmul', a, a)),
+   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
+   (('~fpow', 2.0, a), ('fexp2', a)),
+   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
+   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
+   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
+   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
+   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
+   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
+   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
+   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
+   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
+   (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
+   (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
+   (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
   # Division and reciprocal
-   (('fdiv', 1.0, a), ('frcp', a)),
+   (('~fdiv', 1.0, a), ('frcp', a)),
   (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
-   (('frcp', ('frcp', a)), a),
-   (('frcp', ('fsqrt', a)), ('frsq', a)),
+   (('~frcp', ('frcp', a)), a),
+   (('~frcp', ('fsqrt', a)), ('frsq', a)),
   (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
-   (('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
+   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
   # Boolean simplifications
   (('ieq', 'a@bool', True), a),
   (('ine', 'a@bool', True), ('inot', a)),
@@ -216,6 +242,10 @@ optimizations = [
   (('i2b', ('b2i', a)), a),
   (('f2i', ('ftrunc', a)), ('f2i', a)),
   (('f2u', ('ftrunc', a)), ('f2u', a)),
+   (('i2b', ('ineg', a)), ('i2b', a)),
+   (('i2b', ('iabs', a)), ('i2b', a)),
+   (('fabs', ('b2f', a)), ('b2f', a)),
+   (('iabs', ('b2i', a)), ('b2i', a)),

   # Byte extraction
   (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
@@ -228,7 +258,7 @@ optimizations = [
   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),

   # Subtracts
-   (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
+   (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
   (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
   (('ussub_4x8', a, 0), a),
   (('ussub_4x8', a, ~0), 0),
@@ -236,7 +266,7 @@ optimizations = [
   (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
   (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
   (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
-   (('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
+   (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
   (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
   (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
   (('iabs', ('isub', 0, a)), ('iabs', a)),
@@ -368,10 +398,13 @@ for op in ['flt', 'fge', 'feq', 'fne',
 # they help code generation but do not necessarily produce code that is
 # more easily optimizable.
 late_optimizations = [
+   # Most of these optimizations aren't quite safe when you get infinity or
+   # Nan involved but the first one should be fine.
   (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))),
-   (('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
-   (('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
-   (('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+   (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
+   (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
+   (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+
   (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
   (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
   (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
@@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
   if (!instr->dest.dest.is_ssa)
      return false;

+   /* In the case that any outputs/inputs have unsized types, then we need to
+    * guess the bit-size. In this case, the validator ensures that all
+    * bit-sizes match so we can just take the bit-size from first
+    * output/input with an unsized type. If all the outputs/inputs are sized
+    * then we don't need to guess the bit-size at all because the code we
+    * generate for constant opcodes in this case already knows the sizes of
+    * the types involved and does not need the provided bit-size for anything
+    * (although it still requires to receive a valid bit-size).
+    */
+   unsigned bit_size = 0;
+   if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type))
+      bit_size = instr->dest.dest.ssa.bit_size;
+
   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
      if (!instr->src[i].src.is_ssa)
         return false;

+      if (bit_size == 0 &&
+          !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) {
+         bit_size = instr->src[i].src.ssa->bit_size;
+      }
+
      nir_instr *src_instr = instr->src[i].src.ssa->parent_instr;

      if (src_instr->type != nir_instr_type_load_const)
@@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)

      for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
           j++) {
-         src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
+         if (load_const->def.bit_size == 64)
+            src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]];
+         else
+            src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]];
      }

      /* We shouldn't have any source modifiers in the optimization loop. */
      assert(!instr->src[i].abs && !instr->src[i].negate);
   }

+   if (bit_size == 0)
+      bit_size = 32;
+
   /* We shouldn't have any saturate modifiers in the optimization loop. */
   assert(!instr->dest.saturate);

   nir_const_value dest =
      nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components,
-                            src);
+                            bit_size, src);

   nir_load_const_instr *new_instr =
      nir_load_const_instr_create(mem_ctx,
                                  instr->dest.dest.ssa.num_components);

+   new_instr->def.bit_size = instr->dest.dest.ssa.bit_size;
   new_instr->value = dest;

   nir_instr_insert_before(&instr->instr, &new_instr->instr);
@@ -106,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref)
         nir_load_const_instr *indirect =
            nir_instr_as_load_const(arr->indirect.ssa->parent_instr);

-         arr->base_offset += indirect->value.u[0];
+         arr->base_offset += indirect->value.u32[0];

         /* Clear out the source */
         nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL));
@@ -228,7 +228,7 @@ dead_cf_block(nir_block *block)
     if (!const_value)
        return false;

-      opt_constant_if(following_if, const_value->u[0] != 0);
+      opt_constant_if(following_if, const_value->u32[0] != 0);
      return true;
   }

@@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state)
      }

      nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components,
+                        phi->dest.ssa.bit_size, phi->dest.ssa.name);
      sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;

      nir_ssa_def_rewrite_uses(&phi->dest.ssa,
@@ -52,6 +52,7 @@ struct nir_phi_builder_value {

   /* Needed so we can create phis and undefs */
   unsigned num_components;
+   unsigned bit_size;

   /* The list of phi nodes associated with this value.  Phi nodes are not
    * added directly.  Instead, they are created, the instr->block pointer
@@ -61,8 +62,18 @@ struct nir_phi_builder_value {
    */
   struct exec_list phis;

-   /* Array of SSA defs, indexed by block.  If a phi needs to be inserted
-    * in a given block, it will have the magic value NEEDS_PHI.
+   /* Array of SSA defs, indexed by block.  For each block, this array has has
+    * one of three types of values:
+    *
+    *  - NULL. Indicates that there is no known definition in this block.  If
+    *    you need to find one, look at the block's immediate dominator.
+    *
+    *  - NEEDS_PHI. Indicates that the block may need a phi node but none has
+    *    been created yet.  If a def is requested for a block, a phi will need
+    *    to be created.
+    *
+    *  - A regular SSA def.  This will be either the result of a phi node or
+    *    one of the defs provided by nir_phi_builder_value_set_blocK_def().
    */
   nir_ssa_def *defs[0];
 };
@@ -101,7 +112,7 @@ nir_phi_builder_create(nir_function_impl *impl)

 struct nir_phi_builder_value *
 nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
-                          const BITSET_WORD *defs)
+                          unsigned bit_size, const BITSET_WORD *defs)
 {
   struct nir_phi_builder_value *val;
   unsigned i, w_start = 0, w_end = 0;
@@ -109,6 +120,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
   val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks);
   val->builder = pb;
   val->num_components = num_components;
+   val->bit_size = bit_size;
   exec_list_make_empty(&val->phis);
   exec_list_push_tail(&pb->values, &val->node);

@@ -127,8 +139,7 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
      set_foreach(cur->dom_frontier, dom_entry) {
         nir_block *next = (nir_block *) dom_entry->key;

-         /*
-          * If there's more than one return statement, then the end block
+         /* If there's more than one return statement, then the end block
          * can be a join point for some definitions. However, there are
          * no instructions in the end block, so nothing would use those
          * phi nodes. Of course, we couldn't place those phi nodes
@@ -139,6 +150,10 @@ nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
            continue;

         if (val->defs[next->index] == NULL) {
+            /* Instead of creating a phi node immediately, we simply set the
+             * value to the magic value NEEDS_PHI.  Later, we create phi nodes
+             * on demand in nir_phi_builder_value_get_block_def().
+             */
            val->defs[next->index] = NEEDS_PHI;

            if (pb->work[next->index] < pb->iter_count) {
@@ -163,7 +178,9 @@ nir_ssa_def *
 nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
                                    nir_block *block)
 {
+   /* For each block, we have one of three types of values */
   if (val->defs[block->index] == NULL) {
+      /* NULL indicates that we have no SSA def for this block. */
      if (block->imm_dom) {
         /* Grab it from our immediate dominator.  We'll stash it here for
          * easy access later.
@@ -185,17 +202,36 @@ nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
         return &undef->def;
      }
   } else if (val->defs[block->index] == NEEDS_PHI) {
-      /* If we need a phi instruction, go ahead and create one but don't
-       * add it to the program yet.  Later, we'll go through and set up phi
-       * sources and add the instructions will be added at that time.
+      /* The magic value NEEDS_PHI indicates that the block needs a phi node
+       * but none has been created.  We need to create one now so we can
+       * return it to the caller.
+       *
+       * Because a phi node may use SSA defs that it does not dominate (this
+       * happens in loops), we do not yet have enough information to fully
+       * fill out the phi node.  Instead, the phi nodes we create here will be
+       * empty (have no sources) and won't actually be placed in the block's
+       * instruction list yet.  Later, in nir_phi_builder_finish(), we walk
+       * over all of the phi instructions, fill out the sources lists, and
+       * place them at the top of their respective block's instruction list.
+       *
+       * Creating phi nodes on-demand allows us to avoid creating dead phi
+       * nodes that will just get deleted later. While this probably isn't a
+       * big win for a full into-SSA pass, other users may use the phi builder
+       * to make small SSA form repairs where most of the phi nodes will never
+       * be used.
       */
      nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader);
-      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components, NULL);
+      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components,
+                        val->bit_size, NULL);
      phi->instr.block = block;
      exec_list_push_tail(&val->phis, &phi->instr.node);
      val->defs[block->index] = &phi->dest.ssa;
      return &phi->dest.ssa;
   } else {
+      /* In this case, we have an actual SSA def.  It's either the result of a
+       * phi node created by the case above or one passed to us through
+       * nir_phi_builder_value_set_block_def().
+       */
      return val->defs[block->index];
   }
 }
@@ -216,9 +252,14 @@ nir_phi_builder_finish(struct nir_phi_builder *pb)
   NIR_VLA(nir_block *, preds, num_blocks);

   foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) {
-      /* We can't iterate over the list of phis normally because we are
-       * removing them as we go and, in some cases, adding new phis as we
-       * build the source lists of others.
+      /* We treat the linked list of phi nodes like a worklist.  The list is
+       * pre-populated by calls to nir_phi_builder_value_get_block_def() that
+       * create phi nodes.  As we fill in the sources of phi nodes, more may
+       * be created and are added to the end of the list.
+       *
+       * Because we are adding and removing phi nodes from the list as we go,
+       * we can't iterate over it normally.  Instead, we just iterate until
+       * the list is empty.
       */
      while (!exec_list_is_empty(&val->phis)) {
         struct exec_node *head = exec_list_get_head(&val->phis);
@@ -25,7 +25,38 @@

 #include "nir.h"

+/** A helper for placing phi nodes in a NIR shader
+ *
+ * Basic usage goes something like this:
+ *
+ *     each variable, var, has:
+ *         a bitset var.defs of blocks where the variable is defined
+ *         a struct nir_phi_builder_value *pb_val
+ *
+ *     // initialize bitsets
+ *     foreach block:
+ *         foreach def of variable var:
+ *             var.defs[def.block] = true;
+ *
+ *     // initialize phi builder
+ *     pb = nir_phi_builder_create()
+ *     foreach var:
+ *         var.pb_val = nir_phi_builder_add_value(pb, var.defs)
+ *
+ *     // Visit each block.  This needs to visit dominators first;
+ *     // nir_for_each_block() will be ok.
+ *     foreach block:
+ *         foreach instruction:
+ *             foreach use of variable var:
+ *                 replace use with nir_phi_builder_get_block_def(var.pb_val)
+ *             foreach def of variable var:
+ *                 create ssa def, register with
+ *     nir_phi_builder_set_block_def(var.pb_val)
+ *
+ *     nir_phi_builder_finish(pb)
+ */
 struct nir_phi_builder;
+
 struct nir_phi_builder_value;

 /* Create a new phi builder.
@@ -43,7 +74,7 @@ struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl);
 */
 struct nir_phi_builder_value *
 nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
-                          const BITSET_WORD *defs);
+                          unsigned bit_size, const BITSET_WORD *defs);

 /* Register a definition for the given value and block.
 *
@@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state)
   print_alu_dest(&instr->dest, state);

   fprintf(fp, " = %s", nir_op_infos[instr->op].name);
+   if (instr->exact)
+      fprintf(fp, "!");
   if (instr->dest.saturate)
      fprintf(fp, ".sat");
   fprintf(fp, " ");
@@ -714,7 +716,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
       * and then print the float in a comment for readability.
       */

-      fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]);
+      fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
   }

   fprintf(fp, ")");
@@ -85,7 +85,8 @@ repair_ssa_def(nir_ssa_def *def, void *void_state)
   BITSET_SET(state->def_set, def->parent_instr->block->index);

   struct nir_phi_builder_value *val =
-      nir_phi_builder_add_value(pb, def->num_components, state->def_set);
+      nir_phi_builder_add_value(pb, def->num_components, def->bit_size,
+                                state->def_set);

   nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);

@@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr)
   case nir_op_inot:
      return src_is_bool(instr->src[0].src);
   default:
-      return nir_op_infos[instr->op].output_type == nir_type_bool;
+      return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type)
+             == nir_type_bool);
   }
 }

@@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
            nir_alu_instr *src_alu =
               nir_instr_as_alu(instr->src[src].src.ssa->parent_instr);

-            if (nir_op_infos[src_alu->op].output_type != var->type &&
-                !(var->type == nir_type_bool && alu_instr_is_bool(src_alu)))
+            if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) !=
+                var->type &&
+                !(nir_alu_type_get_base_type(var->type) == nir_type_bool &&
+                  alu_instr_is_bool(src_alu)))
               return false;
         }

@@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
      nir_load_const_instr *load =
         nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr);

-      switch (nir_op_infos[instr->op].input_types[src]) {
+      switch (const_val->type) {
      case nir_type_float:
         for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.f[new_swizzle[i]] != const_val->data.f)
+            double val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.f32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.f64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.d)
               return false;
         }
         return true;
+
      case nir_type_int:
-      case nir_type_uint:
-      case nir_type_bool:
         for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.i[new_swizzle[i]] != const_val->data.i)
+            int64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.i32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.i64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.i)
               return false;
         }
         return true;
+
+      case nir_type_uint:
+      case nir_type_bool32:
+         for (unsigned i = 0; i < num_components; ++i) {
+            uint64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.u32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.u64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.u)
+               return false;
+         }
+         return true;
+
      default:
         unreachable("Invalid alu source type");
      }
@@ -191,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
   if (instr->op != expr->opcode)
      return false;

+   assert(instr->dest.dest.is_ssa);
+   if (expr->inexact && instr->exact)
+      return false;
+
   assert(!instr->dest.saturate);
   assert(nir_op_infos[instr->op].num_inputs > 0);

@@ -244,9 +295,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
   }
 }

+typedef struct bitsize_tree {
+   unsigned num_srcs;
+   struct bitsize_tree *srcs[4];
+
+   unsigned common_size;
+   bool is_src_sized[4];
+   bool is_dest_sized;
+
+   unsigned dest_size;
+   unsigned src_size[4];
+} bitsize_tree;
+
+static bitsize_tree *
+build_bitsize_tree(void *mem_ctx, struct match_state *state,
+                   const nir_search_value *value)
+{
+   bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree);
+
+   switch (value->type) {
+   case nir_search_value_expression: {
+      nir_search_expression *expr = nir_search_value_as_expression(value);
+      nir_op_info info = nir_op_infos[expr->opcode];
+      tree->num_srcs = info.num_inputs;
+      tree->common_size = 0;
+      for (unsigned i = 0; i < info.num_inputs; i++) {
+         tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]);
+         if (tree->is_src_sized[i])
+            tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]);
+         tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]);
+      }
+      tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type);
+      if (tree->is_dest_sized)
+         tree->dest_size = nir_alu_type_get_type_size(info.output_type);
+      break;
+   }
+
+   case nir_search_value_variable: {
+      nir_search_variable *var = nir_search_value_as_variable(value);
+      tree->num_srcs = 0;
+      tree->is_dest_sized = true;
+      tree->dest_size = nir_src_bit_size(state->variables[var->variable].src);
+      break;
+   }
+
+   case nir_search_value_constant: {
+      tree->num_srcs = 0;
+      tree->is_dest_sized = false;
+      tree->common_size = 0;
+      break;
+   }
+   }
+
+   return tree;
+}
+
+static unsigned
+bitsize_tree_filter_up(bitsize_tree *tree)
+{
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]);
+      if (src_size == 0)
+         continue;
+
+      if (tree->is_src_sized[i]) {
+         assert(src_size == tree->src_size[i]);
+      } else if (tree->common_size != 0) {
+         assert(src_size == tree->common_size);
+         tree->src_size[i] = src_size;
+      } else {
+         tree->common_size = src_size;
+         tree->src_size[i] = src_size;
+      }
+   }
+
+   if (tree->num_srcs && tree->common_size) {
+      if (tree->dest_size == 0)
+         tree->dest_size = tree->common_size;
+      else if (!tree->is_dest_sized)
+         assert(tree->dest_size == tree->common_size);
+
+      for (unsigned i = 0; i < tree->num_srcs; i++) {
+         if (!tree->src_size[i])
+            tree->src_size[i] = tree->common_size;
+      }
+   }
+
+   return tree->dest_size;
+}
+
+static void
+bitsize_tree_filter_down(bitsize_tree *tree, unsigned size)
+{
+   if (tree->dest_size)
+      assert(tree->dest_size == size);
+   else
+      tree->dest_size = size;
+
+   if (!tree->is_dest_sized) {
+      if (tree->common_size)
+         assert(tree->common_size == size);
+      else
+         tree->common_size = size;
+   }
+
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      if (!tree->src_size[i]) {
+         assert(tree->common_size);
+         tree->src_size[i] = tree->common_size;
+      }
+      bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]);
+   }
+}
+
 static nir_alu_src
-construct_value(const nir_search_value *value, nir_alu_type type,
-                unsigned num_components, struct match_state *state,
+construct_value(const nir_search_value *value,
+                unsigned num_components, bitsize_tree *bitsize, bool exact,
+                struct match_state *state,
                nir_instr *instr, void *mem_ctx)
 {
   switch (value->type) {
@@ -257,7 +422,9 @@ construct_value(const nir_search_value *value, nir_alu_type type,
         num_components = nir_op_infos[expr->opcode].output_size;

      nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
-      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL);
+      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components,
+                        bitsize->dest_size, NULL);
+      alu->exact = exact;
      alu->dest.write_mask = (1 << num_components) - 1;
      alu->dest.saturate = false;

@@ -269,8 +436,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
            num_components = nir_op_infos[alu->op].input_sizes[i];

         alu->src[i] = construct_value(expr->srcs[i],
-                                       nir_op_infos[alu->op].input_types[i],
-                                       num_components,
+                                       num_components, bitsize->srcs[i], exact,
                                       state, instr, mem_ctx);
      }

@@ -301,23 +467,57 @@ construct_value(const nir_search_value *value, nir_alu_type type,
      const nir_search_constant *c = nir_search_value_as_constant(value);
      nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1);

-      switch (type) {
+      switch (c->type) {
      case nir_type_float:
-         load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f);
-         load->value.f[0] = c->data.f;
+         load->def.name = ralloc_asprintf(load, "%f", c->data.d);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.f32[0] = c->data.d;
+            break;
+         case 64:
+            load->value.f64[0] = c->data.d;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
         break;
+
      case nir_type_int:
-         load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
-         load->value.i[0] = c->data.i;
+         load->def.name = ralloc_asprintf(load, "%ld", c->data.i);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.i32[0] = c->data.i;
+            break;
+         case 64:
+            load->value.i64[0] = c->data.i;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
         break;
+
      case nir_type_uint:
-      case nir_type_bool:
-         load->value.u[0] = c->data.u;
+         load->def.name = ralloc_asprintf(load, "%lu", c->data.u);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.u32[0] = c->data.u;
+            break;
+         case 64:
+            load->value.u64[0] = c->data.u;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
+
+      case nir_type_bool32:
+         load->value.u32[0] = c->data.u;
         break;
      default:
         unreachable("Invalid alu source type");
      }

+      load->def.bit_size = bitsize->dest_size;
+
      nir_instr_insert_before(instr, &load->instr);

      nir_alu_src val;
@@ -352,6 +552,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                         swizzle, &state))
      return NULL;

+   void *bitsize_ctx = ralloc_context(NULL);
+   bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace);
+   bitsize_tree_filter_up(tree);
+   bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size);
+
   /* Inserting a mov may be unnecessary.  However, it's much easier to
    * simply let copy propagation clean this up than to try to go through
    * and rewrite swizzles ourselves.
@@ -359,11 +564,12 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
   nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
   mov->dest.write_mask = instr->dest.write_mask;
   nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                     instr->dest.dest.ssa.num_components, NULL);
+                     instr->dest.dest.ssa.num_components,
+                     instr->dest.dest.ssa.bit_size, NULL);

-   mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type,
-                                 instr->dest.dest.ssa.num_components, &state,
-                                 &instr->instr, mem_ctx);
+   mov->src[0] = construct_value(replace,
+                                 instr->dest.dest.ssa.num_components, tree,
+                                 instr->exact, &state, &instr->instr, mem_ctx);
   nir_instr_insert_before(&instr->instr, &mov->instr);

   nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa,
@@ -375,5 +581,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
    */
   nir_instr_remove(&instr->instr);

+   ralloc_free(bitsize_ctx);
+
   return mov;
 }
@@ -71,16 +71,24 @@ typedef struct {
 typedef struct {
   nir_search_value value;

+   nir_alu_type type;
+
   union {
-      uint32_t u;
-      int32_t i;
-      float f;
+      uint64_t u;
+      int64_t i;
+      double d;
   } data;
 } nir_search_constant;

 typedef struct {
   nir_search_value value;

+   /* When set on a search expression, the expression will only match an SSA
+    * value that does *not* have the exact bit set.  If unset, the exact bit
+    * on the SSA value is ignored.
+    */
+   bool inexact;
+
   nir_op opcode;
   const nir_search_value *srcs[4];
 } nir_search_expression;
@@ -219,7 +219,9 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
                             state->states[index].num_defs);

   list_del(&dest->reg.def_link);
-   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name);
+   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
+                     reg->bit_size, name);
+   ralloc_free(name);

   /* push our SSA destination on the stack */
   state->states[index].index++;
@@ -271,7 +273,9 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)

      instr->dest.write_mask = (1 << num_components) - 1;
      list_del(&instr->dest.dest.reg.def_link);
-      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name);
+      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                        reg->bit_size, name);
+      ralloc_free(name);

      if (nir_op_infos[instr->op].output_size == 0) {
         /*
@@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
   nir_alu_src *src = &instr->src[index];

   unsigned num_components;
-   if (src->src.is_ssa)
+   unsigned src_bit_size;
+   if (src->src.is_ssa) {
+      src_bit_size = src->src.ssa->bit_size;
      num_components = src->src.ssa->num_components;
-   else {
+   } else {
+      src_bit_size = src->src.reg.reg->bit_size;
      if (src->src.reg.reg->is_packed)
         num_components = 4; /* can't check anything */
      else
@@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
         assert(src->swizzle[i] < num_components);
   }

+   nir_alu_type src_type = nir_op_infos[instr->op].input_types[index];
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(src_type) == nir_type_float)
+      assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64);
+
+   if (nir_alu_type_get_type_size(src_type)) {
+      /* This source has an explicit bit size */
+      assert(nir_alu_type_get_type_size(src_type) == src_bit_size);
+   } else {
+      if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) {
+         unsigned dest_bit_size =
+            instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size
+                                    : instr->dest.dest.reg.reg->bit_size;
+         assert(dest_bit_size == src_bit_size);
+      }
+   }
+
   validate_src(&src->src, state);
 }

@@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state)
 }

 static void
-validate_alu_dest(nir_alu_dest *dest, validate_state *state)
+validate_alu_dest(nir_alu_instr *instr, validate_state *state)
 {
+   nir_alu_dest *dest = &instr->dest;
+
   unsigned dest_size =
      dest->dest.is_ssa ? dest->dest.ssa.num_components
                        : dest->dest.reg.reg->num_components;
@@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state)
   assert(nir_op_infos[alu->op].output_type == nir_type_float ||
          !dest->saturate);

+   unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size
+                                         : dest->dest.reg.reg->bit_size;
+   nir_alu_type type = nir_op_infos[instr->op].output_type;
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(type) == nir_type_float)
+      assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+
+   assert(nir_alu_type_get_type_size(type) == 0 ||
+          nir_alu_type_get_type_size(type) == bit_size);
+
   validate_dest(&dest->dest, state);
 }

@@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state)
      validate_alu_src(instr, i, state);
   }

-   validate_alu_dest(&instr->dest, state);
+   validate_alu_dest(instr, state);
 }

 static void
@@ -92,7 +92,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
            nir_load_const_instr_create(b->shader, num_components);

         for (unsigned i = 0; i < num_components; i++)
-            load->value.u[i] = constant->value.u[i];
+            load->value.u32[i] = constant->value.u[i];

         nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
         val->def = &load->def;
@@ -109,7 +109,7 @@ vtn_const_ssa_value(struct vtn_builder *b, nir_constant *constant,
               nir_load_const_instr_create(b->shader, rows);

            for (unsigned j = 0; j < rows; j++)
-               load->value.u[j] = constant->value.u[rows * i + j];
+               load->value.u32[j] = constant->value.u[rows * i + j];

            nir_instr_insert_before_cf_list(&b->impl->body, &load->instr);
            col_val->def = &load->def;
@@ -1035,6 +1035,8 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
         nir_op op = vtn_nir_alu_op_for_spirv_opcode(opcode, &swap);

         unsigned num_components = glsl_get_vector_elements(val->const_type);
+         unsigned bit_size =
+            glsl_get_bit_size(glsl_get_base_type(val->const_type));

         nir_const_value src[3];
         assert(count <= 7);
@@ -1043,14 +1045,16 @@ vtn_handle_constant(struct vtn_builder *b, SpvOp opcode,
               vtn_value(b, w[4 + i], vtn_value_type_constant)->constant;

            unsigned j = swap ? 1 - i : i;
+            assert(bit_size == 32);
            for (unsigned k = 0; k < num_components; k++)
-               src[j].u[k] = c->value.u[k];
+               src[j].u32[k] = c->value.u[k];
         }

-         nir_const_value res = nir_eval_const_opcode(op, num_components, src);
+         nir_const_value res = nir_eval_const_opcode(op, num_components,
+                                                     bit_size, src);

         for (unsigned k = 0; k < num_components; k++)
-            val->constant->value.u[k] = res.u[k];
+            val->constant->value.u[k] = res.u32[k];

         return;
      } /* default */
@@ -1414,7 +1418,7 @@ vtn_handle_texture(struct vtn_builder *b, SpvOp opcode,
   }

   nir_ssa_dest_init(&instr->instr, &instr->dest,
-                     nir_tex_instr_dest_size(instr), NULL);
+                     nir_tex_instr_dest_size(instr), 32, NULL);

   assert(glsl_get_vector_elements(ret_type->type) ==
          nir_tex_instr_dest_size(instr));
@@ -1600,7 +1604,7 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode,
   if (opcode != SpvOpImageWrite) {
      struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
      struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
-      nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, NULL);
+      nir_ssa_dest_init(&intrin->instr, &intrin->dest, 4, 32, NULL);

      nir_builder_instr_insert(&b->nb, &intrin->instr);

@@ -1738,7 +1742,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
      fill_common_atomic_sources(b, opcode, w, &atomic->src[2]);
   }

-   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, NULL);
+   nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL);

   struct vtn_type *type = vtn_value(b, w[1], vtn_value_type_type)->type;
   struct vtn_value *val = vtn_push_value(b, w[2], vtn_value_type_ssa);
@@ -1750,7 +1754,7 @@ vtn_handle_ssbo_or_shared_atomic(struct vtn_builder *b, SpvOp opcode,
 }

 static nir_alu_instr *
-create_vec(nir_shader *shader, unsigned num_components)
+create_vec(nir_shader *shader, unsigned num_components, unsigned bit_size)
 {
   nir_op op;
   switch (num_components) {
@@ -1762,7 +1766,8 @@ create_vec(nir_shader *shader, unsigned num_components)
   }

   nir_alu_instr *vec = nir_alu_instr_create(shader, op);
-   nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&vec->instr, &vec->dest.dest, num_components,
+                     bit_size, NULL);
   vec->dest.write_mask = (1 << num_components) - 1;

   return vec;
@@ -1779,7 +1784,8 @@ vtn_ssa_transpose(struct vtn_builder *b, struct vtn_ssa_value *src)

   for (unsigned i = 0; i < glsl_get_matrix_columns(dest->type); i++) {
      nir_alu_instr *vec = create_vec(b->shader,
-                                      glsl_get_matrix_columns(src->type));
+                                      glsl_get_matrix_columns(src->type),
+                                      glsl_get_bit_size(glsl_get_base_type(src->type)));
      if (glsl_type_is_vector_or_scalar(src->type)) {
          vec->src[0].src = nir_src_for_ssa(src->def);
          vec->src[0].swizzle[0] = i;
@@ -1809,7 +1815,8 @@ nir_ssa_def *
 vtn_vector_insert(struct vtn_builder *b, nir_ssa_def *src, nir_ssa_def *insert,
                  unsigned index)
 {
-   nir_alu_instr *vec = create_vec(b->shader, src->num_components);
+   nir_alu_instr *vec = create_vec(b->shader, src->num_components,
+                                   src->bit_size);

   for (unsigned i = 0; i < src->num_components; i++) {
      if (i == index) {
@@ -1854,7 +1861,7 @@ vtn_vector_shuffle(struct vtn_builder *b, unsigned num_components,
                   nir_ssa_def *src0, nir_ssa_def *src1,
                   const uint32_t *indices)
 {
-   nir_alu_instr *vec = create_vec(b->shader, num_components);
+   nir_alu_instr *vec = create_vec(b->shader, num_components, src0->bit_size);

   nir_ssa_undef_instr *undef = nir_ssa_undef_instr_create(b->shader, 1);
   nir_builder_instr_insert(&b->nb, &undef->instr);
@@ -1884,7 +1891,8 @@ static nir_ssa_def *
 vtn_vector_construct(struct vtn_builder *b, unsigned num_components,
                     unsigned num_srcs, nir_ssa_def **srcs)
 {
-   nir_alu_instr *vec = create_vec(b->shader, num_components);
+   nir_alu_instr *vec = create_vec(b->shader, num_components,
+                                   srcs[0]->bit_size);

   unsigned dest_idx = 0;
   for (unsigned i = 0; i < num_srcs; i++) {
@@ -627,7 +627,9 @@ handle_glsl450_alu(struct vtn_builder *b, enum GLSLstd450 entrypoint,

   nir_alu_instr *instr = nir_alu_instr_create(b->shader, op);
   nir_ssa_dest_init(&instr->instr, &instr->dest.dest,
-                     glsl_get_vector_elements(val->ssa->type), val->name);
+                     glsl_get_vector_elements(val->ssa->type),
+                     glsl_get_bit_size(glsl_get_base_type(val->ssa->type)),
+                     val->name);
   instr->dest.write_mask = (1 << instr->dest.dest.ssa.num_components) - 1;
   val->ssa->def = &instr->dest.dest.ssa;

@@ -190,7 +190,9 @@ _vtn_local_load_store(struct vtn_builder *b, bool load, nir_deref_var *deref,

      if (load) {
         nir_ssa_dest_init(&intrin->instr, &intrin->dest,
-                           intrin->num_components, NULL);
+                           intrin->num_components,
+                           glsl_get_bit_size(glsl_get_base_type(tail->type)),
+                           NULL);
         inout->def = &intrin->dest.ssa;
      } else {
         nir_intrinsic_set_write_mask(intrin, (1 << intrin->num_components) - 1);
@@ -322,7 +324,7 @@ get_vulkan_resource_index(struct vtn_builder *b, struct vtn_access_chain *chain,
   nir_intrinsic_set_desc_set(instr, chain->var->descriptor_set);
   nir_intrinsic_set_binding(instr, chain->var->binding);

-   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
   nir_builder_instr_insert(&b->nb, &instr->instr);

   return &instr->dest.ssa;
@@ -411,7 +413,8 @@ _vtn_load_store_tail(struct vtn_builder *b, nir_intrinsic_op op, bool load,

   if (load) {
      nir_ssa_dest_init(&instr->instr, &instr->dest,
-                        instr->num_components, NULL);
+                        instr->num_components,
+                        glsl_get_bit_size(glsl_get_base_type(type)), NULL);
      (*inout)->def = &instr->dest.ssa;
   }

@@ -1385,7 +1388,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
         nir_intrinsic_instr_create(b->nb.shader,
                                    nir_intrinsic_get_buffer_size);
      instr->src[0] = nir_src_for_ssa(index);
-      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
      nir_builder_instr_insert(&b->nb, &instr->instr);
      nir_ssa_def *buf_size = &instr->dest.ssa;

@@ -80,6 +80,27 @@ enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
 unsigned glsl_get_record_location_offset(const struct glsl_type *type,
                                         unsigned length);

+static inline unsigned
+glsl_get_bit_size(enum glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+   case GLSL_TYPE_SUBROUTINE:
+      return 32;
+
+   case GLSL_TYPE_DOUBLE:
+      return 64;
+
+   default:
+      unreachable("unknown base type");
+   }
+
+   return 0;
+}
+
 bool glsl_type_is_void(const struct glsl_type *type);
 bool glsl_type_is_error(const struct glsl_type *type);
 bool glsl_type_is_vector(const struct glsl_type *type);
@@ -44,7 +44,6 @@
 #include "egllog.h"


-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))


 /**
@@ -40,9 +40,16 @@ extern "C" {

 #define _EGL_MAX_EXTENSIONS_LEN 1000

+/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER,
+ * this is used to implement EGL_LARGEST_PBUFFER.
+ */
+#define _EGL_MAX_PBUFFER_WIDTH 4096
+#define _EGL_MAX_PBUFFER_HEIGHT 4096
+
 #define _EGL_VENDOR_STRING "Mesa Project"

 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))

 #ifdef __cplusplus
 }
@@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
   if (err != EGL_SUCCESS)
      return _eglError(err, func);

+   /* if EGL_LARGEST_PBUFFER in use, clamp width and height */
+   if (surf->LargestPbuffer) {
+      surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH);
+      surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT);
+   }
+
   return EGL_TRUE;
 }

@@ -206,12 +206,6 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader *shader,
 {
   struct tgsi_exec_machine *machine = shader->machine;

-   tgsi_set_exec_mask(machine,
-                      1,
-                      input_primitives > 1,
-                      input_primitives > 2,
-                      input_primitives > 3);
-
   /* run interpreter */
   tgsi_exec_machine_run(machine);

@@ -264,11 +264,11 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
   if (aactx->colorOutput != -1) {
      /* insert texture sampling code for antialiasing. */

-      /* TEX texTemp, input_coord, sampler */
-      tgsi_transform_tex_2d_inst(ctx,
-                                 TGSI_FILE_TEMPORARY, aactx->texTemp,
-                                 TGSI_FILE_INPUT, aactx->maxInput + 1,
-                                 aactx->freeSampler);
+      /* TEX texTemp, input_coord, sampler, 2D */
+      tgsi_transform_tex_inst(ctx,
+                              TGSI_FILE_TEMPORARY, aactx->texTemp,
+                              TGSI_FILE_INPUT, aactx->maxInput + 1,
+                              TGSI_TEXTURE_2D, aactx->freeSampler);

      /* MOV rgb */
      tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
@@ -159,12 +159,6 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
         input = (const float (*)[4])((const char *)input + input_stride);
      } 

-      tgsi_set_exec_mask(machine,
-                         1,
-                         max_vertices > 1,
-                         max_vertices > 2,
-                         max_vertices > 3);
-
      /* run interpreter */
      tgsi_exec_machine_run( machine );

@@ -1191,6 +1191,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
         "FRAG\n"
         "DCL IN[0], GENERIC[0], LINEAR\n"
         "DCL SAMP[0]\n"
+         "DCL SVIEW[0], RECT, FLOAT\n"
         "DCL OUT[0], COLOR[0]\n"
         "DCL TEMP[0]\n"

@@ -459,7 +459,7 @@ ttn_emit_immediate(struct ttn_compile *c)
   c->next_imm++;

   for (i = 0; i < 4; i++)
-      load_const->value.u[i] = tgsi_imm->u[i].Uint;
+      load_const->value.u32[i] = tgsi_imm->u[i].Uint;

   nir_builder_instr_insert(b, &load_const->instr);
 }
@@ -515,8 +515,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
                                           nir_intrinsic_load_var);
         load->num_components = 4;
         load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
-
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+         nir_ssa_dest_init(&load->instr, &load->dest,
+                           4, 32, NULL);
         nir_builder_instr_insert(b, &load->instr);

         src = nir_src_for_ssa(&load->dest.ssa);
@@ -567,7 +567,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
      load = nir_intrinsic_instr_create(b->shader, op);
      load->num_components = ncomp;

-      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, 32, NULL);
      nir_builder_instr_insert(b, &load->instr);

      src = nir_src_for_ssa(&load->dest.ssa);
@@ -632,7 +632,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
      }
      load->src[srcn++] = nir_src_for_ssa(offset);

-      nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
      nir_builder_instr_insert(b, &load->instr);

      src = nir_src_for_ssa(&load->dest.ssa);
@@ -1425,7 +1425,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)

   assert(src_number == num_srcs);

-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
   nir_builder_instr_insert(b, &instr->instr);

   /* Resolve the writemask on the texture op. */
@@ -1464,10 +1464,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
   txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X));
   txs->src[0].src_type = nir_tex_src_lod;

-   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL);
   nir_builder_instr_insert(b, &txs->instr);

-   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL);
+   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL);
   nir_builder_instr_insert(b, &qlv->instr);

   ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
@@ -23,8 +23,6 @@

 #include "compiler/nir/nir.h"

-struct nir_shader_compiler_options *options;
-
 struct nir_shader *
 tgsi_to_nir(const void *tgsi_tokens,
            const struct nir_shader_compiler_options *options);
@@ -33,6 +33,7 @@ static const char nored[] = "FRAG\n"
   "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL TEMP[0]\n"
   "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
   "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -46,6 +47,7 @@ static const char nogreen[] = "FRAG\n"
   "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL TEMP[0]\n"
   "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
   "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -59,6 +61,7 @@ static const char noblue[] = "FRAG\n"
   "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL TEMP[0]\n"
   "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
   "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -50,6 +50,7 @@ static const char depth1fs[] = "FRAG\n"
   "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL TEMP[0..2]\n"
   "IMM FLT32 {    0.0030,     0.0000,     1.0000,     0.0000}\n"
   "  0: TEX TEMP[0].x, IN[1].xyyy, SAMP[0], 2D\n"
@@ -80,6 +81,7 @@ static const char color1fs[] = "FRAG\n"
   "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL TEMP[0..2]\n"
   "IMM FLT32 {    0.2126,     0.7152,     0.0722,     0.1000}\n"
   "IMM FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}\n"
@@ -112,6 +114,7 @@ static const char neigh3fs[] = "FRAG\n"
   "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL SAMP[1]\n"
   "DCL TEMP[0..8]\n"
   "IMM FLT32 {    1.0000,     0.00001,     0.0000,     0.0000}\n"
@@ -175,8 +178,11 @@ static const char blend2fs_1[] = "FRAG\n"
   "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
   "DCL OUT[0], COLOR\n"
   "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
   "DCL SAMP[1]\n"
+   "DCL SVIEW[1], 2D, FLOAT\n"
   "DCL SAMP[2]\n"
+   "DCL SVIEW[2], 2D, FLOAT\n"
   "DCL CONST[0]\n"
   "DCL TEMP[0..6]\n"
   "IMM FLT32 {    0.0000,    -0.2500,     0.00609756,     0.5000}\n"
@@ -111,7 +111,7 @@ tgsi_default_declaration( void )
   declaration.Local = 0;
   declaration.Array = 0;
   declaration.Atomic = 0;
-   declaration.Shared = 0;
+   declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
   declaration.Padding = 0;

   return declaration;
@@ -127,6 +127,8 @@ tgsi_build_declaration(
   unsigned invariant,
   unsigned local,
   unsigned array,
+   unsigned atomic,
+   unsigned mem_type,
   struct tgsi_header *header )
 {
   struct tgsi_declaration declaration;
@@ -143,6 +145,8 @@ tgsi_build_declaration(
   declaration.Invariant = invariant;
   declaration.Local = local;
   declaration.Array = array;
+   declaration.Atomic = atomic;
+   declaration.MemType = mem_type;
   header_bodysize_grow( header );

   return declaration;
@@ -401,6 +405,8 @@ tgsi_build_full_declaration(
      full_decl->Declaration.Invariant,
      full_decl->Declaration.Local,
      full_decl->Declaration.Array,
+      full_decl->Declaration.Atomic,
+      full_decl->Declaration.MemType,
      header );

   if (maxsize <= size)
@@ -775,6 +781,8 @@ tgsi_default_instruction_memory( void )
   struct tgsi_instruction_memory instruction_memory;

   instruction_memory.Qualifier = 0;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
   instruction_memory.Padding = 0;

   return instruction_memory;
@@ -790,6 +798,8 @@ tgsi_build_instruction_memory(
   struct tgsi_instruction_memory instruction_memory;

   instruction_memory.Qualifier = qualifier;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
   instruction_memory.Padding = 0;
   instruction->Memory = 1;

@@ -365,8 +365,13 @@ iter_declaration(
   }

   if (decl->Declaration.File == TGSI_FILE_MEMORY) {
-      if (decl->Declaration.Shared)
-         TXT(", SHARED");
+      switch (decl->Declaration.MemType) {
+      /* Note: ,GLOBAL is optional / the default */
+      case TGSI_MEMORY_TYPE_GLOBAL:  TXT(", GLOBAL");  break;
+      case TGSI_MEMORY_TYPE_SHARED:  TXT(", SHARED");  break;
+      case TGSI_MEMORY_TYPE_PRIVATE: TXT(", PRIVATE"); break;
+      case TGSI_MEMORY_TYPE_INPUT:   TXT(", INPUT");   break;
+      }
   }

   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
@@ -196,10 +196,6 @@ struct tgsi_sampler
 #define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
 #define TGSI_EXEC_TEMP_HALF_C       0

-/* execution mask, each value is either 0 or ~0 */
-#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
-#define TGSI_EXEC_MASK_C            1
-
 /* 4 register buffer for various purposes */
 #define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
 #define TGSI_EXEC_NUM_TEMP_R        4
@@ -397,27 +393,6 @@ boolean
 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);


-static inline void
-tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
-{
-   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
-      mask;
-}
-
-
-/** Set execution mask values prior to executing the shader */
-static inline void
-tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
-                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
-{
-   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
-   mask[0] = ch0 ? ~0 : 0;
-   mask[1] = ch1 ? ~0 : 0;
-   mask[2] = ch2 ? ~0 : 0;
-   mask[3] = ch3 ? ~0 : 0;
-}
-
-
 extern void
 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
                               unsigned num_bufs,
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
+#include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
@@ -192,8 +193,17 @@ scan_instruction(struct tgsi_shader_info *info,
         }
      }

-      if (is_memory_file(src->Register.File))
+      if (is_memory_file(src->Register.File)) {
         is_mem_inst = true;
+
+         if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) {
+            info->writes_memory = TRUE;
+
+            if (src->Register.File == TGSI_FILE_IMAGE &&
+                !src->Register.Indirect)
+               info->images_writemask |= 1 << src->Register.Index;
+         }
+      }
   }

   /* check for indirect register writes */
@@ -204,8 +214,16 @@ scan_instruction(struct tgsi_shader_info *info,
         info->indirect_files_written |= (1 << dst->Register.File);
      }

-      if (is_memory_file(dst->Register.File))
+      if (is_memory_file(dst->Register.File)) {
+         assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE);
+
         is_mem_inst = true;
+         info->writes_memory = TRUE;
+
+         if (dst->Register.File == TGSI_FILE_IMAGE &&
+             !dst->Register.Indirect)
+            info->images_writemask |= 1 << dst->Register.Index;
+      }
   }

   if (is_mem_inst)
@@ -413,6 +431,9 @@ scan_declaration(struct tgsi_shader_info *info,
         }
      } else if (file == TGSI_FILE_SAMPLER) {
         info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_IMAGE) {
+         if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
+            info->images_buffers |= 1 << reg;
      }
   }
 }
@@ -111,12 +111,22 @@ struct tgsi_shader_info
   boolean writes_clipvertex;
   boolean writes_viewport_index;
   boolean writes_layer;
+   boolean writes_memory; /**< contains stores or atomics to buffers or images */
   boolean is_msaa_sampler[PIPE_MAX_SAMPLERS];
   boolean uses_doubles; /**< uses any of the double instructions */
   unsigned clipdist_writemask;
   unsigned culldist_writemask;
   unsigned num_written_culldistance;
   unsigned num_written_clipdistance;
+   /**
+    * Bitmask indicating which images are written to (STORE / ATOM*).
+    * Indirect image accesses are not reflected in this mask.
+    */
+   unsigned images_writemask;
+   /**
+    * Bitmask indicating which declared image is a buffer.
+    */
+   unsigned images_buffers;
   /**
    * Bitmask indicating which register files are accessed with
    * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.
@@ -145,6 +145,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
   "NUM_CLIPDIST_ENABLED",
   "NUM_CULLDIST_ENABLED",
   "FS_EARLY_DEPTH_STENCIL",
+   "NEXT_SHADER",
 };

 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
@@ -1390,8 +1390,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
            ctx->cur = cur;
         }
      } else if (file == TGSI_FILE_MEMORY) {
-         if (str_match_nocase_whole(&cur, "SHARED")) {
-            decl.Declaration.Shared = 1;
+         if (str_match_nocase_whole(&cur, "GLOBAL")) {
+            /* Note this is a no-op global is the default */
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "SHARED")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_SHARED;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "PRIVATE")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_PRIVATE;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "INPUT")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_INPUT;
            ctx->cur = cur;
         }
      } else {
@@ -301,6 +301,40 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
 }


+static inline void
+tgsi_transform_op3_inst(struct tgsi_transform_context *ctx,
+                        unsigned opcode,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned dst_writemask,
+                        unsigned src0_file,
+                        unsigned src0_index,
+                        unsigned src1_file,
+                        unsigned src1_index,
+                        unsigned src2_file,
+                        unsigned src2_index)
+{
+   struct tgsi_full_instruction inst;
+
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.Dst[0].Register.File = dst_file,
+   inst.Dst[0].Register.Index = dst_index;
+   inst.Dst[0].Register.WriteMask = dst_writemask;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.Src[0].Register.File = src0_file;
+   inst.Src[0].Register.Index = src0_index;
+   inst.Src[1].Register.File = src1_file;
+   inst.Src[1].Register.Index = src1_index;
+   inst.Src[2].Register.File = src2_file;
+   inst.Src[2].Register.Index = src2_index;
+
+   ctx->emit_instruction(ctx, &inst);
+}
+
+
+
 static inline void
 tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
                            unsigned opcode,
@@ -482,15 +516,18 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,


 static inline void
-tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
-                           unsigned dst_file,
-                           unsigned dst_index,
-                           unsigned src_file,
-                           unsigned src_index,
-                           unsigned sampler_index)
+tgsi_transform_tex_inst(struct tgsi_transform_context *ctx,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned src_file,
+                        unsigned src_index,
+                        unsigned tex_target,
+                        unsigned sampler_index)
 {
   struct tgsi_full_instruction inst;

+   assert(tex_target < TGSI_TEXTURE_COUNT);
+
   inst = tgsi_default_full_instruction();
   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
   inst.Instruction.NumDstRegs = 1;
@@ -498,7 +535,7 @@ tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
   inst.Dst[0].Register.Index = dst_index;
   inst.Instruction.NumSrcRegs = 2;
   inst.Instruction.Texture = TRUE;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
+   inst.Texture.Texture = tex_target;
   inst.Src[0].Register.File = src_file;
   inst.Src[0].Register.Index = src_index;
   inst.Src[1].Register.File = TGSI_FILE_SAMPLER;
@@ -101,6 +101,7 @@ struct ureg_program
 {
   unsigned processor;
   bool supports_any_inout_decl_range;
+   int next_shader_processor;

   struct {
      unsigned semantic_name;
@@ -190,7 +191,7 @@ struct ureg_program

   struct ureg_tokens domain[2];

-   bool use_shared_memory;
+   bool use_memory[TGSI_MEMORY_TYPE_COUNT];
 };

 static union tgsi_any_token error_tokens[32];
@@ -729,13 +730,14 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
   return reg;
 }

-/* Allocate a shared memory area.
+/* Allocate a memory area.
 */
-struct ureg_src ureg_DECL_shared_memory(struct ureg_program *ureg)
+struct ureg_src ureg_DECL_memory(struct ureg_program *ureg,
+                                 unsigned memory_type)
 {
-   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, 0);
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, memory_type);

-   ureg->use_shared_memory = true;
+   ureg->use_memory[memory_type] = true;
   return reg;
 }

@@ -1672,7 +1674,7 @@ emit_decl_buffer(struct ureg_program *ureg,
 }

 static void
-emit_decl_shared_memory(struct ureg_program *ureg)
+emit_decl_memory(struct ureg_program *ureg, unsigned memory_type)
 {
   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);

@@ -1681,11 +1683,11 @@ emit_decl_shared_memory(struct ureg_program *ureg)
   out[0].decl.NrTokens = 2;
   out[0].decl.File = TGSI_FILE_MEMORY;
   out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
-   out[0].decl.Shared = true;
+   out[0].decl.MemType = memory_type;

   out[1].value = 0;
-   out[1].decl_range.First = 0;
-   out[1].decl_range.Last = 0;
+   out[1].decl_range.First = memory_type;
+   out[1].decl_range.Last = memory_type;
 }

 static void
@@ -1860,8 +1862,10 @@ static void emit_decls( struct ureg_program *ureg )
      emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
   }

-   if (ureg->use_shared_memory)
-      emit_decl_shared_memory(ureg);
+   for (i = 0; i < TGSI_MEMORY_TYPE_COUNT; i++) {
+      if (ureg->use_memory[i])
+         emit_decl_memory(ureg, i);
+   }

   if (ureg->const_decls.nr_constant_ranges) {
      for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
@@ -1966,6 +1970,16 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
 {
   const struct tgsi_token *tokens;

+   switch (ureg->processor) {
+   case TGSI_PROCESSOR_VERTEX:
+   case TGSI_PROCESSOR_TESS_EVAL:
+      ureg_property(ureg, TGSI_PROPERTY_NEXT_SHADER,
+                    ureg->next_shader_processor == -1 ?
+                       TGSI_PROCESSOR_FRAGMENT :
+                       ureg->next_shader_processor);
+      break;
+   }
+
   emit_header( ureg );
   emit_decls( ureg );
   copy_instructions( ureg );
@@ -2079,6 +2093,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
      screen->get_shader_param(screen,
                               util_pipe_shader_from_tgsi_processor(processor),
                               PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
+   ureg->next_shader_processor = -1;

   for (i = 0; i < Elements(ureg->properties); i++)
      ureg->properties[i] = ~0;
@@ -2108,6 +2123,13 @@ no_ureg:
 }


+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor)
+{
+   ureg->next_shader_processor = processor;
+}
+
+
 unsigned
 ureg_get_nr_outputs( const struct ureg_program *ureg )
 {
@@ -114,6 +114,8 @@ ureg_create_shader( struct ureg_program *,
                    struct pipe_context *pipe,
 		    const struct pipe_stream_output_info *so );

+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor);

 /* Alternately, return the built token stream and hand ownership of
 * that memory to the caller:
@@ -338,7 +340,7 @@ struct ureg_src
 ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);

 struct ureg_src
-ureg_DECL_shared_memory(struct ureg_program *ureg);
+ureg_DECL_memory(struct ureg_program *ureg, unsigned memory_type);

 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
@@ -344,11 +344,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
                           pctx->wincoordFile, wincoordInput,
                           TGSI_FILE_IMMEDIATE, pctx->numImmed);

-   /* TEX texTemp, texTemp, sampler; */
-   tgsi_transform_tex_2d_inst(ctx,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              sampIdx);
+   /* TEX texTemp, texTemp, sampler, 2D; */
+   tgsi_transform_tex_inst(ctx,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_TEXTURE_2D, sampIdx);

   /* KILL_IF -texTemp;   # if -texTemp < 0, kill fragment */
   tgsi_transform_kill_inst(ctx,
@@ -646,6 +646,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
         "FRAG\n"
         "DCL IN[0], GENERIC[0], LINEAR\n"
         "DCL SAMP[0..1]\n"
+         "DCL SVIEW[0..1], %s, FLOAT\n"
         "DCL OUT[0], POSITION\n"
         "DCL OUT[1], STENCIL\n"
         "DCL TEMP[0]\n"
@@ -663,7 +664,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
   assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA ||
          tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA);

-   sprintf(text, shader_templ, type, type);
+   sprintf(text, shader_templ, type, type, type);

   if (!tgsi_text_translate(text, tokens, Elements(tokens))) {
      assert(0);
@@ -3213,6 +3213,14 @@ Whether depth test, stencil test, and occlusion query should run before
 the fragment shader (regardless of fragment shader side effects). Corresponds
 to GLSL early_fragment_tests.

+NEXT_SHADER
+"""""""""""
+
+Which shader stage will MOST LIKELY follow after this shader when the shader
+is bound. This is only a hint to the driver and doesn't have to be precise.
+Only set for VS and TES.
+
+
 Texture Sampling and Texture Formats
 ------------------------------------

@@ -1017,7 +1017,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,

 	const_offset = nir_src_as_const_value(intr->src[1]);
 	if (const_offset) {
-		off += const_offset->u[0];
+		off += const_offset->u32[0];
 	} else {
 		/* For load_ubo_indirect, second src is indirect offset: */
 		src1 = get_src(ctx, &intr->src[1])[0];
@@ -1159,7 +1159,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = create_uniform(ctx, n);
@@ -1186,7 +1186,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = ctx->ir->inputs[n];
@@ -1213,7 +1213,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[1]);
 		compile_assert(ctx, const_offset != NULL);
-		idx += const_offset->u[0];
+		idx += const_offset->u32[0];

 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
@@ -1301,7 +1301,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
 	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
 			instr->def.num_components);
 	for (int i = 0; i < instr->def.num_components; i++)
-		dst[i] = create_immed(ctx->block, instr->value.u[i]);
+		dst[i] = create_immed(ctx->block, instr->value.u32[i]);
 }

 static void
@@ -290,7 +290,7 @@ lower_if_else_block(nir_block *block, void *void_state)
 		}

 		nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-				phi->dest.ssa.num_components, phi->dest.ssa.name);
+				phi->dest.ssa.num_components, 32, phi->dest.ssa.name);
 		sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;

 		nir_ssa_def_rewrite_uses(&phi->dest.ssa,
@@ -160,7 +160,7 @@ struct nv50_ir_prog_info
      uint8_t clipDistances;     /* number of clip distance outputs */
      uint8_t cullDistances;     /* number of cull distance outputs */
      int8_t genUserClip;        /* request user clip planes for ClipVertex */
-      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
+      uint8_t auxCBSlot;         /* driver constant buffer slot */
      uint16_t ucpBase;          /* base address for UCPs */
      uint16_t drawInfoBase;     /* base address for draw parameters */
      uint8_t pointSize;         /* output index for PointSize */
@@ -175,7 +175,6 @@ struct nv50_ir_prog_info
      uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
      bool fp64;                 /* program uses fp64 math */
      bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
-      uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
      uint16_t texBindBase;      /* base address for tex handles (nve4) */
      uint16_t suInfoBase;       /* base address for surface info (nve4) */
      uint16_t sampleInfoBase;   /* base address for sample positions */
@@ -1655,10 +1655,8 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
      break;
   }

-   if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
-      offset &= 0xffffff;
-
   if (code[0] & 0x2) {
+      offset &= 0xffffff;
      emitLoadStoreType(i->dType, 0x33);
      if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
         emitCachingMode(i->cache, 0x2f);
@@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
   code[1] |= (i->tex.mask & 0xc) << 12;

   if (i->tex.liveOnly)
-      code[1] |= 4;
+      code[1] |= 1 << 2;
+   if (i->tex.derivAll)
+      code[1] |= 1 << 3;

   defId(i->def(0), 2);

@@ -856,15 +856,17 @@ public:
   };
   std::vector<TextureView> textureViews;

+   /*
   struct Resource {
      uint8_t target; // TGSI_TEXTURE_*
      bool raw;
      uint8_t slot; // $surface index
   };
   std::vector<Resource> resources;
+   */

   struct MemoryFile {
-      bool shared;
+      uint8_t mem_type; // TGSI_MEMORY_TYPE_*
   };
   std::vector<MemoryFile> memoryFiles;

@@ -1037,6 +1039,9 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
   case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
      info->io.cullDistances = prop->u[0].Data;
      break;
+   case TGSI_PROPERTY_NEXT_SHADER:
+      /* Do not need to know the next shader stage. */
+      break;
   default:
      INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
      break;
@@ -1222,7 +1227,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
      break;
   case TGSI_FILE_MEMORY:
      for (i = first; i <= last; ++i)
-         memoryFiles[i].shared = decl->Declaration.Shared;
+         memoryFiles[i].mem_type = decl->Declaration.MemType;
      break;
   case TGSI_FILE_NULL:
   case TGSI_FILE_TEMPORARY:
@@ -1261,9 +1266,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
      info->numBarriers = 1;

   if (insn.dstCount()) {
-      if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
-         Instruction::DstRegister dst = insn.getDst(0);
+      Instruction::DstRegister dst = insn.getDst(0);

+      if (dst.getFile() == TGSI_FILE_OUTPUT) {
         if (dst.isIndirect(0))
            for (unsigned i = 0; i < info->numOutputs; ++i)
               info->out[i].mask = 0xf;
@@ -1280,11 +1285,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
         if (isEdgeFlagPassthrough(insn))
            info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
      } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
-         if (insn.getDst(0).isIndirect(0))
-            indirectTempArrays.insert(insn.getDst(0).getArrayId());
+      if (dst.getFile() == TGSI_FILE_TEMPORARY) {
+         if (dst.isIndirect(0))
+            indirectTempArrays.insert(dst.getArrayId());
      } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+      if (dst.getFile() == TGSI_FILE_BUFFER) {
         info->io.globalAccess |= 0x2;
      }
   }
@@ -1419,8 +1424,8 @@ private:
   void handleLIT(Value *dst0[4]);
   void handleUserClipPlanes();

-   Symbol *getResourceBase(int r);
-   void getResourceCoords(std::vector<Value *>&, int r, int s);
+   // Symbol *getResourceBase(int r);
+   // void getResourceCoords(std::vector<Value *>&, int r, int s);

   void handleLOAD(Value *dst0[4]);
   void handleSTORE();
@@ -1527,8 +1532,21 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)

   sym->reg.fileIndex = fileIdx;

-   if (tgsiFile == TGSI_FILE_MEMORY && code->memoryFiles[fileIdx].shared)
-      sym->setFile(FILE_MEMORY_SHARED);
+   if (tgsiFile == TGSI_FILE_MEMORY) {
+      switch (code->memoryFiles[fileIdx].mem_type) {
+      case TGSI_MEMORY_TYPE_SHARED:
+         sym->setFile(FILE_MEMORY_SHARED);
+         break;
+      case TGSI_MEMORY_TYPE_INPUT:
+         assert(prog->getType() == Program::TYPE_COMPUTE);
+         assert(idx == -1);
+         sym->setFile(FILE_SHADER_INPUT);
+         address += info->prop.cp.inputOffset;
+         break;
+      default:
+         assert(0); /* TODO: Add support for global and private memory */
+      }
+   }

   if (idx >= 0) {
      if (sym->reg.file == FILE_SHADER_INPUT)
@@ -1989,7 +2007,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
 void
 Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
 {
-   Value *val;
   Value *arg[4], *src[8];
   Value *lod = NULL, *shd = NULL;
   unsigned int s, c, d;
@@ -2032,17 +2049,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
         shd = src[n - 1];
   }

-   if (tgt.isCube()) {
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
-      val = getScratch();
-      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-      mkOp1(OP_RCP, TYPE_F32, val, val);
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
-   }
-
   for (c = 0, d = 0; c < 4; ++c) {
      if (dst[c]) {
         texi->setDef(d++, dst[c]);
@@ -2148,6 +2154,7 @@ Converter::handleLIT(Value *dst0[4])
   }
 }

+/* Keep this around for now as reference when adding img support
 static inline bool
 isResourceSpecial(const int r)
 {
@@ -2178,7 +2185,8 @@ Converter::getResourceBase(const int r)

   switch (r) {
   case TGSI_RESOURCE_GLOBAL:
-      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL,
+                       info->io.auxCBSlot);
      break;
   case TGSI_RESOURCE_LOCAL:
      assert(prog->getType() == Program::TYPE_COMPUTE);
@@ -2243,6 +2251,7 @@ partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
   }
   return n + 1;
 }
+*/

 // For raw loads, granularity is 4 byte.
 // Usage of the texture read mask on OP_SULDP is not allowed.
@@ -2253,8 +2262,9 @@ Converter::handleLOAD(Value *dst0[4])
   int c;
   std::vector<Value *> off, src, ldv, def;

-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
      for (c = 0; c < 4; ++c) {
         if (!dst0[c])
            continue;
@@ -2274,9 +2284,12 @@ Converter::handleLOAD(Value *dst0[4])
         if (tgsi.getSrc(0).isIndirect(0))
            ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
      }
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for LOAD");
   }

+/* Keep this around for now as reference when adding img support
   getResourceCoords(off, r, 1);

   if (isResourceRaw(code, r)) {
@@ -2342,6 +2355,7 @@ Converter::handleLOAD(Value *dst0[4])
   FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
      if (dst0[c] != def[c])
         mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+*/
 }

 // For formatted stores, the write mask on OP_SUSTP can be used.
@@ -2353,8 +2367,9 @@ Converter::handleSTORE()
   int c;
   std::vector<Value *> off, src, dummy;

-   if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getDst(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getDst(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
      for (c = 0; c < 4; ++c) {
         if (!(tgsi.getDst(0).getMask() & (1 << c)))
            continue;
@@ -2375,9 +2390,12 @@ Converter::handleSTORE()
         if (tgsi.getDst(0).isIndirect(0))
            st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
      }
-      return;
+      break;
+   default:
+      assert(!"Unsupported dstFile for STORE");
   }

+/* Keep this around for now as reference when adding img support
   getResourceCoords(off, r, 0);
   src = off;
   const int s = src.size();
@@ -2425,6 +2443,7 @@ Converter::handleSTORE()
      mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
            dummy, src)->tex.mask = tgsi.getDst(0).getMask();
   }
+*/
 }

 // XXX: These only work on resources with the single-component u32/s32 formats.
@@ -2439,8 +2458,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
   std::vector<Value *> defv;
   LValue *dst = getScratch();

-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
      for (int c = 0; c < 4; ++c) {
         if (!dst0[c])
            continue;
@@ -2468,10 +2488,12 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
      for (int c = 0; c < 4; ++c)
         if (dst0[c])
            dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for ATOM");
   }

-
+/* Keep this around for now as reference when adding img support
   getResourceCoords(srcv, r, 1);

   if (isResourceSpecial(r)) {
@@ -2499,6 +2521,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
   for (int c = 0; c < 4; ++c)
      if (dst0[c])
         dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+*/
 }

 void
@@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
   tmp = bld.getScratch();

   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
      for (c = 0; c < dim; ++c) {
@@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
         add->lanes = 1; /* abused for .ndv */
      }

+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
+
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
      bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);

      // save results
@@ -682,7 +682,7 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
                                       Value **ms_x, Value **ms_y) {
   // This loads the texture-indexed ms setting from the constant buffer
   Value *tmp = new_LValue(func, FILE_GPR);
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
   off += prog->driver->io.suInfoBase;
   if (prog->getType() > Program::TYPE_VERTEX)
      off += 16 * 2 * 4;
@@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
   const int dref = arg;
   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;

+   /* Only normalize in the non-explicit derivatives case.
+    */
+   if (i->tex.target.isCube() && i->op != OP_TXD) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
   // handle MS, which means looking up the MS params for this texture, and
   // adjusting the input coordinates to point at the right sample.
   if (i->tex.target.isMS()) {
@@ -934,12 +951,14 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)

   handleTEX(i);
   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+   i->tex.derivAll = true;

   for (c = 0; c < dim; ++c)
      crd[c] = bld.getScratch();

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -949,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c, crd[c]);
+         tex->setSrc(c, src[c]);
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
@@ -1174,7 +1207,7 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
      bld.mkLoad(TYPE_F32,
                 def,
                 bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                       TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
                 off);
      break;
@@ -600,7 +600,7 @@ NVC0LoweringPass::visit(BasicBlock *bb)
 inline Value *
 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
   uint32_t off = prog->driver->io.texBindBase + slot * 4;
   return bld.
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
   const int chipset = prog->getTarget()->getChipset();

+   /* Only normalize in the non-explicit derivatives case. For explicit
+    * derivatives, this is handled in handleManualTXD.
+    */
+   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
   // Arguments to the TEX instruction are a little insane. Even though the
   // encoding is identical between SM20 and SM30, the arguments mean
   // different things between Fermi and Kepler+. A lot of arguments are
@@ -728,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
      }

      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
-      for (int s = dim; s >= 1; --s)
-         i->setSrc(s, i->getSrc(s - 1));
-      i->setSrc(0, arrayIndex);
+      if (arrayIndex) {
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, arrayIndex);
+      } else {
+         i->moveSources(0, 1);
+      }

      if (arrayIndex) {
         int sat = (i->op == OP_TXF) ? 1 : 0;
@@ -861,6 +883,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)

   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
   for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
      // mov coordinates from lane l to all lanes
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
@@ -870,10 +893,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
      // add dPdy from lane l to lanes dy
      for (c = 0; c < dim; ++c)
         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
      // texture
      bld.insert(tex = cloneForward(func, i));
      for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
      // save results
      for (c = 0; i->defExists(c); ++c) {
         Instruction *mov;
@@ -1098,6 +1135,7 @@ NVC0LoweringPass::handleSharedATOM(Instruction *atom)
         break;
      default:
         assert(0);
+         return;
      }

      Instruction *i =
@@ -1204,7 +1242,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 inline Value *
 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
   off += prog->driver->io.suInfoBase;
   return bld.
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -1213,7 +1251,7 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
   off += prog->driver->io.suInfoBase;

   if (ptr)
@@ -1226,7 +1264,7 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
   off += prog->driver->io.suInfoBase;

   if (ptr)
@@ -1540,7 +1578,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
      call->indirect = 1;
      call->absolute = 1;
      call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
-                                   prog->driver->io.resInfoCBSlot, TYPE_U32,
+                                   prog->driver->io.auxCBSlot, TYPE_U32,
                                   prog->driver->io.suInfoBase + base));
      call->setSrc(1, r[2]);
      call->setSrc(2, r[4]);
@@ -1698,7 +1736,8 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
      }
      addr += prog->driver->prop.cp.gridInfoBase;
      bld.mkLoad(TYPE_U32, i->getDef(0),
-                 bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                              TYPE_U32, addr), NULL);
      break;
   case SV_SAMPLE_INDEX:
      // TODO: Properly pass source as an address in the PIX address space
@@ -1715,7 +1754,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
      bld.mkLoad(TYPE_F32,
                 i->getDef(0),
                 bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                       TYPE_U32, prog->driver->io.sampleInfoBase +
                       4 * sym->reg.data.sv.index),
                 off);
@@ -1780,7 +1819,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i)
 {
   if (i->dType == TYPE_F64) {
      Value *pred = bld.getSSA(1, FILE_PREDICATE);
-      Value *zero = bld.loadImm(NULL, 0.0d);
+      Value *zero = bld.loadImm(NULL, 0.0);
      Value *dst = bld.getSSA(8);
      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
@@ -204,6 +204,11 @@ static const char *ldstSubOpStr[] =
   "", "lock", "unlock"
 };

+static const char *subfmOpStr[] =
+{
+   "", "3d"
+};
+
 static const char *DataTypeStr[] =
 {
   "-",
@@ -548,6 +553,10 @@ void Instruction::print() const
         if (subOp < Elements(ldstSubOpStr))
            PRINT("%s ", ldstSubOpStr[subOp]);
         break;
+      case OP_SUBFM:
+         if (subOp < Elements(subfmOpStr))
+            PRINT("%s ", subfmOpStr[subOp]);
+         break;
      default:
         if (subOp)
            PRINT("(SUBOP:%u) ", subOp);
@@ -114,8 +114,6 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],

   info.io.auxCBSlot = 15;
   info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
-
-   info.io.resInfoCBSlot = 15;
   info.io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
   info.io.msInfoCBSlot = 15;
   info.io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
@@ -67,122 +67,94 @@ nv50_screen_compute_setup(struct nv50_screen *screen,
   if (ret)
      return ret;

-   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
   PUSH_DATA (push, screen->compute->handle);

-   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   BEGIN_NV04(push, NV50_CP(UNK02A0), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);
   PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);
   PUSH_DATAh(push, screen->stack_bo->offset);
   PUSH_DATA (push, screen->stack_bo->offset);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);
   PUSH_DATA (push, 4);

-   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0290), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   BEGIN_NV04(push, NV50_CP(REG_MODE), 1);
   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
-   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0384), 1);
   PUSH_DATA (push, 0x100);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);
   PUSH_DATA (push, fifo->vram);

   for (i = 0; i < 15; i++) {
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);
      PUSH_DATA (push, 0);
      PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);
      PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);
      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
   }

-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);
   PUSH_DATA (push, 0);
   PUSH_DATA (push, 0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);
   PUSH_DATA (push, ~0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);
   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);

-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);
   PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);
   PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
   PUSH_DATA (push, 0);

-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);
   PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);
   PUSH_DATA (push, 0x54);
-   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);
   PUSH_DATA (push, 0);

-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);
   PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);
   PUSH_DATAh(push, screen->txc->offset);
   PUSH_DATA (push, screen->txc->offset);
   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);

-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);
   PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);
   PUSH_DATAh(push, screen->txc->offset + 65536);
   PUSH_DATA (push, screen->txc->offset + 65536);
   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);

-   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);
   PUSH_DATA (push, fifo->vram);

-   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);
   PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);
   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
   PUSH_DATA (push, screen->tls_bo->offset + 65536);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);
   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));

   return 0;
 }

-static bool
-nv50_compute_validate_program(struct nv50_context *nv50)
-{
-   struct nv50_program *prog = nv50->compprog;
-
-   if (prog->mem)
-      return true;
-
-   if (!prog->translated) {
-      prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
-      if (!prog->translated)
-         return false;
-   }
-   if (unlikely(!prog->code_size))
-      return false;
-
-   if (likely(prog->code_size)) {
-      if (nv50_program_upload_code(nv50, prog)) {
-         struct nouveau_pushbuf *push = nv50->base.pushbuf;
-         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
-         PUSH_DATA (push, 0);
-         return true;
-      }
-   }
-   return false;
-}
-
 static void
 nv50_compute_validate_globals(struct nv50_context *nv50)
 {
@@ -198,26 +170,25 @@ nv50_compute_validate_globals(struct nv50_context *nv50)
   }
 }

-static bool
-nv50_compute_state_validate(struct nv50_context *nv50)
-{
-   if (!nv50_compute_validate_program(nv50))
-      return false;
+static struct nv50_state_validate
+validate_list_cp[] = {
+   { nv50_compprog_validate,              NV50_NEW_CP_PROGRAM     },
+   { nv50_compute_validate_globals,       NV50_NEW_CP_GLOBALS     },
+};

-   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
-      nv50_compute_validate_globals(nv50);
+static bool
+nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)
+{
+   bool ret;

   /* TODO: validate textures, samplers, surfaces */
+   ret = nv50_state_validate(nv50, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,
+                             nv50->bufctx_cp);

-   nv50_bufctx_fence(nv50->bufctx_cp, false);
-
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
-      return false;
   if (unlikely(nv50->state.flushed))
      nv50_bufctx_fence(nv50->bufctx_cp, true);
-
-   return true;
+   return ret;
 }

 static void
@@ -227,7 +198,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
   struct nouveau_pushbuf *push = screen->base.pushbuf;
   unsigned size = align(nv50->compprog->parm_size, 0x4);

-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
   PUSH_DATA (push, (size / 4) << 8);

   if (size) {
@@ -245,7 +216,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
      nouveau_pushbuf_bufctx(push, nv50->bufctx);
      nouveau_pushbuf_validate(push);

-      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4);
      nouveau_pushbuf_data(push, bo, offset, size);

      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
@@ -278,7 +249,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
   struct nv50_program *cp = nv50->compprog;
   bool ret;

-   ret = !nv50_compute_state_validate(nv50);
+   ret = !nv50_state_validate_cp(nv50, ~0);
   if (ret) {
      NOUVEAU_ERR("Failed to launch grid !\n");
      return;
@@ -286,33 +257,33 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)

   nv50_compute_upload_input(nv50, info->input);

-   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
   PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc));

-   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
-   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
   PUSH_DATA (push, cp->max_gpr);

   /* grid/block setup */
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);
   PUSH_DATA (push, info->block[1] << 16 | info->block[0]);
   PUSH_DATA (push, info->block[2]);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);
   PUSH_DATA (push, 1 << 16 | block_size);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);
   PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);
   PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDID), 1);
   PUSH_DATA (push, 1);

   /* kernel launching */
-   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   BEGIN_NV04(push, NV50_CP(LAUNCH), 1);
   PUSH_DATA (push, 0);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
   PUSH_DATA (push, 0);

   /* bind a compute shader clobbers fragment shader state */
-   nv50->dirty |= NV50_NEW_FRAGPROG;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }
@@ -176,8 +176,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
         if (nv50->framebuffer.cbufs[i] &&
             nv50->framebuffer.cbufs[i]->texture == res) {
-            nv50->dirty |= NV50_NEW_FRAMEBUFFER;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+            nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
            if (!--ref)
               return ref;
         }
@@ -186,8 +186,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
   if (bind & PIPE_BIND_DEPTH_STENCIL) {
      if (nv50->framebuffer.zsbuf &&
          nv50->framebuffer.zsbuf->texture == res) {
-         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+         nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
         if (!--ref)
            return ref;
      }
@@ -202,8 +202,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
      for (i = 0; i < nv50->num_vtxbufs; ++i) {
         if (nv50->vtxbuf[i].buffer == res) {
-            nv50->dirty |= NV50_NEW_ARRAYS;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+            nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
            if (!--ref)
               return ref;
         }
@@ -211,8 +211,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,

      if (nv50->idxbuf.buffer == res) {
         /* Just rebind to the bufctx as there is no separate dirty bit */
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(res), RD);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD);
         if (!--ref)
            return ref;
      }
@@ -222,8 +222,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
      for (i = 0; i < nv50->num_textures[s]; ++i) {
         if (nv50->textures[s][i] &&
             nv50->textures[s][i]->texture == res) {
-            nv50->dirty |= NV50_NEW_TEXTURES;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+            nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
            if (!--ref)
               return ref;
         }
@@ -236,9 +236,9 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
            continue;
         if (!nv50->constbuf[s][i].user &&
             nv50->constbuf[s][i].u.buf == res) {
-            nv50->dirty |= NV50_NEW_CONSTBUF;
+            nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
            nv50->constbuf_dirty[s] |= 1 << i;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
            if (!--ref)
               return ref;
         }
@@ -345,10 +345,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)

   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;

-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->code);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->uniforms);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->txc);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->stack_bo);
   if (screen->compute) {
      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
@@ -357,7 +357,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)

   flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;

-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo);
   BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
   if (screen->compute)
      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
@@ -26,43 +26,43 @@
 #include "nv50/nv50_3d.xml.h"
 #include "nv50/nv50_2d.xml.h"

-#define NV50_NEW_BLEND        (1 << 0)
-#define NV50_NEW_RASTERIZER   (1 << 1)
-#define NV50_NEW_ZSA          (1 << 2)
-#define NV50_NEW_VERTPROG     (1 << 3)
-#define NV50_NEW_GMTYPROG     (1 << 6)
-#define NV50_NEW_FRAGPROG     (1 << 7)
-#define NV50_NEW_BLEND_COLOUR (1 << 8)
-#define NV50_NEW_STENCIL_REF  (1 << 9)
-#define NV50_NEW_CLIP         (1 << 10)
-#define NV50_NEW_SAMPLE_MASK  (1 << 11)
-#define NV50_NEW_FRAMEBUFFER  (1 << 12)
-#define NV50_NEW_STIPPLE      (1 << 13)
-#define NV50_NEW_SCISSOR      (1 << 14)
-#define NV50_NEW_VIEWPORT     (1 << 15)
-#define NV50_NEW_ARRAYS       (1 << 16)
-#define NV50_NEW_VERTEX       (1 << 17)
-#define NV50_NEW_CONSTBUF     (1 << 18)
-#define NV50_NEW_TEXTURES     (1 << 19)
-#define NV50_NEW_SAMPLERS     (1 << 20)
-#define NV50_NEW_STRMOUT      (1 << 21)
-#define NV50_NEW_MIN_SAMPLES  (1 << 22)
-#define NV50_NEW_CONTEXT      (1 << 31)
+#define NV50_NEW_3D_BLEND        (1 << 0)
+#define NV50_NEW_3D_RASTERIZER   (1 << 1)
+#define NV50_NEW_3D_ZSA          (1 << 2)
+#define NV50_NEW_3D_VERTPROG     (1 << 3)
+#define NV50_NEW_3D_GMTYPROG     (1 << 6)
+#define NV50_NEW_3D_FRAGPROG     (1 << 7)
+#define NV50_NEW_3D_BLEND_COLOUR (1 << 8)
+#define NV50_NEW_3D_STENCIL_REF  (1 << 9)
+#define NV50_NEW_3D_CLIP         (1 << 10)
+#define NV50_NEW_3D_SAMPLE_MASK  (1 << 11)
+#define NV50_NEW_3D_FRAMEBUFFER  (1 << 12)
+#define NV50_NEW_3D_STIPPLE      (1 << 13)
+#define NV50_NEW_3D_SCISSOR      (1 << 14)
+#define NV50_NEW_3D_VIEWPORT     (1 << 15)
+#define NV50_NEW_3D_ARRAYS       (1 << 16)
+#define NV50_NEW_3D_VERTEX       (1 << 17)
+#define NV50_NEW_3D_CONSTBUF     (1 << 18)
+#define NV50_NEW_3D_TEXTURES     (1 << 19)
+#define NV50_NEW_3D_SAMPLERS     (1 << 20)
+#define NV50_NEW_3D_STRMOUT      (1 << 21)
+#define NV50_NEW_3D_MIN_SAMPLES  (1 << 22)
+#define NV50_NEW_3D_CONTEXT      (1 << 31)

 #define NV50_NEW_CP_PROGRAM   (1 << 0)
 #define NV50_NEW_CP_GLOBALS   (1 << 1)

 /* 3d bufctx (during draw_vbo, blit_3d) */
-#define NV50_BIND_FB          0
-#define NV50_BIND_VERTEX      1
-#define NV50_BIND_VERTEX_TMP  2
-#define NV50_BIND_INDEX       3
-#define NV50_BIND_TEXTURES    4
-#define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SO         53
-#define NV50_BIND_SCREEN     54
-#define NV50_BIND_TLS        55
-#define NV50_BIND_3D_COUNT   56
+#define NV50_BIND_3D_FB          0
+#define NV50_BIND_3D_VERTEX      1
+#define NV50_BIND_3D_VERTEX_TMP  2
+#define NV50_BIND_3D_INDEX       3
+#define NV50_BIND_3D_TEXTURES    4
+#define NV50_BIND_3D_CB(s, i)   (5 + 16 * (s) + (i))
+#define NV50_BIND_3D_SO         53
+#define NV50_BIND_3D_SCREEN     54
+#define NV50_BIND_3D_TLS        55
+#define NV50_BIND_3D_COUNT      56

 /* compute bufctx (during launch_grid) */
 #define NV50_BIND_CP_GLOBAL   0
@@ -115,7 +115,7 @@ struct nv50_context {
   struct nouveau_bufctx *bufctx;
   struct nouveau_bufctx *bufctx_cp;

-   uint32_t dirty;
+   uint32_t dirty_3d; /* dirty flags for 3d state */
   uint32_t dirty_cp; /* dirty flags for compute state */
   bool cb_dirty;

@@ -221,6 +221,7 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 void nv50_vertprog_validate(struct nv50_context *);
 void nv50_gmtyprog_validate(struct nv50_context *);
 void nv50_fragprog_validate(struct nv50_context *);
+void nv50_compprog_validate(struct nv50_context *);
 void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
@@ -231,7 +232,15 @@ void nv50_stream_output_validate(struct nv50_context *);
 extern void nv50_init_state_functions(struct nv50_context *);

 /* nv50_state_validate.c */
-bool nv50_state_validate(struct nv50_context *, uint32_t state_mask);
+struct nv50_state_validate {
+   void (*func)(struct nv50_context *);
+   uint32_t states;
+};
+
+bool nv50_state_validate(struct nv50_context *, uint32_t,
+                         struct nv50_state_validate *, int, uint32_t *,
+                         struct nouveau_bufctx *);
+bool nv50_state_validate_3d(struct nv50_context *, uint32_t);

 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
@@ -335,7 +335,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
   info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
   info->io.genUserClip = prog->vp.clpd_nr;

-   info->io.resInfoCBSlot = 15;
   info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
   info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
   info->io.msInfoCBSlot = 15;
@@ -202,10 +202,10 @@ nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
      func = nv50_hw_sm_get_func(c);

      /* configure and reset the counter(s) */
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
      PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                        | cfg->ctr[i].unit | cfg->ctr[i].mode);
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_SET(c)), 1);
      PUSH_DATA (push, 0);
   }
   return true;
@@ -240,7 +240,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
   PUSH_SPACE(push, 8);
   for (c = 0; c < 4; c++) {
      if (screen->pm.mp_counter[c]) {
-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
         PUSH_DATA (push, 0);
      }
   }
@@ -257,7 +257,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
                hq->bo);

   PUSH_SPACE(push, 2);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
   PUSH_DATA (push, 0);

   pipe->bind_compute_state(pipe, screen->pm.prog);
@@ -295,7 +295,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
         mask |= 1 << hsq->ctr[i];
         func  = nv50_hw_sm_get_func(hsq->ctr[i]);

-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(hsq->ctr[i])), 1);
         PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                    | cfg->ctr[i].unit | cfg->ctr[i].mode);
      }
@@ -29,6 +29,8 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query_hw.h"

+#include "nv50/nv50_compute.xml.h"
+
 void
 nv50_constbufs_validate(struct nv50_context *nv50)
 {
@@ -94,7 +96,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
               BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
               PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);

-               BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+               BCTX_REFN(nv50->bufctx_3d, 3D_CB(s, i), res, RD);

               nv50->cb_dirty = 1; /* Force cache flush for UBO. */
            } else {
@@ -131,14 +133,14 @@ nv50_program_update_context_state(struct nv50_context *nv50,

   if (prog && prog->tls_space) {
      if (nv50->state.new_tls_space)
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
      if (!nv50->state.tls_required || nv50->state.new_tls_space)
-         BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
+         BCTX_REFN_bo(nv50->bufctx_3d, 3D_TLS, flags, nv50->screen->tls_bo);
      nv50->state.new_tls_space = false;
      nv50->state.tls_required |= 1 << stage;
   } else {
      if (nv50->state.tls_required == (1 << stage))
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
      nv50->state.tls_required &= ~(1 << stage);
   }
 }
@@ -181,7 +183,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
      fp->fp.force_persample_interp = rast->force_persample_interp;
   }

-   if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+   if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_MIN_SAMPLES)))
      return;

   if (!nv50_program_validate(nv50, fp))
@@ -238,6 +240,19 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
   /* GP_ENABLE is updated in linkage validation */
 }

+void
+nv50_compprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *cp = nv50->compprog;
+
+   if (cp && !nv50_program_validate(nv50, cp))
+      return;
+
+   BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
+   PUSH_DATA (push, 0);
+}
+
 static void
 nv50_sprite_coords_validate(struct nv50_context *nv50)
 {
@@ -309,7 +324,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
      PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
   }

-   if (nv50->dirty & NV50_NEW_FRAGPROG)
+   if (nv50->dirty_3d & NV50_NEW_3D_FRAGPROG)
      return;
   psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
   color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
@@ -378,9 +393,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
   uint8_t map[64];
   uint8_t so_map[64];

-   if (!(nv50->dirty & (NV50_NEW_VERTPROG |
-                        NV50_NEW_FRAGPROG |
-                        NV50_NEW_GMTYPROG))) {
+   if (!(nv50->dirty_3d & (NV50_NEW_3D_VERTPROG |
+                           NV50_NEW_3D_FRAGPROG |
+                           NV50_NEW_3D_GMTYPROG))) {
      uint8_t bfc, ffc;
      ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
      bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
@@ -633,8 +648,6 @@ nv50_stream_output_validate(struct nv50_context *nv50)
   BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
   PUSH_DATA (push, ctrl);

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
-
   for (i = 0; i < nv50->num_so_targets; ++i) {
      struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
      struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
@@ -664,7 +677,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
         prims = MIN2(prims, limit);
      }
      targ->stride = so->stride[i];
-      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_SO, buf, WR);
   }
   if (prims != ~0) {
      BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
@@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->blend = hwcso;
-   nv50->dirty |= NV50_NEW_BLEND;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND;
 }

 static void
@@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->rast = hwcso;
-   nv50->dirty |= NV50_NEW_RASTERIZER;
+   nv50->dirty_3d |= NV50_NEW_3D_RASTERIZER;
 }

 static void
@@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->zsa = hwcso;
-   nv50->dirty |= NV50_NEW_ZSA;
+   nv50->dirty_3d |= NV50_NEW_3D_ZSA;
 }

 static void
@@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,

   nv50->num_samplers[s] = nr;

-   nv50->dirty |= NV50_NEW_SAMPLERS;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;
 }

 static void
@@ -698,9 +698,9 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,

   nv50->num_textures[s] = nr;

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);

-   nv50->dirty |= NV50_NEW_TEXTURES;
+   nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
 }

 static void
@@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);

    nv50->vertprog = hwcso;
-    nv50->dirty |= NV50_NEW_VERTPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
 }

 static void *
@@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);

    nv50->fragprog = hwcso;
-    nv50->dirty |= NV50_NEW_FRAGPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }

 static void *
@@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);

    nv50->gmtyprog = hwcso;
-    nv50->dirty |= NV50_NEW_GMTYPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
 }

 static void *
@@ -857,7 +857,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
      nv50->constbuf[s][i].u.buf = NULL;
   else
   if (nv50->constbuf[s][i].u.buf)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));

   pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);

@@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
   }
   nv50->constbuf_dirty[s] |= 1 << i;

-   nv50->dirty |= NV50_NEW_CONSTBUF;
+   nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
 }

 /* =============================================================================
@@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->blend_colour = *bcol;
-   nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND_COLOUR;
 }

 static void
@@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->stencil_ref = *sr;
-   nv50->dirty |= NV50_NEW_STENCIL_REF;
+   nv50->dirty_3d |= NV50_NEW_3D_STENCIL_REF;
 }

 static void
@@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe,

   memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));

-   nv50->dirty |= NV50_NEW_CLIP;
+   nv50->dirty_3d |= NV50_NEW_3D_CLIP;
 }

 static void
@@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->sample_mask = sample_mask;
-   nv50->dirty |= NV50_NEW_SAMPLE_MASK;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLE_MASK;
 }

 static void
@@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples)

   if (nv50->min_samples != min_samples) {
      nv50->min_samples = min_samples;
-      nv50->dirty |= NV50_NEW_MIN_SAMPLES;
+      nv50->dirty_3d |= NV50_NEW_3D_MIN_SAMPLES;
   }
 }

@@ -945,11 +945,11 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
 {
   struct nv50_context *nv50 = nv50_context(pipe);

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);

   util_copy_framebuffer_state(&nv50->framebuffer, fb);

-   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
 }

 static void
@@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->stipple = *stipple;
-   nv50->dirty |= NV50_NEW_STIPPLE;
+   nv50->dirty_3d |= NV50_NEW_3D_STIPPLE;
 }

 static void
@@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe,
         continue;
      nv50->scissors[start_slot + i] = scissor[i];
      nv50->scissors_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_SCISSOR;
+      nv50->dirty_3d |= NV50_NEW_3D_SCISSOR;
   }
 }

@@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe,
         continue;
      nv50->viewports[start_slot + i] = vpt[i];
      nv50->viewports_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_VIEWPORT;
+      nv50->dirty_3d |= NV50_NEW_3D_VIEWPORT;
   }
 }

@@ -1008,8 +1008,8 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);
   unsigned i;

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-   nv50->dirty |= NV50_NEW_ARRAYS;
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
+   nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;

   util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                 start_slot, count);
@@ -1051,14 +1051,14 @@ nv50_set_index_buffer(struct pipe_context *pipe,
   struct nv50_context *nv50 = nv50_context(pipe);

   if (nv50->idxbuf.buffer)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);

   if (ib) {
      pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
      nv50->idxbuf.index_size = ib->index_size;
      if (ib->buffer) {
         nv50->idxbuf.offset = ib->offset;
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD);
      } else {
         nv50->idxbuf.user_buffer = ib->user_buffer;
      }
@@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
   struct nv50_context *nv50 = nv50_context(pipe);

   nv50->vertex = hwcso;
-   nv50->dirty |= NV50_NEW_VERTEX;
+   nv50->dirty_3d |= NV50_NEW_3D_VERTEX;
 }

 static struct pipe_stream_output_target *
@@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
   }
   nv50->num_so_targets = num_targets;

-   if (nv50->so_targets_dirty)
-      nv50->dirty |= NV50_NEW_STRMOUT;
+   if (nv50->so_targets_dirty) {
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
+      nv50->dirty_3d |= NV50_NEW_3D_STRMOUT;
+   }
 }

 static void
@@ -25,7 +25,7 @@ nv50_validate_fb(struct nv50_context *nv50)
   unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
   uint32_t array_size = 0xffff, array_mode = 0;

-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);

   BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
   PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
@@ -90,7 +90,7 @@ nv50_validate_fb(struct nv50_context *nv50)
      mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;

      /* only register for writing, otherwise we'd always serialize here */
-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
   }

   if (fb->zsbuf) {
@@ -118,7 +118,7 @@ nv50_validate_fb(struct nv50_context *nv50)
      mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
      mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;

-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
   } else {
      BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
      PUSH_DATA (push, 0);
@@ -187,8 +187,8 @@ nv50_validate_scissor(struct nv50_context *nv50)
 #ifdef NV50_SCISSORS_CLIPPING
   int minx, maxx, miny, maxy, i;

-   if (!(nv50->dirty &
-         (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
+   if (!(nv50->dirty_3d &
+         (NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | NV50_NEW_3D_FRAMEBUFFER)) &&
       nv50->state.scissor == nv50->rast->pipe.scissor)
      return;

@@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50)

   nv50->state.scissor = nv50->rast->pipe.scissor;

-   if ((nv50->dirty & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
+   if ((nv50->dirty_3d & NV50_NEW_3D_FRAMEBUFFER) && !nv50->state.scissor)
      nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1;

   for (i = 0; i < NV50_MAX_VIEWPORTS; i++) {
@@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50,

   vp->vp.clpd_nr = n;
   if (likely(vp == nv50->vertprog)) {
-      nv50->dirty |= NV50_NEW_VERTPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
      nv50_vertprog_validate(nv50);
   } else {
-      nv50->dirty |= NV50_NEW_GMTYPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
      nv50_gmtyprog_validate(nv50);
   }
   nv50_fp_linkage_validate(nv50);
@@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50)
   struct nv50_program *vp;
   uint8_t clip_enable;

-   if (nv50->dirty & NV50_NEW_CLIP) {
+   if (nv50->dirty_3d & NV50_NEW_3D_CLIP) {
      BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
      PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX);
      BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
@@ -436,7 +436,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
   else
      ctx_to->state = ctx_to->screen->save_state;

-   ctx_to->dirty = ~0;
+   ctx_to->dirty_3d = ~0;
+   ctx_to->dirty_cp = ~0;
   ctx_to->viewports_dirty = ~0;
   ctx_to->scissors_dirty = ~0;

@@ -445,71 +446,71 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
   ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;

   if (!ctx_to->vertex)
-      ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS);

   if (!ctx_to->vertprog)
-      ctx_to->dirty &= ~NV50_NEW_VERTPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_VERTPROG;
   if (!ctx_to->fragprog)
-      ctx_to->dirty &= ~NV50_NEW_FRAGPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_FRAGPROG;

   if (!ctx_to->blend)
-      ctx_to->dirty &= ~NV50_NEW_BLEND;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_BLEND;
   if (!ctx_to->rast)
 #ifdef NV50_SCISSORS_CLIPPING
-      ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_SCISSOR);
 #else
-      ctx_to->dirty &= ~NV50_NEW_RASTERIZER;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_RASTERIZER;
 #endif
   if (!ctx_to->zsa)
-      ctx_to->dirty &= ~NV50_NEW_ZSA;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_ZSA;

   ctx_to->screen->cur_ctx = ctx_to;
 }

-static struct state_validate {
-    void (*func)(struct nv50_context *);
-    uint32_t states;
-} validate_list[] = {
-    { nv50_validate_fb,            NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_blend,         NV50_NEW_BLEND },
-    { nv50_validate_zsa,           NV50_NEW_ZSA },
-    { nv50_validate_sample_mask,   NV50_NEW_SAMPLE_MASK },
-    { nv50_validate_rasterizer,    NV50_NEW_RASTERIZER },
-    { nv50_validate_blend_colour,  NV50_NEW_BLEND_COLOUR },
-    { nv50_validate_stencil_ref,   NV50_NEW_STENCIL_REF },
-    { nv50_validate_stipple,       NV50_NEW_STIPPLE },
+static struct nv50_state_validate
+validate_list_3d[] = {
+    { nv50_validate_fb,            NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_blend,         NV50_NEW_3D_BLEND },
+    { nv50_validate_zsa,           NV50_NEW_3D_ZSA },
+    { nv50_validate_sample_mask,   NV50_NEW_3D_SAMPLE_MASK },
+    { nv50_validate_rasterizer,    NV50_NEW_3D_RASTERIZER },
+    { nv50_validate_blend_colour,  NV50_NEW_3D_BLEND_COLOUR },
+    { nv50_validate_stencil_ref,   NV50_NEW_3D_STENCIL_REF },
+    { nv50_validate_stipple,       NV50_NEW_3D_STIPPLE },
 #ifdef NV50_SCISSORS_CLIPPING
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT |
-                                   NV50_NEW_RASTERIZER |
-                                   NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT |
+                                   NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_FRAMEBUFFER },
 #else
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR },
 #endif
-    { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
-    { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
-    { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
-    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_MIN_SAMPLES },
-    { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
-                                   NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
-    { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
-    { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_derived_3,     NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
-    { nv50_validate_textures,      NV50_NEW_TEXTURES },
-    { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
-    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
-    { nv50_validate_min_samples,   NV50_NEW_MIN_SAMPLES },
+    { nv50_validate_viewport,      NV50_NEW_3D_VIEWPORT },
+    { nv50_vertprog_validate,      NV50_NEW_3D_VERTPROG },
+    { nv50_gmtyprog_validate,      NV50_NEW_3D_GMTYPROG },
+    { nv50_fragprog_validate,      NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_MIN_SAMPLES },
+    { nv50_fp_linkage_validate,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_VERTPROG |
+                                   NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_RASTERIZER },
+    { nv50_gp_linkage_validate,    NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_VERTPROG },
+    { nv50_validate_derived_rs,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_3D_ZSA | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_derived_3,     NV50_NEW_3D_BLEND | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_clip,          NV50_NEW_3D_CLIP | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_constbufs_validate,     NV50_NEW_3D_CONSTBUF },
+    { nv50_validate_textures,      NV50_NEW_3D_TEXTURES },
+    { nv50_validate_samplers,      NV50_NEW_3D_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_3D_STRMOUT |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS },
+    { nv50_validate_min_samples,   NV50_NEW_3D_MIN_SAMPLES },
 };

 bool
-nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
+nv50_state_validate(struct nv50_context *nv50, uint32_t mask,
+                    struct nv50_state_validate *validate_list, int size,
+                    uint32_t *dirty, struct nouveau_bufctx *bufctx)
 {
   uint32_t state_mask;
   int ret;
@@ -518,16 +519,16 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
   if (nv50->screen->cur_ctx != nv50)
      nv50_switch_pipe_context(nv50);

-   state_mask = nv50->dirty & mask;
+   state_mask = *dirty & mask;

   if (state_mask) {
-      for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
-         struct state_validate *validate = &validate_list[i];
+      for (i = 0; i < size; i++) {
+         struct nv50_state_validate *validate = &validate_list[i];

         if (state_mask & validate->states)
            validate->func(nv50);
      }
-      nv50->dirty &= ~state_mask;
+      *dirty &= ~state_mask;

      if (nv50->state.rt_serialize) {
         nv50->state.rt_serialize = false;
@@ -535,14 +536,26 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
         PUSH_DATA (nv50->base.pushbuf, 0);
      }

-      nv50_bufctx_fence(nv50->bufctx_3d, false);
+      nv50_bufctx_fence(bufctx, false);
   }
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, bufctx);
   ret = nouveau_pushbuf_validate(nv50->base.pushbuf);

+   return !ret;
+}
+
+bool
+nv50_state_validate_3d(struct nv50_context *nv50, uint32_t mask)
+{
+   bool ret;
+
+   ret = nv50_state_validate(nv50, mask, validate_list_3d,
+                             ARRAY_SIZE(validate_list_3d), &nv50->dirty_3d,
+                             nv50->bufctx_3d);
+
   if (unlikely(nv50->state.flushed)) {
      nv50->state.flushed = false;
      nv50_bufctx_fence(nv50->bufctx_3d, true);
   }
-   return !ret;
+   return ret;
 }
--- a/Show More
+++ b/Show More