Merge remote-tracking branch 'public/master' into vulkan

2016-04-01 14:59:38 -07:00
parent cf2257069c 14c46954c9
commit 95106f6bfb
253 changed files with 8673 additions and 3663 deletions
@@ -179,10 +179,10 @@ NIR_FILES = \
 	nir/nir_gather_info.c \
 	nir/nir_gs_count_vertices.c \
 	nir/nir_inline_functions.c \
-	nir/nir_intrinsics.c \
-	nir/nir_intrinsics.h \
 	nir/nir_instr_set.c \
 	nir/nir_instr_set.h \
+	nir/nir_intrinsics.c \
+	nir/nir_intrinsics.h \
 	nir/nir_liveness.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
@@ -214,6 +214,7 @@ public:
      subexpressions[2] = NULL;
      primary_expression.identifier = identifier;
      this->non_lvalue_description = NULL;
+      this->is_lhs = false;
   }

   static const char *operator_string(enum ast_operators op);
@@ -263,6 +264,11 @@ public:
    * This pointer may be \c NULL.
    */
   const char *non_lvalue_description;
+
+   void set_is_lhs(bool new_value);
+
+private:
+   bool is_lhs;
 };

 class ast_expression_bin : public ast_expression {
@@ -556,6 +562,15 @@ struct ast_type_qualifier {
         unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */
         /** \} */

+         /** \name Layout qualifiers for GL_ARB_enhanced_layouts */
+         /** \{ */
+         unsigned explicit_xfb_offset:1; /**< xfb_offset value assigned explicitly by shader code */
+         unsigned xfb_buffer:1; /**< Has xfb_buffer value assigned  */
+         unsigned explicit_xfb_buffer:1; /**< xfb_buffer value assigned explicitly by shader code */
+         unsigned xfb_stride:1; /**< Is xfb_stride value yet to be merged with global values  */
+         unsigned explicit_xfb_stride:1; /**< xfb_stride value assigned explicitly by shader code */
+         /** \} */
+
 	 /** \name Layout qualifiers for GL_ARB_tessellation_shader */
 	 /** \{ */
 	 /* tess eval input layout */
@@ -612,6 +627,15 @@ struct ast_type_qualifier {
   /** Stream in GLSL 1.50 geometry shaders. */
   ast_expression *stream;

+   /** xfb_buffer specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_buffer;
+
+   /** xfb_stride specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_stride;
+
+   /** global xfb_stride values for each buffer */
+   ast_layout_expression *out_xfb_stride[MAX_FEEDBACK_BUFFERS];
+
   /**
    * Input or output primitive type in GLSL 1.50 geometry shaders
    * and tessellation shaders.
@@ -627,8 +651,9 @@ struct ast_type_qualifier {
   ast_expression *binding;

   /**
-    * Offset specified via GL_ARB_shader_atomic_counter's "offset"
-    * keyword.
+    * Offset specified via GL_ARB_shader_atomic_counter's or
+    * GL_ARB_enhanced_layouts "offset" keyword, or by GL_ARB_enhanced_layouts
+    * "xfb_offset" keyword.
    *
    * \note
    * This field is only valid if \c explicit_offset is set.
@@ -1199,4 +1224,10 @@ extern void _mesa_ast_process_interface_block(YYLTYPE *locp,
                                              ast_interface_block *const block,
                                              const struct ast_type_qualifier &q);

+extern bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value);
 #endif /* AST_H */
@@ -1727,6 +1727,10 @@ ast_function_expression::handle_method(exec_list *instructions,
   const char *method;
   method = field->primary_expression.identifier;

+   /* This would prevent to raise "uninitialized variable" warnings when
+    * calling array.length.
+    */
+   field->subexpressions[0]->set_is_lhs(true);
   op = field->subexpressions[0]->hir(instructions, state);
   if (strcmp(method, "length") == 0) {
      if (!this->expressions.is_empty()) {
@@ -54,6 +54,7 @@
 #include "ast.h"
 #include "compiler/glsl_types.h"
 #include "program/hash_table.h"
+#include "main/macros.h"
 #include "main/shaderobj.h"
 #include "ir.h"
 #include "ir_builder.h"
@@ -819,7 +820,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
    * if the expression indicating the vertex number is not the identifier
    * `gl_InvocationID`.
    */
-   if (state->stage == MESA_SHADER_TESS_CTRL) {
+   if (state->stage == MESA_SHADER_TESS_CTRL && !lhs->type->is_error()) {
      ir_variable *var = lhs->variable_referenced();
      if (var->data.mode == ir_var_shader_out && !var->data.patch) {
         ir_rvalue *index = find_innermost_array_index(lhs);
@@ -1248,6 +1249,24 @@ ast_expression::hir_no_rvalue(exec_list *instructions,
   do_hir(instructions, state, false);
 }

+void
+ast_expression::set_is_lhs(bool new_value)
+{
+   /* is_lhs is tracked only to print "variable used uninitialized" warnings,
+    * if we lack a identifier we can just skip it.
+    */
+   if (this->primary_expression.identifier == NULL)
+      return;
+
+   this->is_lhs = new_value;
+
+   /* We need to go through the subexpressions tree to cover cases like
+    * ast_field_selection
+    */
+   if (this->subexpressions[0] != NULL)
+      this->subexpressions[0]->set_is_lhs(new_value);
+}
+
 ir_rvalue *
 ast_expression::do_hir(exec_list *instructions,
                       struct _mesa_glsl_parse_state *state,
@@ -1323,6 +1342,7 @@ ast_expression::do_hir(exec_list *instructions,
      break;

   case ast_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);

@@ -1592,6 +1612,7 @@ ast_expression::do_hir(exec_list *instructions,
   case ast_div_assign:
   case ast_add_assign:
   case ast_sub_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);

@@ -1618,6 +1639,7 @@ ast_expression::do_hir(exec_list *instructions,
   }

   case ast_mod_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);

@@ -1640,6 +1662,7 @@ ast_expression::do_hir(exec_list *instructions,

   case ast_ls_assign:
   case ast_rs_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);
      type = shift_result_type(op[0]->type, op[1]->type, this->oper, state,
@@ -1658,6 +1681,7 @@ ast_expression::do_hir(exec_list *instructions,
   case ast_and_assign:
   case ast_xor_assign:
   case ast_or_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
      op[0] = this->subexpressions[0]->hir(instructions, state);
      op[1] = this->subexpressions[1]->hir(instructions, state);
      type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
@@ -1839,6 +1863,11 @@ ast_expression::do_hir(exec_list *instructions,
   case ast_array_index: {
      YYLTYPE index_loc = subexpressions[1]->get_location();

+      /* Getting if an array is being used uninitialized is beyond what we get
+       * from ir_value.data.assigned. Setting is_lhs as true would force to
+       * not raise a uninitialized warning when using an array
+       */
+      subexpressions[0]->set_is_lhs(true);
      op[0] = subexpressions[0]->hir(instructions, state);
      op[1] = subexpressions[1]->hir(instructions, state);

@@ -1873,6 +1902,14 @@ ast_expression::do_hir(exec_list *instructions,
      if (var != NULL) {
         var->data.used = true;
         result = new(ctx) ir_dereference_variable(var);
+
+         if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out)
+             && !this->is_lhs
+             && result->variable_referenced()->data.assigned != true
+             && !is_gl_identifier(var->name)) {
+            _mesa_glsl_warning(&loc, state, "`%s' used uninitialized",
+                               this->primary_expression.identifier);
+         }
      } else {
         _mesa_glsl_error(& loc, state, "`%s' undeclared",
                          this->primary_expression.identifier);
@@ -2318,11 +2355,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
            return names[type_idx];
         }
         case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "samplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "samplerBuffer", NULL, NULL, NULL,
+              "imageBuffer", NULL, NULL, NULL
            };
-            return names[type_idx];
+            return names[offset + type_idx];
         }
         case GLSL_SAMPLER_DIM_EXTERNAL: {
            assert(type->base_type == GLSL_TYPE_SAMPLER);
@@ -2380,11 +2417,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
            return names[type_idx];
         }
         case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "isamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "isamplerBuffer", NULL, NULL, NULL,
+              "iimageBuffer", NULL, NULL, NULL
            };
-            return names[type_idx];
+            return names[offset + type_idx];
         }
         default:
            unreachable("Unsupported isampler/iimage dimensionality");
@@ -2435,11 +2472,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
            return names[type_idx];
         }
         case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "usamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "usamplerBuffer", NULL, NULL, NULL,
+              "uimageBuffer", NULL, NULL, NULL
            };
-            return names[type_idx];
+            return names[offset + type_idx];
         }
         default:
            unreachable("Unsupported usampler/uimage dimensionality");
@@ -2550,43 +2587,79 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
 }

 static bool
-process_qualifier_constant(struct _mesa_glsl_parse_state *state,
-                           YYLTYPE *loc,
-                           const char *qual_indentifier,
-                           ast_expression *const_expression,
-                           unsigned *value)
-{
-   exec_list dummy_instructions;
-
-   if (const_expression == NULL) {
-      *value = 0;
-      return true;
-   }
-
-   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
-
-   ir_constant *const const_int = ir->constant_expression_value();
-   if (const_int == NULL || !const_int->type->is_integer()) {
-      _mesa_glsl_error(loc, state, "%s must be an integral constant "
-                       "expression", qual_indentifier);
+validate_xfb_buffer_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              unsigned xfb_buffer) {
+   if (xfb_buffer >= state->Const.MaxTransformFeedbackBuffers) {
+      _mesa_glsl_error(loc, state,
+                       "invalid xfb_buffer specified %d is larger than "
+                       "MAX_TRANSFORM_FEEDBACK_BUFFERS - 1 (%d).",
+                       xfb_buffer,
+                       state->Const.MaxTransformFeedbackBuffers - 1);
      return false;
   }

-   if (const_int->value.i[0] < 0) {
-      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
-                       qual_indentifier, const_int->value.u[0]);
+   return true;
+}
+
+/* From the ARB_enhanced_layouts spec:
+ *
+ *    "Variables and block members qualified with *xfb_offset* can be
+ *    scalars, vectors, matrices, structures, and (sized) arrays of these.
+ *    The offset must be a multiple of the size of the first component of
+ *    the first qualified variable or block member, or a compile-time error
+ *    results.  Further, if applied to an aggregate containing a double,
+ *    the offset must also be a multiple of 8, and the space taken in the
+ *    buffer will be a multiple of 8.
+ */
+static bool
+validate_xfb_offset_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              int xfb_offset, const glsl_type *type,
+                              unsigned component_size) {
+  const glsl_type *t_without_array = type->without_array();
+
+   if (xfb_offset != -1 && type->is_unsized_array()) {
+      _mesa_glsl_error(loc, state,
+                       "xfb_offset can't be used with unsized arrays.");
      return false;
   }

-   /* If the location is const (and we've verified that
-    * it is) then no instructions should have been emitted
-    * when we converted it to HIR. If they were emitted,
-    * then either the location isn't const after all, or
-    * we are emitting unnecessary instructions.
+   /* Make sure nested structs don't contain unsized arrays, and validate
+    * any xfb_offsets on interface members.
    */
-   assert(dummy_instructions.is_empty());
+   if (t_without_array->is_record() || t_without_array->is_interface())
+      for (unsigned int i = 0; i < t_without_array->length; i++) {
+         const glsl_type *member_t = t_without_array->fields.structure[i].type;
+
+         /* When the interface block doesn't have an xfb_offset qualifier then
+          * we apply the component size rules at the member level.
+          */
+         if (xfb_offset == -1)
+            component_size = member_t->contains_double() ? 8 : 4;
+
+         int xfb_offset = t_without_array->fields.structure[i].offset;
+         validate_xfb_offset_qualifier(loc, state, xfb_offset, member_t,
+                                       component_size);
+      }
+
+  /* Nested structs or interface block without offset may not have had an
+   * offset applied yet so return.
+   */
+   if (xfb_offset == -1) {
+     return true;
+   }
+
+   if (xfb_offset % component_size) {
+      _mesa_glsl_error(loc, state,
+                       "invalid qualifier xfb_offset=%d must be a multiple "
+                       "of the first component size of the first qualified "
+                       "variable or block member. Or double if an aggregate "
+                       "that contains a double (%d).",
+                       xfb_offset, component_size);
+      return false;
+   }

-   *value = const_int->value.u[0];
   return true;
 }

@@ -3151,6 +3224,39 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
      }
   }

+   if (qual->flags.q.out && qual->flags.q.xfb_buffer) {
+      unsigned qual_xfb_buffer;
+      if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                     qual->xfb_buffer, &qual_xfb_buffer) &&
+          validate_xfb_buffer_qualifier(loc, state, qual_xfb_buffer)) {
+         var->data.xfb_buffer = qual_xfb_buffer;
+         if (qual->flags.q.explicit_xfb_buffer)
+            var->data.explicit_xfb_buffer = true;
+      }
+   }
+
+   if (qual->flags.q.explicit_xfb_offset) {
+      unsigned qual_xfb_offset;
+      unsigned component_size = var->type->contains_double() ? 8 : 4;
+
+      if (process_qualifier_constant(state, loc, "xfb_offset",
+                                     qual->offset, &qual_xfb_offset) &&
+          validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset,
+                                        var->type, component_size)) {
+         var->data.offset = qual_xfb_offset;
+         var->data.explicit_xfb_offset = true;
+      }
+   }
+
+   if (qual->flags.q.explicit_xfb_stride) {
+      unsigned qual_xfb_stride;
+      if (process_qualifier_constant(state, loc, "xfb_stride",
+                                     qual->xfb_stride, &qual_xfb_stride)) {
+         var->data.xfb_stride = qual_xfb_stride;
+         var->data.explicit_xfb_stride = true;
+      }
+   }
+
   if (var->type->contains_atomic()) {
      if (var->data.mode == ir_var_uniform) {
         if (var->data.explicit_binding) {
@@ -5746,6 +5852,11 @@ ast_switch_statement::test_to_hir(exec_list *instructions,
 {
   void *ctx = state;

+   /* set to true to avoid a duplicate "use of uninitialized variable" warning
+    * on the switch test case. The first one would be already raised when
+    * getting the test_expression at ast_switch_statement::hir
+    */
+   test_expression->set_is_lhs(true);
   /* Cache value of test expression. */
   ir_rvalue *const test_val =
      test_expression->hir(instructions,
@@ -6258,6 +6369,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                          ir_variable_mode var_mode,
                                          ast_type_qualifier *layout,
                                          unsigned block_stream,
+                                          unsigned block_xfb_buffer,
+                                          unsigned block_xfb_offset,
                                          unsigned expl_location,
                                          unsigned expl_align)
 {
@@ -6413,6 +6526,35 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
         }
      }

+      int xfb_buffer;
+      unsigned explicit_xfb_buffer = 0;
+      if (qual->flags.q.explicit_xfb_buffer) {
+         unsigned qual_xfb_buffer;
+         if (process_qualifier_constant(state, &loc, "xfb_buffer",
+                                        qual->xfb_buffer, &qual_xfb_buffer)) {
+            explicit_xfb_buffer = 1;
+            if (qual_xfb_buffer != block_xfb_buffer)
+               _mesa_glsl_error(&loc, state, "xfb_buffer layout qualifier on "
+                                "interface block member does not match "
+                                "the interface block (%u vs %u)",
+                                qual_xfb_buffer, block_xfb_buffer);
+         }
+         xfb_buffer = (int) qual_xfb_buffer;
+      } else {
+         if (layout)
+            explicit_xfb_buffer = layout->flags.q.xfb_buffer;
+         xfb_buffer = (int) block_xfb_buffer;
+      }
+
+      int xfb_stride = -1;
+      if (qual->flags.q.explicit_xfb_stride) {
+         unsigned qual_xfb_stride;
+         if (process_qualifier_constant(state, &loc, "xfb_stride",
+                                        qual->xfb_stride, &qual_xfb_stride)) {
+            xfb_stride = (int) qual_xfb_stride;
+         }
+      }
+
      if (qual->flags.q.uniform && qual->has_interpolation()) {
         _mesa_glsl_error(&loc, state,
                          "interpolation qualifiers cannot be used "
@@ -6458,6 +6600,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
         fields[i].sample = qual->flags.q.sample ? 1 : 0;
         fields[i].patch = qual->flags.q.patch ? 1 : 0;
         fields[i].precision = qual->precision;
+         fields[i].offset = -1;
+         fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
+         fields[i].xfb_buffer = xfb_buffer;
+         fields[i].xfb_stride = xfb_stride;

         if (qual->flags.q.explicit_location) {
            unsigned qual_location;
@@ -6520,8 +6666,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                   "with std430 and std140 layouts");
               }
            }
-         } else {
-            fields[i].offset = -1;
         }

         if (qual->flags.q.explicit_align || expl_align != 0) {
@@ -6554,6 +6698,32 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
               next_offset = glsl_align(next_offset + size, align);
         }

+         /* From the ARB_enhanced_layouts spec:
+          *
+          *    "The given offset applies to the first component of the first
+          *    member of the qualified entity.  Then, within the qualified
+          *    entity, subsequent components are each assigned, in order, to
+          *    the next available offset aligned to a multiple of that
+          *    component's size.  Aggregate types are flattened down to the
+          *    component level to get this sequence of components."
+          */
+         if (qual->flags.q.explicit_xfb_offset) {
+            unsigned xfb_offset;
+            if (process_qualifier_constant(state, &loc, "xfb_offset",
+                                           qual->offset, &xfb_offset)) {
+               fields[i].offset = xfb_offset;
+               block_xfb_offset = fields[i].offset +
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+            }
+         } else {
+            if (layout && layout->flags.q.explicit_xfb_offset) {
+               unsigned align = field_type->is_double() ? 8 : 4;
+               fields[i].offset = glsl_align(block_xfb_offset, align);
+               block_xfb_offset +=
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+            }
+         }
+
         /* Propogate row- / column-major information down the fields of the
          * structure or interface block.  Structures need this data because
          * the structure may contain a structure that contains ... a matrix
@@ -6648,6 +6818,8 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                ir_var_auto,
                                                layout,
                                                0, /* for interface only */
+                                                0, /* for interface only */
+                                                0, /* for interface only */
                                                expl_location,
                                                0 /* for interface only */);

@@ -6807,6 +6979,29 @@ ast_interface_block::hir(exec_list *instructions,
      return NULL;
   }

+   unsigned qual_xfb_buffer;
+   if (!process_qualifier_constant(state, &loc, "xfb_buffer",
+                                   layout.xfb_buffer, &qual_xfb_buffer) ||
+       !validate_xfb_buffer_qualifier(&loc, state, qual_xfb_buffer)) {
+      return NULL;
+   }
+
+   unsigned qual_xfb_offset;
+   if (layout.flags.q.explicit_xfb_offset) {
+      if (!process_qualifier_constant(state, &loc, "xfb_offset",
+                                      layout.offset, &qual_xfb_offset)) {
+         return NULL;
+      }
+   }
+
+   unsigned qual_xfb_stride;
+   if (layout.flags.q.explicit_xfb_stride) {
+      if (!process_qualifier_constant(state, &loc, "xfb_stride",
+                                      layout.xfb_stride, &qual_xfb_stride)) {
+         return NULL;
+      }
+   }
+
   unsigned expl_location = 0;
   if (layout.flags.q.explicit_location) {
      if (!process_qualifier_constant(state, &loc, "location",
@@ -6842,6 +7037,8 @@ ast_interface_block::hir(exec_list *instructions,
                                                var_mode,
                                                &this->layout,
                                                qual_stream,
+                                                qual_xfb_buffer,
+                                                qual_xfb_offset,
                                                expl_location,
                                                expl_align);

@@ -6956,6 +7153,12 @@ ast_interface_block::hir(exec_list *instructions,
               earlier_per_vertex->fields.structure[j].patch;
            fields[i].precision =
               earlier_per_vertex->fields.structure[j].precision;
+            fields[i].explicit_xfb_buffer =
+               earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
+            fields[i].xfb_buffer =
+               earlier_per_vertex->fields.structure[j].xfb_buffer;
+            fields[i].xfb_stride =
+               earlier_per_vertex->fields.structure[j].xfb_stride;
         }
      }

@@ -6986,6 +7189,12 @@ ast_interface_block::hir(exec_list *instructions,
                                        packing,
                                        this->block_name);

+   unsigned component_size = block_type->contains_double() ? 8 : 4;
+   int xfb_offset =
+      layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
+   validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
+                                 component_size);
+
   if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
      YYLTYPE loc = this->get_location();
      _mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' "
@@ -7207,8 +7416,17 @@ ast_interface_block::hir(exec_list *instructions,
         var->data.patch = fields[i].patch;
         var->data.stream = qual_stream;
         var->data.location = fields[i].location;
+
         if (fields[i].location != -1)
            var->data.explicit_location = true;
+
+         var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer;
+         var->data.xfb_buffer = fields[i].xfb_buffer;
+
+         if (fields[i].offset != -1)
+            var->data.explicit_xfb_offset = true;
+         var->data.offset = fields[i].offset;
+
         var->init_interface_type(block_type);

         if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
@@ -79,7 +79,10 @@ ast_type_qualifier::has_layout() const
          || this->flags.q.explicit_index
          || this->flags.q.explicit_binding
          || this->flags.q.explicit_offset
-          || this->flags.q.explicit_stream;
+          || this->flags.q.explicit_stream
+          || this->flags.q.explicit_xfb_buffer
+          || this->flags.q.explicit_xfb_offset
+          || this->flags.q.explicit_xfb_stride;
 }

 bool
@@ -229,6 +232,43 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
      }
   }

+   if (state->has_enhanced_layouts()) {
+      if (!this->flags.q.explicit_xfb_buffer) {
+         if (q.flags.q.xfb_buffer) {
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = q.xfb_buffer;
+         } else if (!this->flags.q.xfb_buffer && this->flags.q.out) {
+            /* Assign global xfb_buffer value */
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = state->out_qualifier->xfb_buffer;
+         }
+      }
+
+      if (q.flags.q.explicit_xfb_stride)
+         this->xfb_stride = q.xfb_stride;
+
+      /* Merge all we xfb_stride qualifiers into the global out */
+      if (q.flags.q.explicit_xfb_stride || this->flags.q.xfb_stride) {
+
+         /* Set xfb_stride flag to 0 to avoid adding duplicates every time
+          * there is a merge.
+          */
+         this->flags.q.xfb_stride = 0;
+
+         unsigned buff_idx;
+         if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                        this->xfb_buffer, &buff_idx)) {
+            if (state->out_qualifier->out_xfb_stride[buff_idx]) {
+               state->out_qualifier->out_xfb_stride[buff_idx]->merge_qualifier(
+                  new(state) ast_layout_expression(*loc, this->xfb_stride));
+            } else {
+               state->out_qualifier->out_xfb_stride[buff_idx] =
+                  new(state) ast_layout_expression(*loc, this->xfb_stride);
+            }
+         }
+      }
+   }
+
   if (q.flags.q.vertices) {
      if (this->vertices) {
         this->vertices->merge_qualifier(q.vertices);
@@ -300,7 +340,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
   if (q.flags.q.explicit_binding)
      this->binding = q.binding;

-   if (q.flags.q.explicit_offset)
+   if (q.flags.q.explicit_offset || q.flags.q.explicit_xfb_offset)
      this->offset = q.offset;

   if (q.precision != ast_precision_none)
@@ -322,6 +362,8 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
 {
   void *mem_ctx = state;
   const bool r = this->merge_qualifier(loc, state, q, false);
+   ast_type_qualifier valid_out_mask;
+   valid_out_mask.flags.i = 0;

   if (state->stage == MESA_SHADER_GEOMETRY) {
      if (q.flags.q.prim_type) {
@@ -340,13 +382,45 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,

      /* Allow future assigments of global out's stream id value */
      this->flags.q.explicit_stream = 0;
+
+      valid_out_mask.flags.q.stream = 1;
+      valid_out_mask.flags.q.explicit_stream = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+      valid_out_mask.flags.q.max_vertices = 1;
+      valid_out_mask.flags.q.prim_type = 1;
   } else if (state->stage == MESA_SHADER_TESS_CTRL) {
      if (create_node) {
         node = new(mem_ctx) ast_tcs_output_layout(*loc);
      }
+      valid_out_mask.flags.q.vertices = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+   } else if (state->stage == MESA_SHADER_TESS_EVAL ||
+              state->stage == MESA_SHADER_VERTEX) {
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
   } else {
      _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
-                       "tessellation control or geometry shaders");
+                       "geometry, tessellation and vertex shaders");
+      return false;
+   }
+
+   /* Allow future assigments of global out's */
+   this->flags.q.explicit_xfb_buffer = 0;
+   this->flags.q.explicit_xfb_stride = 0;
+
+   /* Generate an error when invalid input layout qualifiers are used. */
+   if ((q.flags.i & ~valid_out_mask.flags.i) != 0) {
+      _mesa_glsl_error(loc, state,
+		       "invalid output layout qualifiers used");
+      return false;
   }

   return r;
@@ -566,3 +640,44 @@ ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state

   return true;
 }
+
+bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value)
+{
+   exec_list dummy_instructions;
+
+   if (const_expression == NULL) {
+      *value = 0;
+      return true;
+   }
+
+   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+   ir_constant *const const_int = ir->constant_expression_value();
+   if (const_int == NULL || !const_int->type->is_integer()) {
+      _mesa_glsl_error(loc, state, "%s must be an integral constant "
+                       "expression", qual_indentifier);
+      return false;
+   }
+
+   if (const_int->value.i[0] < 0) {
+      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
+                       qual_indentifier, const_int->value.u[0]);
+      return false;
+   }
+
+   /* If the location is const (and we've verified that
+    * it is) then no instructions should have been emitted
+    * when we converted it to HIR. If they were emitted,
+    * then either the location isn't const after all, or
+    * we are emitting unnecessary instructions.
+    */
+   assert(dummy_instructions.is_empty());
+
+   *value = const_int->value.u[0];
+   return true;
+}
@@ -129,12 +129,6 @@ v130_fs_only(const _mesa_glsl_parse_state *state)
          state->stage == MESA_SHADER_FRAGMENT;
 }

-static bool
-v140(const _mesa_glsl_parse_state *state)
-{
-   return state->is_version(140, 0);
-}
-
 static bool
 v140_or_es3(const _mesa_glsl_parse_state *state)
 {
@@ -183,6 +177,14 @@ v110_lod(const _mesa_glsl_parse_state *state)
   return !state->es_shader && lod_exists_in_stage(state);
 }

+static bool
+texture_buffer(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(140, 320) ||
+      state->EXT_texture_buffer_enable ||
+      state->OES_texture_buffer_enable;
+}
+
 static bool
 shader_texture_lod(const _mesa_glsl_parse_state *state)
 {
@@ -262,10 +264,12 @@ shader_packing_or_es31_or_gpu_shader5(const _mesa_glsl_parse_state *state)
 }

 static bool
-fs_gpu_shader5(const _mesa_glsl_parse_state *state)
+fs_interpolate_at(const _mesa_glsl_parse_state *state)
 {
   return state->stage == MESA_SHADER_FRAGMENT &&
-          (state->is_version(400, 0) || state->ARB_gpu_shader5_enable);
+          (state->is_version(400, 320) ||
+           state->ARB_gpu_shader5_enable ||
+           state->OES_shader_multisample_interpolation_enable);
 }


@@ -1581,9 +1585,9 @@ builtin_builder::create_builtins()
                _textureSize(v130, glsl_type::ivec2_type, glsl_type::usampler2DRect_type),
                _textureSize(v130, glsl_type::ivec2_type, glsl_type::sampler2DRectShadow_type),

-                _textureSize(v140, glsl_type::int_type,   glsl_type::samplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::samplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
                _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::sampler2DMS_type),
                _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type),
                _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type),
@@ -1855,9 +1859,9 @@ builtin_builder::create_builtins()
                _texelFetch(v130, glsl_type::ivec4_type, glsl_type::isampler2DArray_type, glsl_type::ivec3_type),
                _texelFetch(v130, glsl_type::uvec4_type, glsl_type::usampler2DArray_type, glsl_type::ivec3_type),

-                _texelFetch(v140, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
-                _texelFetch(v140, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
-                _texelFetch(v140, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),

                _texelFetch(texture_multisample, glsl_type::vec4_type,  glsl_type::sampler2DMS_type,  glsl_type::ivec2_type),
                _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
@@ -5163,7 +5167,7 @@ builtin_builder::_interpolateAtCentroid(const glsl_type *type)
 {
   ir_variable *interpolant = in_var(type, "interpolant");
   interpolant->data.must_be_shader_input = 1;
-   MAKE_SIG(type, fs_gpu_shader5, 1, interpolant);
+   MAKE_SIG(type, fs_interpolate_at, 1, interpolant);

   body.emit(ret(interpolate_at_centroid(interpolant)));

@@ -5176,7 +5180,7 @@ builtin_builder::_interpolateAtOffset(const glsl_type *type)
   ir_variable *interpolant = in_var(type, "interpolant");
   interpolant->data.must_be_shader_input = 1;
   ir_variable *offset = in_var(glsl_type::vec2_type, "offset");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, offset);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, offset);

   body.emit(ret(interpolate_at_offset(interpolant, offset)));

@@ -5189,7 +5193,7 @@ builtin_builder::_interpolateAtSample(const glsl_type *type)
   ir_variable *interpolant = in_var(type, "interpolant");
   interpolant->data.must_be_shader_input = 1;
   ir_variable *sample_num = in_var(glsl_type::int_type, "sample_num");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, sample_num);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, sample_num);

   body.emit(ret(interpolate_at_sample(interpolant, sample_num)));

@@ -179,7 +179,7 @@ static const struct builtin_type_versions {
   T(sampler2DArray,                  130, 300)
   T(samplerCubeArray,                400, 999)
   T(sampler2DRect,                   140, 999)
-   T(samplerBuffer,                   140, 999)
+   T(samplerBuffer,                   140, 320)
   T(sampler2DMS,                     150, 310)
   T(sampler2DMSArray,                150, 999)

@@ -191,7 +191,7 @@ static const struct builtin_type_versions {
   T(isampler2DArray,                 130, 300)
   T(isamplerCubeArray,               400, 999)
   T(isampler2DRect,                  140, 999)
-   T(isamplerBuffer,                  140, 999)
+   T(isamplerBuffer,                  140, 320)
   T(isampler2DMS,                    150, 310)
   T(isampler2DMSArray,               150, 999)

@@ -203,7 +203,7 @@ static const struct builtin_type_versions {
   T(usampler2DArray,                 130, 300)
   T(usamplerCubeArray,               400, 999)
   T(usampler2DRect,                  140, 999)
-   T(usamplerBuffer,                  140, 999)
+   T(usamplerBuffer,                  140, 320)
   T(usampler2DMS,                    150, 310)
   T(usampler2DMSArray,               150, 999)

@@ -222,7 +222,7 @@ static const struct builtin_type_versions {
   T(image3D,                         420, 310)
   T(image2DRect,                     420, 999)
   T(imageCube,                       420, 310)
-   T(imageBuffer,                     420, 999)
+   T(imageBuffer,                     420, 320)
   T(image1DArray,                    420, 999)
   T(image2DArray,                    420, 310)
   T(imageCubeArray,                  420, 999)
@@ -233,7 +233,7 @@ static const struct builtin_type_versions {
   T(iimage3D,                        420, 310)
   T(iimage2DRect,                    420, 999)
   T(iimageCube,                      420, 310)
-   T(iimageBuffer,                    420, 999)
+   T(iimageBuffer,                    420, 320)
   T(iimage1DArray,                   420, 999)
   T(iimage2DArray,                   420, 310)
   T(iimageCubeArray,                 420, 999)
@@ -244,7 +244,7 @@ static const struct builtin_type_versions {
   T(uimage3D,                        420, 310)
   T(uimage2DRect,                    420, 999)
   T(uimageCube,                      420, 310)
-   T(uimageBuffer,                    420, 999)
+   T(uimageBuffer,                    420, 320)
   T(uimage1DArray,                   420, 999)
   T(uimage2DArray,                   420, 310)
   T(uimageCubeArray,                 420, 999)
@@ -371,6 +371,16 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
      add_type(symbols, glsl_type::uimage2DMSArray_type);
   }

+   if (state->EXT_texture_buffer_enable || state->OES_texture_buffer_enable) {
+      add_type(symbols, glsl_type::samplerBuffer_type);
+      add_type(symbols, glsl_type::isamplerBuffer_type);
+      add_type(symbols, glsl_type::usamplerBuffer_type);
+
+      add_type(symbols, glsl_type::imageBuffer_type);
+      add_type(symbols, glsl_type::iimageBuffer_type);
+      add_type(symbols, glsl_type::uimageBuffer_type);
+   }
+
   if (state->has_atomic_counters()) {
      add_type(symbols, glsl_type::atomic_uint_type);
   }
@@ -334,6 +334,9 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
   this->fields[this->num_fields].image_coherent = 0;
   this->fields[this->num_fields].image_volatile = 0;
   this->fields[this->num_fields].image_restrict = 0;
+   this->fields[this->num_fields].explicit_xfb_buffer = 0;
+   this->fields[this->num_fields].xfb_buffer = -1;
+   this->fields[this->num_fields].xfb_stride = -1;
   this->num_fields++;
 }

@@ -812,6 +815,13 @@ builtin_variable_generator::generate_constants()
       */
   }

+   if (state->has_enhanced_layouts()) {
+      add_const("gl_MaxTransformFeedbackBuffers",
+                state->Const.MaxTransformFeedbackBuffers);
+      add_const("gl_MaxTransformFeedbackInterleavedComponents",
+                state->Const.MaxTransformFeedbackInterleavedComponents);
+   }
+
   if (state->is_version(420, 310) ||
       state->ARB_shader_image_load_store_enable) {
      add_const("gl_MaxImageUnits",
@@ -868,6 +878,10 @@ builtin_variable_generator::generate_constants()
      add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
      add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
   }
+
+   if (state->is_version(450, 320) ||
+       state->OES_sample_variables_enable)
+      add_const("gl_MaxSamples", state->Const.MaxSamples);
 }


@@ -877,7 +891,9 @@ builtin_variable_generator::generate_constants()
 void
 builtin_variable_generator::generate_uniforms()
 {
-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable)
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable)
      add_uniform(int_t, "gl_NumSamples");
   add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange");
   add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA");
@@ -1130,7 +1146,9 @@ builtin_variable_generator::generate_fs_special_vars()
         var->enable_extension_warning("GL_AMD_shader_stencil_export");
   }

-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable) {
      add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID");
      add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition");
      /* From the ARB_sample_shading specification:
@@ -1143,7 +1161,9 @@ builtin_variable_generator::generate_fs_special_vars()
      add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask");
   }

-   if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_gpu_shader5_enable ||
+       state->OES_sample_variables_enable) {
      add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn");
   }

@@ -2371,6 +2371,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	   if (extensions != NULL) {
 	      if (extensions->OES_EGL_image_external)
 	         add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
+              if (extensions->OES_sample_variables) {
+                 add_builtin_define(parser, "GL_OES_sample_variables", 1);
+                 add_builtin_define(parser, "GL_OES_shader_multisample_interpolation", 1);
+              }
              if (extensions->OES_standard_derivatives)
                 add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
              if (extensions->ARB_texture_multisample)
@@ -2390,6 +2394,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                    add_builtin_define(parser, "GL_EXT_gpu_shader5", 1);
                    add_builtin_define(parser, "GL_OES_gpu_shader5", 1);
                 }
+                 if (extensions->OES_texture_buffer) {
+                    add_builtin_define(parser, "GL_EXT_texture_buffer", 1);
+                    add_builtin_define(parser, "GL_OES_texture_buffer", 1);
+                 }
              }
 	   }
 	} else {
@@ -369,7 +369,7 @@ image2D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 image3D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
 image2DRect     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT);
 imageCube       KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
-imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER);
+imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IMAGEBUFFER);
 image1DArray    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY);
 image2DArray    KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
 imageCubeArray  KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY);
@@ -380,7 +380,7 @@ iimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 iimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
 iimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT);
 iimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
-iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER);
+iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IIMAGEBUFFER);
 iimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY);
 iimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
 iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY);
@@ -391,7 +391,7 @@ uimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 uimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
 uimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT);
 uimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
-uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER);
+uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, UIMAGEBUFFER);
 uimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY);
 uimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
 uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY);
@@ -472,6 +472,13 @@ layout		{
 \.[0-9]+([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+\.([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+[eE][+-]?[0-9]+[fF]?		{
+			    struct _mesa_glsl_parse_state *state = yyextra;
+			    char suffix = yytext[strlen(yytext) - 1];
+			    if (!state->is_version(120, 300) &&
+			        (suffix == 'f' || suffix == 'F')) {
+			        _mesa_glsl_error(yylloc, state,
+			                         "Float suffixes are invalid in GLSL 1.10");
+			    }
 			    yylval->real = _mesa_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
@@ -565,19 +572,19 @@ common		KEYWORD(130, 300, 0, 0, COMMON);
 partition	KEYWORD(130, 300, 0, 0, PARTITION);
 active		KEYWORD(130, 300, 0, 0, ACTIVE);
 superp		KEYWORD(130, 100, 0, 0, SUPERP);
-samplerBuffer	KEYWORD(130, 300, 140, 0, SAMPLERBUFFER);
+samplerBuffer	KEYWORD_WITH_ALT(130, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, SAMPLERBUFFER);
 filter		KEYWORD(130, 300, 0, 0, FILTER);
 row_major	KEYWORD_WITH_ALT(130, 0, 140, 0, yyextra->ARB_uniform_buffer_object_enable && !yyextra->es_shader, ROW_MAJOR);

    /* Additional reserved words in GLSL 1.40 */
 isampler2DRect	KEYWORD(140, 300, 140, 0, ISAMPLER2DRECT);
 usampler2DRect	KEYWORD(140, 300, 140, 0, USAMPLER2DRECT);
-isamplerBuffer	KEYWORD(140, 300, 140, 0, ISAMPLERBUFFER);
-usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
+isamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, ISAMPLERBUFFER);
+usamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, USAMPLERBUFFER);

    /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
-sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
+sample		KEYWORD_WITH_ALT(400, 300, 400, 320, yyextra->ARB_gpu_shader5_enable || yyextra->OES_shader_multisample_interpolation_enable, SAMPLE);
 subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);


@@ -1541,6 +1541,25 @@ layout_qualifier_id:
         }
      }

+      if (state->has_enhanced_layouts()) {
+         if (match_layout_qualifier("xfb_buffer", $1, state) == 0) {
+            $$.flags.q.xfb_buffer = 1;
+            $$.flags.q.explicit_xfb_buffer = 1;
+            $$.xfb_buffer = $3;
+         }
+
+         if (match_layout_qualifier("xfb_offset", $1, state) == 0) {
+            $$.flags.q.explicit_xfb_offset = 1;
+            $$.offset = $3;
+         }
+
+         if (match_layout_qualifier("xfb_stride", $1, state) == 0) {
+            $$.flags.q.xfb_stride = 1;
+            $$.flags.q.explicit_xfb_stride = 1;
+            $$.xfb_stride = $3;
+         }
+      }
+
      static const char * const local_size_qualifiers[3] = {
         "local_size_x",
         "local_size_y",
@@ -1915,6 +1934,12 @@ storage_qualifier:
          $$.flags.q.explicit_stream = 0;
          $$.stream = state->out_qualifier->stream;
      }
+
+      if (state->has_enhanced_layouts()) {
+          $$.flags.q.xfb_buffer = 1;
+          $$.flags.q.explicit_xfb_buffer = 0;
+          $$.xfb_buffer = state->out_qualifier->xfb_buffer;
+      }
   }
   | UNIFORM
   {
@@ -140,6 +140,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
   this->Const.MaxAtomicCounterBufferSize =
      ctx->Const.MaxAtomicBufferSize;

+   /* ARB_enhanced_layouts constants */
+   this->Const.MaxTransformFeedbackBuffers = ctx->Const.MaxTransformFeedbackBuffers;
+   this->Const.MaxTransformFeedbackInterleavedComponents = ctx->Const.MaxTransformFeedbackInterleavedComponents;
+
   /* Compute shader constants */
   for (unsigned i = 0; i < ARRAY_SIZE(this->Const.MaxComputeWorkGroupCount); i++)
      this->Const.MaxComputeWorkGroupCount[i] = ctx->Const.MaxComputeWorkGroupCount[i];
@@ -177,6 +181,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
   this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
   this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;

+   /* GL 4.5 / OES_sample_variables */
+   this->Const.MaxSamples = ctx->Const.MaxSamples;
+
   this->current_function = NULL;
   this->toplevel_ir = NULL;
   this->found_return = false;
@@ -610,9 +617,12 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
   EXT(OES_geometry_point_size,        false, true,      OES_geometry_shader),
   EXT(OES_geometry_shader,            false, true,      OES_geometry_shader),
   EXT(OES_gpu_shader5,                false, true,      ARB_gpu_shader5),
+   EXT(OES_sample_variables,           false, true,      OES_sample_variables),
   EXT(OES_shader_image_atomic,        false, true,      ARB_shader_image_load_store),
+   EXT(OES_shader_multisample_interpolation, false, true, OES_sample_variables),
   EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
   EXT(OES_texture_3D,                 false, true,      dummy_true),
+   EXT(OES_texture_buffer,             false, true,      OES_texture_buffer),
   EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),

   /* All other extensions go here, sorted alphabetically.
@@ -629,6 +639,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
   EXT(EXT_shader_integer_mix,         true,  true,      EXT_shader_integer_mix),
   EXT(EXT_shader_samples_identical,   true,  true,      EXT_shader_samples_identical),
   EXT(EXT_texture_array,              true,  false,     EXT_texture_array),
+   EXT(EXT_texture_buffer,             false, true,      OES_texture_buffer),
 };

 #undef EXT
@@ -935,6 +946,13 @@ _mesa_ast_process_interface_block(YYLTYPE *locp,
      block->layout.stream = state->out_qualifier->stream;
   }

+   if (state->has_enhanced_layouts() && block->layout.flags.q.out) {
+      /* Assign global layout's xfb_buffer value. */
+      block->layout.flags.q.xfb_buffer = 1;
+      block->layout.flags.q.explicit_xfb_buffer = 0;
+      block->layout.xfb_buffer = state->out_qualifier->xfb_buffer;
+   }
+
   foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
      ast_type_qualifier& qualifier = member->type->qualifier;
      if ((qualifier.flags.i & interface_type_mask) == 0) {
@@ -1206,6 +1224,7 @@ ast_expression::ast_expression(int oper,
   this->subexpressions[1] = ex1;
   this->subexpressions[2] = ex2;
   this->non_lvalue_description = NULL;
+   this->is_lhs = false;
 }


@@ -1583,13 +1602,12 @@ set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
 {
   /* Should have been prevented by the parser. */
-   if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+   if (shader->Stage == MESA_SHADER_TESS_CTRL ||
+       shader->Stage == MESA_SHADER_VERTEX) {
      assert(!state->in_qualifier->flags.i);
-   } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
-      assert(!state->out_qualifier->flags.i);
-   } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
+   } else if (shader->Stage != MESA_SHADER_GEOMETRY &&
+              shader->Stage != MESA_SHADER_TESS_EVAL) {
      assert(!state->in_qualifier->flags.i);
-      assert(!state->out_qualifier->flags.i);
   }

   if (shader->Stage != MESA_SHADER_COMPUTE) {
@@ -1606,6 +1624,17 @@ set_shader_inout_layout(struct gl_shader *shader,
      assert(!state->fs_early_fragment_tests);
   }

+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      if (state->out_qualifier->out_xfb_stride[i]) {
+         unsigned xfb_stride;
+         if (state->out_qualifier->out_xfb_stride[i]->
+                process_qualifier_constant(state, "xfb_stride", &xfb_stride,
+                true)) {
+            shader->TransformFeedback.BufferStride[i] = xfb_stride;
+         }
+      }
+   }
+
   switch (shader->Stage) {
   case MESA_SHADER_TESS_CTRL:
      shader->TessCtrl.VerticesOut = 0;
@@ -383,6 +383,10 @@ struct _mesa_glsl_parse_state {
      /* ARB_draw_buffers */
      unsigned MaxDrawBuffers;

+      /* ARB_enhanced_layouts */
+      unsigned MaxTransformFeedbackBuffers;
+      unsigned MaxTransformFeedbackInterleavedComponents;
+
      /* ARB_blend_func_extended */
      unsigned MaxDualSourceDrawBuffers;

@@ -457,6 +461,9 @@ struct _mesa_glsl_parse_state {
      unsigned MaxTessControlTotalOutputComponents;
      unsigned MaxTessControlUniformComponents;
      unsigned MaxTessEvaluationUniformComponents;
+
+      /* GL 4.5 / OES_sample_variables */
+      unsigned MaxSamples;
   } Const;

   /**
@@ -597,12 +604,18 @@ struct _mesa_glsl_parse_state {
   bool OES_geometry_shader_warn;
   bool OES_gpu_shader5_enable;
   bool OES_gpu_shader5_warn;
+   bool OES_sample_variables_enable;
+   bool OES_sample_variables_warn;
   bool OES_shader_image_atomic_enable;
   bool OES_shader_image_atomic_warn;
+   bool OES_shader_multisample_interpolation_enable;
+   bool OES_shader_multisample_interpolation_warn;
   bool OES_standard_derivatives_enable;
   bool OES_standard_derivatives_warn;
   bool OES_texture_3D_enable;
   bool OES_texture_3D_warn;
+   bool OES_texture_buffer_enable;
+   bool OES_texture_buffer_warn;
   bool OES_texture_storage_multisample_2d_array_enable;
   bool OES_texture_storage_multisample_2d_array_warn;

@@ -632,6 +645,8 @@ struct _mesa_glsl_parse_state {
   bool EXT_shader_samples_identical_warn;
   bool EXT_texture_array_enable;
   bool EXT_texture_array_warn;
+   bool EXT_texture_buffer_enable;
+   bool EXT_texture_buffer_warn;
   /*@}*/

   /** Extensions supported by the OpenGL implementation. */
@@ -726,6 +726,21 @@ public:
       */
      unsigned is_xfb_only:1;

+      /**
+       * Was a transfor feedback buffer set in the shader?
+       */
+      unsigned explicit_xfb_buffer:1;
+
+      /**
+       * Was a transfor feedback offset set in the shader?
+       */
+      unsigned explicit_xfb_offset:1;
+
+      /**
+       * Was a transfor feedback stride set in the shader?
+       */
+      unsigned explicit_xfb_stride:1;
+
      /**
       * If non-zero, then this variable may be packed along with other variables
       * into a single varying slot, so this offset should be applied when
@@ -742,21 +757,9 @@ public:

      /**
       * Non-zero if this variable was created by lowering a named interface
-       * block which was not an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_array will never
-       * both be non-zero.
+       * block.
       */
-      unsigned from_named_ifc_block_nonarray:1;
-
-      /**
-       * Non-zero if this variable was created by lowering a named interface
-       * block which was an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_nonarray will never
-       * both be non-zero.
-       */
-      unsigned from_named_ifc_block_array:1;
+      unsigned from_named_ifc_block:1;

      /**
       * Non-zero if the variable must be a shader input. This is useful for
@@ -873,7 +876,7 @@ public:
      unsigned stream;

      /**
-       * Atomic or block member offset.
+       * Atomic, transform feedback or block member offset.
       */
      unsigned offset;

@@ -884,6 +887,16 @@ public:
       */
      unsigned max_array_access;

+      /**
+       * Transform feedback buffer.
+       */
+      unsigned xfb_buffer;
+
+      /**
+       * Transform feedback stride.
+       */
+      unsigned xfb_stride;
+
      /**
       * Allow (only) ir_variable direct access private members.
       */
@@ -105,11 +105,6 @@ struct gl_uniform_storage {
    */
   unsigned array_elements;

-   /**
-    * Has this uniform ever been set?
-    */
-   bool initialized;
-
   struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES];

   /**
@@ -242,7 +242,8 @@ public:
         return entry ? (ir_variable *) entry->data : NULL;
      } else {
         const struct hash_entry *entry =
-            _mesa_hash_table_search(ht, var->get_interface_type()->name);
+            _mesa_hash_table_search(ht,
+               var->get_interface_type()->without_array()->name);
         return entry ? (ir_variable *) entry->data : NULL;
      }
   }
@@ -263,7 +264,8 @@ public:
         snprintf(location_str, 11, "%d", var->data.location);
         _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
      } else {
-         _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+         _mesa_hash_table_insert(ht,
+            var->get_interface_type()->without_array()->name, var);
      }
   }

@@ -162,8 +162,6 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
            }
         }
      }
-
-      storage->initialized = true;
   }
 }

@@ -183,7 +181,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)

         if (stage_index != -1) {
            struct gl_shader *sh = prog->_LinkedShaders[i];
-            sh->BufferInterfaceBlocks[stage_index].Binding = binding;
+            sh->BufferInterfaceBlocks[stage_index]->Binding = binding;
         }
      }
 }
@@ -267,8 +265,6 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
         }
      }
   }
-
-   storage->initialized = true;
 }
 }

@@ -68,7 +68,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name)
   unsigned packing = type->interface_packing;

   recursion(type, &name_copy, strlen(name), false, NULL, packing, false,
-             record_array_count);
+             record_array_count, NULL);
   ralloc_free(name_copy);
 }

@@ -76,8 +76,6 @@ void
 program_resource_visitor::process(ir_variable *var)
 {
   unsigned record_array_count = 1;
-   const glsl_type *t = var->type;
-   const glsl_type *t_without_array = var->type->without_array();
   const bool row_major =
      var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;

@@ -85,80 +83,28 @@ program_resource_visitor::process(ir_variable *var)
      var->get_interface_type()->interface_packing :
      var->type->interface_packing;

+   const glsl_type *t =
+      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+   const glsl_type *t_without_array = t->without_array();
+
   /* false is always passed for the row_major parameter to the other
    * processing functions because no information is available to do
    * otherwise.  See the warning in linker.h.
    */
-
-   /* Only strdup the name if we actually will need to modify it. */
-   if (var->data.from_named_ifc_block_array) {
-      /* lower_named_interface_blocks created this variable by lowering an
-       * interface block array to an array variable.  For example if the
-       * original source code was:
-       *
-       *     out Blk { vec4 bar } foo[3];
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar[3];
-       *
-       * We need to visit each array element using the names constructed like
-       * so:
-       *
-       *     Blk[0].bar
-       *     Blk[1].bar
-       *     Blk[2].bar
-       */
-      assert(t->is_array());
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_strdup(NULL, ifc_type->name);
-      size_t name_length = strlen(name);
-      for (unsigned i = 0; i < t->length; i++) {
-         size_t new_length = name_length;
-         ralloc_asprintf_rewrite_tail(&name, &new_length, "[%u].%s", i,
-                                      var->name);
-         /* Note: row_major is only meaningful for uniform blocks, and
-          * lowering is only applied to non-uniform interface blocks, so we
-          * can safely pass false for row_major.
-          */
-         recursion(var->type, &name, new_length, row_major, NULL, packing,
-                   false, record_array_count);
-      }
-      ralloc_free(name);
-   } else if (var->data.from_named_ifc_block_nonarray) {
-      /* lower_named_interface_blocks created this variable by lowering a
-       * named interface block (non-array) to an ordinary variable.  For
-       * example if the original source code was:
-       *
-       *     out Blk { vec4 bar } foo;
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar;
-       *
-       * We need to visit this variable using the name:
-       *
-       *     Blk.bar
-       */
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_asprintf(NULL, "%s.%s", ifc_type->name, var->name);
-      /* Note: row_major is only meaningful for uniform blocks, and lowering
-       * is only applied to non-uniform interface blocks, so we can safely
-       * pass false for row_major.
-       */
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
-      ralloc_free(name);
-   } else if (t_without_array->is_record() ||
+   if (t_without_array->is_record() ||
              (t->is_array() && t->fields.array->is_array())) {
      char *name = ralloc_strdup(NULL, var->name);
      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+                false, record_array_count, NULL);
      ralloc_free(name);
   } else if (t_without_array->is_interface()) {
      char *name = ralloc_strdup(NULL, t_without_array->name);
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+      const glsl_struct_field *ifc_member = var->data.from_named_ifc_block ?
+         &t_without_array->
+            fields.structure[t_without_array->field_index(var->name)] : NULL;
+
+      recursion(t, &name, strlen(name), row_major, NULL, packing,
+                false, record_array_count, ifc_member);
      ralloc_free(name);
   } else {
      this->set_record_array_count(record_array_count);
@@ -172,7 +118,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                                    const glsl_type *record_type,
                                    const unsigned packing,
                                    bool last_field,
-                                    unsigned record_array_count)
+                                    unsigned record_array_count,
+                                    const glsl_struct_field *named_ifc_member)
 {
   /* Records need to have each field processed individually.
    *
@@ -180,7 +127,12 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
    * individually, then each field of the resulting array elements processed
    * individually.
    */
-   if (t->is_record() || t->is_interface()) {
+   if (t->is_interface() && named_ifc_member) {
+      ralloc_asprintf_rewrite_tail(name, &name_length, ".%s",
+                                   named_ifc_member->name);
+      recursion(named_ifc_member->type, name, name_length, row_major, NULL,
+                packing, false, record_array_count, NULL);
+   } else if (t->is_record() || t->is_interface()) {
      if (record_type == NULL && t->is_record())
         record_type = t;

@@ -223,7 +175,7 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                   field_row_major,
                   record_type,
                   packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count, NULL);

         /* Only the first leaf-field of the record gets called with the
          * record type pointer.
@@ -258,7 +210,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
         recursion(t->fields.array, name, new_length, row_major,
                   record_type,
                   packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count,
+                   named_ifc_member);

         /* Only the first leaf-field of the record gets called with the
          * record type pointer.
@@ -799,7 +752,6 @@ private:

      this->uniforms[id].name = ralloc_strdup(this->uniforms, name);
      this->uniforms[id].type = base_type;
-      this->uniforms[id].initialized = 0;
      this->uniforms[id].num_driver_storage = 0;
      this->uniforms[id].driver_storage = NULL;
      this->uniforms[id].atomic_buffer_index = -1;
@@ -954,6 +906,8 @@ link_cross_validate_uniform_block(void *mem_ctx,
          new_block->Uniforms,
          sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);

+   linked_block->Name = ralloc_strdup(*linked_blocks, linked_block->Name);
+
   for (unsigned int i = 0; i < linked_block->NumUniforms; i++) {
      struct gl_uniform_buffer_variable *ubo_var =
         &linked_block->Uniforms[i];
@@ -1005,9 +959,9 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)

      const unsigned l = strlen(var->name);
      for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
-         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
+         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) {
            if (sentinel) {
-               const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
+               const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name;
               const char *end = strchr(begin, sentinel);

               if (end == NULL)
@@ -1022,7 +976,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                  break;
               }
            } else if (!strcmp(var->name,
-                               shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
+                               shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) {
               found = true;
               var->data.location = j;
               break;
@@ -1148,9 +1102,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
      sh->num_combined_uniform_components = sh->num_uniform_components;

      for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
-         if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
+         if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) {
            sh->num_combined_uniform_components +=
-               sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
+               sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4;
         }
      }
   }
@@ -63,6 +63,125 @@ get_varying_type(const ir_variable *var, gl_shader_stage stage)
   return type;
 }

+static void
+create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
+                         size_t name_length, unsigned *count,
+                         const char *ifc_member_name,
+                         const glsl_type *ifc_member_t, char ***varying_names)
+{
+   if (t->is_interface()) {
+      size_t new_length = name_length;
+
+      assert(ifc_member_name && ifc_member_t);
+      ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", ifc_member_name);
+
+      create_xfb_varying_names(mem_ctx, ifc_member_t, name, new_length, count,
+                               NULL, NULL, varying_names);
+   } else if (t->is_record()) {
+      for (unsigned i = 0; i < t->length; i++) {
+         const char *field = t->fields.structure[i].name;
+         size_t new_length = name_length;
+
+         ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+
+         create_xfb_varying_names(mem_ctx, t->fields.structure[i].type, name,
+                                  new_length, count, NULL, NULL,
+                                  varying_names);
+      }
+   } else if (t->without_array()->is_record() ||
+              t->without_array()->is_interface() ||
+              (t->is_array() && t->fields.array->is_array())) {
+      for (unsigned i = 0; i < t->length; i++) {
+         size_t new_length = name_length;
+
+         /* Append the subscript to the current variable name */
+         ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
+
+         create_xfb_varying_names(mem_ctx, t->fields.array, name, new_length,
+                                  count, ifc_member_name, ifc_member_t,
+                                  varying_names);
+      }
+   } else {
+      (*varying_names)[(*count)++] = ralloc_strdup(mem_ctx, *name);
+   }
+}
+
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names)
+{
+   bool has_xfb_qualifiers = false;
+
+   /* We still need to enable transform feedback mode even if xfb_stride is
+    * only applied to a global out. Also we don't bother to propagate
+    * xfb_stride to interface block members so this will catch that case also.
+    */
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (sh->TransformFeedback.BufferStride[j]) {
+         has_xfb_qualifiers = true;
+      }
+   }
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "Any shader making any static use (after preprocessing) of any of
+       *     these *xfb_* qualifiers will cause the shader to be in a
+       *     transform feedback capturing mode and hence responsible for
+       *     describing the transform feedback setup.  This mode will capture
+       *     any output selected by *xfb_offset*, directly or indirectly, to
+       *     a transform feedback buffer."
+       */
+      if (var->data.explicit_xfb_buffer || var->data.explicit_xfb_stride) {
+         has_xfb_qualifiers = true;
+      }
+
+      if (var->data.explicit_xfb_offset) {
+         *num_tfeedback_decls += var->type->varying_count();
+         has_xfb_qualifiers = true;
+      }
+   }
+
+   if (*num_tfeedback_decls == 0)
+      return has_xfb_qualifiers;
+
+   unsigned i = 0;
+   *varying_names = ralloc_array(mem_ctx, char *, *num_tfeedback_decls);
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      if (var->data.explicit_xfb_offset) {
+         char *name;
+         const glsl_type *type, *member_type;
+
+         if (var->data.from_named_ifc_block) {
+            type = var->get_interface_type();
+            /* Find the member type before it was altered by lowering */
+            member_type =
+               type->fields.structure[type->field_index(var->name)].type;
+            name = ralloc_strdup(NULL, type->without_array()->name);
+         } else {
+            type = var->type;
+            member_type = NULL;
+            name = ralloc_strdup(NULL, var->name);
+         }
+         create_xfb_varying_names(mem_ctx, type, &name, strlen(name), &i,
+                                  var->name, member_type, varying_names);
+         ralloc_free(name);
+      }
+   }
+
+   assert(i == *num_tfeedback_decls);
+   return has_xfb_qualifiers;
+}
+
 /**
 * Validate the types and qualifiers of an output from one stage against the
 * matching input to another stage.
@@ -397,6 +516,8 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
   this->next_buffer_separator = false;
   this->matched_candidate = NULL;
   this->stream_id = 0;
+   this->buffer = 0;
+   this->offset = 0;

   if (ctx->Extensions.ARB_transform_feedback3) {
      /* Parse gl_NextBuffer. */
@@ -489,6 +610,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
      = this->matched_candidate->toplevel_var->data.location * 4
      + this->matched_candidate->toplevel_var->data.location_frac
      + this->matched_candidate->offset;
+   const unsigned dmul =
+      this->matched_candidate->type->without_array()->is_double() ? 2 : 1;

   if (this->matched_candidate->type->is_array()) {
      /* Array variable */
@@ -496,8 +619,6 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
         this->matched_candidate->type->fields.array->matrix_columns;
      const unsigned vector_elements =
         this->matched_candidate->type->fields.array->vector_elements;
-      const unsigned dmul =
-         this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
      unsigned actual_array_size;
      switch (this->lowered_builtin_array_variable) {
      case clip_distance:
@@ -575,6 +696,12 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
    */
   this->stream_id = this->matched_candidate->toplevel_var->data.stream;

+   unsigned array_offset = this->array_subscript * 4 * dmul;
+   unsigned struct_offset = this->matched_candidate->offset * 4 * dmul;
+   this->buffer = this->matched_candidate->toplevel_var->data.xfb_buffer;
+   this->offset = this->matched_candidate->toplevel_var->data.offset +
+      array_offset + struct_offset;
+
   return true;
 }

@@ -598,55 +725,108 @@ tfeedback_decl::get_num_outputs() const
 bool
 tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                      struct gl_transform_feedback_info *info,
-                      unsigned buffer, const unsigned max_outputs) const
+                      unsigned buffer, unsigned buffer_index,
+                      const unsigned max_outputs, bool *explicit_stride,
+                      bool has_xfb_qualifiers) const
 {
   assert(!this->next_buffer_separator);

   /* Handle gl_SkipComponents. */
   if (this->skip_components) {
-      info->BufferStride[buffer] += this->skip_components;
+      info->Buffers[buffer].Stride += this->skip_components;
      return true;
   }

+   unsigned xfb_offset = 0;
+   if (has_xfb_qualifiers) {
+      xfb_offset = this->offset / 4;
+   } else {
+      xfb_offset = info->Buffers[buffer].Stride;
+   }
+   info->Varyings[info->NumVarying].Offset = xfb_offset * 4;
+
+   unsigned location = this->location;
+   unsigned location_frac = this->location_frac;
+   unsigned num_components = this->num_components();
+   while (num_components > 0) {
+      unsigned output_size = MIN2(num_components, 4 - location_frac);
+      assert((info->NumOutputs == 0 && max_outputs == 0) ||
+             info->NumOutputs < max_outputs);
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "If such a block member or variable is not written during a shader
+       *    invocation, the buffer contents at the assigned offset will be
+       *    undefined.  Even if there are no static writes to a variable or
+       *    member that is assigned a transform feedback offset, the space is
+       *    still allocated in the buffer and still affects the stride."
+       */
+      if (this->is_varying_written()) {
+         info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
+         info->Outputs[info->NumOutputs].OutputRegister = location;
+         info->Outputs[info->NumOutputs].NumComponents = output_size;
+         info->Outputs[info->NumOutputs].StreamId = stream_id;
+         info->Outputs[info->NumOutputs].OutputBuffer = buffer;
+         info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
+         ++info->NumOutputs;
+      }
+      info->Buffers[buffer].Stream = this->stream_id;
+      xfb_offset += output_size;
+
+      num_components -= output_size;
+      location++;
+      location_frac = 0;
+   }
+
+   if (explicit_stride && explicit_stride[buffer]) {
+      if (this->is_double() && info->Buffers[buffer].Stride % 2) {
+         linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                      "multiple of 8 as its applied to a type that is or "
+                      "contains a double.",
+                      info->Buffers[buffer].Stride * 4);
+         return false;
+      }
+
+      if ((this->offset / 4) / info->Buffers[buffer].Stride !=
+          (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+         linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
+                      "buffer (%d)", xfb_offset * 4,
+                      info->Buffers[buffer].Stride * 4, buffer);
+         return false;
+      }
+   } else {
+      info->Buffers[buffer].Stride = xfb_offset;
+   }
+
   /* From GL_EXT_transform_feedback:
    *   A program will fail to link if:
    *
    *     * the total number of components to capture is greater than
    *       the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
    *       and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
+    *
+    * From GL_ARB_enhanced_layouts:
+    *
+    *   "The resulting stride (implicit or explicit) must be less than or
+    *   equal to the implementation-dependent constant
+    *   gl_MaxTransformFeedbackInterleavedComponents."
    */
-   if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS &&
-       info->BufferStride[buffer] + this->num_components() >
+   if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
+        has_xfb_qualifiers) &&
+       info->Buffers[buffer].Stride >
       ctx->Const.MaxTransformFeedbackInterleavedComponents) {
      linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
                   "limit has been exceeded.");
      return false;
   }

-   unsigned location = this->location;
-   unsigned location_frac = this->location_frac;
-   unsigned num_components = this->num_components();
-   while (num_components > 0) {
-      unsigned output_size = MIN2(num_components, 4 - location_frac);
-      assert(info->NumOutputs < max_outputs);
-      info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
-      info->Outputs[info->NumOutputs].OutputRegister = location;
-      info->Outputs[info->NumOutputs].NumComponents = output_size;
-      info->Outputs[info->NumOutputs].StreamId = stream_id;
-      info->Outputs[info->NumOutputs].OutputBuffer = buffer;
-      info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
-      ++info->NumOutputs;
-      info->BufferStride[buffer] += output_size;
-      info->BufferStream[buffer] = this->stream_id;
-      num_components -= output_size;
-      location++;
-      location_frac = 0;
-   }
-
-   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, this->orig_name);
+   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog,
+                                                         this->orig_name);
   info->Varyings[info->NumVarying].Type = this->type;
   info->Varyings[info->NumVarying].Size = this->size;
+   info->Varyings[info->NumVarying].BufferIndex = buffer_index;
   info->NumVarying++;
+   info->Buffers[buffer].NumVaryings++;

   return true;
 }
@@ -731,6 +911,17 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 }


+static int
+cmp_xfb_offset(const void * x_generic, const void * y_generic)
+{
+   tfeedback_decl *x = (tfeedback_decl *) x_generic;
+   tfeedback_decl *y = (tfeedback_decl *) y_generic;
+
+   if (x->get_buffer() != y->get_buffer())
+      return x->get_buffer() - y->get_buffer();
+   return x->get_offset() - y->get_offset();
+}
+
 /**
 * Store transform feedback location assignments into
 * prog->LinkedTransformFeedback based on the data stored in tfeedback_decls.
@@ -741,8 +932,13 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                     unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls)
+                     tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers)
 {
+   /* Make sure MaxTransformFeedbackBuffers is less than 32 so the bitmask for
+    * tracking the number of buffers doesn't overflow.
+    */
+   assert(ctx->Const.MaxTransformFeedbackBuffers < 32);
+
   bool separate_attribs_mode =
      prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;

@@ -752,14 +948,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
   memset(&prog->LinkedTransformFeedback, 0,
          sizeof(prog->LinkedTransformFeedback));

+   /* The xfb_offset qualifier does not have to be used in increasing order
+    * however some drivers expect to receive the list of transform feedback
+    * declarations in order so sort it now for convenience.
+    */
+   if (has_xfb_qualifiers)
+      qsort(tfeedback_decls, num_tfeedback_decls, sizeof(*tfeedback_decls),
+            cmp_xfb_offset);
+
   prog->LinkedTransformFeedback.Varyings =
      rzalloc_array(prog,
                    struct gl_transform_feedback_varying_info,
                    num_tfeedback_decls);

   unsigned num_outputs = 0;
-   for (unsigned i = 0; i < num_tfeedback_decls; ++i)
-      num_outputs += tfeedback_decls[i].get_num_outputs();
+   for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+      if (tfeedback_decls[i].is_varying_written())
+         num_outputs += tfeedback_decls[i].get_num_outputs();
+   }

   prog->LinkedTransformFeedback.Outputs =
      rzalloc_array(prog,
@@ -767,21 +973,47 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                    num_outputs);

   unsigned num_buffers = 0;
+   unsigned buffers = 0;

-   if (separate_attribs_mode) {
+   if (!has_xfb_qualifiers && separate_attribs_mode) {
      /* GL_SEPARATE_ATTRIBS */
      for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
         if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       num_buffers, num_buffers, num_outputs,
+                                       NULL, has_xfb_qualifiers))
            return false;

+         buffers |= 1 << num_buffers;
         num_buffers++;
      }
   }
   else {
      /* GL_INVERLEAVED_ATTRIBS */
      int buffer_stream_id = -1;
+      unsigned buffer =
+         num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
+      bool explicit_stride[MAX_FEEDBACK_BUFFERS] = { false };
+
+      /* Apply any xfb_stride global qualifiers */
+      if (has_xfb_qualifiers) {
+         for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+            if (prog->TransformFeedback.BufferStride[j]) {
+               buffers |= 1 << j;
+               explicit_stride[j] = true;
+               prog->LinkedTransformFeedback.Buffers[j].Stride =
+                  prog->TransformFeedback.BufferStride[j] / 4;
+            }
+         }
+      }
+
      for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+         if (has_xfb_qualifiers &&
+             buffer != tfeedback_decls[i].get_buffer()) {
+            /* we have moved to the next buffer so reset stream id */
+            buffer_stream_id = -1;
+            num_buffers++;
+         }
+
         if (tfeedback_decls[i].is_next_buffer_separator()) {
            num_buffers++;
            buffer_stream_id = -1;
@@ -803,17 +1035,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
            return false;
         }

+         if (has_xfb_qualifiers) {
+            buffer = tfeedback_decls[i].get_buffer();
+         } else {
+            buffer = num_buffers;
+         }
+         buffers |= 1 << buffer;
+
         if (!tfeedback_decls[i].store(ctx, prog,
                                       &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       buffer, num_buffers, num_outputs,
+                                       explicit_stride, has_xfb_qualifiers))
            return false;
      }
-      num_buffers++;
   }

   assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs);

-   prog->LinkedTransformFeedback.NumBuffers = num_buffers;
+   prog->LinkedTransformFeedback.ActiveBuffers = buffers;
   return true;
 }

@@ -1466,8 +1705,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
         } else if (input_var->get_interface_type() != NULL) {
            char *const iface_field_name =
               ralloc_asprintf(mem_ctx, "%s.%s",
-                               input_var->get_interface_type()->name,
-                               input_var->name);
+                  input_var->get_interface_type()->without_array()->name,
+                  input_var->name);
            hash_table_insert(consumer_interface_inputs, input_var,
                              iface_field_name);
         } else {
@@ -1498,8 +1737,8 @@ get_matching_input(void *mem_ctx,
   } else if (output_var->get_interface_type() != NULL) {
      char *const iface_field_name =
         ralloc_asprintf(mem_ctx, "%s.%s",
-                         output_var->get_interface_type()->name,
-                         output_var->name);
+            output_var->get_interface_type()->without_array()->name,
+            output_var->name);
      input_var =
         (ir_variable *) hash_table_find(consumer_interface_inputs,
                                         iface_field_name);
@@ -98,7 +98,8 @@ public:
   unsigned get_num_outputs() const;
   bool store(struct gl_context *ctx, struct gl_shader_program *prog,
              struct gl_transform_feedback_info *info, unsigned buffer,
-              const unsigned max_outputs) const;
+              unsigned buffer_index, const unsigned max_outputs,
+              bool *explicit_stride, bool has_xfb_qualifiers) const;
   const tfeedback_candidate *find_candidate(gl_shader_program *prog,
                                             hash_table *tfeedback_candidates);

@@ -107,6 +108,14 @@ public:
      return this->next_buffer_separator;
   }

+   bool is_varying_written() const
+   {
+      if (this->next_buffer_separator || this->skip_components)
+         return false;
+
+      return this->matched_candidate->toplevel_var->data.assigned;
+   }
+
   bool is_varying() const
   {
      return !this->next_buffer_separator && !this->skip_components;
@@ -122,6 +131,16 @@ public:
      return this->stream_id;
   }

+   unsigned get_buffer() const
+   {
+      return this->buffer;
+   }
+
+   unsigned get_offset() const
+   {
+      return this->offset;
+   }
+
   /**
    * The total number of varying components taken up by this variable.  Only
    * valid if assign_location() has been called.
@@ -201,6 +220,16 @@ private:
    */
   int location;

+   /**
+    * Used to store the buffer assigned by xfb_buffer.
+    */
+   unsigned buffer;
+
+   /**
+    * Used to store the offset assigned by xfb_offset.
+    */
+   unsigned offset;
+
   /**
    * If non-zero, then this variable may be packed along with other variables
    * into a single varying slot, so this offset should be applied when
@@ -268,6 +297,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                      const void *mem_ctx, unsigned num_names,
                      char **varying_names, tfeedback_decl *decls);

+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names);
+
 void
 remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
                                        gl_shader *sh,
@@ -276,7 +310,8 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                     unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls);
+                     tfeedback_decl *tfeedback_decls,
+                     bool has_xfb_qualifiers);

 bool
 assign_varying_locations(struct gl_context *ctx,
@@ -1192,11 +1192,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
 	 int index = link_cross_validate_uniform_block(prog,
 						       &prog->BufferInterfaceBlocks,
 						       &prog->NumBufferInterfaceBlocks,
-						       &sh->BufferInterfaceBlocks[j]);
+						       sh->BufferInterfaceBlocks[j]);

 	 if (index == -1) {
 	    linker_error(prog, "uniform block `%s' has mismatching definitions\n",
-			 sh->BufferInterfaceBlocks[j].Name);
+			 sh->BufferInterfaceBlocks[j]->Name);
 	    return false;
 	 }

@@ -1204,6 +1204,23 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
      }
   }

+   /* Update per stage block pointers to point to the program list.
+    * FIXME: We should be able to free the per stage blocks here.
+    */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
+	 int stage_index =
+            prog->InterfaceBlockStageIndex[i][j];
+
+	 if (stage_index != -1) {
+	    struct gl_shader *sh = prog->_LinkedShaders[i];
+
+            sh->BufferInterfaceBlocks[stage_index] =
+               &prog->BufferInterfaceBlocks[j];
+	 }
+      }
+   }
+
   return true;
 }

@@ -1567,6 +1584,69 @@ private:
   hash_table *unnamed_interfaces;
 };

+/**
+ * Check for conflicting xfb_stride default qualifiers and store buffer stride
+ * for later use.
+ */
+static void
+link_xfb_stride_layout_qualifiers(struct gl_context *ctx,
+                                  struct gl_shader_program *prog,
+			          struct gl_shader *linked_shader,
+			          struct gl_shader **shader_list,
+			          unsigned num_shaders)
+{
+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      linked_shader->TransformFeedback.BufferStride[i] = 0;
+   }
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+         if (shader->TransformFeedback.BufferStride[j]) {
+	    if (linked_shader->TransformFeedback.BufferStride[j] != 0 &&
+                shader->TransformFeedback.BufferStride[j] != 0 &&
+	        linked_shader->TransformFeedback.BufferStride[j] !=
+                   shader->TransformFeedback.BufferStride[j]) {
+	       linker_error(prog,
+                            "intrastage shaders defined with conflicting "
+                            "xfb_stride for buffer %d (%d and %d)\n", j,
+                            linked_shader->TransformFeedback.BufferStride[j],
+			    shader->TransformFeedback.BufferStride[j]);
+	       return;
+	    }
+
+            if (shader->TransformFeedback.BufferStride[j])
+	       linked_shader->TransformFeedback.BufferStride[j] =
+                  shader->TransformFeedback.BufferStride[j];
+         }
+      }
+   }
+
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (linked_shader->TransformFeedback.BufferStride[j]) {
+         prog->TransformFeedback.BufferStride[j] =
+            linked_shader->TransformFeedback.BufferStride[j];
+
+         /* We will validate doubles at a later stage */
+         if (prog->TransformFeedback.BufferStride[j] % 4) {
+            linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                         "multiple of 4 or if its applied to a type that is "
+                         "or contains a double a multiple of 8.",
+                         prog->TransformFeedback.BufferStride[j]);
+            return;
+         }
+
+         if (prog->TransformFeedback.BufferStride[j] / 4 >
+             ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+            linker_error(prog,
+                         "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+                         "limit has been exceeded.");
+                  return;
+         }
+      }
+   }
+}

 /**
 * Performs the cross-validation of tessellation control shader vertices and
@@ -2069,15 +2149,23 @@ link_intrastage_shaders(void *mem_ctx,
   linked->ir = new(linked) exec_list;
   clone_ir_list(mem_ctx, linked->ir, main->ir);

-   linked->BufferInterfaceBlocks = uniform_blocks;
+   linked->BufferInterfaceBlocks =
+      ralloc_array(linked, gl_uniform_block *, num_uniform_blocks);
+
+   ralloc_steal(linked, uniform_blocks);
+   for (unsigned i = 0; i < num_uniform_blocks; i++) {
+      linked->BufferInterfaceBlocks[i] = &uniform_blocks[i];
+   }
+
   linked->NumBufferInterfaceBlocks = num_uniform_blocks;
-   ralloc_steal(linked, linked->BufferInterfaceBlocks);

   link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
   link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
   link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
   link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
   link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list,
+                                     num_shaders);

   populate_symbol_table(linked);

@@ -2869,7 +2957,8 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (prog->InterfaceBlockStageIndex[j][i] != -1) {
            struct gl_shader *sh = prog->_LinkedShaders[j];
            int stage_index = prog->InterfaceBlockStageIndex[j][i];
-            if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
+            if (sh &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) {
               shader_blocks[j]++;
               total_shader_storage_blocks++;
            } else {
@@ -2986,7 +3075,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)

         for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
            int stage_index = prog->InterfaceBlockStageIndex[i][j];
-            if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
+            if (stage_index != -1 &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage)
               total_shader_storage_blocks++;
         }

@@ -3762,7 +3852,8 @@ write_top_level_array_size_and_stride:
 * resource data.
 */
 void
-build_program_resource_list(struct gl_shader_program *shProg)
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg)
 {
   /* Rebuild resource list. */
   if (shProg->ProgramResourceList) {
@@ -3820,6 +3911,17 @@ build_program_resource_list(struct gl_shader_program *shProg)
      }
   }

+   /* Add transform feedback buffers. */
+   for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((shProg->LinkedTransformFeedback.ActiveBuffers >> i) & 1) {
+         shProg->LinkedTransformFeedback.Buffers[i].Binding = i;
+         if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_BUFFER,
+                                   &shProg->LinkedTransformFeedback.Buffers[i],
+                                   0))
+         return;
+      }
+   }
+
   /* Add uniforms from uniform storage. */
   for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
      /* Do not add uniforms internally used by Mesa. */
@@ -4006,20 +4108,22 @@ link_assign_subroutine_types(struct gl_shader_program *prog)

 static void
 split_ubos_and_ssbos(void *mem_ctx,
-                     struct gl_uniform_block *blocks,
+                     struct gl_uniform_block **s_blks,
+                     struct gl_uniform_block *p_blks,
                     unsigned num_blocks,
                     struct gl_uniform_block ***ubos,
                     unsigned *num_ubos,
-                     unsigned **ubo_interface_block_indices,
                     struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos,
-                     unsigned **ssbo_interface_block_indices)
+                     unsigned *num_ssbos)
 {
   unsigned num_ubo_blocks = 0;
   unsigned num_ssbo_blocks = 0;

+   /* Are we spliting the list of blocks for the shader or the program */
+   bool is_shader = p_blks == NULL;
+
   for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage)
+      if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage)
         num_ssbo_blocks++;
      else
         num_ubo_blocks++;
@@ -4031,24 +4135,13 @@ split_ubos_and_ssbos(void *mem_ctx,
   *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
   *num_ssbos = 0;

-   if (ubo_interface_block_indices)
-      *ubo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
-
-   if (ssbo_interface_block_indices)
-      *ssbo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
-
   for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage) {
-         (*ssbos)[*num_ssbos] = &blocks[i];
-         if (ssbo_interface_block_indices)
-            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+      struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i];
+      if (blk->IsShaderStorage) {
+         (*ssbos)[*num_ssbos] = blk;
         (*num_ssbos)++;
      } else {
-         (*ubos)[*num_ubos] = &blocks[i];
-         if (ubo_interface_block_indices)
-            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*ubos)[*num_ubos] = blk;
         (*num_ubos)++;
      }
   }
@@ -4153,9 +4246,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
      return;
   }

-   tfeedback_decl *tfeedback_decls = NULL;
-   unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+   unsigned num_tfeedback_decls = 0;
   unsigned int num_explicit_uniform_locs = 0;
+   bool has_xfb_qualifiers = false;
+   char **varying_names = NULL;
+   tfeedback_decl *tfeedback_decls = NULL;

   void *mem_ctx = ralloc_context(NULL); // temporary linker context

@@ -4465,6 +4560,30 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
      goto done;
   }

+   /* From the ARB_enhanced_layouts spec:
+    *
+    *    "If the shader used to record output variables for transform feedback
+    *    varyings uses the "xfb_buffer", "xfb_offset", or "xfb_stride" layout
+    *    qualifiers, the values specified by TransformFeedbackVaryings are
+    *    ignored, and the set of variables captured for transform feedback is
+    *    instead derived from the specified layout qualifiers."
+    */
+   for (int i = MESA_SHADER_FRAGMENT - 1; i >= 0; i--) {
+      /* Find last stage before fragment shader */
+      if (prog->_LinkedShaders[i]) {
+         has_xfb_qualifiers =
+            process_xfb_layout_qualifiers(mem_ctx, prog->_LinkedShaders[i],
+                                          &num_tfeedback_decls,
+                                          &varying_names);
+         break;
+      }
+   }
+
+   if (!has_xfb_qualifiers) {
+      num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+      varying_names = prog->TransformFeedback.VaryingNames;
+   }
+
   if (num_tfeedback_decls != 0) {
      /* From GL_EXT_transform_feedback:
       *   A program will fail to link if:
@@ -4481,10 +4600,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
      }

      tfeedback_decls = ralloc_array(mem_ctx, tfeedback_decl,
-                                     prog->TransformFeedback.NumVarying);
+                                     num_tfeedback_decls);
      if (!parse_tfeedback_decls(ctx, prog, mem_ctx, num_tfeedback_decls,
-                                 prog->TransformFeedback.VaryingNames,
-                                 tfeedback_decls))
+                                 varying_names, tfeedback_decls))
         goto done;
   }

@@ -4564,7 +4682,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
      }
   }

-   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls))
+   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls,
+                             has_xfb_qualifiers))
      goto done;

   update_array_sizes(prog);
@@ -4627,25 +4746,23 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
         gl_shader *sh = prog->_LinkedShaders[i];
         split_ubos_and_ssbos(sh,
                              sh->BufferInterfaceBlocks,
+                              NULL,
                              sh->NumBufferInterfaceBlocks,
                              &sh->UniformBlocks,
                              &sh->NumUniformBlocks,
-                              NULL,
                              &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks,
-                              NULL);
+                              &sh->NumShaderStorageBlocks);
      }
   }

   split_ubos_and_ssbos(prog,
+                        NULL,
                        prog->BufferInterfaceBlocks,
                        prog->NumBufferInterfaceBlocks,
                        &prog->UniformBlocks,
                        &prog->NumUniformBlocks,
-                        &prog->UboInterfaceBlockIndex,
                        &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks,
-                        &prog->SsboInterfaceBlockIndex);
+                        &prog->NumShaderStorageBlocks);

   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
      if (prog->_LinkedShaders[i] == NULL)
@@ -197,7 +197,8 @@ private:
   void recursion(const glsl_type *t, char **name, size_t name_length,
                  bool row_major, const glsl_type *record_type,
                  const unsigned packing,
-                  bool last_field, unsigned record_array_count);
+                  bool last_field, unsigned record_array_count,
+                  const glsl_struct_field *named_ifc_member);
 };

 void
@@ -169,7 +169,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                  new(mem_ctx) ir_variable(iface_t->fields.structure[i].type,
                                           var_name,
                                           (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_nonarray = 1;
            } else {
               const glsl_type *new_array_type =
                  process_array_type(var->type, i);
@@ -177,10 +176,16 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                  new(mem_ctx) ir_variable(new_array_type,
                                           var_name,
                                           (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_array = 1;
            }
            new_var->data.location = iface_t->fields.structure[i].location;
            new_var->data.explicit_location = (new_var->data.location >= 0);
+            new_var->data.offset = iface_t->fields.structure[i].offset;
+            new_var->data.explicit_xfb_offset =
+               (iface_t->fields.structure[i].offset >= 0);
+            new_var->data.xfb_buffer =
+               iface_t->fields.structure[i].xfb_buffer;
+            new_var->data.explicit_xfb_buffer =
+               iface_t->fields.structure[i].explicit_xfb_buffer;
            new_var->data.interpolation =
               iface_t->fields.structure[i].interpolation;
            new_var->data.centroid = iface_t->fields.structure[i].centroid;
@@ -188,8 +193,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
            new_var->data.patch = iface_t->fields.structure[i].patch;
            new_var->data.stream = var->data.stream;
            new_var->data.how_declared = var->data.how_declared;
+            new_var->data.from_named_ifc_block = 1;

-            new_var->init_interface_type(iface_t);
+            new_var->init_interface_type(var->type);
            hash_table_insert(interface_namespace, new_var,
                              iface_field_name);
            insert_pos->insert_after(new_var);
@@ -211,12 +217,23 @@ ir_visitor_status
 flatten_named_interface_blocks_declarations::visit_leave(ir_assignment *ir)
 {
   ir_dereference_record *lhs_rec = ir->lhs->as_dereference_record();
+
+   ir_variable *lhs_var =  ir->lhs->variable_referenced();
+   if (lhs_var && lhs_var->get_interface_type()) {
+      lhs_var->data.assigned = 1;
+   }
+
   if (lhs_rec) {
      ir_rvalue *lhs_rec_tmp = lhs_rec;
      handle_rvalue(&lhs_rec_tmp);
      if (lhs_rec_tmp != lhs_rec) {
         ir->set_lhs(lhs_rec_tmp);
      }
+
+      ir_variable *lhs_var =  lhs_rec_tmp->variable_referenced();
+      if (lhs_var) {
+         lhs_var->data.assigned = 1;
+      }
   }
   return rvalue_visit(ir);
 }
@@ -43,7 +43,8 @@ extern void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);

 extern void
-build_program_resource_list(struct gl_shader_program *shProg);
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg);

 extern void
 linker_error(struct gl_shader_program *prog, const char *fmt, ...)
@@ -130,11 +130,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
      shProg->InterfaceBlockStageIndex[i] = NULL;
   }

-   ralloc_free(shProg->UboInterfaceBlockIndex);
-   shProg->UboInterfaceBlockIndex = NULL;
-   ralloc_free(shProg->SsboInterfaceBlockIndex);
-   shProg->SsboInterfaceBlockIndex = NULL;
-
   ralloc_free(shProg->AtomicBuffers);
   shProg->AtomicBuffers = NULL;
   shProg->NumAtomicBuffers = 0;
@@ -115,7 +115,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
   prog->UniformStorage[index_to_set].name = (char *) name;
   prog->UniformStorage[index_to_set].type = type;
   prog->UniformStorage[index_to_set].array_elements = array_size;
-   prog->UniformStorage[index_to_set].initialized = false;
   for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
      prog->UniformStorage[index_to_set].opaque[sh].index = ~0;
      prog->UniformStorage[index_to_set].opaque[sh].active = false;
@@ -136,7 +135,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
      prog->UniformStorage[i].name = (char *) "invalid slot";
      prog->UniformStorage[i].type = glsl_type::void_type;
      prog->UniformStorage[i].array_elements = 0;
-      prog->UniformStorage[i].initialized = false;
      for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
         prog->UniformStorage[i].opaque[sh].index = ~0;
         prog->UniformStorage[i].opaque[sh].active = false;
@@ -149,21 +147,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
   return red_zone_components;
 }

-/**
- * Verify that the correct uniform is marked as having been initialized.
- */
-static void
-verify_initialization(struct gl_shader_program *prog, unsigned actual_index)
-{
-   for (unsigned i = 0; i < prog->NumUniformStorage; i++) {
-      if (i == actual_index) {
-	 EXPECT_TRUE(prog->UniformStorage[actual_index].initialized);
-      } else {
-	 EXPECT_FALSE(prog->UniformStorage[i].initialized);
-      }
-   }
-}
-
 static void
 non_array_test(void *mem_ctx, struct gl_shader_program *prog,
 	       unsigned actual_index, const char *name,
@@ -181,7 +164,6 @@ non_array_test(void *mem_ctx, struct gl_shader_program *prog,

   linker::set_uniform_initializer(mem_ctx, prog, name, type, val, 0xF00F);

-   verify_initialization(prog, actual_index);
   verify_data(prog->UniformStorage[actual_index].storage, 0, val,
 	       red_zone_components, 0xF00F);
 }
@@ -338,7 +320,6 @@ array_test(void *mem_ctx, struct gl_shader_program *prog,
   linker::set_uniform_initializer(mem_ctx, prog, name, element_type, val,
                                   0xF00F);

-   verify_initialization(prog, actual_index);
   verify_data(prog->UniformStorage[actual_index].storage, array_size,
 	       val, red_zone_components, 0xF00F);
 }
@@ -132,6 +132,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
      this->fields.structure[i].image_volatile = fields[i].image_volatile;
      this->fields.structure[i].image_restrict = fields[i].image_restrict;
      this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
   }

   mtx_unlock(&glsl_type::mutex);
@@ -172,6 +176,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
      this->fields.structure[i].image_volatile = fields[i].image_volatile;
      this->fields.structure[i].image_restrict = fields[i].image_restrict;
      this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
   }

   mtx_unlock(&glsl_type::mutex);
@@ -915,6 +923,15 @@ glsl_type::record_compare(const glsl_type *b) const
      if (this->fields.structure[i].precision
          != b->fields.structure[i].precision)
         return false;
+      if (this->fields.structure[i].explicit_xfb_buffer
+          != b->fields.structure[i].explicit_xfb_buffer)
+         return false;
+      if (this->fields.structure[i].xfb_buffer
+          != b->fields.structure[i].xfb_buffer)
+         return false;
+      if (this->fields.structure[i].xfb_stride
+          != b->fields.structure[i].xfb_stride)
+         return false;
   }

   return true;
@@ -1333,6 +1350,38 @@ glsl_type::uniform_locations() const
   }
 }

+unsigned
+glsl_type::varying_count() const
+{
+   unsigned size = 0;
+
+   switch (this->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_BOOL:
+      return 1;
+
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE:
+      for (unsigned i = 0; i < this->length; i++)
+         size += this->fields.structure[i].type->varying_count();
+      return size;
+   case GLSL_TYPE_ARRAY:
+      /* Don't count innermost array elements */
+      if (this->without_array()->is_record() ||
+          this->without_array()->is_interface() ||
+          this->fields.array->is_array())
+         return this->length * this->fields.array->varying_count();
+      else
+         return this->fields.array->varying_count();
+   default:
+      assert(!"unsupported varying type");
+      return 0;
+   }
+}
+
 bool
 glsl_type::can_implicitly_convert_to(const glsl_type *desired,
                                     _mesa_glsl_parse_state *state) const
@@ -326,6 +326,12 @@ struct glsl_type {
    */
   unsigned uniform_locations() const;

+   /**
+    * Used to count the number of varyings contained in the type ignoring
+    * innermost array elements.
+    */
+   unsigned varying_count() const;
+
   /**
    * Calculate the number of attribute slots required to hold this type
    *
@@ -839,12 +845,24 @@ struct glsl_struct_field {

   /**
    * For interface blocks, members may have an explicit byte offset
-    * specified; -1 otherwise.
+    * specified; -1 otherwise. Also used for xfb_offset layout qualifier.
    *
-    * Ignored for structs.
+    * Unless used for xfb_offset this field is ignored for structs.
    */
   int offset;

+   /**
+    * For interface blocks, members may define a transform feedback buffer;
+    * -1 otherwise.
+    */
+   int xfb_buffer;
+
+   /**
+    * For interface blocks, members may define a transform feedback stride;
+    * -1 otherwise.
+    */
+   int xfb_stride;
+
   /**
    * For interface blocks, the interpolation mode (as in
    * ir_variable::interpolation).  0 otherwise.
@@ -889,6 +907,13 @@ struct glsl_struct_field {
   unsigned image_volatile:1;
   unsigned image_restrict:1;

+   /**
+    * Any of the xfb_* qualifiers trigger the shader to be in transform
+    * feedback mode so we need to keep track of whether the buffer was
+    * explicitly set or if its just been assigned the default global value.
+    */
+   unsigned explicit_xfb_buffer:1;
+
 #ifdef __cplusplus
   glsl_struct_field(const struct glsl_type *_type, const char *_name)
      : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
@@ -22,10 +22,10 @@ NIR_FILES = \
 	nir_gather_info.c \
 	nir_gs_count_vertices.c \
 	nir_inline_functions.c \
-	nir_intrinsics.c \
-	nir_intrinsics.h \
 	nir_instr_set.c \
 	nir_instr_set.h \
+	nir_intrinsics.c \
+	nir_intrinsics.h \
 	nir_liveness.c \
 	nir_lower_alu_to_scalar.c \
 	nir_lower_atomics.c \
@@ -143,16 +143,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
   v2.run(sh->ir);
   visit_exec_list(sh->ir, &v1);

-   nir_function *main = NULL;
-   nir_foreach_function(shader, func) {
-      if (strcmp(func->name, "main") == 0) {
-         main = func;
-         break;
-      }
-   }
-   assert(main);
-
-   nir_lower_outputs_to_temporaries(shader, main);
+   nir_lower_outputs_to_temporaries(shader, nir_shader_get_entrypoint(shader));

   shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
   if (shader_prog->Label)
@@ -1822,6 +1822,8 @@ nir_shader_get_entrypoint(nir_shader *shader)
   assert(exec_list_length(&shader->functions) == 1);
   struct exec_node *func_node = exec_list_get_head(&shader->functions);
   nir_function *func = exec_node_data(nir_function, func_node, node);
+   assert(func->return_type == glsl_void_type());
+   assert(func->num_params == 0);
   return func;
 }

@@ -127,6 +127,7 @@ optimizations = [
   (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
   (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
   (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
+   (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
   (('fmin', a, a), a),
   (('fmax', a, a), a),
   (('imin', a, a), a),
@@ -270,6 +271,10 @@ optimizations = [
   (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
   (('iabs', ('isub', 0, a)), ('iabs', a)),

+   # Propagate negation up multiplication chains
+   (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
+
   # Misc. lowering
   (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
   (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
@@ -31,7 +31,7 @@ extern "C" {
 #endif

 /**
- * Shader stages. Note that these will become 5 with tessellation.
+ * Shader stages.
 *
 * The order must match how shaders are ordered in the pipeline.
 * The GLSL linker assumes that if i<j, then the j-th shader is
@@ -537,6 +537,8 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
   EGLint config_attrs[] = {
     EGL_NATIVE_VISUAL_ID,   0,
     EGL_NATIVE_VISUAL_TYPE, 0,
+     EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
+     EGL_RECORDABLE_ANDROID, EGL_TRUE,
     EGL_NONE
   };
   int count, i, j;
@@ -714,7 +716,9 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
      goto cleanup_screen;
   }

+   dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
   dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
+   dpy->Extensions.ANDROID_recordable = EGL_TRUE;
   dpy->Extensions.KHR_image_base = EGL_TRUE;

   /* Fill vtbl last to prevent accidentally calling virtual function during
@@ -381,7 +381,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
   char *exts = dpy->ExtensionsString;

   /* Please keep these sorted alphabetically. */
+   _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target);
   _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
+   _EGL_CHECK_EXTENSION(ANDROID_recordable);

   _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);

@@ -245,7 +245,13 @@ static const struct {
   /* extensions */
   { EGL_Y_INVERTED_NOK,            ATTRIB_TYPE_BOOLEAN,
                                    ATTRIB_CRITERION_EXACT,
-                                    EGL_DONT_CARE }
+                                    EGL_DONT_CARE },
+   { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
+   { EGL_RECORDABLE_ANDROID,        ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
 };


@@ -488,6 +494,10 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr)
   switch (attr) {
   case EGL_Y_INVERTED_NOK:
      return conf->Display->Extensions.NOK_texture_from_pixmap;
+   case EGL_FRAMEBUFFER_TARGET_ANDROID:
+      return conf->Display->Extensions.ANDROID_framebuffer_target;
+   case EGL_RECORDABLE_ANDROID:
+      return conf->Display->Extensions.ANDROID_recordable;
   default:
      break;
   }
@@ -86,6 +86,8 @@ struct _egl_config

   /* extensions */
   EGLint YInvertedNOK;
+   EGLint FramebufferTargetAndroid;
+   EGLint RecordableAndroid;
 };


@@ -133,6 +135,8 @@ _eglOffsetOfConfig(EGLint attr)
   ATTRIB_MAP(EGL_CONFORMANT,                Conformant);
   /* extensions */
   ATTRIB_MAP(EGL_Y_INVERTED_NOK,            YInvertedNOK);
+   ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid);
+   ATTRIB_MAP(EGL_RECORDABLE_ANDROID,        RecordableAndroid);
 #undef ATTRIB_MAP
   default:
      return -1;
@@ -90,7 +90,9 @@ struct _egl_resource
 struct _egl_extensions
 {
   /* Please keep these sorted alphabetically. */
+   EGLBoolean ANDROID_framebuffer_target;
   EGLBoolean ANDROID_image_native_buffer;
+   EGLBoolean ANDROID_recordable;

   EGLBoolean CHROMIUM_sync_control;

@@ -731,6 +731,24 @@ draw_texture_sampler(struct draw_context *draw,
   }
 }

+/**
+ * Provide TGSI image objects for vertex/geometry shaders that use
+ * texture fetches.  This state only needs to be set once per context.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_image(struct draw_context *draw,
+           uint shader,
+           struct tgsi_image *image)
+{
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.tgsi.image = image;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.tgsi.image = image;
+   }
+}
+



@@ -48,6 +48,7 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
+struct tgsi_image;

 /*
 * structure to contain driver internal information 
@@ -154,6 +155,11 @@ draw_texture_sampler(struct draw_context *draw,
                     uint shader_type,
                     struct tgsi_sampler *sampler);

+void
+draw_image(struct draw_context *draw,
+           uint shader_type,
+           struct tgsi_image *image);
+
 void
 draw_set_sampler_views(struct draw_context *draw,
                       unsigned shader_stage,
@@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
   if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
      tgsi_exec_machine_bind_shader(shader->machine,
                                    shader->state.tokens,
-                                    draw->gs.tgsi.sampler);
+                                    draw->gs.tgsi.sampler, draw->gs.tgsi.image);
   }
 }

@@ -66,6 +66,7 @@ struct draw_stage;
 struct vbuf_render;
 struct tgsi_exec_machine;
 struct tgsi_sampler;
+struct tgsi_image;
 struct draw_pt_front_end;
 struct draw_assembler;
 struct draw_llvm;
@@ -267,6 +268,7 @@ struct draw_context
         struct tgsi_exec_machine *machine;

         struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
      } tgsi;

      struct translate *fetch;
@@ -286,6 +288,7 @@ struct draw_context
         struct tgsi_exec_machine *machine;

         struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
      } tgsi;

   } gs;
@@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
   if (evs->machine->Tokens != shader->state.tokens) {
      tgsi_exec_machine_bind_shader(evs->machine,
                                    shader->state.tokens,
-                                    draw->vs.tgsi.sampler);
+                                    draw->vs.tgsi.sampler, draw->vs.tgsi.image);
   }
 }

@@ -128,7 +128,7 @@ lp_debug_dump_value(LLVMValueRef value)
 * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
 */
 static size_t
-disassemble(const void* func, std::stringstream &buffer)
+disassemble(const void* func, std::ostream &buffer)
 {
   const uint8_t *bytes = (const uint8_t *)func;

@@ -235,15 +235,16 @@ disassemble(const void* func, std::stringstream &buffer)


 extern "C" void
-lp_disassemble(LLVMValueRef func, const void *code) {
-   std::stringstream buffer;
+lp_disassemble(LLVMValueRef func, const void *code)
+{
+   std::ostringstream buffer;
   std::string s;

   buffer << LLVMGetValueName(func) << ":\n";
   disassemble(code, buffer);
   s = buffer.str();
-   _debug_printf("%s", s.c_str());
-   _debug_printf("\n");
+   os_log_message(s.c_str());
+   os_log_message("\n");
 }


@@ -259,7 +260,6 @@ extern "C" void
 lp_profile(LLVMValueRef func, const void *code)
 {
 #if defined(__linux__) && defined(PROFILE)
-   std::stringstream buffer;
   static std::ofstream perf_asm_file;
   static boolean first_time = TRUE;
   static FILE *perf_map_file = NULL;
@@ -283,9 +283,9 @@ lp_profile(LLVMValueRef func, const void *code)
   if (perf_map_file) {
      const char *symbol = LLVMGetValueName(func);
      unsigned long addr = (uintptr_t)code;
-      buffer << symbol << ":\n";
-      unsigned long size = disassemble(code, buffer);
-      perf_asm_file << buffer.rdbuf() << std::flush;
+      perf_asm_file << symbol << ":\n";
+      unsigned long size = disassemble(code, perf_asm_file);
+      perf_asm_file.flush();
      fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
      fflush(perf_map_file);
   }
@@ -314,11 +314,13 @@ lp_build_select(struct lp_build_context *bld,
      mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
      res = LLVMBuildSelect(builder, mask, a, b, "");
   }
-   else if (0) {
+   else if (HAVE_LLVM >= 0x0303) {
      /* Generate a vector select.
       *
-       * XXX: Using vector selects would avoid emitting intrinsics, but they aren't
-       * properly supported yet.
+       * Using vector selects would avoid emitting intrinsics, but they weren't
+       * properly supported yet for a long time.
+       *
+       * LLVM 3.3 appears to reliably support it.
       *
       * LLVM 3.1 supports it, but it yields buggy code (e.g. lp_blend_test).
       *
@@ -108,14 +108,14 @@ struct fenced_manager
 */
 struct fenced_buffer
 {
-   /*
+   /**
    * Immutable members.
    */

   struct pb_buffer base;
   struct fenced_manager *mgr;

-   /*
+   /**
    * Following members are mutable and protected by fenced_manager::mutex.
    */

@@ -205,7 +205,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)

   curr = fenced_mgr->unfenced.next;
   next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
      assert(!fenced_buf->fence);
      debug_printf("%10p %7u %8u %7s\n",
@@ -219,7 +219,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)

   curr = fenced_mgr->fenced.next;
   next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
      int signaled;
      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
      assert(fenced_buf->buffer);
@@ -340,7 +340,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
   assert(pipe_is_referenced(&fenced_buf->base.reference));
   assert(fenced_buf->fence);

-   if(fenced_buf->fence) {
+   if (fenced_buf->fence) {
      struct pipe_fence_handle *fence = NULL;
      int finished;
      boolean proceed;
@@ -355,8 +355,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,

      assert(pipe_is_referenced(&fenced_buf->base.reference));

-      /*
-       * Only proceed if the fence object didn't change in the meanwhile.
+      /* Only proceed if the fence object didn't change in the meanwhile.
       * Otherwise assume the work has been already carried out by another
       * thread that re-aquired the lock before us.
       */
@@ -364,14 +363,9 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,

      ops->fence_reference(ops, &fence, NULL);

-      if(proceed && finished == 0) {
-         /*
-          * Remove from the fenced list
-          */
-
-         boolean destroyed;
-
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+      if (proceed && finished == 0) {
+         /* Remove from the fenced list. */
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);

         /* TODO: remove consequents buffers with the same fence? */

@@ -405,36 +399,33 @@ fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,

   curr = fenced_mgr->fenced.next;
   next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);

-      if(fenced_buf->fence != prev_fence) {
-	 int signaled;
+      if (fenced_buf->fence != prev_fence) {
+         int signaled;

-	 if (wait) {
-	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
+         if (wait) {
+            signaled = ops->fence_finish(ops, fenced_buf->fence, 0);

-	    /*
-	     * Don't return just now. Instead preemptively check if the
-	     * following buffers' fences already expired, without further waits.
-	     */
-	    wait = FALSE;
-	 }
-	 else {
-	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-	 }
-
-	 if (signaled != 0) {
-	    return ret;
+            /* Don't return just now. Instead preemptively check if the
+             * following buffers' fences already expired, without further waits.
+             */
+            wait = FALSE;
+         } else {
+            signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
         }

-	 prev_fence = fenced_buf->fence;
-      }
-      else {
+         if (signaled != 0) {
+            return ret;
+         }
+
+         prev_fence = fenced_buf->fence;
+      } else {
         /* This buffer's fence object is identical to the previous buffer's
          * fence object, so no need to check the fence again.
          */
-	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+         assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
      }

      fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
@@ -462,22 +453,21 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)

   curr = fenced_mgr->unfenced.next;
   next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
      fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);

-      /*
-       * We can only move storage if the buffer is not mapped and not
+      /* We can only move storage if the buffer is not mapped and not
       * validated.
       */
-      if(fenced_buf->buffer &&
+      if (fenced_buf->buffer &&
         !fenced_buf->mapcount &&
         !fenced_buf->vl) {
         enum pipe_error ret;

         ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
-         if(ret == PIPE_OK) {
+         if (ret == PIPE_OK) {
            ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
-            if(ret == PIPE_OK) {
+            if (ret == PIPE_OK) {
               fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
               return TRUE;
            }
@@ -499,7 +489,7 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
 static void
 fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->data) {
+   if (fenced_buf->data) {
      align_free(fenced_buf->data);
      fenced_buf->data = NULL;
      assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
@@ -516,14 +506,14 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
                                        struct fenced_buffer *fenced_buf)
 {
   assert(!fenced_buf->data);
-   if(fenced_buf->data)
+   if (fenced_buf->data)
      return PIPE_OK;

   if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
      return PIPE_ERROR_OUT_OF_MEMORY;

   fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
-   if(!fenced_buf->data)
+   if (!fenced_buf->data)
      return PIPE_ERROR_OUT_OF_MEMORY;

   fenced_mgr->cpu_total_size += fenced_buf->size;
@@ -538,7 +528,7 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
 static void
 fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
      pb_reference(&fenced_buf->buffer, NULL);
   }
 }
@@ -575,41 +565,37 @@ fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
 {
   assert(!fenced_buf->buffer);

-   /*
-    * Check for signaled buffers before trying to allocate.
-    */
+   /* Check for signaled buffers before trying to allocate. */
   fenced_manager_check_signalled_locked(fenced_mgr, FALSE);

   fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);

-   /*
-    * Keep trying while there is some sort of progress:
+   /* Keep trying while there is some sort of progress:
    * - fences are expiring,
    * - or buffers are being being swapped out from GPU memory into CPU memory.
    */
-   while(!fenced_buf->buffer &&
+   while (!fenced_buf->buffer &&
         (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
          fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
      fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
   }

-   if(!fenced_buf->buffer && wait) {
-      /*
-       * Same as before, but this time around, wait to free buffers if
+   if (!fenced_buf->buffer && wait) {
+      /* Same as before, but this time around, wait to free buffers if
       * necessary.
       */
-      while(!fenced_buf->buffer &&
+      while (!fenced_buf->buffer &&
            (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
             fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
         fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
      }
   }

-   if(!fenced_buf->buffer) {
-      if(0)
+   if (!fenced_buf->buffer) {
+      if (0)
         fenced_manager_dump_locked(fenced_mgr);

-      /* give up */
+      /* Give up. */
      return PIPE_ERROR_OUT_OF_MEMORY;
   }

@@ -686,18 +672,16 @@ fenced_buffer_map(struct pb_buffer *buf,

   assert(!(flags & PB_USAGE_GPU_READ_WRITE));

-   /*
-    * Serialize writes.
-    */
-   while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+   /* Serialize writes. */
+   while ((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
         ((fenced_buf->flags & PB_USAGE_GPU_READ) &&
          (flags & PB_USAGE_CPU_WRITE))) {

-      /* 
-       * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+      /* Don't wait for the GPU to finish accessing it,
+       * if blocking is forbidden.
       */
-      if((flags & PB_USAGE_DONTBLOCK) &&
-          ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+      if ((flags & PB_USAGE_DONTBLOCK) &&
+         ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
         goto done;
      }

@@ -705,17 +689,15 @@ fenced_buffer_map(struct pb_buffer *buf,
         break;
      }

-      /*
-       * Wait for the GPU to finish accessing. This will release and re-acquire
+      /* Wait for the GPU to finish accessing. This will release and re-acquire
       * the mutex, so all copies of mutable state must be discarded.
       */
      fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
   }

-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
      map = pb_map(fenced_buf->buffer, flags, flush_ctx);
-   }
-   else {
+   } else {
      assert(fenced_buf->data);
      map = fenced_buf->data;
   }
@@ -725,7 +707,7 @@ fenced_buffer_map(struct pb_buffer *buf,
      fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
   }

-done:
+ done:
   pipe_mutex_unlock(fenced_mgr->mutex);

   return map;
@@ -741,12 +723,12 @@ fenced_buffer_unmap(struct pb_buffer *buf)
   pipe_mutex_lock(fenced_mgr->mutex);

   assert(fenced_buf->mapcount);
-   if(fenced_buf->mapcount) {
+   if (fenced_buf->mapcount) {
      if (fenced_buf->buffer)
         pb_unmap(fenced_buf->buffer);
      --fenced_buf->mapcount;
-      if(!fenced_buf->mapcount)
-	 fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
+      if (!fenced_buf->mapcount)
+         fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
   }

   pipe_mutex_unlock(fenced_mgr->mutex);
@@ -765,7 +747,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
   pipe_mutex_lock(fenced_mgr->mutex);

   if (!vl) {
-      /* invalidate */
+      /* Invalidate. */
      fenced_buf->vl = NULL;
      fenced_buf->validation_flags = 0;
      ret = PIPE_OK;
@@ -776,40 +758,37 @@ fenced_buffer_validate(struct pb_buffer *buf,
   assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
   flags &= PB_USAGE_GPU_READ_WRITE;

-   /* Buffer cannot be validated in two different lists */
-   if(fenced_buf->vl && fenced_buf->vl != vl) {
+   /* Buffer cannot be validated in two different lists. */
+   if (fenced_buf->vl && fenced_buf->vl != vl) {
      ret = PIPE_ERROR_RETRY;
      goto done;
   }

-   if(fenced_buf->vl == vl &&
+   if (fenced_buf->vl == vl &&
      (fenced_buf->validation_flags & flags) == flags) {
-      /* Nothing to do -- buffer already validated */
+      /* Nothing to do -- buffer already validated. */
      ret = PIPE_OK;
      goto done;
   }

-   /*
-    * Create and update GPU storage.
-    */
-   if(!fenced_buf->buffer) {
+   /* Create and update GPU storage. */
+   if (!fenced_buf->buffer) {
      assert(!fenced_buf->mapcount);

      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
         goto done;
      }

      ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
         fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
         goto done;
      }

-      if(fenced_buf->mapcount) {
+      if (fenced_buf->mapcount) {
         debug_printf("warning: validating a buffer while it is still mapped\n");
-      }
-      else {
+      } else {
         fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
      }
   }
@@ -821,7 +800,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
   fenced_buf->vl = vl;
   fenced_buf->validation_flags |= flags;

-done:
+ done:
   pipe_mutex_unlock(fenced_mgr->mutex);

   return ret;
@@ -841,13 +820,12 @@ fenced_buffer_fence(struct pb_buffer *buf,
   assert(pipe_is_referenced(&fenced_buf->base.reference));
   assert(fenced_buf->buffer);

-   if(fence != fenced_buf->fence) {
+   if (fence != fenced_buf->fence) {
      assert(fenced_buf->vl);
      assert(fenced_buf->validation_flags);

      if (fenced_buf->fence) {
-         boolean destroyed;
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
         assert(!destroyed);
      }
      if (fence) {
@@ -876,16 +854,15 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,

   pipe_mutex_lock(fenced_mgr->mutex);

-   /*
-    * This should only be called when the buffer is validated. Typically
+   /* This should only be called when the buffer is validated. Typically
    * when processing relocations.
    */
   assert(fenced_buf->vl);
   assert(fenced_buf->buffer);

-   if(fenced_buf->buffer)
+   if (fenced_buf->buffer) {
      pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
-   else {
+   } else {
      *base_buf = buf;
      *offset = 0;
   }
@@ -896,12 +873,12 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,

 static const struct pb_vtbl
 fenced_buffer_vtbl = {
-      fenced_buffer_destroy,
-      fenced_buffer_map,
-      fenced_buffer_unmap,
-      fenced_buffer_validate,
-      fenced_buffer_fence,
-      fenced_buffer_get_base_buffer
+   fenced_buffer_destroy,
+   fenced_buffer_map,
+   fenced_buffer_unmap,
+   fenced_buffer_validate,
+   fenced_buffer_fence,
+   fenced_buffer_get_base_buffer
 };


@@ -917,12 +894,11 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
   struct fenced_buffer *fenced_buf;
   enum pipe_error ret;

-   /*
-    * Don't stall the GPU, waste time evicting buffers, or waste memory
+   /* Don't stall the GPU, waste time evicting buffers, or waste memory
    * trying to create a buffer that will most likely never fit into the
    * graphics aperture.
    */
-   if(size > fenced_mgr->max_buffer_size) {
+   if (size > fenced_mgr->max_buffer_size) {
      goto no_buffer;
   }

@@ -942,29 +918,21 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,

   pipe_mutex_lock(fenced_mgr->mutex);

-   /*
-    * Try to create GPU storage without stalling,
-    */
+   /* Try to create GPU storage without stalling. */
   ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);

-   /*
-    * Attempt to use CPU memory to avoid stalling the GPU.
-    */
-   if(ret != PIPE_OK) {
+   /* Attempt to use CPU memory to avoid stalling the GPU. */
+   if (ret != PIPE_OK) {
      ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
   }

-   /*
-    * Create GPU storage, waiting for some to be available.
-    */
-   if(ret != PIPE_OK) {
+   /* Create GPU storage, waiting for some to be available. */
+   if (ret != PIPE_OK) {
      ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
   }

-   /*
-    * Give up.
-    */
-   if(ret != PIPE_OK) {
+   /* Give up. */
+   if (ret != PIPE_OK) {
      goto no_storage;
   }

@@ -976,10 +944,10 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,

   return &fenced_buf->base;

-no_storage:
+ no_storage:
   pipe_mutex_unlock(fenced_mgr->mutex);
   FREE(fenced_buf);
-no_buffer:
+ no_buffer:
   return NULL;
 }

@@ -990,12 +958,12 @@ fenced_bufmgr_flush(struct pb_manager *mgr)
   struct fenced_manager *fenced_mgr = fenced_manager(mgr);

   pipe_mutex_lock(fenced_mgr->mutex);
-   while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+   while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
      ;
   pipe_mutex_unlock(fenced_mgr->mutex);

   assert(fenced_mgr->provider->flush);
-   if(fenced_mgr->provider->flush)
+   if (fenced_mgr->provider->flush)
      fenced_mgr->provider->flush(fenced_mgr->provider);
 }

@@ -1007,25 +975,25 @@ fenced_bufmgr_destroy(struct pb_manager *mgr)

   pipe_mutex_lock(fenced_mgr->mutex);

-   /* Wait on outstanding fences */
+   /* Wait on outstanding fences. */
   while (fenced_mgr->num_fenced) {
      pipe_mutex_unlock(fenced_mgr->mutex);
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
      sched_yield();
 #endif
      pipe_mutex_lock(fenced_mgr->mutex);
-      while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+      while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
         ;
   }

 #ifdef DEBUG
-   /*assert(!fenced_mgr->num_unfenced);*/
+   /* assert(!fenced_mgr->num_unfenced); */
 #endif

   pipe_mutex_unlock(fenced_mgr->mutex);
   pipe_mutex_destroy(fenced_mgr->mutex);

-   if(fenced_mgr->provider)
+   if (fenced_mgr->provider)
      fenced_mgr->provider->destroy(fenced_mgr->provider);

   fenced_mgr->ops->destroy(fenced_mgr->ops);
@@ -853,7 +853,8 @@ void
 tgsi_exec_machine_bind_shader(
   struct tgsi_exec_machine *mach,
   const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler)
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image)
 {
   uint k;
   struct tgsi_parse_context parse;
@@ -871,6 +872,7 @@ tgsi_exec_machine_bind_shader(

   mach->Tokens = tokens;
   mach->Sampler = sampler;
+   mach->Image = image;

   if (!tokens) {
      /* unbind and free all */
@@ -1994,12 +1996,12 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                   const struct tgsi_full_instruction *inst,
                   uint sampler)
 {
-   uint unit;
-
+   uint unit = 0;
+   int i;
   if (inst->Src[sampler].Register.Indirect) {
      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
      union tgsi_exec_channel indir_index, index2;
-
+      const uint execmask = mach->ExecMask;
      index2.i[0] =
      index2.i[1] =
      index2.i[2] =
@@ -2012,7 +2014,13 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                             &index2,
                             &ZeroVec,
                             &indir_index);
-      unit = inst->Src[sampler].Register.Index + indir_index.i[0];
+      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+         if (execmask & (1 << i)) {
+            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
+            break;
+         }
+      }
+
   } else {
      unit = inst->Src[sampler].Register.Index;
   }
@@ -2046,7 +2054,8 @@ exec_tex(struct tgsi_exec_machine *mach,
   assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
   assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);

-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, &shadow_ref);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
+   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);

   assert(dim <= 4);
   if (shadow_ref >= 0)
@@ -2145,7 +2154,7 @@ exec_lodq(struct tgsi_exec_machine *mach,
   union tgsi_exec_channel r[2];

   unit = fetch_sampler_unit(mach, inst, 1);
-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, NULL);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
   assert(dim <= Elements(coords));
   /* fetch coordinates */
   for (i = 0; i < dim; i++) {
@@ -3700,6 +3709,247 @@ exec_dfracexp(struct tgsi_exec_machine *mach,
   }
 }

+static int
+get_image_coord_dim(unsigned tgsi_tex)
+{
+   int dim;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_BUFFER:
+   case TGSI_TEXTURE_1D:
+      dim = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+      dim = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dim = 3;
+      break;
+   default:
+      assert(!"unknown texture target");
+      dim = 0;
+      break;
+   }
+
+   return dim;
+}
+
+static int
+get_image_coord_sample(unsigned tgsi_tex)
+{
+   int sample = 0;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_2D_MSAA:
+      sample = 3;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      sample = 4;
+      break;
+   default:
+      break;
+   }
+   return sample;
+}
+
+static void
+exec_load(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4], sample_r;
+   uint unit;
+   int sample;
+   int i, j;
+   int dim;
+   uint chan;
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   mach->Image->load(mach->Image, &params,
+                     r[0].i, r[1].i, r[2].i, sample_r.i,
+                     rgba);
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_store(struct tgsi_exec_machine *mach,
+           const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[3], sample_r;
+   union tgsi_exec_channel value[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = inst->Dst[0].Register.Index;
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 1, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+
+   mach->Image->store(mach->Image, &params,
+                      r[0].i, r[1].i, r[2].i, sample_r.i,
+                      rgba);
+}
+
+static void
+exec_atomop(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4], sample_r;
+   union tgsi_exec_channel value[4], value2[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit, chan;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 2, TGSI_CHAN_X + i);
+      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba2[0][j] = value2[0].f[j];
+         rgba2[1][j] = value2[1].f[j];
+         rgba2[2][j] = value2[2].f[j];
+         rgba2[3][j] = value2[3].f[j];
+      }
+   }
+
+   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
+                   r[0].i, r[1].i, r[2].i, sample_r.i,
+                   rgba, rgba2);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_resq(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   int result[4];
+   union tgsi_exec_channel r[4];
+   uint unit;
+   int i, chan, j;
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   mach->Image->get_dims(mach->Image, &params, result);
+
+   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+      for (j = 0; j < 4; j++) {
+         r[j].i[i] = result[j];
+      }
+   }
+
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
+                    TGSI_EXEC_DATA_INT);
+      }
+   }
+}

 static void
 micro_i2f(union tgsi_exec_channel *dst,
@@ -5166,6 +5416,34 @@ exec_instruction(
   case TGSI_OPCODE_D2U:
      exec_d2u(mach, inst);
      break;
+
+   case TGSI_OPCODE_LOAD:
+      exec_load(mach, inst);
+      break;
+
+   case TGSI_OPCODE_STORE:
+      exec_store(mach, inst);
+      break;
+
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      exec_atomop(mach, inst);
+      break;
+
+   case TGSI_OPCODE_RESQ:
+      exec_resq(mach, inst);
+      break;
+   case TGSI_OPCODE_BARRIER:
+   case TGSI_OPCODE_MEMBAR:
+      break;
   default:
      assert( 0 );
   }
@@ -5193,6 +5471,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
      default_mask = 0x1;
   }

+   if (mach->NonHelperMask == 0)
+      mach->NonHelperMask = default_mask;
   mach->CondMask = default_mask;
   mach->LoopMask = default_mask;
   mach->ContMask = default_mask;
@@ -98,6 +98,46 @@ enum tgsi_sampler_control
   TGSI_SAMPLER_GATHER,
 };

+struct tgsi_image_params {
+   unsigned unit;
+   unsigned tgsi_tex_instr;
+   enum pipe_format format;
+   unsigned execmask;
+};
+
+struct tgsi_image {
+   /* image interfaces */
+   void (*load)(const struct tgsi_image *image,
+                const struct tgsi_image_params *params,
+                const int s[TGSI_QUAD_SIZE],
+                const int t[TGSI_QUAD_SIZE],
+                const int r[TGSI_QUAD_SIZE],
+                const int sample[TGSI_QUAD_SIZE],
+                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*store)(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 const int s[TGSI_QUAD_SIZE],
+                 const int t[TGSI_QUAD_SIZE],
+                 const int r[TGSI_QUAD_SIZE],
+                 const int sample[TGSI_QUAD_SIZE],
+                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*op)(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              unsigned opcode,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*get_dims)(const struct tgsi_image *image,
+                    const struct tgsi_image_params *params,
+                    int dims[4]);
+};
+
 /**
 * Information for sampling textures, which must be implemented
 * by code outside the TGSI executor.
@@ -201,12 +241,13 @@ struct tgsi_sampler
 #define TGSI_EXEC_NUM_TEMP_R        4

 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
+#define TGSI_EXEC_NUM_ADDRS         3

 /* predicate register */
-#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 11)
 #define TGSI_EXEC_NUM_PREDS         1

-#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   12



@@ -292,6 +333,7 @@ struct tgsi_exec_machine

   struct tgsi_sampler           *Sampler;

+   struct tgsi_image             *Image;
   unsigned                      ImmLimit;

   const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
@@ -311,6 +353,9 @@ struct tgsi_exec_machine
   struct tgsi_exec_vector       QuadPos;
   float                         Face;    /**< +1 if front facing, -1 if back facing */
   bool                          flatshade_color;
+
+   /* See GLSL 4.50 specification for definition of helper invocations */
+   uint NonHelperMask;  /**< non-helpers */
   /* Conditional execution masks */
   uint CondMask;  /**< For IF/ELSE/ENDIF */
   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
@@ -378,7 +423,8 @@ void
 tgsi_exec_machine_bind_shader(
   struct tgsi_exec_machine *mach,
   const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler);
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image);

 uint
 tgsi_exec_machine_run(
@@ -451,8 +497,10 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
   case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
   case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
   case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
      return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      return PIPE_MAX_SHADER_IMAGES;
+
   case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
      return 32;
   }
@@ -54,6 +54,20 @@ is_memory_file(unsigned file)
 }


+/**
+ * Is the opcode a "true" texture instruction which samples from a
+ * texture map?
+ */
+static bool
+is_texture_inst(unsigned opcode)
+{
+   return (opcode != TGSI_OPCODE_TXQ &&
+           opcode != TGSI_OPCODE_TXQS &&
+           opcode != TGSI_OPCODE_TXQ_LZ &&
+           opcode != TGSI_OPCODE_LODQ &&
+           tgsi_get_opcode_info(opcode)->is_tex);
+}
+
 static void
 scan_instruction(struct tgsi_shader_info *info,
                 const struct tgsi_full_instruction *fullinst,
@@ -181,15 +195,35 @@ scan_instruction(struct tgsi_shader_info *info,
         info->indirect_files_read |= (1 << src->Register.File);
      }

-      /* MSAA samplers */
+      /* Texture samplers */
      if (src->Register.File == TGSI_FILE_SAMPLER) {
-         assert(fullinst->Instruction.Texture);
-         assert(src->Register.Index < Elements(info->is_msaa_sampler));
+         const unsigned index = src->Register.Index;

-         if (fullinst->Instruction.Texture &&
-             (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-              fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
-            info->is_msaa_sampler[src->Register.Index] = TRUE;
+         assert(fullinst->Instruction.Texture);
+         assert(index < Elements(info->is_msaa_sampler));
+         assert(index < PIPE_MAX_SAMPLERS);
+
+         if (is_texture_inst(fullinst->Instruction.Opcode)) {
+            const unsigned target = fullinst->Texture.Texture;
+            assert(target < TGSI_TEXTURE_UNKNOWN);
+            /* for texture instructions, check that the texture instruction
+             * target matches the previous sampler view declaration (if there
+             * was one.)
+             */
+            if (info->sampler_targets[index] == TGSI_TEXTURE_UNKNOWN) {
+               /* probably no sampler view declaration */
+               info->sampler_targets[index] = target;
+            } else {
+               /* Make sure the texture instruction's sampler/target info
+                * agrees with the sampler view declaration.
+                */
+               assert(info->sampler_targets[index] == target);
+            }
+            /* MSAA samplers */
+            if (target == TGSI_TEXTURE_2D_MSAA ||
+                target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+               info->is_msaa_sampler[src->Register.Index] = TRUE;
+            }
         }
      }

@@ -431,6 +465,16 @@ scan_declaration(struct tgsi_shader_info *info,
         }
      } else if (file == TGSI_FILE_SAMPLER) {
         info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+         unsigned target = fulldecl->SamplerView.Resource;
+         assert(target < TGSI_TEXTURE_UNKNOWN);
+         if (info->sampler_targets[reg] == TGSI_TEXTURE_UNKNOWN) {
+            /* Save sampler target for this sampler index */
+            info->sampler_targets[reg] = target;
+         } else {
+            /* if previously declared, make sure targets agree */
+            assert(info->sampler_targets[reg] == target);
+         }
      } else if (file == TGSI_FILE_IMAGE) {
         if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
            info->images_buffers |= 1 << reg;
@@ -493,6 +537,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
   for (i = 0; i < Elements(info->const_file_max); i++)
      info->const_file_max[i] = -1;
   info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1;
+   for (i = 0; i < Elements(info->sampler_targets); i++)
+      info->sampler_targets[i] = TGSI_TEXTURE_UNKNOWN;

   /**
    ** Setup to begin parsing input shader
@@ -65,6 +65,7 @@ struct tgsi_shader_info
   int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
   int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
   unsigned samplers_declared; /**< bitmask of declared samplers */
+   ubyte sampler_targets[PIPE_MAX_SHADER_SAMPLER_VIEWS];  /**< TGSI_TEXTURE_x values */

   ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
   ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];
@@ -375,10 +375,8 @@ tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg)
 * sample index.
 */
 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex)
 {
-   int dim;
-
   /*
    * Depending on the texture target, (src0.xyzw, src1.x) is interpreted
    * differently:
@@ -407,8 +405,7 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
   case TGSI_TEXTURE_BUFFER:
   case TGSI_TEXTURE_1D:
   case TGSI_TEXTURE_SHADOW1D:
-      dim = 1;
-      break;
+      return 1;
   case TGSI_TEXTURE_2D:
   case TGSI_TEXTURE_RECT:
   case TGSI_TEXTURE_1D_ARRAY:
@@ -416,52 +413,48 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
   case TGSI_TEXTURE_SHADOWRECT:
   case TGSI_TEXTURE_SHADOW1D_ARRAY:
   case TGSI_TEXTURE_2D_MSAA:
-      dim = 2;
-      break;
+      return 2;
   case TGSI_TEXTURE_3D:
   case TGSI_TEXTURE_CUBE:
   case TGSI_TEXTURE_2D_ARRAY:
   case TGSI_TEXTURE_SHADOWCUBE:
   case TGSI_TEXTURE_SHADOW2D_ARRAY:
   case TGSI_TEXTURE_2D_ARRAY_MSAA:
-      dim = 3;
-      break;
+      return 3;
   case TGSI_TEXTURE_CUBE_ARRAY:
   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-      dim = 4;
-      break;
+      return 4;
   default:
      assert(!"unknown texture target");
-      dim = 0;
-      break;
+      return 0;
   }
+}

-   if (shadow_or_sample) {
-      switch (tgsi_tex) {
-      case TGSI_TEXTURE_SHADOW1D:
-         /* there is a gap */
-         *shadow_or_sample = 2;
-         break;
-      case TGSI_TEXTURE_SHADOW2D:
-      case TGSI_TEXTURE_SHADOWRECT:
-      case TGSI_TEXTURE_SHADOWCUBE:
-      case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-         *shadow_or_sample = dim;
-         break;
-      case TGSI_TEXTURE_2D_MSAA:
-      case TGSI_TEXTURE_2D_ARRAY_MSAA:
-         *shadow_or_sample = 3;
-         break;
-      default:
-         /* no shadow nor sample */
-         *shadow_or_sample = -1;
-         break;
-      }
+
+/**
+ * Given a TGSI_TEXTURE_x target, return the src register index for the
+ * shadow reference coordinate.
+ */
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex)
+{
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return 2;
+   case TGSI_TEXTURE_SHADOWCUBE:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return 3;
+   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      return 4;
+   default:
+      /* no shadow nor sample */
+      return -1;
   }
-
-   return dim;
 }


@@ -80,7 +80,10 @@ struct tgsi_src_register
 tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg);

 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample);
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex);
+
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex);

 boolean
 tgsi_is_shadow_target(unsigned target);
@@ -55,16 +55,16 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
       dst->height != src->height)
      return FALSE;

-   for (i = 0; i < Elements(src->cbufs); i++) {
+   if (dst->nr_cbufs != src->nr_cbufs) {
+      return FALSE;
+   }
+
+   for (i = 0; i < src->nr_cbufs; i++) {
      if (dst->cbufs[i] != src->cbufs[i]) {
         return FALSE;
      }
   }

-   if (dst->nr_cbufs != src->nr_cbufs) {
-      return FALSE;
-   }
-
   if (dst->zsbuf != src->zsbuf) {
      return FALSE;
   }
@@ -2095,7 +2095,7 @@ after lookup.
 .. opcode:: SAMPLE

  Using provided address, sample data from the specified texture using the
-  filtering mode identified by the gven sampler. The source data may come from
+  filtering mode identified by the given sampler. The source data may come from
  any resource type other than buffers.

  Syntax: ``SAMPLE dst, address, sampler_view, sampler``
@@ -1109,7 +1109,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
-		break;
+		return;
 	}

 	for (int i = 0; i < intr->num_components; i++) {
@@ -1258,7 +1258,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			ctx->frag_face = create_input(b, 0);
 			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 		}
-		dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+		/* for fragface, we always get -1 or 0, but that is inverse
+		 * of what nir expects (where ~0 is true).  Unfortunately
+		 * trying to widen from half to full in add.s seems to do a
+		 * non-sign-extending widen (resulting in something that
+		 * gets interpreted as float Inf??)
+		 */
+		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
 		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {
@@ -740,7 +740,9 @@ fs_prepare_tgsi_sampling(struct fs_compile_context *fcc,
      break;
   }

-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
+
   tsrc_transpose(inst->src[0], coords);
   bias_or_lod = tsrc_null();
   ref_or_si = tsrc_null();
@@ -407,7 +407,8 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
   num_derivs = 0;
   sampler_src = 1;

-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);

   /* extract the parameters */
   switch (inst->opcode) {
@@ -177,9 +177,11 @@ struct nv50_ir_prog_info
      bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
      uint16_t texBindBase;      /* base address for tex handles (nve4) */
      uint16_t suInfoBase;       /* base address for surface info (nve4) */
+      uint16_t bufInfoBase;      /* base address for buffer info */
      uint16_t sampleInfoBase;   /* base address for sample positions */
      uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
      uint16_t msInfoBase;       /* base address for multisample info */
+      uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
   } io;

   /* driver callback to assign input/output locations */
@@ -1858,7 +1858,10 @@ CodeEmitterNVC0::emitLOAD(const Instruction *i)
   if (i->src(0).getFile() == FILE_MEMORY_SHARED) {
      if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
         assert(i->defExists(1));
-         defId(i->def(1), 32 + 18);
+         if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+            defId(i->def(1), 8);
+         else
+            defId(i->def(1), 32 + 18);
      }
   }

@@ -3536,8 +3536,11 @@ Converter::exportOutputs()
         Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
                                info->out[i].slot[c] * 4);
         Value *val = oData.load(sub.cur->values, i, c, NULL);
-         if (val)
+         if (val) {
+            if (info->out[i].sn == TGSI_SEMANTIC_POSITION)
+               mkOp1(OP_SAT, TYPE_F32, val, val);
            mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+         }
      }
   }
 }
@@ -874,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
   Value *zero = bld.loadImm(bld.getSSA(), 0);
   int l, c;
   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
-   const int array = i->tex.target.isArray();
+
+   // This function is invoked after handleTEX lowering, so we have to expect
+   // the arguments in the order that the hw wants them. For Fermi, array and
+   // indirect are both in the leading arg, while for Kepler, array and
+   // indirect are separate (and both precede the coordinates). Maxwell is
+   // handled in a separate function.
+   unsigned array;
+   if (targ->getChipset() < NVISA_GK104_CHIPSET)
+      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
+   else
+      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);

   i->op = OP_TEX; // no need to clone dPdx/dPdy later

@@ -1063,13 +1073,115 @@ bool
 NVC0LoweringPass::handleSUQ(Instruction *suq)
 {
   suq->op = OP_MOV;
-   suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+   suq->setSrc(0, loadBufLength32(suq->getIndirect(0, 1),
                                  suq->getSrc(0)->reg.fileIndex * 16));
   suq->setIndirect(0, 0, NULL);
   suq->setIndirect(0, 1, NULL);
   return true;
 }

+void
+NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
+{
+   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
+
+   BasicBlock *currBB = atom->bb;
+   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
+   BasicBlock *joinBB = atom->bb->splitAfter(atom);
+   BasicBlock *setAndUnlockBB = new BasicBlock(func);
+   BasicBlock *failLockBB = new BasicBlock(func);
+
+   bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   CmpInstruction *pred =
+      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
+
+   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
+   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(tryLockBB, true);
+
+   Instruction *ld =
+      bld.mkLoad(TYPE_U32, atom->getDef(0),
+                 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), NULL);
+   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
+   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
+
+   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
+   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
+
+   tryLockBB->cfg.detach(&joinBB->cfg);
+   bld.remove(atom);
+
+   bld.setPosition(setAndUnlockBB, true);
+   Value *stVal;
+   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
+      // Read the old value, and write the new one.
+      stVal = atom->getSrc(1);
+   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      CmpInstruction *set =
+         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
+                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
+
+      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
+                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
+   } else {
+      operation op;
+
+      switch (atom->subOp) {
+      case NV50_IR_SUBOP_ATOM_ADD:
+         op = OP_ADD;
+         break;
+      case NV50_IR_SUBOP_ATOM_AND:
+         op = OP_AND;
+         break;
+      case NV50_IR_SUBOP_ATOM_OR:
+         op = OP_OR;
+         break;
+      case NV50_IR_SUBOP_ATOM_XOR:
+         op = OP_XOR;
+         break;
+      case NV50_IR_SUBOP_ATOM_MIN:
+         op = OP_MIN;
+         break;
+      case NV50_IR_SUBOP_ATOM_MAX:
+         op = OP_MAX;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
+                         atom->getSrc(1));
+   }
+
+   Instruction *st =
+      bld.mkStore(OP_STORE, TYPE_U32,
+                  bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0),
+                  NULL, stVal);
+   st->setDef(0, pred->getDef(0));
+   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
+
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
+
+   // Lock until the store has not been performed.
+   bld.setPosition(failLockBB, true);
+   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
+   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
+   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
+   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(joinBB, false);
+   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
+}
+
 void
 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
 {
@@ -1176,11 +1288,16 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
      sv = SV_LBASE;
      break;
   case FILE_MEMORY_SHARED:
-      handleSharedATOM(atom);
+      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
+      // operations on shared memory. For Maxwell, ATOMS is enough.
+      if (targ->getChipset() < NVISA_GK104_CHIPSET)
+         handleSharedATOM(atom);
+      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
+         handleSharedATOMNVE4(atom);
      return true;
   default:
      assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
-      base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
      assert(base->reg.size == 8);
      if (ptr)
         base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
@@ -1204,9 +1321,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
 bool
 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 {
-   if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
-      // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
-      return false;
+   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
+      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
+         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
+         return false;
+      }
   }

   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
@@ -1240,19 +1359,20 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 }

 inline Value *
-NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
 {
   uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
+
   return bld.
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }

 inline Value *
-NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
 {
   uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;

   if (ptr)
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1262,10 +1382,10 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 }

 inline Value *
-NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
 {
   uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;

   if (ptr)
      ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1274,6 +1394,60 @@ NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
 }

+inline Value *
+NVC0LoweringPass::loadSuInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
 inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
@@ -1354,8 +1528,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)

   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();

-   Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
-   Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+   Value *ms_x = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+   Value *ms_y = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(1));

   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
@@ -1408,9 +1582,9 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   for (c = 0; c < arg; ++c) {
      src[c] = bld.getScratch();
      if (c == 0 && raw)
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
      else
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
         ->subOp = getSuClampSubOp(su, c);
   }
@@ -1432,16 +1606,16 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
   } else
   if (dim == 3) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l

-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
   } else {
      assert(dim == 2);
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
         ->subOp = su->tex.target.isArray() ?
         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
@@ -1452,7 +1626,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
      if (raw) {
         bf = src[0];
      } else {
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
      }
@@ -1469,7 +1643,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
      case 2:
         z = off;
         if (!su->tex.target.isArray()) {
-            z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+            z = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
            subOp = NV50_IR_SUBOP_SUBFM_3D;
         }
         break;
@@ -1484,7 +1658,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   }

   // part 2
-   v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+   v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ADDR);

   if (su->tex.target == TEX_TARGET_BUFFER) {
      eau = v;
@@ -1493,7 +1667,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
   }
   // add array layer offset
   if (su->tex.target.isArray()) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
      if (dim == 1)
         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
@@ -1533,7 +1707,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)

   // let's just set it 0 for raw access and hope it works
   v = raw ?
-      bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+      bld.mkImm(0) : loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);

   // get rid of old coordinate sources, make space for fmt info and predicate
   su->moveSources(arg, 3 - arg);
@@ -1644,6 +1818,100 @@ NVC0LoweringPass::handleWRSV(Instruction *i)
   return true;
 }

+void
+NVC0LoweringPass::handleLDST(Instruction *i)
+{
+   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+      if (prog->getType() == Program::TYPE_COMPUTE) {
+         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+         i->getSrc(0)->reg.fileIndex = 0;
+      } else
+      if (prog->getType() == Program::TYPE_GEOMETRY &&
+          i->src(0).isIndirect(0)) {
+         // XXX: this assumes vec4 units
+         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                                 i->getIndirect(0, 0), bld.mkImm(4));
+         i->setIndirect(0, 0, ptr);
+         i->op = OP_VFETCH;
+      } else {
+         i->op = OP_VFETCH;
+         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+      }
+   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
+          prog->getType() == Program::TYPE_COMPUTE) {
+         // The launch descriptor only allows to set up 8 CBs, but OpenGL
+         // requires at least 12 UBOs. To bypass this limitation, we store the
+         // addrs into the driver constbuf and we directly load from the global
+         // memory.
+         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
+         Value *ind = i->getIndirect(0, 1);
+         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
+
+         // TODO: clamp the offset to the maximum number of const buf.
+         if (i->src(0).isIndirect(1)) {
+            Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+            Value *length = loadUboLength32(ind, fileIndex * 16);
+            Value *pred = new_LValue(func, FILE_PREDICATE);
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+               bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+            bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+            i->setPredicate(CC_NOT_P, pred);
+            if (i->defExists(0)) {
+               bld.mkMov(i->getDef(0), bld.mkImm(0));
+            }
+         } else if (fileIndex >= 0) {
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+         }
+      } else if (i->src(0).isIndirect(1)) {
+         Value *ptr;
+         if (i->src(0).isIndirect(0))
+            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(0x1010),
+                             i->getIndirect(0, 0));
+         else
+            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(16));
+         i->setIndirect(0, 1, NULL);
+         i->setIndirect(0, 0, ptr);
+         i->subOp = NV50_IR_SUBOP_LDC_IS;
+      }
+   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      i->op = OP_VFETCH;
+   } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      Value *ind = i->getIndirect(0, 1);
+      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+      // XXX come up with a way not to do this for EVERY little access but
+      // rather to batch these up somehow. Unfortunately we've lost the
+      // information about the field width by the time we get here.
+      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+      Value *pred = new_LValue(func, FILE_PREDICATE);
+      if (i->src(0).isIndirect(0)) {
+         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+      }
+      i->setIndirect(0, 1, NULL);
+      i->setIndirect(0, 0, ptr);
+      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+      i->setPredicate(CC_NOT_P, pred);
+      if (i->defExists(0)) {
+         bld.mkMov(i->getDef(0), bld.mkImm(0));
+      }
+   }
+}
+
 void
 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
 {
@@ -1969,60 +2237,7 @@ NVC0LoweringPass::visit(Instruction *i)
      return handleWRSV(i);
   case OP_STORE:
   case OP_LOAD:
-      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
-         if (prog->getType() == Program::TYPE_COMPUTE) {
-            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
-            i->getSrc(0)->reg.fileIndex = 0;
-         } else
-         if (prog->getType() == Program::TYPE_GEOMETRY &&
-             i->src(0).isIndirect(0)) {
-            // XXX: this assumes vec4 units
-            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                    i->getIndirect(0, 0), bld.mkImm(4));
-            i->setIndirect(0, 0, ptr);
-            i->op = OP_VFETCH;
-         } else {
-            i->op = OP_VFETCH;
-            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
-         }
-      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-         if (i->src(0).isIndirect(1)) {
-            Value *ptr;
-            if (i->src(0).isIndirect(0))
-               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(0x1010),
-                                i->getIndirect(0, 0));
-            else
-               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(16));
-            i->setIndirect(0, 1, NULL);
-            i->setIndirect(0, 0, ptr);
-            i->subOp = NV50_IR_SUBOP_LDC_IS;
-         }
-      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
-         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
-         i->op = OP_VFETCH;
-      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
-         Value *ind = i->getIndirect(0, 1);
-         Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
-         // XXX come up with a way not to do this for EVERY little access but
-         // rather to batch these up somehow. Unfortunately we've lost the
-         // information about the field width by the time we get here.
-         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
-         Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
-         Value *pred = new_LValue(func, FILE_PREDICATE);
-         if (i->src(0).isIndirect(0)) {
-            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
-            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
-         }
-         i->setIndirect(0, 1, NULL);
-         i->setIndirect(0, 0, ptr);
-         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
-         i->setPredicate(CC_NOT_P, pred);
-         if (i->defExists(0)) {
-            bld.mkMov(i->getDef(0), bld.mkImm(0));
-         }
-      }
+      handleLDST(i);
      break;
   case OP_ATOM:
   {
@@ -106,6 +106,8 @@ protected:
   bool handleCasExch(Instruction *, bool needCctl);
   void handleSurfaceOpNVE4(TexInstruction *);
   void handleSharedATOM(Instruction *);
+   void handleSharedATOMNVE4(Instruction *);
+   void handleLDST(Instruction *);

   void checkPredicate(Instruction *);

@@ -117,9 +119,18 @@ private:

   void readTessCoord(LValue *dst, int c);

-   Value *loadResInfo32(Value *ptr, uint32_t off);
-   Value *loadResInfo64(Value *ptr, uint32_t off);
-   Value *loadResLength32(Value *ptr, uint32_t off);
+   Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadSuInfo32(Value *ptr, uint32_t off);
+   Value *loadSuInfo64(Value *ptr, uint32_t off);
+   Value *loadSuLength32(Value *ptr, uint32_t off);
+   Value *loadBufInfo32(Value *ptr, uint32_t off);
+   Value *loadBufInfo64(Value *ptr, uint32_t off);
+   Value *loadBufLength32(Value *ptr, uint32_t off);
+   Value *loadUboInfo32(Value *ptr, uint32_t off);
+   Value *loadUboInfo64(Value *ptr, uint32_t off);
+   Value *loadUboLength32(Value *ptr, uint32_t off);
   Value *loadMsInfo32(Value *ptr, uint32_t off);
   Value *loadTexHandle(Value *ptr, unsigned int slot);

@@ -853,7 +853,7 @@ isShortRegOp(Instruction *insn)
 static bool
 isShortRegVal(LValue *lval)
 {
-   if (lval->defs.size() == 0)
+   if (lval->getInsn() == NULL)
      return false;
   for (Value::DefCIterator def = lval->defs.begin();
        def != lval->defs.end(); ++def)
@@ -1467,7 +1467,7 @@ GCRA::allocateRegisters(ArrayList& insns)
         nodes[i].init(regs, lval);
         RIG.insert(&nodes[i]);

-         if (lval->inFile(FILE_GPR) && lval->defs.size() > 0 &&
+         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL &&
             prog->getTarget()->getChipset() < 0xc0) {
            Instruction *insn = lval->getInsn();
            if (insn->op == OP_MAD || insn->op == OP_SAD)
@@ -67,9 +67,18 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
      break;
   }

+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
   /* transfers & shared are always supported */
   bindings &= ~(PIPE_BIND_TRANSFER_READ |
                 PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                 PIPE_BIND_SHARED);

   return (( nv50_format_table[format].usage |
@@ -110,9 +110,18 @@
 /* 32 textures handles, at 1 32-bits integer each */
 #define NVC0_CB_AUX_TEX_INFO(i)     0x020 + (i) * 4
 #define NVC0_CB_AUX_TEX_SIZE        (32 * 4)
+/* 8 sets of 32-bits coordinate offsets */
+#define NVC0_CB_AUX_MS_INFO         0x0a0 /* CP */
+#define NVC0_CB_AUX_MS_SIZE         (8 * 2 * 4)
+/* block/grid size, at 3 32-bits integers each and gridid */
+#define NVC0_CB_AUX_GRID_INFO       0x0e0 /* CP */
+#define NVC0_CB_AUX_GRID_SIZE       (7 * 4)
 /* 8 user clip planes, at 4 32-bits floats each */
 #define NVC0_CB_AUX_UCP_INFO        0x100
 #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 13 ubos, at 4 32-bits integer each */
+#define NVC0_CB_AUX_UBO_INFO(i)     0x100 + (i) * 4 * 4 /* CP */
+#define NVC0_CB_AUX_UBO_SIZE        ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
 /* 8 sets of 32-bits integer pairs sample offsets */
 #define NVC0_CB_AUX_SAMPLE_INFO     0x180 /* FP */
 #define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
@@ -540,24 +540,24 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,

   if (prog->type == PIPE_SHADER_COMPUTE) {
      if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.auxCBSlot = 0;
-         info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
-         info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
-         info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
-      } else {
-         info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+         info->io.auxCBSlot = 7;
+         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
+         info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
+         info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
      }
      info->io.msInfoCBSlot = 0;
-      info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
+      info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+      info->io.suInfoBase = 0; /* TODO */
   } else {
      if (chipset >= NVISA_GK104_CHIPSET) {
         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
-         info->io.suInfoBase = 0; /* TODO */
      }
      info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
-      info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
      info->io.msInfoCBSlot = 15;
      info->io.msInfoBase = 0; /* TODO */
+      info->io.suInfoBase = 0; /* TODO */
   }

   info->assignSlots = nvc0_program_assign_varying_slots;
@@ -57,9 +57,18 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
      if (util_format_get_blocksizebits(format) == 3 * 32)
         return false;

+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
   /* transfers & shared are always supported */
   bindings &= ~(PIPE_BIND_TRANSFER_READ |
                 PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                 PIPE_BIND_SHARED);

   return (( nvc0_format_table[format].usage |
@@ -282,7 +291,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_PREFERRED_IR:
      return PIPE_SHADER_IR_TGSI;
   case PIPE_SHADER_CAP_SUPPORTED_IRS:
-      if (class_3d >= NVE4_3D_CLASS)
+      if (class_3d == NVF0_3D_CLASS &&
+          !debug_get_bool_option("NVF0_COMPUTE", false))
         return 0;
      return 1 << PIPE_SHADER_IR_TGSI;
   case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
@@ -311,8 +321,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
   case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
      return 65536;
   case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
-         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
      return NVC0_MAX_PIPE_CONSTBUFS;
   case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
      return shader != PIPE_SHADER_FRAGMENT;
@@ -16,7 +16,6 @@

 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
 #define NVC0_MAX_PIPE_CONSTBUFS         14
-#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE  7

 #define NVC0_MAX_SURFACE_SLOTS 16

@@ -1295,6 +1295,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
                      NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
                      NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
   }
+   for (i = 1; i < n; ++i)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
   if (nvc0->state.instance_elts) {
      nvc0->state.instance_elts = 0;
      BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
@@ -1303,6 +1305,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
   }
   nvc0->state.num_vtxelts = 2;

+   if (nvc0->state.prim_restart) {
+      IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+      nvc0->state.prim_restart = 0;
+   }
+
+   if (nvc0->state.index_bias) {
+      IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
+      nvc0->state.index_bias = 0;
+   }
+
   for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
      if (info->dst.box.z + i) {
         BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
@@ -41,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
   int i;
   int ret;
   uint32_t obj_class;
+   uint64_t address;

   switch (dev->chipset & ~0xf) {
   case 0x100:
@@ -65,7 +66,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
      return ret;
   }

-   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
                        &screen->parm);
   if (ret)
      return ret;
@@ -95,9 +96,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    *  accessible. We cannot prevent that at the moment, so expect failure.
    */
   BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
-   PUSH_DATA (push, 1 << 24);
+   PUSH_DATA (push, 0xff << 24);
   BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
-   PUSH_DATA (push, 2 << 24);
+   PUSH_DATA (push, 0xfe << 24);

   BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
   PUSH_DATAh(push, screen->text->offset);
@@ -128,15 +129,17 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
   }

   BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
-   PUSH_DATA (push, 0); /* does not interefere with 3D */
+   PUSH_DATA (push, 7); /* does not interfere with 3D */

   if (obj_class == NVF0_COMPUTE_CLASS)
      IMMED_NVC0(push, SUBC_CP(0x02c4), 1);

+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
+
   /* MS sample coordinate offsets: these do not work with _ALT modes ! */
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
   PUSH_DATA (push, 64);
   PUSH_DATA (push, 1);
@@ -159,7 +162,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
   PUSH_DATA (push, 3); /* 7 */
   PUSH_DATA (push, 1);

-#ifdef DEBUG
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
@@ -194,6 +197,9 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
   uint32_t mask;
   unsigned i;
   const unsigned t = 1;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);

   mask = nvc0->surfaces_dirty[t];
   while (mask) {
@@ -205,8 +211,8 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
       * directly instead of via binding points, so we have to supply them.
       */
      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-      PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
-      PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
+      PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
      PUSH_DATA (push, 64);
      PUSH_DATA (push, 1);
@@ -271,6 +277,7 @@ static void
 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 {
   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
   uint64_t address;
   const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
   unsigned i, n;
@@ -282,11 +289,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
   n = util_logbase2(dirty) + 1 - i;
   assert(n);

-   address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);

   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, address);
-   PUSH_DATA (push, address);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
+   PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
   PUSH_DATA (push, n * 4);
   PUSH_DATA (push, 0x1);
@@ -301,6 +308,103 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
   nvc0->samplers_dirty[s] = 0;
 }

+static void
+nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const int s = 5;
+
+   while (nvc0->constbuf_dirty[s]) {
+      int i = ffs(nvc0->constbuf_dirty[s]) - 1;
+      nvc0->constbuf_dirty[s] &= ~(1 << i);
+
+      if (nvc0->constbuf[s][i].user) {
+         struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+         const unsigned base = NVC0_CB_USR_INFO(s);
+         const unsigned size = nvc0->constbuf[s][0].size;
+         assert(i == 0); /* we really only want OpenGL uniforms here */
+         assert(nvc0->constbuf[s][0].u.data);
+
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, bo->offset + base);
+         PUSH_DATA (push, bo->offset + base);
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, size);
+         PUSH_DATA (push, 0x1);
+         BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+         PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
+      }
+      else {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->constbuf[s][i].u.buf);
+         if (res) {
+            uint64_t address
+               = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+            assert(i > 0); /* we really only want uniform buffer objects */
+
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+            PUSH_DATA (push, 4 * 4);
+            PUSH_DATA (push, 0x1);
+            BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+            PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+            PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATA (push, nvc0->constbuf[5][i].size);
+            PUSH_DATA (push, 0);
+            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+
+            res->cb_bindings[s] |= 1 << i;
+         }
+      }
+   }
+
+   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
+static void
+nve4_compute_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   const int s = 5;
+   int i;
+
+   address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
+   PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+   for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+      if (nvc0->buffers[s][i].buffer) {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->buffers[s][i].buffer);
+         PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+         PUSH_DATA (push, 0);
+         BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
+      } else {
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+      }
+   }
+}
+
 static struct nvc0_state_validate
 validate_list_cp[] = {
   { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
@@ -310,6 +414,8 @@ validate_list_cp[] = {
                                          NVC0_NEW_CP_SAMPLERS    },
   { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
   { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+   { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
+   { nve4_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
 };

 static bool
@@ -327,13 +433,16 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 }

 static void
-nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
-                          const uint *block_layout,
-                          const uint *grid_layout)
+nve4_compute_upload_input(struct nvc0_context *nvc0,
+                          struct nve4_cp_launch_desc *desc,
+                          const struct pipe_grid_info *info)
 {
   struct nvc0_screen *screen = nvc0->screen;
   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   struct nvc0_program *cp = nvc0->compprog;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);

   if (cp->parm_size) {
      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
@@ -344,18 +453,38 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
      PUSH_DATA (push, 0x1);
      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-      PUSH_DATAp(push, input, cp->parm_size / 4);
+      PUSH_DATAp(push, info->input, cp->parm_size / 4);
+
+      /* Bind user parameters coming from clover. */
+      /* TODO: This should be harmonized with uniform_bo. */
+      assert(!(desc->cb_mask & (1 << 0)));
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
   }
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+   PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
   PUSH_DATA (push, 7 * 4);
   PUSH_DATA (push, 0x1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-   PUSH_DATAp(push, block_layout, 3);
-   PUSH_DATAp(push, grid_layout, 3);
+
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+   } else {
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      PUSH_DATAp(push, info->grid, 3);
+   }
   PUSH_DATA (push, 0);

   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
@@ -375,24 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 static void
 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                               struct nve4_cp_launch_desc *desc,
-                               uint32_t label,
-                               const uint *block_layout,
-                               const uint *grid_layout)
+                               const struct pipe_grid_info *info)
 {
   const struct nvc0_screen *screen = nvc0->screen;
   const struct nvc0_program *cp = nvc0->compprog;
-   unsigned i;

   nve4_cp_launch_desc_init_default(desc);

-   desc->entry = nvc0_program_symbol_offset(cp, label);
+   desc->entry = nvc0_program_symbol_offset(cp, info->pc);

-   desc->griddim_x = grid_layout[0];
-   desc->griddim_y = grid_layout[1];
-   desc->griddim_z = grid_layout[2];
-   desc->blockdim_x = block_layout[0];
-   desc->blockdim_y = block_layout[1];
-   desc->blockdim_z = block_layout[2];
+   desc->griddim_x = info->grid[0];
+   desc->griddim_y = info->grid[1];
+   desc->griddim_z = info->grid[2];
+   desc->blockdim_x = info->block[0];
+   desc->blockdim_y = info->block[1];
+   desc->blockdim_z = info->block[2];

   desc->shared_size = align(cp->cp.smem_size, 0x100);
   desc->local_size_p = align(cp->cp.lmem_size, 0x10);
@@ -403,12 +529,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
   desc->gpr_alloc = cp->num_gprs;
   desc->bar_alloc = cp->num_barriers;

-   for (i = 0; i < 7; ++i) {
-      const unsigned s = 5;
-      if (nvc0->constbuf[s][i].u.buf)
-         nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+   // Only bind OpenGL uniforms and the driver constant buffer through the
+   // launch descriptor because UBOs are sticked to the driver cb to avoid the
+   // limitation of 8 CBs.
+   if (nvc0->constbuf[5][0].user) {
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
+                                 NVC0_CB_USR_INFO(5), 1 << 16);
   }
-   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+   nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
+                              NVC0_CB_AUX_INFO(5), 1 << 10);
 }

 static inline struct nve4_cp_launch_desc *
@@ -448,29 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
   if (ret)
      goto out;

-   nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
-                                  info->block, info->grid);
+   nve4_compute_setup_launch_desc(nvc0, desc, info);
+
+   nve4_compute_upload_input(nvc0, desc, info);
+
 #ifdef DEBUG
   if (debug_get_num_option("NV50_PROG_DEBUG", 0))
      nve4_compute_dump_launch_desc(desc);
 #endif

-   nve4_compute_upload_input(nvc0, info->input, info->block, info->grid);
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      /* upload the descriptor */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr);
+      PUSH_DATA (push, desc_gpuaddr);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 256);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+
+      /* overwrite griddim_x and griddim_y as two 32-bits integers even
+       * if griddim_y must be a 16-bits integer */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 48);
+      PUSH_DATA (push, desc_gpuaddr + 48);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 8);
+      PUSH_DATA (push, 1);
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
+
+      /* overwrite the 16 high bits of griddim_y with griddim_z because
+       * we need (z << 16) | x */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 54);
+      PUSH_DATA (push, desc_gpuaddr + 54);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 4);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset + 8,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+   }

   /* upload descriptor and flush */
-#if 0
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, desc_gpuaddr);
-   PUSH_DATA (push, desc_gpuaddr);
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-   PUSH_DATA (push, 256);
-   PUSH_DATA (push, 1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
-   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
-   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
-   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
-#endif
   BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
   PUSH_DATA (push, desc_gpuaddr >> 8);
   BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
@@ -495,7 +657,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
   const unsigned s = 5;
   unsigned i;
-   uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+   uint32_t commands[2][32];
   unsigned n[2] = { 0, 0 };

   for (i = 0; i < nvc0->num_textures[s]; ++i) {
@@ -4,31 +4,6 @@

 #include "nvc0/nve4_compute.xml.h"

-/* Input space is implemented as c0[], to which we bind the screen->parm bo.
- */
-#define NVE4_CP_INPUT_USER           0x0000
-#define NVE4_CP_INPUT_USER_LIMIT     0x1000
-#define NVE4_CP_INPUT_GRID_INFO(i)  (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NTID(i)       (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NCTAID(i)     (0x100c + (i) * 4)
-#define NVE4_CP_INPUT_GRIDID         0x1018
-#define NVE4_CP_INPUT_TEX(i)        (0x1040 + (i) * 4)
-#define NVE4_CP_INPUT_TEX_STRIDE     4
-#define NVE4_CP_INPUT_TEX_MAX        32
-#define NVE4_CP_INPUT_MS_OFFSETS     0x10c0
-#define NVE4_CP_INPUT_SUF_STRIDE     64
-#define NVE4_CP_INPUT_SUF(i)        (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
-#define NVE4_CP_INPUT_SUF_MAX        32
-#define NVE4_CP_INPUT_TRAP_INFO_PTR  0x1900
-#define NVE4_CP_INPUT_TEMP_PTR       0x1908
-#define NVE4_CP_INPUT_MP_TEMP_SIZE   0x1910
-#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914
-#define NVE4_CP_INPUT_CSTACK_SIZE    0x1918
-#define NVE4_CP_INPUT_SIZE           0x1a00
-#define NVE4_CP_PARAM_TRAP_INFO      0x2000
-#define NVE4_CP_PARAM_TRAP_INFO_SZ  (1 << 16)
-#define NVE4_CP_PARAM_SIZE          (NVE4_CP_PARAM_TRAP_INFO + (1 << 16))
-
 struct nve4_cp_launch_desc
 {
   u32 unk0[8];
@@ -81,7 +56,7 @@ static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                           unsigned index,
                           struct nouveau_bo *bo,
-                           uint32_t base, uint16_t size)
+                           uint32_t base, uint32_t size)
 {
   uint64_t address = bo->offset + base;

@@ -95,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
   desc->cb_mask |= 1 << index;
 }

-static inline void
-nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
-                               unsigned index,
-                               const struct nvc0_constbuf *cb)
-{
-   assert(index < 8);
-
-   if (!cb->u.buf) {
-      desc->cb_mask &= ~(1 << index);
-   } else {
-      const struct nv04_resource *buf = nv04_resource(cb->u.buf);
-      assert(!cb->user);
-      nve4_cp_launch_desc_set_cb(desc, index,
-                                 buf->bo, buf->offset + cb->offset, cb->size);
-   }
-}
-
 struct nve4_mp_trap_info {
   u32 lock;
   u32 pc;
@@ -299,6 +299,11 @@ boolean evergreen_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;

+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }

@@ -239,6 +239,11 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;

+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }

@@ -467,6 +467,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_POLARIS10: return "AMD POLARIS10";
+	case CHIP_POLARIS11: return "AMD POLARIS11";
 	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
@@ -597,6 +599,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 #else
 	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
+#endif
+#if HAVE_LLVM <= 0x0308
+	case CHIP_POLARIS10: return "tonga";
+	case CHIP_POLARIS11: return "tonga";
+#else
+	case CHIP_POLARIS10: return "polaris10";
+	case CHIP_POLARIS11: return "polaris11";
 #endif
 	default: return "";
 	}
@@ -1066,7 +1066,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 			item_mask = 0x3;
 		}

-		while(num_tile_pipes--) {
+		while (num_tile_pipes--) {
 			i = backend_map & item_mask;
 			mask |= (1<<i);
 			backend_map >>= item_width;
@@ -335,7 +335,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	 */
 	if (resource->target != PIPE_BUFFER &&
 	    (resource->nr_samples > 1 || rtex->is_depth))
-		return NULL;
+		return false;

 	if (!res->is_shared) {
 		res->is_shared = true;
@@ -50,6 +50,7 @@
 #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
 #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))

 /**
 * flush commands to the hardware
@@ -408,7 +409,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
            rscreen->info.drm_major == 3)
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA &&
-             rscreen->info.family != CHIP_STONEY)
+	    rscreen->info.family != CHIP_STONEY &&
+	    rscreen->info.family != CHIP_POLARIS11)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -482,6 +484,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		break;

 	case FW_52_0_3:
+	case FW_52_4_3:
 		radeon_vce_52_init(enc);
 		break;

@@ -514,6 +517,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 	case FW_50_10_2:
 	case FW_50_17_3:
 	case FW_52_0_3:
+	case FW_52_4_3:
 		return true;
 	default:
 		return false;
@@ -124,6 +124,8 @@ enum radeon_family {
    CHIP_CARRIZO,
    CHIP_FIJI,
    CHIP_STONEY,
+    CHIP_POLARIS10,
+    CHIP_POLARIS11,
    CHIP_LAST,
 };

@@ -598,6 +598,8 @@ static bool si_init_gs_info(struct si_screen *sscreen)
 	case CHIP_HAWAII:
 	case CHIP_TONGA:
 	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
 		sscreen->gs_table_depth = 32;
 		return true;
 	default:
@@ -39,6 +39,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
+#include "util/u_string.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
@@ -2874,8 +2875,7 @@ static LLVMValueRef image_fetch_coords(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Memory.Texture;
-	int sample;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 	LLVMValueRef coords[4];
 	LLVMValueRef tmp;
 	int chan;
@@ -3387,8 +3387,8 @@ static void tex_fetch_args(
 	unsigned target = inst->Texture.Texture;
 	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
-	int ref_pos;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
 	unsigned count = 0;
 	unsigned chan;
 	unsigned num_deriv_channels = 0;
@@ -4996,7 +4996,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary

 			line = binary->disasm_string;
 			while (*line) {
-				p = strchrnul(line, '\n');
+				p = util_strchrnul(line, '\n');
 				count = p - line;

 				if (count) {
@@ -2046,6 +2046,11 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;

+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }

@@ -3946,6 +3951,14 @@ static void si_init_config(struct si_context *sctx)
 			raster_config_1 = 0x0000002e;
 		}
 		break;
+	case CHIP_POLARIS10:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_POLARIS11:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
 	case CHIP_TONGA:
 		raster_config = 0x16000012;
 		raster_config_1 = 0x0000002a;
@@ -10,6 +10,7 @@ C_SOURCES := \
 	sp_flush.h \
 	sp_fs_exec.c \
 	sp_fs.h \
+	sp_image.c \
 	sp_limits.h \
 	sp_prim_vbuf.c \
 	sp_prim_vbuf.h \
@@ -31,6 +32,7 @@ C_SOURCES := \
 	sp_state_blend.c \
 	sp_state_clip.c \
 	sp_state_derived.c \
+	sp_state_image.c \
 	sp_state.h \
 	sp_state_rasterizer.c \
 	sp_state_sampler.c \
@@ -50,7 +50,7 @@
 #include "sp_query.h"
 #include "sp_screen.h"
 #include "sp_tex_sample.h"
-
+#include "sp_image.h"

 static void
 softpipe_destroy( struct pipe_context *pipe )
@@ -199,6 +199,10 @@ softpipe_create_context(struct pipe_screen *screen,
      softpipe->tgsi.sampler[i] = sp_create_tgsi_sampler();
   }

+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      softpipe->tgsi.image[i] = sp_create_tgsi_image();
+   }
+
   softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
   softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );

@@ -216,6 +220,7 @@ softpipe_create_context(struct pipe_screen *screen,
   softpipe_init_streamout_funcs(&softpipe->pipe);
   softpipe_init_texture_funcs( &softpipe->pipe );
   softpipe_init_vertex_funcs(&softpipe->pipe);
+   softpipe_init_image_funcs(&softpipe->pipe);

   softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;

@@ -223,7 +228,8 @@ softpipe_create_context(struct pipe_screen *screen,

   softpipe->pipe.clear = softpipe_clear;
   softpipe->pipe.flush = softpipe_flush_wrapped;
-
+   softpipe->pipe.texture_barrier = softpipe_texture_barrier;
+   softpipe->pipe.memory_barrier = softpipe_memory_barrier;
   softpipe->pipe.render_condition = softpipe_render_condition;
   
   /*
@@ -272,6 +278,16 @@ softpipe_create_context(struct pipe_screen *screen,
                        (struct tgsi_sampler *)
                           softpipe->tgsi.sampler[PIPE_SHADER_GEOMETRY]);

+   draw_image(softpipe->draw,
+              PIPE_SHADER_VERTEX,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_VERTEX]);
+
+   draw_image(softpipe->draw,
+              PIPE_SHADER_GEOMETRY,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]);
+
   if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE ))
      softpipe->no_rast = TRUE;

@@ -83,6 +83,7 @@ struct softpipe_context {
   struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
   struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];

+   struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
   struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
   struct pipe_index_buffer index_buffer;
@@ -172,9 +173,12 @@ struct softpipe_context {
   /** TGSI exec things */
   struct {
      struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES];
+      struct sp_tgsi_image *image[PIPE_SHADER_TYPES];
   } tgsi;

   struct tgsi_exec_machine *fs_machine;
+   /** whether early depth testing is enabled */
+   bool early_depth;

   /** The primitive drawing context */
   struct draw_context *draw;
@@ -168,3 +168,29 @@ softpipe_flush_resource(struct pipe_context *pipe,

   return TRUE;
 }
+
+void softpipe_texture_barrier(struct pipe_context *pipe)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i, sh;
+
+   for (sh = 0; sh < Elements(softpipe->tex_cache); sh++) {
+      for (i = 0; i < softpipe->num_sampler_views[sh]; i++) {
+         sp_flush_tex_tile_cache(softpipe->tex_cache[sh][i]);
+      }
+   }
+
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+      if (softpipe->cbuf_cache[i])
+         sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+   if (softpipe->zsbuf_cache)
+      sp_flush_tile_cache(softpipe->zsbuf_cache);
+
+   softpipe->dirty_render_cache = FALSE;
+}
+
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags)
+{
+   softpipe_texture_barrier(pipe);
+}
@@ -55,4 +55,6 @@ softpipe_flush_resource(struct pipe_context *pipe,
                        boolean cpu_access,
                        boolean do_not_block);

+void softpipe_texture_barrier(struct pipe_context *pipe);
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags);
 #endif
@@ -62,14 +62,15 @@ sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 static void
 exec_prepare( const struct sp_fragment_shader_variant *var,
              struct tgsi_exec_machine *machine,
-              struct tgsi_sampler *sampler )
+              struct tgsi_sampler *sampler,
+              struct tgsi_image *image )
 {
   /*
    * Bind tokens/shader to the interpreter's machine state.
    */
   tgsi_exec_machine_bind_shader(machine,
                                 var->tokens,
-                                 sampler);
+                                 sampler, image);
 }


@@ -116,7 +117,8 @@ setup_pos_vector(const struct tgsi_interp_coef *coef,
 static unsigned 
 exec_run( const struct sp_fragment_shader_variant *var,
 	  struct tgsi_exec_machine *machine,
-	  struct quad_header *quad )
+	  struct quad_header *quad,
+	  bool early_depth_test )
 {
   /* Compute X, Y, Z, W vals for this quad */
   setup_pos_vector(quad->posCoef, 
@@ -126,6 +128,7 @@ exec_run( const struct sp_fragment_shader_variant *var,
   /* convert 0 to 1.0 and 1 to -1.0 */
   machine->Face = (float) (quad->input.facing * -2 + 1);

+   machine->NonHelperMask = quad->inout.mask;
   quad->inout.mask &= tgsi_exec_machine_run( machine );
   if (quad->inout.mask == 0)
      return FALSE;
@@ -155,16 +158,19 @@ exec_run( const struct sp_fragment_shader_variant *var,
            {
               uint j;

-               for (j = 0; j < 4; j++)
-                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
            }
            break;
         case TGSI_SEMANTIC_STENCIL:
            {
               uint j;
-
-               for (j = 0; j < 4; j++)
-                  quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               }
            }
            break;
         }
@@ -180,7 +186,7 @@ exec_delete(struct sp_fragment_shader_variant *var,
            struct tgsi_exec_machine *machine)
 {
   if (machine->Tokens == var->tokens) {
-      tgsi_exec_machine_bind_shader(machine, NULL, NULL);
+      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL);
   }

   FREE( (void *) var->tokens );
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_image.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+
+/*
+ * Get the offset into the base image
+ * first element for a buffer or layer/level for texture.
+ */
+static uint32_t
+get_image_offset(const struct softpipe_resource *spr,
+                 const struct pipe_image_view *iview,
+                 enum pipe_format format, unsigned r_coord)
+{
+   int base_layer = 0;
+
+   if (spr->base.target == PIPE_BUFFER)
+      return iview->u.buf.first_element * util_format_get_blocksize(format);
+
+   if (spr->base.target == PIPE_TEXTURE_1D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE ||
+       spr->base.target == PIPE_TEXTURE_3D)
+      base_layer = r_coord + iview->u.tex.first_layer;
+   return softpipe_get_tex_image_offset(spr, iview->u.tex.level, base_layer);
+}
+
+/*
+ * Does this texture instruction have a layer or depth parameter.
+ */
+static inline bool
+has_layer_or_depth(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_3D ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY_MSAA);
+}
+
+/*
+ * Is this texture instruction a single non-array coordinate.
+ */
+static inline bool
+has_1coord(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_BUFFER ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY);
+}
+
+/*
+ * check the bounds vs w/h/d
+ */
+static inline bool
+bounds_check(int width, int height, int depth,
+             int s, int t, int r)
+{
+   if (s < 0 || s >= width)
+      return false;
+   if (t < 0 || t >= height)
+      return false;
+   if (r < 0 || r >= depth)
+      return false;
+   return true;
+}
+
+/*
+ * Checks if the texture target compatible with the image resource
+ * pipe target.
+ */
+static inline bool
+has_compat_target(unsigned pipe_target, unsigned tgsi_target)
+{
+   switch (pipe_target) {
+   case PIPE_TEXTURE_1D:
+      if (tgsi_target == TGSI_TEXTURE_1D)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D:
+      if (tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_RECT:
+      if (tgsi_target == TGSI_TEXTURE_RECT)
+         return true;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (tgsi_target == TGSI_TEXTURE_3D ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_1D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_1D ||
+          tgsi_target == TGSI_TEXTURE_1D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_2D ||
+          tgsi_target == TGSI_TEXTURE_2D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_CUBE_ARRAY ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_BUFFER:
+      return (tgsi_target == TGSI_TEXTURE_BUFFER);
+   }
+   return false;
+}
+
+static bool
+get_dimensions(const struct pipe_image_view *iview,
+               const struct softpipe_resource *spr,
+               unsigned tgsi_tex_instr,
+               enum pipe_format pformat,
+               unsigned *width,
+               unsigned *height,
+               unsigned *depth)
+{
+   if (tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      *width = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      *height = 1;
+      *depth = 1;
+      /*
+       * Bounds check the buffer size from the view
+       * and the buffer size from the underlying buffer.
+       */
+      if (util_format_get_stride(pformat, *width) >
+          util_format_get_stride(spr->base.format, spr->base.width0))
+         return false;
+   } else {
+      unsigned level;
+
+      level = spr->base.target == PIPE_BUFFER ? 0 : iview->u.tex.level;
+      *width = u_minify(spr->base.width0, level);
+      *height = u_minify(spr->base.height0, level);
+
+      if (spr->base.target == TGSI_TEXTURE_3D)
+         *depth = u_minify(spr->base.depth0, level);
+      else
+         *depth = spr->base.array_size;
+
+      /* Make sure the resource and view have compatiable formats */
+      if (util_format_get_blocksize(pformat) >
+          util_format_get_blocksize(spr->base.format))
+         return false;
+   }
+   return true;
+}
+
+static void
+fill_coords(const struct tgsi_image_params *params,
+            unsigned index,
+            const int s[TGSI_QUAD_SIZE],
+            const int t[TGSI_QUAD_SIZE],
+            const int r[TGSI_QUAD_SIZE],
+            int *s_coord, int *t_coord, int *r_coord)
+{
+   *s_coord = s[index];
+   *t_coord = has_1coord(params->tgsi_tex_instr) ? 0 : t[index];
+   *r_coord = has_layer_or_depth(params->tgsi_tex_instr) ?
+      (params->tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ? t[index] : r[index]) : 0;
+}
+/*
+ * Implement the image LOAD operation.
+ */
+static void
+sp_tgsi_load(const struct tgsi_image *image,
+             const struct tgsi_image_params *params,
+             const int s[TGSI_QUAD_SIZE],
+             const int t[TGSI_QUAD_SIZE],
+             const int r[TGSI_QUAD_SIZE],
+             const int sample[TGSI_QUAD_SIZE],
+             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int c, j;
+   char *data_ptr;
+   unsigned offset = 0;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      goto fail_write_all_zero;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(params->format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool fill_zero = false;
+
+      if (!(params->execmask & (1 << j)))
+         fill_zero = true;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         fill_zero = true;
+
+      if (fill_zero) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(params->format)) {
+         int32_t sdata[4];
+
+         util_format_read_4i(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((int32_t *)rgba[c])[j] = sdata[c];
+      } else if (util_format_is_pure_uint(params->format)) {
+         uint32_t sdata[4];
+         util_format_read_4ui(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((uint32_t *)rgba[c])[j] = sdata[c];
+      } else {
+         float sdata[4];
+         util_format_read_4f(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            rgba[c][j] = sdata[c];
+      }
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+/*
+ * Implement the image STORE operation.
+ */
+static void
+sp_tgsi_store(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   char *data_ptr;
+   int j, c;
+   unsigned offset = 0;
+   unsigned pformat = params->format;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      return;
+
+   if (params->format == PIPE_FORMAT_NONE)
+      pformat = spr->base.format;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       pformat, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(pformat, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+
+      if (!(params->execmask & (1 << j)))
+         continue;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         continue;
+
+      offset = get_image_offset(spr, iview, pformat, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(pformat)) {
+         int32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((int32_t *)rgba[c])[j];
+         util_format_write_4i(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      } else if (util_format_is_pure_uint(pformat)) {
+         uint32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((uint32_t *)rgba[c])[j];
+         util_format_write_4ui(pformat, sdata, 0, data_ptr, stride,
+                               s_coord, t_coord, 1, 1);
+      } else {
+         float sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = rgba[c][j];
+         util_format_write_4f(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      }
+   }
+}
+
+/*
+ * Implement atomic operations on unsigned integers.
+ */
+static void
+handle_op_uint(const struct pipe_image_view *iview,
+               const struct tgsi_image_params *params,
+               bool just_read,
+               char *data_ptr,
+               uint qi,
+               unsigned stride,
+               unsigned opcode,
+               int s,
+               int t,
+               float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+               float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   unsigned sdata[4];
+
+   util_format_read_4ui(params->format,
+                        sdata, 0,
+                        data_ptr, stride,
+                        s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((uint32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] += ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] = ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned cmp_x = ((uint32_t *)rgba[c])[qi];
+         unsigned src_x = ((uint32_t *)rgba2[c])[qi];
+         unsigned temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] &= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] |= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] ^= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4ui(params->format, sdata, 0, data_ptr, stride,
+                         s, t, 1, 1);
+}
+
+/*
+ * Implement atomic operations on signed integers.
+ */
+static void
+handle_op_int(const struct pipe_image_view *iview,
+              const struct tgsi_image_params *params,
+              bool just_read,
+              char *data_ptr,
+              uint qi,
+              unsigned stride,
+              unsigned opcode,
+              int s,
+              int t,
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   int sdata[4];
+   util_format_read_4i(params->format,
+                       sdata, 0,
+                       data_ptr, stride,
+                       s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((int32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] += ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] = ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int cmp_x = ((int32_t *)rgba[c])[qi];
+         int src_x = ((int32_t *)rgba2[c])[qi];
+         int temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] &= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] |= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] ^= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4i(params->format, sdata, 0, data_ptr, stride,
+                        s, t, 1, 1);
+}
+
+/*
+ * Implement atomic image operations.
+ */
+static void
+sp_tgsi_op(const struct tgsi_image *image,
+           const struct tgsi_image_params *params,
+           unsigned opcode,
+           const int s[TGSI_QUAD_SIZE],
+           const int t[TGSI_QUAD_SIZE],
+           const int r[TGSI_QUAD_SIZE],
+           const int sample[TGSI_QUAD_SIZE],
+           float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+           float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int j, c;
+   unsigned offset;
+   char *data_ptr;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      goto fail_write_all_zero;
+
+   stride = util_format_get_stride(spr->base.format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool just_read = false;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord)) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         int c;
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+
+      /* just readback the value for atomic if execmask isn't set */
+      if (!(params->execmask & (1 << j))) {
+         just_read = true;
+      }
+
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      /* we should see atomic operations on r32 formats */
+      if (util_format_is_pure_uint(params->format))
+         handle_op_uint(iview, params, just_read, data_ptr, j, stride,
+                        opcode, s_coord, t_coord, rgba, rgba2);
+      else if (util_format_is_pure_sint(params->format))
+         handle_op_int(iview, params, just_read, data_ptr, j, stride,
+                       opcode, s_coord, t_coord, rgba, rgba2);
+      else
+         assert(0);
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+static void
+sp_tgsi_get_dims(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 int dims[4])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   int level;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+
+   if (params->tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      dims[0] = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      dims[1] = dims[2] = dims[3] = 0;
+      return;
+   }
+
+   level = iview->u.tex.level;
+   dims[0] = u_minify(spr->base.width0, level);
+   switch (params->tgsi_tex_instr) {
+   case TGSI_TEXTURE_1D_ARRAY:
+      dims[1] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_1D:
+      return;
+   case TGSI_TEXTURE_2D_ARRAY:
+      dims[2] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_RECT:
+      dims[1] = u_minify(spr->base.height0, level);
+      return;
+   case TGSI_TEXTURE_3D:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = u_minify(spr->base.depth0, level);
+      return;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = (iview->u.tex.last_layer - iview->u.tex.first_layer + 1) / 6;
+      break;
+   default:
+      assert(!"unexpected texture target in sp_get_dims()");
+      return;
+   }
+}
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void)
+{
+   struct sp_tgsi_image *img = CALLOC_STRUCT(sp_tgsi_image);
+   if (!img)
+      return NULL;
+
+   img->base.load = sp_tgsi_load;
+   img->base.store = sp_tgsi_store;
+   img->base.op = sp_tgsi_op;
+   img->base.get_dims = sp_tgsi_get_dims;
+   return img;
+};
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SP_IMAGE_H
+#define SP_IMAGE_H
+#include "tgsi/tgsi_exec.h"
+
+struct sp_tgsi_image
+{
+   struct tgsi_image base;
+   struct pipe_image_view sp_iview[PIPE_MAX_SHADER_IMAGES];
+};
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void);
+
+#endif
@@ -782,7 +782,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
 {
   unsigned i, pass = 0;
   const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
   boolean shader_stencil_ref = fsInfo->writes_stencil;
   struct depth_data data;
   unsigned vp_idx = quads[0]->input.viewport_index;
@@ -902,7 +902,7 @@ choose_depth_test(struct quad_stage *qs,
 {
   const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;

-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;

   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;

@@ -80,7 +80,7 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad)

   /* run shader */
   machine->flatshade_color = softpipe->rasterizer->flatshade ? TRUE : FALSE;
-   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad );
+   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad, softpipe->early_depth );
 }


@@ -43,15 +43,17 @@ void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
   boolean early_depth_test =
-      sp->depth_stencil->depth.enabled &&
+      (sp->depth_stencil->depth.enabled &&
      sp->framebuffer.zsbuf &&
      !sp->depth_stencil->alpha.enabled &&
      !sp->fs_variant->info.uses_kill &&
      !sp->fs_variant->info.writes_z &&
-      !sp->fs_variant->info.writes_stencil;
+       !sp->fs_variant->info.writes_stencil) ||
+      sp->fs_variant->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL];

   sp->quad.first = sp->quad.blend;

+   sp->early_depth = early_depth_test;
   if (early_depth_test) {
      insert_stage_at_head( sp, sp->quad.shade );
      insert_stage_at_head( sp, sp->quad.depth_test );
@@ -56,6 +56,7 @@


 struct tgsi_sampler;
+struct tgsi_image;
 struct tgsi_exec_machine;
 struct vertex_info;

@@ -81,11 +82,13 @@ struct sp_fragment_shader_variant

   void (*prepare)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct tgsi_sampler *sampler);
+		   struct tgsi_sampler *sampler,
+		   struct tgsi_image *image);

   unsigned (*run)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct quad_header *quad);
+		   struct quad_header *quad,
+		   bool early_depth_test);

   /* Deletes this instance of the object */
   void (*delete)(struct sp_fragment_shader_variant *shader,
@@ -148,6 +151,9 @@ softpipe_init_streamout_funcs(struct pipe_context *pipe);
 void
 softpipe_init_vertex_funcs(struct pipe_context *pipe);

+void
+softpipe_init_image_funcs(struct pipe_context *pipe);
+
 void
 softpipe_set_framebuffer_state(struct pipe_context *,
                               const struct pipe_framebuffer_state *);
@@ -343,7 +343,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim)
      softpipe->fs_variant->prepare(softpipe->fs_variant, 
                                    softpipe->fs_machine,
                                    (struct tgsi_sampler *) softpipe->
-                                    tgsi.sampler[PIPE_SHADER_FRAGMENT]);
+                                    tgsi.sampler[PIPE_SHADER_FRAGMENT],
+                                    (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]);
   }
   else {
      softpipe->fs_variant = NULL;
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_image.h"
+
+static void softpipe_set_shader_images(struct pipe_context *pipe,
+                                       unsigned shader,
+                                       unsigned start,
+                                       unsigned num,
+                                       struct pipe_image_view *images)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(softpipe->sampler_views[shader]));
+
+   /* set the new images */
+   for (i = 0; i < num; i++) {
+      int idx = start + i;
+
+      if (images) {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, images[i].resource);
+         softpipe->tgsi.image[shader]->sp_iview[idx] = images[i];
+      }
+      else {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, NULL);
+         memset(&softpipe->tgsi.image[shader]->sp_iview[idx], 0, sizeof(struct pipe_image_view));
+      }
+   }
+}
+
+void softpipe_init_image_funcs(struct pipe_context *pipe)
+{
+   pipe->set_shader_images = softpipe_set_shader_images;
+}
@@ -270,9 +270,9 @@ softpipe_resource_get_handle(struct pipe_screen *screen,
 * Helper function to compute offset (in bytes) for a particular
 * texture level/face/slice from the start of the buffer.
 */
-static unsigned
-sp_get_tex_image_offset(const struct softpipe_resource *spr,
-                        unsigned level, unsigned layer)
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer)
 {
   unsigned offset = spr->level_offset[level];

@@ -422,7 +422,7 @@ softpipe_transfer_map(struct pipe_context *pipe,
   pt->stride = spr->stride[level];
   pt->layer_stride = spr->img_stride[level];

-   spt->offset = sp_get_tex_image_offset(spr, level, box->z);
+   spt->offset = softpipe_get_tex_image_offset(spr, level, box->z);

   spt->offset +=
         box->y / util_format_get_blockheight(format) * spt->base.stride +
@@ -116,5 +116,7 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 extern void
 softpipe_init_texture_funcs(struct pipe_context *pipe);

-
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer);
 #endif /* SP_TEXTURE */
@@ -50,15 +50,6 @@
 */
 static char err_buf[128];

-#if 0
-static void
-svga_destroy_shader_emitter(struct svga_shader_emitter *emit)
-{
-   if (emit->buf != err_buf)
-      FREE(emit->buf);
-}
-#endif
-

 static boolean
 svga_shader_expand(struct svga_shader_emitter *emit)
@@ -265,6 +256,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,

 fail:
   FREE(variant);
-   FREE(emit.buf);
+   if (emit.buf != err_buf)
+      FREE(emit.buf);
   return NULL;
 }
@@ -535,7 +535,6 @@ svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)

 static boolean
 ps30_sampler( struct svga_shader_emitter *emit,
-              struct tgsi_declaration_semantic semantic,
              unsigned idx )
 {
   SVGA3DOpDclArgs dcl;
@@ -553,6 +552,17 @@ ps30_sampler( struct svga_shader_emitter *emit,
           svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
 }

+boolean
+svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   for (i = 0; i < emit->num_samplers; i++) {
+      if (!ps30_sampler(emit, i))
+         return FALSE;
+   }
+   return TRUE;
+}

 boolean
 svga_translate_decl_sm30( struct svga_shader_emitter *emit,
@@ -563,12 +573,15 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
   unsigned idx;

   for( idx = first; idx <= last; idx++ ) {
-      boolean ok;
+      boolean ok = TRUE;

      switch (decl->Declaration.File) {
      case TGSI_FILE_SAMPLER:
         assert (emit->unit == PIPE_SHADER_FRAGMENT);
-         ok = ps30_sampler( emit, decl->Semantic, idx );
+         /* just keep track of the number of samplers here.
+          * Will emit the declaration in the helpers function.
+          */
+         emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
         break;

      case TGSI_FILE_INPUT:
@@ -137,6 +137,7 @@ struct svga_shader_emitter

   unsigned pstipple_sampler_unit;

+   int num_samplers;
   uint8_t sampler_target[PIPE_MAX_SAMPLERS];
 };

@@ -156,6 +157,9 @@ boolean
 svga_shader_emit_instructions(struct svga_shader_emitter *emit,
                              const struct tgsi_token *tokens);

+boolean
+svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit);
+
 boolean
 svga_translate_decl_sm30(struct svga_shader_emitter *emit,
                         const struct tgsi_full_declaration *decl);
--- a/Show More
+++ b/Show More