From 78f19194295e3ea3cf219c15bf67dcab5dea4a34 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 15 May 2015 09:21:23 -0700
Subject: [PATCH 001/197] nir: Add explicitly sized types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: Fix size/type mask to properly handle 8-bit types.

v3: Add helpers to get the bitsize and base type of a
nir_alu_type enum.

Signed-off-by: Juan A. Suarez Romero <jasuarez@igalia.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir.h | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 00f107d4243..418682f2caf 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -613,9 +613,36 @@ typedef enum {
    nir_type_float,
    nir_type_int,
    nir_type_uint,
-   nir_type_bool
+   nir_type_bool,
+   nir_type_bool32 =    32 | nir_type_bool,
+   nir_type_int8 =      8  | nir_type_int,
+   nir_type_int16 =     16 | nir_type_int,
+   nir_type_int32 =     32 | nir_type_int,
+   nir_type_int64 =     64 | nir_type_int,
+   nir_type_uint8 =     8  | nir_type_uint,
+   nir_type_uint16 =    16 | nir_type_uint,
+   nir_type_uint32 =    32 | nir_type_uint,
+   nir_type_uint64 =    64 | nir_type_uint,
+   nir_type_float16 =   16 | nir_type_float,
+   nir_type_float32 =   32 | nir_type_float,
+   nir_type_float64 =   64 | nir_type_float,
 } nir_alu_type;
 
+#define NIR_ALU_TYPE_SIZE_MASK 0xfffffff8
+#define NIR_ALU_TYPE_BASE_TYPE_MASK 0x00000007
+
+static inline unsigned
+nir_alu_type_get_type_size(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_SIZE_MASK;
+}
+
+static inline unsigned
+nir_alu_type_get_base_type(nir_alu_type type)
+{
+   return type & NIR_ALU_TYPE_BASE_TYPE_MASK;
+}
+
 typedef enum {
    NIR_OP_IS_COMMUTATIVE = (1 << 0),
    NIR_OP_IS_ASSOCIATIVE = (1 << 1),

From c38a25af2f47c6de093c9b0ac0ccdfa9fe48007e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Samuel=20Iglesias=20Gons=C3=A1lvez?= <siglesias@igalia.com>
Date: Wed, 24 Feb 2016 10:51:22 +0100
Subject: [PATCH 002/197] i965/nir: fix check to resolve booleans to work with
 sized nir_alu_type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As nir_alu_type has now embedded the data size, the check for the
instruction's output type (to see if a boolean resolve is required)
should ignore the data size part.

Signed-off-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index 56e15ef935f..12ac853478a 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -165,7 +165,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
          }
 
          default:
-            if (nir_op_infos[alu->op].output_type == nir_type_bool) {
+            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
                /* This instructions will turn into a CMP when we actually emit
                 * them so the result will have to be resolved before it can be
                 * used.

From 3d37de930d04da1d067b40593b55fc248eaf7b3b Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Fri, 14 Aug 2015 10:36:15 -0700
Subject: [PATCH 003/197] nir/types: add a function to get the bitsize of a
 base type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix it for GLSL_TYPE_SUBROUTINE (Iago)

Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir_types.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/compiler/nir_types.h b/src/compiler/nir_types.h
index 18d64b768d4..07487838b50 100644
--- a/src/compiler/nir_types.h
+++ b/src/compiler/nir_types.h
@@ -77,6 +77,27 @@ enum glsl_base_type glsl_get_sampler_result_type(const struct glsl_type *type);
 unsigned glsl_get_record_location_offset(const struct glsl_type *type,
                                          unsigned length);
 
+static inline unsigned
+glsl_get_bit_size(enum glsl_base_type type)
+{
+   switch (type) {
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_FLOAT: /* TODO handle mediump */
+   case GLSL_TYPE_SUBROUTINE:
+      return 32;
+
+   case GLSL_TYPE_DOUBLE:
+      return 64;
+
+   default:
+      unreachable("unknown base type");
+   }
+
+   return 0;
+}
+
 bool glsl_type_is_void(const struct glsl_type *type);
 bool glsl_type_is_error(const struct glsl_type *type);
 bool glsl_type_is_vector(const struct glsl_type *type);

From e172dbe5d2e9a4effd92823cd8ebc342e0928a36 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 17 Nov 2015 15:45:18 +0100
Subject: [PATCH 004/197] nir: Add a bit_size to nir_register and nir_ssa_def
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This really hacky commit adds a bit size to registers and SSA values.  It
also adds rules in the validator to validate that they do the right things.

It's still an open question as to whether or not we want a bit_size in
nir_alu_instr or if we just want to let it inherit from the destination.
I'm inclined to just let it inherit from the destination.  A similar
question needs to be asked about intrinsics.

v2 (Connor):
  - Relax validation: comparisons have explicit destination sizes
    and implicit source sizes.

v3 (Sam):
- Use helpers to get size and base types of nir_alu_type enum.

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir.c          |  2 ++
 src/compiler/nir/nir.h          |  6 +++++
 src/compiler/nir/nir_validate.c | 42 +++++++++++++++++++++++++++++----
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 655dc884382..d07550a6b03 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -70,6 +70,7 @@ reg_create(void *mem_ctx, struct exec_list *list)
    list_inithead(&reg->if_uses);
 
    reg->num_components = 0;
+   reg->bit_size = 32;
    reg->num_array_elems = 0;
    reg->is_packed = false;
    reg->name = NULL;
@@ -1325,6 +1326,7 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
    list_inithead(&def->uses);
    list_inithead(&def->if_uses);
    def->num_components = num_components;
+   def->bit_size = 32; /* FIXME: Add an input paremeter or guess? */
 
    if (instr->block) {
       nir_function_impl *impl =
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 418682f2caf..8f411793d9d 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -345,6 +345,9 @@ typedef struct nir_register {
    unsigned num_components; /** < number of vector components */
    unsigned num_array_elems; /** < size of array (0 for no array) */
 
+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
+
    /** generic register index. */
    unsigned index;
 
@@ -452,6 +455,9 @@ typedef struct nir_ssa_def {
    struct list_head if_uses;
 
    uint8_t num_components;
+
+   /* The bit-size of each channel; must be one of 8, 16, 32, or 64 */
+   uint8_t bit_size;
 } nir_ssa_def;
 
 struct nir_src;
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 0c32d5fe07a..9f18d1c33e4 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -179,9 +179,12 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
    nir_alu_src *src = &instr->src[index];
 
    unsigned num_components;
-   if (src->src.is_ssa)
+   unsigned src_bit_size;
+   if (src->src.is_ssa) {
+      src_bit_size = src->src.ssa->bit_size;
       num_components = src->src.ssa->num_components;
-   else {
+   } else {
+      src_bit_size = src->src.reg.reg->bit_size;
       if (src->src.reg.reg->is_packed)
          num_components = 4; /* can't check anything */
       else
@@ -194,6 +197,24 @@ validate_alu_src(nir_alu_instr *instr, unsigned index, validate_state *state)
          assert(src->swizzle[i] < num_components);
    }
 
+   nir_alu_type src_type = nir_op_infos[instr->op].input_types[index];
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(src_type) == nir_type_float)
+      assert(src_bit_size == 16 || src_bit_size == 32 || src_bit_size == 64);
+
+   if (nir_alu_type_get_type_size(src_type)) {
+      /* This source has an explicit bit size */
+      assert(nir_alu_type_get_type_size(src_type) == src_bit_size);
+   } else {
+      if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type)) {
+         unsigned dest_bit_size =
+            instr->dest.dest.is_ssa ? instr->dest.dest.ssa.bit_size
+                                    : instr->dest.dest.reg.reg->bit_size;
+         assert(dest_bit_size == src_bit_size);
+      }
+   }
+
    validate_src(&src->src, state);
 }
 
@@ -263,8 +284,10 @@ validate_dest(nir_dest *dest, validate_state *state)
 }
 
 static void
-validate_alu_dest(nir_alu_dest *dest, validate_state *state)
+validate_alu_dest(nir_alu_instr *instr, validate_state *state)
 {
+   nir_alu_dest *dest = &instr->dest;
+
    unsigned dest_size =
       dest->dest.is_ssa ? dest->dest.ssa.num_components
                         : dest->dest.reg.reg->num_components;
@@ -282,6 +305,17 @@ validate_alu_dest(nir_alu_dest *dest, validate_state *state)
    assert(nir_op_infos[alu->op].output_type == nir_type_float ||
           !dest->saturate);
 
+   unsigned bit_size = dest->dest.is_ssa ? dest->dest.ssa.bit_size
+                                         : dest->dest.reg.reg->bit_size;
+   nir_alu_type type = nir_op_infos[instr->op].output_type;
+
+   /* 8-bit float isn't a thing */
+   if (nir_alu_type_get_base_type(type) == nir_type_float)
+      assert(bit_size == 16 || bit_size == 32 || bit_size == 64);
+
+   assert(nir_alu_type_get_type_size(type) == 0 ||
+          nir_alu_type_get_type_size(type) == bit_size);
+
    validate_dest(&dest->dest, state);
 }
 
@@ -294,7 +328,7 @@ validate_alu_instr(nir_alu_instr *instr, validate_state *state)
       validate_alu_src(instr, i, state);
    }
 
-   validate_alu_dest(&instr->dest, state);
+   validate_alu_dest(instr, state);
 }
 
 static void

From 6700d7e423bb2d7c5f0b46740bd92b5e65679eaf Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Fri, 14 Aug 2015 10:18:39 -0700
Subject: [PATCH 005/197] nir: add nir_{src,dest}_bit_size() helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: use a ternary (Jason)

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 8f411793d9d..6413f438ee3 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -564,6 +564,18 @@ nir_dest_for_reg(nir_register *reg)
    return dest;
 }
 
+static inline unsigned
+nir_src_bit_size(nir_src src)
+{
+   return src.is_ssa ? src.ssa->bit_size : src.reg.reg->bit_size;
+}
+
+static inline unsigned
+nir_dest_bit_size(nir_dest dest)
+{
+   return dest.is_ssa ? dest.ssa.bit_size : dest.reg.reg->bit_size;
+}
+
 void nir_src_copy(nir_src *dest, const nir_src *src, void *instr_or_if);
 void nir_dest_copy(nir_dest *dest, const nir_dest *src, nir_instr *instr);
 

From 9076c4e289de0debf1fb2a7237bdeb9c11002347 Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Fri, 14 Aug 2015 10:45:06 -0700
Subject: [PATCH 006/197] nir: update opcode definitions for different bit
 sizes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some opcodes need explicit bitsizes, and sometimes we need to use the
double version when constant folding.

v2: fix output type for u2f (Iago)

v3: do not change vecN opcodes to be float. The next commit will add
    infrastructure to enable 64-bit integer constant folding so this is isn't
    really necessary. Also, that created problems with source modifiers in
    some cases (Iago)

v4 (Jason):
  - do not change bcsel to work in terms of floats
  - leave ldexp generic

Squashed changes to handle different bit sizes when constant
folding since otherwise we would break the build.

v2:
- Use the bit-size information from the opcode information if defined (Iago)
- Use helpers to get type size and base type of nir_alu_type enum (Sam)
- Do not fallback to sized types to guess bit-size information. (Jason)

Squashed changes in i965 and gallium/nir drivers to support sized types.
These functions should only see sized types, but we can't make that change
until we make sure that nir uses the sized versions in all the relevant places.
A later commit will address this.

Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir.h                       |   4 +
 src/compiler/nir/nir_constant_expressions.h  |   2 +-
 src/compiler/nir/nir_constant_expressions.py | 244 ++++++++++++-------
 src/compiler/nir/nir_opcodes.py              | 138 ++++++-----
 src/compiler/nir/nir_opt_constant_folding.c  |  29 ++-
 src/gallium/drivers/vc4/vc4_program.c        |   4 +-
 src/mesa/drivers/dri/i965/brw_nir.c          |  18 ++
 7 files changed, 282 insertions(+), 157 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 6413f438ee3..824f4e20706 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -101,6 +101,7 @@ union nir_constant_data {
    int i[16];
    float f[16];
    bool b[16];
+   double d[16];
 };
 
 typedef struct nir_constant {
@@ -1209,8 +1210,11 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)
 typedef struct {
    union {
       float f[4];
+      double d[4];
       int32_t i[4];
       uint32_t u[4];
+      int64_t l[4];
+      uint64_t ul[4];
    };
 } nir_const_value;
 
diff --git a/src/compiler/nir/nir_constant_expressions.h b/src/compiler/nir/nir_constant_expressions.h
index 97997f2e514..201f278c71c 100644
--- a/src/compiler/nir/nir_constant_expressions.h
+++ b/src/compiler/nir/nir_constant_expressions.h
@@ -28,4 +28,4 @@
 #include "nir.h"
 
 nir_const_value nir_eval_const_opcode(nir_op op, unsigned num_components,
-                                      nir_const_value *src);
+                                      unsigned bit_size, nir_const_value *src);
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index 32784f6398d..972d2819af9 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -1,4 +1,43 @@
 #! /usr/bin/python2
+
+def type_has_size(type_):
+    return type_[-1:].isdigit()
+
+def type_sizes(type_):
+    if type_.endswith("8"):
+        return [8]
+    elif type_.endswith("16"):
+        return [16]
+    elif type_.endswith("32"):
+        return [32]
+    elif type_.endswith("64"):
+        return [64]
+    else:
+        return [32, 64]
+
+def type_add_size(type_, size):
+    if type_has_size(type_):
+        return type_
+    return type_ + str(size)
+
+def get_const_field(type_):
+    if type_ == "int32":
+        return "i"
+    if type_ == "uint32":
+        return "u"
+    if type_ == "int64":
+        return "l"
+    if type_ == "uint64":
+        return "ul"
+    if type_ == "bool32":
+        return "b"
+    if type_ == "float32":
+        return "f"
+    if type_ == "float64":
+        return "d"
+    raise Exception(str(type_))
+    assert(0)
+
 template = """\
 /*
  * Copyright (C) 2014 Intel Corporation
@@ -205,110 +244,140 @@ unpack_half_1x16(uint16_t u)
 }
 
 /* Some typed vector structures to make things like src0.y work */
-% for type in ["float", "int", "uint", "bool"]:
-struct ${type}_vec {
-   ${type} x;
-   ${type} y;
-   ${type} z;
-   ${type} w;
+typedef float float32_t;
+typedef double float64_t;
+typedef bool bool32_t;
+% for type in ["float", "int", "uint"]:
+% for width in [32, 64]:
+struct ${type}${width}_vec {
+   ${type}${width}_t x;
+   ${type}${width}_t y;
+   ${type}${width}_t z;
+   ${type}${width}_t w;
 };
 % endfor
+% endfor
+
+struct bool32_vec {
+    bool x;
+    bool y;
+    bool z;
+    bool w;
+};
 
 % for name, op in sorted(opcodes.iteritems()):
 static nir_const_value
-evaluate_${name}(unsigned num_components, nir_const_value *_src)
+evaluate_${name}(unsigned num_components, unsigned bit_size,
+                 nir_const_value *_src)
 {
    nir_const_value _dst_val = { { {0, 0, 0, 0} } };
 
-   ## For each non-per-component input, create a variable srcN that
-   ## contains x, y, z, and w elements which are filled in with the
-   ## appropriately-typed values.
-   % for j in range(op.num_inputs):
-      % if op.input_sizes[j] == 0:
-         <% continue %>
-      % elif "src" + str(j) not in op.const_expr:
-         ## Avoid unused variable warnings
-         <% continue %>
-      %endif
+   switch (bit_size) {
+   % for bit_size in [32, 64]:
+   case ${bit_size}: {
+      <%
+      output_type = type_add_size(op.output_type, bit_size)
+      input_types = [type_add_size(type_, bit_size) for type_ in op.input_types]
+      %>
 
-      struct ${op.input_types[j]}_vec src${j} = {
-      % for k in range(op.input_sizes[j]):
-         % if op.input_types[j] == "bool":
-            _src[${j}].u[${k}] != 0,
-         % else:
-            _src[${j}].${op.input_types[j][:1]}[${k}],
-         % endif
-      % endfor
-      };
-   % endfor
+      ## For each non-per-component input, create a variable srcN that
+      ## contains x, y, z, and w elements which are filled in with the
+      ## appropriately-typed values.
+      % for j in range(op.num_inputs):
+         % if op.input_sizes[j] == 0:
+            <% continue %>
+         % elif "src" + str(j) not in op.const_expr:
+            ## Avoid unused variable warnings
+            <% continue %>
+         %endif
 
-   % if op.output_size == 0:
-      ## For per-component instructions, we need to iterate over the
-      ## components and apply the constant expression one component
-      ## at a time.
-      for (unsigned _i = 0; _i < num_components; _i++) {
-         ## For each per-component input, create a variable srcN that
-         ## contains the value of the current (_i'th) component.
-         % for j in range(op.num_inputs):
-            % if op.input_sizes[j] != 0:
-               <% continue %>
-            % elif "src" + str(j) not in op.const_expr:
-               ## Avoid unused variable warnings
-               <% continue %>
-            % elif op.input_types[j] == "bool":
-               bool src${j} = _src[${j}].u[_i] != 0;
+         struct ${input_types[j]}_vec src${j} = {
+         % for k in range(op.input_sizes[j]):
+            % if input_types[j] == "bool32":
+               _src[${j}].u[${k}] != 0,
             % else:
-               ${op.input_types[j]} src${j} = _src[${j}].${op.input_types[j][:1]}[_i];
+               _src[${j}].${get_const_field(input_types[j])}[${k}],
             % endif
          % endfor
+         };
+      % endfor
+
+      % if op.output_size == 0:
+         ## For per-component instructions, we need to iterate over the
+         ## components and apply the constant expression one component
+         ## at a time.
+         for (unsigned _i = 0; _i < num_components; _i++) {
+            ## For each per-component input, create a variable srcN that
+            ## contains the value of the current (_i'th) component.
+            % for j in range(op.num_inputs):
+               % if op.input_sizes[j] != 0:
+                  <% continue %>
+               % elif "src" + str(j) not in op.const_expr:
+                  ## Avoid unused variable warnings
+                  <% continue %>
+               % elif input_types[j] == "bool32":
+                  bool src${j} = _src[${j}].u[_i] != 0;
+               % else:
+                  ${input_types[j]}_t src${j} =
+                     _src[${j}].${get_const_field(input_types[j])}[_i];
+               % endif
+            % endfor
+
+            ## Create an appropriately-typed variable dst and assign the
+            ## result of the const_expr to it.  If const_expr already contains
+            ## writes to dst, just include const_expr directly.
+            % if "dst" in op.const_expr:
+               ${output_type}_t dst;
+               ${op.const_expr}
+            % else:
+               ${output_type}_t dst = ${op.const_expr};
+            % endif
+
+            ## Store the current component of the actual destination to the
+            ## value of dst.
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[_i] = dst;
+            % endif
+         }
+      % else:
+         ## In the non-per-component case, create a struct dst with
+         ## appropriately-typed elements x, y, z, and w and assign the result
+         ## of the const_expr to all components of dst, or include the
+         ## const_expr directly if it writes to dst already.
+         struct ${output_type}_vec dst;
 
-         ## Create an appropriately-typed variable dst and assign the
-         ## result of the const_expr to it.  If const_expr already contains
-         ## writes to dst, just include const_expr directly.
          % if "dst" in op.const_expr:
-            ${op.output_type} dst;
             ${op.const_expr}
          % else:
-            ${op.output_type} dst = ${op.const_expr};
+            ## Splat the value to all components.  This way expressions which
+            ## write the same value to all components don't need to explicitly
+            ## write to dest.  One such example is fnoise which has a
+            ## const_expr of 0.0f.
+            dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
          % endif
 
-         ## Store the current component of the actual destination to the
-         ## value of dst.
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[_i] = dst;
-         % endif
-      }
-   % else:
-      ## In the non-per-component case, create a struct dst with
-      ## appropriately-typed elements x, y, z, and w and assign the result
-      ## of the const_expr to all components of dst, or include the
-      ## const_expr directly if it writes to dst already.
-      struct ${op.output_type}_vec dst;
-
-      % if "dst" in op.const_expr:
-         ${op.const_expr}
-      % else:
-         ## Splat the value to all components.  This way expressions which
-         ## write the same value to all components don't need to explicitly
-         ## write to dest.  One such example is fnoise which has a
-         ## const_expr of 0.0f.
-         dst.x = dst.y = dst.z = dst.w = ${op.const_expr};
+         ## For each component in the destination, copy the value of dst to
+         ## the actual destination.
+         % for k in range(op.output_size):
+            % if output_type == "bool32":
+               ## Sanitize the C value to a proper NIR bool
+               _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+            % else:
+               _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
+            % endif
+         % endfor
       % endif
 
-      ## For each component in the destination, copy the value of dst to
-      ## the actual destination.
-      % for k in range(op.output_size):
-         % if op.output_type == "bool":
-            ## Sanitize the C value to a proper NIR bool
-            _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
-         % else:
-            _dst_val.${op.output_type[:1]}[${k}] = dst.${"xyzw"[k]};
-         % endif
-      % endfor
-   % endif
+      break;
+   }
+   % endfor
+
+   default:
+      unreachable("unknown bit width");
+   }
 
    return _dst_val;
 }
@@ -316,12 +385,12 @@ evaluate_${name}(unsigned num_components, nir_const_value *_src)
 
 nir_const_value
 nir_eval_const_opcode(nir_op op, unsigned num_components,
-                      nir_const_value *src)
+                      unsigned bit_width, nir_const_value *src)
 {
    switch (op) {
 % for name in sorted(opcodes.iterkeys()):
    case nir_op_${name}: {
-      return evaluate_${name}(num_components, src);
+      return evaluate_${name}(num_components, bit_width, src);
       break;
    }
 % endfor
@@ -333,4 +402,7 @@ nir_eval_const_opcode(nir_op op, unsigned num_components,
 from nir_opcodes import opcodes
 from mako.template import Template
 
-print Template(template).render(opcodes=opcodes)
+print Template(template).render(opcodes=opcodes, type_sizes=type_sizes,
+                                type_has_size=type_has_size,
+                                type_add_size=type_add_size,
+                                get_const_field=get_const_field)
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index a37fe2dc060..553f924afc5 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -90,8 +90,12 @@ class Opcode(object):
 # helper variables for strings
 tfloat = "float"
 tint = "int"
-tbool = "bool"
+tbool = "bool32"
 tuint = "uint"
+tfloat32 = "float32"
+tint32 = "int32"
+tuint32 = "uint32"
+tfloat64 = "float64"
 
 commutative = "commutative "
 associative = "associative "
@@ -155,56 +159,56 @@ unop("frsq", tfloat, "1.0f / sqrtf(src0)")
 unop("fsqrt", tfloat, "sqrtf(src0)")
 unop("fexp2", tfloat, "exp2f(src0)")
 unop("flog2", tfloat, "log2f(src0)")
-unop_convert("f2i", tint, tfloat, "src0") # Float-to-integer conversion.
-unop_convert("f2u", tuint, tfloat, "src0") # Float-to-unsigned conversion
-unop_convert("i2f", tfloat, tint, "src0") # Integer-to-float conversion.
+unop_convert("f2i", tint32, tfloat32, "src0") # Float-to-integer conversion.
+unop_convert("f2u", tuint32, tfloat32, "src0") # Float-to-unsigned conversion
+unop_convert("i2f", tfloat32, tint32, "src0") # Integer-to-float conversion.
 # Float-to-boolean conversion
-unop_convert("f2b", tbool, tfloat, "src0 != 0.0f")
+unop_convert("f2b", tbool, tfloat32, "src0 != 0.0f")
 # Boolean-to-float conversion
-unop_convert("b2f", tfloat, tbool, "src0 ? 1.0f : 0.0f")
+unop_convert("b2f", tfloat32, tbool, "src0 ? 1.0f : 0.0f")
 # Int-to-boolean conversion
-unop_convert("i2b", tbool, tint, "src0 != 0")
-unop_convert("b2i", tint, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
-unop_convert("u2f", tfloat, tuint, "src0") # Unsigned-to-float conversion.
+unop_convert("i2b", tbool, tint32, "src0 != 0")
+unop_convert("b2i", tint32, tbool, "src0 ? 1 : 0") # Boolean-to-int conversion
+unop_convert("u2f", tfloat32, tuint32, "src0") # Unsigned-to-float conversion.
 
 # Unary floating-point rounding operations.
 
 
-unop("ftrunc", tfloat, "truncf(src0)")
-unop("fceil", tfloat, "ceilf(src0)")
-unop("ffloor", tfloat, "floorf(src0)")
-unop("ffract", tfloat, "src0 - floorf(src0)")
-unop("fround_even", tfloat, "_mesa_roundevenf(src0)")
+unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
+unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
+unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
+unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
+unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 
 
 # Trigonometric operations.
 
 
-unop("fsin", tfloat, "sinf(src0)")
-unop("fcos", tfloat, "cosf(src0)")
+unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
+unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
 
 
 # Partial derivatives.
 
 
-unop("fddx", tfloat, "0.0f") # the derivative of a constant is 0.
-unop("fddy", tfloat, "0.0f")
-unop("fddx_fine", tfloat, "0.0f")
-unop("fddy_fine", tfloat, "0.0f")
-unop("fddx_coarse", tfloat, "0.0f")
-unop("fddy_coarse", tfloat, "0.0f")
+unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
+unop("fddy", tfloat, "0.0")
+unop("fddx_fine", tfloat, "0.0")
+unop("fddy_fine", tfloat, "0.0")
+unop("fddx_coarse", tfloat, "0.0")
+unop("fddy_coarse", tfloat, "0.0")
 
 
 # Floating point pack and unpack operations.
 
 def pack_2x16(fmt):
-   unop_horiz("pack_" + fmt + "_2x16", 1, tuint, 2, tfloat, """
+   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x16(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
 """.replace("fmt", fmt))
 
 def pack_4x8(fmt):
-   unop_horiz("pack_" + fmt + "_4x8", 1, tuint, 4, tfloat, """
+   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
 dst.x = (uint32_t) pack_fmt_1x8(src0.x);
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
 dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
@@ -212,13 +216,13 @@ dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
 """.replace("fmt", fmt))
 
 def unpack_2x16(fmt):
-   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
 dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
 """.replace("fmt", fmt))
 
 def unpack_4x8(fmt):
-   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat, 1, tuint, """
+   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
 dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
 dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
 dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
@@ -237,11 +241,11 @@ unpack_2x16("unorm")
 unpack_4x8("unorm")
 unpack_2x16("half")
 
-unop_horiz("pack_uvec2_to_uint", 1, tuint, 2, tuint, """
+unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
 dst.x = (src0.x & 0xffff) | (src0.y >> 16);
 """)
 
-unop_horiz("pack_uvec4_to_uint", 1, tuint, 4, tuint, """
+unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
 dst.x = (src0.x <<  0) |
         (src0.y <<  8) |
         (src0.z << 16) |
@@ -251,22 +255,22 @@ dst.x = (src0.x <<  0) |
 # Lowered floating point unpacking operations.
 
 
-unop_horiz("unpack_half_2x16_split_x", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_x", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x & 0xffff))")
-unop_horiz("unpack_half_2x16_split_y", 1, tfloat, 1, tuint,
+unop_horiz("unpack_half_2x16_split_y", 1, tfloat32, 1, tuint32,
            "unpack_half_1x16((uint16_t)(src0.x >> 16))")
 
 
 # Bit operations, part of ARB_gpu_shader5.
 
 
-unop("bitfield_reverse", tuint, """
+unop("bitfield_reverse", tuint32, """
 /* we're not winning any awards for speed here, but that's ok */
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++)
    dst |= ((src0 >> bit) & 1) << (31 - bit);
 """)
-unop("bit_count", tuint, """
+unop("bit_count", tuint32, """
 dst = 0;
 for (unsigned bit = 0; bit < 32; bit++) {
    if ((src0 >> bit) & 1)
@@ -274,7 +278,7 @@ for (unsigned bit = 0; bit < 32; bit++) {
 }
 """)
 
-unop_convert("ufind_msb", tint, tuint, """
+unop_convert("ufind_msb", tint32, tuint32, """
 dst = -1;
 for (int bit = 31; bit > 0; bit--) {
    if ((src0 >> bit) & 1) {
@@ -284,7 +288,7 @@ for (int bit = 31; bit > 0; bit--) {
 }
 """)
 
-unop("ifind_msb", tint, """
+unop("ifind_msb", tint32, """
 dst = -1;
 for (int bit = 31; bit >= 0; bit--) {
    /* If src0 < 0, we're looking for the first 0 bit.
@@ -298,7 +302,7 @@ for (int bit = 31; bit >= 0; bit--) {
 }
 """)
 
-unop("find_lsb", tint, """
+unop("find_lsb", tint32, """
 dst = -1;
 for (unsigned bit = 0; bit < 32; bit++) {
    if ((src0 >> bit) & 1) {
@@ -358,10 +362,10 @@ binop("fmul", tfloat, commutative + associative, "src0 * src1")
 # low 32-bits of signed/unsigned integer multiply
 binop("imul", tint, commutative + associative, "src0 * src1")
 # high 32-bits of signed integer multiply
-binop("imul_high", tint, commutative,
+binop("imul_high", tint32, commutative,
       "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
 # high 32-bits of unsigned integer multiply
-binop("umul_high", tuint, commutative,
+binop("umul_high", tuint32, commutative,
       "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
 
 binop("fdiv", tfloat, "", "src0 / src1")
@@ -412,18 +416,18 @@ binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
 
 # non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
 
-binop_reduce("fall_equal",  1, tfloat, tfloat, "{src0} == {src1}",
+binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
              "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
-binop_reduce("fany_nequal", 1, tfloat, tfloat, "{src0} != {src1}",
+binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
              "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
 
 # These comparisons for integer-less hardware return 1.0 and 0.0 for true
 # and false respectively
 
-binop("slt", tfloat, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
-binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
-binop("seq", tfloat, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
-binop("sne", tfloat, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
+binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
+binop("sge", tfloat32, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
+binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
+binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
 
 
 binop("ishl", tint, "", "src0 << src1")
@@ -446,11 +450,11 @@ binop("ixor", tuint, commutative + associative, "src0 ^ src1")
 # These use (src != 0.0) for testing the truth of the input, and output 1.0
 # for true and 0.0 for false
 
-binop("fand", tfloat, commutative,
+binop("fand", tfloat32, commutative,
       "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("for", tfloat, commutative,
+binop("for", tfloat32, commutative,
       "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("fxor", tfloat, commutative,
+binop("fxor", tfloat32, commutative,
       "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
 
 binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
@@ -472,7 +476,7 @@ binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
 binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
 
 # Saturated vector add for 4 8bit ints.
-binop("usadd_4x8", tint, commutative + associative, """
+binop("usadd_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
@@ -480,7 +484,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # Saturated vector subtract for 4 8bit ints.
-binop("ussub_4x8", tint, "", """
+binop("ussub_4x8", tint32, "", """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    int src0_chan = (src0 >> i) & 0xff;
@@ -491,7 +495,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # vector min for 4 8bit ints.
-binop("umin_4x8", tint, commutative + associative, """
+binop("umin_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -499,7 +503,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # vector max for 4 8bit ints.
-binop("umax_4x8", tint, commutative + associative, """
+binop("umax_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
@@ -507,7 +511,7 @@ for (int i = 0; i < 32; i += 8) {
 """)
 
 # unorm multiply: (a * b) / 255.
-binop("umul_unorm_4x8", tint, commutative + associative, """
+binop("umul_unorm_4x8", tint32, commutative + associative, """
 dst = 0;
 for (int i = 0; i < 32; i += 8) {
    int src0_chan = (src0 >> i) & 0xff;
@@ -516,15 +520,15 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)
 
-binop("fpow", tfloat, "", "powf(src0, src1)")
+binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
 
-binop_horiz("pack_half_2x16_split", 1, tuint, 1, tfloat, 1, tfloat,
+binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
             "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
 
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
 # and that of the "bfi1" i965 instruction. That is, it has undefined behavior
 # if either of its arguments are 32.
-binop_convert("bfm", tuint, tint, "", """
+binop_convert("bfm", tuint32, tint32, "", """
 int bits = src0, offset = src1;
 if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
    dst = 0; /* undefined */
@@ -533,7 +537,7 @@ else
 """)
 
 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint], "", """
-dst = ldexpf(src0, src1);
+dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
 /* flush denormals to zero. */
 if (!isnormal(dst))
    dst = copysignf(0.0f, src0);
@@ -573,12 +577,12 @@ triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
 # bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
 
 
-triop("fcsel", tfloat, "(src0 != 0.0f) ? src1 : src2")
+triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
 opcode("bcsel", 0, tuint, [0, 0, 0],
       [tbool, tuint, tuint], "", "src0 ? src1 : src2")
 
 # SM5 bfi assembly
-triop("bfi", tuint, """
+triop("bfi", tuint32, """
 unsigned mask = src0, insert = src1, base = src2;
 if (mask == 0) {
    dst = base;
@@ -593,8 +597,8 @@ if (mask == 0) {
 """)
 
 # SM5 ubfe/ibfe assembly
-opcode("ubfe", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubfe", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -607,8 +611,8 @@ if (bits == 0) {
    dst = base >> offset;
 }
 """)
-opcode("ibfe", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibfe", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -623,8 +627,8 @@ if (bits == 0) {
 """)
 
 # GLSL bitfieldExtract()
-opcode("ubitfield_extract", 0, tuint,
-       [0, 0, 0], [tuint, tint, tint], "", """
+opcode("ubitfield_extract", 0, tuint32,
+       [0, 0, 0], [tuint32, tint32, tint32], "", """
 unsigned base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -635,8 +639,8 @@ if (bits == 0) {
    dst = (base >> offset) & ((1ull << bits) - 1);
 }
 """)
-opcode("ibitfield_extract", 0, tint,
-       [0, 0, 0], [tint, tint, tint], "", """
+opcode("ibitfield_extract", 0, tint32,
+       [0, 0, 0], [tint32, tint32, tint32], "", """
 int base = src0;
 int offset = src1, bits = src2;
 if (bits == 0) {
@@ -663,8 +667,8 @@ def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
           [tuint, tuint, tuint, tuint],
           "", const_expr)
 
-opcode("bitfield_insert", 0, tuint, [0, 0, 0, 0],
-       [tuint, tuint, tint, tint], "", """
+opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
+       [tuint32, tuint32, tint32, tint32], "", """
 unsigned base = src0, insert = src1;
 int offset = src2, bits = src3;
 if (bits == 0) {
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index 04876a42fd7..63eca1c31cc 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -46,10 +46,28 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
    if (!instr->dest.dest.is_ssa)
       return false;
 
+   /* In the case that any outputs/inputs have unsized types, then we need to
+    * guess the bit-size. In this case, the validator ensures that all
+    * bit-sizes match so we can just take the bit-size from first
+    * output/input with an unsized type. If all the outputs/inputs are sized
+    * then we don't need to guess the bit-size at all because the code we
+    * generate for constant opcodes in this case already knows the sizes of
+    * the types involved and does not need the provided bit-size for anything
+    * (although it still requires to receive a valid bit-size).
+    */
+   unsigned bit_size = 0;
+   if (!nir_alu_type_get_type_size(nir_op_infos[instr->op].output_type))
+      bit_size = instr->dest.dest.ssa.bit_size;
+
    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
       if (!instr->src[i].src.is_ssa)
          return false;
 
+      if (bit_size == 0 &&
+          !nir_alu_type_get_type_size(nir_op_infos[instr->op].input_sizes[i])) {
+         bit_size = instr->src[i].src.ssa->bit_size;
+      }
+
       nir_instr *src_instr = instr->src[i].src.ssa->parent_instr;
 
       if (src_instr->type != nir_instr_type_load_const)
@@ -58,24 +76,31 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
 
       for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
            j++) {
-         src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
+         if (load_const->def.bit_size == 64)
+            src[i].ul[j] = load_const->value.ul[instr->src[i].swizzle[j]];
+         else
+            src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
       }
 
       /* We shouldn't have any source modifiers in the optimization loop. */
       assert(!instr->src[i].abs && !instr->src[i].negate);
    }
 
+   if (bit_size == 0)
+      bit_size = 32;
+
    /* We shouldn't have any saturate modifiers in the optimization loop. */
    assert(!instr->dest.saturate);
 
    nir_const_value dest =
       nir_eval_const_opcode(instr->op, instr->dest.dest.ssa.num_components,
-                            src);
+                            bit_size, src);
 
    nir_load_const_instr *new_instr =
       nir_load_const_instr_create(mem_ctx,
                                   instr->dest.dest.ssa.num_components);
 
+   new_instr->def.bit_size = instr->dest.dest.ssa.bit_size;
    new_instr->value = dest;
 
    nir_instr_insert_before(&instr->instr, &new_instr->instr);
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index f5826d85174..bfa1a23ae49 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -885,7 +885,9 @@ ntq_emit_comparison(struct vc4_compile *c, struct qreg *dest,
         struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0);
         struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1);
 
-        if (nir_op_infos[compare_instr->op].input_types[0] == nir_type_float)
+        unsigned unsized_type =
+                nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]);
+        if (unsized_type == nir_type_float)
                 qir_SF(c, qir_FSUB(c, src0, src1));
         else
                 qir_SF(c, qir_SUB(c, src0, src1));
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 24350460466..ed941a8c8c7 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -623,12 +623,24 @@ brw_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
    case nir_type_uint:
+   case nir_type_uint32:
       return BRW_REGISTER_TYPE_UD;
    case nir_type_bool:
    case nir_type_int:
+   case nir_type_bool32:
+   case nir_type_int32:
       return BRW_REGISTER_TYPE_D;
    case nir_type_float:
+   case nir_type_float32:
       return BRW_REGISTER_TYPE_F;
+   case nir_type_float64:
+      return BRW_REGISTER_TYPE_DF;
+   case nir_type_int64:
+   case nir_type_uint64:
+      /* TODO we should only see these in moves, so for now it's ok, but when
+       * we add actual 64-bit integer support we should fix this.
+       */
+      return BRW_REGISTER_TYPE_DF;
    default:
       unreachable("unknown type");
    }
@@ -644,12 +656,18 @@ brw_glsl_base_type_for_nir_type(nir_alu_type type)
 {
    switch (type) {
    case nir_type_float:
+   case nir_type_float32:
       return GLSL_TYPE_FLOAT;
 
+   case nir_type_float64:
+      return GLSL_TYPE_DOUBLE;
+
    case nir_type_int:
+   case nir_type_int32:
       return GLSL_TYPE_INT;
 
    case nir_type_uint:
+   case nir_type_uint32:
       return GLSL_TYPE_UINT;
 
    default:

From 084b24f5582567ebf5aa94b7f40ae3bdcb71316b Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Wed, 16 Mar 2016 12:11:34 +0100
Subject: [PATCH 007/197] nir: rename nir_const_value fields to include bitsize
 information
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/compiler/nir/nir.c                        |  4 +-
 src/compiler/nir/nir.h                        | 12 ++--
 src/compiler/nir/nir_builder.h                | 20 +++---
 src/compiler/nir/nir_constant_expressions.py  | 22 +++---
 src/compiler/nir/nir_gs_count_vertices.c      |  4 +-
 src/compiler/nir/nir_instr_set.c              |  8 +--
 src/compiler/nir/nir_lower_atomics.c          |  6 +-
 .../nir/nir_lower_load_const_to_scalar.c      |  2 +-
 src/compiler/nir/nir_lower_locals_to_regs.c   |  2 +-
 src/compiler/nir/nir_lower_tex.c              |  6 +-
 src/compiler/nir/nir_opt_constant_folding.c   |  6 +-
 src/compiler/nir/nir_opt_dead_cf.c            |  2 +-
 src/compiler/nir/nir_print.c                  |  2 +-
 src/compiler/nir/nir_search.c                 | 10 +--
 src/gallium/auxiliary/nir/tgsi_to_nir.c       |  2 +-
 .../drivers/freedreno/ir3/ir3_compiler_nir.c  | 10 +--
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |  6 +-
 src/gallium/drivers/vc4/vc4_program.c         | 10 +--
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp      | 70 +++++++++----------
 src/mesa/drivers/dri/i965/brw_nir.c           |  4 +-
 .../i965/brw_nir_analyze_boolean_resolves.c   |  2 +-
 src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp |  4 +-
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp    | 44 ++++++------
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp    |  2 +-
 24 files changed, 130 insertions(+), 130 deletions(-)

diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index d07550a6b03..8fa75e4e5dc 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -700,10 +700,10 @@ nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref)
       case GLSL_TYPE_FLOAT:
       case GLSL_TYPE_INT:
       case GLSL_TYPE_UINT:
-         load->value.u[i] = constant->value.u[matrix_offset + i];
+         load->value.u32[i] = constant->value.u[matrix_offset + i];
          break;
       case GLSL_TYPE_BOOL:
-         load->value.u[i] = constant->value.b[matrix_offset + i] ?
+         load->value.u32[i] = constant->value.b[matrix_offset + i] ?
                              NIR_TRUE : NIR_FALSE;
          break;
       default:
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 824f4e20706..2ddc2dc4404 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1209,12 +1209,12 @@ nir_tex_instr_src_index(nir_tex_instr *instr, nir_tex_src_type type)
 
 typedef struct {
    union {
-      float f[4];
-      double d[4];
-      int32_t i[4];
-      uint32_t u[4];
-      int64_t l[4];
-      uint64_t ul[4];
+      float f32[4];
+      double f64[4];
+      int32_t i32[4];
+      uint32_t u32[4];
+      int64_t i64[4];
+      uint64_t u64[4];
    };
 } nir_const_value;
 
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index d546e41b5fe..02e4526dcaa 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -90,7 +90,7 @@ nir_imm_float(nir_builder *build, float x)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.f[0] = x;
+   v.f32[0] = x;
 
    return nir_build_imm(build, 1, v);
 }
@@ -101,10 +101,10 @@ nir_imm_vec4(nir_builder *build, float x, float y, float z, float w)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.f[0] = x;
-   v.f[1] = y;
-   v.f[2] = z;
-   v.f[3] = w;
+   v.f32[0] = x;
+   v.f32[1] = y;
+   v.f32[2] = z;
+   v.f32[3] = w;
 
    return nir_build_imm(build, 4, v);
 }
@@ -115,7 +115,7 @@ nir_imm_int(nir_builder *build, int x)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.i[0] = x;
+   v.i32[0] = x;
 
    return nir_build_imm(build, 1, v);
 }
@@ -126,10 +126,10 @@ nir_imm_ivec4(nir_builder *build, int x, int y, int z, int w)
    nir_const_value v;
 
    memset(&v, 0, sizeof(v));
-   v.i[0] = x;
-   v.i[1] = y;
-   v.i[2] = z;
-   v.i[3] = w;
+   v.i32[0] = x;
+   v.i32[1] = y;
+   v.i32[2] = z;
+   v.i32[3] = w;
 
    return nir_build_imm(build, 4, v);
 }
diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
index 972d2819af9..e36dc4853b5 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -22,19 +22,19 @@ def type_add_size(type_, size):
 
 def get_const_field(type_):
     if type_ == "int32":
-        return "i"
+        return "i32"
     if type_ == "uint32":
-        return "u"
+        return "u32"
     if type_ == "int64":
-        return "l"
+        return "i64"
     if type_ == "uint64":
-        return "ul"
+        return "u64"
     if type_ == "bool32":
-        return "b"
+        return "u32"
     if type_ == "float32":
-        return "f"
+        return "f32"
     if type_ == "float64":
-        return "d"
+        return "f64"
     raise Exception(str(type_))
     assert(0)
 
@@ -294,7 +294,7 @@ evaluate_${name}(unsigned num_components, unsigned bit_size,
          struct ${input_types[j]}_vec src${j} = {
          % for k in range(op.input_sizes[j]):
             % if input_types[j] == "bool32":
-               _src[${j}].u[${k}] != 0,
+               _src[${j}].u32[${k}] != 0,
             % else:
                _src[${j}].${get_const_field(input_types[j])}[${k}],
             % endif
@@ -316,7 +316,7 @@ evaluate_${name}(unsigned num_components, unsigned bit_size,
                   ## Avoid unused variable warnings
                   <% continue %>
                % elif input_types[j] == "bool32":
-                  bool src${j} = _src[${j}].u[_i] != 0;
+                  bool src${j} = _src[${j}].u32[_i] != 0;
                % else:
                   ${input_types[j]}_t src${j} =
                      _src[${j}].${get_const_field(input_types[j])}[_i];
@@ -337,7 +337,7 @@ evaluate_${name}(unsigned num_components, unsigned bit_size,
             ## value of dst.
             % if output_type == "bool32":
                ## Sanitize the C value to a proper NIR bool
-               _dst_val.u[_i] = dst ? NIR_TRUE : NIR_FALSE;
+               _dst_val.u32[_i] = dst ? NIR_TRUE : NIR_FALSE;
             % else:
                _dst_val.${get_const_field(output_type)}[_i] = dst;
             % endif
@@ -364,7 +364,7 @@ evaluate_${name}(unsigned num_components, unsigned bit_size,
          % for k in range(op.output_size):
             % if output_type == "bool32":
                ## Sanitize the C value to a proper NIR bool
-               _dst_val.u[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
+               _dst_val.u32[${k}] = dst.${"xyzw"[k]} ? NIR_TRUE : NIR_FALSE;
             % else:
                _dst_val.${get_const_field(output_type)}[${k}] = dst.${"xyzw"[k]};
             % endif
diff --git a/src/compiler/nir/nir_gs_count_vertices.c b/src/compiler/nir/nir_gs_count_vertices.c
index db15d160ee7..3c1bd2a59bd 100644
--- a/src/compiler/nir/nir_gs_count_vertices.c
+++ b/src/compiler/nir/nir_gs_count_vertices.c
@@ -77,13 +77,13 @@ nir_gs_count_vertices(const nir_shader *shader)
                return -1;
 
             if (count == -1)
-               count = val->i[0];
+               count = val->i32[0];
 
             /* We've found contradictory set_vertex_count intrinsics.
              * This can happen if there are early-returns in main() and
              * different paths emit different numbers of vertices.
              */
-            if (count != val->i[0])
+            if (count != val->i32[0])
                return -1;
          }
       }
diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c
index 159ded0e72b..3f5da496092 100644
--- a/src/compiler/nir/nir_instr_set.c
+++ b/src/compiler/nir/nir_instr_set.c
@@ -81,9 +81,9 @@ hash_load_const(uint32_t hash, const nir_load_const_instr *instr)
 {
    hash = HASH(hash, instr->def.num_components);
 
-   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f,
+   hash = _mesa_fnv32_1a_accumulate_block(hash, instr->value.f32,
                                           instr->def.num_components
-                                             * sizeof(instr->value.f[0]));
+                                             * sizeof(instr->value.f32[0]));
 
    return hash;
 }
@@ -322,8 +322,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (load1->def.num_components != load2->def.num_components)
          return false;
 
-      return memcmp(load1->value.f, load2->value.f,
-                    load1->def.num_components * sizeof(*load2->value.f)) == 0;
+      return memcmp(load1->value.f32, load2->value.f32,
+                    load1->def.num_components * sizeof(*load2->value.f32)) == 0;
    }
    case nir_instr_type_phi: {
       nir_phi_instr *phi1 = nir_instr_as_phi(instr1);
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index eefcb55a0a6..e066cf222a7 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -75,7 +75,7 @@ lower_instr(nir_intrinsic_instr *instr,
       state->shader_program->UniformStorage[uniform_loc].opaque[state->shader->stage].index);
 
    nir_load_const_instr *offset_const = nir_load_const_instr_create(mem_ctx, 1);
-   offset_const->value.u[0] = instr->variables[0]->var->data.offset;
+   offset_const->value.u32[0] = instr->variables[0]->var->data.offset;
 
    nir_instr_insert_before(&instr->instr, &offset_const->instr);
 
@@ -90,13 +90,13 @@ lower_instr(nir_intrinsic_instr *instr,
       unsigned child_array_elements = tail->child != NULL ?
          glsl_get_aoa_size(tail->type) : 1;
 
-      offset_const->value.u[0] += deref_array->base_offset *
+      offset_const->value.u32[0] += deref_array->base_offset *
          child_array_elements * ATOMIC_COUNTER_SIZE;
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          nir_load_const_instr *atomic_counter_size =
                nir_load_const_instr_create(mem_ctx, 1);
-         atomic_counter_size->value.u[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
+         atomic_counter_size->value.u32[0] = child_array_elements * ATOMIC_COUNTER_SIZE;
          nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
diff --git a/src/compiler/nir/nir_lower_load_const_to_scalar.c b/src/compiler/nir/nir_lower_load_const_to_scalar.c
index 1eeed13cbac..b5df46413f1 100644
--- a/src/compiler/nir/nir_lower_load_const_to_scalar.c
+++ b/src/compiler/nir/nir_lower_load_const_to_scalar.c
@@ -49,7 +49,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
    nir_ssa_def *loads[4];
    for (unsigned i = 0; i < lower->def.num_components; i++) {
       nir_load_const_instr *load_comp = nir_load_const_instr_create(b.shader, 1);
-      load_comp->value.u[0] = lower->value.u[i];
+      load_comp->value.u32[0] = lower->value.u32[i];
       nir_builder_instr_insert(&b, &load_comp->instr);
       loads[i] = &load_comp->def;
    }
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index 45036fa7787..235cb842908 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -161,7 +161,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
       if (src.reg.indirect) {
          nir_load_const_instr *load_const =
             nir_load_const_instr_create(state->shader, 1);
-         load_const->value.u[0] = glsl_get_length(parent_type);
+         load_const->value.u32[0] = glsl_get_length(parent_type);
          nir_instr_insert_before(instr, &load_const->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(state->shader, nir_op_imul);
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index 806acd8333c..f737463b877 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -223,13 +223,13 @@ get_zero_or_one(nir_builder *b, nir_alu_type type, uint8_t swizzle_val)
    memset(&v, 0, sizeof(v));
 
    if (swizzle_val == 4) {
-      v.u[0] = v.u[1] = v.u[2] = v.u[3] = 0;
+      v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 0;
    } else {
       assert(swizzle_val == 5);
       if (type == nir_type_float)
-         v.f[0] = v.f[1] = v.f[2] = v.f[3] = 1.0;
+         v.f32[0] = v.f32[1] = v.f32[2] = v.f32[3] = 1.0;
       else
-         v.u[0] = v.u[1] = v.u[2] = v.u[3] = 1;
+         v.u32[0] = v.u32[1] = v.u32[2] = v.u32[3] = 1;
    }
 
    return nir_build_imm(b, 4, v);
diff --git a/src/compiler/nir/nir_opt_constant_folding.c b/src/compiler/nir/nir_opt_constant_folding.c
index 63eca1c31cc..e64ca369bbc 100644
--- a/src/compiler/nir/nir_opt_constant_folding.c
+++ b/src/compiler/nir/nir_opt_constant_folding.c
@@ -77,9 +77,9 @@ constant_fold_alu_instr(nir_alu_instr *instr, void *mem_ctx)
       for (unsigned j = 0; j < nir_ssa_alu_instr_src_components(instr, i);
            j++) {
          if (load_const->def.bit_size == 64)
-            src[i].ul[j] = load_const->value.ul[instr->src[i].swizzle[j]];
+            src[i].u64[j] = load_const->value.u64[instr->src[i].swizzle[j]];
          else
-            src[i].u[j] = load_const->value.u[instr->src[i].swizzle[j]];
+            src[i].u32[j] = load_const->value.u32[instr->src[i].swizzle[j]];
       }
 
       /* We shouldn't have any source modifiers in the optimization loop. */
@@ -131,7 +131,7 @@ constant_fold_deref(nir_instr *instr, nir_deref_var *deref)
          nir_load_const_instr *indirect =
             nir_instr_as_load_const(arr->indirect.ssa->parent_instr);
 
-         arr->base_offset += indirect->value.u[0];
+         arr->base_offset += indirect->value.u32[0];
 
          /* Clear out the source */
          nir_instr_rewrite_src(instr, &arr->indirect, nir_src_for_ssa(NULL));
diff --git a/src/compiler/nir/nir_opt_dead_cf.c b/src/compiler/nir/nir_opt_dead_cf.c
index 4cc6798702b..4658b23c57b 100644
--- a/src/compiler/nir/nir_opt_dead_cf.c
+++ b/src/compiler/nir/nir_opt_dead_cf.c
@@ -228,7 +228,7 @@ dead_cf_block(nir_block *block)
      if (!const_value)
         return false;
 
-      opt_constant_if(following_if, const_value->u[0] != 0);
+      opt_constant_if(following_if, const_value->u32[0] != 0);
       return true;
    }
 
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index b8943b83f46..d3d5b84a024 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -711,7 +711,7 @@ print_load_const_instr(nir_load_const_instr *instr, print_state *state)
        * and then print the float in a comment for readability.
        */
 
-      fprintf(fp, "0x%08x /* %f */", instr->value.u[i], instr->value.f[i]);
+      fprintf(fp, "0x%08x /* %f */", instr->value.u32[i], instr->value.f32[i]);
    }
 
    fprintf(fp, ")");
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 56d7e8162f3..e7164a76110 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -161,7 +161,7 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
       switch (nir_op_infos[instr->op].input_types[src]) {
       case nir_type_float:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.f[new_swizzle[i]] != const_val->data.f)
+            if (load->value.f32[new_swizzle[i]] != const_val->data.f)
                return false;
          }
          return true;
@@ -169,7 +169,7 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
       case nir_type_uint:
       case nir_type_bool:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.i[new_swizzle[i]] != const_val->data.i)
+            if (load->value.i32[new_swizzle[i]] != const_val->data.i)
                return false;
          }
          return true;
@@ -304,15 +304,15 @@ construct_value(const nir_search_value *value, nir_alu_type type,
       switch (type) {
       case nir_type_float:
          load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f);
-         load->value.f[0] = c->data.f;
+         load->value.f32[0] = c->data.f;
          break;
       case nir_type_int:
          load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
-         load->value.i[0] = c->data.i;
+         load->value.i32[0] = c->data.i;
          break;
       case nir_type_uint:
       case nir_type_bool:
-         load->value.u[0] = c->data.u;
+         load->value.u32[0] = c->data.u;
          break;
       default:
          unreachable("Invalid alu source type");
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index b71917618c1..80fd3b69d19 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -459,7 +459,7 @@ ttn_emit_immediate(struct ttn_compile *c)
    c->next_imm++;
 
    for (i = 0; i < 4; i++)
-      load_const->value.u[i] = tgsi_imm->u[i].Uint;
+      load_const->value.u32[i] = tgsi_imm->u[i].Uint;
 
    nir_builder_instr_insert(b, &load_const->instr);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 7a1812f2518..54315d2f592 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1017,7 +1017,7 @@ emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 
 	const_offset = nir_src_as_const_value(intr->src[1]);
 	if (const_offset) {
-		off += const_offset->u[0];
+		off += const_offset->u32[0];
 	} else {
 		/* For load_ubo_indirect, second src is indirect offset: */
 		src1 = get_src(ctx, &intr->src[1])[0];
@@ -1159,7 +1159,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = create_uniform(ctx, n);
@@ -1186,7 +1186,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
 		if (const_offset) {
-			idx += const_offset->u[0];
+			idx += const_offset->u32[0];
 			for (int i = 0; i < intr->num_components; i++) {
 				unsigned n = idx * 4 + i;
 				dst[i] = ctx->ir->inputs[n];
@@ -1213,7 +1213,7 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[1]);
 		compile_assert(ctx, const_offset != NULL);
-		idx += const_offset->u[0];
+		idx += const_offset->u32[0];
 
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
@@ -1301,7 +1301,7 @@ emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
 	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
 			instr->def.num_components);
 	for (int i = 0; i < instr->def.num_components; i++)
-		dst[i] = create_immed(ctx->block, instr->value.u[i]);
+		dst[i] = create_immed(ctx->block, instr->value.u32[i]);
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index d47e3bf52b0..941673c80fa 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -183,7 +183,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[0]) &&
-               nir_src_as_const_value(intr->src[0])->u[0] == 0);
+               nir_src_as_const_value(intr->src[0])->u32[0] == 0);
 
         /* Generate dword loads for the VPM values (Since these intrinsics may
          * be reordered, the actual reads will be generated at the top of the
@@ -256,7 +256,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[0]) &&
-               nir_src_as_const_value(intr->src[0])->u[0] == 0);
+               nir_src_as_const_value(intr->src[0])->u32[0] == 0);
 
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
@@ -339,7 +339,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
          * with an offset value of 0.
          */
         assert(nir_src_as_const_value(intr->src[1]) &&
-               nir_src_as_const_value(intr->src[1])->u[0] == 0);
+               nir_src_as_const_value(intr->src[1])->u32[0] == 0);
 
         b->cursor = nir_before_instr(&intr->instr);
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index bfa1a23ae49..7deca8761b8 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1521,7 +1521,7 @@ ntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr)
 {
         struct qreg *qregs = ntq_init_ssa_def(c, &instr->def);
         for (int i = 0; i < instr->def.num_components; i++)
-                qregs[i] = qir_uniform_ui(c, instr->value.u[i]);
+                qregs[i] = qir_uniform_ui(c, instr->value.u32[i]);
 
         _mesa_hash_table_insert(c->def_ht, &instr->def, qregs);
 }
@@ -1555,7 +1555,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 assert(instr->num_components == 1);
                 const_offset = nir_src_as_const_value(instr->src[0]);
                 if (const_offset) {
-                        offset = instr->const_index[0] + const_offset->u[0];
+                        offset = instr->const_index[0] + const_offset->u32[0];
                         assert(offset % 4 == 0);
                         /* We need dwords */
                         offset = offset / 4;
@@ -1586,7 +1586,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 const_offset = nir_src_as_const_value(instr->src[0]);
                 assert(const_offset && "vc4 doesn't support indirect inputs");
                 if (instr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT) {
-                        assert(const_offset->u[0] == 0);
+                        assert(const_offset->u32[0] == 0);
                         /* Reads of the per-sample color need to be done in
                          * order.
                          */
@@ -1600,7 +1600,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                         }
                         *dest = c->color_reads[sample_index];
                 } else {
-                        offset = instr->const_index[0] + const_offset->u[0];
+                        offset = instr->const_index[0] + const_offset->u32[0];
                         *dest = c->inputs[offset];
                 }
                 break;
@@ -1608,7 +1608,7 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
         case nir_intrinsic_store_output:
                 const_offset = nir_src_as_const_value(instr->src[1]);
                 assert(const_offset && "vc4 doesn't support indirect outputs");
-                offset = instr->const_index[0] + const_offset->u[0];
+                offset = instr->const_index[0] + const_offset->u32[0];
 
                 /* MSAA color outputs are the only case where we have an
                  * output that's not lowered to being a store of a single 32
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index cde8f0b6381..4de559941ce 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -527,10 +527,10 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 
    enum opcode extract_op;
    if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) {
-      assert(element->u[0] <= 1);
+      assert(element->u32[0] <= 1);
       extract_op = SHADER_OPCODE_EXTRACT_WORD;
    } else {
-      assert(element->u[0] <= 3);
+      assert(element->u32[0] <= 3);
       extract_op = SHADER_OPCODE_EXTRACT_BYTE;
    }
 
@@ -539,7 +539,7 @@ fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 
    set_saturate(instr->dest.saturate,
-                bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0])));
+                bld.emit(extract_op, result, op0, brw_imm_ud(element->u32[0])));
    return true;
 }
 
@@ -558,11 +558,11 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
       return false;
 
    nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-   if (!value1 || fabsf(value1->f[0]) != 1.0f)
+   if (!value1 || fabsf(value1->f32[0]) != 1.0f)
       return false;
 
    nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
-   if (!value2 || fabsf(value2->f[0]) != 1.0f)
+   if (!value2 || fabsf(value2->f32[0]) != 1.0f)
       return false;
 
    fs_reg tmp = vgrf(glsl_type::int_type);
@@ -582,7 +582,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
        * surely be TRIANGLES
        */
 
-      if (value1->f[0] == -1.0f) {
+      if (value1->f32[0] == -1.0f) {
          g0.negate = true;
       }
 
@@ -610,7 +610,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
        * surely be TRIANGLES
        */
 
-      if (value1->f[0] == -1.0f) {
+      if (value1->f32[0] == -1.0f) {
          g1_6.negate = true;
       }
 
@@ -1115,7 +1115,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_extract_i8: {
       nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
-               result, op[0], brw_imm_ud(byte->u[0]));
+               result, op[0], brw_imm_ud(byte->u32[0]));
       break;
    }
 
@@ -1123,7 +1123,7 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    case nir_op_extract_i16: {
       nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
       bld.emit(SHADER_OPCODE_EXTRACT_WORD,
-               result, op[0], brw_imm_ud(word->u[0]));
+               result, op[0], brw_imm_ud(word->u32[0]));
       break;
    }
 
@@ -1150,7 +1150,7 @@ fs_visitor::nir_emit_load_const(const fs_builder &bld,
    fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, instr->def.num_components);
 
    for (unsigned i = 0; i < instr->def.num_components; i++)
-      bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i[i]));
+      bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
 
    nir_ssa_values[instr->def.index] = reg;
 }
@@ -1686,9 +1686,9 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
    const bool is_point_size = (base_offset == 0);
 
    if (offset_const != NULL && vertex_const != NULL &&
-       4 * (base_offset + offset_const->u[0]) < push_reg_count) {
-      int imm_offset = (base_offset + offset_const->u[0]) * 4 +
-                       vertex_const->u[0] * push_reg_count;
+       4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
+      int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
+                       vertex_const->u32[0] * push_reg_count;
       /* This input was pushed into registers. */
       if (is_point_size) {
          /* gl_PointSize comes in .w */
@@ -1710,7 +1710,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       if (vertex_const) {
          /* The vertex index is constant; just select the proper URB handle. */
          icp_handle =
-            retype(brw_vec8_grf(first_icp_handle + vertex_const->i[0], 0),
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
                    BRW_REGISTER_TYPE_UD);
       } else {
          /* The vertex index is non-constant.  We need to use indirect
@@ -1754,7 +1754,7 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
       if (offset_const) {
          /* Constant indexing - use global offset. */
          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
-         inst->offset = base_offset + offset_const->u[0];
+         inst->offset = base_offset + offset_const->u32[0];
          inst->base_mrf = -1;
          inst->mlen = 1;
          inst->regs_written = num_components;
@@ -1792,7 +1792,7 @@ fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
        * add_const_offset_to_base() will fold other constant offsets
        * into instr->const_index[0].
        */
-      assert(const_value->u[0] == 0);
+      assert(const_value->u32[0] == 0);
       return fs_reg();
    }
 
@@ -2110,7 +2110,7 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
 
          if (const_sample) {
-            unsigned msg_data = const_sample->i[0] << 4;
+            unsigned msg_data = const_sample->i32[0] << 4;
 
             emit_pixel_interpolater_send(bld,
                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
@@ -2177,8 +2177,8 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
 
          if (const_offset) {
-            unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
-            unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
+            unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
+            unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
 
             emit_pixel_interpolater_send(bld,
                                          FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
@@ -2536,8 +2536,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
          /* Offsets are in bytes but they should always be multiples of 4 */
-         assert(const_offset->u[0] % 4 == 0);
-         src.reg_offset = const_offset->u[0] / 4;
+         assert(const_offset->u32[0] % 4 == 0);
+         src.reg_offset = const_offset->u32[0] / 4;
       } else {
          src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
       }
@@ -2554,7 +2554,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       if (const_index) {
          const unsigned index = stage_prog_data->binding_table.ubo_start +
-                                const_index->u[0];
+                                const_index->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2587,12 +2587,12 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
-         struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u[0] & ~15);
+         struct brw_reg const_offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
          bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
                   surf_index, const_offset_reg);
 
          for (unsigned i = 0; i < instr->num_components; i++) {
-            packed_consts.set_smear(const_offset->u[0] % 16 / 4 + i);
+            packed_consts.set_smear(const_offset->u32[0] % 16 / 4 + i);
 
             /* The std140 packing rules don't allow vectors to cross 16-byte
              * boundaries, and a reg is 32 bytes.
@@ -2615,7 +2615,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg surf_index;
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2634,7 +2634,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[1]);
       }
@@ -2660,7 +2660,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       fs_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
       } else {
          offset_reg = vgrf(glsl_type::uint_type);
          bld.ADD(offset_reg,
@@ -2704,7 +2704,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
          if (const_offset) {
-            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
                                     4 * first_component);
          } else {
             offset_reg = vgrf(glsl_type::uint_type);
@@ -2738,7 +2738,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       assert(const_offset && "Indirect input loads not allowed");
-      src = offset(src, bld, const_offset->u[0]);
+      src = offset(src, bld, const_offset->u32[0]);
 
       for (unsigned j = 0; j < instr->num_components; j++) {
          bld.MOV(offset(dest, bld, j), offset(src, bld, j));
@@ -2755,7 +2755,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
          unsigned index = stage_prog_data->binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(prog_data, index);
       } else {
@@ -2786,7 +2786,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          fs_reg offset_reg;
          nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
          if (const_offset) {
-            offset_reg = brw_imm_ud(const_offset->u[0] + 4 * first_component);
+            offset_reg = brw_imm_ud(const_offset->u32[0] + 4 * first_component);
          } else {
             offset_reg = vgrf(glsl_type::uint_type);
             bld.ADD(offset_reg,
@@ -2814,7 +2814,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       assert(const_offset && "Indirect output stores not allowed");
-      new_dest = offset(new_dest, bld, const_offset->u[0]);
+      new_dest = offset(new_dest, bld, const_offset->u32[0]);
 
       for (unsigned j = 0; j < instr->num_components; j++) {
          bld.MOV(offset(new_dest, bld, j), offset(src, bld, j));
@@ -2855,7 +2855,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
       int reg_width = dispatch_width / 8;
 
       /* Set LOD = 0 */
@@ -2906,7 +2906,7 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
       unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
-                            const_surface->u[0];
+                            const_surface->u32[0];
       surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(prog_data, surf_index);
    } else {
@@ -3031,7 +3031,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          nir_const_value *const_offset =
             nir_src_as_const_value(instr->src[i].src);
          if (const_offset) {
-            tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i, 3));
+            tex_offset = brw_imm_ud(brw_texture_offset(const_offset->i32, 3));
          } else {
             tex_offset = retype(src, BRW_REGISTER_TYPE_D);
          }
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index ed941a8c8c7..c62840a6e15 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -77,7 +77,7 @@ add_const_offset_to_base_block(nir_block *block, void *closure)
          nir_const_value *const_offset = nir_src_as_const_value(*offset);
 
          if (const_offset) {
-            intrin->const_index[0] += const_offset->u[0];
+            intrin->const_index[0] += const_offset->u32[0];
             b->cursor = nir_before_instr(&intrin->instr);
             nir_instr_rewrite_src(&intrin->instr, offset,
                                   nir_src_for_ssa(nir_imm_int(b, 0)));
@@ -175,7 +175,7 @@ remap_patch_urb_offsets(nir_block *block, void *closure)
          if (vertex) {
             nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
             if (const_vertex) {
-               intrin->const_index[0] += const_vertex->u[0] *
+               intrin->const_index[0] += const_vertex->u32[0] *
                                          state->vue_map->num_per_vertex_slots;
             } else {
                state->b.cursor = nir_before_instr(&intrin->instr);
diff --git a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
index 12ac853478a..22eeb1a1296 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_analyze_boolean_resolves.c
@@ -225,7 +225,7 @@ analyze_boolean_resolves_block(nir_block *block, void *void_state)
           * have to worry about resolving them.
           */
          instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
-         if (load->value.u[0] == NIR_TRUE || load->value.u[0] == NIR_FALSE) {
+         if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) {
             instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
          } else {
             instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
index d9c048e1764..e915aee3bd0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_nir.cpp
@@ -70,8 +70,8 @@ vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* Make up a type...we have no way of knowing... */
       const glsl_type *const type = glsl_type::ivec(instr->num_components);
 
-      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] +
-                          instr->const_index[0] + offset->u[0],
+      src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+                          instr->const_index[0] + offset->u32[0],
                     type);
       /* gl_PointSize is passed in the .w component of the VUE header */
       if (instr->const_index[0] == VARYING_SLOT_PSIZ)
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 52977f1c12b..eef3940b643 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -352,7 +352,7 @@ vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
        * add_const_offset_to_base() will fold other constant offsets
        * into instr->const_index[0].
        */
-      assert(const_value->u[0] == 0);
+      assert(const_value->u32[0] == 0);
       return src_reg();
    }
 
@@ -378,13 +378,13 @@ vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
          continue;
 
       for (unsigned j = i; j < instr->def.num_components; j++) {
-         if (instr->value.u[i] == instr->value.u[j]) {
+         if (instr->value.u32[i] == instr->value.u32[j]) {
             writemask |= 1 << j;
          }
       }
 
       reg.writemask = writemask;
-      emit(MOV(reg, brw_imm_d(instr->value.i[i])));
+      emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
 
       remaining &= ~writemask;
    }
@@ -409,7 +409,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* We set EmitNoIndirectInput for VS */
       assert(const_offset);
 
-      src = src_reg(ATTR, instr->const_index[0] + const_offset->u[0],
+      src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
                     glsl_type::uvec4_type);
 
       dest = get_nir_dest(instr->dest, src.type);
@@ -423,7 +423,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       assert(const_offset);
 
-      int varying = instr->const_index[0] + const_offset->u[0];
+      int varying = instr->const_index[0] + const_offset->u32[0];
 
       src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
                         instr->num_components);
@@ -434,7 +434,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
    case nir_intrinsic_get_buffer_size: {
       nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
-      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u[0] : 0;
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
 
       const unsigned index =
          prog_data->base.binding_table.ssbo_start + ssbo_index;
@@ -467,7 +467,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          nir_src_as_const_value(instr->src[1]);
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
@@ -485,7 +485,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[2], 1);
       }
@@ -605,7 +605,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg surf_index;
       if (const_uniform_block) {
          unsigned index = prog_data->base.binding_table.ssbo_start +
-                          const_uniform_block->u[0];
+                          const_uniform_block->u32[0];
          surf_index = brw_imm_ud(index);
 
          brw_mark_surface_used(&prog_data->base, index);
@@ -626,7 +626,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset_reg;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset_reg = brw_imm_ud(const_offset->u[0]);
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
       } else {
          offset_reg = get_nir_src(instr->src[1], 1);
       }
@@ -706,8 +706,8 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
       if (const_offset) {
          /* Offsets are in bytes but they should always be multiples of 16 */
-         assert(const_offset->u[0] % 16 == 0);
-         src.reg_offset = const_offset->u[0] / 16;
+         assert(const_offset->u32[0] % 16 == 0);
+         src.reg_offset = const_offset->u32[0] / 16;
       } else {
          src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
          src.reladdr = new(mem_ctx) src_reg(tmp);
@@ -767,7 +767,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * as an immediate.
           */
          const unsigned index = prog_data->base.binding_table.ubo_start +
-                                const_block_index->u[0];
+                                const_block_index->u32[0];
          surf_index = brw_imm_ud(index);
          brw_mark_surface_used(&prog_data->base, index);
       } else {
@@ -792,7 +792,7 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       src_reg offset;
       nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
       if (const_offset) {
-         offset = brw_imm_ud(const_offset->u[0] & ~15);
+         offset = brw_imm_ud(const_offset->u32[0] & ~15);
       } else {
          offset = get_nir_src(instr->src[1], nir_type_int, 1);
       }
@@ -807,10 +807,10 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
       if (const_offset) {
-         packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4,
-                                               const_offset->u[0] % 16 / 4);
+         packed_consts.swizzle += BRW_SWIZZLE4(const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4,
+                                               const_offset->u32[0] % 16 / 4);
       }
 
       emit(MOV(dest, packed_consts));
@@ -852,7 +852,7 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
    nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
    if (const_surface) {
       unsigned surf_index = prog_data->base.binding_table.ssbo_start +
-                            const_surface->u[0];
+                            const_surface->u32[0];
       surface = brw_imm_ud(surf_index);
       brw_mark_surface_used(&prog_data->base, surf_index);
    } else {
@@ -1049,12 +1049,12 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
           * operand. If we can determine that one of the args is in the low
           * 16 bits, though, we can just emit a single MUL.
           */
-         if (value0 && value0->u[0] < (1 << 16)) {
+         if (value0 && value0->u32[0] < (1 << 16)) {
             if (devinfo->gen < 7)
                emit(MUL(dst, op[0], op[1]));
             else
                emit(MUL(dst, op[1], op[0]));
-         } else if (value1 && value1->u[0] < (1 << 16)) {
+         } else if (value1 && value1->u32[0] < (1 << 16)) {
             if (devinfo->gen < 7)
                emit(MUL(dst, op[1], op[0]));
             else
@@ -1725,7 +1725,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          nir_const_value *const_offset =
             nir_src_as_const_value(instr->src[i].src);
          if (const_offset) {
-            constant_offset = brw_texture_offset(const_offset->i, 3);
+            constant_offset = brw_texture_offset(const_offset->i32, 3);
          } else {
             offset_value =
                get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 8f77b59ea03..cb345157f81 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -355,7 +355,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
       src_reg vertex_index =
-         vertex_const ? src_reg(brw_imm_ud(vertex_const->u[0]))
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
                       : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
 
       dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);

From 3124ce699bb3844e793f00e00bfbea5c91744f90 Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Tue, 17 Nov 2015 13:57:54 +0100
Subject: [PATCH 008/197] nir: add a bit_size parameter to nir_ssa_dest_init
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: Squash multiple commits addressing the new parameter in different
    files so we don't break the build (Iago)

v3: Fix tgsi (Samuel)

v4: Fix nir_clone.c (Samuel)

v5: Fix vc4 and freedreno (Iago)

v6 (Sam)
- Fix build errors in nir_lower_indirect_derefs
- Use helper to get type size from nir_alu_type.

Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Tested-by: Rob Clark <robdclark@gmail.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/glsl_to_nir.cpp              | 22 ++++++++-----
 src/compiler/nir/nir.c                        | 14 ++++----
 src/compiler/nir/nir.h                        |  6 ++--
 src/compiler/nir/nir_builder.h                | 33 ++++++++++++++++---
 src/compiler/nir/nir_clone.c                  |  3 +-
 src/compiler/nir/nir_from_ssa.c               |  6 ++--
 src/compiler/nir/nir_lower_alu_to_scalar.c    | 10 +++---
 src/compiler/nir/nir_lower_atomics.c          |  6 ++--
 src/compiler/nir/nir_lower_clip.c             |  2 +-
 src/compiler/nir/nir_lower_indirect_derefs.c  |  6 ++--
 src/compiler/nir/nir_lower_io.c               |  6 ++--
 src/compiler/nir/nir_lower_locals_to_regs.c   |  7 ++--
 src/compiler/nir/nir_lower_phis_to_scalar.c   | 10 ++++--
 src/compiler/nir/nir_lower_tex.c              |  2 +-
 src/compiler/nir/nir_lower_two_sided_color.c  |  2 +-
 src/compiler/nir/nir_lower_var_copies.c       |  5 ++-
 src/compiler/nir/nir_lower_vars_to_ssa.c      | 12 +++++--
 src/compiler/nir/nir_opt_peephole_select.c    |  3 +-
 src/compiler/nir/nir_search.c                 |  5 +--
 src/compiler/nir/nir_to_ssa.c                 |  6 ++--
 src/gallium/auxiliary/nir/tgsi_to_nir.c       | 14 ++++----
 .../freedreno/ir3/ir3_nir_lower_if_else.c     |  2 +-
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c |  4 +--
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |  6 ++--
 .../drivers/vc4/vc4_nir_lower_txf_ms.c        |  2 +-
 src/gallium/drivers/vc4/vc4_program.c         |  2 +-
 .../dri/i965/brw_nir_opt_peephole_ffma.c      |  7 ++--
 src/mesa/program/prog_to_nir.c                | 10 +++---
 28 files changed, 137 insertions(+), 76 deletions(-)

diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 613b138ae59..ee9c05308d6 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -718,7 +718,7 @@ nir_visitor::visit(ir_call *ir)
          ir_dereference *param =
             (ir_dereference *) ir->actual_parameters.get_head();
          instr->variables[0] = evaluate_deref(&instr->instr, param);
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -752,7 +752,7 @@ nir_visitor::visit(ir_call *ir)
             const nir_intrinsic_info *info =
                     &nir_intrinsic_infos[instr->intrinsic];
             nir_ssa_dest_init(&instr->instr, &instr->dest,
-                              info->dest_components, NULL);
+                              info->dest_components, 32, NULL);
          }
 
          if (op == nir_intrinsic_image_size ||
@@ -813,7 +813,7 @@ nir_visitor::visit(ir_call *ir)
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_shader_clock:
-         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       case nir_intrinsic_store_ssbo: {
@@ -854,7 +854,7 @@ nir_visitor::visit(ir_call *ir)
 
          /* Setup destination register */
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, 32, NULL);
 
          /* Insert the created nir instruction now since in the case of boolean
           * result we will need to emit another instruction after it
@@ -877,7 +877,7 @@ nir_visitor::visit(ir_call *ir)
                load_ssbo_compare->src[1].swizzle[i] = 0;
             nir_ssa_dest_init(&load_ssbo_compare->instr,
                               &load_ssbo_compare->dest.dest,
-                              type->vector_elements, NULL);
+                              type->vector_elements, 32, NULL);
             load_ssbo_compare->dest.write_mask = (1 << type->vector_elements) - 1;
             nir_builder_instr_insert(&b, &load_ssbo_compare->instr);
             dest = &load_ssbo_compare->dest.dest;
@@ -923,7 +923,7 @@ nir_visitor::visit(ir_call *ir)
          /* Atomic result */
          assert(ir->return_deref);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements, 32, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -938,8 +938,9 @@ nir_visitor::visit(ir_call *ir)
          instr->num_components = type->vector_elements;
 
          /* Setup destination register */
+         unsigned bit_size = glsl_get_bit_size(type->base_type);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           type->vector_elements, NULL);
+                           type->vector_elements, bit_size, NULL);
 
          nir_builder_instr_insert(&b, &instr->instr);
          break;
@@ -1000,8 +1001,10 @@ nir_visitor::visit(ir_call *ir)
 
          /* Atomic result */
          assert(ir->return_deref);
+         unsigned bit_size = glsl_get_bit_size(ir->return_deref->type->base_type);
          nir_ssa_dest_init(&instr->instr, &instr->dest,
-                           ir->return_deref->type->vector_elements, NULL);
+                           ir->return_deref->type->vector_elements,
+                           bit_size, NULL);
          nir_builder_instr_insert(&b, &instr->instr);
          break;
       }
@@ -1150,7 +1153,7 @@ nir_visitor::add_instr(nir_instr *instr, unsigned num_components)
    nir_dest *dest = get_instr_dest(instr);
 
    if (dest)
-      nir_ssa_dest_init(instr, dest, num_components, NULL);
+      nir_ssa_dest_init(instr, dest, num_components, 32, NULL);
 
    nir_builder_instr_insert(&b, instr);
 
@@ -1190,6 +1193,7 @@ nir_visitor::visit(ir_expression *ir)
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(this->shader, nir_intrinsic_load_ubo);
       load->num_components = ir->type->vector_elements;
+      load->dest.ssa.bit_size = glsl_get_bit_size(ir->type->base_type);
       load->src[0] = nir_src_for_ssa(evaluate_rvalue(ir->operands[0]));
       load->src[1] = nir_src_for_ssa(evaluate_rvalue(ir->operands[1]));
       add_instr(&load->instr, ir->type->vector_elements);
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 8fa75e4e5dc..b11498132a6 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -474,7 +474,7 @@ nir_load_const_instr_create(nir_shader *shader, unsigned num_components)
    nir_load_const_instr *instr = ralloc(shader, nir_load_const_instr);
    instr_init(&instr->instr, nir_instr_type_load_const);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
 
    return instr;
 }
@@ -563,7 +563,7 @@ nir_ssa_undef_instr_create(nir_shader *shader, unsigned num_components)
    nir_ssa_undef_instr *instr = ralloc(shader, nir_ssa_undef_instr);
    instr_init(&instr->instr, nir_instr_type_ssa_undef);
 
-   nir_ssa_def_init(&instr->instr, &instr->def, num_components, NULL);
+   nir_ssa_def_init(&instr->instr, &instr->def, num_components, 32, NULL);
 
    return instr;
 }
@@ -1319,14 +1319,15 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
 
 void
 nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                 unsigned num_components, const char *name)
+                 unsigned num_components,
+                 unsigned bit_size, const char *name)
 {
    def->name = name;
    def->parent_instr = instr;
    list_inithead(&def->uses);
    list_inithead(&def->if_uses);
    def->num_components = num_components;
-   def->bit_size = 32; /* FIXME: Add an input paremeter or guess? */
+   def->bit_size = bit_size;
 
    if (instr->block) {
       nir_function_impl *impl =
@@ -1340,10 +1341,11 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
 
 void
 nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                 unsigned num_components, const char *name)
+                 unsigned num_components, unsigned bit_size,
+                 const char *name)
 {
    dest->is_ssa = true;
-   nir_ssa_def_init(instr, &dest->ssa, num_components, name);
+   nir_ssa_def_init(instr, &dest->ssa, num_components, bit_size, name);
 }
 
 void
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 2ddc2dc4404..36f90fc6fb7 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2014,9 +2014,11 @@ void nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest,
                             nir_dest new_dest);
 
 void nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
-                       unsigned num_components, const char *name);
+                       unsigned num_components, unsigned bit_size,
+                       const char *name);
 void nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
-                      unsigned num_components, const char *name);
+                      unsigned num_components, unsigned bit_size,
+                      const char *name);
 void nir_ssa_def_rewrite_uses(nir_ssa_def *def, nir_src new_src);
 void nir_ssa_def_rewrite_uses_after(nir_ssa_def *def, nir_src new_src,
                                     nir_instr *after_me);
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 02e4526dcaa..e2000200ea7 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -164,6 +164,25 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
    }
    assert(num_components != 0);
 
+   /* Figure out the bitwidth based on the source bitwidth if the instruction
+    * is variable-width.
+    */
+   unsigned bit_size = nir_alu_type_get_type_size(op_info->output_type);
+   if (bit_size == 0) {
+      for (unsigned i = 0; i < op_info->num_inputs; i++) {
+         unsigned src_bit_size = instr->src[i].src.ssa->bit_size;
+         if (nir_alu_type_get_type_size(op_info->input_types[i]) == 0) {
+            if (bit_size)
+               assert(src_bit_size == bit_size);
+            else
+               bit_size = src_bit_size;
+         } else {
+            assert(src_bit_size ==
+               nir_alu_type_get_type_size(op_info->input_types[i]));
+         }
+      }
+   }
+
    /* Make sure we don't swizzle from outside of our source vector (like if a
     * scalar value was passed into a multiply with a vector).
     */
@@ -173,7 +192,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
       }
    }
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
    instr->dest.write_mask = (1 << num_components) - 1;
 
    nir_builder_instr_insert(build, &instr->instr);
@@ -238,7 +258,8 @@ static inline nir_ssa_def *
 nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);
@@ -250,7 +271,8 @@ static inline nir_ssa_def *
 nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
 {
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov);
-   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
+                     nir_src_bit_size(src.src), NULL);
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);
@@ -329,7 +351,8 @@ nir_load_var(nir_builder *build, nir_variable *var)
       nir_intrinsic_instr_create(build->shader, nir_intrinsic_load_var);
    load->num_components = num_components;
    load->variables[0] = nir_deref_var_create(load, var);
-   nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components,
+                     glsl_get_bit_size(glsl_get_base_type(var->type)), NULL);
    nir_builder_instr_insert(build, &load->instr);
    return &load->dest.ssa;
 }
@@ -356,7 +379,7 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
    load->num_components = nir_intrinsic_infos[op].dest_components;
    load->const_index[0] = index;
    nir_ssa_dest_init(&load->instr, &load->dest,
-                     nir_intrinsic_infos[op].dest_components, NULL);
+                     nir_intrinsic_infos[op].dest_components, 32, NULL);
    nir_builder_instr_insert(build, &load->instr);
    return &load->dest.ssa;
 }
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 0b426e940b4..7444dfe6e59 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -220,7 +220,8 @@ __clone_dst(clone_state *state, nir_instr *ninstr,
 {
    ndst->is_ssa = dst->is_ssa;
    if (dst->is_ssa) {
-      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components, dst->ssa.name);
+      nir_ssa_dest_init(ninstr, ndst, dst->ssa.num_components,
+                        dst->ssa.bit_size, dst->ssa.name);
       add_remap(state, &ndst->ssa, &dst->ssa);
    } else {
       ndst->reg.reg = remap_reg(state, dst->reg.reg);
diff --git a/src/compiler/nir/nir_from_ssa.c b/src/compiler/nir/nir_from_ssa.c
index 8bc9f24e406..82317c21b62 100644
--- a/src/compiler/nir/nir_from_ssa.c
+++ b/src/compiler/nir/nir_from_ssa.c
@@ -342,7 +342,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
          nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                                   nir_parallel_copy_entry);
          nir_ssa_dest_init(&pcopy->instr, &entry->dest,
-                           phi->dest.ssa.num_components, src->src.ssa->name);
+                           phi->dest.ssa.num_components,
+                           phi->dest.ssa.bit_size, src->src.ssa->name);
          exec_list_push_tail(&pcopy->entries, &entry->node);
 
          assert(src->src.is_ssa);
@@ -355,7 +356,8 @@ isolate_phi_nodes_block(nir_block *block, void *void_state)
       nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
                                                nir_parallel_copy_entry);
       nir_ssa_dest_init(&block_pcopy->instr, &entry->dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components, phi->dest.ssa.bit_size,
+                        phi->dest.ssa.name);
       exec_list_push_tail(&block_pcopy->entries, &entry->node);
 
       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 312d2f99a1c..5b3281e0a13 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -31,9 +31,11 @@
  */
 
 static void
-nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components)
+nir_alu_ssa_dest_init(nir_alu_instr *instr, unsigned num_components,
+                      unsigned bit_size)
 {
-   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                     bit_size, NULL);
    instr->dest.write_mask = (1 << num_components) - 1;
 }
 
@@ -46,7 +48,7 @@ lower_reduction(nir_alu_instr *instr, nir_op chan_op, nir_op merge_op,
    nir_ssa_def *last = NULL;
    for (unsigned i = 0; i < num_components; i++) {
       nir_alu_instr *chan = nir_alu_instr_create(builder->shader, chan_op);
-      nir_alu_ssa_dest_init(chan, 1);
+      nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size);
       nir_alu_src_copy(&chan->src[0], &instr->src[0], chan);
       chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
       if (nir_op_infos[chan_op].num_inputs > 1) {
@@ -220,7 +222,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
             lower->src[i].swizzle[j] = instr->src[i].swizzle[src_chan];
       }
 
-      nir_alu_ssa_dest_init(lower, 1);
+      nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size);
       lower->dest.saturate = instr->dest.saturate;
       comps[chan] = &lower->dest.dest.ssa;
 
diff --git a/src/compiler/nir/nir_lower_atomics.c b/src/compiler/nir/nir_lower_atomics.c
index e066cf222a7..70381a7968a 100644
--- a/src/compiler/nir/nir_lower_atomics.c
+++ b/src/compiler/nir/nir_lower_atomics.c
@@ -100,7 +100,7 @@ lower_instr(nir_intrinsic_instr *instr,
          nir_instr_insert_before(&instr->instr, &atomic_counter_size->instr);
 
          nir_alu_instr *mul = nir_alu_instr_create(mem_ctx, nir_op_imul);
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
          mul->dest.write_mask = 0x1;
          nir_src_copy(&mul->src[0].src, &deref_array->indirect, mul);
          mul->src[1].src.is_ssa = true;
@@ -108,7 +108,7 @@ lower_instr(nir_intrinsic_instr *instr,
          nir_instr_insert_before(&instr->instr, &mul->instr);
 
          nir_alu_instr *add = nir_alu_instr_create(mem_ctx, nir_op_iadd);
-         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
          add->dest.write_mask = 0x1;
          add->src[0].src.is_ssa = true;
          add->src[0].src.ssa = &mul->dest.dest.ssa;
@@ -125,7 +125,7 @@ lower_instr(nir_intrinsic_instr *instr,
 
    if (instr->dest.is_ssa) {
       nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
-                        instr->dest.ssa.num_components, NULL);
+                        instr->dest.ssa.num_components, 32, NULL);
       nir_ssa_def_rewrite_uses(&instr->dest.ssa,
                                nir_src_for_ssa(&new_instr->dest.ssa));
    } else {
diff --git a/src/compiler/nir/nir_lower_clip.c b/src/compiler/nir/nir_lower_clip.c
index bcbad536874..c711230ad5b 100644
--- a/src/compiler/nir/nir_lower_clip.c
+++ b/src/compiler/nir/nir_lower_clip.c
@@ -88,7 +88,7 @@ load_clipdist_input(nir_builder *b, nir_variable *in, nir_ssa_def **val)
    load->num_components = 4;
    nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &load->instr);
 
    val[0] = nir_channel(b, &load->dest.ssa, 0);
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index a4affa7bdcf..62b8c84a956 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -75,8 +75,9 @@ emit_indirect_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
       if (src == NULL) {
          /* We're a load.  We need to insert a phi node */
          nir_phi_instr *phi = nir_phi_instr_create(b->shader);
+         unsigned bit_size = then_dest->bit_size;
          nir_ssa_dest_init(&phi->instr, &phi->dest,
-                           then_dest->num_components, NULL);
+                           then_dest->num_components, bit_size, NULL);
 
          nir_phi_src *src0 = ralloc(phi, nir_phi_src);
          src0->pred = nir_cf_node_as_block(nir_if_last_then_node(if_stmt));
@@ -125,8 +126,9 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
       load->num_components = orig_instr->num_components;
       load->variables[0] =
          nir_deref_as_var(nir_copy_deref(load, &deref->deref));
+      unsigned bit_size = orig_instr->dest.ssa.bit_size;
       nir_ssa_dest_init(&load->instr, &load->dest,
-                        load->num_components, NULL);
+                        load->num_components, bit_size, NULL);
       nir_builder_instr_insert(b, &load->instr);
       *dest = &load->dest.ssa;
    } else {
diff --git a/src/compiler/nir/nir_lower_io.c b/src/compiler/nir/nir_lower_io.c
index 408a221355d..d9af8bf3c7a 100644
--- a/src/compiler/nir/nir_lower_io.c
+++ b/src/compiler/nir/nir_lower_io.c
@@ -284,7 +284,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&load->instr, &load->dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&load->dest.ssa));
          } else {
@@ -364,7 +365,8 @@ nir_lower_io_block(nir_block *block, void *void_state)
 
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&atomic->instr, &atomic->dest,
-                              intrin->dest.ssa.num_components, NULL);
+                              intrin->dest.ssa.num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&atomic->dest.ssa));
          } else {
diff --git a/src/compiler/nir/nir_lower_locals_to_regs.c b/src/compiler/nir/nir_lower_locals_to_regs.c
index 235cb842908..0438802d3b2 100644
--- a/src/compiler/nir/nir_lower_locals_to_regs.c
+++ b/src/compiler/nir/nir_lower_locals_to_regs.c
@@ -169,7 +169,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
          mul->src[1].src.is_ssa = true;
          mul->src[1].src.ssa = &load_const->def;
          mul->dest.write_mask = 1;
-         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, NULL);
+         nir_ssa_dest_init(&mul->instr, &mul->dest.dest, 1, 32, NULL);
          nir_instr_insert_before(instr, &mul->instr);
 
          src.reg.indirect->is_ssa = true;
@@ -187,7 +187,7 @@ get_deref_reg_src(nir_deref_var *deref, nir_instr *instr,
             add->src[0].src = *src.reg.indirect;
             nir_src_copy(&add->src[1].src, &deref_array->indirect, add);
             add->dest.write_mask = 1;
-            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&add->instr, &add->dest.dest, 1, 32, NULL);
             nir_instr_insert_before(instr, &add->instr);
 
             src.reg.indirect->is_ssa = true;
@@ -221,7 +221,8 @@ lower_locals_to_regs_block(nir_block *block, void *void_state)
          mov->dest.write_mask = (1 << intrin->num_components) - 1;
          if (intrin->dest.is_ssa) {
             nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
                                      nir_src_for_ssa(&mov->dest.dest.ssa));
          } else {
diff --git a/src/compiler/nir/nir_lower_phis_to_scalar.c b/src/compiler/nir/nir_lower_phis_to_scalar.c
index dd2abcf72f8..026c8665f91 100644
--- a/src/compiler/nir/nir_lower_phis_to_scalar.c
+++ b/src/compiler/nir/nir_lower_phis_to_scalar.c
@@ -188,6 +188,8 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
       if (!should_lower_phi(phi, state))
          continue;
 
+      unsigned bit_size = phi->dest.ssa.bit_size;
+
       /* Create a vecN operation to combine the results.  Most of these
        * will be redundant, but copy propagation should clean them up for
        * us.  No need to add the complexity here.
@@ -202,12 +204,14 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
 
       nir_alu_instr *vec = nir_alu_instr_create(state->mem_ctx, vec_op);
       nir_ssa_dest_init(&vec->instr, &vec->dest.dest,
-                        phi->dest.ssa.num_components, NULL);
+                        phi->dest.ssa.num_components,
+                        bit_size, NULL);
       vec->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
       for (unsigned i = 0; i < phi->dest.ssa.num_components; i++) {
          nir_phi_instr *new_phi = nir_phi_instr_create(state->mem_ctx);
-         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1, NULL);
+         nir_ssa_dest_init(&new_phi->instr, &new_phi->dest, 1,
+                           phi->dest.ssa.bit_size, NULL);
 
          vec->src[i].src = nir_src_for_ssa(&new_phi->dest.ssa);
 
@@ -215,7 +219,7 @@ lower_phis_to_scalar_block(nir_block *block, void *void_state)
             /* We need to insert a mov to grab the i'th component of src */
             nir_alu_instr *mov = nir_alu_instr_create(state->mem_ctx,
                                                       nir_op_imov);
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, bit_size, NULL);
             mov->dest.write_mask = 1;
             nir_src_copy(&mov->src[0].src, &src->src, state->mem_ctx);
             mov->src[0].swizzle[0] = i;
diff --git a/src/compiler/nir/nir_lower_tex.c b/src/compiler/nir/nir_lower_tex.c
index f737463b877..4999603e592 100644
--- a/src/compiler/nir/nir_lower_tex.c
+++ b/src/compiler/nir/nir_lower_tex.c
@@ -140,7 +140,7 @@ get_texture_size(nir_builder *b, nir_tex_instr *tex)
    txs->src[0].src = nir_src_for_ssa(nir_imm_int(b, 0));
    txs->src[0].src_type = nir_tex_src_lod;
 
-   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 2, 32, NULL);
    nir_builder_instr_insert(b, &txs->instr);
 
    return nir_i2f(b, &txs->dest.ssa);
diff --git a/src/compiler/nir/nir_lower_two_sided_color.c b/src/compiler/nir/nir_lower_two_sided_color.c
index fe3507cb7a3..c7fb67e4f27 100644
--- a/src/compiler/nir/nir_lower_two_sided_color.c
+++ b/src/compiler/nir/nir_lower_two_sided_color.c
@@ -74,7 +74,7 @@ load_input(nir_builder *b, nir_variable *in)
    load->num_components = 4;
    nir_intrinsic_set_base(load, in->data.driver_location);
    load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-   nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+   nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &load->instr);
 
    return &load->dest.ssa;
diff --git a/src/compiler/nir/nir_lower_var_copies.c b/src/compiler/nir/nir_lower_var_copies.c
index 7db9839c369..c994f0fe12c 100644
--- a/src/compiler/nir/nir_lower_var_copies.c
+++ b/src/compiler/nir/nir_lower_var_copies.c
@@ -116,12 +116,15 @@ emit_copy_load_store(nir_intrinsic_instr *copy_instr,
       assert(src_tail->type == dest_tail->type);
 
       unsigned num_components = glsl_get_vector_elements(src_tail->type);
+      unsigned bit_size =
+         glsl_get_bit_size(glsl_get_base_type(src_tail->type));
 
       nir_intrinsic_instr *load =
          nir_intrinsic_instr_create(mem_ctx, nir_intrinsic_load_var);
       load->num_components = num_components;
       load->variables[0] = nir_deref_as_var(nir_copy_deref(load, &src_head->deref));
-      nir_ssa_dest_init(&load->instr, &load->dest, num_components, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, num_components, bit_size,
+                        NULL);
 
       nir_instr_insert_before(&copy_instr->instr, &load->instr);
 
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index 5e81f237c1a..2331791d135 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -650,7 +650,8 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
 
             mov->dest.write_mask = (1 << intrin->num_components) - 1;
             nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components, NULL);
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
 
             nir_instr_insert_before(&intrin->instr, &mov->instr);
             nir_instr_remove(&intrin->instr);
@@ -808,6 +809,8 @@ insert_phi_nodes(struct lower_variables_state *state)
       if (!node->lower_to_ssa)
          continue;
 
+      unsigned bit_size = glsl_get_bit_size(glsl_get_base_type(node->type));
+
       w_start = w_end = 0;
       iter_count++;
 
@@ -839,7 +842,8 @@ insert_phi_nodes(struct lower_variables_state *state)
             if (has_already[next->index] < iter_count) {
                nir_phi_instr *phi = nir_phi_instr_create(state->shader);
                nir_ssa_dest_init(&phi->instr, &phi->dest,
-                                 glsl_get_vector_elements(node->type), NULL);
+                                 glsl_get_vector_elements(node->type),
+                                 bit_size, NULL);
                nir_instr_insert_before_block(next, &phi->instr);
 
                _mesa_hash_table_insert(state->phi_table, phi, node);
@@ -932,7 +936,9 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
          nir_load_const_instr *load =
             nir_deref_get_const_initializer_load(state.shader, deref);
          nir_ssa_def_init(&load->instr, &load->def,
-                          glsl_get_vector_elements(node->type), NULL);
+                          glsl_get_vector_elements(node->type),
+                          glsl_get_bit_size(glsl_get_base_type(node->type)),
+                          NULL);
          nir_instr_insert_before_cf_list(&impl->body, &load->instr);
          def_stack_push(node, &load->def, &state);
       }
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 0fc658df861..bad9dc457ad 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -210,7 +210,8 @@ nir_opt_peephole_select_block(nir_block *block, void *void_state)
       }
 
       nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-                        phi->dest.ssa.num_components, phi->dest.ssa.name);
+                        phi->dest.ssa.num_components,
+                        phi->dest.ssa.bit_size, phi->dest.ssa.name);
       sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index e7164a76110..5a033bd1288 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -257,7 +257,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
          num_components = nir_op_infos[expr->opcode].output_size;
 
       nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
-      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, NULL);
+      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, 32, NULL);
       alu->dest.write_mask = (1 << num_components) - 1;
       alu->dest.saturate = false;
 
@@ -359,7 +359,8 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
    nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
    mov->dest.write_mask = instr->dest.write_mask;
    nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                     instr->dest.dest.ssa.num_components, NULL);
+                     instr->dest.dest.ssa.num_components,
+                     instr->dest.dest.ssa.bit_size, NULL);
 
    mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type,
                                  instr->dest.dest.ssa.num_components, &state,
diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c
index 44a50547738..06406071166 100644
--- a/src/compiler/nir/nir_to_ssa.c
+++ b/src/compiler/nir/nir_to_ssa.c
@@ -219,7 +219,8 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
                              state->states[index].num_defs);
 
    list_del(&dest->reg.def_link);
-   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components, name);
+   nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
+                     reg->bit_size, name);
 
    /* push our SSA destination on the stack */
    state->states[index].index++;
@@ -271,7 +272,8 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)
 
       instr->dest.write_mask = (1 << num_components) - 1;
       list_del(&instr->dest.dest.reg.def_link);
-      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components, name);
+      nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+                        reg->bit_size, name);
 
       if (nir_op_infos[instr->op].output_size == 0) {
          /*
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 80fd3b69d19..7ec8b662200 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -515,8 +515,8 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
                                            nir_intrinsic_load_var);
          load->num_components = 4;
          load->variables[0] = ttn_array_deref(c, load, var, offset, indirect);
-
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+         nir_ssa_dest_init(&load->instr, &load->dest,
+                           4, 32, NULL);
          nir_builder_instr_insert(b, &load->instr);
 
          src = nir_src_for_ssa(&load->dest.ssa);
@@ -567,7 +567,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       load = nir_intrinsic_instr_create(b->shader, op);
       load->num_components = ncomp;
 
-      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, ncomp, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
@@ -632,7 +632,7 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
       }
       load->src[srcn++] = nir_src_for_ssa(offset);
 
-      nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src = nir_src_for_ssa(&load->dest.ssa);
@@ -1425,7 +1425,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
 
    assert(src_number == num_srcs);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
@@ -1464,10 +1464,10 @@ ttn_txq(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    txs->src[0].src = nir_src_for_ssa(ttn_channel(b, src[0], X));
    txs->src[0].src_type = nir_tex_src_lod;
 
-   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, NULL);
+   nir_ssa_dest_init(&txs->instr, &txs->dest, 3, 32, NULL);
    nir_builder_instr_insert(b, &txs->instr);
 
-   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, NULL);
+   nir_ssa_dest_init(&qlv->instr, &qlv->dest, 1, 32, NULL);
    nir_builder_instr_insert(b, &qlv->instr);
 
    ttn_move_dest_masked(b, dest, &txs->dest.ssa, TGSI_WRITEMASK_XYZ);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index 8815ac981eb..ec76b0bdc4d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -290,7 +290,7 @@ lower_if_else_block(nir_block *block, void *void_state)
 		}
 
 		nir_ssa_dest_init(&sel->instr, &sel->dest.dest,
-				phi->dest.ssa.num_components, phi->dest.ssa.name);
+				phi->dest.ssa.num_components, 32, phi->dest.ssa.name);
 		sel->dest.write_mask = (1 << phi->dest.ssa.num_components) - 1;
 
 		nir_ssa_def_rewrite_uses(&phi->dest.ssa,
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index a13e309985a..49a314cdb25 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -62,7 +62,7 @@ vc4_nir_get_dst_color(nir_builder *b, int sample)
         load->num_components = 1;
         load->const_index[0] = VC4_NIR_TLB_COLOR_READ_INPUT + sample;
         load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-        nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+        nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
         nir_builder_instr_insert(b, &load->instr);
         return &load->dest.ssa;
 }
@@ -627,7 +627,7 @@ vc4_nir_lower_blend_instr(struct vc4_compile *c, nir_builder *b,
                         nir_intrinsic_instr_create(b->shader,
                                                    nir_intrinsic_load_sample_mask_in);
                 load->num_components = 1;
-                nir_ssa_dest_init(&load->instr, &load->dest, 1, NULL);
+                nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &load->instr);
 
                 nir_ssa_def *bitmask = &load->dest.ssa;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 941673c80fa..d08ad588e5b 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -197,7 +197,7 @@ vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b,
                 intr_comp->num_components = 1;
                 intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
                 intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &intr_comp->instr);
 
                 vpm_reads[i] = &intr_comp->dest.ssa;
@@ -267,7 +267,7 @@ vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b,
                 intr_comp->const_index[0] = intr->const_index[0] * 4 + i;
                 intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
 
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
                 nir_builder_instr_insert(b, &intr_comp->instr);
 
                 dests[i] = &intr_comp->dest.ssa;
@@ -378,7 +378,7 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                 nir_intrinsic_instr *intr_comp =
                         nir_intrinsic_instr_create(c->s, intr->intrinsic);
                 intr_comp->num_components = 1;
-                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, NULL);
+                nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL);
 
                 /* Convert the uniform (not user_clip_plane) offset to bytes.
                  * If it happens to be a constant, constant-folding will clean
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index f6ba5b802ad..a2d89ef3349 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -123,7 +123,7 @@ vc4_nir_lower_txf_ms_instr(struct vc4_compile *c, nir_builder *b,
 
         txf->src[0].src_type = nir_tex_src_coord;
         txf->src[0].src = nir_src_for_ssa(nir_vec2(b, addr, nir_imm_int(b, 0)));
-        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, NULL);
+        nir_ssa_dest_init(&txf->instr, &txf->dest, 4, 32, NULL);
         nir_builder_instr_insert(b, &txf->instr);
         nir_ssa_def_rewrite_uses(&txf_ms->dest.ssa,
                                  nir_src_for_ssa(&txf->dest.ssa));
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 7deca8761b8..71a1ebbb313 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -118,7 +118,7 @@ nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
         intr->const_index[0] = (VC4_NIR_STATE_UNIFORM_OFFSET + contents) * 4;
         intr->num_components = 1;
         intr->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
-        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, NULL);
+        nir_ssa_dest_init(&intr->instr, &intr->dest, 1, 32, NULL);
         nir_builder_instr_insert(b, &intr->instr);
         return &intr->dest.ssa;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
index 5ff2cba0464..49810c22cfa 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -201,6 +201,8 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
       if (mul == NULL)
          continue;
 
+      unsigned bit_size = add->dest.dest.ssa.bit_size;
+
       nir_ssa_def *mul_src[2];
       mul_src[0] = mul->src[0].src.ssa;
       mul_src[1] = mul->src[1].src.ssa;
@@ -220,7 +222,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
                                                       nir_op_fabs);
             abs->src[0].src = nir_src_for_ssa(mul_src[i]);
             nir_ssa_dest_init(&abs->instr, &abs->dest.dest,
-                              mul_src[i]->num_components, NULL);
+                              mul_src[i]->num_components, bit_size, NULL);
             abs->dest.write_mask = (1 << mul_src[i]->num_components) - 1;
             nir_instr_insert_before(&add->instr, &abs->instr);
             mul_src[i] = &abs->dest.dest.ssa;
@@ -232,7 +234,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
                                                    nir_op_fneg);
          neg->src[0].src = nir_src_for_ssa(mul_src[0]);
          nir_ssa_dest_init(&neg->instr, &neg->dest.dest,
-                           mul_src[0]->num_components, NULL);
+                           mul_src[0]->num_components, bit_size, NULL);
          neg->dest.write_mask = (1 << mul_src[0]->num_components) - 1;
          nir_instr_insert_before(&add->instr, &neg->instr);
          mul_src[0] = &neg->dest.dest.ssa;
@@ -253,6 +255,7 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
 
       nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest,
                         add->dest.dest.ssa.num_components,
+                        bit_size,
                         add->dest.dest.ssa.name);
       nir_ssa_def_rewrite_uses(&add->dest.dest.ssa,
                                nir_src_for_ssa(&ffma->dest.dest.ssa));
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 1f916ab9299..16b79c94c84 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -142,7 +142,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
       load->num_components = 4;
       load->variables[0] = nir_deref_var_create(load, c->input_vars[prog_src->Index]);
 
-      nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+      nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
       nir_builder_instr_insert(b, &load->instr);
 
       src.src = nir_src_for_ssa(&load->dest.ssa);
@@ -171,7 +171,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
 
          nir_intrinsic_instr *load =
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
-         nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
+         nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
          load->num_components = 4;
 
          load->variables[0] = nir_deref_var_create(load, c->parameters);
@@ -246,7 +246,7 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
          } else {
             assert(swizzle != SWIZZLE_NIL);
             nir_alu_instr *mov = nir_alu_instr_create(b->shader, nir_op_fmov);
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, NULL);
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest, 1, 32, NULL);
             mov->dest.write_mask = 0x1;
             mov->src[0] = src;
             mov->src[0].swizzle[0] = swizzle;
@@ -676,7 +676,7 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
 
    assert(src_number == num_srcs);
 
-   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, NULL);
+   nir_ssa_dest_init(&instr->instr, &instr->dest, 4, 32, NULL);
    nir_builder_instr_insert(b, &instr->instr);
 
    /* Resolve the writemask on the texture op. */
@@ -974,7 +974,7 @@ setup_registers_and_variables(struct ptn_compile *c)
                nir_intrinsic_instr_create(shader, nir_intrinsic_load_var);
             load_x->num_components = 1;
             load_x->variables[0] = nir_deref_var_create(load_x, var);
-            nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, NULL);
+            nir_ssa_dest_init(&load_x->instr, &load_x->dest, 1, 32, NULL);
             nir_builder_instr_insert(b, &load_x->instr);
 
             nir_ssa_def *f001 = nir_vec4(b, &load_x->dest.ssa, nir_imm_float(b, 0.0),

From 58fe7837b844da0c466a8573702d745f6f9975e6 Mon Sep 17 00:00:00 2001
From: Connor Abbott <connor.w.abbott@intel.com>
Date: Fri, 14 Aug 2015 11:45:30 -0700
Subject: [PATCH 009/197] nir: propagate bitsize information in nir_search
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When we replace an expresion we have to compute bitsize information for the
replacement. We do this in two passes to validate that bitsize information
is consistent and correct: first we propagate bitsize from child nodes to
parent, then we do it the other way around, starting from the original's
instruction destination bitsize.

v2 (Iago):
- Always use nir_type_bool32 instead of nir_type_bool when generating
  algebraic optimizations. Before we used nir_type_bool32 with constants
  and nir_type_bool with variables.
- Fix bool comparisons in nir_search.c to account for bitsized types.

v3 (Sam):
- Unpack the double constant value as unsigned long long (8 bytes) in
nir_algrebraic.py.

v4 (Sam):
- Use helpers to get type size and base type from nir_alu_type.

Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
Signed-off-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir_algebraic.py |  22 ++-
 src/compiler/nir/nir_search.c     | 246 +++++++++++++++++++++++++++---
 src/compiler/nir/nir_search.h     |   8 +-
 3 files changed, 248 insertions(+), 28 deletions(-)

diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index 2357b57117a..1818877a216 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -63,11 +63,11 @@ class Value(object):
 static const ${val.c_type} ${val.name} = {
    { ${val.type_enum} },
 % if isinstance(val, Constant):
-   { ${hex(val)} /* ${val.value} */ },
+   ${val.type()}, { ${hex(val)} /* ${val.value} */ },
 % elif isinstance(val, Variable):
    ${val.index}, /* ${val.var_name} */
    ${'true' if val.is_constant else 'false'},
-   nir_type_${ val.required_type or 'invalid' },
+   ${val.type() or 'nir_type_invalid' },
 % elif isinstance(val, Expression):
    nir_op_${val.opcode},
    { ${', '.join(src.c_ptr for src in val.sources)} },
@@ -107,10 +107,18 @@ class Constant(Value):
       if isinstance(self.value, (int, long)):
          return hex(self.value)
       elif isinstance(self.value, float):
-         return hex(struct.unpack('I', struct.pack('f', self.value))[0])
+         return hex(struct.unpack('Q', struct.pack('d', self.value))[0])
       else:
          assert False
 
+   def type(self):
+      if isinstance(self.value, (bool)):
+         return "nir_type_bool32"
+      elif isinstance(self.value, (int, long)):
+         return "nir_type_int"
+      elif isinstance(self.value, float):
+         return "nir_type_float"
+
 _var_name_re = re.compile(r"(?P<const>#)?(?P<name>\w+)(?:@(?P<type>\w+))?")
 
 class Variable(Value):
@@ -129,6 +137,14 @@ class Variable(Value):
 
       self.index = varset[self.var_name]
 
+   def type(self):
+      if self.required_type == 'bool':
+         return "nir_type_bool32"
+      elif self.required_type in ('int', 'unsigned'):
+         return "nir_type_int"
+      elif self.required_type == 'float':
+         return "nir_type_float"
+
 class Expression(Value):
    def __init__(self, expr, name_base, varset):
       Value.__init__(self, name_base, "expression")
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 5a033bd1288..6df662aa531 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -62,7 +62,8 @@ alu_instr_is_bool(nir_alu_instr *instr)
    case nir_op_inot:
       return src_is_bool(instr->src[0].src);
    default:
-      return nir_op_infos[instr->op].output_type == nir_type_bool;
+      return (nir_alu_type_get_base_type(nir_op_infos[instr->op].output_type)
+             == nir_type_bool);
    }
 }
 
@@ -125,8 +126,10 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
             nir_alu_instr *src_alu =
                nir_instr_as_alu(instr->src[src].src.ssa->parent_instr);
 
-            if (nir_op_infos[src_alu->op].output_type != var->type &&
-                !(var->type == nir_type_bool && alu_instr_is_bool(src_alu)))
+            if (nir_alu_type_get_base_type(nir_op_infos[src_alu->op].output_type) !=
+                var->type &&
+                !(nir_alu_type_get_base_type(var->type) == nir_type_bool &&
+                  alu_instr_is_bool(src_alu)))
                return false;
          }
 
@@ -158,21 +161,65 @@ match_value(const nir_search_value *value, nir_alu_instr *instr, unsigned src,
       nir_load_const_instr *load =
          nir_instr_as_load_const(instr->src[src].src.ssa->parent_instr);
 
-      switch (nir_op_infos[instr->op].input_types[src]) {
+      switch (const_val->type) {
       case nir_type_float:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.f32[new_swizzle[i]] != const_val->data.f)
+            double val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.f32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.f64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.d)
                return false;
          }
          return true;
+
       case nir_type_int:
-      case nir_type_uint:
-      case nir_type_bool:
          for (unsigned i = 0; i < num_components; ++i) {
-            if (load->value.i32[new_swizzle[i]] != const_val->data.i)
+            int64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.i32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.i64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.i)
                return false;
          }
          return true;
+
+      case nir_type_uint:
+      case nir_type_bool32:
+         for (unsigned i = 0; i < num_components; ++i) {
+            uint64_t val;
+            switch (load->def.bit_size) {
+            case 32:
+               val = load->value.u32[new_swizzle[i]];
+               break;
+            case 64:
+               val = load->value.u64[new_swizzle[i]];
+               break;
+            default:
+               unreachable("unknown bit size");
+            }
+
+            if (val != const_val->data.u)
+               return false;
+         }
+         return true;
+
       default:
          unreachable("Invalid alu source type");
       }
@@ -244,9 +291,123 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
    }
 }
 
+typedef struct bitsize_tree {
+   unsigned num_srcs;
+   struct bitsize_tree *srcs[4];
+
+   unsigned common_size;
+   bool is_src_sized[4];
+   bool is_dest_sized;
+
+   unsigned dest_size;
+   unsigned src_size[4];
+} bitsize_tree;
+
+static bitsize_tree *
+build_bitsize_tree(void *mem_ctx, struct match_state *state,
+                   const nir_search_value *value)
+{
+   bitsize_tree *tree = ralloc(mem_ctx, bitsize_tree);
+
+   switch (value->type) {
+   case nir_search_value_expression: {
+      nir_search_expression *expr = nir_search_value_as_expression(value);
+      nir_op_info info = nir_op_infos[expr->opcode];
+      tree->num_srcs = info.num_inputs;
+      tree->common_size = 0;
+      for (unsigned i = 0; i < info.num_inputs; i++) {
+         tree->is_src_sized[i] = !!nir_alu_type_get_type_size(info.input_types[i]);
+         if (tree->is_src_sized[i])
+            tree->src_size[i] = nir_alu_type_get_type_size(info.input_types[i]);
+         tree->srcs[i] = build_bitsize_tree(mem_ctx, state, expr->srcs[i]);
+      }
+      tree->is_dest_sized = !!nir_alu_type_get_type_size(info.output_type);
+      if (tree->is_dest_sized)
+         tree->dest_size = nir_alu_type_get_type_size(info.output_type);
+      break;
+   }
+
+   case nir_search_value_variable: {
+      nir_search_variable *var = nir_search_value_as_variable(value);
+      tree->num_srcs = 0;
+      tree->is_dest_sized = true;
+      tree->dest_size = nir_src_bit_size(state->variables[var->variable].src);
+      break;
+   }
+
+   case nir_search_value_constant: {
+      tree->num_srcs = 0;
+      tree->is_dest_sized = false;
+      tree->common_size = 0;
+      break;
+   }
+   }
+
+   return tree;
+}
+
+static unsigned
+bitsize_tree_filter_up(bitsize_tree *tree)
+{
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      unsigned src_size = bitsize_tree_filter_up(tree->srcs[i]);
+      if (src_size == 0)
+         continue;
+
+      if (tree->is_src_sized[i]) {
+         assert(src_size == tree->src_size[i]);
+      } else if (tree->common_size != 0) {
+         assert(src_size == tree->common_size);
+         tree->src_size[i] = src_size;
+      } else {
+         tree->common_size = src_size;
+         tree->src_size[i] = src_size;
+      }
+   }
+
+   if (tree->num_srcs && tree->common_size) {
+      if (tree->dest_size == 0)
+         tree->dest_size = tree->common_size;
+      else if (!tree->is_dest_sized)
+         assert(tree->dest_size == tree->common_size);
+
+      for (unsigned i = 0; i < tree->num_srcs; i++) {
+         if (!tree->src_size[i])
+            tree->src_size[i] = tree->common_size;
+      }
+   }
+
+   return tree->dest_size;
+}
+
+static void
+bitsize_tree_filter_down(bitsize_tree *tree, unsigned size)
+{
+   if (tree->dest_size)
+      assert(tree->dest_size == size);
+   else
+      tree->dest_size = size;
+
+   if (!tree->is_dest_sized) {
+      if (tree->common_size)
+         assert(tree->common_size == size);
+      else
+         tree->common_size = size;
+   }
+
+   for (unsigned i = 0; i < tree->num_srcs; i++) {
+      if (!tree->src_size[i]) {
+         assert(tree->common_size);
+         tree->src_size[i] = tree->common_size;
+      }
+      bitsize_tree_filter_down(tree->srcs[i], tree->src_size[i]);
+   }
+}
+
 static nir_alu_src
-construct_value(const nir_search_value *value, nir_alu_type type,
-                unsigned num_components, struct match_state *state,
+construct_value(const nir_search_value *value,
+                unsigned num_components, bitsize_tree *bitsize,
+                struct match_state *state,
                 nir_instr *instr, void *mem_ctx)
 {
    switch (value->type) {
@@ -257,7 +418,8 @@ construct_value(const nir_search_value *value, nir_alu_type type,
          num_components = nir_op_infos[expr->opcode].output_size;
 
       nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
-      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components, 32, NULL);
+      nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components,
+                        bitsize->dest_size, NULL);
       alu->dest.write_mask = (1 << num_components) - 1;
       alu->dest.saturate = false;
 
@@ -269,8 +431,7 @@ construct_value(const nir_search_value *value, nir_alu_type type,
             num_components = nir_op_infos[alu->op].input_sizes[i];
 
          alu->src[i] = construct_value(expr->srcs[i],
-                                       nir_op_infos[alu->op].input_types[i],
-                                       num_components,
+                                       num_components, bitsize->srcs[i],
                                        state, instr, mem_ctx);
       }
 
@@ -301,23 +462,57 @@ construct_value(const nir_search_value *value, nir_alu_type type,
       const nir_search_constant *c = nir_search_value_as_constant(value);
       nir_load_const_instr *load = nir_load_const_instr_create(mem_ctx, 1);
 
-      switch (type) {
+      switch (c->type) {
       case nir_type_float:
-         load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.f);
-         load->value.f32[0] = c->data.f;
+         load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.d);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.f32[0] = c->data.d;
+            break;
+         case 64:
+            load->value.f64[0] = c->data.d;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
          break;
+
       case nir_type_int:
-         load->def.name = ralloc_asprintf(mem_ctx, "%d", c->data.i);
-         load->value.i32[0] = c->data.i;
+         load->def.name = ralloc_asprintf(mem_ctx, "%ld", c->data.i);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.i32[0] = c->data.i;
+            break;
+         case 64:
+            load->value.i64[0] = c->data.i;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
          break;
+
       case nir_type_uint:
-      case nir_type_bool:
+         load->def.name = ralloc_asprintf(mem_ctx, "%lu", c->data.u);
+         switch (bitsize->dest_size) {
+         case 32:
+            load->value.u32[0] = c->data.u;
+            break;
+         case 64:
+            load->value.u64[0] = c->data.u;
+            break;
+         default:
+            unreachable("unknown bit size");
+         }
+
+      case nir_type_bool32:
          load->value.u32[0] = c->data.u;
          break;
       default:
          unreachable("Invalid alu source type");
       }
 
+      load->def.bit_size = bitsize->dest_size;
+
       nir_instr_insert_before(instr, &load->instr);
 
       nir_alu_src val;
@@ -352,6 +547,11 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                          swizzle, &state))
       return NULL;
 
+   void *bitsize_ctx = ralloc_context(NULL);
+   bitsize_tree *tree = build_bitsize_tree(bitsize_ctx, &state, replace);
+   bitsize_tree_filter_up(tree);
+   bitsize_tree_filter_down(tree, instr->dest.dest.ssa.bit_size);
+
    /* Inserting a mov may be unnecessary.  However, it's much easier to
     * simply let copy propagation clean this up than to try to go through
     * and rewrite swizzles ourselves.
@@ -362,9 +562,9 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                      instr->dest.dest.ssa.num_components,
                      instr->dest.dest.ssa.bit_size, NULL);
 
-   mov->src[0] = construct_value(replace, nir_op_infos[instr->op].output_type,
-                                 instr->dest.dest.ssa.num_components, &state,
-                                 &instr->instr, mem_ctx);
+   mov->src[0] = construct_value(replace,
+                                 instr->dest.dest.ssa.num_components,
+                                 tree, &state, &instr->instr, mem_ctx);
    nir_instr_insert_before(&instr->instr, &mov->instr);
 
    nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa,
@@ -376,5 +576,7 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
     */
    nir_instr_remove(&instr->instr);
 
+   ralloc_free(bitsize_ctx);
+
    return mov;
 }
diff --git a/src/compiler/nir/nir_search.h b/src/compiler/nir/nir_search.h
index 7d47792945e..321d6d00355 100644
--- a/src/compiler/nir/nir_search.h
+++ b/src/compiler/nir/nir_search.h
@@ -71,10 +71,12 @@ typedef struct {
 typedef struct {
    nir_search_value value;
 
+   nir_alu_type type;
+
    union {
-      uint32_t u;
-      int32_t i;
-      float f;
+      uint64_t u;
+      int64_t i;
+      double d;
    } data;
 } nir_search_constant;
 

From 29d26f1522d7f7be8f7d7791e37c3fcd8ac4544a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 12 Jan 2016 15:19:54 +0100
Subject: [PATCH 010/197] gallium/winsys/drm: add offset to struct
 winsys_handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are going to need this for EGL_EXT_image_dma_buf_import.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/state_tracker/drm_driver.h    | 5 +++++
 src/gallium/state_trackers/dri/dri2.c             | 2 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c         | 1 +
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c     | 1 +
 src/gallium/winsys/svga/drm/vmw_screen_dri.c      | 1 +
 src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c | 3 +++
 6 files changed, 13 insertions(+)

diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index 959a7625e30..d81da8911e0 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -35,6 +35,11 @@ struct winsys_handle
     * Output for texture_get_handle.
     */
    unsigned stride;
+   /**
+    * Input to texture_from_handle.
+    * Output for texture_get_handle.
+    */
+   unsigned offset;
 };
 
 
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 7f7fbc47e6d..fb0a1802cf2 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -534,6 +534,7 @@ dri2_allocate_textures(struct dri_context *ctx,
          templ.bind = bind;
          whandle.handle = buf->name;
          whandle.stride = buf->pitch;
+         whandle.offset = 0;
          if (screen->can_share_buffer)
             whandle.type = DRM_API_HANDLE_TYPE_SHARED;
          else
@@ -756,6 +757,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
    templ.array_size = 1;
 
    whandle->stride = pitch * util_format_get_blocksize(pf);
+   whandle->offset = 0;
 
    img->texture = screen->base.screen->resource_from_handle(screen->base.screen,
          &templ, whandle, PIPE_HANDLE_USAGE_READ_WRITE);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index b670f263329..70993297ceb 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -637,6 +637,7 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
       return FALSE;
 
    whandle->stride = stride;
+   whandle->offset = 0;
    bo->is_shared = true;
    return TRUE;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 978df52447e..9e5d7d2e7a0 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -1025,6 +1025,7 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
     }
 
     whandle->stride = stride;
+    whandle->offset = 0;
     return TRUE;
 }
 
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_dri.c b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
index 01bb0e2d753..baa22a90beb 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_dri.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_dri.c
@@ -357,6 +357,7 @@ vmw_drm_surface_get_handle(struct svga_winsys_screen *sws,
     vsrf = vmw_svga_winsys_surface(surface);
     whandle->handle = vsrf->sid;
     whandle->stride = stride;
+    whandle->offset = 0;
 
     switch (whandle->type) {
     case DRM_API_HANDLE_TYPE_SHARED:
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 1e859717f1c..9aaee8844be 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -309,17 +309,20 @@ kms_sw_displaytarget_get_handle(struct sw_winsys *winsys,
    case DRM_API_HANDLE_TYPE_KMS:
       whandle->handle = kms_sw_dt->handle;
       whandle->stride = kms_sw_dt->stride;
+      whandle->offset = 0;
       return TRUE;
    case DRM_API_HANDLE_TYPE_FD:
       if (!drmPrimeHandleToFD(kms_sw->fd, kms_sw_dt->handle,
                              DRM_CLOEXEC, (int*)&whandle->handle)) {
          whandle->stride = kms_sw_dt->stride;
+         whandle->offset = 0;
          return TRUE;
       }
       /* fallthrough */
    default:
       whandle->handle = 0;
       whandle->stride = 0;
+      whandle->offset = 0;
       return FALSE;
    }
 }

From f1e78a48f2ec2645eb87d134c6961815dc89a307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 13:51:18 +0100
Subject: [PATCH 011/197] gallium/winsys/drm: add layer to struct winsys_handle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For exporting a specific layer of an array texture.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/state_tracker/drm_driver.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/include/state_tracker/drm_driver.h b/src/gallium/include/state_tracker/drm_driver.h
index d81da8911e0..fefab11cccd 100644
--- a/src/gallium/include/state_tracker/drm_driver.h
+++ b/src/gallium/include/state_tracker/drm_driver.h
@@ -25,6 +25,11 @@ struct winsys_handle
     * to select handle for kms, flink, or prime.
     */
    unsigned type;
+   /**
+    * Input for texture_get_handle, allows to export the offset
+    * of a specific layer of an array texture.
+    */
+   unsigned layer;
    /**
     * Input to texture_from_handle.
     * Output for texture_get_handle.

From 04bc082f6a8bfc3b3774bb102d3200317609432e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 12 Jan 2016 15:59:11 +0100
Subject: [PATCH 012/197] radeon/winsys: add offset support for BO
 import/export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add offset support to handle NV12 offsets as well.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r300/r300_texture.c       |  6 ++---
 src/gallium/drivers/radeon/r600_texture.c     | 23 +++++++++++++------
 src/gallium/drivers/radeon/radeon_winsys.h    |  4 ++--
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     |  9 +++++---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c |  9 +++++---
 5 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 57456c6d867..22a613f1e52 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -981,8 +981,8 @@ boolean r300_resource_get_handle(struct pipe_screen* screen,
         return FALSE;
     }
 
-    return rws->buffer_get_handle(tex->buf,
-                                  tex->tex.stride_in_bytes[0], whandle);
+    return rws->buffer_get_handle(tex->buf, tex->tex.stride_in_bytes[0],
+                                  0, whandle);
 }
 
 static const struct u_resource_vtbl r300_texture_vtbl =
@@ -1116,7 +1116,7 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
         return NULL;
     }
 
-    buffer = rws->buffer_from_handle(rws, whandle, &stride);
+    buffer = rws->buffer_from_handle(rws, whandle, &stride, NULL);
     if (!buffer)
         return NULL;
 
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 115c7289c4c..15818aaae6f 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -201,9 +201,11 @@ static int r600_init_surface(struct r600_common_screen *rscreen,
 
 static int r600_setup_surface(struct pipe_screen *screen,
 			      struct r600_texture *rtex,
-			      unsigned pitch_in_bytes_override)
+			      unsigned pitch_in_bytes_override,
+			      unsigned offset)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	unsigned i;
 	int r;
 
 	r = rscreen->ws->surface_init(rscreen->ws, &rtex->surface);
@@ -225,6 +227,11 @@ static int r600_setup_surface(struct pipe_screen *screen,
 			rtex->surface.stencil_level[0].offset = rtex->surface.level[0].slice_size;
 		}
 	}
+
+	if (offset) {
+		for (i = 0; i < Elements(rtex->surface.level); ++i)
+			rtex->surface.level[i].offset += offset;
+	}
 	return 0;
 }
 
@@ -366,6 +373,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 
 	return rscreen->ws->buffer_get_handle(res->buf,
 					      rtex->surface.level[0].pitch_bytes,
+					      rtex->surface.level[0].offset,
 					      whandle);
 }
 
@@ -791,6 +799,7 @@ static struct r600_texture *
 r600_texture_create_object(struct pipe_screen *screen,
 			   const struct pipe_resource *base,
 			   unsigned pitch_in_bytes_override,
+			   unsigned offset,
 			   struct pb_buffer *buf,
 			   struct radeon_surf *surface)
 {
@@ -812,7 +821,7 @@ r600_texture_create_object(struct pipe_screen *screen,
 	rtex->is_depth = util_format_has_depth(util_format_description(rtex->resource.b.b.format));
 
 	rtex->surface = *surface;
-	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override)) {
+	if (r600_setup_surface(screen, rtex, pitch_in_bytes_override, offset)) {
 		FREE(rtex);
 		return NULL;
 	}
@@ -979,7 +988,7 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 	if (r) {
 		return NULL;
 	}
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ,
+	return (struct pipe_resource *)r600_texture_create_object(screen, templ, 0,
 								  0, NULL, &surface);
 }
 
@@ -990,7 +999,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pb_buffer *buf = NULL;
-	unsigned stride = 0;
+	unsigned stride = 0, offset = 0;
 	unsigned array_mode;
 	struct radeon_surf surface;
 	int r;
@@ -1002,7 +1011,7 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	      templ->depth0 != 1 || templ->last_level != 0)
 		return NULL;
 
-	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride);
+	buf = rscreen->ws->buffer_from_handle(rscreen->ws, whandle, &stride, &offset);
 	if (!buf)
 		return NULL;
 
@@ -1029,8 +1038,8 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	if (metadata.scanout)
 		surface.flags |= RADEON_SURF_SCANOUT;
 
-	rtex = r600_texture_create_object(screen, templ,
-					  stride, buf, &surface);
+	rtex = r600_texture_create_object(screen, templ, stride,
+					  offset, buf, &surface);
 	if (!rtex)
 		return NULL;
 
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index b8a065957a7..fb3f051be2b 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -515,7 +515,7 @@ struct radeon_winsys {
      */
     struct pb_buffer *(*buffer_from_handle)(struct radeon_winsys *ws,
                                             struct winsys_handle *whandle,
-                                            unsigned *stride);
+                                            unsigned *stride, unsigned *offset);
 
     /**
      * Get a winsys buffer from a user pointer. The resulting buffer can't
@@ -546,7 +546,7 @@ struct radeon_winsys {
      * \return          TRUE on success.
      */
     boolean (*buffer_get_handle)(struct pb_buffer *buf,
-                                 unsigned stride,
+                                 unsigned stride, unsigned offset,
                                  struct winsys_handle *whandle);
 
     /**
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 70993297ceb..3f74dd7da5e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -519,7 +519,8 @@ amdgpu_bo_create(struct radeon_winsys *rws,
 
 static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
                                                struct winsys_handle *whandle,
-                                               unsigned *stride)
+                                               unsigned *stride,
+                                               unsigned *offset)
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_winsys_bo *bo;
@@ -587,6 +588,8 @@ static struct pb_buffer *amdgpu_bo_from_handle(struct radeon_winsys *rws,
 
    if (stride)
       *stride = whandle->stride;
+   if (offset)
+      *offset = whandle->offset;
 
    if (bo->initial_domain & RADEON_DOMAIN_VRAM)
       ws->allocated_vram += align(bo->base.size, ws->gart_page_size);
@@ -609,7 +612,7 @@ error:
 }
 
 static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
-                                    unsigned stride,
+                                    unsigned stride, unsigned offset,
                                     struct winsys_handle *whandle)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
@@ -637,7 +640,7 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
       return FALSE;
 
    whandle->stride = stride;
-   whandle->offset = 0;
+   whandle->offset = offset;
    bo->is_shared = true;
    return TRUE;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 9e5d7d2e7a0..3f39827b307 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -851,7 +851,8 @@ static struct pb_buffer *radeon_winsys_bo_from_ptr(struct radeon_winsys *rws,
 
 static struct pb_buffer *radeon_winsys_bo_from_handle(struct radeon_winsys *rws,
                                                       struct winsys_handle *whandle,
-                                                      unsigned *stride)
+                                                      unsigned *stride,
+                                                      unsigned *offset)
 {
     struct radeon_drm_winsys *ws = radeon_drm_winsys(rws);
     struct radeon_bo *bo;
@@ -941,6 +942,8 @@ done:
 
     if (stride)
         *stride = whandle->stride;
+    if (offset)
+        *offset = whandle->offset;
 
     if (ws->info.has_virtual_memory && !bo->va) {
         struct drm_radeon_gem_va va;
@@ -991,7 +994,7 @@ fail:
 }
 
 static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
-                                           unsigned stride,
+                                           unsigned stride, unsigned offset,
                                            struct winsys_handle *whandle)
 {
     struct drm_gem_flink flink;
@@ -1025,7 +1028,7 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
     }
 
     whandle->stride = stride;
-    whandle->offset = 0;
+    whandle->offset = offset;
     return TRUE;
 }
 

From 5aea0d691988af945e09e1d7cca28ca0759cc309 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 15:01:39 +0100
Subject: [PATCH 013/197] radeon/winsys: add layer support for BO export
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add layer support to export individual array layers.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_texture.c     | 1 +
 src/gallium/drivers/radeon/radeon_winsys.h    | 1 +
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     | 2 ++
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 3 +++
 4 files changed, 7 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 15818aaae6f..1ad70765781 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -374,6 +374,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	return rscreen->ws->buffer_get_handle(res->buf,
 					      rtex->surface.level[0].pitch_bytes,
 					      rtex->surface.level[0].offset,
+					      rtex->surface.level[0].slice_size,
 					      whandle);
 }
 
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index fb3f051be2b..daa15db2812 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -547,6 +547,7 @@ struct radeon_winsys {
      */
     boolean (*buffer_get_handle)(struct pb_buffer *buf,
                                  unsigned stride, unsigned offset,
+                                 unsigned slice_size,
                                  struct winsys_handle *whandle);
 
     /**
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 3f74dd7da5e..c79bed45753 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -613,6 +613,7 @@ error:
 
 static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
                                     unsigned stride, unsigned offset,
+                                    unsigned slice_size,
                                     struct winsys_handle *whandle)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(buffer);
@@ -641,6 +642,7 @@ static boolean amdgpu_bo_get_handle(struct pb_buffer *buffer,
 
    whandle->stride = stride;
    whandle->offset = offset;
+   whandle->offset += slice_size * whandle->layer;
    bo->is_shared = true;
    return TRUE;
 }
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 3f39827b307..08856dff430 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -995,6 +995,7 @@ fail:
 
 static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
                                            unsigned stride, unsigned offset,
+                                           unsigned slice_size,
                                            struct winsys_handle *whandle)
 {
     struct drm_gem_flink flink;
@@ -1029,6 +1030,8 @@ static boolean radeon_winsys_bo_get_handle(struct pb_buffer *buffer,
 
     whandle->stride = stride;
     whandle->offset = offset;
+    whandle->offset += slice_size * whandle->layer;
+
     return TRUE;
 }
 

From 84b961dd53a0509a6865d8417301838b34a40096 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 17 Mar 2016 09:47:21 -0600
Subject: [PATCH 014/197] r300g: add missing layer argument to
 rws->buffer_get_handle() call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes compilation error since 5aea0d691.

Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/drivers/r300/r300_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 22a613f1e52..709345a492e 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -982,7 +982,7 @@ boolean r300_resource_get_handle(struct pipe_screen* screen,
     }
 
     return rws->buffer_get_handle(tex->buf, tex->tex.stride_in_bytes[0],
-                                  0, whandle);
+                                  0, 0, whandle);
 }
 
 static const struct u_resource_vtbl r300_texture_vtbl =

From e571f11de8c325f9a254c9f5f724d672e48530f1 Mon Sep 17 00:00:00 2001
From: Romain Failliot <romain.failliot@foolstep.com>
Date: Tue, 15 Mar 2016 16:14:08 -0400
Subject: [PATCH 015/197] docs: howto to read and edit GL3.txt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added a small guide on how to read and edit GL3.txt.
I think this would help as much the devs as the users reading this file.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 docs/GL3.txt | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index ee7facafc95..1ed5c1a8c68 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -1,13 +1,28 @@
+# Status of OpenGL extensions in Mesa
 
-Status of OpenGL 3.x features in Mesa
+Here's how to read this file:
 
+all DONE: <driver>, ...
+    All the extensions are done for the given list of drivers.
 
-Note: when an item is marked as "DONE" it means all the core Mesa
-infrastructure is complete but it may be the case that few (if any) drivers
-implement the features.
+DONE
+    The extension is done for Mesa and no implementation is necessary on the
+    driver-side.
 
+DONE ()
+    The extension is done for Mesa and all the drivers in the "all DONE" list.
 
-OpenGL Core and Compatibility context support
+DONE (<driver>, ...)
+    The extension is done for Mesa, all the drivers in the "all DONE" list, and
+    all the drivers in the brackets.
+
+in progress
+    The extension is started but not finished yet.
+
+not started
+    The extension isn't started yet.
+
+# OpenGL Core and Compatibility context support
 
 OpenGL 3.1 and later versions are only supported with the Core profile.
 There are no plans to support GL_ARB_compatibility. The last supported OpenGL

From 3671bb3eaf12d8b2c351fadf276e2136218ec38b Mon Sep 17 00:00:00 2001
From: Romain Failliot <romain.failliot@foolstep.com>
Date: Tue, 15 Mar 2016 16:14:09 -0400
Subject: [PATCH 016/197] docs: Realign the "Status" column.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "Status" column was misaligned in some GL sections.
This is a lot of diffs, but it's only spaces in the end.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 docs/GL3.txt | 278 +++++++++++++++++++++++++--------------------------
 1 file changed, 139 insertions(+), 139 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 1ed5c1a8c68..17522d53ad1 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -30,8 +30,8 @@ version with all deprecated features is 3.0. Some of the later GL features
 are exposed in the 3.0 context as extensions.
 
 
-Feature                                               Status
------------------------------------------------------ ------------------------
+Feature                                                 Status
+------------------------------------------------------- ------------------------
 
 GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
@@ -109,170 +109,170 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
 
 GL 4.0, GLSL 4.00 --- all DONE: nvc0, r600, radeonsi
 
-  GL_ARB_draw_buffers_blend                            DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, llvmpipe, softpipe)
-  GL_ARB_gpu_shader5                                   DONE (i965)
-  - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (softpipe)
-  - Dynamically uniform UBO array indices              DONE ()
-  - Implicit signed -> unsigned conversions            DONE
-  - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (softpipe)
-  - Enhanced textureGather                             DONE (softpipe)
-  - Geometry shader instancing                         DONE (llvmpipe, softpipe)
-  - Geometry shader multiple streams                   DONE ()
-  - Enhanced per-sample shading                        DONE ()
-  - Interpolation functions                            DONE ()
-  - New overload resolution rules                      DONE
-  GL_ARB_gpu_shader_fp64                               DONE (llvmpipe, softpipe)
-  GL_ARB_sample_shading                                DONE (i965, nv50)
-  GL_ARB_shader_subroutine                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_tessellation_shader                           DONE (i965)
-  GL_ARB_texture_buffer_object_rgb32                   DONE (i965, llvmpipe, softpipe)
-  GL_ARB_texture_cube_map_array                        DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_texture_query_lod                             DONE (i965, nv50, softpipe)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_buffers_blend                             DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                  DONE (i965, llvmpipe, softpipe)
+  GL_ARB_gpu_shader5                                    DONE (i965)
+  - 'precise' qualifier                                 DONE
+  - Dynamically uniform sampler array indices           DONE (softpipe)
+  - Dynamically uniform UBO array indices               DONE ()
+  - Implicit signed -> unsigned conversions             DONE
+  - Fused multiply-add                                  DONE ()
+  - Packing/bitfield/conversion functions               DONE (softpipe)
+  - Enhanced textureGather                              DONE (softpipe)
+  - Geometry shader instancing                          DONE (llvmpipe, softpipe)
+  - Geometry shader multiple streams                    DONE ()
+  - Enhanced per-sample shading                         DONE ()
+  - Interpolation functions                             DONE ()
+  - New overload resolution rules                       DONE
+  GL_ARB_gpu_shader_fp64                                DONE (llvmpipe, softpipe)
+  GL_ARB_sample_shading                                 DONE (i965, nv50)
+  GL_ARB_shader_subroutine                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_tessellation_shader                            DONE (i965)
+  GL_ARB_texture_buffer_object_rgb32                    DONE (i965, llvmpipe, softpipe)
+  GL_ARB_texture_cube_map_array                         DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_gather                                 DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_texture_query_lod                              DONE (i965, nv50, softpipe)
+  GL_ARB_transform_feedback2                            DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                            DONE (i965, nv50, llvmpipe, softpipe)
 
 
 GL 4.1, GLSL 4.10 --- all DONE: nvc0, r600, radeonsi
 
-  GL_ARB_ES2_compatibility                             DONE (i965, nv50, llvmpipe, softpipe)
-  GL_ARB_get_program_binary                            DONE (0 binary formats)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_shader_precision                              DONE (all drivers that support GLSL 4.10)
-  GL_ARB_vertex_attrib_64bit                           DONE (llvmpipe, softpipe)
-  GL_ARB_viewport_array                                DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_ES2_compatibility                              DONE (i965, nv50, llvmpipe, softpipe)
+  GL_ARB_get_program_binary                             DONE (0 binary formats)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_shader_precision                               DONE (all drivers that support GLSL 4.10)
+  GL_ARB_vertex_attrib_64bit                            DONE (llvmpipe, softpipe)
+  GL_ARB_viewport_array                                 DONE (i965, nv50, llvmpipe, softpipe)
 
 
 GL 4.2, GLSL 4.20:
 
-  GL_ARB_texture_compression_bptc                      DONE (i965, nvc0, r600, radeonsi)
-  GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_texture_storage                               DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_420pack                      DONE (all drivers that support GLSL 1.30)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_internalformat_query                          DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_map_buffer_alignment                          DONE (all drivers)
+  GL_ARB_texture_compression_bptc                       DONE (i965, nvc0, r600, radeonsi)
+  GL_ARB_compressed_texture_pixel_storage               DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_texture_storage                                DONE (all drivers)
+  GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_shader_image_load_store                        DONE (i965)
+  GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_internalformat_query                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_map_buffer_alignment                           DONE (all drivers)
 
 
 GL 4.3, GLSL 4.30:
 
-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
-  GL_ARB_clear_buffer_object                           DONE (all drivers)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_copy_image                                    DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_internalformat_query2                         DONE (i965)
-  GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_robust_buffer_access_behavior                 not started
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
-  GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_ES3_compatibility                              DONE (all drivers that support GLSL 3.30)
+  GL_ARB_clear_buffer_object                            DONE (all drivers)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_copy_image                                     DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_internalformat_query2                          DONE (i965)
+  GL_ARB_invalidate_subdata                             DONE (all drivers)
+  GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_robust_buffer_access_behavior                  not started
+  GL_ARB_shader_image_size                              DONE (i965)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_buffer_range                           DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
+  GL_ARB_texture_query_levels                           DONE (all drivers that support GLSL 1.30)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_texture_view                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
 
 
 GL 4.4, GLSL 4.40:
 
-  GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
-  GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
-  GL_ARB_enhanced_layouts                              in progress (Timothy)
-  - compile-time constant expressions                  DONE
-  - explicit byte offsets for blocks                   DONE
-  - forced alignment within blocks                     DONE
-  - specified vec4-slot component numbers              in progress
-  - specified transform/feedback layout                in progress
-  - input/output block locations                       DONE
-  GL_ARB_multi_bind                                    DONE (all drivers)
-  GL_ARB_query_buffer_object                           DONE (nvc0)
-  GL_ARB_texture_mirror_clamp_to_edge                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_stencil8                              DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_vertex_type_10f_11f_11f_rev                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_MAX_VERTEX_ATTRIB_STRIDE                           DONE (all drivers)
+  GL_ARB_buffer_storage                                 DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_clear_texture                                  DONE (i965, nv50, nvc0)
+  GL_ARB_enhanced_layouts                               in progress (Timothy)
+  - compile-time constant expressions                   DONE
+  - explicit byte offsets for blocks                    DONE
+  - forced alignment within blocks                      DONE
+  - specified vec4-slot component numbers               in progress
+  - specified transform/feedback layout                 in progress
+  - input/output block locations                        DONE
+  GL_ARB_multi_bind                                     DONE (all drivers)
+  GL_ARB_query_buffer_object                            DONE (nvc0)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_stencil8                               DONE (nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
 
 GL 4.5, GLSL 4.50:
 
-  GL_ARB_ES3_1_compatibility                           not started
-  GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_cull_distance                                 in progress (Tobias)
-  GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_direct_state_access                           DONE (all drivers)
-  GL_ARB_get_texture_sub_image                         DONE (all drivers)
-  GL_ARB_shader_texture_image_samples                  DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_texture_barrier                               DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_KHR_context_flush_control                         DONE (all - but needs GLX/EGL extension to be useful)
-  GL_KHR_robust_buffer_access_behavior                 not started
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_ES3_1_compatibility                            not started
+  GL_ARB_clip_control                                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_conditional_render_inverted                    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_cull_distance                                  in progress (Tobias)
+  GL_ARB_derivative_control                             DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_direct_state_access                            DONE (all drivers)
+  GL_ARB_get_texture_sub_image                          DONE (all drivers)
+  GL_ARB_shader_texture_image_samples                   DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_texture_barrier                                DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_KHR_context_flush_control                          DONE (all - but needs GLX/EGL extension to be useful)
+  GL_KHR_robust_buffer_access_behavior                  not started
+  GL_KHR_robustness                                     90% done (the ARB variant)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
 These are the extensions cherry-picked to make GLES 3.1
 GLES3.1, GLSL ES 3.1
-  GL_ARB_arrays_of_arrays                              DONE (all drivers that support GLSL 1.30)
-  GL_ARB_compute_shader                                DONE (i965)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_program_interface_query                       DONE (all drivers)
-  GL_ARB_shader_atomic_counters                        DONE (i965, nvc0)
-  GL_ARB_shader_image_load_store                       DONE (i965)
-  GL_ARB_shader_image_size                             DONE (i965)
-  GL_ARB_shader_storage_buffer_object                  DONE (i965, nvc0)
-  GL_ARB_shading_language_packing                      DONE (all drivers)
-  GL_ARB_separate_shader_objects                       DONE (all drivers)
-  GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  Multisample textures (GL_ARB_texture_multisample)    DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_vertex_attrib_binding                         DONE (all drivers)
-  GS5 Enhanced textureGather                           DONE (i965, nvc0, r600, radeonsi)
-  GS5 Packing/bitfield/conversion functions            DONE (i965, nvc0, r600, radeonsi)
-  GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)
+  GL_ARB_arrays_of_arrays                               DONE (all drivers that support GLSL 1.30)
+  GL_ARB_compute_shader                                 DONE (i965)
+  GL_ARB_draw_indirect                                  DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
+  GL_ARB_framebuffer_no_attachments                     DONE (i965)
+  GL_ARB_program_interface_query                        DONE (all drivers)
+  GL_ARB_shader_atomic_counters                         DONE (i965, nvc0)
+  GL_ARB_shader_image_load_store                        DONE (i965)
+  GL_ARB_shader_image_size                              DONE (i965)
+  GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
+  GL_ARB_shading_language_packing                       DONE (all drivers)
+  GL_ARB_separate_shader_objects                        DONE (all drivers)
+  GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  Multisample textures (GL_ARB_texture_multisample)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_ARB_vertex_attrib_binding                          DONE (all drivers)
+  GS5 Enhanced textureGather                            DONE (i965, nvc0, r600, radeonsi)
+  GS5 Packing/bitfield/conversion functions             DONE (i965, nvc0, r600, radeonsi)
+  GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
   Additional functionality not covered above:
-      glMemoryBarrierByRegion                          DONE
-      glGetTexLevelParameter[fi]v - needs updates      DONE
+      glMemoryBarrierByRegion                           DONE
+      glGetTexLevelParameter[fi]v - needs updates       DONE
       glGetBooleani_v - restrict to GLES enums
-      gl_HelperInvocation support                      DONE (i965, nvc0, r600)
+      gl_HelperInvocation support                       DONE (i965, nvc0, r600)
 
 GLES3.2, GLSL ES 3.2
-  GL_EXT_color_buffer_float                            DONE (all drivers)
-  GL_KHR_blend_equation_advanced                       not started
-  GL_KHR_debug                                         DONE (all drivers)
-  GL_KHR_robustness                                    90% done (the ARB variant)
-  GL_KHR_texture_compression_astc_ldr                  DONE (i965/gen9+)
-  GL_OES_copy_image                                    not started (based on GL_ARB_copy_image, which is done for some drivers)
-  GL_OES_draw_buffers_indexed                          not started
-  GL_OES_draw_elements_base_vertex                     DONE (all drivers)
-  GL_OES_geometry_shader                               started (Marta)
-  GL_OES_gpu_shader5                                   DONE (all drivers that support GL_ARB_gpu_shader5)
-  GL_OES_primitive_bounding box                        not started
-  GL_OES_sample_shading                                not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_sample_variables                              not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_shader_image_atomic                           DONE (all drivers that support GL_ARB_shader_image_load_store)
-  GL_OES_shader_io_blocks                              not started (based on parts of GLSL 1.50, which is done)
-  GL_OES_shader_multisample_interpolation              not started (based on parts of GL_ARB_gpu_shader5, which is done)
-  GL_OES_tessellation_shader                           not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
-  GL_OES_texture_border_clamp                          DONE (all drivers)
-  GL_OES_texture_buffer                                not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
-  GL_OES_texture_cube_map_array                        not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
-  GL_OES_texture_stencil8                              DONE (all drivers that support GL_ARB_texture_stencil8)
-  GL_OES_texture_storage_multisample_2d_array          DONE (all drivers that support GL_ARB_texture_multisample)
+  GL_EXT_color_buffer_float                             DONE (all drivers)
+  GL_KHR_blend_equation_advanced                        not started
+  GL_KHR_debug                                          DONE (all drivers)
+  GL_KHR_robustness                                     90% done (the ARB variant)
+  GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
+  GL_OES_copy_image                                     not started (based on GL_ARB_copy_image, which is done for some drivers)
+  GL_OES_draw_buffers_indexed                           not started
+  GL_OES_draw_elements_base_vertex                      DONE (all drivers)
+  GL_OES_geometry_shader                                started (Marta)
+  GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
+  GL_OES_primitive_bounding box                         not started
+  GL_OES_sample_shading                                 not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_sample_variables                               not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
+  GL_OES_shader_io_blocks                               not started (based on parts of GLSL 1.50, which is done)
+  GL_OES_shader_multisample_interpolation               not started (based on parts of GL_ARB_gpu_shader5, which is done)
+  GL_OES_tessellation_shader                            not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
+  GL_OES_texture_border_clamp                           DONE (all drivers)
+  GL_OES_texture_buffer                                 not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
+  GL_OES_texture_cube_map_array                         not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
+  GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
+  GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
 
 More info about these features and the work involved can be found at
 http://dri.freedesktop.org/wiki/MissingFunctionality

From f5d47dd428c8f7f5b4df08ee15496554ef19d50d Mon Sep 17 00:00:00 2001
From: Romain Failliot <romain.failliot@foolstep.com>
Date: Tue, 15 Mar 2016 16:14:10 -0400
Subject: [PATCH 017/197] docs: Renormalize some extensions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes some exceptions I have to deal with in mesamatrix.net.
The extensions GL_ARB_texture_buffer_object had a comment between "DONE"
and the brackets.
And the extension GL_KHR_robustness (in GL 4.5 and GLES 3.1) was using
"90% done" instead of "in progress". The "90% done" is still here
though, but as an extension comment.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 docs/GL3.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 17522d53ad1..49ba7d7f5a1 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -72,7 +72,7 @@ GL 3.1, GLSL 1.40 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
   Buffer copying (GL_ARB_copy_buffer)                   DONE ()
   Primitive restart (GL_NV_primitive_restart)           DONE ()
   16 vertex texture image units                         DONE ()
-  Texture buffer objs (GL_ARB_texture_buffer_object)    DONE for OpenGL 3.1 contexts ()
+  Texture buffer objs (GL_ARB_texture_buffer_object)    DONE (for OpenGL 3.1 contexts)
   Rectangular textures (GL_ARB_texture_rectangle)       DONE ()
   Uniform buffer objs (GL_ARB_uniform_buffer_object)    DONE ()
   Signed normalized textures (GL_EXT_texture_snorm)     DONE ()
@@ -219,7 +219,7 @@ GL 4.5, GLSL 4.50:
   GL_ARB_texture_barrier                                DONE (i965, nv50, nvc0, r600, radeonsi)
   GL_KHR_context_flush_control                          DONE (all - but needs GLX/EGL extension to be useful)
   GL_KHR_robust_buffer_access_behavior                  not started
-  GL_KHR_robustness                                     90% done (the ARB variant)
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
   GL_EXT_shader_integer_mix                             DONE (all drivers that support GLSL)
 
 These are the extensions cherry-picked to make GLES 3.1
@@ -254,7 +254,7 @@ GLES3.2, GLSL ES 3.2
   GL_EXT_color_buffer_float                             DONE (all drivers)
   GL_KHR_blend_equation_advanced                        not started
   GL_KHR_debug                                          DONE (all drivers)
-  GL_KHR_robustness                                     90% done (the ARB variant)
+  GL_KHR_robustness                                     not started (90% done with the ARB variant)
   GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
   GL_OES_copy_image                                     not started (based on GL_ARB_copy_image, which is done for some drivers)
   GL_OES_draw_buffers_indexed                           not started

From 151724159d00dac4fa0ebddc76389e3e6412fb5b Mon Sep 17 00:00:00 2001
From: Romain Failliot <romain.failliot@foolstep.com>
Date: Tue, 15 Mar 2016 16:14:11 -0400
Subject: [PATCH 018/197] docs: Renormalize older extensions.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For older extensions, there is an explanation first and the extension
name in brackets, like that:
    Clamping controls (GL_ARB_color_buffer_float)
I inverted that so we have the extension first and then the explanation
in brackets, like that:
    GL_ARB_color_buffer_float (Clamping controls)

It will help me later to parse the few extensions that use this syntax:
    all drivers that support <GL_extension>

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 docs/GL3.txt | 52 ++++++++++++++++++++++++++--------------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 49ba7d7f5a1..3058996ad16 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -36,24 +36,24 @@ Feature                                                 Status
 GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   glBindFragDataLocation, glGetFragDataLocation         DONE
-  Conditional rendering (GL_NV_conditional_render)      DONE ()
-  Map buffer subranges (GL_ARB_map_buffer_range)        DONE ()
-  Clamping controls (GL_ARB_color_buffer_float)         DONE ()
-  Float textures, renderbuffers (GL_ARB_texture_float)  DONE ()
+  GL_NV_conditional_render (Conditional rendering)      DONE ()
+  GL_ARB_map_buffer_range (Map buffer subranges)        DONE ()
+  GL_ARB_color_buffer_float (Clamping controls)         DONE ()
+  GL_ARB_texture_float (Float textures, renderbuffers)  DONE ()
   GL_EXT_packed_float                                   DONE ()
   GL_EXT_texture_shared_exponent                        DONE ()
-  Float depth buffers (GL_ARB_depth_buffer_float)       DONE ()
-  Framebuffer objects (GL_ARB_framebuffer_object)       DONE ()
+  GL_ARB_depth_buffer_float (Float depth buffers)       DONE ()
+  GL_ARB_framebuffer_object (Framebuffer objects)       DONE ()
   GL_ARB_half_float_pixel                               DONE (all drivers)
   GL_ARB_half_float_vertex                              DONE ()
   GL_EXT_texture_integer                                DONE ()
   GL_EXT_texture_array                                  DONE ()
-  Per-buffer blend and masks (GL_EXT_draw_buffers2)     DONE ()
+  GL_EXT_draw_buffers2 (Per-buffer blend and masks)     DONE ()
   GL_EXT_texture_compression_rgtc                       DONE ()
   GL_ARB_texture_rg                                     DONE ()
-  Transform feedback (GL_EXT_transform_feedback)        DONE ()
-  Vertex array objects (GL_ARB_vertex_array_object)     DONE ()
-  sRGB framebuffer format (GL_EXT_framebuffer_sRGB)     DONE ()
+  GL_EXT_transform_feedback (Transform feedback)        DONE ()
+  GL_ARB_vertex_array_object (Vertex array objects)     DONE ()
+  GL_EXT_framebuffer_sRGB (sRGB framebuffer format)     DONE ()
   glClearBuffer commands                                DONE
   glGetStringi command                                  DONE
   glTexParameterI, glGetTexParameterI commands          DONE
@@ -68,28 +68,28 @@ GL 3.0, GLSL 1.30 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, soft
 GL 3.1, GLSL 1.40 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   Forward compatible context support/deprecations       DONE ()
-  Instanced drawing (GL_ARB_draw_instanced)             DONE ()
-  Buffer copying (GL_ARB_copy_buffer)                   DONE ()
-  Primitive restart (GL_NV_primitive_restart)           DONE ()
+  GL_ARB_draw_instanced (Instanced drawing)             DONE ()
+  GL_ARB_copy_buffer (Buffer copying)                   DONE ()
+  GL_NV_primitive_restart (Primitive restart)           DONE ()
   16 vertex texture image units                         DONE ()
-  Texture buffer objs (GL_ARB_texture_buffer_object)    DONE (for OpenGL 3.1 contexts)
-  Rectangular textures (GL_ARB_texture_rectangle)       DONE ()
-  Uniform buffer objs (GL_ARB_uniform_buffer_object)    DONE ()
-  Signed normalized textures (GL_EXT_texture_snorm)     DONE ()
+  GL_ARB_texture_buffer_object (Texture buffer objs)    DONE (for OpenGL 3.1 contexts)
+  GL_ARB_texture_rectangle (Rectangular textures)       DONE ()
+  GL_ARB_uniform_buffer_object (Uniform buffer objs)    DONE ()
+  GL_EXT_texture_snorm (Signed normalized textures)     DONE ()
 
 
 GL 3.2, GLSL 1.50 --- all DONE: i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe
 
   Core/compatibility profiles                           DONE
   Geometry shaders                                      DONE ()
-  BGRA vertex order (GL_ARB_vertex_array_bgra)          DONE ()
-  Base vertex offset(GL_ARB_draw_elements_base_vertex)  DONE ()
-  Frag shader coord (GL_ARB_fragment_coord_conventions) DONE ()
-  Provoking vertex (GL_ARB_provoking_vertex)            DONE ()
-  Seamless cubemaps (GL_ARB_seamless_cube_map)          DONE ()
-  Multisample textures (GL_ARB_texture_multisample)     DONE ()
-  Frag depth clamp (GL_ARB_depth_clamp)                 DONE ()
-  Fence objects (GL_ARB_sync)                           DONE ()
+  GL_ARB_vertex_array_bgra (BGRA vertex order)          DONE ()
+  GL_ARB_draw_elements_base_vertex (Base vertex offset) DONE ()
+  GL_ARB_fragment_coord_conventions (Frag shader coord) DONE ()
+  GL_ARB_provoking_vertex (Provoking vertex)            DONE ()
+  GL_ARB_seamless_cube_map (Seamless cubemaps)          DONE ()
+  GL_ARB_texture_multisample (Multisample textures)     DONE ()
+  GL_ARB_depth_clamp (Frag depth clamp)                 DONE ()
+  GL_ARB_sync (Fence objects)                           DONE ()
   GLX_ARB_create_context_profile                        DONE
 
 
@@ -237,7 +237,7 @@ GLES3.1, GLSL ES 3.1
   GL_ARB_shading_language_packing                       DONE (all drivers)
   GL_ARB_separate_shader_objects                        DONE (all drivers)
   GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  Multisample textures (GL_ARB_texture_multisample)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_texture_multisample (Multisample textures)     DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_storage_multisample                    DONE (all drivers that support GL_ARB_texture_multisample)
   GL_ARB_vertex_attrib_binding                          DONE (all drivers)
   GS5 Enhanced textureGather                            DONE (i965, nvc0, r600, radeonsi)

From 4ab2ac334921ae9bbd1791adaf8977fccf744580 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 15 Mar 2016 21:49:54 +0100
Subject: [PATCH 019/197] radeonsi: fix Hyper-Z hangs on P2 configs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cc: 11.1 11.2 <mesa-stable@lists.freedesktop.org>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeon/r600_texture.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 1ad70765781..c573b438b01 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -638,8 +638,14 @@ static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
 	    rscreen->info.drm_major == 2 && rscreen->info.drm_minor < 38)
 		return 0;
 
-	/* Overalign HTILE on Stoney to fix piglit/depthstencil-render-miplevels 585. */
-	if (rscreen->family == CHIP_STONEY)
+	/* Overalign HTILE on P2 configs to work around GPU hangs in
+	 * piglit/depthstencil-render-miplevels 585.
+	 *
+	 * This has been confirmed to help Kabini & Stoney, where the hangs
+	 * are always reproducible. I think I have seen the test hang
+	 * on Carrizo too, though it was very rare there.
+	 */
+	if (rscreen->chip_class >= CIK && num_pipes < 4)
 		num_pipes = 4;
 
 	switch (num_pipes) {

From c0ae6eeb3b0ea42344cc91cd0caa7bd0296172d4 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 24 Nov 2015 12:56:45 +1100
Subject: [PATCH 020/197] glsl: pass disable_varying_packing bool to the
 lowering pass

This will allow us to choose to ignore the disable which will be
useful for more fine grained control over when to enable or disable
packing.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/glsl/ir_optimization.h         |  3 ++-
 src/compiler/glsl/link_varyings.cpp         | 18 +++++++++---------
 src/compiler/glsl/lower_packed_varyings.cpp | 18 +++++++++++++-----
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h
index b56413a1500..30c95f4772a 100644
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -124,7 +124,8 @@ void lower_shared_reference(struct gl_shader *shader, unsigned *shared_size);
 void lower_ubo_reference(struct gl_shader *shader);
 void lower_packed_varyings(void *mem_ctx,
                            unsigned locations_used, ir_variable_mode mode,
-                           unsigned gs_input_vertices, gl_shader *shader);
+                           unsigned gs_input_vertices, gl_shader *shader,
+                           bool disable_varying_packing);
 bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
 bool lower_vector_derefs(gl_shader *shader);
 void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 34eb848a9c1..806191bd404 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -1784,15 +1784,15 @@ assign_varying_locations(struct gl_context *ctx,
                                               ir_var_shader_in);
    }
 
-   if (!disable_varying_packing) {
-      if (producer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                               0, producer);
-      }
-      if (consumer) {
-         lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
-                               consumer_vertices, consumer);
-      }
+   if (producer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
+                            0, producer, disable_varying_packing);
+   }
+
+   if (consumer) {
+      lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
+                            consumer_vertices, consumer,
+                            disable_varying_packing);
    }
 
    return true;
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index 8d1eb1725d5..d91aa22c2a4 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -168,7 +168,8 @@ public:
                                  ir_variable_mode mode,
                                  unsigned gs_input_vertices,
                                  exec_list *out_instructions,
-                                 exec_list *out_variables);
+                                 exec_list *out_variables,
+                                 bool disable_varying_packing);
 
    void run(struct gl_shader *shader);
 
@@ -231,6 +232,8 @@ private:
     * Exec list into which the visitor should insert any new variables.
     */
    exec_list *out_variables;
+
+   bool disable_varying_packing;
 };
 
 } /* anonymous namespace */
@@ -238,7 +241,7 @@ private:
 lower_packed_varyings_visitor::lower_packed_varyings_visitor(
       void *mem_ctx, unsigned locations_used, ir_variable_mode mode,
       unsigned gs_input_vertices, exec_list *out_instructions,
-      exec_list *out_variables)
+      exec_list *out_variables, bool disable_varying_packing)
    : mem_ctx(mem_ctx),
      locations_used(locations_used),
      packed_varyings((ir_variable **)
@@ -247,7 +250,8 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor(
      mode(mode),
      gs_input_vertices(gs_input_vertices),
      out_instructions(out_instructions),
-     out_variables(out_variables)
+     out_variables(out_variables),
+     disable_varying_packing(disable_varying_packing)
 {
 }
 
@@ -656,6 +660,9 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
    if (var->data.explicit_location)
       return false;
 
+   if (disable_varying_packing)
+      return false;
+
    const glsl_type *type = var->type->without_array();
    if (type->vector_elements == 4 && !type->is_double())
       return false;
@@ -709,7 +716,7 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev)
 void
 lower_packed_varyings(void *mem_ctx, unsigned locations_used,
                       ir_variable_mode mode, unsigned gs_input_vertices,
-                      gl_shader *shader)
+                      gl_shader *shader, bool disable_varying_packing)
 {
    exec_list *instructions = shader->ir;
    ir_function *main_func = shader->symbols->get_function("main");
@@ -720,7 +727,8 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used,
    lower_packed_varyings_visitor visitor(mem_ctx, locations_used, mode,
                                          gs_input_vertices,
                                          &new_instructions,
-                                         &new_variables);
+                                         &new_variables,
+                                         disable_varying_packing);
    visitor.run(shader);
    if (mode == ir_var_shader_out) {
       if (shader->Stage == MESA_SHADER_GEOMETRY) {

From d6b9202873f015174592e32f3325d00c57153d2d Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 29 Feb 2016 11:46:37 +1100
Subject: [PATCH 021/197] glsl: disable varying packing when its not safe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In GL 4.4+ there is no guarantee that interpolation qualifiers will
match between stages so we cannot safely pack varyings using the
current packing pass in Mesa.

We also disable packing on outerward facing interfaces for SSO
because in ES we need to retain the unpacked varying information
for draw time validation. For desktop GL we could allow packing for
SSO in versions < 4.4 but its just safer not to do so.

We do however enable packing on individual arrays, structs, and
matrices as these are required by the transform feedback code and it
is still safe to do so.

Finally we also enable packing when a varying is only used for
transform feedback and its not a SSO.

This fixes all remaining rendering issues with the dEQP SSO tests,
the only issues remaining with thoses tests are to do with validation.

Note: There is still one remaining SSO bug that this patch doesn't fix.
Their is a chance that VS -> TCS will have mismatching interfaces
because we pack VS output in case its used by transform feedback but
don't pack TCS input for performance reasons. This patch will make the
situation better but doesn't fix it.

V4: fix out of order function params after rebase, make sure packing
still disabled in tess stages. Update comments as to why we disable
packing on SSO.

V3: ES 3.1 *does* require interpolation to match so don't disable
packing there. Rebased on master rather than on enhanced layouts
component packing series.

V2: Make is_varying_packing_safe() a function in the varying_matches
class, fix spelling (Matt) and make sure to remove the outer array
when dealing with Geom and Tess shaders where appropriate.
Lastly fix piglit regression in new piglit test and document the
undefined behaviour it depends on:
arb_separate_shader_objects/execution/vs-gs-linking.shader_test

Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
---
 src/compiler/glsl/ir.h                      |   7 +
 src/compiler/glsl/ir_optimization.h         |   2 +-
 src/compiler/glsl/link_varyings.cpp         | 196 +++++++++++++++-----
 src/compiler/glsl/lower_packed_varyings.cpp |  28 ++-
 4 files changed, 180 insertions(+), 53 deletions(-)

diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index f4519679ff3..b74d68a605b 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -719,6 +719,13 @@ public:
        */
       unsigned is_unmatched_generic_inout:1;
 
+      /**
+       * Is this varying used only by transform feedback?
+       *
+       * This is used by the linker to decide if its safe to pack the varying.
+       */
+      unsigned is_xfb_only:1;
+
       /**
        * If non-zero, then this variable may be packed along with other variables
        * into a single varying slot, so this offset should be applied when
diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h
index 30c95f4772a..2d773760f90 100644
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -125,7 +125,7 @@ void lower_ubo_reference(struct gl_shader *shader);
 void lower_packed_varyings(void *mem_ctx,
                            unsigned locations_used, ir_variable_mode mode,
                            unsigned gs_input_vertices, gl_shader *shader,
-                           bool disable_varying_packing);
+                           bool disable_varying_packing, bool xfb_enabled);
 bool lower_vector_insert(exec_list *instructions, bool lower_nonconstant_index);
 bool lower_vector_derefs(gl_shader *shader);
 void lower_named_interface_blocks(void *mem_ctx, gl_shader *shader);
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 806191bd404..44fc8f617f8 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -826,7 +826,7 @@ namespace {
 class varying_matches
 {
 public:
-   varying_matches(bool disable_varying_packing,
+   varying_matches(bool disable_varying_packing, bool xfb_enabled,
                    gl_shader_stage producer_stage,
                    gl_shader_stage consumer_stage);
    ~varying_matches();
@@ -836,13 +836,29 @@ public:
    void store_locations() const;
 
 private:
+   bool is_varying_packing_safe(const glsl_type *type,
+                                const ir_variable *var);
+
    /**
     * If true, this driver disables varying packing, so all varyings need to
     * be aligned on slot boundaries, and take up a number of slots equal to
     * their number of matrix columns times their array size.
+    *
+    * Packing may also be disabled because our current packing method is not
+    * safe in SSO or versions of OpenGL where interpolation qualifiers are not
+    * guaranteed to match across stages.
     */
    const bool disable_varying_packing;
 
+   /**
+    * If true, this driver has transform feedback enabled. The transform
+    * feedback code requires at least some packing be done even when varying
+    * packing is disabled, fortunately where transform feedback requires
+    * packing it's safe to override the disabled setting. See
+    * is_varying_packing_safe().
+    */
+   const bool xfb_enabled;
+
    /**
     * Enum representing the order in which varyings are packed within a
     * packing class.
@@ -862,6 +878,7 @@ private:
    static unsigned compute_packing_class(const ir_variable *var);
    static packing_order_enum compute_packing_order(const ir_variable *var);
    static int match_comparator(const void *x_generic, const void *y_generic);
+   static int xfb_comparator(const void *x_generic, const void *y_generic);
 
    /**
     * Structure recording the relationship between a single producer output
@@ -917,9 +934,11 @@ private:
 } /* anonymous namespace */
 
 varying_matches::varying_matches(bool disable_varying_packing,
+                                 bool xfb_enabled,
                                  gl_shader_stage producer_stage,
                                  gl_shader_stage consumer_stage)
    : disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled),
      producer_stage(producer_stage),
      consumer_stage(consumer_stage)
 {
@@ -941,6 +960,24 @@ varying_matches::~varying_matches()
 }
 
 
+/**
+ * Packing is always safe on individual arrays, structure and matices. It is
+ * also safe if the varying is only used for transform feedback.
+ */
+bool
+varying_matches::is_varying_packing_safe(const glsl_type *type,
+                                         const ir_variable *var)
+{
+   if (consumer_stage == MESA_SHADER_TESS_EVAL ||
+       consumer_stage == MESA_SHADER_TESS_CTRL ||
+       producer_stage == MESA_SHADER_TESS_CTRL)
+      return false;
+
+   return xfb_enabled && (type->is_array() || type->is_record() ||
+                          type->is_matrix() || var->data.is_xfb_only);
+}
+
+
 /**
  * Record the given producer/consumer variable pair in the list of variables
  * that should later be assigned locations.
@@ -1020,7 +1057,7 @@ varying_matches::record(ir_variable *producer_var, ir_variable *consumer_var)
       = this->compute_packing_class(var);
    this->matches[this->num_matches].packing_order
       = this->compute_packing_order(var);
-   if (this->disable_varying_packing) {
+   if (this->disable_varying_packing && !is_varying_packing_safe(type, var)) {
       unsigned slots = type->count_attribute_slots(false);
       this->matches[this->num_matches].num_components = slots * 4;
    } else {
@@ -1046,37 +1083,28 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
                                   uint64_t reserved_slots,
                                   bool separate_shader)
 {
-   /* We disable varying sorting for separate shader programs for the
-    * following reasons:
-    *
-    * 1/ All programs must sort the code in the same order to guarantee the
-    *    interface matching. However varying_matches::record() will change the
-    *    interpolation qualifier of some stages.
-    *
-    * 2/ GLSL version 4.50 removes the matching constrain on the interpolation
-    *    qualifier.
-    *
-    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec:
-    *
-    *    "The type and presence of interpolation qualifiers of variables with
-    *    the same name declared in all linked shaders for the same cross-stage
-    *    interface must match, otherwise the link command will fail.
-    *
-    *    When comparing an output from one stage to an input of a subsequent
-    *    stage, the input and output don't match if their interpolation
-    *    qualifiers (or lack thereof) are not the same."
-    *
-    *    "It is a link-time error if, within the same stage, the interpolation
-    *    qualifiers of variables of the same name do not match."
+   /* If packing has been disabled then we cannot safely sort the varyings by
+    * class as it may mean we are using a version of OpenGL where
+    * interpolation qualifiers are not guaranteed to be matching across
+    * shaders, sorting in this case could result in mismatching shader
+    * interfaces.
+    * When packing is disabled the sort orders varyings used by transform
+    * feedback first, but also depends on *undefined behaviour* of qsort to
+    * reverse the order of the varyings. See: xfb_comparator().
     */
-   if (!separate_shader) {
+   if (!this->disable_varying_packing) {
       /* Sort varying matches into an order that makes them easy to pack. */
       qsort(this->matches, this->num_matches, sizeof(*this->matches),
             &varying_matches::match_comparator);
+   } else {
+      /* Only sort varyings that are only used by transform feedback. */
+      qsort(this->matches, this->num_matches, sizeof(*this->matches),
+            &varying_matches::xfb_comparator);
    }
 
    unsigned generic_location = 0;
    unsigned generic_patch_location = MAX_VARYING*4;
+   bool previous_var_xfb_only = false;
 
    for (unsigned i = 0; i < this->num_matches; i++) {
       unsigned *location = &generic_location;
@@ -1100,16 +1128,30 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
       /* Advance to the next slot if this varying has a different packing
        * class than the previous one, and we're not already on a slot
        * boundary.
+       *
+       * Also advance to the next slot if packing is disabled. This makes sure
+       * we don't assign varyings the same locations which is possible
+       * because we still pack individual arrays, records and matrices even
+       * when packing is disabled. Note we don't advance to the next slot if
+       * we can pack varyings together that are only used for transform
+       * feedback.
        */
-      if (i > 0 &&
-          this->matches[i - 1].packing_class
-          != this->matches[i].packing_class) {
+      if ((this->disable_varying_packing &&
+           !(previous_var_xfb_only && var->data.is_xfb_only)) ||
+          (i > 0 && this->matches[i - 1].packing_class
+          != this->matches[i].packing_class )) {
          *location = ALIGN(*location, 4);
       }
 
+      previous_var_xfb_only = var->data.is_xfb_only;
+
       unsigned num_elements =  type->count_attribute_slots(is_vertex_input);
-      unsigned slot_end = this->disable_varying_packing ? 4 :
-         type->without_array()->vector_elements;
+      unsigned slot_end;
+      if (this->disable_varying_packing &&
+          !is_varying_packing_safe(type, var))
+         slot_end = 4;
+      else
+         slot_end = type->without_array()->vector_elements;
       slot_end += *location - 1;
 
       /* FIXME: We could be smarter in the below code and loop back over
@@ -1133,7 +1175,8 @@ varying_matches::assign_locations(struct gl_shader_program *prog,
          /* Increase the slot to make sure there is enough room for next
           * array element.
           */
-         if (this->disable_varying_packing)
+         if (this->disable_varying_packing &&
+             !is_varying_packing_safe(type, var))
             slot_end += 4;
          else
             slot_end += type->without_array()->vector_elements;
@@ -1258,6 +1301,32 @@ varying_matches::match_comparator(const void *x_generic, const void *y_generic)
 }
 
 
+/**
+ * Comparison function passed to qsort() to sort varyings used only by
+ * transform feedback when packing of other varyings is disabled.
+ */
+int
+varying_matches::xfb_comparator(const void *x_generic, const void *y_generic)
+{
+   const match *x = (const match *) x_generic;
+
+   if (x->producer_var != NULL && x->producer_var->data.is_xfb_only)
+         return match_comparator(x_generic, y_generic);
+
+   /* FIXME: When the comparator returns 0 it means the elements being
+    * compared are equivalent. However the qsort documentation says:
+    *
+    *    "The order of equivalent elements is undefined."
+    *
+    * In practice the sort ends up reversing the order of the varyings which
+    * means locations are also assigned in this reversed order and happens to
+    * be what we want. This is also whats happening in
+    * varying_matches::match_comparator().
+    */
+   return 0;
+}
+
+
 /**
  * Is the given variable a varying variable to be counted against the
  * limit in ctx->Const.MaxVarying?
@@ -1573,26 +1642,60 @@ assign_varying_locations(struct gl_context *ctx,
                          unsigned num_tfeedback_decls,
                          tfeedback_decl *tfeedback_decls)
 {
-   if (ctx->Const.DisableVaryingPacking) {
-      /* Transform feedback code assumes varyings are packed, so if the driver
-       * has disabled varying packing, make sure it does not support transform
-       * feedback.
-       */
-      assert(!ctx->Extensions.EXT_transform_feedback);
-   }
-
    /* Tessellation shaders treat inputs and outputs as shared memory and can
     * access inputs and outputs of other invocations.
     * Therefore, they can't be lowered to temps easily (and definitely not
     * efficiently).
     */
-   bool disable_varying_packing =
-      ctx->Const.DisableVaryingPacking ||
+   bool unpackable_tess =
       (consumer && consumer->Stage == MESA_SHADER_TESS_EVAL) ||
       (consumer && consumer->Stage == MESA_SHADER_TESS_CTRL) ||
       (producer && producer->Stage == MESA_SHADER_TESS_CTRL);
 
-   varying_matches matches(disable_varying_packing,
+   /* Transform feedback code assumes varying arrays are packed, so if the
+    * driver has disabled varying packing, make sure to at least enable
+    * packing required by transform feedback.
+    */
+   bool xfb_enabled =
+      ctx->Extensions.EXT_transform_feedback && !unpackable_tess;
+
+   /* Disable varying packing for GL 4.4+ as there is no guarantee
+    * that interpolation qualifiers will match between shaders in these
+    * versions. We also disable packing on outerward facing interfaces for
+    * SSO because in ES we need to retain the unpacked varying information
+    * for draw time validation. For desktop GL we could allow packing for
+    * versions < 4.4 but its just safer not to do packing.
+    *
+    * Packing is still enabled on individual arrays, structs, and matrices as
+    * these are required by the transform feedback code and it is still safe
+    * to do so. We also enable packing when a varying is only used for
+    * transform feedback and its not a SSO.
+    *
+    * Varying packing currently only packs together varyings with matching
+    * interpolation qualifiers as the backends assume all packed components
+    * are to be processed in the same way. Therefore we cannot do packing in
+    * these versions of GL without the risk of mismatching interfaces.
+    *
+    * From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.30 spec:
+    *
+    *    "The type and presence of interpolation qualifiers of variables with
+    *    the same name declared in all linked shaders for the same cross-stage
+    *    interface must match, otherwise the link command will fail.
+    *
+    *    When comparing an output from one stage to an input of a subsequent
+    *    stage, the input and output don't match if their interpolation
+    *    qualifiers (or lack thereof) are not the same."
+    *
+    * This text was also in at least revison 7 of the 4.40 spec but is no
+    * longer in revision 9 and not in the 4.50 spec.
+    */
+   bool disable_varying_packing =
+      ctx->Const.DisableVaryingPacking || unpackable_tess;
+   if ((ctx->API == API_OPENGL_CORE && ctx->Version >= 44) ||
+       (prog->SeparateShader && (producer == NULL || consumer == NULL)))
+      disable_varying_packing = true;
+
+   varying_matches matches(disable_varying_packing, xfb_enabled,
                            producer ? producer->Stage : (gl_shader_stage)-1,
                            consumer ? consumer->Stage : (gl_shader_stage)-1);
    hash_table *tfeedback_candidates
@@ -1711,8 +1814,10 @@ assign_varying_locations(struct gl_context *ctx,
          return false;
       }
 
-      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout)
+      if (matched_candidate->toplevel_var->data.is_unmatched_generic_inout) {
+         matched_candidate->toplevel_var->data.is_xfb_only = 1;
          matches.record(matched_candidate->toplevel_var, NULL);
+      }
    }
 
    const uint64_t reserved_slots =
@@ -1786,13 +1891,14 @@ assign_varying_locations(struct gl_context *ctx,
 
    if (producer) {
       lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_out,
-                            0, producer, disable_varying_packing);
+                            0, producer, disable_varying_packing,
+                            xfb_enabled);
    }
 
    if (consumer) {
       lower_packed_varyings(mem_ctx, slots_used, ir_var_shader_in,
                             consumer_vertices, consumer,
-                            disable_varying_packing);
+                            disable_varying_packing, xfb_enabled);
    }
 
    return true;
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index d91aa22c2a4..825cc9ee8cd 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -169,7 +169,8 @@ public:
                                  unsigned gs_input_vertices,
                                  exec_list *out_instructions,
                                  exec_list *out_variables,
-                                 bool disable_varying_packing);
+                                 bool disable_varying_packing,
+                                 bool xfb_enabled);
 
    void run(struct gl_shader *shader);
 
@@ -234,6 +235,7 @@ private:
    exec_list *out_variables;
 
    bool disable_varying_packing;
+   bool xfb_enabled;
 };
 
 } /* anonymous namespace */
@@ -241,7 +243,8 @@ private:
 lower_packed_varyings_visitor::lower_packed_varyings_visitor(
       void *mem_ctx, unsigned locations_used, ir_variable_mode mode,
       unsigned gs_input_vertices, exec_list *out_instructions,
-      exec_list *out_variables, bool disable_varying_packing)
+      exec_list *out_variables, bool disable_varying_packing,
+      bool xfb_enabled)
    : mem_ctx(mem_ctx),
      locations_used(locations_used),
      packed_varyings((ir_variable **)
@@ -251,7 +254,8 @@ lower_packed_varyings_visitor::lower_packed_varyings_visitor(
      gs_input_vertices(gs_input_vertices),
      out_instructions(out_instructions),
      out_variables(out_variables),
-     disable_varying_packing(disable_varying_packing)
+     disable_varying_packing(disable_varying_packing),
+     xfb_enabled(xfb_enabled)
 {
 }
 
@@ -660,10 +664,18 @@ lower_packed_varyings_visitor::needs_lowering(ir_variable *var)
    if (var->data.explicit_location)
       return false;
 
-   if (disable_varying_packing)
+   /* Override disable_varying_packing if the var is only used by transform
+    * feedback. Also override it if transform feedback is enabled and the
+    * variable is an array, struct or matrix as the elements of these types
+    * will always has the same interpolation and therefore asre safe to pack.
+    */
+   const glsl_type *type = var->type;
+   if (disable_varying_packing && !var->data.is_xfb_only &&
+       !((type->is_array() || type->is_record() || type->is_matrix()) &&
+         xfb_enabled))
       return false;
 
-   const glsl_type *type = var->type->without_array();
+   type = type->without_array();
    if (type->vector_elements == 4 && !type->is_double())
       return false;
    return true;
@@ -716,7 +728,8 @@ lower_packed_varyings_gs_splicer::visit_leave(ir_emit_vertex *ev)
 void
 lower_packed_varyings(void *mem_ctx, unsigned locations_used,
                       ir_variable_mode mode, unsigned gs_input_vertices,
-                      gl_shader *shader, bool disable_varying_packing)
+                      gl_shader *shader, bool disable_varying_packing,
+                      bool xfb_enabled)
 {
    exec_list *instructions = shader->ir;
    ir_function *main_func = shader->symbols->get_function("main");
@@ -728,7 +741,8 @@ lower_packed_varyings(void *mem_ctx, unsigned locations_used,
                                          gs_input_vertices,
                                          &new_instructions,
                                          &new_variables,
-                                         disable_varying_packing);
+                                         disable_varying_packing,
+                                         xfb_enabled);
    visitor.run(shader);
    if (mode == ir_var_shader_out) {
       if (shader->Stage == MESA_SHADER_GEOMETRY) {

From 49eb5e75bdc54d230c87669dacf24efb8217e756 Mon Sep 17 00:00:00 2001
From: Dongwon Kim <dongwon.kim@intel.com>
Date: Tue, 16 Feb 2016 10:05:24 -0800
Subject: [PATCH 022/197] configure.ac: enable_asm=yes when x-compiling across
 same X86 arch

Currently, configure script is forcing 'enable_asm' to be 'no'
whenever cross-compilation is performed on X86 host. This is
based on an assumption that target architecture is different
from host's (i.e. ARM). But there's always a case that we do
cross-compilation for target that is also X86 based just like
host in which same ASM codes will be supported. 'enable_asm'
should not be forced to be "no" anymore in this case.

v2: corrected commit message

Reviewed-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Dongwon Kim <dongwon.kim@intel.com>
---
 configure.ac | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/configure.ac b/configure.ac
index 31703b58301..1ece6fa4e16 100644
--- a/configure.ac
+++ b/configure.ac
@@ -704,8 +704,10 @@ test "x$enable_asm" = xno && AC_MSG_RESULT([no])
 if test "x$enable_asm" = xyes -a "x$cross_compiling" = xyes; then
     case "$host_cpu" in
     i?86 | x86_64 | amd64)
-        enable_asm=no
-        AC_MSG_RESULT([no, cross compiling])
+        if test "x$host_cpu" != "x$target_cpu"; then
+            enable_asm=no
+            AC_MSG_RESULT([no, cross compiling])
+        fi
         ;;
     esac
 fi

From a8eea696b8966d119e213d532158f63c5b280740 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 14 Mar 2016 15:33:34 -0500
Subject: [PATCH 023/197] st/mesa: honour sized internal formats in
 st_choose_format (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bitcasting which is possible with shader images (and texture views?)
requires that when the user specifies a sized internal format for a
texture, we really allocate that format. To this end:

(1) find_exact_format should ignore sized internal formats and

(2) some of the entries in the mapping table corresponding to sized
    internal formats are reordered to use an RGBA format instead of
    a BGRA one.

This fixes arb_shader_image_load_store-bitcast in the (work in progress)
ARB_shader_image_load_store implementation for radeonsi.

v2: don't change the mapping of GL_RGB10: the change caused a regression
    because it preferred a format with an alpha channel, and GL_RGB10
    is not among the supported formats for shader images

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_format.c | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 5392c23ec00..161c7678236 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -1114,12 +1114,12 @@ static const struct format_mapping format_map[] = {
    },
    {
       { GL_RGB10_A2, 0 },
-      { PIPE_FORMAT_B10G10R10A2_UNORM, PIPE_FORMAT_R10G10B10A2_UNORM,
+      { PIPE_FORMAT_R10G10B10A2_UNORM, PIPE_FORMAT_B10G10R10A2_UNORM,
         DEFAULT_RGBA_FORMATS }
    },
    {
       { 4, GL_RGBA, GL_RGBA8, 0 },
-      { DEFAULT_RGBA_FORMATS }
+      { PIPE_FORMAT_R8G8B8A8_UNORM, DEFAULT_RGBA_FORMATS }
    },
    {
       { GL_BGRA, 0 },
@@ -1127,7 +1127,7 @@ static const struct format_mapping format_map[] = {
    },
    {
       { 3, GL_RGB, GL_RGB8, 0 },
-      { DEFAULT_RGB_FORMATS }
+      { PIPE_FORMAT_R8G8B8X8_UNORM, DEFAULT_RGB_FORMATS }
    },
    {
       { GL_RGB12, GL_RGB16, 0 },
@@ -2022,20 +2022,10 @@ static const struct exact_format_mapping rgbx8888_tbl[] =
    { 0,           0,                              0                          }
 };
 
-static const struct exact_format_mapping rgba1010102_tbl[] =
-{
-   { GL_BGRA,     GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_B10G10R10A2_UNORM },
-   /* No Mesa formats for these Gallium formats:
-   { GL_RGBA,     GL_UNSIGNED_INT_2_10_10_10_REV, PIPE_FORMAT_R10G10B10A2_UNORM },
-   { GL_ABGR_EXT, GL_UNSIGNED_INT_10_10_10_2,     PIPE_FORMAT_R10G10B10A2_UNORM },
-   { GL_ABGR_EXT, GL_UNSIGNED_INT,                PIPE_FORMAT_R10G10B10A2_UNORM },
-   */
-   { 0,           0,                              0                             }
-};
-
 /**
- * If there is an exact pipe_format match for {internalFormat, format, type}
- * return that, otherwise return PIPE_FORMAT_NONE so we can do fuzzy matching.
+ * For unsized/base internal formats, we may choose a convenient effective
+ * internal format for {format, type}. If one exists, return that, otherwise
+ * return PIPE_FORMAT_NONE.
  */
 static enum pipe_format
 find_exact_format(GLint internalFormat, GLenum format, GLenum type)
@@ -2049,17 +2039,12 @@ find_exact_format(GLint internalFormat, GLenum format, GLenum type)
    switch (internalFormat) {
    case 4:
    case GL_RGBA:
-   case GL_RGBA8:
       tbl = rgba8888_tbl;
       break;
    case 3:
    case GL_RGB:
-   case GL_RGB8:
       tbl = rgbx8888_tbl;
       break;
-   case GL_RGB10_A2:
-      tbl = rgba1010102_tbl;
-      break;
    default:
       return PIPE_FORMAT_NONE;
    }

From 7794b22a84166193c59f86dc33bde95c180edee8 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Mar 2016 11:09:32 +1100
Subject: [PATCH 024/197] mesa: remove unused function

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/program/prog_parameter.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index c04d7a2e634..acc5b6c317b 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -99,12 +99,6 @@ _mesa_new_parameter_list_sized(unsigned size);
 extern void
 _mesa_free_parameter_list(struct gl_program_parameter_list *paramList);
 
-static inline GLuint
-_mesa_num_parameters(const struct gl_program_parameter_list *list)
-{
-   return list ? list->NumParameters : 0;
-}
-
 extern void
 _mesa_reserve_parameter_storage(struct gl_program_parameter_list *paramList,
                                 unsigned reserve_slots);

From 350b1ef027167af12156df92f449f370a0f8d396 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Mar 2016 11:10:06 +1100
Subject: [PATCH 025/197] mesa: make _mesa_lookup_parameter_constant static

This is not used outside of prog_parameter.c

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/program/prog_parameter.c | 189 +++++++++++++++---------------
 src/mesa/program/prog_parameter.h |   5 -
 2 files changed, 94 insertions(+), 100 deletions(-)

diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 34183d4d95f..19b57ee1bc1 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -37,6 +37,99 @@
 #include "prog_statevars.h"
 
 
+/**
+ * Look for a float vector in the given parameter list.  The float vector
+ * may be of length 1, 2, 3 or 4.  If swizzleOut is non-null, we'll try
+ * swizzling to find a match.
+ * \param list  the parameter list to search
+ * \param v  the float vector to search for
+ * \param vSize  number of element in v
+ * \param posOut  returns the position of the constant, if found
+ * \param swizzleOut  returns a swizzle mask describing location of the
+ *                    vector elements if found.
+ * \return GL_TRUE if found, GL_FALSE if not found
+ */
+static GLboolean
+lookup_parameter_constant(const struct gl_program_parameter_list *list,
+                          const gl_constant_value v[], GLuint vSize,
+                          GLint *posOut, GLuint *swizzleOut)
+{
+   GLuint i;
+
+   assert(vSize >= 1);
+   assert(vSize <= 4);
+
+   if (!list) {
+      *posOut = -1;
+      return GL_FALSE;
+   }
+
+   for (i = 0; i < list->NumParameters; i++) {
+      if (list->Parameters[i].Type == PROGRAM_CONSTANT) {
+         if (!swizzleOut) {
+            /* swizzle not allowed */
+            GLuint j, match = 0;
+            for (j = 0; j < vSize; j++) {
+               if (v[j].u == list->ParameterValues[i][j].u)
+                  match++;
+            }
+            if (match == vSize) {
+               *posOut = i;
+               return GL_TRUE;
+            }
+         }
+         else {
+            /* try matching w/ swizzle */
+             if (vSize == 1) {
+                /* look for v[0] anywhere within float[4] value */
+                GLuint j;
+                for (j = 0; j < list->Parameters[i].Size; j++) {
+                   if (list->ParameterValues[i][j].u == v[0].u) {
+                      /* found it */
+                      *posOut = i;
+                      *swizzleOut = MAKE_SWIZZLE4(j, j, j, j);
+                      return GL_TRUE;
+                   }
+                }
+             }
+             else if (vSize <= list->Parameters[i].Size) {
+                /* see if we can match this constant (with a swizzle) */
+                GLuint swz[4];
+                GLuint match = 0, j, k;
+                for (j = 0; j < vSize; j++) {
+                   if (v[j].u == list->ParameterValues[i][j].u) {
+                      swz[j] = j;
+                      match++;
+                   }
+                   else {
+                      for (k = 0; k < list->Parameters[i].Size; k++) {
+                         if (v[j].u == list->ParameterValues[i][k].u) {
+                            swz[j] = k;
+                            match++;
+                            break;
+                         }
+                      }
+                   }
+                }
+                /* smear last value to remaining positions */
+                for (; j < 4; j++)
+                   swz[j] = swz[j-1];
+
+                if (match == vSize) {
+                   *posOut = i;
+                   *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
+                   return GL_TRUE;
+                }
+             }
+         }
+      }
+   }
+
+   *posOut = -1;
+   return GL_FALSE;
+}
+
+
 struct gl_program_parameter_list *
 _mesa_new_parameter_list(void)
 {
@@ -228,8 +321,7 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
    assert(size <= 4);
 
    if (swizzleOut &&
-       _mesa_lookup_parameter_constant(paramList, values,
-                                       size, &pos, swizzleOut)) {
+       lookup_parameter_constant(paramList, values, size, &pos, swizzleOut)) {
       return pos;
    }
 
@@ -361,96 +453,3 @@ _mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
    }
    return -1;
 }
-
-
-/**
- * Look for a float vector in the given parameter list.  The float vector
- * may be of length 1, 2, 3 or 4.  If swizzleOut is non-null, we'll try
- * swizzling to find a match.
- * \param list  the parameter list to search
- * \param v  the float vector to search for
- * \param vSize  number of element in v
- * \param posOut  returns the position of the constant, if found
- * \param swizzleOut  returns a swizzle mask describing location of the
- *                    vector elements if found.
- * \return GL_TRUE if found, GL_FALSE if not found
- */
-GLboolean
-_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list,
-                                const gl_constant_value v[], GLuint vSize,
-                                GLint *posOut, GLuint *swizzleOut)
-{
-   GLuint i;
-
-   assert(vSize >= 1);
-   assert(vSize <= 4);
-
-   if (!list) {
-      *posOut = -1;
-      return GL_FALSE;
-   }
-
-   for (i = 0; i < list->NumParameters; i++) {
-      if (list->Parameters[i].Type == PROGRAM_CONSTANT) {
-         if (!swizzleOut) {
-            /* swizzle not allowed */
-            GLuint j, match = 0;
-            for (j = 0; j < vSize; j++) {
-               if (v[j].u == list->ParameterValues[i][j].u)
-                  match++;
-            }
-            if (match == vSize) {
-               *posOut = i;
-               return GL_TRUE;
-            }
-         }
-         else {
-            /* try matching w/ swizzle */
-             if (vSize == 1) {
-                /* look for v[0] anywhere within float[4] value */
-                GLuint j;
-                for (j = 0; j < list->Parameters[i].Size; j++) {
-                   if (list->ParameterValues[i][j].u == v[0].u) {
-                      /* found it */
-                      *posOut = i;
-                      *swizzleOut = MAKE_SWIZZLE4(j, j, j, j);
-                      return GL_TRUE;
-                   }
-                }
-             }
-             else if (vSize <= list->Parameters[i].Size) {
-                /* see if we can match this constant (with a swizzle) */
-                GLuint swz[4];
-                GLuint match = 0, j, k;
-                for (j = 0; j < vSize; j++) {
-                   if (v[j].u == list->ParameterValues[i][j].u) {
-                      swz[j] = j;
-                      match++;
-                   }
-                   else {
-                      for (k = 0; k < list->Parameters[i].Size; k++) {
-                         if (v[j].u == list->ParameterValues[i][k].u) {
-                            swz[j] = k;
-                            match++;
-                            break;
-                         }
-                      }
-                   }
-                }
-                /* smear last value to remaining positions */
-                for (; j < 4; j++)
-                   swz[j] = swz[j-1];
-
-                if (match == vSize) {
-                   *posOut = i;
-                   *swizzleOut = MAKE_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
-                   return GL_TRUE;
-                }
-             }
-         }
-      }
-   }
-
-   *posOut = -1;
-   return GL_FALSE;
-}
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index acc5b6c317b..c17d703040d 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -128,11 +128,6 @@ extern GLint
 _mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
                              GLsizei nameLen, const char *name);
 
-extern GLboolean
-_mesa_lookup_parameter_constant(const struct gl_program_parameter_list *list,
-                                const gl_constant_value v[], GLuint vSize,
-                                GLint *posOut, GLuint *swizzleOut);
-
 #ifdef __cplusplus
 }
 #endif

From fa9bd6b663a1c78d5a17e3ad5407ff5530fbb0c9 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Mar 2016 11:21:13 +1100
Subject: [PATCH 026/197] mesa: simplify and inline
 _mesa_lookup_parameter_index()

The function has only one user and strings are always null terminated.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/program/ir_to_mesa.cpp   |  2 +-
 src/mesa/program/prog_parameter.c | 38 -------------------------------
 src/mesa/program/prog_parameter.h | 19 ++++++++++++++--
 3 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 10d931c8b6b..1d9047ee6fd 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2356,7 +2356,7 @@ add_uniform_to_shader::visit_field(const glsl_type *type, const char *name,
       file = PROGRAM_UNIFORM;
    }
 
-   int index = _mesa_lookup_parameter_index(params, -1, name);
+   int index = _mesa_lookup_parameter_index(params, name);
    if (index < 0) {
       index = _mesa_add_parameter(params, file, name, size, type->gl_type,
 				  NULL, NULL);
diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 19b57ee1bc1..25d38353ddd 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -415,41 +415,3 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
 
    return index;
 }
-
-
-/**
- * Given a program parameter name, find its position in the list of parameters.
- * \param paramList  the parameter list to search
- * \param nameLen  length of name (in chars).
- *                 If length is negative, assume that name is null-terminated.
- * \param name  the name to search for
- * \return index of parameter in the list.
- */
-GLint
-_mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
-                             GLsizei nameLen, const char *name)
-{
-   GLint i;
-
-   if (!paramList)
-      return -1;
-
-   if (nameLen == -1) {
-      /* name is null-terminated */
-      for (i = 0; i < (GLint) paramList->NumParameters; i++) {
-         if (paramList->Parameters[i].Name &&
-	     strcmp(paramList->Parameters[i].Name, name) == 0)
-            return i;
-      }
-   }
-   else {
-      /* name is not null-terminated, use nameLen */
-      for (i = 0; i < (GLint) paramList->NumParameters; i++) {
-         if (paramList->Parameters[i].Name &&
-	     strncmp(paramList->Parameters[i].Name, name, nameLen) == 0
-             && strlen(paramList->Parameters[i].Name) == (size_t)nameLen)
-            return i;
-      }
-   }
-   return -1;
-}
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index c17d703040d..b4b24a11af3 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -34,6 +34,7 @@
 #include "main/mtypes.h"
 #include "prog_statevars.h"
 
+#include <string.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -124,9 +125,23 @@ extern GLint
 _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
                           const gl_state_index stateTokens[STATE_LENGTH]);
 
-extern GLint
+
+static inline GLint
 _mesa_lookup_parameter_index(const struct gl_program_parameter_list *paramList,
-                             GLsizei nameLen, const char *name);
+                             const char *name)
+{
+   if (!paramList)
+      return -1;
+
+   /* name must be null-terminated */
+   for (GLint i = 0; i < (GLint) paramList->NumParameters; i++) {
+      if (paramList->Parameters[i].Name &&
+         strcmp(paramList->Parameters[i].Name, name) == 0)
+         return i;
+   }
+
+   return -1;
+}
 
 #ifdef __cplusplus
 }

From ce9c042ab3d1c86837285c4e4d6de07646c1952f Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Mar 2016 11:32:15 +1100
Subject: [PATCH 027/197] mesa: inline _mesa_add_unnamed_constant()

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/program/prog_parameter.c | 22 ----------------------
 src/mesa/program/prog_parameter.h |  8 ++++++--
 2 files changed, 6 insertions(+), 24 deletions(-)

diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 25d38353ddd..470c98eb44e 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -356,28 +356,6 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
    return pos;
 }
 
-/**
- * Add a new unnamed constant to the parameter list.  This will be used
- * when a fragment/vertex program contains something like this:
- *    MOV r, { 0, 1, 2, 3 };
- * If swizzleOut is non-null we'll search the parameter list for an
- * existing instance of the constant which matches with a swizzle.
- *
- * \param paramList  the parameter list
- * \param values  four float values
- * \param swizzleOut  returns swizzle mask for accessing the constant
- * \return index/position of the new parameter in the parameter list.
- * \sa _mesa_add_typed_unnamed_constant
- */
-GLint
-_mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList,
-                           const gl_constant_value values[4], GLuint size,
-                           GLuint *swizzleOut)
-{
-   return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE,
-                                           swizzleOut);
-}
-
 
 /**
  * Add a new state reference to the parameter list.
diff --git a/src/mesa/program/prog_parameter.h b/src/mesa/program/prog_parameter.h
index b4b24a11af3..320f64f3f54 100644
--- a/src/mesa/program/prog_parameter.h
+++ b/src/mesa/program/prog_parameter.h
@@ -116,10 +116,14 @@ _mesa_add_typed_unnamed_constant(struct gl_program_parameter_list *paramList,
                            const gl_constant_value values[4], GLuint size,
                            GLenum datatype, GLuint *swizzleOut);
 
-extern GLint
+static inline GLint
 _mesa_add_unnamed_constant(struct gl_program_parameter_list *paramList,
                            const gl_constant_value values[4], GLuint size,
-                           GLuint *swizzleOut);
+                           GLuint *swizzleOut)
+{
+   return _mesa_add_typed_unnamed_constant(paramList, values, size, GL_NONE,
+                                           swizzleOut);
+}
 
 extern GLint
 _mesa_add_state_reference(struct gl_program_parameter_list *paramList,

From 952c166170aaf44af10e7463359e7a3e5e6fe65d Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 18 Mar 2016 12:33:27 +1100
Subject: [PATCH 028/197] mesa: remove remaining tabs in prog_parameter.c

Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/program/prog_parameter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/mesa/program/prog_parameter.c b/src/mesa/program/prog_parameter.c
index 470c98eb44e..02d84f20cd8 100644
--- a/src/mesa/program/prog_parameter.c
+++ b/src/mesa/program/prog_parameter.c
@@ -147,17 +147,17 @@ _mesa_new_parameter_list_sized(unsigned size)
 
       /* alloc arrays */
       p->Parameters = (struct gl_program_parameter *)
-	 calloc(size, sizeof(struct gl_program_parameter));
+         calloc(size, sizeof(struct gl_program_parameter));
 
       p->ParameterValues = (gl_constant_value (*)[4])
          _mesa_align_malloc(size * 4 *sizeof(gl_constant_value), 16);
 
 
       if ((p->Parameters == NULL) || (p->ParameterValues == NULL)) {
-	 free(p->Parameters);
-	 _mesa_align_free(p->ParameterValues);
-	 free(p);
-	 p = NULL;
+         free(p->Parameters);
+         _mesa_align_free(p->ParameterValues);
+         free(p);
+         p = NULL;
       }
    }
 
@@ -284,7 +284,7 @@ _mesa_add_parameter(struct gl_program_parameter_list *paramList,
          else {
             /* silence valgrind */
             for (j = 0; j < 4; j++)
-            	paramList->ParameterValues[oldNum + i][j].f = 0;
+               paramList->ParameterValues[oldNum + i][j].f = 0;
          }
          size -= 4;
       }
@@ -377,8 +377,8 @@ _mesa_add_state_reference(struct gl_program_parameter_list *paramList,
    /* Check if the state reference is already in the list */
    for (index = 0; index < (GLint) paramList->NumParameters; index++) {
       if (!memcmp(paramList->Parameters[index].StateIndexes,
-		  stateTokens, STATE_LENGTH * sizeof(gl_state_index))) {
-	 return index;
+                  stateTokens, STATE_LENGTH * sizeof(gl_state_index))) {
+         return index;
       }
    }
 

From dd63fa28f14f8ddeeeca1847eb7d38f4e2bc2234 Mon Sep 17 00:00:00 2001
From: George Kyriazis <george.kyriazis@intel.com>
Date: Mon, 14 Mar 2016 17:40:14 -0500
Subject: [PATCH 029/197] gallium/swr: Cleaned up some context-resource
 management

Removed bound_to_context.  We now pick up the context from the screen
instead of the resource itself.  The resource could be out-of-date
and point to a pipe that is already freed.

Fixes manywin mesa xdemo.

Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
---
 src/gallium/drivers/swr/swr_context.cpp | 16 +++++++++++-----
 src/gallium/drivers/swr/swr_resource.h  | 18 ++++++------------
 src/gallium/drivers/swr/swr_screen.cpp  |  8 ++++----
 src/gallium/drivers/swr/swr_screen.h    |  1 +
 src/gallium/drivers/swr/swr_state.cpp   | 10 +++++-----
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index c8cb145d334..78b8fdf619b 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -129,7 +129,7 @@ swr_transfer_map(struct pipe_context *pipe,
                swr_fence_submit(swr_context(pipe), screen->flush_fence);
 
             swr_fence_finish(pipe->screen, screen->flush_fence, 0);
-            swr_resource_unused(pipe, spr);
+            swr_resource_unused(resource);
          }
       }
    }
@@ -206,8 +206,8 @@ swr_resource_copy(struct pipe_context *pipe,
    swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED);
 
    swr_fence_finish(pipe->screen, screen->flush_fence, 0);
-   swr_resource_unused(pipe, swr_resource(src));
-   swr_resource_unused(pipe, swr_resource(dst));
+   swr_resource_unused(src);
+   swr_resource_unused(dst);
 
    if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER)
        || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) {
@@ -293,6 +293,7 @@ static void
 swr_destroy(struct pipe_context *pipe)
 {
    struct swr_context *ctx = swr_context(pipe);
+   struct swr_screen *screen = swr_screen(pipe->screen);
 
    if (ctx->blitter)
       util_blitter_destroy(ctx->blitter);
@@ -306,6 +307,9 @@ swr_destroy(struct pipe_context *pipe)
 
    swr_destroy_scratch_buffers(ctx);
 
+   assert(screen);
+   screen->pipe = NULL;
+
    FREE(ctx);
 }
 
@@ -324,9 +328,10 @@ swr_render_condition(struct pipe_context *pipe,
 }
 
 struct pipe_context *
-swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
 {
    struct swr_context *ctx = CALLOC_STRUCT(swr_context);
+   struct swr_screen *screen = swr_screen(p_screen);
    ctx->blendJIT =
       new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
 
@@ -347,7 +352,8 @@ swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
    if (ctx->swrContext == NULL)
       goto fail;
 
-   ctx->pipe.screen = screen;
+   screen->pipe = &ctx->pipe;
+   ctx->pipe.screen = p_screen;
    ctx->pipe.destroy = swr_destroy;
    ctx->pipe.priv = priv;
    ctx->pipe.create_surface = swr_create_surface;
diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
index 2fdc7683cb8..59cf0284461 100644
--- a/src/gallium/drivers/swr/swr_resource.h
+++ b/src/gallium/drivers/swr/swr_resource.h
@@ -54,9 +54,6 @@ struct swr_resource {
    unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
 
    enum swr_resource_status status;
-
-   /* pipe_context to which resource is currently bound. */
-   struct pipe_context *bound_to_context;
 };
 
 
@@ -120,24 +117,21 @@ swr_resource_status & operator|=(enum swr_resource_status & a,
 }
 
 static INLINE void
-swr_resource_read(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_read(struct pipe_resource *resource)
 {
-   resource->status |= SWR_RESOURCE_READ;
-   resource->bound_to_context = pipe;
+   swr_resource(resource)->status |= SWR_RESOURCE_READ;
 }
 
 static INLINE void
-swr_resource_write(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_write(struct pipe_resource *resource)
 {
-   resource->status |= SWR_RESOURCE_WRITE;
-   resource->bound_to_context = pipe;
+   swr_resource(resource)->status |= SWR_RESOURCE_WRITE;
 }
 
 static INLINE void
-swr_resource_unused(struct pipe_context *pipe, struct swr_resource *resource)
+swr_resource_unused(struct pipe_resource *resource)
 {
-   resource->status = SWR_RESOURCE_UNUSED;
-   resource->bound_to_context = nullptr;
+   swr_resource(resource)->status = SWR_RESOURCE_UNUSED;
 }
 
 #endif
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
index e46df47570f..f9e52be2367 100644
--- a/src/gallium/drivers/swr/swr_screen.cpp
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -620,7 +620,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
 {
    struct swr_screen *screen = swr_screen(p_screen);
    struct swr_resource *spr = swr_resource(pt);
-   struct pipe_context *pipe = spr->bound_to_context;
+   struct pipe_context *pipe = screen->pipe;
 
    /* Only wait on fence if the resource is being used */
    if (pipe && spr->status) {
@@ -630,7 +630,7 @@ swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
          swr_fence_submit(swr_context(pipe), screen->flush_fence);
 
       swr_fence_finish(p_screen, screen->flush_fence, 0);
-      swr_resource_unused(pipe, spr);
+      swr_resource_unused(pt);
    }
 
    /*
@@ -661,11 +661,11 @@ swr_flush_frontbuffer(struct pipe_screen *p_screen,
    struct swr_screen *screen = swr_screen(p_screen);
    struct sw_winsys *winsys = screen->winsys;
    struct swr_resource *spr = swr_resource(resource);
-   struct pipe_context *pipe = spr->bound_to_context;
+   struct pipe_context *pipe = screen->pipe;
 
    if (pipe) {
       swr_fence_finish(p_screen, screen->flush_fence, 0);
-      swr_resource_unused(pipe, spr);
+      swr_resource_unused(resource);
       SwrEndFrame(swr_context(pipe)->swrContext);
    }
 
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
index a96dc44cf66..0c82a2eff7a 100644
--- a/src/gallium/drivers/swr/swr_screen.h
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -32,6 +32,7 @@ struct sw_winsys;
 
 struct swr_screen {
    struct pipe_screen base;
+   struct pipe_context *pipe;
 
    struct pipe_fence_handle *flush_fence;
 
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
index 47ee3cb2664..e7bf3618a7d 100644
--- a/src/gallium/drivers/swr/swr_state.cpp
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -646,24 +646,24 @@ swr_update_resource_status(struct pipe_context *pipe,
    if (fb->nr_cbufs)
       for (uint32_t i = 0; i < fb->nr_cbufs; ++i)
          if (fb->cbufs[i])
-            swr_resource_write(pipe, swr_resource(fb->cbufs[i]->texture));
+            swr_resource_write(fb->cbufs[i]->texture);
 
    /* depth/stencil target */
    if (fb->zsbuf)
-      swr_resource_write(pipe, swr_resource(fb->zsbuf->texture));
+      swr_resource_write(fb->zsbuf->texture);
 
    /* VBO vertex buffers */
    for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) {
       struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
       if (!vb->user_buffer)
-         swr_resource_read(pipe, swr_resource(vb->buffer));
+         swr_resource_read(vb->buffer);
    }
 
    /* VBO index buffer */
    if (p_draw_info && p_draw_info->indexed) {
       struct pipe_index_buffer *ib = &ctx->index_buffer;
       if (!ib->user_buffer)
-         swr_resource_read(pipe, swr_resource(ib->buffer));
+         swr_resource_read(ib->buffer);
    }
 
    /* texture sampler views */
@@ -671,7 +671,7 @@ swr_update_resource_status(struct pipe_context *pipe,
       struct pipe_sampler_view *view =
          ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
       if (view)
-         swr_resource_read(pipe, swr_resource(view->texture));
+         swr_resource_read(view->texture);
    }
 }
 

From d4714512e4077b8079efe526d7823e19fdb9be37 Mon Sep 17 00:00:00 2001
From: Daniel Czarnowski <daniel.czarnowski@intel.com>
Date: Mon, 22 Feb 2016 08:00:14 +0200
Subject: [PATCH 030/197] egl: support EGL_LARGEST_PBUFFER in
 eglCreatePbufferSurface(...)

Patch provides a default for a set pbuffer surface size when
EGL_LARGEST_PBUFFER is used by the client. MIN2 macro is moved
to egldefines so that it can be shared.

Fixes following Piglit test:
   egl-create-largest-pbuffer-surface

From EGL 1.5 spec:
   "Use EGL_LARGEST_PBUFFER to get the largest available pbuffer
   when the allocation of the pbuffer would otherwise fail."

Currently there exists no API to query largest available pixmap size
using xlib or xcb so right now this seems most straightforward way to
ensure that we fulfill above API and also we don't attempt to allocate
'too big' pixmap which might succeed on server side but not work in
practice when driver starts to use it as a texture.

v2: add more explanation about the change (Emil)

Signed-off-by: Matt Roper <matthew.d.roper@intel.com>
Cc: "11.0 11.1" <mesa-stable@lists.freedesktop.org
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/egl/main/eglconfig.c  | 1 -
 src/egl/main/egldefines.h | 7 +++++++
 src/egl/main/eglsurface.c | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index c445d9b0c92..d79c0e15422 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -44,7 +44,6 @@
 #include "egllog.h"
 
 
-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
 
 
 /**
diff --git a/src/egl/main/egldefines.h b/src/egl/main/egldefines.h
index a32cab26408..13a7563ce04 100644
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -40,9 +40,16 @@ extern "C" {
 
 #define _EGL_MAX_EXTENSIONS_LEN 1000
 
+/* Hardcoded, conservative default for EGL_LARGEST_PBUFFER,
+ * this is used to implement EGL_LARGEST_PBUFFER.
+ */
+#define _EGL_MAX_PBUFFER_WIDTH 4096
+#define _EGL_MAX_PBUFFER_HEIGHT 4096
+
 #define _EGL_VENDOR_STRING "Mesa Project"
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
 
 #ifdef __cplusplus
 }
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 4fa43f3e2b1..2971bb0983a 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -307,6 +307,12 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
    if (err != EGL_SUCCESS)
       return _eglError(err, func);
 
+   /* if EGL_LARGEST_PBUFFER in use, clamp width and height */
+   if (surf->LargestPbuffer) {
+      surf->Width = MIN2(surf->Width, _EGL_MAX_PBUFFER_WIDTH);
+      surf->Height = MIN2(surf->Height, _EGL_MAX_PBUFFER_HEIGHT);
+   }
+
    return EGL_TRUE;
 }
 

From 7a712e64d6d59c3543fd307f9e029ad0886be622 Mon Sep 17 00:00:00 2001
From: "Juan A. Suarez Romero" <jasuarez@igalia.com>
Date: Fri, 18 Mar 2016 17:29:55 +0100
Subject: [PATCH 031/197] doc: add 'vec4' option in INTEL_DEBUG

Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 docs/envvars.html | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/envvars.html b/docs/envvars.html
index 06aa0ac9369..e21b7c1aaa0 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -163,6 +163,7 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
    <li>blorp - emit messages about the blorp operations (blits &amp; clears)</li>
    <li>nodualobj - suppress generation of dual-object geometry shader code</li>
    <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
+   <li>vec4 - force vec4 mode in vertex shader</li>
 </ul>
 </ul>
 

From e9d5e68d1b3f2ce21486a17799e2345bb54116f6 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 15:47:41 -0600
Subject: [PATCH 032/197] tgsi: add tgsi_transform_op3_inst() function

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_transform.h | 34 +++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 27e6179c9ee..4dd7dda25fd 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -301,6 +301,40 @@ tgsi_transform_op2_inst(struct tgsi_transform_context *ctx,
 }
 
 
+static inline void
+tgsi_transform_op3_inst(struct tgsi_transform_context *ctx,
+                        unsigned opcode,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned dst_writemask,
+                        unsigned src0_file,
+                        unsigned src0_index,
+                        unsigned src1_file,
+                        unsigned src1_index,
+                        unsigned src2_file,
+                        unsigned src2_index)
+{
+   struct tgsi_full_instruction inst;
+
+   inst = tgsi_default_full_instruction();
+   inst.Instruction.Opcode = opcode;
+   inst.Instruction.NumDstRegs = 1;
+   inst.Dst[0].Register.File = dst_file,
+   inst.Dst[0].Register.Index = dst_index;
+   inst.Dst[0].Register.WriteMask = dst_writemask;
+   inst.Instruction.NumSrcRegs = 3;
+   inst.Src[0].Register.File = src0_file;
+   inst.Src[0].Register.Index = src0_index;
+   inst.Src[1].Register.File = src1_file;
+   inst.Src[1].Register.Index = src1_index;
+   inst.Src[2].Register.File = src2_file;
+   inst.Src[2].Register.Index = src2_index;
+
+   ctx->emit_instruction(ctx, &inst);
+}
+
+
+
 static inline void
 tgsi_transform_op1_swz_inst(struct tgsi_transform_context *ctx,
                             unsigned opcode,

From 373910f4e764179ec3c970a9ed13435cd6087631 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 15:49:34 -0600
Subject: [PATCH 033/197] st/mesa: simplify bitmap shader code with tgsi
 transform helper functions

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_cb_bitmap_shader.c | 45 ++++----------------
 1 file changed, 8 insertions(+), 37 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
index cddea36d4f6..88779bc627d 100644
--- a/src/mesa/state_tracker/st_cb_bitmap_shader.c
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -52,7 +52,6 @@ transform_instr(struct tgsi_transform_context *tctx,
 		struct tgsi_full_instruction *current_inst)
 {
    struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx);
-   struct tgsi_full_declaration decl;
    struct tgsi_full_instruction inst;
    unsigned i, semantic;
    int texcoord_index = -1;
@@ -66,9 +65,7 @@ transform_instr(struct tgsi_transform_context *tctx,
 
    /* Add TEMP[0] if it's missing. */
    if (ctx->info.file_max[TGSI_FILE_TEMPORARY] == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_TEMPORARY;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_temp_decl(tctx, 0);
    }
 
    /* Add TEXCOORD[0] if it's missing. */
@@ -83,45 +80,19 @@ transform_instr(struct tgsi_transform_context *tctx,
    }
 
    if (texcoord_index == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_INPUT;
-      decl.Declaration.Semantic = 1;
-      decl.Semantic.Name = semantic;
-      decl.Declaration.Interpolate = 1;
-      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
-      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
       texcoord_index = ctx->info.num_inputs;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_input_decl(tctx, texcoord_index,
+                                semantic, 0, TGSI_INTERPOLATE_PERSPECTIVE);
    }
 
    /* Declare the sampler. */
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_SAMPLER;
-   decl.Range.First = decl.Range.Last = ctx->sampler_index;
-   tctx->emit_declaration(tctx, &decl);
+   tgsi_transform_sampler_decl(tctx, ctx->sampler_index);
 
    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   inst = tgsi_default_full_instruction();
-   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-   inst.Instruction.Texture = 1;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
-
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-   inst.Dst[0].Register.Index = 0;
-   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-   inst.Instruction.NumSrcRegs = 2;
-   inst.Src[0].Register.File  = TGSI_FILE_INPUT;
-   inst.Src[0].Register.Index = texcoord_index;
-   inst.Src[0].Register.SwizzleX = TGSI_SWIZZLE_X;
-   inst.Src[0].Register.SwizzleY = TGSI_SWIZZLE_Y;
-   inst.Src[0].Register.SwizzleZ = TGSI_SWIZZLE_Z;
-   inst.Src[0].Register.SwizzleW = TGSI_SWIZZLE_W;
-   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
-   inst.Src[1].Register.Index = ctx->sampler_index;
-
-   tctx->emit_instruction(tctx, &inst);
+   tgsi_transform_tex_2d_inst(tctx,
+                              TGSI_FILE_TEMPORARY, 0,
+                              TGSI_FILE_INPUT, texcoord_index,
+                              ctx->sampler_index);
 
    /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
    inst = tgsi_default_full_instruction();

From 0f73c3ab2577abce8c978fb91a0521ffc6079378 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 15:50:21 -0600
Subject: [PATCH 034/197] st/mesa: simplify drawpixels shader code with tgsi
 transform helper functions

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 .../state_tracker/st_cb_drawpixels_shader.c   | 82 ++++---------------
 1 file changed, 18 insertions(+), 64 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
index 749b46cfbf7..2cf75f8bd77 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -72,8 +72,6 @@ transform_instr(struct tgsi_transform_context *tctx,
 		struct tgsi_full_instruction *current_inst)
 {
    struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx);
-   struct tgsi_full_declaration decl;
-   struct tgsi_full_instruction inst;
    unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD :
                                                   TGSI_SEMANTIC_GENERIC;
    int texcoord_index = -1;
@@ -86,33 +84,21 @@ transform_instr(struct tgsi_transform_context *tctx,
    /* Add scale and bias constants. */
    if (ctx->scale_and_bias) {
       if (ctx->info.const_file_max[0] < (int)ctx->scale_const) {
-         decl = tgsi_default_full_declaration();
-         decl.Declaration.File = TGSI_FILE_CONSTANT;
-         decl.Range.First = decl.Range.Last = ctx->scale_const;
-         tctx->emit_declaration(tctx, &decl);
+         tgsi_transform_const_decl(tctx, ctx->scale_const, ctx->scale_const);
       }
 
       if (ctx->info.const_file_max[0] < (int)ctx->bias_const) {
-         decl = tgsi_default_full_declaration();
-         decl.Declaration.File = TGSI_FILE_CONSTANT;
-         decl.Range.First = decl.Range.Last = ctx->bias_const;
-         tctx->emit_declaration(tctx, &decl);
+         tgsi_transform_const_decl(tctx, ctx->bias_const, ctx->bias_const);
       }
    }
 
    if (ctx->info.const_file_max[0] < (int)ctx->texcoord_const) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_CONSTANT;
-      decl.Range.First = decl.Range.Last = ctx->texcoord_const;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_const_decl(tctx, ctx->texcoord_const, ctx->texcoord_const);
    }
 
    /* Add a new temp. */
    ctx->color_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
-   decl = tgsi_default_full_declaration();
-   decl.Declaration.File = TGSI_FILE_TEMPORARY;
-   decl.Range.First = decl.Range.Last = ctx->color_temp;
-   tctx->emit_declaration(tctx, &decl);
+   tgsi_transform_temp_decl(tctx, ctx->color_temp);
 
    /* Add TEXCOORD[texcoord_slot] if it's missing. */
    for (i = 0; i < ctx->info.num_inputs; i++) {
@@ -124,75 +110,43 @@ transform_instr(struct tgsi_transform_context *tctx,
    }
 
    if (texcoord_index == -1) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_INPUT;
-      decl.Declaration.Semantic = 1;
-      decl.Semantic.Name = sem_texcoord;
-      decl.Declaration.Interpolate = 1;
-      decl.Interp.Interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
-      decl.Range.First = decl.Range.Last = ctx->info.num_inputs;
       texcoord_index = ctx->info.num_inputs;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_input_decl(tctx, texcoord_index, sem_texcoord, 0,
+                                TGSI_INTERPOLATE_PERSPECTIVE);
    }
 
    /* Declare the drawpix sampler if it's missing. */
    if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_SAMPLER;
-      decl.Range.First = decl.Range.Last = ctx->drawpix_sampler;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_sampler_decl(tctx, ctx->drawpix_sampler);
    }
 
    /* Declare the pixel map sampler if it's missing. */
    if (ctx->pixel_maps &&
        !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) {
-      decl = tgsi_default_full_declaration();
-      decl.Declaration.File = TGSI_FILE_SAMPLER;
-      decl.Range.First = decl.Range.Last = ctx->pixelmap_sampler;
-      tctx->emit_declaration(tctx, &decl);
+      tgsi_transform_sampler_decl(tctx, ctx->pixelmap_sampler);
    }
 
    /* Get initial pixel color from the texture.
     * TEX temp, fragment.texcoord[0], texture[0], 2D;
     */
-   inst = tgsi_default_full_instruction();
-   inst.Instruction.Opcode = TGSI_OPCODE_TEX;
-   inst.Instruction.Texture = 1;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
-
-   inst.Instruction.NumDstRegs = 1;
-   inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-   inst.Dst[0].Register.Index = ctx->color_temp;
-   inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-   inst.Instruction.NumSrcRegs = 2;
-   SET_SRC(&inst, 0, TGSI_FILE_INPUT, texcoord_index, X, Y, Z, W);
-   inst.Src[1].Register.File  = TGSI_FILE_SAMPLER;
-   inst.Src[1].Register.Index = ctx->drawpix_sampler;
-
-   tctx->emit_instruction(tctx, &inst);
+   tgsi_transform_tex_2d_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp,
+                              TGSI_FILE_INPUT, texcoord_index,
+                              ctx->drawpix_sampler);
 
    /* Apply the scale and bias. */
    if (ctx->scale_and_bias) {
       /* MAD temp, temp, scale, bias; */
-      inst = tgsi_default_full_instruction();
-      inst.Instruction.Opcode = TGSI_OPCODE_MAD;
-
-      inst.Instruction.NumDstRegs = 1;
-      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
-      inst.Dst[0].Register.Index = ctx->color_temp;
-      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
-
-      inst.Instruction.NumSrcRegs = 3;
-      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->color_temp, X, Y, Z, W);
-      SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, ctx->scale_const, X, Y, Z, W);
-      SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, ctx->bias_const, X, Y, Z, W);
-
-      tctx->emit_instruction(tctx, &inst);
+      tgsi_transform_op3_inst(tctx, TGSI_OPCODE_MAD,
+                              TGSI_FILE_TEMPORARY, ctx->color_temp,
+                              TGSI_WRITEMASK_XYZW,
+                              TGSI_FILE_TEMPORARY, ctx->color_temp,
+                              TGSI_FILE_CONSTANT, ctx->scale_const,
+                              TGSI_FILE_CONSTANT, ctx->bias_const);
    }
 
    if (ctx->pixel_maps) {
       /* do four pixel map look-ups with two TEX instructions: */
+      struct tgsi_full_instruction inst;
 
       /* TEX temp.xy, temp.xyyy, texture[1], 2D; */
       inst = tgsi_default_full_instruction();

From 9211b68ad3dae7cc6cc49f77cf2393b215e25c59 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 15:12:42 -0600
Subject: [PATCH 035/197] st/mesa: clean up st_translate_texture_target()

Reformat code.  Improve assertion.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_mesa_to_tgsi.c | 69 +++++++++++++++---------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 8772efb0944..8a12ce4c685 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -241,37 +241,56 @@ src_register( struct st_translate *t,
  * Map mesa texture target to TGSI texture target.
  */
 unsigned
-st_translate_texture_target( GLuint textarget,
-                          GLboolean shadow )
+st_translate_texture_target(GLuint textarget, GLboolean shadow)
 {
    if (shadow) {
-      switch( textarget ) {
-      case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_SHADOW1D;
-      case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_SHADOW2D;
-      case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_SHADOWRECT;
-      case TEXTURE_1D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW1D_ARRAY;
-      case TEXTURE_2D_ARRAY_INDEX: return TGSI_TEXTURE_SHADOW2D_ARRAY;
-      case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_SHADOWCUBE;
-      case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_SHADOWCUBE_ARRAY;
-      default: break;
+      switch (textarget) {
+      case TEXTURE_1D_INDEX:
+         return TGSI_TEXTURE_SHADOW1D;
+      case TEXTURE_2D_INDEX:
+         return TGSI_TEXTURE_SHADOW2D;
+      case TEXTURE_RECT_INDEX:
+         return TGSI_TEXTURE_SHADOWRECT;
+      case TEXTURE_1D_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOW1D_ARRAY;
+      case TEXTURE_2D_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOW2D_ARRAY;
+      case TEXTURE_CUBE_INDEX:
+         return TGSI_TEXTURE_SHADOWCUBE;
+      case TEXTURE_CUBE_ARRAY_INDEX:
+         return TGSI_TEXTURE_SHADOWCUBE_ARRAY;
+      default:
+         break;
       }
    }
 
-   switch( textarget ) {
-   case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA;
-   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA;
-   case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER;
-   case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_1D;
-   case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_2D;
-   case TEXTURE_3D_INDEX:   return TGSI_TEXTURE_3D;
-   case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE;
-   case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY;
-   case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT;
-   case TEXTURE_1D_ARRAY_INDEX:   return TGSI_TEXTURE_1D_ARRAY;
-   case TEXTURE_2D_ARRAY_INDEX:   return TGSI_TEXTURE_2D_ARRAY;
-   case TEXTURE_EXTERNAL_INDEX:   return TGSI_TEXTURE_2D;
+   switch (textarget) {
+   case TEXTURE_2D_MULTISAMPLE_INDEX:
+      return TGSI_TEXTURE_2D_MSAA;
+   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX:
+      return TGSI_TEXTURE_2D_ARRAY_MSAA;
+   case TEXTURE_BUFFER_INDEX:
+      return TGSI_TEXTURE_BUFFER;
+   case TEXTURE_1D_INDEX:
+      return TGSI_TEXTURE_1D;
+   case TEXTURE_2D_INDEX:
+      return TGSI_TEXTURE_2D;
+   case TEXTURE_3D_INDEX:
+      return TGSI_TEXTURE_3D;
+   case TEXTURE_CUBE_INDEX:
+      return TGSI_TEXTURE_CUBE;
+   case TEXTURE_CUBE_ARRAY_INDEX:
+      return TGSI_TEXTURE_CUBE_ARRAY;
+   case TEXTURE_RECT_INDEX:
+      return TGSI_TEXTURE_RECT;
+   case TEXTURE_1D_ARRAY_INDEX:
+      return TGSI_TEXTURE_1D_ARRAY;
+   case TEXTURE_2D_ARRAY_INDEX:
+      return TGSI_TEXTURE_2D_ARRAY;
+   case TEXTURE_EXTERNAL_INDEX:
+      return TGSI_TEXTURE_2D;
    default:
-      debug_assert( 0 );
+      debug_assert(!"unexpected texture target index");
       return TGSI_TEXTURE_1D;
    }
 }

From dedb46f582b527d722f431be9f591d20a5513030 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:16:00 +0100
Subject: [PATCH 036/197] nv50: rename nv50_context::dirty to
 nv50_context::dirty_3d

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_compute.c       |  2 +-
 .../drivers/nouveau/nv50/nv50_context.c       | 10 ++---
 .../drivers/nouveau/nv50/nv50_context.h       |  2 +-
 .../drivers/nouveau/nv50/nv50_shader_state.c  | 10 ++---
 src/gallium/drivers/nouveau/nv50/nv50_state.c | 42 +++++++++----------
 .../nouveau/nv50/nv50_state_validate.c        | 30 ++++++-------
 .../drivers/nouveau/nv50/nv50_surface.c       | 14 +++----
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |  4 +-
 8 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 04488d6d0a6..444e59df443 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -314,5 +314,5 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    PUSH_DATA (push, 0);
 
    /* bind a compute shader clobbers fragment shader state */
-   nv50->dirty |= NV50_NEW_FRAGPROG;
+   nv50->dirty_3d |= NV50_NEW_FRAGPROG;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 4874b77b1e1..08981f5258b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -176,7 +176,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
              nv50->framebuffer.cbufs[i]->texture == res) {
-            nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+            nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
             if (!--ref)
                return ref;
@@ -186,7 +186,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
    if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
-         nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+         nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
          nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
          if (!--ref)
             return ref;
@@ -202,7 +202,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
          if (nv50->vtxbuf[i].buffer == res) {
-            nv50->dirty |= NV50_NEW_ARRAYS;
+            nv50->dirty_3d |= NV50_NEW_ARRAYS;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
             if (!--ref)
                return ref;
@@ -222,7 +222,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->num_textures[s]; ++i) {
          if (nv50->textures[s][i] &&
              nv50->textures[s][i]->texture == res) {
-            nv50->dirty |= NV50_NEW_TEXTURES;
+            nv50->dirty_3d |= NV50_NEW_TEXTURES;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
             if (!--ref)
                return ref;
@@ -236,7 +236,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
             continue;
          if (!nv50->constbuf[s][i].user &&
              nv50->constbuf[s][i].u.buf == res) {
-            nv50->dirty |= NV50_NEW_CONSTBUF;
+            nv50->dirty_3d |= NV50_NEW_CONSTBUF;
             nv50->constbuf_dirty[s] |= 1 << i;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
             if (!--ref)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 2620d03b999..dc82bdc4002 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -115,7 +115,7 @@ struct nv50_context {
    struct nouveau_bufctx *bufctx;
    struct nouveau_bufctx *bufctx_cp;
 
-   uint32_t dirty;
+   uint32_t dirty_3d; /* dirty flags for 3d state */
    uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 8e4b2b42bda..7e8acd290e8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -181,7 +181,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
       fp->fp.force_persample_interp = rast->force_persample_interp;
    }
 
-   if (fp->mem && !(nv50->dirty & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+   if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
       return;
 
    if (!nv50_program_validate(nv50, fp))
@@ -309,7 +309,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
       PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
    }
 
-   if (nv50->dirty & NV50_NEW_FRAGPROG)
+   if (nv50->dirty_3d & NV50_NEW_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
    color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
@@ -378,9 +378,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint8_t map[64];
    uint8_t so_map[64];
 
-   if (!(nv50->dirty & (NV50_NEW_VERTPROG |
-                        NV50_NEW_FRAGPROG |
-                        NV50_NEW_GMTYPROG))) {
+   if (!(nv50->dirty_3d & (NV50_NEW_VERTPROG |
+                           NV50_NEW_FRAGPROG |
+                           NV50_NEW_GMTYPROG))) {
       uint8_t bfc, ffc;
       ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
       bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 8504ba466cc..cfbb0b3bde0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend = hwcso;
-   nv50->dirty |= NV50_NEW_BLEND;
+   nv50->dirty_3d |= NV50_NEW_BLEND;
 }
 
 static void
@@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->rast = hwcso;
-   nv50->dirty |= NV50_NEW_RASTERIZER;
+   nv50->dirty_3d |= NV50_NEW_RASTERIZER;
 }
 
 static void
@@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->zsa = hwcso;
-   nv50->dirty |= NV50_NEW_ZSA;
+   nv50->dirty_3d |= NV50_NEW_ZSA;
 }
 
 static void
@@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
 
    nv50->num_samplers[s] = nr;
 
-   nv50->dirty |= NV50_NEW_SAMPLERS;
+   nv50->dirty_3d |= NV50_NEW_SAMPLERS;
 }
 
 static void
@@ -700,7 +700,7 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
 
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
-   nv50->dirty |= NV50_NEW_TEXTURES;
+   nv50->dirty_3d |= NV50_NEW_TEXTURES;
 }
 
 static void
@@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->vertprog = hwcso;
-    nv50->dirty |= NV50_NEW_VERTPROG;
+    nv50->dirty_3d |= NV50_NEW_VERTPROG;
 }
 
 static void *
@@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->fragprog = hwcso;
-    nv50->dirty |= NV50_NEW_FRAGPROG;
+    nv50->dirty_3d |= NV50_NEW_FRAGPROG;
 }
 
 static void *
@@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->gmtyprog = hwcso;
-    nv50->dirty |= NV50_NEW_GMTYPROG;
+    nv50->dirty_3d |= NV50_NEW_GMTYPROG;
 }
 
 static void *
@@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
-   nv50->dirty |= NV50_NEW_CONSTBUF;
+   nv50->dirty_3d |= NV50_NEW_CONSTBUF;
 }
 
 /* =============================================================================
@@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend_colour = *bcol;
-   nv50->dirty |= NV50_NEW_BLEND_COLOUR;
+   nv50->dirty_3d |= NV50_NEW_BLEND_COLOUR;
 }
 
 static void
@@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stencil_ref = *sr;
-   nv50->dirty |= NV50_NEW_STENCIL_REF;
+   nv50->dirty_3d |= NV50_NEW_STENCIL_REF;
 }
 
 static void
@@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe,
 
    memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));
 
-   nv50->dirty |= NV50_NEW_CLIP;
+   nv50->dirty_3d |= NV50_NEW_CLIP;
 }
 
 static void
@@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->sample_mask = sample_mask;
-   nv50->dirty |= NV50_NEW_SAMPLE_MASK;
+   nv50->dirty_3d |= NV50_NEW_SAMPLE_MASK;
 }
 
 static void
@@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples)
 
    if (nv50->min_samples != min_samples) {
       nv50->min_samples = min_samples;
-      nv50->dirty |= NV50_NEW_MIN_SAMPLES;
+      nv50->dirty_3d |= NV50_NEW_MIN_SAMPLES;
    }
 }
 
@@ -949,7 +949,7 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
 
    util_copy_framebuffer_state(&nv50->framebuffer, fb);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
 }
 
 static void
@@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stipple = *stipple;
-   nv50->dirty |= NV50_NEW_STIPPLE;
+   nv50->dirty_3d |= NV50_NEW_STIPPLE;
 }
 
 static void
@@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe,
          continue;
       nv50->scissors[start_slot + i] = scissor[i];
       nv50->scissors_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_SCISSOR;
+      nv50->dirty_3d |= NV50_NEW_SCISSOR;
    }
 }
 
@@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe,
          continue;
       nv50->viewports[start_slot + i] = vpt[i];
       nv50->viewports_dirty |= 1 << (start_slot + i);
-      nv50->dirty |= NV50_NEW_VIEWPORT;
+      nv50->dirty_3d |= NV50_NEW_VIEWPORT;
    }
 }
 
@@ -1009,7 +1009,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    unsigned i;
 
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-   nv50->dirty |= NV50_NEW_ARRAYS;
+   nv50->dirty_3d |= NV50_NEW_ARRAYS;
 
    util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                  start_slot, count);
@@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->vertex = hwcso;
-   nv50->dirty |= NV50_NEW_VERTEX;
+   nv50->dirty_3d |= NV50_NEW_VERTEX;
 }
 
 static struct pipe_stream_output_target *
@@ -1181,7 +1181,7 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
    nv50->num_so_targets = num_targets;
 
    if (nv50->so_targets_dirty)
-      nv50->dirty |= NV50_NEW_STRMOUT;
+      nv50->dirty_3d |= NV50_NEW_STRMOUT;
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 55369781606..601f9b1449d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -187,7 +187,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
 #ifdef NV50_SCISSORS_CLIPPING
    int minx, maxx, miny, maxy, i;
 
-   if (!(nv50->dirty &
+   if (!(nv50->dirty_3d &
          (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
        nv50->state.scissor == nv50->rast->pipe.scissor)
       return;
@@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
 
    nv50->state.scissor = nv50->rast->pipe.scissor;
 
-   if ((nv50->dirty & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
+   if ((nv50->dirty_3d & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
       nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1;
 
    for (i = 0; i < NV50_MAX_VIEWPORTS; i++) {
@@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50,
 
    vp->vp.clpd_nr = n;
    if (likely(vp == nv50->vertprog)) {
-      nv50->dirty |= NV50_NEW_VERTPROG;
+      nv50->dirty_3d |= NV50_NEW_VERTPROG;
       nv50_vertprog_validate(nv50);
    } else {
-      nv50->dirty |= NV50_NEW_GMTYPROG;
+      nv50->dirty_3d |= NV50_NEW_GMTYPROG;
       nv50_gmtyprog_validate(nv50);
    }
    nv50_fp_linkage_validate(nv50);
@@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50)
    struct nv50_program *vp;
    uint8_t clip_enable;
 
-   if (nv50->dirty & NV50_NEW_CLIP) {
+   if (nv50->dirty_3d & NV50_NEW_CLIP) {
       BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
       PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX);
       BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
@@ -436,7 +436,7 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    else
       ctx_to->state = ctx_to->screen->save_state;
 
-   ctx_to->dirty = ~0;
+   ctx_to->dirty_3d = ~0;
    ctx_to->viewports_dirty = ~0;
    ctx_to->scissors_dirty = ~0;
 
@@ -445,23 +445,23 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;
 
    if (!ctx_to->vertex)
-      ctx_to->dirty &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
+      ctx_to->dirty_3d &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
 
    if (!ctx_to->vertprog)
-      ctx_to->dirty &= ~NV50_NEW_VERTPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_VERTPROG;
    if (!ctx_to->fragprog)
-      ctx_to->dirty &= ~NV50_NEW_FRAGPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_FRAGPROG;
 
    if (!ctx_to->blend)
-      ctx_to->dirty &= ~NV50_NEW_BLEND;
+      ctx_to->dirty_3d &= ~NV50_NEW_BLEND;
    if (!ctx_to->rast)
 #ifdef NV50_SCISSORS_CLIPPING
-      ctx_to->dirty &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
+      ctx_to->dirty_3d &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
 #else
-      ctx_to->dirty &= ~NV50_NEW_RASTERIZER;
+      ctx_to->dirty_3d &= ~NV50_NEW_RASTERIZER;
 #endif
    if (!ctx_to->zsa)
-      ctx_to->dirty &= ~NV50_NEW_ZSA;
+      ctx_to->dirty_3d &= ~NV50_NEW_ZSA;
 
    ctx_to->screen->cur_ctx = ctx_to;
 }
@@ -518,7 +518,7 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
    if (nv50->screen->cur_ctx != nv50)
       nv50_switch_pipe_context(nv50);
 
-   state_mask = nv50->dirty & mask;
+   state_mask = nv50->dirty_3d & mask;
 
    if (state_mask) {
       for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
@@ -527,7 +527,7 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
          if (state_mask & validate->states)
             validate->func(nv50);
       }
-      nv50->dirty &= ~state_mask;
+      nv50->dirty_3d &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
          nv50->state.rt_serialize = false;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 84646f6adb1..e595524dcc4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -353,7 +353,7 @@ nv50_clear_render_target(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
 static void
@@ -436,7 +436,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
 void
@@ -798,7 +798,7 @@ nv50_clear_buffer(struct pipe_context *pipe,
                              data, data_size);
    }
 
-   nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
 }
 
 /* =============================== BLIT CODE ===================================
@@ -834,7 +834,7 @@ struct nv50_blitctx
       struct pipe_sampler_view *texture[2];
       struct nv50_tsc_entry *sampler[2];
       unsigned min_samples;
-      uint32_t dirty;
+      uint32_t dirty_3d;
    } saved;
    struct nv50_rasterizer_stateobj rast;
 };
@@ -1253,12 +1253,12 @@ nv50_blitctx_pre_blit(struct nv50_blitctx *ctx)
 
    nv50->min_samples = 1;
 
-   ctx->saved.dirty = nv50->dirty;
+   ctx->saved.dirty_3d = nv50->dirty_3d;
 
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
-   nv50->dirty =
+   nv50->dirty_3d =
       NV50_NEW_FRAMEBUFFER | NV50_NEW_MIN_SAMPLES |
       NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | NV50_NEW_GMTYPROG |
       NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS;
@@ -1305,7 +1305,7 @@ nv50_blitctx_post_blit(struct nv50_blitctx *blit)
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
-   nv50->dirty = blit->saved.dirty |
+   nv50->dirty_3d = blit->saved.dirty_3d |
       (NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR | NV50_NEW_SAMPLE_MASK |
        NV50_NEW_RASTERIZER | NV50_NEW_ZSA | NV50_NEW_BLEND |
        NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS |
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 6f60445d8d2..11f2471ed5b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -779,9 +779,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    nv50->vbo_push_hint = /* the 64 is heuristic */
       !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count));
 
-   if (nv50->vbo_user && !(nv50->dirty & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) {
+   if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) {
       if (!!nv50->vbo_fifo != nv50->vbo_push_hint)
-         nv50->dirty |= NV50_NEW_ARRAYS;
+         nv50->dirty_3d |= NV50_NEW_ARRAYS;
       else
       if (!nv50->vbo_fifo)
          nv50_update_user_vbufs(nv50);

From e844aac40bbcbdfef373cc7cf720cc8b0552aff0 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:09:05 +0100
Subject: [PATCH 037/197] nv50: rename NV50_COMPUTE to NV50_CP

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_compute.c       | 90 +++++++++----------
 .../drivers/nouveau/nv50/nv50_query_hw_sm.c   | 10 +--
 .../drivers/nouveau/nv50/nv50_winsys.h        |  4 +-
 3 files changed, 52 insertions(+), 52 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 444e59df443..ac411ee0e31 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -67,89 +67,89 @@ nv50_screen_compute_setup(struct nv50_screen *screen,
    if (ret)
       return ret;
 
-   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   BEGIN_NV04(push, SUBC_CP(NV01_SUBCHAN_OBJECT), 1);
    PUSH_DATA (push, screen->compute->handle);
 
-   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   BEGIN_NV04(push, NV50_CP(UNK02A0), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_STACK), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(STACK_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->stack_bo->offset);
    PUSH_DATA (push, screen->stack_bo->offset);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_SIZE_LOG), 1);
    PUSH_DATA (push, 4);
 
-   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0290), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   BEGIN_NV04(push, NV50_CP(LANES32_ENABLE), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   BEGIN_NV04(push, NV50_CP(REG_MODE), 1);
    PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
-   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   BEGIN_NV04(push, NV50_CP(UNK0384), 1);
    PUSH_DATA (push, 0x100);
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_GLOBAL), 1);
    PUSH_DATA (push, fifo->vram);
 
    for (i = 0; i < 15; i++) {
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(i)), 2);
       PUSH_DATA (push, 0);
       PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(i)), 1);
       PUSH_DATA (push, 0);
-      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(i)), 1);
       PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
    }
 
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_ADDRESS_HIGH(15)), 2);
    PUSH_DATA (push, 0);
    PUSH_DATA (push, 0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_LIMIT(15)), 1);
    PUSH_DATA (push, ~0);
-   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   BEGIN_NV04(push, NV50_CP(GLOBAL_MODE(15)), 1);
    PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
 
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_LOG_ALLOC), 1);
    PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_WARPS_NO_CLAMP), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_LOG_ALLOC), 1);
    PUSH_DATA (push, 7);
-   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   BEGIN_NV04(push, NV50_CP(STACK_WARPS_NO_CLAMP), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
    PUSH_DATA (push, 0);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TEXTURE), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   BEGIN_NV04(push, NV50_CP(TEX_LIMITS), 1);
    PUSH_DATA (push, 0x54);
-   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(LINKED_TSC), 1);
    PUSH_DATA (push, 0);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TIC), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TIC_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->txc->offset);
    PUSH_DATA (push, screen->txc->offset);
    PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_TSC), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   BEGIN_NV04(push, NV50_CP(TSC_ADDRESS_HIGH), 3);
    PUSH_DATAh(push, screen->txc->offset + 65536);
    PUSH_DATA (push, screen->txc->offset + 65536);
    PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_CODE_CB), 1);
    PUSH_DATA (push, fifo->vram);
 
-   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   BEGIN_NV04(push, NV50_CP(DMA_LOCAL), 1);
    PUSH_DATA (push, fifo->vram);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   BEGIN_NV04(push, NV50_CP(LOCAL_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->tls_bo->offset + 65536);
    PUSH_DATA (push, screen->tls_bo->offset + 65536);
-   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   BEGIN_NV04(push, NV50_CP(LOCAL_SIZE_LOG), 1);
    PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
 
    return 0;
@@ -175,7 +175,7 @@ nv50_compute_validate_program(struct nv50_context *nv50)
    if (likely(prog->code_size)) {
       if (nv50_program_upload_code(nv50, prog)) {
          struct nouveau_pushbuf *push = nv50->base.pushbuf;
-         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
+         BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
          PUSH_DATA (push, 0);
          return true;
       }
@@ -227,7 +227,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
    struct nouveau_pushbuf *push = screen->base.pushbuf;
    unsigned size = align(nv50->compprog->parm_size, 0x4);
 
-   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   BEGIN_NV04(push, NV50_CP(USER_PARAM_COUNT), 1);
    PUSH_DATA (push, (size / 4) << 8);
 
    if (size) {
@@ -245,7 +245,7 @@ nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
       nouveau_pushbuf_bufctx(push, nv50->bufctx);
       nouveau_pushbuf_validate(push);
 
-      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      BEGIN_NV04(push, NV50_CP(USER_PARAM(0)), size / 4);
       nouveau_pushbuf_data(push, bo, offset, size);
 
       nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
@@ -286,31 +286,31 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 
    nv50_compute_upload_input(nv50, info->input);
 
-   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   BEGIN_NV04(push, NV50_CP(CP_START_ID), 1);
    PUSH_DATA (push, nv50_compute_find_symbol(nv50, info->pc));
 
-   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   BEGIN_NV04(push, NV50_CP(SHARED_SIZE), 1);
    PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
-   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   BEGIN_NV04(push, NV50_CP(CP_REG_ALLOC_TEMP), 1);
    PUSH_DATA (push, cp->max_gpr);
 
    /* grid/block setup */
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_XY), 2);
    PUSH_DATA (push, info->block[1] << 16 | info->block[0]);
    PUSH_DATA (push, info->block[2]);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCK_ALLOC), 1);
    PUSH_DATA (push, 1 << 16 | block_size);
-   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   BEGIN_NV04(push, NV50_CP(BLOCKDIM_LATCH), 1);
    PUSH_DATA (push, 1);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDDIM), 1);
    PUSH_DATA (push, info->grid[1] << 16 | info->grid[0]);
-   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   BEGIN_NV04(push, NV50_CP(GRIDID), 1);
    PUSH_DATA (push, 1);
 
    /* kernel launching */
-   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   BEGIN_NV04(push, NV50_CP(LAUNCH), 1);
    PUSH_DATA (push, 0);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
    /* bind a compute shader clobbers fragment shader state */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
index be19c0fdc85..0a73090d78d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -202,10 +202,10 @@ nv50_hw_sm_begin_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
       func = nv50_hw_sm_get_func(c);
 
       /* configure and reset the counter(s) */
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
       PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                         | cfg->ctr[i].unit | cfg->ctr[i].mode);
-      BEGIN_NV04(push, NV50_COMPUTE(MP_PM_SET(c)), 1);
+      BEGIN_NV04(push, NV50_CP(MP_PM_SET(c)), 1);
       PUSH_DATA (push, 0);
    }
    return true;
@@ -240,7 +240,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
    PUSH_SPACE(push, 8);
    for (c = 0; c < 4; c++) {
       if (screen->pm.mp_counter[c]) {
-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(c)), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(c)), 1);
          PUSH_DATA (push, 0);
       }
    }
@@ -257,7 +257,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
                 hq->bo);
 
    PUSH_SPACE(push, 2);
-   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   BEGIN_NV04(push, SUBC_CP(NV50_GRAPH_SERIALIZE), 1);
    PUSH_DATA (push, 0);
 
    pipe->bind_compute_state(pipe, screen->pm.prog);
@@ -295,7 +295,7 @@ nv50_hw_sm_end_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
          mask |= 1 << hsq->ctr[i];
          func  = nv50_hw_sm_get_func(hsq->ctr[i]);
 
-         BEGIN_NV04(push, NV50_COMPUTE(MP_PM_CONTROL(hsq->ctr[i])), 1);
+         BEGIN_NV04(push, NV50_CP(MP_PM_CONTROL(hsq->ctr[i])), 1);
          PUSH_DATA (push, (cfg->ctr[i].sig << 24) | (func << 8)
                     | cfg->ctr[i].unit | cfg->ctr[i].mode);
       }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
index 68002305d72..7056258d1bf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_winsys.h
@@ -58,8 +58,8 @@ PUSH_REFN(struct nouveau_pushbuf *push, struct nouveau_bo *bo, uint32_t flags)
 #define SUBC_M2MF(m) 5, (m)
 #define NV50_M2MF(n) SUBC_M2MF(NV50_M2MF_##n)
 
-#define SUBC_COMPUTE(m) 6, (m)
-#define NV50_COMPUTE(n) SUBC_COMPUTE(NV50_COMPUTE_##n)
+#define SUBC_CP(m) 6, (m)
+#define NV50_CP(n) SUBC_CP(NV50_COMPUTE_##n)
 
 
 static inline uint32_t

From 9374fc1e67f524a27fa3f2eb8821be1af5ea9895 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:24:49 +0100
Subject: [PATCH 038/197] nv50: rename 3d dirty flags to NV50_NEW_3D_XXX

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_compute.c       |  2 +-
 .../drivers/nouveau/nv50/nv50_context.c       | 10 +--
 .../drivers/nouveau/nv50/nv50_context.h       | 44 ++++-----
 .../drivers/nouveau/nv50/nv50_shader_state.c  | 10 +--
 src/gallium/drivers/nouveau/nv50/nv50_state.c | 42 ++++-----
 .../nouveau/nv50/nv50_state_validate.c        | 90 +++++++++----------
 .../drivers/nouveau/nv50/nv50_surface.c       | 22 ++---
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |  4 +-
 8 files changed, 112 insertions(+), 112 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index ac411ee0e31..da448c43e6c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -314,5 +314,5 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    PUSH_DATA (push, 0);
 
    /* bind a compute shader clobbers fragment shader state */
-   nv50->dirty_3d |= NV50_NEW_FRAGPROG;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 08981f5258b..18e2ab018dc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -176,7 +176,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->framebuffer.nr_cbufs; ++i) {
          if (nv50->framebuffer.cbufs[i] &&
              nv50->framebuffer.cbufs[i]->texture == res) {
-            nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
+            nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
             if (!--ref)
                return ref;
@@ -186,7 +186,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
    if (bind & PIPE_BIND_DEPTH_STENCIL) {
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
-         nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
+         nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
          nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
          if (!--ref)
             return ref;
@@ -202,7 +202,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       assert(nv50->num_vtxbufs <= PIPE_MAX_ATTRIBS);
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
          if (nv50->vtxbuf[i].buffer == res) {
-            nv50->dirty_3d |= NV50_NEW_ARRAYS;
+            nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
             if (!--ref)
                return ref;
@@ -222,7 +222,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->num_textures[s]; ++i) {
          if (nv50->textures[s][i] &&
              nv50->textures[s][i]->texture == res) {
-            nv50->dirty_3d |= NV50_NEW_TEXTURES;
+            nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
             if (!--ref)
                return ref;
@@ -236,7 +236,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
             continue;
          if (!nv50->constbuf[s][i].user &&
              nv50->constbuf[s][i].u.buf == res) {
-            nv50->dirty_3d |= NV50_NEW_CONSTBUF;
+            nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
             nv50->constbuf_dirty[s] |= 1 << i;
             nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
             if (!--ref)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index dc82bdc4002..89a4da501c2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -26,28 +26,28 @@
 #include "nv50/nv50_3d.xml.h"
 #include "nv50/nv50_2d.xml.h"
 
-#define NV50_NEW_BLEND        (1 << 0)
-#define NV50_NEW_RASTERIZER   (1 << 1)
-#define NV50_NEW_ZSA          (1 << 2)
-#define NV50_NEW_VERTPROG     (1 << 3)
-#define NV50_NEW_GMTYPROG     (1 << 6)
-#define NV50_NEW_FRAGPROG     (1 << 7)
-#define NV50_NEW_BLEND_COLOUR (1 << 8)
-#define NV50_NEW_STENCIL_REF  (1 << 9)
-#define NV50_NEW_CLIP         (1 << 10)
-#define NV50_NEW_SAMPLE_MASK  (1 << 11)
-#define NV50_NEW_FRAMEBUFFER  (1 << 12)
-#define NV50_NEW_STIPPLE      (1 << 13)
-#define NV50_NEW_SCISSOR      (1 << 14)
-#define NV50_NEW_VIEWPORT     (1 << 15)
-#define NV50_NEW_ARRAYS       (1 << 16)
-#define NV50_NEW_VERTEX       (1 << 17)
-#define NV50_NEW_CONSTBUF     (1 << 18)
-#define NV50_NEW_TEXTURES     (1 << 19)
-#define NV50_NEW_SAMPLERS     (1 << 20)
-#define NV50_NEW_STRMOUT      (1 << 21)
-#define NV50_NEW_MIN_SAMPLES  (1 << 22)
-#define NV50_NEW_CONTEXT      (1 << 31)
+#define NV50_NEW_3D_BLEND        (1 << 0)
+#define NV50_NEW_3D_RASTERIZER   (1 << 1)
+#define NV50_NEW_3D_ZSA          (1 << 2)
+#define NV50_NEW_3D_VERTPROG     (1 << 3)
+#define NV50_NEW_3D_GMTYPROG     (1 << 6)
+#define NV50_NEW_3D_FRAGPROG     (1 << 7)
+#define NV50_NEW_3D_BLEND_COLOUR (1 << 8)
+#define NV50_NEW_3D_STENCIL_REF  (1 << 9)
+#define NV50_NEW_3D_CLIP         (1 << 10)
+#define NV50_NEW_3D_SAMPLE_MASK  (1 << 11)
+#define NV50_NEW_3D_FRAMEBUFFER  (1 << 12)
+#define NV50_NEW_3D_STIPPLE      (1 << 13)
+#define NV50_NEW_3D_SCISSOR      (1 << 14)
+#define NV50_NEW_3D_VIEWPORT     (1 << 15)
+#define NV50_NEW_3D_ARRAYS       (1 << 16)
+#define NV50_NEW_3D_VERTEX       (1 << 17)
+#define NV50_NEW_3D_CONSTBUF     (1 << 18)
+#define NV50_NEW_3D_TEXTURES     (1 << 19)
+#define NV50_NEW_3D_SAMPLERS     (1 << 20)
+#define NV50_NEW_3D_STRMOUT      (1 << 21)
+#define NV50_NEW_3D_MIN_SAMPLES  (1 << 22)
+#define NV50_NEW_3D_CONTEXT      (1 << 31)
 
 #define NV50_NEW_CP_PROGRAM   (1 << 0)
 #define NV50_NEW_CP_GLOBALS   (1 << 1)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 7e8acd290e8..134ad837fab 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -181,7 +181,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
       fp->fp.force_persample_interp = rast->force_persample_interp;
    }
 
-   if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_FRAGPROG | NV50_NEW_MIN_SAMPLES)))
+   if (fp->mem && !(nv50->dirty_3d & (NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_MIN_SAMPLES)))
       return;
 
    if (!nv50_program_validate(nv50, fp))
@@ -309,7 +309,7 @@ nv50_validate_derived_rs(struct nv50_context *nv50)
       PUSH_DATA (push, !nv50->rast->pipe.rasterizer_discard);
    }
 
-   if (nv50->dirty_3d & NV50_NEW_FRAGPROG)
+   if (nv50->dirty_3d & NV50_NEW_3D_FRAGPROG)
       return;
    psize = nv50->state.semantic_psize & ~NV50_3D_SEMANTIC_PTSZ_PTSZ_EN__MASK;
    color = nv50->state.semantic_color & ~NV50_3D_SEMANTIC_COLOR_CLMP_EN;
@@ -378,9 +378,9 @@ nv50_fp_linkage_validate(struct nv50_context *nv50)
    uint8_t map[64];
    uint8_t so_map[64];
 
-   if (!(nv50->dirty_3d & (NV50_NEW_VERTPROG |
-                           NV50_NEW_FRAGPROG |
-                           NV50_NEW_GMTYPROG))) {
+   if (!(nv50->dirty_3d & (NV50_NEW_3D_VERTPROG |
+                           NV50_NEW_3D_FRAGPROG |
+                           NV50_NEW_3D_GMTYPROG))) {
       uint8_t bfc, ffc;
       ffc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_FFC0_ID__MASK);
       bfc = (nv50->state.semantic_color & NV50_3D_SEMANTIC_COLOR_BFC0_ID__MASK)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index cfbb0b3bde0..2e47df3b5b2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -200,7 +200,7 @@ nv50_blend_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend = hwcso;
-   nv50->dirty_3d |= NV50_NEW_BLEND;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND;
 }
 
 static void
@@ -337,7 +337,7 @@ nv50_rasterizer_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->rast = hwcso;
-   nv50->dirty_3d |= NV50_NEW_RASTERIZER;
+   nv50->dirty_3d |= NV50_NEW_3D_RASTERIZER;
 }
 
 static void
@@ -426,7 +426,7 @@ nv50_zsa_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->zsa = hwcso;
-   nv50->dirty_3d |= NV50_NEW_ZSA;
+   nv50->dirty_3d |= NV50_NEW_3D_ZSA;
 }
 
 static void
@@ -605,7 +605,7 @@ nv50_stage_sampler_states_bind(struct nv50_context *nv50, int s,
 
    nv50->num_samplers[s] = nr;
 
-   nv50->dirty_3d |= NV50_NEW_SAMPLERS;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLERS;
 }
 
 static void
@@ -700,7 +700,7 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
 
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
-   nv50->dirty_3d |= NV50_NEW_TEXTURES;
+   nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
 }
 
 static void
@@ -776,7 +776,7 @@ nv50_vp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->vertprog = hwcso;
-    nv50->dirty_3d |= NV50_NEW_VERTPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
 }
 
 static void *
@@ -792,7 +792,7 @@ nv50_fp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->fragprog = hwcso;
-    nv50->dirty_3d |= NV50_NEW_FRAGPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_FRAGPROG;
 }
 
 static void *
@@ -808,7 +808,7 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     struct nv50_context *nv50 = nv50_context(pipe);
 
     nv50->gmtyprog = hwcso;
-    nv50->dirty_3d |= NV50_NEW_GMTYPROG;
+    nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
 }
 
 static void *
@@ -882,7 +882,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    }
    nv50->constbuf_dirty[s] |= 1 << i;
 
-   nv50->dirty_3d |= NV50_NEW_CONSTBUF;
+   nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
 }
 
 /* =============================================================================
@@ -895,7 +895,7 @@ nv50_set_blend_color(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->blend_colour = *bcol;
-   nv50->dirty_3d |= NV50_NEW_BLEND_COLOUR;
+   nv50->dirty_3d |= NV50_NEW_3D_BLEND_COLOUR;
 }
 
 static void
@@ -905,7 +905,7 @@ nv50_set_stencil_ref(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stencil_ref = *sr;
-   nv50->dirty_3d |= NV50_NEW_STENCIL_REF;
+   nv50->dirty_3d |= NV50_NEW_3D_STENCIL_REF;
 }
 
 static void
@@ -916,7 +916,7 @@ nv50_set_clip_state(struct pipe_context *pipe,
 
    memcpy(nv50->clip.ucp, clip->ucp, sizeof(clip->ucp));
 
-   nv50->dirty_3d |= NV50_NEW_CLIP;
+   nv50->dirty_3d |= NV50_NEW_3D_CLIP;
 }
 
 static void
@@ -925,7 +925,7 @@ nv50_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->sample_mask = sample_mask;
-   nv50->dirty_3d |= NV50_NEW_SAMPLE_MASK;
+   nv50->dirty_3d |= NV50_NEW_3D_SAMPLE_MASK;
 }
 
 static void
@@ -935,7 +935,7 @@ nv50_set_min_samples(struct pipe_context *pipe, unsigned min_samples)
 
    if (nv50->min_samples != min_samples) {
       nv50->min_samples = min_samples;
-      nv50->dirty_3d |= NV50_NEW_MIN_SAMPLES;
+      nv50->dirty_3d |= NV50_NEW_3D_MIN_SAMPLES;
    }
 }
 
@@ -949,7 +949,7 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
 
    util_copy_framebuffer_state(&nv50->framebuffer, fb);
 
-   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
 }
 
 static void
@@ -959,7 +959,7 @@ nv50_set_polygon_stipple(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->stipple = *stipple;
-   nv50->dirty_3d |= NV50_NEW_STIPPLE;
+   nv50->dirty_3d |= NV50_NEW_3D_STIPPLE;
 }
 
 static void
@@ -977,7 +977,7 @@ nv50_set_scissor_states(struct pipe_context *pipe,
          continue;
       nv50->scissors[start_slot + i] = scissor[i];
       nv50->scissors_dirty |= 1 << (start_slot + i);
-      nv50->dirty_3d |= NV50_NEW_SCISSOR;
+      nv50->dirty_3d |= NV50_NEW_3D_SCISSOR;
    }
 }
 
@@ -996,7 +996,7 @@ nv50_set_viewport_states(struct pipe_context *pipe,
          continue;
       nv50->viewports[start_slot + i] = vpt[i];
       nv50->viewports_dirty |= 1 << (start_slot + i);
-      nv50->dirty_3d |= NV50_NEW_VIEWPORT;
+      nv50->dirty_3d |= NV50_NEW_3D_VIEWPORT;
    }
 }
 
@@ -1009,7 +1009,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    unsigned i;
 
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
-   nv50->dirty_3d |= NV50_NEW_ARRAYS;
+   nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
 
    util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
                                  start_slot, count);
@@ -1073,7 +1073,7 @@ nv50_vertex_state_bind(struct pipe_context *pipe, void *hwcso)
    struct nv50_context *nv50 = nv50_context(pipe);
 
    nv50->vertex = hwcso;
-   nv50->dirty_3d |= NV50_NEW_VERTEX;
+   nv50->dirty_3d |= NV50_NEW_3D_VERTEX;
 }
 
 static struct pipe_stream_output_target *
@@ -1181,7 +1181,7 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
    nv50->num_so_targets = num_targets;
 
    if (nv50->so_targets_dirty)
-      nv50->dirty_3d |= NV50_NEW_STRMOUT;
+      nv50->dirty_3d |= NV50_NEW_3D_STRMOUT;
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 601f9b1449d..c5decb0b3fd 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -188,7 +188,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
    int minx, maxx, miny, maxy, i;
 
    if (!(nv50->dirty_3d &
-         (NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT | NV50_NEW_FRAMEBUFFER)) &&
+         (NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT | NV50_NEW_3D_FRAMEBUFFER)) &&
        nv50->state.scissor == nv50->rast->pipe.scissor)
       return;
 
@@ -197,7 +197,7 @@ nv50_validate_scissor(struct nv50_context *nv50)
 
    nv50->state.scissor = nv50->rast->pipe.scissor;
 
-   if ((nv50->dirty_3d & NV50_NEW_FRAMEBUFFER) && !nv50->state.scissor)
+   if ((nv50->dirty_3d & NV50_NEW_3D_FRAMEBUFFER) && !nv50->state.scissor)
       nv50->scissors_dirty = (1 << NV50_MAX_VIEWPORTS) - 1;
 
    for (i = 0; i < NV50_MAX_VIEWPORTS; i++) {
@@ -290,10 +290,10 @@ nv50_check_program_ucps(struct nv50_context *nv50,
 
    vp->vp.clpd_nr = n;
    if (likely(vp == nv50->vertprog)) {
-      nv50->dirty_3d |= NV50_NEW_VERTPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_VERTPROG;
       nv50_vertprog_validate(nv50);
    } else {
-      nv50->dirty_3d |= NV50_NEW_GMTYPROG;
+      nv50->dirty_3d |= NV50_NEW_3D_GMTYPROG;
       nv50_gmtyprog_validate(nv50);
    }
    nv50_fp_linkage_validate(nv50);
@@ -342,7 +342,7 @@ nv50_validate_clip(struct nv50_context *nv50)
    struct nv50_program *vp;
    uint8_t clip_enable;
 
-   if (nv50->dirty_3d & NV50_NEW_CLIP) {
+   if (nv50->dirty_3d & NV50_NEW_3D_CLIP) {
       BEGIN_NV04(push, NV50_3D(CB_ADDR), 1);
       PUSH_DATA (push, (NV50_CB_AUX_UCP_OFFSET << 8) | NV50_CB_AUX);
       BEGIN_NI04(push, NV50_3D(CB_DATA(0)), PIPE_MAX_CLIP_PLANES * 4);
@@ -445,23 +445,23 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    ctx_to->constbuf_dirty[2] = (1 << NV50_MAX_PIPE_CONSTBUFS) - 1;
 
    if (!ctx_to->vertex)
-      ctx_to->dirty_3d &= ~(NV50_NEW_VERTEX | NV50_NEW_ARRAYS);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS);
 
    if (!ctx_to->vertprog)
-      ctx_to->dirty_3d &= ~NV50_NEW_VERTPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_VERTPROG;
    if (!ctx_to->fragprog)
-      ctx_to->dirty_3d &= ~NV50_NEW_FRAGPROG;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_FRAGPROG;
 
    if (!ctx_to->blend)
-      ctx_to->dirty_3d &= ~NV50_NEW_BLEND;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_BLEND;
    if (!ctx_to->rast)
 #ifdef NV50_SCISSORS_CLIPPING
-      ctx_to->dirty_3d &= ~(NV50_NEW_RASTERIZER | NV50_NEW_SCISSOR);
+      ctx_to->dirty_3d &= ~(NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_SCISSOR);
 #else
-      ctx_to->dirty_3d &= ~NV50_NEW_RASTERIZER;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_RASTERIZER;
 #endif
    if (!ctx_to->zsa)
-      ctx_to->dirty_3d &= ~NV50_NEW_ZSA;
+      ctx_to->dirty_3d &= ~NV50_NEW_3D_ZSA;
 
    ctx_to->screen->cur_ctx = ctx_to;
 }
@@ -470,42 +470,42 @@ static struct state_validate {
     void (*func)(struct nv50_context *);
     uint32_t states;
 } validate_list[] = {
-    { nv50_validate_fb,            NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_blend,         NV50_NEW_BLEND },
-    { nv50_validate_zsa,           NV50_NEW_ZSA },
-    { nv50_validate_sample_mask,   NV50_NEW_SAMPLE_MASK },
-    { nv50_validate_rasterizer,    NV50_NEW_RASTERIZER },
-    { nv50_validate_blend_colour,  NV50_NEW_BLEND_COLOUR },
-    { nv50_validate_stencil_ref,   NV50_NEW_STENCIL_REF },
-    { nv50_validate_stipple,       NV50_NEW_STIPPLE },
+    { nv50_validate_fb,            NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_blend,         NV50_NEW_3D_BLEND },
+    { nv50_validate_zsa,           NV50_NEW_3D_ZSA },
+    { nv50_validate_sample_mask,   NV50_NEW_3D_SAMPLE_MASK },
+    { nv50_validate_rasterizer,    NV50_NEW_3D_RASTERIZER },
+    { nv50_validate_blend_colour,  NV50_NEW_3D_BLEND_COLOUR },
+    { nv50_validate_stencil_ref,   NV50_NEW_3D_STENCIL_REF },
+    { nv50_validate_stipple,       NV50_NEW_3D_STIPPLE },
 #ifdef NV50_SCISSORS_CLIPPING
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR | NV50_NEW_VIEWPORT |
-                                   NV50_NEW_RASTERIZER |
-                                   NV50_NEW_FRAMEBUFFER },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR | NV50_NEW_3D_VIEWPORT |
+                                   NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_FRAMEBUFFER },
 #else
-    { nv50_validate_scissor,       NV50_NEW_SCISSOR },
+    { nv50_validate_scissor,       NV50_NEW_3D_SCISSOR },
 #endif
-    { nv50_validate_viewport,      NV50_NEW_VIEWPORT },
-    { nv50_vertprog_validate,      NV50_NEW_VERTPROG },
-    { nv50_gmtyprog_validate,      NV50_NEW_GMTYPROG },
-    { nv50_fragprog_validate,      NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_MIN_SAMPLES },
-    { nv50_fp_linkage_validate,    NV50_NEW_FRAGPROG | NV50_NEW_VERTPROG |
-                                   NV50_NEW_GMTYPROG | NV50_NEW_RASTERIZER },
-    { nv50_gp_linkage_validate,    NV50_NEW_GMTYPROG | NV50_NEW_VERTPROG },
-    { nv50_validate_derived_rs,    NV50_NEW_FRAGPROG | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_validate_derived_2,     NV50_NEW_ZSA | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_derived_3,     NV50_NEW_BLEND | NV50_NEW_FRAMEBUFFER },
-    { nv50_validate_clip,          NV50_NEW_CLIP | NV50_NEW_RASTERIZER |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_constbufs_validate,     NV50_NEW_CONSTBUF },
-    { nv50_validate_textures,      NV50_NEW_TEXTURES },
-    { nv50_validate_samplers,      NV50_NEW_SAMPLERS },
-    { nv50_stream_output_validate, NV50_NEW_STRMOUT |
-                                   NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG },
-    { nv50_vertex_arrays_validate, NV50_NEW_VERTEX | NV50_NEW_ARRAYS },
-    { nv50_validate_min_samples,   NV50_NEW_MIN_SAMPLES },
+    { nv50_validate_viewport,      NV50_NEW_3D_VIEWPORT },
+    { nv50_vertprog_validate,      NV50_NEW_3D_VERTPROG },
+    { nv50_gmtyprog_validate,      NV50_NEW_3D_GMTYPROG },
+    { nv50_fragprog_validate,      NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_MIN_SAMPLES },
+    { nv50_fp_linkage_validate,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_VERTPROG |
+                                   NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_RASTERIZER },
+    { nv50_gp_linkage_validate,    NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_VERTPROG },
+    { nv50_validate_derived_rs,    NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_validate_derived_2,     NV50_NEW_3D_ZSA | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_derived_3,     NV50_NEW_3D_BLEND | NV50_NEW_3D_FRAMEBUFFER },
+    { nv50_validate_clip,          NV50_NEW_3D_CLIP | NV50_NEW_3D_RASTERIZER |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_constbufs_validate,     NV50_NEW_3D_CONSTBUF },
+    { nv50_validate_textures,      NV50_NEW_3D_TEXTURES },
+    { nv50_validate_samplers,      NV50_NEW_3D_SAMPLERS },
+    { nv50_stream_output_validate, NV50_NEW_3D_STRMOUT |
+                                   NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG },
+    { nv50_vertex_arrays_validate, NV50_NEW_3D_VERTEX | NV50_NEW_3D_ARRAYS },
+    { nv50_validate_min_samples,   NV50_NEW_3D_MIN_SAMPLES },
 };
 
 bool
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index e595524dcc4..dd094d5c070 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -353,7 +353,7 @@ nv50_clear_render_target(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 static void
@@ -436,7 +436,7 @@ nv50_clear_depth_stencil(struct pipe_context *pipe,
    BEGIN_NV04(push, NV50_3D(COND_MODE), 1);
    PUSH_DATA (push, nv50->cond_condmode);
 
-   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 void
@@ -525,7 +525,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    uint32_t mode = 0;
 
    /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
-   if (!nv50_state_validate(nv50, NV50_NEW_FRAMEBUFFER))
+   if (!nv50_state_validate(nv50, NV50_NEW_3D_FRAMEBUFFER))
       return;
 
    /* We have to clear ALL of the layers, not up to the min number of layers
@@ -798,7 +798,7 @@ nv50_clear_buffer(struct pipe_context *pipe,
                              data, data_size);
    }
 
-   nv50->dirty_3d |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR;
+   nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR;
 }
 
 /* =============================== BLIT CODE ===================================
@@ -1259,9 +1259,9 @@ nv50_blitctx_pre_blit(struct nv50_blitctx *ctx)
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
    nv50->dirty_3d =
-      NV50_NEW_FRAMEBUFFER | NV50_NEW_MIN_SAMPLES |
-      NV50_NEW_VERTPROG | NV50_NEW_FRAGPROG | NV50_NEW_GMTYPROG |
-      NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS;
+      NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_MIN_SAMPLES |
+      NV50_NEW_3D_VERTPROG | NV50_NEW_3D_FRAGPROG | NV50_NEW_3D_GMTYPROG |
+      NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS;
 }
 
 static void
@@ -1306,10 +1306,10 @@ nv50_blitctx_post_blit(struct nv50_blitctx *blit)
    nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
 
    nv50->dirty_3d = blit->saved.dirty_3d |
-      (NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR | NV50_NEW_SAMPLE_MASK |
-       NV50_NEW_RASTERIZER | NV50_NEW_ZSA | NV50_NEW_BLEND |
-       NV50_NEW_TEXTURES | NV50_NEW_SAMPLERS |
-       NV50_NEW_VERTPROG | NV50_NEW_GMTYPROG | NV50_NEW_FRAGPROG);
+      (NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR | NV50_NEW_3D_SAMPLE_MASK |
+       NV50_NEW_3D_RASTERIZER | NV50_NEW_3D_ZSA | NV50_NEW_3D_BLEND |
+       NV50_NEW_3D_TEXTURES | NV50_NEW_3D_SAMPLERS |
+       NV50_NEW_3D_VERTPROG | NV50_NEW_3D_GMTYPROG | NV50_NEW_3D_FRAGPROG);
    nv50->scissors_dirty |= 1;
 
    nv50->base.pipe.set_min_samples(&nv50->base.pipe, blit->saved.min_samples);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 11f2471ed5b..06f8462b579 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -779,9 +779,9 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    nv50->vbo_push_hint = /* the 64 is heuristic */
       !(info->indexed && ((nv50->vb_elt_limit + 64) < info->count));
 
-   if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_ARRAYS | NV50_NEW_VERTEX))) {
+   if (nv50->vbo_user && !(nv50->dirty_3d & (NV50_NEW_3D_ARRAYS | NV50_NEW_3D_VERTEX))) {
       if (!!nv50->vbo_fifo != nv50->vbo_push_hint)
-         nv50->dirty_3d |= NV50_NEW_ARRAYS;
+         nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
       else
       if (!nv50->vbo_fifo)
          nv50_update_user_vbufs(nv50);

From 517d2c97e1c94d9d5a3c03b4fac3e8d80d5334c4 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:34:34 +0100
Subject: [PATCH 039/197] nv50: rename 3d binding points to NV50_BIND_3D_XXX

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_context.c       | 24 +++++++++----------
 .../drivers/nouveau/nv50/nv50_context.h       | 20 ++++++++--------
 .../drivers/nouveau/nv50/nv50_shader_state.c  | 12 +++++-----
 src/gallium/drivers/nouveau/nv50/nv50_state.c | 12 +++++-----
 .../nouveau/nv50/nv50_state_validate.c        |  6 ++---
 .../drivers/nouveau/nv50/nv50_surface.c       |  8 +++----
 src/gallium/drivers/nouveau/nv50/nv50_tex.c   |  2 +-
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |  8 +++----
 8 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 18e2ab018dc..61a52c4b366 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -177,7 +177,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
          if (nv50->framebuffer.cbufs[i] &&
              nv50->framebuffer.cbufs[i]->texture == res) {
             nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
             if (!--ref)
                return ref;
          }
@@ -187,7 +187,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       if (nv50->framebuffer.zsbuf &&
           nv50->framebuffer.zsbuf->texture == res) {
          nv50->dirty_3d |= NV50_NEW_3D_FRAMEBUFFER;
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
          if (!--ref)
             return ref;
       }
@@ -203,7 +203,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
       for (i = 0; i < nv50->num_vtxbufs; ++i) {
          if (nv50->vtxbuf[i].buffer == res) {
             nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
             if (!--ref)
                return ref;
          }
@@ -211,8 +211,8 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
 
       if (nv50->idxbuf.buffer == res) {
          /* Just rebind to the bufctx as there is no separate dirty bit */
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(res), RD);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(res), RD);
          if (!--ref)
             return ref;
       }
@@ -223,7 +223,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
          if (nv50->textures[s][i] &&
              nv50->textures[s][i]->texture == res) {
             nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
             if (!--ref)
                return ref;
          }
@@ -238,7 +238,7 @@ nv50_invalidate_resource_storage(struct nouveau_context *ctx,
              nv50->constbuf[s][i].u.buf == res) {
             nv50->dirty_3d |= NV50_NEW_3D_CONSTBUF;
             nv50->constbuf_dirty[s] |= 1 << i;
-            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+            nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
             if (!--ref)
                return ref;
          }
@@ -345,10 +345,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
 
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->code);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->code);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->uniforms);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->txc);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->stack_bo);
    if (screen->compute) {
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
@@ -357,7 +357,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
-   BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
+   BCTX_REFN_bo(nv50->bufctx_3d, 3D_SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
    if (screen->compute)
       BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 89a4da501c2..314b99783c7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -53,16 +53,16 @@
 #define NV50_NEW_CP_GLOBALS   (1 << 1)
 
 /* 3d bufctx (during draw_vbo, blit_3d) */
-#define NV50_BIND_FB          0
-#define NV50_BIND_VERTEX      1
-#define NV50_BIND_VERTEX_TMP  2
-#define NV50_BIND_INDEX       3
-#define NV50_BIND_TEXTURES    4
-#define NV50_BIND_CB(s, i)   (5 + 16 * (s) + (i))
-#define NV50_BIND_SO         53
-#define NV50_BIND_SCREEN     54
-#define NV50_BIND_TLS        55
-#define NV50_BIND_3D_COUNT   56
+#define NV50_BIND_3D_FB          0
+#define NV50_BIND_3D_VERTEX      1
+#define NV50_BIND_3D_VERTEX_TMP  2
+#define NV50_BIND_3D_INDEX       3
+#define NV50_BIND_3D_TEXTURES    4
+#define NV50_BIND_3D_CB(s, i)   (5 + 16 * (s) + (i))
+#define NV50_BIND_3D_SO         53
+#define NV50_BIND_3D_SCREEN     54
+#define NV50_BIND_3D_TLS        55
+#define NV50_BIND_3D_COUNT      56
 
 /* compute bufctx (during launch_grid) */
 #define NV50_BIND_CP_GLOBAL   0
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 134ad837fab..693920e6870 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -94,7 +94,7 @@ nv50_constbufs_validate(struct nv50_context *nv50)
                BEGIN_NV04(push, NV50_3D(SET_PROGRAM_CB), 1);
                PUSH_DATA (push, (b << 12) | (i << 8) | p | 1);
 
-               BCTX_REFN(nv50->bufctx_3d, CB(s, i), res, RD);
+               BCTX_REFN(nv50->bufctx_3d, 3D_CB(s, i), res, RD);
 
                nv50->cb_dirty = 1; /* Force cache flush for UBO. */
             } else {
@@ -131,14 +131,14 @@ nv50_program_update_context_state(struct nv50_context *nv50,
 
    if (prog && prog->tls_space) {
       if (nv50->state.new_tls_space)
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
       if (!nv50->state.tls_required || nv50->state.new_tls_space)
-         BCTX_REFN_bo(nv50->bufctx_3d, TLS, flags, nv50->screen->tls_bo);
+         BCTX_REFN_bo(nv50->bufctx_3d, 3D_TLS, flags, nv50->screen->tls_bo);
       nv50->state.new_tls_space = false;
       nv50->state.tls_required |= 1 << stage;
    } else {
       if (nv50->state.tls_required == (1 << stage))
-         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TLS);
+         nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TLS);
       nv50->state.tls_required &= ~(1 << stage);
    }
 }
@@ -633,7 +633,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
    PUSH_DATA (push, ctrl);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_SO);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
 
    for (i = 0; i < nv50->num_so_targets; ++i) {
       struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
@@ -664,7 +664,7 @@ nv50_stream_output_validate(struct nv50_context *nv50)
          prims = MIN2(prims, limit);
       }
       targ->stride = so->stride[i];
-      BCTX_REFN(nv50->bufctx_3d, SO, buf, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_SO, buf, WR);
    }
    if (prims != ~0) {
       BEGIN_NV04(push, NV50_3D(STRMOUT_PRIMITIVE_LIMIT), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 2e47df3b5b2..4d77bf1f711 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -698,7 +698,7 @@ nv50_stage_set_sampler_views(struct nv50_context *nv50, int s,
 
    nv50->num_textures[s] = nr;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
    nv50->dirty_3d |= NV50_NEW_3D_TEXTURES;
 }
@@ -857,7 +857,7 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
       nv50->constbuf[s][i].u.buf = NULL;
    else
    if (nv50->constbuf[s][i].u.buf)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_CB(s, i));
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_CB(s, i));
 
    pipe_resource_reference(&nv50->constbuf[s][i].u.buf, res);
 
@@ -945,7 +945,7 @@ nv50_set_framebuffer_state(struct pipe_context *pipe,
 {
    struct nv50_context *nv50 = nv50_context(pipe);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
 
    util_copy_framebuffer_state(&nv50->framebuffer, fb);
 
@@ -1008,7 +1008,7 @@ nv50_set_vertex_buffers(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
    unsigned i;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX);
    nv50->dirty_3d |= NV50_NEW_3D_ARRAYS;
 
    util_set_vertex_buffers_count(nv50->vtxbuf, &nv50->num_vtxbufs, vb,
@@ -1051,14 +1051,14 @@ nv50_set_index_buffer(struct pipe_context *pipe,
    struct nv50_context *nv50 = nv50_context(pipe);
 
    if (nv50->idxbuf.buffer)
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_INDEX);
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_INDEX);
 
    if (ib) {
       pipe_resource_reference(&nv50->idxbuf.buffer, ib->buffer);
       nv50->idxbuf.index_size = ib->index_size;
       if (ib->buffer) {
          nv50->idxbuf.offset = ib->offset;
-         BCTX_REFN(nv50->bufctx_3d, INDEX, nv04_resource(ib->buffer), RD);
+         BCTX_REFN(nv50->bufctx_3d, 3D_INDEX, nv04_resource(ib->buffer), RD);
       } else {
          nv50->idxbuf.user_buffer = ib->user_buffer;
       }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index c5decb0b3fd..fd22877a459 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -25,7 +25,7 @@ nv50_validate_fb(struct nv50_context *nv50)
    unsigned ms_mode = NV50_3D_MULTISAMPLE_MODE_MS1;
    uint32_t array_size = 0xffff, array_mode = 0;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
 
    BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
    PUSH_DATA (push, (076543210 << 4) | fb->nr_cbufs);
@@ -90,7 +90,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
       /* only register for writing, otherwise we'd always serialize here */
-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
    }
 
    if (fb->zsbuf) {
@@ -118,7 +118,7 @@ nv50_validate_fb(struct nv50_context *nv50)
       mt->base.status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       mt->base.status &= ~NOUVEAU_BUFFER_STATUS_GPU_READING;
 
-      BCTX_REFN(nv50->bufctx_3d, FB, &mt->base, WR);
+      BCTX_REFN(nv50->bufctx_3d, 3D_FB, &mt->base, WR);
    } else {
       BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1);
       PUSH_DATA (push, 0);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index dd094d5c070..ceb734a9847 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -1255,8 +1255,8 @@ nv50_blitctx_pre_blit(struct nv50_blitctx *ctx)
 
    ctx->saved.dirty_3d = nv50->dirty_3d;
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
    nv50->dirty_3d =
       NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_MIN_SAMPLES |
@@ -1302,8 +1302,8 @@ nv50_blitctx_post_blit(struct nv50_blitctx *blit)
       nv50->base.pipe.render_condition(&nv50->base.pipe, nv50->cond_query,
                                        nv50->cond_cond, nv50->cond_mode);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_FB);
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_TEXTURES);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_FB);
+   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_TEXTURES);
 
    nv50->dirty_3d = blit->saved.dirty_3d |
       (NV50_NEW_3D_FRAMEBUFFER | NV50_NEW_3D_SCISSOR | NV50_NEW_3D_SAMPLE_MASK |
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index 4b69c3bd504..414d326eeed 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -299,7 +299,7 @@ nv50_validate_tic(struct nv50_context *nv50, int s)
       res->status &= ~NOUVEAU_BUFFER_STATUS_GPU_WRITING;
       res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
 
-      BCTX_REFN(nv50->bufctx_3d, TEXTURES, res, RD);
+      BCTX_REFN(nv50->bufctx_3d, 3D_TEXTURES, res, RD);
 
       BEGIN_NV04(push, NV50_3D(BIND_TIC(s)), 1);
       PUSH_DATA (push, (tic->id << 9) | (i << 1) | 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 06f8462b579..47cca012b41 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -230,7 +230,7 @@ nv50_upload_user_buffers(struct nv50_context *nv50,
       addrs[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer, base, size,
                                       &bo);
       if (addrs[b])
-         BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, NOUVEAU_BO_GART |
+         BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, NOUVEAU_BO_GART |
                       NOUVEAU_BO_RD, bo);
    }
    nv50->base.vbo_dirty = true;
@@ -269,7 +269,7 @@ nv50_update_user_vbufs(struct nv50_context *nv50)
          address[b] = nouveau_scratch_data(&nv50->base, vb->user_buffer,
                                            base, size, &bo);
          if (address[b])
-            BCTX_REFN_bo(nv50->bufctx_3d, VERTEX_TMP, bo_flags, bo);
+            BCTX_REFN_bo(nv50->bufctx_3d, 3D_VERTEX_TMP, bo_flags, bo);
       }
 
       BEGIN_NV04(push, NV50_3D(VERTEX_ARRAY_LIMIT_HIGH(i)), 2);
@@ -286,7 +286,7 @@ static inline void
 nv50_release_user_vbufs(struct nv50_context *nv50)
 {
    if (nv50->vbo_user) {
-      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_VERTEX_TMP);
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_VERTEX_TMP);
       nouveau_scratch_done(&nv50->base);
    }
 }
@@ -394,7 +394,7 @@ nv50_vertex_arrays_validate(struct nv50_context *nv50)
          struct nv04_resource *buf = nv04_resource(vb->buffer);
          if (!(refd & (1 << b))) {
             refd |= 1 << b;
-            BCTX_REFN(nv50->bufctx_3d, VERTEX, buf, RD);
+            BCTX_REFN(nv50->bufctx_3d, 3D_VERTEX, buf, RD);
          }
          address = buf->address + vb->buffer_offset + ve->pipe.src_offset;
          limit = buf->address + buf->base.width0 - 1;

From a07ebc1993069306a1c80db8de726a8981e62092 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:49:39 +0100
Subject: [PATCH 040/197] nv50: rework the validation path for 3D

This exposes an interface for state validation that will be also used
to rework the compute validation path.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_context.h       | 10 +++++-
 .../nouveau/nv50/nv50_state_validate.c        | 36 ++++++++++++-------
 .../drivers/nouveau/nv50/nv50_surface.c       |  4 +--
 src/gallium/drivers/nouveau/nv50/nv50_vbo.c   |  2 +-
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 314b99783c7..486ba4fc05c 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -231,7 +231,15 @@ void nv50_stream_output_validate(struct nv50_context *);
 extern void nv50_init_state_functions(struct nv50_context *);
 
 /* nv50_state_validate.c */
-bool nv50_state_validate(struct nv50_context *, uint32_t state_mask);
+struct nv50_state_validate {
+   void (*func)(struct nv50_context *);
+   uint32_t states;
+};
+
+bool nv50_state_validate(struct nv50_context *, uint32_t,
+                         struct nv50_state_validate *, int, uint32_t *,
+                         struct nouveau_bufctx *);
+bool nv50_state_validate_3d(struct nv50_context *, uint32_t);
 
 /* nv50_surface.c */
 extern void nv50_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index fd22877a459..e7631bb1fcf 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -466,10 +466,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
    ctx_to->screen->cur_ctx = ctx_to;
 }
 
-static struct state_validate {
-    void (*func)(struct nv50_context *);
-    uint32_t states;
-} validate_list[] = {
+static struct nv50_state_validate
+validate_list_3d[] = {
     { nv50_validate_fb,            NV50_NEW_3D_FRAMEBUFFER },
     { nv50_validate_blend,         NV50_NEW_3D_BLEND },
     { nv50_validate_zsa,           NV50_NEW_3D_ZSA },
@@ -509,7 +507,9 @@ static struct state_validate {
 };
 
 bool
-nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
+nv50_state_validate(struct nv50_context *nv50, uint32_t mask,
+                    struct nv50_state_validate *validate_list, int size,
+                    uint32_t *dirty, struct nouveau_bufctx *bufctx)
 {
    uint32_t state_mask;
    int ret;
@@ -518,16 +518,16 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
    if (nv50->screen->cur_ctx != nv50)
       nv50_switch_pipe_context(nv50);
 
-   state_mask = nv50->dirty_3d & mask;
+   state_mask = *dirty & mask;
 
    if (state_mask) {
-      for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
-         struct state_validate *validate = &validate_list[i];
+      for (i = 0; i < size; i++) {
+         struct nv50_state_validate *validate = &validate_list[i];
 
          if (state_mask & validate->states)
             validate->func(nv50);
       }
-      nv50->dirty_3d &= ~state_mask;
+      *dirty &= ~state_mask;
 
       if (nv50->state.rt_serialize) {
          nv50->state.rt_serialize = false;
@@ -535,14 +535,26 @@ nv50_state_validate(struct nv50_context *nv50, uint32_t mask)
          PUSH_DATA (nv50->base.pushbuf, 0);
       }
 
-      nv50_bufctx_fence(nv50->bufctx_3d, false);
+      nv50_bufctx_fence(bufctx, false);
    }
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_3d);
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, bufctx);
    ret = nouveau_pushbuf_validate(nv50->base.pushbuf);
 
+   return !ret;
+}
+
+bool
+nv50_state_validate_3d(struct nv50_context *nv50, uint32_t mask)
+{
+   bool ret;
+
+   ret = nv50_state_validate(nv50, mask, validate_list_3d,
+                             ARRAY_SIZE(validate_list_3d), &nv50->dirty_3d,
+                             nv50->bufctx_3d);
+
    if (unlikely(nv50->state.flushed)) {
       nv50->state.flushed = false;
       nv50_bufctx_fence(nv50->bufctx_3d, true);
    }
-   return !ret;
+   return ret;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index ceb734a9847..68b0e18ef8f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -525,7 +525,7 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers,
    uint32_t mode = 0;
 
    /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
-   if (!nv50_state_validate(nv50, NV50_NEW_3D_FRAMEBUFFER))
+   if (!nv50_state_validate_3d(nv50, NV50_NEW_3D_FRAMEBUFFER))
       return;
 
    /* We have to clear ALL of the layers, not up to the min number of layers
@@ -1344,7 +1344,7 @@ nv50_blit_3d(struct nv50_context *nv50, const struct pipe_blit_info *info)
 
    nv50_blitctx_prepare_state(blit);
 
-   nv50_state_validate(nv50, ~0);
+   nv50_state_validate_3d(nv50, ~0);
 
    x_range = (float)info->src.box.width / (float)info->dst.box.width;
    y_range = (float)info->src.box.height / (float)info->dst.box.height;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 47cca012b41..a11cdf847b1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -790,7 +790,7 @@ nv50_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (unlikely(nv50->num_so_targets && !nv50->gmtyprog))
       nv50->state.prim_size = nv50_pipe_prim_to_prim_size[info->mode];
 
-   nv50_state_validate(nv50, ~0);
+   nv50_state_validate_3d(nv50, ~0);
 
    push->kick_notify = nv50_draw_vbo_kick_notify;
 

From 5ed387675d61e739b548fde9ff40d284160d6269 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 14:58:20 +0100
Subject: [PATCH 041/197] nv50: rework nv50_compute_validate_program()

Reduce the amount of duplicated code by re-using
nv50_program_validate(). While we are at it, change the prototype to
return void. We don't check anymore if the translation fails but
improving the state validation is a long process.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_compute.c       | 32 +------------------
 .../drivers/nouveau/nv50/nv50_context.h       |  1 +
 .../drivers/nouveau/nv50/nv50_shader_state.c  | 15 +++++++++
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index da448c43e6c..562a64e5673 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -155,34 +155,6 @@ nv50_screen_compute_setup(struct nv50_screen *screen,
    return 0;
 }
 
-static bool
-nv50_compute_validate_program(struct nv50_context *nv50)
-{
-   struct nv50_program *prog = nv50->compprog;
-
-   if (prog->mem)
-      return true;
-
-   if (!prog->translated) {
-      prog->translated = nv50_program_translate(
-         prog, nv50->screen->base.device->chipset, &nv50->base.debug);
-      if (!prog->translated)
-         return false;
-   }
-   if (unlikely(!prog->code_size))
-      return false;
-
-   if (likely(prog->code_size)) {
-      if (nv50_program_upload_code(nv50, prog)) {
-         struct nouveau_pushbuf *push = nv50->base.pushbuf;
-         BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
-         PUSH_DATA (push, 0);
-         return true;
-      }
-   }
-   return false;
-}
-
 static void
 nv50_compute_validate_globals(struct nv50_context *nv50)
 {
@@ -201,9 +173,7 @@ nv50_compute_validate_globals(struct nv50_context *nv50)
 static bool
 nv50_compute_state_validate(struct nv50_context *nv50)
 {
-   if (!nv50_compute_validate_program(nv50))
-      return false;
-
+   nv50_compprog_validate(nv50);
    if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
       nv50_compute_validate_globals(nv50);
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 486ba4fc05c..2317fa2ccf8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -221,6 +221,7 @@ extern struct draw_stage *nv50_draw_render_stage(struct nv50_context *);
 void nv50_vertprog_validate(struct nv50_context *);
 void nv50_gmtyprog_validate(struct nv50_context *);
 void nv50_fragprog_validate(struct nv50_context *);
+void nv50_compprog_validate(struct nv50_context *);
 void nv50_fp_linkage_validate(struct nv50_context *);
 void nv50_gp_linkage_validate(struct nv50_context *);
 void nv50_constbufs_validate(struct nv50_context *);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 693920e6870..56a3df9d578 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -29,6 +29,8 @@
 #include "nv50/nv50_context.h"
 #include "nv50/nv50_query_hw.h"
 
+#include "nv50/nv50_compute.xml.h"
+
 void
 nv50_constbufs_validate(struct nv50_context *nv50)
 {
@@ -238,6 +240,19 @@ nv50_gmtyprog_validate(struct nv50_context *nv50)
    /* GP_ENABLE is updated in linkage validation */
 }
 
+void
+nv50_compprog_validate(struct nv50_context *nv50)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_program *cp = nv50->compprog;
+
+   if (cp && !nv50_program_validate(nv50, cp))
+      return;
+
+   BEGIN_NV04(push, NV50_CP(CODE_CB_FLUSH), 1);
+   PUSH_DATA (push, 0);
+}
+
 static void
 nv50_sprite_coords_validate(struct nv50_context *nv50)
 {

From af0c97fb90f21dae79a7221b19e8e13dd32ee5e8 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 15:03:26 +0100
Subject: [PATCH 042/197] nv50: add a new validation path for compute

This makes use of the new state validation interface to be consistent
with 3d.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Pierre Moreau <pierre.morrow@free.fr>
Tested-by: Pierre Moreau <pierre.morrow@free.fr>
---
 .../drivers/nouveau/nv50/nv50_compute.c       | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
index 562a64e5673..d781f6fd7d4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_compute.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -170,24 +170,25 @@ nv50_compute_validate_globals(struct nv50_context *nv50)
    }
 }
 
+static struct nv50_state_validate
+validate_list_cp[] = {
+   { nv50_compprog_validate,              NV50_NEW_CP_PROGRAM     },
+   { nv50_compute_validate_globals,       NV50_NEW_CP_GLOBALS     },
+};
+
 static bool
-nv50_compute_state_validate(struct nv50_context *nv50)
+nv50_state_validate_cp(struct nv50_context *nv50, uint32_t mask)
 {
-   nv50_compprog_validate(nv50);
-   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
-      nv50_compute_validate_globals(nv50);
+   bool ret;
 
    /* TODO: validate textures, samplers, surfaces */
+   ret = nv50_state_validate(nv50, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nv50->dirty_cp,
+                             nv50->bufctx_cp);
 
-   nv50_bufctx_fence(nv50->bufctx_cp, false);
-
-   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
-      return false;
    if (unlikely(nv50->state.flushed))
       nv50_bufctx_fence(nv50->bufctx_cp, true);
-
-   return true;
+   return ret;
 }
 
 static void
@@ -248,7 +249,7 @@ nv50_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    struct nv50_program *cp = nv50->compprog;
    bool ret;
 
-   ret = !nv50_compute_state_validate(nv50);
+   ret = !nv50_state_validate_cp(nv50, ~0);
    if (ret) {
       NOUVEAU_ERR("Failed to launch grid !\n");
       return;

From a734c0f8ba4a99e3ae42fe7ca9b1af16a01587b3 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 17 Mar 2016 23:24:54 +0100
Subject: [PATCH 043/197] nv50/ir: print SUBFM subops

Only 3d subop is currently emitted.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
index cfa85ec123c..066faa367d2 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -204,6 +204,11 @@ static const char *ldstSubOpStr[] =
    "", "lock", "unlock"
 };
 
+static const char *subfmOpStr[] =
+{
+   "", "3d"
+};
+
 static const char *DataTypeStr[] =
 {
    "-",
@@ -548,6 +553,10 @@ void Instruction::print() const
          if (subOp < Elements(ldstSubOpStr))
             PRINT("%s ", ldstSubOpStr[subOp]);
          break;
+      case OP_SUBFM:
+         if (subOp < Elements(subfmOpStr))
+            PRINT("%s ", subfmOpStr[subOp]);
+         break;
       default:
          if (subOp)
             PRINT("(SUBOP:%u) ", subOp);

From 1282146d4e41eb3b73678a71389706f16c245979 Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierre.morrow@free.fr>
Date: Sun, 13 Mar 2016 22:11:42 +0100
Subject: [PATCH 044/197] nv50: Mark compute states as dirty on context switch

Signed-off-by: Pierre Moreau <pierre.morrow@free.fr>
[ Samuel Pitoiset: Trivial rebase conflict ]
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/gallium/drivers/nouveau/nv50/nv50_state_validate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index e7631bb1fcf..51204930031 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -437,6 +437,7 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
       ctx_to->state = ctx_to->screen->save_state;
 
    ctx_to->dirty_3d = ~0;
+   ctx_to->dirty_cp = ~0;
    ctx_to->viewports_dirty = ~0;
    ctx_to->scissors_dirty = ~0;
 

From 8679d40dc792df76e292194ec443b33444c1d4a3 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 15 Mar 2016 01:00:18 -0700
Subject: [PATCH 045/197] i965: Account for TES in is_drawing_points().

Now that we implement tessellation shaders, the TES might be the last
stage enabled.  If it's outputting points, then the primitive type
reaching the SF is points.  We need to account for this.

Caught by Ilia Mirkin.

v2: Update dirty bit comment above caller (caught by Iago)

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/mesa/drivers/dri/i965/gen6_sf_state.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 2634e6ba6fd..7de2e8e7c47 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -160,6 +160,10 @@ is_drawing_points(const struct brw_context *brw)
    if (brw->geometry_program) {
       /* BRW_NEW_GEOMETRY_PROGRAM */
       return brw->geometry_program->OutputType == GL_POINTS;
+   } else if (brw->tes.prog_data) {
+      /* BRW_NEW_TES_PROG_DATA */
+      return brw->tes.prog_data->output_topology ==
+             BRW_TESS_OUTPUT_TOPOLOGY_POINT;
    } else {
       /* BRW_NEW_PRIMITIVE */
       return brw->primitive == _3DPRIM_POINTLIST;
@@ -216,8 +220,10 @@ calculate_attr_overrides(const struct brw_context *brw,
     * This is not required on Haswell, as the hardware ignores this state
     * when drawing non-points -- although we do still need to be careful to
     * correctly set the attr overrides.
+    *
+    * _NEW_POLYGON
+    * BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA
     */
-   /* BRW_NEW_PRIMITIVE | BRW_NEW_GEOMETRY_PROGRAM */
    bool drawing_points = is_drawing_points(brw);
 
    /* Initialize all the attr_overrides to 0.  In the loop below we'll modify
@@ -484,6 +490,7 @@ const struct brw_tracked_state gen6_sf_state = {
                BRW_NEW_FS_PROG_DATA |
                BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_PRIMITIVE |
+               BRW_NEW_TES_PROG_DATA |
                BRW_NEW_VUE_MAP_GEOM_OUT,
    },
    .emit = upload_sf_state,

From 24298b7e2ffe0d69ef996ab2c279b380bcb4a269 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 16 Mar 2016 17:01:10 -0700
Subject: [PATCH 046/197] i965: Decode non-normalized coordinates bit in
 SAMPLER_STATE.

We weren't printing this for some reason.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/drivers/dri/i965/brw_state_dump.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 46667884125..b7b0a86f1c7 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -423,11 +423,12 @@ static void gen7_dump_sampler_state(struct brw_context *brw,
                 GET_BITS(samp[1], 15, 8)
                );
       batch_out(brw, name, offset, i+2, "Border Color\n"); /* FINISHME: gen8+ */
-      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s\n",
+      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s, %snormalized coords\n",
                 (GET_FIELD(samp[3], BRW_SAMPLER_MAX_ANISOTROPY) + 1) * 2,
                 sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCX_WRAP_MODE)],
                 sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCY_WRAP_MODE)],
-                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)]
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)],
+                (samp[3] & GEN7_SAMPLER_NON_NORMALIZED_COORDINATES) ? "non-" : ""
                );
 
       samp += 4;

From 5b2d8c2273c6f48e764a1386240ec674cb4aa4ad Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 14 Mar 2016 14:22:39 -0700
Subject: [PATCH 047/197] i965: Fix gl_TessLevelOuter[] for isolines.

Thanks to James Legg for finding this!

From the ARB_tessellation_shader spec:
"The number of isolines generated is derived from the first outer
 tessellation level; the number of segments in each isoline is
 derived from the second outer tessellation level."

According to the PRM, "TF.LineDensity determines # lines" while
"TF.LineDetail determines # segments".  Line Density is stored at
DWord 6, while Line Detail is at DWord 7.  So, they're not reversed
like they are for triangles and quads.

Fixes Piglit's spec/arb_tessellation_shader/execution/isoline,
and about 24 dEQP isoline tests (with GL_EXT_tessellation_shader
hacked on - it's not normally enabled).

Cc: mesa-stable@lists.freedesktop.org
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94524
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp | 16 +++++++++++++---
 src/mesa/drivers/dri/i965/brw_vec4_tes.cpp | 12 +++++++++---
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index cb345157f81..2046b94bca1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -402,6 +402,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          }
       } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) {
          dst.type = BRW_REGISTER_TYPE_F;
+         unsigned swiz = BRW_SWIZZLE_WZYX;
 
          /* This is a read of gl_TessLevelOuter[], which lives in the
           * high 4 DWords of the Patch URB header, in reverse order.
@@ -414,6 +415,8 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             dst.writemask = WRITEMASK_XYZ;
             break;
          case GL_ISOLINES:
+            /* Isolines are not reversed; swizzle .zw -> .xy */
+            swiz = BRW_SWIZZLE_ZWZW;
             dst.writemask = WRITEMASK_XY;
             return;
          default:
@@ -422,7 +425,7 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
          dst_reg tmp(this, glsl_type::vec4_type);
          emit_output_urb_read(tmp, 1, src_reg());
-         emit(MOV(dst, swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX)));
+         emit(MOV(dst, swizzle(src_reg(tmp), swiz)));
       } else {
          emit_output_urb_read(dst, imm_offset, indirect_offset);
       }
@@ -475,8 +478,15 @@ vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * Patch URB Header at DWords 4-7.  However, it's reversed, so
           * instead of .xyzw we have .wzyx.
           */
-         swiz = BRW_SWIZZLE_WZYX;
-         mask = writemask_for_backwards_vector(mask);
+         if (key->tes_primitive_mode == GL_ISOLINES) {
+            /* Isolines .xy should be stored in .zw, in order. */
+            swiz = BRW_SWIZZLE4(0, 0, 0, 1);
+            mask <<= 2;
+         } else {
+            /* Other domains are reversed; store .wzyx instead of .xyzw. */
+            swiz = BRW_SWIZZLE_WZYX;
+            mask = writemask_for_backwards_vector(mask);
+         }
       }
 
       emit_urb_write(swizzle(value, swiz), mask,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
index e3c23f1a52f..7ba494fbffc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tes.cpp
@@ -149,9 +149,15 @@ vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                src_reg(brw_vec8_grf(1, 0))));
       break;
    case nir_intrinsic_load_tess_level_outer:
-      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
-               swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
-                       BRW_SWIZZLE_WZYX)));
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_ZWZW)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      }
       break;
    case nir_intrinsic_load_tess_level_inner:
       if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {

From 757674e8d00772ce091e75df186e6c27821bd53d Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 10 Mar 2016 15:46:34 -0800
Subject: [PATCH 048/197] i965: Move is_drawing_points to brw_state.h.

I need to use this in multiple source files.

v2: Rebase on TES output domain fix.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_state.h     | 24 +++++++++++++++++++++++
 src/mesa/drivers/dri/i965/gen6_sf_state.c | 24 -----------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 6b85eac77d6..79ee5029bd4 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -34,6 +34,7 @@
 #define BRW_STATE_H
 
 #include "brw_context.h"
+#include "brw_defines.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -406,6 +407,29 @@ void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
 void
 gen7_restore_default_l3_config(struct brw_context *brw);
 
+static inline bool
+is_drawing_points(const struct brw_context *brw)
+{
+   /* Determine if the primitives *reaching the SF* are points */
+   /* _NEW_POLYGON */
+   if (brw->ctx.Polygon.FrontMode == GL_POINT ||
+       brw->ctx.Polygon.BackMode == GL_POINT) {
+      return true;
+   }
+
+   if (brw->geometry_program) {
+      /* BRW_NEW_GEOMETRY_PROGRAM */
+      return brw->geometry_program->OutputType == GL_POINTS;
+   } else if (brw->tes.prog_data) {
+      /* BRW_NEW_TES_PROG_DATA */
+      return brw->tes.prog_data->output_topology ==
+             BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else {
+      /* BRW_NEW_PRIMITIVE */
+      return brw->primitive == _3DPRIM_POINTLIST;
+   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 7de2e8e7c47..3a337e817b0 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -147,30 +147,6 @@ get_attr_override(const struct brw_vue_map *vue_map, int urb_entry_read_offset,
 }
 
 
-static bool
-is_drawing_points(const struct brw_context *brw)
-{
-   /* Determine if the primitives *reaching the SF* are points */
-   /* _NEW_POLYGON */
-   if (brw->ctx.Polygon.FrontMode == GL_POINT ||
-       brw->ctx.Polygon.BackMode == GL_POINT) {
-      return true;
-   }
-
-   if (brw->geometry_program) {
-      /* BRW_NEW_GEOMETRY_PROGRAM */
-      return brw->geometry_program->OutputType == GL_POINTS;
-   } else if (brw->tes.prog_data) {
-      /* BRW_NEW_TES_PROG_DATA */
-      return brw->tes.prog_data->output_topology ==
-             BRW_TESS_OUTPUT_TOPOLOGY_POINT;
-   } else {
-      /* BRW_NEW_PRIMITIVE */
-      return brw->primitive == _3DPRIM_POINTLIST;
-   }
-}
-
-
 /**
  * Create the mapping from the FS inputs we produce to the previous pipeline
  * stage (GS or VS) outputs they source from.

From 47be5a64c786e04578bebe21601b3c0821be75a0 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 10 Mar 2016 15:51:56 -0800
Subject: [PATCH 049/197] i965: Introduce an is_drawing_lines() helper.

Similar to is_drawing_points().

v2: Account for isoline tessellation output topology.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_state.h | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 79ee5029bd4..783af78479e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -430,6 +430,36 @@ is_drawing_points(const struct brw_context *brw)
    }
 }
 
+static inline bool
+is_drawing_lines(const struct brw_context *brw)
+{
+   /* Determine if the primitives *reaching the SF* are points */
+   /* _NEW_POLYGON */
+   if (brw->ctx.Polygon.FrontMode == GL_LINE ||
+       brw->ctx.Polygon.BackMode == GL_LINE) {
+      return true;
+   }
+
+   if (brw->geometry_program) {
+      /* BRW_NEW_GEOMETRY_PROGRAM */
+      return brw->geometry_program->OutputType == GL_LINE_STRIP;
+   } else if (brw->tes.prog_data) {
+      /* BRW_NEW_TES_PROG_DATA */
+      return brw->tes.prog_data->output_topology ==
+             BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* BRW_NEW_PRIMITIVE */
+      switch (brw->primitive) {
+      case _3DPRIM_LINELIST:
+      case _3DPRIM_LINESTRIP:
+      case _3DPRIM_LINELOOP:
+         return true;
+      }
+   }
+   return false;
+}
+
+
 #ifdef __cplusplus
 }
 #endif

From d000a4989f66921832a33742ea07fa191393231b Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 10 Mar 2016 16:04:01 -0800
Subject: [PATCH 050/197] i965: Include the viewport in the scissor rectangle.

We'll need to use scissoring to restrict fragments to the viewport
soon.  It seems harmless to include it generally, so let's do that.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94453
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94454
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/gen6_scissor_state.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 17b4a7fba96..a20673282f2 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -58,10 +58,10 @@ gen6_upload_scissor_state(struct brw_context *brw)
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
       int bbox[4];
 
-      bbox[0] = 0;
-      bbox[1] = fb_width;
-      bbox[2] = 0;
-      bbox[3] = fb_height;
+      bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
+      bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
+      bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
+      bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
       _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
 
       if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {

From 0de64ab7881cc4d343fecf8a2b5b9b9ca7b34416 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 10 Mar 2016 15:30:36 -0800
Subject: [PATCH 051/197] i965: Scissor to the viewport when rendering
 points/lines.

We're about to start allowing wide points/lines whose vertices are
outside the viewport past the clipper.  This scissoring hack ensures
that any fragments generated are still restricted to the viewport.

It is not necessary on Gen8+ as those platforms already discard
fragments which are outside the viewport.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94453
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94454
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/gen6_sf_state.c | 5 +++--
 src/mesa/drivers/dri/i965/gen7_sf_state.c | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index 3a337e817b0..42f9a5ca8b6 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -351,8 +351,9 @@ upload_sf_state(struct brw_context *brw)
        unreachable("not reached");
    }
 
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
+   /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */
+   if (ctx->Scissor.EnableFlags ||
+       is_drawing_points(brw) || is_drawing_lines(brw))
       dw3 |= GEN6_SF_SCISSOR_ENABLE;
 
    /* _NEW_POLYGON */
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index b1f13aceba4..7c98c73edf8 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -188,8 +188,9 @@ upload_sf_state(struct brw_context *brw)
       dw2 |= GEN6_SF_CULL_NONE;
    }
 
-   /* _NEW_SCISSOR */
-   if (ctx->Scissor.EnableFlags)
+   /* _NEW_SCISSOR _NEW_POLYGON BRW_NEW_GEOMETRY_PROGRAM BRW_NEW_PRIMITIVE */
+   if (ctx->Scissor.EnableFlags ||
+       is_drawing_points(brw) || is_drawing_lines(brw))
       dw2 |= GEN6_SF_SCISSOR_ENABLE;
 
    /* _NEW_LINE */
@@ -254,7 +255,8 @@ const struct brw_tracked_state gen7_sf_state = {
                _NEW_POLYGON |
                _NEW_PROGRAM |
                _NEW_SCISSOR,
-      .brw   = BRW_NEW_CONTEXT,
+      .brw   = BRW_NEW_CONTEXT |
+               BRW_NEW_PRIMITIVE,
    },
    .emit = upload_sf_state,
 };

From 88d28aa4d9edec33ef7bcf1f56b77fbb756a24f8 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 10 Mar 2016 14:36:25 -0800
Subject: [PATCH 052/197] i965: Stop XY clipping point and line primitives.

Wide points and lines are not supposed to be clipped by the viewport.
Rather, they should be rendered, and any fragments outside of the
viewport should be discarded.

The traditional use case for this behavior is rendering moving wide
point particles.  When the center of the point approaches the viewport
edge, clipping would make it pop out of view early.

Fixes:
- dEQP-GLES2.functional.clipping.point.wide_point_clip
- dEQP-GLES3.functional.clipping.point.wide_point_clip
- dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_center
- dEQP-GLES3.functional.clipping.point.wide_point_clip_viewport_corner
- dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_center
- dEQP-GLES3.functional.clipping.line.wide_line_clip_viewport_corner

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94453
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94454
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/gen6_clip_state.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 9a29366f0e0..004ecebc69e 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -174,12 +174,14 @@ upload_clip_state(struct brw_context *brw)
    else
       enable = GEN6_CLIP_ENABLE;
 
+   if (!is_drawing_points(brw) && !is_drawing_lines(brw))
+      dw2 |= GEN6_CLIP_XY_TEST;
+
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2));
    OUT_BATCH(dw1);
    OUT_BATCH(enable |
 	     GEN6_CLIP_MODE_NORMAL |
-	     GEN6_CLIP_XY_TEST |
 	     dw2);
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
              U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
@@ -195,7 +197,9 @@ const struct brw_tracked_state gen6_clip_state = {
                _NEW_TRANSFORM,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_META_IN_PROGRESS |
+               BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD,
    },
    .emit = upload_clip_state,
@@ -209,7 +213,9 @@ const struct brw_tracked_state gen7_clip_state = {
                _NEW_TRANSFORM,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
+               BRW_NEW_GEOMETRY_PROGRAM |
                BRW_NEW_META_IN_PROGRESS |
+               BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD,
    },
    .emit = upload_clip_state,

From f1b05735108c6733893dfbe762366f2676501c0d Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 15 Mar 2016 00:30:05 -0700
Subject: [PATCH 053/197] mesa: Add color renderable/texture filterable format
 info for ES 3.x.

OpenGL ES 3.x contains a table of sized internal formats and their
required properties.  In particular, each format is marked as
"Color Renderable" or "Texture Filterable".

This patch introduces two functions that can be used to query the
information from that table.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/glformats.c | 83 +++++++++++++++++++++++++++++++++++++++
 src/mesa/main/glformats.h |  7 ++++
 2 files changed, 90 insertions(+)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index cf6495885b6..96ab393c0e1 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -3556,3 +3556,86 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
     */
    unreachable("Unsupported format");
 }
+
+/**
+ * Returns true if \p internal_format is a sized internal format that
+ * is marked "Color Renderable" in Table 8.10 of the ES 3.2 specification.
+ */
+bool
+_mesa_is_es3_color_renderable(GLenum internal_format)
+{
+   switch (internal_format) {
+   case GL_R8:
+   case GL_RG8:
+   case GL_RGB8:
+   case GL_RGB565:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGB10_A2UI:
+   case GL_SRGB8_ALPHA8:
+   case GL_R16F:
+   case GL_RG16F:
+   case GL_RGBA16F:
+   case GL_R32F:
+   case GL_RG32F:
+   case GL_RGBA32F:
+   case GL_R11F_G11F_B10F:
+   case GL_R8I:
+   case GL_R8UI:
+   case GL_R16I:
+   case GL_R16UI:
+   case GL_R32I:
+   case GL_R32UI:
+   case GL_RG8I:
+   case GL_RG8UI:
+   case GL_RG16I:
+   case GL_RG16UI:
+   case GL_RG32I:
+   case GL_RG32UI:
+   case GL_RGBA8I:
+   case GL_RGBA8UI:
+   case GL_RGBA16I:
+   case GL_RGBA16UI:
+   case GL_RGBA32I:
+   case GL_RGBA32UI:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true if \p internal_format is a sized internal format that
+ * is marked "Texture Filterable" in Table 8.10 of the ES 3.2 specification.
+ */
+bool
+_mesa_is_es3_texture_filterable(GLenum internal_format)
+{
+   switch (internal_format) {
+   case GL_R8:
+   case GL_R8_SNORM:
+   case GL_RG8:
+   case GL_RG8_SNORM:
+   case GL_RGB8:
+   case GL_RGB8_SNORM:
+   case GL_RGB565:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGBA8_SNORM:
+   case GL_RGB10_A2:
+   case GL_SRGB8:
+   case GL_SRGB8_ALPHA8:
+   case GL_R16F:
+   case GL_RG16F:
+   case GL_RGB16F:
+   case GL_RGBA16F:
+   case GL_R11F_G11F_B10F:
+   case GL_RGB9_E5:
+      return true;
+   default:
+      return false;
+   }
+}
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 00d2767085d..c73f464e5f9 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -28,6 +28,7 @@
 #define GLFORMATS_H
 
 
+#include <stdbool.h>
 #include <GL/gl.h>
 
 
@@ -144,6 +145,12 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat );
 extern uint32_t
 _mesa_format_from_format_and_type(GLenum format, GLenum type);
 
+extern bool
+_mesa_is_es3_color_renderable(GLenum internal_format);
+
+extern bool
+_mesa_is_es3_texture_filterable(GLenum internal_format);
+
 #ifdef __cplusplus
 }
 #endif

From 46610238e0a8db47c293f75ad8d667747d6256af Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 15 Mar 2016 00:41:16 -0700
Subject: [PATCH 054/197] mesa: Do proper format error checks for
 GenerateMipmap in ES 3.x.

According to the OpenGL ES 3.2 spec's description of GenerateMipmap:

"An INVALID_OPERATION error is generated if the levelbase array was not
 specified with an unsized internal format from table 8.3 or a sized
 internal format that is both color-renderable and texture-filterable
 according to table 8.10."

Similar text exists in the ES 3.0 specification as well.

Our existing rules are pretty close, but miss a few things.  The
OpenGL specification actually doesn't have any text about internal
format checking - our existing code comes from a Khronos bug report.
The ES 3.x spec provides a clearer description.

Fixes dEQP-GLES3.functional.negative_api.texture.generatemipmap and
dEQP-GLES2.functional.negative_api.texture.generatemipmap_zero_level
_array_compressed.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/genmipmap.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 6eacd424df7..1a6ae9a5f3c 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -79,6 +79,20 @@ bool
 _mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx,
                                                       GLenum internalformat)
 {
+   if (_mesa_is_gles3(ctx)) {
+      /* From the ES 3.2 specification's description of GenerateMipmap():
+       * "An INVALID_OPERATION error is generated if the levelbase array was
+       *  not specified with an unsized internal format from table 8.3 or a
+       *  sized internal format that is both color-renderable and
+       *  texture-filterable according to table 8.10."
+       */
+      return internalformat == GL_RGBA || internalformat == GL_RGB ||
+             internalformat == GL_LUMINANCE_ALPHA ||
+             internalformat == GL_LUMINANCE || internalformat == GL_ALPHA ||
+             (_mesa_is_es3_color_renderable(internalformat) &&
+              _mesa_is_es3_texture_filterable(internalformat));
+   }
+
    return (!_mesa_is_enum_format_integer(internalformat) &&
            !_mesa_is_depthstencil_format(internalformat) &&
            !_mesa_is_astc_format(internalformat) &&

From a100d89d09981d2ebb42a7e4643a48e78db8dfe3 Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Fri, 18 Mar 2016 18:28:28 -0700
Subject: [PATCH 055/197] nv50,nvc0: Fix invalid constant.

Fix clang build error.

  CXX      codegen/nv50_ir_lowering_nvc0.lo
codegen/nv50_ir_lowering_nvc0.cpp:1783:42: error: invalid suffix 'd' on floating constant
      Value *zero = bld.loadImm(NULL, 0.0d);
                                         ^

Fixes: c1e4a6bfbf01 ("nv50,nvc0: handle SQRT lowering inside the driver")
Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d0936d88d60..01364b3b7e6 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1780,7 +1780,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i)
 {
    if (i->dType == TYPE_F64) {
       Value *pred = bld.getSSA(1, FILE_PREDICATE);
-      Value *zero = bld.loadImm(NULL, 0.0d);
+      Value *zero = bld.loadImm(NULL, 0);
       Value *dst = bld.getSSA(8);
       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);

From e05492fd7f0e1a9454482a9174f5870b8cb5a41e Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Sat, 19 Mar 2016 16:52:45 +0100
Subject: [PATCH 056/197] nv50/ir: fix compilation warning in
 handleSharedATOM()

In release build mode only, op may be used uninitialized because
the assertion has been removed.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 01364b3b7e6..1c56d16abc8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1098,6 +1098,7 @@ NVC0LoweringPass::handleSharedATOM(Instruction *atom)
          break;
       default:
          assert(0);
+         return;
       }
 
       Instruction *i =

From d86933e6f42b9c2f5bb617c66c91795c560a9abd Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 13:16:44 +0100
Subject: [PATCH 057/197] nv50,nvc0: replace resInfoCBSlot by auxCBSlot

Having two different variables for the driver constant buffer slot
is confusing and really useless.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Pierre Moreau <pierre.morrow@free.fr>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h |  3 +--
 .../nouveau/codegen/nv50_ir_lowering_nv50.cpp        |  4 ++--
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp        | 12 ++++++------
 src/gallium/drivers/nouveau/nouveau_compiler.c       |  2 --
 src/gallium/drivers/nouveau/nv50/nv50_program.c      |  1 -
 src/gallium/drivers/nouveau/nvc0/nvc0_program.c      |  4 +---
 6 files changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 9f7d2572bbe..21523a27761 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -160,7 +160,7 @@ struct nv50_ir_prog_info
       uint8_t clipDistances;     /* number of clip distance outputs */
       uint8_t cullDistances;     /* number of cull distance outputs */
       int8_t genUserClip;        /* request user clip planes for ClipVertex */
-      uint8_t auxCBSlot;         /* constant buffer index of UCP/draw data */
+      uint8_t auxCBSlot;         /* driver constant buffer slot */
       uint16_t ucpBase;          /* base address for UCPs */
       uint16_t drawInfoBase;     /* base address for draw parameters */
       uint8_t pointSize;         /* output index for PointSize */
@@ -175,7 +175,6 @@ struct nv50_ir_prog_info
       uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
       bool fp64;                 /* program uses fp64 math */
       bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
-      uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
       uint16_t sampleInfoBase;   /* base address for sample positions */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 12c5f699603..5a46ede8528 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -682,7 +682,7 @@ void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
                                        Value **ms_x, Value **ms_y) {
    // This loads the texture-indexed ms setting from the constant buffer
    Value *tmp = new_LValue(func, FILE_GPR);
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
    if (prog->getType() > Program::TYPE_VERTEX)
       off += 16 * 2 * 4;
@@ -1174,7 +1174,7 @@ NV50LoweringPreSSA::handleRDSV(Instruction *i)
       bld.mkLoad(TYPE_F32,
                  def,
                  bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                        TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
                  off);
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 1c56d16abc8..8d3cf5ac2ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -600,7 +600,7 @@ NVC0LoweringPass::visit(BasicBlock *bb)
 inline Value *
 NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    uint32_t off = prog->driver->io.texBindBase + slot * 4;
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -1205,7 +1205,7 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 inline Value *
 NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
@@ -1214,7 +1214,7 @@ NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
 
    if (ptr)
@@ -1227,7 +1227,7 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 inline Value *
 NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
 {
-   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint8_t b = prog->driver->io.auxCBSlot;
    off += prog->driver->io.suInfoBase;
 
    if (ptr)
@@ -1541,7 +1541,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
       call->indirect = 1;
       call->absolute = 1;
       call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
-                                   prog->driver->io.resInfoCBSlot, TYPE_U32,
+                                   prog->driver->io.auxCBSlot, TYPE_U32,
                                    prog->driver->io.suInfoBase + base));
       call->setSrc(1, r[2]);
       call->setSrc(2, r[4]);
@@ -1716,7 +1716,7 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       bld.mkLoad(TYPE_F32,
                  i->getDef(0),
                  bld.mkSymbol(
-                       FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
+                       FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
                        TYPE_U32, prog->driver->io.sampleInfoBase +
                        4 * sym->reg.data.sv.index),
                  off);
diff --git a/src/gallium/drivers/nouveau/nouveau_compiler.c b/src/gallium/drivers/nouveau/nouveau_compiler.c
index cd44aa1e1d9..ca73fd17a43 100644
--- a/src/gallium/drivers/nouveau/nouveau_compiler.c
+++ b/src/gallium/drivers/nouveau/nouveau_compiler.c
@@ -114,8 +114,6 @@ nouveau_codegen(int chipset, int type, struct tgsi_token tokens[],
 
    info.io.auxCBSlot = 15;
    info.io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
-
-   info.io.resInfoCBSlot = 15;
    info.io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
    info.io.msInfoCBSlot = 15;
    info.io.msInfoBase = NV50_CB_AUX_MS_OFFSET;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index a67ef28abf8..3444b3110de 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -335,7 +335,6 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    info->io.ucpBase = NV50_CB_AUX_UCP_OFFSET;
    info->io.genUserClip = prog->vp.clpd_nr;
 
-   info->io.resInfoCBSlot = 15;
    info->io.suInfoBase = NV50_CB_AUX_TEX_MS_OFFSET;
    info->io.sampleInfoBase = NV50_CB_AUX_SAMPLE_OFFSET;
    info->io.msInfoCBSlot = 15;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index bc884d6c08f..48e3475a95f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -540,12 +540,11 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.resInfoCBSlot = 0;
+         info->io.auxCBSlot = 0;
          info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
          info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
          info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
       } else {
-         info->io.resInfoCBSlot = 15;
          info->io.suInfoBase = 512;
       }
       info->io.msInfoCBSlot = 0;
@@ -555,7 +554,6 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.texBindBase = 0x20;
          info->io.suInfoBase = 0; /* TODO */
       }
-      info->io.resInfoCBSlot = 15;
       info->io.sampleInfoBase = 256 + 128;
       info->io.suInfoBase = 512;
       info->io.msInfoCBSlot = 15;

From 26cc411db87f924003f227874d7a047dd8b5e5a4 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 15 Mar 2016 13:06:44 +0100
Subject: [PATCH 058/197] nv50/ir: make use of auxCBSlot instead of magic
 numbers

This avoids using magic numbers for the driver constbuf slot which
is always 15 except for compute shaders on gk104+ where the slot 0
is used.

For gk104+, some special compute-related values like the thread
index are uploaded to screen->parm which is currently bound on c0.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Pierre Moreau <pierre.morrow@free.fr>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp     | 3 ++-
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index d284446f5d9..4bebfdc0a7b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2178,7 +2178,8 @@ Converter::getResourceBase(const int r)
 
    switch (r) {
    case TGSI_RESOURCE_GLOBAL:
-      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL,
+                       info->io.auxCBSlot);
       break;
    case TGSI_RESOURCE_LOCAL:
       assert(prog->getType() == Program::TYPE_COMPUTE);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 8d3cf5ac2ce..6f1ebef74fb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1699,7 +1699,8 @@ NVC0LoweringPass::handleRDSV(Instruction *i)
       }
       addr += prog->driver->prop.cp.gridInfoBase;
       bld.mkLoad(TYPE_U32, i->getDef(0),
-                 bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+                 bld.mkSymbol(FILE_MEMORY_CONST, prog->driver->io.auxCBSlot,
+                              TYPE_U32, addr), NULL);
       break;
    case SV_SAMPLE_INDEX:
       // TODO: Properly pass source as an address in the PIX address space

From 902bbda81b31bacb2a8c60ca6a8ba8ca34ae73d3 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 24 Feb 2016 21:35:25 +0100
Subject: [PATCH 059/197] nvc0: avoid using magic numbers for the uniform_bo
 offsets

Instead make use of constants to improve readability.

The first 32 bytes of the driver constant buffer are unknown... This
doesn't seem to be used in the codegen part, but if the texBindBase
offset is shifted from 0x20 to 0x00, this breaks the universe for
really weird reasons. This sounds like to be related to textures.

Anyway, name this NVC0_CB_AUX_UNK_INFO and add a todo should be
enough for now.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nvc0_compute.c       | 13 +++++----
 .../drivers/nouveau/nvc0/nvc0_context.h       | 25 +++++++++++++++++
 .../drivers/nouveau/nvc0/nvc0_program.c       | 12 ++++----
 .../drivers/nouveau/nvc0/nvc0_screen.c        | 14 +++++-----
 .../drivers/nouveau/nvc0/nvc0_screen.h        |  2 +-
 .../nouveau/nvc0/nvc0_state_validate.c        | 28 ++++++++++---------
 src/gallium/drivers/nouveau/nvc0/nvc0_tex.c   |  9 +++---
 src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c   | 14 ++++++----
 8 files changed, 73 insertions(+), 44 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index ffbb16f79de..6aaa7ce1aaf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -153,7 +153,7 @@ nvc0_compute_validate_constbufs(struct nvc0_context *nvc0)
 
       if (nvc0->constbuf[s][i].user) {
          struct nouveau_bo *bo = nvc0->screen->uniform_bo;
-         const unsigned base = s << 16;
+         const unsigned base = NVC0_CB_USR_INFO(s);
          const unsigned size = nvc0->constbuf[s][0].size;
          assert(i == 0); /* we really only want OpenGL uniforms here */
          assert(nvc0->constbuf[s][0].u.data);
@@ -207,8 +207,8 @@ nvc0_compute_validate_driverconst(struct nvc0_context *nvc0)
 
    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (5 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (5 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5));
    BEGIN_NVC0(push, NVC0_CP(CB_BIND), 1);
    PUSH_DATA (push, (15 << 8) | 1);
 
@@ -219,15 +219,16 @@ static void
 nvc0_compute_validate_buffers(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    const int s = 5;
    int i;
 
    BEGIN_NVC0(push, NVC0_CP(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
-   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
    BEGIN_1IC0(push, NVC0_CP(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
-   PUSH_DATA (push, 512);
+   PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
 
    for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
       if (nvc0->buffers[s][i].buffer) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 54afe887ebd..31e1272aeed 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -98,6 +98,31 @@
 #define NVC0_BIND_M2MF          0
 #define NVC0_BIND_FENCE         1
 
+/* 6 user uniform buffers, at 64K each */
+#define NVC0_CB_USR_INFO(s)         (s << 16)
+#define NVC0_CB_USR_SIZE            (6 << 16)
+/* 6 driver constbuts, at 1K each */
+#define NVC0_CB_AUX_INFO(s)         NVC0_CB_USR_SIZE + (s << 10)
+#define NVC0_CB_AUX_SIZE            (6 << 10)
+/* XXX: Figure out what this UNK data is. */
+#define NVC0_CB_AUX_UNK_INFO        0x000
+#define NVC0_CB_AUX_UNK_SIZE        (8 * 4)
+/* 32 textures handles, at 1 32-bits integer each */
+#define NVC0_CB_AUX_TEX_INFO(i)     0x020 + (i) * 4
+#define NVC0_CB_AUX_TEX_SIZE        (32 * 4)
+/* 8 user clip planes, at 4 32-bits floats each */
+#define NVC0_CB_AUX_UCP_INFO        0x100
+#define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 8 sets of 32-bits integer pairs sample offsets */
+#define NVC0_CB_AUX_SAMPLE_INFO     0x180 /* FP */
+#define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
+/* draw parameters (index bais, base instance, drawid) */
+#define NVC0_CB_AUX_DRAW_INFO       0x180 /* VP */
+/* 32 user buffers, at 4 32-bits integers each */
+#define NVC0_CB_AUX_BUF_INFO(i)     0x200 + (i) * 4 * 4
+#define NVC0_CB_AUX_BUF_SIZE        (NVC0_MAX_BUFFERS * 4 * 4)
+/* 4 32-bits floats for the vertex runout, put at the end */
+#define NVC0_CB_AUX_RUNOUT_INFO     NVC0_CB_USR_SIZE + NVC0_CB_AUX_SIZE
 
 struct nvc0_blitctx;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 48e3475a95f..b7c6faf9cde 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -535,8 +535,8 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    info->io.genUserClip = prog->vp.num_ucps;
    info->io.auxCBSlot = 15;
-   info->io.ucpBase = 256;
-   info->io.drawInfoBase = 256 + 128;
+   info->io.ucpBase = NVC0_CB_AUX_UCP_INFO;
+   info->io.drawInfoBase = NVC0_CB_AUX_DRAW_INFO;
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
@@ -545,17 +545,17 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
          info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
       } else {
-         info->io.suInfoBase = 512;
+         info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       }
       info->io.msInfoCBSlot = 0;
       info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.texBindBase = 0x20;
+         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
          info->io.suInfoBase = 0; /* TODO */
       }
-      info->io.sampleInfoBase = 256 + 128;
-      info->io.suInfoBase = 512;
+      info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
+      info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       info->io.msInfoCBSlot = 15;
       info->io.msInfoBase = 0; /* TODO */
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 3c5b1da2063..553c001cd2b 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -922,14 +922,14 @@ nvc0_screen_create(struct nouveau_device *dev)
       /* auxiliary constants (6 user clip planes, base instance id) */
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
-      PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
       BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
       PUSH_DATA (push, (15 << 4) | 1);
       if (screen->eng3d->oclass >= NVE4_3D_CLASS) {
          unsigned j;
          BEGIN_1IC0(push, NVC0_3D(CB_POS), 9);
-         PUSH_DATA (push, 0);
+         PUSH_DATA (push, NVC0_CB_AUX_UNK_INFO);
          for (j = 0; j < 8; ++j)
             PUSH_DATA(push, j);
       } else {
@@ -943,8 +943,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    /* return { 0.0, 0.0, 0.0, 0.0 } for out-of-bounds vtxbuf access */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 256);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
    BEGIN_1IC0(push, NVC0_3D(CB_POS), 5);
    PUSH_DATA (push, 0);
    PUSH_DATAf(push, 0.0f);
@@ -952,8 +952,8 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATAf(push, 0.0f);
    PUSH_DATAf(push, 0.0f);
    BEGIN_NVC0(push, NVC0_3D(VERTEX_RUNOUT_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
-   PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (6 << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_RUNOUT_INFO);
 
    if (screen->base.drm->version >= 0x01000101) {
       ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 8487abcf999..46b692df2e3 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -66,7 +66,7 @@ struct nvc0_screen {
 
    struct nouveau_bo *text;
    struct nouveau_bo *parm;       /* for COMPUTE */
-   struct nouveau_bo *uniform_bo; /* for 3D */
+   struct nouveau_bo *uniform_bo;
    struct nouveau_bo *tls;
    struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
    struct nouveau_bo *poly_cache;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index c0ed5c0043d..9c64482f2e2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -72,6 +72,7 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
 {
     struct nouveau_pushbuf *push = nvc0->base.pushbuf;
     struct pipe_framebuffer_state *fb = &nvc0->framebuffer;
+    struct nvc0_screen *screen = nvc0->screen;
     unsigned i, ms;
     unsigned ms_mode = NVC0_3D_MULTISAMPLE_MODE_MS1;
     bool serialize = false;
@@ -183,10 +184,10 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
     ms = 1 << ms_mode;
     BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
     PUSH_DATA (push, 1024);
-    PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10));
-    PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (4 << 10));
+    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
+    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(4));
     BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 2 * ms);
-    PUSH_DATA (push, 256 + 128);
+    PUSH_DATA (push, NVC0_CB_AUX_SAMPLE_INFO);
     for (i = 0; i < ms; i++) {
        float xy[2];
        nvc0->base.pipe.get_sample_position(&nvc0->base.pipe, ms, i, xy);
@@ -313,14 +314,14 @@ static inline void
 nvc0_upload_uclip_planes(struct nvc0_context *nvc0, unsigned s)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+   struct nvc0_screen *screen = nvc0->screen;
 
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 1024);
-   PUSH_DATAh(push, bo->offset + (6 << 16) + (s << 10));
-   PUSH_DATA (push, bo->offset + (6 << 16) + (s << 10));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
    BEGIN_1IC0(push, NVC0_3D(CB_POS), PIPE_MAX_CLIP_PLANES * 4 + 1);
-   PUSH_DATA (push, 256);
+   PUSH_DATA (push, NVC0_CB_AUX_UCP_INFO);
    PUSH_DATAp(push, &nvc0->clip.ucp[0][0], PIPE_MAX_CLIP_PLANES * 4);
 }
 
@@ -424,7 +425,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
 
          if (nvc0->constbuf[s][i].user) {
             struct nouveau_bo *bo = nvc0->screen->uniform_bo;
-            const unsigned base = s << 16;
+            const unsigned base = NVC0_CB_USR_INFO(s);
             const unsigned size = nvc0->constbuf[s][0].size;
             assert(i == 0); /* we really only want OpenGL uniforms here */
             assert(nvc0->constbuf[s][0].u.data);
@@ -478,15 +479,16 @@ static void
 nvc0_validate_buffers(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    int i, s;
 
    for (s = 0; s < 5; s++) {
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
-      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (s << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
       BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 4 * NVC0_MAX_BUFFERS);
-      PUSH_DATA (push, 512);
+      PUSH_DATA (push, NVC0_CB_AUX_BUF_INFO(0));
       for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
          if (nvc0->buffers[s][i].buffer) {
             struct nv04_resource *res =
@@ -550,8 +552,8 @@ nvc0_validate_driverconst(struct nvc0_context *nvc0)
    for (i = 0; i < 5; ++i) {
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
-      PUSH_DATA (push, screen->uniform_bo->offset + (6 << 16) + (i << 10));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(i));
       BEGIN_NVC0(push, NVC0_3D(CB_BIND(i)), 1);
       PUSH_DATA (push, (15 << 4) | 1);
    }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 53332400a4f..ce6a6dce39c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -707,21 +707,20 @@ void
 nve4_set_tex_handles(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
-   uint64_t address;
+   struct nvc0_screen *screen = nvc0->screen;
    unsigned s;
 
    if (nvc0->screen->base.class_3d < NVE4_3D_CLASS)
       return;
-   address = nvc0->screen->uniform_bo->offset + (6 << 16);
 
-   for (s = 0; s < 5; ++s, address += (1 << 10)) {
+   for (s = 0; s < 5; ++s) {
       uint32_t dirty = nvc0->textures_dirty[s] | nvc0->samplers_dirty[s];
       if (!dirty)
          continue;
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 1024);
-      PUSH_DATAh(push, address);
-      PUSH_DATA (push, address);
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s));
       do {
          int i = ffs(dirty) - 1;
          dirty &= ~(1 << i);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index e0e0ad2a0f7..4d9cd5752b5 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -820,6 +820,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    struct nv04_resource *buf_count = nv04_resource(info->indirect_params);
    unsigned size, macro, count = info->indirect_count, drawid = info->drawid;
    uint32_t offset = buf->offset + info->indirect_offset;
+   struct nvc0_screen *screen = nvc0->screen;
 
    PUSH_SPACE(push, 7);
 
@@ -833,10 +834,10 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    /* Queue things up to let the macros write params to the driver constbuf */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
    PUSH_DATA (push, 512);
-   PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
-   PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
+   PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+   PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
    BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
-   PUSH_DATA (push, 256 + 128);
+   PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
 
    if (info->indexed) {
       assert(nvc0->idxbuf.buffer);
@@ -934,6 +935,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    int s;
 
    /* NOTE: caller must ensure that (min_index + index_bias) is >= 0 */
@@ -975,11 +977,11 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       PUSH_SPACE(push, 9);
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
       PUSH_DATA (push, 512);
-      PUSH_DATAh(push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
-      PUSH_DATA (push, nvc0->screen->uniform_bo->offset + (6 << 16) + (0 << 9));
+      PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
+      PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
       if (!info->indirect) {
          BEGIN_1IC0(push, NVC0_3D(CB_POS), 1 + 3);
-         PUSH_DATA (push, 256 + 128);
+         PUSH_DATA (push, NVC0_CB_AUX_DRAW_INFO);
          PUSH_DATA (push, info->index_bias);
          PUSH_DATA (push, info->start_instance);
          PUSH_DATA (push, info->drawid);

From d1b85dbffa0eec2b44bb2a9f339a2617a39730da Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Mar 2016 11:43:37 -0400
Subject: [PATCH 060/197] nv50: reset TFB bufctx when we no longer hold a
 reference to the buffers

This fix is analogous to commit ff085d014.

This fixes some use-after-free situations in dEQP when an xfb state is
removed, and then a clear is triggered, which only does a partial
validation. It would attempt to read the no-longer-valid buffers,
resulting in crashes.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nv50/nv50_shader_state.c | 2 --
 src/gallium/drivers/nouveau/nv50/nv50_state.c        | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
index 56a3df9d578..3d2ebfbcc46 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_shader_state.c
@@ -648,8 +648,6 @@ nv50_stream_output_validate(struct nv50_context *nv50)
    BEGIN_NV04(push, NV50_3D(STRMOUT_BUFFERS_CTRL), 1);
    PUSH_DATA (push, ctrl);
 
-   nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
-
    for (i = 0; i < nv50->num_so_targets; ++i) {
       struct nv50_so_target *targ = nv50_so_target(nv50->so_target[i]);
       struct nv04_resource *buf = nv04_resource(targ->pipe.buffer);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 4d77bf1f711..86e74d68b11 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -1180,8 +1180,10 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
    }
    nv50->num_so_targets = num_targets;
 
-   if (nv50->so_targets_dirty)
+   if (nv50->so_targets_dirty) {
+      nouveau_bufctx_reset(nv50->bufctx_3d, NV50_BIND_3D_SO);
       nv50->dirty_3d |= NV50_NEW_3D_STRMOUT;
+   }
 }
 
 static void

From d2445b00837c9123b59a1ac743c136546f334504 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Mar 2016 11:46:11 -0400
Subject: [PATCH 061/197] nv50/ir: force-enable derivatives on TXD ops

This matters especially in vertex shaders, where derivatives are
disabled by default. This fixes textureGrad in vertex shaders on nv50.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp     | 4 +++-
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 682a19d6d78..bd6200687ed 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1634,7 +1634,9 @@ CodeEmitterNV50::emitTEX(const TexInstruction *i)
    code[1] |= (i->tex.mask & 0xc) << 12;
 
    if (i->tex.liveOnly)
-      code[1] |= 4;
+      code[1] |= 1 << 2;
+   if (i->tex.derivAll)
+      code[1] |= 1 << 3;
 
    defId(i->def(0), 2);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 5a46ede8528..6987503f9ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -934,6 +934,7 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 
    handleTEX(i);
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
+   i->tex.derivAll = true;
 
    for (c = 0; c < dim; ++c)
       crd[c] = bld.getScratch();

From 789e0965941533b0eeb2bc822012985e7c36d9c9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 8 Mar 2016 23:59:37 -0800
Subject: [PATCH 062/197] mesa: Disallow GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME
 on winsys FBO.

Fixes:
dEQP-GLES3.functional.negative_api.state.get_framebuffer_attachment_parameteriv

Apparently, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME is not allowed when
GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is GL_FRAMEBUFFER_DEFAULT, and
is expected to result in a GL_INVALID_ENUM error.

No GL specification actually defines what GL_FRAMEBUFFER_DEFAULT means.
It probably means the window system FBO.  It also doesn't mention the
behavior of any queries for that type.  Various ARB folks seem fairly
confused about it too.  For now, just do something vaguely like what
dEQP expects.

I think we probably need to check the visual bits against 0 for the
attachment, but we haven't been doing that thusfar, and given how
confusingly this is specified, I can't imagine anyone relying on it.

v2: Improve comments, move error condition above the
    _mesa_get_fb0_attachment call, add forgotten "return"
    (all suggested/caught by Jordan Justen).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/main/fbobject.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index d490918b816..bb8d4c3112b 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -3623,6 +3623,23 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
                      _mesa_enum_to_string(attachment));
          return;
       }
+
+      /* The specs are not clear about how to handle
+       * GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME with the default framebuffer,
+       * but dEQP-GLES3 expects an INVALID_ENUM error. This has also been
+       * discussed in:
+       *
+       * https://cvs.khronos.org/bugzilla/show_bug.cgi?id=12928#c1
+       * and https://bugs.freedesktop.org/show_bug.cgi?id=31947
+       */
+      if (pname == GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "%s(requesting GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME "
+                     "when GL_FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is "
+                     "GL_FRAMEBUFFER_DEFAULT is not allowed)", caller);
+         return;
+      }
+
       /* the default / window-system FBO */
       att = _mesa_get_fb0_attachment(ctx, buffer, attachment);
    }

From 9184d9a0bbe8a8b88d676a20f95d66ceee9eaf21 Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierre.morrow@free.fr>
Date: Sat, 19 Mar 2016 17:56:03 +0100
Subject: [PATCH 063/197] nvc0/ir: Use double constant in handleSQRT

Fixes: a100d89d0998 (nv50,nvc0: Fix invalid constant.)
Signed-off-by: Pierre Moreau <pierre.morrow@free.fr>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 6f1ebef74fb..c88a2695a4c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1782,7 +1782,7 @@ NVC0LoweringPass::handleSQRT(Instruction *i)
 {
    if (i->dType == TYPE_F64) {
       Value *pred = bld.getSSA(1, FILE_PREDICATE);
-      Value *zero = bld.loadImm(NULL, 0);
+      Value *zero = bld.loadImm(NULL, 0.0);
       Value *dst = bld.getSSA(8);
       bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
       bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);

From fbe6e92899f90e7ee85420e88c807a1f2fd2be14 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 10 Mar 2016 13:20:36 +0100
Subject: [PATCH 064/197] gallium: add TGSI property NEXT_SHADER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Radeonsi needs to know which shader stage will execute after a shader
in order to make the best decision about which shader variant to compile
first.

This is only set for VS and TES, because we don't need it elsewhere.

VS has 3 variants:
- next shader is FS
- next shader is GS
- next shader is TCS

TES has 2 variants:
- next shader is FS
- next shader is GS

Currently, radeonsi always assumes the next shader is FS, which is suboptimal,
since st/mesa always knows which shader is next if the GLSL program is not
a "separate shader".

By default, ureg always sets "next shader is FS".

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_strings.c  |  1 +
 src/gallium/auxiliary/tgsi/tgsi_ureg.c     | 19 +++++++++++++++++++
 src/gallium/auxiliary/tgsi/tgsi_ureg.h     |  2 ++
 src/gallium/docs/source/tgsi.rst           |  8 ++++++++
 src/gallium/include/pipe/p_shader_tokens.h |  3 ++-
 5 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 6bd1a2e14d2..ae779a8320a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -145,6 +145,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "NUM_CLIPDIST_ENABLED",
    "NUM_CULLDIST_ENABLED",
    "FS_EARLY_DEPTH_STENCIL",
+   "NEXT_SHADER",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index ab1d03458ef..0dd5ea76f33 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -101,6 +101,7 @@ struct ureg_program
 {
    unsigned processor;
    bool supports_any_inout_decl_range;
+   int next_shader_processor;
 
    struct {
       unsigned semantic_name;
@@ -1966,6 +1967,16 @@ const struct tgsi_token *ureg_finalize( struct ureg_program *ureg )
 {
    const struct tgsi_token *tokens;
 
+   switch (ureg->processor) {
+   case TGSI_PROCESSOR_VERTEX:
+   case TGSI_PROCESSOR_TESS_EVAL:
+      ureg_property(ureg, TGSI_PROPERTY_NEXT_SHADER,
+                    ureg->next_shader_processor == -1 ?
+                       TGSI_PROCESSOR_FRAGMENT :
+                       ureg->next_shader_processor);
+      break;
+   }
+
    emit_header( ureg );
    emit_decls( ureg );
    copy_instructions( ureg );
@@ -2079,6 +2090,7 @@ ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
       screen->get_shader_param(screen,
                                util_pipe_shader_from_tgsi_processor(processor),
                                PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
+   ureg->next_shader_processor = -1;
 
    for (i = 0; i < Elements(ureg->properties); i++)
       ureg->properties[i] = ~0;
@@ -2108,6 +2120,13 @@ no_ureg:
 }
 
 
+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor)
+{
+   ureg->next_shader_processor = processor;
+}
+
+
 unsigned
 ureg_get_nr_outputs( const struct ureg_program *ureg )
 {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 04a62a6e160..74324678a99 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -114,6 +114,8 @@ ureg_create_shader( struct ureg_program *,
                     struct pipe_context *pipe,
 		    const struct pipe_stream_output_info *so );
 
+void
+ureg_set_next_shader_processor(struct ureg_program *ureg, unsigned processor);
 
 /* Alternately, return the built token stream and hand ownership of
  * that memory to the caller:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index af2df2251da..6366f7e802d 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -3213,6 +3213,14 @@ Whether depth test, stencil test, and occlusion query should run before
 the fragment shader (regardless of fragment shader side effects). Corresponds
 to GLSL early_fragment_tests.
 
+NEXT_SHADER
+"""""""""""
+
+Which shader stage will MOST LIKELY follow after this shader when the shader
+is bound. This is only a hint to the driver and doesn't have to be precise.
+Only set for VS and TES.
+
+
 Texture Sampling and Texture Formats
 ------------------------------------
 
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 7a34841088a..5c460276d73 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -278,7 +278,8 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED   15
 #define TGSI_PROPERTY_NUM_CULLDIST_ENABLED   16
 #define TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL 17
-#define TGSI_PROPERTY_COUNT                  18
+#define TGSI_PROPERTY_NEXT_SHADER            18
+#define TGSI_PROPERTY_COUNT                  19
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */

From 2bdd7a46a92fcfa983bd53294342a0ef14098d7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 10 Mar 2016 13:28:08 +0100
Subject: [PATCH 065/197] st/mesa: set TGSI property NEXT_SHADER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 36 ++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 18414055549..bdfd5ebb9f1 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6370,6 +6370,42 @@ st_translate_program(
                        t->insn[t->labels[i].branch_target]);
    }
 
+   /* Set the next shader stage hint for VS and TES. */
+   switch (procType) {
+   case TGSI_PROCESSOR_VERTEX:
+   case TGSI_PROCESSOR_TESS_EVAL:
+      if (program->shader_program->SeparateShader)
+         break;
+
+      for (i = program->shader->Stage+1; i <= MESA_SHADER_FRAGMENT; i++) {
+         if (program->shader_program->_LinkedShaders[i]) {
+            unsigned next;
+
+            switch (i) {
+            case MESA_SHADER_TESS_CTRL:
+               next = TGSI_PROCESSOR_TESS_CTRL;
+               break;
+            case MESA_SHADER_TESS_EVAL:
+               next = TGSI_PROCESSOR_TESS_EVAL;
+               break;
+            case MESA_SHADER_GEOMETRY:
+               next = TGSI_PROCESSOR_GEOMETRY;
+               break;
+            case MESA_SHADER_FRAGMENT:
+               next = TGSI_PROCESSOR_FRAGMENT;
+               break;
+            default:
+               assert(0);
+               continue;
+            }
+
+            ureg_set_next_shader_processor(ureg, next);
+            break;
+         }
+      }
+      break;
+   }
+
 out:
    if (t) {
       free(t->arrays);

From a73a657def40375e0c5788bd8c3db7c6b987a934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Thu, 10 Mar 2016 13:29:12 +0100
Subject: [PATCH 066/197] radeonsi: process TGSI property NEXT_SHADER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This allows compiling the main shader part as ES or LS.

If we get the correct hint, non-separable GLSL shaders no longer have to be
compiled as VS first, followed by LS or ES compiled on demand.

The result is that fewer shaders are compiled by piglit, but it doesn't
improve piglit running time.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c      |  9 ++++---
 .../drivers/radeonsi/si_state_shaders.c       | 27 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 8c1151aa493..151615eb4e7 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5897,12 +5897,15 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
 	struct si_shader *mainp = shader->selector->main_shader_part;
 	int r;
 
-	/* LS and ES are always compiled on demand. */
+	/* LS, ES, VS are compiled on demand if the main part hasn't been
+	 * compiled for that stage.
+	 */
 	if (!mainp ||
 	    (shader->selector->type == PIPE_SHADER_VERTEX &&
-	     (shader->key.vs.as_es || shader->key.vs.as_ls)) ||
+	     (shader->key.vs.as_es != mainp->key.vs.as_es ||
+	      shader->key.vs.as_ls != mainp->key.vs.as_ls)) ||
 	    (shader->selector->type == PIPE_SHADER_TESS_EVAL &&
-	     shader->key.tes.as_es)) {
+	     shader->key.tes.as_es != mainp->key.tes.as_es)) {
 		/* Monolithic shader (compiled as a whole, has many variants,
 		 * may take a long time to compile).
 		 */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 5fe1f7960f3..d69bb2e317a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1042,6 +1042,31 @@ static int si_shader_select(struct pipe_context *ctx,
 	return si_shader_select_with_key(ctx, state, &key);
 }
 
+static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
+					  union si_shader_key *key)
+{
+	unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
+
+	switch (info->processor) {
+	case TGSI_PROCESSOR_VERTEX:
+		switch (next_shader) {
+		case TGSI_PROCESSOR_GEOMETRY:
+			key->vs.as_es = 1;
+			break;
+		case TGSI_PROCESSOR_TESS_CTRL:
+		case TGSI_PROCESSOR_TESS_EVAL:
+			key->vs.as_ls = 1;
+			break;
+		}
+		break;
+
+	case TGSI_PROCESSOR_TESS_EVAL:
+		if (next_shader == TGSI_PROCESSOR_GEOMETRY)
+			key->tes.as_es = 1;
+		break;
+	}
+}
+
 static void *si_create_shader_selector(struct pipe_context *ctx,
 				       const struct pipe_shader_state *state)
 {
@@ -1167,6 +1192,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 			goto error;
 
 		shader->selector = sel;
+		si_parse_next_shader_property(&sel->info, &shader->key);
 
 		tgsi_binary = si_get_tgsi_binary(sel);
 
@@ -1202,6 +1228,7 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		union si_shader_key key;
 
 		memset(&key, 0, sizeof(key));
+		si_parse_next_shader_property(&sel->info, &key);
 
 		/* Set reasonable defaults, so that the shader key doesn't
 		 * cause any code to be eliminated.

From 8140154ae92c6bd022e409790bb069966a857aed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 11 Mar 2016 15:24:05 +0100
Subject: [PATCH 067/197] gallium/radeon: remove old CS tracing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Cons:
- it was only integrated in r600g
- it doesn't work with GPUVM
- it records buffer contents at the end of IBs instead of at the beginning,
  so the replay isn't exact
- it lacks an IB parser and user-friendliness

A better solution is apitrace in combination with gallium/ddebug, which
has a complete IB parser and can pinpoint hanging CP packets.

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r300/r300_context.c       |   2 +-
 src/gallium/drivers/r300/r300_flush.c         |   6 +-
 src/gallium/drivers/r600/r600_hw_context.c    |  11 +-
 src/gallium/drivers/r600/r600_pipe.c          |   4 +-
 src/gallium/drivers/r600/r600_pipe.h          |   6 -
 src/gallium/drivers/r600/r600_state_common.c  |  23 --
 src/gallium/drivers/radeon/r600_pipe_common.c |  21 +-
 src/gallium/drivers/radeon/r600_pipe_common.h |   6 +-
 src/gallium/drivers/radeon/radeon_uvd.c       |   4 +-
 src/gallium/drivers/radeon/radeon_vce.c       |   4 +-
 src/gallium/drivers/radeon/radeon_winsys.h    |   8 +-
 src/gallium/drivers/radeonsi/si_hw_context.c  |   3 +-
 src/gallium/drivers/radeonsi/si_pipe.c        |   5 +-
 src/gallium/winsys/amdgpu/drm/amdgpu_cs.c     |   6 +-
 src/gallium/winsys/radeon/drm/Makefile.am     |   2 -
 .../winsys/radeon/drm/Makefile.sources        |   4 -
 src/gallium/winsys/radeon/drm/radeon_ctx.h    | 205 ------------------
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c |  13 +-
 src/gallium/winsys/radeon/drm/radeon_drm_cs.h |   5 -
 .../winsys/radeon/drm/radeon_drm_cs_dump.c    | 161 --------------
 20 files changed, 23 insertions(+), 476 deletions(-)
 delete mode 100644 src/gallium/winsys/radeon/drm/radeon_ctx.h
 delete mode 100644 src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c

diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 6fa892089ec..d100a9df55b 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -385,7 +385,7 @@ struct pipe_context* r300_create_context(struct pipe_screen* screen,
     if (!r300->ctx)
         goto fail;
 
-    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300, NULL);
+    r300->cs = rws->cs_create(r300->ctx, RING_GFX, r300_flush_callback, r300);
     if (r300->cs == NULL)
         goto fail;
 
diff --git a/src/gallium/drivers/r300/r300_flush.c b/src/gallium/drivers/r300/r300_flush.c
index 7a75b43a53e..63182cba2b2 100644
--- a/src/gallium/drivers/r300/r300_flush.c
+++ b/src/gallium/drivers/r300/r300_flush.c
@@ -53,7 +53,7 @@ static void r300_flush_and_cleanup(struct r300_context *r300, unsigned flags,
     }
 
     r300->flush_counter++;
-    r300->rws->cs_flush(r300->cs, flags, fence, 0);
+    r300->rws->cs_flush(r300->cs, flags, fence);
     r300->dirty_hw = 0;
 
     /* New kitchen sink, baby. */
@@ -88,11 +88,11 @@ void r300_flush(struct pipe_context *pipe,
              * and we cannot emit an empty CS. Let's write to some reg. */
             CS_LOCALS(r300);
             OUT_CS_REG(RB3D_COLOR_CHANNEL_MASK, 0);
-            r300->rws->cs_flush(r300->cs, flags, fence, 0);
+            r300->rws->cs_flush(r300->cs, flags, fence);
         } else {
             /* Even if hw is not dirty, we should at least reset the CS in case
              * the space checking failed for the first draw operation. */
-            r300->rws->cs_flush(r300->cs, flags, NULL, 0);
+            r300->rws->cs_flush(r300->cs, flags, NULL);
         }
     }
 
diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c
index 4951297df42..7a6f957945b 100644
--- a/src/gallium/drivers/r600/r600_hw_context.c
+++ b/src/gallium/drivers/r600/r600_hw_context.c
@@ -57,18 +57,11 @@ void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw,
 
 		/* The number of dwords all the dirty states would take. */
 		mask = ctx->dirty_atoms;
-		while (mask != 0) {
+		while (mask != 0)
 			num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw;
-			if (ctx->screen->b.trace_bo) {
-				num_dw += R600_TRACE_CS_DWORDS;
-			}
-		}
 
 		/* The upper-bound of how much space a draw command would take. */
 		num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS;
-		if (ctx->screen->b.trace_bo) {
-			num_dw += R600_TRACE_CS_DWORDS;
-		}
 	}
 
 	/* Count in queries_suspend. */
@@ -273,7 +266,7 @@ void r600_context_gfx_flush(void *context, unsigned flags,
 	flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
 
 	/* Flush the CS. */
-	ctx->b.ws->cs_flush(cs, flags, fence, ctx->screen->b.cs_count++);
+	ctx->b.ws->cs_flush(cs, flags, fence);
 
 	r600_begin_new_cs(ctx);
 }
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 7018088d204..88c500a162a 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -187,9 +187,7 @@ static struct pipe_context *r600_create_context(struct pipe_screen *screen,
 	}
 
 	rctx->b.gfx.cs = ws->cs_create(rctx->b.ctx, RING_GFX,
-				       r600_context_gfx_flush, rctx,
-				       rscreen->b.trace_bo ?
-					       rscreen->b.trace_bo->buf : NULL);
+				       r600_context_gfx_flush, rctx);
 	rctx->b.gfx.flush = r600_context_gfx_flush;
 
 	rctx->allocator_fetch_shader = u_suballocator_create(&rctx->b.b, 64 * 1024, 256,
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index f8a20398355..72aa64233a9 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -60,7 +60,6 @@
 /* the number of CS dwords for flushing and drawing */
 #define R600_MAX_FLUSH_CS_DWORDS	16
 #define R600_MAX_DRAW_CS_DWORDS		58
-#define R600_TRACE_CS_DWORDS		7
 
 #define R600_MAX_USER_CONST_BUFFERS 13
 #define R600_MAX_DRIVER_CONST_BUFFERS 3
@@ -571,15 +570,10 @@ static inline void r600_mark_atom_dirty(struct r600_context *rctx,
 	r600_set_atom_dirty(rctx, atom, true);
 }
 
-void r600_trace_emit(struct r600_context *rctx);
-
 static inline void r600_emit_atom(struct r600_context *rctx, struct r600_atom *atom)
 {
 	atom->emit(&rctx->b, atom);
 	r600_set_atom_dirty(rctx, atom, false);
-	if (rctx->screen->b.trace_bo) {
-		r600_trace_emit(rctx);
-	}
 }
 
 static inline void r600_set_cso_state(struct r600_context *rctx,
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 2211e07ceba..df41d3f028d 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -2029,10 +2029,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		cs->buf[cs->cdw++] = EVENT_TYPE(EVENT_TYPE_SQ_NON_EVENT);
 	}
 
-	if (rctx->screen->b.trace_bo) {
-		r600_trace_emit(rctx);
-	}
-
 	/* Set the depth buffer as dirty. */
 	if (rctx->framebuffer.state.zsbuf) {
 		struct pipe_surface *surf = rctx->framebuffer.state.zsbuf;
@@ -2927,22 +2923,3 @@ void r600_init_common_state_functions(struct r600_context *rctx)
 	rctx->b.set_occlusion_query_state = r600_set_occlusion_query_state;
 	rctx->b.need_gfx_cs_space = r600_need_gfx_cs_space;
 }
-
-void r600_trace_emit(struct r600_context *rctx)
-{
-	struct r600_screen *rscreen = rctx->screen;
-	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
-	uint64_t va;
-	uint32_t reloc;
-
-	va = rscreen->b.trace_bo->gpu_address;
-	reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, rscreen->b.trace_bo,
-				      RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
-	radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0));
-	radeon_emit(cs, va & 0xFFFFFFFFUL);
-	radeon_emit(cs, (va >> 32UL) & 0xFFUL);
-	radeon_emit(cs, cs->cdw);
-	radeon_emit(cs, rscreen->b.cs_count);
-	radeon_emit(cs, PKT3(PKT3_NOP, 0, 0));
-	radeon_emit(cs, reloc);
-}
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index ea028272ccd..eed9d83ee49 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -229,7 +229,7 @@ static void r600_flush_dma_ring(void *ctx, unsigned flags,
 	struct radeon_winsys_cs *cs = rctx->dma.cs;
 
 	if (cs->cdw)
-		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence, 0);
+		rctx->ws->cs_flush(cs, flags, &rctx->last_sdma_fence);
 	if (fence)
 		rctx->ws->fence_reference(fence, rctx->last_sdma_fence);
 }
@@ -318,7 +318,7 @@ bool r600_common_context_init(struct r600_common_context *rctx,
 	if (rscreen->info.has_sdma && !(rscreen->debug_flags & DBG_NO_ASYNC_DMA)) {
 		rctx->dma.cs = rctx->ws->cs_create(rctx->ctx, RING_DMA,
 						   r600_flush_dma_ring,
-						   rctx, NULL);
+						   rctx);
 		rctx->dma.flush = r600_flush_dma_ring;
 	}
 
@@ -379,7 +379,6 @@ static const struct debug_named_value common_debug_options[] = {
 	{ "tex", DBG_TEX, "Print texture info" },
 	{ "compute", DBG_COMPUTE, "Print compute info" },
 	{ "vm", DBG_VM, "Print virtual addresses when creating resources" },
-	{ "trace_cs", DBG_TRACE_CS, "Trace cs and write rlockup_<csid>.c file with faulty cs" },
 	{ "info", DBG_INFO, "Print driver information" },
 
 	/* shaders */
@@ -893,19 +892,6 @@ bool r600_common_screen_init(struct r600_common_screen *rscreen,
 	pipe_mutex_init(rscreen->aux_context_lock);
 	pipe_mutex_init(rscreen->gpu_load_mutex);
 
-	if (((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 28) ||
-	     rscreen->info.drm_major == 3) &&
-	    (rscreen->debug_flags & DBG_TRACE_CS)) {
-		rscreen->trace_bo = (struct r600_resource*)pipe_buffer_create(&rscreen->b,
-										PIPE_BIND_CUSTOM,
-										PIPE_USAGE_STAGING,
-										4096);
-		if (rscreen->trace_bo) {
-			rscreen->trace_ptr = rscreen->ws->buffer_map(rscreen->trace_bo->buf, NULL,
-									PIPE_TRANSFER_UNSYNCHRONIZED);
-		}
-	}
-
 	if (rscreen->debug_flags & DBG_INFO) {
 		printf("pci_id = 0x%x\n", rscreen->info.pci_id);
 		printf("family = %i (%s)\n", rscreen->info.family,
@@ -951,9 +937,6 @@ void r600_destroy_common_screen(struct r600_common_screen *rscreen)
 	pipe_mutex_destroy(rscreen->aux_context_lock);
 	rscreen->aux_context->destroy(rscreen->aux_context);
 
-	if (rscreen->trace_bo)
-		pipe_resource_reference((struct pipe_resource**)&rscreen->trace_bo, NULL);
-
 	rscreen->ws->destroy(rscreen->ws);
 	FREE(rscreen);
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index cf8dcf7ea88..a9de71a8734 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -61,7 +61,7 @@
 /* gap - reuse */
 #define DBG_COMPUTE		(1 << 2)
 #define DBG_VM			(1 << 3)
-#define DBG_TRACE_CS		(1 << 4)
+/* gap - reuse */
 /* shader logging */
 #define DBG_FS			(1 << 5)
 #define DBG_VS			(1 << 6)
@@ -303,10 +303,6 @@ struct r600_common_screen {
 	struct pipe_context		*aux_context;
 	pipe_mutex			aux_context_lock;
 
-	struct r600_resource		*trace_bo;
-	uint32_t			*trace_ptr;
-	unsigned			cs_count;
-
 	/* This must be in the screen, because UE4 uses one context for
 	 * compilation and another one for rendering.
 	 */
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index b8efc58eaab..233f46091a4 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -92,7 +92,7 @@ struct ruvd_decoder {
 /* flush IB to the hardware */
 static void flush(struct ruvd_decoder *dec)
 {
-	dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	dec->ws->cs_flush(dec->cs, RADEON_FLUSH_ASYNC, NULL);
 }
 
 /* add a new set register command to the IB */
@@ -1142,7 +1142,7 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 	dec->stream_handle = rvid_alloc_stream_handle();
 	dec->screen = context->screen;
 	dec->ws = ws;
-	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL, NULL);
+	dec->cs = ws->cs_create(rctx->ctx, RING_UVD, NULL, NULL);
 	if (!dec->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 087d9422c04..2ab74e9eb6c 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -56,7 +56,7 @@
  */
 static void flush(struct rvce_encoder *enc)
 {
-	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL, 0);
+	enc->ws->cs_flush(enc->cs, RADEON_FLUSH_ASYNC, NULL);
 	enc->task_info_idx = 0;
 	enc->bs_idx = 0;
 }
@@ -429,7 +429,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	enc->screen = context->screen;
 	enc->ws = ws;
-	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc, NULL);
+	enc->cs = ws->cs_create(rctx->ctx, RING_VCE, rvce_cs_flush, enc);
 	if (!enc->cs) {
 		RVID_ERR("Can't get command submission context.\n");
 		goto error;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index daa15db2812..d35e963133e 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -593,14 +593,12 @@ struct radeon_winsys {
      * \param ring_type The ring type (GFX, DMA, UVD)
      * \param flush     Flush callback function associated with the command stream.
      * \param user      User pointer that will be passed to the flush callback.
-     * \param trace_buf Trace buffer when tracing is enabled
      */
     struct radeon_winsys_cs *(*cs_create)(struct radeon_winsys_ctx *ctx,
                                           enum ring_type ring_type,
                                           void (*flush)(void *ctx, unsigned flags,
 							struct pipe_fence_handle **fence),
-                                          void *flush_ctx,
-                                          struct pb_buffer *trace_buf);
+                                          void *flush_ctx);
 
     /**
      * Destroy a command stream.
@@ -673,12 +671,10 @@ struct radeon_winsys {
      * \param flags,      RADEON_FLUSH_ASYNC or 0.
      * \param fence       Pointer to a fence. If non-NULL, a fence is inserted
      *                    after the CS and is returned through this parameter.
-     * \param cs_trace_id A unique identifier of the cs, used for tracing.
      */
     void (*cs_flush)(struct radeon_winsys_cs *cs,
                      unsigned flags,
-                     struct pipe_fence_handle **fence,
-                     uint32_t cs_trace_id);
+                     struct pipe_fence_handle **fence);
 
     /**
      * Return TRUE if a buffer is referenced by a command stream.
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index b5a4034cc12..8c900a4ecb6 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -118,8 +118,7 @@ void si_context_gfx_flush(void *context, unsigned flags,
 	}
 
 	/* Flush the CS. */
-	ws->cs_flush(cs, flags, &ctx->last_gfx_fence,
-		     ctx->screen->b.cs_count++);
+	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
 
 	if (fence)
 		ws->fence_reference(fence, ctx->last_gfx_fence);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 8b50a49cba0..042cfc764fd 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -140,9 +140,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->b.b.create_video_buffer = vl_video_buffer_create;
 	}
 
-	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX, si_context_gfx_flush,
-				       sctx, sscreen->b.trace_bo ?
-					       sscreen->b.trace_bo->buf : NULL);
+	sctx->b.gfx.cs = ws->cs_create(sctx->b.ctx, RING_GFX,
+				       si_context_gfx_flush, sctx);
 	sctx->b.gfx.flush = si_context_gfx_flush;
 
 	/* Border colors. */
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
index 83da740f649..a9fc55f4a5a 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.c
@@ -335,8 +335,7 @@ amdgpu_cs_create(struct radeon_winsys_ctx *rwctx,
                  enum ring_type ring_type,
                  void (*flush)(void *ctx, unsigned flags,
                                struct pipe_fence_handle **fence),
-                 void *flush_ctx,
-                 struct pb_buffer *trace_buf)
+                 void *flush_ctx)
 {
    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
    struct amdgpu_cs *cs;
@@ -609,8 +608,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(all_bos, "RADEON_ALL_BOS", FALSE)
 
 static void amdgpu_cs_flush(struct radeon_winsys_cs *rcs,
                             unsigned flags,
-                            struct pipe_fence_handle **fence,
-                            uint32_t cs_trace_id)
+                            struct pipe_fence_handle **fence)
 {
    struct amdgpu_cs *cs = amdgpu_cs(rcs);
    struct amdgpu_winsys *ws = cs->ctx->ws;
diff --git a/src/gallium/winsys/radeon/drm/Makefile.am b/src/gallium/winsys/radeon/drm/Makefile.am
index 0320aca01f9..b413b0b93a0 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.am
+++ b/src/gallium/winsys/radeon/drm/Makefile.am
@@ -8,5 +8,3 @@ AM_CFLAGS = \
 noinst_LTLIBRARIES = libradeonwinsys.la
 
 libradeonwinsys_la_SOURCES = $(C_SOURCES)
-
-EXTRA_DIST = $(TOOLS_HDR)
diff --git a/src/gallium/winsys/radeon/drm/Makefile.sources b/src/gallium/winsys/radeon/drm/Makefile.sources
index a00c84d35b3..2762c91e216 100644
--- a/src/gallium/winsys/radeon/drm/Makefile.sources
+++ b/src/gallium/winsys/radeon/drm/Makefile.sources
@@ -2,12 +2,8 @@ C_SOURCES := \
 	radeon_drm_bo.c \
 	radeon_drm_bo.h \
 	radeon_drm_cs.c \
-	radeon_drm_cs_dump.c \
 	radeon_drm_cs.h \
 	radeon_drm_public.h \
 	radeon_drm_surface.c \
 	radeon_drm_winsys.c \
 	radeon_drm_winsys.h
-
-TOOLS_HDR := \
-	radeon_ctx.h
diff --git a/src/gallium/winsys/radeon/drm/radeon_ctx.h b/src/gallium/winsys/radeon/drm/radeon_ctx.h
deleted file mode 100644
index 5618b3a8d00..00000000000
--- a/src/gallium/winsys/radeon/drm/radeon_ctx.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright 2011 Jerome Glisse <glisse@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *      Jérôme Glisse
- */
-#ifndef RADEON_CTX_H
-#define RADEON_CTX_H
-
-#define _FILE_OFFSET_BITS 64
-#include <sys/mman.h>
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include "xf86drm.h"
-#include "radeon_drm.h"
-
-struct ctx {
-    int                         fd;
-};
-
-struct bo {
-    uint32_t                    handle;
-    uint32_t                    alignment;
-    uint64_t                    size;
-    uint64_t                    va;
-    void                        *ptr;
-};
-
-static void ctx_init(struct ctx *ctx)
-{
-    ctx->fd = drmOpen("radeon", NULL);
-    if (ctx->fd < 0) {
-        fprintf(stderr, "failed to open radeon drm device file\n");
-        exit(-1);
-    }
-}
-
-static void bo_wait(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_wait_idle args;
-    void *ptr;
-    int r;
-
-    /* Zero out args to make valgrind happy */
-    memset(&args, 0, sizeof(args));
-    args.handle = bo->handle;
-    do {
-        r = drmCommandWrite(ctx->fd, DRM_RADEON_GEM_WAIT_IDLE, &args, sizeof(args));
-    } while (r == -EBUSY);
-}
-
-
-static void ctx_cs(struct ctx *ctx, uint32_t *cs, uint32_t cs_flags[2], unsigned ndw,
-                   struct bo **bo, uint32_t *bo_relocs, unsigned nbo)
-{
-    struct drm_radeon_cs args;
-    struct drm_radeon_cs_chunk chunks[3];
-    uint64_t chunk_array[3];
-    unsigned i;
-    int r;
-
-    /* update handle */
-    for (i = 0; i < nbo; i++) {
-        bo_relocs[i*4+0] = bo[i]->handle;
-    }
-
-    args.num_chunks = 2;
-    if (cs_flags[0] || cs_flags[1]) {
-        /* enable RADEON_CHUNK_ID_FLAGS */
-        args.num_chunks = 3;
-    }
-    args.chunks = (uint64_t)(uintptr_t)chunk_array;
-    chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
-    chunks[0].length_dw = ndw;
-    chunks[0].chunk_data = (uintptr_t)cs;
-    chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
-    chunks[1].length_dw = nbo * 4;
-    chunks[1].chunk_data = (uintptr_t)bo_relocs;
-    chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
-    chunks[2].length_dw = 2;
-    chunks[2].chunk_data = (uintptr_t)cs_flags;
-    chunk_array[0] = (uintptr_t)&chunks[0];
-    chunk_array[1] = (uintptr_t)&chunks[1];
-    chunk_array[2] = (uintptr_t)&chunks[2];
-
-    fprintf(stderr, "emiting cs %ddw with %d bo\n", ndw, nbo);
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_CS, &args, sizeof(args));
-    if (r) {
-        fprintf(stderr, "cs submission failed with %d\n", r);
-        return;
-    }
-}
-
-static void bo_map(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_mmap args;
-    void *ptr;
-    int r;
-
-    /* Zero out args to make valgrind happy */
-    memset(&args, 0, sizeof(args));
-    args.handle = bo->handle;
-    args.offset = 0;
-    args.size = (uint64_t)bo->size;
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_MMAP, &args, sizeof(args));
-    if (r) {
-        fprintf(stderr, "error mapping %p 0x%08X (error = %d)\n", bo, bo->handle, r);
-        exit(-1);
-    }
-    ptr = mmap(0, args.size, PROT_READ|PROT_WRITE, MAP_SHARED, ctx->fd, args.addr_ptr);
-    if (ptr == MAP_FAILED) {
-        fprintf(stderr, "%s failed to map bo\n", __func__);
-        exit(-1);
-    }
-    bo->ptr = ptr;
-}
-
-static void bo_va(struct ctx *ctx, struct bo *bo)
-{
-    struct drm_radeon_gem_va args;
-    int r;
-
-    args.handle = bo->handle;
-    args.vm_id = 0;
-    args.operation = RADEON_VA_MAP;
-    args.flags = RADEON_VM_PAGE_READABLE | RADEON_VM_PAGE_WRITEABLE | RADEON_VM_PAGE_SNOOPED;
-    args.offset = bo->va;
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_VA, &args, sizeof(args));
-    if (r && args.operation == RADEON_VA_RESULT_ERROR) {
-        fprintf(stderr, "radeon: Failed to allocate virtual address for buffer:\n");
-        fprintf(stderr, "radeon:    size      : %d bytes\n", bo->size);
-        fprintf(stderr, "radeon:    alignment : %d bytes\n", bo->alignment);
-        fprintf(stderr, "radeon:    va        : 0x%016llx\n", (unsigned long long)bo->va);
-        exit(-1);
-    }
-}
-
-static struct bo *bo_new(struct ctx *ctx, unsigned ndw, uint32_t *data, uint64_t va, uint32_t alignment)
-{
-    struct drm_radeon_gem_create args;
-    struct bo *bo;
-    int r;
-
-    bo = calloc(1, sizeof(*bo));
-    if (bo == NULL) {
-        fprintf(stderr, "failed to malloc bo struct\n");
-        exit(-1);
-    }
-    bo->size = ndw * 4ULL;
-    bo->va = va;
-    bo->alignment = alignment;
-
-    args.size = bo->size;
-    args.alignment = bo->alignment;
-    args.initial_domain = RADEON_GEM_DOMAIN_GTT;
-    args.flags = 0;
-    args.handle = 0;
-
-    r = drmCommandWriteRead(ctx->fd, DRM_RADEON_GEM_CREATE, &args, sizeof(args));
-    bo->handle = args.handle;
-    if (r) {
-        fprintf(stderr, "Failed to allocate :\n");
-        fprintf(stderr, "   size      : %d bytes\n", bo->size);
-        fprintf(stderr, "   alignment : %d bytes\n", bo->alignment);
-        free(bo);
-        exit(-1);
-    }
-
-    if (data) {
-        bo_map(ctx, bo);
-        memcpy(bo->ptr, data, bo->size);
-    }
-
-    if (va) {
-        bo_va(ctx, bo);
-    }
-
-    return bo;
-}
-
-
-#endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index 155a13008a4..b50e19c0381 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -168,8 +168,7 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
                      enum ring_type ring_type,
                      void (*flush)(void *ctx, unsigned flags,
                                    struct pipe_fence_handle **fence),
-                     void *flush_ctx,
-                     struct pb_buffer *trace_buf)
+                     void *flush_ctx)
 {
     struct radeon_drm_winsys *ws = (struct radeon_drm_winsys*)ctx;
     struct radeon_drm_cs *cs;
@@ -183,7 +182,6 @@ radeon_drm_cs_create(struct radeon_winsys_ctx *ctx,
     cs->ws = ws;
     cs->flush_cs = flush;
     cs->flush_data = flush_ctx;
-    cs->trace_buf = (struct radeon_bo*)trace_buf;
 
     if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
         FREE(cs);
@@ -439,10 +437,6 @@ void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs
         }
     }
 
-    if (cs->trace_buf) {
-        radeon_dump_cs_on_lockup(cs, csc);
-    }
-
     for (i = 0; i < csc->crelocs; i++)
         p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
 
@@ -467,8 +461,7 @@ DEBUG_GET_ONCE_BOOL_OPTION(noop, "RADEON_NOOP", FALSE)
 
 static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
                                 unsigned flags,
-                                struct pipe_fence_handle **fence,
-                                uint32_t cs_trace_id)
+                                struct pipe_fence_handle **fence)
 {
     struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct radeon_cs_context *tmp;
@@ -520,8 +513,6 @@ static void radeon_drm_cs_flush(struct radeon_winsys_cs *rcs,
     cs->csc = cs->cst;
     cs->cst = tmp;
 
-    cs->cst->cs_trace_id = cs_trace_id;
-
     /* If the CS is not empty or overflowed, emit it in a separate thread. */
     if (cs->base.cdw && cs->base.cdw <= cs->base.max_dw && !debug_get_option_noop()) {
         unsigned i, crelocs;
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
index 81f66f56d99..4ffa91ae8b2 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.h
@@ -43,8 +43,6 @@ struct radeon_cs_context {
     uint64_t                    chunk_array[3];
     uint32_t                    flags[2];
 
-    uint32_t                    cs_trace_id;
-
     /* Buffers. */
     unsigned                    nrelocs;
     unsigned                    crelocs;
@@ -80,7 +78,6 @@ struct radeon_drm_cs {
     void *flush_data;
 
     pipe_semaphore flush_completed;
-    struct radeon_bo                    *trace_buf;
 };
 
 int radeon_lookup_buffer(struct radeon_cs_context *csc, struct radeon_bo *bo);
@@ -126,6 +123,4 @@ void radeon_drm_cs_sync_flush(struct radeon_winsys_cs *rcs);
 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws);
 void radeon_drm_cs_emit_ioctl_oneshot(struct radeon_drm_cs *cs, struct radeon_cs_context *csc);
 
-void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc);
-
 #endif
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c
deleted file mode 100644
index 99585956a49..00000000000
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs_dump.c
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright © 2013 Jérôme Glisse
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
- * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- */
-/*
- * Authors:
- *      Jérôme Glisse <jglisse@redhat.com>
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <xf86drm.h>
-#include "radeon_drm_cs.h"
-#include "radeon_drm_bo.h"
-
-#define RADEON_CS_DUMP_AFTER_MS_TIMEOUT         500
-
-void radeon_dump_cs_on_lockup(struct radeon_drm_cs *cs, struct radeon_cs_context *csc)
-{
-    struct drm_radeon_gem_busy args;
-    FILE *dump;
-    unsigned i, lockup;
-    uint32_t *ptr;
-    char fname[32];
-
-    /* only dump the first cs to cause a lockup */
-    if (!csc->crelocs) {
-        /* can not determine if there was a lockup if no bo were use by
-         * the cs and most likely in such case no lockup occurs
-         */
-        return;
-    }
-
-    memset(&args, 0, sizeof(args));
-    args.handle = csc->relocs_bo[0].bo->handle;
-    for (i = 0; i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT; i++) {
-        usleep(1);
-        lockup = drmCommandWriteRead(csc->fd, DRM_RADEON_GEM_BUSY, &args, sizeof(args));
-        if (!lockup) {
-            break;
-        }
-    }
-    if (!lockup || i < RADEON_CS_DUMP_AFTER_MS_TIMEOUT) {
-        return;
-    }
-
-    ptr = radeon_bo_do_map(cs->trace_buf);
-    fprintf(stderr, "timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x\n", ptr[1], ptr[0]);
-
-    if (csc->cs_trace_id != ptr[1]) {
-        return;
-    }
-
-    /* ok we are most likely facing a lockup write the standalone replay file */
-    snprintf(fname, sizeof(fname), "rlockup_0x%08x.c", csc->cs_trace_id);
-    dump = fopen(fname, "w");
-    if (dump == NULL) {
-        return;
-    }
-    fprintf(dump, "/* To build this file you will need to copy radeon_ctx.h\n");
-    fprintf(dump, " * in same directory. You can find radeon_ctx.h in mesa tree :\n");
-    fprintf(dump, " * mesa/src/gallium/winsys/radeon/drm/radeon_ctx.h\n");
-    fprintf(dump, " * Build with :\n");
-    fprintf(dump, " * gcc -O0 -g `pkg-config --cflags --libs libdrm` %s -o rlockup_0x%08x \n", fname, csc->cs_trace_id);
-    fprintf(dump, " */\n");
-    fprintf(dump, " /* timeout on cs lockup likely happen at cs 0x%08x dw 0x%08x*/\n", ptr[1], ptr[0]);
-    fprintf(dump, "#include <stdio.h>\n");
-    fprintf(dump, "#include <stdint.h>\n");
-    fprintf(dump, "#include \"radeon_ctx.h\"\n");
-    fprintf(dump, "\n");
-    fprintf(dump, "#define ARRAY_SIZE(x)  (sizeof(x)/sizeof(x[0]))\n");
-    fprintf(dump, "\n");
-
-    for (i = 0; i < csc->crelocs; i++) {
-        unsigned j, ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2;
-
-        ptr = radeon_bo_do_map(csc->relocs_bo[i].bo);
-        if (ptr) {
-            fprintf(dump, "static uint32_t bo_%04d_data[%d] = {\n   ", i, ndw);
-            for (j = 0; j < ndw; j++) {
-                if (j && !(j % 8)) {
-                    uint32_t offset = (j - 8) << 2;
-                    fprintf(dump, "  /* [0x%08x] va[0x%016"PRIx64"] */\n   ", offset, offset + csc->relocs_bo[i].bo->va);
-                }
-                fprintf(dump, " 0x%08x,", ptr[j]);
-            }
-            fprintf(dump, "};\n\n");
-        }
-    }
-
-    fprintf(dump, "static uint32_t bo_relocs[%d] = {\n", csc->crelocs * 4);
-    for (i = 0; i < csc->crelocs; i++) {
-        fprintf(dump, "    0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
-                0, csc->relocs[i].read_domains, csc->relocs[i].write_domain, csc->relocs[i].flags);
-    }
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "/* cs %d dw */\n", csc->chunks[0].length_dw);
-    fprintf(dump, "static uint32_t cs[] = {\n");
-    ptr = csc->buf;
-    for (i = 0; i < csc->chunks[0].length_dw; i++) {
-        fprintf(dump, "    0x%08x,\n", ptr[i]);
-    }
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "static uint32_t cs_flags[2] = {\n");
-    fprintf(dump, "    0x%08x,\n", csc->flags[0]);
-    fprintf(dump, "    0x%08x,\n", csc->flags[1]);
-    fprintf(dump, "};\n\n");
-
-    fprintf(dump, "int main(int argc, char *argv[])\n");
-    fprintf(dump, "{\n");
-    fprintf(dump, "    struct bo *bo[%d];\n", csc->crelocs);
-    fprintf(dump, "    struct ctx ctx;\n");
-    fprintf(dump, "\n");
-    fprintf(dump, "    ctx_init(&ctx);\n");
-    fprintf(dump, "\n");
-
-    for (i = 0; i < csc->crelocs; i++) {
-        unsigned ndw = (csc->relocs_bo[i].bo->base.size + 3) >> 2;
-        uint32_t *ptr;
-
-        ptr = radeon_bo_do_map(csc->relocs_bo[i].bo);
-        if (ptr) {
-            fprintf(dump, "    bo[%d] = bo_new(&ctx, %d, bo_%04d_data, 0x%016"PRIx64", 0x%08x);\n",
-                    i, ndw, i, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment);
-        } else {
-            fprintf(dump, "    bo[%d] = bo_new(&ctx, %d, NULL, 0x%016"PRIx64", 0x%08x);\n",
-                    i, ndw, csc->relocs_bo[i].bo->va, csc->relocs_bo[i].bo->base.alignment);
-        }
-    }
-    fprintf(dump, "\n");
-    fprintf(dump, "    ctx_cs(&ctx, cs, cs_flags, ARRAY_SIZE(cs), bo, bo_relocs, %d);\n", csc->crelocs);
-    fprintf(dump, "\n");
-    fprintf(dump, "    fprintf(stderr, \"waiting for cs execution to end ....\\n\");\n");
-    fprintf(dump, "    bo_wait(&ctx, bo[0]);\n");
-    fprintf(dump, "}\n");
-    fclose(dump);
-}

From 20a09897a6c757a93cfb385ede7a7eb5e79cc18f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 11 Mar 2016 15:49:21 +0100
Subject: [PATCH 068/197] r600g: remove TGSI->LLVM translation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It was useful for testing and as a prototype for radeonsi bringup,
but it's not used anymore and doesn't support OpenGL 3.3 even.

v2: try to fix OpenCL build

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Tested-by: Jan Vesely <jan.vesely@rutgers.edu>
---
 configure.ac                                  |  15 +-
 src/gallium/drivers/r600/Makefile.am          |   8 -
 src/gallium/drivers/r600/Makefile.sources     |   4 -
 src/gallium/drivers/r600/evergreen_compute.c  |  65 +-
 .../drivers/r600/evergreen_compute_internal.h |   4 +
 src/gallium/drivers/r600/r600_llvm.c          | 943 ------------------
 src/gallium/drivers/r600/r600_llvm.h          |  42 -
 src/gallium/drivers/r600/r600_pipe.c          |   5 -
 src/gallium/drivers/r600/r600_pipe.h          |   3 -
 src/gallium/drivers/r600/r600_shader.c        | 122 +--
 10 files changed, 90 insertions(+), 1121 deletions(-)
 delete mode 100644 src/gallium/drivers/r600/r600_llvm.c
 delete mode 100644 src/gallium/drivers/r600/r600_llvm.h

diff --git a/configure.ac b/configure.ac
index 1ece6fa4e16..ffd51db31b7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -931,12 +931,6 @@ AC_ARG_ENABLE([xlib-glx],
     [enable_xlib_glx="$enableval"],
     [enable_xlib_glx=no])
 
-AC_ARG_ENABLE([r600-llvm-compiler],
-    [AS_HELP_STRING([--enable-r600-llvm-compiler],
-        [Enable experimental LLVM backend for graphics shaders @<:@default=disabled@:>@])],
-    [enable_r600_llvm="$enableval"],
-    [enable_r600_llvm=no])
-
 AC_ARG_ENABLE([gallium-tests],
     [AS_HELP_STRING([--enable-gallium-tests],
         [Enable optional Gallium tests) @<:@default=disabled@:>@])],
@@ -2238,14 +2232,8 @@ if test -n "$with_gallium_drivers"; then
             PKG_CHECK_MODULES([RADEON], [libdrm_radeon >= $LIBDRM_RADEON_REQUIRED])
             gallium_require_drm "Gallium R600"
             gallium_require_drm_loader
-            if test "x$enable_r600_llvm" = xyes -o "x$enable_opencl" = xyes; then
-                radeon_llvm_check "r600g"
-                LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
-            fi
-            if test "x$enable_r600_llvm" = xyes; then
-                USE_R600_LLVM_COMPILER=yes;
-            fi
             if test "x$enable_opencl" = xyes; then
+                radeon_llvm_check "r600g"
                 LLVM_COMPONENTS="${LLVM_COMPONENTS} bitreader asmparser"
             fi
             ;;
@@ -2416,7 +2404,6 @@ AM_CONDITIONAL(NEED_RADEON_DRM_WINSYS, test "x$HAVE_GALLIUM_R300" = xyes -o \
                                             "x$HAVE_GALLIUM_RADEONSI" = xyes)
 AM_CONDITIONAL(NEED_WINSYS_XLIB, test "x$NEED_WINSYS_XLIB" = xyes)
 AM_CONDITIONAL(NEED_RADEON_LLVM, test x$NEED_RADEON_LLVM = xyes)
-AM_CONDITIONAL(USE_R600_LLVM_COMPILER, test x$USE_R600_LLVM_COMPILER = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_COMPUTE, test x$enable_opencl = xyes)
 AM_CONDITIONAL(HAVE_MESA_LLVM, test x$MESA_LLVM = x1)
 AM_CONDITIONAL(USE_VC4_SIMULATOR, test x$USE_VC4_SIMULATOR = xyes)
diff --git a/src/gallium/drivers/r600/Makefile.am b/src/gallium/drivers/r600/Makefile.am
index 8317da727a2..f3bb03e54be 100644
--- a/src/gallium/drivers/r600/Makefile.am
+++ b/src/gallium/drivers/r600/Makefile.am
@@ -21,14 +21,6 @@ AM_CFLAGS += \
 	$(LLVM_CFLAGS) \
 	-I$(top_srcdir)/src/gallium/drivers/radeon/
 
-libr600_la_SOURCES += \
-	$(LLVM_C_SOURCES)
-
-endif
-
-if USE_R600_LLVM_COMPILER
-AM_CFLAGS += \
-	-DR600_USE_LLVM
 endif
 
 if HAVE_GALLIUM_COMPUTE
diff --git a/src/gallium/drivers/r600/Makefile.sources b/src/gallium/drivers/r600/Makefile.sources
index 024dea3a002..8bf8083bbab 100644
--- a/src/gallium/drivers/r600/Makefile.sources
+++ b/src/gallium/drivers/r600/Makefile.sources
@@ -64,7 +64,3 @@ CXX_SOURCES = \
 	sb/sb_shader.h \
 	sb/sb_ssa_builder.cpp \
 	sb/sb_valtable.cpp
-
-LLVM_C_SOURCES = \
-	r600_llvm.c \
-	r600_llvm.h
diff --git a/src/gallium/drivers/r600/evergreen_compute.c b/src/gallium/drivers/r600/evergreen_compute.c
index 2a1b2519ec7..f4b669000dc 100644
--- a/src/gallium/drivers/r600/evergreen_compute.c
+++ b/src/gallium/drivers/r600/evergreen_compute.c
@@ -192,6 +192,69 @@ static const struct u_resource_vtbl r600_global_buffer_vtbl =
 	r600_compute_global_transfer_inline_write /* transfer_inline_write */
 };
 
+/* We need to define these R600 registers here, because we can't include
+ * evergreend.h and r600d.h.
+ */
+#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
+#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
+
+#ifdef HAVE_OPENCL
+
+static void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
+					   struct r600_bytecode *bc,
+					   uint64_t symbol_offset,
+					   boolean *use_kill)
+{
+       unsigned i;
+       const unsigned char *config =
+               radeon_shader_binary_config_start(binary, symbol_offset);
+
+       for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
+               unsigned reg =
+                       util_le32_to_cpu(*(uint32_t*)(config + i));
+               unsigned value =
+                       util_le32_to_cpu(*(uint32_t*)(config + i + 4));
+               switch (reg) {
+               /* R600 / R700 */
+               case R_028850_SQ_PGM_RESOURCES_PS:
+               case R_028868_SQ_PGM_RESOURCES_VS:
+               /* Evergreen / Northern Islands */
+               case R_028844_SQ_PGM_RESOURCES_PS:
+               case R_028860_SQ_PGM_RESOURCES_VS:
+               case R_0288D4_SQ_PGM_RESOURCES_LS:
+                       bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
+                       bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
+                       break;
+               case R_02880C_DB_SHADER_CONTROL:
+                       *use_kill = G_02880C_KILL_ENABLE(value);
+                       break;
+               case R_0288E8_SQ_LDS_ALLOC:
+                       bc->nlds_dw = value;
+                       break;
+               }
+       }
+}
+
+static unsigned r600_create_shader(struct r600_bytecode *bc,
+				   const struct radeon_shader_binary *binary,
+				   boolean *use_kill)
+
+{
+	assert(binary->code_size % 4 == 0);
+	bc->bytecode = CALLOC(1, binary->code_size);
+	memcpy(bc->bytecode, binary->code, binary->code_size);
+	bc->ndw = binary->code_size / 4;
+
+	r600_shader_binary_read_config(binary, bc, 0, use_kill);
+	return 0;
+}
+
+#endif
+
+static void r600_destroy_shader(struct r600_bytecode *bc)
+{
+	FREE(bc->bytecode);
+}
 
 void *evergreen_create_compute_state(
 	struct pipe_context *ctx_,
@@ -236,13 +299,11 @@ void evergreen_delete_compute_state(struct pipe_context *ctx_, void* state)
 	if (!shader)
 		return;
 
-#ifdef HAVE_OPENCL
 	radeon_shader_binary_clean(&shader->binary);
 	r600_destroy_shader(&shader->bc);
 
 	/* TODO destroy shader->code_bo, shader->const_bo
 	 * we'll need something like r600_buffer_free */
-#endif
 	FREE(shader);
 }
 
diff --git a/src/gallium/drivers/r600/evergreen_compute_internal.h b/src/gallium/drivers/r600/evergreen_compute_internal.h
index c8998d00f5a..e6ff7609aea 100644
--- a/src/gallium/drivers/r600/evergreen_compute_internal.h
+++ b/src/gallium/drivers/r600/evergreen_compute_internal.h
@@ -26,6 +26,10 @@
 #define EVERGREEN_COMPUTE_INTERNAL_H
 
 #include "r600_asm.h"
+#ifdef HAVE_OPENCL
+#include "radeon/radeon_llvm.h"
+#include <llvm-c/Core.h>
+#endif
 
 struct r600_pipe_compute {
 	struct r600_context *ctx;
diff --git a/src/gallium/drivers/r600/r600_llvm.c b/src/gallium/drivers/r600/r600_llvm.c
deleted file mode 100644
index 7eab29c6eb4..00000000000
--- a/src/gallium/drivers/r600/r600_llvm.c
+++ /dev/null
@@ -1,943 +0,0 @@
-#include "r600_llvm.h"
-
-#include "gallivm/lp_bld_const.h"
-#include "gallivm/lp_bld_intr.h"
-#include "gallivm/lp_bld_gather.h"
-#include "tgsi/tgsi_parse.h"
-#include "util/list.h"
-#include "util/u_memory.h"
-
-#include "evergreend.h"
-#include "r600_asm.h"
-#include "r600_sq.h"
-#include "r600_opcodes.h"
-#include "r600_shader.h"
-#include "r600_pipe.h"
-#include "radeon_llvm.h"
-#include "radeon_llvm_emit.h"
-#include "radeon_elf_util.h"
-
-#include <stdio.h>
-
-#if defined R600_USE_LLVM || defined HAVE_OPENCL
-
-#define CONSTANT_BUFFER_0_ADDR_SPACE 8
-#define CONSTANT_BUFFER_1_ADDR_SPACE (CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER)
-#define LLVM_R600_BUFFER_INFO_CONST_BUFFER \
-	(CONSTANT_BUFFER_0_ADDR_SPACE + R600_BUFFER_INFO_CONST_BUFFER)
-
-static LLVMValueRef llvm_load_const_buffer(
-	struct lp_build_tgsi_context * bld_base,
-	LLVMValueRef OffsetValue,
-	unsigned ConstantAddressSpace)
-{
-	LLVMValueRef offset[2] = {
-		LLVMConstInt(LLVMInt64TypeInContext(bld_base->base.gallivm->context), 0, false),
-		OffsetValue
-	};
-
-	LLVMTypeRef const_ptr_type = LLVMPointerType(LLVMArrayType(LLVMVectorType(bld_base->base.elem_type, 4), 1024),
-							ConstantAddressSpace);
-	LLVMValueRef const_ptr = LLVMBuildIntToPtr(bld_base->base.gallivm->builder, lp_build_const_int32(bld_base->base.gallivm, 0), const_ptr_type, "");
-	LLVMValueRef ptr = LLVMBuildGEP(bld_base->base.gallivm->builder, const_ptr, offset, 2, "");
-	return LLVMBuildLoad(bld_base->base.gallivm->builder, ptr, "");
-}
-
-static LLVMValueRef llvm_fetch_const(
-	struct lp_build_tgsi_context * bld_base,
-	const struct tgsi_full_src_register *reg,
-	enum tgsi_opcode_type type,
-	unsigned swizzle)
-{
-	LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg->Register.Index);
-	if (reg->Register.Indirect) {
-		struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-		LLVMValueRef index = LLVMBuildLoad(bld_base->base.gallivm->builder, bld->addr[reg->Indirect.Index][reg->Indirect.Swizzle], "");
-		offset = LLVMBuildAdd(bld_base->base.gallivm->builder, offset, index, "");
-	}
-	unsigned ConstantAddressSpace = CONSTANT_BUFFER_0_ADDR_SPACE ;
-	if (reg->Register.Dimension) {
-		ConstantAddressSpace += reg->Dimension.Index;
-	}
-	LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset, ConstantAddressSpace);
-	LLVMValueRef cval = LLVMBuildExtractElement(bld_base->base.gallivm->builder, cvecval, lp_build_const_int32(bld_base->base.gallivm, swizzle), "");
-	return bitcast(bld_base, type, cval);
-}
-
-static void llvm_load_system_value(
-		struct radeon_llvm_context * ctx,
-		unsigned index,
-		const struct tgsi_full_declaration *decl)
-{
-	unsigned chan;
-
-	switch (decl->Semantic.Name) {
-	case TGSI_SEMANTIC_INSTANCEID: chan = 3; break;
-	case TGSI_SEMANTIC_VERTEXID: chan = 0; break;
-	default: assert(!"unknown system value");
-	}
-
-	ctx->system_values[index] = LLVMBuildExtractElement(ctx->gallivm.builder,
-		LLVMGetParam(ctx->main_fn, 0), lp_build_const_int32(&(ctx->gallivm), chan),
-		"");
-}
-
-static LLVMValueRef
-llvm_load_input_vector(
-	struct radeon_llvm_context * ctx, unsigned location, unsigned ijregs,
-	boolean interp)
-{
-		LLVMTypeRef VecType;
-		LLVMValueRef Args[3] = {
-			lp_build_const_int32(&(ctx->gallivm), location)
-		};
-		unsigned ArgCount = 1;
-		if (interp) {
-			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 2);
-			LLVMValueRef IJIndex = LLVMGetParam(ctx->main_fn, ijregs / 2);
-			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
-				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2)), "");
-			Args[ArgCount++] = LLVMBuildExtractElement(ctx->gallivm.builder, IJIndex,
-				lp_build_const_int32(&(ctx->gallivm), 2 * (ijregs % 2) + 1), "");
-			LLVMValueRef HalfVec[2] = {
-				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.xy",
-					VecType, Args, ArgCount, LLVMReadNoneAttribute),
-				lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.zw",
-					VecType, Args, ArgCount, LLVMReadNoneAttribute)
-			};
-			LLVMValueRef MaskInputs[4] = {
-				lp_build_const_int32(&(ctx->gallivm), 0),
-				lp_build_const_int32(&(ctx->gallivm), 1),
-				lp_build_const_int32(&(ctx->gallivm), 2),
-				lp_build_const_int32(&(ctx->gallivm), 3)
-			};
-			LLVMValueRef Mask = LLVMConstVector(MaskInputs, 4);
-			return LLVMBuildShuffleVector(ctx->gallivm.builder, HalfVec[0], HalfVec[1],
-				Mask, "");
-		} else {
-			VecType = LLVMVectorType(ctx->soa.bld_base.base.elem_type, 4);
-			return lp_build_intrinsic(ctx->gallivm.builder, "llvm.R600.interp.const",
-				VecType, Args, ArgCount, LLVMReadNoneAttribute);
-		}
-}
-
-static LLVMValueRef
-llvm_face_select_helper(
-	struct radeon_llvm_context * ctx,
-	LLVMValueRef face, LLVMValueRef front_color, LLVMValueRef back_color)
-{
-	const struct lp_build_context * bb = &ctx->soa.bld_base.base;
-	LLVMValueRef is_front = LLVMBuildFCmp(
-		bb->gallivm->builder, LLVMRealUGT, face,
-		lp_build_const_float(bb->gallivm, 0.0f),	"");
-	return LLVMBuildSelect(bb->gallivm->builder, is_front,
-		front_color, back_color, "");
-}
-
-static void llvm_load_input(
-	struct radeon_llvm_context * ctx,
-	unsigned input_index,
-	const struct tgsi_full_declaration *decl)
-{
-	const struct r600_shader_io * input = &ctx->r600_inputs[input_index];
-	unsigned chan;
-	int two_side = (ctx->two_side && input->name == TGSI_SEMANTIC_COLOR);
-	LLVMValueRef v;
-	boolean require_interp_intrinsic = ctx->chip_class >= EVERGREEN &&
-		ctx->type == TGSI_PROCESSOR_FRAGMENT;
-
-	if (require_interp_intrinsic && input->spi_sid) {
-		v = llvm_load_input_vector(ctx, input->lds_pos, input->ij_index,
-			(input->interpolate > 0));
-	} else
-		v = LLVMGetParam(ctx->main_fn, input->gpr);
-
-	if (two_side) {
-		struct r600_shader_io * back_input =
-			&ctx->r600_inputs[input->back_color_input];
-		LLVMValueRef v2;
-		LLVMValueRef face = LLVMGetParam(ctx->main_fn, ctx->face_gpr);
-		face = LLVMBuildExtractElement(ctx->gallivm.builder, face,
-			lp_build_const_int32(&(ctx->gallivm), 0), "");
-
-		if (require_interp_intrinsic && back_input->spi_sid)
-			v2 = llvm_load_input_vector(ctx, back_input->lds_pos,
-				back_input->ij_index, (back_input->interpolate > 0));
-		else
-			v2 = LLVMGetParam(ctx->main_fn, back_input->gpr);
-		v = llvm_face_select_helper(ctx, face, v, v2);
-	}
-
-	for (chan = 0; chan < 4; chan++) {
-		unsigned soa_index = radeon_llvm_reg_index_soa(input_index, chan);
-
-		ctx->inputs[soa_index] = LLVMBuildExtractElement(ctx->gallivm.builder, v,
-			lp_build_const_int32(&(ctx->gallivm), chan), "");
-
-		if (input->name == TGSI_SEMANTIC_POSITION &&
-				ctx->type == TGSI_PROCESSOR_FRAGMENT && chan == 3) {
-		/* RCP for fragcoord.w */
-		ctx->inputs[soa_index] = LLVMBuildFDiv(ctx->gallivm.builder,
-				lp_build_const_float(&(ctx->gallivm), 1.0f),
-				ctx->inputs[soa_index], "");
-	}
-	}
-}
-
-static void llvm_emit_prologue(struct lp_build_tgsi_context * bld_base)
-{
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-	radeon_llvm_shader_type(ctx->main_fn, ctx->type);
-
-}
-
-static void llvm_emit_epilogue(struct lp_build_tgsi_context * bld_base)
-{
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-	struct lp_build_context * base = &bld_base->base;
-	struct pipe_stream_output_info * so = ctx->stream_outputs;
-	unsigned i;
-	unsigned next_pos = 60;
-	unsigned next_param = 0;
-
-	unsigned color_count = 0;
-	boolean has_color = false;
-
-	if (ctx->type == TGSI_PROCESSOR_VERTEX && so->num_outputs) {
-		for (i = 0; i < so->num_outputs; i++) {
-			unsigned register_index = so->output[i].register_index;
-			unsigned start_component = so->output[i].start_component;
-			unsigned num_components = so->output[i].num_components;
-			unsigned dst_offset = so->output[i].dst_offset;
-			unsigned chan;
-			LLVMValueRef elements[4];
-			if (dst_offset < start_component) {
-				for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-					elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-						ctx->soa.outputs[register_index][(chan + start_component) % TGSI_NUM_CHANNELS], "");
-				}
-				start_component = 0;
-			} else {
-				for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-					elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-						ctx->soa.outputs[register_index][chan], "");
-				}
-			}
-			LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4);
-			LLVMValueRef args[4];
-			args[0] = output;
-			args[1] = lp_build_const_int32(base->gallivm, dst_offset - start_component);
-			args[2] = lp_build_const_int32(base->gallivm, so->output[i].output_buffer);
-			args[3] = lp_build_const_int32(base->gallivm, ((1 << num_components) - 1) << start_component);
-			lp_build_intrinsic(base->gallivm->builder, "llvm.R600.store.stream.output",
-				LLVMVoidTypeInContext(base->gallivm->context), args, 4, 0);
-		}
-	}
-
-	/* Add the necessary export instructions */
-	for (i = 0; i < ctx->output_reg_count; i++) {
-		unsigned chan;
-		LLVMValueRef elements[4];
-		for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-			elements[chan] = LLVMBuildLoad(base->gallivm->builder,
-				ctx->soa.outputs[i][chan], "");
-		}
-		if (ctx->alpha_to_one && ctx->type == TGSI_PROCESSOR_FRAGMENT && ctx->r600_outputs[i].name == TGSI_SEMANTIC_COLOR)
-			elements[3] = lp_build_const_float(base->gallivm, 1.0f);
-		LLVMValueRef output = lp_build_gather_values(base->gallivm, elements, 4);
-
-		if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-			switch (ctx->r600_outputs[i].name) {
-			case TGSI_SEMANTIC_POSITION:
-			case TGSI_SEMANTIC_PSIZE: {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			case TGSI_SEMANTIC_CLIPVERTEX: {
-				LLVMValueRef args[3];
-				unsigned reg_index;
-				LLVMValueRef adjusted_elements[4];
-				for (reg_index = 0; reg_index < 2; reg_index ++) {
-					for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
-						LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, reg_index * 4 + chan);
-						LLVMValueRef base_vector = llvm_load_const_buffer(bld_base, offset, CONSTANT_BUFFER_1_ADDR_SPACE);
-						args[0] = output;
-						args[1] = base_vector;
-						adjusted_elements[chan] = lp_build_intrinsic(base->gallivm->builder,
-							"llvm.AMDGPU.dp4", bld_base->base.elem_type,
-							args, 2, LLVMReadNoneAttribute);
-					}
-					args[0] = lp_build_gather_values(base->gallivm,
-						adjusted_elements, 4);
-					args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-					args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-					lp_build_intrinsic(
-						base->gallivm->builder,
-						"llvm.R600.store.swizzle",
-						LLVMVoidTypeInContext(base->gallivm->context),
-						args, 3, 0);
-				}
-				break;
-			}
-			case TGSI_SEMANTIC_CLIPDIST : {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_pos++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			case TGSI_SEMANTIC_FOG: {
-				elements[0] = LLVMBuildLoad(base->gallivm->builder,
-					ctx->soa.outputs[i][0], "");
-				elements[1] = elements[2] = lp_build_const_float(base->gallivm, 0.0f);
-				elements[3] = lp_build_const_float(base->gallivm, 1.0f);
-
-				LLVMValueRef args[3];
-				args[0] = lp_build_gather_values(base->gallivm, elements, 4);
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			default: {
-				LLVMValueRef args[3];
-				args[0] = output;
-				args[1] = lp_build_const_int32(base->gallivm, next_param++);
-				args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM);
-				lp_build_intrinsic(
-					base->gallivm->builder,
-					"llvm.R600.store.swizzle",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					args, 3, 0);
-				break;
-			}
-			}
-		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			switch (ctx->r600_outputs[i].name) {
-			case TGSI_SEMANTIC_COLOR:
-				has_color = true;
-				if ( color_count < ctx->color_buffer_count) {
-					LLVMValueRef args[3];
-					args[0] = output;
-					if (ctx->fs_color_all) {
-						for (unsigned j = 0; j < ctx->color_buffer_count; j++) {
-							args[1] = lp_build_const_int32(base->gallivm, j);
-							args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-							lp_build_intrinsic(
-								base->gallivm->builder,
-								"llvm.R600.store.swizzle",
-								LLVMVoidTypeInContext(base->gallivm->context),
-								args, 3, 0);
-						}
-					} else {
-						args[1] = lp_build_const_int32(base->gallivm, color_count++);
-						args[2] = lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL);
-						lp_build_intrinsic(
-							base->gallivm->builder,
-							"llvm.R600.store.swizzle",
-							LLVMVoidTypeInContext(base->gallivm->context),
-							args, 3, 0);
-					}
-				}
-				break;
-			case TGSI_SEMANTIC_POSITION:
-				lp_build_intrinsic_unary(
-					base->gallivm->builder,
-					"llvm.R600.store.pixel.depth",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][2], ""));
-				break;
-			case TGSI_SEMANTIC_STENCIL:
-				lp_build_intrinsic_unary(
-					base->gallivm->builder,
-					"llvm.R600.store.pixel.stencil",
-					LLVMVoidTypeInContext(base->gallivm->context),
-					LLVMBuildLoad(base->gallivm->builder, ctx->soa.outputs[i][1], ""));
-				break;
-			}
-		}
-	}
-	// Add dummy exports
-	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-		if (!next_param) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM));
-		}
-		if (!(next_pos-60)) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS));
-		}
-	}
-	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-		if (!has_color) {
-			lp_build_intrinsic_unary(base->gallivm->builder, "llvm.R600.store.dummy",
-				LLVMVoidTypeInContext(base->gallivm->context),
-				lp_build_const_int32(base->gallivm, V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL));
-		}
-	}
-
-}
-
-static void llvm_emit_tex(
-	const struct lp_build_tgsi_action * action,
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	struct gallivm_state * gallivm = bld_base->base.gallivm;
-	LLVMValueRef args[7];
-	unsigned c, sampler_src;
-	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-
-	if (emit_data->inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
-		switch (emit_data->inst->Instruction.Opcode) {
-		case TGSI_OPCODE_TXQ: {
-			struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-			ctx->uses_tex_buffers = true;
-			bool isEgPlus = (ctx->chip_class >= EVERGREEN);
-			LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm,
-				isEgPlus ? 0 : 1);
-			LLVMValueRef cvecval = llvm_load_const_buffer(bld_base, offset,
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			if (!isEgPlus) {
-				LLVMValueRef maskval[4] = {
-					lp_build_const_int32(gallivm, 1),
-					lp_build_const_int32(gallivm, 2),
-					lp_build_const_int32(gallivm, 3),
-					lp_build_const_int32(gallivm, 0),
-				};
-				LLVMValueRef mask = LLVMConstVector(maskval, 4);
-				cvecval = LLVMBuildShuffleVector(gallivm->builder, cvecval, cvecval,
-					mask, "");
-			}
-			emit_data->output[0] = cvecval;
-			return;
-		}
-		case TGSI_OPCODE_TXF: {
-			args[0] = LLVMBuildExtractElement(gallivm->builder, emit_data->args[0], lp_build_const_int32(gallivm, 0), "");
-			args[1] = lp_build_const_int32(gallivm, R600_MAX_CONST_BUFFERS);
-			emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
-							"llvm.R600.load.texbuf",
-							emit_data->dst_type, args, 2, LLVMReadNoneAttribute);
-			if (ctx->chip_class >= EVERGREEN)
-				return;
-			ctx->uses_tex_buffers = true;
-			LLVMDumpValue(emit_data->output[0]);
-			emit_data->output[0] = LLVMBuildBitCast(gallivm->builder,
-				emit_data->output[0], LLVMVectorType(bld_base->base.int_elem_type, 4),
-				"");
-			LLVMValueRef Mask = llvm_load_const_buffer(bld_base,
-				lp_build_const_int32(gallivm, 0),
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			Mask = LLVMBuildBitCast(gallivm->builder, Mask,
-				LLVMVectorType(bld_base->base.int_elem_type, 4), "");
-			emit_data->output[0] = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_AND,
-				emit_data->output[0],
-				Mask);
-			LLVMValueRef WComponent = LLVMBuildExtractElement(gallivm->builder,
-				emit_data->output[0], lp_build_const_int32(gallivm, 3), "");
-			Mask = llvm_load_const_buffer(bld_base, lp_build_const_int32(gallivm, 1),
-				LLVM_R600_BUFFER_INFO_CONST_BUFFER);
-			Mask = LLVMBuildExtractElement(gallivm->builder, Mask,
-				lp_build_const_int32(gallivm, 0), "");
-			Mask = LLVMBuildBitCast(gallivm->builder, Mask,
-				bld_base->base.int_elem_type, "");
-			WComponent = lp_build_emit_llvm_binary(bld_base, TGSI_OPCODE_OR,
-				WComponent, Mask);
-			emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder,
-				emit_data->output[0], WComponent, lp_build_const_int32(gallivm, 3), "");
-			emit_data->output[0] = LLVMBuildBitCast(gallivm->builder,
-				emit_data->output[0], LLVMVectorType(bld_base->base.elem_type, 4), "");
-		}
-			return;
-		default:
-			break;
-		}
-	}
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TEX ||
-		emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
-		LLVMValueRef Vector[4] = {
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 0), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 1), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 2), ""),
-			LLVMBuildExtractElement(gallivm->builder, emit_data->args[0],
-				lp_build_const_int32(gallivm, 3), ""),
-		};
-		switch (emit_data->inst->Texture.Texture) {
-		case TGSI_TEXTURE_2D:
-		case TGSI_TEXTURE_RECT:
-			Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type);
-			break;
-		case TGSI_TEXTURE_1D:
-			Vector[1] = Vector[2] = Vector[3] = LLVMGetUndef(bld_base->base.elem_type);
-			break;
-		default:
-			break;
-		}
-		args[0] = lp_build_gather_values(gallivm, Vector, 4);
-	} else {
-		args[0] = emit_data->args[0];
-	}
-
-	assert(emit_data->arg_count + 2 <= Elements(args));
-
-	for (c = 1; c < emit_data->arg_count; ++c)
-		args[c] = emit_data->args[c];
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
-		args[1] = LLVMBuildShl(gallivm->builder, args[1], lp_build_const_int32(gallivm, 1), "");
-		args[2] = LLVMBuildShl(gallivm->builder, args[2], lp_build_const_int32(gallivm, 1), "");
-		args[3] = LLVMBuildShl(gallivm->builder, args[3], lp_build_const_int32(gallivm, 1), "");
-	}
-
-	sampler_src = emit_data->inst->Instruction.NumSrcRegs-1;
-
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Src[sampler_src].Register.Index + R600_MAX_CONST_BUFFERS);
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Src[sampler_src].Register.Index);
-	args[c++] = lp_build_const_int32(gallivm,
-					emit_data->inst->Texture.Texture);
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
-		(emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-		emit_data->inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
-
-		switch (emit_data->inst->Texture.Texture) {
-		case TGSI_TEXTURE_2D_MSAA:
-			args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D);
-			break;
-		case TGSI_TEXTURE_2D_ARRAY_MSAA:
-			args[6] = lp_build_const_int32(gallivm, TGSI_TEXTURE_2D_ARRAY);
-			break;
-		default:
-			break;
-		}
-
-		if (ctx->has_compressed_msaa_texturing) {
-			LLVMValueRef ldptr_args[10] = {
-				args[0], // Coord
-				args[1], // Offset X
-				args[2], // Offset Y
-				args[3], // Offset Z
-				args[4],
-				args[5],
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1),
-				lp_build_const_int32(gallivm, 1)
-			};
-			LLVMValueRef ptr = lp_build_intrinsic(gallivm->builder,
-				"llvm.R600.ldptr",
-				emit_data->dst_type, ldptr_args, 10, LLVMReadNoneAttribute);
-			LLVMValueRef Tmp = LLVMBuildExtractElement(gallivm->builder, args[0],
-				lp_build_const_int32(gallivm, 3), "");
-			Tmp = LLVMBuildMul(gallivm->builder, Tmp,
-				lp_build_const_int32(gallivm, 4), "");
-			LLVMValueRef ResX = LLVMBuildExtractElement(gallivm->builder, ptr,
-				lp_build_const_int32(gallivm, 0), "");
-			ResX = LLVMBuildBitCast(gallivm->builder, ResX,
-				bld_base->base.int_elem_type, "");
-			Tmp = LLVMBuildLShr(gallivm->builder, ResX, Tmp, "");
-			Tmp = LLVMBuildAnd(gallivm->builder, Tmp,
-				lp_build_const_int32(gallivm, 0xF), "");
-			args[0] = LLVMBuildInsertElement(gallivm->builder, args[0], Tmp,
-				lp_build_const_int32(gallivm, 3), "");
-			args[c++] = lp_build_const_int32(gallivm,
-				emit_data->inst->Texture.Texture);
-		}
-	}
-
-	emit_data->output[0] = lp_build_intrinsic(gallivm->builder,
-					action->intr_name,
-					emit_data->dst_type, args, c, LLVMReadNoneAttribute);
-
-	if (emit_data->inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
-		((emit_data->inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-		emit_data->inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
-		if (emit_data->inst->Dst[0].Register.WriteMask & 4) {
-			LLVMValueRef offset = lp_build_const_int32(bld_base->base.gallivm, 0);
-			LLVMValueRef ZLayer = LLVMBuildExtractElement(gallivm->builder,
-				llvm_load_const_buffer(bld_base, offset, LLVM_R600_BUFFER_INFO_CONST_BUFFER),
-				lp_build_const_int32(gallivm, 0), "");
-
-			emit_data->output[0] = LLVMBuildInsertElement(gallivm->builder, emit_data->output[0], ZLayer, lp_build_const_int32(gallivm, 2), "");
-			struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
-			ctx->has_txq_cube_array_z_comp = true;
-		}
-}
-
-static void emit_cndlt(
-		const struct lp_build_tgsi_action * action,
-		struct lp_build_tgsi_context * bld_base,
-		struct lp_build_emit_data * emit_data)
-{
-	LLVMBuilderRef builder = bld_base->base.gallivm->builder;
-	LLVMValueRef float_zero = lp_build_const_float(
-		bld_base->base.gallivm, 0.0f);
-	LLVMValueRef cmp = LLVMBuildFCmp(
-		builder, LLVMRealULT, emit_data->args[0], float_zero, "");
-	emit_data->output[emit_data->chan] = LLVMBuildSelect(builder,
-		cmp, emit_data->args[1], emit_data->args[2], "");
-}
-
-static void dp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	struct lp_build_context * base = &bld_base->base;
-	unsigned chan;
-	LLVMValueRef elements[2][4];
-	unsigned opcode = emit_data->inst->Instruction.Opcode;
-	unsigned dp_components = (opcode == TGSI_OPCODE_DP2 ? 2 :
-					(opcode == TGSI_OPCODE_DP3 ? 3 : 4));
-	for (chan = 0 ; chan < dp_components; chan++) {
-		elements[0][chan] = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		elements[1][chan] = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 1, chan);
-	}
-
-	for ( ; chan < 4; chan++) {
-		elements[0][chan] = base->zero;
-		elements[1][chan] = base->zero;
-	}
-
-	 /* Fix up for DPH */
-	if (opcode == TGSI_OPCODE_DPH) {
-		elements[0][TGSI_CHAN_W] = base->one;
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-							elements[0], 4);
-	emit_data->args[1] = lp_build_gather_values(bld_base->base.gallivm,
-							elements[1], 4);
-	emit_data->arg_count = 2;
-
-	emit_data->dst_type = base->elem_type;
-}
-
-static struct lp_build_tgsi_action dot_action = {
-	.fetch_args = dp_fetch_args,
-	.emit = build_tgsi_intrinsic_nomem,
-	.intr_name = "llvm.AMDGPU.dp4"
-};
-
-static void txd_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[4];
-	unsigned chan, src;
-	for (src = 0; src < 3; src++) {
-		for (chan = 0; chan < 4; chan++)
-			coords[chan] = lp_build_emit_fetch(bld_base, inst, src, chan);
-
-		emit_data->args[src] = lp_build_gather_values(bld_base->base.gallivm,
-				coords, 4);
-	}
-	emit_data->arg_count = 3;
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-
-static void txp_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	LLVMValueRef src_w;
-	unsigned chan;
-	LLVMValueRef coords[5];
-
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-	src_w = lp_build_emit_fetch(bld_base, emit_data->inst, 0, TGSI_CHAN_W);
-
-	for (chan = 0; chan < 3; chan++ ) {
-		LLVMValueRef arg = lp_build_emit_fetch(bld_base,
-						emit_data->inst, 0, chan);
-		coords[chan] = lp_build_emit_llvm_binary(bld_base,
-					TGSI_OPCODE_DIV, arg, src_w);
-	}
-	coords[3] = bld_base->base.one;
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->arg_count = 1;
-}
-
-static void tex_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-
-	LLVMValueRef coords[5];
-	unsigned chan;
-	for (chan = 0; chan < 4; chan++) {
-		coords[chan] = lp_build_emit_fetch(bld_base, inst, 0, chan);
-	}
-
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
-		inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
-		/* These instructions have additional operand that should be packed
-		 * into the cube coord vector by radeon_llvm_emit_prepare_cube_coords.
-		 * That operand should be passed as a float value in the args array
-		 * right after the coord vector. After packing it's not used anymore,
-		 * that's why arg_count is not increased */
-		coords[4] = lp_build_emit_fetch(bld_base, inst, 1, TGSI_CHAN_X);
-	}
-
-	if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
-	     inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
-	    inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
-		radeon_llvm_emit_prepare_cube_coords(bld_base, emit_data, coords, NULL);
-	}
-
-	emit_data->arg_count = 1;
-	emit_data->args[0] = lp_build_gather_values(bld_base->base.gallivm,
-						coords, 4);
-	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
-}
-
-static void txf_fetch_args(
-	struct lp_build_tgsi_context * bld_base,
-	struct lp_build_emit_data * emit_data)
-{
-	const struct tgsi_full_instruction * inst = emit_data->inst;
-	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
-	const struct tgsi_texture_offset * off = inst->TexOffsets;
-	LLVMTypeRef offset_type = bld_base->int_bld.elem_type;
-
-	/* fetch tex coords */
-	tex_fetch_args(bld_base, emit_data);
-
-	/* fetch tex offsets */
-	if (inst->Texture.NumOffsets) {
-		assert(inst->Texture.NumOffsets == 1);
-
-		emit_data->args[1] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleX],
-			offset_type);
-		emit_data->args[2] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleY],
-			offset_type);
-		emit_data->args[3] = LLVMConstBitCast(
-			bld->immediates[off->Index][off->SwizzleZ],
-			offset_type);
-	} else {
-		emit_data->args[1] = bld_base->int_bld.zero;
-		emit_data->args[2] = bld_base->int_bld.zero;
-		emit_data->args[3] = bld_base->int_bld.zero;
-	}
-
-	emit_data->arg_count = 4;
-}
-
-LLVMModuleRef r600_tgsi_llvm(
-	struct radeon_llvm_context * ctx,
-	const struct tgsi_token * tokens)
-{
-	struct tgsi_shader_info shader_info;
-	struct lp_build_tgsi_context * bld_base = &ctx->soa.bld_base;
-	radeon_llvm_context_init(ctx, "r600--");
-	LLVMTypeRef Arguments[32];
-	unsigned ArgumentsCount = 0;
-	for (unsigned i = 0; i < ctx->inputs_count; i++)
-		Arguments[ArgumentsCount++] = LLVMVectorType(bld_base->base.elem_type, 4);
-	radeon_llvm_create_func(ctx, NULL, 0, Arguments, ArgumentsCount);
-	for (unsigned i = 0; i < ctx->inputs_count; i++) {
-		LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
-		LLVMAddAttribute(P, LLVMInRegAttribute);
-	}
-	tgsi_scan_shader(tokens, &shader_info);
-
-	bld_base->info = &shader_info;
-	bld_base->userdata = ctx;
-	bld_base->emit_fetch_funcs[TGSI_FILE_CONSTANT] = llvm_fetch_const;
-	bld_base->emit_prologue = llvm_emit_prologue;
-	bld_base->emit_epilogue = llvm_emit_epilogue;
-	ctx->load_input = llvm_load_input;
-	ctx->load_system_value = llvm_load_system_value;
-
-	bld_base->op_actions[TGSI_OPCODE_DP2] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DP3] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DP4] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DPH] = dot_action;
-	bld_base->op_actions[TGSI_OPCODE_DDX].intr_name = "llvm.AMDGPU.ddx";
-	bld_base->op_actions[TGSI_OPCODE_DDX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDX].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_DDY].intr_name = "llvm.AMDGPU.ddy";
-	bld_base->op_actions[TGSI_OPCODE_DDY].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_DDY].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TEX].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TEX2].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TEX2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXB].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXB2].intr_name = "llvm.AMDGPU.txb";
-	bld_base->op_actions[TGSI_OPCODE_TXB2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXD].fetch_args = txd_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXD].intr_name = "llvm.AMDGPU.txd";
-	bld_base->op_actions[TGSI_OPCODE_TXD].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXF].fetch_args = txf_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXF].intr_name = "llvm.AMDGPU.txf";
-	bld_base->op_actions[TGSI_OPCODE_TXF].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXL2].intr_name = "llvm.AMDGPU.txl";
-	bld_base->op_actions[TGSI_OPCODE_TXL2].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXP].fetch_args = txp_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXP].intr_name = "llvm.AMDGPU.tex";
-	bld_base->op_actions[TGSI_OPCODE_TXP].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].fetch_args = tex_fetch_args;
-	bld_base->op_actions[TGSI_OPCODE_TXQ].intr_name = "llvm.AMDGPU.txq";
-	bld_base->op_actions[TGSI_OPCODE_TXQ].emit = llvm_emit_tex;
-	bld_base->op_actions[TGSI_OPCODE_CMP].emit = emit_cndlt;
-
-	lp_build_tgsi_llvm(bld_base, tokens);
-
-	LLVMBuildRetVoid(bld_base->base.gallivm->builder);
-	radeon_llvm_finalize_module(ctx);
-
-	return ctx->gallivm.module;
-}
-
-/* We need to define these R600 registers here, because we can't include
- * evergreend.h and r600d.h.
- */
-#define R_028868_SQ_PGM_RESOURCES_VS                 0x028868
-#define R_028850_SQ_PGM_RESOURCES_PS                 0x028850
-
-void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
-					struct r600_bytecode *bc,
-					uint64_t symbol_offset,
-					boolean *use_kill)
-{
-	unsigned i;
-	const unsigned char *config =
-		radeon_shader_binary_config_start(binary, symbol_offset);
-
-	for (i = 0; i < binary->config_size_per_symbol; i+= 8) {
-		unsigned reg =
-			util_le32_to_cpu(*(uint32_t*)(config + i));
-		unsigned value =
-			util_le32_to_cpu(*(uint32_t*)(config + i + 4));
-		switch (reg) {
-		/* R600 / R700 */
-		case R_028850_SQ_PGM_RESOURCES_PS:
-		case R_028868_SQ_PGM_RESOURCES_VS:
-		/* Evergreen / Northern Islands */
-		case R_028844_SQ_PGM_RESOURCES_PS:
-		case R_028860_SQ_PGM_RESOURCES_VS:
-		case R_0288D4_SQ_PGM_RESOURCES_LS:
-			bc->ngpr = MAX2(bc->ngpr, G_028844_NUM_GPRS(value));
-			bc->nstack = MAX2(bc->nstack, G_028844_STACK_SIZE(value));
-			break;
-		case R_02880C_DB_SHADER_CONTROL:
-			*use_kill = G_02880C_KILL_ENABLE(value);
-			break;
-		case R_0288E8_SQ_LDS_ALLOC:
-			bc->nlds_dw = value;
-			break;
-		}
-	}
-
-}
-
-unsigned r600_create_shader(struct r600_bytecode *bc,
-		const struct radeon_shader_binary *binary,
-		boolean *use_kill)
-
-{
-	assert(binary->code_size % 4 == 0);
-	bc->bytecode = CALLOC(1, binary->code_size);
-	memcpy(bc->bytecode, binary->code, binary->code_size);
-	bc->ndw = binary->code_size / 4;
-
-	r600_shader_binary_read_config(binary, bc, 0, use_kill);
-
-	return 0;
-}
-
-void r600_destroy_shader(struct r600_bytecode *bc)
-{
-	FREE(bc->bytecode);
-}
-
-unsigned r600_llvm_compile(
-	LLVMModuleRef mod,
-	enum radeon_family family,
-	struct r600_bytecode *bc,
-	boolean *use_kill,
-	unsigned dump,
-	struct pipe_debug_callback *debug)
-{
-	unsigned r;
-	struct radeon_shader_binary binary;
-	const char * gpu_family = r600_get_llvm_processor_name(family);
-
-	radeon_shader_binary_init(&binary);
-	if (dump)
-		LLVMDumpModule(mod);
-	r = radeon_llvm_compile(mod, &binary, gpu_family, NULL, debug);
-
-	r = r600_create_shader(bc, &binary, use_kill);
-
-	radeon_shader_binary_clean(&binary);
-
-	return r;
-}
-
-#endif
diff --git a/src/gallium/drivers/r600/r600_llvm.h b/src/gallium/drivers/r600/r600_llvm.h
deleted file mode 100644
index 3f7fc4bef7e..00000000000
--- a/src/gallium/drivers/r600/r600_llvm.h
+++ /dev/null
@@ -1,42 +0,0 @@
-
-#ifndef R600_LLVM_H
-#define R600_LLVM_H
-
-#if defined R600_USE_LLVM || defined HAVE_OPENCL
-
-#include "radeon/radeon_llvm.h"
-#include <llvm-c/Core.h>
-
-struct pipe_debug_callback;
-struct r600_bytecode;
-struct r600_shader_ctx;
-struct radeon_llvm_context;
-struct radeon_shader_binary;
-enum radeon_family;
-
-LLVMModuleRef r600_tgsi_llvm(
-	struct radeon_llvm_context * ctx,
-	const struct tgsi_token * tokens);
-
-unsigned r600_llvm_compile(
-	LLVMModuleRef mod,
-	enum radeon_family family,
-	struct r600_bytecode *bc,
-	boolean *use_kill,
-	unsigned dump,
-	struct pipe_debug_callback *debug);
-
-unsigned r600_create_shader(struct r600_bytecode *bc,
-		const struct radeon_shader_binary *binary,
-		boolean *use_kill);
-
-void r600_destroy_shader(struct r600_bytecode *bc);
-
-void r600_shader_binary_read_config(const struct radeon_shader_binary *binary,
-		struct r600_bytecode *bc,
-		uint64_t symbol_offset,
-		boolean *use_kill);
-
-#endif /* defined R600_USE_LLVM || defined HAVE_OPENCL */
-
-#endif /* R600_LLVM_H */
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 88c500a162a..b8011917907 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -43,9 +43,6 @@
 
 static const struct debug_named_value r600_debug_options[] = {
 	/* features */
-#if defined(R600_USE_LLVM)
-	{ "llvm", DBG_LLVM, "Enable the LLVM shader compiler" },
-#endif
 	{ "nocpdma", DBG_NO_CP_DMA, "Disable CP DMA" },
 
 	/* shader backend */
@@ -620,8 +617,6 @@ struct pipe_screen *r600_screen_create(struct radeon_winsys *ws)
 		rscreen->b.debug_flags |= DBG_FS | DBG_VS | DBG_GS | DBG_PS | DBG_CS | DBG_TCS | DBG_TES;
 	if (!debug_get_bool_option("R600_HYPERZ", TRUE))
 		rscreen->b.debug_flags |= DBG_NO_HYPERZ;
-	if (debug_get_bool_option("R600_LLVM", FALSE))
-		rscreen->b.debug_flags |= DBG_LLVM;
 
 	if (rscreen->b.family == CHIP_UNKNOWN) {
 		fprintf(stderr, "r600: Unknown chipset 0x%04X\n", rscreen->b.info.pci_id);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 72aa64233a9..cd0052a519f 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -28,8 +28,6 @@
 
 #include "radeon/r600_pipe_common.h"
 #include "radeon/r600_cs.h"
-
-#include "r600_llvm.h"
 #include "r600_public.h"
 
 #include "util/u_suballoc.h"
@@ -243,7 +241,6 @@ struct r600_gs_rings_state {
 
 /* This must start from 16. */
 /* features */
-#define DBG_LLVM		(1 << 29)
 #define DBG_NO_CP_DMA		(1 << 30)
 /* shader backend */
 #define DBG_NO_SB		(1 << 21)
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index df40f94bdcf..49a18768af3 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -21,7 +21,6 @@
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 #include "r600_sq.h"
-#include "r600_llvm.h"
 #include "r600_formats.h"
 #include "r600_opcodes.h"
 #include "r600_shader.h"
@@ -194,10 +193,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
 	/* disable SB for shaders using doubles */
 	use_sb &= !shader->shader.uses_doubles;
 
-	/* Check if the bytecode has already been built.  When using the llvm
-	 * backend, r600_shader_from_tgsi() will take care of building the
-	 * bytecode.
-	 */
+	/* Check if the bytecode has already been built. */
 	if (!shader->shader.bc.bytecode) {
 		r = r600_bytecode_build(&shader->shader.bc);
 		if (r) {
@@ -332,7 +328,6 @@ struct r600_shader_ctx {
 	uint32_t				*literals;
 	uint32_t				nliterals;
 	uint32_t				max_driver_temp_used;
-	boolean use_llvm;
 	/* needed for evergreen interpolation */
 	struct eg_interp		eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
 	/* evergreen/cayman also store sample mask in face register */
@@ -661,11 +656,9 @@ static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
 		ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
 		if (ctx->shader->input[index].interpolate > 0) {
 			evergreen_interp_assign_ij_index(ctx, index);
-			if (!ctx->use_llvm)
-				r = evergreen_interp_alu(ctx, index);
+			r = evergreen_interp_alu(ctx, index);
 		} else {
-			if (!ctx->use_llvm)
-				r = evergreen_interp_flat(ctx, index);
+			r = evergreen_interp_flat(ctx, index);
 		}
 	}
 	return r;
@@ -2936,22 +2929,16 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	int i, j, k, r = 0;
 	int next_param_base = 0, next_clip_base;
 	int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
-	/* Declarations used by llvm code */
-	bool use_llvm = false;
 	bool indirect_gprs;
 	bool ring_outputs = false;
 	bool lds_outputs = false;
 	bool lds_inputs = false;
 	bool pos_emitted = false;
 
-#ifdef R600_USE_LLVM
-	use_llvm = rscreen->b.debug_flags & DBG_LLVM;
-#endif
 	ctx.bc = &shader->bc;
 	ctx.shader = shader;
 	ctx.native_integers = true;
 
-
 	r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
 			   rscreen->has_compressed_msaa_texturing);
 	ctx.tokens = tokens;
@@ -3043,19 +3030,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		ctx.file_offset[i] = 0;
 	}
 
-#ifdef R600_USE_LLVM
-	if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
-		fprintf(stderr, "Warning: R600 LLVM backend does not support "
-				"indirect adressing.  Falling back to TGSI "
-				"backend.\n");
-		use_llvm = 0;
-	}
-#endif
 	if (ctx.type == TGSI_PROCESSOR_VERTEX) {
 		ctx.file_offset[TGSI_FILE_INPUT] = 1;
-		if (!use_llvm) {
-			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
-		}
+		r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
 	}
 	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
 		if (ctx.bc->chip_class >= EVERGREEN)
@@ -3085,16 +3062,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		if (add_tess_inout)
 			ctx.file_offset[TGSI_FILE_INPUT]+=2;
 	}
-	ctx.use_llvm = use_llvm;
 
-	if (use_llvm) {
-		ctx.file_offset[TGSI_FILE_OUTPUT] =
-			ctx.file_offset[TGSI_FILE_INPUT];
-	} else {
-	   ctx.file_offset[TGSI_FILE_OUTPUT] =
+	ctx.file_offset[TGSI_FILE_OUTPUT] =
 			ctx.file_offset[TGSI_FILE_INPUT] +
 			ctx.info.file_max[TGSI_FILE_INPUT] + 1;
-	}
 	ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
 						ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
 
@@ -3234,54 +3205,10 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 		}
 	}
 
-/* LLVM backend setup */
-#ifdef R600_USE_LLVM
-	if (use_llvm) {
-		struct radeon_llvm_context radeon_llvm_ctx;
-		LLVMModuleRef mod;
-		bool dump = r600_can_dump_shader(&rscreen->b,
-						 tgsi_get_processor_type(tokens));
-		boolean use_kill = false;
-
-		memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
-		radeon_llvm_ctx.type = ctx.type;
-		radeon_llvm_ctx.two_side = shader->two_side;
-		radeon_llvm_ctx.face_gpr = ctx.face_gpr;
-		radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
-		radeon_llvm_ctx.r600_inputs = ctx.shader->input;
-		radeon_llvm_ctx.r600_outputs = ctx.shader->output;
-		radeon_llvm_ctx.color_buffer_count = max_color_exports;
-		radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
-		radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
-		radeon_llvm_ctx.stream_outputs = &so;
-		radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
-		radeon_llvm_ctx.has_compressed_msaa_texturing =
-			ctx.bc->has_compressed_msaa_texturing;
-		mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
-		ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
-		ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
-
-		if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill,
-				      dump, &rctx->b.debug)) {
-			radeon_llvm_dispose(&radeon_llvm_ctx);
-			use_llvm = 0;
-			fprintf(stderr, "R600 LLVM backend failed to compile "
-				"shader.  Falling back to TGSI\n");
-		} else {
-			ctx.file_offset[TGSI_FILE_OUTPUT] =
-					ctx.file_offset[TGSI_FILE_INPUT];
-		}
-		if (use_kill)
-			ctx.shader->uses_kill = use_kill;
-		radeon_llvm_dispose(&radeon_llvm_ctx);
-	}
-#endif
-/* End of LLVM backend setup */
-
 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
 		shader->nr_ps_max_color_exports = 8;
 
-	if (!use_llvm) {
+	if (1) {
 		if (ctx.fragcoord_input >= 0) {
 			if (ctx.bc->chip_class == CAYMAN) {
 				for (j = 0 ; j < 4; j++) {
@@ -3437,8 +3364,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				alu.dst.write = (j == ochan);
 				if (j == 3)
 					alu.last = 1;
-				if (!use_llvm)
-					r = r600_bytecode_add_alu(ctx.bc, &alu);
+				r = r600_bytecode_add_alu(ctx.bc, &alu);
 				if (r)
 					return r;
 			}
@@ -3446,7 +3372,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	}
 
 	/* Add stream outputs. */
-	if (!use_llvm && so.num_outputs) {
+	if (so.num_outputs) {
 		bool emit = false;
 		if (!lds_outputs && !ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX)
 			emit = true;
@@ -3709,31 +3635,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			}
 		}
 		/* add output to bytecode */
-		if (!use_llvm) {
-			for (i = 0; i < noutput; i++) {
-				r = r600_bytecode_add_output(ctx.bc, &output[i]);
-				if (r)
-					goto out_err;
-			}
+		for (i = 0; i < noutput; i++) {
+			r = r600_bytecode_add_output(ctx.bc, &output[i]);
+			if (r)
+				goto out_err;
 		}
 	}
 
 	/* add program end */
-	if (!use_llvm) {
-		if (ctx.bc->chip_class == CAYMAN)
-			cm_bytecode_add_cf_end(ctx.bc);
-		else {
-			const struct cf_op_info *last = NULL;
+	if (ctx.bc->chip_class == CAYMAN)
+		cm_bytecode_add_cf_end(ctx.bc);
+	else {
+		const struct cf_op_info *last = NULL;
 
-			if (ctx.bc->cf_last)
-				last = r600_isa_cf(ctx.bc->cf_last->op);
+		if (ctx.bc->cf_last)
+			last = r600_isa_cf(ctx.bc->cf_last->op);
 
-			/* alu clause instructions don't have EOP bit, so add NOP */
-			if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
-				r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
+		/* alu clause instructions don't have EOP bit, so add NOP */
+		if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS || ctx.bc->cf_last->op == CF_OP_POP || ctx.bc->cf_last->op == CF_OP_GDS)
+			r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
 
-			ctx.bc->cf_last->end_of_program = 1;
-		}
+		ctx.bc->cf_last->end_of_program = 1;
 	}
 
 	/* check GPR limit - we have 124 = 128 - 4

From 4e5dc69af158dd5eef73ad4fcd7990e49ec6ec57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 11 Mar 2016 15:53:55 +0100
Subject: [PATCH 069/197] r600g: flatten if (1) statement after removal of
 TGSI->LLVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/r600/r600_shader.c | 172 ++++++++++++-------------
 1 file changed, 85 insertions(+), 87 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 49a18768af3..77658f53551 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -3208,24 +3208,9 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
 		shader->nr_ps_max_color_exports = 8;
 
-	if (1) {
-		if (ctx.fragcoord_input >= 0) {
-			if (ctx.bc->chip_class == CAYMAN) {
-				for (j = 0 ; j < 4; j++) {
-					struct r600_bytecode_alu alu;
-					memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-					alu.op = ALU_OP1_RECIP_IEEE;
-					alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
-					alu.src[0].chan = 3;
-
-					alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
-					alu.dst.chan = j;
-					alu.dst.write = (j == 3);
-					alu.last = 1;
-					if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
-						return r;
-				}
-			} else {
+	if (ctx.fragcoord_input >= 0) {
+		if (ctx.bc->chip_class == CAYMAN) {
+			for (j = 0 ; j < 4; j++) {
 				struct r600_bytecode_alu alu;
 				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
 				alu.op = ALU_OP1_RECIP_IEEE;
@@ -3233,87 +3218,100 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				alu.src[0].chan = 3;
 
 				alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
-				alu.dst.chan = 3;
-				alu.dst.write = 1;
+				alu.dst.chan = j;
+				alu.dst.write = (j == 3);
 				alu.last = 1;
 				if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
 					return r;
 			}
-		}
-
-		if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+		} else {
 			struct r600_bytecode_alu alu;
-			int r;
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_RECIP_IEEE;
+			alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
+			alu.src[0].chan = 3;
 
-			/* GS thread with no output workaround - emit a cut at start of GS */
-			if (ctx.bc->chip_class == R600)
-				r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
-
-			for (j = 0; j < 4; j++) {
-				memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-				alu.op = ALU_OP1_MOV;
-				alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
-				alu.src[0].value = 0;
-				alu.dst.sel = ctx.gs_export_gpr_tregs[j];
-				alu.dst.write = 1;
-				alu.last = 1;
-				r = r600_bytecode_add_alu(ctx.bc, &alu);
-				if (r)
-					return r;
-			}
-		}
-
-		if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
-			r600_fetch_tess_io_info(&ctx);
-
-		if (shader->two_side && ctx.colors_used) {
-			if ((r = process_twoside_color_inputs(&ctx)))
+			alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+			if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
 				return r;
 		}
+	}
 
-		tgsi_parse_init(&ctx.parse, tokens);
-		while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
-			tgsi_parse_token(&ctx.parse);
-			switch (ctx.parse.FullToken.Token.Type) {
-			case TGSI_TOKEN_TYPE_INSTRUCTION:
-				r = tgsi_is_supported(&ctx);
-				if (r)
-					goto out_err;
-				ctx.max_driver_temp_used = 0;
-				/* reserve first tmp for everyone */
-				r600_get_temp(&ctx);
+	if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+		struct r600_bytecode_alu alu;
+		int r;
 
-				opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
-				if ((r = tgsi_split_constant(&ctx)))
-					goto out_err;
-				if ((r = tgsi_split_literal_constant(&ctx)))
-					goto out_err;
-				if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
-					if ((r = tgsi_split_gs_inputs(&ctx)))
-						goto out_err;
-				} else if (lds_inputs) {
-					if ((r = tgsi_split_lds_inputs(&ctx)))
-						goto out_err;
-				}
-				if (ctx.bc->chip_class == CAYMAN)
-					ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
-				else if (ctx.bc->chip_class >= EVERGREEN)
-					ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
-				else
-					ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
-				r = ctx.inst_info->process(&ctx);
-				if (r)
-					goto out_err;
+		/* GS thread with no output workaround - emit a cut at start of GS */
+		if (ctx.bc->chip_class == R600)
+			r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
 
-				if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
-					r = r600_store_tcs_output(&ctx);
-					if (r)
-						goto out_err;
-				}
-				break;
-			default:
-				break;
+		for (j = 0; j < 4; j++) {
+			memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+			alu.op = ALU_OP1_MOV;
+			alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[0].value = 0;
+			alu.dst.sel = ctx.gs_export_gpr_tregs[j];
+			alu.dst.write = 1;
+			alu.last = 1;
+			r = r600_bytecode_add_alu(ctx.bc, &alu);
+			if (r)
+				return r;
+		}
+	}
+
+	if (ctx.type == TGSI_PROCESSOR_TESS_CTRL)
+		r600_fetch_tess_io_info(&ctx);
+
+	if (shader->two_side && ctx.colors_used) {
+		if ((r = process_twoside_color_inputs(&ctx)))
+			return r;
+	}
+
+	tgsi_parse_init(&ctx.parse, tokens);
+	while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
+		tgsi_parse_token(&ctx.parse);
+		switch (ctx.parse.FullToken.Token.Type) {
+		case TGSI_TOKEN_TYPE_INSTRUCTION:
+			r = tgsi_is_supported(&ctx);
+			if (r)
+				goto out_err;
+			ctx.max_driver_temp_used = 0;
+			/* reserve first tmp for everyone */
+			r600_get_temp(&ctx);
+
+			opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
+			if ((r = tgsi_split_constant(&ctx)))
+				goto out_err;
+			if ((r = tgsi_split_literal_constant(&ctx)))
+				goto out_err;
+			if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
+				if ((r = tgsi_split_gs_inputs(&ctx)))
+					goto out_err;
+			} else if (lds_inputs) {
+				if ((r = tgsi_split_lds_inputs(&ctx)))
+					goto out_err;
 			}
+			if (ctx.bc->chip_class == CAYMAN)
+				ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
+			else if (ctx.bc->chip_class >= EVERGREEN)
+				ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
+			else
+				ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
+			r = ctx.inst_info->process(&ctx);
+			if (r)
+				goto out_err;
+
+			if (ctx.type == TGSI_PROCESSOR_TESS_CTRL) {
+				r = r600_store_tcs_output(&ctx);
+				if (r)
+					goto out_err;
+			}
+			break;
+		default:
+			break;
 		}
 	}
 

From ea2bff1d115ef00aaa06797fffd4334f6a50570f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 11 Mar 2016 15:59:28 +0100
Subject: [PATCH 070/197] gallium/radeon: remove remnants of R600 TGSI->LLVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeon/radeon_llvm.h        | 17 -----------------
 .../drivers/radeon/radeon_setup_tgsi_llvm.c     |  3 ---
 2 files changed, 20 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index bdee2f8020a..0a164bba307 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -51,24 +51,8 @@ struct radeon_llvm_loop {
 };
 
 struct radeon_llvm_context {
-
 	struct lp_build_tgsi_soa_context soa;
 
-	unsigned chip_class;
-	unsigned type;
-	unsigned face_gpr;
-	unsigned two_side;
-	unsigned inputs_count;
-	struct r600_shader_io * r600_inputs;
-	struct r600_shader_io * r600_outputs;
-	struct pipe_stream_output_info *stream_outputs;
-	unsigned color_buffer_count;
-	unsigned fs_color_all;
-	unsigned alpha_to_one;
-	unsigned has_txq_cube_array_z_comp;
-	unsigned uses_tex_buffers;
-	unsigned has_compressed_msaa_texturing;
-
 	/*=== Front end configuration ===*/
 
 	/* Instructions that are not described by any of the TGSI opcodes. */
@@ -90,7 +74,6 @@ struct radeon_llvm_context {
 	  */
 	LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
 	LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
-	unsigned output_reg_count;
 
 	/** This pointer is used to contain the temporary values.
 	  * The amount of temporary used in tgsi can't be bound to a max value and
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index c74397fb5c9..fb883cb585e 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -363,9 +363,6 @@ static void emit_declaration(
 					ctx->soa.bld_base.base.elem_type, "");
 			}
 		}
-
-		ctx->output_reg_count = MAX2(ctx->output_reg_count,
-							 decl->Range.Last + 1);
 		break;
 	}
 

From 6eeb284e4f74a2fe5ae6cba90f97f219935e24df Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Mar 2016 11:58:25 -0400
Subject: [PATCH 071/197] nv50/ir: normalize cube coordinates after derivatives
 have been computed

In "manual" derivative mode (always used on nv50 and sometimes on nvc0
but always for cube), the idea is that using the quadop instruction, we
set up the "other" quads to have values such that the derivatives work
out, and then run the texture instruction as if nothing were strange. It
pulls values from the other lanes, and does its magic.

However cube coordinates have to be normalized - one of the 3 coords has
to be 1, to determine which is the major axis, to say which face is
being sampled. We were normalizing the coordinates first, and then
adding the derivatives. This is wrong for two reasons:

- the coordinates got normalized by a scaling factor but the
  derivatives didn't
- the result of the addition didn't end up normalized

To resolve this, we flip the logic around to normalize *after* the
per-lane coordinates are set up.

This fixes a bunch of textureGrad cube dEQP tests.

NOTE: nv50 cube arrays with explicit derivatives are still broken, to be
resolved at a later date.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 12 -------
 .../codegen/nv50_ir_lowering_gm107.cpp        | 18 +++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nv50.cpp | 34 +++++++++++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 35 ++++++++++++++++++-
 4 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 4bebfdc0a7b..39cd986c23a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1989,7 +1989,6 @@ Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
 void
 Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
 {
-   Value *val;
    Value *arg[4], *src[8];
    Value *lod = NULL, *shd = NULL;
    unsigned int s, c, d;
@@ -2032,17 +2031,6 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
          shd = src[n - 1];
    }
 
-   if (tgt.isCube()) {
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
-      val = getScratch();
-      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
-      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
-      mkOp1(OP_RCP, TYPE_F32, val, val);
-      for (c = 0; c < 3; ++c)
-         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
-   }
-
    for (c = 0, d = 0; c < 4; ++c) {
       if (dst[c]) {
          texi->setDef(d++, dst[c]);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 0b903780614..a5deaef14e0 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -67,6 +67,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
    tmp = bld.getScratch();
 
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
       for (c = 0; c < dim; ++c) {
@@ -92,10 +93,25 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
          add->lanes = 1; /* abused for .ndv */
       }
 
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
+
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
       bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
 
       // save results
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 6987503f9ce..02c4f1a4ca8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -724,6 +724,23 @@ NV50LoweringPreSSA::handleTEX(TexInstruction *i)
    const int dref = arg;
    const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
 
+   /* Only normalize in the non-explicit derivatives case.
+    */
+   if (i->tex.target.isCube() && i->op != OP_TXD) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
    // handle MS, which means looking up the MS params for this texture, and
    // adjusting the input coordinates to point at the right sample.
    if (i->tex.target.isMS()) {
@@ -941,6 +958,7 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
 
    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
@@ -950,10 +968,24 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates if necessary
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c, crd[c]);
+         tex->setSrc(c, src[c]);
       // save results
       for (c = 0; i->defExists(c); ++c) {
          Instruction *mov;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index c88a2695a4c..8e3529ffe5c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -615,6 +615,24 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
    const int chipset = prog->getTarget()->getChipset();
 
+   /* Only normalize in the non-explicit derivatives case. For explicit
+    * derivatives, this is handled in handleManualTXD.
+    */
+   if (i->tex.target.isCube() && i->dPdx[0].get() == NULL) {
+      Value *src[3], *val;
+      int c;
+      for (c = 0; c < 3; ++c)
+         src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), i->getSrc(c));
+      val = bld.getScratch();
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c) {
+         i->setSrc(c, bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(),
+                                 i->getSrc(c), val));
+      }
+   }
+
    // Arguments to the TEX instruction are a little insane. Even though the
    // encoding is identical between SM20 and SM30, the arguments mean
    // different things between Fermi and Kepler+. A lot of arguments are
@@ -861,6 +879,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 
    bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
    for (l = 0; l < 4; ++l) {
+      Value *src[3], *val;
       // mov coordinates from lane l to all lanes
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
@@ -870,10 +889,24 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
       // add dPdy from lane l to lanes dy
       for (c = 0; c < dim; ++c)
          bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // normalize cube coordinates
+      if (i->tex.target.isCube()) {
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp1v(OP_ABS, TYPE_F32, bld.getSSA(), crd[c]);
+         val = bld.getScratch();
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+         bld.mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+         bld.mkOp1(OP_RCP, TYPE_F32, val, val);
+         for (c = 0; c < 3; ++c)
+            src[c] = bld.mkOp2v(OP_MUL, TYPE_F32, bld.getSSA(), crd[c], val);
+      } else {
+         for (c = 0; c < dim; ++c)
+            src[c] = crd[c];
+      }
       // texture
       bld.insert(tex = cloneForward(func, i));
       for (c = 0; c < dim; ++c)
-         tex->setSrc(c + array, crd[c]);
+         tex->setSrc(c + array, src[c]);
       // save results
       for (c = 0; i->defExists(c); ++c) {
          Instruction *mov;

From adb40a739943d62508b9c79cbd85e3c67ee3b43b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Mar 2016 21:25:36 -0400
Subject: [PATCH 072/197] st/mesa: only minify depth for 3d targets

We make sure that that image depth matches the level's depth before
copying it into place. However we should only be minifying the first
level's depth for 3d textures - array textures have the same depth for
all levels.

This fixes tests such as
dEQP-GLES3.functional.texture.specification.texsubimage3d_depth.* and I
suspect account for a number of other odd situations I've run into where
level > 0 of array textures was messed up.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/state_tracker/st_cb_texture.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index bffa4d026cb..460c1790663 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,10 +2886,13 @@ st_finalize_texture(struct gl_context *ctx,
          /* Need to import images in main memory or held in other textures.
           */
          if (stImage && stObj->pt != stImage->pt) {
+            GLuint depth = stObj->depth0;
+            if (stObj->base.Target == GL_TEXTURE_3D)
+               depth = u_minify(depth, level);
             if (level == 0 ||
                 (stImage->base.Width == u_minify(stObj->width0, level) &&
                  stImage->base.Height == u_minify(stObj->height0, level) &&
-                 stImage->base.Depth == u_minify(stObj->depth0, level))) {
+                 stImage->base.Depth == depth)) {
                /* src image fits expected dest mipmap level size */
                copy_image_data_to_texture(st, stObj, level, stImage);
             }

From 7d98bfedd73d632041d27ff12ccf7c7be74a2ddd Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Mar 2016 13:43:43 -0400
Subject: [PATCH 073/197] nv50/ir: fix indirect texturing for non-array
 textures on nvc0

If a layer parameter is provided, we want to flip it to position 0 (and
combine it with any indirect params). However if the target is not an
array, there is no layer, so we have to shift all of the arguments down
by one to make room for it.

This fixes situations where there were non-coordinate parameters, such
as bias, lod, depth compare, explicit derivatives. Instead of adding a
new parameter at the front for the indirect reference, we would swap one
of those in its place.

Fixes dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.uniform.compute.*shadow

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reported-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Tested-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 .../drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 8e3529ffe5c..e8f8e30918b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -746,9 +746,13 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
       }
 
       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
-      for (int s = dim; s >= 1; --s)
-         i->setSrc(s, i->getSrc(s - 1));
-      i->setSrc(0, arrayIndex);
+      if (arrayIndex) {
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, arrayIndex);
+      } else {
+         i->moveSources(0, 1);
+      }
 
       if (arrayIndex) {
          int sat = (i->op == OP_TXF) ? 1 : 0;

From 46de6bbb775602ab237d0054e5351b0fc90d942b Mon Sep 17 00:00:00 2001
From: Nishanth Peethambaran <nishanth.peethambaran@amd.com>
Date: Tue, 15 Mar 2016 01:56:18 -0400
Subject: [PATCH 074/197] st/omx: Remove trailing spaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Nishanth Peethambaran <nishanth.peethambaran@amd.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/state_trackers/omx/vid_dec.c      | 10 ++---
 src/gallium/state_trackers/omx/vid_dec_h264.c |  8 ++--
 src/gallium/state_trackers/omx/vid_enc.c      | 44 +++++++++----------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/gallium/state_trackers/omx/vid_dec.c b/src/gallium/state_trackers/omx/vid_dec.c
index 5584348761e..9fcf20ebebf 100644
--- a/src/gallium/state_trackers/omx/vid_dec.c
+++ b/src/gallium/state_trackers/omx/vid_dec.c
@@ -140,7 +140,7 @@ static OMX_ERRORTYPE vid_dec_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
 
    r = omx_base_filter_Constructor(comp, name);
    if (r)
-	return r;
+      return r;
 
    priv->profile = PIPE_VIDEO_PROFILE_UNKNOWN;
 
@@ -268,7 +268,7 @@ static OMX_ERRORTYPE vid_dec_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
       r = checkHeader(param, sizeof(OMX_PARAM_COMPONENTROLETYPE));
       if (r)
          return r;
- 
+
       if (!strcmp((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE)) {
          priv->profile = PIPE_VIDEO_PROFILE_MPEG2_MAIN;
       } else if (!strcmp((char *)role->cRole, OMX_VID_DEC_AVC_ROLE)) {
@@ -321,7 +321,7 @@ static OMX_ERRORTYPE vid_dec_GetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
          strcpy((char *)role->cRole, OMX_VID_DEC_MPEG2_ROLE);
       else if (priv->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH)
          strcpy((char *)role->cRole, OMX_VID_DEC_AVC_ROLE);
- 
+
       break;
    }
 
@@ -474,7 +474,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
       if (r)
          return r;
    }
- 
+
    return OMX_ErrorNone;
 }
 
@@ -513,7 +513,7 @@ static void vid_dec_FillOutput(vid_dec_PrivateType *priv, struct pipe_video_buff
 
    box.width = def->nFrameWidth / 2;
    box.height = def->nFrameHeight / 2;
- 
+
    src = priv->pipe->transfer_map(priv->pipe, views[1]->texture, 0,
                                   PIPE_TRANSFER_READ, &box, &transfer);
    util_copy_rect(dst, views[1]->texture->format, def->nStride, 0, 0,
diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index b4536828909..75f27d2f6b0 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -91,7 +91,7 @@ void vid_dec_h264_Init(vid_dec_PrivateType *priv)
    priv->Decode = vid_dec_h264_Decode;
    priv->EndFrame = vid_dec_h264_EndFrame;
    priv->Flush = vid_dec_h264_Flush;
-   
+
    LIST_INITHEAD(&priv->codec_data.h264.dpb_list);
    priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX;
 }
@@ -829,7 +829,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
          priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
          priv->picture.h264.field_order_cnt[1] = priv->picture.h264.field_order_cnt[0] +
             sps->offset_for_top_to_bottom_field + priv->codec_data.h264.delta_pic_order_cnt[1];
-         
+
       } else if (!priv->picture.h264.bottom_field_flag)
          priv->picture.h264.field_order_cnt[0] = expectedPicOrderCnt + priv->codec_data.h264.delta_pic_order_cnt[0];
       else
@@ -859,7 +859,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
       if (!priv->picture.h264.field_pic_flag) {
          priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
          priv->picture.h264.field_order_cnt[1] = tempPicOrderCnt;
-         
+
       } else if (!priv->picture.h264.bottom_field_flag)
          priv->picture.h264.field_order_cnt[0] = tempPicOrderCnt;
       else
@@ -876,7 +876,7 @@ static void slice_header(vid_dec_PrivateType *priv, struct vl_rbsp *rbsp,
 
    priv->picture.h264.num_ref_idx_l0_active_minus1 = pps->num_ref_idx_l0_default_active_minus1;
    priv->picture.h264.num_ref_idx_l1_active_minus1 = pps->num_ref_idx_l1_default_active_minus1;
- 
+
    if (slice_type == PIPE_H264_SLICE_TYPE_P ||
        slice_type == PIPE_H264_SLICE_TYPE_SP ||
        slice_type == PIPE_H264_SLICE_TYPE_B) {
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index df22a97a42c..4505fe1a693 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -179,7 +179,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    if (!screen->get_video_param(screen, PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE, PIPE_VIDEO_CAP_SUPPORTED))
       return OMX_ErrorBadParameter;
- 
+
    priv->stacked_frames_num = screen->get_video_param(screen,
                                 PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH,
                                 PIPE_VIDEO_ENTRYPOINT_ENCODE,
@@ -242,7 +242,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
 
    port->Port_AllocateBuffer = vid_enc_AllocateOutBuffer;
    port->Port_FreeBuffer = vid_enc_FreeOutBuffer;
- 
+
    priv->bitrate.eControlRate = OMX_Video_ControlRateDisable;
    priv->bitrate.nTargetBitrate = 0;
 
@@ -253,7 +253,7 @@ static OMX_ERRORTYPE vid_enc_Constructor(OMX_COMPONENTTYPE *comp, OMX_STRING nam
    priv->profile_level.eProfile = OMX_VIDEO_AVCProfileBaseline;
    priv->profile_level.eLevel = OMX_VIDEO_AVCLevel42;
 
-   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+   priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
    priv->frame_num = 0;
    priv->pic_order_cnt = 0;
    priv->restricted_b_frames = debug_get_bool_option("OMX_USE_RESTRICTED_B_FRAMES", FALSE);
@@ -380,7 +380,7 @@ static OMX_ERRORTYPE vid_enc_SetParameter(OMX_HANDLETYPE handle, OMX_INDEXTYPE i
 
          port = (omx_base_video_PortType *)priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX];
          port->sPortParam.nBufferSize = framesize * 512 / (16*16);
-      
+
          priv->frame_rate = def->format.video.xFramerate;
 
          priv->callbacks->EventHandler(comp, priv->callbackData, OMX_EventPortSettingsChanged,
@@ -532,10 +532,10 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
    vid_enc_PrivateType *priv = comp->pComponentPrivate;
    OMX_ERRORTYPE r;
    int i;
- 
+
    if (!config)
       return OMX_ErrorBadParameter;
-                         
+
    switch(idx) {
    case OMX_IndexConfigVideoIntraVOPRefresh: {
       OMX_CONFIG_INTRAREFRESHVOPTYPE *type = config;
@@ -543,9 +543,9 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
       r = checkHeader(config, sizeof(OMX_CONFIG_INTRAREFRESHVOPTYPE));
       if (r)
          return r;
-      
+
       priv->force_pic_type = *type;
-      
+
       break;
    }
    case OMX_IndexConfigCommonScale: {
@@ -568,11 +568,11 @@ static OMX_ERRORTYPE vid_enc_SetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
       priv->scale = *scale;
       if (priv->scale.xWidth != 0xffffffff && priv->scale.xHeight != 0xffffffff) {
          struct pipe_video_buffer templat = {};
- 
+
          templat.buffer_format = PIPE_FORMAT_NV12;
          templat.chroma_format = PIPE_VIDEO_CHROMA_FORMAT_420;
-         templat.width = priv->scale.xWidth; 
-         templat.height = priv->scale.xHeight; 
+         templat.width = priv->scale.xWidth;
+         templat.height = priv->scale.xHeight;
          templat.interlaced = false;
          for (i = 0; i < OMX_VID_ENC_NUM_SCALING_BUFFERS; ++i) {
             priv->scale_buffer[i] = priv->s_pipe->create_video_buffer(priv->s_pipe, &templat);
@@ -615,7 +615,7 @@ static OMX_ERRORTYPE vid_enc_GetConfig(OMX_HANDLETYPE handle, OMX_INDEXTYPE idx,
    default:
       return omx_base_component_GetConfig(handle, idx, config);
    }
-   
+
    return OMX_ErrorNone;
 }
 
@@ -1010,10 +1010,10 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
    switch (priv->bitrate.eControlRate) {
    case OMX_Video_ControlRateVariable:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE;
-      break; 
+      break;
    case OMX_Video_ControlRateConstant:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_CONSTANT;
-      break; 
+      break;
    case OMX_Video_ControlRateVariableSkipFrames:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_VARIABLE_SKIP;
       break;
@@ -1023,8 +1023,8 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
    default:
       rate_ctrl->rate_ctrl_method = PIPE_H264_ENC_RATE_CONTROL_METHOD_DISABLE;
       break;
-   } 
-      
+   }
+
    rate_ctrl->frame_rate_den = OMX_VID_ENC_CONTROL_FRAME_RATE_DEN_DEFAULT;
    rate_ctrl->frame_rate_num = ((priv->frame_rate) >> 16) * rate_ctrl->frame_rate_den;
 
@@ -1035,7 +1035,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
          rate_ctrl->target_bitrate = priv->bitrate.nTargetBitrate;
       else
          rate_ctrl->target_bitrate = OMX_VID_ENC_BITRATE_MAX;
-      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;    
+      rate_ctrl->peak_bitrate = rate_ctrl->target_bitrate;
       if (rate_ctrl->target_bitrate < OMX_VID_ENC_BITRATE_MEDIAN)
          rate_ctrl->vbv_buffer_size = MIN2((rate_ctrl->target_bitrate * 2.75), OMX_VID_ENC_BITRATE_MEDIAN);
       else
@@ -1051,7 +1051,7 @@ static void enc_ControlPicture(omx_base_PortType *port, struct pipe_h264_enc_pic
       rate_ctrl->peak_bits_picture_integer = rate_ctrl->target_bits_picture;
       rate_ctrl->peak_bits_picture_fraction = 0;
    }
-   
+
    picture->quant_i_frames = priv->quant.nQpI;
    picture->quant_p_frames = priv->quant.nQpP;
    picture->quant_b_frames = priv->quant.nQpB;
@@ -1069,7 +1069,7 @@ static void enc_HandleTask(omx_base_PortType *port, struct encode_task *task,
    unsigned size = priv->ports[OMX_BASE_FILTER_OUTPUTPORT_INDEX]->sPortParam.nBufferSize;
    struct pipe_video_buffer *vbuf = task->buf;
    struct pipe_h264_enc_picture_desc picture = {};
- 
+
    /* -------------- scale input image --------- */
    enc_ScaleInput(port, &vbuf, &size);
    priv->s_pipe->flush(priv->s_pipe, NULL, 0);
@@ -1160,7 +1160,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
        priv->force_pic_type.IntraRefreshVOP) {
       enc_ClearBframes(port, inp);
       picture_type = PIPE_H264_ENC_PICTURE_TYPE_IDR;
-      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE; 
+      priv->force_pic_type.IntraRefreshVOP = OMX_FALSE;
       priv->frame_num = 0;
    } else if (priv->codec->profile == PIPE_VIDEO_PROFILE_MPEG4_AVC_BASELINE ||
               !(priv->pic_order_cnt % OMX_VID_ENC_P_PERIOD_DEFAULT) ||
@@ -1169,7 +1169,7 @@ static OMX_ERRORTYPE vid_enc_EncodeFrame(omx_base_PortType *port, OMX_BUFFERHEAD
    } else {
       picture_type = PIPE_H264_ENC_PICTURE_TYPE_B;
    }
-   
+
    task->pic_order_cnt = priv->pic_order_cnt++;
 
    if (picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
@@ -1245,7 +1245,7 @@ static void vid_enc_BufferEncoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
    output->pBuffer = priv->t_pipe->transfer_map(priv->t_pipe, outp->bitstream, 0,
                                                 PIPE_TRANSFER_READ_WRITE,
                                                 &box, &outp->transfer);
- 
+
    /* ------------- get size of result ----------------- */
 
    priv->codec->get_feedback(priv->codec, task->feedback, &size);

From eeb117a09d6c0eb2b4fa94d55e8015c8aa982727 Mon Sep 17 00:00:00 2001
From: Nishanth Peethambaran <nishanth.peethambaran@amd.com>
Date: Fri, 11 Mar 2016 01:23:00 -0500
Subject: [PATCH 075/197] st/omx/dec: Correct the timestamping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Attach the timestamp to the dpb buffer and use that timestamp
while pushing buffer from dpb list to the omx client.

Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Nishanth Peethambaran <nishanth.peethambaran@amd.com>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/state_trackers/omx/vid_dec.c       | 11 +++++++++--
 src/gallium/state_trackers/omx/vid_dec.h       |  7 ++++++-
 src/gallium/state_trackers/omx/vid_dec_h264.c  | 18 +++++++++++++++---
 .../state_trackers/omx/vid_dec_mpeg12.c        |  6 ++++--
 4 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/src/gallium/state_trackers/omx/vid_dec.c b/src/gallium/state_trackers/omx/vid_dec.c
index 9fcf20ebebf..108a46029e0 100644
--- a/src/gallium/state_trackers/omx/vid_dec.c
+++ b/src/gallium/state_trackers/omx/vid_dec.c
@@ -419,6 +419,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
    priv->in_buffers[i] = buf;
    priv->sizes[i] = buf->nFilledLen;
    priv->inputs[i] = buf->pBuffer;
+   priv->timestamps[i] = buf->nTimeStamp;
 
    while (priv->num_in_buffers > (!!(buf->nFlags & OMX_BUFFERFLAG_EOS) ? 0 : 1)) {
       bool eos = !!(priv->in_buffers[0]->nFlags & OMX_BUFFERFLAG_EOS);
@@ -469,6 +470,7 @@ static OMX_ERRORTYPE vid_dec_DecodeBuffer(omx_base_PortType *port, OMX_BUFFERHEA
          priv->in_buffers[0] = priv->in_buffers[1];
          priv->sizes[0] = priv->sizes[1] - delta;
          priv->inputs[0] = priv->inputs[1] + delta;
+         priv->timestamps[0] = priv->timestamps[1];
       }
 
       if (r)
@@ -526,9 +528,13 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
 {
    vid_dec_PrivateType *priv = comp->pComponentPrivate;
    bool eos = !!(input->nFlags & OMX_BUFFERFLAG_EOS);
+   OMX_TICKS timestamp;
 
-   if (!input->pInputPortPrivate)
-      input->pInputPortPrivate = priv->Flush(priv);
+   if (!input->pInputPortPrivate) {
+      input->pInputPortPrivate = priv->Flush(priv, &timestamp);
+      if (timestamp != OMX_VID_DEC_TIMESTAMP_INVALID)
+         input->nTimeStamp = timestamp;
+   }
 
    if (input->pInputPortPrivate) {
       if (output->pInputPortPrivate) {
@@ -539,6 +545,7 @@ static void vid_dec_FrameDecoded(OMX_COMPONENTTYPE *comp, OMX_BUFFERHEADERTYPE*
          vid_dec_FillOutput(priv, input->pInputPortPrivate, output);
       }
       output->nFilledLen = output->nAllocLen;
+      output->nTimeStamp = input->nTimeStamp;
    }
 
    if (eos && input->pInputPortPrivate)
diff --git a/src/gallium/state_trackers/omx/vid_dec.h b/src/gallium/state_trackers/omx/vid_dec.h
index 3b39826b8a7..649d745dfc0 100644
--- a/src/gallium/state_trackers/omx/vid_dec.h
+++ b/src/gallium/state_trackers/omx/vid_dec.h
@@ -59,6 +59,8 @@
 #define OMX_VID_DEC_AVC_NAME "OMX.mesa.video_decoder.avc"
 #define OMX_VID_DEC_AVC_ROLE "video_decoder.avc"
 
+#define OMX_VID_DEC_TIMESTAMP_INVALID ((OMX_TICKS) -1)
+
 struct vl_vlc;
 
 DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
@@ -69,7 +71,7 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
    struct pipe_video_codec *codec; \
    void (*Decode)(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left); \
    void (*EndFrame)(vid_dec_PrivateType *priv); \
-   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv); \
+   struct pipe_video_buffer *(*Flush)(vid_dec_PrivateType *priv, OMX_TICKS *timestamp); \
    struct pipe_video_buffer *target, *shadow; \
    union { \
       struct { \
@@ -100,6 +102,9 @@ DERIVEDCLASS(vid_dec_PrivateType, omx_base_filter_PrivateType)
    OMX_BUFFERHEADERTYPE *in_buffers[2]; \
    const void *inputs[2]; \
    unsigned sizes[2]; \
+   OMX_TICKS timestamps[2]; \
+   OMX_TICKS timestamp; \
+   bool first_buf_in_frame; \
    bool frame_finished; \
    bool frame_started; \
    unsigned bytes_left; \
diff --git a/src/gallium/state_trackers/omx/vid_dec_h264.c b/src/gallium/state_trackers/omx/vid_dec_h264.c
index 75f27d2f6b0..9aab6d1a1a3 100644
--- a/src/gallium/state_trackers/omx/vid_dec_h264.c
+++ b/src/gallium/state_trackers/omx/vid_dec_h264.c
@@ -45,6 +45,7 @@
 struct dpb_list {
    struct list_head list;
    struct pipe_video_buffer *buffer;
+   OMX_TICKS timestamp;
    unsigned poc;
 };
 
@@ -82,7 +83,7 @@ static const uint8_t Default_8x8_Inter[64] = {
 
 static void vid_dec_h264_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);
 
 void vid_dec_h264_Init(vid_dec_PrivateType *priv)
 {
@@ -94,6 +95,7 @@ void vid_dec_h264_Init(vid_dec_PrivateType *priv)
 
    LIST_INITHEAD(&priv->codec_data.h264.dpb_list);
    priv->picture.h264.field_order_cnt[0] = priv->picture.h264.field_order_cnt[1] = INT_MAX;
+   priv->first_buf_in_frame = true;
 }
 
 static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
@@ -104,6 +106,9 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
       return;
 
    vid_dec_NeedTarget(priv);
+   if (priv->first_buf_in_frame)
+      priv->timestamp = priv->timestamps[0];
+   priv->first_buf_in_frame = false;
 
    priv->picture.h264.num_ref_frames = priv->picture.h264.pps->sps->max_num_ref_frames;
 
@@ -127,7 +132,8 @@ static void vid_dec_h264_BeginFrame(vid_dec_PrivateType *priv)
    priv->frame_started = true;
 }
 
-static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv,
+                                                    OMX_TICKS *timestamp)
 {
    struct dpb_list *entry, *result = NULL;
    struct pipe_video_buffer *buf;
@@ -146,6 +152,8 @@ static struct pipe_video_buffer *vid_dec_h264_Flush(vid_dec_PrivateType *priv)
       return NULL;
 
    buf = result->buffer;
+   if (timestamp)
+      *timestamp = result->timestamp;
 
    --priv->codec_data.h264.dpb_num;
    LIST_DEL(&result->list);
@@ -159,6 +167,7 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
    struct dpb_list *entry;
    struct pipe_video_buffer *tmp;
    bool top_field_first;
+   OMX_TICKS timestamp;
 
    if (!priv->frame_started)
       return;
@@ -181,7 +190,9 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
    if (!entry)
       return;
 
+   priv->first_buf_in_frame = true;
    entry->buffer = priv->target;
+   entry->timestamp = priv->timestamp;
    entry->poc = MIN2(priv->picture.h264.field_order_cnt[0], priv->picture.h264.field_order_cnt[1]);
    LIST_ADDTAIL(&entry->list, &priv->codec_data.h264.dpb_list);
    ++priv->codec_data.h264.dpb_num;
@@ -192,7 +203,8 @@ static void vid_dec_h264_EndFrame(vid_dec_PrivateType *priv)
       return;
 
    tmp = priv->in_buffers[0]->pInputPortPrivate;
-   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv);
+   priv->in_buffers[0]->pInputPortPrivate = vid_dec_h264_Flush(priv, &timestamp);
+   priv->in_buffers[0]->nTimeStamp = timestamp;
    priv->target = tmp;
    priv->frame_finished = priv->in_buffers[0]->pInputPortPrivate != NULL;
 }
diff --git a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
index bef83ecd85a..7b2df8f40fa 100644
--- a/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
+++ b/src/gallium/state_trackers/omx/vid_dec_mpeg12.c
@@ -61,7 +61,7 @@ static uint8_t default_non_intra_matrix[64] = {
 
 static void vid_dec_mpeg12_Decode(vid_dec_PrivateType *priv, struct vl_vlc *vlc, unsigned min_bits_left);
 static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv);
-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv);
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp);
 
 void vid_dec_mpeg12_Init(vid_dec_PrivateType *priv)
 {
@@ -131,10 +131,12 @@ static void vid_dec_mpeg12_EndFrame(vid_dec_PrivateType *priv)
    priv->in_buffers[0]->pInputPortPrivate = done;
 }
 
-static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv)
+static struct pipe_video_buffer *vid_dec_mpeg12_Flush(vid_dec_PrivateType *priv, OMX_TICKS *timestamp)
 {
    struct pipe_video_buffer *result = priv->picture.mpeg12.ref[1];
    priv->picture.mpeg12.ref[1] = NULL;
+   if (timestamp)
+      *timestamp = OMX_VID_DEC_TIMESTAMP_INVALID;
    return result;
 }
 

From bbbdcdcf75f05f97ea346fd6d84ecc3d8ec61d24 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 19 Mar 2016 23:27:56 -0400
Subject: [PATCH 076/197] st/mesa: report correct precision information for
 low/medium/high ints

When we have native integers, these have full precision. Whether they're
low/medium/high isn't piped through the TGSI yet, but eventually those
might have differing precisions. For now they're just 32-bit ints.

Fixes the following dEQP tests:

  dEQP-GLES3.functional.state_query.shader.precision_vertex_highp_int
  dEQP-GLES3.functional.state_query.shader.precision_fragment_highp_int

which expected highp ints to have full 32-bit precision, not the default
23-bit float precision.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_extensions.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 3666ece8ee7..988e9049a20 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -253,6 +253,13 @@ void st_init_limits(struct pipe_screen *screen,
       pc->MaxLocalParams = MIN2(pc->MaxParameters, MAX_PROGRAM_LOCAL_PARAMS);
       pc->MaxEnvParams = MIN2(pc->MaxParameters, MAX_PROGRAM_ENV_PARAMS);
 
+      if (screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_INTEGERS)) {
+         pc->LowInt.RangeMin = 31;
+         pc->LowInt.RangeMax = 30;
+         pc->LowInt.Precision = 0;
+         pc->MediumInt = pc->HighInt = pc->LowInt;
+      }
+
       options->EmitNoNoise = TRUE;
 
       /* TODO: make these more fine-grained if anyone needs it */

From b72156c8e0fe9dd2c4d9614f3f7d1e8bdea0e4dd Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 16 Mar 2016 09:46:05 +0100
Subject: [PATCH 077/197] tgsi: Fix return of uninitialized memory in
 tgsi_*_instruction_memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tgsi_default_instruction_memory / tgsi_build_instruction_memory were
returning uninitialized memory for tgsi_instruction_memory.Texture and
tgsi_instruction_memory.Format. Note 0 means not set, and thus is a
correct default initializer for these.

Fixes: 3243b6fc97 ("tgsi: add Texture and Format to tgsi_instruction_memory")
Cc: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_build.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index e5355f573bb..638730acf95 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -775,6 +775,8 @@ tgsi_default_instruction_memory( void )
    struct tgsi_instruction_memory instruction_memory;
 
    instruction_memory.Qualifier = 0;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
    instruction_memory.Padding = 0;
 
    return instruction_memory;
@@ -790,6 +792,8 @@ tgsi_build_instruction_memory(
    struct tgsi_instruction_memory instruction_memory;
 
    instruction_memory.Qualifier = qualifier;
+   instruction_memory.Texture = 0;
+   instruction_memory.Format = 0;
    instruction_memory.Padding = 0;
    instruction->Memory = 1;
 

From 8f45691cdaedd5e743c11fbd8ae71c192644f715 Mon Sep 17 00:00:00 2001
From: Iago Toral Quiroga <itoral@igalia.com>
Date: Fri, 18 Mar 2016 08:39:23 +0100
Subject: [PATCH 078/197] doc: document spilling options accepted by
 INTEL_DEBUG

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 docs/envvars.html | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/envvars.html b/docs/envvars.html
index e21b7c1aaa0..253aaf26dcd 100644
--- a/docs/envvars.html
+++ b/docs/envvars.html
@@ -164,6 +164,8 @@ See the <a href="xlibdriver.html">Xlib software driver page</a> for details.
    <li>nodualobj - suppress generation of dual-object geometry shader code</li>
    <li>optimizer - dump shader assembly to files at each optimization pass and iteration that make progress</li>
    <li>vec4 - force vec4 mode in vertex shader</li>
+   <li>spill_fs - force spilling of all registers in the scalar backend (useful to debug spilling code)</li>
+   <li>spill_vec4 - force spilling of all registers in the vec4 backend (useful to debug spilling code)</li>
 </ul>
 </ul>
 

From 43ddec2f435b3e42a1c271ef485f6959bd59b148 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Mar 2016 15:26:21 +0100
Subject: [PATCH 079/197] tgsi: Fix decl.Atomic and .Shared not propagating
 when parsing tgsi text

When support for decl.Atomic and .Shared was added, tgsi_build_declaration
was not updated to propagate these properly.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu> (v1)
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
---
 src/gallium/auxiliary/tgsi/tgsi_build.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 638730acf95..f767b089133 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -127,6 +127,8 @@ tgsi_build_declaration(
    unsigned invariant,
    unsigned local,
    unsigned array,
+   unsigned atomic,
+   unsigned shared,
    struct tgsi_header *header )
 {
    struct tgsi_declaration declaration;
@@ -143,6 +145,8 @@ tgsi_build_declaration(
    declaration.Invariant = invariant;
    declaration.Local = local;
    declaration.Array = array;
+   declaration.Atomic = atomic;
+   declaration.Shared = shared;
    header_bodysize_grow( header );
 
    return declaration;
@@ -401,6 +405,8 @@ tgsi_build_full_declaration(
       full_decl->Declaration.Invariant,
       full_decl->Declaration.Local,
       full_decl->Declaration.Array,
+      full_decl->Declaration.Atomic,
+      full_decl->Declaration.Shared,
       header );
 
    if (maxsize <= size)

From 3788e1bf748eca3186377dfa60dbba1c37f8939e Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Mar 2016 13:52:00 +0100
Subject: [PATCH 080/197] tgsi: Add support for global / private / input MEMORY

Extend the MEMORY file support to differentiate between global, private
and shared memory, as well as "input" memory.

"MEMORY[x], INPUT" is intended to access OpenCL kernel parameters, a
special memory type is added for this, since the actual storage of these
(e.g. UBO-s) may differ per implementation. The uploading of kernel
parameters is handled by launch_grid, "MEMORY[x], INPUT" allows drivers
to use an access mechanism for parameter reads which matches with the
upload method.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu> (v1)
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
---
 src/gallium/auxiliary/tgsi/tgsi_build.c       |  8 +++---
 src/gallium/auxiliary/tgsi/tgsi_dump.c        |  9 +++++--
 src/gallium/auxiliary/tgsi/tgsi_text.c        | 14 +++++++++--
 src/gallium/auxiliary/tgsi/tgsi_ureg.c        | 25 +++++++++++--------
 src/gallium/auxiliary/tgsi/tgsi_ureg.h        |  2 +-
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     |  7 +++---
 src/gallium/include/pipe/p_shader_tokens.h    | 10 ++++++--
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp    |  2 +-
 8 files changed, 51 insertions(+), 26 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index f767b089133..7e30bb646e2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -111,7 +111,7 @@ tgsi_default_declaration( void )
    declaration.Local = 0;
    declaration.Array = 0;
    declaration.Atomic = 0;
-   declaration.Shared = 0;
+   declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
    declaration.Padding = 0;
 
    return declaration;
@@ -128,7 +128,7 @@ tgsi_build_declaration(
    unsigned local,
    unsigned array,
    unsigned atomic,
-   unsigned shared,
+   unsigned mem_type,
    struct tgsi_header *header )
 {
    struct tgsi_declaration declaration;
@@ -146,7 +146,7 @@ tgsi_build_declaration(
    declaration.Local = local;
    declaration.Array = array;
    declaration.Atomic = atomic;
-   declaration.Shared = shared;
+   declaration.MemType = mem_type;
    header_bodysize_grow( header );
 
    return declaration;
@@ -406,7 +406,7 @@ tgsi_build_full_declaration(
       full_decl->Declaration.Local,
       full_decl->Declaration.Array,
       full_decl->Declaration.Atomic,
-      full_decl->Declaration.Shared,
+      full_decl->Declaration.MemType,
       header );
 
    if (maxsize <= size)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index c8b91bba534..6d39ef21083 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -365,8 +365,13 @@ iter_declaration(
    }
 
    if (decl->Declaration.File == TGSI_FILE_MEMORY) {
-      if (decl->Declaration.Shared)
-         TXT(", SHARED");
+      switch (decl->Declaration.MemType) {
+      /* Note: ,GLOBAL is optional / the default */
+      case TGSI_MEMORY_TYPE_GLOBAL:  TXT(", GLOBAL");  break;
+      case TGSI_MEMORY_TYPE_SHARED:  TXT(", SHARED");  break;
+      case TGSI_MEMORY_TYPE_PRIVATE: TXT(", PRIVATE"); break;
+      case TGSI_MEMORY_TYPE_INPUT:   TXT(", INPUT");   break;
+      }
    }
 
    if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 77598d2cb79..028633c9412 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1390,8 +1390,18 @@ static boolean parse_declaration( struct translate_ctx *ctx )
             ctx->cur = cur;
          }
       } else if (file == TGSI_FILE_MEMORY) {
-         if (str_match_nocase_whole(&cur, "SHARED")) {
-            decl.Declaration.Shared = 1;
+         if (str_match_nocase_whole(&cur, "GLOBAL")) {
+            /* Note this is a no-op global is the default */
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_GLOBAL;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "SHARED")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_SHARED;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "PRIVATE")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_PRIVATE;
+            ctx->cur = cur;
+         } else if (str_match_nocase_whole(&cur, "INPUT")) {
+            decl.Declaration.MemType = TGSI_MEMORY_TYPE_INPUT;
             ctx->cur = cur;
          }
       } else {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index 0dd5ea76f33..297e257b322 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -191,7 +191,7 @@ struct ureg_program
 
    struct ureg_tokens domain[2];
 
-   bool use_shared_memory;
+   bool use_memory[TGSI_MEMORY_TYPE_COUNT];
 };
 
 static union tgsi_any_token error_tokens[32];
@@ -730,13 +730,14 @@ struct ureg_src ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr,
    return reg;
 }
 
-/* Allocate a shared memory area.
+/* Allocate a memory area.
  */
-struct ureg_src ureg_DECL_shared_memory(struct ureg_program *ureg)
+struct ureg_src ureg_DECL_memory(struct ureg_program *ureg,
+                                 unsigned memory_type)
 {
-   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, 0);
+   struct ureg_src reg = ureg_src_register(TGSI_FILE_MEMORY, memory_type);
 
-   ureg->use_shared_memory = true;
+   ureg->use_memory[memory_type] = true;
    return reg;
 }
 
@@ -1673,7 +1674,7 @@ emit_decl_buffer(struct ureg_program *ureg,
 }
 
 static void
-emit_decl_shared_memory(struct ureg_program *ureg)
+emit_decl_memory(struct ureg_program *ureg, unsigned memory_type)
 {
    union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 2);
 
@@ -1682,11 +1683,11 @@ emit_decl_shared_memory(struct ureg_program *ureg)
    out[0].decl.NrTokens = 2;
    out[0].decl.File = TGSI_FILE_MEMORY;
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW;
-   out[0].decl.Shared = true;
+   out[0].decl.MemType = memory_type;
 
    out[1].value = 0;
-   out[1].decl_range.First = 0;
-   out[1].decl_range.Last = 0;
+   out[1].decl_range.First = memory_type;
+   out[1].decl_range.Last = memory_type;
 }
 
 static void
@@ -1861,8 +1862,10 @@ static void emit_decls( struct ureg_program *ureg )
       emit_decl_buffer(ureg, ureg->buffer[i].index, ureg->buffer[i].atomic);
    }
 
-   if (ureg->use_shared_memory)
-      emit_decl_shared_memory(ureg);
+   for (i = 0; i < TGSI_MEMORY_TYPE_COUNT; i++) {
+      if (ureg->use_memory[i])
+         emit_decl_memory(ureg, i);
+   }
 
    if (ureg->const_decls.nr_constant_ranges) {
       for (i = 0; i < ureg->const_decls.nr_constant_ranges; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 74324678a99..b4258fdbaa2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -340,7 +340,7 @@ struct ureg_src
 ureg_DECL_buffer(struct ureg_program *ureg, unsigned nr, bool atomic);
 
 struct ureg_src
-ureg_DECL_shared_memory(struct ureg_program *ureg);
+ureg_DECL_memory(struct ureg_program *ureg, unsigned memory_type);
 
 static inline struct ureg_src
 ureg_imm4f( struct ureg_program *ureg,
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 39cd986c23a..5d744aff338 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -864,7 +864,7 @@ public:
    std::vector<Resource> resources;
 
    struct MemoryFile {
-      bool shared;
+      uint8_t mem_type; // TGSI_MEMORY_TYPE_*
    };
    std::vector<MemoryFile> memoryFiles;
 
@@ -1222,7 +1222,7 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       break;
    case TGSI_FILE_MEMORY:
       for (i = first; i <= last; ++i)
-         memoryFiles[i].shared = decl->Declaration.Shared;
+         memoryFiles[i].mem_type = decl->Declaration.MemType;
       break;
    case TGSI_FILE_NULL:
    case TGSI_FILE_TEMPORARY:
@@ -1527,7 +1527,8 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
 
    sym->reg.fileIndex = fileIdx;
 
-   if (tgsiFile == TGSI_FILE_MEMORY && code->memoryFiles[fileIdx].shared)
+   if (tgsiFile == TGSI_FILE_MEMORY &&
+       code->memoryFiles[fileIdx].mem_type == TGSI_MEMORY_TYPE_SHARED)
       sym->setFile(FILE_MEMORY_SHARED);
 
    if (idx >= 0) {
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 5c460276d73..e1979dd1ebd 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -117,6 +117,12 @@ enum tgsi_file_type {
 #define TGSI_CYLINDRICAL_WRAP_Z (1 << 2)
 #define TGSI_CYLINDRICAL_WRAP_W (1 << 3)
 
+#define TGSI_MEMORY_TYPE_GLOBAL        0 /* OpenCL global              */
+#define TGSI_MEMORY_TYPE_SHARED        1 /* OpenCL local / GLSL shared */
+#define TGSI_MEMORY_TYPE_PRIVATE       2 /* OpenCL private             */
+#define TGSI_MEMORY_TYPE_INPUT         3 /* OpenCL kernel input params */
+#define TGSI_MEMORY_TYPE_COUNT         4
+
 struct tgsi_declaration
 {
    unsigned Type        : 4;  /**< TGSI_TOKEN_TYPE_DECLARATION */
@@ -130,8 +136,8 @@ struct tgsi_declaration
    unsigned Local       : 1;  /**< optimize as subroutine local variable? */
    unsigned Array       : 1;  /**< extra array info? */
    unsigned Atomic      : 1;  /**< atomic only? for TGSI_FILE_BUFFER */
-   unsigned Shared      : 1;  /**< shared storage for TGSI_FILE_MEMORY */
-   unsigned Padding     : 4;
+   unsigned MemType     : 2;  /**< TGSI_MEMORY_TYPE_x for TGSI_FILE_MEMORY */
+   unsigned Padding     : 3;
 };
 
 struct tgsi_declaration_range
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index bdfd5ebb9f1..06b4bb41a9b 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6345,7 +6345,7 @@ st_translate_program(
    }
 
    if (program->use_shared_memory)
-      t->shared_memory = ureg_DECL_shared_memory(ureg);
+      t->shared_memory = ureg_DECL_memory(ureg, TGSI_MEMORY_TYPE_SHARED);
 
    for (i = 0; i < program->shader->NumImages; i++) {
       if (program->images_used & (1 << i)) {

From 54cdde5effd8540eb45b55c71ed34ff5c058a6f0 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 10 Mar 2016 16:02:06 +0100
Subject: [PATCH 081/197] nouveau: codegen: Add support for clover / OpenCL
 kernel input parameters

Add support for clover / OpenCL kernel input parameters.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu> (v1)
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp      | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 5d744aff338..2a1047fcfee 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1527,9 +1527,21 @@ Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
 
    sym->reg.fileIndex = fileIdx;
 
-   if (tgsiFile == TGSI_FILE_MEMORY &&
-       code->memoryFiles[fileIdx].mem_type == TGSI_MEMORY_TYPE_SHARED)
-      sym->setFile(FILE_MEMORY_SHARED);
+   if (tgsiFile == TGSI_FILE_MEMORY) {
+      switch (code->memoryFiles[fileIdx].mem_type) {
+      case TGSI_MEMORY_TYPE_SHARED:
+         sym->setFile(FILE_MEMORY_SHARED);
+         break;
+      case TGSI_MEMORY_TYPE_INPUT:
+         assert(prog->getType() == Program::TYPE_COMPUTE);
+         assert(idx == -1);
+         sym->setFile(FILE_SHADER_INPUT);
+         address += info->prop.cp.inputOffset;
+         break;
+      default:
+         assert(0); /* TODO: Add support for global and private memory */
+      }
+   }
 
    if (idx >= 0) {
       if (sym->reg.file == FILE_SHADER_INPUT)

From c783ad0e24fc4b5c843d29eb85270f5e29b7c4d1 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 15 Mar 2016 13:48:30 +0100
Subject: [PATCH 082/197] nouveau: codegen: Slightly refactor
 Source::scanInstruction() dst handling

Use the dst temp variable which was used in the TGSI_FILE_OUTPUT
case everywhere. This makes the code somewhat easier to reads
and helps avoiding going over 80 chars with upcoming changes.

This also brings the dst handling more in line with the src
handling.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 2a1047fcfee..762f7ccbb16 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1261,9 +1261,9 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
       info->numBarriers = 1;
 
    if (insn.dstCount()) {
-      if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
-         Instruction::DstRegister dst = insn.getDst(0);
+      Instruction::DstRegister dst = insn.getDst(0);
 
+      if (dst.getFile() == TGSI_FILE_OUTPUT) {
          if (dst.isIndirect(0))
             for (unsigned i = 0; i < info->numOutputs; ++i)
                info->out[i].mask = 0xf;
@@ -1280,11 +1280,11 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
          if (isEdgeFlagPassthrough(insn))
             info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
       } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
-         if (insn.getDst(0).isIndirect(0))
-            indirectTempArrays.insert(insn.getDst(0).getArrayId());
+      if (dst.getFile() == TGSI_FILE_TEMPORARY) {
+         if (dst.isIndirect(0))
+            indirectTempArrays.insert(dst.getArrayId());
       } else
-      if (insn.getDst(0).getFile() == TGSI_FILE_BUFFER) {
+      if (dst.getFile() == TGSI_FILE_BUFFER) {
          info->io.globalAccess |= 0x2;
       }
    }

From 71e315475c780311fea205ab8c9a898a67da683b Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 15 Mar 2016 14:37:27 +0100
Subject: [PATCH 083/197] nouveau: codegen: gk110: Make emitSTORE offset
 handling identical to emitLOAD

Make the store offset handling in CodeEmitterGK110::emitSTORE identical
to the one in CodeEmitterGK110::emitLOAD handling.

This is just a cleanup, it does not cause any functional changes.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 0d7d95e3105..70f3c3f69ff 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1655,10 +1655,8 @@ CodeEmitterGK110::emitSTORE(const Instruction *i)
       break;
    }
 
-   if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
-      offset &= 0xffffff;
-
    if (code[0] & 0x2) {
+      offset &= 0xffffff;
       emitLoadStoreType(i->dType, 0x33);
       if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
          emitCachingMode(i->cache, 0x2f);

From 86e444036190fb6215e9365fe4126b420a15784a Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Wed, 16 Mar 2016 10:10:47 +0100
Subject: [PATCH 084/197] nouveau: codegen: Disable more old resource handling
 code

Commit c3083c7082 ("nv50/ir: add support for BUFFER accesses") disabled /
commented out some of the old resource handling code, but not all of it.

Effectively all of it is dead already, if we ever enter the old code
paths in handeLOAD / handleSTORE / handleATOM we will get an exception
due to trying to access the now always zero-sized resources vector.

Disable all the dead code.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
---
 .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 762f7ccbb16..dad0e2e6f69 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -856,12 +856,14 @@ public:
    };
    std::vector<TextureView> textureViews;
 
+   /*
    struct Resource {
       uint8_t target; // TGSI_TEXTURE_*
       bool raw;
       uint8_t slot; // $surface index
    };
    std::vector<Resource> resources;
+   */
 
    struct MemoryFile {
       uint8_t mem_type; // TGSI_MEMORY_TYPE_*
@@ -1419,8 +1421,8 @@ private:
    void handleLIT(Value *dst0[4]);
    void handleUserClipPlanes();
 
-   Symbol *getResourceBase(int r);
-   void getResourceCoords(std::vector<Value *>&, int r, int s);
+   // Symbol *getResourceBase(int r);
+   // void getResourceCoords(std::vector<Value *>&, int r, int s);
 
    void handleLOAD(Value *dst0[4]);
    void handleSTORE();
@@ -2149,6 +2151,7 @@ Converter::handleLIT(Value *dst0[4])
    }
 }
 
+/* Keep this around for now as reference when adding img support
 static inline bool
 isResourceSpecial(const int r)
 {
@@ -2245,6 +2248,7 @@ partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
    }
    return n + 1;
 }
+*/
 
 // For raw loads, granularity is 4 byte.
 // Usage of the texture read mask on OP_SULDP is not allowed.
@@ -2279,6 +2283,7 @@ Converter::handleLOAD(Value *dst0[4])
       return;
    }
 
+/* Keep this around for now as reference when adding img support
    getResourceCoords(off, r, 1);
 
    if (isResourceRaw(code, r)) {
@@ -2344,6 +2349,7 @@ Converter::handleLOAD(Value *dst0[4])
    FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
       if (dst0[c] != def[c])
          mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+*/
 }
 
 // For formatted stores, the write mask on OP_SUSTP can be used.
@@ -2380,6 +2386,7 @@ Converter::handleSTORE()
       return;
    }
 
+/* Keep this around for now as reference when adding img support
    getResourceCoords(off, r, 0);
    src = off;
    const int s = src.size();
@@ -2427,6 +2434,7 @@ Converter::handleSTORE()
       mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
             dummy, src)->tex.mask = tgsi.getDst(0).getMask();
    }
+*/
 }
 
 // XXX: These only work on resources with the single-component u32/s32 formats.
@@ -2473,7 +2481,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
       return;
    }
 
-
+/* Keep this around for now as reference when adding img support
    getResourceCoords(srcv, r, 1);
 
    if (isResourceSpecial(r)) {
@@ -2501,6 +2509,7 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
    for (int c = 0; c < 4; ++c)
       if (dst0[c])
          dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+*/
 }
 
 void

From 9b4c8f66294d6b8c3a935736b03a6e530b716f95 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 17 Mar 2016 10:00:59 +0100
Subject: [PATCH 085/197] nouveau: codegen: Do not silently fail in handeLOAD /
 handleSTORE / handleATOM

handeLOAD / handleSTORE / handleATOM can only handle TGSI_FILE_BUFFER
and TGSI_FILE_MEMORY. Make things fail explictly when another
register-file is used in these functions.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
---
 .../nouveau/codegen/nv50_ir_from_tgsi.cpp     | 27 ++++++++++++-------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index dad0e2e6f69..12f2551ddf4 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -2259,8 +2259,9 @@ Converter::handleLOAD(Value *dst0[4])
    int c;
    std::vector<Value *> off, src, ldv, def;
 
-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (c = 0; c < 4; ++c) {
          if (!dst0[c])
             continue;
@@ -2280,7 +2281,9 @@ Converter::handleLOAD(Value *dst0[4])
          if (tgsi.getSrc(0).isIndirect(0))
             ld->setIndirect(0, 1, fetchSrc(tgsi.getSrc(0).getIndirect(0), 0, 0));
       }
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for LOAD");
    }
 
 /* Keep this around for now as reference when adding img support
@@ -2361,8 +2364,9 @@ Converter::handleSTORE()
    int c;
    std::vector<Value *> off, src, dummy;
 
-   if (tgsi.getDst(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getDst(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getDst(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (c = 0; c < 4; ++c) {
          if (!(tgsi.getDst(0).getMask() & (1 << c)))
             continue;
@@ -2383,7 +2387,9 @@ Converter::handleSTORE()
          if (tgsi.getDst(0).isIndirect(0))
             st->setIndirect(0, 1, fetchSrc(tgsi.getDst(0).getIndirect(0), 0, 0));
       }
-      return;
+      break;
+   default:
+      assert(!"Unsupported dstFile for STORE");
    }
 
 /* Keep this around for now as reference when adding img support
@@ -2449,8 +2455,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
    std::vector<Value *> defv;
    LValue *dst = getScratch();
 
-   if (tgsi.getSrc(0).getFile() == TGSI_FILE_BUFFER ||
-       tgsi.getSrc(0).getFile() == TGSI_FILE_MEMORY) {
+   switch (tgsi.getSrc(0).getFile()) {
+   case TGSI_FILE_BUFFER:
+   case TGSI_FILE_MEMORY:
       for (int c = 0; c < 4; ++c) {
          if (!dst0[c])
             continue;
@@ -2478,7 +2485,9 @@ Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
       for (int c = 0; c < 4; ++c)
          if (dst0[c])
             dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
-      return;
+      break;
+   default:
+      assert(!"Unsupported srcFile for ATOM");
    }
 
 /* Keep this around for now as reference when adding img support

From dcf8a4d2817c74f44c5a0d16012d656705202ec3 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 17 Mar 2016 10:04:15 +0100
Subject: [PATCH 086/197] gallium: Remove unused TGSI_RESOURCE_ defines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These magic file-index defines where only ever used in the nouveau code
and that no longer uses them.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com> (v2)
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v2)
---
 src/gallium/include/pipe/p_shader_tokens.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index e1979dd1ebd..5cc18a293d3 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -237,15 +237,6 @@ struct tgsi_declaration_array {
    unsigned Padding : 22;
 };
 
-/*
- * Special resources that don't need to be declared.  They map to the
- * GLOBAL/LOCAL/PRIVATE/INPUT compute memory spaces.
- */
-#define TGSI_RESOURCE_GLOBAL	0x7fff
-#define TGSI_RESOURCE_LOCAL	0x7ffe
-#define TGSI_RESOURCE_PRIVATE	0x7ffd
-#define TGSI_RESOURCE_INPUT	0x7ffc
-
 #define TGSI_IMM_FLOAT32   0
 #define TGSI_IMM_UINT32    1
 #define TGSI_IMM_INT32     2

From a8b315b8271e867db30650dedb52e53d8dd9667c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Fri, 18 Mar 2016 17:16:39 -0500
Subject: [PATCH 087/197] st/mesa: use the texture view's format for
 render-to-texture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Aside from the bug below, it fixes a simplistic test I've written locally,
and I see no regression in Piglit for radeonsi.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94595
Cc: "11.0 11.1 11.2" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_fbo.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 82ab914503b..ff570e0e444 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -387,6 +387,7 @@ st_update_renderbuffer_surface(struct st_context *st,
 {
    struct pipe_context *pipe = st->pipe;
    struct pipe_resource *resource = strb->texture;
+   struct st_texture_object *stTexObj = NULL;
    unsigned rtt_width = strb->Base.Width;
    unsigned rtt_height = strb->Base.Height;
    unsigned rtt_depth = strb->Base.Depth;
@@ -398,9 +399,18 @@ st_update_renderbuffer_surface(struct st_context *st,
     */
    boolean enable_srgb = (st->ctx->Color.sRGBEnabled &&
          _mesa_get_format_color_encoding(strb->Base.Format) == GL_SRGB);
-   enum pipe_format format = (enable_srgb) ?
-      util_format_srgb(resource->format) :
-      util_format_linear(resource->format);
+   enum pipe_format format = resource->format;
+
+   if (strb->is_rtt) {
+      stTexObj = st_texture_object(strb->Base.TexImage->TexObject);
+      if (stTexObj->surface_based)
+         format = stTexObj->surface_format;
+   }
+
+   format = (enable_srgb) ?
+      util_format_srgb(format) :
+      util_format_linear(format);
+
    unsigned first_layer, last_layer, level;
 
    if (resource->target == PIPE_TEXTURE_1D_ARRAY) {
@@ -431,8 +441,8 @@ st_update_renderbuffer_surface(struct st_context *st,
 
    /* Adjust for texture views */
    if (strb->is_rtt && resource->array_size > 1 &&
-       strb->Base.TexImage->TexObject->Immutable) {
-      struct gl_texture_object *tex = strb->Base.TexImage->TexObject;
+       stTexObj->base.Immutable) {
+      struct gl_texture_object *tex = &stTexObj->base;
       first_layer += tex->MinLayer;
       if (!strb->rtt_layered)
          last_layer += tex->MinLayer;
@@ -492,8 +502,6 @@ st_render_texture(struct gl_context *ctx,
 
    st_update_renderbuffer_surface(st, strb);
 
-   strb->Base.Format = st_pipe_format_to_mesa_format(pt->format);
-
    /* Invalidate buffer state so that the pipe's framebuffer state
     * gets updated.
     * That's where the new renderbuffer (which we just created) gets

From 63e020d734faa224dae576e2883ef39d8827fcad Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 12:11:39 -0600
Subject: [PATCH 088/197] gallium/tgsi: pass TGSI tex target to
 tgsi_transform_tex_inst()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of hard-coded 2D tex target in tgsi_transform_tex_2d_inst()

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/draw/draw_pipe_aaline.c   | 10 +++++-----
 src/gallium/auxiliary/tgsi/tgsi_transform.h     | 17 ++++++++++-------
 src/gallium/auxiliary/util/u_pstipple.c         | 10 +++++-----
 src/mesa/state_tracker/st_cb_bitmap_shader.c    |  8 ++++----
 .../state_tracker/st_cb_drawpixels_shader.c     |  6 +++---
 5 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index e85ae16c1df..cd9ee5434d3 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -264,11 +264,11 @@ aa_transform_epilog(struct tgsi_transform_context *ctx)
    if (aactx->colorOutput != -1) {
       /* insert texture sampling code for antialiasing. */
 
-      /* TEX texTemp, input_coord, sampler */
-      tgsi_transform_tex_2d_inst(ctx,
-                                 TGSI_FILE_TEMPORARY, aactx->texTemp,
-                                 TGSI_FILE_INPUT, aactx->maxInput + 1,
-                                 aactx->freeSampler);
+      /* TEX texTemp, input_coord, sampler, 2D */
+      tgsi_transform_tex_inst(ctx,
+                              TGSI_FILE_TEMPORARY, aactx->texTemp,
+                              TGSI_FILE_INPUT, aactx->maxInput + 1,
+                              TGSI_TEXTURE_2D, aactx->freeSampler);
 
       /* MOV rgb */
       tgsi_transform_op1_inst(ctx, TGSI_OPCODE_MOV,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 4dd7dda25fd..c21ff959cbf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -516,15 +516,18 @@ tgsi_transform_kill_inst(struct tgsi_transform_context *ctx,
 
 
 static inline void
-tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
-                           unsigned dst_file,
-                           unsigned dst_index,
-                           unsigned src_file,
-                           unsigned src_index,
-                           unsigned sampler_index)
+tgsi_transform_tex_inst(struct tgsi_transform_context *ctx,
+                        unsigned dst_file,
+                        unsigned dst_index,
+                        unsigned src_file,
+                        unsigned src_index,
+                        unsigned tex_target,
+                        unsigned sampler_index)
 {
    struct tgsi_full_instruction inst;
 
+   assert(tex_target < TGSI_TEXTURE_COUNT);
+
    inst = tgsi_default_full_instruction();
    inst.Instruction.Opcode = TGSI_OPCODE_TEX;
    inst.Instruction.NumDstRegs = 1;
@@ -532,7 +535,7 @@ tgsi_transform_tex_2d_inst(struct tgsi_transform_context *ctx,
    inst.Dst[0].Register.Index = dst_index;
    inst.Instruction.NumSrcRegs = 2;
    inst.Instruction.Texture = TRUE;
-   inst.Texture.Texture = TGSI_TEXTURE_2D;
+   inst.Texture.Texture = tex_target;
    inst.Src[0].Register.File = src_file;
    inst.Src[0].Register.Index = src_index;
    inst.Src[1].Register.File = TGSI_FILE_SAMPLER;
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 74e6f99da67..bcbe2a25b25 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -344,11 +344,11 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
                            pctx->wincoordFile, wincoordInput,
                            TGSI_FILE_IMMEDIATE, pctx->numImmed);
 
-   /* TEX texTemp, texTemp, sampler; */
-   tgsi_transform_tex_2d_inst(ctx,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              TGSI_FILE_TEMPORARY, texTemp,
-                              sampIdx);
+   /* TEX texTemp, texTemp, sampler, 2D; */
+   tgsi_transform_tex_inst(ctx,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_FILE_TEMPORARY, texTemp,
+                           TGSI_TEXTURE_2D, sampIdx);
 
    /* KILL_IF -texTemp;   # if -texTemp < 0, kill fragment */
    tgsi_transform_kill_inst(ctx,
diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
index 88779bc627d..42aa0337af9 100644
--- a/src/mesa/state_tracker/st_cb_bitmap_shader.c
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -89,10 +89,10 @@ transform_instr(struct tgsi_transform_context *tctx,
    tgsi_transform_sampler_decl(tctx, ctx->sampler_index);
 
    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
-   tgsi_transform_tex_2d_inst(tctx,
-                              TGSI_FILE_TEMPORARY, 0,
-                              TGSI_FILE_INPUT, texcoord_index,
-                              ctx->sampler_index);
+   tgsi_transform_tex_inst(tctx,
+                           TGSI_FILE_TEMPORARY, 0,
+                           TGSI_FILE_INPUT, texcoord_index,
+                           TGSI_TEXTURE_2D, ctx->sampler_index);
 
    /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
    inst = tgsi_default_full_instruction();
diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
index 2cf75f8bd77..2170850151d 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -129,9 +129,9 @@ transform_instr(struct tgsi_transform_context *tctx,
    /* Get initial pixel color from the texture.
     * TEX temp, fragment.texcoord[0], texture[0], 2D;
     */
-   tgsi_transform_tex_2d_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp,
-                              TGSI_FILE_INPUT, texcoord_index,
-                              ctx->drawpix_sampler);
+   tgsi_transform_tex_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp,
+                           TGSI_FILE_INPUT, texcoord_index,
+                           TGSI_TEXTURE_2D, ctx->drawpix_sampler);
 
    /* Apply the scale and bias. */
    if (ctx->scale_and_bias) {

From 83b5b3d66ecb502e69c0f8a09b1673e94dcac1bc Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 12:16:50 -0600
Subject: [PATCH 089/197] st/mesa: use correct TGSI texture target in bitmap
 fragment shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depending on the driver's support for NPOT textures, we might use
a RECT texture instead of 2D texture.  We should propogate that info
to the fragment shader's TEX instruction.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_cb_bitmap.h        |  2 +-
 src/mesa/state_tracker/st_cb_bitmap_shader.c | 11 +++++++++--
 src/mesa/state_tracker/st_program.c          |  1 +
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_bitmap.h b/src/mesa/state_tracker/st_cb_bitmap.h
index 4d1ae222b81..323158ea11d 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.h
+++ b/src/mesa/state_tracker/st_cb_bitmap.h
@@ -49,7 +49,7 @@ st_flush_bitmap_cache(struct st_context *st);
 
 extern const struct tgsi_token *
 st_get_bitmap_shader(const struct tgsi_token *tokens,
-                     unsigned sampler_index,
+                     unsigned tex_target, unsigned sampler_index,
                      bool use_texcoord, bool swizzle_xxxx);
 
 #endif /* ST_CB_BITMAP_H */
diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
index 42aa0337af9..a0b9be33415 100644
--- a/src/mesa/state_tracker/st_cb_bitmap_shader.c
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -36,6 +36,7 @@ struct tgsi_bitmap_transform {
    struct tgsi_transform_context base;
    struct tgsi_shader_info info;
    unsigned sampler_index;
+   unsigned tex_target;
    bool use_texcoord;
    bool swizzle_xxxx;
    bool first_instruction_emitted;
@@ -53,6 +54,8 @@ transform_instr(struct tgsi_transform_context *tctx,
 {
    struct tgsi_bitmap_transform *ctx = tgsi_bitmap_transform(tctx);
    struct tgsi_full_instruction inst;
+   unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D
+      ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT;
    unsigned i, semantic;
    int texcoord_index = -1;
 
@@ -92,7 +95,7 @@ transform_instr(struct tgsi_transform_context *tctx,
    tgsi_transform_tex_inst(tctx,
                            TGSI_FILE_TEMPORARY, 0,
                            TGSI_FILE_INPUT, texcoord_index,
-                           TGSI_TEXTURE_2D, ctx->sampler_index);
+                           tgsi_tex_target, ctx->sampler_index);
 
    /* KIL if -tmp0 < 0 # texel=0 -> keep / texel=0 -> discard */
    inst = tgsi_default_full_instruction();
@@ -121,15 +124,19 @@ transform_instr(struct tgsi_transform_context *tctx,
 
 const struct tgsi_token *
 st_get_bitmap_shader(const struct tgsi_token *tokens,
-                     unsigned sampler_index,
+                     unsigned tex_target, unsigned sampler_index,
                      bool use_texcoord, bool swizzle_xxxx)
 {
    struct tgsi_bitmap_transform ctx;
    struct tgsi_token *newtoks;
    int newlen;
 
+   assert(tex_target == PIPE_TEXTURE_2D ||
+          tex_target == PIPE_TEXTURE_RECT);
+
    memset(&ctx, 0, sizeof(ctx));
    ctx.base.transform_instruction = transform_instr;
+   ctx.tex_target = tex_target;
    ctx.sampler_index = sampler_index;
    ctx.use_texcoord = use_texcoord;
    ctx.swizzle_xxxx = swizzle_xxxx;
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index c9f390aa9a2..d4ff845a023 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -871,6 +871,7 @@ st_create_fp_variant(struct st_context *st,
       variant->bitmap_sampler = ffs(~stfp->Base.Base.SamplersUsed) - 1;
 
       tokens = st_get_bitmap_shader(tgsi.tokens,
+                                    st->internal_target,
                                     variant->bitmap_sampler,
                                     st->needs_texcoord_semantic,
                                     st->bitmap.tex_format ==

From eda81fa3571dac4b54c52134e9d198655f4561b0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 12:20:10 -0600
Subject: [PATCH 090/197] st/mesa: use correct TGSI texture target in drawpix
 fragment shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.h        |  2 +-
 src/mesa/state_tracker/st_cb_drawpixels_shader.c | 11 +++++++++--
 src/mesa/state_tracker/st_program.c              |  2 +-
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.h b/src/mesa/state_tracker/st_cb_drawpixels.h
index f1fb32dd6cf..24526d55402 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.h
+++ b/src/mesa/state_tracker/st_cb_drawpixels.h
@@ -46,6 +46,6 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
                       bool scale_and_bias, unsigned scale_const,
                       unsigned bias_const, bool pixel_maps,
                       unsigned drawpix_sampler, unsigned pixelmap_sampler,
-                      unsigned texcoord_const);
+                      unsigned texcoord_const, unsigned tex_target);
 
 #endif /* ST_CB_DRAWPIXELS_H */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
index 2170850151d..5a620f73e08 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -43,6 +43,7 @@ struct tgsi_drawpix_transform {
    unsigned drawpix_sampler;
    unsigned pixelmap_sampler;
    unsigned texcoord_const;
+   unsigned tex_target;
 };
 
 static inline struct tgsi_drawpix_transform *
@@ -72,6 +73,8 @@ transform_instr(struct tgsi_transform_context *tctx,
 		struct tgsi_full_instruction *current_inst)
 {
    struct tgsi_drawpix_transform *ctx = tgsi_drawpix_transform(tctx);
+   const unsigned tgsi_tex_target = ctx->tex_target == PIPE_TEXTURE_2D
+      ? TGSI_TEXTURE_2D : TGSI_TEXTURE_RECT;
    unsigned i, sem_texcoord = ctx->use_texcoord ? TGSI_SEMANTIC_TEXCOORD :
                                                   TGSI_SEMANTIC_GENERIC;
    int texcoord_index = -1;
@@ -131,7 +134,7 @@ transform_instr(struct tgsi_transform_context *tctx,
     */
    tgsi_transform_tex_inst(tctx, TGSI_FILE_TEMPORARY, ctx->color_temp,
                            TGSI_FILE_INPUT, texcoord_index,
-                           TGSI_TEXTURE_2D, ctx->drawpix_sampler);
+                           tgsi_tex_target, ctx->drawpix_sampler);
 
    /* Apply the scale and bias. */
    if (ctx->scale_and_bias) {
@@ -204,12 +207,15 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
                       bool scale_and_bias, unsigned scale_const,
                       unsigned bias_const, bool pixel_maps,
                       unsigned drawpix_sampler, unsigned pixelmap_sampler,
-                      unsigned texcoord_const)
+                      unsigned texcoord_const, unsigned tex_target)
 {
    struct tgsi_drawpix_transform ctx;
    struct tgsi_token *newtoks;
    int newlen;
 
+   assert(tex_target == PIPE_TEXTURE_2D ||
+          tex_target == PIPE_TEXTURE_RECT);
+
    memset(&ctx, 0, sizeof(ctx));
    ctx.base.transform_instruction = transform_instr;
    ctx.use_texcoord = use_texcoord;
@@ -220,6 +226,7 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
    ctx.drawpix_sampler = drawpix_sampler;
    ctx.pixelmap_sampler = pixelmap_sampler;
    ctx.texcoord_const = texcoord_const;
+   ctx.tex_target = tex_target;
    tgsi_scan_shader(tokens, &ctx.info);
 
    newlen = tgsi_num_tokens(tokens) + 30;
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index d4ff845a023..80dcfd82743 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -924,7 +924,7 @@ st_create_fp_variant(struct st_context *st,
                                      bias_const, key->pixelMaps,
                                      variant->drawpix_sampler,
                                      variant->pixelmap_sampler,
-                                     texcoord_const);
+                                     texcoord_const, st->internal_target);
 
       if (tokens) {
          if (tgsi.tokens != stfp->tgsi.tokens)

From 72eb5a3cfe29eabdd0a2386642b36f9648a911fc Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 15:58:37 -0600
Subject: [PATCH 091/197] st/mesa: emit sampler view declarations for ARB
 vert/frag programs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_mesa_to_tgsi.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 8a12ce4c685..7a686b199d5 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -296,6 +296,19 @@ st_translate_texture_target(GLuint textarget, GLboolean shadow)
 }
 
 
+/**
+ * Translate a (1 << TEXTURE_x_INDEX) bit into a TGSI_TEXTURE_x enum.
+ */
+static unsigned
+translate_texture_index(GLbitfield texBit, bool shadow)
+{
+   int index = ffs(texBit);
+   assert(index > 0);
+   assert(index - 1 < NUM_TEXTURE_TARGETS);
+   return st_translate_texture_target(index - 1, shadow);
+}
+
+
 /**
  * Create a TGSI ureg_dst register from a Mesa dest register.
  */
@@ -1147,7 +1160,16 @@ st_translate_mesa_program(
    /* texture samplers */
    for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) {
       if (program->SamplersUsed & (1 << i)) {
+         unsigned target =
+            translate_texture_index(program->TexturesUsed[i],
+                                    !!(program->ShadowSamplers & (1 << i)));
          t->samplers[i] = ureg_DECL_sampler( ureg, i );
+         ureg_DECL_sampler_view(ureg, i, target,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT);
+
       }
    }
 

From 0f0a23d4d874a2b51a07741c11fb81f246e5298c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 18:43:00 -0600
Subject: [PATCH 092/197] st/mesa: emit sampler view declaration in bitmap
 shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In June 2015, Rob Clark started updating the tgsi utility code to emit
SVIEW declarations in various shaders (for polygon stipple, blitting,
etc).  These patches do the same for the Mesa state tracker.

The VMware driver will use this.

v2: support both TGSI_TEXTURE_2D and _RECT

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_cb_bitmap_shader.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/state_tracker/st_cb_bitmap_shader.c b/src/mesa/state_tracker/st_cb_bitmap_shader.c
index a0b9be33415..7ce078d5008 100644
--- a/src/mesa/state_tracker/st_cb_bitmap_shader.c
+++ b/src/mesa/state_tracker/st_cb_bitmap_shader.c
@@ -91,6 +91,10 @@ transform_instr(struct tgsi_transform_context *tctx,
    /* Declare the sampler. */
    tgsi_transform_sampler_decl(tctx, ctx->sampler_index);
 
+   /* Declare the sampler view. */
+   tgsi_transform_sampler_view_decl(tctx, ctx->sampler_index,
+                                    tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT);
+
    /* TEX tmp0, fragment.texcoord[0], texture[0], 2D; */
    tgsi_transform_tex_inst(tctx,
                            TGSI_FILE_TEMPORARY, 0,

From b3daaefadb05cdde439198a2159daf895dc7c475 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 16 Mar 2016 18:43:00 -0600
Subject: [PATCH 093/197] st/mesa: emit sampler view decls in drawpixels code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: support both TGSI_TEXTURE_2D and _RECT

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/mesa/state_tracker/st_cb_drawpixels.c        | 10 ++++++++++
 src/mesa/state_tracker/st_cb_drawpixels_shader.c | 10 +++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 51d4ae51918..09f4d8e00d1 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -142,11 +142,21 @@ get_drawpix_z_stencil_program(struct st_context *st,
       out_color = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
 
       depth_sampler = ureg_DECL_sampler(ureg, 0);
+      ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT,
+                             TGSI_RETURN_TYPE_FLOAT);
       out_depth = ureg_DECL_output(ureg, TGSI_SEMANTIC_POSITION, 0);
    }
 
    if (write_stencil) {
       stencil_sampler = ureg_DECL_sampler(ureg, 1);
+      ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT,
+                             TGSI_RETURN_TYPE_UINT);
       out_stencil = ureg_DECL_output(ureg, TGSI_SEMANTIC_STENCIL, 0);
    }
 
diff --git a/src/mesa/state_tracker/st_cb_drawpixels_shader.c b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
index 5a620f73e08..35a9da0643d 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels_shader.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels_shader.c
@@ -121,12 +121,20 @@ transform_instr(struct tgsi_transform_context *tctx,
    /* Declare the drawpix sampler if it's missing. */
    if (!(ctx->info.samplers_declared & (1 << ctx->drawpix_sampler))) {
       tgsi_transform_sampler_decl(tctx, ctx->drawpix_sampler);
+
+      /* emit sampler view declaration */
+      tgsi_transform_sampler_view_decl(tctx, ctx->drawpix_sampler,
+                                       tgsi_tex_target, TGSI_RETURN_TYPE_FLOAT);
    }
 
    /* Declare the pixel map sampler if it's missing. */
    if (ctx->pixel_maps &&
        !(ctx->info.samplers_declared & (1 << ctx->pixelmap_sampler))) {
       tgsi_transform_sampler_decl(tctx, ctx->pixelmap_sampler);
+
+      /* emit sampler view declaration */
+      tgsi_transform_sampler_view_decl(tctx, ctx->pixelmap_sampler,
+                                       TGSI_TEXTURE_2D, TGSI_RETURN_TYPE_FLOAT);
    }
 
    /* Get initial pixel color from the texture.
@@ -229,7 +237,7 @@ st_get_drawpix_shader(const struct tgsi_token *tokens, bool use_texcoord,
    ctx.tex_target = tex_target;
    tgsi_scan_shader(tokens, &ctx.info);
 
-   newlen = tgsi_num_tokens(tokens) + 30;
+   newlen = tgsi_num_tokens(tokens) + 60;
    newtoks = tgsi_alloc_tokens(newlen);
    if (!newtoks)
       return NULL;

From 5a9f2a2d8957676cdb4843dc9026639381495ae8 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 14:07:47 -0600
Subject: [PATCH 094/197] hud: add sampler view declaration in text fragment
 shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/hud/hud_context.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index fb998349a35..4673458171e 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -1191,6 +1191,7 @@ hud_create(struct pipe_context *pipe, struct cso_context *cso)
          "FRAG\n"
          "DCL IN[0], GENERIC[0], LINEAR\n"
          "DCL SAMP[0]\n"
+         "DCL SVIEW[0], RECT, FLOAT\n"
          "DCL OUT[0], COLOR[0]\n"
          "DCL TEMP[0]\n"
 

From e7b5a844e3b69e709c702cf012f2db5dc1f0ca50 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 14:20:01 -0600
Subject: [PATCH 095/197] postprocess: declare sampler views in shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/postprocess/pp_colors.h | 3 +++
 src/gallium/auxiliary/postprocess/pp_mlaa.h   | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/src/gallium/auxiliary/postprocess/pp_colors.h b/src/gallium/auxiliary/postprocess/pp_colors.h
index a79858ef53c..76c4ab49b5c 100644
--- a/src/gallium/auxiliary/postprocess/pp_colors.h
+++ b/src/gallium/auxiliary/postprocess/pp_colors.h
@@ -33,6 +33,7 @@ static const char nored[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -46,6 +47,7 @@ static const char nogreen[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
@@ -59,6 +61,7 @@ static const char noblue[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0]\n"
    "IMM FLT32 {    0.0000,     0.0000,     0.0000,     0.0000}\n"
    "  0: TEX TEMP[0], IN[0].xyyy, SAMP[0], 2D\n"
diff --git a/src/gallium/auxiliary/postprocess/pp_mlaa.h b/src/gallium/auxiliary/postprocess/pp_mlaa.h
index 93a8a8afa90..0b2c363e1c4 100644
--- a/src/gallium/auxiliary/postprocess/pp_mlaa.h
+++ b/src/gallium/auxiliary/postprocess/pp_mlaa.h
@@ -50,6 +50,7 @@ static const char depth1fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0..2]\n"
    "IMM FLT32 {    0.0030,     0.0000,     1.0000,     0.0000}\n"
    "  0: TEX TEMP[0].x, IN[1].xyyy, SAMP[0], 2D\n"
@@ -80,6 +81,7 @@ static const char color1fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL TEMP[0..2]\n"
    "IMM FLT32 {    0.2126,     0.7152,     0.0722,     0.1000}\n"
    "IMM FLT32 {    1.0000,     0.0000,     0.0000,     0.0000}\n"
@@ -112,6 +114,7 @@ static const char neigh3fs[] = "FRAG\n"
    "DCL IN[2], GENERIC[11], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL SAMP[1]\n"
    "DCL TEMP[0..8]\n"
    "IMM FLT32 {    1.0000,     0.00001,     0.0000,     0.0000}\n"
@@ -175,8 +178,11 @@ static const char blend2fs_1[] = "FRAG\n"
    "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
    "DCL OUT[0], COLOR\n"
    "DCL SAMP[0]\n"
+   "DCL SVIEW[0], 2D, FLOAT\n"
    "DCL SAMP[1]\n"
+   "DCL SVIEW[1], 2D, FLOAT\n"
    "DCL SAMP[2]\n"
+   "DCL SVIEW[2], 2D, FLOAT\n"
    "DCL CONST[0]\n"
    "DCL TEMP[0..6]\n"
    "IMM FLT32 {    0.0000,    -0.2500,     0.00609756,     0.5000}\n"

From 38e831ca3dc896aa8f64870ee38f67e554a59c6f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 14:20:27 -0600
Subject: [PATCH 096/197] gallium/util: declare sampler view in
 util_make_fs_blit_msaa_depthstencil()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/auxiliary/util/u_simple_shaders.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index 7ffb2712472..76950a1a9cf 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -646,6 +646,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
          "FRAG\n"
          "DCL IN[0], GENERIC[0], LINEAR\n"
          "DCL SAMP[0..1]\n"
+         "DCL SVIEW[0..1], %s, FLOAT\n"
          "DCL OUT[0], POSITION\n"
          "DCL OUT[1], STENCIL\n"
          "DCL TEMP[0]\n"
@@ -663,7 +664,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
    assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA ||
           tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA);
 
-   sprintf(text, shader_templ, type, type);
+   sprintf(text, shader_templ, type, type, type);
 
    if (!tgsi_text_translate(text, tokens, Elements(tokens))) {
       assert(0);

From b56b853ab3937d6144597f490bb38e2532d0cee2 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 14:20:44 -0600
Subject: [PATCH 097/197] gallium/tests: declare sampler views in shaders
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/tests/graw/quad-tex.c    | 1 +
 src/gallium/tests/graw/tex-srgb.c    | 1 +
 src/gallium/tests/graw/tex-swizzle.c | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/gallium/tests/graw/quad-tex.c b/src/gallium/tests/graw/quad-tex.c
index 5f90166830f..8a9d1b80f1a 100644
--- a/src/gallium/tests/graw/quad-tex.c
+++ b/src/gallium/tests/graw/quad-tex.c
@@ -92,6 +92,7 @@ static void set_fragment_shader( void )
       "DCL OUT[0], COLOR\n"
       "DCL TEMP[0]\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP TEMP[0], IN[0], SAMP[0], 2D\n"
       "  1: MOV OUT[0], TEMP[0]\n"
       "  2: END\n";
diff --git a/src/gallium/tests/graw/tex-srgb.c b/src/gallium/tests/graw/tex-srgb.c
index af989d72a2e..3b43bcbf73e 100644
--- a/src/gallium/tests/graw/tex-srgb.c
+++ b/src/gallium/tests/graw/tex-srgb.c
@@ -108,6 +108,7 @@ static void set_fragment_shader( void )
       "DCL OUT[0], COLOR\n"
       "DCL TEMP[0]\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP TEMP[0], IN[0], SAMP[0], 2D\n"
       "  1: MOV OUT[0], TEMP[0]\n"
       "  2: END\n";
diff --git a/src/gallium/tests/graw/tex-swizzle.c b/src/gallium/tests/graw/tex-swizzle.c
index e45b848b48e..8b472c9364c 100644
--- a/src/gallium/tests/graw/tex-swizzle.c
+++ b/src/gallium/tests/graw/tex-swizzle.c
@@ -89,6 +89,7 @@ static void set_fragment_shader(void)
       "DCL IN[0], GENERIC[0], PERSPECTIVE\n"
       "DCL OUT[0], COLOR\n"
       "DCL SAMP[0]\n"
+      "DCL SVIEW[0], 2D, FLOAT\n"
       "  0: TXP OUT[0], IN[0], SAMP[0], 2D\n"
       "  2: END\n";
 

From dc9ecf58c0c5c8a97cd41362e78c2fcd9f6e3b80 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 18 Mar 2016 09:55:57 -0600
Subject: [PATCH 098/197] svga: use shader sampler view declarations

Previously, we looked at the bound textures (via the pipe_sampler_views)
to determine texture dimensions (1D/2D/3D/etc) and datatype (float vs.
int).  But this could fail in out of memory conditions.  If we failed to
allocate a texture and didn't create a pipe_sampler_view, we'd default
to using 0 (PIPE_BUFFER) as the texture type.  This led to device errors
because of inconsistent shader code.

This change relies on all TGSI shaders having an SVIEW declaration for
each SAMP declaration.  The previous patch series does that.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/drivers/svga/svga_shader.c        | 16 ++--
 src/gallium/drivers/svga/svga_shader.h        |  3 +-
 .../drivers/svga/svga_tgsi_decl_sm30.c        | 20 +++--
 src/gallium/drivers/svga/svga_tgsi_emit.h     |  2 +
 src/gallium/drivers/svga/svga_tgsi_insn.c     |  2 +-
 src/gallium/drivers/svga/svga_tgsi_vgpu10.c   | 78 ++++++++++++-------
 6 files changed, 74 insertions(+), 47 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index 5c99e16d976..78eb3f65b61 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -180,18 +180,18 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
          assert(view->texture);
          assert(view->texture->target < (1 << 4)); /* texture_target:4 */
 
-         key->tex[i].texture_target = view->texture->target;
-
          /* 1D/2D array textures with one slice are treated as non-arrays
           * by the SVGA3D device.  Convert the texture type here so that
           * we emit the right TEX/SAMPLE instruction in the shader.
           */
-         if (view->texture->array_size == 1) {
-            if (view->texture->target == PIPE_TEXTURE_1D_ARRAY) {
-               key->tex[i].texture_target = PIPE_TEXTURE_1D;
+         if (view->texture->target == PIPE_TEXTURE_1D_ARRAY ||
+             view->texture->target == PIPE_TEXTURE_2D_ARRAY) {
+            if (view->texture->array_size == 1) {
+               key->tex[i].is_array = 0;
             }
-            else if (view->texture->target == PIPE_TEXTURE_2D_ARRAY) {
-               key->tex[i].texture_target = PIPE_TEXTURE_2D;
+            else {
+               assert(view->texture->array_size > 1);
+               key->tex[i].is_array = 1;
             }
          }
 
@@ -207,8 +207,6 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
          key->tex[i].swizzle_g = view->swizzle_g;
          key->tex[i].swizzle_b = view->swizzle_b;
          key->tex[i].swizzle_a = view->swizzle_a;
-
-         key->tex[i].return_type = svga_get_texture_datatype(view->format);
       }
    }
    key->num_textures = svga->curr.num_sampler_views[shader];
diff --git a/src/gallium/drivers/svga/svga_shader.h b/src/gallium/drivers/svga/svga_shader.h
index f49fdb46d0e..3f915740b1f 100644
--- a/src/gallium/drivers/svga/svga_shader.h
+++ b/src/gallium/drivers/svga/svga_shader.h
@@ -98,14 +98,13 @@ struct svga_compile_key
       unsigned compare_func:3;
       unsigned unnormalized:1;
       unsigned width_height_idx:5; /**< texture unit */
-      unsigned texture_target:4;   /**< PIPE_TEXTURE_x */
+      unsigned is_array:1;
       unsigned texture_msaa:1;    /**< A multisample texture? */
       unsigned sprite_texgen:1;
       unsigned swizzle_r:3;
       unsigned swizzle_g:3;
       unsigned swizzle_b:3;
       unsigned swizzle_a:3;
-      unsigned return_type:3;  /**< TGSI_RETURN_TYPE_x */
    } tex[PIPE_MAX_SAMPLERS];
    /* Note: svga_compile_keys_equal() depends on the variable-size
     * tex[] array being at the end of this structure.
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index ca4009b9e38..204b814a964 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -517,15 +517,15 @@ vs30_output(struct svga_shader_emitter *emit,
 static ubyte
 svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)
 {
-   switch (emit->key.tex[idx].texture_target) {
-   case PIPE_TEXTURE_1D:
+   switch (emit->sampler_target[idx]) {
+   case TGSI_TEXTURE_1D:
       return SVGA3DSAMP_2D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
       return SVGA3DSAMP_2D;
-   case PIPE_TEXTURE_3D:
+   case TGSI_TEXTURE_3D:
       return SVGA3DSAMP_VOLUME;
-   case PIPE_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE:
       return SVGA3DSAMP_CUBE;
    }
 
@@ -585,6 +585,14 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
             ok = ps30_output( emit, decl->Semantic, idx );
          break;
 
+      case TGSI_FILE_SAMPLER_VIEW:
+         {
+            unsigned unit = decl->Range.First;
+            assert(decl->Range.First == decl->Range.Last);
+            emit->sampler_target[unit] = decl->SamplerView.Resource;
+         }
+         break;
+
       default:
          /* don't need to declare other vars */
          ok = TRUE;
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 83f0c8bd4d0..7a593ba6e9d 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -136,6 +136,8 @@ struct svga_shader_emitter
    int current_arl;
 
    unsigned pstipple_sampler_unit;
+
+   uint8_t sampler_target[PIPE_MAX_SAMPLERS];
 };
 
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 489e68f88e8..3188c411863 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3849,7 +3849,7 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
 
       if (new_tokens) {
          /* Setup texture state for stipple */
-         emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D;
+         emit->sampler_target[unit] = TGSI_TEXTURE_2D;
          emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X;
          emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y;
          emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z;
diff --git a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
index 0c5afeb4cf9..0d5628251df 100644
--- a/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
+++ b/src/gallium/drivers/svga/svga_tgsi_vgpu10.c
@@ -134,6 +134,8 @@ struct svga_shader_emitter_v10
 
    /* Samplers */
    unsigned num_samplers;
+   ubyte sampler_target[PIPE_MAX_SAMPLERS];  /**< TGSI_TEXTURE_x */
+   ubyte sampler_return_type[PIPE_MAX_SAMPLERS];  /**< TGSI_RETURN_TYPE_x */
 
    /* Address regs (really implemented with temps) */
    unsigned num_address_regs;
@@ -2312,9 +2314,13 @@ emit_vgpu10_declaration(struct svga_shader_emitter_v10 *emit,
       return TRUE;
 
    case TGSI_FILE_SAMPLER_VIEW:
-      /* Not used at this time, but maybe in the future.
-       * See emit_resource_declarations().
-       */
+      {
+         unsigned unit = decl->Range.First;
+         assert(decl->Range.First == decl->Range.Last);
+         emit->sampler_target[unit] = decl->SamplerView.Resource;
+         /* Note: we can ignore YZW return types for now */
+         emit->sampler_return_type[unit] = decl->SamplerView.ReturnTypeX;
+      }
       return TRUE;
 
    default:
@@ -2854,7 +2860,7 @@ emit_constant_declaration(struct svga_shader_emitter_v10 *emit)
 
    /* Texture buffer sizes */
    for (i = 0; i < emit->num_samplers; i++) {
-      if (emit->key.tex[i].texture_target == PIPE_BUFFER) {
+      if (emit->sampler_target[i] == TGSI_TEXTURE_BUFFER) {
          emit->texture_buffer_size_index[i] = total_consts++;
       }
    }
@@ -2918,30 +2924,44 @@ emit_sampler_declarations(struct svga_shader_emitter_v10 *emit)
 
 
 /**
- * Translate PIPE_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x.
+ * Translate TGSI_TEXTURE_x to VGAPU10_RESOURCE_DIMENSION_x.
  */
 static unsigned
-pipe_texture_to_resource_dimension(unsigned target, bool msaa)
+tgsi_texture_to_resource_dimension(unsigned target, boolean is_array)
 {
    switch (target) {
-   case PIPE_BUFFER:
+   case TGSI_TEXTURE_BUFFER:
       return VGPU10_RESOURCE_DIMENSION_BUFFER;
-   case PIPE_TEXTURE_1D:
+   case TGSI_TEXTURE_1D:
       return VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
-      return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS
-         : VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
-   case PIPE_TEXTURE_3D:
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_3D:
       return VGPU10_RESOURCE_DIMENSION_TEXTURE3D;
-   case PIPE_TEXTURE_CUBE:
+   case TGSI_TEXTURE_CUBE:
       return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
-   case PIPE_TEXTURE_1D_ARRAY:
-      return VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY;
-   case PIPE_TEXTURE_2D_ARRAY:
-      return msaa ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY
-         : VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY;
-   case PIPE_TEXTURE_CUBE_ARRAY:
+   case TGSI_TEXTURE_SHADOW1D:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE1DARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE1D;
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE2D;
+   case TGSI_TEXTURE_SHADOWCUBE:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURECUBE;
+   case TGSI_TEXTURE_2D_MSAA:
+      return VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return is_array ? VGPU10_RESOURCE_DIMENSION_TEXTURE2DMSARRAY
+         : VGPU10_RESOURCE_DIMENSION_TEXTURE2DMS;
+   case TGSI_TEXTURE_CUBE_ARRAY:
       return VGPU10_RESOURCE_DIMENSION_TEXTURECUBEARRAY;
    default:
       assert(!"Unexpected resource type");
@@ -2993,8 +3013,8 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
       opcode0.value = 0;
       opcode0.opcodeType = VGPU10_OPCODE_DCL_RESOURCE;
       opcode0.resourceDimension =
-         pipe_texture_to_resource_dimension(emit->key.tex[i].texture_target,
-                                            emit->key.tex[i].texture_msaa);
+         tgsi_texture_to_resource_dimension(emit->sampler_target[i],
+                                            emit->key.tex[i].is_array);
       operand0.value = 0;
       operand0.numComponents = VGPU10_OPERAND_0_COMPONENT;
       operand0.operandType = VGPU10_OPERAND_TYPE_RESOURCE;
@@ -3008,10 +3028,10 @@ emit_resource_declarations(struct svga_shader_emitter_v10 *emit)
       STATIC_ASSERT(VGPU10_RETURN_TYPE_SINT == TGSI_RETURN_TYPE_SINT + 1);
       STATIC_ASSERT(VGPU10_RETURN_TYPE_UINT == TGSI_RETURN_TYPE_UINT + 1);
       STATIC_ASSERT(VGPU10_RETURN_TYPE_FLOAT == TGSI_RETURN_TYPE_FLOAT + 1);
-      assert(emit->key.tex[i].return_type <= TGSI_RETURN_TYPE_FLOAT);
-      rt = emit->key.tex[i].return_type + 1;
+      assert(emit->sampler_return_type[i] <= TGSI_RETURN_TYPE_FLOAT);
+      rt = emit->sampler_return_type[i] + 1;
 #else
-      switch (emit->key.tex[i].return_type) {
+      switch (emit->sampler_return_type[i]) {
          case TGSI_RETURN_TYPE_UNORM: rt = VGPU10_RETURN_TYPE_UNORM; break;
          case TGSI_RETURN_TYPE_SNORM: rt = VGPU10_RETURN_TYPE_SNORM; break;
          case TGSI_RETURN_TYPE_SINT:  rt = VGPU10_RETURN_TYPE_SINT;  break;
@@ -5024,7 +5044,7 @@ end_tex_swizzle(struct svga_shader_emitter_v10 *emit,
       unsigned swz_b = emit->key.tex[swz->unit].swizzle_b;
       unsigned swz_a = emit->key.tex[swz->unit].swizzle_a;
       unsigned writemask_0 = 0, writemask_1 = 0;
-      boolean int_tex = is_integer_type(emit->key.tex[swz->unit].return_type);
+      boolean int_tex = is_integer_type(emit->sampler_return_type[swz->unit]);
 
       /* Swizzle w/out zero/one terms */
       struct tgsi_full_src_register src_swizzled =
@@ -5131,7 +5151,7 @@ is_valid_tex_instruction(struct svga_shader_emitter_v10 *emit,
    boolean valid = TRUE;
 
    if (tgsi_is_shadow_target(target) &&
-       is_integer_type(emit->key.tex[unit].return_type)) {
+       is_integer_type(emit->sampler_return_type[unit])) {
       debug_printf("Invalid SAMPLE_C with an integer texture!\n");
       valid = FALSE;
    }
@@ -5528,7 +5548,7 @@ emit_txq(struct svga_shader_emitter_v10 *emit,
 {
    const uint unit = inst->Src[1].Register.Index;
 
-   if (emit->key.tex[unit].texture_target == PIPE_BUFFER) {
+   if (emit->sampler_target[unit] == TGSI_TEXTURE_BUFFER) {
       /* RESINFO does not support querying texture buffers, so we instead
        * store texture buffer sizes in shader constants, then copy them to
        * implement TXQ instead of emitting RESINFO.
@@ -6617,7 +6637,7 @@ transform_fs_pstipple(struct svga_shader_emitter_v10 *emit,
    emit->fs.pstipple_sampler_unit = unit;
 
    /* Setup texture state for stipple */
-   emit->key.tex[unit].texture_target = PIPE_TEXTURE_2D;
+   emit->sampler_target[unit] = TGSI_TEXTURE_2D;
    emit->key.tex[unit].swizzle_r = TGSI_SWIZZLE_X;
    emit->key.tex[unit].swizzle_g = TGSI_SWIZZLE_Y;
    emit->key.tex[unit].swizzle_b = TGSI_SWIZZLE_Z;

From 79e343b36a729afb8086b99e4bf15d8c444887c1 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Thu, 10 Mar 2016 10:57:24 -0800
Subject: [PATCH 099/197] svga: add new num-readbacks HUD query

To find out how many image readback command is issued.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 src/gallium/drivers/svga/svga_context.h          | 16 +++++++++-------
 src/gallium/drivers/svga/svga_pipe_query.c       |  9 +++++++++
 src/gallium/drivers/svga/svga_resource_buffer.c  |  2 ++
 src/gallium/drivers/svga/svga_resource_texture.c |  2 ++
 src/gallium/drivers/svga/svga_screen.c           |  2 ++
 5 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index 1976f98e5c1..e16f62a1aef 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -55,16 +55,17 @@
 #define SVGA_QUERY_COMMAND_BUFFER_SIZE     (PIPE_QUERY_DRIVER_SPECIFIC + 7)
 #define SVGA_QUERY_FLUSH_TIME              (PIPE_QUERY_DRIVER_SPECIFIC + 8)
 #define SVGA_QUERY_SURFACE_WRITE_FLUSHES   (PIPE_QUERY_DRIVER_SPECIFIC + 9)
+#define SVGA_QUERY_NUM_READBACKS           (PIPE_QUERY_DRIVER_SPECIFIC + 10)
 
 /* running total counters */
-#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 13)
-#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
-#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 16)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 17)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -513,6 +514,7 @@ struct svga_context
       uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
       uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
       uint64_t num_generate_mipmap;  /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
+      uint64_t num_readbacks;        /**< SVGA_QUERY_NUM_READBACK */
    } hud;
 
    /** The currently bound stream output targets */
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 845f4ef3a1c..11e69edce82 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -736,6 +736,7 @@ svga_create_query(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_READBACKS:
       break;
    default:
       assert(!"unexpected query type in svga_create_query()");
@@ -808,6 +809,7 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_STATE_OBJECTS:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
+   case SVGA_QUERY_NUM_READBACKS:
       /* nothing */
       break;
    default:
@@ -899,6 +901,9 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
       sq->begin_count = svga->hud.surface_write_flushes;
       break;
+   case SVGA_QUERY_NUM_READBACKS:
+      sq->begin_count = svga->hud.num_readbacks;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1002,6 +1007,9 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
       sq->end_count = svga->hud.surface_write_flushes;
       break;
+   case SVGA_QUERY_NUM_READBACKS:
+      sq->end_count = svga->hud.num_readbacks;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1103,6 +1111,7 @@ svga_get_query_result(struct pipe_context *pipe,
    case SVGA_QUERY_COMMAND_BUFFER_SIZE:
    case SVGA_QUERY_FLUSH_TIME:
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
+   case SVGA_QUERY_NUM_READBACKS:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
    /* These are running total counters */
diff --git a/src/gallium/drivers/svga/svga_resource_buffer.c b/src/gallium/drivers/svga/svga_resource_buffer.c
index a8ffcc7f680..9ecb97509c2 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer.c
@@ -109,6 +109,8 @@ svga_buffer_transfer_map(struct pipe_context *pipe,
             assert(ret == PIPE_OK);
          }
 
+         svga->hud.num_readbacks++;
+
          svga_context_finish(svga);
 
          sbuf->dirty = FALSE;
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 1edb41dabee..3a3325c65df 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -448,6 +448,8 @@ svga_texture_transfer_map(struct pipe_context *pipe,
             ret = readback_image_vgpu9(svga, surf, st->slice, transfer->level);
          }
 
+         svga->hud.num_readbacks++;
+
          assert(ret == PIPE_OK);
          (void) ret;
 
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index bcc512041f7..fd3cc7db8e3 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -837,6 +837,8 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_MICROSECONDS),
       QUERY("surface-write-flushes", SVGA_QUERY_SURFACE_WRITE_FLUSHES,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-readbacks", SVGA_QUERY_NUM_READBACKS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
 
       /* running total counters */
       QUERY("memory-used", SVGA_QUERY_MEMORY_USED,

From 0a1d91ef979a2782c722032f2b0a88d0754dd561 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Fri, 11 Mar 2016 14:33:39 -0800
Subject: [PATCH 100/197] svga: add a few more resource updates HUD query

This patch adds the following HUD queries:
.num-resource-updates  -- number of resource update. Commands include
                          UPDATE_SUBRESOURCE, UPDATE_GB_IMAGE.
.num-buffer-uploads    -- number of buffer uploads.
.num-const-buf-updates -- number of set constant buffer.
.num-const-updates     -- number of set shader constant.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 src/gallium/drivers/svga/svga_context.h       | 52 +++++++++++--------
 src/gallium/drivers/svga/svga_pipe_query.c    | 36 +++++++++++++
 .../svga/svga_resource_buffer_upload.c        |  6 +++
 .../drivers/svga/svga_resource_texture.c      |  2 +
 src/gallium/drivers/svga/svga_screen.c        |  8 +++
 .../drivers/svga/svga_state_constants.c       |  9 ++++
 6 files changed, 91 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index e16f62a1aef..ead47c07980 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -56,16 +56,20 @@
 #define SVGA_QUERY_FLUSH_TIME              (PIPE_QUERY_DRIVER_SPECIFIC + 8)
 #define SVGA_QUERY_SURFACE_WRITE_FLUSHES   (PIPE_QUERY_DRIVER_SPECIFIC + 9)
 #define SVGA_QUERY_NUM_READBACKS           (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define SVGA_QUERY_NUM_RESOURCE_UPDATES    (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_BUFFER_UPLOADS      (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_NUM_CONST_BUF_UPDATES   (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define SVGA_QUERY_NUM_CONST_UPDATES       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
 
 /* running total counters */
-#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 12)
-#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 13)
-#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
-#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 15)
-#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 15)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 16)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 17)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 18)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 19)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 20)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 17)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 21)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -500,21 +504,25 @@ struct svga_context
 
    /** performance / info queries for HUD */
    struct {
-      uint64_t num_draw_calls;       /**< SVGA_QUERY_DRAW_CALLS */
-      uint64_t num_fallbacks;        /**< SVGA_QUERY_NUM_FALLBACKS */
-      uint64_t num_flushes;          /**< SVGA_QUERY_NUM_FLUSHES */
-      uint64_t num_validations;      /**< SVGA_QUERY_NUM_VALIDATIONS */
-      uint64_t map_buffer_time;      /**< SVGA_QUERY_MAP_BUFFER_TIME */
-      uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
-      uint64_t command_buffer_size;  /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
-      uint64_t flush_time;           /**< SVGA_QUERY_FLUSH_TIME */
-      uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
-      uint64_t num_shaders;          /**< SVGA_QUERY_NUM_SHADERS */
-      uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
-      uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
-      uint64_t num_bytes_uploaded;   /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
-      uint64_t num_generate_mipmap;  /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
-      uint64_t num_readbacks;        /**< SVGA_QUERY_NUM_READBACK */
+      uint64_t num_draw_calls;          /**< SVGA_QUERY_DRAW_CALLS */
+      uint64_t num_fallbacks;           /**< SVGA_QUERY_NUM_FALLBACKS */
+      uint64_t num_flushes;             /**< SVGA_QUERY_NUM_FLUSHES */
+      uint64_t num_validations;         /**< SVGA_QUERY_NUM_VALIDATIONS */
+      uint64_t map_buffer_time;         /**< SVGA_QUERY_MAP_BUFFER_TIME */
+      uint64_t num_resources_mapped;    /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
+      uint64_t command_buffer_size;     /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
+      uint64_t flush_time;              /**< SVGA_QUERY_FLUSH_TIME */
+      uint64_t surface_write_flushes;   /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
+      uint64_t num_readbacks;           /**< SVGA_QUERY_NUM_READBACKS */
+      uint64_t num_resource_updates;    /**< SVGA_QUERY_NUM_RESOURCE_UPDATES */
+      uint64_t num_buffer_uploads;      /**< SVGA_QUERY_NUM_BUFFER_UPLOADS */
+      uint64_t num_const_buf_updates;   /**< SVGA_QUERY_NUM_CONST_BUF_UPDATES */
+      uint64_t num_const_updates;       /**< SVGA_QUERY_NUM_CONST_UPDATES */
+      uint64_t num_shaders;             /**< SVGA_QUERY_NUM_SHADERS */
+      uint64_t num_state_objects;       /**< SVGA_QUERY_NUM_STATE_OBJECTS */
+      uint64_t num_surface_views;       /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
+      uint64_t num_bytes_uploaded;      /**< SVGA_QUERY_NUM_BYTES_UPLOADED */
+      uint64_t num_generate_mipmap;     /**< SVGA_QUERY_NUM_GENERATE_MIPMAP */
    } hud;
 
    /** The currently bound stream output targets */
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 11e69edce82..4e4086021a6 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -737,6 +737,10 @@ svga_create_query(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
    case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       break;
    default:
       assert(!"unexpected query type in svga_create_query()");
@@ -810,6 +814,10 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
    case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       /* nothing */
       break;
    default:
@@ -904,6 +912,18 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_READBACKS:
       sq->begin_count = svga->hud.num_readbacks;
       break;
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+      sq->begin_count = svga->hud.num_resource_updates;
+      break;
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+      sq->begin_count = svga->hud.num_buffer_uploads;
+      break;
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+      sq->begin_count = svga->hud.num_const_buf_updates;
+      break;
+   case SVGA_QUERY_NUM_CONST_UPDATES:
+      sq->begin_count = svga->hud.num_const_updates;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1010,6 +1030,18 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_READBACKS:
       sq->end_count = svga->hud.num_readbacks;
       break;
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+      sq->end_count = svga->hud.num_resource_updates;
+      break;
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+      sq->end_count = svga->hud.num_buffer_uploads;
+      break;
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+      sq->end_count = svga->hud.num_const_buf_updates;
+      break;
+   case SVGA_QUERY_NUM_CONST_UPDATES:
+      sq->end_count = svga->hud.num_const_updates;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1112,6 +1144,10 @@ svga_get_query_result(struct pipe_context *pipe,
    case SVGA_QUERY_FLUSH_TIME:
    case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
    case SVGA_QUERY_NUM_READBACKS:
+   case SVGA_QUERY_NUM_RESOURCE_UPDATES:
+   case SVGA_QUERY_NUM_BUFFER_UPLOADS:
+   case SVGA_QUERY_NUM_CONST_BUF_UPDATES:
+   case SVGA_QUERY_NUM_CONST_UPDATES:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
    /* These are running total counters */
diff --git a/src/gallium/drivers/svga/svga_resource_buffer_upload.c b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
index 7f7ceab0aa5..1121b780af1 100644
--- a/src/gallium/drivers/svga/svga_resource_buffer_upload.c
+++ b/src/gallium/drivers/svga/svga_resource_buffer_upload.c
@@ -311,6 +311,8 @@ svga_buffer_upload_gb_command(struct svga_context *svga,
    swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
+   svga->hud.num_resource_updates++;
+
    return PIPE_OK;
 }
 
@@ -385,6 +387,8 @@ svga_buffer_upload_command(struct svga_context *svga,
    swc->hints |= SVGA_HINT_FLAG_CAN_PRE_FLUSH;
    sbuf->dma.flags.discard = FALSE;
 
+   svga->hud.num_buffer_uploads++;
+
    return PIPE_OK;
 }
 
@@ -433,6 +437,7 @@ svga_buffer_upload_flush(struct svga_context *svga,
          assert(box->x + box->w <= sbuf->b.b.width0);
 
          svga->hud.num_bytes_uploaded += box->w;
+         svga->hud.num_buffer_uploads++;
       }
    }
    else {
@@ -460,6 +465,7 @@ svga_buffer_upload_flush(struct svga_context *svga,
          assert(box->x + box->w <= sbuf->b.b.width0);
 
          svga->hud.num_bytes_uploaded += box->w;
+         svga->hud.num_buffer_uploads++;
       }
    }
 
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 3a3325c65df..db730802c7a 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -683,6 +683,8 @@ svga_texture_transfer_unmap(struct pipe_context *pipe,
          ret = update_image_vgpu9(svga, surf, &box, st->slice, transfer->level);
       }
 
+      svga->hud.num_resource_updates++;
+
       assert(ret == PIPE_OK);
       (void) ret;
    }
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index fd3cc7db8e3..c0873c0c65a 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -839,6 +839,14 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-readbacks", SVGA_QUERY_NUM_READBACKS,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-resource-updates", SVGA_QUERY_NUM_RESOURCE_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-buffer-uploads", SVGA_QUERY_NUM_BUFFER_UPLOADS,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-const-buf-updates", SVGA_QUERY_NUM_CONST_BUF_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
+      QUERY("num-const-updates", SVGA_QUERY_NUM_CONST_UPDATES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
 
       /* running total counters */
       QUERY("memory-used", SVGA_QUERY_MEMORY_USED,
diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index 8ab1693088a..aee1adeee46 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -301,6 +301,8 @@ emit_const(struct svga_context *svga, unsigned shader, unsigned i,
          return ret;
 
       memcpy(svga->state.hw_draw.cb[shader][i], value, 4 * sizeof(float));
+
+      svga->hud.num_const_updates++;
    }
 
    return ret;
@@ -420,6 +422,9 @@ emit_const_range(struct svga_context *svga,
                 (j - i) * 4 * sizeof(float));
 
          i = j + 1;
+
+         svga->hud.num_const_updates++;
+
       } else {
          ++i;
       }
@@ -664,6 +669,8 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
 
    pipe_resource_reference(&dst_buffer, NULL);
 
+   svga->hud.num_const_buf_updates++;
+
    return ret;
 }
 
@@ -732,6 +739,8 @@ emit_consts_vgpu10(struct svga_context *svga, unsigned shader)
                                                   size);
       if (ret != PIPE_OK)
          return ret;
+
+      svga->hud.num_const_buf_updates++;
    }
 
    svga->state.hw_draw.enabled_constbufs[shader] = enabled_constbufs;

From b45b47c5c98052c6c5d190c45843e12f8d0b6af3 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Fri, 11 Mar 2016 14:54:36 -0800
Subject: [PATCH 101/197] svga: optimize constant buffer uploads

When a constant buffer slot is allocated in the upload buffer,
the allocated slot size is always in multiple of 256. But the actual buffer
size might not be in multiple of 256. This causes a gap between
the ending offset of a slot and the starting offset of the next slot.
The gap will prevent the two slots to be updated in a single update command.
In order to maximize the chance of merging the contiguous dirty ranges,
when a slot is to be allocated in the constant upload buffer,
specify a buffer size in multiple of 256.

There is about 10% performance improvement with Lightsmark2008 and
30% with Cinebench R11.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Thomas Hellstrom <thellstrom@vmware.com>
---
 src/gallium/drivers/svga/svga_state_constants.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/svga/svga_state_constants.c b/src/gallium/drivers/svga/svga_state_constants.c
index aee1adeee46..5ae0382cd45 100644
--- a/src/gallium/drivers/svga/svga_state_constants.c
+++ b/src/gallium/drivers/svga/svga_state_constants.c
@@ -554,6 +554,7 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
    void *src_map = NULL, *dst_map;
    unsigned offset;
    const struct svga_shader_variant *variant;
+   unsigned alloc_buf_size;
 
    assert(shader == PIPE_SHADER_VERTEX ||
           shader == PIPE_SHADER_GEOMETRY ||
@@ -618,7 +619,16 @@ emit_constbuf_vgpu10(struct svga_context *svga, unsigned shader)
     */
    new_buf_size = align(new_buf_size, 16);
 
-   u_upload_alloc(svga->const0_upload, 0, new_buf_size,
+   /* Constant buffer size in the upload buffer must be in multiples of 256.
+    * In order to maximize the chance of merging the upload buffer chunks
+    * when svga_buffer_add_range() is called,
+    * the allocate buffer size needs to be in multiples of 256 as well.
+    * Otherwise, since there is gap between each dirty range of the upload buffer,
+    * each dirty range will end up in its own UPDATE_GB_IMAGE command.
+    */
+   alloc_buf_size = align(new_buf_size, CONST0_UPLOAD_ALIGNMENT);
+
+   u_upload_alloc(svga->const0_upload, 0, alloc_buf_size,
                   CONST0_UPLOAD_ALIGNMENT, &offset,
                   &dst_buffer, &dst_map);
    if (!dst_map) {

From 299f8ca0a7f723dfbf757385c8c7c221c3b87683 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 21 Mar 2016 13:23:04 -0600
Subject: [PATCH 102/197] svga: minor formatting fix, comment addition

To sync with our internal tree.

Signed-off-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/svga/svga_pipe_query.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 4e4086021a6..88f41eadc1d 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -72,11 +72,14 @@ struct svga_query {
 
 /** cast wrapper */
 static inline struct svga_query *
-svga_query( struct pipe_query *q )
+svga_query(struct pipe_query *q)
 {
    return (struct svga_query *)q;
 }
 
+/**
+ * VGPU9
+ */
 
 static boolean
 svga_get_query_result(struct pipe_context *pipe,

From 47cfc83440c3030999a08c0b5fccae860294608c Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Tue, 19 Jan 2016 20:25:39 -0800
Subject: [PATCH 103/197] svga: rebind index buffer

Similar to other resources, current index buffer needs to be
rebound at the first draw of the current command buffer to make
sure the buffer is available for the draw command.

Fixes bug 1587263.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_context.c | 1 +
 src/gallium/drivers/svga/svga_draw.c    | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index da4281490ae..896dcdf59d0 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -247,6 +247,7 @@ struct pipe_context *svga_context_create(struct pipe_screen *screen,
           sizeof(svga->state.hw_draw.default_constbuf_size));
    memset(svga->state.hw_draw.enabled_constbufs, 0,
           sizeof(svga->state.hw_draw.enabled_constbufs));
+   svga->state.hw_draw.ib = NULL;
 
    /* Create a no-operation blend state which we will bind whenever the
     * requested blend state is impossible (e.g. due to having an integer
diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index fe6cf71a6e5..3eda09acc7c 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -458,6 +458,14 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
       ret = svga_rebind_shaders(svga);
       if (ret != PIPE_OK)
          return ret;
+
+      /* Rebind index buffer */
+      if (svga->state.hw_draw.ib) {
+         struct svga_winsys_context *swc = svga->swc;
+         ret = swc->resource_rebind(swc, svga->state.hw_draw.ib, NULL, SVGA_RELOC_READ);
+         if (ret != PIPE_OK)
+            return ret;
+      }
    }
 
    ret = validate_sampler_resources(svga);

From 47856e59456361f2218e03d997d2735e1a848230 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Wed, 20 Jan 2016 10:35:56 -0800
Subject: [PATCH 104/197] svga: rebind stream output targets

To ensure stream output target surfaces are available for the draw commands,
we need to rebind the current stream output targets at the first draw in the
command buffer.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_draw.c          |  5 +++++
 .../drivers/svga/svga_pipe_streamout.c        | 19 +++++++++++++++++++
 src/gallium/drivers/svga/svga_streamout.h     |  3 +++
 3 files changed, 27 insertions(+)

diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 3eda09acc7c..96f82381708 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -459,6 +459,11 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
       if (ret != PIPE_OK)
          return ret;
 
+      /* Rebind stream output targets */
+      ret = svga_rebind_stream_output_targets(svga);
+      if (ret != PIPE_OK)
+         return ret;
+
       /* Rebind index buffer */
       if (svga->state.hw_draw.ib) {
          struct svga_winsys_context *swc = svga->swc;
diff --git a/src/gallium/drivers/svga/svga_pipe_streamout.c b/src/gallium/drivers/svga/svga_pipe_streamout.c
index 3f443c44eee..1318b5565ce 100644
--- a/src/gallium/drivers/svga/svga_pipe_streamout.c
+++ b/src/gallium/drivers/svga/svga_pipe_streamout.c
@@ -311,6 +311,25 @@ svga_set_stream_output_targets(struct pipe_context *pipe,
    svga->num_so_targets = num_targets;
 }
 
+/**
+ * Rebind stream output target surfaces
+ */
+enum pipe_error
+svga_rebind_stream_output_targets(struct svga_context *svga)
+{
+   struct svga_winsys_context *swc = svga->swc;
+   enum pipe_error ret;
+   unsigned i;
+
+   for (i = 0; i < svga->num_so_targets; i++) {
+      ret = swc->resource_rebind(swc, svga->so_surfaces[i], NULL, SVGA_RELOC_WRITE);
+      if (ret != PIPE_OK)
+         return ret;
+   }
+
+   return PIPE_OK;
+}
+
 void
 svga_init_stream_output_functions(struct svga_context *svga)
 {
diff --git a/src/gallium/drivers/svga/svga_streamout.h b/src/gallium/drivers/svga/svga_streamout.h
index da0c4457d2e..1daa1ad5352 100644
--- a/src/gallium/drivers/svga/svga_streamout.h
+++ b/src/gallium/drivers/svga/svga_streamout.h
@@ -47,4 +47,7 @@ void
 svga_delete_stream_output(struct svga_context *svga,
                           struct svga_stream_output *streamout);
 
+enum pipe_error
+svga_rebind_stream_output_targets(struct svga_context *svga);
+
 #endif /* SVGA_STREAMOUT_H */

From f8aaf0094dd23b5f94bbef5fd444861d286ddb36 Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Tue, 26 Jan 2016 11:12:09 -0800
Subject: [PATCH 105/197] svga: Fix the index buffer rebind regression

The index buffer handle saved in the hw_state structure could
be invalid after the index buffer is destroyed. Instead of
rebinding the index buffer using the saved index buffer handle,
we will reset the index buffer handle in the hw_state structure
to force resending of the index buffer.

Fixes bug 1593320

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_draw.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_draw.c b/src/gallium/drivers/svga/svga_draw.c
index 96f82381708..0b9ea889afa 100644
--- a/src/gallium/drivers/svga/svga_draw.c
+++ b/src/gallium/drivers/svga/svga_draw.c
@@ -464,13 +464,8 @@ draw_vgpu10(struct svga_hwtnl *hwtnl,
       if (ret != PIPE_OK)
          return ret;
 
-      /* Rebind index buffer */
-      if (svga->state.hw_draw.ib) {
-         struct svga_winsys_context *swc = svga->swc;
-         ret = swc->resource_rebind(swc, svga->state.hw_draw.ib, NULL, SVGA_RELOC_READ);
-         if (ret != PIPE_OK)
-            return ret;
-      }
+      /* Force rebinding the index buffer when needed */
+      svga->state.hw_draw.ib = NULL;
    }
 
    ret = validate_sampler_resources(svga);

From 86caa67aefab81aed1d358bb0ac6cfa648576297 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 23 Feb 2016 18:02:11 -0700
Subject: [PATCH 106/197] svga: add svga_winsys_context::pipe_debug_callback
 pointer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The svga winsys modules can use this to send debug messages to the
state tracker and Mesa.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
Reviewed-by: José Fonseca <jfonseca@vmware.com>
---
 src/gallium/drivers/svga/svga_pipe_misc.c | 7 +++++--
 src/gallium/drivers/svga/svga_winsys.h    | 4 ++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_pipe_misc.c b/src/gallium/drivers/svga/svga_pipe_misc.c
index af9356d7c75..a26e577d8f7 100644
--- a/src/gallium/drivers/svga/svga_pipe_misc.c
+++ b/src/gallium/drivers/svga/svga_pipe_misc.c
@@ -254,10 +254,13 @@ svga_set_debug_callback(struct pipe_context *pipe,
 {
    struct svga_context *svga = svga_context(pipe);
 
-   if (cb)
+   if (cb) {
       svga->debug.callback = *cb;
-   else
+      svga->swc->debug_callback = &svga->debug.callback;
+   } else {
       memset(&svga->debug.callback, 0, sizeof(svga->debug.callback));
+      svga->swc->debug_callback = NULL;
+   }
 }
 
 
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 0ad6b5e6c76..7da2c4e77ca 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -48,6 +48,7 @@ struct svga_winsys_screen;
 struct svga_winsys_buffer;
 struct pipe_screen;
 struct pipe_context;
+struct pipe_debug_callback;
 struct pipe_fence_handle;
 struct pipe_resource;
 struct svga_region;
@@ -286,6 +287,9 @@ struct svga_winsys_context
                       struct svga_winsys_surface *surface,
                       struct svga_winsys_gb_shader *shader,
                       unsigned flags);
+
+   /** To report perf/conformance/etc issues to the state tracker */
+   struct pipe_debug_callback *debug_callback;
 };
 
 

From 96cd908fd34ef711100c9beaed03c3c8ffd5873d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 11:36:53 -0500
Subject: [PATCH 107/197] gallium: add additional PIPE_BARRIER_* bits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_defines.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index bdd76ab1f81..90af7a7012c 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -360,6 +360,13 @@ enum pipe_flush_flags
 #define PIPE_BARRIER_MAPPED_BUFFER     (1 << 0)
 #define PIPE_BARRIER_SHADER_BUFFER     (1 << 1)
 #define PIPE_BARRIER_QUERY_BUFFER      (1 << 2)
+#define PIPE_BARRIER_VERTEX_BUFFER     (1 << 3)
+#define PIPE_BARRIER_INDEX_BUFFER      (1 << 4)
+#define PIPE_BARRIER_CONSTANT_BUFFER   (1 << 5)
+#define PIPE_BARRIER_INDIRECT_BUFFER   (1 << 6)
+#define PIPE_BARRIER_TEXTURE           (1 << 7)
+#define PIPE_BARRIER_IMAGE             (1 << 8)
+#define PIPE_BARRIER_FRAMEBUFFER       (1 << 9)
 
 /**
  * Resource binding flags -- state tracker must specify in advance all

From 137954408172a5104f0f5650bae943d2ebf0aa07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 11:37:10 -0500
Subject: [PATCH 108/197] st/mesa: translate additional flags in MemoryBarrier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-order flags in the order in which they appear in the OpenGL spec in the
description of MemoryBarrier().

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_texturebarrier.c | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_texturebarrier.c b/src/mesa/state_tracker/st_cb_texturebarrier.c
index 2de150ba13a..6319b6258ac 100644
--- a/src/mesa/state_tracker/st_cb_texturebarrier.c
+++ b/src/mesa/state_tracker/st_cb_texturebarrier.c
@@ -63,16 +63,31 @@ st_MemoryBarrier(struct gl_context *ctx, GLbitfield barriers)
    struct pipe_context *pipe = st_context(ctx)->pipe;
    unsigned flags = 0;
 
+   if (barriers & GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT)
+      flags |= PIPE_BARRIER_VERTEX_BUFFER;
+   if (barriers & GL_ELEMENT_ARRAY_BARRIER_BIT)
+      flags |= PIPE_BARRIER_INDEX_BUFFER;
+   if (barriers & GL_UNIFORM_BARRIER_BIT)
+      flags |= PIPE_BARRIER_CONSTANT_BUFFER;
+   if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT)
+      flags |= PIPE_BARRIER_TEXTURE;
+   if (barriers & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)
+      flags |= PIPE_BARRIER_IMAGE;
+   if (barriers & GL_COMMAND_BARRIER_BIT)
+      flags |= PIPE_BARRIER_INDIRECT_BUFFER;
+   if (barriers & GL_PIXEL_BUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_TEXTURE;
    if (barriers & GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_MAPPED_BUFFER;
+   if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_QUERY_BUFFER;
+   if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
+      flags |= PIPE_BARRIER_FRAMEBUFFER;
    if (barriers & GL_ATOMIC_COUNTER_BARRIER_BIT)
       flags |= PIPE_BARRIER_SHADER_BUFFER;
    if (barriers & GL_SHADER_STORAGE_BARRIER_BIT)
       flags |= PIPE_BARRIER_SHADER_BUFFER;
 
-   if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
-      flags |= PIPE_BARRIER_QUERY_BUFFER;
-
    if (flags && pipe->memory_barrier)
       pipe->memory_barrier(pipe, flags);
 }

From fa096a14af09ef1ebb459b238e5c600a60e0ef7b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 15:00:40 -0500
Subject: [PATCH 109/197] tgsi/scan: add images_writemask
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 18 ++++++++++++++++--
 src/gallium/auxiliary/tgsi/tgsi_scan.h |  5 +++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 8e24cc626bd..dee6884d14d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -38,6 +38,7 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_prim.h"
+#include "tgsi/tgsi_info.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_scan.h"
@@ -192,8 +193,14 @@ scan_instruction(struct tgsi_shader_info *info,
          }
       }
 
-      if (is_memory_file(src->Register.File))
+      if (is_memory_file(src->Register.File)) {
          is_mem_inst = true;
+
+         if (src->Register.File == TGSI_FILE_IMAGE &&
+             !src->Register.Indirect &&
+             tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store)
+            info->images_writemask |= 1 << src->Register.Index;
+      }
    }
 
    /* check for indirect register writes */
@@ -204,8 +211,15 @@ scan_instruction(struct tgsi_shader_info *info,
          info->indirect_files_written |= (1 << dst->Register.File);
       }
 
-      if (is_memory_file(dst->Register.File))
+      if (is_memory_file(dst->Register.File)) {
+         assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE);
+
          is_mem_inst = true;
+
+         if (dst->Register.File == TGSI_FILE_IMAGE &&
+             !dst->Register.Indirect)
+            info->images_writemask |= 1 << dst->Register.Index;
+      }
    }
 
    if (is_mem_inst)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index d65dec71888..f52729ae2d2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -117,6 +117,11 @@ struct tgsi_shader_info
    unsigned culldist_writemask;
    unsigned num_written_culldistance;
    unsigned num_written_clipdistance;
+   /**
+    * Bitmask indicating which images are written to (STORE / ATOM*).
+    * Indirect image accesses are not reflected in this mask.
+    */
+   unsigned images_writemask;
    /**
     * Bitmask indicating which register files are accessed with
     * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.

From 457f9c6b25240795039b1827876a3af5ffa2155b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 15:06:15 -0500
Subject: [PATCH 110/197] tgsi/scan: track which shader images are really
 buffers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 3 +++
 src/gallium/auxiliary/tgsi/tgsi_scan.h | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index dee6884d14d..65bdab5b1cd 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -427,6 +427,9 @@ scan_declaration(struct tgsi_shader_info *info,
          }
       } else if (file == TGSI_FILE_SAMPLER) {
          info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_IMAGE) {
+         if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
+            info->images_buffers |= 1 << reg;
       }
    }
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index f52729ae2d2..d777f23749b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -122,6 +122,10 @@ struct tgsi_shader_info
     * Indirect image accesses are not reflected in this mask.
     */
    unsigned images_writemask;
+   /**
+    * Bitmask indicating which declared image is a buffer.
+    */
+   unsigned images_buffers;
    /**
     * Bitmask indicating which register files are accessed with
     * indirect addressing.  The bits are (1 << TGSI_FILE_x), etc.

From b1b7268f014c78ac46b2f360959e681bad3091d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Fri, 11 Mar 2016 19:39:18 -0500
Subject: [PATCH 111/197] gallium/radeon: make r600_texture_disable_dcc
 externally accessible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We will need it in radeonsi for shader images.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_pipe_common.h | 2 ++
 src/gallium/drivers/radeon/r600_texture.c     | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index a9de71a8734..381ad21a4e3 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -606,6 +606,8 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 				   struct r600_atom *fb_state,
 				   unsigned *buffers, unsigned *dirty_cbufs,
 				   const union pipe_color_union *color);
+void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
+			      struct r600_texture *rtex);
 void r600_init_screen_texture_functions(struct r600_common_screen *rscreen);
 void r600_init_context_texture_functions(struct r600_common_context *rctx);
 
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index c573b438b01..7322f3ee985 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -297,8 +297,8 @@ static void r600_texture_disable_cmask(struct r600_common_screen *rscreen,
 	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
-static void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
-				     struct r600_texture *rtex)
+void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
+			      struct r600_texture *rtex)
 {
 	struct r600_common_context *rctx =
 		(struct r600_common_context *)rscreen->aux_context;

From e85cf35a6516c44e33663fcd9637c6b434bb63ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sat, 6 Feb 2016 18:32:13 -0500
Subject: [PATCH 112/197] radeonsi: implement set_shader_images (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Whether DCC is disabled depends on the access flags with which the image
is bound: image_load supports DCC, but store and atomic don't.

v2: remove an unnecessary masking of images->desc.enabled_mask

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 206 ++++++++++++++++--
 src/gallium/drivers/radeonsi/si_pipe.h        |   7 +
 src/gallium/drivers/radeonsi/si_shader.c      |   8 +-
 src/gallium/drivers/radeonsi/si_shader.h      |   4 +-
 src/gallium/drivers/radeonsi/si_state.c       |  39 +++-
 src/gallium/drivers/radeonsi/si_state.h       |  19 ++
 6 files changed, 254 insertions(+), 29 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index d12b3e6b28a..a931ab24554 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -64,7 +64,8 @@
 #include "util/u_upload_mgr.h"
 
 
-/* NULL image and buffer descriptor.
+/* NULL image and buffer descriptor for textures (alpha = 1) and images
+ * (alpha = 0).
  *
  * For images, all fields must be zero except for the swizzle, which
  * supports arbitrary combinations of 0s and 1s. The texture type must be
@@ -74,7 +75,7 @@
  *
  * This is the only reason why the buffer descriptor must be in words [4:7].
  */
-static uint32_t null_descriptor[8] = {
+static uint32_t null_texture_descriptor[8] = {
 	0,
 	0,
 	0,
@@ -84,10 +85,20 @@ static uint32_t null_descriptor[8] = {
 	 * descriptor */
 };
 
+static uint32_t null_image_descriptor[8] = {
+	0,
+	0,
+	0,
+	S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D)
+	/* the rest must contain zeros, which is also used by the buffer
+	 * descriptor */
+};
+
 static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements)
+				unsigned num_elements,
+				const uint32_t *null_descriptor)
 {
 	int i;
 
@@ -100,10 +111,12 @@ static void si_init_descriptors(struct si_descriptors *desc,
 	desc->shader_userdata_offset = shader_userdata_index * 4;
 
 	/* Initialize the array to NULL descriptors if the element size is 8. */
-	if (element_dw_size % 8 == 0)
+	if (null_descriptor) {
+		assert(element_dw_size % 8 == 0);
 		for (i = 0; i < num_elements * element_dw_size / 8; i++)
-			memcpy(desc->list + i*8, null_descriptor,
-			       sizeof(null_descriptor));
+			memcpy(desc->list + i * 8, null_descriptor,
+			       8 * 4);
+	}
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
@@ -210,7 +223,7 @@ static void si_set_sampler_view(struct si_context *sctx,
 		} else {
 			/* Disable FMASK and bind sampler state in [12:15]. */
 			memcpy(views->desc.list + slot*16 + 8,
-			       null_descriptor, 4*4);
+			       null_texture_descriptor, 4*4);
 
 			if (views->sampler_states[slot])
 				memcpy(views->desc.list + slot*16 + 12,
@@ -220,9 +233,9 @@ static void si_set_sampler_view(struct si_context *sctx,
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		memcpy(views->desc.list + slot*16, null_descriptor, 8*4);
+		memcpy(views->desc.list + slot*16, null_texture_descriptor, 8*4);
 		/* Only clear the lower dwords of FMASK. */
-		memcpy(views->desc.list + slot*16 + 8, null_descriptor, 4*4);
+		memcpy(views->desc.list + slot*16 + 8, null_texture_descriptor, 4*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
@@ -301,6 +314,160 @@ si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
 	}
 }
 
+/* IMAGE VIEWS */
+
+static void
+si_release_image_views(struct si_images_info *images)
+{
+	unsigned i;
+
+	for (i = 0; i < SI_NUM_IMAGES; ++i) {
+		struct pipe_image_view *view = &images->views[i];
+
+		pipe_resource_reference(&view->resource, NULL);
+	}
+
+	si_release_descriptors(&images->desc);
+}
+
+static void
+si_image_views_begin_new_cs(struct si_context *sctx, struct si_images_info *images)
+{
+	uint mask = images->desc.enabled_mask;
+
+	/* Add buffers to the CS. */
+	while (mask) {
+		int i = u_bit_scan(&mask);
+		struct pipe_image_view *view = &images->views[i];
+
+		assert(view->resource);
+
+		si_sampler_view_add_buffer(sctx, view->resource);
+	}
+
+	if (images->desc.buffer) {
+		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
+					  images->desc.buffer,
+					  RADEON_USAGE_READ,
+					  RADEON_PRIO_DESCRIPTORS);
+	}
+}
+
+static void
+si_disable_shader_image(struct si_images_info *images, unsigned slot)
+{
+	if (images->desc.enabled_mask & (1llu << slot)) {
+		pipe_resource_reference(&images->views[slot].resource, NULL);
+		images->compressed_colortex_mask &= ~(1 << slot);
+
+		memcpy(images->desc.list + slot*8, null_image_descriptor, 8*4);
+		images->desc.enabled_mask &= ~(1llu << slot);
+		images->desc.list_dirty = true;
+	}
+}
+
+static void
+si_set_shader_images(struct pipe_context *pipe, unsigned shader,
+		     unsigned start_slot, unsigned count,
+		     struct pipe_image_view *views)
+{
+	struct si_context *ctx = (struct si_context *)pipe;
+	struct si_screen *screen = ctx->screen;
+	struct si_images_info *images = &ctx->images[shader];
+	unsigned i, slot;
+
+	assert(shader < SI_NUM_SHADERS);
+
+	if (!count)
+		return;
+
+	assert(start_slot + count <= SI_NUM_IMAGES);
+
+	for (i = 0, slot = start_slot; i < count; ++i, ++slot) {
+		struct r600_resource *res;
+
+		if (!views || !views[i].resource) {
+			si_disable_shader_image(images, slot);
+			continue;
+		}
+
+		res = (struct r600_resource *)views[i].resource;
+		util_copy_image_view(&images->views[slot], &views[i]);
+
+		si_sampler_view_add_buffer(ctx, &res->b.b);
+
+		if (res->b.b.target == PIPE_BUFFER) {
+			si_make_buffer_descriptor(screen, res,
+						  views[i].format,
+						  views[i].u.buf.first_element,
+						  views[i].u.buf.last_element,
+						  images->desc.list + slot * 8);
+			images->compressed_colortex_mask &= ~(1 << slot);
+		} else {
+			static const unsigned char swizzle[4] = { 0, 1, 2, 3 };
+			struct r600_texture *tex = (struct r600_texture *)res;
+			unsigned level;
+			unsigned width, height, depth;
+
+			assert(!tex->is_depth);
+			assert(tex->fmask.size == 0);
+
+			if (tex->dcc_offset &&
+			    views[i].access & PIPE_IMAGE_ACCESS_WRITE)
+				r600_texture_disable_dcc(&screen->b, tex);
+
+			if (is_compressed_colortex(tex)) {
+				images->compressed_colortex_mask |= 1 << slot;
+			} else {
+				images->compressed_colortex_mask &= ~(1 << slot);
+			}
+
+			/* Always force the base level to the selected level.
+			 *
+			 * This is required for 3D textures, where otherwise
+			 * selecting a single slice for non-layered bindings
+			 * fails. It doesn't hurt the other targets.
+			 */
+			level = views[i].u.tex.level;
+			width = u_minify(res->b.b.width0, level);
+			height = u_minify(res->b.b.height0, level);
+			depth = u_minify(res->b.b.depth0, level);
+
+			si_make_texture_descriptor(screen, tex, false, res->b.b.target,
+						   views[i].format, swizzle,
+						   level, 0, 0,
+						   views[i].u.tex.first_layer, views[i].u.tex.last_layer,
+						   width, height, depth,
+						   images->desc.list + slot * 8,
+						   NULL);
+		}
+
+		images->desc.enabled_mask |= 1llu << slot;
+		images->desc.list_dirty = true;
+	}
+}
+
+static void
+si_images_update_compressed_colortex_mask(struct si_images_info *images)
+{
+	uint64_t mask = images->desc.enabled_mask;
+
+	while (mask) {
+		int i = u_bit_scan64(&mask);
+		struct pipe_resource *res = images->views[i].resource;
+
+		if (res && res->target != PIPE_BUFFER) {
+			struct r600_texture *rtex = (struct r600_texture *)res;
+
+			if (is_compressed_colortex(rtex)) {
+				images->compressed_colortex_mask |= 1 << i;
+			} else {
+				images->compressed_colortex_mask &= ~(1 << i);
+			}
+		}
+	}
+}
+
 /* SAMPLER STATES */
 
 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
@@ -351,7 +518,7 @@ static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
 
 	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
-			    num_buffers);
+			    num_buffers, NULL);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -804,6 +971,7 @@ void si_update_compressed_colortex_masks(struct si_context *sctx)
 {
 	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
 		si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
+		si_images_update_compressed_colortex_mask(&sctx->images[i]);
 	}
 }
 
@@ -925,6 +1093,8 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 			}
 		}
 	}
+
+	/* Shader images - update TODO */
 }
 
 /* SHADER USER DATA */
@@ -1055,6 +1225,7 @@ void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom)
 
 		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
 		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->images[i].desc, base, false);
 	}
 	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
@@ -1074,14 +1245,20 @@ void si_init_all_descriptors(struct si_context *sctx)
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_RINGS_STREAMOUT);
 
 		si_init_descriptors(&sctx->samplers[i].views.desc,
-				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS);
+				    SI_SGPR_SAMPLERS, 16, SI_NUM_SAMPLERS,
+				    null_texture_descriptor);
+
+		si_init_descriptors(&sctx->images[i].desc,
+				    SI_SGPR_IMAGES, 8, SI_NUM_IMAGES,
+				    null_image_descriptor);
 	}
 
 	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFERS,
-			    4, SI_NUM_VERTEX_BUFFERS);
+			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.bind_sampler_states = si_bind_sampler_states;
+	sctx->b.b.set_shader_images = si_set_shader_images;
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
 	sctx->b.b.set_sampler_views = si_set_sampler_views;
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
@@ -1105,7 +1282,8 @@ bool si_upload_shader_descriptors(struct si_context *sctx)
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
 		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
-		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc))
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->images[i].desc))
 			return false;
 	}
 	return si_upload_vertex_buffer_descriptors(sctx);
@@ -1119,6 +1297,7 @@ void si_release_all_descriptors(struct si_context *sctx)
 		si_release_buffer_resources(&sctx->const_buffers[i]);
 		si_release_buffer_resources(&sctx->rw_buffers[i]);
 		si_release_sampler_views(&sctx->samplers[i].views);
+		si_release_image_views(&sctx->images[i]);
 	}
 	si_release_descriptors(&sctx->vertex_buffers);
 }
@@ -1131,6 +1310,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_buffer_resources_begin_new_cs(sctx, &sctx->const_buffers[i]);
 		si_buffer_resources_begin_new_cs(sctx, &sctx->rw_buffers[i]);
 		si_sampler_views_begin_new_cs(sctx, &sctx->samplers[i].views);
+		si_image_views_begin_new_cs(sctx, &sctx->images[i]);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
 	si_shader_userdata_begin_new_cs(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 0fef5f72098..6d0d687fe4c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -141,6 +141,12 @@ struct si_textures_info {
 	uint32_t			compressed_colortex_mask;
 };
 
+struct si_images_info {
+	struct si_descriptors		desc;
+	struct pipe_image_view		views[SI_NUM_IMAGES];
+	uint32_t			compressed_colortex_mask;
+};
+
 struct si_framebuffer {
 	struct r600_atom		atom;
 	struct pipe_framebuffer_state	state;
@@ -251,6 +257,7 @@ struct si_context {
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
 	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
 	struct si_textures_info		samplers[SI_NUM_SHADERS];
+	struct si_images_info		images[SI_NUM_SHADERS];
 
 	/* other shader resources */
 	struct pipe_constant_buffer	null_const_buf; /* used for set_constant_buffer(NULL) on CIK */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 151615eb4e7..c95e2fe2c61 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3865,8 +3865,8 @@ static void create_function(struct si_shader_context *ctx)
 	params[SI_PARAM_RW_BUFFERS] = const_array(ctx->v16i8, SI_NUM_RW_BUFFERS);
 	params[SI_PARAM_CONST_BUFFERS] = const_array(ctx->v16i8, SI_NUM_CONST_BUFFERS);
 	params[SI_PARAM_SAMPLERS] = const_array(ctx->v8i32, SI_NUM_SAMPLERS);
-	params[SI_PARAM_UNUSED] = LLVMPointerType(ctx->i32, CONST_ADDR_SPACE);
-	last_array_pointer = SI_PARAM_UNUSED;
+	params[SI_PARAM_IMAGES] = const_array(ctx->v8i32, SI_NUM_IMAGES);
+	last_array_pointer = SI_PARAM_IMAGES;
 
 	switch (ctx->type) {
 	case TGSI_PROCESSOR_VERTEX:
@@ -5383,7 +5383,7 @@ static bool si_compile_tcs_epilog(struct si_screen *sscreen,
 	last_array_pointer = SI_PARAM_RW_BUFFERS;
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
-	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_IMAGES] = ctx.i64;
 	params[SI_PARAM_TCS_OUT_OFFSETS] = ctx.i32;
 	params[SI_PARAM_TCS_OUT_LAYOUT] = ctx.i32;
 	params[SI_PARAM_TCS_IN_LAYOUT] = ctx.i32;
@@ -5633,7 +5633,7 @@ static bool si_compile_ps_epilog(struct si_screen *sscreen,
 	params[SI_PARAM_RW_BUFFERS] = ctx.i64;
 	params[SI_PARAM_CONST_BUFFERS] = ctx.i64;
 	params[SI_PARAM_SAMPLERS] = ctx.i64;
-	params[SI_PARAM_UNUSED] = ctx.i64;
+	params[SI_PARAM_IMAGES] = ctx.i64;
 	params[SI_PARAM_ALPHA_REF] = ctx.f32;
 	last_array_pointer = -1;
 	last_sgpr = SI_PARAM_ALPHA_REF;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index de23e642fe4..8059edf6395 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -80,7 +80,7 @@ struct radeon_shader_reloc;
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
 #define SI_SGPR_CONST_BUFFERS	2
 #define SI_SGPR_SAMPLERS	4  /* images & sampler states interleaved */
-/* TODO: gap */
+#define SI_SGPR_IMAGES		6
 #define SI_SGPR_VERTEX_BUFFERS	8  /* VS only */
 #define SI_SGPR_BASE_VERTEX	10 /* VS only */
 #define SI_SGPR_START_INSTANCE	11 /* VS only */
@@ -104,7 +104,7 @@ struct radeon_shader_reloc;
 #define SI_PARAM_RW_BUFFERS	0
 #define SI_PARAM_CONST_BUFFERS	1
 #define SI_PARAM_SAMPLERS	2
-#define SI_PARAM_UNUSED		3 /* TODO: use */
+#define SI_PARAM_IMAGES		3
 
 /* VS only parameters */
 #define SI_PARAM_VERTEX_BUFFERS	4
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index f823af188c7..0c3fbdc9b31 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2797,7 +2797,7 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  * Build the sampler view descriptor for a buffer texture.
  * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
-static void
+void
 si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
 			  enum pipe_format format,
 			  unsigned first_element, unsigned last_element,
@@ -2838,9 +2838,10 @@ si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
 /**
  * Build the sampler view descriptor for a texture.
  */
-static void
+void
 si_make_texture_descriptor(struct si_screen *screen,
 			   struct r600_texture *tex,
+			   bool sampler,
 			   enum pipe_texture_target target,
 			   enum pipe_format pipe_format,
 			   const unsigned char state_swizzle[4],
@@ -2855,7 +2856,7 @@ si_make_texture_descriptor(struct si_screen *screen,
 	const struct util_format_description *desc;
 	unsigned char swizzle[4];
 	int first_non_void;
-	unsigned num_format, data_format;
+	unsigned num_format, data_format, type;
 	uint32_t pitch;
 	uint64_t va;
 
@@ -2973,12 +2974,29 @@ si_make_texture_descriptor(struct si_screen *screen,
 		data_format = 0;
 	}
 
-	if (res->target == PIPE_TEXTURE_1D_ARRAY) {
+	if (!sampler &&
+	    (res->target == PIPE_TEXTURE_CUBE ||
+	     res->target == PIPE_TEXTURE_CUBE_ARRAY ||
+	     res->target == PIPE_TEXTURE_3D)) {
+		/* For the purpose of shader images, treat cube maps and 3D
+		 * textures as 2D arrays. For 3D textures, the address
+		 * calculations for mipmaps are different, so we rely on the
+		 * caller to effectively disable mipmaps.
+		 */
+		type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
+
+		assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
+	} else {
+		type = si_tex_dim(res->target, target, res->nr_samples);
+	}
+
+	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
 	        height = 1;
 		depth = res->array_size;
-	} else if (res->target == PIPE_TEXTURE_2D_ARRAY) {
-		depth = res->array_size;
-	} else if (res->target == PIPE_TEXTURE_CUBE_ARRAY)
+	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY) {
+		if (sampler || res->target != PIPE_TEXTURE_3D)
+			depth = res->array_size;
+	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
 		depth = res->array_size / 6;
 
 	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
@@ -3001,7 +3019,7 @@ si_make_texture_descriptor(struct si_screen *screen,
 					last_level) |
 		    S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) |
 		    S_008F1C_POW2_PAD(res->last_level > 0) |
-		    S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples)));
+		    S_008F1C_TYPE(type));
 	state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
 	state[5] = (S_008F24_BASE_ARRAY(first_layer) |
 		    S_008F24_LAST_ARRAY(last_layer));
@@ -3155,7 +3173,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 	    state->target == PIPE_TEXTURE_CUBE)
 		last_layer = state->u.tex.first_layer;
 
-	si_make_texture_descriptor(sctx->screen, tmp, state->target,
+	si_make_texture_descriptor(sctx->screen, tmp, true, state->target,
 				   state->format, state_swizzle,
 				   base_level, first_level, last_level,
 				   state->u.tex.first_layer, last_layer,
@@ -3637,7 +3655,8 @@ static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
 	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
 	md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
 
-	si_make_texture_descriptor(sscreen, rtex, res->target, res->format,
+	si_make_texture_descriptor(sscreen, rtex, true,
+				   res->target, res->format,
 				   swizzle, 0, 0, res->last_level, 0,
 				   is_array ? res->array_size - 1 : 0,
 				   res->width0, res->height0, res->depth0,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 60c34f19e55..c4d6b9d9eee 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -158,6 +158,8 @@ struct si_shader_data {
 #define SI_DRIVER_STATE_CONST_BUF	SI_NUM_USER_CONST_BUFFERS
 #define SI_NUM_CONST_BUFFERS		(SI_DRIVER_STATE_CONST_BUF + 1)
 
+#define SI_NUM_IMAGES			16
+
 /* Read-write buffer slots.
  *
  * Ring buffers:        0..1
@@ -272,6 +274,23 @@ unsigned cik_tile_split(unsigned tile_split);
 unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
+void
+si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
+			  enum pipe_format format,
+			  unsigned first_element, unsigned last_element,
+			  uint32_t *state);
+void
+si_make_texture_descriptor(struct si_screen *screen,
+			   struct r600_texture *tex,
+			   bool sampler,
+			   enum pipe_texture_target target,
+			   enum pipe_format pipe_format,
+			   const unsigned char state_swizzle[4],
+			   unsigned base_level, unsigned first_level, unsigned last_level,
+			   unsigned first_layer, unsigned last_layer,
+			   unsigned width, unsigned height, unsigned depth,
+			   uint32_t *state,
+			   uint32_t *fmask_state);
 struct pipe_sampler_view *
 si_create_sampler_view_custom(struct pipe_context *ctx,
 			      struct pipe_resource *texture,

From f61566b77a6164ad0830c4c7d363d91f6859a794 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 15 Mar 2016 22:01:39 -0500
Subject: [PATCH 113/197] radeonsi: update shader image descriptor for
 invalidated buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index a931ab24554..815b87bbd7e 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -1094,7 +1094,27 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 		}
 	}
 
-	/* Shader images - update TODO */
+	/* Shader images */
+	for (shader = 0; shader < SI_NUM_SHADERS; ++shader) {
+		struct si_images_info *images = &sctx->images[shader];
+		unsigned mask = images->desc.enabled_mask;
+
+		while (mask) {
+			unsigned i = u_bit_scan(&mask);
+
+			if (images->views[i].resource == buf) {
+				si_desc_reset_buffer_offset(
+					ctx, images->desc.list + i * 8 + 4,
+					old_va, buf);
+				images->desc.list_dirty = true;
+
+				radeon_add_to_buffer_list(
+					&sctx->b, &sctx->b.gfx, rbuffer,
+					RADEON_USAGE_READWRITE,
+					RADEON_PRIO_SAMPLER_BUFFER);
+			}
+		}
+	}
 }
 
 /* SHADER USER DATA */

From 515fb2c09c51ada05db80a3ee337bc7265edfdd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 7 Feb 2016 22:30:46 -0500
Subject: [PATCH 114/197] radeonsi: decompress shader images
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_blit.c | 36 +++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index f9a6de48f6b..e0dbec5fb79 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -325,8 +325,8 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 }
 
 static void
-si_decompress_color_textures(struct si_context *sctx,
-			     struct si_textures_info *textures)
+si_decompress_sampler_color_textures(struct si_context *sctx,
+				     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->compressed_colortex_mask;
@@ -350,6 +350,33 @@ si_decompress_color_textures(struct si_context *sctx,
 	}
 }
 
+static void
+si_decompress_image_color_textures(struct si_context *sctx,
+				   struct si_images_info *images)
+{
+	unsigned i;
+	unsigned mask = images->compressed_colortex_mask;
+
+	while (mask) {
+		const struct pipe_image_view *view;
+		struct r600_texture *tex;
+
+		i = u_bit_scan(&mask);
+
+		view = &images->views[i];
+		assert(view->resource->target != PIPE_BUFFER);
+
+		tex = (struct r600_texture *)view->resource;
+		if (!tex->cmask.size && !tex->fmask.size && !tex->dcc_offset)
+			continue;
+
+		si_blit_decompress_color(&sctx->b.b, tex,
+					 view->u.tex.level, view->u.tex.level,
+					 0, util_max_layer(&tex->resource.b.b, view->u.tex.level),
+					 false);
+	}
+}
+
 void si_decompress_textures(struct si_context *sctx)
 {
 	unsigned compressed_colortex_counter;
@@ -370,7 +397,10 @@ void si_decompress_textures(struct si_context *sctx)
 			si_flush_depth_textures(sctx, &sctx->samplers[i]);
 		}
 		if (sctx->samplers[i].compressed_colortex_mask) {
-			si_decompress_color_textures(sctx, &sctx->samplers[i]);
+			si_decompress_sampler_color_textures(sctx, &sctx->samplers[i]);
+		}
+		if (sctx->images[i].compressed_colortex_mask) {
+			si_decompress_image_color_textures(sctx, &sctx->images[i]);
 		}
 	}
 }

From 75539197c7687410228d4fb18b0faa201474eff2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 7 Feb 2016 17:34:57 -0500
Subject: [PATCH 115/197] radeonsi: extract TXQ buffer size computation into
 its own function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This will allow it to be reused for RESQ.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 55 +++++++++++++++---------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index c95e2fe2c61..aefb2e0cbd8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2656,6 +2656,40 @@ static void si_llvm_return_fs_outputs(struct lp_build_tgsi_context *bld_base)
 	ctx->return_value = ret;
 }
 
+/**
+ * Given a v8i32 resource descriptor for a buffer, extract the size of the
+ * buffer in number of elements and return it as an i32.
+ */
+static LLVMValueRef get_buffer_size(
+	struct lp_build_tgsi_context *bld_base,
+	LLVMValueRef descriptor)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef size =
+		LLVMBuildExtractElement(builder, descriptor,
+					lp_build_const_int32(gallivm, 6), "");
+
+	if (ctx->screen->b.chip_class >= VI) {
+		/* On VI, the descriptor contains the size in bytes,
+		 * but TXQ must return the size in elements.
+		 * The stride is always non-zero for resources using TXQ.
+		 */
+		LLVMValueRef stride =
+			LLVMBuildExtractElement(builder, descriptor,
+						lp_build_const_int32(gallivm, 5), "");
+		stride = LLVMBuildLShr(builder, stride,
+				       lp_build_const_int32(gallivm, 16), "");
+		stride = LLVMBuildAnd(builder, stride,
+				      lp_build_const_int32(gallivm, 0x3FFF), "");
+
+		size = LLVMBuildUDiv(builder, size, stride, "");
+	}
+
+	return size;
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data);
@@ -2836,26 +2870,7 @@ static void tex_fetch_args(
 		if (target == TGSI_TEXTURE_BUFFER) {
 			/* Read the size from the buffer descriptor directly. */
 			LLVMValueRef res = LLVMBuildBitCast(builder, res_ptr, ctx->v8i32, "");
-			LLVMValueRef size = LLVMBuildExtractElement(builder, res,
-							lp_build_const_int32(gallivm, 6), "");
-
-			if (ctx->screen->b.chip_class >= VI) {
-				/* On VI, the descriptor contains the size in bytes,
-				 * but TXQ must return the size in elements.
-				 * The stride is always non-zero for resources using TXQ.
-				 */
-				LLVMValueRef stride =
-					LLVMBuildExtractElement(builder, res,
-								lp_build_const_int32(gallivm, 5), "");
-				stride = LLVMBuildLShr(builder, stride,
-						       lp_build_const_int32(gallivm, 16), "");
-				stride = LLVMBuildAnd(builder, stride,
-						      lp_build_const_int32(gallivm, 0x3FFF), "");
-
-				size = LLVMBuildUDiv(builder, size, stride, "");
-			}
-
-			emit_data->args[0] = size;
+			emit_data->args[0] = get_buffer_size(bld_base, res);
 			return;
 		}
 

From 02bd0cd7b108dd903ae40af1f70a36f7553bfa7e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 7 Feb 2016 18:41:09 -0500
Subject: [PATCH 116/197] radeonsi: Lower TGSI_OPCODE_RESQ down to LLVM op
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 129 +++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index aefb2e0cbd8..04f4aa98e99 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -99,6 +99,7 @@ struct si_shader_context
 	LLVMValueRef sampler_views[SI_NUM_SAMPLERS];
 	LLVMValueRef sampler_states[SI_NUM_SAMPLERS];
 	LLVMValueRef fmasks[SI_NUM_USER_SAMPLERS];
+	LLVMValueRef images[SI_NUM_IMAGES];
 	LLVMValueRef so_buffers[4];
 	LLVMValueRef esgs_ring;
 	LLVMValueRef gsvs_ring[4];
@@ -2705,6 +2706,109 @@ static bool tgsi_is_array_sampler(unsigned target)
 	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
 }
 
+static bool tgsi_is_array_image(unsigned target)
+{
+	return target == TGSI_TEXTURE_3D ||
+	       target == TGSI_TEXTURE_CUBE ||
+	       target == TGSI_TEXTURE_1D_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY ||
+	       target == TGSI_TEXTURE_CUBE_ARRAY ||
+	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
+}
+
+/**
+ * Load the resource descriptor for \p image.
+ */
+static void
+image_fetch_rsrc(
+	struct lp_build_tgsi_context *bld_base,
+	const struct tgsi_full_src_register *image,
+	LLVMValueRef *rsrc)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+
+	assert(image->Register.File == TGSI_FILE_IMAGE);
+
+	if (!image->Register.Indirect) {
+		/* Fast path: use preloaded resources */
+		*rsrc = ctx->images[image->Register.Index];
+	} else {
+		/* Indexing and manual load */
+		LLVMValueRef ind_index;
+		LLVMValueRef rsrc_ptr;
+
+		ind_index = get_indirect_index(ctx, &image->Indirect, image->Register.Index);
+
+		rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
+		*rsrc = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
+	}
+}
+
+static void resq_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	const struct tgsi_full_src_register *reg = &inst->Src[0];
+	unsigned tex_target = inst->Memory.Texture;
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	if (tex_target == TGSI_TEXTURE_BUFFER) {
+		image_fetch_rsrc(bld_base, reg, &emit_data->args[0]);
+		emit_data->arg_count = 1;
+	} else {
+		emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
+		image_fetch_rsrc(bld_base, reg, &emit_data->args[1]);
+		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
+		emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
+		emit_data->args[5] = tgsi_is_array_image(tex_target) ?
+			bld_base->uint_bld.one : bld_base->uint_bld.zero; /* da */
+		emit_data->args[6] = bld_base->uint_bld.zero; /* glc */
+		emit_data->args[7] = bld_base->uint_bld.zero; /* slc */
+		emit_data->args[8] = bld_base->uint_bld.zero; /* tfe */
+		emit_data->args[9] = bld_base->uint_bld.zero; /* lwe */
+		emit_data->arg_count = 10;
+	}
+}
+
+static void resq_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef out;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		out = get_buffer_size(bld_base, emit_data->args[0]);
+	} else {
+		out = lp_build_intrinsic(
+			builder, "llvm.SI.getresinfo.i32", emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMReadNoneAttribute | LLVMNoUnwindAttribute);
+
+		/* Divide the number of layers by 6 to get the number of cubes. */
+		if (target == TGSI_TEXTURE_CUBE_ARRAY) {
+			LLVMValueRef imm2 = lp_build_const_int32(gallivm, 2);
+			LLVMValueRef imm6 = lp_build_const_int32(gallivm, 6);
+
+			LLVMValueRef z = LLVMBuildExtractElement(builder, out, imm2, "");
+			z = LLVMBuildBitCast(builder, z, bld_base->uint_bld.elem_type, "");
+			z = LLVMBuildSDiv(builder, z, imm6, "");
+			z = LLVMBuildBitCast(builder, z, bld_base->base.elem_type, "");
+			out = LLVMBuildInsertElement(builder, out, z, imm2, "");
+		}
+	}
+
+	emit_data->output[emit_data->chan] = out;
+}
+
 static void set_tex_fetch_args(struct si_shader_context *ctx,
 			       struct lp_build_emit_data *emit_data,
 			       unsigned opcode, unsigned target,
@@ -4168,6 +4272,27 @@ static void preload_samplers(struct si_shader_context *ctx)
 	}
 }
 
+static void preload_images(struct si_shader_context *ctx)
+{
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
+	LLVMValueRef res_ptr;
+	unsigned i;
+
+	if (num_images == 0)
+		return;
+
+	res_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
+
+	for (i = 0; i < num_images; ++i) {
+		/* Rely on LLVM to shrink the load for buffer resources. */
+		ctx->images[i] =
+			build_indexed_load_const(ctx, res_ptr,
+						 lp_build_const_int32(gallivm, i));
+	}
+}
+
 static void preload_streamout_buffers(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
@@ -4854,6 +4979,9 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
 
+	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
+
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;
@@ -4941,6 +5069,7 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 	create_function(&ctx);
 	preload_constants(&ctx);
 	preload_samplers(&ctx);
+	preload_images(&ctx);
 	preload_streamout_buffers(&ctx);
 	preload_ring_buffers(&ctx);
 

From 136686a51dd5f92c3905253d7abf7ad40f717016 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 9 Feb 2016 15:01:35 -0500
Subject: [PATCH 117/197] radeonsi: extract the LLVM type name construction
 into its own function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 26 +++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 04f4aa98e99..fd43e2ace3d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2691,6 +2691,23 @@ static LLVMValueRef get_buffer_size(
 	return size;
 }
 
+/**
+ * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
+ * intrinsic names).
+ */
+static void build_int_type_name(
+	LLVMTypeRef type,
+	char *buf, unsigned bufsize)
+{
+	assert(bufsize >= 6);
+
+	if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
+		snprintf(buf, bufsize, "v%ui32",
+			 LLVMGetVectorSize(type));
+	else
+		strcpy(buf, "i32");
+}
+
 static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data);
@@ -3355,14 +3372,9 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 		return;
 	}
 
-	if (LLVMGetTypeKind(LLVMTypeOf(emit_data->args[0])) == LLVMVectorTypeKind)
-		sprintf(type, ".v%ui32",
-			LLVMGetVectorSize(LLVMTypeOf(emit_data->args[0])));
-	else
-		strcpy(type, ".i32");
-
 	/* Add the type and suffixes .c, .o if needed. */
-	sprintf(intr_name, "%s%s%s%s%s",
+	build_int_type_name(LLVMTypeOf(emit_data->args[0]), type, sizeof(type));
+	sprintf(intr_name, "%s%s%s%s.%s",
 		name, is_shadow ? ".c" : "", infix,
 		has_offset ? ".o" : "", type);
 

From 1e82dedeca9670012a24b3d5da0832ca2c5c0861 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 9 Feb 2016 10:59:14 -0500
Subject: [PATCH 118/197] radeonsi: Lower TGSI_OPCODE_LOAD down to LLVM op (v3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: new signature style for buffer intrinsics (offsets)
v3: new signature style for llvm.amdgcn.buffer.load.format (overloaded return)

Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v2)
---
 src/gallium/drivers/radeonsi/si_shader.c | 139 +++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index fd43e2ace3d..b73076f29e5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2761,6 +2761,143 @@ image_fetch_rsrc(
 	}
 }
 
+static LLVMValueRef image_fetch_coords(
+		struct lp_build_tgsi_context *bld_base,
+		const struct tgsi_full_instruction *inst)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	unsigned target = inst->Memory.Texture;
+	int sample;
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+	LLVMValueRef coords[4];
+	LLVMValueRef tmp;
+	int chan;
+
+	for (chan = 0; chan < num_coords; ++chan) {
+		tmp = lp_build_emit_fetch(bld_base, inst, 1, chan);
+		tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+		coords[chan] = tmp;
+	}
+
+	if (num_coords == 1)
+		return coords[0];
+
+	if (num_coords == 3) {
+		/* LLVM has difficulties lowering 3-element vectors. */
+		coords[3] = bld_base->uint_bld.undef;
+		num_coords = 4;
+	}
+
+	return lp_build_gather_values(gallivm, coords, num_coords);
+}
+
+/**
+ * Append the extra mode bits that are used by image load and store.
+ */
+static void image_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data * emit_data,
+		unsigned target)
+{
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
+
+	emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
+	emit_data->args[emit_data->arg_count++] =
+		tgsi_is_array_image(target) ? i1true : i1false; /* da */
+	emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
+}
+
+/**
+ * Append the resource and indexing arguments for buffer intrinsics.
+ *
+ * \param rsrc the 256 bit resource
+ * \param index index into the buffer
+ */
+static void buffer_append_args(
+		struct si_shader_context *ctx,
+		struct lp_build_emit_data *emit_data,
+		LLVMValueRef rsrc,
+		LLVMValueRef index)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
+	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
+	rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
+	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, ctx->v4i32, "");
+
+	emit_data->args[emit_data->arg_count++] = rsrc;
+	emit_data->args[emit_data->arg_count++] = index; /* vindex */
+	emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */
+	emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
+}
+
+static void load_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+
+	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
+
+	image_fetch_rsrc(bld_base, &inst->Src[0], &rsrc);
+	coords = image_fetch_coords(bld_base, inst);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		buffer_append_args(ctx, emit_data, rsrc, coords);
+	} else {
+		emit_data->args[0] = coords;
+		emit_data->args[1] = rsrc;
+		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->arg_count = 3;
+
+		image_append_args(ctx, emit_data, target);
+	}
+}
+
+static void load_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[32];
+	char coords_type[8];
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,
+			emit_data->args, emit_data->arg_count,
+			LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	} else {
+		build_int_type_name(LLVMTypeOf(emit_data->args[0]),
+				    coords_type, sizeof(coords_type));
+
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.load.%s", coords_type);
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				LLVMReadOnlyAttribute | LLVMNoUnwindAttribute);
+	}
+}
+
 static void resq_fetch_args(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data)
@@ -4991,6 +5128,8 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_LODQ] = tex_action;
 	bld_base->op_actions[TGSI_OPCODE_TXQS].emit = si_llvm_emit_txqs;
 
+	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
 

From bfcefcb3c77ad734d3deee888b6881b4c20f28a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 9 Feb 2016 11:51:31 -0500
Subject: [PATCH 119/197] radeonsi: Lower TGSI_OPCODE_STORE down to LLVM op
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 83 +++++++++++++++++++++++-
 1 file changed, 80 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b73076f29e5..a5ccb720d34 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -40,6 +40,7 @@
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"
 
@@ -2763,7 +2764,8 @@ image_fetch_rsrc(
 
 static LLVMValueRef image_fetch_coords(
 		struct lp_build_tgsi_context *bld_base,
-		const struct tgsi_full_instruction *inst)
+		const struct tgsi_full_instruction *inst,
+		unsigned src)
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
@@ -2775,7 +2777,7 @@ static LLVMValueRef image_fetch_coords(
 	int chan;
 
 	for (chan = 0; chan < num_coords; ++chan) {
-		tmp = lp_build_emit_fetch(bld_base, inst, 1, chan);
+		tmp = lp_build_emit_fetch(bld_base, inst, src, chan);
 		tmp = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
 		coords[chan] = tmp;
 	}
@@ -2852,7 +2854,7 @@ static void load_fetch_args(
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 
 	image_fetch_rsrc(bld_base, &inst->Src[0], &rsrc);
-	coords = image_fetch_coords(bld_base, inst);
+	coords = image_fetch_coords(bld_base, inst, 1);
 
 	if (target == TGSI_TEXTURE_BUFFER) {
 		buffer_append_args(ctx, emit_data, rsrc, coords);
@@ -2898,6 +2900,79 @@ static void load_emit(
 	}
 }
 
+static void store_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	struct tgsi_full_src_register image;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef chans[4];
+	LLVMValueRef data;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+	unsigned chan;
+
+	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
+
+	image = tgsi_full_src_register_from_dst(&inst->Dst[0]);
+	image_fetch_rsrc(bld_base, &image, &rsrc);
+	coords = image_fetch_coords(bld_base, inst, 0);
+
+	for (chan = 0; chan < 4; ++chan) {
+		chans[chan] = lp_build_emit_fetch(bld_base, inst, 1, chan);
+	}
+	data = lp_build_gather_values(gallivm, chans, 4);
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->args[0] = data;
+		emit_data->arg_count = 1;
+
+		buffer_append_args(ctx, emit_data, rsrc, coords);
+	} else {
+		emit_data->args[0] = data;
+		emit_data->args[1] = coords;
+		emit_data->args[2] = rsrc;
+		emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
+		emit_data->arg_count = 4;
+
+		image_append_args(ctx, emit_data, target);
+	}
+}
+
+static void store_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[32];
+	char coords_type[8];
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		emit_data->output[emit_data->chan] = lp_build_intrinsic(
+			builder, "llvm.amdgcn.buffer.store.format.v4f32",
+			emit_data->dst_type, emit_data->args, emit_data->arg_count,
+			LLVMNoUnwindAttribute);
+	} else {
+		build_int_type_name(LLVMTypeOf(emit_data->args[1]),
+				    coords_type, sizeof(coords_type));
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.store.%s", coords_type);
+
+		emit_data->output[emit_data->chan] =
+			lp_build_intrinsic(
+				builder, intrinsic_name, emit_data->dst_type,
+				emit_data->args, emit_data->arg_count,
+				LLVMNoUnwindAttribute);
+	}
+}
+
 static void resq_fetch_args(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data)
@@ -5130,6 +5205,8 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 
 	bld_base->op_actions[TGSI_OPCODE_LOAD].fetch_args = load_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_LOAD].emit = load_emit;
+	bld_base->op_actions[TGSI_OPCODE_STORE].fetch_args = store_fetch_args;
+	bld_base->op_actions[TGSI_OPCODE_STORE].emit = store_emit;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
 

From f7a85a8a0aae692303601c5359ba8e76d78e1c28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 11 Feb 2016 20:54:25 -0500
Subject: [PATCH 120/197] radeonsi: Lower TGSI_OPCODE_ATOM* down to LLVM op
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 121 +++++++++++++++++++++--
 1 file changed, 113 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index a5ccb720d34..b487a3f6d13 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2800,7 +2800,8 @@ static LLVMValueRef image_fetch_coords(
 static void image_append_args(
 		struct si_shader_context *ctx,
 		struct lp_build_emit_data * emit_data,
-		unsigned target)
+		unsigned target,
+		bool atomic)
 {
 	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
 	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
@@ -2808,7 +2809,8 @@ static void image_append_args(
 	emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
 	emit_data->args[emit_data->arg_count++] =
 		tgsi_is_array_image(target) ? i1true : i1false; /* da */
-	emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	if (!atomic)
+		emit_data->args[emit_data->arg_count++] = i1false; /* glc */
 	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
 }
 
@@ -2822,7 +2824,8 @@ static void buffer_append_args(
 		struct si_shader_context *ctx,
 		struct lp_build_emit_data *emit_data,
 		LLVMValueRef rsrc,
-		LLVMValueRef index)
+		LLVMValueRef index,
+		bool atomic)
 {
 	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
@@ -2836,7 +2839,8 @@ static void buffer_append_args(
 	emit_data->args[emit_data->arg_count++] = rsrc;
 	emit_data->args[emit_data->arg_count++] = index; /* vindex */
 	emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */
-	emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	if (!atomic)
+		emit_data->args[emit_data->arg_count++] = i1false; /* glc */
 	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
 }
 
@@ -2857,14 +2861,14 @@ static void load_fetch_args(
 	coords = image_fetch_coords(bld_base, inst, 1);
 
 	if (target == TGSI_TEXTURE_BUFFER) {
-		buffer_append_args(ctx, emit_data, rsrc, coords);
+		buffer_append_args(ctx, emit_data, rsrc, coords, false);
 	} else {
 		emit_data->args[0] = coords;
 		emit_data->args[1] = rsrc;
 		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
 		emit_data->arg_count = 3;
 
-		image_append_args(ctx, emit_data, target);
+		image_append_args(ctx, emit_data, target, false);
 	}
 }
 
@@ -2930,7 +2934,7 @@ static void store_fetch_args(
 		emit_data->args[0] = data;
 		emit_data->arg_count = 1;
 
-		buffer_append_args(ctx, emit_data, rsrc, coords);
+		buffer_append_args(ctx, emit_data, rsrc, coords, false);
 	} else {
 		emit_data->args[0] = data;
 		emit_data->args[1] = coords;
@@ -2938,7 +2942,7 @@ static void store_fetch_args(
 		emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
 		emit_data->arg_count = 4;
 
-		image_append_args(ctx, emit_data, target);
+		image_append_args(ctx, emit_data, target, false);
 	}
 }
 
@@ -2973,6 +2977,83 @@ static void store_emit(
 	}
 }
 
+static void atomic_fetch_args(
+		struct lp_build_tgsi_context * bld_base,
+		struct lp_build_emit_data * emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	LLVMValueRef data1, data2;
+	LLVMValueRef coords;
+	LLVMValueRef rsrc;
+	LLVMValueRef tmp;
+
+	emit_data->dst_type = bld_base->base.elem_type;
+
+	image_fetch_rsrc(bld_base, &inst->Src[0], &rsrc);
+	coords = image_fetch_coords(bld_base, inst, 1);
+
+	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
+	data1 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+		tmp = lp_build_emit_fetch(bld_base, inst, 3, 0);
+		data2 = LLVMBuildBitCast(builder, tmp, bld_base->uint_bld.elem_type, "");
+	}
+
+	/* llvm.amdgcn.image/buffer.atomic.cmpswap reflect the hardware order
+	 * of arguments, which is reversed relative to TGSI (and GLSL)
+	 */
+	if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+		emit_data->args[emit_data->arg_count++] = data2;
+	emit_data->args[emit_data->arg_count++] = data1;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		buffer_append_args(ctx, emit_data, rsrc, coords, true);
+	} else {
+		emit_data->args[emit_data->arg_count++] = coords;
+		emit_data->args[emit_data->arg_count++] = rsrc;
+
+		image_append_args(ctx, emit_data, target, true);
+	}
+}
+
+static void atomic_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	const struct tgsi_full_instruction * inst = emit_data->inst;
+	unsigned target = inst->Memory.Texture;
+	char intrinsic_name[40];
+	LLVMValueRef tmp;
+
+	if (target == TGSI_TEXTURE_BUFFER) {
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.buffer.atomic.%s", action->intr_name);
+	} else {
+		char coords_type[8];
+
+		build_int_type_name(LLVMTypeOf(emit_data->args[1]),
+				    coords_type, sizeof(coords_type));
+		snprintf(intrinsic_name, sizeof(intrinsic_name),
+			 "llvm.amdgcn.image.atomic.%s.%s",
+			 action->intr_name, coords_type);
+	}
+
+	tmp = lp_build_intrinsic(
+		builder, intrinsic_name, bld_base->uint_bld.elem_type,
+		emit_data->args, emit_data->arg_count,
+		LLVMNoUnwindAttribute);
+	emit_data->output[emit_data->chan] =
+		LLVMBuildBitCast(builder, tmp, bld_base->base.elem_type, "");
+}
+
 static void resq_fetch_args(
 		struct lp_build_tgsi_context * bld_base,
 		struct lp_build_emit_data * emit_data)
@@ -5156,6 +5237,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       LLVMTargetMachineRef tm)
 {
 	struct lp_build_tgsi_context *bld_base;
+	struct lp_build_tgsi_action tmpl;
 
 	memset(ctx, 0, sizeof(*ctx));
 	radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");
@@ -5210,6 +5292,29 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_RESQ].fetch_args = resq_fetch_args;
 	bld_base->op_actions[TGSI_OPCODE_RESQ].emit = resq_emit;
 
+	tmpl.fetch_args = atomic_fetch_args;
+	tmpl.emit = atomic_emit;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXCHG].intr_name = "swap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMCAS].intr_name = "cmpswap";
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMAND].intr_name = "and";
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMOR].intr_name = "or";
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMXOR].intr_name = "xor";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMIN].intr_name = "umin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMUMAX].intr_name = "umax";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMIN].intr_name = "smin";
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
+	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
+
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;

From d6fa650454db5e3308a5c3618e4586a2c8f537cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 10 Mar 2016 18:12:44 -0500
Subject: [PATCH 121/197] radeonsi: Lower TGSI_OPCODE_MEMBAR down to LLVM op
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 31 ++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index b487a3f6d13..ca90178f075 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2713,6 +2713,35 @@ static void build_tex_intrinsic(const struct lp_build_tgsi_action *action,
 				struct lp_build_tgsi_context *bld_base,
 				struct lp_build_emit_data *emit_data);
 
+/* Prevent optimizations (at least of memory accesses) across the current
+ * point in the program by emitting empty inline assembly that is marked as
+ * having side effects.
+ */
+static void emit_optimization_barrier(struct si_shader_context *ctx)
+{
+	LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+	LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
+	LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, "", "", true, false);
+	LLVMBuildCall(builder, inlineasm, NULL, 0, "");
+}
+
+static void membar_emit(
+		const struct lp_build_tgsi_action *action,
+		struct lp_build_tgsi_context *bld_base,
+		struct lp_build_emit_data *emit_data)
+{
+	struct si_shader_context *ctx = si_shader_context(bld_base);
+
+	/* Since memoryBarrier only makes guarantees about atomics and
+	 * coherent image accesses (which bypass TC L1), we do not need to emit
+	 * any special cache handling here.
+	 *
+	 * We do have to prevent LLVM from re-ordering loads across
+	 * the barrier though.
+	 */
+	emit_optimization_barrier(ctx);
+}
+
 static bool tgsi_is_array_sampler(unsigned target)
 {
 	return target == TGSI_TEXTURE_1D_ARRAY ||
@@ -5315,6 +5344,8 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX] = tmpl;
 	bld_base->op_actions[TGSI_OPCODE_ATOMIMAX].intr_name = "smax";
 
+	bld_base->op_actions[TGSI_OPCODE_MEMBAR].emit = membar_emit;
+
 	bld_base->op_actions[TGSI_OPCODE_DDX].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDY].emit = si_llvm_emit_ddxy;
 	bld_base->op_actions[TGSI_OPCODE_DDX_FINE].emit = si_llvm_emit_ddxy;

From 5a61b428f477e7eef9f18f2fd43f661f193ece39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sat, 12 Mar 2016 21:32:34 -0500
Subject: [PATCH 122/197] radeonsi: implement coherent memory access (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: set glc=1 for volatile also on buffers

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index ca90178f075..db0cc5bbf62 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2832,14 +2832,18 @@ static void image_append_args(
 		unsigned target,
 		bool atomic)
 {
+	const struct tgsi_full_instruction *inst = emit_data->inst;
 	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
 	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
 
 	emit_data->args[emit_data->arg_count++] = i1false; /* r128 */
 	emit_data->args[emit_data->arg_count++] =
 		tgsi_is_array_image(target) ? i1true : i1false; /* da */
-	if (!atomic)
-		emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	if (!atomic) {
+		emit_data->args[emit_data->arg_count++] =
+			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+			i1true : i1false; /* glc */
+	}
 	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
 }
 
@@ -2858,8 +2862,10 @@ static void buffer_append_args(
 {
 	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	const struct tgsi_full_instruction *inst = emit_data->inst;
 	LLVMTypeRef v2i128 = LLVMVectorType(ctx->i128, 2);
 	LLVMValueRef i1false = LLVMConstInt(ctx->i1, 0, 0);
+	LLVMValueRef i1true = LLVMConstInt(ctx->i1, 1, 0);
 
 	rsrc = LLVMBuildBitCast(gallivm->builder, rsrc, v2i128, "");
 	rsrc = LLVMBuildExtractElement(gallivm->builder, rsrc, bld_base->uint_bld.one, "");
@@ -2868,8 +2874,11 @@ static void buffer_append_args(
 	emit_data->args[emit_data->arg_count++] = rsrc;
 	emit_data->args[emit_data->arg_count++] = index; /* vindex */
 	emit_data->args[emit_data->arg_count++] = bld_base->uint_bld.zero; /* voffset */
-	if (!atomic)
-		emit_data->args[emit_data->arg_count++] = i1false; /* glc */
+	if (!atomic) {
+		emit_data->args[emit_data->arg_count++] =
+			inst->Memory.Qualifier & (TGSI_MEMORY_COHERENT | TGSI_MEMORY_VOLATILE) ?
+			i1true : i1false; /* glc */
+	}
 	emit_data->args[emit_data->arg_count++] = i1false; /* slc */
 }
 

From 97352aa50af87b50271bc632abfb971caca46e2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 14 Mar 2016 10:22:21 -0500
Subject: [PATCH 123/197] radeonsi: implement volatile memory access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevent loads from being re-ordered or coalesced.

Atomics don't need special handling by definition, and stores don't need
special handling because LLVM is unable to detect dead image or buffer
stores.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index db0cc5bbf62..0d26957101e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2915,6 +2915,7 @@ static void load_emit(
 		struct lp_build_tgsi_context *bld_base,
 		struct lp_build_emit_data *emit_data)
 {
+	struct si_shader_context *ctx = si_shader_context(bld_base);
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	const struct tgsi_full_instruction * inst = emit_data->inst;
@@ -2922,6 +2923,9 @@ static void load_emit(
 	char intrinsic_name[32];
 	char coords_type[8];
 
+	if (inst->Memory.Qualifier & TGSI_MEMORY_VOLATILE)
+		emit_optimization_barrier(ctx);
+
 	if (target == TGSI_TEXTURE_BUFFER) {
 		emit_data->output[emit_data->chan] = lp_build_intrinsic(
 			builder, "llvm.amdgcn.buffer.load.format.v4f32", emit_data->dst_type,

From 43f5ce1d20dac94d83d6d6c31b88b4227316877d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 11:37:27 -0500
Subject: [PATCH 124/197] radeonsi: implement MemoryBarrier (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: invalidate both constant and VMEM/TC L1 for constant buffers (Marek)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 37 +++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 0c3fbdc9b31..b9bdd47c496 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3521,6 +3521,42 @@ static void si_texture_barrier(struct pipe_context *ctx)
 			 SI_CONTEXT_FLUSH_AND_INV_CB;
 }
 
+static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+
+	/* Subsequent commands must wait for all shader invocations to
+	 * complete. */
+	sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH;
+
+	if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
+		sctx->b.flags |= SI_CONTEXT_INV_SMEM_L1 |
+				 SI_CONTEXT_INV_VMEM_L1;
+
+	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
+		     PIPE_BARRIER_SHADER_BUFFER |
+		     PIPE_BARRIER_TEXTURE |
+		     PIPE_BARRIER_IMAGE)) {
+		/* As far as I can tell, L1 contents are written back to L2
+		 * automatically at end of shader, but the contents of other
+		 * L1 caches might still be stale. */
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
+	}
+
+	if (flags & PIPE_BARRIER_FRAMEBUFFER)
+		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
+
+	if (flags & (PIPE_BARRIER_MAPPED_BUFFER |
+		     PIPE_BARRIER_FRAMEBUFFER)) {
+		/* Not sure if INV_GLOBAL_L2 is the best thing here.
+		 *
+		 * We need to make sure that TC L1 & L2 are written back to
+		 * memory, because neither CPU accesses nor CB fetches consider
+		 * TC, but there's no need to invalidate any TC cache lines. */
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+	}
+}
+
 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
 {
 	struct pipe_blend_state blend;
@@ -3601,6 +3637,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.set_index_buffer = si_set_index_buffer;
 
 	sctx->b.b.texture_barrier = si_texture_barrier;
+	sctx->b.b.memory_barrier = si_memory_barrier;
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
 	sctx->b.b.set_tess_state = si_set_tess_state;

From e9d935ed0e2839d2f07220a9f10477ab3cc79486 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Sun, 13 Mar 2016 14:44:46 -0500
Subject: [PATCH 125/197] radeonsi: force the DCC enable bit off in image
 descriptors for writing (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This avoids a lockup at least on Tonga.

v2: only force DCC off on VI+ (Marek)

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 57 ++++++++++++++++++++----
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0d26957101e..9ad2290fd4f 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2763,6 +2763,34 @@ static bool tgsi_is_array_image(unsigned target)
 	       target == TGSI_TEXTURE_2D_ARRAY_MSAA;
 }
 
+/**
+ * Given a 256-bit resource descriptor, force the DCC enable bit to off.
+ *
+ * At least on Tonga, executing image stores on images with DCC enabled and
+ * non-trivial can eventually lead to lockups. This can occur when an
+ * application binds an image as read-only but then uses a shader that writes
+ * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
+ * program termination) in this case, but it doesn't cost much to be a bit
+ * nicer: disabling DCC in the shader still leads to undefined results but
+ * avoids the lockup.
+ */
+static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
+				  LLVMValueRef rsrc)
+{
+	if (ctx->screen->b.chip_class <= CIK) {
+		return rsrc;
+	} else {
+		LLVMBuilderRef builder = ctx->radeon_bld.gallivm.builder;
+		LLVMValueRef i32_6 = LLVMConstInt(ctx->i32, 6, 0);
+		LLVMValueRef i32_C = LLVMConstInt(ctx->i32, C_008F28_COMPRESSION_EN, 0);
+		LLVMValueRef tmp;
+
+		tmp = LLVMBuildExtractElement(builder, rsrc, i32_6, "");
+		tmp = LLVMBuildAnd(builder, tmp, i32_C, "");
+		return LLVMBuildInsertElement(builder, rsrc, tmp, i32_6, "");
+	}
+}
+
 /**
  * Load the resource descriptor for \p image.
  */
@@ -2770,6 +2798,7 @@ static void
 image_fetch_rsrc(
 	struct lp_build_tgsi_context *bld_base,
 	const struct tgsi_full_src_register *image,
+	bool dcc_off,
 	LLVMValueRef *rsrc)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -2783,11 +2812,15 @@ image_fetch_rsrc(
 		/* Indexing and manual load */
 		LLVMValueRef ind_index;
 		LLVMValueRef rsrc_ptr;
+		LLVMValueRef tmp;
 
 		ind_index = get_indirect_index(ctx, &image->Indirect, image->Register.Index);
 
 		rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
-		*rsrc = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
+		tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);
+		if (dcc_off)
+			tmp = force_dcc_off(ctx, tmp);
+		*rsrc = tmp;
 	}
 }
 
@@ -2895,7 +2928,7 @@ static void load_fetch_args(
 
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 
-	image_fetch_rsrc(bld_base, &inst->Src[0], &rsrc);
+	image_fetch_rsrc(bld_base, &inst->Src[0], false, &rsrc);
 	coords = image_fetch_coords(bld_base, inst, 1);
 
 	if (target == TGSI_TEXTURE_BUFFER) {
@@ -2964,7 +2997,6 @@ static void store_fetch_args(
 	emit_data->dst_type = LLVMVoidTypeInContext(gallivm->context);
 
 	image = tgsi_full_src_register_from_dst(&inst->Dst[0]);
-	image_fetch_rsrc(bld_base, &image, &rsrc);
 	coords = image_fetch_coords(bld_base, inst, 0);
 
 	for (chan = 0; chan < 4; ++chan) {
@@ -2973,6 +3005,7 @@ static void store_fetch_args(
 	data = lp_build_gather_values(gallivm, chans, 4);
 
 	if (target == TGSI_TEXTURE_BUFFER) {
+		image_fetch_rsrc(bld_base, &image, false, &rsrc);
 		emit_data->args[0] = data;
 		emit_data->arg_count = 1;
 
@@ -2980,7 +3013,7 @@ static void store_fetch_args(
 	} else {
 		emit_data->args[0] = data;
 		emit_data->args[1] = coords;
-		emit_data->args[2] = rsrc;
+		image_fetch_rsrc(bld_base, &image, true, &emit_data->args[2]);
 		emit_data->args[3] = lp_build_const_int32(gallivm, 15); /* dmask */
 		emit_data->arg_count = 4;
 
@@ -3035,7 +3068,8 @@ static void atomic_fetch_args(
 
 	emit_data->dst_type = bld_base->base.elem_type;
 
-	image_fetch_rsrc(bld_base, &inst->Src[0], &rsrc);
+	image_fetch_rsrc(bld_base, &inst->Src[0], target != TGSI_TEXTURE_BUFFER,
+			 &rsrc);
 	coords = image_fetch_coords(bld_base, inst, 1);
 
 	tmp = lp_build_emit_fetch(bld_base, inst, 2, 0);
@@ -3108,11 +3142,11 @@ static void resq_fetch_args(
 	emit_data->dst_type = LLVMVectorType(bld_base->base.elem_type, 4);
 
 	if (tex_target == TGSI_TEXTURE_BUFFER) {
-		image_fetch_rsrc(bld_base, reg, &emit_data->args[0]);
+		image_fetch_rsrc(bld_base, reg, false, &emit_data->args[0]);
 		emit_data->arg_count = 1;
 	} else {
 		emit_data->args[0] = bld_base->uint_bld.zero; /* mip level */
-		image_fetch_rsrc(bld_base, reg, &emit_data->args[1]);
+		image_fetch_rsrc(bld_base, reg, false, &emit_data->args[1]);
 		emit_data->args[2] = lp_build_const_int32(gallivm, 15); /* dmask */
 		emit_data->args[3] = bld_base->uint_bld.zero; /* unorm */
 		emit_data->args[4] = bld_base->uint_bld.zero; /* r128 */
@@ -4622,6 +4656,7 @@ static void preload_samplers(struct si_shader_context *ctx)
 static void preload_images(struct si_shader_context *ctx)
 {
 	struct lp_build_tgsi_context *bld_base = &ctx->radeon_bld.soa.bld_base;
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	unsigned num_images = bld_base->info->file_max[TGSI_FILE_IMAGE] + 1;
 	LLVMValueRef res_ptr;
@@ -4634,9 +4669,15 @@ static void preload_images(struct si_shader_context *ctx)
 
 	for (i = 0; i < num_images; ++i) {
 		/* Rely on LLVM to shrink the load for buffer resources. */
-		ctx->images[i] =
+		LLVMValueRef rsrc =
 			build_indexed_load_const(ctx, res_ptr,
 						 lp_build_const_int32(gallivm, i));
+
+		if (info->images_writemask & (1 << i) &&
+		    !(info->images_buffers & (1 << i)))
+			rsrc = force_dcc_off(ctx, rsrc);
+
+		ctx->images[i] = rsrc;
 	}
 }
 

From 79762e877cd9b439d5f7697d3fea8d930ab05646 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 15 Mar 2016 20:54:30 -0500
Subject: [PATCH 126/197] tgsi/scan: add writes_memory to flag presence of
 stores or atomics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 12 ++++++++----
 src/gallium/auxiliary/tgsi/tgsi_scan.h |  1 +
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 65bdab5b1cd..d32c3a14344 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -196,10 +196,13 @@ scan_instruction(struct tgsi_shader_info *info,
       if (is_memory_file(src->Register.File)) {
          is_mem_inst = true;
 
-         if (src->Register.File == TGSI_FILE_IMAGE &&
-             !src->Register.Indirect &&
-             tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store)
-            info->images_writemask |= 1 << src->Register.Index;
+         if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_store) {
+            info->writes_memory = TRUE;
+
+            if (src->Register.File == TGSI_FILE_IMAGE &&
+                !src->Register.Indirect)
+               info->images_writemask |= 1 << src->Register.Index;
+         }
       }
    }
 
@@ -215,6 +218,7 @@ scan_instruction(struct tgsi_shader_info *info,
          assert(fullinst->Instruction.Opcode == TGSI_OPCODE_STORE);
 
          is_mem_inst = true;
+         info->writes_memory = TRUE;
 
          if (dst->Register.File == TGSI_FILE_IMAGE &&
              !dst->Register.Indirect)
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index d777f23749b..76d8925119e 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -111,6 +111,7 @@ struct tgsi_shader_info
    boolean writes_clipvertex;
    boolean writes_viewport_index;
    boolean writes_layer;
+   boolean writes_memory; /**< contains stores or atomics to buffers or images */
    boolean is_msaa_sampler[PIPE_MAX_SAMPLERS];
    boolean uses_doubles; /**< uses any of the double instructions */
    unsigned clipdist_writemask;

From 6f942ac5eedec5b5517618c52434d7c0794163c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 15 Mar 2016 20:58:12 -0500
Subject: [PATCH 127/197] radeonsi: disable early Z if the fragment shader
 writes to memory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Empirically, both the EXEC_ON_* flags and LATE_Z are necessary.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state_shaders.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index d69bb2e317a..02489583423 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -794,9 +794,15 @@ static void si_shader_ps(struct si_shader *shader)
 	 * - the shader uses at least 2 VMEM instructions, or
 	 * - the code size is at least 50 2-dword instructions or 100 1-dword
 	 *   instructions.
+	 *
+	 * Shaders with side effects that must execute independently of the
+	 * depth test require LATE_Z.
 	 */
-	if (info->num_memory_instructions >= 2 ||
-	    shader->binary.code_size > 100*4)
+	if (info->writes_memory &&
+	    !info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
+		shader->z_order = V_02880C_LATE_Z;
+	else if (info->num_memory_instructions >= 2 ||
+	         shader->binary.code_size > 100*4)
 		shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z;
 	else
 		shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
@@ -1182,6 +1188,10 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 	if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
 		sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1);
 
+	if (sel->info.writes_memory)
+		sel->db_shader_control |= S_02880C_EXEC_ON_HIER_FAIL(1) |
+					  S_02880C_EXEC_ON_NOOP(1);
+
 	/* Compile the main shader part for use with a prolog and/or epilog. */
 	if (sel->type != PIPE_SHADER_GEOMETRY &&
 	    !sscreen->use_monolithic_shaders) {

From 5219eb15e12903a10c0aea22a7460bb6867a958e Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Mon, 11 Jan 2016 00:50:32 +1100
Subject: [PATCH 128/197] radeonsi: Set PIPE_SHADER_CAP_MAX_SHADER_IMAGES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This enables ARB_shader_image_load_store and ARB_shader_image_size.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
[allow the same number of images for all shader stages and require LLVM 3.9]

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_pipe.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 042cfc764fd..dd1103eed06 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -538,8 +538,9 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
 		return 0;
+	case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+		return HAVE_LLVM >= 0x0309 ? SI_NUM_IMAGES : 0;
 	}
 	return 0;
 }

From b74784638df4c6b1d25aa04044946e380ee61c28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 15 Mar 2016 13:08:21 -0500
Subject: [PATCH 129/197] docs: mark GL_ARB_shader_image_load_store/_size as
 done for radeonsi
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 docs/GL3.txt              | 4 ++--
 docs/relnotes/11.3.0.html | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 3058996ad16..89cc6620328 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -154,7 +154,7 @@ GL 4.2, GLSL 4.20:
   GL_ARB_texture_storage                                DONE (all drivers)
   GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                        DONE (i965)
+  GL_ARB_shader_image_load_store                        DONE (i965, radeonsi)
   GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_packing                       DONE (all drivers)
@@ -178,7 +178,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                        DONE (all drivers)
   GL_ARB_robust_buffer_access_behavior                  not started
-  GL_ARB_shader_image_size                              DONE (i965)
+  GL_ARB_shader_image_size                              DONE (i965, radeonsi)
   GL_ARB_shader_storage_buffer_object                   DONE (i965, nvc0)
   GL_ARB_stencil_texturing                              DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_buffer_range                           DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index c31296ef9b1..04564a8f680 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -46,6 +46,8 @@ Note: some of the new features are only available with certain drivers.
 <ul>
 <li>GL_ARB_internalformat_query2 on i965</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
+<li>GL_ARB_shader_image_load_store on radeonsi</li>
+<li>GL_ARB_shader_image_size on radeonsi</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>

From eee8a53906f72635423931430e667159c88613bb Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 8 Mar 2016 00:34:14 -0800
Subject: [PATCH 130/197] meta: Make BlitFramebuffer() do sRGB encoding in ES
 3.x.

According to the ES 3.0 and GL 4.4 specifications, glBlitFramebuffer
is supposed to perform sRGB decoding and encoding whenever sRGB formats
are in use.  The ES 3.0 specification is completely clear, and has
always stated this.

However, the GL specification has changed behavior in 4.1, 4.2, and
4.4.  The original behavior stated that no sRGB encoding should occur.
The 4.4 behavior matches ES 3.0's wording.  However, implementing the
new behavior appears to break applications such as Left 4 Dead 2.

This patch changes Meta to apply the ES 3.x rules in ES 3.x, but
leaves OpenGL alone for now, to avoid breaking applications.

Meta implements several other functions in terms of BlitFramebuffer,
and many of those explicitly do not perform sRGB encoding.  So, this
patch explicitly disables sRGB encoding in those other functions,
preserving the existing (correct) behavior.

If you're from the future and are reading this, hi!  Welcome to
the "fun" of debugging sRGB problems!  Best of luck!

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/common/meta_blit.c         | 43 ++++++++++++++++-----
 src/mesa/drivers/common/meta_copy_image.c   |  3 ++
 src/mesa/drivers/common/meta_tex_subimage.c |  6 +++
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index 0066f7f9184..6761238b014 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -597,6 +597,7 @@ blitframebuffer_texture(struct gl_context *ctx,
                         GLenum filter, GLint flipX, GLint flipY,
                         GLboolean glsl_version, GLboolean do_depth)
 {
+   struct save_state *save = &ctx->Meta->Save[ctx->Meta->SaveStackDepth - 1];
    int att_index = do_depth ? BUFFER_DEPTH : readFb->_ColorReadBufferIndex;
    const struct gl_renderbuffer_attachment *readAtt =
       &readFb->Attachment[att_index];
@@ -709,7 +710,7 @@ blitframebuffer_texture(struct gl_context *ctx,
    fb_tex_blit.samp_obj = _mesa_meta_setup_sampler(ctx, texObj, target, filter,
                                                    srcLevel);
 
-   /* Always do our blits with no net sRGB decode or encode.
+   /* For desktop GL, we do our blits with no net sRGB decode or encode.
     *
     * However, if both the src and dst can be srgb decode/encoded, enable them
     * so that we do any blending (from scaling or from MSAA resolves) in the
@@ -723,18 +724,42 @@ blitframebuffer_texture(struct gl_context *ctx,
     *      scissor test."
     *
     * The GL 4.4 specification disagrees and says that the sRGB part of the
-    * fragment pipeline applies, but this was found to break applications.
+    * fragment pipeline applies, but this was found to break applications
+    * (such as Left 4 Dead 2).
+    *
+    * However, for ES 3.0, we follow the specification and perform sRGB
+    * decoding and encoding.  The specification has always been clear in
+    * the ES world, and hasn't changed over time.
     */
    if (ctx->Extensions.EXT_texture_sRGB_decode) {
-      if (_mesa_get_format_color_encoding(rb->Format) == GL_SRGB &&
-          drawFb->Visual.sRGBCapable) {
+      bool src_srgb = _mesa_get_format_color_encoding(rb->Format) == GL_SRGB;
+      if (save->API == API_OPENGLES2 && ctx->Version >= 30) {
+         /* From the ES 3.0.4 specification, page 198:
+          * "When values are taken from the read buffer, if the value of
+          *  FRAMEBUFFER_ATTACHMENT_COLOR_ENCODING for the framebuffer
+          *  attachment corresponding to the read buffer is SRGB (see section
+          *  6.1.13), the red, green, and blue components are converted from
+          *  the non-linear sRGB color space according to equation 3.24.
+          *
+          *  When values are written to the draw buffers, blit operations
+          *  bypass the fragment pipeline. The only fragment operations which
+          *  affect a blit are the pixel ownership test, the scissor test,
+          *  and sRGB conversion (see section 4.1.8)."
+          */
          _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
-                                       GL_DECODE_EXT);
-         _mesa_set_framebuffer_srgb(ctx, GL_TRUE);
+                                       src_srgb ? GL_DECODE_EXT
+                                                : GL_SKIP_DECODE_EXT);
+         _mesa_set_framebuffer_srgb(ctx, drawFb->Visual.sRGBCapable);
       } else {
-         _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
-                                       GL_SKIP_DECODE_EXT);
-         /* set_framebuffer_srgb was set by _mesa_meta_begin(). */
+         if (src_srgb && drawFb->Visual.sRGBCapable) {
+            _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                          GL_DECODE_EXT);
+            _mesa_set_framebuffer_srgb(ctx, GL_TRUE);
+         } else {
+            _mesa_set_sampler_srgb_decode(ctx, fb_tex_blit.samp_obj,
+                                          GL_SKIP_DECODE_EXT);
+            /* set_framebuffer_srgb was set by _mesa_meta_begin(). */
+         }
       }
    }
 
diff --git a/src/mesa/drivers/common/meta_copy_image.c b/src/mesa/drivers/common/meta_copy_image.c
index 18b9681b710..9402a4652eb 100644
--- a/src/mesa/drivers/common/meta_copy_image.c
+++ b/src/mesa/drivers/common/meta_copy_image.c
@@ -269,6 +269,9 @@ _mesa_meta_CopyImageSubData_uncompressed(struct gl_context *ctx,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto meta_end;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    /* Since we've bound a new draw framebuffer, we need to update its
     * derived state -- _Xmin, etc -- for BlitFramebuffer's clipping to
     * be correct.
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index dfd3327dd55..62c3fce4249 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -263,6 +263,9 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto fail;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    _mesa_update_state(ctx);
 
    if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
@@ -420,6 +423,9 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    if (status != GL_FRAMEBUFFER_COMPLETE)
       goto fail;
 
+   /* Explicitly disable sRGB encoding */
+   ctx->DrawBuffer->Visual.sRGBCapable = false;
+
    _mesa_update_state(ctx);
 
    if (_mesa_meta_BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,

From 8679bb7c9e8bcf25639664fa2bd02cd2a3de9e52 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 16 Mar 2016 20:15:52 -0700
Subject: [PATCH 131/197] i965/blorp: Refactor sRGB encoding/decoding.

Because the rules for sRGB are so insane, we change brw_blorp_miptrees
to take decode_srgb and encode_srgb flags, which control linearization
of the source and destination separately.

This should make it easy to implement whatever crazy combination of
rules people throw at us.  For now, it should be equivalent.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_blorp.cpp       |  7 +++----
 src/mesa/drivers/dri/i965/brw_blorp.h         |  3 ++-
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp  | 18 ++++++++++++++----
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c |  6 ++++--
 4 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp.cpp b/src/mesa/drivers/dri/i965/brw_blorp.cpp
index 4497eab3bf0..38a32361f0b 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp.cpp
@@ -115,12 +115,11 @@ brw_blorp_surface_info::set(struct brw_context *brw,
       this->brw_surfaceformat = BRW_SURFACEFORMAT_R16_UNORM;
       break;
    default: {
-      mesa_format linear_format = _mesa_get_srgb_format_linear(format);
       if (is_render_target) {
-         assert(brw->format_supported_as_render_target[linear_format]);
-         this->brw_surfaceformat = brw->render_target_format[linear_format];
+         assert(brw->format_supported_as_render_target[format]);
+         this->brw_surfaceformat = brw->render_target_format[format];
       } else {
-         this->brw_surfaceformat = brw_format_for_mesa_format(linear_format);
+         this->brw_surfaceformat = brw_format_for_mesa_format(format);
       }
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index a04a1dfa719..f04e1969351 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -46,7 +46,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
                         float src_x1, float src_y1,
                         float dst_x0, float dst_y0,
                         float dst_x1, float dst_y1,
-                        GLenum filter, bool mirror_x, bool mirror_y);
+                        GLenum filter, bool mirror_x, bool mirror_y,
+                        bool decode_srgb, bool encode_srgb);
 
 #ifdef __cplusplus
 } /* end extern "C" */
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 05fff91ed57..df5d7ace775 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -63,7 +63,8 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
                         float src_x1, float src_y1,
                         float dst_x0, float dst_y0,
                         float dst_x1, float dst_y1,
-                        GLenum filter, bool mirror_x, bool mirror_y)
+                        GLenum filter, bool mirror_x, bool mirror_y,
+                        bool decode_srgb, bool encode_srgb)
 {
    /* Get ready to blit.  This includes depth resolving the src and dst
     * buffers if necessary.  Note: it's not necessary to do a color resolve on
@@ -89,6 +90,12 @@ brw_blorp_blit_miptrees(struct brw_context *brw,
        dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
        mirror_x, mirror_y);
 
+   if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB)
+      src_format = _mesa_get_srgb_format_linear(src_format);
+
+   if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB)
+      dst_format = _mesa_get_srgb_format_linear(dst_format);
+
    brw_blorp_blit_params params(brw,
                                 src_mt, src_level, src_layer, src_format,
                                 dst_mt, dst_level, dst_layer, dst_format,
@@ -122,7 +129,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
                            dst_format,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
-                           filter, mirror_x, mirror_y);
+                           filter, mirror_x, mirror_y,
+                           false, false);
 
    dst_irb->need_downsample = true;
 }
@@ -289,7 +297,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw,
                            dst_image->TexFormat,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
-                           GL_NEAREST, false, mirror_y);
+                           GL_NEAREST, false, mirror_y,
+                           false, false);
 
    /* If we're copying to a packed depth stencil texture and the source
     * framebuffer has separate stencil, we need to also copy the stencil data
@@ -314,7 +323,8 @@ brw_blorp_copytexsubimage(struct brw_context *brw,
                                  dst_mt->format,
                                  srcX0, srcY0, srcX1, srcY1,
                                  dstX0, dstY0, dstX1, dstY1,
-                                 GL_NEAREST, false, mirror_y);
+                                 GL_NEAREST, false, mirror_y,
+                                 false, false);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 6c233d84df9..9e84abb8d9f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2172,7 +2172,8 @@ intel_miptree_updownsample(struct brw_context *brw,
                               src->logical_width0, src->logical_height0,
                               0, 0,
                               dst->logical_width0, dst->logical_height0,
-                              GL_NEAREST, false, false /*mirror x, y*/);
+                              GL_NEAREST, false, false /*mirror x, y*/,
+                              false, false);
    } else if (src->format == MESA_FORMAT_S_UINT8) {
       brw_meta_stencil_updownsample(brw, src, dst);
    } else {
@@ -2194,7 +2195,8 @@ intel_miptree_updownsample(struct brw_context *brw,
                               src->logical_width0, src->logical_height0,
                               0, 0,
                               dst->logical_width0, dst->logical_height0,
-                              GL_NEAREST, false, false /*mirror x, y*/);
+                              GL_NEAREST, false, false /*mirror x, y*/,
+                              false, false /* decode/encode srgb */);
    }
 }
 

From 4b0a5b21ae39756919d739015fcc835f2901337f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 16 Mar 2016 20:19:50 -0700
Subject: [PATCH 132/197] i965/blorp: Make BlitFramebuffer() do sRGB encoding
 in ES 3.x.

According to the ES 3.0 and GL 4.4 specifications, glBlitFramebuffer
is supposed to perform sRGB decoding and encoding whenever sRGB formats
are in use.  The ES 3.0 specification is completely clear, and has
always stated this.

However, the GL specification has changed behavior in 4.1, 4.2, and
4.4.  The original behavior stated that no sRGB encoding should occur.
The 4.4 behavior matches ES 3.0's wording.  However, implementing the
new behavior appears to break applications such as Left 4 Dead 2.

This patch changes Meta to apply the ES 3.x rules in ES 3.x, but
leaves OpenGL alone for now, to avoid breaking applications.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_blorp_blit.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index df5d7ace775..5fd25f1ffe4 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -21,6 +21,7 @@
  * IN THE SOFTWARE.
  */
 
+#include "main/context.h"
 #include "main/teximage.h"
 #include "main/fbobject.h"
 
@@ -121,6 +122,8 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
    struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
    struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
 
+   const bool es3 = _mesa_is_gles3(&brw->ctx);
+
    /* Do the blit */
    brw_blorp_blit_miptrees(brw,
                            src_mt, src_irb->mt_level, src_irb->mt_layer,
@@ -130,7 +133,7 @@ do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
                            srcX0, srcY0, srcX1, srcY1,
                            dstX0, dstY0, dstX1, dstY1,
                            filter, mirror_x, mirror_y,
-                           false, false);
+                           es3, es3);
 
    dst_irb->need_downsample = true;
 }

From 9efd8b590f716bb7766ae6816bc080e7ef60a010 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 21 Mar 2016 13:15:44 +0100
Subject: [PATCH 133/197] nvc0: make sure to delete samplers used by compute
 shaders

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 090a0395432..a100fc4c478 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -413,7 +413,7 @@ nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
    unsigned s, i;
 
-   for (s = 0; s < 5; ++s)
+   for (s = 0; s < 6; ++s)
       for (i = 0; i < nvc0_context(pipe)->num_samplers[s]; ++i)
          if (nvc0_context(pipe)->samplers[s][i] == hwcso)
             nvc0_context(pipe)->samplers[s][i] = NULL;

From fce0b55ccbc33d320b9734a53c2a9f7886450c73 Mon Sep 17 00:00:00 2001
From: xavier <xavierb@gmail.com>
Date: Wed, 9 Mar 2016 09:58:48 +0100
Subject: [PATCH 134/197] r600/sb: Do not distribute neg in
 expr_handler::fold_assoc() when folding multiplications.

Previously it was doing this transformation for a Trine 3 shader:
     MUL     R6.x.12,    R13.x.23, 0.5|3f000000
-    MULADD     R4.x.12,    -R6.x.12, 2|40000000, 1|3f800000
+    MULADD     R4.x.12,    -R13.x.23, -1|bf800000, 1|3f800000

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94412
Signed-off-by: Xavier Bouchoux <xavierb@gmail.com>
Cc: "11.0 11.1 11.2" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Glenn Kennard <glenn.kennard@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/sb/sb_expr.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp
index 556a05da395..3dd3a4815ba 100644
--- a/src/gallium/drivers/r600/sb/sb_expr.cpp
+++ b/src/gallium/drivers/r600/sb/sb_expr.cpp
@@ -598,9 +598,13 @@ bool expr_handler::fold_assoc(alu_node *n) {
 
 	unsigned op = n->bc.op;
 	bool allow_neg = false, cur_neg = false;
+	bool distribute_neg = false;
 
 	switch(op) {
 	case ALU_OP2_ADD:
+		distribute_neg = true;
+		allow_neg = true;
+		break;
 	case ALU_OP2_MUL:
 	case ALU_OP2_MUL_IEEE:
 		allow_neg = true;
@@ -632,7 +636,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v1->is_const()) {
 			literal arg = v1->get_const_value();
 			apply_alu_src_mod(a->bc, 1, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;
 
 			if (a == n)
@@ -660,7 +664,7 @@ bool expr_handler::fold_assoc(alu_node *n) {
 		if (v0->is_const()) {
 			literal arg = v0->get_const_value();
 			apply_alu_src_mod(a->bc, 0, arg);
-			if (cur_neg)
+			if (cur_neg && distribute_neg)
 				arg.f = -arg.f;
 
 			if (last_arg == 0) {

From 65cd2f8443e0226797204be73033a852682c3da3 Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Mon, 14 Mar 2016 10:25:50 -0700
Subject: [PATCH 135/197] swrast: Move assert for 'slice' in to
 check_map_teximage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
---
 src/mesa/swrast/s_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/swrast/s_texture.c b/src/mesa/swrast/s_texture.c
index 9ccd0e34702..25918e3e037 100644
--- a/src/mesa/swrast/s_texture.c
+++ b/src/mesa/swrast/s_texture.c
@@ -188,6 +188,7 @@ check_map_teximage(const struct gl_texture_image *texImage,
    assert(y < texImage->Height || texImage->Height == 0);
    assert(x + w <= texImage->Width);
    assert(y + h <= texImage->Height);
+   assert(slice < texture_slices(texImage));
 }
 
 /**
@@ -240,7 +241,6 @@ _swrast_map_teximage(struct gl_context *ctx,
    assert(swImage->Buffer);
    assert(swImage->Buffer == swImage->ImageSlices[0]);
 
-   assert(slice < texture_slices(texImage));
    map = swImage->ImageSlices[slice];
 
    /* apply x/y offset to map address */

From 4ba47f7b2adf71ed100cd390a1a9fbd4434e896a Mon Sep 17 00:00:00 2001
From: Anuj Phogat <anuj.phogat@gmail.com>
Date: Fri, 11 Mar 2016 15:24:36 -0800
Subject: [PATCH 136/197] i965: Fix assert conditions for src/dst x/y offsets

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/intel_copy_image.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index 08b7623e63d..ccb82b64d5f 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -140,9 +140,9 @@ copy_image_with_memcpy(struct brw_context *brw,
    _mesa_get_format_block_size(src_mt->format, &src_bw, &src_bh);
 
    assert(src_width % src_bw == 0);
-   assert(src_height % src_bw == 0);
+   assert(src_height % src_bh == 0);
    assert(src_x % src_bw == 0);
-   assert(src_y % src_bw == 0);
+   assert(src_y % src_bh == 0);
 
    /* If we are on the same miptree, same level, and same slice, then
     * intel_miptree_map won't let us map it twice.  We have to do things a
@@ -153,7 +153,7 @@ copy_image_with_memcpy(struct brw_context *brw,
 
    if (same_slice) {
       assert(dst_x % src_bw == 0);
-      assert(dst_y % src_bw == 0);
+      assert(dst_y % src_bh == 0);
 
       map_x1 = MIN2(src_x, dst_x);
       map_y1 = MIN2(src_y, dst_y);

From ee7c8b98047f808baadb0704b1cc7eb02b510b98 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 4 Mar 2016 12:33:46 +1000
Subject: [PATCH 137/197] st/mesa: add support for internalformat query2.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add code to handle GL_INTERNALFORMAT_PREFERRED.
Add code to deal with GL_RENDERBUFFER being passes into ChooseTextureFormat.

Reviewed-by: Alejandro Piñeiro <apinheiro@igalia.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                           |  2 +-
 src/mesa/state_tracker/st_extensions.c |  1 +
 src/mesa/state_tracker/st_format.c     | 52 +++++++++++++++++++++-----
 3 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 89cc6620328..88c14c4c67d 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -173,7 +173,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_explicit_uniform_location                      DONE (all drivers that support GLSL)
   GL_ARB_fragment_layer_viewport                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
   GL_ARB_framebuffer_no_attachments                     DONE (i965)
-  GL_ARB_internalformat_query2                          DONE (i965)
+  GL_ARB_internalformat_query2                          DONE (all drivers)
   GL_ARB_invalidate_subdata                             DONE (all drivers)
   GL_ARB_multi_draw_indirect                            DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                        DONE (all drivers)
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 988e9049a20..2fdaba073a2 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -790,6 +790,7 @@ void st_init_extensions(struct pipe_screen *screen,
    extensions->ARB_fragment_shader = GL_TRUE;
    extensions->ARB_half_float_vertex = GL_TRUE;
    extensions->ARB_internalformat_query = GL_TRUE;
+   extensions->ARB_internalformat_query2 = GL_TRUE;
    extensions->ARB_map_buffer_range = GL_TRUE;
    extensions->ARB_texture_border_clamp = GL_TRUE; /* XXX temp */
    extensions->ARB_texture_cube_map = GL_TRUE;
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 161c7678236..4b5f8199c6c 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -2201,7 +2201,15 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
    enum pipe_format pFormat;
    mesa_format mFormat;
    unsigned bindings;
-   enum pipe_texture_target pTarget = gl_target_to_pipe(target);
+   bool is_renderbuffer = false;
+   enum pipe_texture_target pTarget;
+
+   if (target == GL_RENDERBUFFER) {
+      pTarget = PIPE_TEXTURE_2D;
+      is_renderbuffer = true;
+   } else {
+      pTarget = gl_target_to_pipe(target);
+   }
 
    if (target == GL_TEXTURE_1D || target == GL_TEXTURE_1D_ARRAY) {
       /* We don't do compression for these texture targets because of
@@ -2219,7 +2227,7 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
    bindings = PIPE_BIND_SAMPLER_VIEW;
    if (_mesa_is_depth_or_stencil_format(internalFormat))
       bindings |= PIPE_BIND_DEPTH_STENCIL;
-   else if (internalFormat == 3 || internalFormat == 4 ||
+   else if (is_renderbuffer || internalFormat == 3 || internalFormat == 4 ||
             internalFormat == GL_RGB || internalFormat == GL_RGBA ||
             internalFormat == GL_RGB8 || internalFormat == GL_RGBA8 ||
             internalFormat == GL_BGRA ||
@@ -2252,19 +2260,21 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
          if (pFormat != PIPE_FORMAT_NONE)
             return st_pipe_format_to_mesa_format(pFormat);
 
-         /* try choosing format again, this time without render target bindings */
-         pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW,
-                                             format, type,
-                                             ctx->Unpack.SwapBytes);
-         if (pFormat != PIPE_FORMAT_NONE)
-            return st_pipe_format_to_mesa_format(pFormat);
+         if (!is_renderbuffer) {
+            /* try choosing format again, this time without render target bindings */
+            pFormat = st_choose_matching_format(st, PIPE_BIND_SAMPLER_VIEW,
+                                                format, type,
+                                                ctx->Unpack.SwapBytes);
+            if (pFormat != PIPE_FORMAT_NONE)
+               return st_pipe_format_to_mesa_format(pFormat);
+         }
       }
    }
 
    pFormat = st_choose_format(st, internalFormat, format, type,
                               pTarget, 0, bindings, ctx->Mesa_DXTn);
 
-   if (pFormat == PIPE_FORMAT_NONE) {
+   if (pFormat == PIPE_FORMAT_NONE && !is_renderbuffer) {
       /* try choosing format again, this time without render target bindings */
       pFormat = st_choose_format(st, internalFormat, format, type,
                                  pTarget, 0, PIPE_BIND_SAMPLER_VIEW,
@@ -2342,6 +2352,7 @@ void
 st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
                        GLenum internalFormat, GLenum pname, GLint *params)
 {
+   struct st_context *st = st_context(ctx);
    /* The API entry-point gives us a temporary params buffer that is non-NULL
     * and guaranteed to have at least 16 elements.
     */
@@ -2359,7 +2370,30 @@ st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
       params[0] = (GLint) num_samples;
       break;
    }
+   case GL_INTERNALFORMAT_PREFERRED: {
+      params[0] = GL_NONE;
 
+      /* We need to resolve an internal format that is compatible with
+       * the passed internal format, and optimal to the driver. By now,
+       * we just validate that the passed internal format is supported by
+       * the driver, and if so return the same internal format, otherwise
+       * return GL_NONE.
+       */
+      uint usage;
+      if (_mesa_is_depth_or_stencil_format(internalFormat))
+         usage = PIPE_BIND_DEPTH_STENCIL;
+      else
+         usage = PIPE_BIND_RENDER_TARGET;
+      enum pipe_format pformat = st_choose_format(st,
+                                                  internalFormat,
+                                                  GL_NONE,
+                                                  GL_NONE,
+                                                  PIPE_TEXTURE_2D, 1,
+                                                  usage, FALSE);
+      if (pformat)
+         params[0] = internalFormat;
+      break;
+   }
    default:
       /* For the rest of the pnames, we call back the Mesa's default
        * function for drivers that don't implement ARB_internalformat_query2.

From 1e8435ce0cce671024ebf9c5465ea8bdcb563b69 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 09:53:47 +1000
Subject: [PATCH 138/197] docs/relnotes: update ARB_internalformat_query2
 status.

Signed-off-by: Dave Airlie <Airlied@redhat.com>
---
 docs/relnotes/11.3.0.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 04564a8f680..acd8e11e3fc 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -44,7 +44,7 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
-<li>GL_ARB_internalformat_query2 on i965</li>
+<li>GL_ARB_internalformat_query2 on all drivers</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
 <li>GL_ARB_shader_image_load_store on radeonsi</li>
 <li>GL_ARB_shader_image_size on radeonsi</li>

From 53afbc980a973bf1e5c4a479aa277e8df8d9698f Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 10:28:44 +1000
Subject: [PATCH 139/197] tgsi: drop unused set_exec/kill_mask interfaces.

These don't get used and haven't been in git history from what I can
see, so drop them.

Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/draw/draw_gs.c      |  6 ------
 src/gallium/auxiliary/draw/draw_vs_exec.c |  6 ------
 src/gallium/auxiliary/tgsi/tgsi_exec.h    | 25 -----------------------
 3 files changed, 37 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 6b33341ce6c..fcef31b4ff5 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -206,12 +206,6 @@ static unsigned tgsi_gs_run(struct draw_geometry_shader *shader,
 {
    struct tgsi_exec_machine *machine = shader->machine;
 
-   tgsi_set_exec_mask(machine,
-                      1,
-                      input_primitives > 1,
-                      input_primitives > 2,
-                      input_primitives > 3);
-
    /* run interpreter */
    tgsi_exec_machine_run(machine);
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index abd64f5acd2..3fd8ef3cd2f 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -159,12 +159,6 @@ vs_exec_run_linear( struct draw_vertex_shader *shader,
          input = (const float (*)[4])((const char *)input + input_stride);
       } 
 
-      tgsi_set_exec_mask(machine,
-                         1,
-                         max_vertices > 1,
-                         max_vertices > 2,
-                         max_vertices > 3);
-
       /* run interpreter */
       tgsi_exec_machine_run( machine );
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 12a68759ce5..991c3bfc5db 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -196,10 +196,6 @@ struct tgsi_sampler
 #define TGSI_EXEC_TEMP_HALF_I       (TGSI_EXEC_NUM_TEMPS + 3)
 #define TGSI_EXEC_TEMP_HALF_C       0
 
-/* execution mask, each value is either 0 or ~0 */
-#define TGSI_EXEC_MASK_I            (TGSI_EXEC_NUM_TEMPS + 3)
-#define TGSI_EXEC_MASK_C            1
-
 /* 4 register buffer for various purposes */
 #define TGSI_EXEC_TEMP_R0           (TGSI_EXEC_NUM_TEMPS + 4)
 #define TGSI_EXEC_NUM_TEMP_R        4
@@ -397,27 +393,6 @@ boolean
 tgsi_check_soa_dependencies(const struct tgsi_full_instruction *inst);
 
 
-static inline void
-tgsi_set_kill_mask(struct tgsi_exec_machine *mach, unsigned mask)
-{
-   mach->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0] =
-      mask;
-}
-
-
-/** Set execution mask values prior to executing the shader */
-static inline void
-tgsi_set_exec_mask(struct tgsi_exec_machine *mach,
-                   boolean ch0, boolean ch1, boolean ch2, boolean ch3)
-{
-   int *mask = mach->Temps[TGSI_EXEC_MASK_I].xyzw[TGSI_EXEC_MASK_C].i;
-   mask[0] = ch0 ? ~0 : 0;
-   mask[1] = ch1 ? ~0 : 0;
-   mask[2] = ch2 ? ~0 : 0;
-   mask[3] = ch3 ? ~0 : 0;
-}
-
-
 extern void
 tgsi_exec_set_constant_buffers(struct tgsi_exec_machine *mach,
                                unsigned num_bufs,

From 530593da65c0205539fe4bd7bcf7c01e3eba723d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= <marcandre.lureau@redhat.com>
Date: Fri, 18 Mar 2016 20:01:07 +0100
Subject: [PATCH 140/197] i965: fix invalid memory write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I noticed some heap corruption running virgl tests, and valgrind
helped me to track it down to the following error:

==29272== Invalid write of size 4
==29272==    at 0x90283D4: push_loop_stack (brw_eu_emit.c:1307)
==29272==    by 0x9029A7D: brw_DO (brw_eu_emit.c:1750)
==29272==    by 0x90554B0: fs_generator::generate_code(cfg_t const*, int) (brw_fs_generator.cpp:1999)
==29272==    by 0x904491F: brw_compile_fs (brw_fs.cpp:5685)
==29272==    by 0x8FC5DC5: brw_codegen_wm_prog (brw_wm.c:137)
==29272==    by 0x8FC7663: brw_fs_precompile (brw_wm.c:638)
==29272==    by 0x8FA4040: brw_shader_precompile(gl_context*, gl_shader_program*) (brw_link.cpp:51)
==29272==    by 0x8FA4A9A: brw_link_shader (brw_link.cpp:260)
==29272==    by 0x8DEF751: _mesa_glsl_link_shader (ir_to_mesa.cpp:3006)
==29272==    by 0x8C84325: _mesa_link_program (shaderapi.c:1042)
==29272==    by 0x8C851D7: _mesa_LinkProgram (shaderapi.c:1515)
==29272==    by 0x4E4B8E8: add_shader_program (vrend_renderer.c:880)
==29272==  Address 0xf2f3cb0 is 0 bytes after a block of size 112 alloc'd
==29272==    at 0x4C2AA98: calloc (vg_replace_malloc.c:711)
==29272==    by 0x8ED11F7: ralloc_size (ralloc.c:113)
==29272==    by 0x8ED1282: rzalloc_size (ralloc.c:134)
==29272==    by 0x8ED14C0: rzalloc_array_size (ralloc.c:196)
==29272==    by 0x9019C7B: brw_init_codegen (brw_eu.c:291)
==29272==    by 0x904F565: fs_generator::fs_generator(brw_compiler const*, void*, void*, void const*, brw_stage_prog_data*, unsigned int, bool, gl_shader_stage) (brw_fs_generator.cpp:124)
==29272==    by 0x9044883: brw_compile_fs (brw_fs.cpp:5675)
==29272==    by 0x8FC5DC5: brw_codegen_wm_prog (brw_wm.c:137)
==29272==    by 0x8FC7663: brw_fs_precompile (brw_wm.c:638)
==29272==    by 0x8FA4040: brw_shader_precompile(gl_context*, gl_shader_program*) (brw_link.cpp:51)
==29272==    by 0x8FA4A9A: brw_link_shader (brw_link.cpp:260)
==29272==    by 0x8DEF751: _mesa_glsl_link_shader (ir_to_mesa.cpp:3006)

if_depth_in_loop is an array of size p->loop_stack_array_size, and
push_loop_stack() will access if_depth_in_loop[p->loop_stack_depth+1],
thus the condition to grow the array should be
p->loop_stack_array_size <= (p->loop_stack_depth + 1) (it's currently
off by 2...)

This can be reproduced by running the following test with virgl test
server:
LIBGL_ALWAYS_SOFTWARE=y GALLIUM_DRIVER=virpipe bin/shader_runner
./tests/shaders/glsl-fs-unroll-explosion.shader_test -auto

Signed-off-by: Marc-André Lureau <marcandre.lureau@redhat.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0b99356b27d..5b142525752 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1294,7 +1294,7 @@ pop_if_stack(struct brw_codegen *p)
 static void
 push_loop_stack(struct brw_codegen *p, brw_inst *inst)
 {
-   if (p->loop_stack_array_size < p->loop_stack_depth) {
+   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
       p->loop_stack_array_size *= 2;
       p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
 			       p->loop_stack_array_size);

From 18c5fa1122eddc2da0912749760356e570631163 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 22 Mar 2016 08:35:25 -0600
Subject: [PATCH 141/197] swrast: fix discarded const warning in s_texture.c

Signed-off-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/swrast/s_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/swrast/s_texture.c b/src/mesa/swrast/s_texture.c
index 25918e3e037..d35bea96b92 100644
--- a/src/mesa/swrast/s_texture.c
+++ b/src/mesa/swrast/s_texture.c
@@ -60,7 +60,7 @@ _swrast_delete_texture_image(struct gl_context *ctx,
 }
 
 static unsigned int
-texture_slices(struct gl_texture_image *texImage)
+texture_slices(const struct gl_texture_image *texImage)
 {
    if (texImage->TexObject->Target == GL_TEXTURE_1D_ARRAY)
       return texImage->Height;

From 9442db4f89156713bcb241803f6de7c0777262c5 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 7 Mar 2016 10:55:21 -0800
Subject: [PATCH 142/197] i965: Have NIR lower flrp on pre-GEN6 vec4 backend

Previously we were doing the lowering by hand in vec4_visitor::emit_lrp.
By doing it in NIR, we have the opportunity for NIR to do additional
optimization of the expanded code.

This also enables optimizations added by the next commit.

shader-db results:

G4X / Ironlake
total instructions in shared programs: 4024401 -> 4016538 (-0.20%)
instructions in affected programs: 447686 -> 439823 (-1.76%)
helped: 2623
HURT: 0

total cycles in shared programs: 84375846 -> 84328296 (-0.06%)
cycles in affected programs: 16964960 -> 16917410 (-0.28%)
helped: 2556
HURT: 41

Unsurprisingly, no changes on later platforms.

v2: Formatting and comment changes suggested by Matt.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_compiler.c | 28 ++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 2f05a26e0e0..3da6aac2cbf 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -107,6 +107,26 @@ static const struct nir_shader_compiler_options vector_nir_options = {
     */
    .fdot_replicates = true,
 
+   /* Prior to Gen6, there are no three source operations for SIMD4x2. */
+   .lower_flrp = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
    .lower_pack_snorm_2x16 = true,
    .lower_pack_unorm_2x16 = true,
    .lower_unpack_snorm_2x16 = true,
@@ -159,8 +179,12 @@ brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
       if (devinfo->gen < 7)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
-      compiler->glsl_compiler_options[i].NirOptions =
-         is_scalar ? &scalar_nir_options : &vector_nir_options;
+      if (is_scalar) {
+         compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
+      } else {
+         compiler->glsl_compiler_options[i].NirOptions =
+            devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
+      }
 
       compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
    }

From a4079f1cb2d7142d798f6d7fcb7ef29f34976f5b Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 7 Mar 2016 13:09:30 -0800
Subject: [PATCH 143/197] nir: Lower flrp with Boolean interpolator to bcsel

On Intel platforms that don't set lower_flrp, using bcsel instead of
flrp seems to be a small amount worse.  On those platforms, the use of
flrp, bcsel, and multiply of b2f is still an active area of research.
In review, Matt suggested this is because bcsel turns into CMP+SEL, and
because of the flag register we can't schedule instructions well.

shader-db results:

G4X / Ironlake
total instructions in shared programs: 4016538 -> 4012279 (-0.11%)
instructions in affected programs: 161556 -> 157297 (-2.64%)
helped: 1077
HURT: 1

total cycles in shared programs: 84328296 -> 84315862 (-0.01%)
cycles in affected programs: 4174570 -> 4162136 (-0.30%)
helped: 926
HURT: 53

Unsurprisingly, no changes on later platforms.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 39be85f639e..8a44a7a0a66 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -83,10 +83,13 @@ optimizations = [
    (('flrp', a, b, 1.0), b),
    (('flrp', a, a, b), a),
    (('flrp', 0.0, a, b), ('fmul', a, b)),
+   (('flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
    (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
    (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
-   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c))), ('fmul', b, c)), ('flrp', a, b, c), '!options->lower_flrp'),
-   (('fadd', a, ('fmul', c, ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('fadd', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
    (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
    (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
    # Comparison simplifications

From bf0d60aa115cb8ba83f9b853f1b57c290eaf325b Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 1 Mar 2016 18:59:57 -0800
Subject: [PATCH 144/197] nir: Simplify i2b with negated or abs operand

This enables removing ssa_201 and ssa_202 in sequences like:

                 vec1 ssa_200 = flt ssa_199, ssa_194
                 vec1 ssa_201 = b2i ssa_200
                 vec1 ssa_202 = i2b -ssa_201

shader-db results:

Sandy Bridge
total instructions in shared programs: 8462257 -> 8462180 (-0.00%)
instructions in affected programs: 3846 -> 3769 (-2.00%)
helped: 35
HURT: 0

total cycles in shared programs: 117542934 -> 117542462 (-0.00%)
cycles in affected programs: 20072 -> 19600 (-2.35%)
helped: 20
HURT: 1

Ivy Bridge
total instructions in shared programs: 7775252 -> 7775137 (-0.00%)
instructions in affected programs: 3645 -> 3530 (-3.16%)
helped: 35
HURT: 0

total cycles in shared programs: 65760522 -> 65760068 (-0.00%)
cycles in affected programs: 21082 -> 20628 (-2.15%)
helped: 25
HURT: 2

Haswell
total instructions in shared programs: 7108666 -> 7108589 (-0.00%)
instructions in affected programs: 3253 -> 3176 (-2.37%)
helped: 35
HURT: 0

total cycles in shared programs: 64675726 -> 64675272 (-0.00%)
cycles in affected programs: 21034 -> 20580 (-2.16%)
helped: 26
HURT: 1

Broadwell / Skylake
total instructions in shared programs: 8980912 -> 8980835 (-0.00%)
instructions in affected programs: 3223 -> 3146 (-2.39%)
helped: 35
HURT: 0

total cycles in shared programs: 70077926 -> 70077904 (-0.00%)
cycles in affected programs: 21886 -> 21864 (-0.10%)
helped: 21
HURT: 6

G45 and Ironlake showed no change.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Suggested-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 8a44a7a0a66..5b3694e1933 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -217,6 +217,8 @@ optimizations = [
    (('i2b', ('b2i', a)), a),
    (('f2i', ('ftrunc', a)), ('f2i', a)),
    (('f2u', ('ftrunc', a)), ('f2u', a)),
+   (('i2b', ('ineg', a)), ('i2b', a)),
+   (('i2b', ('iabs', a)), ('i2b', a)),
 
    # Byte extraction
    (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),

From 564a8b8a2645939cd416b62ae2e59c2fbd4b9fc2 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 2 Mar 2016 15:18:34 -0800
Subject: [PATCH 145/197] nir: Simplify 0 >= b2f(a)

This also prevented some regressions with other patches in my local
tree.

Broadwell / Skylake
total instructions in shared programs: 8980835 -> 8980833 (-0.00%)
instructions in affected programs: 45 -> 43 (-4.44%)
helped: 1
HURT: 0

total cycles in shared programs: 70077904 -> 70077900 (-0.00%)
cycles in affected programs: 122 -> 118 (-3.28%)
helped: 1
HURT: 0

No changes on earlier platforms.

v2: Modify the comments to look more like a proof.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 5b3694e1933..facc33b4e0a 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -101,6 +101,13 @@ optimizations = [
    (('inot', ('ige', a, b)), ('ilt', a, b)),
    (('inot', ('ieq', a, b)), ('ine', a, b)),
    (('inot', ('ine', a, b)), ('ieq', a, b)),
+
+   # 0.0 >= b2f(a)
+   # b2f(a) <= 0.0
+   # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
+   # inot(a)
+   (('fge', 0.0, ('b2f', a)), ('inot', a)),
+
    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
    (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),

From 348e5a71d8bc1398c73b731d3e6d80fd3b122513 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 2 Mar 2016 15:36:14 -0800
Subject: [PATCH 146/197] nir: Simplify 0 < fabs(a)

Sandy Bridge / Ivy Bridge / Haswell
total instructions in shared programs: 8462180 -> 8462174 (-0.00%)
instructions in affected programs: 564 -> 558 (-1.06%)
helped: 6
HURT: 0

total cycles in shared programs: 117542462 -> 117542276 (-0.00%)
cycles in affected programs: 9768 -> 9582 (-1.90%)
helped: 12
HURT: 0

Broadwell / Skylake
total instructions in shared programs: 8980833 -> 8980826 (-0.00%)
instructions in affected programs: 626 -> 619 (-1.12%)
helped: 7
HURT: 0

total cycles in shared programs: 70077900 -> 70077714 (-0.00%)
cycles in affected programs: 9378 -> 9192 (-1.98%)
helped: 12
HURT: 0

G45 and Ironlake showed no change.

v2: Modify the comments to look more like a proof.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index facc33b4e0a..c013fd78d9e 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -108,6 +108,12 @@ optimizations = [
    # inot(a)
    (('fge', 0.0, ('b2f', a)), ('inot', a)),
 
+   # 0.0 < fabs(a)
+   # fabs(a) > 0.0
+   # fabs(a) != 0.0 because fabs(a) must be >= 0
+   # a != 0.0
+   (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
+
    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
    (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),

From 2bb006af68e79657e92cb03a6880a786f31304f9 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 2 Mar 2016 13:46:50 -0800
Subject: [PATCH 147/197] nir: Don't abs the result of b2f or b2i

In the results below, 2 SIMD16 shaders in Trine are lost.

G4X
total instructions in shared programs: 4012279 -> 4011108 (-0.03%)
instructions in affected programs: 116776 -> 115605 (-1.00%)
helped: 339
HURT: 0

total cycles in shared programs: 84315862 -> 84313584 (-0.00%)
cycles in affected programs: 1767232 -> 1764954 (-0.13%)
helped: 274
HURT: 81

Ironlake
total instructions in shared programs: 6399073 -> 6396998 (-0.03%)
instructions in affected programs: 218050 -> 215975 (-0.95%)
helped: 600
HURT: 0

total cycles in shared programs: 128892088 -> 128888810 (-0.00%)
cycles in affected programs: 2867452 -> 2864174 (-0.11%)
helped: 422
HURT: 137

Sandy Bridge
total instructions in shared programs: 8462174 -> 8460759 (-0.02%)
instructions in affected programs: 178529 -> 177114 (-0.79%)
helped: 596
HURT: 0

total cycles in shared programs: 117542276 -> 117534098 (-0.01%)
cycles in affected programs: 1239166 -> 1230988 (-0.66%)
helped: 369
HURT: 150

Ivy Bridge
total instructions in shared programs: 7775131 -> 7773410 (-0.02%)
instructions in affected programs: 162903 -> 161182 (-1.06%)
helped: 590
HURT: 0

total cycles in shared programs: 65759882 -> 65747268 (-0.02%)
cycles in affected programs: 1004354 -> 991740 (-1.26%)
helped: 467
HURT: 141

Haswell
total instructions in shared programs: 7107786 -> 7106327 (-0.02%)
instructions in affected programs: 140954 -> 139495 (-1.04%)
helped: 590
HURT: 0

total cycles in shared programs: 64668028 -> 64655322 (-0.02%)
cycles in affected programs: 967080 -> 954374 (-1.31%)
helped: 452
HURT: 149

LOST:   2
GAINED: 0

Broadwell
total instructions in shared programs: 8980029 -> 8978287 (-0.02%)
instructions in affected programs: 197232 -> 195490 (-0.88%)
helped: 715
HURT: 0

total cycles in shared programs: 70070448 -> 70055970 (-0.02%)
cycles in affected programs: 975724 -> 961246 (-1.48%)
helped: 471
HURT: 111

LOST:   2
GAINED: 0

Skylake
total instructions in shared programs: 9115178 -> 9113436 (-0.02%)
instructions in affected programs: 203012 -> 201270 (-0.86%)
helped: 715
HURT: 0

total cycles in shared programs: 68848660 -> 68834004 (-0.02%)
cycles in affected programs: 993888 -> 979232 (-1.47%)
helped: 473
HURT: 116

LOST:   2
GAINED: 0

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index c013fd78d9e..9cec15c2ef5 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -232,6 +232,8 @@ optimizations = [
    (('f2u', ('ftrunc', a)), ('f2u', a)),
    (('i2b', ('ineg', a)), ('i2b', a)),
    (('i2b', ('iabs', a)), ('i2b', a)),
+   (('fabs', ('b2f', a)), ('b2f', a)),
+   (('iabs', ('b2i', a)), ('b2i', a)),
 
    # Byte extraction
    (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),

From d7a25a9defe5fd42677266c0bcfd10909e5e49a4 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Wed, 2 Mar 2016 13:47:56 -0800
Subject: [PATCH 148/197] nir: Don't abs slt and friends

No shader-db changes, but this is symmetric with the previous commit.

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 9cec15c2ef5..0f2bd18dd69 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -134,6 +134,10 @@ optimizations = [
    (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
    (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
    (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('fabs', ('slt', a, b)), ('slt', a, b)),
+   (('fabs', ('sge', a, b)), ('sge', a, b)),
+   (('fabs', ('seq', a, b)), ('seq', a, b)),
+   (('fabs', ('sne', a, b)), ('sne', a, b)),
    (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
    (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
    (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),

From 01425c45b32fa7f323515b05697c6cc0d245ad32 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 16 Mar 2016 16:06:10 -0700
Subject: [PATCH 149/197] i965: Remove the RCP+RSQ algebraic optimizations

NIR already has this optimization and it can do much better than the little
peephole in the backend.

No shader-db change on Haswell or Broadwell.

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   | 11 -----------
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 11 -----------
 2 files changed, 22 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index eaff9535c22..1a6a229e444 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2274,17 +2274,6 @@ fs_visitor::opt_algebraic()
             progress = true;
          }
          break;
-      case SHADER_OPCODE_RCP: {
-         fs_inst *prev = (fs_inst *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(prev->dst)) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
       case SHADER_OPCODE_BROADCAST:
          if (is_uniform(inst->src[0])) {
             inst->opcode = BRW_OPCODE_MOV;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index baf72a25c42..b9cf3f657a1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -699,17 +699,6 @@ vec4_visitor::opt_algebraic()
             break;
          }
          break;
-      case SHADER_OPCODE_RCP: {
-         vec4_instruction *prev = (vec4_instruction *)inst->prev;
-         if (prev->opcode == SHADER_OPCODE_SQRT) {
-            if (inst->src[0].equals(src_reg(prev->dst))) {
-               inst->opcode = SHADER_OPCODE_RSQ;
-               inst->src[0] = prev->src[0];
-               progress = true;
-            }
-         }
-         break;
-      }
       case SHADER_OPCODE_BROADCAST:
          if (is_uniform(inst->src[0]) ||
              inst->src[1].is_zero()) {

From 9881eab197c70b85346d682b525b8ea9ed241862 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 16 Mar 2016 19:31:02 -0700
Subject: [PATCH 150/197] i965/fs: Don't constant-fold RCP

No shader-db changes on Broadwell

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 .../drivers/dri/i965/brw_fs_copy_propagation.cpp  | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 2616e65fc62..ffab0a8ebd5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -654,21 +654,6 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
          }
          break;
 
-      case SHADER_OPCODE_RCP:
-         /* The hardware doesn't do math on immediate values
-          * (because why are you doing that, seriously?), but
-          * the correct answer is to just constant fold it
-          * anyway.
-          */
-         assert(i == 0);
-         if (inst->src[0].f != 0.0f) {
-            inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0] = val;
-            inst->src[0].f = 1.0f / inst->src[0].f;
-            progress = true;
-         }
-         break;
-
       case SHADER_OPCODE_UNTYPED_ATOMIC:
       case SHADER_OPCODE_UNTYPED_SURFACE_READ:
       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:

From b8ec20551506204bf9aa794efae6f978499c34f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 22 Mar 2016 18:26:53 +0100
Subject: [PATCH 151/197] radeonsi: fix 2D array MSAA failures since image
 support landed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-and-Tested-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b9bdd47c496..b8fde00c668 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2993,7 +2993,8 @@ si_make_texture_descriptor(struct si_screen *screen,
 	if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
 	        height = 1;
 		depth = res->array_size;
-	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY) {
+	} else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY ||
+		   type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
 		if (sampler || res->target != PIPE_TEXTURE_3D)
 			depth = res->array_size;
 	} else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)

From b15b1faefdf403b727ec416addc3f4ae16feb5c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 17 Mar 2016 19:49:03 -0500
Subject: [PATCH 152/197] gallium: add PIPE_BARRIER_STREAMOUT_BUFFER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_defines.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 90af7a7012c..8257b4a7142 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -367,6 +367,7 @@ enum pipe_flush_flags
 #define PIPE_BARRIER_TEXTURE           (1 << 7)
 #define PIPE_BARRIER_IMAGE             (1 << 8)
 #define PIPE_BARRIER_FRAMEBUFFER       (1 << 9)
+#define PIPE_BARRIER_STREAMOUT_BUFFER  (1 << 10)
 
 /**
  * Resource binding flags -- state tracker must specify in advance all

From fc94bc2986e6a46a45c643c2236f3e2ced4a2bf3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Thu, 17 Mar 2016 19:49:26 -0500
Subject: [PATCH 153/197] st/mesa: add missing MemoryBarrier bits and some
 explanations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_cb_texturebarrier.c | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_texturebarrier.c b/src/mesa/state_tracker/st_cb_texturebarrier.c
index 6319b6258ac..fecba65a64d 100644
--- a/src/mesa/state_tracker/st_cb_texturebarrier.c
+++ b/src/mesa/state_tracker/st_cb_texturebarrier.c
@@ -75,14 +75,37 @@ st_MemoryBarrier(struct gl_context *ctx, GLbitfield barriers)
       flags |= PIPE_BARRIER_IMAGE;
    if (barriers & GL_COMMAND_BARRIER_BIT)
       flags |= PIPE_BARRIER_INDIRECT_BUFFER;
-   if (barriers & GL_PIXEL_BUFFER_BARRIER_BIT)
+   if (barriers & GL_PIXEL_BUFFER_BARRIER_BIT) {
+      /* The PBO may be
+       *  (1) bound as a texture for PBO uploads, or
+       *  (2) accessed by the CPU via transfer ops.
+       * For case (2), we assume automatic flushing by the driver.
+       */
       flags |= PIPE_BARRIER_TEXTURE;
+   }
+   /* GL_TEXTURE_UPDATE_BARRIER_BIT:
+    * Texture updates translate to:
+    *  (1) texture transfers to/from the CPU,
+    *  (2) texture as blit destination, or
+    *  (3) texture as framebuffer.
+    * In all cases, we assume the driver does the required flushing
+    * automatically.
+    */
+   /* GL_BUFFER_UPDATE_BARRIER_BIT:
+    * Buffer updates translate to
+    *  (1) buffer transfers to/from the CPU,
+    *  (2) resource copies and clears.
+    * In all cases, we assume the driver does the required flushing
+    * automatically.
+    */
    if (barriers & GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_MAPPED_BUFFER;
    if (barriers & GL_QUERY_BUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_QUERY_BUFFER;
    if (barriers & GL_FRAMEBUFFER_BARRIER_BIT)
       flags |= PIPE_BARRIER_FRAMEBUFFER;
+   if (barriers & GL_TRANSFORM_FEEDBACK_BARRIER_BIT)
+      flags |= PIPE_BARRIER_STREAMOUT_BUFFER;
    if (barriers & GL_ATOMIC_COUNTER_BARRIER_BIT)
       flags |= PIPE_BARRIER_SHADER_BUFFER;
    if (barriers & GL_SHADER_STORAGE_BARRIER_BIT)

From a8f5d11426af0eeadf6977c3d8f3a76afe8f03c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 16 Mar 2016 20:47:47 -0500
Subject: [PATCH 154/197] radeonsi: cache flush/invalidation for missing
 PIPE_BARRIER_*_BUFFER bits (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes arb_shader_image_load_store-host-mem-barrier.

v2: flush TC L2 for index buffers on <= CIK (Marek)

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b8fde00c668..1245f56c08a 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3537,18 +3537,28 @@ static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
 	if (flags & (PIPE_BARRIER_VERTEX_BUFFER |
 		     PIPE_BARRIER_SHADER_BUFFER |
 		     PIPE_BARRIER_TEXTURE |
-		     PIPE_BARRIER_IMAGE)) {
+		     PIPE_BARRIER_IMAGE |
+		     PIPE_BARRIER_STREAMOUT_BUFFER)) {
 		/* As far as I can tell, L1 contents are written back to L2
 		 * automatically at end of shader, but the contents of other
 		 * L1 caches might still be stale. */
 		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
 	}
 
+	if (flags & PIPE_BARRIER_INDEX_BUFFER) {
+		sctx->b.flags |= SI_CONTEXT_INV_VMEM_L1;
+
+		/* Indices are read through TC L2 since VI. */
+		if (sctx->screen->b.chip_class <= CIK)
+			sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+	}
+
 	if (flags & PIPE_BARRIER_FRAMEBUFFER)
 		sctx->b.flags |= SI_CONTEXT_FLUSH_AND_INV_FRAMEBUFFER;
 
 	if (flags & (PIPE_BARRIER_MAPPED_BUFFER |
-		     PIPE_BARRIER_FRAMEBUFFER)) {
+		     PIPE_BARRIER_FRAMEBUFFER |
+		     PIPE_BARRIER_INDIRECT_BUFFER)) {
 		/* Not sure if INV_GLOBAL_L2 is the best thing here.
 		 *
 		 * We need to make sure that TC L1 & L2 are written back to

From c4931ae17452cabbc7830cd9d5a356b835fd4c48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Mon, 21 Mar 2016 14:50:50 -0500
Subject: [PATCH 155/197] radeonsi: fix out-of-bounds indexing of shader images
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Results are undefined but may not crash. Without this change, out-of-bounds
indexing can lead to VM faults and GPU hangs.

Constant buffers, samplers, and possibly others will eventually need similar
treatment to support GL_ARB_robust_buffer_access_behavior.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Reviewed-and-Tested-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 44 +++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9ad2290fd4f..1e4bf828ae4 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -531,6 +531,37 @@ static LLVMValueRef get_indirect_index(struct si_shader_context *ctx,
 	return result;
 }
 
+/**
+ * Like get_indirect_index, but restricts the return value to a (possibly
+ * undefined) value inside [0..num).
+ */
+static LLVMValueRef get_bounded_indirect_index(struct si_shader_context *ctx,
+					       const struct tgsi_ind_register *ind,
+					       int rel_index, unsigned num)
+{
+	struct gallivm_state *gallivm = &ctx->radeon_bld.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
+	LLVMValueRef result = get_indirect_index(ctx, ind, rel_index);
+	LLVMValueRef c_max = LLVMConstInt(ctx->i32, num - 1, 0);
+	LLVMValueRef cc;
+
+	if (util_is_power_of_two(num)) {
+		result = LLVMBuildAnd(builder, result, c_max, "");
+	} else {
+		/* In theory, this MAX pattern should result in code that is
+		 * as good as the bit-wise AND above.
+		 *
+		 * In practice, LLVM generates worse code (at the time of
+		 * writing), because its value tracking is not strong enough.
+		 */
+		cc = LLVMBuildICmp(builder, LLVMIntULE, result, c_max, "");
+		result = LLVMBuildSelect(builder, cc, result, c_max, "");
+	}
+
+	return result;
+}
+
+
 /**
  * Calculate a dword address given an input or output register and a stride.
  */
@@ -2814,7 +2845,18 @@ image_fetch_rsrc(
 		LLVMValueRef rsrc_ptr;
 		LLVMValueRef tmp;
 
-		ind_index = get_indirect_index(ctx, &image->Indirect, image->Register.Index);
+		/* From the GL_ARB_shader_image_load_store extension spec:
+		 *
+		 *    If a shader performs an image load, store, or atomic
+		 *    operation using an image variable declared as an array,
+		 *    and if the index used to select an individual element is
+		 *    negative or greater than or equal to the size of the
+		 *    array, the results of the operation are undefined but may
+		 *    not lead to termination.
+		 */
+		ind_index = get_bounded_indirect_index(ctx, &image->Indirect,
+						       image->Register.Index,
+						       SI_NUM_IMAGES);
 
 		rsrc_ptr = LLVMGetParam(ctx->radeon_bld.main_fn, SI_PARAM_IMAGES);
 		tmp = build_indexed_load_const(ctx, rsrc_ptr, ind_index);

From 43c6f3f82f62f28dc97d195750ba25c88051b64e Mon Sep 17 00:00:00 2001
From: Lars Hamre <chemecse@gmail.com>
Date: Wed, 23 Mar 2016 10:14:23 -0400
Subject: [PATCH 156/197] compiler/glsl: allow sequence op as a const expr in
 gles 1.0

Allow the sequence operator to be a constant expression in GLSL ES
versions prior to GLSL ES 3.0

Fixes the following piglit test:
/all/spec/glsl-es-1.0/compiler/array-sized-by-sequence-in-parenthesis.vert

This is similar to the logic from process_initializer() which performs
the same check for constant variable initialization with sequence
operators.

v2: Fixed regression pointed out by Eduardo Lima Mitev

Signed-off-by: Lars Hamre <chemecse@gmail.com>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 5262bd87655..35def8e3ae0 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2125,7 +2125,9 @@ process_array_size(exec_node *node,
    }
 
    ir_constant *const size = ir->constant_expression_value();
-   if (size == NULL || array_size->has_sequence_subexpression()) {
+   if (size == NULL ||
+       (state->is_version(120, 300) &&
+        array_size->has_sequence_subexpression())) {
       _mesa_glsl_error(& loc, state, "array size must be a "
                        "constant valued expression");
       return 0;

From 9a6da49371d38cfda782873f73bcd39a0be999ff Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 23 Mar 2016 12:55:45 -0600
Subject: [PATCH 157/197] docs: use latest libDRM version

Signed-off-by: Brian Paul <brianp@vmware.com>
---
 docs/install.html | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/install.html b/docs/install.html
index ae911d5347b..8b349c40b9d 100644
--- a/docs/install.html
+++ b/docs/install.html
@@ -73,8 +73,7 @@ The following are required for DRI-based hardware acceleration with Mesa:
 <ul>
 <li><a href="http://xorg.freedesktop.org/releases/individual/proto/">
 dri2proto</a> version 2.6 or later
-<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a>
-version 2.4.33 or later
+<li><a href="http://dri.freedesktop.org/libdrm/">libDRM</a> latest version
 <li>Xorg server version 1.5 or later
 <li>Linux 2.6.28 or later
 </ul>

From c4c373f156a0b63a5789c0a8a3a1b641a58aa938 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Mar 2016 14:57:57 -0700
Subject: [PATCH 158/197] nir: Fix whitespace

Reviewed-by: Rob Clark <robclark@gmail.com>
---
 src/compiler/nir/nir.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 36f90fc6fb7..75198e806c4 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2049,7 +2049,7 @@ void nir_index_blocks(nir_function_impl *impl);
 void nir_print_shader(nir_shader *shader, FILE *fp);
 void nir_print_instr(const nir_instr *instr, FILE *fp);
 
-nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 
 #ifdef DEBUG

From 5fe8959912f617fd0ada71f6b952e66305f48d67 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 30 Dec 2015 18:44:19 -0800
Subject: [PATCH 159/197] nir/clone: Expose nir_constant_clone

Reviewed-by: Rob Clark <robclark@gmail.com>
---
 src/compiler/nir/nir.h       | 1 +
 src/compiler/nir/nir_clone.c | 8 ++++----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 75198e806c4..1e91207509e 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2051,6 +2051,7 @@ void nir_print_instr(const nir_instr *instr, FILE *fp);
 
 nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
+nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
 
 #ifdef DEBUG
 void nir_validate_shader(nir_shader *shader);
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 7444dfe6e59..0ef0289504e 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -109,8 +109,8 @@ remap_var(clone_state *state, const nir_variable *var)
    return _lookup_ptr(state, var, nir_variable_is_global(var));
 }
 
-static nir_constant *
-clone_constant(clone_state *state, const nir_constant *c, nir_variable *nvar)
+nir_constant *
+nir_constant_clone(const nir_constant *c, nir_variable *nvar)
 {
    nir_constant *nc = ralloc(nvar, nir_constant);
 
@@ -118,7 +118,7 @@ clone_constant(clone_state *state, const nir_constant *c, nir_variable *nvar)
    nc->num_elements = c->num_elements;
    nc->elements = ralloc_array(nvar, nir_constant *, c->num_elements);
    for (unsigned i = 0; i < c->num_elements; i++) {
-      nc->elements[i] = clone_constant(state, c->elements[i], nvar);
+      nc->elements[i] = nir_constant_clone(c->elements[i], nvar);
    }
 
    return nc;
@@ -142,7 +142,7 @@ clone_variable(clone_state *state, const nir_variable *var)
           var->num_state_slots * sizeof(nir_state_slot));
    if (var->constant_initializer) {
       nvar->constant_initializer =
-         clone_constant(state, var->constant_initializer, nvar);
+         nir_constant_clone(var->constant_initializer, nvar);
    }
    nvar->interface_type = var->interface_type;
 

From f849f53990c613a28c32826d09005e127adedeac Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Mar 2016 15:05:55 -0700
Subject: [PATCH 160/197] nir/clone: Export nir_variable_clone

Reviewed-by: Rob Clark <robclark@gmail.com>
---
 src/compiler/nir/nir.h       |  1 +
 src/compiler/nir/nir_clone.c | 16 ++++++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 1e91207509e..02acbfc9d73 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2052,6 +2052,7 @@ void nir_print_instr(const nir_instr *instr, FILE *fp);
 nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
 nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
+nir_variable *nir_variable_clone(const nir_variable *c, nir_shader *shader);
 
 #ifdef DEBUG
 void nir_validate_shader(nir_shader *shader);
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index 0ef0289504e..a721552f6a6 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -127,11 +127,10 @@ nir_constant_clone(const nir_constant *c, nir_variable *nvar)
 /* NOTE: for cloning nir_variable's, bypass nir_variable_create to avoid
  * having to deal with locals and globals separately:
  */
-static nir_variable *
-clone_variable(clone_state *state, const nir_variable *var)
+nir_variable *
+nir_variable_clone(const nir_variable *var, nir_shader *shader)
 {
-   nir_variable *nvar = rzalloc(state->ns, nir_variable);
-   add_remap(state, nvar, var);
+   nir_variable *nvar = rzalloc(shader, nir_variable);
 
    nvar->type = var->type;
    nvar->name = ralloc_strdup(nvar, var->name);
@@ -149,6 +148,15 @@ clone_variable(clone_state *state, const nir_variable *var)
    return nvar;
 }
 
+static nir_variable *
+clone_variable(clone_state *state, const nir_variable *var)
+{
+   nir_variable *nvar = nir_variable_clone(var, state->ns);
+   add_remap(state, nvar, var);
+
+   return nvar;
+}
+
 /* clone list of nir_variable: */
 static void
 clone_var_list(clone_state *state, struct exec_list *dst,

From 4ff89377d96e0670bd4f5e149a6dc5955b6bb635 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 10:50:27 -0700
Subject: [PATCH 161/197] nir: Add an "exact" bit to nir_alu_instr

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir.h       | 11 +++++++++++
 src/compiler/nir/nir_clone.c |  1 +
 src/compiler/nir/nir_print.c |  2 ++
 3 files changed, 14 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 02acbfc9d73..76a511c2d4f 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -718,6 +718,17 @@ extern const nir_op_info nir_op_infos[nir_num_opcodes];
 typedef struct nir_alu_instr {
    nir_instr instr;
    nir_op op;
+
+   /** Indicates that this ALU instruction generates an exact value
+    *
+    * This is kind of a mixture of GLSL "precise" and "invariant" and not
+    * really equivalent to either.  This indicates that the value generated by
+    * this operation is high-precision and any code transformations that touch
+    * it must ensure that the resulting value is bit-for-bit identical to the
+    * original.
+    */
+   bool exact;
+
    nir_alu_dest dest;
    nir_alu_src src[];
 } nir_alu_instr;
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
index a721552f6a6..7d2e3835258 100644
--- a/src/compiler/nir/nir_clone.c
+++ b/src/compiler/nir/nir_clone.c
@@ -312,6 +312,7 @@ static nir_alu_instr *
 clone_alu(clone_state *state, const nir_alu_instr *alu)
 {
    nir_alu_instr *nalu = nir_alu_instr_create(state->ns, alu->op);
+   nalu->exact = alu->exact;
 
    __clone_dst(state, &nalu->instr, &nalu->dest.dest, &alu->dest.dest);
    nalu->dest.saturate = alu->dest.saturate;
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index d3d5b84a024..c295c192c2a 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -207,6 +207,8 @@ print_alu_instr(nir_alu_instr *instr, print_state *state)
    print_alu_dest(&instr->dest, state);
 
    fprintf(fp, " = %s", nir_op_infos[instr->op].name);
+   if (instr->exact)
+      fprintf(fp, "!");
    if (instr->dest.saturate)
       fprintf(fp, ".sat");
    fprintf(fp, " ");

From ded3133d47cde1c28566f266a8bbe903badbd82b Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 15:54:26 -0700
Subject: [PATCH 162/197] nir/builder: Add a flag for setting exact

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_builder.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index e2000200ea7..64d7b43aa58 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -31,6 +31,9 @@ struct exec_list;
 typedef struct nir_builder {
    nir_cursor cursor;
 
+   /* Whether new ALU instructions will be marked "exact" */
+   bool exact;
+
    nir_shader *shader;
    nir_function_impl *impl;
 } nir_builder;
@@ -39,6 +42,7 @@ static inline void
 nir_builder_init(nir_builder *build, nir_function_impl *impl)
 {
    memset(build, 0, sizeof(*build));
+   build->exact = false;
    build->impl = impl;
    build->shader = impl->function->shader;
 }
@@ -50,6 +54,7 @@ nir_builder_init_simple_shader(nir_builder *build, void *mem_ctx,
 {
    build->shader = nir_shader_create(mem_ctx, stage, options);
    nir_function *func = nir_function_create(build->shader, "main");
+   build->exact = false;
    build->impl = nir_function_impl_create(func);
    build->cursor = nir_after_cf_list(&build->impl->body);
 }
@@ -143,6 +148,8 @@ nir_build_alu(nir_builder *build, nir_op op, nir_ssa_def *src0,
    if (!instr)
       return NULL;
 
+   instr->exact = build->exact;
+
    instr->src[0].src = nir_src_for_ssa(src0);
    if (src1)
       instr->src[1].src = nir_src_for_ssa(src1);
@@ -260,6 +267,7 @@ nir_fmov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_fmov);
    nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
                      nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);
@@ -273,6 +281,7 @@ nir_imov_alu(nir_builder *build, nir_alu_src src, unsigned num_components)
    nir_alu_instr *mov = nir_alu_instr_create(build->shader, nir_op_imov);
    nir_ssa_dest_init(&mov->instr, &mov->dest.dest, num_components,
                      nir_src_bit_size(src.src), NULL);
+   mov->exact = build->exact;
    mov->dest.write_mask = (1 << num_components) - 1;
    mov->src[0] = src;
    nir_builder_instr_insert(build, &mov->instr);

From a6f25fa7d77cbbce113b92690dc43ed2ed9a0211 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 15:20:34 -0700
Subject: [PATCH 163/197] nir/search: Propagate exactness into newly created
 expressions

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_search.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 6df662aa531..6f6a9425c18 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -406,7 +406,7 @@ bitsize_tree_filter_down(bitsize_tree *tree, unsigned size)
 
 static nir_alu_src
 construct_value(const nir_search_value *value,
-                unsigned num_components, bitsize_tree *bitsize,
+                unsigned num_components, bitsize_tree *bitsize, bool exact,
                 struct match_state *state,
                 nir_instr *instr, void *mem_ctx)
 {
@@ -420,6 +420,7 @@ construct_value(const nir_search_value *value,
       nir_alu_instr *alu = nir_alu_instr_create(mem_ctx, expr->opcode);
       nir_ssa_dest_init(&alu->instr, &alu->dest.dest, num_components,
                         bitsize->dest_size, NULL);
+      alu->exact = exact;
       alu->dest.write_mask = (1 << num_components) - 1;
       alu->dest.saturate = false;
 
@@ -431,7 +432,7 @@ construct_value(const nir_search_value *value,
             num_components = nir_op_infos[alu->op].input_sizes[i];
 
          alu->src[i] = construct_value(expr->srcs[i],
-                                       num_components, bitsize->srcs[i],
+                                       num_components, bitsize->srcs[i], exact,
                                        state, instr, mem_ctx);
       }
 
@@ -563,8 +564,8 @@ nir_replace_instr(nir_alu_instr *instr, const nir_search_expression *search,
                      instr->dest.dest.ssa.bit_size, NULL);
 
    mov->src[0] = construct_value(replace,
-                                 instr->dest.dest.ssa.num_components,
-                                 tree, &state, &instr->instr, mem_ctx);
+                                 instr->dest.dest.ssa.num_components, tree,
+                                 instr->exact, &state, &instr->instr, mem_ctx);
    nir_instr_insert_before(&instr->instr, &mov->instr);
 
    nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa,

From 3a7cb6534c3f82482c05f6a6813308cf2cad131f Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 11:04:49 -0700
Subject: [PATCH 164/197] nir/algebraic: Allow for flagging operations as being
 inexact

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_algebraic.py     | 9 ++++++++-
 src/compiler/nir/nir_opt_algebraic.py | 9 ++++++++-
 src/compiler/nir/nir_search.c         | 4 ++++
 src/compiler/nir/nir_search.h         | 6 ++++++
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_algebraic.py b/src/compiler/nir/nir_algebraic.py
index 1818877a216..d05564f779c 100644
--- a/src/compiler/nir/nir_algebraic.py
+++ b/src/compiler/nir/nir_algebraic.py
@@ -69,6 +69,7 @@ static const ${val.c_type} ${val.name} = {
    ${'true' if val.is_constant else 'false'},
    ${val.type() or 'nir_type_invalid' },
 % elif isinstance(val, Expression):
+   ${'true' if val.inexact else 'false'},
    nir_op_${val.opcode},
    { ${', '.join(src.c_ptr for src in val.sources)} },
 % endif
@@ -145,12 +146,18 @@ class Variable(Value):
       elif self.required_type == 'float':
          return "nir_type_float"
 
+_opcode_re = re.compile(r"(?P<inexact>~)?(?P<opcode>\w+)")
+
 class Expression(Value):
    def __init__(self, expr, name_base, varset):
       Value.__init__(self, name_base, "expression")
       assert isinstance(expr, tuple)
 
-      self.opcode = expr[0]
+      m = _opcode_re.match(expr[0])
+      assert m and m.group('opcode') is not None
+
+      self.opcode = m.group('opcode')
+      self.inexact = m.group('inexact') is not None
       self.sources = [ Value.create(src, "{0}_{1}".format(name_base, i), varset)
                        for (i, src) in enumerate(expr[1:]) ]
 
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 0f2bd18dd69..d788b7b1a0c 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -34,10 +34,17 @@ d = 'd'
 
 # Written in the form (<search>, <replace>) where <search> is an expression
 # and <replace> is either an expression or a value.  An expression is
-# defined as a tuple of the form (<op>, <src0>, <src1>, <src2>, <src3>)
+# defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
 # where each source is either an expression or a value.  A value can be
 # either a numeric constant or a string representing a variable name.
 #
+# If the opcode in a search expression is prefixed by a '~' character, this
+# indicates that the operation is inexact.  Such operations will only get
+# applied to SSA values that do not have the exact bit set.  This should be
+# used by by any optimizations that are not bit-for-bit exact.  It should not,
+# however, be used for backend-requested lowering operations as those need to
+# happen regardless of precision.
+#
 # Variable names are specified as "[#]name[@type]" where "#" inicates that
 # the given variable will only match constants and the type indicates that
 # the given variable will only match values from ALU instructions with the
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 6f6a9425c18..110ab5e2362 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -238,6 +238,10 @@ match_expression(const nir_search_expression *expr, nir_alu_instr *instr,
    if (instr->op != expr->opcode)
       return false;
 
+   assert(instr->dest.dest.is_ssa);
+   if (expr->inexact && instr->exact)
+      return false;
+
    assert(!instr->dest.saturate);
    assert(nir_op_infos[instr->op].num_inputs > 0);
 
diff --git a/src/compiler/nir/nir_search.h b/src/compiler/nir/nir_search.h
index 321d6d00355..61742f129b1 100644
--- a/src/compiler/nir/nir_search.h
+++ b/src/compiler/nir/nir_search.h
@@ -83,6 +83,12 @@ typedef struct {
 typedef struct {
    nir_search_value value;
 
+   /* When set on a search expression, the expression will only match an SSA
+    * value that does *not* have the exact bit set.  If unset, the exact bit
+    * on the SSA value is ignored.
+    */
+   bool inexact;
+
    nir_op opcode;
    const nir_search_value *srcs[4];
 } nir_search_expression;

From 89545b13141053cf12a10e5357aa82031e8419ed Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Mar 2016 14:25:56 -0700
Subject: [PATCH 165/197] nir/algebraic: Get rid of an invlid fxor optimization

The fxor opcode is required to return 1.0f or 0.0f but the input variable
may not be 1.0f or 0.0f.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_opt_algebraic.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index d788b7b1a0c..ec6c6338bb1 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -176,7 +176,6 @@ optimizations = [
    (('ior', a, 0), a),
    (('fxor', a, a), 0.0),
    (('ixor', a, a), 0),
-   (('fxor', a, 0.0), a),
    (('ixor', a, 0), a),
    (('inot', ('inot', a)), a),
    # DeMorgan's Laws

From ed3a029e8088cb17af073c3b5f7444cb7e2f1cfb Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Mar 2016 14:30:29 -0700
Subject: [PATCH 166/197] nir/algebraic: Fix fmin detection to match the spec

The previous transformation got the arguments to fmin backwards.  When NaNs
are involved, the GLSL min/max aren't commutative so it matters.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_opt_algebraic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index ec6c6338bb1..7e3aa5aa798 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -122,7 +122,7 @@ optimizations = [
    (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
 
    (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
-   (('bcsel', ('flt', a, b), a, b), ('fmin', a, b)),
+   (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
    (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
    (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),

From 0dbda153aae548a4087f7364c9013583a076e0e9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 11:31:48 -0700
Subject: [PATCH 167/197] nir/algebraic: Flag inexact optimizations

Many of our optimizations, while great for cutting shaders down to size,
aren't really precision-safe.  This commit tries to flag all of the
inexact floating-point optimizations so they don't get run on values that
are flagged "exact".  It's a bit conservative and maybe flags some safe
optimizations as unsafe but that's better than missing one.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_opt_algebraic.py | 121 +++++++++++++-------------
 1 file changed, 62 insertions(+), 59 deletions(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 7e3aa5aa798..53633233f2b 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -61,19 +61,19 @@ optimizations = [
    (('fabs', ('fneg', a)), ('fabs', a)),
    (('iabs', ('iabs', a)), ('iabs', a)),
    (('iabs', ('ineg', a)), ('iabs', a)),
-   (('fadd', a, 0.0), a),
+   (('~fadd', a, 0.0), a),
    (('iadd', a, 0), a),
    (('usadd_4x8', a, 0), a),
    (('usadd_4x8', a, ~0), ~0),
-   (('fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
+   (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
    (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
-   (('fadd', ('fneg', a), a), 0.0),
+   (('~fadd', ('fneg', a), a), 0.0),
    (('iadd', ('ineg', a), a), 0),
    (('iadd', ('ineg', a), ('iadd', a, b)), b),
    (('iadd', a, ('iadd', ('ineg', a), b)), b),
-   (('fadd', ('fneg', a), ('fadd', a, b)), b),
-   (('fadd', a, ('fadd', ('fneg', a), b)), b),
-   (('fmul', a, 0.0), 0.0),
+   (('~fadd', ('fneg', a), ('fadd', a, b)), b),
+   (('~fadd', a, ('fadd', ('fneg', a), b)), b),
+   (('~fmul', a, 0.0), 0.0),
    (('imul', a, 0), 0),
    (('umul_unorm_4x8', a, 0), 0),
    (('umul_unorm_4x8', a, ~0), a),
@@ -81,29 +81,29 @@ optimizations = [
    (('imul', a, 1), a),
    (('fmul', a, -1.0), ('fneg', a)),
    (('imul', a, -1), ('ineg', a)),
-   (('ffma', 0.0, a, b), b),
-   (('ffma', a, 0.0, b), b),
-   (('ffma', a, b, 0.0), ('fmul', a, b)),
+   (('~ffma', 0.0, a, b), b),
+   (('~ffma', a, 0.0, b), b),
+   (('~ffma', a, b, 0.0), ('fmul', a, b)),
    (('ffma', a, 1.0, b), ('fadd', a, b)),
    (('ffma', 1.0, a, b), ('fadd', a, b)),
-   (('flrp', a, b, 0.0), a),
-   (('flrp', a, b, 1.0), b),
-   (('flrp', a, a, b), a),
-   (('flrp', 0.0, a, b), ('fmul', a, b)),
-   (('flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~flrp', a, b, 0.0), a),
+   (('~flrp', a, b, 1.0), b),
+   (('~flrp', a, a, b), a),
+   (('~flrp', 0.0, a, b), ('fmul', a, b)),
+   (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
    (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
    (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
-   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
-   (('fadd', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp'),
-   (('fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
-   (('fadd', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg',         c ))), ('fmul', b,         c )), ('flrp', a, b, c), '!options->lower_flrp'),
+   (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
+   (('~fadd', a, ('fmul',         c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
    (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
-   (('fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
+   (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
    # Comparison simplifications
-   (('inot', ('flt', a, b)), ('fge', a, b)),
-   (('inot', ('fge', a, b)), ('flt', a, b)),
-   (('inot', ('feq', a, b)), ('fne', a, b)),
-   (('inot', ('fne', a, b)), ('feq', a, b)),
+   (('~inot', ('flt', a, b)), ('fge', a, b)),
+   (('~inot', ('fge', a, b)), ('flt', a, b)),
+   (('~inot', ('feq', a, b)), ('fne', a, b)),
+   (('~inot', ('fne', a, b)), ('feq', a, b)),
    (('inot', ('ilt', a, b)), ('ige', a, b)),
    (('inot', ('ige', a, b)), ('ilt', a, b)),
    (('inot', ('ieq', a, b)), ('ine', a, b)),
@@ -132,15 +132,15 @@ optimizations = [
    (('imax', a, a), a),
    (('umin', a, a), a),
    (('umax', a, a), a),
-   (('fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
-   (('fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
+   (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
    (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
    (('fsat', ('fsat', a)), ('fsat', a)),
    (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
-   (('ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
-   (('ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
-   (('ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
-   (('ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
+   (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
+   (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
+   (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
+   (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
    (('fabs', ('slt', a, b)), ('slt', a, b)),
    (('fabs', ('sge', a, b)), ('sge', a, b)),
    (('fabs', ('seq', a, b)), ('seq', a, b)),
@@ -191,35 +191,35 @@ optimizations = [
    (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
    (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
    # Exponential/logarithmic identities
-   (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
-   (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
+   (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
+   (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
    (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
-   (('fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
-   (('fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
-    ('fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
-   (('fpow', a, 1.0), a),
-   (('fpow', a, 2.0), ('fmul', a, a)),
-   (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
-   (('fpow', 2.0, a), ('fexp2', a)),
-   (('fpow', ('fpow', a, 2.2), 0.454545), a),
-   (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
-   (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
-   (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
-   (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
-   (('flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
-   (('flog2', ('frcp', a)), ('fneg', ('flog2', a))),
-   (('flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
-   (('flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
-   (('fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
-   (('fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
-   (('fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
+   (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
+   (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
+    ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
+   (('~fpow', a, 1.0), a),
+   (('~fpow', a, 2.0), ('fmul', a, a)),
+   (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
+   (('~fpow', 2.0, a), ('fexp2', a)),
+   (('~fpow', ('fpow', a, 2.2), 0.454545), a),
+   (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
+   (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
+   (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
+   (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
+   (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
+   (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
+   (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
+   (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
+   (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
+   (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
+   (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
    # Division and reciprocal
-   (('fdiv', 1.0, a), ('frcp', a)),
+   (('~fdiv', 1.0, a), ('frcp', a)),
    (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
-   (('frcp', ('frcp', a)), a),
-   (('frcp', ('fsqrt', a)), ('frsq', a)),
+   (('~frcp', ('frcp', a)), a),
+   (('~frcp', ('fsqrt', a)), ('frsq', a)),
    (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
-   (('frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
+   (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
    # Boolean simplifications
    (('ieq', 'a@bool', True), a),
    (('ine', 'a@bool', True), ('inot', a)),
@@ -256,7 +256,7 @@ optimizations = [
    (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
 
    # Subtracts
-   (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
+   (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
    (('ussub_4x8', a, 0), a),
    (('ussub_4x8', a, ~0), 0),
@@ -264,7 +264,7 @@ optimizations = [
    (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
    (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
    (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
-   (('fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
+   (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
    (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
    (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
    (('iabs', ('isub', 0, a)), ('iabs', a)),
@@ -393,10 +393,13 @@ for op in ['flt', 'fge', 'feq', 'fne',
 # they help code generation but do not necessarily produce code that is
 # more easily optimizable.
 late_optimizations = [
+   # Most of these optimizations aren't quite safe when you get infinity or
+   # Nan involved but the first one should be fine.
    (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))),
-   (('fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
-   (('feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
-   (('fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+   (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
+   (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
+   (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
+
    (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
    (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
    (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),

From 5f39e3e16510b840f53ef6172631cca8a9f0eaeb Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 11:38:54 -0700
Subject: [PATCH 168/197] nir/cse: Properly handle nir_ssa_def.exact

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_instr_set.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_instr_set.c b/src/compiler/nir/nir_instr_set.c
index 3f5da496092..e244122e466 100644
--- a/src/compiler/nir/nir_instr_set.c
+++ b/src/compiler/nir/nir_instr_set.c
@@ -52,6 +52,7 @@ hash_alu(uint32_t hash, const nir_alu_instr *instr)
 {
    hash = HASH(hash, instr->op);
    hash = HASH(hash, instr->dest.dest.ssa.num_components);
+   /* We explicitly don't hash instr->dest.dest.exact */
 
    if (nir_op_infos[instr->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
       assert(nir_op_infos[instr->op].num_inputs == 2);
@@ -267,6 +268,8 @@ nir_instrs_equal(const nir_instr *instr1, const nir_instr *instr2)
       if (alu1->dest.dest.ssa.num_components != alu2->dest.dest.ssa.num_components)
          return false;
 
+      /* We explicitly don't hash instr->dest.dest.exact */
+
       if (nir_op_infos[alu1->op].algebraic_properties & NIR_OP_IS_COMMUTATIVE) {
          assert(nir_op_infos[alu1->op].num_inputs == 2);
          return (nir_alu_srcs_equal(alu1, alu2, 0, 0) &&
@@ -496,8 +499,17 @@ nir_instr_set_add_or_rewrite(struct set *instr_set, nir_instr *instr)
    struct set_entry *entry = _mesa_set_search(instr_set, instr);
    if (entry) {
       nir_ssa_def *def = nir_instr_get_dest_ssa_def(instr);
-      nir_ssa_def *new_def =
-         nir_instr_get_dest_ssa_def((nir_instr *) entry->key);
+      nir_instr *match = (nir_instr *) entry->key;
+      nir_ssa_def *new_def = nir_instr_get_dest_ssa_def(match);
+
+      /* It's safe to replace a exact instruction with an inexact one as
+       * long as we make it exact.  If we got here, the two instructions are
+       * exactly identical in every other way so, once we've set the exact
+       * bit, they are the same.
+       */
+      if (instr->type == nir_instr_type_alu && nir_instr_as_alu(instr)->exact)
+         nir_instr_as_alu(match)->exact = true;
+
       nir_ssa_def_rewrite_uses(def, nir_src_for_ssa(new_def));
       return true;
    }

From 865e83b9ec86c9ccec7100dfae9f80ff2969753c Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 13:39:07 -0700
Subject: [PATCH 169/197] i965/peephole_ffma: Don't fuse exact adds

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
index 49810c22cfa..6e8b1f99505 100644
--- a/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
+++ b/src/mesa/drivers/dri/i965/brw_nir_opt_peephole_ffma.c
@@ -168,7 +168,9 @@ brw_nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
       if (add->op != nir_op_fadd)
          continue;
 
-      /* TODO: Maybe bail if this expression is considered "precise"? */
+      assert(add->dest.dest.is_ssa);
+      if (add->exact)
+         continue;
 
       assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa);
 

From 91d6272c2b29faa06f352b55e25526c726a25f82 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 16:13:40 -0700
Subject: [PATCH 170/197] nir/alu_to_scalar: Propagate the "exact" bit

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/nir_lower_alu_to_scalar.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 5b3281e0a13..e8ba640fe0b 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -82,6 +82,7 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
    assert(instr->dest.write_mask != 0);
 
    b->cursor = nir_before_instr(&instr->instr);
+   b->exact = instr->exact;
 
 #define LOWER_REDUCTION(name, chan, merge) \
    case name##2: \

From 89b604922d2fb50ca1013473b2003227d61507cd Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 13:58:40 -0700
Subject: [PATCH 171/197] glsl: Add a pass to propagate the "invariant" and
 "precise" qualifiers

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/Makefile.sources              |   1 +
 src/compiler/glsl/glsl_parser_extras.cpp   |   1 +
 src/compiler/glsl/ir_optimization.h        |   1 +
 src/compiler/glsl/propagate_invariance.cpp | 125 +++++++++++++++++++++
 4 files changed, 128 insertions(+)
 create mode 100644 src/compiler/glsl/propagate_invariance.cpp

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 9f3bcf0255b..6ab0aa7b896 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -129,6 +129,7 @@ LIBGLSL_FILES = \
 	glsl/opt_tree_grafting.cpp \
 	glsl/opt_vectorize.cpp \
 	glsl/program.h \
+	glsl/propagate_invariance.cpp \
 	glsl/s_expression.cpp \
 	glsl/s_expression.h
 
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 1c6cd43cd68..9fcca211a99 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1885,6 +1885,7 @@ do_common_optimization(exec_list *ir, bool linked,
       OPT(do_dead_functions, ir);
       OPT(do_structure_splitting, ir);
    }
+   propagate_invariance(ir);
    OPT(do_if_simplification, ir);
    OPT(opt_flatten_nested_if_blocks, ir);
    OPT(opt_conditional_discard, ir);
diff --git a/src/compiler/glsl/ir_optimization.h b/src/compiler/glsl/ir_optimization.h
index 2d773760f90..f9599a39ff5 100644
--- a/src/compiler/glsl/ir_optimization.h
+++ b/src/compiler/glsl/ir_optimization.h
@@ -139,6 +139,7 @@ bool lower_tess_level(gl_shader *shader);
 bool lower_vertex_id(gl_shader *shader);
 
 bool lower_subroutine(exec_list *instructions, struct _mesa_glsl_parse_state *state);
+void propagate_invariance(exec_list *instructions);
 
 ir_rvalue *
 compare_index_block(exec_list *instructions, ir_variable *index,
diff --git a/src/compiler/glsl/propagate_invariance.cpp b/src/compiler/glsl/propagate_invariance.cpp
new file mode 100644
index 00000000000..c137ff3324c
--- /dev/null
+++ b/src/compiler/glsl/propagate_invariance.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file propagate_invariance.cpp
+ * Propagate the "invariant" and "precise" qualifiers to variables used to
+ * compute invariant or precise values.
+ *
+ * The GLSL spec (depending on what version you read) says, among the
+ * conditions for geting bit-for-bit the same values on an invariant output:
+ *
+ *    "All operations in the consuming expressions and any intermediate
+ *    expressions must be the same, with the same order of operands and same
+ *    associativity, to give the same order of evaluation."
+ *
+ * This effectively means that if a variable is used to compute an invariant
+ * value then that variable becomes invariant.  The same should apply to the
+ * "precise" qualifier.
+ */
+
+#include "ir.h"
+#include "ir_visitor.h"
+#include "ir_rvalue_visitor.h"
+#include "ir_optimization.h"
+#include "compiler/glsl_types.h"
+
+namespace {
+
+class ir_invariance_propagation_visitor : public ir_hierarchical_visitor {
+public:
+   ir_invariance_propagation_visitor()
+   {
+      this->progress = false;
+      this->dst_var = NULL;
+   }
+
+   virtual ~ir_invariance_propagation_visitor()
+   {
+      /* empty */
+   }
+
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+   virtual ir_visitor_status visit_leave(ir_assignment *ir);
+   virtual ir_visitor_status visit(ir_dereference_variable *ir);
+
+   ir_variable *dst_var;
+   bool progress;
+};
+
+} /* unnamed namespace */
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_enter(ir_assignment *ir)
+{
+   assert(this->dst_var == NULL);
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      this->dst_var = var;
+      return visit_continue;
+   } else {
+      return visit_continue_with_parent;
+   }
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit_leave(ir_assignment *ir)
+{
+   this->dst_var = NULL;
+
+   return visit_continue;
+}
+
+ir_visitor_status
+ir_invariance_propagation_visitor::visit(ir_dereference_variable *ir)
+{
+   if (this->dst_var == NULL)
+      return visit_continue;
+
+   if (this->dst_var->data.invariant) {
+      if (!ir->var->data.invariant)
+         this->progress = true;
+
+      ir->var->data.invariant = true;
+   }
+
+   if (this->dst_var->data.precise) {
+      if (!ir->var->data.precise)
+         this->progress = true;
+
+      ir->var->data.precise = true;
+   }
+
+   return visit_continue;
+}
+
+void
+propagate_invariance(exec_list *instructions)
+{
+   ir_invariance_propagation_visitor visitor;
+
+   do {
+      visitor.progress = false;
+      visit_list_elements(&visitor, instructions);
+   } while (visitor.progress);
+}

From b2209b2333e71549f4101d3d1193c7a2df4e1c14 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 14:41:14 -0700
Subject: [PATCH 172/197] glsl/opt_algebraic: Don't handle invariant or precise
 trees

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/glsl/opt_algebraic.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/compiler/glsl/opt_algebraic.cpp b/src/compiler/glsl/opt_algebraic.cpp
index 1e58062cb0d..f5858c83865 100644
--- a/src/compiler/glsl/opt_algebraic.cpp
+++ b/src/compiler/glsl/opt_algebraic.cpp
@@ -58,6 +58,8 @@ public:
    {
    }
 
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
    ir_rvalue *handle_expression(ir_expression *ir);
    void handle_rvalue(ir_rvalue **rvalue);
    bool reassociate_constant(ir_expression *ir1,
@@ -80,6 +82,23 @@ public:
 
 } /* unnamed namespace */
 
+ir_visitor_status
+ir_algebraic_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant or precise variable, just bail.
+       * Most of the algebraic optimizations aren't precision-safe.
+       *
+       * FINISHME: Find out which optimizations are precision-safe and enable
+       * then only for invariant or precise trees.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static inline bool
 is_vec_zero(ir_constant *ir)
 {

From 028d6ecfe0feecd1e543322d2953bef810f13d23 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 14:44:57 -0700
Subject: [PATCH 173/197] glsl/rebalance_tree: Don't handle invariant or
 precise trees

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/glsl/opt_rebalance_tree.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/compiler/glsl/opt_rebalance_tree.cpp b/src/compiler/glsl/opt_rebalance_tree.cpp
index 095f2d7d2f0..8045d51033d 100644
--- a/src/compiler/glsl/opt_rebalance_tree.cpp
+++ b/src/compiler/glsl/opt_rebalance_tree.cpp
@@ -131,6 +131,8 @@ public:
       progress = false;
    }
 
+   virtual ir_visitor_status visit_enter(ir_assignment *ir);
+
    void handle_rvalue(ir_rvalue **rvalue);
 
    bool progress;
@@ -146,6 +148,20 @@ struct is_reduction_data {
 
 } /* anonymous namespace */
 
+ir_visitor_status
+ir_rebalance_visitor::visit_enter(ir_assignment *ir)
+{
+   ir_variable *var = ir->lhs->variable_referenced();
+   if (var->data.invariant || var->data.precise) {
+      /* If we're assigning to an invariant variable, just bail.  Tree
+       * rebalancing (reassociation) isn't precision-safe.
+       */
+      return visit_continue_with_parent;
+   } else {
+      return visit_continue;
+   }
+}
+
 static bool
 is_reduction_operation(ir_expression_operation operation)
 {

From a984e44abde74bd17ae2b0ef0762da3f63d84483 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Thu, 17 Mar 2016 15:20:20 -0700
Subject: [PATCH 174/197] nir/glsl: Propagate invariant into NIR alu ops

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/compiler/nir/glsl_to_nir.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index ee9c05308d6..f6e1a17a916 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -1051,6 +1051,9 @@ nir_visitor::visit(ir_assignment *ir)
 {
    unsigned num_components = ir->lhs->type->vector_elements;
 
+   b.exact = ir->lhs->variable_referenced()->data.invariant ||
+             ir->lhs->variable_referenced()->data.precise;
+
    if ((ir->rhs->as_dereference() || ir->rhs->as_constant()) &&
        (ir->write_mask == (1 << num_components) - 1 || ir->write_mask == 0)) {
       /* We're doing a plain-as-can-be copy, so emit a copy_var */

From 4e060d80ff92b7fcf9b54cdd5ed00f549db3f573 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Mar 2016 21:04:18 -0700
Subject: [PATCH 175/197] glsl: Add propagate_invariance to the other makefile

This fixes the scons build
---
 src/compiler/glsl/Makefile.sources | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/glsl/Makefile.sources b/src/compiler/glsl/Makefile.sources
index 08b40c5cc8f..538196a79a9 100644
--- a/src/compiler/glsl/Makefile.sources
+++ b/src/compiler/glsl/Makefile.sources
@@ -201,6 +201,7 @@ LIBGLSL_FILES = \
 	opt_tree_grafting.cpp \
 	opt_vectorize.cpp \
 	program.h \
+	propagate_invariance.cpp \
 	s_expression.cpp \
 	s_expression.h
 

From 0bea0e7141a7698118bfd465fdb4adf8e0b21bc8 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 22 Mar 2016 15:02:42 -0400
Subject: [PATCH 176/197] nir: fix dangling ssadef->name ptrs

In many places, the convention is to pass an existing ssadef name ptr
when construction/initializing a new nir_ssa_def.  But that goes badly
(as noticed by garbage in nir_print output) when the original string
gets freed.

Just use ralloc_strdup() instead, and add ralloc_free() in the two
places that would care (not that the strings wouldn't eventually get
freed anyways).

Also fixup the nir_search code which was directly setting ssadef->name
to use the parent instruction as memctx.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/compiler/nir/nir.c        | 4 +++-
 src/compiler/nir/nir_search.c | 6 +++---
 src/compiler/nir/nir_to_ssa.c | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index b11498132a6..20f1a182b77 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -1317,12 +1317,13 @@ nir_instr_rewrite_dest(nir_instr *instr, nir_dest *dest, nir_dest new_dest)
       src_add_all_uses(dest->reg.indirect, instr, NULL);
 }
 
+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
                  unsigned num_components,
                  unsigned bit_size, const char *name)
 {
-   def->name = name;
+   def->name = ralloc_strdup(instr, name);
    def->parent_instr = instr;
    list_inithead(&def->uses);
    list_inithead(&def->if_uses);
@@ -1339,6 +1340,7 @@ nir_ssa_def_init(nir_instr *instr, nir_ssa_def *def,
    }
 }
 
+/* note: does *not* take ownership of 'name' */
 void
 nir_ssa_dest_init(nir_instr *instr, nir_dest *dest,
                  unsigned num_components, unsigned bit_size,
diff --git a/src/compiler/nir/nir_search.c b/src/compiler/nir/nir_search.c
index 110ab5e2362..6e630631453 100644
--- a/src/compiler/nir/nir_search.c
+++ b/src/compiler/nir/nir_search.c
@@ -469,7 +469,7 @@ construct_value(const nir_search_value *value,
 
       switch (c->type) {
       case nir_type_float:
-         load->def.name = ralloc_asprintf(mem_ctx, "%f", c->data.d);
+         load->def.name = ralloc_asprintf(load, "%f", c->data.d);
          switch (bitsize->dest_size) {
          case 32:
             load->value.f32[0] = c->data.d;
@@ -483,7 +483,7 @@ construct_value(const nir_search_value *value,
          break;
 
       case nir_type_int:
-         load->def.name = ralloc_asprintf(mem_ctx, "%ld", c->data.i);
+         load->def.name = ralloc_asprintf(load, "%ld", c->data.i);
          switch (bitsize->dest_size) {
          case 32:
             load->value.i32[0] = c->data.i;
@@ -497,7 +497,7 @@ construct_value(const nir_search_value *value,
          break;
 
       case nir_type_uint:
-         load->def.name = ralloc_asprintf(mem_ctx, "%lu", c->data.u);
+         load->def.name = ralloc_asprintf(load, "%lu", c->data.u);
          switch (bitsize->dest_size) {
          case 32:
             load->value.u32[0] = c->data.u;
diff --git a/src/compiler/nir/nir_to_ssa.c b/src/compiler/nir/nir_to_ssa.c
index 06406071166..d588d7d2df3 100644
--- a/src/compiler/nir/nir_to_ssa.c
+++ b/src/compiler/nir/nir_to_ssa.c
@@ -221,6 +221,7 @@ rewrite_def_forwards(nir_dest *dest, void *_state)
    list_del(&dest->reg.def_link);
    nir_ssa_dest_init(state->parent_instr, dest, reg->num_components,
                      reg->bit_size, name);
+   ralloc_free(name);
 
    /* push our SSA destination on the stack */
    state->states[index].index++;
@@ -274,6 +275,7 @@ rewrite_alu_instr_forward(nir_alu_instr *instr, rewrite_state *state)
       list_del(&instr->dest.dest.reg.def_link);
       nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
                         reg->bit_size, name);
+      ralloc_free(name);
 
       if (nir_op_infos[instr->op].output_size == 0) {
          /*

From f96309753b7f5f4ea5e1942778087b3ace8eda9b Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Thu, 24 Mar 2016 08:30:09 -0600
Subject: [PATCH 177/197] mesa: replace gl_context->Multisample._Enabled with
 _mesa_is_multisample_enabled.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This removes any dependency on driver validation of the number of
framebuffer samples.

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Tested-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/drivers/dri/i965/brw_util.h          |  5 +++--
 src/mesa/drivers/dri/i965/gen6_cc.c           |  6 +++---
 .../drivers/dri/i965/gen6_multisample_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_blend_state.c  |  6 +++---
 src/mesa/drivers/dri/i965/gen8_depth_state.c  |  3 ++-
 src/mesa/drivers/dri/i965/gen8_sf_state.c     |  4 ++--
 src/mesa/main/framebuffer.c                   | 19 +++++++++++++++++++
 src/mesa/main/framebuffer.h                   |  3 +++
 src/mesa/main/mtypes.h                        |  1 -
 src/mesa/main/state.c                         | 17 -----------------
 src/mesa/program/prog_statevars.c             |  2 +-
 src/mesa/state_tracker/st_atom_rasterizer.c   |  5 +++--
 src/mesa/state_tracker/st_atom_shader.c       |  2 +-
 src/mesa/swrast/s_points.c                    |  4 ++--
 14 files changed, 43 insertions(+), 36 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index 1f27e9862a7..3e9a6ee48d2 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -34,6 +34,7 @@
 #define BRW_UTIL_H
 
 #include "brw_context.h"
+#include "main/framebuffer.h"
 
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
@@ -49,13 +50,13 @@ brw_get_line_width(struct brw_context *brw)
     * implementation-dependent maximum non-antialiased line width."
     */
    float line_width =
-      CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
+      CLAMP(!_mesa_is_multisample_enabled(&brw->ctx) && !brw->ctx.Line.SmoothFlag
             ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
             0.0f, brw->ctx.Const.MaxLineWidth);
    uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
 
    /* Line width of 0 is not allowed when MSAA enabled */
-   if (brw->ctx.Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(&brw->ctx)) {
       if (line_width_u3_7 == 0)
          line_width_u3_7 = 1;
    } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5f) {
diff --git a/src/mesa/drivers/dri/i965/gen6_cc.c b/src/mesa/drivers/dri/i965/gen6_cc.c
index cee139b7fd4..f5a7d4d0ef6 100644
--- a/src/mesa/drivers/dri/i965/gen6_cc.c
+++ b/src/mesa/drivers/dri/i965/gen6_cc.c
@@ -198,14 +198,14 @@ gen6_upload_blend_state(struct brw_context *brw)
       if(!is_buffer_zero_integer_format) {
          /* _NEW_MULTISAMPLE */
          blend[b].blend1.alpha_to_coverage =
-            ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage;
+            _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage;
 
 	/* From SandyBridge PRM, volume 2 Part 1, section 8.2.3, BLEND_STATE:
 	 * DWord 1, Bit 30 (AlphaToOne Enable):
 	 * "If Dual Source Blending is enabled, this bit must be disabled"
 	 */
          WARN_ONCE(ctx->Color.Blend[b]._UsesDualSrc &&
-                   ctx->Multisample._Enabled &&
+                   _mesa_is_multisample_enabled(ctx) &&
                    ctx->Multisample.SampleAlphaToOne,
                    "HW workaround: disabling alpha to one with dual src "
                    "blending\n");
@@ -213,7 +213,7 @@ gen6_upload_blend_state(struct brw_context *brw)
             blend[b].blend1.alpha_to_one = false;
 	 else
 	    blend[b].blend1.alpha_to_one =
-	       ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToOne;
+	       _mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToOne;
 
          blend[b].blend1.alpha_to_coverage_dither = (brw->gen >= 7);
       }
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index 8eb620de56b..fcd313aece2 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -171,7 +171,7 @@ gen6_determine_sample_mask(struct brw_context *brw)
    /* BRW_NEW_NUM_SAMPLES */
    unsigned num_samples = brw->num_samples;
 
-   if (ctx->Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(ctx)) {
       if (ctx->Multisample.SampleCoverage) {
          coverage = ctx->Multisample.SampleCoverageValue;
          coverage_invert = ctx->Multisample.SampleCoverageInvert;
diff --git a/src/mesa/drivers/dri/i965/gen8_blend_state.c b/src/mesa/drivers/dri/i965/gen8_blend_state.c
index 786c79ad44d..63186bd4897 100644
--- a/src/mesa/drivers/dri/i965/gen8_blend_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_blend_state.c
@@ -65,7 +65,7 @@ gen8_upload_blend_state(struct brw_context *brw)
 
    if (rb_zero_type != GL_INT && rb_zero_type != GL_UNSIGNED_INT) {
       /* _NEW_MULTISAMPLE */
-      if (ctx->Multisample._Enabled) {
+      if (_mesa_is_multisample_enabled(ctx)) {
          if (ctx->Multisample.SampleAlphaToCoverage) {
             blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_ENABLE;
             blend[0] |= GEN8_BLEND_ALPHA_TO_COVERAGE_DITHER_ENABLE;
@@ -183,7 +183,7 @@ gen8_upload_blend_state(struct brw_context *brw)
       * "If Dual Source Blending is enabled, this bit must be disabled."
       */
       WARN_ONCE(ctx->Color.Blend[i]._UsesDualSrc &&
-                ctx->Multisample._Enabled &&
+                _mesa_is_multisample_enabled(ctx) &&
                 ctx->Multisample.SampleAlphaToOne,
                 "HW workaround: disabling alpha to one with dual src "
                 "blending\n");
@@ -226,7 +226,7 @@ gen8_upload_ps_blend(struct brw_context *brw)
       dw1 |= GEN8_PS_BLEND_ALPHA_TEST_ENABLE;
 
    /* _NEW_MULTISAMPLE */
-   if (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage)
+   if (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage)
       dw1 |= GEN8_PS_BLEND_ALPHA_TO_COVERAGE_ENABLE;
 
    /* Used for implementing the following bit of GL_EXT_texture_integer:
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index 93100a0708f..8aaa1a8e449 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -29,6 +29,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 #include "brw_wm.h"
+#include "main/framebuffer.h"
 
 /**
  * Helper function to emit depth related command packets.
@@ -303,7 +304,7 @@ pma_fix_enable(const struct brw_context *brw)
    const bool kill_pixel =
       brw->wm.prog_data->uses_kill ||
       brw->wm.prog_data->uses_omask ||
-      (ctx->Multisample._Enabled && ctx->Multisample.SampleAlphaToCoverage) ||
+      (_mesa_is_multisample_enabled(ctx) && ctx->Multisample.SampleAlphaToCoverage) ||
       ctx->Color.AlphaEnabled;
 
    /* The big formula in CACHE_MODE_1::NP PMA FIX ENABLE. */
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 8b6f31f3be6..2ac21f7c873 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -178,7 +178,7 @@ upload_sf(struct brw_context *brw)
       dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH;
 
    /* _NEW_POINT | _NEW_MULTISAMPLE */
-   if ((ctx->Point.SmoothFlag || ctx->Multisample._Enabled) &&
+   if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
        !ctx->Point.PointSprite) {
       dw3 |= GEN8_SF_SMOOTH_POINT_ENABLE;
    }
@@ -249,7 +249,7 @@ upload_raster(struct brw_context *brw)
    if (ctx->Point.SmoothFlag)
       dw1 |= GEN8_RASTER_SMOOTH_POINT_ENABLE;
 
-   if (ctx->Multisample._Enabled)
+   if (_mesa_is_multisample_enabled(ctx))
       dw1 |= GEN8_RASTER_API_MULTISAMPLE_ENABLE;
 
    if (ctx->Polygon.OffsetFill)
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index d18166d528e..f69dc6cb3e6 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -983,3 +983,22 @@ _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb)
    return (fb->_NumColorDrawBuffers >= 1 &&
            fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT);
 }
+
+static inline GLuint
+_mesa_geometric_nonvalidated_samples(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Visual.samples :
+      buffer->DefaultGeometry.NumSamples;
+}
+
+bool _mesa_is_multisample_enabled(const struct gl_context *ctx)
+{
+   /* The sample count may not be validated by the driver, but when it is set,
+    * we know that is in a valid range and no driver should ever validate a
+    * multisampled framebuffer to non-multisampled and vice-versa.
+    */
+   return ctx->Multisample.Enabled &&
+          ctx->DrawBuffer &&
+          _mesa_geometric_nonvalidated_samples(ctx->DrawBuffer) > 1;
+}
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index fa434d447ae..384f7498776 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -146,4 +146,7 @@ _mesa_is_front_buffer_reading(const struct gl_framebuffer *fb);
 extern bool
 _mesa_is_front_buffer_drawing(const struct gl_framebuffer *fb);
 
+extern bool
+_mesa_is_multisample_enabled(const struct gl_context *ctx);
+
 #endif /* FRAMEBUFFER_H */
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5d8bfe4bb09..399f4508415 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -667,7 +667,6 @@ struct gl_list_attrib
 struct gl_multisample_attrib
 {
    GLboolean Enabled;
-   GLboolean _Enabled;   /**< true if Enabled and multisample buffer */
    GLboolean SampleAlphaToCoverage;
    GLboolean SampleAlphaToOne;
    GLboolean SampleCoverage;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 57f13411fdf..917ae4da023 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -343,20 +343,6 @@ update_frontbit(struct gl_context *ctx)
 }
 
 
-/**
- * Update derived multisample state.
- */
-static void
-update_multisample(struct gl_context *ctx)
-{
-   ctx->Multisample._Enabled = GL_FALSE;
-   if (ctx->Multisample.Enabled &&
-       ctx->DrawBuffer &&
-       _mesa_geometric_samples(ctx->DrawBuffer) > 0)
-      ctx->Multisample._Enabled = GL_TRUE;
-}
-
-
 /**
  * Update the ctx->VertexProgram._TwoSideEnabled flag.
  */
@@ -450,9 +436,6 @@ _mesa_update_state_locked( struct gl_context *ctx )
    if (new_state & _NEW_PIXEL)
       _mesa_update_pixel( ctx, new_state );
 
-   if (new_state & (_NEW_MULTISAMPLE | _NEW_BUFFERS))
-      update_multisample( ctx );
-
    /* ctx->_NeedEyeCoords is now up to date.
     *
     * If the truth value of this variable has changed, update for the
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index db53377d705..03ece6711c2 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -502,7 +502,7 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
                minImplSize = ctx->Const.MinPointSizeAA;
                maxImplSize = ctx->Const.MaxPointSize;
             }
-            else if (ctx->Point.SmoothFlag || ctx->Multisample._Enabled) {
+            else if (ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) {
                minImplSize = ctx->Const.MinPointSizeAA;
                maxImplSize = ctx->Const.MaxPointSizeAA;
             }
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index c20cadf508f..366163e42df 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -31,6 +31,7 @@
   */
  
 #include "main/macros.h"
+#include "main/framebuffer.h"
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_debug.h"
@@ -235,12 +236,12 @@ static void update_raster_state( struct st_context *st )
    raster->line_stipple_factor = ctx->Line.StippleFactor - 1;
 
    /* _NEW_MULTISAMPLE */
-   raster->multisample = ctx->Multisample._Enabled;
+   raster->multisample = _mesa_is_multisample_enabled(ctx);
 
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    raster->force_persample_interp =
          !st->force_persample_in_shader &&
-         ctx->Multisample._Enabled &&
+         _mesa_is_multisample_enabled(ctx) &&
          ctx->Multisample.SampleShading &&
          ctx->Multisample.MinSampleShadingValue *
          ctx->DrawBuffer->Visual.samples > 1;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index ff90bd61d5b..709f0cbcb91 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -74,7 +74,7 @@ update_fp( struct st_context *st )
    /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    key.persample_shading =
       st->force_persample_in_shader &&
-      st->ctx->Multisample._Enabled &&
+      _mesa_is_multisample_enabled(st->ctx) &&
       st->ctx->Multisample.SampleShading &&
       st->ctx->Multisample.MinSampleShadingValue *
       _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index d9aae73302c..3163b0407ea 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -22,7 +22,7 @@
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
-
+#include "main/framebuffer.h"
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "s_context.h"
@@ -257,7 +257,7 @@ smooth_point(struct gl_context *ctx, const SWvertex *vert)
    size = get_size(ctx, vert, GL_TRUE);
 
    /* alpha attenuation / fade factor */
-   if (ctx->Multisample._Enabled) {
+   if (_mesa_is_multisample_enabled(ctx)) {
       if (vert->pointSize >= ctx->Point.Threshold) {
          alphaAtten = 1.0F;
       }

From 7880b81d39e56f1d4b062519f087a053c01ee0e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 23 Mar 2016 11:58:28 -0500
Subject: [PATCH 178/197] radeonsi: silence a coverity warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following Coverity warning

5378     	tmpl.fetch_args = atomic_fetch_args;
5379     	tmpl.emit = atomic_emit;
>>>     CID 1357115:  Uninitialized variables  (UNINIT)
>>>     Using uninitialized value "tmpl". Field "tmpl.intr_name" is uninitialized.
5380     	bld_base->op_actions[TGSI_OPCODE_ATOMUADD] = tmpl;
5381     	bld_base->op_actions[TGSI_OPCODE_ATOMUADD].intr_name = "add";

... is a false positive, but what the hell. This change should "fix" it.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 1e4bf828ae4..9eb531f8d80 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5362,7 +5362,7 @@ static void si_init_shader_ctx(struct si_shader_context *ctx,
 			       LLVMTargetMachineRef tm)
 {
 	struct lp_build_tgsi_context *bld_base;
-	struct lp_build_tgsi_action tmpl;
+	struct lp_build_tgsi_action tmpl = {};
 
 	memset(ctx, 0, sizeof(*ctx));
 	radeon_llvm_context_init(&ctx->radeon_bld, "amdgcn--");

From 6b763c026de0aa4c18bb698ddcfd25d04c73e56e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Wed, 23 Mar 2016 15:22:16 -0500
Subject: [PATCH 179/197] st/mesa: use RGBA instead of BGRA for SRGB_ALPHA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a regression introduced by commit a8eea696 "st/mesa: honour sized
internal formats in st_choose_format (v2)".

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94657
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94671
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_format.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 4b5f8199c6c..9a280fc004b 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -1309,7 +1309,7 @@ static const struct format_mapping format_map[] = {
    },
    {
       { GL_SRGB_ALPHA_EXT, GL_SRGB8_ALPHA8_EXT, 0 },
-      { DEFAULT_SRGBA_FORMATS }
+      { PIPE_FORMAT_R8G8B8A8_SRGB, DEFAULT_SRGBA_FORMATS }
    },
    {
       { GL_COMPRESSED_SRGB_EXT, GL_COMPRESSED_SRGB_S3TC_DXT1_EXT, 0 },

From 412e686da9e64d5b56b0a9c57c2b95624c56ea05 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Mar 2016 21:38:42 -0700
Subject: [PATCH 180/197] mesa: Include null terminator in
 GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH.

From the KHR_debug spec:
"Applications can query the number of messages currently in the log by
 obtaining the value of DEBUG_LOGGED_MESSAGES, and the string length
 (including its null terminator) of the oldest message in the log
 through the value of DEBUG_NEXT_LOGGED_MESSAGE_LENGTH."

Because we weren't including the null terminator, many dEQP tests
called glGetDebugMessageLog with a bufSize parameter that was 1 too
small, and unable to contain the message, so we skipped returning it,
failing many cases.

Fixes 298 dEQP-GLES31.functional.debug.* tests.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Stephane Marchesin <stephane.marchesin@gmail.com>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/main/debug_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
index c2b9f053352..74be8554c80 100644
--- a/src/mesa/main/debug_output.c
+++ b/src/mesa/main/debug_output.c
@@ -779,7 +779,7 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
       break;
    case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH:
       val = (debug->Log.NumMessages) ?
-         debug->Log.Messages[debug->Log.NextMessage].length : 0;
+         debug->Log.Messages[debug->Log.NextMessage].length + 1 : 0;
       break;
    case GL_DEBUG_GROUP_STACK_DEPTH:
       val = debug->CurrentGroup + 1;

From 028459a00d6faec85ea75ebbaff75fb6f1d91bff Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Mar 2016 23:35:40 -0700
Subject: [PATCH 181/197] mesa: Make glDebugMessageInsert deal with negative
 length for all types.

From the KHR_debug spec, section 5.5.5 (Externally Generated Messages):

   "If <length> is negative, it is implied that <buf> contains a null
    terminated string. The error INVALID_VALUE will be generated if the
    number of characters in <buf>, excluding the null terminator when
    <length> is negative, is not less than the value of
    MAX_DEBUG_MESSAGE_LENGTH."

This indicates that length should be set to strlen for all types, not
just GL_DEBUG_TYPE_MARKER.  We want it to be after validate_length()
so we still generate appropriate errors.

Fixes crashes from uncaught std::string exceptions in many
dEQP-GLES31.functional.debug.error_filters.* tests.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/main/debug_output.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
index 74be8554c80..001f63ea94d 100644
--- a/src/mesa/main/debug_output.c
+++ b/src/mesa/main/debug_output.c
@@ -1009,15 +1009,16 @@ _mesa_DebugMessageInsert(GLenum source, GLenum type, GLuint id,
    if (!validate_length(ctx, callerstr, length, buf))
       return; /* GL_INVALID_VALUE */
 
+   /* if length not specified, string will be null terminated: */
+   if (length < 0)
+      length = strlen(buf);
+
    _mesa_log_msg(ctx, gl_enum_to_debug_source(source),
                  gl_enum_to_debug_type(type), id,
                  gl_enum_to_debug_severity(severity),
                  length, buf);
 
    if (type == GL_DEBUG_TYPE_MARKER && ctx->Driver.EmitStringMarker) {
-      /* if length not specified, string will be null terminated: */
-      if (length < 0)
-         length = strlen(buf);
       ctx->Driver.EmitStringMarker(ctx, buf, length);
    }
 }

From d1bb1df87ed8518693730efc80b3a8b9912bb7bf Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Mar 2016 23:46:12 -0700
Subject: [PATCH 182/197] mesa: Handle negative length in glPushDebugGroup().

The KHR_debug spec doesn't actually say we should handle this, but that
is most likely an oversight - it says to check against strlen and
generate errors if length is negative.  It appears they just forgot to
explicitly spell out that we should then proceed to actually handle it.

Fixes crashes from uncaught std::string exceptions in many
dEQP-GLES31.functional.debug.error_filters.* tests.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/main/debug_output.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
index 001f63ea94d..85f64bd459f 100644
--- a/src/mesa/main/debug_output.c
+++ b/src/mesa/main/debug_output.c
@@ -1189,6 +1189,9 @@ _mesa_PushDebugGroup(GLenum source, GLuint id, GLsizei length,
    if (!validate_length(ctx, callerstr, length, message))
       return; /* GL_INVALID_VALUE */
 
+   if (length < 0)
+      length = strlen(message);
+
    debug = _mesa_lock_debug_state(ctx);
    if (!debug)
       return;

From b9c70fcdadec9a73db13794e89253ababc063574 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 23 Mar 2016 23:29:20 +0100
Subject: [PATCH 183/197] nv50/ir: silence unhandled TGSI_PROPERTY_NEXT_SHADER
 info

radeonsi uses this property to make the best decision about which
shader to compile, but this is not currently used by our codegen.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 12f2551ddf4..611d5f9c3ed 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1039,6 +1039,9 @@ void Source::scanProperty(const struct tgsi_full_property *prop)
    case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
       info->io.cullDistances = prop->u[0].Data;
       break;
+   case TGSI_PROPERTY_NEXT_SHADER:
+      /* Do not need to know the next shader stage. */
+      break;
    default:
       INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
       break;

From 61c7d20e4f3c2902582acfcd7212f3357034f33b Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Thu, 24 Mar 2016 15:44:35 -0400
Subject: [PATCH 184/197] ttn: remove stray global from header

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/auxiliary/nir/tgsi_to_nir.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.h b/src/gallium/auxiliary/nir/tgsi_to_nir.h
index 0651870ea80..f480009afa4 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.h
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.h
@@ -23,8 +23,6 @@
 
 #include "compiler/nir/nir.h"
 
-struct nir_shader_compiler_options *options;
-
 struct nir_shader *
 tgsi_to_nir(const void *tgsi_tokens,
             const struct nir_shader_compiler_options *options);

From 9a41d947319f2d2999b4b5442ce20443d7c3cf3a Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 16 Jan 2016 16:42:06 -0800
Subject: [PATCH 185/197] util/bitset: Allow iterating over const bitsets

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/util/bitset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/util/bitset.h b/src/util/bitset.h
index c452819414f..2404ce7f630 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -98,7 +98,7 @@ __bitset_ffs(const BITSET_WORD *x, int n)
 
 static inline unsigned
 __bitset_next_set(unsigned i, BITSET_WORD *tmp,
-                  BITSET_WORD *set, unsigned size)
+                  const BITSET_WORD *set, unsigned size)
 {
    unsigned bit, word;
 

From e4dc82cfcffd9c3472b962b6bd7328788926452d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 12 Feb 2016 21:41:42 -0800
Subject: [PATCH 186/197] nir: Add a phi node placement helper

Right now, we have phi placement code in two places and there are other
places where it would be nice to be able to do this analysis.  Instead of
repeating it all over the place, this commit adds a helper for placing all
of the needed phi nodes for a value.

v2: Add better documentation

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/Makefile.sources      |   2 +
 src/compiler/nir/Makefile.sources  |   2 +
 src/compiler/nir/nir_phi_builder.c | 295 +++++++++++++++++++++++++++++
 src/compiler/nir/nir_phi_builder.h | 115 +++++++++++
 4 files changed, 414 insertions(+)
 create mode 100644 src/compiler/nir/nir_phi_builder.c
 create mode 100644 src/compiler/nir/nir_phi_builder.h

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 6ab0aa7b896..c38454e0267 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -215,6 +215,8 @@ NIR_FILES = \
 	nir/nir_opt_peephole_select.c \
 	nir/nir_opt_remove_phis.c \
 	nir/nir_opt_undef.c \
+	nir/nir_phi_builder.c \
+	nir/nir_phi_builder.h \
 	nir/nir_print.c \
 	nir/nir_remove_dead_variables.c \
 	nir/nir_search.c \
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index f31547b9aac..db3eeccf858 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -58,6 +58,8 @@ NIR_FILES = \
 	nir_opt_peephole_select.c \
 	nir_opt_remove_phis.c \
 	nir_opt_undef.c \
+	nir_phi_builder.c \
+	nir_phi_builder.h \
 	nir_print.c \
 	nir_remove_dead_variables.c \
 	nir_search.c \
diff --git a/src/compiler/nir/nir_phi_builder.c b/src/compiler/nir/nir_phi_builder.c
new file mode 100644
index 00000000000..a39e3606fd5
--- /dev/null
+++ b/src/compiler/nir/nir_phi_builder.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir_phi_builder.h"
+#include "nir/nir_vla.h"
+
+struct nir_phi_builder {
+   nir_shader *shader;
+   nir_function_impl *impl;
+
+   /* Copied from the impl for easy access */
+   unsigned num_blocks;
+
+   /* Array of all blocks indexed by block->index. */
+   nir_block **blocks;
+
+   /* Hold on to the values so we can easily iterate over them. */
+   struct exec_list values;
+
+   /* Worklist for phi adding */
+   unsigned iter_count;
+   unsigned *work;
+   nir_block **W;
+};
+
+#define NEEDS_PHI ((nir_ssa_def *)(intptr_t)-1)
+
+struct nir_phi_builder_value {
+   struct exec_node node;
+
+   struct nir_phi_builder *builder;
+
+   /* Needed so we can create phis and undefs */
+   unsigned num_components;
+   unsigned bit_size;
+
+   /* The list of phi nodes associated with this value.  Phi nodes are not
+    * added directly.  Instead, they are created, the instr->block pointer
+    * set, and then added to this list.  Later, in phi_builder_finish, we
+    * set up their sources and add them to the top of their respective
+    * blocks.
+    */
+   struct exec_list phis;
+
+   /* Array of SSA defs, indexed by block.  For each block, this array has has
+    * one of three types of values:
+    *
+    *  - NULL. Indicates that there is no known definition in this block.  If
+    *    you need to find one, look at the block's immediate dominator.
+    *
+    *  - NEEDS_PHI. Indicates that the block may need a phi node but none has
+    *    been created yet.  If a def is requested for a block, a phi will need
+    *    to be created.
+    *
+    *  - A regular SSA def.  This will be either the result of a phi node or
+    *    one of the defs provided by nir_phi_builder_value_set_blocK_def().
+    */
+   nir_ssa_def *defs[0];
+};
+
+static bool
+fill_block_array(nir_block *block, void *void_data)
+{
+   nir_block **blocks = void_data;
+   blocks[block->index] = block;
+   return true;
+}
+
+struct nir_phi_builder *
+nir_phi_builder_create(nir_function_impl *impl)
+{
+   struct nir_phi_builder *pb = ralloc(NULL, struct nir_phi_builder);
+
+   pb->shader = impl->function->shader;
+   pb->impl = impl;
+
+   assert(impl->valid_metadata & (nir_metadata_block_index |
+                                  nir_metadata_dominance));
+
+   pb->num_blocks = impl->num_blocks;
+   pb->blocks = ralloc_array(pb, nir_block *, pb->num_blocks);
+   nir_foreach_block(impl, fill_block_array, pb->blocks);
+
+   exec_list_make_empty(&pb->values);
+
+   pb->iter_count = 0;
+   pb->work = rzalloc_array(pb, unsigned, pb->num_blocks);
+   pb->W = ralloc_array(pb, nir_block *, pb->num_blocks);
+
+   return pb;
+}
+
+struct nir_phi_builder_value *
+nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
+                          unsigned bit_size, const BITSET_WORD *defs)
+{
+   struct nir_phi_builder_value *val;
+   unsigned i, w_start = 0, w_end = 0;
+
+   val = rzalloc_size(pb, sizeof(*val) + sizeof(val->defs[0]) * pb->num_blocks);
+   val->builder = pb;
+   val->num_components = num_components;
+   val->bit_size = bit_size;
+   exec_list_make_empty(&val->phis);
+   exec_list_push_tail(&pb->values, &val->node);
+
+   pb->iter_count++;
+
+   BITSET_WORD tmp;
+   BITSET_FOREACH_SET(i, tmp, defs, pb->num_blocks) {
+      if (pb->work[i] < pb->iter_count)
+         pb->W[w_end++] = pb->blocks[i];
+      pb->work[i] = pb->iter_count;
+   }
+
+   while (w_start != w_end) {
+      nir_block *cur = pb->W[w_start++];
+      struct set_entry *dom_entry;
+      set_foreach(cur->dom_frontier, dom_entry) {
+         nir_block *next = (nir_block *) dom_entry->key;
+
+         /* If there's more than one return statement, then the end block
+          * can be a join point for some definitions. However, there are
+          * no instructions in the end block, so nothing would use those
+          * phi nodes. Of course, we couldn't place those phi nodes
+          * anyways due to the restriction of having no instructions in the
+          * end block...
+          */
+         if (next == pb->impl->end_block)
+            continue;
+
+         if (val->defs[next->index] == NULL) {
+            /* Instead of creating a phi node immediately, we simply set the
+             * value to the magic value NEEDS_PHI.  Later, we create phi nodes
+             * on demand in nir_phi_builder_value_get_block_def().
+             */
+            val->defs[next->index] = NEEDS_PHI;
+
+            if (pb->work[next->index] < pb->iter_count) {
+               pb->work[next->index] = pb->iter_count;
+               pb->W[w_end++] = next;
+            }
+         }
+      }
+   }
+
+   return val;
+}
+
+void
+nir_phi_builder_value_set_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block, nir_ssa_def *def)
+{
+   val->defs[block->index] = def;
+}
+
+nir_ssa_def *
+nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block)
+{
+   /* For each block, we have one of three types of values */
+   if (val->defs[block->index] == NULL) {
+      /* NULL indicates that we have no SSA def for this block. */
+      if (block->imm_dom) {
+         /* Grab it from our immediate dominator.  We'll stash it here for
+          * easy access later.
+          */
+         val->defs[block->index] =
+            nir_phi_builder_value_get_block_def(val, block->imm_dom);
+         return val->defs[block->index];
+      } else {
+         /* No immediate dominator means that this block is either the
+          * start block or unreachable.  In either case, the value is
+          * undefined so we need an SSA undef.
+          */
+         nir_ssa_undef_instr *undef =
+            nir_ssa_undef_instr_create(val->builder->shader,
+                                       val->num_components);
+         nir_instr_insert(nir_before_cf_list(&val->builder->impl->body),
+                          &undef->instr);
+         val->defs[block->index] = &undef->def;
+         return &undef->def;
+      }
+   } else if (val->defs[block->index] == NEEDS_PHI) {
+      /* The magic value NEEDS_PHI indicates that the block needs a phi node
+       * but none has been created.  We need to create one now so we can
+       * return it to the caller.
+       *
+       * Because a phi node may use SSA defs that it does not dominate (this
+       * happens in loops), we do not yet have enough information to fully
+       * fill out the phi node.  Instead, the phi nodes we create here will be
+       * empty (have no sources) and won't actually be placed in the block's
+       * instruction list yet.  Later, in nir_phi_builder_finish(), we walk
+       * over all of the phi instructions, fill out the sources lists, and
+       * place them at the top of their respective block's instruction list.
+       *
+       * Creating phi nodes on-demand allows us to avoid creating dead phi
+       * nodes that will just get deleted later. While this probably isn't a
+       * big win for a full into-SSA pass, other users may use the phi builder
+       * to make small SSA form repairs where most of the phi nodes will never
+       * be used.
+       */
+      nir_phi_instr *phi = nir_phi_instr_create(val->builder->shader);
+      nir_ssa_dest_init(&phi->instr, &phi->dest, val->num_components,
+                        val->bit_size, NULL);
+      phi->instr.block = block;
+      exec_list_push_tail(&val->phis, &phi->instr.node);
+      val->defs[block->index] = &phi->dest.ssa;
+      return &phi->dest.ssa;
+   } else {
+      /* In this case, we have an actual SSA def.  It's either the result of a
+       * phi node created by the case above or one passed to us through
+       * nir_phi_builder_value_set_block_def().
+       */
+      return val->defs[block->index];
+   }
+}
+
+static int
+compare_blocks(const void *_a, const void *_b)
+{
+   nir_block * const * a = _a;
+   nir_block * const * b = _b;
+
+   return (*a)->index - (*b)->index;
+}
+
+void
+nir_phi_builder_finish(struct nir_phi_builder *pb)
+{
+   const unsigned num_blocks = pb->num_blocks;
+   NIR_VLA(nir_block *, preds, num_blocks);
+
+   foreach_list_typed(struct nir_phi_builder_value, val, node, &pb->values) {
+      /* We treat the linked list of phi nodes like a worklist.  The list is
+       * pre-populated by calls to nir_phi_builder_value_get_block_def() that
+       * create phi nodes.  As we fill in the sources of phi nodes, more may
+       * be created and are added to the end of the list.
+       *
+       * Because we are adding and removing phi nodes from the list as we go,
+       * we can't iterate over it normally.  Instead, we just iterate until
+       * the list is empty.
+       */
+      while (!exec_list_is_empty(&val->phis)) {
+         struct exec_node *head = exec_list_get_head(&val->phis);
+         nir_phi_instr *phi = exec_node_data(nir_phi_instr, head, instr.node);
+         assert(phi->instr.type == nir_instr_type_phi);
+
+         exec_node_remove(&phi->instr.node);
+
+         /* Construct an array of predecessors.  We sort it to ensure
+          * determinism in the phi insertion algorithm.
+          *
+          * XXX: Calling qsort this many times seems expensive.
+          */
+         int num_preds = 0;
+         struct set_entry *entry;
+         set_foreach(phi->instr.block->predecessors, entry)
+            preds[num_preds++] = (nir_block *)entry->key;
+         qsort(preds, num_preds, sizeof(*preds), compare_blocks);
+
+         for (unsigned i = 0; i < num_preds; i++) {
+            nir_phi_src *src = ralloc(phi, nir_phi_src);
+            src->pred = preds[i];
+            src->src = nir_src_for_ssa(
+               nir_phi_builder_value_get_block_def(val, preds[i]));
+            exec_list_push_tail(&phi->srcs, &src->node);
+         }
+
+         nir_instr_insert(nir_before_block(phi->instr.block), &phi->instr);
+      }
+   }
+
+   ralloc_free(pb);
+}
diff --git a/src/compiler/nir/nir_phi_builder.h b/src/compiler/nir/nir_phi_builder.h
new file mode 100644
index 00000000000..edc530268c2
--- /dev/null
+++ b/src/compiler/nir/nir_phi_builder.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "nir.h"
+
+/** A helper for placing phi nodes in a NIR shader
+ *
+ * Basic usage goes something like this:
+ *
+ *     each variable, var, has:
+ *         a bitset var.defs of blocks where the variable is defined
+ *         a struct nir_phi_builder_value *pb_val
+ *
+ *     // initialize bitsets
+ *     foreach block:
+ *         foreach def of variable var:
+ *             var.defs[def.block] = true;
+ *
+ *     // initialize phi builder
+ *     pb = nir_phi_builder_create()
+ *     foreach var:
+ *         var.pb_val = nir_phi_builder_add_value(pb, var.defs)
+ *
+ *     // Visit each block.  This needs to visit dominators first;
+ *     // nir_for_each_block() will be ok.
+ *     foreach block:
+ *         foreach instruction:
+ *             foreach use of variable var:
+ *                 replace use with nir_phi_builder_get_block_def(var.pb_val)
+ *             foreach def of variable var:
+ *                 create ssa def, register with
+ *     nir_phi_builder_set_block_def(var.pb_val)
+ *
+ *     nir_phi_builder_finish(pb)
+ */
+struct nir_phi_builder;
+
+struct nir_phi_builder_value;
+
+/* Create a new phi builder.
+ *
+ * While this is fairly cheap, it does allocate some memory and walk the list
+ * of blocks so it's recommended that you only call it once and use it to
+ * build phis for several values.
+ */
+struct nir_phi_builder *nir_phi_builder_create(nir_function_impl *impl);
+
+/* Register a value with the builder.
+ *
+ * The 'defs' parameter specifies a bitset of blocks in which the given value
+ * is defined.  This is used to determine where to place the phi nodes.
+ */
+struct nir_phi_builder_value *
+nir_phi_builder_add_value(struct nir_phi_builder *pb, unsigned num_components,
+                          unsigned bit_size, const BITSET_WORD *defs);
+
+/* Register a definition for the given value and block.
+ *
+ * It is safe to call this function as many times as you wish for any given
+ * block/value pair.  However, it always replaces whatever was there
+ * previously even if that definition is from a phi node.  The phi builder
+ * always uses the latest information it has, so you must be careful about the
+ * order in which you register definitions.  The final value at the end of the
+ * block must be the last value registered.
+ */
+void
+nir_phi_builder_value_set_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block, nir_ssa_def *def);
+
+/* Get the definition for the given value in the given block.
+ *
+ * This definition will always be the latest definition known for the given
+ * block.  If no definition is immediately available, it will crawl up the
+ * dominance tree and insert phi nodes as needed until it finds one.  In the
+ * case that no suitable definition is found, it will return the result of a
+ * nir_ssa_undef_instr with the correct number of components.
+ *
+ * Because this function only uses the latest available information for any
+ * given block, you must have already finished registering definitions for any
+ * blocks that dominate the current block in order to get the correct result.
+ */
+nir_ssa_def *
+nir_phi_builder_value_get_block_def(struct nir_phi_builder_value *val,
+                                    nir_block *block);
+
+/* Finish building phi nodes and free the builder.
+ *
+ * This function does far more than just free memory.  Prior to calling
+ * nir_phi_builder_finish, no phi nodes have actually been inserted in the
+ * program.  This function is what finishes setting up phi node sources and
+ * adds the phi nodes to the program.
+ */
+void nir_phi_builder_finish(struct nir_phi_builder *pb);

From 42ddfc611f84297abeadf74be424387b127f7567 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 29 Dec 2015 15:25:43 -0800
Subject: [PATCH 187/197] nir/dominance: Handle unreachable blocks

Previously, nir_dominance.c didn't properly handle unreachable blocks.
This can happen if, for instance, you have something like this:

loop {
   if (...) {
      break;
   } else {
      break;
   }
}

In this case, the block right after the if statement will be unreachable.
This commit makes two changes to handle this.  First, it removes an assert
and allows block->imm_dom to be null if the block is unreachable.  Second,
it properly skips unreachable blocks in calc_dom_frontier_cb.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/compiler/nir/nir_dominance.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_dominance.c b/src/compiler/nir/nir_dominance.c
index b345b85e8a0..d95f3968074 100644
--- a/src/compiler/nir/nir_dominance.c
+++ b/src/compiler/nir/nir_dominance.c
@@ -94,7 +94,6 @@ calc_dominance_cb(nir_block *block, void *_state)
       }
    }
 
-   assert(new_idom);
    if (block->imm_dom != new_idom) {
       block->imm_dom = new_idom;
       state->progress = true;
@@ -112,6 +111,11 @@ calc_dom_frontier_cb(nir_block *block, void *state)
       struct set_entry *entry;
       set_foreach(block->predecessors, entry) {
          nir_block *runner = (nir_block *) entry->key;
+
+         /* Skip unreachable predecessors */
+         if (runner->imm_dom == NULL)
+            continue;
+
          while (runner != block->imm_dom) {
             _mesa_set_add(runner->dom_frontier, block);
             runner = runner->imm_dom;

From ea98d415e42b7a97b8c9f37eb2e0e0f6ad98d14e Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 12 Feb 2016 21:48:26 -0800
Subject: [PATCH 188/197] nir/vars_to_ssa: Use the new nir_phi_builder helper

The efficiency should be approximately the same.  We do a little more work
per phi node because we have to sort the predecessors.  However, we no
longer have to walk the blocks a second time to pop things off the stack.
The bigger advantage, however, is that we can now re-use the phi placement
and per-block SSA value tracking in other passes.

As a side-benifit, the phi builder actually handles unreachable blocks
correctly.  The original vars_to_ssa code, because of the way it iterated
the blocks and added phi sources, didn't add sources corresponding to
predecessors of unreachable blocks.  The new strategy employed by the phi
builder creates a phi source for each predecessor and should correctly
handle unreachable blocks by setting those sources to SSA undefs.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/nir/nir_lower_vars_to_ssa.c | 521 +++++++----------------
 1 file changed, 148 insertions(+), 373 deletions(-)

diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index 2331791d135..9f9e454c198 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -27,6 +27,7 @@
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_phi_builder.h"
 #include "nir_vla.h"
 
 
@@ -47,8 +48,7 @@ struct deref_node {
    struct set *stores;
    struct set *copies;
 
-   nir_ssa_def **def_stack;
-   nir_ssa_def **def_stack_tail;
+   struct nir_phi_builder_value *pb_value;
 
    struct deref_node *wildcard;
    struct deref_node *indirect;
@@ -87,8 +87,7 @@ struct lower_variables_state {
     */
    bool add_to_direct_deref_nodes;
 
-   /* A hash table mapping phi nodes to deref_state data */
-   struct hash_table *phi_table;
+   struct nir_phi_builder *phi_builder;
 };
 
 static struct deref_node *
@@ -473,114 +472,6 @@ lower_copies_to_load_store(struct deref_node *node,
    return true;
 }
 
-/** Pushes an SSA def onto the def stack for the given node
- *
- * Each node is potentially associated with a stack of SSA definitions.
- * This stack is used for determining what SSA definition reaches a given
- * point in the program for variable renaming.  The stack is always kept in
- * dominance-order with at most one SSA def per block.  If the SSA
- * definition on the top of the stack is in the same block as the one being
- * pushed, the top element is replaced.
- */
-static void
-def_stack_push(struct deref_node *node, nir_ssa_def *def,
-               struct lower_variables_state *state)
-{
-   if (node->def_stack == NULL) {
-      node->def_stack = ralloc_array(state->dead_ctx, nir_ssa_def *,
-                                     state->impl->num_blocks);
-      node->def_stack_tail = node->def_stack - 1;
-   }
-
-   if (node->def_stack_tail >= node->def_stack) {
-      nir_ssa_def *top_def = *node->def_stack_tail;
-
-      if (def->parent_instr->block == top_def->parent_instr->block) {
-         /* They're in the same block, just replace the top */
-         *node->def_stack_tail = def;
-         return;
-      }
-   }
-
-   *(++node->def_stack_tail) = def;
-}
-
-/* Pop the top of the def stack if it's in the given block */
-static void
-def_stack_pop_if_in_block(struct deref_node *node, nir_block *block)
-{
-   /* If we're popping, then we have presumably pushed at some time in the
-    * past so this should exist.
-    */
-   assert(node->def_stack != NULL);
-
-   /* The stack is already empty.  Do nothing. */
-   if (node->def_stack_tail < node->def_stack)
-      return;
-
-   nir_ssa_def *def = *node->def_stack_tail;
-   if (def->parent_instr->block == block)
-      node->def_stack_tail--;
-}
-
-/** Retrieves the SSA definition on the top of the stack for the given
- * node, if one exists.  If the stack is empty, then we return the constant
- * initializer (if it exists) or an SSA undef.
- */
-static nir_ssa_def *
-get_ssa_def_for_block(struct deref_node *node, nir_block *block,
-                      struct lower_variables_state *state)
-{
-   /* If we have something on the stack, go ahead and return it.  We're
-    * assuming that the top of the stack dominates the given block.
-    */
-   if (node->def_stack && node->def_stack_tail >= node->def_stack)
-      return *node->def_stack_tail;
-
-   /* If we got here then we don't have a definition that dominates the
-    * given block.  This means that we need to add an undef and use that.
-    */
-   nir_ssa_undef_instr *undef =
-      nir_ssa_undef_instr_create(state->shader,
-                                 glsl_get_vector_elements(node->type));
-   nir_instr_insert_before_cf_list(&state->impl->body, &undef->instr);
-   def_stack_push(node, &undef->def, state);
-   return &undef->def;
-}
-
-/* Given a block and one of its predecessors, this function fills in the
- * souces of the phi nodes to take SSA defs from the given predecessor.
- * This function must be called exactly once per block/predecessor pair.
- */
-static void
-add_phi_sources(nir_block *block, nir_block *pred,
-                struct lower_variables_state *state)
-{
-   nir_foreach_instr(block, instr) {
-      if (instr->type != nir_instr_type_phi)
-         break;
-
-      nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-      struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
-      if (!entry)
-         continue;
-
-      struct deref_node *node = entry->data;
-
-      nir_phi_src *src = ralloc(phi, nir_phi_src);
-      src->pred = pred;
-      src->src.parent_instr = &phi->instr;
-      src->src.is_ssa = true;
-      src->src.ssa = get_ssa_def_for_block(node, pred, state);
-
-      list_addtail(&src->src.use_link, &src->src.ssa->uses);
-
-      exec_list_push_tail(&phi->srcs, &src->node);
-   }
-}
-
 /* Performs variable renaming by doing a DFS of the dominance tree
  *
  * This algorithm is very similar to the one outlined in "Efficiently
@@ -595,271 +486,130 @@ rename_variables_block(nir_block *block, struct lower_variables_state *state)
    nir_builder_init(&b, state->impl);
 
    nir_foreach_instr_safe(block, instr) {
-      if (instr->type == nir_instr_type_phi) {
-         nir_phi_instr *phi = nir_instr_as_phi(instr);
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
 
-         struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 
-         /* This can happen if we already have phi nodes in the program
-          * that were not created in this pass.
-          */
-         if (!entry)
-            continue;
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_var: {
+         struct deref_node *node =
+            get_deref_node(intrin->variables[0], state);
 
-         struct deref_node *node = entry->data;
+         if (node == NULL) {
+            /* If we hit this path then we are referencing an invalid
+             * value.  Most likely, we unrolled something and are
+             * reading past the end of some array.  In any case, this
+             * should result in an undefined value.
+             */
+            nir_ssa_undef_instr *undef =
+               nir_ssa_undef_instr_create(state->shader,
+                                          intrin->num_components);
+            undef->def.bit_size = intrin->dest.ssa.bit_size;
 
-         def_stack_push(node, &phi->dest.ssa, state);
-      } else if (instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-         switch (intrin->intrinsic) {
-         case nir_intrinsic_load_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
-
-            if (node == NULL) {
-               /* If we hit this path then we are referencing an invalid
-                * value.  Most likely, we unrolled something and are
-                * reading past the end of some array.  In any case, this
-                * should result in an undefined value.
-                */
-               nir_ssa_undef_instr *undef =
-                  nir_ssa_undef_instr_create(state->shader,
-                                             intrin->num_components);
-
-               nir_instr_insert_before(&intrin->instr, &undef->instr);
-               nir_instr_remove(&intrin->instr);
-
-               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                        nir_src_for_ssa(&undef->def));
-               continue;
-            }
-
-            if (!node->lower_to_ssa)
-               continue;
-
-            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
-                                                      nir_op_imov);
-            mov->src[0].src.is_ssa = true;
-            mov->src[0].src.ssa = get_ssa_def_for_block(node, block, state);
-            for (unsigned i = intrin->num_components; i < 4; i++)
-               mov->src[0].swizzle[i] = 0;
-
-            assert(intrin->dest.is_ssa);
-
-            mov->dest.write_mask = (1 << intrin->num_components) - 1;
-            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                              intrin->num_components,
-                              intrin->dest.ssa.bit_size, NULL);
-
-            nir_instr_insert_before(&intrin->instr, &mov->instr);
+            nir_instr_insert_before(&intrin->instr, &undef->instr);
             nir_instr_remove(&intrin->instr);
 
             nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                     nir_src_for_ssa(&mov->dest.dest.ssa));
-            break;
-         }
-
-         case nir_intrinsic_store_var: {
-            struct deref_node *node =
-               get_deref_node(intrin->variables[0], state);
-
-            if (node == NULL) {
-               /* Probably an out-of-bounds array store.  That should be a
-                * no-op. */
-               nir_instr_remove(&intrin->instr);
-               continue;
-            }
-
-            if (!node->lower_to_ssa)
-               continue;
-
-            assert(intrin->num_components ==
-                   glsl_get_vector_elements(node->type));
-
-            assert(intrin->src[0].is_ssa);
-
-            nir_ssa_def *new_def;
-            b.cursor = nir_before_instr(&intrin->instr);
-
-            unsigned wrmask = nir_intrinsic_write_mask(intrin);
-            if (wrmask == (1 << intrin->num_components) - 1) {
-               /* Whole variable store - just copy the source.  Note that
-                * intrin->num_components and intrin->src[0].ssa->num_components
-                * may differ.
-                */
-               unsigned swiz[4];
-               for (unsigned i = 0; i < 4; i++)
-                  swiz[i] = i < intrin->num_components ? i : 0;
-
-               new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
-                                     intrin->num_components, false);
-            } else {
-               nir_ssa_def *old_def = get_ssa_def_for_block(node, block, state);
-               /* For writemasked store_var intrinsics, we combine the newly
-                * written values with the existing contents of unwritten
-                * channels, creating a new SSA value for the whole vector.
-                */
-               nir_ssa_def *srcs[4];
-               for (unsigned i = 0; i < intrin->num_components; i++) {
-                  if (wrmask & (1 << i)) {
-                     srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
-                  } else {
-                     srcs[i] = nir_channel(&b, old_def, i);
-                  }
-               }
-               new_def = nir_vec(&b, srcs, intrin->num_components);
-            }
-
-            assert(new_def->num_components == intrin->num_components);
-
-            def_stack_push(node, new_def, state);
-
-            /* We'll wait to remove the instruction until the next pass
-             * where we pop the node we just pushed back off the stack.
-             */
-            break;
-         }
-
-         default:
-            break;
-         }
-      }
-   }
-
-   if (block->successors[0])
-      add_phi_sources(block->successors[0], block, state);
-   if (block->successors[1])
-      add_phi_sources(block->successors[1], block, state);
-
-   for (unsigned i = 0; i < block->num_dom_children; ++i)
-      rename_variables_block(block->dom_children[i], state);
-
-   /* Now we iterate over the instructions and pop off any SSA defs that we
-    * pushed in the first loop.
-    */
-   nir_foreach_instr_safe(block, instr) {
-      if (instr->type == nir_instr_type_phi) {
-         nir_phi_instr *phi = nir_instr_as_phi(instr);
-
-         struct hash_entry *entry =
-            _mesa_hash_table_search(state->phi_table, phi);
-
-         /* This can happen if we already have phi nodes in the program
-          * that were not created in this pass.
-          */
-         if (!entry)
-            continue;
-
-         struct deref_node *node = entry->data;
-
-         def_stack_pop_if_in_block(node, block);
-      } else if (instr->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-         if (intrin->intrinsic != nir_intrinsic_store_var)
-            continue;
-
-         struct deref_node *node = get_deref_node(intrin->variables[0], state);
-         if (!node)
+                                     nir_src_for_ssa(&undef->def));
             continue;
+         }
 
          if (!node->lower_to_ssa)
             continue;
 
-         def_stack_pop_if_in_block(node, block);
+         nir_alu_instr *mov = nir_alu_instr_create(state->shader,
+                                                   nir_op_imov);
+         mov->src[0].src = nir_src_for_ssa(
+            nir_phi_builder_value_get_block_def(node->pb_value, block));
+         for (unsigned i = intrin->num_components; i < 4; i++)
+            mov->src[0].swizzle[i] = 0;
+
+         assert(intrin->dest.is_ssa);
+
+         mov->dest.write_mask = (1 << intrin->num_components) - 1;
+         nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
+                           intrin->num_components,
+                           intrin->dest.ssa.bit_size, NULL);
+
+         nir_instr_insert_before(&intrin->instr, &mov->instr);
          nir_instr_remove(&intrin->instr);
+
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                  nir_src_for_ssa(&mov->dest.dest.ssa));
+         break;
+      }
+
+      case nir_intrinsic_store_var: {
+         struct deref_node *node =
+            get_deref_node(intrin->variables[0], state);
+
+         if (node == NULL) {
+            /* Probably an out-of-bounds array store.  That should be a
+             * no-op. */
+            nir_instr_remove(&intrin->instr);
+            continue;
+         }
+
+         if (!node->lower_to_ssa)
+            continue;
+
+         assert(intrin->num_components ==
+                glsl_get_vector_elements(node->type));
+
+         assert(intrin->src[0].is_ssa);
+
+         nir_ssa_def *new_def;
+         b.cursor = nir_before_instr(&intrin->instr);
+
+         unsigned wrmask = nir_intrinsic_write_mask(intrin);
+         if (wrmask == (1 << intrin->num_components) - 1) {
+            /* Whole variable store - just copy the source.  Note that
+             * intrin->num_components and intrin->src[0].ssa->num_components
+             * may differ.
+             */
+            unsigned swiz[4];
+            for (unsigned i = 0; i < 4; i++)
+               swiz[i] = i < intrin->num_components ? i : 0;
+
+            new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
+                                  intrin->num_components, false);
+         } else {
+            nir_ssa_def *old_def =
+               nir_phi_builder_value_get_block_def(node->pb_value, block);
+            /* For writemasked store_var intrinsics, we combine the newly
+             * written values with the existing contents of unwritten
+             * channels, creating a new SSA value for the whole vector.
+             */
+            nir_ssa_def *srcs[4];
+            for (unsigned i = 0; i < intrin->num_components; i++) {
+               if (wrmask & (1 << i)) {
+                  srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
+               } else {
+                  srcs[i] = nir_channel(&b, old_def, i);
+               }
+            }
+            new_def = nir_vec(&b, srcs, intrin->num_components);
+         }
+
+         assert(new_def->num_components == intrin->num_components);
+
+         nir_phi_builder_value_set_block_def(node->pb_value, block, new_def);
+         nir_instr_remove(&intrin->instr);
+         break;
+      }
+
+      default:
+         break;
       }
    }
 
+   for (unsigned i = 0; i < block->num_dom_children; ++i)
+      rename_variables_block(block->dom_children[i], state);
+
    return true;
 }
 
-/* Inserts phi nodes for all variables marked lower_to_ssa
- *
- * This is the same algorithm as presented in "Efficiently Computing Static
- * Single Assignment Form and the Control Dependence Graph" by Cytron et.
- * al.
- */
-static void
-insert_phi_nodes(struct lower_variables_state *state)
-{
-   NIR_VLA_ZERO(unsigned, work, state->impl->num_blocks);
-   NIR_VLA_ZERO(unsigned, has_already, state->impl->num_blocks);
-
-   /*
-    * Since the work flags already prevent us from inserting a node that has
-    * ever been inserted into W, we don't need to use a set to represent W.
-    * Also, since no block can ever be inserted into W more than once, we know
-    * that the maximum size of W is the number of basic blocks in the
-    * function. So all we need to handle W is an array and a pointer to the
-    * next element to be inserted and the next element to be removed.
-    */
-   NIR_VLA(nir_block *, W, state->impl->num_blocks);
-
-   unsigned w_start, w_end;
-   unsigned iter_count = 0;
-
-   foreach_list_typed(struct deref_node, node, direct_derefs_link,
-                      &state->direct_deref_nodes) {
-      if (node->stores == NULL)
-         continue;
-
-      if (!node->lower_to_ssa)
-         continue;
-
-      unsigned bit_size = glsl_get_bit_size(glsl_get_base_type(node->type));
-
-      w_start = w_end = 0;
-      iter_count++;
-
-      struct set_entry *store_entry;
-      set_foreach(node->stores, store_entry) {
-         nir_intrinsic_instr *store = (nir_intrinsic_instr *)store_entry->key;
-         if (work[store->instr.block->index] < iter_count)
-            W[w_end++] = store->instr.block;
-         work[store->instr.block->index] = iter_count;
-      }
-
-      while (w_start != w_end) {
-         nir_block *cur = W[w_start++];
-         struct set_entry *dom_entry;
-         set_foreach(cur->dom_frontier, dom_entry) {
-            nir_block *next = (nir_block *) dom_entry->key;
-
-            /*
-             * If there's more than one return statement, then the end block
-             * can be a join point for some definitions. However, there are
-             * no instructions in the end block, so nothing would use those
-             * phi nodes. Of course, we couldn't place those phi nodes
-             * anyways due to the restriction of having no instructions in the
-             * end block...
-             */
-            if (next == state->impl->end_block)
-               continue;
-
-            if (has_already[next->index] < iter_count) {
-               nir_phi_instr *phi = nir_phi_instr_create(state->shader);
-               nir_ssa_dest_init(&phi->instr, &phi->dest,
-                                 glsl_get_vector_elements(node->type),
-                                 bit_size, NULL);
-               nir_instr_insert_before_block(next, &phi->instr);
-
-               _mesa_hash_table_insert(state->phi_table, phi, node);
-
-               has_already[next->index] = iter_count;
-               if (work[next->index] < iter_count) {
-                  work[next->index] = iter_count;
-                  W[w_end++] = next;
-               }
-            }
-         }
-      }
-   }
-}
-
-
 /** Implements a pass to lower variable uses to SSA values
  *
  * This path walks the list of instructions and tries to lower as many
@@ -900,9 +650,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
                                                    _mesa_hash_pointer,
                                                    _mesa_key_pointer_equal);
    exec_list_make_empty(&state.direct_deref_nodes);
-   state.phi_table = _mesa_hash_table_create(state.dead_ctx,
-                                             _mesa_hash_pointer,
-                                             _mesa_key_pointer_equal);
 
    /* Build the initial deref structures and direct_deref_nodes table */
    state.add_to_direct_deref_nodes = true;
@@ -932,17 +679,6 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
       node->lower_to_ssa = true;
       progress = true;
 
-      if (deref->var->constant_initializer) {
-         nir_load_const_instr *load =
-            nir_deref_get_const_initializer_load(state.shader, deref);
-         nir_ssa_def_init(&load->instr, &load->def,
-                          glsl_get_vector_elements(node->type),
-                          glsl_get_bit_size(glsl_get_base_type(node->type)),
-                          NULL);
-         nir_instr_insert_before_cf_list(&impl->body, &load->instr);
-         def_stack_push(node, &load->def, &state);
-      }
-
       foreach_deref_node_match(deref, lower_copies_to_load_store, &state);
    }
 
@@ -959,9 +695,48 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
     */
    nir_foreach_block(impl, register_variable_uses_block, &state);
 
-   insert_phi_nodes(&state);
+   state.phi_builder = nir_phi_builder_create(state.impl);
+
+   NIR_VLA(BITSET_WORD, store_blocks, BITSET_WORDS(state.impl->num_blocks));
+   foreach_list_typed(struct deref_node, node, direct_derefs_link,
+                      &state.direct_deref_nodes) {
+      if (!node->lower_to_ssa)
+         continue;
+
+      memset(store_blocks, 0,
+             BITSET_WORDS(state.impl->num_blocks) * sizeof(*store_blocks));
+
+      if (node->stores) {
+         struct set_entry *store_entry;
+         set_foreach(node->stores, store_entry) {
+            nir_intrinsic_instr *store =
+               (nir_intrinsic_instr *)store_entry->key;
+            BITSET_SET(store_blocks, store->instr.block->index);
+         }
+      }
+
+      if (node->deref->var->constant_initializer)
+         BITSET_SET(store_blocks, 0);
+
+      node->pb_value =
+         nir_phi_builder_add_value(state.phi_builder,
+                                   glsl_get_vector_elements(node->type),
+                                   glsl_get_bit_size(glsl_get_base_type(node->type)),
+                                   store_blocks);
+
+      if (node->deref->var->constant_initializer) {
+         nir_load_const_instr *load =
+            nir_deref_get_const_initializer_load(state.shader, node->deref);
+         nir_instr_insert_before_cf_list(&impl->body, &load->instr);
+         nir_phi_builder_value_set_block_def(node->pb_value,
+                                             nir_start_block(impl), &load->def);
+      }
+   }
+
    rename_variables_block(nir_start_block(impl), &state);
 
+   nir_phi_builder_finish(state.phi_builder);
+
    nir_metadata_preserve(impl, nir_metadata_block_index |
                                nir_metadata_dominance);
 

From 364212f1ede4b2ecf4361e27e24e3d84e19aa54d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 12 Feb 2016 21:52:46 -0800
Subject: [PATCH 189/197] nir: Add a pass to repair SSA form

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/Makefile.sources     |   1 +
 src/compiler/nir/Makefile.sources |   1 +
 src/compiler/nir/nir.h            |   3 +
 src/compiler/nir/nir_repair_ssa.c | 158 ++++++++++++++++++++++++++++++
 4 files changed, 163 insertions(+)
 create mode 100644 src/compiler/nir/nir_repair_ssa.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index c38454e0267..796d0044f46 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -218,6 +218,7 @@ NIR_FILES = \
 	nir/nir_phi_builder.c \
 	nir/nir_phi_builder.h \
 	nir/nir_print.c \
+	nir/nir_repair_ssa.c \
 	nir/nir_remove_dead_variables.c \
 	nir/nir_search.c \
 	nir/nir_search.h \
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index db3eeccf858..c1493551192 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -61,6 +61,7 @@ NIR_FILES = \
 	nir_phi_builder.c \
 	nir_phi_builder.h \
 	nir_print.c \
+	nir_repair_ssa.c \
 	nir_remove_dead_variables.c \
 	nir_search.c \
 	nir_search.h \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 76a511c2d4f..a4596096b59 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2235,6 +2235,9 @@ bool nir_ssa_defs_interfere(nir_ssa_def *a, nir_ssa_def *b);
 void nir_convert_to_ssa_impl(nir_function_impl *impl);
 void nir_convert_to_ssa(nir_shader *shader);
 
+bool nir_repair_ssa_impl(nir_function_impl *impl);
+bool nir_repair_ssa(nir_shader *shader);
+
 /* If phi_webs_only is true, only convert SSA values involved in phi nodes to
  * registers.  If false, convert all values (even those not involved in a phi
  * node) to registers.
diff --git a/src/compiler/nir/nir_repair_ssa.c b/src/compiler/nir/nir_repair_ssa.c
new file mode 100644
index 00000000000..96c791cbc6b
--- /dev/null
+++ b/src/compiler/nir/nir_repair_ssa.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_phi_builder.h"
+
+struct repair_ssa_state {
+   nir_function_impl *impl;
+
+   BITSET_WORD *def_set;
+   struct nir_phi_builder *phi_builder;
+
+   bool progress;
+};
+
+/* Get ready to build a phi and return the builder */
+static struct nir_phi_builder *
+prep_build_phi(struct repair_ssa_state *state)
+{
+   const unsigned num_words = BITSET_WORDS(state->impl->num_blocks);
+
+   /* We create the phi builder on-demand. */
+   if (state->phi_builder == NULL) {
+      state->phi_builder = nir_phi_builder_create(state->impl);
+      state->def_set = ralloc_array(NULL, BITSET_WORD, num_words);
+   }
+
+   /* We're going to build a phi.  That's progress. */
+   state->progress = true;
+
+   /* Set the defs set to empty */
+   memset(state->def_set, 0, num_words * sizeof(*state->def_set));
+
+   return state->phi_builder;
+}
+
+static nir_block *
+get_src_block(nir_src *src)
+{
+   if (src->parent_instr->type == nir_instr_type_phi) {
+      return exec_node_data(nir_phi_src, src, src)->pred;
+   } else {
+      return src->parent_instr->block;
+   }
+}
+
+static bool
+repair_ssa_def(nir_ssa_def *def, void *void_state)
+{
+   struct repair_ssa_state *state = void_state;
+
+   bool is_valid = true;
+   nir_foreach_use(def, src) {
+      if (!nir_block_dominates(def->parent_instr->block, get_src_block(src))) {
+         is_valid = false;
+         break;
+      }
+   }
+
+   if (is_valid)
+      return true;
+
+   struct nir_phi_builder *pb = prep_build_phi(state);
+
+   BITSET_SET(state->def_set, def->parent_instr->block->index);
+
+   struct nir_phi_builder_value *val =
+      nir_phi_builder_add_value(pb, def->num_components, def->bit_size,
+                                state->def_set);
+
+   nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);
+
+   nir_foreach_use_safe(def, src) {
+      nir_block *src_block = get_src_block(src);
+      if (!nir_block_dominates(def->parent_instr->block, src_block)) {
+         nir_instr_rewrite_src(src->parent_instr, src, nir_src_for_ssa(
+            nir_phi_builder_value_get_block_def(val, src_block)));
+      }
+   }
+
+   return true;
+}
+
+static bool
+repair_ssa_block(nir_block *block, void *state)
+{
+   nir_foreach_instr_safe(block, instr) {
+      nir_foreach_ssa_def(instr, repair_ssa_def, state);
+   }
+
+   return true;
+}
+
+bool
+nir_repair_ssa_impl(nir_function_impl *impl)
+{
+   struct repair_ssa_state state;
+
+   state.impl = impl;
+   state.phi_builder = NULL;
+   state.progress = false;
+
+   nir_metadata_require(impl, nir_metadata_block_index |
+                              nir_metadata_dominance);
+
+   nir_foreach_block(impl, repair_ssa_block, &state);
+
+   if (state.progress)
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+
+   if (state.phi_builder) {
+      nir_phi_builder_finish(state.phi_builder);
+      ralloc_free(state.def_set);
+   }
+
+   return state.progress;
+}
+
+/** This pass can be used to repair SSA form in a shader.
+ *
+ * Sometimes a transformation (such as return lowering) will have to make
+ * changes to a shader which, while still correct, break some of NIR's SSA
+ * invariants.  This pass will insert ssa_undefs and phi nodes as needed to
+ * get the shader back into SSA that the validator will like.
+ */
+bool
+nir_repair_ssa(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_repair_ssa_impl(function->impl) || progress;
+   }
+
+   return progress;
+}

From 124f229ece8ffa7d1f8d530771f183f7803d6cdc Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 18 Dec 2015 11:27:00 -0800
Subject: [PATCH 190/197] nir/cf: Handle relinking top-level blocks

This can happen if a function ends in a return instruction and you remove
the return.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/compiler/nir/nir_control_flow.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/compiler/nir/nir_control_flow.c b/src/compiler/nir/nir_control_flow.c
index 96395a41615..ecd9cbd4173 100644
--- a/src/compiler/nir/nir_control_flow.c
+++ b/src/compiler/nir/nir_control_flow.c
@@ -336,8 +336,7 @@ block_add_normal_succs(nir_block *block)
          nir_block *next_block = nir_cf_node_as_block(next);
 
          link_blocks(block, next_block, NULL);
-      } else {
-         assert(parent->type == nir_cf_node_loop);
+      } else if (parent->type == nir_cf_node_loop) {
          nir_loop *loop = nir_cf_node_as_loop(parent);
 
          nir_cf_node *head = nir_loop_first_cf_node(loop);
@@ -346,6 +345,10 @@ block_add_normal_succs(nir_block *block)
 
          link_blocks(block, head_block, NULL);
          insert_phi_undef(head_block, block);
+      } else {
+         assert(parent->type == nir_cf_node_function);
+         nir_function_impl *impl = nir_cf_node_as_function(parent);
+         link_blocks(block, impl->end_block, NULL);
       }
    } else {
       nir_cf_node *next = nir_cf_node_next(&block->cf_node);

From 7022a673cd6b9e4bdd4c55fe1d7c76c04d27d4e6 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 26 Dec 2015 10:32:10 -0800
Subject: [PATCH 191/197] nir: Add a function for comparing cursors

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/nir/nir.c | 56 ++++++++++++++++++++++++++++++++++++++++++
 src/compiler/nir/nir.h |  2 ++
 2 files changed, 58 insertions(+)

diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 20f1a182b77..b67916dc86b 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -724,6 +724,62 @@ nir_cf_node_get_function(nir_cf_node *node)
    return nir_cf_node_as_function(node);
 }
 
+/* Reduces a cursor by trying to convert everything to after and trying to
+ * go up to block granularity when possible.
+ */
+static nir_cursor
+reduce_cursor(nir_cursor cursor)
+{
+   switch (cursor.option) {
+   case nir_cursor_before_block:
+      assert(nir_cf_node_prev(&cursor.block->cf_node) == NULL ||
+             nir_cf_node_prev(&cursor.block->cf_node)->type != nir_cf_node_block);
+      if (exec_list_is_empty(&cursor.block->instr_list)) {
+         /* Empty block.  After is as good as before. */
+         cursor.option = nir_cursor_after_block;
+      }
+      return cursor;
+
+   case nir_cursor_after_block:
+      return cursor;
+
+   case nir_cursor_before_instr: {
+      nir_instr *prev_instr = nir_instr_prev(cursor.instr);
+      if (prev_instr) {
+         /* Before this instruction is after the previous */
+         cursor.instr = prev_instr;
+         cursor.option = nir_cursor_after_instr;
+      } else {
+         /* No previous instruction.  Switch to before block */
+         cursor.block = cursor.instr->block;
+         cursor.option = nir_cursor_before_block;
+      }
+      return reduce_cursor(cursor);
+   }
+
+   case nir_cursor_after_instr:
+      if (nir_instr_next(cursor.instr) == NULL) {
+         /* This is the last instruction, switch to after block */
+         cursor.option = nir_cursor_after_block;
+         cursor.block = cursor.instr->block;
+      }
+      return cursor;
+
+   default:
+      unreachable("Inavlid cursor option");
+   }
+}
+
+bool
+nir_cursors_equal(nir_cursor a, nir_cursor b)
+{
+   /* Reduced cursors should be unique */
+   a = reduce_cursor(a);
+   b = reduce_cursor(b);
+
+   return a.block == b.block && a.option == b.option;
+}
+
 static bool
 add_use_cb(nir_src *src, void *state)
 {
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index a4596096b59..718d281d7de 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1867,6 +1867,8 @@ typedef struct {
    };
 } nir_cursor;
 
+bool nir_cursors_equal(nir_cursor a, nir_cursor b);
+
 static inline nir_cursor
 nir_before_block(nir_block *block)
 {

From 97b663481c8c83fda06246860708530cff755a05 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 23 Dec 2015 18:10:08 -0800
Subject: [PATCH 192/197] nir/cf: Make extracting or re-inserting nothing a
 no-op

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/compiler/nir/nir_control_flow.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/compiler/nir/nir_control_flow.c b/src/compiler/nir/nir_control_flow.c
index ecd9cbd4173..33b06d0cc84 100644
--- a/src/compiler/nir/nir_control_flow.c
+++ b/src/compiler/nir/nir_control_flow.c
@@ -749,6 +749,12 @@ nir_cf_extract(nir_cf_list *extracted, nir_cursor begin, nir_cursor end)
 {
    nir_block *block_begin, *block_end, *block_before, *block_after;
 
+   if (nir_cursors_equal(begin, end)) {
+      exec_list_make_empty(&extracted->list);
+      extracted->impl = NULL; /* we shouldn't need this */
+      return;
+   }
+
    /* In the case where begin points to an instruction in some basic block and
     * end points to the end of the same basic block, we rely on the fact that
     * splitting on an instruction moves earlier instructions into a new basic
@@ -788,6 +794,9 @@ nir_cf_reinsert(nir_cf_list *cf_list, nir_cursor cursor)
 {
    nir_block *before, *after;
 
+   if (exec_list_is_empty(&cf_list->list))
+      return;
+
    split_block_cursor(cursor, &before, &after);
 
    foreach_list_typed_safe(nir_cf_node, node, node, &cf_list->list) {

From 18b01667493bd61c9c3eafd0848322b23da20efd Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 13 Feb 2016 17:14:27 -0800
Subject: [PATCH 193/197] nir/builder: Add a helper for inserting jump
 instructions

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/nir/nir_builder.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 64d7b43aa58..de5b6cee581 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -393,4 +393,11 @@ nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
    return &load->dest.ssa;
 }
 
+static inline void
+nir_jump(nir_builder *build, nir_jump_type jump_type)
+{
+   nir_jump_instr *jump = nir_jump_instr_create(build->shader, jump_type);
+   nir_builder_instr_insert(build, &jump->instr);
+}
+
 #endif /* NIR_BUILDER_H */

From 8d61d7252433a0470b441c70085391d3dd4c04bb Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sun, 27 Dec 2015 22:50:14 -0800
Subject: [PATCH 194/197] nir: Add a cursor helper for getting a cursor after
 any phi nodes

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/nir/nir.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 718d281d7de..51d2e79f570 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1934,6 +1934,22 @@ nir_after_cf_node(nir_cf_node *node)
    return nir_before_block(nir_cf_node_as_block(nir_cf_node_next(node)));
 }
 
+static inline nir_cursor
+nir_after_cf_node_and_phis(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_after_block(nir_cf_node_as_block(node));
+
+   nir_block *block = nir_cf_node_as_block(nir_cf_node_next(node));
+   assert(block->cf_node.type == nir_cf_node_block);
+
+   nir_foreach_instr(block, instr) {
+      if (instr->type != nir_instr_type_phi)
+         return nir_before_instr(instr);
+   }
+   return nir_after_block(block);
+}
+
 static inline nir_cursor
 nir_before_cf_list(struct exec_list *cf_list)
 {

From 79dec93ead6e3b95b1240a9d843d617a88ee9179 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 13 Feb 2016 17:08:57 -0800
Subject: [PATCH 195/197] nir: Add return lowering pass

This commit adds a NIR pass for lowering away returns in functions.  If the
return is in a loop, it is lowered to a break.  If it is not in a loop,
it's lowered away by moving/deleting code as needed.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/Makefile.sources        |   1 +
 src/compiler/nir/Makefile.sources    |   1 +
 src/compiler/nir/nir.h               |   3 +
 src/compiler/nir/nir_lower_returns.c | 246 +++++++++++++++++++++++++++
 4 files changed, 251 insertions(+)
 create mode 100644 src/compiler/nir/nir_lower_returns.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 796d0044f46..74636b13e12 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -194,6 +194,7 @@ NIR_FILES = \
 	nir/nir_lower_io.c \
 	nir/nir_lower_outputs_to_temporaries.c \
 	nir/nir_lower_phis_to_scalar.c \
+	nir/nir_lower_returns.c \
 	nir/nir_lower_samplers.c \
 	nir/nir_lower_system_values.c \
 	nir/nir_lower_tex.c \
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index c1493551192..b9506e8a393 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -37,6 +37,7 @@ NIR_FILES = \
 	nir_lower_io.c \
 	nir_lower_outputs_to_temporaries.c \
 	nir_lower_phis_to_scalar.c \
+	nir_lower_returns.c \
 	nir_lower_samplers.c \
 	nir_lower_system_values.c \
 	nir_lower_tex.c \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 51d2e79f570..78bbdc0b78c 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2146,6 +2146,9 @@ int nir_gs_count_vertices(const nir_shader *shader);
 
 bool nir_split_var_copies(nir_shader *shader);
 
+bool nir_lower_returns_impl(nir_function_impl *impl);
+bool nir_lower_returns(nir_shader *shader);
+
 void nir_lower_var_copy_instr(nir_intrinsic_instr *copy, void *mem_ctx);
 void nir_lower_var_copies(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_lower_returns.c b/src/compiler/nir/nir_lower_returns.c
new file mode 100644
index 00000000000..91bb2f7dfeb
--- /dev/null
+++ b/src/compiler/nir/nir_lower_returns.c
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_control_flow.h"
+
+struct lower_returns_state {
+   nir_builder builder;
+   struct exec_list *cf_list;
+   nir_loop *loop;
+   nir_variable *return_flag;
+};
+
+static bool lower_returns_in_cf_list(struct exec_list *cf_list,
+                                     struct lower_returns_state *state);
+
+static void
+predicate_following(nir_cf_node *node, struct lower_returns_state *state)
+{
+   nir_builder *b = &state->builder;
+   b->cursor = nir_after_cf_node_and_phis(node);
+
+   if (nir_cursors_equal(b->cursor, nir_after_cf_list(state->cf_list)))
+      return; /* Nothing to predicate */
+
+   assert(state->return_flag);
+
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(nir_load_var(b, state->return_flag));
+   nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
+
+   if (state->loop) {
+      /* If we're inside of a loop, then all we need to do is insert a
+       * conditional break.
+       */
+      nir_jump_instr *brk =
+         nir_jump_instr_create(state->builder.shader, nir_jump_break);
+      nir_instr_insert(nir_before_cf_list(&if_stmt->then_list), &brk->instr);
+   } else {
+      /* Otherwise, we need to actually move everything into the else case
+       * of the if statement.
+       */
+      nir_cf_list list;
+      nir_cf_extract(&list, nir_after_cf_node(&if_stmt->cf_node),
+                            nir_after_cf_list(state->cf_list));
+      assert(!exec_list_is_empty(&list.list));
+      nir_cf_reinsert(&list, nir_before_cf_list(&if_stmt->else_list));
+   }
+}
+
+static bool
+lower_returns_in_loop(nir_loop *loop, struct lower_returns_state *state)
+{
+   nir_loop *parent = state->loop;
+   state->loop = loop;
+   bool progress = lower_returns_in_cf_list(&loop->body, state);
+   state->loop = parent;
+
+   /* If the recursive call made progress, then there were returns inside
+    * of the loop.  These would have been lowered to breaks with the return
+    * flag set to true.  We need to predicate everything following the loop
+    * on the return flag.
+    */
+   if (progress)
+      predicate_following(&loop->cf_node, state);
+
+   return progress;
+}
+
+static bool
+lower_returns_in_if(nir_if *if_stmt, struct lower_returns_state *state)
+{
+   bool progress;
+
+   progress = lower_returns_in_cf_list(&if_stmt->then_list, state);
+   progress = lower_returns_in_cf_list(&if_stmt->else_list, state) || progress;
+
+   /* If either of the recursive calls made progress, then there were
+    * returns inside of the body of the if.  If we're in a loop, then these
+    * were lowered to breaks which automatically skip to the end of the
+    * loop so we don't have to do anything.  If we're not in a loop, then
+    * all we know is that the return flag is set appropreately and that the
+    * recursive calls ensured that nothing gets executed *inside* the if
+    * after a return.  In order to ensure nothing outside gets executed
+    * after a return, we need to predicate everything following on the
+    * return flag.
+    */
+   if (progress && !state->loop)
+      predicate_following(&if_stmt->cf_node, state);
+
+   return progress;
+}
+
+static bool
+lower_returns_in_block(nir_block *block, struct lower_returns_state *state)
+{
+   if (block->predecessors->entries == 0 &&
+       block != nir_start_block(state->builder.impl)) {
+      /* This block is unreachable.  Delete it and everything after it. */
+      nir_cf_list list;
+      nir_cf_extract(&list, nir_before_cf_node(&block->cf_node),
+                            nir_after_cf_list(state->cf_list));
+
+      if (exec_list_is_empty(&list.list)) {
+         /* There's nothing here, which also means there's nothing in this
+          * block so we have nothing to do.
+          */
+         return false;
+      } else {
+         nir_cf_delete(&list);
+         return true;
+      }
+   }
+
+   nir_instr *last_instr = nir_block_last_instr(block);
+   if (last_instr == NULL)
+      return false;
+
+   if (last_instr->type != nir_instr_type_jump)
+      return false;
+
+   nir_jump_instr *jump = nir_instr_as_jump(last_instr);
+   if (jump->type != nir_jump_return)
+      return false;
+
+   nir_instr_remove(&jump->instr);
+
+   nir_builder *b = &state->builder;
+   b->cursor = nir_after_block(block);
+
+   /* Set the return flag */
+   if (state->return_flag == NULL) {
+      state->return_flag =
+         nir_local_variable_create(b->impl, glsl_bool_type(), "return");
+
+      /* Set a default value of false */
+      state->return_flag->constant_initializer =
+         rzalloc(state->return_flag, nir_constant);
+   }
+   nir_store_var(b, state->return_flag, nir_imm_int(b, NIR_TRUE), 1);
+
+   if (state->loop) {
+      /* We're in a loop;  we need to break out of it. */
+      nir_jump(b, nir_jump_break);
+   } else {
+      /* Not in a loop;  we'll deal with predicating later*/
+      assert(nir_cf_node_next(&block->cf_node) == NULL);
+   }
+
+   return true;
+}
+
+static bool
+lower_returns_in_cf_list(struct exec_list *cf_list,
+                         struct lower_returns_state *state)
+{
+   bool progress = false;
+
+   struct exec_list *parent_list = state->cf_list;
+   state->cf_list = cf_list;
+
+   /* We iterate over the list backwards because any given lower call may
+    * take everything following the given CF node and predicate it.  In
+    * order to avoid recursion/iteration problems, we want everything after
+    * a given node to already be lowered before this happens.
+    */
+   foreach_list_typed_reverse_safe(nir_cf_node, node, node, cf_list) {
+      switch (node->type) {
+      case nir_cf_node_block:
+         if (lower_returns_in_block(nir_cf_node_as_block(node), state))
+            progress = true;
+         break;
+
+      case nir_cf_node_if:
+         if (lower_returns_in_if(nir_cf_node_as_if(node), state))
+            progress = true;
+         break;
+
+      case nir_cf_node_loop:
+         if (lower_returns_in_loop(nir_cf_node_as_loop(node), state))
+            progress = true;
+         break;
+
+      default:
+         unreachable("Invalid inner CF node type");
+      }
+   }
+
+   state->cf_list = parent_list;
+
+   return progress;
+}
+
+bool
+nir_lower_returns_impl(nir_function_impl *impl)
+{
+   struct lower_returns_state state;
+
+   state.cf_list = &impl->body;
+   state.loop = NULL;
+   state.return_flag = NULL;
+   nir_builder_init(&state.builder, impl);
+
+   bool progress = lower_returns_in_cf_list(&impl->body, &state);
+
+   if (progress) {
+      nir_metadata_preserve(impl, nir_metadata_none);
+      nir_repair_ssa_impl(impl);
+   }
+
+   return progress;
+}
+
+bool
+nir_lower_returns(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = nir_lower_returns_impl(function->impl) || progress;
+   }
+
+   return progress;
+}

From debf23ec6837506ce372f032751dc683e36d8a98 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 26 Dec 2015 10:48:14 -0800
Subject: [PATCH 196/197] nir/builder: Add helpers for easily inserting
 copy_var intrinsics

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/nir/nir_builder.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index de5b6cee581..b245f48c96d 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -381,6 +381,29 @@ nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
    nir_builder_instr_insert(build, &store->instr);
 }
 
+static inline void
+nir_copy_deref_var(nir_builder *build, nir_deref_var *dest, nir_deref_var *src)
+{
+   assert(nir_deref_tail(&dest->deref)->type ==
+          nir_deref_tail(&src->deref)->type);
+
+   nir_intrinsic_instr *copy =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
+   copy->variables[0] = nir_deref_as_var(nir_copy_deref(copy, &dest->deref));
+   copy->variables[1] = nir_deref_as_var(nir_copy_deref(copy, &src->deref));
+   nir_builder_instr_insert(build, &copy->instr);
+}
+
+static inline void
+nir_copy_var(nir_builder *build, nir_variable *dest, nir_variable *src)
+{
+   nir_intrinsic_instr *copy =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_copy_var);
+   copy->variables[0] = nir_deref_var_create(copy, dest);
+   copy->variables[1] = nir_deref_var_create(copy, src);
+   nir_builder_instr_insert(build, &copy->instr);
+}
+
 static inline nir_ssa_def *
 nir_load_system_value(nir_builder *build, nir_intrinsic_op op, int index)
 {

From 22b343a8ec75a08dae6a6badbb261eab8437475d Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Sat, 13 Feb 2016 17:31:05 -0800
Subject: [PATCH 197/197] nir: Add a pass to inline functions

This commit adds a new NIR pass that lowers all function calls away by
inlining the functions.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/compiler/Makefile.sources           |   1 +
 src/compiler/nir/Makefile.sources       |   1 +
 src/compiler/nir/nir.h                  |   2 +
 src/compiler/nir/nir_inline_functions.c | 270 ++++++++++++++++++++++++
 4 files changed, 274 insertions(+)
 create mode 100644 src/compiler/nir/nir_inline_functions.c

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 74636b13e12..1f8517282ef 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -177,6 +177,7 @@ NIR_FILES = \
 	nir/nir_dominance.c \
 	nir/nir_from_ssa.c \
 	nir/nir_gs_count_vertices.c \
+	nir/nir_inline_functions.c \
 	nir/nir_intrinsics.c \
 	nir/nir_intrinsics.h \
 	nir/nir_instr_set.c \
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index b9506e8a393..00576f062c5 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -20,6 +20,7 @@ NIR_FILES = \
 	nir_dominance.c \
 	nir_from_ssa.c \
 	nir_gs_count_vertices.c \
+	nir_inline_functions.c \
 	nir_intrinsics.c \
 	nir_intrinsics.h \
 	nir_instr_set.c \
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 78bbdc0b78c..37d2907a82b 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2149,6 +2149,8 @@ bool nir_split_var_copies(nir_shader *shader);
 bool nir_lower_returns_impl(nir_function_impl *impl);
 bool nir_lower_returns(nir_shader *shader);
 
+bool nir_inline_functions(nir_shader *shader);
+
 void nir_lower_var_copy_instr(nir_intrinsic_instr *copy, void *mem_ctx);
 void nir_lower_var_copies(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_inline_functions.c b/src/compiler/nir/nir_inline_functions.c
new file mode 100644
index 00000000000..4a08dcc96e0
--- /dev/null
+++ b/src/compiler/nir/nir_inline_functions.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+#include "nir_control_flow.h"
+
+struct inline_functions_state {
+   struct set *inlined;
+   nir_builder builder;
+   bool progress;
+};
+
+static bool inline_function_impl(nir_function_impl *impl, struct set *inlined);
+
+static bool
+rewrite_param_derefs_block(nir_block *block, void *void_state)
+{
+   nir_call_instr *call = void_state;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      for (unsigned i = 0;
+           i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
+         if (intrin->variables[i]->var->data.mode != nir_var_param)
+            continue;
+
+         int param_idx = intrin->variables[i]->var->data.location;
+
+         nir_deref_var *call_deref;
+         if (param_idx >= 0) {
+            assert(param_idx < call->callee->num_params);
+            call_deref = call->params[param_idx];
+         } else {
+            call_deref = call->return_deref;
+         }
+         assert(call_deref);
+
+         nir_deref_var *new_deref = nir_deref_as_var(nir_copy_deref(intrin, &call_deref->deref));
+         nir_deref *new_tail = nir_deref_tail(&new_deref->deref);
+         new_tail->child = intrin->variables[i]->deref.child;
+         ralloc_steal(new_tail, new_tail->child);
+         intrin->variables[i] = new_deref;
+      }
+   }
+
+   return true;
+}
+
+static void
+lower_param_to_local(nir_variable *param, nir_function_impl *impl, bool write)
+{
+   if (param->data.mode != nir_var_param)
+      return;
+
+   nir_parameter_type param_type;
+   if (param->data.location >= 0) {
+      assert(param->data.location < impl->num_params);
+      param_type = impl->function->params[param->data.location].param_type;
+   } else {
+      /* Return variable */
+      param_type = nir_parameter_out;
+   }
+
+   if ((write && param_type == nir_parameter_in) ||
+       (!write && param_type == nir_parameter_out)) {
+      /* In this case, we need a shadow copy.  Turn it into a local */
+      param->data.mode = nir_var_local;
+      exec_list_push_tail(&impl->locals, &param->node);
+   }
+}
+
+static bool
+lower_params_to_locals_block(nir_block *block, void *void_state)
+{
+   nir_function_impl *impl = void_state;
+
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_store_var:
+         lower_param_to_local(intrin->variables[0]->var, impl, true);
+         break;
+
+      case nir_intrinsic_copy_var:
+         lower_param_to_local(intrin->variables[0]->var, impl, true);
+         lower_param_to_local(intrin->variables[1]->var, impl, false);
+         break;
+
+      case nir_intrinsic_load_var:
+         /* All other intrinsics which access variables (image_load_store)
+          * do so in a read-only fasion.
+          */
+         for (unsigned i = 0;
+              i < nir_intrinsic_infos[intrin->intrinsic].num_variables; i++) {
+            lower_param_to_local(intrin->variables[i]->var, impl, false);
+         }
+         break;
+
+      default:
+         continue;
+      }
+   }
+
+   return true;
+}
+
+static bool
+inline_functions_block(nir_block *block, void *void_state)
+{
+   struct inline_functions_state *state = void_state;
+
+   nir_builder *b = &state->builder;
+
+   /* This is tricky.  We're iterating over instructions in a block but, as
+    * we go, the block and its instruction list are being split into
+    * pieces.  However, this *should* be safe since foreach_safe always
+    * stashes the next thing in the iteration.  That next thing will
+    * properly get moved to the next block when it gets split, and we
+    * continue iterating there.
+    */
+   nir_foreach_instr_safe(block, instr) {
+      if (instr->type != nir_instr_type_call)
+         continue;
+
+      state->progress = true;
+
+      nir_call_instr *call = nir_instr_as_call(instr);
+      assert(call->callee->impl);
+
+      inline_function_impl(call->callee->impl, state->inlined);
+
+      nir_function_impl *callee_copy =
+         nir_function_impl_clone(call->callee->impl);
+      callee_copy->function = call->callee;
+
+      /* Add copies of all in parameters */
+      assert(call->num_params == callee_copy->num_params);
+
+      exec_list_append(&b->impl->locals, &callee_copy->locals);
+      exec_list_append(&b->impl->registers, &callee_copy->registers);
+
+      b->cursor = nir_before_instr(&call->instr);
+
+      /* We now need to tie the two functions together using the
+       * parameters.  There are two ways we do this: One is to turn the
+       * parameter into a local variable and do a shadow-copy.  The other
+       * is to treat the parameter as a "proxy" and rewrite derefs to use
+       * the actual variable that comes from the call instruction.  We
+       * implement both schemes.  The first is needed in the case where we
+       * have an in parameter that we write or similar.  The second case is
+       * needed for handling things such as images and uniforms properly.
+       */
+
+      /* Figure out when we need to lower to a shadow local */
+      nir_foreach_block(callee_copy, lower_params_to_locals_block, callee_copy);
+      for (unsigned i = 0; i < callee_copy->num_params; i++) {
+         nir_variable *param = callee_copy->params[i];
+
+         if (param->data.mode == nir_var_local &&
+             call->callee->params[i].param_type != nir_parameter_out) {
+            nir_copy_deref_var(b, nir_deref_var_create(b->shader, param),
+                                  call->params[i]);
+         }
+      }
+
+      nir_foreach_block(callee_copy, rewrite_param_derefs_block, call);
+
+      /* Pluck the body out of the function and place it here */
+      nir_cf_list body;
+      nir_cf_list_extract(&body, &callee_copy->body);
+      nir_cf_reinsert(&body, b->cursor);
+
+      b->cursor = nir_before_instr(&call->instr);
+
+      /* Add copies of all out parameters and the return */
+      assert(call->num_params == callee_copy->num_params);
+      for (unsigned i = 0; i < callee_copy->num_params; i++) {
+         nir_variable *param = callee_copy->params[i];
+
+         if (param->data.mode == nir_var_local &&
+             call->callee->params[i].param_type != nir_parameter_in) {
+            nir_copy_deref_var(b, call->params[i],
+                                  nir_deref_var_create(b->shader, param));
+         }
+      }
+      if (!glsl_type_is_void(call->callee->return_type) &&
+          callee_copy->return_var->data.mode == nir_var_local) {
+         nir_copy_deref_var(b, call->return_deref,
+                               nir_deref_var_create(b->shader,
+                                                    callee_copy->return_var));
+      }
+
+      nir_instr_remove(&call->instr);
+   }
+
+   return true;
+}
+
+static bool
+inline_function_impl(nir_function_impl *impl, struct set *inlined)
+{
+   if (_mesa_set_search(inlined, impl))
+      return false; /* Already inlined */
+
+   struct inline_functions_state state;
+
+   state.inlined = inlined;
+   state.progress = false;
+   nir_builder_init(&state.builder, impl);
+
+   nir_foreach_block(impl, inline_functions_block, &state);
+
+   if (state.progress) {
+      /* SSA and register indices are completely messed up now */
+      nir_index_ssa_defs(impl);
+      nir_index_local_regs(impl);
+
+      nir_metadata_preserve(impl, nir_metadata_none);
+   }
+
+   _mesa_set_add(inlined, impl);
+
+   return state.progress;
+}
+
+bool
+nir_inline_functions(nir_shader *shader)
+{
+   struct set *inlined = _mesa_set_create(NULL, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+   bool progress = false;
+
+   nir_foreach_function(shader, function) {
+      if (function->impl)
+         progress = inline_function_impl(function->impl, inlined) || progress;
+   }
+
+   _mesa_set_destroy(inlined, NULL);
+
+   return progress;
+}