From eb1f19d7bf194574b984033754a301d1407f24d5 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 25 Sep 2023 17:40:01 -0700
Subject: [PATCH] intel/compiler: Validation for DPAS instructions

v2: s/regiser/register/g in messages. Noticed by Caio. Add more context
to the sub-byte precision error message. Suggested by Caio.

Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25994>
---
 src/intel/compiler/brw_eu_validate.c    | 147 +++++++++
 src/intel/compiler/test_eu_validate.cpp | 393 ++++++++++++++++++++++++
 2 files changed, 540 insertions(+)

diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
index 2d30c7fa37e..87e9f0cd1e3 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -2473,6 +2473,153 @@ instruction_restrictions(const struct brw_isa_info *isa,
       }
    }
 
+   if (brw_inst_opcode(isa, inst) == BRW_OPCODE_DPAS) {
+      ERROR_IF(brw_inst_dpas_3src_sdepth(devinfo, inst) != BRW_SYSTOLIC_DEPTH_8,
+               "Systolic depth must be 8.");
+
+      const unsigned sdepth = 8;
+
+      const enum brw_reg_type dst_type =
+         brw_inst_dpas_3src_dst_type(devinfo, inst);
+      const enum brw_reg_type src0_type =
+         brw_inst_dpas_3src_src0_type(devinfo, inst);
+      const enum brw_reg_type src1_type =
+         brw_inst_dpas_3src_src1_type(devinfo, inst);
+      const enum brw_reg_type src2_type =
+         brw_inst_dpas_3src_src2_type(devinfo, inst);
+
+      const enum gfx12_sub_byte_precision src1_sub_byte =
+         brw_inst_dpas_3src_src1_subbyte(devinfo, inst);
+
+      if (src1_type != BRW_REGISTER_TYPE_B && src1_type != BRW_REGISTER_TYPE_UB) {
+         ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE,
+                  "Sub-byte precision must be None for source type larger than Byte.");
+      } else {
+         ERROR_IF(src1_sub_byte != BRW_SUB_BYTE_PRECISION_NONE &&
+                  src1_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT &&
+                  src1_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT,
+                  "Invalid sub-byte precision.");
+      }
+
+      const enum gfx12_sub_byte_precision src2_sub_byte =
+         brw_inst_dpas_3src_src2_subbyte(devinfo, inst);
+
+      if (src2_type != BRW_REGISTER_TYPE_B && src2_type != BRW_REGISTER_TYPE_UB) {
+         ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE,
+                  "Sub-byte precision must be None.");
+      } else {
+         ERROR_IF(src2_sub_byte != BRW_SUB_BYTE_PRECISION_NONE &&
+                  src2_sub_byte != BRW_SUB_BYTE_PRECISION_4BIT &&
+                  src2_sub_byte != BRW_SUB_BYTE_PRECISION_2BIT,
+                  "Invalid sub-byte precision.");
+      }
+
+      const unsigned src1_bits_per_element =
+         (8 * brw_reg_type_to_size(src1_type)) >>
+         brw_inst_dpas_3src_src1_subbyte(devinfo, inst);
+
+      const unsigned src2_bits_per_element =
+         (8 * brw_reg_type_to_size(src2_type)) >>
+         brw_inst_dpas_3src_src2_subbyte(devinfo, inst);
+
+      /* The MAX2(1, ...) is just to prevent possible division by 0 later. */
+      const unsigned ops_per_chan =
+         MAX2(1, 32 / MAX2(src1_bits_per_element, src2_bits_per_element));
+
+      ERROR_IF(brw_inst_exec_size(devinfo, inst) != BRW_EXECUTE_8,
+               "DPAS execution size must be 8.");
+
+      const unsigned exec_size = 8;
+
+      const unsigned dst_subnr  = brw_inst_dpas_3src_dst_subreg_nr(devinfo, inst);
+      const unsigned src0_subnr = brw_inst_dpas_3src_src0_subreg_nr(devinfo, inst);
+      const unsigned src1_subnr = brw_inst_dpas_3src_src1_subreg_nr(devinfo, inst);
+      const unsigned src2_subnr = brw_inst_dpas_3src_src2_subreg_nr(devinfo, inst);
+
+      /* Until HF is supported as dst type, this is effectively subnr == 0. */
+      ERROR_IF(dst_subnr % exec_size != 0,
+               "Destination subregister offset must be a multiple of ExecSize.");
+
+      /* Until HF is supported as src0 type, this is effectively subnr == 0. */
+      ERROR_IF(src0_subnr % exec_size != 0,
+               "Src0 subregister offset must be a multiple of ExecSize.");
+
+      ERROR_IF(src1_subnr != 0,
+               "Src1 subregister offsets must be 0.");
+
+      /* In nearly all cases, this effectively requires that src2.subnr be
+       * 0. It is only when src1 is 8 bits and src2 is 2 or 4 bits that the
+       * ops_per_chan value can allow non-zero src2.subnr.
+       */
+      ERROR_IF(src2_subnr % (sdepth * ops_per_chan) != 0,
+               "Src2 subregister offset must be a multiple of SystolicDepth "
+               "times OPS_PER_CHAN.");
+
+      ERROR_IF(dst_subnr * type_sz(dst_type) >= REG_SIZE,
+               "Destination subregister specifies next register.");
+
+      ERROR_IF(src0_subnr * type_sz(src0_type) >= REG_SIZE,
+               "Src0 subregister specifies next register.");
+
+      ERROR_IF((src1_subnr * type_sz(src1_type) * src1_bits_per_element) / 8 >= REG_SIZE,
+               "Src1 subregister specifies next register.");
+
+      ERROR_IF((src2_subnr * type_sz(src2_type) * src2_bits_per_element) / 8 >= REG_SIZE,
+               "Src2 subregister specifies next register.");
+
+      if (brw_inst_3src_atomic_control(devinfo, inst)) {
+         /* FINISHME: When we start emitting DPAS with Atomic set, figure out
+          * a way to validate it. Also add a test in test_eu_validate.cpp.
+          */
+         ERROR_IF(true,
+                  "When instruction option Atomic is used it must be follwed by a "
+                  "DPAS instruction.");
+      }
+
+      if (brw_inst_dpas_3src_exec_type(devinfo, inst) ==
+          BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT) {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_F,
+                  "DPAS destination type must be F.");
+         ERROR_IF(src0_type != BRW_REGISTER_TYPE_F,
+                  "DPAS src0 type must be F.");
+         ERROR_IF(src1_type != BRW_REGISTER_TYPE_HF,
+                  "DPAS src1 type must be HF.");
+         ERROR_IF(src2_type != BRW_REGISTER_TYPE_HF,
+                  "DPAS src2 type must be HF.");
+      } else {
+         ERROR_IF(dst_type != BRW_REGISTER_TYPE_D &&
+                  dst_type != BRW_REGISTER_TYPE_UD,
+                  "DPAS destination type must be D or UD.");
+         ERROR_IF(src0_type != BRW_REGISTER_TYPE_D &&
+                  src0_type != BRW_REGISTER_TYPE_UD,
+                  "DPAS src0 type must be D or UD.");
+         ERROR_IF(src1_type != BRW_REGISTER_TYPE_B &&
+                  src1_type != BRW_REGISTER_TYPE_UB,
+                  "DPAS src1 base type must be B or UB.");
+         ERROR_IF(src2_type != BRW_REGISTER_TYPE_B &&
+                  src2_type != BRW_REGISTER_TYPE_UB,
+                  "DPAS src2 base type must be B or UB.");
+
+         if (brw_reg_type_is_unsigned_integer(dst_type)) {
+            ERROR_IF(!brw_reg_type_is_unsigned_integer(src0_type) ||
+                     !brw_reg_type_is_unsigned_integer(src1_type) ||
+                     !brw_reg_type_is_unsigned_integer(src2_type),
+                     "If any source datatype is signed, destination datatype "
+                     "must be signed.");
+         }
+      }
+
+      /* FINISHME: Additional restrictions mentioned in the Bspec that are not
+       * yet enforced here:
+       *
+       *    - General Accumulator registers access is not supported. This is
+       *      currently enforced in brw_dpas_three_src (brw_eu_emit.c).
+       *
+       *    - Given any combination of datatypes in the sources of a DPAS
+       *      instructions, the boundaries of a register should not be crossed.
+       */
+   }
+
    return error_msg;
 }
 
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
index ceb55502e6f..cd2fce9040f 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -3136,3 +3136,396 @@ TEST_P(validation_test, add3_immediate_types)
       clear_instructions(p);
    }
 }
+
+TEST_P(validation_test, dpas_sdepth)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const enum gfx12_systolic_depth depth[] = {
+      BRW_SYSTOLIC_DEPTH_16,
+      BRW_SYSTOLIC_DEPTH_2,
+      BRW_SYSTOLIC_DEPTH_4,
+      BRW_SYSTOLIC_DEPTH_8,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(depth); i++) {
+      brw_DPAS(p,
+               depth[i],
+               8,
+               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_F),
+               null,
+               retype(brw_vec8_grf(16, 0), BRW_REGISTER_TYPE_HF),
+               retype(brw_vec8_grf(32, 0), BRW_REGISTER_TYPE_HF));
+
+      const bool expected_result = depth[i] == BRW_SYSTOLIC_DEPTH_8;
+
+      EXPECT_EQ(expected_result, validate(p)) <<
+         "Encoded systolic depth value is: " << depth[i];
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_exec_size)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const enum brw_execution_size test_vectors[] = {
+      BRW_EXECUTE_1,
+      BRW_EXECUTE_2,
+      BRW_EXECUTE_4,
+      BRW_EXECUTE_8,
+      BRW_EXECUTE_16,
+      BRW_EXECUTE_32,
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_set_default_exec_size(p, test_vectors[i]);
+
+      brw_DPAS(p,
+               BRW_SYSTOLIC_DEPTH_8,
+               8,
+               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_F),
+               null,
+               retype(brw_vec8_grf(16, 0), BRW_REGISTER_TYPE_HF),
+               retype(brw_vec8_grf(32, 0), BRW_REGISTER_TYPE_HF));
+
+      const bool expected_result = test_vectors[i] == BRW_EXECUTE_8;
+
+      EXPECT_EQ(expected_result, validate(p)) <<
+         "Exec size = " << (1u << test_vectors[i]);
+
+      clear_instructions(p);
+   }
+
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+}
+
+TEST_P(validation_test, dpas_sub_byte_precision)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+   static const struct {
+      brw_reg_type dst_type;
+      brw_reg_type src0_type;
+      brw_reg_type src1_type;
+      enum gfx12_sub_byte_precision src1_prec;
+      brw_reg_type src2_type;
+      enum gfx12_sub_byte_precision src2_prec;
+      bool expected_result;
+   } test_vectors[] = {
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_4BIT,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_2BIT,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_4BIT,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_F,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_2BIT,
+         BRW_REGISTER_TYPE_HF, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_4BIT,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_2BIT,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         BRW_REGISTER_TYPE_UB, (enum gfx12_sub_byte_precision) 3,
+         false,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_4BIT,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_2BIT,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         true,
+      },
+      {
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UD,
+         BRW_REGISTER_TYPE_UB, (enum gfx12_sub_byte_precision) 3,
+         BRW_REGISTER_TYPE_UB, BRW_SUB_BYTE_PRECISION_NONE,
+         false,
+      },
+   };
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_inst *inst =
+         brw_DPAS(p,
+                  BRW_SYSTOLIC_DEPTH_8,
+                  8,
+                  retype(brw_vec8_grf(0, 0), test_vectors[i].dst_type),
+                  retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type),
+                  retype(brw_vec8_grf(32, 0), test_vectors[i].src1_type),
+                  retype(brw_vec8_grf(48, 0), test_vectors[i].src2_type));
+
+      brw_inst_set_dpas_3src_src1_subbyte(&devinfo, inst,
+                                          test_vectors[i].src1_prec);
+      brw_inst_set_dpas_3src_src2_subbyte(&devinfo, inst,
+                                          test_vectors[i].src2_prec);
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_types)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+#define TV(a, b, c, d, r)                              \
+   { BRW_REGISTER_TYPE_ ## a, BRW_REGISTER_TYPE_ ## b, \
+     BRW_REGISTER_TYPE_ ## c, BRW_REGISTER_TYPE_ ## d, \
+     r }
+
+   static const struct {
+      brw_reg_type dst_type;
+      brw_reg_type src0_type;
+      brw_reg_type src1_type;
+      brw_reg_type src2_type;
+      bool expected_result;
+   } test_vectors[] = {
+      TV( F,  F, HF, HF, true),
+      TV( F, HF, HF, HF, false),
+      TV(HF,  F, HF, HF, false),
+      TV( F,  F,  F, HF, false),
+      TV( F,  F, HF,  F, false),
+
+      TV(DF, DF, DF, DF, false),
+      TV(DF, DF, DF,  F, false),
+      TV(DF, DF,  F, DF, false),
+      TV(DF,  F, DF, DF, false),
+      TV(DF, DF, DF, HF, false),
+      TV(DF, DF, HF, DF, false),
+      TV(DF, HF, DF, DF, false),
+
+      TV(UD, UD, UB, UB, true),
+      TV(UD, UD, UB, UD, false),
+      TV(UD, UD, UD, UB, false),
+      TV(UD, UD, UB, UW, false),
+      TV(UD, UD, UW, UB, false),
+
+      TV(UD, UB, UB, UB, false),
+      TV(UD, UW, UB, UB, false),
+
+      TV(UQ, UQ, UB, UB, false),
+      TV(UQ, UQ, UB, UQ, false),
+      TV(UQ, UQ, UQ, UB, false),
+      TV(UQ, UQ, UB, UW, false),
+      TV(UQ, UQ, UW, UB, false),
+
+      TV( D,  D,  B,  B, true),
+      TV( D,  D,  B, UB, true),
+      TV( D,  D, UB,  B, true),
+      TV( D, UD,  B,  B, true),
+
+      TV( D,  D,  B,  D, false),
+      TV( D,  D,  D,  B, false),
+      TV( D,  D,  B,  W, false),
+      TV( D,  D,  W,  B, false),
+
+      TV( D,  B,  B,  B, false),
+      TV( D,  W,  B,  B, false),
+
+      TV( Q,  Q,  B,  B, false),
+      TV( Q,  Q,  B,  Q, false),
+      TV( Q,  Q,  Q,  B, false),
+      TV( Q,  Q,  B,  W, false),
+      TV( Q,  Q,  W,  B, false),
+
+      TV(UD, UD, UB,  B, false),
+      TV(UD, UD,  B, UB, false),
+      TV(UD,  D, UB, UB, false),
+   };
+
+#undef TV
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      brw_DPAS(p,
+               BRW_SYSTOLIC_DEPTH_8,
+               8,
+               retype(brw_vec8_grf(0, 0), test_vectors[i].dst_type),
+               retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type),
+               retype(brw_vec8_grf(32, 0), test_vectors[i].src1_type),
+               retype(brw_vec8_grf(48, 0), test_vectors[i].src2_type));
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, dpas_src_subreg_nr)
+{
+   if (devinfo.verx10 < 125)
+      return;
+
+#define TV(dt, od, t0, o0, t1, o1, o2, r) {  \
+      BRW_REGISTER_TYPE_ ## dt, od,          \
+      BRW_REGISTER_TYPE_ ## t0, o0,          \
+      BRW_REGISTER_TYPE_ ## t1, o1, o2,      \
+      r }
+
+   static const struct {
+      brw_reg_type dst_type;
+      unsigned dst_subnr;
+      brw_reg_type src0_type;
+      unsigned src0_subnr;
+      brw_reg_type src1_src2_type;
+      unsigned src1_subnr;
+      unsigned src2_subnr;
+      bool expected_result;
+   } test_vectors[] = {
+      TV( F,  0,  F,  0, HF,  0,  0, true),
+      TV( D,  0,  D,  0,  B,  0,  0, true),
+      TV( D,  0,  D,  0, UB,  0,  0, true),
+      TV( D,  0, UD,  0,  B,  0,  0, true),
+
+      TV( F,  1,  F,  0, HF,  0,  0, false),
+      TV( F,  2,  F,  0, HF,  0,  0, false),
+      TV( F,  3,  F,  0, HF,  0,  0, false),
+      TV( F,  4,  F,  0, HF,  0,  0, false),
+      TV( F,  5,  F,  0, HF,  0,  0, false),
+      TV( F,  6,  F,  0, HF,  0,  0, false),
+      TV( F,  7,  F,  0, HF,  0,  0, false),
+
+      TV( F,  0,  F,  1, HF,  0,  0, false),
+      TV( F,  0,  F,  2, HF,  0,  0, false),
+      TV( F,  0,  F,  3, HF,  0,  0, false),
+      TV( F,  0,  F,  4, HF,  0,  0, false),
+      TV( F,  0,  F,  5, HF,  0,  0, false),
+      TV( F,  0,  F,  6, HF,  0,  0, false),
+      TV( F,  0,  F,  7, HF,  0,  0, false),
+
+      TV( F,  0,  F,  0, HF,  1,  0, false),
+      TV( F,  0,  F,  0, HF,  2,  0, false),
+      TV( F,  0,  F,  0, HF,  3,  0, false),
+      TV( F,  0,  F,  0, HF,  4,  0, false),
+      TV( F,  0,  F,  0, HF,  5,  0, false),
+      TV( F,  0,  F,  0, HF,  6,  0, false),
+      TV( F,  0,  F,  0, HF,  7,  0, false),
+      TV( F,  0,  F,  0, HF,  8,  0, false),
+      TV( F,  0,  F,  0, HF,  9,  0, false),
+      TV( F,  0,  F,  0, HF, 10,  0, false),
+      TV( F,  0,  F,  0, HF, 11,  0, false),
+      TV( F,  0,  F,  0, HF, 12,  0, false),
+      TV( F,  0,  F,  0, HF, 13,  0, false),
+      TV( F,  0,  F,  0, HF, 14,  0, false),
+      TV( F,  0,  F,  0, HF, 15,  0, false),
+
+      TV( F,  0,  F,  0, HF,  0,  1, false),
+      TV( F,  0,  F,  0, HF,  0,  2, false),
+      TV( F,  0,  F,  0, HF,  0,  3, false),
+      TV( F,  0,  F,  0, HF,  0,  4, false),
+      TV( F,  0,  F,  0, HF,  0,  5, false),
+      TV( F,  0,  F,  0, HF,  0,  6, false),
+      TV( F,  0,  F,  0, HF,  0,  7, false),
+      TV( F,  0,  F,  0, HF,  0,  8, false),
+      TV( F,  0,  F,  0, HF,  0,  9, false),
+      TV( F,  0,  F,  0, HF,  0, 10, false),
+      TV( F,  0,  F,  0, HF,  0, 11, false),
+      TV( F,  0,  F,  0, HF,  0, 12, false),
+      TV( F,  0,  F,  0, HF,  0, 13, false),
+      TV( F,  0,  F,  0, HF,  0, 14, false),
+      TV( F,  0,  F,  0, HF,  0, 15, false),
+
+      /* These meet the requirements, but they specify a subnr that is part of
+       * the next register. It is currently not possible to specify a subnr of
+       * 32 for the B and UB values because brw_reg::subnr is only 5 bits.
+       */
+      TV( F, 16,  F,  0, HF,  0,  0, false),
+      TV( F,  0,  F, 16, HF,  0,  0, false),
+      TV( F,  0,  F,  0, HF,  0, 16, false),
+
+      TV( D, 16,  D,  0,  B,  0,  0, false),
+      TV( D,  0,  D, 16,  B,  0,  0, false),
+   };
+
+#undef TV
+
+   for (unsigned i = 0; i < ARRAY_SIZE(test_vectors); i++) {
+      struct brw_reg dst =
+         retype(brw_vec8_grf( 0, 0), test_vectors[i].dst_type);
+      struct brw_reg src0 =
+         retype(brw_vec8_grf(16, 0), test_vectors[i].src0_type);
+      struct brw_reg src1 =
+         retype(brw_vec8_grf(32, 0), test_vectors[i].src1_src2_type);
+      struct brw_reg src2 =
+         retype(brw_vec8_grf(48, 0), test_vectors[i].src1_src2_type);
+
+      /* subnr for DPAS is in units of datatype precision instead of bytes as
+       * it is for every other instruction. Set the value by hand instead of
+       * using byte_offset() or similar.
+       */
+      dst.subnr = test_vectors[i].dst_subnr;
+      src0.subnr = test_vectors[i].src0_subnr;
+      src1.subnr = test_vectors[i].src1_subnr;
+      src2.subnr = test_vectors[i].src2_subnr;
+
+      brw_DPAS(p, BRW_SYSTOLIC_DEPTH_8, 8, dst, src0, src1, src2);
+
+      EXPECT_EQ(test_vectors[i].expected_result, validate(p)) <<
+         "test vector index = " << i;
+
+      clear_instructions(p);
+   }
+}