nouveau/mme: Add support for multiplication on Fermi

Because of Fermi's extremely tight register file and the fact that we have to modify x and y as we multiply, the only form of these we can support as builder helpers is one which implicitly frees its sources. Also, because we can't afford to just allocate extra stuff, we add 32x32_32, 32x32_64, and 32x64_64 forms. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25167>
2024-07-29 11:41:57 -05:00
parent cac4da4cab
commit d43ed4445b
4 changed files with 179 additions and 2 deletions
--- a/src/nouveau/mme/mme_builder.h
+++ b/src/nouveau/mme/mme_builder.h
@@ -131,6 +131,14 @@ mme_free_reg(struct mme_builder *b, struct mme_value val)
   mme_reg_alloc_free(&b->reg_alloc, val);
 }

+static inline struct mme_value64
+mme_alloc_reg64(struct mme_builder *b)
+{
+   struct mme_value lo = mme_alloc_reg(b);
+   struct mme_value hi = mme_alloc_reg(b);
+   return mme_value64(lo, hi);
+}
+
 static inline void
 mme_free_reg64(struct mme_builder *b, struct mme_value64 val)
 {
@@ -295,6 +303,26 @@ mme_sub64(struct mme_builder *b,
   return mme_alu64(b, MME_ALU_OP_SUB, MME_ALU_OP_SUBB, x, y);
 }

+static inline struct mme_value
+mme_mul_32x32_32_free_srcs(struct mme_builder *b,
+                           struct mme_value x, struct mme_value y)
+{
+   assert(x.type == MME_VALUE_TYPE_REG);
+   assert(y.type == MME_VALUE_TYPE_REG);
+   if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
+      struct mme_value dst = mme_mul(b, x, y);
+      mme_free_reg(b, x);
+      mme_free_reg(b, y);
+      return dst;
+   } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
+      struct mme_value dst = mme_alloc_reg(b);
+      mme_fermi_umul_32x32_32_to_free_srcs(b, dst, x, y);
+      return dst;
+   } else {
+      unreachable("Unsupported GPU class");
+   }
+}
+
 static inline void
 mme_imul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst,
                     struct mme_value x, struct mme_value y)
@@ -317,6 +345,7 @@ static inline void
 mme_umul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst,
                     struct mme_value x, struct mme_value y)
 {
+   assert(b->devinfo->cls_eng3d >= MME_CLS_TURING);
   mme_alu64_to(b, dst, MME_ALU_OP_MULU, MME_ALU_OP_MULH,
                mme_value64(x, mme_zero()),
                mme_value64(y, mme_zero()));
@@ -326,11 +355,56 @@ static inline struct mme_value64
 mme_umul_32x32_64(struct mme_builder *b,
                  struct mme_value x, struct mme_value y)
 {
+   assert(b->devinfo->cls_eng3d >= MME_CLS_TURING);
   return mme_alu64(b, MME_ALU_OP_MULU, MME_ALU_OP_MULH,
                    mme_value64(x, mme_zero()),
                    mme_value64(y, mme_zero()));
 }

+static inline struct mme_value64
+mme_umul_32x32_64_free_srcs(struct mme_builder *b,
+                            struct mme_value x, struct mme_value y)
+{
+   assert(x.type == MME_VALUE_TYPE_REG);
+   assert(y.type == MME_VALUE_TYPE_REG);
+   if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
+      struct mme_value64 dst = mme_umul_32x32_64(b, x, y);
+      mme_free_reg(b, x);
+      mme_free_reg(b, y);
+      return dst;
+   } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
+      struct mme_value y_hi = mme_mov(b, mme_zero());
+      struct mme_value64 dst = mme_alloc_reg64(b);
+      mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, mme_value64(y, y_hi));
+      return dst;
+   } else {
+      unreachable("Unsupported GPU class");
+   }
+}
+
+static inline struct mme_value64
+mme_umul_32x64_64_free_srcs(struct mme_builder *b,
+                            struct mme_value x, struct mme_value64 y)
+{
+   assert(x.type == MME_VALUE_TYPE_REG);
+   assert(y.lo.type == MME_VALUE_TYPE_REG);
+   assert(y.hi.type == MME_VALUE_TYPE_REG);
+   if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
+      struct mme_value64 dst = mme_umul_32x32_64(b, x, y.lo);
+      struct mme_value tmp = mme_mul(b, x, y.hi);
+      mme_add64_to(b, dst, dst, mme_value64(mme_zero(), tmp));
+      mme_free_reg(b, x);
+      mme_free_reg64(b, y);
+      return dst;
+   } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
+      struct mme_value64 dst = mme_alloc_reg64(b);
+      mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, y);
+      return dst;
+   } else {
+      unreachable("Unsupported GPU class");
+   }
+}
+
 static inline struct mme_value64
 mme_mul64(struct mme_builder *b,
          struct mme_value64 x, struct mme_value64 y)
--- a/src/nouveau/mme/mme_fermi_builder.c
+++ b/src/nouveau/mme/mme_fermi_builder.c
@@ -338,6 +338,28 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b,
      mme_srl_to(b, x, x, mme_imm(1u));
      mme_sll_to(b, y, y, mme_imm(1u));
   }
+   mme_free_reg(b, x);
+   mme_free_reg(b, y);
+}
+
+void
+mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b,
+                                     struct mme_value64 dst,
+                                     struct mme_value x,
+                                     struct mme_value64 y)
+{
+   mme_while (b, ine, x, mme_zero()) {
+      struct mme_value lsb = mme_and(b, x, mme_imm(1));
+      mme_if (b, ine, lsb, mme_zero()) {
+         mme_add64_to(b, dst, dst, y);
+      }
+      mme_free_reg(b, lsb);
+      mme_srl_to(b, x, x, mme_imm(1u));
+      /* y = y << 1 */
+      mme_add64_to(b, y, y, y);
+   }
+   mme_free_reg(b, x);
+   mme_free_reg64(b, y);
 }

 static struct mme_value
--- a/src/nouveau/mme/mme_fermi_builder.h
+++ b/src/nouveau/mme/mme_fermi_builder.h
@@ -115,6 +115,12 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b,
                                     struct mme_value x,
                                     struct mme_value y);

+void
+mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b,
+                                     struct mme_value64 dst,
+                                     struct mme_value x,
+                                     struct mme_value64 y);
+
 void
 mme_fermi_merge_to(struct mme_builder *b, struct mme_value dst,
                   struct mme_value x, struct mme_value y,
--- a/src/nouveau/mme/tests/mme_builder_test.cpp
+++ b/src/nouveau/mme/tests/mme_builder_test.cpp
@@ -205,7 +205,7 @@ static const uint32_t mul_cases[] = {
   0xfffe0000,
 };

-TEST_F(mme_builder_test, mul)
+TEST_F(mme_builder_test, mul_32x32_32)
 {
   for (auto sim : sims) {
      mme_builder b;
@@ -214,7 +214,7 @@ TEST_F(mme_builder_test, mul)
      mme_value x = mme_load(&b);
      mme_value y = mme_load(&b);

-      sim->mme_store_data(&b, 0, mme_mul(&b, x, y));
+      sim->mme_store_data(&b, 0, mme_mul_32x32_32_free_srcs(&b, x, y));

      auto macro = mme_builder_finish_vec(&b);

@@ -231,6 +231,81 @@ TEST_F(mme_builder_test, mul)
   }
 }

+TEST_F(mme_builder_test, umul_32x32_64)
+{
+   for (auto sim : sims) {
+      mme_builder b;
+      mme_builder_init(&b, sim->devinfo);
+
+      mme_value x = mme_load(&b);
+      mme_value y = mme_load(&b);
+
+      struct mme_value64 d = mme_umul_32x32_64_free_srcs(&b, x, y);
+
+      sim->mme_store_data(&b, 0, d.lo);
+      sim->mme_store_data(&b, 1, d.hi);
+
+      auto macro = mme_builder_finish_vec(&b);
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) {
+         for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) {
+            std::vector<uint32_t> params;
+            params.push_back(mul_cases[i]);
+            params.push_back(mul_cases[j]);
+
+            sim->run_macro(macro, params);
+
+            uint64_t d = (uint64_t)mul_cases[i] * (uint64_t)mul_cases[j];
+            ASSERT_EQ(sim->data[0], (uint32_t)d);
+            ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32));
+         }
+      }
+   }
+}
+
+TEST_F(mme_builder_test, umul_32x64_64)
+{
+   for (auto sim : sims) {
+      mme_builder b;
+      mme_builder_init(&b, sim->devinfo);
+
+      mme_value x = mme_load(&b);
+      struct mme_value64 y;
+      y.lo = mme_load(&b);
+      y.hi = mme_load(&b);
+
+      struct mme_value64 d = mme_umul_32x64_64_free_srcs(&b, x, y);
+
+      sim->mme_store_data(&b, 0, d.lo);
+      sim->mme_store_data(&b, 1, d.hi);
+
+      auto macro = mme_builder_finish_vec(&b);
+
+      for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) {
+         for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) {
+            for (uint32_t k = 0; k < ARRAY_SIZE(mul_cases); k++) {
+               std::vector<uint32_t> params;
+               params.push_back(mul_cases[i]);
+               params.push_back(mul_cases[j]);
+               params.push_back(mul_cases[k]);
+
+               sim->run_macro(macro, params);
+
+               uint32_t x = mul_cases[i];
+               uint32_t y_lo = mul_cases[j];
+               uint32_t y_hi = mul_cases[k];
+               uint64_t y = ((uint64_t)y_hi << 32) | (uint64_t)y_lo;
+
+               uint64_t d = x * y;
+
+               ASSERT_EQ(sim->data[0], (uint32_t)d);
+               ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32));
+            }
+         }
+      }
+   }
+}
+
 TEST_F(mme_builder_test, sll_srl)
 {
   static const uint32_t x = 0xac406fe1;