From d43ed4445b59cb8e94f57cb96005d34138d067d8 Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Mon, 29 Jul 2024 11:41:57 -0500 Subject: [PATCH] nouveau/mme: Add support for multiplication on Fermi Because of Fermi's extremely tight register file and the fact that we have to modify x and y as we multiply, the only form of these we can support as builder helpers is one which implicitly frees its sources. Also, because we can't afford to just allocate extra stuff, we add 32x32_32, 32x32_64, and 32x64_64 forms. Part-of: --- src/nouveau/mme/mme_builder.h | 74 ++++++++++++++++++++ src/nouveau/mme/mme_fermi_builder.c | 22 ++++++ src/nouveau/mme/mme_fermi_builder.h | 6 ++ src/nouveau/mme/tests/mme_builder_test.cpp | 79 +++++++++++++++++++++- 4 files changed, 179 insertions(+), 2 deletions(-) diff --git a/src/nouveau/mme/mme_builder.h b/src/nouveau/mme/mme_builder.h index 0a5eb321541..2db219e72b4 100644 --- a/src/nouveau/mme/mme_builder.h +++ b/src/nouveau/mme/mme_builder.h @@ -131,6 +131,14 @@ mme_free_reg(struct mme_builder *b, struct mme_value val) mme_reg_alloc_free(&b->reg_alloc, val); } +static inline struct mme_value64 +mme_alloc_reg64(struct mme_builder *b) +{ + struct mme_value lo = mme_alloc_reg(b); + struct mme_value hi = mme_alloc_reg(b); + return mme_value64(lo, hi); +} + static inline void mme_free_reg64(struct mme_builder *b, struct mme_value64 val) { @@ -295,6 +303,26 @@ mme_sub64(struct mme_builder *b, return mme_alu64(b, MME_ALU_OP_SUB, MME_ALU_OP_SUBB, x, y); } +static inline struct mme_value +mme_mul_32x32_32_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value dst = mme_mul(b, x, y); + mme_free_reg(b, x); + mme_free_reg(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value dst = mme_alloc_reg(b); + mme_fermi_umul_32x32_32_to_free_srcs(b, dst, x, y); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + static inline void mme_imul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst, struct mme_value x, struct mme_value y) @@ -317,6 +345,7 @@ static inline void mme_umul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst, struct mme_value x, struct mme_value y) { + assert(b->devinfo->cls_eng3d >= MME_CLS_TURING); mme_alu64_to(b, dst, MME_ALU_OP_MULU, MME_ALU_OP_MULH, mme_value64(x, mme_zero()), mme_value64(y, mme_zero())); @@ -326,11 +355,56 @@ static inline struct mme_value64 mme_umul_32x32_64(struct mme_builder *b, struct mme_value x, struct mme_value y) { + assert(b->devinfo->cls_eng3d >= MME_CLS_TURING); return mme_alu64(b, MME_ALU_OP_MULU, MME_ALU_OP_MULH, mme_value64(x, mme_zero()), mme_value64(y, mme_zero())); } +static inline struct mme_value64 +mme_umul_32x32_64_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value64 dst = mme_umul_32x32_64(b, x, y); + mme_free_reg(b, x); + mme_free_reg(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value y_hi = mme_mov(b, mme_zero()); + struct mme_value64 dst = mme_alloc_reg64(b); + mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, mme_value64(y, y_hi)); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + +static inline struct mme_value64 +mme_umul_32x64_64_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value64 y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.lo.type == MME_VALUE_TYPE_REG); + assert(y.hi.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value64 dst = mme_umul_32x32_64(b, x, y.lo); + struct mme_value tmp = mme_mul(b, x, y.hi); + mme_add64_to(b, dst, dst, mme_value64(mme_zero(), tmp)); + mme_free_reg(b, x); + mme_free_reg64(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value64 dst = mme_alloc_reg64(b); + mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, y); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + static inline struct mme_value64 mme_mul64(struct mme_builder *b, struct mme_value64 x, struct mme_value64 y) diff --git a/src/nouveau/mme/mme_fermi_builder.c b/src/nouveau/mme/mme_fermi_builder.c index 676c0057c6e..2c60785bb8f 100644 --- a/src/nouveau/mme/mme_fermi_builder.c +++ b/src/nouveau/mme/mme_fermi_builder.c @@ -338,6 +338,28 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b, mme_srl_to(b, x, x, mme_imm(1u)); mme_sll_to(b, y, y, mme_imm(1u)); } + mme_free_reg(b, x); + mme_free_reg(b, y); +} + +void +mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b, + struct mme_value64 dst, + struct mme_value x, + struct mme_value64 y) +{ + mme_while (b, ine, x, mme_zero()) { + struct mme_value lsb = mme_and(b, x, mme_imm(1)); + mme_if (b, ine, lsb, mme_zero()) { + mme_add64_to(b, dst, dst, y); + } + mme_free_reg(b, lsb); + mme_srl_to(b, x, x, mme_imm(1u)); + /* y = y << 1 */ + mme_add64_to(b, y, y, y); + } + mme_free_reg(b, x); + mme_free_reg64(b, y); } static struct mme_value diff --git a/src/nouveau/mme/mme_fermi_builder.h b/src/nouveau/mme/mme_fermi_builder.h index 0e955e7f090..d0e4ef7e042 100644 --- a/src/nouveau/mme/mme_fermi_builder.h +++ b/src/nouveau/mme/mme_fermi_builder.h @@ -115,6 +115,12 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b, struct mme_value x, struct mme_value y); +void +mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b, + struct mme_value64 dst, + struct mme_value x, + struct mme_value64 y); + void mme_fermi_merge_to(struct mme_builder *b, struct mme_value dst, struct mme_value x, struct mme_value y, diff --git a/src/nouveau/mme/tests/mme_builder_test.cpp b/src/nouveau/mme/tests/mme_builder_test.cpp index 0ccc55dd9e8..aa85769d443 100644 --- a/src/nouveau/mme/tests/mme_builder_test.cpp +++ b/src/nouveau/mme/tests/mme_builder_test.cpp @@ -205,7 +205,7 @@ static const uint32_t mul_cases[] = { 0xfffe0000, }; -TEST_F(mme_builder_test, mul) +TEST_F(mme_builder_test, mul_32x32_32) { for (auto sim : sims) { mme_builder b; @@ -214,7 +214,7 @@ TEST_F(mme_builder_test, mul) mme_value x = mme_load(&b); mme_value y = mme_load(&b); - sim->mme_store_data(&b, 0, mme_mul(&b, x, y)); + sim->mme_store_data(&b, 0, mme_mul_32x32_32_free_srcs(&b, x, y)); auto macro = mme_builder_finish_vec(&b); @@ -231,6 +231,81 @@ TEST_F(mme_builder_test, mul) } } +TEST_F(mme_builder_test, umul_32x32_64) +{ + for (auto sim : sims) { + mme_builder b; + mme_builder_init(&b, sim->devinfo); + + mme_value x = mme_load(&b); + mme_value y = mme_load(&b); + + struct mme_value64 d = mme_umul_32x32_64_free_srcs(&b, x, y); + + sim->mme_store_data(&b, 0, d.lo); + sim->mme_store_data(&b, 1, d.hi); + + auto macro = mme_builder_finish_vec(&b); + + for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) { + for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) { + std::vector params; + params.push_back(mul_cases[i]); + params.push_back(mul_cases[j]); + + sim->run_macro(macro, params); + + uint64_t d = (uint64_t)mul_cases[i] * (uint64_t)mul_cases[j]; + ASSERT_EQ(sim->data[0], (uint32_t)d); + ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32)); + } + } + } +} + +TEST_F(mme_builder_test, umul_32x64_64) +{ + for (auto sim : sims) { + mme_builder b; + mme_builder_init(&b, sim->devinfo); + + mme_value x = mme_load(&b); + struct mme_value64 y; + y.lo = mme_load(&b); + y.hi = mme_load(&b); + + struct mme_value64 d = mme_umul_32x64_64_free_srcs(&b, x, y); + + sim->mme_store_data(&b, 0, d.lo); + sim->mme_store_data(&b, 1, d.hi); + + auto macro = mme_builder_finish_vec(&b); + + for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) { + for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) { + for (uint32_t k = 0; k < ARRAY_SIZE(mul_cases); k++) { + std::vector params; + params.push_back(mul_cases[i]); + params.push_back(mul_cases[j]); + params.push_back(mul_cases[k]); + + sim->run_macro(macro, params); + + uint32_t x = mul_cases[i]; + uint32_t y_lo = mul_cases[j]; + uint32_t y_hi = mul_cases[k]; + uint64_t y = ((uint64_t)y_hi << 32) | (uint64_t)y_lo; + + uint64_t d = x * y; + + ASSERT_EQ(sim->data[0], (uint32_t)d); + ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32)); + } + } + } + } +} + TEST_F(mme_builder_test, sll_srl) { static const uint32_t x = 0xac406fe1;