diff --git a/src/nouveau/mme/mme_builder.h b/src/nouveau/mme/mme_builder.h index 0a5eb321541..2db219e72b4 100644 --- a/src/nouveau/mme/mme_builder.h +++ b/src/nouveau/mme/mme_builder.h @@ -131,6 +131,14 @@ mme_free_reg(struct mme_builder *b, struct mme_value val) mme_reg_alloc_free(&b->reg_alloc, val); } +static inline struct mme_value64 +mme_alloc_reg64(struct mme_builder *b) +{ + struct mme_value lo = mme_alloc_reg(b); + struct mme_value hi = mme_alloc_reg(b); + return mme_value64(lo, hi); +} + static inline void mme_free_reg64(struct mme_builder *b, struct mme_value64 val) { @@ -295,6 +303,26 @@ mme_sub64(struct mme_builder *b, return mme_alu64(b, MME_ALU_OP_SUB, MME_ALU_OP_SUBB, x, y); } +static inline struct mme_value +mme_mul_32x32_32_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value dst = mme_mul(b, x, y); + mme_free_reg(b, x); + mme_free_reg(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value dst = mme_alloc_reg(b); + mme_fermi_umul_32x32_32_to_free_srcs(b, dst, x, y); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + static inline void mme_imul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst, struct mme_value x, struct mme_value y) @@ -317,6 +345,7 @@ static inline void mme_umul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst, struct mme_value x, struct mme_value y) { + assert(b->devinfo->cls_eng3d >= MME_CLS_TURING); mme_alu64_to(b, dst, MME_ALU_OP_MULU, MME_ALU_OP_MULH, mme_value64(x, mme_zero()), mme_value64(y, mme_zero())); @@ -326,11 +355,56 @@ static inline struct mme_value64 mme_umul_32x32_64(struct mme_builder *b, struct mme_value x, struct mme_value y) { + assert(b->devinfo->cls_eng3d >= MME_CLS_TURING); return mme_alu64(b, MME_ALU_OP_MULU, MME_ALU_OP_MULH, mme_value64(x, mme_zero()), mme_value64(y, mme_zero())); } +static inline struct mme_value64 +mme_umul_32x32_64_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value64 dst = mme_umul_32x32_64(b, x, y); + mme_free_reg(b, x); + mme_free_reg(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value y_hi = mme_mov(b, mme_zero()); + struct mme_value64 dst = mme_alloc_reg64(b); + mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, mme_value64(y, y_hi)); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + +static inline struct mme_value64 +mme_umul_32x64_64_free_srcs(struct mme_builder *b, + struct mme_value x, struct mme_value64 y) +{ + assert(x.type == MME_VALUE_TYPE_REG); + assert(y.lo.type == MME_VALUE_TYPE_REG); + assert(y.hi.type == MME_VALUE_TYPE_REG); + if (b->devinfo->cls_eng3d >= MME_CLS_TURING) { + struct mme_value64 dst = mme_umul_32x32_64(b, x, y.lo); + struct mme_value tmp = mme_mul(b, x, y.hi); + mme_add64_to(b, dst, dst, mme_value64(mme_zero(), tmp)); + mme_free_reg(b, x); + mme_free_reg64(b, y); + return dst; + } else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) { + struct mme_value64 dst = mme_alloc_reg64(b); + mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, y); + return dst; + } else { + unreachable("Unsupported GPU class"); + } +} + static inline struct mme_value64 mme_mul64(struct mme_builder *b, struct mme_value64 x, struct mme_value64 y) diff --git a/src/nouveau/mme/mme_fermi_builder.c b/src/nouveau/mme/mme_fermi_builder.c index 676c0057c6e..2c60785bb8f 100644 --- a/src/nouveau/mme/mme_fermi_builder.c +++ b/src/nouveau/mme/mme_fermi_builder.c @@ -338,6 +338,28 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b, mme_srl_to(b, x, x, mme_imm(1u)); mme_sll_to(b, y, y, mme_imm(1u)); } + mme_free_reg(b, x); + mme_free_reg(b, y); +} + +void +mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b, + struct mme_value64 dst, + struct mme_value x, + struct mme_value64 y) +{ + mme_while (b, ine, x, mme_zero()) { + struct mme_value lsb = mme_and(b, x, mme_imm(1)); + mme_if (b, ine, lsb, mme_zero()) { + mme_add64_to(b, dst, dst, y); + } + mme_free_reg(b, lsb); + mme_srl_to(b, x, x, mme_imm(1u)); + /* y = y << 1 */ + mme_add64_to(b, y, y, y); + } + mme_free_reg(b, x); + mme_free_reg64(b, y); } static struct mme_value diff --git a/src/nouveau/mme/mme_fermi_builder.h b/src/nouveau/mme/mme_fermi_builder.h index 0e955e7f090..d0e4ef7e042 100644 --- a/src/nouveau/mme/mme_fermi_builder.h +++ b/src/nouveau/mme/mme_fermi_builder.h @@ -115,6 +115,12 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b, struct mme_value x, struct mme_value y); +void +mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b, + struct mme_value64 dst, + struct mme_value x, + struct mme_value64 y); + void mme_fermi_merge_to(struct mme_builder *b, struct mme_value dst, struct mme_value x, struct mme_value y, diff --git a/src/nouveau/mme/tests/mme_builder_test.cpp b/src/nouveau/mme/tests/mme_builder_test.cpp index 0ccc55dd9e8..aa85769d443 100644 --- a/src/nouveau/mme/tests/mme_builder_test.cpp +++ b/src/nouveau/mme/tests/mme_builder_test.cpp @@ -205,7 +205,7 @@ static const uint32_t mul_cases[] = { 0xfffe0000, }; -TEST_F(mme_builder_test, mul) +TEST_F(mme_builder_test, mul_32x32_32) { for (auto sim : sims) { mme_builder b; @@ -214,7 +214,7 @@ TEST_F(mme_builder_test, mul) mme_value x = mme_load(&b); mme_value y = mme_load(&b); - sim->mme_store_data(&b, 0, mme_mul(&b, x, y)); + sim->mme_store_data(&b, 0, mme_mul_32x32_32_free_srcs(&b, x, y)); auto macro = mme_builder_finish_vec(&b); @@ -231,6 +231,81 @@ TEST_F(mme_builder_test, mul) } } +TEST_F(mme_builder_test, umul_32x32_64) +{ + for (auto sim : sims) { + mme_builder b; + mme_builder_init(&b, sim->devinfo); + + mme_value x = mme_load(&b); + mme_value y = mme_load(&b); + + struct mme_value64 d = mme_umul_32x32_64_free_srcs(&b, x, y); + + sim->mme_store_data(&b, 0, d.lo); + sim->mme_store_data(&b, 1, d.hi); + + auto macro = mme_builder_finish_vec(&b); + + for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) { + for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) { + std::vector params; + params.push_back(mul_cases[i]); + params.push_back(mul_cases[j]); + + sim->run_macro(macro, params); + + uint64_t d = (uint64_t)mul_cases[i] * (uint64_t)mul_cases[j]; + ASSERT_EQ(sim->data[0], (uint32_t)d); + ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32)); + } + } + } +} + +TEST_F(mme_builder_test, umul_32x64_64) +{ + for (auto sim : sims) { + mme_builder b; + mme_builder_init(&b, sim->devinfo); + + mme_value x = mme_load(&b); + struct mme_value64 y; + y.lo = mme_load(&b); + y.hi = mme_load(&b); + + struct mme_value64 d = mme_umul_32x64_64_free_srcs(&b, x, y); + + sim->mme_store_data(&b, 0, d.lo); + sim->mme_store_data(&b, 1, d.hi); + + auto macro = mme_builder_finish_vec(&b); + + for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) { + for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) { + for (uint32_t k = 0; k < ARRAY_SIZE(mul_cases); k++) { + std::vector params; + params.push_back(mul_cases[i]); + params.push_back(mul_cases[j]); + params.push_back(mul_cases[k]); + + sim->run_macro(macro, params); + + uint32_t x = mul_cases[i]; + uint32_t y_lo = mul_cases[j]; + uint32_t y_hi = mul_cases[k]; + uint64_t y = ((uint64_t)y_hi << 32) | (uint64_t)y_lo; + + uint64_t d = x * y; + + ASSERT_EQ(sim->data[0], (uint32_t)d); + ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32)); + } + } + } + } +} + TEST_F(mme_builder_test, sll_srl) { static const uint32_t x = 0xac406fe1;