nouveau/mme: Add support for multiplication on Fermi

Because of Fermi's extremely tight register file and the fact that we
have to modify x and y as we multiply, the only form of these we can
support as builder helpers is one which implicitly frees its sources.
Also, because we can't afford to just allocate extra stuff, we add
32x32_32, 32x32_64, and 32x64_64 forms.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25167>
This commit is contained in:
Faith Ekstrand
2024-07-29 11:41:57 -05:00
committed by Marge Bot
parent cac4da4cab
commit d43ed4445b
4 changed files with 179 additions and 2 deletions

View File

@@ -131,6 +131,14 @@ mme_free_reg(struct mme_builder *b, struct mme_value val)
mme_reg_alloc_free(&b->reg_alloc, val);
}
static inline struct mme_value64
mme_alloc_reg64(struct mme_builder *b)
{
struct mme_value lo = mme_alloc_reg(b);
struct mme_value hi = mme_alloc_reg(b);
return mme_value64(lo, hi);
}
static inline void
mme_free_reg64(struct mme_builder *b, struct mme_value64 val)
{
@@ -295,6 +303,26 @@ mme_sub64(struct mme_builder *b,
return mme_alu64(b, MME_ALU_OP_SUB, MME_ALU_OP_SUBB, x, y);
}
static inline struct mme_value
mme_mul_32x32_32_free_srcs(struct mme_builder *b,
struct mme_value x, struct mme_value y)
{
assert(x.type == MME_VALUE_TYPE_REG);
assert(y.type == MME_VALUE_TYPE_REG);
if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
struct mme_value dst = mme_mul(b, x, y);
mme_free_reg(b, x);
mme_free_reg(b, y);
return dst;
} else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
struct mme_value dst = mme_alloc_reg(b);
mme_fermi_umul_32x32_32_to_free_srcs(b, dst, x, y);
return dst;
} else {
unreachable("Unsupported GPU class");
}
}
static inline void
mme_imul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst,
struct mme_value x, struct mme_value y)
@@ -317,6 +345,7 @@ static inline void
mme_umul_32x32_64_to(struct mme_builder *b, struct mme_value64 dst,
struct mme_value x, struct mme_value y)
{
assert(b->devinfo->cls_eng3d >= MME_CLS_TURING);
mme_alu64_to(b, dst, MME_ALU_OP_MULU, MME_ALU_OP_MULH,
mme_value64(x, mme_zero()),
mme_value64(y, mme_zero()));
@@ -326,11 +355,56 @@ static inline struct mme_value64
mme_umul_32x32_64(struct mme_builder *b,
struct mme_value x, struct mme_value y)
{
assert(b->devinfo->cls_eng3d >= MME_CLS_TURING);
return mme_alu64(b, MME_ALU_OP_MULU, MME_ALU_OP_MULH,
mme_value64(x, mme_zero()),
mme_value64(y, mme_zero()));
}
static inline struct mme_value64
mme_umul_32x32_64_free_srcs(struct mme_builder *b,
struct mme_value x, struct mme_value y)
{
assert(x.type == MME_VALUE_TYPE_REG);
assert(y.type == MME_VALUE_TYPE_REG);
if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
struct mme_value64 dst = mme_umul_32x32_64(b, x, y);
mme_free_reg(b, x);
mme_free_reg(b, y);
return dst;
} else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
struct mme_value y_hi = mme_mov(b, mme_zero());
struct mme_value64 dst = mme_alloc_reg64(b);
mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, mme_value64(y, y_hi));
return dst;
} else {
unreachable("Unsupported GPU class");
}
}
static inline struct mme_value64
mme_umul_32x64_64_free_srcs(struct mme_builder *b,
struct mme_value x, struct mme_value64 y)
{
assert(x.type == MME_VALUE_TYPE_REG);
assert(y.lo.type == MME_VALUE_TYPE_REG);
assert(y.hi.type == MME_VALUE_TYPE_REG);
if (b->devinfo->cls_eng3d >= MME_CLS_TURING) {
struct mme_value64 dst = mme_umul_32x32_64(b, x, y.lo);
struct mme_value tmp = mme_mul(b, x, y.hi);
mme_add64_to(b, dst, dst, mme_value64(mme_zero(), tmp));
mme_free_reg(b, x);
mme_free_reg64(b, y);
return dst;
} else if (b->devinfo->cls_eng3d >= MME_CLS_FERMI) {
struct mme_value64 dst = mme_alloc_reg64(b);
mme_fermi_umul_32x64_64_to_free_srcs(b, dst, x, y);
return dst;
} else {
unreachable("Unsupported GPU class");
}
}
static inline struct mme_value64
mme_mul64(struct mme_builder *b,
struct mme_value64 x, struct mme_value64 y)

View File

@@ -338,6 +338,28 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b,
mme_srl_to(b, x, x, mme_imm(1u));
mme_sll_to(b, y, y, mme_imm(1u));
}
mme_free_reg(b, x);
mme_free_reg(b, y);
}
void
mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b,
struct mme_value64 dst,
struct mme_value x,
struct mme_value64 y)
{
mme_while (b, ine, x, mme_zero()) {
struct mme_value lsb = mme_and(b, x, mme_imm(1));
mme_if (b, ine, lsb, mme_zero()) {
mme_add64_to(b, dst, dst, y);
}
mme_free_reg(b, lsb);
mme_srl_to(b, x, x, mme_imm(1u));
/* y = y << 1 */
mme_add64_to(b, y, y, y);
}
mme_free_reg(b, x);
mme_free_reg64(b, y);
}
static struct mme_value

View File

@@ -115,6 +115,12 @@ mme_fermi_umul_32x32_32_to_free_srcs(struct mme_builder *b,
struct mme_value x,
struct mme_value y);
void
mme_fermi_umul_32x64_64_to_free_srcs(struct mme_builder *b,
struct mme_value64 dst,
struct mme_value x,
struct mme_value64 y);
void
mme_fermi_merge_to(struct mme_builder *b, struct mme_value dst,
struct mme_value x, struct mme_value y,

View File

@@ -205,7 +205,7 @@ static const uint32_t mul_cases[] = {
0xfffe0000,
};
TEST_F(mme_builder_test, mul)
TEST_F(mme_builder_test, mul_32x32_32)
{
for (auto sim : sims) {
mme_builder b;
@@ -214,7 +214,7 @@ TEST_F(mme_builder_test, mul)
mme_value x = mme_load(&b);
mme_value y = mme_load(&b);
sim->mme_store_data(&b, 0, mme_mul(&b, x, y));
sim->mme_store_data(&b, 0, mme_mul_32x32_32_free_srcs(&b, x, y));
auto macro = mme_builder_finish_vec(&b);
@@ -231,6 +231,81 @@ TEST_F(mme_builder_test, mul)
}
}
TEST_F(mme_builder_test, umul_32x32_64)
{
for (auto sim : sims) {
mme_builder b;
mme_builder_init(&b, sim->devinfo);
mme_value x = mme_load(&b);
mme_value y = mme_load(&b);
struct mme_value64 d = mme_umul_32x32_64_free_srcs(&b, x, y);
sim->mme_store_data(&b, 0, d.lo);
sim->mme_store_data(&b, 1, d.hi);
auto macro = mme_builder_finish_vec(&b);
for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) {
for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) {
std::vector<uint32_t> params;
params.push_back(mul_cases[i]);
params.push_back(mul_cases[j]);
sim->run_macro(macro, params);
uint64_t d = (uint64_t)mul_cases[i] * (uint64_t)mul_cases[j];
ASSERT_EQ(sim->data[0], (uint32_t)d);
ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32));
}
}
}
}
TEST_F(mme_builder_test, umul_32x64_64)
{
for (auto sim : sims) {
mme_builder b;
mme_builder_init(&b, sim->devinfo);
mme_value x = mme_load(&b);
struct mme_value64 y;
y.lo = mme_load(&b);
y.hi = mme_load(&b);
struct mme_value64 d = mme_umul_32x64_64_free_srcs(&b, x, y);
sim->mme_store_data(&b, 0, d.lo);
sim->mme_store_data(&b, 1, d.hi);
auto macro = mme_builder_finish_vec(&b);
for (uint32_t i = 0; i < ARRAY_SIZE(mul_cases); i++) {
for (uint32_t j = 0; j < ARRAY_SIZE(mul_cases); j++) {
for (uint32_t k = 0; k < ARRAY_SIZE(mul_cases); k++) {
std::vector<uint32_t> params;
params.push_back(mul_cases[i]);
params.push_back(mul_cases[j]);
params.push_back(mul_cases[k]);
sim->run_macro(macro, params);
uint32_t x = mul_cases[i];
uint32_t y_lo = mul_cases[j];
uint32_t y_hi = mul_cases[k];
uint64_t y = ((uint64_t)y_hi << 32) | (uint64_t)y_lo;
uint64_t d = x * y;
ASSERT_EQ(sim->data[0], (uint32_t)d);
ASSERT_EQ(sim->data[1], (uint32_t)(d >> 32));
}
}
}
}
}
TEST_F(mme_builder_test, sll_srl)
{
static const uint32_t x = 0xac406fe1;