radeonsi: unroll loops in si_emit_spi_map using 33 C++ template instantiations

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12343>
This commit is contained in:
Marek Olšák
2021-08-11 03:07:03 -04:00
committed by Marge Bot
parent 3264372539
commit dba914de85
5 changed files with 141 additions and 95 deletions
+1
View File
@@ -571,6 +571,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
si_init_state_functions(sctx);
si_init_streamout_functions(sctx);
si_init_viewport_functions(sctx);
si_init_spi_map_functions(sctx);
sctx->blitter = util_blitter_create(&sctx->b);
if (sctx->blitter == NULL)
+1
View File
@@ -1224,6 +1224,7 @@ struct si_context {
pipe_draw_vbo_func draw_vbo[2][2][2];
/* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
pipe_draw_vbo_func real_draw_vbo;
void (*emit_spi_map[33])(struct si_context *sctx);
/* SQTT */
struct ac_thread_trace_data *thread_trace;
+1
View File
@@ -604,6 +604,7 @@ void si_init_draw_functions_GFX8(struct si_context *sctx);
void si_init_draw_functions_GFX9(struct si_context *sctx);
void si_init_draw_functions_GFX10(struct si_context *sctx);
void si_init_draw_functions_GFX10_3(struct si_context *sctx);
void si_init_spi_map_functions(struct si_context *sctx);
/* si_state_msaa.c */
void si_init_msaa_functions(struct si_context *sctx);
+138 -1
View File
@@ -22,6 +22,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "ac_exp_param.h"
#include "ac_sqtt.h"
#include "si_build_pm4.h"
#include "util/u_index_modify.h"
@@ -47,6 +48,95 @@
/* special primitive types */
#define SI_PRIM_RECTANGLE_LIST PIPE_PRIM_MAX
template<int NUM_INTERP>
static void si_emit_spi_map(struct si_context *sctx)
{
struct si_shader *ps = sctx->shader.ps.current;
struct si_shader *vs;
struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
unsigned spi_ps_input_cntl[NUM_INTERP];
STATIC_ASSERT(NUM_INTERP >= 0 && NUM_INTERP <= 32);
if (!NUM_INTERP)
return;
/* With legacy GS, only the GS copy shader contains information about param exports. */
if (sctx->shader.gs.cso && !sctx->ngg)
vs = sctx->shader.gs.cso->gs_copy_shader;
else
vs = si_get_vs(sctx)->current;
struct si_shader_info *vsinfo = &vs->selector->info;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
for (unsigned i = 0; i < NUM_INTERP; i++) {
union si_input_info input = psinfo->input[i];
unsigned ps_input_cntl = 0;
int vs_slot = vsinfo->output_semantic_to_slot[input.semantic];
if (vs_slot >= 0) {
unsigned offset = vs->info.vs_output_param_offset[vs_slot];
if (offset <= AC_EXP_PARAM_OFFSET_31) {
/* The input is loaded from parameter memory. */
ps_input_cntl |= S_028644_OFFSET(offset);
if (input.interpolate == INTERP_MODE_FLAT ||
(input.interpolate == INTERP_MODE_COLOR && rs->flatshade)) {
ps_input_cntl |= S_028644_FLAT_SHADE(1);
}
} else {
/* The input is a DEFAULT_VAL constant. */
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
/* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */
ps_input_cntl = S_028644_OFFSET(0x20) |
S_028644_DEFAULT_VAL(offset);
}
if (input.fp16_lo_hi_valid) {
assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000);
ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) |
S_028644_DEFAULT_VAL_ATTR1(0) |
S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
S_028644_ATTR1_VALID(!!(input.fp16_lo_hi_valid & 0x2));
}
} else {
/* No corresponding output found, load defaults into input. */
ps_input_cntl = S_028644_OFFSET(0x20) |
/* D3D 9 behaviour for COLOR0. GL is undefined */
S_028644_DEFAULT_VAL(input.semantic == VARYING_SLOT_COL1 ? 3 : 0);
}
if (input.semantic == VARYING_SLOT_PNTC ||
(input.semantic >= VARYING_SLOT_TEX0 && input.semantic <= VARYING_SLOT_TEX7 &&
rs->sprite_coord_enable & (1 << (input.semantic - VARYING_SLOT_TEX0)))) {
/* Overwrite the whole value for sprite coordinates. */
ps_input_cntl = S_028644_OFFSET(0) |
S_028644_PT_SPRITE_TEX(1);
if (input.fp16_lo_hi_valid & 0x1) {
ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
S_028644_ATTR0_VALID(1);
}
}
spi_ps_input_cntl[i] = ps_input_cntl;
}
/* R_028644_SPI_PS_INPUT_CNTL_0 */
/* Dota 2: Only ~16% of SPI map updates set different values. */
/* Talos: Only ~9% of SPI map updates set different values. */
radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
sctx->tracked_regs.spi_ps_input_cntl, NUM_INTERP);
radeon_end_update_context_roll(sctx);
}
template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
static bool si_update_shaders(struct si_context *sctx)
{
@@ -188,8 +278,10 @@ static bool si_update_shaders(struct si_context *sctx)
if (si_pm4_state_changed(sctx, ps) ||
(!NGG && si_pm4_state_changed(sctx, vs)) ||
(NGG && si_pm4_state_changed(sctx, gs)))
(NGG && si_pm4_state_changed(sctx, gs))) {
sctx->atoms.s.spi_map.emit = sctx->emit_spi_map[sctx->shader.ps.current->ctx_reg.ps.num_interp];
si_mark_atom_dirty(sctx, &sctx->atoms.s.spi_map);
}
if ((GFX_VERSION >= GFX10_3 || (GFX_VERSION >= GFX9 && sctx->screen->info.rbplus_allowed)) &&
si_pm4_state_changed(sctx, ps) &&
@@ -2413,3 +2505,48 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx)
si_init_ia_multi_vgt_param_table(sctx);
}
#if GFX_VER == 6 /* declare this function only once because it supports all chips. */
extern "C"
void si_init_spi_map_functions(struct si_context *sctx)
{
/* This unrolls the loops in si_emit_spi_map and inlines memcmp and memcpys.
* It improves performance for viewperf/snx.
*/
sctx->emit_spi_map[0] = si_emit_spi_map<0>;
sctx->emit_spi_map[1] = si_emit_spi_map<1>;
sctx->emit_spi_map[2] = si_emit_spi_map<2>;
sctx->emit_spi_map[3] = si_emit_spi_map<3>;
sctx->emit_spi_map[4] = si_emit_spi_map<4>;
sctx->emit_spi_map[5] = si_emit_spi_map<5>;
sctx->emit_spi_map[6] = si_emit_spi_map<6>;
sctx->emit_spi_map[7] = si_emit_spi_map<7>;
sctx->emit_spi_map[8] = si_emit_spi_map<8>;
sctx->emit_spi_map[9] = si_emit_spi_map<9>;
sctx->emit_spi_map[10] = si_emit_spi_map<10>;
sctx->emit_spi_map[11] = si_emit_spi_map<11>;
sctx->emit_spi_map[12] = si_emit_spi_map<12>;
sctx->emit_spi_map[13] = si_emit_spi_map<13>;
sctx->emit_spi_map[14] = si_emit_spi_map<14>;
sctx->emit_spi_map[15] = si_emit_spi_map<15>;
sctx->emit_spi_map[16] = si_emit_spi_map<16>;
sctx->emit_spi_map[17] = si_emit_spi_map<17>;
sctx->emit_spi_map[18] = si_emit_spi_map<18>;
sctx->emit_spi_map[19] = si_emit_spi_map<19>;
sctx->emit_spi_map[20] = si_emit_spi_map<20>;
sctx->emit_spi_map[21] = si_emit_spi_map<21>;
sctx->emit_spi_map[22] = si_emit_spi_map<22>;
sctx->emit_spi_map[23] = si_emit_spi_map<23>;
sctx->emit_spi_map[24] = si_emit_spi_map<24>;
sctx->emit_spi_map[25] = si_emit_spi_map<25>;
sctx->emit_spi_map[26] = si_emit_spi_map<26>;
sctx->emit_spi_map[27] = si_emit_spi_map<27>;
sctx->emit_spi_map[28] = si_emit_spi_map<28>;
sctx->emit_spi_map[29] = si_emit_spi_map<29>;
sctx->emit_spi_map[30] = si_emit_spi_map<30>;
sctx->emit_spi_map[31] = si_emit_spi_map<31>;
sctx->emit_spi_map[32] = si_emit_spi_map<32>;
}
#endif
@@ -3525,99 +3525,6 @@ static void si_delete_shader_selector(struct pipe_context *ctx, void *state)
si_shader_selector_reference(sctx, &sel, NULL);
}
static void si_emit_spi_map(struct si_context *sctx)
{
struct si_shader *ps = sctx->shader.ps.current;
struct si_shader *vs;
struct si_shader_info *psinfo = ps ? &ps->selector->info : NULL;
unsigned i, num_written = 0;
unsigned spi_ps_input_cntl[32];
if (!ps || !ps->selector->info.num_inputs)
return;
/* With legacy GS, only the GS copy shader contains information about param exports. */
if (sctx->shader.gs.cso && !sctx->ngg)
vs = sctx->shader.gs.cso->gs_copy_shader;
else
vs = si_get_vs(sctx)->current;
unsigned num_interp = ps->ctx_reg.ps.num_interp;
struct si_shader_info *vsinfo = &vs->selector->info;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
for (i = 0; i < num_interp; i++) {
unsigned semantic = psinfo->input[i].semantic;
unsigned interpolate = psinfo->input[i].interpolate;
ubyte fp16_lo_hi_mask = psinfo->input[i].fp16_lo_hi_valid;
unsigned ps_input_cntl = 0;
int vs_slot = vsinfo->output_semantic_to_slot[semantic];
if (vs_slot >= 0) {
unsigned offset = vs->info.vs_output_param_offset[vs_slot];
if (offset <= AC_EXP_PARAM_OFFSET_31) {
/* The input is loaded from parameter memory. */
ps_input_cntl |= S_028644_OFFSET(offset);
if (interpolate == INTERP_MODE_FLAT ||
(interpolate == INTERP_MODE_COLOR && rs->flatshade)) {
ps_input_cntl |= S_028644_FLAT_SHADE(1);
}
} else {
/* The input is a DEFAULT_VAL constant. */
assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
offset <= AC_EXP_PARAM_DEFAULT_VAL_1111);
offset -= AC_EXP_PARAM_DEFAULT_VAL_0000;
/* Overwrite the whole value. OFFSET=0x20 means that DEFAULT_VAL is used. */
ps_input_cntl = S_028644_OFFSET(0x20) |
S_028644_DEFAULT_VAL(offset);
}
if (fp16_lo_hi_mask) {
assert(offset <= AC_EXP_PARAM_OFFSET_31 || offset == AC_EXP_PARAM_DEFAULT_VAL_0000);
ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
S_028644_USE_DEFAULT_ATTR1(offset == AC_EXP_PARAM_DEFAULT_VAL_0000) |
S_028644_DEFAULT_VAL_ATTR1(0) |
S_028644_ATTR0_VALID(1) | /* this must be set if FP16_INTERP_MODE is set */
S_028644_ATTR1_VALID(!!(fp16_lo_hi_mask & 0x2));
}
} else {
/* No corresponding output found, load defaults into input. */
ps_input_cntl = S_028644_OFFSET(0x20) |
/* D3D 9 behaviour for COLOR0. GL is undefined */
S_028644_DEFAULT_VAL(semantic == VARYING_SLOT_COL1 ? 3 : 0);
}
if (semantic == VARYING_SLOT_PNTC ||
(semantic >= VARYING_SLOT_TEX0 && semantic <= VARYING_SLOT_TEX7 &&
rs->sprite_coord_enable & (1 << (semantic - VARYING_SLOT_TEX0)))) {
/* Overwrite the whole value for sprite coordinates. */
ps_input_cntl = S_028644_OFFSET(0) |
S_028644_PT_SPRITE_TEX(1);
if (fp16_lo_hi_mask & 0x1) {
ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
S_028644_ATTR0_VALID(1);
}
}
spi_ps_input_cntl[num_written++] = ps_input_cntl;
}
assert(num_interp > 0);
assert(num_interp == num_written);
/* R_028644_SPI_PS_INPUT_CNTL_0 */
/* Dota 2: Only ~16% of SPI map updates set different values. */
/* Talos: Only ~9% of SPI map updates set different values. */
radeon_begin(&sctx->gfx_cs);
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0, spi_ps_input_cntl,
sctx->tracked_regs.spi_ps_input_cntl, num_interp);
radeon_end_update_context_roll(sctx);
}
/**
* Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
*/
@@ -4146,7 +4053,6 @@ void si_init_screen_live_shader_cache(struct si_screen *sscreen)
void si_init_shader_functions(struct si_context *sctx)
{
sctx->atoms.s.spi_map.emit = si_emit_spi_map;
sctx->atoms.s.scratch_state.emit = si_emit_scratch_state;
sctx->b.create_vs_state = si_create_shader;