From 19db6b760aa3dd1ce510e80e5567992d955cd067 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pavel=20Ondra=C4=8Dka?= Date: Fri, 4 Mar 2022 10:27:54 +0100 Subject: [PATCH] r300: set PVS_LAST_VTX_SRC_INST properly to last input read MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From docs: The PVS Instruction which uses the Input Vertex Memory for the last time. This value is used to free up the Input Vertex Slots ASAP. This field must be set to a valid instruction. Right now it is set to the last instruction. When the last read is inside a loop, set it on the outhermost ENDLOOP. This could in theory help performance, but none of my usual benchmarks including GLmark, Unigine Sanctuary or Lightsmark show any measurable performance difference. Suggested in: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6045 Signed-off-by: Pavel Ondračka Reviewed-by: Emma Anholt Part-of: --- .../drivers/r300/compiler/r3xx_vertprog.c | 18 +++++++++++++++++- .../drivers/r300/compiler/radeon_code.h | 1 + src/gallium/drivers/r300/r300_emit.c | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index bc65fa4c80b..39db61e1682 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -371,10 +371,12 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {}; unsigned loop_depth = 0; + bool last_input_read_at_loop_end = false; compiler->code->pos_end = 0; /* Not supported yet */ compiler->code->length = 0; compiler->code->num_temporaries = 0; + compiler->code->last_input_read = 0; compiler->SetHwInputOutput(compiler); @@ -448,6 +450,11 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) unsigned int last_addr; unsigned int ret_addr; + if (loop_depth == 1 && last_input_read_at_loop_end) { + compiler->code->last_input_read = compiler->code->length / 4; + last_input_read_at_loop_end = false; + } + ret_addr = loops[--loop_depth]; act_addr = ret_addr - 1; last_addr = (compiler->code->length / 4) - 1; @@ -536,10 +543,19 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) vpi->DstReg.Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->DstReg.Index + 1; - for (unsigned i = 0; i < info->NumSrcRegs; i++) + for (unsigned i = 0; i < info->NumSrcRegs; i++) { if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY && vpi->SrcReg[i].Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; + if (vpi->SrcReg[i].File == RC_FILE_INPUT) { + if (loop_depth == 0) + compiler->code->last_input_read = compiler->code->length / 4; + else + last_input_read_at_loop_end = true; + } + + } + if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { rc_error(&compiler->Base, "Too many temporaries.\n"); diff --git a/src/gallium/drivers/r300/compiler/radeon_code.h b/src/gallium/drivers/r300/compiler/radeon_code.h index 0c90a7ccb52..52bfab7f1f5 100644 --- a/src/gallium/drivers/r300/compiler/radeon_code.h +++ b/src/gallium/drivers/r300/compiler/radeon_code.h @@ -270,6 +270,7 @@ struct r300_vertex_program_code { int num_temporaries; /* Number of temp vars used by program */ int inputs[VSF_MAX_INPUTS]; int outputs[VSF_MAX_OUTPUTS]; + unsigned last_input_read; struct rc_constant_list constants; unsigned *constants_remap_table; diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c index 0411b950224..efb1cc792dd 100644 --- a/src/gallium/drivers/r300/r300_emit.c +++ b/src/gallium/drivers/r300/r300_emit.c @@ -1128,7 +1128,7 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state) OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, R300_PVS_FIRST_INST(0) | R300_PVS_XYZW_VALID_INST(instruction_count - 1) | R300_PVS_LAST_INST(instruction_count - 1)); - OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, instruction_count - 1); + OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, code->last_input_read); OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0); OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);