r300: set PVS_LAST_VTX_SRC_INST properly to last input read

From docs:
The PVS Instruction which uses the Input Vertex Memory for the last
time. This value is used to free up the Input Vertex Slots ASAP.
This field must be set to a valid instruction.

Right now it is set to the last instruction. When the last read is
inside a loop, set it on the outhermost ENDLOOP. This could in theory
help performance, but none of my usual benchmarks including GLmark,
Unigine Sanctuary or Lightsmark show any measurable performance difference.

Suggested in: https://gitlab.freedesktop.org/mesa/mesa/-/issues/6045

Signed-off-by: Pavel Ondračka <pavel.ondracka@gmail.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15252>
This commit is contained in:
Pavel Ondračka
2022-03-04 10:27:54 +01:00
committed by Marge Bot
parent 43c3f4386b
commit 19db6b760a
3 changed files with 19 additions and 2 deletions
@@ -371,10 +371,12 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
unsigned loops[R500_PVS_MAX_LOOP_DEPTH] = {};
unsigned loop_depth = 0;
bool last_input_read_at_loop_end = false;
compiler->code->pos_end = 0; /* Not supported yet */
compiler->code->length = 0;
compiler->code->num_temporaries = 0;
compiler->code->last_input_read = 0;
compiler->SetHwInputOutput(compiler);
@@ -448,6 +450,11 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
unsigned int last_addr;
unsigned int ret_addr;
if (loop_depth == 1 && last_input_read_at_loop_end) {
compiler->code->last_input_read = compiler->code->length / 4;
last_input_read_at_loop_end = false;
}
ret_addr = loops[--loop_depth];
act_addr = ret_addr - 1;
last_addr = (compiler->code->length / 4) - 1;
@@ -536,10 +543,19 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
vpi->DstReg.Index >= compiler->code->num_temporaries)
compiler->code->num_temporaries = vpi->DstReg.Index + 1;
for (unsigned i = 0; i < info->NumSrcRegs; i++)
for (unsigned i = 0; i < info->NumSrcRegs; i++) {
if (vpi->SrcReg[i].File == RC_FILE_TEMPORARY &&
vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
if (vpi->SrcReg[i].File == RC_FILE_INPUT) {
if (loop_depth == 0)
compiler->code->last_input_read = compiler->code->length / 4;
else
last_input_read_at_loop_end = true;
}
}
if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
rc_error(&compiler->Base, "Too many temporaries.\n");
@@ -270,6 +270,7 @@ struct r300_vertex_program_code {
int num_temporaries; /* Number of temp vars used by program */
int inputs[VSF_MAX_INPUTS];
int outputs[VSF_MAX_OUTPUTS];
unsigned last_input_read;
struct rc_constant_list constants;
unsigned *constants_remap_table;
+1 -1
View File
@@ -1128,7 +1128,7 @@ void r300_emit_vs_state(struct r300_context* r300, unsigned size, void* state)
OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_0, R300_PVS_FIRST_INST(0) |
R300_PVS_XYZW_VALID_INST(instruction_count - 1) |
R300_PVS_LAST_INST(instruction_count - 1));
OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, instruction_count - 1);
OUT_CS_REG(R300_VAP_PVS_CODE_CNTL_1, code->last_input_read);
OUT_CS_REG(R300_VAP_PVS_VECTOR_INDX_REG, 0);
OUT_CS_ONE_REG(R300_VAP_PVS_UPLOAD_DATA, code->length);