diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 19331827ce5..a8f5d0ffc7b 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -1034,6 +1034,7 @@ struct brw_context bool has_negative_rhw_bug; bool has_pln; bool no_simd8; + bool use_rep_send; /** * Some versions of Gen hardware don't do centroid interpolation correctly diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h index 88f205785bb..248a866377d 100644 --- a/src/mesa/drivers/dri/i965/brw_defines.h +++ b/src/mesa/drivers/dri/i965/brw_defines.h @@ -850,6 +850,7 @@ enum opcode { */ FS_OPCODE_FB_WRITE = 128, FS_OPCODE_BLORP_FB_WRITE, + FS_OPCODE_REP_FB_WRITE, SHADER_OPCODE_RCP, SHADER_OPCODE_RSQ, SHADER_OPCODE_SQRT, diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 565189ba3aa..f1d3fb8ec67 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -2283,6 +2283,100 @@ fs_visitor::compute_to_mrf() return progress; } +/** + * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE + * instructions to FS_OPCODE_REP_FB_WRITE. + */ +void +fs_visitor::try_rep_send() +{ + int i, count; + fs_inst *start = NULL; + + /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2 + * ("Message Descriptor - Render Target Write"): + * + * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders." + */ + if (dispatch_width != 16) + return; + + /* The constant color write message can't handle anything but the 4 color + * values. We could do MRT, but the loops below would need to understand + * handling the header being enabled or disabled on different messages. It + * also requires that the render target be tiled, which might not be the + * case for some EGLImage paths or if we some day do rendering to PBOs. + */ + if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) || + payload.aa_dest_stencil_reg || + payload.dest_depth_reg || + dual_src_output.file != BAD_FILE) + return; + + /* The optimization is implemented as one pass through the instruction + * list. We keep track of the most recent block of MOVs into sequential + * MRFs from single, sequential float registers (ie uniforms). Then when + * we find an FB_WRITE opcode, we see if the payload registers match the + * destination registers in our block of MOVs. + */ + count = 0; + foreach_in_list_safe(fs_inst, inst, &this->instructions) { + if (count == 0) + start = inst; + if (inst->opcode == BRW_OPCODE_MOV && + inst->dst.file == MRF && + inst->dst.reg == start->dst.reg + 2 * count && + inst->src[0].file == HW_REG && + inst->src[0].reg_offset == start->src[0].reg_offset + count) { + if (count == 0) + start = inst; + count++; + } + + if (inst->opcode == FS_OPCODE_FB_WRITE && + count == 4 && + (inst->base_mrf == start->dst.reg || + (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) { + fs_inst *mov = MOV(start->dst, start->src[0]); + + /* Make a MOV that moves the four floats into the replicated write + * payload. Since we're running at the very end of code generation + * we can use hw registers and generate the stride and offsets we + * need for this MOV. We use the first of the eight registers + * allocated for the SIMD16 payload for the four floats. + */ + mov->dst.fixed_hw_reg = + brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE, + start->dst.reg, 0); + mov->dst.file = HW_REG; + mov->dst.type = mov->dst.fixed_hw_reg.type; + + mov->src[0].fixed_hw_reg = + brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0); + mov->src[0].file = HW_REG; + mov->src[0].type = mov->src[0].fixed_hw_reg.type; + mov->force_writemask_all = true; + mov->dst.type = BRW_REGISTER_TYPE_F; + + /* Replace the four MOVs with the new vec4 MOV. */ + start->insert_before(mov); + for (i = 0; i < 4; i++) + mov->next->remove(); + + /* Finally, adjust the message length and set the opcode to + * REP_FB_WRITE for the send, so that the generator will use the + * replicated data mesage type. Then reset count so we'll start + * looking for a new block in case we're in a MRT shader. + */ + inst->opcode = FS_OPCODE_REP_FB_WRITE; + inst->mlen -= 7; + count = 0; + } + } + + return; +} + /** * Walks through basic blocks, looking for repeated MRF writes and * removing the later ones. @@ -3226,6 +3320,9 @@ fs_visitor::run() prog_data->total_scratch = brw_get_scratch_size(last_scratch); } + if (brw->use_rep_send) + try_rep_send(); + if (dispatch_width == 8) prog_data->reg_blocks = brw_register_blocks(grf_used); else diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 0f8fb2d8c55..9e5b5d7eff3 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -355,6 +355,8 @@ public: void lower_uniform_pull_constant_loads(); bool lower_load_payload(); + void try_rep_send(); + void push_force_uncompressed(); void pop_force_uncompressed(); @@ -590,6 +592,7 @@ private: GLuint nr); void generate_fb_write(fs_inst *inst); void generate_blorp_fb_write(fs_inst *inst); + void generate_rep_fb_write(fs_inst *inst); void generate_pixel_xy(struct brw_reg dst, bool is_x); void generate_linterp(fs_inst *inst, struct brw_reg dst, struct brw_reg *src); diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 1cf5a886fa2..a2430034086 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -115,7 +115,9 @@ fs_generator::fire_fb_write(fs_inst *inst, brw_pop_insn_state(p); } - if (prog_data->dual_src_blend) + if (inst->opcode == FS_OPCODE_REP_FB_WRITE) + msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED; + else if (prog_data->dual_src_blend) msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01; else if (dispatch_width == 16) msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; @@ -1839,6 +1841,7 @@ fs_generator::generate_code(exec_list *instructions) generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]); break; + case FS_OPCODE_REP_FB_WRITE: case FS_OPCODE_FB_WRITE: generate_fb_write(inst); break;