diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 7d3f3909d83..3446d934d3c 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -3167,200 +3167,6 @@ fs_visitor::opt_redundant_halt() return progress; } -/** - * Compute a bitmask with GRF granularity with a bit set for each GRF starting - * from \p r.offset which overlaps the region starting at \p s.offset and - * spanning \p ds bytes. - */ -static inline unsigned -mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) -{ - const int rel_offset = reg_offset(s) - reg_offset(r); - const int shift = rel_offset / REG_SIZE; - const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE); - assert(reg_space(r) == reg_space(s) && - shift >= 0 && shift < int(8 * sizeof(unsigned))); - return ((1 << n) - 1) << shift; -} - -bool -fs_visitor::compute_to_mrf() -{ - bool progress = false; - int next_ip = 0; - - /* No MRFs on Gen >= 7. */ - if (devinfo->ver >= 7) - return false; - - const fs_live_variables &live = live_analysis.require(); - - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - int ip = next_ip; - next_ip++; - - if (inst->opcode != BRW_OPCODE_MOV || - inst->is_partial_write() || - inst->dst.file != MRF || inst->src[0].file != VGRF || - inst->dst.type != inst->src[0].type || - inst->src[0].abs || inst->src[0].negate || - !inst->src[0].is_contiguous() || - inst->src[0].offset % REG_SIZE != 0) - continue; - - /* Can't compute-to-MRF this GRF if someone else was going to - * read it later. - */ - if (live.vgrf_end[inst->src[0].nr] > ip) - continue; - - /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the - * things that computed the value of all GRFs of the source region. The - * regs_left bitset keeps track of the registers we haven't yet found a - * generating instruction for. - */ - unsigned regs_left = (1 << regs_read(inst, 0)) - 1; - - foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (regions_overlap(scan_inst->dst, scan_inst->size_written, - inst->src[0], inst->size_read(0))) { - /* Found the last thing to write our reg we want to turn - * into a compute-to-MRF. - */ - - /* If this one instruction didn't populate all the - * channels, bail. We might be able to rewrite everything - * that writes that reg, but it would require smarter - * tracking. - */ - if (scan_inst->is_partial_write()) - break; - - /* Handling things not fully contained in the source of the copy - * would need us to understand coalescing out more than one MOV at - * a time. - */ - if (!region_contained_in(scan_inst->dst, scan_inst->size_written, - inst->src[0], inst->size_read(0))) - break; - - /* SEND instructions can't have MRF as a destination. */ - if (scan_inst->mlen) - break; - - if (devinfo->ver == 6) { - /* gfx6 math instructions must have the destination be - * GRF, so no compute-to-MRF for them. - */ - if (scan_inst->is_math()) { - break; - } - } - - /* Clear the bits for any registers this instruction overwrites. */ - regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, scan_inst->size_written); - if (!regs_left) - break; - } - - /* We don't handle control flow here. Most computation of - * values that end up in MRFs are shortly before the MRF - * write anyway. - */ - if (block->start() == scan_inst) - break; - - /* You can't read from an MRF, so if someone else reads our - * MRF's source GRF that we wanted to rewrite, that stops us. - */ - bool interfered = false; - for (int i = 0; i < scan_inst->sources; i++) { - if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i), - inst->src[0], inst->size_read(0))) { - interfered = true; - } - } - if (interfered) - break; - - if (regions_overlap(scan_inst->dst, scan_inst->size_written, - inst->dst, inst->size_written)) { - /* If somebody else writes our MRF here, we can't - * compute-to-MRF before that. - */ - break; - } - - if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 && - regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE, - inst->dst, inst->size_written)) { - /* Found a SEND instruction, which means that there are - * live values in MRFs from base_mrf to base_mrf + - * scan_inst->mlen - 1. Don't go pushing our MRF write up - * above it. - */ - break; - } - } - - if (regs_left) - continue; - - /* Found all generating instructions of our MRF's source value, so it - * should be safe to rewrite them to point to the MRF directly. - */ - regs_left = (1 << regs_read(inst, 0)) - 1; - - foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - if (regions_overlap(scan_inst->dst, scan_inst->size_written, - inst->src[0], inst->size_read(0))) { - /* Clear the bits for any registers this instruction overwrites. */ - regs_left &= ~mask_relative_to( - inst->src[0], scan_inst->dst, scan_inst->size_written); - - const unsigned rel_offset = reg_offset(scan_inst->dst) - - reg_offset(inst->src[0]); - - if (inst->dst.nr & BRW_MRF_COMPR4) { - /* Apply the same address transformation done by the hardware - * for COMPR4 MRF writes. - */ - assert(rel_offset < 2 * REG_SIZE); - scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4; - - /* Clear the COMPR4 bit if the generating instruction is not - * compressed. - */ - if (scan_inst->size_written < 2 * REG_SIZE) - scan_inst->dst.nr &= ~BRW_MRF_COMPR4; - - } else { - /* Calculate the MRF number the result of this instruction is - * ultimately written to. - */ - scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE; - } - - scan_inst->dst.file = MRF; - scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE; - scan_inst->saturate |= inst->saturate; - if (!regs_left) - break; - } - } - - assert(!regs_left); - inst->remove(block); - progress = true; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - /** * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control * flow. We could probably do better here with some form of divergence @@ -3494,81 +3300,6 @@ fs_visitor::emit_repclear_shader() lower_scoreboard(); } -/** - * Walks through basic blocks, looking for repeated MRF writes and - * removing the later ones. - */ -bool -fs_visitor::remove_duplicate_mrf_writes() -{ - fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)]; - bool progress = false; - - /* Need to update the MRF tracking for compressed instructions. */ - if (dispatch_width >= 16) - return false; - - memset(last_mrf_move, 0, sizeof(last_mrf_move)); - - foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { - if (inst->is_control_flow()) { - memset(last_mrf_move, 0, sizeof(last_mrf_move)); - } - - if (inst->opcode == BRW_OPCODE_MOV && - inst->dst.file == MRF) { - fs_inst *prev_inst = last_mrf_move[inst->dst.nr]; - if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV && - inst->dst.equals(prev_inst->dst) && - inst->src[0].equals(prev_inst->src[0]) && - inst->saturate == prev_inst->saturate && - inst->predicate == prev_inst->predicate && - inst->conditional_mod == prev_inst->conditional_mod && - inst->exec_size == prev_inst->exec_size) { - inst->remove(block); - progress = true; - continue; - } - } - - /* Clear out the last-write records for MRFs that were overwritten. */ - if (inst->dst.file == MRF) { - last_mrf_move[inst->dst.nr] = NULL; - } - - if (inst->mlen > 0 && inst->base_mrf != -1) { - /* Found a SEND instruction, which will include two or fewer - * implied MRF writes. We could do better here. - */ - for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { - last_mrf_move[inst->base_mrf + i] = NULL; - } - } - - /* Clear out any MRF move records whose sources got overwritten. */ - for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { - if (last_mrf_move[i] && - regions_overlap(inst->dst, inst->size_written, - last_mrf_move[i]->src[0], - last_mrf_move[i]->size_read(0))) { - last_mrf_move[i] = NULL; - } - } - - if (inst->opcode == BRW_OPCODE_MOV && - inst->dst.file == MRF && - inst->src[0].file != ARF && - !inst->is_partial_write()) { - last_mrf_move[inst->dst.nr] = inst; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - /** * Rounding modes for conversion instructions are included for each * conversion, but right now it is a state. So once it is set, @@ -3618,185 +3349,6 @@ fs_visitor::remove_extra_rounding_modes() return progress; } -static void -clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) -{ - /* Clear the flag for registers that actually got read (as expected). */ - for (int i = 0; i < inst->sources; i++) { - int grf; - if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) { - grf = inst->src[i].nr; - } else { - continue; - } - - if (grf >= first_grf && - grf < first_grf + grf_len) { - deps[grf - first_grf] = false; - if (inst->exec_size == 16) - deps[grf - first_grf + 1] = false; - } - } -} - -/** - * Implements this workaround for the original 965: - * - * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not - * check for post destination dependencies on this instruction, software - * must ensure that there is no destination hazard for the case of ‘write - * followed by a posted write’ shown in the following example. - * - * 1. mov r3 0 - * 2. send r3.xy - * 3. mov r2 r3 - * - * Due to no post-destination dependency check on the ‘send’, the above - * code sequence could have two instructions (1 and 2) in flight at the - * same time that both consider ‘r3’ as the target of their final writes. - */ -void -fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block, - fs_inst *inst) -{ - int write_len = regs_written(inst); - int first_write_grf = inst->dst.nr; - bool needs_dep[BRW_MAX_MRF(devinfo->ver)]; - assert(write_len < (int)sizeof(needs_dep) - 1); - - memset(needs_dep, false, sizeof(needs_dep)); - memset(needs_dep, true, write_len); - - clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); - - /* Walk backwards looking for writes to registers we're writing which - * aren't read since being written. If we hit the start of the program, - * we assume that there are no outstanding dependencies on entry to the - * program. - */ - foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { - /* If we hit control flow, assume that there *are* outstanding - * dependencies, and force their cleanup before our instruction. - */ - if (block->start() == scan_inst && block->num != 0) { - for (int i = 0; i < write_len; i++) { - if (needs_dep[i]) - DEP_RESOLVE_MOV(fs_builder(this, block, inst), - first_write_grf + i); - } - return; - } - - /* We insert our reads as late as possible on the assumption that any - * instruction but a MOV that might have left us an outstanding - * dependency has more latency than a MOV. - */ - if (scan_inst->dst.file == VGRF) { - for (unsigned i = 0; i < regs_written(scan_inst); i++) { - int reg = scan_inst->dst.nr + i; - - if (reg >= first_write_grf && - reg < first_write_grf + write_len && - needs_dep[reg - first_write_grf]) { - DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg); - needs_dep[reg - first_write_grf] = false; - if (scan_inst->exec_size == 16) - needs_dep[reg - first_write_grf + 1] = false; - } - } - } - - /* Clear the flag for registers that actually got read (as expected). */ - clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); - - /* Continue the loop only if we haven't resolved all the dependencies */ - int i; - for (i = 0; i < write_len; i++) { - if (needs_dep[i]) - break; - } - if (i == write_len) - return; - } -} - -/** - * Implements this workaround for the original 965: - * - * "[DevBW, DevCL] Errata: A destination register from a send can not be - * used as a destination register until after it has been sourced by an - * instruction with a different destination register. - */ -void -fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) -{ - int write_len = regs_written(inst); - unsigned first_write_grf = inst->dst.nr; - bool needs_dep[BRW_MAX_MRF(devinfo->ver)]; - assert(write_len < (int)sizeof(needs_dep) - 1); - - memset(needs_dep, false, sizeof(needs_dep)); - memset(needs_dep, true, write_len); - /* Walk forwards looking for writes to registers we're writing which aren't - * read before being written. - */ - foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) { - /* If we hit control flow, force resolve all remaining dependencies. */ - if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) { - for (int i = 0; i < write_len; i++) { - if (needs_dep[i]) - DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), - first_write_grf + i); - } - return; - } - - /* Clear the flag for registers that actually got read (as expected). */ - clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); - - /* We insert our reads as late as possible since they're reading the - * result of a SEND, which has massive latency. - */ - if (scan_inst->dst.file == VGRF && - scan_inst->dst.nr >= first_write_grf && - scan_inst->dst.nr < first_write_grf + write_len && - needs_dep[scan_inst->dst.nr - first_write_grf]) { - DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), - scan_inst->dst.nr); - needs_dep[scan_inst->dst.nr - first_write_grf] = false; - } - - /* Continue the loop only if we haven't resolved all the dependencies */ - int i; - for (i = 0; i < write_len; i++) { - if (needs_dep[i]) - break; - } - if (i == write_len) - return; - } -} - -void -fs_visitor::insert_gfx4_send_dependency_workarounds() -{ - if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X) - return; - - bool progress = false; - - foreach_block_and_inst(block, fs_inst, inst, cfg) { - if (inst->mlen != 0 && inst->dst.file == VGRF) { - insert_gfx4_pre_send_dependency_workarounds(block, inst); - insert_gfx4_post_send_dependency_workarounds(block, inst); - progress = true; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); -} - bool fs_visitor::lower_load_payload() { @@ -4396,44 +3948,6 @@ fs_visitor::lower_integer_multiplication() return progress; } -bool -fs_visitor::lower_minmax() -{ - assert(devinfo->ver < 6); - - bool progress = false; - - foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - const fs_builder ibld(this, block, inst); - - if (inst->opcode == BRW_OPCODE_SEL && - inst->predicate == BRW_PREDICATE_NONE) { - /* If src1 is an immediate value that is not NaN, then it can't be - * NaN. In that case, emit CMP because it is much better for cmod - * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't - * support HF or DF, so it is not necessary to check for those. - */ - if (inst->src[1].type != BRW_REGISTER_TYPE_F || - (inst->src[1].file == IMM && !isnan(inst->src[1].f))) { - ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], - inst->conditional_mod); - } else { - ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1], - inst->conditional_mod); - } - inst->predicate = BRW_PREDICATE_NORMAL; - inst->conditional_mod = BRW_CONDITIONAL_NONE; - - progress = true; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - bool fs_visitor::lower_sub_sat() { @@ -6163,8 +5677,6 @@ fs_visitor::optimize() pass_num = 0; iteration++; - OPT(remove_duplicate_mrf_writes); - OPT(opt_algebraic); OPT(opt_cse); OPT(opt_copy_propagation); @@ -6175,7 +5687,6 @@ fs_visitor::optimize() OPT(dead_control_flow_eliminate, this); OPT(opt_saturate_propagation); OPT(register_coalesce); - OPT(compute_to_mrf); OPT(eliminate_find_live_channel); OPT(compact_virtual_grfs); @@ -6201,10 +5712,8 @@ fs_visitor::optimize() /* Identify trailing zeros LOAD_PAYLOAD of sampler messages. * Do this before splitting SENDs. */ - if (devinfo->ver >= 7) { - if (OPT(opt_zero_samples) && OPT(opt_copy_propagation)) - OPT(opt_algebraic); - } + if (OPT(opt_zero_samples) && OPT(opt_copy_propagation)) + OPT(opt_algebraic); OPT(opt_split_sends); OPT(fixup_nomask_control_flow); @@ -6220,9 +5729,7 @@ fs_visitor::optimize() */ OPT(opt_cse); OPT(register_coalesce); - OPT(compute_to_mrf); OPT(dead_code_eliminate); - OPT(remove_duplicate_mrf_writes); OPT(opt_peephole_sel); } @@ -6237,7 +5744,6 @@ fs_visitor::optimize() OPT(register_coalesce); OPT(lower_simd_width); - OPT(compute_to_mrf); OPT(dead_code_eliminate); } @@ -6251,14 +5757,6 @@ fs_visitor::optimize() } OPT(lower_sub_sat); - if (devinfo->ver <= 5 && OPT(lower_minmax)) { - OPT(opt_cmod_propagation); - OPT(opt_cse); - if (OPT(opt_copy_propagation)) - OPT(opt_algebraic); - OPT(dead_code_eliminate); - } - progress = false; OPT(lower_derivatives); OPT(lower_regioning); @@ -6770,12 +6268,6 @@ fs_visitor::allocate_registers(bool allow_spilling) _mesa_shader_stage_to_string(stage)); } - /* This must come after all optimization and register allocation, since - * it inserts dead code that happens to have side effects, and it does - * so based on the actual physical registers in use. - */ - insert_gfx4_send_dependency_workarounds(); - if (failed) return; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 0ee32403541..0f9a67f8852 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -276,10 +276,8 @@ public: bool opt_bank_conflicts(); bool opt_split_sends(); bool register_coalesce(); - bool compute_to_mrf(); bool eliminate_find_live_channel(); bool dead_code_eliminate(); - bool remove_duplicate_mrf_writes(); bool remove_extra_rounding_modes(); fs_instruction_scheduler *prepare_scheduler(void *mem_ctx); @@ -287,11 +285,6 @@ public: instruction_scheduler_mode mode); void schedule_instructions_post_ra(); - void insert_gfx4_send_dependency_workarounds(); - void insert_gfx4_pre_send_dependency_workarounds(bblock_t *block, - fs_inst *inst); - void insert_gfx4_post_send_dependency_workarounds(bblock_t *block, - fs_inst *inst); void vfail(const char *msg, va_list args); void fail(const char *msg, ...); void limit_dispatch_width(unsigned n, const char *msg); @@ -301,7 +294,6 @@ public: bool lower_regioning(); bool lower_logical_sends(); bool lower_integer_multiplication(); - bool lower_minmax(); bool lower_simd_width(); bool lower_barycentrics(); bool lower_derivatives();