intel/fs: Implement the new load/store_scratch intrinsics
This commit fills in a number of different pieces:
1. We add support to brw_nir_lower_mem_access_bit_sizes to handle the
new intrinsics. This involves simple plumbing work as well as a
tiny bit of extra logic to always scalarize scratch intrinsics
2. Add code to brw_fs_nir.cpp to turn nir_load/store_scratch intrinsics
into byte/dword scattered read/write messages which use the A32
stateless model.
3. Add code to lower_surface_logical_send to handle dword scattered
messages and the A32 stateless model.
Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
This commit is contained in:
committed by
Jason Ekstrand
parent
e2297699de
commit
53bfcdeecf
@@ -42,6 +42,7 @@ fs_visitor::emit_nir_code()
|
||||
nir_setup_outputs();
|
||||
nir_setup_uniforms();
|
||||
nir_emit_system_values();
|
||||
last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
|
||||
|
||||
nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
|
||||
}
|
||||
@@ -4023,6 +4024,61 @@ image_intrinsic_coord_components(nir_intrinsic_instr *instr)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The offsets we get from NIR act as if each SIMD channel has it's own blob
|
||||
* of contiguous space. However, if we actually place each SIMD channel in
|
||||
* it's own space, we end up with terrible cache performance because each SIMD
|
||||
* channel accesses a different cache line even when they're all accessing the
|
||||
* same byte offset. To deal with this problem, we swizzle the address using
|
||||
* a simple algorithm which ensures that any time a SIMD message reads or
|
||||
* writes the same address, it's all in the same cache line. We have to keep
|
||||
* the bottom two bits fixed so that we can read/write up to a dword at a time
|
||||
* and the individual element is contiguous. We do this by splitting the
|
||||
* address as follows:
|
||||
*
|
||||
* 31 4-6 2 0
|
||||
* +-------------------------------+------------+----------+
|
||||
* | Hi address bits | chan index | addr low |
|
||||
* +-------------------------------+------------+----------+
|
||||
*
|
||||
* In other words, the bottom two address bits stay, and the top 30 get
|
||||
* shifted up so that we can stick the SIMD channel index in the middle. This
|
||||
* way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
|
||||
* at the same logical offset, the scratch read/write instruction acts on
|
||||
* continuous elements and we get good cache locality.
|
||||
*/
|
||||
fs_reg
|
||||
fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
|
||||
const fs_reg &nir_addr,
|
||||
bool in_dwords)
|
||||
{
|
||||
const fs_reg &chan_index =
|
||||
nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
|
||||
const unsigned chan_index_bits = ffs(dispatch_width) - 1;
|
||||
|
||||
fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
if (in_dwords) {
|
||||
/* In this case, we know the address is aligned to a DWORD and we want
|
||||
* the final address in DWORDs.
|
||||
*/
|
||||
bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
|
||||
bld.OR(addr, addr, chan_index);
|
||||
} else {
|
||||
/* This case substantially more annoying because we have to pay
|
||||
* attention to those pesky two bottom bits.
|
||||
*/
|
||||
fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
|
||||
bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
|
||||
fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
|
||||
bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
|
||||
bld.OR(addr, addr, addr_hi);
|
||||
bld.OR(addr, addr, chan_addr);
|
||||
}
|
||||
return addr;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
|
||||
{
|
||||
@@ -4682,6 +4738,99 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_scratch: {
|
||||
assert(devinfo->gen >= 7);
|
||||
|
||||
assert(nir_dest_num_components(instr->dest) == 1);
|
||||
const unsigned bit_size = nir_dest_bit_size(instr->dest);
|
||||
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
|
||||
|
||||
if (devinfo->gen >= 8) {
|
||||
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
|
||||
brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
|
||||
} else {
|
||||
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
|
||||
}
|
||||
|
||||
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
|
||||
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
|
||||
const fs_reg nir_addr = get_nir_src(instr->src[0]);
|
||||
|
||||
/* Make dest unsigned because that's what the temporary will be */
|
||||
dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||
|
||||
/* Read the vector */
|
||||
if (nir_intrinsic_align(instr) >= 4) {
|
||||
assert(nir_dest_bit_size(instr->dest) == 32);
|
||||
|
||||
/* The offset for a DWORD scattered message is in dwords. */
|
||||
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||
swizzle_nir_scratch_addr(bld, nir_addr, true);
|
||||
|
||||
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
|
||||
dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||
} else {
|
||||
assert(nir_dest_bit_size(instr->dest) <= 32);
|
||||
|
||||
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||
swizzle_nir_scratch_addr(bld, nir_addr, false);
|
||||
|
||||
fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
|
||||
read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||
bld.MOV(dest, read_result);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_store_scratch: {
|
||||
assert(devinfo->gen >= 7);
|
||||
|
||||
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||
const unsigned bit_size = nir_src_bit_size(instr->src[0]);
|
||||
fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
|
||||
|
||||
if (devinfo->gen >= 8) {
|
||||
srcs[SURFACE_LOGICAL_SRC_SURFACE] =
|
||||
brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
|
||||
} else {
|
||||
srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
|
||||
}
|
||||
|
||||
srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
|
||||
srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
|
||||
const fs_reg nir_addr = get_nir_src(instr->src[1]);
|
||||
|
||||
fs_reg data = get_nir_src(instr->src[0]);
|
||||
data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
|
||||
|
||||
assert(nir_intrinsic_write_mask(instr) ==
|
||||
(1u << instr->num_components) - 1);
|
||||
if (nir_intrinsic_align(instr) >= 4) {
|
||||
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||
srcs[SURFACE_LOGICAL_SRC_DATA] = data;
|
||||
|
||||
/* The offset for a DWORD scattered message is in dwords. */
|
||||
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||
swizzle_nir_scratch_addr(bld, nir_addr, true);
|
||||
|
||||
bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
|
||||
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||
} else {
|
||||
assert(nir_src_bit_size(instr->src[0]) <= 32);
|
||||
|
||||
srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
|
||||
|
||||
srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
|
||||
swizzle_nir_scratch_addr(bld, nir_addr, false);
|
||||
|
||||
bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
|
||||
fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_subgroup_size:
|
||||
/* This should only happen for fragment shaders because every other case
|
||||
* is lowered in NIR so we can optimize on it.
|
||||
|
||||
Reference in New Issue
Block a user