intel/elk: Use common code in intel/compiler

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
This commit is contained in:
Caio Oliveira
2024-01-19 13:02:48 -08:00
committed by Marge Bot
parent dcf29202d4
commit 8a57012ff4
21 changed files with 4 additions and 2718 deletions
+1 -1
View File
@@ -36,7 +36,7 @@
#include "brw_eu_defines.h"
#include "brw_eu.h"
#include "brw_shader.h"
#include "intel_gfx_ver_enum.h"
#include "../intel_gfx_ver_enum.h"
#include "dev/intel_debug.h"
#include "util/u_debug.h"
+1 -1
View File
@@ -37,7 +37,7 @@
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
#include "brw_private.h"
#include "intel_nir.h"
#include "../intel_nir.h"
#include "shader_enums.h"
#include "dev/intel_debug.h"
#include "dev/intel_wa.h"
+1 -1
View File
@@ -21,7 +21,7 @@
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "../intel_nir.h"
#include "brw_nir.h"
#include "brw_shader.h"
#include "dev/intel_debug.h"
+1 -1
View File
@@ -27,7 +27,7 @@
* Tessellaton control shader specific code derived from the vec4_visitor class.
*/
#include "intel_nir.h"
#include "../intel_nir.h"
#include "brw_nir.h"
#include "brw_vec4_tcs.h"
#include "brw_fs.h"
@@ -1,72 +0,0 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef INTEL_GFX_VER_ENUM_H
#define INTEL_GFX_VER_ENUM_H
#include "util/macros.h"
#include "dev/intel_device_info.h"
enum gfx_ver {
GFX4 = (1 << 0),
GFX45 = (1 << 1),
GFX5 = (1 << 2),
GFX6 = (1 << 3),
GFX7 = (1 << 4),
GFX75 = (1 << 5),
GFX8 = (1 << 6),
GFX9 = (1 << 7),
GFX10 = (1 << 8),
GFX11 = (1 << 9),
GFX12 = (1 << 10),
GFX125 = (1 << 11),
XE2 = (1 << 12),
GFX_ALL = ~0
};
#define GFX_LT(ver) ((ver) - 1)
#define GFX_GE(ver) (~GFX_LT(ver))
#define GFX_LE(ver) (GFX_LT(ver) | (ver))
static inline enum gfx_ver
gfx_ver_from_devinfo(const struct intel_device_info *devinfo)
{
switch (devinfo->verx10) {
case 40: return GFX4;
case 45: return GFX45;
case 50: return GFX5;
case 60: return GFX6;
case 70: return GFX7;
case 75: return GFX75;
case 80: return GFX8;
case 90: return GFX9;
case 110: return GFX11;
case 120: return GFX12;
case 125: return GFX125;
case 200: return XE2;
default:
unreachable("not reached");
}
}
#endif
-28
View File
@@ -1,28 +0,0 @@
/*
* Copyright (c) 2014-2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "intel_nir.h"
bool
intel_nir_pulls_at_sample(nir_shader *shader)
{
nir_foreach_function_impl(impl, shader) {
nir_foreach_block(block, impl) {
nir_foreach_instr(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic == nir_intrinsic_load_barycentric_at_sample)
return true;
}
}
}
return false;
}
-46
View File
@@ -1,46 +0,0 @@
/*
* Copyright (c) 2015-2023 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#ifndef INTEL_NIR_H
#define INTEL_NIR_H
#include "nir.h"
#ifdef __cplusplus
extern "C" {
#endif
struct intel_device_info;
void intel_nir_apply_tcs_quads_workaround(nir_shader *nir);
bool intel_nir_blockify_uniform_loads(nir_shader *shader,
const struct intel_device_info *devinfo);
bool intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader);
bool intel_nir_clamp_per_vertex_loads(nir_shader *shader);
bool intel_nir_cleanup_resource_intel(nir_shader *shader);
bool intel_nir_lower_conversions(nir_shader *nir);
bool intel_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir);
bool intel_nir_lower_non_uniform_resource_intel(nir_shader *shader);
bool intel_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices);
bool intel_nir_lower_shading_rate_output(nir_shader *nir);
bool intel_nir_lower_sparse_intrinsics(nir_shader *nir);
struct intel_nir_lower_texture_opts {
bool combined_lod_and_array_index;
};
bool intel_nir_lower_texture(nir_shader *nir,
const struct intel_nir_lower_texture_opts *opts);
bool intel_nir_opt_peephole_ffma(nir_shader *shader);
bool intel_nir_opt_peephole_imul32x16(nir_shader *shader);
bool intel_nir_pulls_at_sample(nir_shader *shader);
#ifdef __cplusplus
}
#endif
#endif /* INTEL_NIR_H */
@@ -1,116 +0,0 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "dev/intel_device_info.h"
#include "intel_nir.h"
#include "isl/isl.h"
#include "nir_builder.h"
static bool
intel_nir_blockify_uniform_loads_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
const struct intel_device_info *devinfo = cb_data;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
/* BDW PRMs, Volume 7: 3D-Media-GPGPU: OWord Block ReadWrite:
*
* "The surface base address must be OWord-aligned."
*
* We can't make that guarantee with SSBOs where the alignment is
* 4bytes.
*/
if (devinfo->ver < 9)
return false;
if (nir_src_is_divergent(intrin->src[1]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we can only do block loads of at least 4dwords (1
* oword).
*/
if (!devinfo->has_lsc && intrin->def.num_components < 4)
return false;
intrin->intrinsic =
intrin->intrinsic == nir_intrinsic_load_ubo ?
nir_intrinsic_load_ubo_uniform_block_intel :
nir_intrinsic_load_ssbo_uniform_block_intel;
return true;
case nir_intrinsic_load_shared:
/* Block loads on shared memory are not supported before the LSC. */
if (!devinfo->has_lsc)
return false;
if (nir_src_is_divergent(intrin->src[0]))
return false;
if (intrin->def.bit_size != 32)
return false;
intrin->intrinsic = nir_intrinsic_load_shared_uniform_block_intel;
return true;
case nir_intrinsic_load_global_constant:
if (nir_src_is_divergent(intrin->src[0]))
return false;
if (intrin->def.bit_size != 32)
return false;
/* Without the LSC, we can only do block loads of at least 4dwords (1
* oword).
*/
if (!devinfo->has_lsc && intrin->def.num_components < 4)
return false;
intrin->intrinsic = nir_intrinsic_load_global_constant_uniform_block_intel;
return true;
default:
return false;
}
}
bool
intel_nir_blockify_uniform_loads(nir_shader *shader,
const struct intel_device_info *devinfo)
{
return nir_shader_instructions_pass(shader,
intel_nir_blockify_uniform_loads_instr,
nir_metadata_block_index |
nir_metadata_dominance |
nir_metadata_live_defs,
(void *) devinfo);
}
@@ -1,144 +0,0 @@
/*
* Copyright © 2020 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "intel_nir.h"
/**
* Wa_1806565034:
*
* Gfx12+ allows to set RENDER_SURFACE_STATE::SurfaceArray to 1 only if
* array_len > 1. Setting RENDER_SURFACE_STATE::SurfaceArray to 0 results in
* the HW RESINFO message to report an array size of 0 which breaks texture
* array size queries.
*
* This NIR pass works around this by patching the array size with a
* MAX(array_size, 1) for array textures.
*/
static bool
intel_nir_clamp_image_1d_2d_array_sizes_instr(nir_builder *b,
nir_instr *instr,
UNUSED void *cb_data)
{
nir_def *image_size = NULL;
switch (instr->type) {
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
switch (intr->intrinsic) {
case nir_intrinsic_image_size:
case nir_intrinsic_bindless_image_size:
if (!nir_intrinsic_image_array(intr))
break;
image_size = &intr->def;
break;
case nir_intrinsic_image_deref_size: {
nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
assert(glsl_type_is_image(deref->type));
if (!glsl_sampler_type_is_array(deref->type))
break;
image_size = &intr->def;
break;
}
default:
break;
}
break;
}
case nir_instr_type_tex: {
nir_tex_instr *tex_instr = nir_instr_as_tex(instr);
if (tex_instr->op != nir_texop_txs)
break;
if (!tex_instr->is_array)
break;
image_size = &tex_instr->def;
break;
}
default:
break;
}
if (!image_size)
return false;
b->cursor = nir_after_instr(instr);
nir_def *components[4];
/* OR all the sizes for all components but the last. */
nir_def *or_components = nir_imm_int(b, 0);
for (int i = 0; i < image_size->num_components; i++) {
if (i == (image_size->num_components - 1)) {
nir_def *null_or_size[2] = {
nir_imm_int(b, 0),
nir_imax(b, nir_channel(b, image_size, i),
nir_imm_int(b, 1)),
};
nir_def *vec2_null_or_size = nir_vec(b, null_or_size, 2);
/* Using the ORed sizes select either the element 0 or 1
* from this vec2. For NULL textures which have a size of
* 0x0x0, we'll select the first element which is 0 and for
* the rest MAX(depth, 1).
*/
components[i] =
nir_vector_extract(b, vec2_null_or_size,
nir_imin(b, or_components,
nir_imm_int(b, 1)));
} else {
components[i] = nir_channel(b, image_size, i);
or_components = nir_ior(b, components[i], or_components);
}
}
nir_def *image_size_replacement =
nir_vec(b, components, image_size->num_components);
b->cursor = nir_after_instr(instr);
nir_def_rewrite_uses_after(image_size,
image_size_replacement,
image_size_replacement->parent_instr);
return true;
}
bool
intel_nir_clamp_image_1d_2d_array_sizes(nir_shader *shader)
{
return nir_shader_instructions_pass(shader,
intel_nir_clamp_image_1d_2d_array_sizes_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
@@ -1,108 +0,0 @@
/*
* Copyright (c) 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/*
* Limit input per vertex input accesses. This is useful for the tesselation stages.
* On Gfx12.5+ out of bound accesses generate hangs.
*
* This pass operates on derefs, it must be called before shader inputs are
* lowered.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
#include "compiler/nir/nir_deref.h"
static bool
clamp_per_vertex_loads_instr(nir_builder *b, nir_intrinsic_instr *intrin,
void *cb_data)
{
if (intrin->intrinsic != nir_intrinsic_load_deref)
return false;
nir_deref_instr *deref = nir_instr_as_deref(intrin->src[0].ssa->parent_instr);
nir_variable *var = nir_deref_instr_get_variable(deref);
if (var == NULL || (var->data.mode & nir_var_shader_in) == 0)
return false;
nir_deref_path path;
nir_deref_path_init(&path, deref, cb_data);
bool progress = false;
for (unsigned i = 0; path.path[i]; i++) {
if (path.path[i]->deref_type != nir_deref_type_array)
continue;
b->cursor = nir_before_instr(&path.path[i]->instr);
nir_src_rewrite(&path.path[i]->arr.index,
nir_umin(b, path.path[i]->arr.index.ssa, nir_iadd_imm(b, nir_load_patch_vertices_in(b), -1)));
progress = true;
break;
}
nir_deref_path_finish(&path);
return progress;
}
bool
intel_nir_clamp_per_vertex_loads(nir_shader *shader)
{
void *mem_ctx = ralloc_context(NULL);
bool ret = nir_shader_intrinsics_pass(shader, clamp_per_vertex_loads_instr,
nir_metadata_block_index |
nir_metadata_dominance,
mem_ctx);
ralloc_free(mem_ctx);
return ret;
}
static bool
lower_patch_vertices_instr(nir_builder *b, nir_intrinsic_instr *intrin,
void *cb_data)
{
if (intrin->intrinsic != nir_intrinsic_load_patch_vertices_in)
return false;
unsigned *input_vertices = cb_data;
b->cursor = nir_before_instr(&intrin->instr);
nir_def_rewrite_uses(&intrin->def, nir_imm_int(b, *input_vertices));
return true;
}
bool
intel_nir_lower_patch_vertices_in(nir_shader *shader, unsigned input_vertices)
{
return nir_shader_intrinsics_pass(shader, lower_patch_vertices_instr,
nir_metadata_block_index |
nir_metadata_dominance,
&input_vertices);
}
@@ -1,115 +0,0 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
static void
split_conversion(nir_builder *b, nir_alu_instr *alu, nir_alu_type src_type,
nir_alu_type tmp_type, nir_alu_type dst_type)
{
b->cursor = nir_before_instr(&alu->instr);
nir_def *src = nir_ssa_for_alu_src(b, alu, 0);
nir_def *tmp = nir_type_convert(b, src, src_type, tmp_type, nir_rounding_mode_undef);
nir_def *res = nir_type_convert(b, tmp, tmp_type, dst_type, nir_rounding_mode_undef);
nir_def_rewrite_uses(&alu->def, res);
nir_instr_remove(&alu->instr);
}
static bool
lower_alu_instr(nir_builder *b, nir_alu_instr *alu)
{
unsigned src_bit_size = nir_src_bit_size(alu->src[0].src);
nir_alu_type src_type = nir_op_infos[alu->op].input_types[0];
nir_alu_type src_full_type = (nir_alu_type) (src_type | src_bit_size);
unsigned dst_bit_size = alu->def.bit_size;
nir_alu_type dst_full_type = nir_op_infos[alu->op].output_type;
nir_alu_type dst_type = nir_alu_type_get_base_type(dst_full_type);
/* BDW PRM, vol02, Command Reference Instructions, mov - MOVE:
*
* "There is no direct conversion from HF to DF or DF to HF.
* Use two instructions and F (Float) as an intermediate type.
*
* There is no direct conversion from HF to Q/UQ or Q/UQ to HF.
* Use two instructions and F (Float) or a word integer type
* or a DWord integer type as an intermediate type."
*
* It is important that the intermediate conversion happens through a
* 32-bit float type so we don't lose range when we convert from
* a 64-bit integer.
*/
unsigned int64_types = nir_type_int64 | nir_type_uint64;
if ((src_full_type == nir_type_float16 && (dst_full_type & int64_types)) ||
((src_full_type & int64_types) && dst_full_type == nir_type_float16)) {
split_conversion(b, alu, src_type, nir_type_float | 32,
dst_type | dst_bit_size);
return true;
}
/* SKL PRM, vol 02a, Command Reference: Instructions, Move:
*
* "There is no direct conversion from B/UB to DF or DF to B/UB. Use
* two instructions and a word or DWord intermediate type."
*
* "There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
* Use two instructions and a word or DWord intermediate integer
* type."
*
* It is important that we use a 32-bit integer matching the sign of the
* destination as the intermediate type so we avoid any chance of rtne
* rounding happening before the conversion to integer (which is expected
* to round towards zero) in double to byte conversions.
*/
if ((src_bit_size == 8 && dst_bit_size == 64) ||
(src_bit_size == 64 && dst_bit_size == 8)) {
split_conversion(b, alu, src_type, dst_type | 32, dst_type | dst_bit_size);
return true;
}
return false;
}
static bool
lower_instr(nir_builder *b, nir_instr *instr, UNUSED void *cb_data)
{
if (instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *alu = nir_instr_as_alu(instr);
if (!nir_op_infos[alu->op].is_conversion)
return false;
return lower_alu_instr(b, alu);
}
bool
intel_nir_lower_conversions(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, lower_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
@@ -1,307 +0,0 @@
/*
* Copyright (c) 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
struct lower_intrinsics_state {
nir_shader *nir;
nir_function_impl *impl;
bool progress;
nir_builder builder;
};
static void
compute_local_index_id(nir_builder *b,
nir_shader *nir,
nir_def **local_index,
nir_def **local_id)
{
nir_def *subgroup_id = nir_load_subgroup_id(b);
nir_def *thread_local_id =
nir_imul(b, subgroup_id, nir_load_simd_width_intel(b));
nir_def *channel = nir_load_subgroup_invocation(b);
nir_def *linear = nir_iadd(b, channel, thread_local_id);
nir_def *size_x;
nir_def *size_y;
if (nir->info.workgroup_size_variable) {
nir_def *size_xyz = nir_load_workgroup_size(b);
size_x = nir_channel(b, size_xyz, 0);
size_y = nir_channel(b, size_xyz, 1);
} else {
size_x = nir_imm_int(b, nir->info.workgroup_size[0]);
size_y = nir_imm_int(b, nir->info.workgroup_size[1]);
}
nir_def *size_xy = nir_imul(b, size_x, size_y);
/* The local invocation index and ID must respect the following
*
* gl_LocalInvocationID.x =
* gl_LocalInvocationIndex % gl_WorkGroupSize.x;
* gl_LocalInvocationID.y =
* (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
* gl_WorkGroupSize.y;
* gl_LocalInvocationID.z =
* (gl_LocalInvocationIndex /
* (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
* gl_WorkGroupSize.z;
*
* However, the final % gl_WorkGroupSize.z does nothing unless we
* accidentally end up with a gl_LocalInvocationIndex that is too
* large so it can safely be omitted.
*/
nir_def *id_x, *id_y, *id_z;
switch (nir->info.cs.derivative_group) {
case DERIVATIVE_GROUP_NONE:
if (nir->info.num_images == 0 &&
nir->info.num_textures == 0) {
/* X-major lid order. Optimal for linear accesses only,
* which are usually buffers. X,Y ordering will look like:
* (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
*/
id_x = nir_umod(b, linear, size_x);
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
*local_index = linear;
} else if (!nir->info.workgroup_size_variable &&
nir->info.workgroup_size[1] % 4 == 0) {
/* 1x4 block X-major lid order. Same as X-major except increments in
* blocks of width=1 height=4. Always optimal for tileY and usually
* optimal for linear accesses.
* x = (linear / 4) % size_x
* y = ((linear % 4) + (linear / 4 / size_x) * 4) % size_y
* X,Y ordering will look like: (0,0) (0,1) (0,2) (0,3) (1,0) (1,1)
* (1,2) (1,3) (2,0) ... (size_x-1,3) (0,4) (0,5) (0,6) (0,7) (1,4) ...
*/
const unsigned height = 4;
nir_def *block = nir_udiv_imm(b, linear, height);
id_x = nir_umod(b, block, size_x);
id_y = nir_umod(b,
nir_iadd(b,
nir_umod_imm(b, linear, height),
nir_imul_imm(b,
nir_udiv(b, block, size_x),
height)),
size_y);
} else {
/* Y-major lid order. Optimal for tileY accesses only,
* which are usually images. X,Y ordering will look like:
* (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
*/
id_y = nir_umod(b, linear, size_y);
id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
}
id_z = nir_udiv(b, linear, size_xy);
*local_id = nir_vec3(b, id_x, id_y, id_z);
if (!*local_index) {
*local_index = nir_iadd(b, nir_iadd(b, id_x,
nir_imul(b, id_y, size_x)),
nir_imul(b, id_z, size_xy));
}
break;
case DERIVATIVE_GROUP_LINEAR:
/* For linear, just set the local invocation index linearly,
* and calculate local invocation ID from that.
*/
id_x = nir_umod(b, linear, size_x);
id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
id_z = nir_udiv(b, linear, size_xy);
*local_id = nir_vec3(b, id_x, id_y, id_z);
*local_index = linear;
break;
case DERIVATIVE_GROUP_QUADS: {
/* For quads, first we figure out the 2x2 grid the invocation
* belongs to -- treating extra Z layers as just more rows.
* Then map that into local invocation ID (trivial) and local
* invocation index. Skipping Z simplify index calculation.
*/
nir_def *one = nir_imm_int(b, 1);
nir_def *double_size_x = nir_ishl(b, size_x, one);
/* ID within a pair of rows, where each group of 4 is 2x2 quad. */
nir_def *row_pair_id = nir_umod(b, linear, double_size_x);
nir_def *y_row_pairs = nir_udiv(b, linear, double_size_x);
nir_def *x =
nir_ior(b,
nir_iand(b, row_pair_id, one),
nir_iand(b, nir_ishr(b, row_pair_id, one),
nir_imm_int(b, 0xfffffffe)));
nir_def *y =
nir_ior(b,
nir_ishl(b, y_row_pairs, one),
nir_iand(b, nir_ishr(b, row_pair_id, one), one));
*local_id = nir_vec3(b, x,
nir_umod(b, y, size_y),
nir_udiv(b, y, size_y));
*local_index = nir_iadd(b, x, nir_imul(b, y, size_x));
break;
}
default:
unreachable("invalid derivative group");
}
}
static bool
lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
nir_block *block)
{
bool progress = false;
nir_builder *b = &state->builder;
nir_shader *nir = state->nir;
/* Reuse calculated values inside the block. */
nir_def *local_index = NULL;
nir_def *local_id = NULL;
nir_foreach_instr_safe(instr, block) {
if (instr->type != nir_instr_type_intrinsic)
continue;
nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
b->cursor = nir_after_instr(&intrinsic->instr);
nir_def *sysval;
switch (intrinsic->intrinsic) {
case nir_intrinsic_load_local_invocation_index:
case nir_intrinsic_load_local_invocation_id: {
if (!local_index && !nir->info.workgroup_size_variable) {
const uint16_t *ws = nir->info.workgroup_size;
if (ws[0] * ws[1] * ws[2] == 1) {
nir_def *zero = nir_imm_int(b, 0);
local_index = zero;
local_id = nir_replicate(b, zero, 3);
}
}
if (!local_index) {
if (nir->info.stage == MESA_SHADER_TASK ||
nir->info.stage == MESA_SHADER_MESH) {
/* Will be lowered by nir_emit_task_mesh_intrinsic() using
* information from the payload.
*/
continue;
}
/* First time we are using those, so let's calculate them. */
assert(!local_id);
compute_local_index_id(b, nir, &local_index, &local_id);
}
assert(local_id);
assert(local_index);
if (intrinsic->intrinsic == nir_intrinsic_load_local_invocation_id)
sysval = local_id;
else
sysval = local_index;
break;
}
case nir_intrinsic_load_num_subgroups: {
nir_def *size;
if (state->nir->info.workgroup_size_variable) {
nir_def *size_xyz = nir_load_workgroup_size(b);
nir_def *size_x = nir_channel(b, size_xyz, 0);
nir_def *size_y = nir_channel(b, size_xyz, 1);
nir_def *size_z = nir_channel(b, size_xyz, 2);
size = nir_imul(b, nir_imul(b, size_x, size_y), size_z);
} else {
size = nir_imm_int(b, nir->info.workgroup_size[0] *
nir->info.workgroup_size[1] *
nir->info.workgroup_size[2]);
}
/* Calculate the equivalent of DIV_ROUND_UP. */
nir_def *simd_width = nir_load_simd_width_intel(b);
sysval =
nir_udiv(b, nir_iadd_imm(b, nir_iadd(b, size, simd_width), -1),
simd_width);
break;
}
default:
continue;
}
if (intrinsic->def.bit_size == 64)
sysval = nir_u2u64(b, sysval);
nir_def_rewrite_uses(&intrinsic->def, sysval);
nir_instr_remove(&intrinsic->instr);
state->progress = true;
}
return progress;
}
static void
lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
{
state->builder = nir_builder_create(state->impl);
nir_foreach_block(block, state->impl) {
lower_cs_intrinsics_convert_block(state, block);
}
nir_metadata_preserve(state->impl,
nir_metadata_block_index | nir_metadata_dominance);
}
bool
intel_nir_lower_cs_intrinsics(nir_shader *nir)
{
assert(gl_shader_stage_uses_workgroup(nir->info.stage));
struct lower_intrinsics_state state = {
.nir = nir,
};
/* Constraints from NV_compute_shader_derivatives. */
if (gl_shader_stage_is_compute(nir->info.stage) &&
!nir->info.workgroup_size_variable) {
if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_QUADS) {
assert(nir->info.workgroup_size[0] % 2 == 0);
assert(nir->info.workgroup_size[1] % 2 == 0);
} else if (nir->info.cs.derivative_group == DERIVATIVE_GROUP_LINEAR) {
ASSERTED unsigned workgroup_size =
nir->info.workgroup_size[0] *
nir->info.workgroup_size[1] *
nir->info.workgroup_size[2];
assert(workgroup_size % 4 == 0);
}
}
nir_foreach_function_impl(impl, nir) {
state.impl = impl;
lower_cs_intrinsics_convert_impl(&state);
}
return state.progress;
}
@@ -1,80 +0,0 @@
/*
* Copyright © 2023 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/*
* Lower non uniform at sample messages to the interpolator.
*
* This is pretty much identical to what nir_lower_non_uniform_access() does.
* We do it here because otherwise GCM would undo this optimization. Also we
* can assume divergence analysis here.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
static bool
intel_nir_lower_non_uniform_barycentric_at_sample_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
if (instr->type != nir_instr_type_intrinsic)
return false;
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
if (intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
return false;
if (nir_src_is_always_uniform(intrin->src[0]) ||
!nir_src_is_divergent(intrin->src[0]))
return false;
nir_def *sample_id = intrin->src[0].ssa;
b->cursor = nir_instr_remove(&intrin->instr);
nir_push_loop(b);
{
nir_def *first_sample_id = nir_read_first_invocation(b, sample_id);
nir_push_if(b, nir_ieq(b, sample_id, first_sample_id));
{
nir_builder_instr_insert(b, &intrin->instr);
nir_src_rewrite(&intrin->src[0], first_sample_id);
nir_jump(b, nir_jump_break);
}
}
return true;
}
bool
intel_nir_lower_non_uniform_barycentric_at_sample(nir_shader *nir)
{
return nir_shader_instructions_pass(
nir,
intel_nir_lower_non_uniform_barycentric_at_sample_instr,
nir_metadata_none,
NULL);
}
@@ -1,319 +0,0 @@
/*
* Copyright © 2023 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "util/u_dynarray.h"
#include "intel_nir.h"
static bool
nir_instr_is_resource_intel(nir_instr *instr)
{
return instr->type == nir_instr_type_intrinsic &&
nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_resource_intel;
}
static bool
add_src_instr(nir_src *src, void *state)
{
struct util_dynarray *inst_array = state;
util_dynarray_foreach(inst_array, nir_instr *, instr_ptr) {
if (*instr_ptr == src->ssa->parent_instr)
return true;
}
util_dynarray_append(inst_array, nir_instr *, src->ssa->parent_instr);
return true;
}
static nir_intrinsic_instr *
find_resource_intel(struct util_dynarray *inst_array,
nir_def *def)
{
/* If resouce_intel is already directly in front of the instruction, there
* is nothing to do.
*/
if (nir_instr_is_resource_intel(def->parent_instr))
return NULL;
util_dynarray_append(inst_array, nir_instr *, def->parent_instr);
unsigned idx = 0, scan_index = 0;
while (idx < util_dynarray_num_elements(inst_array, nir_instr *)) {
nir_instr *instr = *util_dynarray_element(inst_array, nir_instr *, idx++);
for (; scan_index < util_dynarray_num_elements(inst_array, nir_instr *); scan_index++) {
nir_instr *scan_instr = *util_dynarray_element(inst_array, nir_instr *, scan_index);
if (nir_instr_is_resource_intel(scan_instr))
return nir_instr_as_intrinsic(scan_instr);
}
nir_foreach_src(instr, add_src_instr, inst_array);
}
return NULL;
}
static bool
intel_nir_lower_non_uniform_intrinsic(nir_builder *b,
nir_intrinsic_instr *intrin,
struct util_dynarray *inst_array)
{
unsigned source;
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_get_ssbo_size:
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_store_ssbo_block_intel:
case nir_intrinsic_load_ubo_uniform_block_intel:
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_image_load_raw_intel:
case nir_intrinsic_image_store_raw_intel:
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap:
case nir_intrinsic_image_size:
case nir_intrinsic_bindless_image_size:
source = 0;
break;
case nir_intrinsic_store_ssbo:
source = 1;
break;
default:
return false;
}
b->cursor = nir_before_instr(&intrin->instr);
util_dynarray_clear(inst_array);
nir_intrinsic_instr *old_resource_intel =
find_resource_intel(inst_array, intrin->src[source].ssa);
if (old_resource_intel == NULL)
return false;
nir_instr *new_instr =
nir_instr_clone(b->shader, &old_resource_intel->instr);
nir_instr_insert(b->cursor, new_instr);
nir_intrinsic_instr *new_resource_intel =
nir_instr_as_intrinsic(new_instr);
nir_src_rewrite(&new_resource_intel->src[1], intrin->src[source].ssa);
nir_src_rewrite(&intrin->src[source], &new_resource_intel->def);
return true;
}
static bool
intel_nir_lower_non_uniform_tex(nir_builder *b,
nir_tex_instr *tex,
struct util_dynarray *inst_array)
{
b->cursor = nir_before_instr(&tex->instr);
bool progress = false;
for (unsigned s = 0; s < tex->num_srcs; s++) {
if (tex->src[s].src_type != nir_tex_src_texture_handle &&
tex->src[s].src_type != nir_tex_src_sampler_handle)
continue;
util_dynarray_clear(inst_array);
nir_intrinsic_instr *old_resource_intel =
find_resource_intel(inst_array, tex->src[s].src.ssa);
if (old_resource_intel == NULL)
continue;
nir_instr *new_instr =
nir_instr_clone(b->shader, &old_resource_intel->instr);
nir_instr_insert(b->cursor, new_instr);
nir_intrinsic_instr *new_resource_intel =
nir_instr_as_intrinsic(new_instr);
nir_src_rewrite(&new_resource_intel->src[1], tex->src[s].src.ssa);
nir_src_rewrite(&tex->src[s].src, &new_resource_intel->def);
progress = true;
}
return progress;
}
static bool
intel_nir_lower_non_uniform_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
struct util_dynarray *inst_array = cb_data;
switch (instr->type) {
case nir_instr_type_intrinsic:
return intel_nir_lower_non_uniform_intrinsic(b,
nir_instr_as_intrinsic(instr),
inst_array);
case nir_instr_type_tex:
return intel_nir_lower_non_uniform_tex(b,
nir_instr_as_tex(instr),
inst_array);
default:
return false;
}
}
/** This pass rematerializes resource_intel intrinsics closer to their use.
*
* For example will turn this :
* ssa_1 = iadd ...
* ssa_2 = resource_intel ..., ssa_1, ...
* ssa_3 = read_first_invocation ssa_2
* ssa_4 = load_ssbo ssa_3, ...
*
* into this :
* ssa_1 = iadd ...
* ssa_3 = read_first_invocation ssa_1
* ssa_5 = resource_intel ..., ssa_3, ...
* ssa_4 = load_ssbo ssa_5, ...
*
* The goal is to have the resource_intel immediately before its use so that
* the backend compiler can know how the load_ssbo should be compiled (binding
* table or bindless access, etc...).
*/
bool
intel_nir_lower_non_uniform_resource_intel(nir_shader *shader)
{
void *mem_ctx = ralloc_context(NULL);
struct util_dynarray inst_array;
util_dynarray_init(&inst_array, mem_ctx);
bool ret = nir_shader_instructions_pass(shader,
intel_nir_lower_non_uniform_instr,
nir_metadata_block_index |
nir_metadata_dominance,
&inst_array);
ralloc_free(mem_ctx);
return ret;
}
static bool
skip_resource_intel_cleanup(nir_instr *instr)
{
switch (instr->type) {
case nir_instr_type_tex:
return true;
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin =
nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_load_ubo:
case nir_intrinsic_load_ssbo:
case nir_intrinsic_store_ssbo:
case nir_intrinsic_get_ssbo_size:
case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_load_ssbo_block_intel:
case nir_intrinsic_store_ssbo_block_intel:
case nir_intrinsic_load_ssbo_uniform_block_intel:
case nir_intrinsic_image_load_raw_intel:
case nir_intrinsic_image_store_raw_intel:
case nir_intrinsic_image_load:
case nir_intrinsic_image_store:
case nir_intrinsic_image_atomic:
case nir_intrinsic_image_atomic_swap:
case nir_intrinsic_bindless_image_load:
case nir_intrinsic_bindless_image_store:
case nir_intrinsic_bindless_image_atomic:
case nir_intrinsic_bindless_image_atomic_swap:
case nir_intrinsic_image_size:
case nir_intrinsic_bindless_image_size:
return true;
default:
return false;
}
}
default:
return false;
}
}
static bool
intel_nir_cleanup_resource_intel_instr(nir_builder *b,
nir_intrinsic_instr *intrin,
void *cb_data)
{
if (intrin->intrinsic != nir_intrinsic_resource_intel)
return false;
bool progress = false;
nir_foreach_use_safe(src, &intrin->def) {
if (!nir_src_is_if(src) && skip_resource_intel_cleanup(nir_src_parent_instr(src)))
continue;
progress = true;
nir_src_rewrite(src, intrin->src[1].ssa);
}
return progress;
}
/** This pass removes unnecessary resource_intel intrinsics
*
* This pass must not be run before intel_nir_lower_non_uniform_resource_intel.
*/
bool
intel_nir_cleanup_resource_intel(nir_shader *shader)
{
void *mem_ctx = ralloc_context(NULL);
bool ret = nir_shader_intrinsics_pass(shader,
intel_nir_cleanup_resource_intel_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
ralloc_free(mem_ctx);
return ret;
}
@@ -1,108 +0,0 @@
/*
* Copyright (c) 2021 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/*
* Lower the shading rate output from the bit field format described in the
* SPIRV spec :
*
* bit | name | description
* 0 | Vertical2Pixels | Fragment invocation covers 2 pixels vertically
* 1 | Vertical4Pixels | Fragment invocation covers 4 pixels vertically
* 2 | Horizontal2Pixels | Fragment invocation covers 2 pixels horizontally
* 3 | Horizontal4Pixels | Fragment invocation covers 4 pixels horizontally
*
* into a single dword composed of 2 fp16 to be stored in the dword 0 of the
* VUE header.
*
* When no horizontal/vertical bits are set, the size in pixel size in that
* dimension is assumed to be 1.
*
* According to the specification, the shading rate output can be read &
* written. A read after a write should report a different value if the
* implementation decides on different primitive shading rate for some reason.
* This is never the case in our implementation.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
static bool
lower_shading_rate_output_instr(nir_builder *b, nir_intrinsic_instr *intrin,
UNUSED void *_state)
{
nir_intrinsic_op op = intrin->intrinsic;
if (op != nir_intrinsic_load_output &&
op != nir_intrinsic_store_output &&
op != nir_intrinsic_load_per_primitive_output &&
op != nir_intrinsic_store_per_primitive_output)
return false;
struct nir_io_semantics io = nir_intrinsic_io_semantics(intrin);
if (io.location != VARYING_SLOT_PRIMITIVE_SHADING_RATE)
return false;
bool is_store = op == nir_intrinsic_store_output ||
op == nir_intrinsic_store_per_primitive_output;
b->cursor = is_store ? nir_before_instr(&intrin->instr) : nir_after_instr(&intrin->instr);
if (is_store) {
nir_def *bit_field = intrin->src[0].ssa;
nir_def *fp16_x =
nir_i2f16(b,
nir_ishl(b, nir_imm_int(b, 1),
nir_ishr_imm(b, bit_field, 2)));
nir_def *fp16_y =
nir_i2f16(b,
nir_ishl(b, nir_imm_int(b, 1),
nir_iand_imm(b, bit_field, 0x3)));
nir_def *packed_fp16_xy = nir_pack_32_2x16_split(b, fp16_x, fp16_y);
nir_src_rewrite(&intrin->src[0], packed_fp16_xy);
} else {
nir_def *packed_fp16_xy = &intrin->def;
nir_def *u32_x =
nir_i2i32(b, nir_unpack_32_2x16_split_x(b, packed_fp16_xy));
nir_def *u32_y =
nir_i2i32(b, nir_unpack_32_2x16_split_y(b, packed_fp16_xy));
nir_def *bit_field =
nir_ior(b, nir_ishl_imm(b, nir_ushr_imm(b, u32_x, 1), 2),
nir_ushr_imm(b, u32_y, 1));
nir_def_rewrite_uses_after(packed_fp16_xy, bit_field,
bit_field->parent_instr);
}
return true;
}
bool
intel_nir_lower_shading_rate_output(nir_shader *nir)
{
return nir_shader_intrinsics_pass(nir, lower_shading_rate_output_instr,
nir_metadata_block_index |
nir_metadata_dominance, NULL);
}
@@ -1,247 +0,0 @@
/*
* Copyright (c) 2023 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
/*
* This pass lowers a few of the sparse instructions to something HW can
* handle.
*
* The image_*_sparse_load intrinsics are lowered into 2 instructions, a
* regular image_*_load intrinsic and a sparse texture txf operation and
* reconstructs the sparse vector of the original intrinsic using the 2 new
* values. We need to do this because our backend implements image load/store
* using the dataport and the dataport unit doesn't provide residency
* information. We need to use the sampler for residency.
*
* The is_sparse_texels_resident intrinsic is lowered to a bit checking
* operation as the data reported by the sampler is a single bit per lane in
* the first component.
*
* The tex_* instructions with a compare value need to be lower into 2
* instructions due to a HW limitation :
*
* SKL PRMs, Volume 7: 3D-Media-GPGPU, Messages, SIMD Payloads :
*
* "The Pixel Null Mask field, when enabled via the Pixel Null Mask Enable
* will be incorect for sample_c when applied to a surface with 64-bit per
* texel format such as R16G16BA16_UNORM. Pixel Null mask Enable may
* incorrectly report pixels as referencing a Null surface."
*/
static void
lower_is_sparse_texels_resident(nir_builder *b, nir_intrinsic_instr *intrin)
{
b->cursor = nir_instr_remove(&intrin->instr);
nir_def_rewrite_uses(
&intrin->def,
nir_i2b(b, nir_iand(b, intrin->src[0].ssa,
nir_ishl(b, nir_imm_int(b, 1),
nir_load_subgroup_invocation(b)))));
}
static void
lower_sparse_residency_code_and(nir_builder *b, nir_intrinsic_instr *intrin)
{
b->cursor = nir_instr_remove(&intrin->instr);
nir_def_rewrite_uses(
&intrin->def,
nir_iand(b, intrin->src[0].ssa, intrin->src[1].ssa));
}
static void
lower_sparse_image_load(nir_builder *b, nir_intrinsic_instr *intrin)
{
b->cursor = nir_instr_remove(&intrin->instr);
nir_def *img_load;
nir_intrinsic_instr *new_intrin;
if (intrin->intrinsic == nir_intrinsic_image_sparse_load) {
img_load = nir_image_load(b,
intrin->num_components - 1,
intrin->def.bit_size,
intrin->src[0].ssa,
intrin->src[1].ssa,
intrin->src[2].ssa,
intrin->src[3].ssa);
new_intrin = nir_instr_as_intrinsic(img_load->parent_instr);
nir_intrinsic_set_range_base(new_intrin, nir_intrinsic_range_base(intrin));
} else {
img_load = nir_bindless_image_load(b,
intrin->num_components - 1,
intrin->def.bit_size,
intrin->src[0].ssa,
intrin->src[1].ssa,
intrin->src[2].ssa,
intrin->src[3].ssa);
new_intrin = nir_instr_as_intrinsic(img_load->parent_instr);
}
nir_intrinsic_set_image_array(new_intrin, nir_intrinsic_image_array(intrin));
nir_intrinsic_set_image_dim(new_intrin, nir_intrinsic_image_dim(intrin));
nir_intrinsic_set_format(new_intrin, nir_intrinsic_format(intrin));
nir_intrinsic_set_access(new_intrin, nir_intrinsic_access(intrin));
nir_intrinsic_set_dest_type(new_intrin, nir_intrinsic_dest_type(intrin));
nir_def *dests[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < intrin->num_components - 1; i++) {
dests[i] = nir_channel(b, img_load, i);
}
/* Use texture instruction to compute residency */
nir_tex_instr *tex = nir_tex_instr_create(b->shader, 3);
tex->op = nir_texop_txf;
/* We don't care about the dest type since we're not using any of that
* data.
*/
tex->dest_type = nir_type_float32;
tex->is_array = nir_intrinsic_image_array(intrin);
tex->is_shadow = false;
tex->sampler_index = 0;
tex->is_sparse = true;
tex->src[0].src_type = intrin->intrinsic == nir_intrinsic_image_sparse_load ?
nir_tex_src_texture_offset :
nir_tex_src_texture_handle;
tex->src[0].src = nir_src_for_ssa(intrin->src[0].ssa);
tex->coord_components = nir_image_intrinsic_coord_components(intrin);
nir_def *coord;
if (nir_intrinsic_image_dim(intrin) == GLSL_SAMPLER_DIM_CUBE &&
nir_intrinsic_image_array(intrin)) {
tex->coord_components++;
nir_def *img_layer = nir_channel(b, intrin->src[1].ssa, 2);
nir_def *tex_slice = nir_idiv(b, img_layer, nir_imm_int(b, 6));
nir_def *tex_face =
nir_iadd(b, img_layer, nir_ineg(b, nir_imul_imm(b, tex_slice, 6)));
nir_def *comps[4] = {
nir_channel(b, intrin->src[1].ssa, 0),
nir_channel(b, intrin->src[1].ssa, 1),
tex_face,
tex_slice
};
coord = nir_vec(b, comps, 4);
} else {
coord = nir_channels(b, intrin->src[1].ssa,
nir_component_mask(tex->coord_components));
}
tex->src[1].src_type = nir_tex_src_coord;
tex->src[1].src = nir_src_for_ssa(coord);
tex->src[2].src_type = nir_tex_src_lod;
tex->src[2].src = nir_src_for_ssa(nir_imm_int(b, 0));
nir_def_init(&tex->instr, &tex->def, 5,
intrin->def.bit_size);
nir_builder_instr_insert(b, &tex->instr);
dests[intrin->num_components - 1] = nir_channel(b, &tex->def, 4);
nir_def_rewrite_uses(
&intrin->def,
nir_vec(b, dests, intrin->num_components));
}
static void
lower_tex_compare(nir_builder *b, nir_tex_instr *tex, int compare_idx)
{
b->cursor = nir_after_instr(&tex->instr);
/* Clone the original instruction */
nir_tex_instr *sparse_tex = nir_instr_as_tex(nir_instr_clone(b->shader, &tex->instr));
nir_def_init(&sparse_tex->instr, &sparse_tex->def,
tex->def.num_components, tex->def.bit_size);
nir_builder_instr_insert(b, &sparse_tex->instr);
/* Drop the compare source on the cloned instruction */
nir_tex_instr_remove_src(sparse_tex, compare_idx);
/* Drop the residency query on the original tex instruction */
tex->is_sparse = false;
tex->def.num_components = tex->def.num_components - 1;
nir_def *new_comps[NIR_MAX_VEC_COMPONENTS];
for (unsigned i = 0; i < tex->def.num_components; i++)
new_comps[i] = nir_channel(b, &tex->def, i);
new_comps[tex->def.num_components] =
nir_channel(b, &sparse_tex->def, tex->def.num_components);
nir_def *new_vec = nir_vec(b, new_comps, sparse_tex->def.num_components);
nir_def_rewrite_uses_after(&tex->def, new_vec, new_vec->parent_instr);
}
static bool
lower_sparse_intrinsics(nir_builder *b, nir_instr *instr, void *cb_data)
{
switch (instr->type) {
case nir_instr_type_intrinsic: {
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
switch (intrin->intrinsic) {
case nir_intrinsic_image_sparse_load:
case nir_intrinsic_bindless_image_sparse_load:
lower_sparse_image_load(b, intrin);
return true;
case nir_intrinsic_is_sparse_texels_resident:
lower_is_sparse_texels_resident(b, intrin);
return true;
case nir_intrinsic_sparse_residency_code_and:
lower_sparse_residency_code_and(b, intrin);
return true;
default:
return false;
}
}
case nir_instr_type_tex: {
nir_tex_instr *tex = nir_instr_as_tex(instr);
int comp_idx = nir_tex_instr_src_index(tex, nir_tex_src_comparator);
if (comp_idx != -1 && tex->is_sparse) {
lower_tex_compare(b, tex, comp_idx);
return true;
}
return false;
}
default:
return false;
}
}
bool
intel_nir_lower_sparse_intrinsics(nir_shader *nir)
{
return nir_shader_instructions_pass(nir, lower_sparse_intrinsics,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
@@ -1,133 +0,0 @@
/*
* Copyright © 2024 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "intel_nir.h"
/**
* Pack either the explicit LOD or LOD bias and the array index together.
*/
static bool
pack_lod_and_array_index(nir_builder *b, nir_tex_instr *tex)
{
/* If 32-bit texture coordinates are used, pack either the explicit LOD or
* LOD bias and the array index into a single (32-bit) value.
*/
int lod_index = nir_tex_instr_src_index(tex, nir_tex_src_lod);
if (lod_index < 0) {
lod_index = nir_tex_instr_src_index(tex, nir_tex_src_bias);
/* The explicit LOD or LOD bias may not be found if this lowering has
* already occured. The explicit LOD may also not be found in some
* cases where it is zero.
*/
if (lod_index < 0)
return false;
}
assert(nir_tex_instr_src_type(tex, lod_index) == nir_type_float);
/* Also do not perform this packing if the explicit LOD is zero. */
if (tex->op == nir_texop_txl &&
nir_src_is_const(tex->src[lod_index].src) &&
nir_src_as_float(tex->src[lod_index].src) == 0.0) {
return false;
}
const int coord_index = nir_tex_instr_src_index(tex, nir_tex_src_coord);
assert(coord_index >= 0);
nir_def *lod = tex->src[lod_index].src.ssa;
nir_def *coord = tex->src[coord_index].src.ssa;
assert(nir_tex_instr_src_type(tex, coord_index) == nir_type_float);
if (coord->bit_size < 32)
return false;
b->cursor = nir_before_instr(&tex->instr);
/* First, combine the two values. The packing format is a little weird.
* The explicit LOD / LOD bias is stored as float, as normal. However, the
* array index is converted to an integer and smashed into the low 9 bits.
*/
const unsigned array_index = tex->coord_components - 1;
nir_def *clamped_ai =
nir_umin(b,
nir_f2u32(b, nir_fround_even(b, nir_channel(b, coord,
array_index))),
nir_imm_int(b, 511));
nir_def *lod_ai = nir_ior(b, nir_iand_imm(b, lod, 0xfffffe00), clamped_ai);
/* Second, replace the coordinate with a new value that has one fewer
* component (i.e., drop the array index).
*/
nir_def *reduced_coord = nir_trim_vector(b, coord, 2);
tex->coord_components--;
/* Finally, remove the old sources and add the new. */
nir_src_rewrite(&tex->src[coord_index].src, reduced_coord);
nir_tex_instr_remove_src(tex, lod_index);
nir_tex_instr_add_src(tex, nir_tex_src_backend1, lod_ai);
return true;
}
static bool
intel_nir_lower_texture_instr(nir_builder *b, nir_instr *instr, void *cb_data)
{
if (instr->type != nir_instr_type_tex)
return false;
const struct intel_nir_lower_texture_opts *opts = cb_data;
nir_tex_instr *tex = nir_instr_as_tex(instr);
switch (tex->op) {
case nir_texop_txl:
case nir_texop_txb:
if (tex->is_array &&
tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
opts->combined_lod_and_array_index) {
return pack_lod_and_array_index(b, tex);
}
return false;
default:
/* Nothing to do */
return false;
}
return false;
}
bool
intel_nir_lower_texture(nir_shader *shader,
const struct intel_nir_lower_texture_opts *opts)
{
return nir_shader_instructions_pass(shader,
intel_nir_lower_texture_instr,
nir_metadata_none,
(void *)opts);
}
@@ -1,253 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
/*
* Implements a small peephole optimization that looks for a multiply that
* is only ever used in an add and replaces both with an fma.
*/
static inline bool
are_all_uses_fadd(nir_def *def)
{
nir_foreach_use_including_if(use_src, def) {
if (nir_src_is_if(use_src))
return false;
nir_instr *use_instr = nir_src_parent_instr(use_src);
if (use_instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
switch (use_alu->op) {
case nir_op_fadd:
break; /* This one's ok */
case nir_op_mov:
case nir_op_fneg:
case nir_op_fabs:
if (!are_all_uses_fadd(&use_alu->def))
return false;
break;
default:
return false;
}
}
return true;
}
static nir_alu_instr *
get_mul_for_src(nir_alu_src *src, unsigned num_components,
uint8_t *swizzle, bool *negate, bool *abs)
{
uint8_t swizzle_tmp[NIR_MAX_VEC_COMPONENTS];
nir_instr *instr = src->src.ssa->parent_instr;
if (instr->type != nir_instr_type_alu)
return NULL;
nir_alu_instr *alu = nir_instr_as_alu(instr);
/* We want to bail if any of the other ALU operations involved is labeled
* exact. One reason for this is that, while the value that is changing is
* actually the result of the add and not the multiply, the intention of
* the user when they specify an exact multiply is that they want *that*
* value and what they don't care about is the add. Another reason is that
* SPIR-V explicitly requires this behaviour.
*/
if (alu->exact)
return NULL;
switch (alu->op) {
case nir_op_mov:
alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
swizzle, negate, abs);
break;
case nir_op_fneg:
alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
swizzle, negate, abs);
*negate = !*negate;
break;
case nir_op_fabs:
alu = get_mul_for_src(&alu->src[0], alu->def.num_components,
swizzle, negate, abs);
*negate = false;
*abs = true;
break;
case nir_op_fmul:
/* Only absorb a fmul into a ffma if the fmul is only used in fadd
* operations. This prevents us from being too aggressive with our
* fusing which can actually lead to more instructions.
*/
if (!are_all_uses_fadd(&alu->def))
return NULL;
break;
default:
return NULL;
}
if (!alu)
return NULL;
/* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
*
* Example:
* Former swizzle[] = xyzw
* src->swizzle[] = zyxx
*
* Expected output swizzle = zyxx
* If we reuse swizzle in the loop, then output swizzle would be zyzz.
*/
memcpy(swizzle_tmp, swizzle, NIR_MAX_VEC_COMPONENTS*sizeof(uint8_t));
for (int i = 0; i < num_components; i++)
swizzle[i] = swizzle_tmp[src->swizzle[i]];
return alu;
}
/**
* Given a list of (at least two) nir_alu_src's, tells if any of them is a
* constant value and is used only once.
*/
static bool
any_alu_src_is_a_constant(nir_alu_src srcs[])
{
for (unsigned i = 0; i < 2; i++) {
if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) {
nir_load_const_instr *load_const =
nir_instr_as_load_const (srcs[i].src.ssa->parent_instr);
if (list_is_singular(&load_const->def.uses))
return true;
}
}
return false;
}
static bool
intel_nir_opt_peephole_ffma_instr(nir_builder *b,
nir_instr *instr,
UNUSED void *cb_data)
{
if (instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *add = nir_instr_as_alu(instr);
if (add->op != nir_op_fadd)
return false;
if (add->exact)
return false;
/* This, is the case a + a. We would rather handle this with an
* algebraic reduction than fuse it. Also, we want to only fuse
* things where the multiply is used only once and, in this case,
* it would be used twice by the same instruction.
*/
if (add->src[0].src.ssa == add->src[1].src.ssa)
return false;
nir_alu_instr *mul;
uint8_t add_mul_src, swizzle[NIR_MAX_VEC_COMPONENTS];
bool negate, abs;
for (add_mul_src = 0; add_mul_src < 2; add_mul_src++) {
for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++)
swizzle[i] = i;
negate = false;
abs = false;
mul = get_mul_for_src(&add->src[add_mul_src],
add->def.num_components,
swizzle, &negate, &abs);
if (mul != NULL)
break;
}
if (mul == NULL)
return false;
unsigned bit_size = add->def.bit_size;
nir_def *mul_src[2];
mul_src[0] = mul->src[0].src.ssa;
mul_src[1] = mul->src[1].src.ssa;
/* If any of the operands of the fmul and any of the fadd is a constant,
* we bypass because it will be more efficient as the constants will be
* propagated as operands, potentially saving two load_const instructions.
*/
if (any_alu_src_is_a_constant(mul->src) &&
any_alu_src_is_a_constant(add->src)) {
return false;
}
b->cursor = nir_before_instr(&add->instr);
if (abs) {
for (unsigned i = 0; i < 2; i++)
mul_src[i] = nir_fabs(b, mul_src[i]);
}
if (negate)
mul_src[0] = nir_fneg(b, mul_src[0]);
nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma);
for (unsigned i = 0; i < 2; i++) {
ffma->src[i].src = nir_src_for_ssa(mul_src[i]);
for (unsigned j = 0; j < add->def.num_components; j++)
ffma->src[i].swizzle[j] = mul->src[i].swizzle[swizzle[j]];
}
nir_alu_src_copy(&ffma->src[2], &add->src[1 - add_mul_src]);
nir_def_init(&ffma->instr, &ffma->def,
add->def.num_components, bit_size);
nir_def_rewrite_uses(&add->def, &ffma->def);
nir_builder_instr_insert(b, &ffma->instr);
assert(list_is_empty(&add->def.uses));
nir_instr_remove(&add->instr);
return true;
}
bool
intel_nir_opt_peephole_ffma(nir_shader *shader)
{
return nir_shader_instructions_pass(shader, intel_nir_opt_peephole_ffma_instr,
nir_metadata_block_index |
nir_metadata_dominance,
NULL);
}
@@ -1,319 +0,0 @@
/*
* Copyright © 2022 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "intel_nir.h"
#include "compiler/nir/nir_builder.h"
/**
* Implement a peephole pass to convert integer multiplications to imul32x16.
*/
struct pass_data {
struct hash_table *range_ht;
};
static void
replace_imul_instr(nir_builder *b, nir_alu_instr *imul, unsigned small_val,
nir_op new_opcode)
{
assert(small_val == 0 || small_val == 1);
b->cursor = nir_before_instr(&imul->instr);
nir_alu_instr *imul_32x16 = nir_alu_instr_create(b->shader, new_opcode);
nir_alu_src_copy(&imul_32x16->src[0], &imul->src[1 - small_val]);
nir_alu_src_copy(&imul_32x16->src[1], &imul->src[small_val]);
nir_def_init(&imul_32x16->instr, &imul_32x16->def,
imul->def.num_components, 32);
nir_def_rewrite_uses(&imul->def,
&imul_32x16->def);
nir_builder_instr_insert(b, &imul_32x16->instr);
nir_instr_remove(&imul->instr);
nir_instr_free(&imul->instr);
}
enum root_operation {
non_unary = 0,
integer_neg = 1 << 0,
integer_abs = 1 << 1,
integer_neg_abs = integer_neg | integer_abs,
invalid_root = 255
};
static enum root_operation
signed_integer_range_analysis(nir_shader *shader, struct hash_table *range_ht,
nir_scalar scalar, int *lo, int *hi)
{
if (nir_scalar_is_const(scalar)) {
*lo = nir_scalar_as_int(scalar);
*hi = *lo;
return non_unary;
}
if (nir_scalar_is_alu(scalar)) {
switch (nir_scalar_alu_op(scalar)) {
case nir_op_iabs:
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 0),
lo, hi);
if (*lo == INT32_MIN) {
*hi = INT32_MAX;
} else {
const int32_t a = abs(*lo);
const int32_t b = abs(*hi);
*lo = MIN2(a, b);
*hi = MAX2(a, b);
}
/* Absolute value wipes out any inner negations, and it is redundant
* with any inner absolute values.
*/
return integer_abs;
case nir_op_ineg: {
const enum root_operation root =
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 0),
lo, hi);
if (*lo == INT32_MIN) {
*hi = INT32_MAX;
} else {
const int32_t a = -(*lo);
const int32_t b = -(*hi);
*lo = MIN2(a, b);
*hi = MAX2(a, b);
}
/* Negation of a negation cancels out, but negation of absolute value
* must preserve the integer_abs bit.
*/
return root ^ integer_neg;
}
case nir_op_imax: {
int src0_lo, src0_hi;
int src1_lo, src1_hi;
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 0),
&src0_lo, &src0_hi);
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 1),
&src1_lo, &src1_hi);
*lo = MAX2(src0_lo, src1_lo);
*hi = MAX2(src0_hi, src1_hi);
return non_unary;
}
case nir_op_imin: {
int src0_lo, src0_hi;
int src1_lo, src1_hi;
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 0),
&src0_lo, &src0_hi);
signed_integer_range_analysis(shader, range_ht,
nir_scalar_chase_alu_src(scalar, 1),
&src1_lo, &src1_hi);
*lo = MIN2(src0_lo, src1_lo);
*hi = MIN2(src0_hi, src1_hi);
return non_unary;
}
default:
break;
}
}
/* Any value with the sign-bit set is problematic. Consider the case when
* bound is 0x80000000. As an unsigned value, this means the value must be
* in the range [0, 0x80000000]. As a signed value, it means the value must
* be in the range [0, INT_MAX] or it must be INT_MIN.
*
* If bound is -2, it means the value is either in the range [INT_MIN, -2]
* or it is in the range [0, INT_MAX].
*
* This function only returns a single, contiguous range. The union of the
* two ranges for any value of bound with the sign-bit set is [INT_MIN,
* INT_MAX].
*/
const int32_t bound = nir_unsigned_upper_bound(shader, range_ht,
scalar, NULL);
if (bound < 0) {
*lo = INT32_MIN;
*hi = INT32_MAX;
} else {
*lo = 0;
*hi = bound;
}
return non_unary;
}
static bool
intel_nir_opt_peephole_imul32x16_instr(nir_builder *b,
nir_instr *instr,
void *cb_data)
{
struct pass_data *d = (struct pass_data *) cb_data;
struct hash_table *range_ht = d->range_ht;
if (instr->type != nir_instr_type_alu)
return false;
nir_alu_instr *imul = nir_instr_as_alu(instr);
if (imul->op != nir_op_imul)
return false;
if (imul->def.bit_size != 32)
return false;
nir_op new_opcode = nir_num_opcodes;
unsigned i;
for (i = 0; i < 2; i++) {
if (!nir_src_is_const(imul->src[i].src))
continue;
int64_t lo = INT64_MAX;
int64_t hi = INT64_MIN;
for (unsigned comp = 0; comp < imul->def.num_components; comp++) {
int64_t v = nir_src_comp_as_int(imul->src[i].src, comp);
if (v < lo)
lo = v;
if (v > hi)
hi = v;
}
if (lo >= INT16_MIN && hi <= INT16_MAX) {
new_opcode = nir_op_imul_32x16;
break;
} else if (lo >= 0 && hi <= UINT16_MAX) {
new_opcode = nir_op_umul_32x16;
break;
}
}
if (new_opcode != nir_num_opcodes) {
replace_imul_instr(b, imul, i, new_opcode);
return true;
}
if (imul->def.num_components > 1)
return false;
const nir_scalar imul_scalar = { &imul->def, 0 };
int idx = -1;
enum root_operation prev_root = invalid_root;
for (i = 0; i < 2; i++) {
/* All constants were previously processed. There is nothing more to
* learn from a constant here.
*/
if (imul->src[i].src.ssa->parent_instr->type == nir_instr_type_load_const)
continue;
nir_scalar scalar = nir_scalar_chase_alu_src(imul_scalar, i);
int lo = INT32_MIN;
int hi = INT32_MAX;
const enum root_operation root =
signed_integer_range_analysis(b->shader, range_ht, scalar, &lo, &hi);
/* Copy propagation (in the backend) has trouble handling cases like
*
* mov(8) g60<1>D -g59<8,8,1>D
* mul(8) g61<1>D g63<8,8,1>D g60<16,8,2>W
*
* If g59 had absolute value instead of negation, even improved copy
* propagation would not be able to make progress.
*
* In cases where both sources to the integer multiplication can fit in
* 16-bits, choose the source that does not have a source modifier.
*/
if (root < prev_root) {
if (lo >= INT16_MIN && hi <= INT16_MAX) {
new_opcode = nir_op_imul_32x16;
idx = i;
prev_root = root;
if (root == non_unary)
break;
} else if (lo >= 0 && hi <= UINT16_MAX) {
new_opcode = nir_op_umul_32x16;
idx = i;
prev_root = root;
if (root == non_unary)
break;
}
}
}
if (new_opcode == nir_num_opcodes) {
assert(idx == -1);
assert(prev_root == invalid_root);
return false;
}
assert(idx != -1);
assert(prev_root != invalid_root);
replace_imul_instr(b, imul, idx, new_opcode);
return true;
}
bool
intel_nir_opt_peephole_imul32x16(nir_shader *shader)
{
struct pass_data cb_data;
cb_data.range_ht = _mesa_pointer_hash_table_create(NULL);
bool progress = nir_shader_instructions_pass(shader,
intel_nir_opt_peephole_imul32x16_instr,
nir_metadata_block_index |
nir_metadata_dominance,
&cb_data);
_mesa_hash_table_destroy(cb_data.range_ht, NULL);
return progress;
}
@@ -1,134 +0,0 @@
/*
* Copyright © 2016 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "compiler/nir/nir_builder.h"
#include "intel_nir.h"
/**
* Implements the WaPreventHSTessLevelsInterference workaround (for Gfx7-8).
*
* From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
* definition of the patch header layouts):
*
* "HW Bug: The Tessellation stage will incorrectly add domain points
* along patch edges under the following conditions, which may result
* in conformance failures and/or cracking artifacts:
*
* * QUAD domain
* * INTEGER partitioning
* * All three TessFactors in a given U or V direction (e.g., V
* direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
* * All three TessFactors in the other direction are > 1.0 and all
* round up to the same integer value (e.g, U direction:
* VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
*
* The suggested workaround (to be implemented as part of the postamble
* to the HS shader in the HS kernel) is:
*
* if (
* (TF[UEQ0] > 1.0) ||
* (TF[VEQ0] > 1.0) ||
* (TF[UEQ1] > 1.0) ||
* (TF[VEQ1] > 1.0) ||
* (TF[INSIDE_U] > 1.0) ||
* (TF[INSIDE_V] > 1.0) )
* {
* TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
* TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
* }"
*
* There's a subtlety here. Intel internal HSD-ES bug 1208668495 notes
* that the above workaround fails to fix certain GL/ES CTS tests which
* have inside tessellation factors of -1.0. This can be explained by
* a quote from the ARB_tessellation_shader specification:
*
* "If "equal_spacing" is used, the floating-point tessellation level is
* first clamped to the range [1,<max>], where <max> is implementation-
* dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
*
* In other words, the actual inner tessellation factor used is
* clamp(TF[INSIDE_*], 1.0, 64.0). So we want to compare the clamped
* value against 1.0. To accomplish this, we change the comparison from
* (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
*/
static inline nir_def *
load_output(nir_builder *b, int num_components, int offset, int component)
{
return nir_load_output(b, num_components, 32, nir_imm_int(b, 0),
.base = offset,
.component = component);
}
static void
emit_quads_workaround(nir_builder *b, nir_block *block)
{
b->cursor = nir_after_block_before_jump(block);
nir_def *inner = load_output(b, 2, 0, 2);
nir_def *outer = load_output(b, 4, 1, 0);
nir_def *any_greater_than_1 =
nir_ior(b, nir_bany(b, nir_fgt_imm(b, outer, 1.0f)),
nir_bany(b, nir_fgt_imm(b, inner, 1.0f)));
nir_push_if(b, any_greater_than_1);
inner = nir_bcsel(b, nir_fle_imm(b, inner, 1.0f),
nir_imm_float(b, 2.0f), inner);
nir_store_output(b, inner, nir_imm_int(b, 0),
.component = 2,
.write_mask = WRITEMASK_XY);
nir_pop_if(b, NULL);
}
void
intel_nir_apply_tcs_quads_workaround(nir_shader *nir)
{
assert(nir->info.stage == MESA_SHADER_TESS_CTRL);
nir_function_impl *impl = nir_shader_get_entrypoint(nir);
nir_builder b = nir_builder_create(impl);
/* emit_quads_workaround() inserts an if statement into each block,
* which splits it in two. This changes the set of predecessors of
* the end block. We want to process the original set, so to be safe,
* save it off to an array first.
*/
const unsigned num_end_preds = impl->end_block->predecessors->entries;
nir_block *end_preds[num_end_preds];
unsigned i = 0;
set_foreach(impl->end_block->predecessors, entry) {
end_preds[i++] = (nir_block *) entry->key;
}
for (i = 0; i < num_end_preds; i++) {
emit_quads_workaround(&b, end_preds[i]);
}
nir_metadata_preserve(impl, nir_metadata_none);
}
-185
View File
@@ -1,185 +0,0 @@
/*
* Copyright 2024 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#ifndef INTEL_SHADER_ENUMS_H
#define INTEL_SHADER_ENUMS_H
#include <stdint.h>
#include "compiler/shader_enums.h"
#include "util/enum_operators.h"
#ifdef __cplusplus
extern "C" {
#endif
enum intel_msaa_flags {
/** Must be set whenever any dynamic MSAA is used
*
* This flag mostly exists to let us assert that the driver understands
* dynamic MSAA so we don't run into trouble with drivers that don't.
*/
INTEL_MSAA_FLAG_ENABLE_DYNAMIC = (1 << 0),
/** True if the framebuffer is multisampled */
INTEL_MSAA_FLAG_MULTISAMPLE_FBO = (1 << 1),
/** True if this shader has been dispatched per-sample */
INTEL_MSAA_FLAG_PERSAMPLE_DISPATCH = (1 << 2),
/** True if inputs should be interpolated per-sample by default */
INTEL_MSAA_FLAG_PERSAMPLE_INTERP = (1 << 3),
/** True if this shader has been dispatched with alpha-to-coverage */
INTEL_MSAA_FLAG_ALPHA_TO_COVERAGE = (1 << 4),
/** True if this shader has been dispatched coarse
*
* This is intentionally chose to be bit 15 to correspond to the coarse bit
* in the pixel interpolator messages.
*/
INTEL_MSAA_FLAG_COARSE_PI_MSG = (1 << 15),
/** True if this shader has been dispatched coarse
*
* This is intentionally chose to be bit 18 to correspond to the coarse bit
* in the render target messages.
*/
INTEL_MSAA_FLAG_COARSE_RT_WRITES = (1 << 18),
};
MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(intel_msaa_flags)
/**
* @defgroup Tessellator parameter enumerations.
*
* These correspond to the hardware values in 3DSTATE_TE, and are provided
* as part of the tessellation evaluation shader.
*
* @{
*/
enum intel_tess_partitioning {
INTEL_TESS_PARTITIONING_INTEGER = 0,
INTEL_TESS_PARTITIONING_ODD_FRACTIONAL = 1,
INTEL_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
};
enum intel_tess_output_topology {
INTEL_TESS_OUTPUT_TOPOLOGY_POINT = 0,
INTEL_TESS_OUTPUT_TOPOLOGY_LINE = 1,
INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2,
INTEL_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
};
enum intel_tess_domain {
INTEL_TESS_DOMAIN_QUAD = 0,
INTEL_TESS_DOMAIN_TRI = 1,
INTEL_TESS_DOMAIN_ISOLINE = 2,
};
/** @} */
enum intel_shader_dispatch_mode {
INTEL_DISPATCH_MODE_4X1_SINGLE = 0,
INTEL_DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
INTEL_DISPATCH_MODE_SIMD8 = 3,
INTEL_DISPATCH_MODE_TCS_SINGLE_PATCH = 0,
INTEL_DISPATCH_MODE_TCS_MULTI_PATCH = 2,
};
/**
* Data structure recording the relationship between the gl_varying_slot enum
* and "slots" within the vertex URB entry (VUE). A "slot" is defined as a
* single octaword within the VUE (128 bits).
*
* Note that each BRW register contains 256 bits (2 octawords), so when
* accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
* consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as
* in a vertex shader), each register corresponds to a single VUE slot, since
* it contains data for two separate vertices.
*/
struct intel_vue_map {
/**
* Bitfield representing all varying slots that are (a) stored in this VUE
* map, and (b) actually written by the shader. Does not include any of
* the additional varying slots defined in brw_varying_slot.
*/
uint64_t slots_valid;
/**
* Is this VUE map for a separate shader pipeline?
*
* Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
* without the linker having a chance to dead code eliminate unused varyings.
*
* This means that we have to use a fixed slot layout, based on the output's
* location field, rather than assigning slots in a compact contiguous block.
*/
bool separate;
/**
* Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are
* not stored in a slot (because they are not written, or because
* additional processing is applied before storing them in the VUE), the
* value is -1.
*/
signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
/**
* Map from VUE slot to gl_varying_slot value. For slots that do not
* directly correspond to a gl_varying_slot, the value comes from
* brw_varying_slot.
*
* For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
*/
signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
/**
* Total number of VUE slots in use
*/
int num_slots;
/**
* Number of position VUE slots. If num_pos_slots > 1, primitive
* replication is being used.
*/
int num_pos_slots;
/**
* Number of per-patch VUE slots. Only valid for tessellation control
* shader outputs and tessellation evaluation shader inputs.
*/
int num_per_patch_slots;
/**
* Number of per-vertex VUE slots. Only valid for tessellation control
* shader outputs and tessellation evaluation shader inputs.
*/
int num_per_vertex_slots;
};
struct intel_cs_dispatch_info {
uint32_t group_size;
uint32_t simd_size;
uint32_t threads;
/* RightExecutionMask field used in GPGPU_WALKER. */
uint32_t right_mask;
};
enum PACKED intel_compute_walk_order {
INTEL_WALK_ORDER_XYZ = 0,
INTEL_WALK_ORDER_XZY = 1,
INTEL_WALK_ORDER_YXZ = 2,
INTEL_WALK_ORDER_YZX = 3,
INTEL_WALK_ORDER_ZXY = 4,
INTEL_WALK_ORDER_ZYX = 5,
};
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* INTEL_SHADER_ENUMS_H */