diff --git a/src/nouveau/compiler/latencies/lat_rs_gen.py b/src/nouveau/compiler/latencies/lat_rs_gen.py new file mode 100644 index 00000000000..b537ca16c34 --- /dev/null +++ b/src/nouveau/compiler/latencies/lat_rs_gen.py @@ -0,0 +1,208 @@ +#! /usr/bin/env python3 +# +# Copyright © 2024 Collabora Ltd. and Red Hat Inc. +# SPDX-License-Identifier: MIT + +# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs +# and constructs a lib.rs which puts each of them in ::path::to::mod. + +import argparse +import csv +import os +import sys + +from mako import template + +TEMPLATE_RS = template.Template(text="""\ +// Copyright 2024 Red Hat Inc. +// SPDX-License-Identifier: MIT + +// This file is generated by lat_rs_gen.py. DO NOT EDIT! +#![allow(unused_variables)] + +const fn pred(has_pred: bool, a: u32, b: u32) -> u32 { + if has_pred { + a + b + } else { + b + } +} + +% for reg_file, cats in file_cats.items(): +<% enum_name = to_camel(reg_file) + 'Latency' + sm.upper() %> +#[derive(PartialEq)] +pub enum ${enum_name} { +% for category in cats[0].header.cats: + ${to_camel(category)}, +% endfor +} + +impl ${to_camel(reg_file)}Latency${sm.upper()} { +% for bigcat in cats: + pub fn ${bigcat.header.latcat}( + ${bigcat.header.cat0}: ${enum_name}, + ${bigcat.header.cat1}: ${enum_name}, + has_pred: bool + ) -> u32 { + use ${enum_name}::*; + match ${bigcat.header.cat1} { +% for cat in bigcat.header.cats: + ${to_camel(cat)} => match ${bigcat.header.cat0} { + <% has_non = False %> +% for cat2 in bigcat.header.cats: + % if bigcat.fields[loop.parent.index].flds[cat2].pred == True: + ${to_camel(cat2)} => pred( + has_pred, + ${bigcat.fields[loop.parent.index].flds[cat2].value}, + ${bigcat.fields[loop.parent.index].flds[cat2].pred_val} + ), + % elif bigcat.fields[loop.parent.index].flds[cat2].value != "none": + ${to_camel(cat2)} => ${bigcat.fields[loop.parent.index].flds[cat2].value}, + % else: + <% has_none = True %> + % endif +% endfor + % if has_none: + _ => panic!("Illegal ${bigcat.header.cat0} value in ${bigcat.header.latcat} for ${to_camel(cat)}"), + % endif + } +% endfor + } + } +% endfor +} + +% endfor +""") + +## A mere convenience to convert snake_case to CamelCase. Numbers are prefixed +## with "_". +def to_camel(snake_str): + result = ''.join(word.title() for word in snake_str.split('_')) + return result if not result[0].isdigit() else '_' + result + +def reader(csvfile): + """Wrapper around csv.reader that skips comments and blanks.""" + # csv.reader actually reads the file one line at a time (it was designed to + # open excel generated sheets), so hold the file until all of the lines are + # read. + with open(csvfile, 'r') as f: + for line in csv.reader(f): + if line and not line[0].startswith('#'): + yield line + +class Fld(object): + def __init__(self, line): + if "none" in line: + self.valid = False + else: + self.valid = True + self.pred = False + if "+" in line: + self.pred = True + part = line.split("+") + self.value = part[0] + self.pred_val = part[1] + elif " & sb" in line: + self.scoreboard = True + self.value = line.removesuffix(" & sb"); + else: + self.scoreboard = False + self.value = line.strip() + +class Header(object): + def __init__(self, line): + self.latcat = line[0].strip() + self.cats = line[1:] + + if self.latcat == "raw": + self.cat0 = "writer" + self.cat1 = "reader" + elif self.latcat == "war": + self.cat0 = "reader" + self.cat1 = "writer" + elif self.latcat == "waw": + self.cat0 = "writer1" + self.cat1 = "writer2" + + +class Fields(object): + def __init__(self, header, line): + self.fldcat = line[0].strip() + self.flds = {} + for index, cat in enumerate(header.cats): + self.flds[cat] = Fld(line[index + 1]) + +class Category(object): + def __init__(self, header, fields): + self.header = header + self.fields = fields + +lattypes = ["reg", "ureg", "pred", "upred"] + +def emit_cats(dirname, f, sm, lat): + cats = [] + for index, cat in enumerate(["raw", "war", "waw"]): + first_line = False + fields = [] + for l in reader(dirname + "sm" + sm + "/" + lat + "_" + cat + ".csv"): + if first_line == False: + header = Header(l) + first_line = True + else: + fields.append(Fields(header, l)) + cats.append(Category(header, fields)) + + try: + f.write(TEMPLATE_RS.render(lat=lat, sm=sm, cats=cats)) + except Exception: + # In the event there's an error, this imports some helpers from mako + # to print a useful stack trace and prints it, then exits with + # status 1, if python is run with debug; otherwise it just raises + # the exception + import sys + from mako import exceptions + print(exceptions.text_error_template().render(), file=sys.stderr) + sys.exit(1) + +def emit_sm(dirname, f, sm): + for lat in lattypes: + emit_cats(dirname, f, sm, lat) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + parser.add_argument('--out-rs', required=True, help='Output Rust file.') + parser.add_argument('--sm', help='SM', required=True) + parser.add_argument('csv_files', metavar='FILE', nargs='*', + action='append', + help='Input CSV filename') + + args = parser.parse_args() + + sys.path.insert(0, args.import_path) + import util + + file_cats = {} + for csv_file in args.csv_files[0]: + split = os.path.basename(csv_file).removesuffix('.csv').split('_') + assert len(split) == 2 + reg_file = split[0] + latcat = split[1] + + r = reader(csv_file) + header = Header(next(r)) + fields = [Fields(header, l) for l in r] + + cat = Category(header, fields) + file_cats.setdefault(reg_file, []).append(cat) + + environment = dict( + sm=args.sm, + file_cats=file_cats, + to_camel=to_camel, + ) + util.write_template_rs(args.out_rs, TEMPLATE_RS, environment) + +if __name__ == '__main__': + main() diff --git a/src/nouveau/compiler/latencies/lib_rs_gen.py b/src/nouveau/compiler/latencies/lib_rs_gen.py new file mode 100644 index 00000000000..c9d1c3f2e1c --- /dev/null +++ b/src/nouveau/compiler/latencies/lib_rs_gen.py @@ -0,0 +1,39 @@ +#! /usr/bin/env python3 +# +# Copyright © 2024 Collabora Ltd. and Red Hat Inc. +# SPDX-License-Identifier: MIT + +# This script takes a list of Rust files, each of the form nvh_path_to_mod.rs +# and constructs a lib.rs which puts each of them in ::path::to::mod. + +import argparse +import sys + +from mako.template import Template + +TEMPLATE_RS = Template("""\ +// Copyright © 2024 Collabora Ltd. and Red Hat Inc. +// SPDX-License-Identifier: MIT + +// This file is generated by lib_rs_gen.py. DO NOT EDIT! + +% for mod in mods: +pub mod ${mod}; +% endfor +""") + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + parser.add_argument('--out-rs', required=True, help='Output Rust file.') + parser.add_argument('mods', metavar='MOD', nargs='*', + action='append', help='Submodule') + args = parser.parse_args() + + sys.path.insert(0, args.import_path) + import util + + util.write_template_rs(args.out_rs, TEMPLATE_RS, dict(mods=args.mods[0])) + +if __name__ == '__main__': + main() diff --git a/src/nouveau/compiler/latencies/meson.build b/src/nouveau/compiler/latencies/meson.build new file mode 100644 index 00000000000..bf10b879a06 --- /dev/null +++ b/src/nouveau/compiler/latencies/meson.build @@ -0,0 +1,59 @@ +# Copyright © 2025 Collabora, Ltd. +# SPDX-License-Identifier: MIT + +_nak_lat_sm100_files = [ + 'pred_raw.csv', + 'pred_war.csv', + 'pred_waw.csv', + 'reg_raw.csv', + 'reg_war.csv', + 'reg_waw.csv', + 'upred_raw.csv', + 'upred_war.csv', + 'upred_waw.csv', + 'ureg_raw.csv', + 'ureg_war.csv', + 'ureg_waw.csv', +] + +_nak_lat_sms = { + 'sm100': _nak_lat_sm100_files, +} + +_lat_rs_gen = files('lat_rs_gen.py') + +_lat_rs_generated = [] +foreach sm, csvs : _nak_lat_sms + csv_files = [] + foreach i : range(csvs.length()) + csv_files += files(sm + '/' + csvs[i]) + endforeach + + _lat_rs_generated += custom_target( + sm+'.rs', + input : [_lat_rs_gen, csv_files], + output : [sm+'.rs'], + command : [ + prog_python, '@INPUT@', '-p', nouveau_util_py_path, + '--out-rs', '@OUTPUT0@', '--sm', sm, + ], + depend_files : nouveau_util_py, + ) +endforeach + +_nak_latencies_lib_rs = custom_target( + 'lib.rs', + input : ['lib_rs_gen.py', _lat_rs_generated, nouveau_util_py], + output : ['lib.rs'], + command : [ + prog_python, '@INPUT0@', '-p', nouveau_util_py_path, + '--out-rs', '@OUTPUT0@', _nak_lat_sms.keys() + ], +) + +libnak_latencies_rs = static_library( + 'nak_latencies', + _nak_latencies_lib_rs, + gnu_symbol_visibility : 'hidden', + rust_abi : 'rust', +) diff --git a/src/nouveau/compiler/latencies/sm100/pred_raw.csv b/src/nouveau/compiler/latencies/sm100/pred_raw.csv new file mode 100644 index 00000000000..c30676a6e3f --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/pred_raw.csv @@ -0,0 +1,13 @@ +raw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard +disp_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none +disp_dual_alu,13,13,13,13,13,13,13,13,14,14,1 & sb,none +coupled,4,4,4,4,5,8,5,5,6,6,1 & sb,none +dualalu,4,4,4,4,5,8,5,5,6,6,1 & sb,none +r2p,4,4,4,4,5,8,5,5,6,6,1 & sb,none +r2ur,none,none,none,none,none,none,none,none,none,none,none,none +fma,5,5,5,5,5,13,4,5,6,6,1 & sb,none +fp16,13,13,13,13,13,13,13,5,14,14,1 & sb,none +hfma2_mma,13,13,13,13,13,13,13,13,6,6,1 & sb,none +redirected_fp64,13,13,13,13,13,13,13,13,6,6,1 & sb,none +decoupled,13,13,13,13,13,13,13,13,14,14,1 & sb,none +guard,13,13,13,13,13,13,13,13,14,14,1 & sb,none diff --git a/src/nouveau/compiler/latencies/sm100/pred_war.csv b/src/nouveau/compiler/latencies/sm100/pred_war.csv new file mode 100644 index 00000000000..daec288338f --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/pred_war.csv @@ -0,0 +1,14 @@ +war,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard +disp_alu,1,1,1,1,1,none,1,1,1,1,1,1 +disp_dual_alu,1,1,1,1,1,none,1,1,1,1,1,1 +coupled,1,1,1,1,1,none,1,1,1,1,1,1 +dualalu,1,1,1,1,1,none,1,1,1,1,1,1 +r2p,1,1,1,1,1,none,1,1,1,1,1,1 +r2ur,1,1,1,1,1,none,1,1,1,1,1,1 +fma,1,1,1,1,1,none,1,1,1,1,1,1 +fp16,1,1,1,1,1,none,1,1,1,1,1,1 +hfma2_mma,1,1,1,1,1,none,1,1,1,1,1,1 +redirected_fp64,1,1,1,1,1,none,1,1,1,1,1,1 +decoupled,1,1,1,1,1,none,1,1,1,1,1,1 +guard,none,none,none,none,none,none,none,none,none,none,none,none + diff --git a/src/nouveau/compiler/latencies/sm100/pred_waw.csv b/src/nouveau/compiler/latencies/sm100/pred_waw.csv new file mode 100644 index 00000000000..045a10d13bb --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/pred_waw.csv @@ -0,0 +1,13 @@ +waw,disp_alu,disp_dual_alu,coupled,dualalu,r2p,r2ur,fma,fp16,hfma2_mma,redirected_fp64,decoupled,guard +disp_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none +disp_dual_alu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none +coupled,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none +dualalu,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none +r2p,1,1,1,1,1,1+7,1,1,2,2,1 & sb,none +r2ur,1,1,1,1,1,1,1,1,2,2,1 & sb,none +fma,1,1,1,1,1,1+8,1,1,2,2,1 & sb,none +fp16,2+6,2+6,2+6,2+6,2+6,2+6,2+6,1,2+7,2+7,1 & sb,none +hfma2_mma,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none +redirected_fp64,2+5,2+5,2+5,2+5,2+5,2+5,2+5,2+5,1,1,1 & sb,none +decoupled,2+10,2+10,2+10,2+10,2+10,2+10,2+10,2+10,1+12,1+12,1 & sb,none +guard,none,none,none,none,none,none,none,none,none,none,none,none diff --git a/src/nouveau/compiler/latencies/sm100/reg_raw.csv b/src/nouveau/compiler/latencies/sm100/reg_raw.csv new file mode 100644 index 00000000000..2ef3bc7d52e --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/reg_raw.csv @@ -0,0 +1,23 @@ +raw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu +alu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +dualalu,4,4,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +disp_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +fma,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +fma_alu,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_read_ab,5,5,6,4,4,none,none,none,4,6,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_read_cl,5,5,6,4,4,none,none,none,2,4,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_read_ch,3,3,4,2,2,none,none,none,2,2,3,3,3,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_write_dl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_write_dh,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +fp16,5,5,6,5,5,none,none,none,3,5,4,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +fp16_alu,5,5,6,5,5,none,none,none,3,5,5,4,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +fp16_f32,5,5,6,5,5,none,none,none,3,5,5,5,5,10,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +hfma2_mma,6,6,6,6,6,none,none,none,6,6,6,6,6,8,10,19,19,1 & sb,1 & sb,1 & sb,1 & sb +redirected_fp64,6,6,6,6,6,none,none,none,6,6,6,6,6,8,8,19,19,1 & sb,1 & sb,1 & sb,1 & sb +imma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb +hmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb +dmma,7,7,7,7,7,none,none,none,7,7,7,7,7,11,11,20,20,1 & sb,1 & sb,1 & sb,1 & sb +branch,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb +decoupled,4,4,4,4,4,none,none,none,4,4,4,4,4,6,6,19,19,1 & sb,1 & sb,1 & sb,1 & sb +decoupled_agu,5,5,5,5,5,none,none,none,5,5,5,5,5,7,7,19,19,1 & sb,1 & sb,1 & sb,1 & sb + diff --git a/src/nouveau/compiler/latencies/sm100/reg_war.csv b/src/nouveau/compiler/latencies/sm100/reg_war.csv new file mode 100644 index 00000000000..f0cf594240a --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/reg_war.csv @@ -0,0 +1,23 @@ +war,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu +alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +dualalu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +disp_64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +fma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +fma_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_write_dl,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_write_dh,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +fp16,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +fp16_alu,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +fp16_f32,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +hfma2_mma,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +redirected_fp64,1,1,1,1,1,1,1,1,none,none,1,1,1,1,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +imma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +hmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +dmma,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +branch,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +decoupled,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb +decoupled_agu,2,2,2,2,2,2,2,2,none,none,2,2,2,2,1,1,1,1 & sb,1 & sb,1 & sb,1 & sb + diff --git a/src/nouveau/compiler/latencies/sm100/reg_waw.csv b/src/nouveau/compiler/latencies/sm100/reg_waw.csv new file mode 100644 index 00000000000..36d904c0a32 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/reg_waw.csv @@ -0,0 +1,22 @@ +waw,alu,dualalu,disp_64,fma,fma_alu,imad_wide_read_ab,imad_wide_read_cl,imad_wide_read_ch,imad_wide_write_dl,imad_wide_write_dh,fp16,fp16_alu,fp16_f32,hfma2_mma,redirected_fp64,imma,hmma,dmma,branch,decoupled,decoupled_agu +alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb +dualalu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb +disp_64,1,1,1,1,1,none,none,none,1,1,1,1,1,3+1,3+1,14+1,14+1,1 & sb,1 & sb,1 & sb,1 & sb +fma,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb +fma_alu,1,1,1+1,1,1,none,none,none,1,1+1,1,1,1,3+3,3+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_read_ab,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_read_cl,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_read_ch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +imad_wide_write_dl,1+2,1+2,1+3,1+1,1+1,none,none,none,1,1+1,1+2,1+2,1+2,5+3,5+3,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb +imad_wide_write_dh,1,1,1+1,1,1,none,none,none,1,1,1,1,1,5+1,5+1,14+3,14+3,1 & sb,1 & sb,1 & sb,1 & sb +fp16,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb +fp16_alu,1,1,1+1,1,1,none,none,none,1,1,1,1,1,3+3,3+3,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb +fp16_f32,1,1,1,1,1,none,none,none,1,1,1,1,1,3+2,3+2,14+2,14+2,1 & sb,1 & sb,1 & sb,1 & sb +hfma2_mma,1,1,1,1,1,none,none,none,1,1,1,1,1,1,3,14,14,1 & sb,1 & sb,1 & sb,1 & sb +redirected_fp64,1,1,1,1,1,none,none,none,1,1,1,1,1,2,1,13,13,1 & sb,1 & sb,1 & sb,1 & sb +imma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb +hmma,2,2,2,2,2,none,none,none,2,2,2,2,2,2,2,1,1,1 & sb,1 & sb,1 & sb,1 & sb +dmma,2+4,2+4,2+4,2+4,2+4,none,none,none,2+4,2+4,2+4,2+4,2+4,2+8,2+8,10+9,10+9,1 & sb,1 & sb,1 & sb,1 & sb +branch,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb +decoupled,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb +decoupled_agu,1+5,1+5,1+5,1+5,1+5,none,none,none,1+5,1+5,1+5,1+5,1+5,1+9,1+9,13+6,13+6,1 & sb,1 & sb,1 & sb,1 & sb diff --git a/src/nouveau/compiler/latencies/sm100/upred_raw.csv b/src/nouveau/compiler/latencies/sm100/upred_raw.csv new file mode 100644 index 00000000000..204d42ec148 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/upred_raw.csv @@ -0,0 +1,8 @@ +raw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg +coupled,none,6,1,none,none,none,1 & sb +udp,none,4,1,none,none,none,1 & sb +voteu,none,none,none,none,none,none,none +u_guard,none,11,5,none,none,none,1 & sb +bra_jmp,none,9,2,none,none,none,1 & sb +uldc_mma,none,11,5,none,none,none,1 & sb +usetmaxreg,none,none,none,none,none,none,none diff --git a/src/nouveau/compiler/latencies/sm100/upred_war.csv b/src/nouveau/compiler/latencies/sm100/upred_war.csv new file mode 100644 index 00000000000..44a7a709db1 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/upred_war.csv @@ -0,0 +1,9 @@ +war,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg +coupled,none,none,none,none,none,none,none +udp,1,1,none,1,1,1,none +voteu,2,2,none,1,1,1,none +u_guard,none,none,none,none,none,none,none +bra_jmp,none,none,none,none,none,none,none +uldc_mma,none,none,none,none,none,none,none +usetmaxreg,1,1,none,1,1,1,none + diff --git a/src/nouveau/compiler/latencies/sm100/upred_waw.csv b/src/nouveau/compiler/latencies/sm100/upred_waw.csv new file mode 100644 index 00000000000..7dff740fa69 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/upred_waw.csv @@ -0,0 +1,8 @@ +waw,coupled,udp,voteu,u_guard,bra_jmp,uldc_mma,usetmaxreg +coupled,none,none,none,none,none,none,none +udp,none,1,1,none,none,none,1 & sb +voteu,none,7,1,none,none,none,1 & sb +u_guard,none,none,none,none,none,none,none +bra_jmp,none,none,none,none,none,none,none +uldc_mma,none,none,none,none,none,none,none +usetmaxreg,none,8+2,8,none,none,none,1 & sb diff --git a/src/nouveau/compiler/latencies/sm100/ureg_raw.csv b/src/nouveau/compiler/latencies/sm100/ureg_raw.csv new file mode 100644 index 00000000000..e9db8cb7e64 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/ureg_raw.csv @@ -0,0 +1,18 @@ +raw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu +coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2 +coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,6,2,2,2,13,2 +decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2 +branch,none,none,none,none,none,none,none,1 & sb,none,none,none,10,3,3,3,13,3 +coupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5 +decoupled_bindless,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5 +hfma2_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2 +to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,13,5 +tex,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2 +tma,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2 +rpcmov_64,none,none,none,none,none,none,none,1 & sb,none,none,none,9,2,2,2,13,2 +udp,none,none,none,none,none,none,none,1 & sb,none,none,none,4,2,2,2,13,2 +uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,12,5,5,5,15,5 +umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,2,2,2,13,2 +elect,none,none,none,none,none,none,none,1 & sb,none,none,none,8,2,2,2,13,2 +r2ur,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +voteu,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none diff --git a/src/nouveau/compiler/latencies/sm100/ureg_war.csv b/src/nouveau/compiler/latencies/sm100/ureg_war.csv new file mode 100644 index 00000000000..2fe5550ad44 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/ureg_war.csv @@ -0,0 +1,18 @@ +war,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu +coupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +coupled_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +decoupled,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +to_ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none +tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +udp,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none +uldc,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none +umov,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none +elect,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none +r2ur,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,1,1,1,1,none,none +voteu,1,1,1,1,1,1,1,1 & sb,1,1 & sb,1,3,1,1,1,none,none diff --git a/src/nouveau/compiler/latencies/sm100/ureg_waw.csv b/src/nouveau/compiler/latencies/sm100/ureg_waw.csv new file mode 100644 index 00000000000..dfc152f0cd8 --- /dev/null +++ b/src/nouveau/compiler/latencies/sm100/ureg_waw.csv @@ -0,0 +1,19 @@ +waw,coupled,coupled_mma,decoupled,branch,coupled_bindless,decoupled_bindless,hfma2_mma,to_ur,tex,tma,rpcmov_64,udp,uldc,umov,elect,r2ur,voteu +coupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none +coupled_mma,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none +decoupled,none,none,none,none,none,none,none,1 & sb,none,none,none,none,none,none,none,none,none +branch,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +coupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +decoupled_bindless,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +hfma2_mma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +to_ur,none,none,none,none,none,none,none,1 & sb,none,none,none,4+7,4,4,4,4+10,4 +tex,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +tma,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +rpcmov_64,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none,none +udp,none,none,none,none,none,none,none,1 & sb,none,none,none,1,1,1,1,4+5,1 +uldc,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1 +umov,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1 +elect,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1 +r2ur,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,1,1 +voteu,none,none,none,none,none,none,none,1 & sb,none,none,none,7,1,1,1,10+1,1 + diff --git a/src/nouveau/compiler/meson.build b/src/nouveau/compiler/meson.build index f5c882bb250..1d6ddc699b3 100644 --- a/src/nouveau/compiler/meson.build +++ b/src/nouveau/compiler/meson.build @@ -1,6 +1,8 @@ # Copyright © 2022 Collabora, Ltd. # SPDX-License-Identifier: MIT +subdir('latencies') + dep_paste = dependency('paste', version : '>= 1.0.14', fallback : ['paste', 'dep_paste'], @@ -114,6 +116,7 @@ _libnak_rs = static_library( link_with : [ _libnak_bindings_rs, _libnak_ir_proc_rs, + libnak_latencies_rs, ], ) diff --git a/src/nouveau/compiler/nak/lib.rs b/src/nouveau/compiler/nak/lib.rs index fb6517bb061..66a2e8e0bb7 100644 --- a/src/nouveau/compiler/nak/lib.rs +++ b/src/nouveau/compiler/nak/lib.rs @@ -27,6 +27,7 @@ mod opt_uniform_instrs; mod qmd; mod reg_tracker; mod repair_ssa; +mod sm120_instr_latencies; mod sm20; mod sm30_instr_latencies; mod sm32; diff --git a/src/nouveau/compiler/nak/sm120_instr_latencies.rs b/src/nouveau/compiler/nak/sm120_instr_latencies.rs new file mode 100644 index 00000000000..3ab2bc42aca --- /dev/null +++ b/src/nouveau/compiler/nak/sm120_instr_latencies.rs @@ -0,0 +1,552 @@ +// Copyright © 2025 Red Hat. +// SPDX-License-Identifier: MIT +#![allow(non_camel_case_types)] + +use crate::ir::*; + +use nak_latencies::sm100::*; + +// This contains the register scheduling information provided by NVIDIA. This +// file is for Blackwell only. +// +// These latencies come from B100 (SM100) and not the consumer RTX chips +// (SM120). We have to add some padding to get everything passing on the RTX +// chips so that's done in this file while using the sm100 CSVs. + +// Coupled instructions are ones with fixed latencies, they need delays but not +// scoreboards. Decoupled instructions are ones with variable latencies, need +// scoreboards but not delays. There are also redirected instructions which +// depending on the SM, can be coupled or Decoupled so both delays and +// scoreboards needs to be provided. + +fn op_reg_latency(op: &Op, reader: bool, op_reg_idx: usize) -> RegLatencySM100 { + use RegLatencySM100::*; + match op { + // this will need updating if imad grows support for input predicates + Op::IMad(_) | Op::IMul(_) => Fma, + Op::IMad64(_) => { + if reader { + match op_reg_idx { + 0 | 1 => ImadWideReadAb, + 2 => ImadWideReadCl, // vs upper C operand - work it out + _ => { + panic!("Illegal field in imadwide") + } + } + } else { + ImadWideWriteDh // as above this needs more work + } + } + + Op::PopC(_) => Decoupled, + Op::IAdd3(_) | Op::IAdd3X(_) => Alu, + + Op::BMsk(_) => Alu, + // Sgxt => Alu, + Op::Lop3(_) => Alu, + Op::Flo(_) => Decoupled, + Op::ISetP(_) => Dualalu, + Op::IAbs(_) => Alu, + Op::Lea(_) => Alu, + Op::LeaX(_) => Alu, + Op::IMnMx(_) => Dualalu, + Op::I2I(_) => Alu, + // I2IP => alu + Op::Shf(_) => Alu, + + Op::F2FP(_) => Alu, + Op::FFma(_) => Fma, + Op::FAdd(_) => Fma, + Op::FMul(_) => Fma, + Op::FMnMx(_) => Dualalu, + Op::FSwzAdd(_) => Fma, + Op::FSet(_) => Dualalu, + // FSel => Alu, + Op::FSetP(_) => Dualalu, + // FChk => Decoupled, + Op::DAdd(_) | Op::DFma(_) | Op::DMul(_) | Op::DSetP(_) => { + RedirectedFp64 + } + + Op::DMnMx(_) => RedirectedFp64, // not in docs + + Op::HAdd2(hadd2) => { + if hadd2.f32 { + Fp16F32 + } else { + Fp16 + } + } + Op::HFma2(_) | Op::HMul2(_) => Fp16, + + Op::HSet2(_) | Op::HSetP2(_) | Op::HMnMx2(_) => Fp16Alu, + Op::Hmma(_) => Hmma, + Op::Ipa(_) => DecoupledAgu, + Op::MuFu(_) => Decoupled, + + // Conversion functions all Decoupled + Op::F2F(_) => Decoupled, + Op::F2I(_) => Decoupled, + Op::I2F(_) => Decoupled, + Op::FRnd(_) => Decoupled, + Op::AL2P(_) => Decoupled, + + Op::Mov(_) => Dualalu, + Op::Sel(_) => Dualalu, + Op::BRev(_) => Decoupled, + // P2R => Alu, + // R2P => Alu, + Op::PLop3(_) => Alu, + Op::Prmt(_) => Alu, + Op::Nop(_) => Disp64, + Op::Vote(_) => Dualalu, + Op::Match(_) => Decoupled, + Op::S2R(_) => DecoupledAgu, + Op::R2UR(_) => Alu, + Op::Redux(_) => { + if reader { + Decoupled + } else { + panic!("Illegal R2UR"); + } + } + Op::CS2R(cs2r) => { + if cs2r.dst.as_reg().unwrap().comps() == 2 { + Disp64 + } else { + Dualalu + } + } + // B2R => DecoupledAgu, + // LEPC => Disp64 + Op::BMov(bmov) => match bmov.dst { + Dst::Reg(_) => Branch, + _ => Branch, + }, + // RPCMOV.32 => Alu, + // RPCMOV.64 => Disp64 + // PMTRIG => Disp64 + // CSMTEST => Alu, + Op::Bar(_) => DecoupledAgu, + Op::Imma(_) => Imma, + Op::IDp4(_) => Fma, + Op::BClear(_) => Decoupled, + Op::Bra(_) => Decoupled, + Op::BSSy(_) => Decoupled, + Op::Kill(_) => Decoupled, + Op::Exit(_) => Decoupled, + Op::BSync(_) => Decoupled, + Op::Tex(_) => Decoupled, + Op::Tld(_) => Decoupled, + Op::Tld4(_) => Decoupled, + Op::Tmml(_) => Decoupled, + Op::Txd(_) => Decoupled, + Op::Txq(_) => Decoupled, + Op::Ldc(_) => Decoupled, + Op::ALd(_) => DecoupledAgu, + Op::ASt(_) => DecoupledAgu, + Op::Out(_) => DecoupledAgu, + Op::OutFinal(_) => DecoupledAgu, + Op::Ld(_) => DecoupledAgu, + Op::St(_) => DecoupledAgu, + Op::Atom(_) => DecoupledAgu, + //CCtl.i,c are coupled + Op::CCtl(_) => DecoupledAgu, + Op::MemBar(_) => Decoupled, + Op::SuLd(_) => Decoupled, + Op::SuSt(_) => Decoupled, + Op::SuAtom(_) => Decoupled, + Op::PixLd(_) => DecoupledAgu, + Op::Isberd(_) => DecoupledAgu, + Op::LdTram(_) => DecoupledAgu, + Op::Shfl(_) => DecoupledAgu, + //Op::LdSm(_) => DecoupledAgu + x => { + panic!("Illegal instuction in reg category {}", x); + } + } +} + +fn op_pred_latency(op: &Op) -> PredLatencySM100 { + use PredLatencySM100::*; + match op { + Op::Atom(_) => Decoupled, + Op::DSetP(_) => RedirectedFp64, + Op::FMnMx(_) | Op::FSetP(_) => Dualalu, + Op::HFma2(_) => Fp16, + Op::HMnMx2(_) => Fp16, + Op::HSetP2(_) => Fp16, + Op::IAdd3(_) => Coupled, + Op::IAdd3X(_) => Coupled, + Op::IMad(_) => Fma, + Op::IMad64(_) => Fma, + Op::IMnMx(_) => Dualalu, + Op::IMul(_) => Fma, + Op::Ipa(_) => Decoupled, + Op::ISetP(_) => Dualalu, + + Op::Ld(_) => Decoupled, + + Op::Lea(_) | Op::LeaX(_) => Coupled, + Op::PixLd(_) => Decoupled, + Op::PLop3(_) => Coupled, + Op::PSetP(_) => Coupled, + Op::R2UR(_) => R2Ur, + Op::Sel(_) => Dualalu, + Op::Shfl(_) => Decoupled, + Op::SuLd(_) => Decoupled, + Op::SuSt(_) => Decoupled, + Op::Tex(_) => Decoupled, + Op::Tld(_) => Decoupled, + Op::Tld4(_) => Decoupled, + Op::Tmml(_) => Decoupled, + Op::Txd(_) => Decoupled, + Op::Txq(_) => Decoupled, + + Op::Vote(_) => DispDualAlu, + Op::Match(_) => Decoupled, + _ => { + panic!("Illegal op in sm120 pred latency {}", op); + } + } +} + +fn op_ureg_latency( + op: &Op, + reader: bool, + op_reg_idx: usize, +) -> UregLatencySM100 { + use UregLatencySM100::*; + // this decides between the category types for readers. + let bindless = reader && op.srcs_as_slice()[op_reg_idx].is_bindless_cbuf(); + + let coupled = if bindless { CoupledBindless } else { Coupled }; + let decoupled = if bindless { + DecoupledBindless + } else { + Decoupled + }; + + // if this is a reader from a ureg, it could be a U* instruction or a + // regular instruction. + let uniform_op = op.is_uniform(); + + let coupled = if uniform_op { Udp } else { coupled }; + let decoupled = if uniform_op { Udp } else { decoupled }; + + match op { + Op::BMsk(_) => coupled, + Op::BRev(_) => decoupled, + // uclea? + Op::Flo(_) => decoupled, + Op::IAdd3(_) | Op::IAdd3X(_) => coupled, + Op::IAbs(_) => coupled, + Op::IDp4(_) => coupled, + Op::IMnMx(_) => coupled, + Op::IMad(_) => coupled, + + Op::IMad64(_) => coupled, + Op::ISetP(_) => coupled, + Op::Ldc(_) => { + if uniform_op { + ToUr + } else { + decoupled + } + } + Op::Lea(_) => coupled, + Op::LeaX(_) => coupled, + Op::Lop2(_) | Op::Lop3(_) => coupled, + + Op::MuFu(_) => decoupled, + Op::Mov(_) => { + if uniform_op { + Umov + } else { + coupled + } + } + + // mov32i => uldc + // p2ur => udp, + Op::PLop3(_) => coupled, + Op::PopC(_) => { + if uniform_op { + coupled + } else { + decoupled + } + } + Op::Prmt(_) => coupled, + Op::PSetP(_) => coupled, + // UR2UP + Op::Sel(_) => coupled, + // SGXT + Op::Shf(_) => coupled, + Op::Shfl(_) => decoupled, + + Op::I2F(_) => decoupled, + Op::F2I(_) => decoupled, + Op::F2F(_) => decoupled, + Op::R2UR(_) => { + if !reader { + R2Ur + } else { + panic!("Illegal R2UR in ureg"); + } + } + Op::Redux(_) => { + if !reader { + ToUr + } else { + panic!("Illegal R2UR in ureg"); + } + } + Op::Vote(_) => Voteu, + Op::S2R(_) => ToUr, + + Op::Tex(_) | Op::Tld(_) | Op::Tld4(_) | Op::Txq(_) => Tex, + Op::FRnd(_) => decoupled, + Op::F2FP(_) + | Op::FAdd(_) + | Op::FMul(_) + | Op::FFma(_) + | Op::FSet(_) + | Op::FSetP(_) + | Op::FMnMx(_) + | Op::HAdd2(_) + | Op::HMul2(_) + | Op::HSet2(_) + | Op::HFma2(_) + | Op::HMnMx2(_) + | Op::HSetP2(_) => coupled, + Op::DMul(_) | Op::DFma(_) | Op::DAdd(_) | Op::DSetP(_) => decoupled, + _ => { + panic!("Illegal instuction in ureg category {}", op); + } + } +} + +fn op_upred_latency(op: &Op) -> UpredLatencySM100 { + use UpredLatencySM100::*; + let uniform_op = op.is_uniform(); + match op { + Op::BMsk(_) + | Op::BRev(_) + | Op::Flo(_) + | Op::IAdd3(_) + | Op::IAdd3X(_) + | Op::IMad(_) + | Op::ISetP(_) + | Op::Lea(_) + | Op::LeaX(_) + | Op::Lop3(_) + | Op::Mov(_) => Udp, + Op::Ldc(_) => UldcMma, + Op::PLop3(_) => { + if uniform_op { + Udp + } else { + Coupled + } + } + Op::PSetP(_) => { + if uniform_op { + Udp + } else { + Coupled + } + } + Op::Sel(_) => { + if uniform_op { + Udp + } else { + Coupled + } + } + Op::Vote(_) => { + if uniform_op { + Voteu + } else { + panic!("Illegal Vote in upred"); + } + } + _ => { + panic!("Illegal instuction in upred category {}", op); + } + } +} + +pub struct SM120Latency {} + +impl SM120Latency { + pub fn needs_scoreboards(op: &Op) -> bool { + if op.is_uniform() { + match op_ureg_latency(op, false, 0) { + UregLatencySM100::Uldc + | UregLatencySM100::ToUr + | UregLatencySM100::Tex => true, + _ => false, + } + } else { + match op_reg_latency(op, false, 0) { + RegLatencySM100::Dmma + | RegLatencySM100::Hmma + | RegLatencySM100::RedirectedFp64 + | RegLatencySM100::Branch + | RegLatencySM100::Decoupled + | RegLatencySM100::DecoupledAgu => true, + _ => false, + } + } + } + + pub fn raw( + write: &Op, + dst_idx: usize, + read: Option<&Op>, + src_idx: usize, + ) -> u32 { + let dst_file = match &write.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write_latency = op_reg_latency(write, false, dst_idx); + let read_latency = match read { + Some(op) => op_reg_latency(op, true, src_idx), + None => RegLatencySM100::RedirectedFp64, + }; + // The latencies are for SM100 docs, but some chips need large + // one just override here. + if write_latency == RegLatencySM100::Hmma + || read_latency == RegLatencySM100::Hmma + { + RegLatencySM100::raw(write_latency, read_latency, false) + 9 + } else { + RegLatencySM100::raw(write_latency, read_latency, false) + 1 + } + } + RegFile::UGPR => { + let write_latency = op_ureg_latency(write, false, dst_idx); + let read_latency = match read { + Some(op) => op_ureg_latency(op, true, src_idx), + None => UregLatencySM100::Uldc, + }; + UregLatencySM100::raw(write_latency, read_latency, false) + 1 + } + RegFile::Pred => { + let write_latency = op_pred_latency(write); + let read_latency = match read { + Some(op) => op_pred_latency(op), + None => PredLatencySM100::RedirectedFp64, + }; + PredLatencySM100::raw(write_latency, read_latency, false) + 1 + } + RegFile::UPred => { + let write_latency = op_upred_latency(write); + let read_latency = match read { + Some(op) => op_upred_latency(op), + None => UpredLatencySM100::UGuard, + }; + UpredLatencySM100::raw(write_latency, read_latency, false) + 1 + } + RegFile::Bar => 0, // Barriers have a HW scoreboard + _ => panic!("Not a register"), + } + } + + pub fn war(read: &Op, src_idx: usize, write: &Op, dst_idx: usize) -> u32 { + let dst_file = match &write.dsts_as_slice()[dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write_latency = op_reg_latency(write, false, dst_idx); + let read_latency = op_reg_latency(read, true, src_idx); + + if write_latency == RegLatencySM100::Hmma + || read_latency == RegLatencySM100::Hmma + { + RegLatencySM100::war(read_latency, write_latency, false) + 7 + } else { + RegLatencySM100::war(read_latency, write_latency, false) + } + } + RegFile::UGPR => { + let write_latency = op_ureg_latency(write, false, dst_idx); + let read_latency = op_ureg_latency(read, true, src_idx); + UregLatencySM100::war(read_latency, write_latency, false) + } + RegFile::Pred => { + let write_latency = op_pred_latency(write); + let read_latency = op_pred_latency(read); + PredLatencySM100::war(read_latency, write_latency, false) + } + RegFile::UPred => { + let write_latency = op_upred_latency(write); + let read_latency = op_upred_latency(read); + UpredLatencySM100::war(read_latency, write_latency, false) + } + _ => panic!("Not a register"), + } + } + + pub fn waw( + a: &Op, + a_dst_idx: usize, + b: &Op, + b_dst_idx: usize, + a_op_pred: bool, + ) -> u32 { + let dst_file = match &a.dsts_as_slice()[a_dst_idx] { + Dst::None => return 0, + Dst::SSA(vec) => vec.file().unwrap(), + Dst::Reg(reg) => reg.file(), + }; + + match dst_file { + RegFile::GPR => { + let write1_latency = op_reg_latency(a, false, a_dst_idx); + let write2_latency = op_reg_latency(b, false, b_dst_idx); + if write1_latency == RegLatencySM100::Hmma + || write2_latency == RegLatencySM100::Hmma + { + RegLatencySM100::waw( + write1_latency, + write2_latency, + a_op_pred, + ) + 7 + } else { + RegLatencySM100::waw( + write1_latency, + write2_latency, + a_op_pred, + ) + } + } + RegFile::UGPR => { + let write1_latency = op_ureg_latency(a, false, a_dst_idx); + let write2_latency = op_ureg_latency(b, false, b_dst_idx); + UregLatencySM100::waw(write1_latency, write2_latency, a_op_pred) + } + RegFile::Pred => { + let write1_latency = op_pred_latency(a); + let write2_latency = op_pred_latency(b); + PredLatencySM100::waw(write1_latency, write2_latency, a_op_pred) + } + RegFile::UPred => { + let write1_latency = op_upred_latency(a); + let write2_latency = op_upred_latency(b); + UpredLatencySM100::waw(write1_latency, write2_latency, false) + } + _ => panic!("Not a register"), + } + } +} diff --git a/src/nouveau/compiler/nak/sm70.rs b/src/nouveau/compiler/nak/sm70.rs index 2fbf7a05706..165735222fa 100644 --- a/src/nouveau/compiler/nak/sm70.rs +++ b/src/nouveau/compiler/nak/sm70.rs @@ -3,10 +3,10 @@ use crate::ir::*; use crate::legalize::LegalizeBuilder; +use crate::sm120_instr_latencies::SM120Latency; use crate::sm70_encode::*; use crate::sm75_instr_latencies::SM75Latency; use crate::sm80_instr_latencies::SM80Latency; - pub struct ShaderModel70 { sm: u8, } @@ -154,7 +154,9 @@ impl ShaderModel for ShaderModel70 { return false; } - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::needs_scoreboards(op) + } else if self.is_ampere() || self.is_ada() { SM80Latency::needs_scoreboards(op) } else if self.is_turing() { SM75Latency::needs_scoreboards(op) @@ -188,7 +190,9 @@ impl ShaderModel for ShaderModel70 { read: &Op, src_idx: usize, ) -> u32 { - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::raw(write, dst_idx, Some(read), src_idx) + } else if self.is_ampere() || self.is_ada() { SM80Latency::raw(write, dst_idx, Some(read), src_idx) } else if self.is_turing() { SM75Latency::raw(write, dst_idx, Some(read), src_idx) @@ -204,7 +208,9 @@ impl ShaderModel for ShaderModel70 { write: &Op, dst_idx: usize, ) -> u32 { - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::war(read, src_idx, write, dst_idx) + } else if self.is_ampere() || self.is_ada() { SM80Latency::war(read, src_idx, write, dst_idx) } else if self.is_turing() { SM75Latency::war(read, src_idx, write, dst_idx) @@ -223,7 +229,9 @@ impl ShaderModel for ShaderModel70 { b: &Op, b_dst_idx: usize, ) -> u32 { - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred) + } else if self.is_ampere() || self.is_ada() { SM80Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred) } else if self.is_turing() { SM75Latency::waw(a, a_dst_idx, b, b_dst_idx, a_has_pred) @@ -235,7 +243,9 @@ impl ShaderModel for ShaderModel70 { } fn paw_latency(&self, write: &Op, dst_idx: usize) -> u32 { - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::raw(write, dst_idx, None, 0) + } else if self.is_ampere() || self.is_ada() { SM80Latency::raw(write, dst_idx, None, 0) } else if self.is_turing() { SM75Latency::raw(write, dst_idx, None, 0) @@ -250,7 +260,9 @@ impl ShaderModel for ShaderModel70 { } fn worst_latency(&self, write: &Op, dst_idx: usize) -> u32 { - if self.is_ampere() || self.is_ada() { + if self.is_blackwell() { + SM120Latency::raw(write, dst_idx, None, 0) + } else if self.is_ampere() || self.is_ada() { SM80Latency::raw(write, dst_idx, None, 0) } else if self.is_turing() { SM75Latency::raw(write, dst_idx, None, 0) diff --git a/src/nouveau/headers/meson.build b/src/nouveau/headers/meson.build index 74b21b7df35..9469b8917c7 100644 --- a/src/nouveau/headers/meson.build +++ b/src/nouveau/headers/meson.build @@ -77,6 +77,9 @@ executable( install : with_tools.contains('nouveau'), ) +nouveau_util_py = files('util.py') +nouveau_util_py_path = meson.current_source_dir() + # Only generate Rust bindings for NVK if with_nouveau_vk cl_rs_generated = []