i965: Add a debug flag for counting cycles spent in each compiled shader.
This can be used for two purposes: Using hand-coded shaders to determine per-instruction timings, or figuring out which shader to optimize in a whole application. Note that this doesn't cover the instructions that set up the message to the URB/FB write -- we'd need to convert the MRF usage in these instructions to GRFs so that our offsets/times don't overwrite our shader outputs. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1) v2: Check the timestamp reset flag in the VS, which is apparently getting set fairly regularly in the range we watch, resulting in negative numbers getting added to our 32-bit counter, and thus large values added to our uint64_t. v3: Rebase on reladdr changes, removing a new safety check that proved impossible to satisfy. Add a comment to the AOP defs from Ken's review, and put them in a slightly more sensible spot. v4: Check timestamp reset in the FS as well.
This commit is contained in:
@@ -383,6 +383,9 @@ brwCreateContext(int api,
|
||||
|
||||
brw_fs_alloc_reg_sets(brw);
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
brw_init_shader_time(brw);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -559,14 +559,15 @@ struct brw_vs_prog_data {
|
||||
#define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
|
||||
#define SURF_INDEX_TEXTURE(t) (BRW_MAX_DRAW_BUFFERS + 2 + (t))
|
||||
#define SURF_INDEX_WM_UBO(u) (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u)
|
||||
|
||||
#define SURF_INDEX_WM_SHADER_TIME (SURF_INDEX_WM_UBO(12))
|
||||
/** Maximum size of the binding table. */
|
||||
#define BRW_MAX_WM_SURFACES SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS)
|
||||
#define BRW_MAX_WM_SURFACES (SURF_INDEX_WM_SHADER_TIME + 1)
|
||||
|
||||
#define SURF_INDEX_VERT_CONST_BUFFER (0)
|
||||
#define SURF_INDEX_VS_TEXTURE(t) (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t))
|
||||
#define SURF_INDEX_VS_UBO(u) (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u)
|
||||
#define BRW_MAX_VS_SURFACES SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS)
|
||||
#define SURF_INDEX_VS_SHADER_TIME (SURF_INDEX_VS_UBO(12))
|
||||
#define BRW_MAX_VS_SURFACES (SURF_INDEX_VS_SHADER_TIME + 1)
|
||||
|
||||
#define SURF_INDEX_SOL_BINDING(t) ((t))
|
||||
#define BRW_MAX_GS_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
|
||||
@@ -651,6 +652,13 @@ struct brw_tracked_state {
|
||||
void (*emit)( struct brw_context *brw );
|
||||
};
|
||||
|
||||
enum shader_time_shader_type {
|
||||
ST_NONE,
|
||||
ST_VS,
|
||||
ST_FS8,
|
||||
ST_FS16,
|
||||
};
|
||||
|
||||
/* Flags for brw->state.cache.
|
||||
*/
|
||||
#define CACHE_NEW_BLEND_STATE (1<<BRW_BLEND_STATE)
|
||||
@@ -1089,6 +1097,16 @@ struct brw_context
|
||||
|
||||
uint32_t num_instances;
|
||||
int basevertex;
|
||||
|
||||
struct {
|
||||
drm_intel_bo *bo;
|
||||
struct gl_shader_program **programs;
|
||||
enum shader_time_shader_type *types;
|
||||
uint64_t *cumulative;
|
||||
int num_entries;
|
||||
int max_entries;
|
||||
double report_time;
|
||||
} shader_time;
|
||||
};
|
||||
|
||||
/*======================================================================
|
||||
@@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
|
||||
int brw_get_scratch_size(int size);
|
||||
void brw_get_scratch_bo(struct intel_context *intel,
|
||||
drm_intel_bo **scratch_bo, int size);
|
||||
|
||||
void brw_init_shader_time(struct brw_context *brw);
|
||||
void brw_collect_and_report_shader_time(struct brw_context *brw);
|
||||
void brw_destroy_shader_time(struct brw_context *brw);
|
||||
|
||||
/* brw_urb.c
|
||||
*/
|
||||
|
||||
@@ -665,6 +665,8 @@ enum opcode {
|
||||
SHADER_OPCODE_TXS,
|
||||
FS_OPCODE_TXB,
|
||||
|
||||
SHADER_OPCODE_SHADER_TIME_ADD,
|
||||
|
||||
FS_OPCODE_DDX,
|
||||
FS_OPCODE_DDY,
|
||||
FS_OPCODE_PIXEL_X,
|
||||
@@ -731,6 +733,8 @@ enum opcode {
|
||||
#define BRW_ARF_CONTROL 0x80
|
||||
#define BRW_ARF_NOTIFICATION_COUNT 0x90
|
||||
#define BRW_ARF_IP 0xA0
|
||||
#define BRW_ARF_TDR 0xB0
|
||||
#define BRW_ARF_TIMESTAMP 0xC0
|
||||
|
||||
#define BRW_MRF_COMPR4 (1 << 7)
|
||||
|
||||
@@ -913,6 +917,23 @@ enum brw_message_target {
|
||||
#define GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 10
|
||||
#define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ 3
|
||||
|
||||
/* dataport atomic operations. */
|
||||
#define BRW_AOP_AND 1
|
||||
#define BRW_AOP_OR 2
|
||||
#define BRW_AOP_XOR 3
|
||||
#define BRW_AOP_MOV 4
|
||||
#define BRW_AOP_INC 5
|
||||
#define BRW_AOP_DEC 6
|
||||
#define BRW_AOP_ADD 7
|
||||
#define BRW_AOP_SUB 8
|
||||
#define BRW_AOP_REVSUB 9
|
||||
#define BRW_AOP_IMAX 10
|
||||
#define BRW_AOP_IMIN 11
|
||||
#define BRW_AOP_UMAX 12
|
||||
#define BRW_AOP_UMIN 13
|
||||
#define BRW_AOP_CMPWR 14
|
||||
#define BRW_AOP_PREDEC 15
|
||||
|
||||
#define BRW_MATH_FUNCTION_INV 1
|
||||
#define BRW_MATH_FUNCTION_LOG 2
|
||||
#define BRW_MATH_FUNCTION_EXP 3
|
||||
@@ -960,8 +981,6 @@ enum brw_message_target {
|
||||
#define BRW_SCRATCH_SPACE_SIZE_2M 11
|
||||
|
||||
|
||||
|
||||
|
||||
#define CMD_URB_FENCE 0x6000
|
||||
#define CMD_CS_URB_STATE 0x6001
|
||||
#define CMD_CONST_BUFFER 0x6002
|
||||
|
||||
@@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file,
|
||||
else if (file == BRW_MESSAGE_REGISTER_FILE)
|
||||
assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
|
||||
else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
|
||||
assert(nr <= BRW_ARF_IP);
|
||||
assert(nr <= BRW_ARF_TIMESTAMP);
|
||||
|
||||
reg.type = type;
|
||||
reg.file = file;
|
||||
@@ -1006,6 +1006,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
|
||||
int num_regs,
|
||||
GLuint offset);
|
||||
|
||||
void brw_shader_time_add(struct brw_compile *p,
|
||||
int mrf,
|
||||
uint32_t surf_index);
|
||||
|
||||
/* If/else/endif. Works by manipulating the execution flags on each
|
||||
* channel.
|
||||
*/
|
||||
|
||||
@@ -253,7 +253,6 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
|
||||
assert(!reg.negate);
|
||||
assert(!reg.abs);
|
||||
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
|
||||
assert(reg.vstride != BRW_VERTICAL_STRIDE_0);
|
||||
}
|
||||
|
||||
validate_reg(insn, reg);
|
||||
@@ -332,7 +331,8 @@ void brw_set_src1(struct brw_compile *p,
|
||||
{
|
||||
assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
|
||||
|
||||
assert(reg.nr < 128);
|
||||
if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
|
||||
assert(reg.nr < 128);
|
||||
|
||||
gen7_convert_mrf_to_grf(p, ®);
|
||||
|
||||
@@ -2448,3 +2448,55 @@ brw_svb_write(struct brw_compile *p,
|
||||
0, /* end_of_thread */
|
||||
send_commit_msg); /* send_commit_msg */
|
||||
}
|
||||
|
||||
/**
|
||||
* This instruction is generated as a single-channel align1 instruction by
|
||||
* both the VS and FS stages when using INTEL_DEBUG=shader_time.
|
||||
*
|
||||
* We can't use the typed atomic op in the FS because that has the execution
|
||||
* mask ANDed with the pixel mask, but we just want to write the one dword for
|
||||
* all the pixels.
|
||||
*
|
||||
* We don't use the SIMD4x2 atomic ops in the VS because want to just write
|
||||
* one u32. So we use the same untyped atomic write message as the pixel
|
||||
* shader.
|
||||
*
|
||||
* The untyped atomic operation requires a BUFFER surface type with RAW
|
||||
* format, and is only accessible through the legacy DATA_CACHE dataport
|
||||
* messages.
|
||||
*/
|
||||
void brw_shader_time_add(struct brw_compile *p,
|
||||
int base_mrf,
|
||||
uint32_t surf_index)
|
||||
{
|
||||
struct intel_context *intel = &p->brw->intel;
|
||||
assert(intel->gen >= 7);
|
||||
|
||||
brw_push_insn_state(p);
|
||||
brw_set_access_mode(p, BRW_ALIGN_1);
|
||||
brw_set_mask_control(p, BRW_MASK_DISABLE);
|
||||
struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
|
||||
brw_pop_insn_state(p);
|
||||
|
||||
/* We use brw_vec1_reg and unmasked because we want to increment the given
|
||||
* offset only once.
|
||||
*/
|
||||
brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
||||
BRW_ARF_NULL, 0));
|
||||
brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
|
||||
base_mrf, 0));
|
||||
|
||||
bool header_present = false;
|
||||
bool eot = false;
|
||||
uint32_t mlen = 2; /* offset, value */
|
||||
uint32_t rlen = 0;
|
||||
brw_set_message_descriptor(p, send,
|
||||
GEN7_SFID_DATAPORT_DATA_CACHE,
|
||||
mlen, rlen, header_present, eot);
|
||||
|
||||
send->bits3.ud |= 6 << 14; /* untyped atomic op */
|
||||
send->bits3.ud |= 0 << 13; /* no return data */
|
||||
send->bits3.ud |= 1 << 12; /* SIMD8 mode */
|
||||
send->bits3.ud |= BRW_AOP_ADD << 8;
|
||||
send->bits3.ud |= surf_index << 0;
|
||||
}
|
||||
|
||||
@@ -459,6 +459,118 @@ fs_visitor::type_size(const struct glsl_type *type)
|
||||
}
|
||||
}
|
||||
|
||||
fs_reg
|
||||
fs_visitor::get_timestamp()
|
||||
{
|
||||
assert(intel->gen >= 7);
|
||||
|
||||
fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
||||
BRW_ARF_TIMESTAMP,
|
||||
0),
|
||||
BRW_REGISTER_TYPE_UD));
|
||||
|
||||
fs_reg dst = fs_reg(this, glsl_type::uint_type);
|
||||
|
||||
fs_inst *mov = emit(MOV(dst, ts));
|
||||
/* We want to read the 3 fields we care about (mostly field 0, but also 2)
|
||||
* even if it's not enabled in the dispatch.
|
||||
*/
|
||||
mov->force_writemask_all = true;
|
||||
mov->force_uncompressed = true;
|
||||
|
||||
/* The caller wants the low 32 bits of the timestamp. Since it's running
|
||||
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
|
||||
* which is plenty of time for our purposes. It is identical across the
|
||||
* EUs, but since it's tracking GPU core speed it will increment at a
|
||||
* varying rate as render P-states change.
|
||||
*
|
||||
* The caller could also check if render P-states have changed (or anything
|
||||
* else that might disrupt timing) by setting smear to 2 and checking if
|
||||
* that field is != 0.
|
||||
*/
|
||||
dst.smear = 0;
|
||||
|
||||
return dst;
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_shader_time_begin()
|
||||
{
|
||||
current_annotation = "shader time start";
|
||||
shader_start_time = get_timestamp();
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_shader_time_end()
|
||||
{
|
||||
current_annotation = "shader time end";
|
||||
|
||||
enum shader_time_shader_type type;
|
||||
if (dispatch_width == 8) {
|
||||
type = ST_FS8;
|
||||
} else {
|
||||
assert(dispatch_width == 16);
|
||||
type = ST_FS16;
|
||||
}
|
||||
|
||||
emit_shader_time_write(type, shader_start_time, get_timestamp());
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
|
||||
fs_reg start, fs_reg end)
|
||||
{
|
||||
/* Choose an index in the buffer and set up tracking information for our
|
||||
* printouts.
|
||||
*/
|
||||
int shader_time_index = brw->shader_time.num_entries++;
|
||||
assert(shader_time_index <= brw->shader_time.max_entries);
|
||||
brw->shader_time.types[shader_time_index] = type;
|
||||
if (prog) {
|
||||
_mesa_reference_shader_program(ctx,
|
||||
&brw->shader_time.programs[shader_time_index],
|
||||
prog);
|
||||
}
|
||||
|
||||
/* Check that there weren't any timestamp reset events (assuming these
|
||||
* were the only two timestamp reads that happened).
|
||||
*/
|
||||
fs_reg reset = end;
|
||||
reset.smear = 2;
|
||||
fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
|
||||
test->conditional_mod = BRW_CONDITIONAL_Z;
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
|
||||
push_force_uncompressed();
|
||||
start.negate = true;
|
||||
fs_reg diff = fs_reg(this, glsl_type::uint_type);
|
||||
emit(ADD(diff, start, end));
|
||||
|
||||
/* If there were no instructions between the two timestamp gets, the diff
|
||||
* is 2 cycles. Remove that overhead, so I can forget about that when
|
||||
* trying to determine the time taken for single instructions.
|
||||
*/
|
||||
emit(ADD(diff, diff, fs_reg(-2u)));
|
||||
|
||||
int base_mrf = 6;
|
||||
|
||||
fs_reg offset_mrf = fs_reg(MRF, base_mrf);
|
||||
offset_mrf.type = BRW_REGISTER_TYPE_UD;
|
||||
emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
|
||||
|
||||
fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
|
||||
time_mrf.type = BRW_REGISTER_TYPE_UD;
|
||||
emit(MOV(time_mrf, diff));
|
||||
|
||||
fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = 2;
|
||||
|
||||
pop_force_uncompressed();
|
||||
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
void
|
||||
fs_visitor::fail(const char *format, ...)
|
||||
{
|
||||
@@ -571,6 +683,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
|
||||
case SHADER_OPCODE_TXL:
|
||||
case SHADER_OPCODE_TXS:
|
||||
return 1;
|
||||
case SHADER_OPCODE_SHADER_TIME_ADD:
|
||||
return 0;
|
||||
case FS_OPCODE_FB_WRITE:
|
||||
return 2;
|
||||
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
||||
@@ -2295,6 +2409,9 @@ fs_visitor::run()
|
||||
if (0) {
|
||||
emit_dummy_fs();
|
||||
} else {
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
emit_shader_time_begin();
|
||||
|
||||
calculate_urb_setup();
|
||||
if (intel->gen < 6)
|
||||
emit_interpolation_setup_gen4();
|
||||
@@ -2318,6 +2435,9 @@ fs_visitor::run()
|
||||
if (failed)
|
||||
return false;
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
emit_shader_time_end();
|
||||
|
||||
emit_fb_writes();
|
||||
|
||||
split_virtual_grfs();
|
||||
|
||||
@@ -363,6 +363,12 @@ public:
|
||||
|
||||
void emit_color_write(int target, int index, int first_color_mrf);
|
||||
void emit_fb_writes();
|
||||
|
||||
void emit_shader_time_begin();
|
||||
void emit_shader_time_end();
|
||||
void emit_shader_time_write(enum shader_time_shader_type type,
|
||||
fs_reg start, fs_reg end);
|
||||
|
||||
bool try_rewrite_rhs_to_dst(ir_assignment *ir,
|
||||
fs_reg dst,
|
||||
fs_reg src,
|
||||
@@ -373,6 +379,8 @@ public:
|
||||
void resolve_ud_negate(fs_reg *reg);
|
||||
void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
|
||||
|
||||
fs_reg get_timestamp();
|
||||
|
||||
struct brw_reg interp_reg(int location, int channel);
|
||||
int setup_uniform_values(int loc, const glsl_type *type);
|
||||
void setup_builtin_uniform_values(ir_variable *ir);
|
||||
@@ -435,6 +443,7 @@ public:
|
||||
fs_reg pixel_w;
|
||||
fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
|
||||
fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
|
||||
fs_reg shader_start_time;
|
||||
|
||||
int grf_used;
|
||||
|
||||
|
||||
@@ -1124,6 +1124,10 @@ fs_generator::generate_code(exec_list *instructions)
|
||||
generate_mov_dispatch_to_flags();
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_SHADER_TIME_ADD:
|
||||
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
|
||||
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
|
||||
|
||||
@@ -189,3 +189,130 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
|
||||
functions->LinkShader = brw_link_shader;
|
||||
}
|
||||
|
||||
void
|
||||
brw_init_shader_time(struct brw_context *brw)
|
||||
{
|
||||
struct intel_context *intel = &brw->intel;
|
||||
|
||||
const int max_entries = 4096;
|
||||
brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time",
|
||||
max_entries * 4, 4096);
|
||||
brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *,
|
||||
max_entries);
|
||||
brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
|
||||
max_entries);
|
||||
brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
|
||||
max_entries);
|
||||
brw->shader_time.max_entries = max_entries;
|
||||
}
|
||||
|
||||
static int
|
||||
compare_time(const void *a, const void *b)
|
||||
{
|
||||
uint64_t * const *a_val = a;
|
||||
uint64_t * const *b_val = b;
|
||||
|
||||
/* We don't just subtract because we're turning the value to an int. */
|
||||
if (**a_val < **b_val)
|
||||
return -1;
|
||||
else if (**a_val == **b_val)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
brw_report_shader_time(struct brw_context *brw)
|
||||
{
|
||||
if (!brw->shader_time.bo || !brw->shader_time.num_entries)
|
||||
return;
|
||||
|
||||
uint64_t *sorted[brw->shader_time.num_entries];
|
||||
double total = 0;
|
||||
for (int i = 0; i < brw->shader_time.num_entries; i++) {
|
||||
sorted[i] = &brw->shader_time.cumulative[i];
|
||||
total += brw->shader_time.cumulative[i];
|
||||
}
|
||||
|
||||
if (total == 0) {
|
||||
printf("No shader time collected yet\n");
|
||||
return;
|
||||
}
|
||||
|
||||
qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
|
||||
|
||||
printf("\n");
|
||||
printf("type ID cycles spent %% of total\n");
|
||||
for (int s = 0; s < brw->shader_time.num_entries; s++) {
|
||||
/* Work back from the sorted pointers times to a time to print. */
|
||||
int i = sorted[s] - brw->shader_time.cumulative;
|
||||
|
||||
int shader_num = -1;
|
||||
if (brw->shader_time.programs[i]) {
|
||||
shader_num = brw->shader_time.programs[i]->Name;
|
||||
}
|
||||
|
||||
switch (brw->shader_time.types[i]) {
|
||||
case ST_VS:
|
||||
printf("vs %4d: ", shader_num);
|
||||
break;
|
||||
case ST_FS8:
|
||||
printf("fs8 %4d: ", shader_num);
|
||||
break;
|
||||
case ST_FS16:
|
||||
printf("fs16 %4d: ", shader_num);
|
||||
break;
|
||||
default:
|
||||
printf("other: ");
|
||||
break;
|
||||
}
|
||||
|
||||
printf("%16lld (%7.2f Gcycles) %4.1f%%\n",
|
||||
(long long)brw->shader_time.cumulative[i],
|
||||
(double)brw->shader_time.cumulative[i] / 1000000000.0,
|
||||
(double)brw->shader_time.cumulative[i] / total * 100.0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
brw_collect_shader_time(struct brw_context *brw)
|
||||
{
|
||||
if (!brw->shader_time.bo)
|
||||
return;
|
||||
|
||||
/* This probably stalls on the last rendering. We could fix that by
|
||||
* delaying reading the reports, but it doesn't look like it's a big
|
||||
* overhead compared to the cost of tracking the time in the first place.
|
||||
*/
|
||||
drm_intel_bo_map(brw->shader_time.bo, true);
|
||||
|
||||
uint32_t *times = brw->shader_time.bo->virtual;
|
||||
|
||||
for (int i = 0; i < brw->shader_time.num_entries; i++) {
|
||||
brw->shader_time.cumulative[i] += times[i];
|
||||
}
|
||||
|
||||
/* Zero the BO out to clear it out for our next collection.
|
||||
*/
|
||||
memset(times, 0, brw->shader_time.bo->size);
|
||||
drm_intel_bo_unmap(brw->shader_time.bo);
|
||||
}
|
||||
|
||||
void
|
||||
brw_collect_and_report_shader_time(struct brw_context *brw)
|
||||
{
|
||||
brw_collect_shader_time(brw);
|
||||
|
||||
if (brw->shader_time.report_time == 0 ||
|
||||
get_time() - brw->shader_time.report_time >= 1.0) {
|
||||
brw_report_shader_time(brw);
|
||||
brw->shader_time.report_time = get_time();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
brw_destroy_shader_time(struct brw_context *brw)
|
||||
{
|
||||
drm_intel_bo_unreference(brw->shader_time.bo);
|
||||
brw->shader_time.bo = NULL;
|
||||
}
|
||||
|
||||
@@ -26,6 +26,7 @@
|
||||
|
||||
extern "C" {
|
||||
#include "main/macros.h"
|
||||
#include "main/shaderobj.h"
|
||||
#include "program/prog_print.h"
|
||||
#include "program/prog_parameter.h"
|
||||
}
|
||||
@@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
|
||||
return 2;
|
||||
case VS_OPCODE_SCRATCH_WRITE:
|
||||
return 3;
|
||||
case SHADER_OPCODE_SHADER_TIME_ADD:
|
||||
return 0;
|
||||
default:
|
||||
assert(!"not reached");
|
||||
return inst->mlen;
|
||||
@@ -1039,9 +1042,109 @@ vec4_visitor::setup_payload(void)
|
||||
this->first_non_payload_grf = reg;
|
||||
}
|
||||
|
||||
src_reg
|
||||
vec4_visitor::get_timestamp()
|
||||
{
|
||||
assert(intel->gen >= 7);
|
||||
|
||||
src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
||||
BRW_ARF_TIMESTAMP,
|
||||
0,
|
||||
BRW_REGISTER_TYPE_UD,
|
||||
BRW_VERTICAL_STRIDE_0,
|
||||
BRW_WIDTH_4,
|
||||
BRW_HORIZONTAL_STRIDE_4,
|
||||
BRW_SWIZZLE_XYZW,
|
||||
WRITEMASK_XYZW));
|
||||
|
||||
dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
|
||||
|
||||
vec4_instruction *mov = emit(MOV(dst, ts));
|
||||
/* We want to read the 3 fields we care about (mostly field 0, but also 2)
|
||||
* even if it's not enabled in the dispatch.
|
||||
*/
|
||||
mov->force_writemask_all = true;
|
||||
|
||||
return src_reg(dst);
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::emit_shader_time_begin()
|
||||
{
|
||||
current_annotation = "shader time start";
|
||||
shader_start_time = get_timestamp();
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::emit_shader_time_end()
|
||||
{
|
||||
current_annotation = "shader time end";
|
||||
src_reg shader_end_time = get_timestamp();
|
||||
|
||||
emit_shader_time_write(ST_VS, shader_start_time, shader_end_time);
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
|
||||
src_reg start, src_reg end)
|
||||
{
|
||||
/* Choose an index in the buffer and set up tracking information for our
|
||||
* printouts.
|
||||
*/
|
||||
int shader_time_index = brw->shader_time.num_entries++;
|
||||
assert(shader_time_index <= brw->shader_time.max_entries);
|
||||
brw->shader_time.types[shader_time_index] = type;
|
||||
if (prog) {
|
||||
_mesa_reference_shader_program(ctx,
|
||||
&brw->shader_time.programs[shader_time_index],
|
||||
prog);
|
||||
}
|
||||
|
||||
/* Check that there weren't any timestamp reset events (assuming these
|
||||
* were the only two timestamp reads that happened).
|
||||
*/
|
||||
src_reg reset_end = end;
|
||||
reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
|
||||
vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
|
||||
test->conditional_mod = BRW_CONDITIONAL_Z;
|
||||
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
|
||||
/* Take the current timestamp and get the delta. */
|
||||
start.negate = true;
|
||||
dst_reg diff = dst_reg(this, glsl_type::uint_type);
|
||||
emit(ADD(diff, start, end));
|
||||
|
||||
/* If there were no instructions between the two timestamp gets, the diff
|
||||
* is 2 cycles. Remove that overhead, so I can forget about that when
|
||||
* trying to determine the time taken for single instructions.
|
||||
*/
|
||||
emit(ADD(diff, src_reg(diff), src_reg(-2u)));
|
||||
|
||||
int base_mrf = 6;
|
||||
|
||||
dst_reg offset_mrf = dst_reg(MRF, base_mrf);
|
||||
offset_mrf.type = BRW_REGISTER_TYPE_UD;
|
||||
emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
|
||||
|
||||
dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
|
||||
time_mrf.type = BRW_REGISTER_TYPE_UD;
|
||||
emit(MOV(time_mrf, src_reg(diff)));
|
||||
|
||||
vec4_instruction *inst;
|
||||
inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = 2;
|
||||
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::run()
|
||||
{
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
emit_shader_time_begin();
|
||||
|
||||
emit_attribute_fixups();
|
||||
|
||||
/* Generate VS IR for main(). (the visitor only descends into
|
||||
@@ -1057,6 +1160,9 @@ vec4_visitor::run()
|
||||
if (c->key.userclip_active && !c->key.uses_clip_distance)
|
||||
setup_uniform_clipplane_values();
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
emit_shader_time_end();
|
||||
|
||||
emit_urb_writes();
|
||||
|
||||
/* Before any optimization, push array accesses out to scratch
|
||||
|
||||
@@ -302,6 +302,8 @@ public:
|
||||
int uniform_vector_size[MAX_UNIFORMS];
|
||||
int uniforms;
|
||||
|
||||
src_reg shader_start_time;
|
||||
|
||||
struct hash_table *variable_ht;
|
||||
|
||||
bool run(void);
|
||||
@@ -434,6 +436,11 @@ public:
|
||||
void emit_urb_slot(int mrf, int vert_result);
|
||||
void emit_urb_writes(void);
|
||||
|
||||
void emit_shader_time_begin();
|
||||
void emit_shader_time_end();
|
||||
void emit_shader_time_write(enum shader_time_shader_type type,
|
||||
src_reg start, src_reg end);
|
||||
|
||||
src_reg get_scratch_offset(vec4_instruction *inst,
|
||||
src_reg *reladdr, int reg_offset);
|
||||
src_reg get_pull_constant_offset(vec4_instruction *inst,
|
||||
@@ -452,6 +459,8 @@ public:
|
||||
bool try_emit_sat(ir_expression *ir);
|
||||
void resolve_ud_negate(src_reg *reg);
|
||||
|
||||
src_reg get_timestamp();
|
||||
|
||||
bool process_move_condition(ir_rvalue *ir);
|
||||
|
||||
void dump_instruction(vec4_instruction *inst);
|
||||
|
||||
@@ -660,6 +660,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction,
|
||||
generate_pull_constant_load(inst, dst, src[0], src[1]);
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_SHADER_TIME_ADD:
|
||||
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME);
|
||||
break;
|
||||
|
||||
default:
|
||||
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
|
||||
_mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
|
||||
|
||||
@@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = {
|
||||
static void
|
||||
brw_vs_upload_binding_table(struct brw_context *brw)
|
||||
{
|
||||
struct intel_context *intel = &brw->intel;
|
||||
uint32_t *bind;
|
||||
int i;
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
|
||||
intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
|
||||
brw->shader_time.bo->size,
|
||||
&brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]);
|
||||
|
||||
assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME);
|
||||
brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME;
|
||||
}
|
||||
|
||||
/* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or
|
||||
* pull constants.
|
||||
*/
|
||||
|
||||
@@ -43,6 +43,7 @@
|
||||
#include "intel_fbo.h"
|
||||
|
||||
#include "brw_context.h"
|
||||
#include "brw_program.h"
|
||||
#include "brw_defines.h"
|
||||
#include "brw_state.h"
|
||||
#include "brw_draw.h"
|
||||
@@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel )
|
||||
{
|
||||
struct brw_context *brw = brw_context(&intel->ctx);
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
|
||||
brw_collect_and_report_shader_time(brw);
|
||||
brw_destroy_shader_time(brw);
|
||||
}
|
||||
|
||||
brw_destroy_state(brw);
|
||||
brw_draw_destroy( brw );
|
||||
|
||||
@@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel )
|
||||
* next batch.
|
||||
*/
|
||||
brw->cache.bo_used_by_gpu = true;
|
||||
|
||||
/* We need to periodically reap the shader time results, because rollover
|
||||
* happens every few seconds. We also want to see results every once in a
|
||||
* while, because many programs won't cleanly destroy our context, so the
|
||||
* end-of-run printout may not happen.
|
||||
*/
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
||||
brw_collect_and_report_shader_time(brw);
|
||||
}
|
||||
|
||||
static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
|
||||
|
||||
@@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = {
|
||||
static void
|
||||
brw_upload_wm_binding_table(struct brw_context *brw)
|
||||
{
|
||||
struct intel_context *intel = &brw->intel;
|
||||
uint32_t *bind;
|
||||
int i;
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
|
||||
intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
|
||||
brw->shader_time.bo->size,
|
||||
&brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]);
|
||||
}
|
||||
|
||||
/* Might want to calculate nr_surfaces first, to avoid taking up so much
|
||||
* space for the binding table.
|
||||
*/
|
||||
|
||||
@@ -492,6 +492,7 @@ static const struct dri_debug_control debug_control[] = {
|
||||
{ "vs", DEBUG_VS },
|
||||
{ "clip", DEBUG_CLIP },
|
||||
{ "aub", DEBUG_AUB },
|
||||
{ "shader_time", DEBUG_SHADER_TIME },
|
||||
{ NULL, 0 }
|
||||
};
|
||||
|
||||
@@ -747,6 +748,11 @@ intelInitContext(struct intel_context *intel,
|
||||
INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
|
||||
if (INTEL_DEBUG & DEBUG_BUFMGR)
|
||||
dri_bufmgr_set_debug(intel->bufmgr, true);
|
||||
if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) {
|
||||
fprintf(stderr,
|
||||
"shader_time debugging requires gen7 (Ivybridge) or better.\n");
|
||||
INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
|
||||
}
|
||||
|
||||
if (INTEL_DEBUG & DEBUG_AUB)
|
||||
drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true);
|
||||
|
||||
@@ -456,6 +456,7 @@ extern int INTEL_DEBUG;
|
||||
#define DEBUG_VS 0x1000000
|
||||
#define DEBUG_CLIP 0x2000000
|
||||
#define DEBUG_AUB 0x4000000
|
||||
#define DEBUG_SHADER_TIME 0x8000000
|
||||
|
||||
#ifdef HAVE_ANDROID_PLATFORM
|
||||
#define LOG_TAG "INTEL-MESA"
|
||||
|
||||
Reference in New Issue
Block a user