i965: Add a debug flag for counting cycles spent in each compiled shader.

This can be used for two purposes: Using hand-coded shaders to determine
per-instruction timings, or figuring out which shader to optimize in a
whole application.

Note that this doesn't cover the instructions that set up the message to
the URB/FB write -- we'd need to convert the MRF usage in these
instructions to GRFs so that our offsets/times don't overwrite our
shader outputs.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1)

v2: Check the timestamp reset flag in the VS, which is apparently
    getting set fairly regularly in the range we watch, resulting in
    negative numbers getting added to our 32-bit counter, and thus large
    values added to our uint64_t.
v3: Rebase on reladdr changes, removing a new safety check that proved
    impossible to satisfy.  Add a comment to the AOP defs from Ken's
    review, and put them in a slightly more sensible spot.
v4: Check timestamp reset in the FS as well.
This commit is contained in:
Eric Anholt
2012-11-27 14:10:52 -08:00
parent ef2fbf67d4
commit 71f06344a0
17 changed files with 524 additions and 9 deletions
+3
View File
@@ -383,6 +383,9 @@ brwCreateContext(int api,
brw_fs_alloc_reg_sets(brw);
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
brw_init_shader_time(brw);
return true;
}
+24 -4
View File
@@ -559,14 +559,15 @@ struct brw_vs_prog_data {
#define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
#define SURF_INDEX_TEXTURE(t) (BRW_MAX_DRAW_BUFFERS + 2 + (t))
#define SURF_INDEX_WM_UBO(u) (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u)
#define SURF_INDEX_WM_SHADER_TIME (SURF_INDEX_WM_UBO(12))
/** Maximum size of the binding table. */
#define BRW_MAX_WM_SURFACES SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS)
#define BRW_MAX_WM_SURFACES (SURF_INDEX_WM_SHADER_TIME + 1)
#define SURF_INDEX_VERT_CONST_BUFFER (0)
#define SURF_INDEX_VS_TEXTURE(t) (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t))
#define SURF_INDEX_VS_UBO(u) (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u)
#define BRW_MAX_VS_SURFACES SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS)
#define SURF_INDEX_VS_SHADER_TIME (SURF_INDEX_VS_UBO(12))
#define BRW_MAX_VS_SURFACES (SURF_INDEX_VS_SHADER_TIME + 1)
#define SURF_INDEX_SOL_BINDING(t) ((t))
#define BRW_MAX_GS_SURFACES SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
@@ -651,6 +652,13 @@ struct brw_tracked_state {
void (*emit)( struct brw_context *brw );
};
enum shader_time_shader_type {
ST_NONE,
ST_VS,
ST_FS8,
ST_FS16,
};
/* Flags for brw->state.cache.
*/
#define CACHE_NEW_BLEND_STATE (1<<BRW_BLEND_STATE)
@@ -1089,6 +1097,16 @@ struct brw_context
uint32_t num_instances;
int basevertex;
struct {
drm_intel_bo *bo;
struct gl_shader_program **programs;
enum shader_time_shader_type *types;
uint64_t *cumulative;
int num_entries;
int max_entries;
double report_time;
} shader_time;
};
/*======================================================================
@@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
int brw_get_scratch_size(int size);
void brw_get_scratch_bo(struct intel_context *intel,
drm_intel_bo **scratch_bo, int size);
void brw_init_shader_time(struct brw_context *brw);
void brw_collect_and_report_shader_time(struct brw_context *brw);
void brw_destroy_shader_time(struct brw_context *brw);
/* brw_urb.c
*/
+21 -2
View File
@@ -665,6 +665,8 @@ enum opcode {
SHADER_OPCODE_TXS,
FS_OPCODE_TXB,
SHADER_OPCODE_SHADER_TIME_ADD,
FS_OPCODE_DDX,
FS_OPCODE_DDY,
FS_OPCODE_PIXEL_X,
@@ -731,6 +733,8 @@ enum opcode {
#define BRW_ARF_CONTROL 0x80
#define BRW_ARF_NOTIFICATION_COUNT 0x90
#define BRW_ARF_IP 0xA0
#define BRW_ARF_TDR 0xB0
#define BRW_ARF_TIMESTAMP 0xC0
#define BRW_MRF_COMPR4 (1 << 7)
@@ -913,6 +917,23 @@ enum brw_message_target {
#define GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE 10
#define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ 3
/* dataport atomic operations. */
#define BRW_AOP_AND 1
#define BRW_AOP_OR 2
#define BRW_AOP_XOR 3
#define BRW_AOP_MOV 4
#define BRW_AOP_INC 5
#define BRW_AOP_DEC 6
#define BRW_AOP_ADD 7
#define BRW_AOP_SUB 8
#define BRW_AOP_REVSUB 9
#define BRW_AOP_IMAX 10
#define BRW_AOP_IMIN 11
#define BRW_AOP_UMAX 12
#define BRW_AOP_UMIN 13
#define BRW_AOP_CMPWR 14
#define BRW_AOP_PREDEC 15
#define BRW_MATH_FUNCTION_INV 1
#define BRW_MATH_FUNCTION_LOG 2
#define BRW_MATH_FUNCTION_EXP 3
@@ -960,8 +981,6 @@ enum brw_message_target {
#define BRW_SCRATCH_SPACE_SIZE_2M 11
#define CMD_URB_FENCE 0x6000
#define CMD_CS_URB_STATE 0x6001
#define CMD_CONST_BUFFER 0x6002
+5 -1
View File
@@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file,
else if (file == BRW_MESSAGE_REGISTER_FILE)
assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
assert(nr <= BRW_ARF_IP);
assert(nr <= BRW_ARF_TIMESTAMP);
reg.type = type;
reg.file = file;
@@ -1006,6 +1006,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
int num_regs,
GLuint offset);
void brw_shader_time_add(struct brw_compile *p,
int mrf,
uint32_t surf_index);
/* If/else/endif. Works by manipulating the execution flags on each
* channel.
*/
+54 -2
View File
@@ -253,7 +253,6 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
assert(!reg.negate);
assert(!reg.abs);
assert(reg.address_mode == BRW_ADDRESS_DIRECT);
assert(reg.vstride != BRW_VERTICAL_STRIDE_0);
}
validate_reg(insn, reg);
@@ -332,7 +331,8 @@ void brw_set_src1(struct brw_compile *p,
{
assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
assert(reg.nr < 128);
if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
assert(reg.nr < 128);
gen7_convert_mrf_to_grf(p, &reg);
@@ -2448,3 +2448,55 @@ brw_svb_write(struct brw_compile *p,
0, /* end_of_thread */
send_commit_msg); /* send_commit_msg */
}
/**
* This instruction is generated as a single-channel align1 instruction by
* both the VS and FS stages when using INTEL_DEBUG=shader_time.
*
* We can't use the typed atomic op in the FS because that has the execution
* mask ANDed with the pixel mask, but we just want to write the one dword for
* all the pixels.
*
* We don't use the SIMD4x2 atomic ops in the VS because want to just write
* one u32. So we use the same untyped atomic write message as the pixel
* shader.
*
* The untyped atomic operation requires a BUFFER surface type with RAW
* format, and is only accessible through the legacy DATA_CACHE dataport
* messages.
*/
void brw_shader_time_add(struct brw_compile *p,
int base_mrf,
uint32_t surf_index)
{
struct intel_context *intel = &p->brw->intel;
assert(intel->gen >= 7);
brw_push_insn_state(p);
brw_set_access_mode(p, BRW_ALIGN_1);
brw_set_mask_control(p, BRW_MASK_DISABLE);
struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
brw_pop_insn_state(p);
/* We use brw_vec1_reg and unmasked because we want to increment the given
* offset only once.
*/
brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_NULL, 0));
brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
base_mrf, 0));
bool header_present = false;
bool eot = false;
uint32_t mlen = 2; /* offset, value */
uint32_t rlen = 0;
brw_set_message_descriptor(p, send,
GEN7_SFID_DATAPORT_DATA_CACHE,
mlen, rlen, header_present, eot);
send->bits3.ud |= 6 << 14; /* untyped atomic op */
send->bits3.ud |= 0 << 13; /* no return data */
send->bits3.ud |= 1 << 12; /* SIMD8 mode */
send->bits3.ud |= BRW_AOP_ADD << 8;
send->bits3.ud |= surf_index << 0;
}
+120
View File
@@ -459,6 +459,118 @@ fs_visitor::type_size(const struct glsl_type *type)
}
}
fs_reg
fs_visitor::get_timestamp()
{
assert(intel->gen >= 7);
fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_TIMESTAMP,
0),
BRW_REGISTER_TYPE_UD));
fs_reg dst = fs_reg(this, glsl_type::uint_type);
fs_inst *mov = emit(MOV(dst, ts));
/* We want to read the 3 fields we care about (mostly field 0, but also 2)
* even if it's not enabled in the dispatch.
*/
mov->force_writemask_all = true;
mov->force_uncompressed = true;
/* The caller wants the low 32 bits of the timestamp. Since it's running
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
* which is plenty of time for our purposes. It is identical across the
* EUs, but since it's tracking GPU core speed it will increment at a
* varying rate as render P-states change.
*
* The caller could also check if render P-states have changed (or anything
* else that might disrupt timing) by setting smear to 2 and checking if
* that field is != 0.
*/
dst.smear = 0;
return dst;
}
void
fs_visitor::emit_shader_time_begin()
{
current_annotation = "shader time start";
shader_start_time = get_timestamp();
}
void
fs_visitor::emit_shader_time_end()
{
current_annotation = "shader time end";
enum shader_time_shader_type type;
if (dispatch_width == 8) {
type = ST_FS8;
} else {
assert(dispatch_width == 16);
type = ST_FS16;
}
emit_shader_time_write(type, shader_start_time, get_timestamp());
}
void
fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
fs_reg start, fs_reg end)
{
/* Choose an index in the buffer and set up tracking information for our
* printouts.
*/
int shader_time_index = brw->shader_time.num_entries++;
assert(shader_time_index <= brw->shader_time.max_entries);
brw->shader_time.types[shader_time_index] = type;
if (prog) {
_mesa_reference_shader_program(ctx,
&brw->shader_time.programs[shader_time_index],
prog);
}
/* Check that there weren't any timestamp reset events (assuming these
* were the only two timestamp reads that happened).
*/
fs_reg reset = end;
reset.smear = 2;
fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
test->conditional_mod = BRW_CONDITIONAL_Z;
emit(IF(BRW_PREDICATE_NORMAL));
push_force_uncompressed();
start.negate = true;
fs_reg diff = fs_reg(this, glsl_type::uint_type);
emit(ADD(diff, start, end));
/* If there were no instructions between the two timestamp gets, the diff
* is 2 cycles. Remove that overhead, so I can forget about that when
* trying to determine the time taken for single instructions.
*/
emit(ADD(diff, diff, fs_reg(-2u)));
int base_mrf = 6;
fs_reg offset_mrf = fs_reg(MRF, base_mrf);
offset_mrf.type = BRW_REGISTER_TYPE_UD;
emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
time_mrf.type = BRW_REGISTER_TYPE_UD;
emit(MOV(time_mrf, diff));
fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
inst->base_mrf = base_mrf;
inst->mlen = 2;
pop_force_uncompressed();
emit(BRW_OPCODE_ENDIF);
}
void
fs_visitor::fail(const char *format, ...)
{
@@ -571,6 +683,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXS:
return 1;
case SHADER_OPCODE_SHADER_TIME_ADD:
return 0;
case FS_OPCODE_FB_WRITE:
return 2;
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
@@ -2295,6 +2409,9 @@ fs_visitor::run()
if (0) {
emit_dummy_fs();
} else {
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
emit_shader_time_begin();
calculate_urb_setup();
if (intel->gen < 6)
emit_interpolation_setup_gen4();
@@ -2318,6 +2435,9 @@ fs_visitor::run()
if (failed)
return false;
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
emit_shader_time_end();
emit_fb_writes();
split_virtual_grfs();
+9
View File
@@ -363,6 +363,12 @@ public:
void emit_color_write(int target, int index, int first_color_mrf);
void emit_fb_writes();
void emit_shader_time_begin();
void emit_shader_time_end();
void emit_shader_time_write(enum shader_time_shader_type type,
fs_reg start, fs_reg end);
bool try_rewrite_rhs_to_dst(ir_assignment *ir,
fs_reg dst,
fs_reg src,
@@ -373,6 +379,8 @@ public:
void resolve_ud_negate(fs_reg *reg);
void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
fs_reg get_timestamp();
struct brw_reg interp_reg(int location, int channel);
int setup_uniform_values(int loc, const glsl_type *type);
void setup_builtin_uniform_values(ir_variable *ir);
@@ -435,6 +443,7 @@ public:
fs_reg pixel_w;
fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
fs_reg shader_start_time;
int grf_used;
@@ -1124,6 +1124,10 @@ fs_generator::generate_code(exec_list *instructions)
generate_mov_dispatch_to_flags();
break;
case SHADER_OPCODE_SHADER_TIME_ADD:
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
break;
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode `%s' in FS",
+127
View File
@@ -189,3 +189,130 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
functions->LinkShader = brw_link_shader;
}
void
brw_init_shader_time(struct brw_context *brw)
{
struct intel_context *intel = &brw->intel;
const int max_entries = 4096;
brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time",
max_entries * 4, 4096);
brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *,
max_entries);
brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
max_entries);
brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
max_entries);
brw->shader_time.max_entries = max_entries;
}
static int
compare_time(const void *a, const void *b)
{
uint64_t * const *a_val = a;
uint64_t * const *b_val = b;
/* We don't just subtract because we're turning the value to an int. */
if (**a_val < **b_val)
return -1;
else if (**a_val == **b_val)
return 0;
else
return 1;
}
static void
brw_report_shader_time(struct brw_context *brw)
{
if (!brw->shader_time.bo || !brw->shader_time.num_entries)
return;
uint64_t *sorted[brw->shader_time.num_entries];
double total = 0;
for (int i = 0; i < brw->shader_time.num_entries; i++) {
sorted[i] = &brw->shader_time.cumulative[i];
total += brw->shader_time.cumulative[i];
}
if (total == 0) {
printf("No shader time collected yet\n");
return;
}
qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
printf("\n");
printf("type ID cycles spent %% of total\n");
for (int s = 0; s < brw->shader_time.num_entries; s++) {
/* Work back from the sorted pointers times to a time to print. */
int i = sorted[s] - brw->shader_time.cumulative;
int shader_num = -1;
if (brw->shader_time.programs[i]) {
shader_num = brw->shader_time.programs[i]->Name;
}
switch (brw->shader_time.types[i]) {
case ST_VS:
printf("vs %4d: ", shader_num);
break;
case ST_FS8:
printf("fs8 %4d: ", shader_num);
break;
case ST_FS16:
printf("fs16 %4d: ", shader_num);
break;
default:
printf("other: ");
break;
}
printf("%16lld (%7.2f Gcycles) %4.1f%%\n",
(long long)brw->shader_time.cumulative[i],
(double)brw->shader_time.cumulative[i] / 1000000000.0,
(double)brw->shader_time.cumulative[i] / total * 100.0);
}
}
static void
brw_collect_shader_time(struct brw_context *brw)
{
if (!brw->shader_time.bo)
return;
/* This probably stalls on the last rendering. We could fix that by
* delaying reading the reports, but it doesn't look like it's a big
* overhead compared to the cost of tracking the time in the first place.
*/
drm_intel_bo_map(brw->shader_time.bo, true);
uint32_t *times = brw->shader_time.bo->virtual;
for (int i = 0; i < brw->shader_time.num_entries; i++) {
brw->shader_time.cumulative[i] += times[i];
}
/* Zero the BO out to clear it out for our next collection.
*/
memset(times, 0, brw->shader_time.bo->size);
drm_intel_bo_unmap(brw->shader_time.bo);
}
void
brw_collect_and_report_shader_time(struct brw_context *brw)
{
brw_collect_shader_time(brw);
if (brw->shader_time.report_time == 0 ||
get_time() - brw->shader_time.report_time >= 1.0) {
brw_report_shader_time(brw);
brw->shader_time.report_time = get_time();
}
}
void
brw_destroy_shader_time(struct brw_context *brw)
{
drm_intel_bo_unreference(brw->shader_time.bo);
brw->shader_time.bo = NULL;
}
+106
View File
@@ -26,6 +26,7 @@
extern "C" {
#include "main/macros.h"
#include "main/shaderobj.h"
#include "program/prog_print.h"
#include "program/prog_parameter.h"
}
@@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
return 2;
case VS_OPCODE_SCRATCH_WRITE:
return 3;
case SHADER_OPCODE_SHADER_TIME_ADD:
return 0;
default:
assert(!"not reached");
return inst->mlen;
@@ -1039,9 +1042,109 @@ vec4_visitor::setup_payload(void)
this->first_non_payload_grf = reg;
}
src_reg
vec4_visitor::get_timestamp()
{
assert(intel->gen >= 7);
src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
BRW_ARF_TIMESTAMP,
0,
BRW_REGISTER_TYPE_UD,
BRW_VERTICAL_STRIDE_0,
BRW_WIDTH_4,
BRW_HORIZONTAL_STRIDE_4,
BRW_SWIZZLE_XYZW,
WRITEMASK_XYZW));
dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
vec4_instruction *mov = emit(MOV(dst, ts));
/* We want to read the 3 fields we care about (mostly field 0, but also 2)
* even if it's not enabled in the dispatch.
*/
mov->force_writemask_all = true;
return src_reg(dst);
}
void
vec4_visitor::emit_shader_time_begin()
{
current_annotation = "shader time start";
shader_start_time = get_timestamp();
}
void
vec4_visitor::emit_shader_time_end()
{
current_annotation = "shader time end";
src_reg shader_end_time = get_timestamp();
emit_shader_time_write(ST_VS, shader_start_time, shader_end_time);
}
void
vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
src_reg start, src_reg end)
{
/* Choose an index in the buffer and set up tracking information for our
* printouts.
*/
int shader_time_index = brw->shader_time.num_entries++;
assert(shader_time_index <= brw->shader_time.max_entries);
brw->shader_time.types[shader_time_index] = type;
if (prog) {
_mesa_reference_shader_program(ctx,
&brw->shader_time.programs[shader_time_index],
prog);
}
/* Check that there weren't any timestamp reset events (assuming these
* were the only two timestamp reads that happened).
*/
src_reg reset_end = end;
reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
test->conditional_mod = BRW_CONDITIONAL_Z;
emit(IF(BRW_PREDICATE_NORMAL));
/* Take the current timestamp and get the delta. */
start.negate = true;
dst_reg diff = dst_reg(this, glsl_type::uint_type);
emit(ADD(diff, start, end));
/* If there were no instructions between the two timestamp gets, the diff
* is 2 cycles. Remove that overhead, so I can forget about that when
* trying to determine the time taken for single instructions.
*/
emit(ADD(diff, src_reg(diff), src_reg(-2u)));
int base_mrf = 6;
dst_reg offset_mrf = dst_reg(MRF, base_mrf);
offset_mrf.type = BRW_REGISTER_TYPE_UD;
emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
time_mrf.type = BRW_REGISTER_TYPE_UD;
emit(MOV(time_mrf, src_reg(diff)));
vec4_instruction *inst;
inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
inst->base_mrf = base_mrf;
inst->mlen = 2;
emit(BRW_OPCODE_ENDIF);
}
bool
vec4_visitor::run()
{
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
emit_shader_time_begin();
emit_attribute_fixups();
/* Generate VS IR for main(). (the visitor only descends into
@@ -1057,6 +1160,9 @@ vec4_visitor::run()
if (c->key.userclip_active && !c->key.uses_clip_distance)
setup_uniform_clipplane_values();
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
emit_shader_time_end();
emit_urb_writes();
/* Before any optimization, push array accesses out to scratch
+9
View File
@@ -302,6 +302,8 @@ public:
int uniform_vector_size[MAX_UNIFORMS];
int uniforms;
src_reg shader_start_time;
struct hash_table *variable_ht;
bool run(void);
@@ -434,6 +436,11 @@ public:
void emit_urb_slot(int mrf, int vert_result);
void emit_urb_writes(void);
void emit_shader_time_begin();
void emit_shader_time_end();
void emit_shader_time_write(enum shader_time_shader_type type,
src_reg start, src_reg end);
src_reg get_scratch_offset(vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
src_reg get_pull_constant_offset(vec4_instruction *inst,
@@ -452,6 +459,8 @@ public:
bool try_emit_sat(ir_expression *ir);
void resolve_ud_negate(src_reg *reg);
src_reg get_timestamp();
bool process_move_condition(ir_rvalue *ir);
void dump_instruction(vec4_instruction *inst);
@@ -660,6 +660,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction,
generate_pull_constant_load(inst, dst, src[0], src[1]);
break;
case SHADER_OPCODE_SHADER_TIME_ADD:
brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME);
break;
default:
if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
_mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
@@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = {
static void
brw_vs_upload_binding_table(struct brw_context *brw)
{
struct intel_context *intel = &brw->intel;
uint32_t *bind;
int i;
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
brw->shader_time.bo->size,
&brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]);
assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME);
brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME;
}
/* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or
* pull constants.
*/
+14
View File
@@ -43,6 +43,7 @@
#include "intel_fbo.h"
#include "brw_context.h"
#include "brw_program.h"
#include "brw_defines.h"
#include "brw_state.h"
#include "brw_draw.h"
@@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel )
{
struct brw_context *brw = brw_context(&intel->ctx);
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
brw_collect_and_report_shader_time(brw);
brw_destroy_shader_time(brw);
}
brw_destroy_state(brw);
brw_draw_destroy( brw );
@@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel )
* next batch.
*/
brw->cache.bo_used_by_gpu = true;
/* We need to periodically reap the shader time results, because rollover
* happens every few seconds. We also want to see results every once in a
* while, because many programs won't cleanly destroy our context, so the
* end-of-run printout may not happen.
*/
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
brw_collect_and_report_shader_time(brw);
}
static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
@@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = {
static void
brw_upload_wm_binding_table(struct brw_context *brw)
{
struct intel_context *intel = &brw->intel;
uint32_t *bind;
int i;
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
brw->shader_time.bo->size,
&brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]);
}
/* Might want to calculate nr_surfaces first, to avoid taking up so much
* space for the binding table.
*/
@@ -492,6 +492,7 @@ static const struct dri_debug_control debug_control[] = {
{ "vs", DEBUG_VS },
{ "clip", DEBUG_CLIP },
{ "aub", DEBUG_AUB },
{ "shader_time", DEBUG_SHADER_TIME },
{ NULL, 0 }
};
@@ -747,6 +748,11 @@ intelInitContext(struct intel_context *intel,
INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
if (INTEL_DEBUG & DEBUG_BUFMGR)
dri_bufmgr_set_debug(intel->bufmgr, true);
if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) {
fprintf(stderr,
"shader_time debugging requires gen7 (Ivybridge) or better.\n");
INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
}
if (INTEL_DEBUG & DEBUG_AUB)
drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true);
@@ -456,6 +456,7 @@ extern int INTEL_DEBUG;
#define DEBUG_VS 0x1000000
#define DEBUG_CLIP 0x2000000
#define DEBUG_AUB 0x4000000
#define DEBUG_SHADER_TIME 0x8000000
#ifdef HAVE_ANDROID_PLATFORM
#define LOG_TAG "INTEL-MESA"