i965: Add a debug flag for counting cycles spent in each compiled shader.

This can be used for two purposes: Using hand-coded shaders to determine per-instruction timings, or figuring out which shader to optimize in a whole application. Note that this doesn't cover the instructions that set up the message to the URB/FB write -- we'd need to convert the MRF usage in these instructions to GRFs so that our offsets/times don't overwrite our shader outputs. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> (v1) v2: Check the timestamp reset flag in the VS, which is apparently getting set fairly regularly in the range we watch, resulting in negative numbers getting added to our 32-bit counter, and thus large values added to our uint64_t. v3: Rebase on reladdr changes, removing a new safety check that proved impossible to satisfy. Add a comment to the AOP defs from Ken's review, and put them in a slightly more sensible spot. v4: Check timestamp reset in the FS as well.
2012-11-27 14:10:52 -08:00
parent ef2fbf67d4
commit 71f06344a0
17 changed files with 524 additions and 9 deletions
@@ -383,6 +383,9 @@ brwCreateContext(int api,

   brw_fs_alloc_reg_sets(brw);

+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      brw_init_shader_time(brw);
+
   return true;
 }

@@ -559,14 +559,15 @@ struct brw_vs_prog_data {
 #define SURF_INDEX_FRAG_CONST_BUFFER (BRW_MAX_DRAW_BUFFERS + 1)
 #define SURF_INDEX_TEXTURE(t)        (BRW_MAX_DRAW_BUFFERS + 2 + (t))
 #define SURF_INDEX_WM_UBO(u)         (SURF_INDEX_TEXTURE(BRW_MAX_TEX_UNIT) + u)
-
+#define SURF_INDEX_WM_SHADER_TIME    (SURF_INDEX_WM_UBO(12))
 /** Maximum size of the binding table. */
-#define BRW_MAX_WM_SURFACES          SURF_INDEX_WM_UBO(BRW_MAX_WM_UBOS)
+#define BRW_MAX_WM_SURFACES          (SURF_INDEX_WM_SHADER_TIME + 1)

 #define SURF_INDEX_VERT_CONST_BUFFER (0)
 #define SURF_INDEX_VS_TEXTURE(t)     (SURF_INDEX_VERT_CONST_BUFFER + 1 + (t))
 #define SURF_INDEX_VS_UBO(u)         (SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT) + u)
-#define BRW_MAX_VS_SURFACES          SURF_INDEX_VS_UBO(BRW_MAX_VS_UBOS)
+#define SURF_INDEX_VS_SHADER_TIME    (SURF_INDEX_VS_UBO(12))
+#define BRW_MAX_VS_SURFACES          (SURF_INDEX_VS_SHADER_TIME + 1)

 #define SURF_INDEX_SOL_BINDING(t)    ((t))
 #define BRW_MAX_GS_SURFACES          SURF_INDEX_SOL_BINDING(BRW_MAX_SOL_BINDINGS)
@@ -651,6 +652,13 @@ struct brw_tracked_state {
   void (*emit)( struct brw_context *brw );
 };

+enum shader_time_shader_type {
+   ST_NONE,
+   ST_VS,
+   ST_FS8,
+   ST_FS16,
+};
+
 /* Flags for brw->state.cache.
 */
 #define CACHE_NEW_BLEND_STATE            (1<<BRW_BLEND_STATE)
@@ -1089,6 +1097,16 @@ struct brw_context

   uint32_t num_instances;
   int basevertex;
+
+   struct {
+      drm_intel_bo *bo;
+      struct gl_shader_program **programs;
+      enum shader_time_shader_type *types;
+      uint64_t *cumulative;
+      int num_entries;
+      int max_entries;
+      double report_time;
+   } shader_time;
 };

 /*======================================================================
@@ -1144,7 +1162,9 @@ void brwInitFragProgFuncs( struct dd_function_table *functions );
 int brw_get_scratch_size(int size);
 void brw_get_scratch_bo(struct intel_context *intel,
 			drm_intel_bo **scratch_bo, int size);
-
+void brw_init_shader_time(struct brw_context *brw);
+void brw_collect_and_report_shader_time(struct brw_context *brw);
+void brw_destroy_shader_time(struct brw_context *brw);

 /* brw_urb.c
 */
@@ -665,6 +665,8 @@ enum opcode {
   SHADER_OPCODE_TXS,
   FS_OPCODE_TXB,

+   SHADER_OPCODE_SHADER_TIME_ADD,
+
   FS_OPCODE_DDX,
   FS_OPCODE_DDY,
   FS_OPCODE_PIXEL_X,
@@ -731,6 +733,8 @@ enum opcode {
 #define BRW_ARF_CONTROL               0x80
 #define BRW_ARF_NOTIFICATION_COUNT    0x90
 #define BRW_ARF_IP                    0xA0
+#define BRW_ARF_TDR                   0xB0
+#define BRW_ARF_TIMESTAMP             0xC0

 #define BRW_MRF_COMPR4			(1 << 7)

@@ -913,6 +917,23 @@ enum brw_message_target {
 #define GEN7_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE          10
 #define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ                       3

+/* dataport atomic operations. */
+#define BRW_AOP_AND                   1
+#define BRW_AOP_OR                    2
+#define BRW_AOP_XOR                   3
+#define BRW_AOP_MOV                   4
+#define BRW_AOP_INC                   5
+#define BRW_AOP_DEC                   6
+#define BRW_AOP_ADD                   7
+#define BRW_AOP_SUB                   8
+#define BRW_AOP_REVSUB                9
+#define BRW_AOP_IMAX                  10
+#define BRW_AOP_IMIN                  11
+#define BRW_AOP_UMAX                  12
+#define BRW_AOP_UMIN                  13
+#define BRW_AOP_CMPWR                 14
+#define BRW_AOP_PREDEC                15
+
 #define BRW_MATH_FUNCTION_INV                              1
 #define BRW_MATH_FUNCTION_LOG                              2
 #define BRW_MATH_FUNCTION_EXP                              3
@@ -960,8 +981,6 @@ enum brw_message_target {
 #define BRW_SCRATCH_SPACE_SIZE_2M     11


-
-
 #define CMD_URB_FENCE                 0x6000
 #define CMD_CS_URB_STATE              0x6001
 #define CMD_CONST_BUFFER              0x6002
@@ -200,7 +200,7 @@ static INLINE struct brw_reg brw_reg( GLuint file,
   else if (file == BRW_MESSAGE_REGISTER_FILE)
      assert((nr & ~(1 << 7)) < BRW_MAX_MRF);
   else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
-      assert(nr <= BRW_ARF_IP);
+      assert(nr <= BRW_ARF_TIMESTAMP);

   reg.type = type;
   reg.file = file;
@@ -1006,6 +1006,10 @@ void brw_oword_block_write_scratch(struct brw_compile *p,
 				   int num_regs,
 				   GLuint offset);

+void brw_shader_time_add(struct brw_compile *p,
+                         int mrf,
+                         uint32_t surf_index);
+
 /* If/else/endif.  Works by manipulating the execution flags on each
 * channel.
 */
@@ -253,7 +253,6 @@ brw_set_src0(struct brw_compile *p, struct brw_instruction *insn,
      assert(!reg.negate);
      assert(!reg.abs);
      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
-      assert(reg.vstride != BRW_VERTICAL_STRIDE_0);
   }

   validate_reg(insn, reg);
@@ -332,7 +331,8 @@ void brw_set_src1(struct brw_compile *p,
 {
   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);

-   assert(reg.nr < 128);
+   if (reg.type != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);

   gen7_convert_mrf_to_grf(p, &reg);

@@ -2448,3 +2448,55 @@ brw_svb_write(struct brw_compile *p,
                            0, /* end_of_thread */
                            send_commit_msg); /* send_commit_msg */
 }
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32.  So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_compile *p,
+                         int base_mrf,
+                         uint32_t surf_index)
+{
+   struct intel_context *intel = &p->brw->intel;
+   assert(intel->gen >= 7);
+
+   brw_push_insn_state(p);
+   brw_set_access_mode(p, BRW_ALIGN_1);
+   brw_set_mask_control(p, BRW_MASK_DISABLE);
+   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_pop_insn_state(p);
+
+   /* We use brw_vec1_reg and unmasked because we want to increment the given
+    * offset only once.
+    */
+   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                      BRW_ARF_NULL, 0));
+   brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+                                      base_mrf, 0));
+
+   bool header_present = false;
+   bool eot = false;
+   uint32_t mlen = 2; /* offset, value */
+   uint32_t rlen = 0;
+   brw_set_message_descriptor(p, send,
+                              GEN7_SFID_DATAPORT_DATA_CACHE,
+                              mlen, rlen, header_present, eot);
+
+   send->bits3.ud |= 6 << 14; /* untyped atomic op */
+   send->bits3.ud |= 0 << 13; /* no return data */
+   send->bits3.ud |= 1 << 12; /* SIMD8 mode */
+   send->bits3.ud |= BRW_AOP_ADD << 8;
+   send->bits3.ud |= surf_index << 0;
+}
@@ -459,6 +459,118 @@ fs_visitor::type_size(const struct glsl_type *type)
   }
 }

+fs_reg
+fs_visitor::get_timestamp()
+{
+   assert(intel->gen >= 7);
+
+   fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                          BRW_ARF_TIMESTAMP,
+                                          0),
+                             BRW_REGISTER_TYPE_UD));
+
+   fs_reg dst = fs_reg(this, glsl_type::uint_type);
+
+   fs_inst *mov = emit(MOV(dst, ts));
+   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+    * even if it's not enabled in the dispatch.
+    */
+   mov->force_writemask_all = true;
+   mov->force_uncompressed = true;
+
+   /* The caller wants the low 32 bits of the timestamp.  Since it's running
+    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+    * which is plenty of time for our purposes.  It is identical across the
+    * EUs, but since it's tracking GPU core speed it will increment at a
+    * varying rate as render P-states change.
+    *
+    * The caller could also check if render P-states have changed (or anything
+    * else that might disrupt timing) by setting smear to 2 and checking if
+    * that field is != 0.
+    */
+   dst.smear = 0;
+
+   return dst;
+}
+
+void
+fs_visitor::emit_shader_time_begin()
+{
+   current_annotation = "shader time start";
+   shader_start_time = get_timestamp();
+}
+
+void
+fs_visitor::emit_shader_time_end()
+{
+   current_annotation = "shader time end";
+
+   enum shader_time_shader_type type;
+   if (dispatch_width == 8) {
+      type = ST_FS8;
+   } else {
+      assert(dispatch_width == 16);
+      type = ST_FS16;
+   }
+
+   emit_shader_time_write(type, shader_start_time, get_timestamp());
+}
+
+void
+fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
+                                   fs_reg start, fs_reg end)
+{
+   /* Choose an index in the buffer and set up tracking information for our
+    * printouts.
+    */
+   int shader_time_index = brw->shader_time.num_entries++;
+   assert(shader_time_index <= brw->shader_time.max_entries);
+   brw->shader_time.types[shader_time_index] = type;
+   if (prog) {
+      _mesa_reference_shader_program(ctx,
+                                     &brw->shader_time.programs[shader_time_index],
+                                     prog);
+   }
+
+   /* Check that there weren't any timestamp reset events (assuming these
+    * were the only two timestamp reads that happened).
+    */
+   fs_reg reset = end;
+   reset.smear = 2;
+   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
+   test->conditional_mod = BRW_CONDITIONAL_Z;
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   push_force_uncompressed();
+   start.negate = true;
+   fs_reg diff = fs_reg(this, glsl_type::uint_type);
+   emit(ADD(diff, start, end));
+
+   /* If there were no instructions between the two timestamp gets, the diff
+    * is 2 cycles.  Remove that overhead, so I can forget about that when
+    * trying to determine the time taken for single instructions.
+    */
+   emit(ADD(diff, diff, fs_reg(-2u)));
+
+   int base_mrf = 6;
+
+   fs_reg offset_mrf = fs_reg(MRF, base_mrf);
+   offset_mrf.type = BRW_REGISTER_TYPE_UD;
+   emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
+
+   fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
+   time_mrf.type = BRW_REGISTER_TYPE_UD;
+   emit(MOV(time_mrf, diff));
+
+   fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
+
+   pop_force_uncompressed();
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
 void
 fs_visitor::fail(const char *format, ...)
 {
@@ -571,6 +683,8 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
   case SHADER_OPCODE_TXL:
   case SHADER_OPCODE_TXS:
      return 1;
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+      return 0;
   case FS_OPCODE_FB_WRITE:
      return 2;
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
@@ -2295,6 +2409,9 @@ fs_visitor::run()
   if (0) {
      emit_dummy_fs();
   } else {
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_begin();
+
      calculate_urb_setup();
      if (intel->gen < 6)
 	 emit_interpolation_setup_gen4();
@@ -2318,6 +2435,9 @@ fs_visitor::run()
      if (failed)
 	 return false;

+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
+
      emit_fb_writes();

      split_virtual_grfs();
@@ -363,6 +363,12 @@ public:

   void emit_color_write(int target, int index, int first_color_mrf);
   void emit_fb_writes();
+
+   void emit_shader_time_begin();
+   void emit_shader_time_end();
+   void emit_shader_time_write(enum shader_time_shader_type type,
+                               fs_reg start, fs_reg end);
+
   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
 			       fs_reg dst,
 			       fs_reg src,
@@ -373,6 +379,8 @@ public:
   void resolve_ud_negate(fs_reg *reg);
   void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);

+   fs_reg get_timestamp();
+
   struct brw_reg interp_reg(int location, int channel);
   int setup_uniform_values(int loc, const glsl_type *type);
   void setup_builtin_uniform_values(ir_variable *ir);
@@ -435,6 +443,7 @@ public:
   fs_reg pixel_w;
   fs_reg delta_x[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
   fs_reg delta_y[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT];
+   fs_reg shader_start_time;

   int grf_used;

@@ -1124,6 +1124,10 @@ fs_generator::generate_code(exec_list *instructions)
         generate_mov_dispatch_to_flags();
         break;

+      case SHADER_OPCODE_SHADER_TIME_ADD:
+         brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_WM_SHADER_TIME);
+         break;
+
      default:
 	 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
 	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
@@ -189,3 +189,130 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
   functions->LinkShader = brw_link_shader;
 }

+void
+brw_init_shader_time(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   const int max_entries = 4096;
+   brw->shader_time.bo = drm_intel_bo_alloc(intel->bufmgr, "shader time",
+                                            max_entries * 4, 4096);
+   brw->shader_time.programs = rzalloc_array(brw, struct gl_shader_program *,
+                                             max_entries);
+   brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
+                                          max_entries);
+   brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
+                                               max_entries);
+   brw->shader_time.max_entries = max_entries;
+}
+
+static int
+compare_time(const void *a, const void *b)
+{
+   uint64_t * const *a_val = a;
+   uint64_t * const *b_val = b;
+
+   /* We don't just subtract because we're turning the value to an int. */
+   if (**a_val < **b_val)
+      return -1;
+   else if (**a_val == **b_val)
+      return 0;
+   else
+      return 1;
+}
+
+static void
+brw_report_shader_time(struct brw_context *brw)
+{
+   if (!brw->shader_time.bo || !brw->shader_time.num_entries)
+      return;
+
+   uint64_t *sorted[brw->shader_time.num_entries];
+   double total = 0;
+   for (int i = 0; i < brw->shader_time.num_entries; i++) {
+      sorted[i] = &brw->shader_time.cumulative[i];
+      total += brw->shader_time.cumulative[i];
+   }
+
+   if (total == 0) {
+      printf("No shader time collected yet\n");
+      return;
+   }
+
+   qsort(sorted, brw->shader_time.num_entries, sizeof(sorted[0]), compare_time);
+
+   printf("\n");
+   printf("type   ID      cycles spent                   %% of total\n");
+   for (int s = 0; s < brw->shader_time.num_entries; s++) {
+      /* Work back from the sorted pointers times to a time to print. */
+      int i = sorted[s] - brw->shader_time.cumulative;
+
+      int shader_num = -1;
+      if (brw->shader_time.programs[i]) {
+         shader_num = brw->shader_time.programs[i]->Name;
+      }
+
+      switch (brw->shader_time.types[i]) {
+      case ST_VS:
+         printf("vs   %4d: ", shader_num);
+         break;
+      case ST_FS8:
+         printf("fs8  %4d: ", shader_num);
+         break;
+      case ST_FS16:
+         printf("fs16 %4d: ", shader_num);
+         break;
+      default:
+         printf("other:     ");
+         break;
+      }
+
+      printf("%16lld (%7.2f Gcycles)      %4.1f%%\n",
+             (long long)brw->shader_time.cumulative[i],
+             (double)brw->shader_time.cumulative[i] / 1000000000.0,
+             (double)brw->shader_time.cumulative[i] / total * 100.0);
+   }
+}
+
+static void
+brw_collect_shader_time(struct brw_context *brw)
+{
+   if (!brw->shader_time.bo)
+      return;
+
+   /* This probably stalls on the last rendering.  We could fix that by
+    * delaying reading the reports, but it doesn't look like it's a big
+    * overhead compared to the cost of tracking the time in the first place.
+    */
+   drm_intel_bo_map(brw->shader_time.bo, true);
+
+   uint32_t *times = brw->shader_time.bo->virtual;
+
+   for (int i = 0; i < brw->shader_time.num_entries; i++) {
+      brw->shader_time.cumulative[i] += times[i];
+   }
+
+   /* Zero the BO out to clear it out for our next collection.
+    */
+   memset(times, 0, brw->shader_time.bo->size);
+   drm_intel_bo_unmap(brw->shader_time.bo);
+}
+
+void
+brw_collect_and_report_shader_time(struct brw_context *brw)
+{
+   brw_collect_shader_time(brw);
+
+   if (brw->shader_time.report_time == 0 ||
+       get_time() - brw->shader_time.report_time >= 1.0) {
+      brw_report_shader_time(brw);
+      brw->shader_time.report_time = get_time();
+   }
+}
+
+void
+brw_destroy_shader_time(struct brw_context *brw)
+{
+   drm_intel_bo_unreference(brw->shader_time.bo);
+   brw->shader_time.bo = NULL;
+}
@@ -26,6 +26,7 @@

 extern "C" {
 #include "main/macros.h"
+#include "main/shaderobj.h"
 #include "program/prog_print.h"
 #include "program/prog_parameter.h"
 }
@@ -248,6 +249,8 @@ vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
      return 2;
   case VS_OPCODE_SCRATCH_WRITE:
      return 3;
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+      return 0;
   default:
      assert(!"not reached");
      return inst->mlen;
@@ -1039,9 +1042,109 @@ vec4_visitor::setup_payload(void)
   this->first_non_payload_grf = reg;
 }

+src_reg
+vec4_visitor::get_timestamp()
+{
+   assert(intel->gen >= 7);
+
+   src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                BRW_ARF_TIMESTAMP,
+                                0,
+                                BRW_REGISTER_TYPE_UD,
+                                BRW_VERTICAL_STRIDE_0,
+                                BRW_WIDTH_4,
+                                BRW_HORIZONTAL_STRIDE_4,
+                                BRW_SWIZZLE_XYZW,
+                                WRITEMASK_XYZW));
+
+   dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
+
+   vec4_instruction *mov = emit(MOV(dst, ts));
+   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+    * even if it's not enabled in the dispatch.
+    */
+   mov->force_writemask_all = true;
+
+   return src_reg(dst);
+}
+
+void
+vec4_visitor::emit_shader_time_begin()
+{
+   current_annotation = "shader time start";
+   shader_start_time = get_timestamp();
+}
+
+void
+vec4_visitor::emit_shader_time_end()
+{
+   current_annotation = "shader time end";
+   src_reg shader_end_time = get_timestamp();
+
+   emit_shader_time_write(ST_VS, shader_start_time, shader_end_time);
+}
+
+void
+vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
+                                     src_reg start, src_reg end)
+{
+   /* Choose an index in the buffer and set up tracking information for our
+    * printouts.
+    */
+   int shader_time_index = brw->shader_time.num_entries++;
+   assert(shader_time_index <= brw->shader_time.max_entries);
+   brw->shader_time.types[shader_time_index] = type;
+   if (prog) {
+      _mesa_reference_shader_program(ctx,
+                                     &brw->shader_time.programs[shader_time_index],
+                                     prog);
+   }
+
+   /* Check that there weren't any timestamp reset events (assuming these
+    * were the only two timestamp reads that happened).
+    */
+   src_reg reset_end = end;
+   reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
+   vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
+   test->conditional_mod = BRW_CONDITIONAL_Z;
+
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   /* Take the current timestamp and get the delta. */
+   start.negate = true;
+   dst_reg diff = dst_reg(this, glsl_type::uint_type);
+   emit(ADD(diff, start, end));
+
+   /* If there were no instructions between the two timestamp gets, the diff
+    * is 2 cycles.  Remove that overhead, so I can forget about that when
+    * trying to determine the time taken for single instructions.
+    */
+   emit(ADD(diff, src_reg(diff), src_reg(-2u)));
+
+   int base_mrf = 6;
+
+   dst_reg offset_mrf = dst_reg(MRF, base_mrf);
+   offset_mrf.type = BRW_REGISTER_TYPE_UD;
+   emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
+
+   dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
+   time_mrf.type = BRW_REGISTER_TYPE_UD;
+   emit(MOV(time_mrf, src_reg(diff)));
+
+   vec4_instruction *inst;
+   inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
 bool
 vec4_visitor::run()
 {
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      emit_shader_time_begin();
+
   emit_attribute_fixups();

   /* Generate VS IR for main().  (the visitor only descends into
@@ -1057,6 +1160,9 @@ vec4_visitor::run()
   if (c->key.userclip_active && !c->key.uses_clip_distance)
      setup_uniform_clipplane_values();

+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      emit_shader_time_end();
+
   emit_urb_writes();

   /* Before any optimization, push array accesses out to scratch
@@ -302,6 +302,8 @@ public:
   int uniform_vector_size[MAX_UNIFORMS];
   int uniforms;

+   src_reg shader_start_time;
+
   struct hash_table *variable_ht;

   bool run(void);
@@ -434,6 +436,11 @@ public:
   void emit_urb_slot(int mrf, int vert_result);
   void emit_urb_writes(void);

+   void emit_shader_time_begin();
+   void emit_shader_time_end();
+   void emit_shader_time_write(enum shader_time_shader_type type,
+                               src_reg start, src_reg end);
+
   src_reg get_scratch_offset(vec4_instruction *inst,
 			      src_reg *reladdr, int reg_offset);
   src_reg get_pull_constant_offset(vec4_instruction *inst,
@@ -452,6 +459,8 @@ public:
   bool try_emit_sat(ir_expression *ir);
   void resolve_ud_negate(src_reg *reg);

+   src_reg get_timestamp();
+
   bool process_move_condition(ir_rvalue *ir);

   void dump_instruction(vec4_instruction *inst);
@@ -660,6 +660,10 @@ vec4_generator::generate_vs_instruction(vec4_instruction *instruction,
      generate_pull_constant_load(inst, dst, src[0], src[1]);
      break;

+   case SHADER_OPCODE_SHADER_TIME_ADD:
+      brw_shader_time_add(p, inst->base_mrf, SURF_INDEX_VS_SHADER_TIME);
+      break;
+
   default:
      if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
         _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n",
@@ -138,9 +138,19 @@ const struct brw_tracked_state brw_vs_ubo_surfaces = {
 static void
 brw_vs_upload_binding_table(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
   uint32_t *bind;
   int i;

+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
+                                          brw->shader_time.bo->size,
+                                          &brw->vs.surf_offset[SURF_INDEX_VS_SHADER_TIME]);
+
+      assert(brw->vs.prog_data->num_surfaces <= SURF_INDEX_VS_SHADER_TIME);
+      brw->vs.prog_data->num_surfaces = SURF_INDEX_VS_SHADER_TIME;
+   }
+
   /* CACHE_NEW_VS_PROG: Skip making a binding table if we don't use textures or
    * pull constants.
    */
@@ -43,6 +43,7 @@
 #include "intel_fbo.h"

 #include "brw_context.h"
+#include "brw_program.h"
 #include "brw_defines.h"
 #include "brw_state.h"
 #include "brw_draw.h"
@@ -69,6 +70,11 @@ static void brw_destroy_context( struct intel_context *intel )
 {
   struct brw_context *brw = brw_context(&intel->ctx);

+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      brw_collect_and_report_shader_time(brw);
+      brw_destroy_shader_time(brw);
+   }
+
   brw_destroy_state(brw);
   brw_draw_destroy( brw );

@@ -201,6 +207,14 @@ static void brw_new_batch( struct intel_context *intel )
    * next batch.
    */
   brw->cache.bo_used_by_gpu = true;
+
+   /* We need to periodically reap the shader time results, because rollover
+    * happens every few seconds.  We also want to see results every once in a
+    * while, because many programs won't cleanly destroy our context, so the
+    * end-of-run printout may not happen.
+    */
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      brw_collect_and_report_shader_time(brw);
 }

 static void brw_invalidate_state( struct intel_context *intel, GLuint new_state )
@@ -1405,9 +1405,16 @@ const struct brw_tracked_state brw_wm_ubo_surfaces = {
 static void
 brw_upload_wm_binding_table(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
   uint32_t *bind;
   int i;

+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      intel->vtbl.create_constant_surface(brw, brw->shader_time.bo, 0,
+                                          brw->shader_time.bo->size,
+                                          &brw->wm.surf_offset[SURF_INDEX_WM_SHADER_TIME]);
+   }
+
   /* Might want to calculate nr_surfaces first, to avoid taking up so much
    * space for the binding table.
    */
@@ -492,6 +492,7 @@ static const struct dri_debug_control debug_control[] = {
   { "vs",    DEBUG_VS },
   { "clip",  DEBUG_CLIP },
   { "aub",   DEBUG_AUB },
+   { "shader_time", DEBUG_SHADER_TIME },
   { NULL,    0 }
 };

@@ -747,6 +748,11 @@ intelInitContext(struct intel_context *intel,
   INTEL_DEBUG = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
   if (INTEL_DEBUG & DEBUG_BUFMGR)
      dri_bufmgr_set_debug(intel->bufmgr, true);
+   if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && intel->gen < 7) {
+      fprintf(stderr,
+              "shader_time debugging requires gen7 (Ivybridge) or better.\n");
+      INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
+   }

   if (INTEL_DEBUG & DEBUG_AUB)
      drm_intel_bufmgr_gem_set_aub_dump(intel->bufmgr, true);
@@ -456,6 +456,7 @@ extern int INTEL_DEBUG;
 #define DEBUG_VS        0x1000000
 #define DEBUG_CLIP      0x2000000
 #define DEBUG_AUB       0x4000000
+#define DEBUG_SHADER_TIME 0x8000000

 #ifdef HAVE_ANDROID_PLATFORM
 #define LOG_TAG "INTEL-MESA"