etnaviv/nn: Implement zero run length encoding of weights

Check how much smaller can the weight+bias buffers be with different amount of bits to encode runs of zeroes and choose the smallest one. This reduces the bandwidth considerably, which is at present the bottleneck with useful models. On a Libre Computer Alta AML-A311D-CC, I see these improvements: MobileNetV1: 15.650ms -> 9.991ms SSDLite MobileDet: 56.149ms -> 32.692ms Acked-by: Christian Gmeiner <cgmeiner@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27513>
2024-02-03 15:54:39 +01:00
parent 1e78d9aaca
commit 0c0d62ba70
5 changed files with 300 additions and 106 deletions
@@ -88,6 +88,7 @@ struct etna_core_npu_info {
   unsigned tp_core_count;             /* number of TP cores */
   unsigned on_chip_sram_size;         /* Size of on-chip SRAM */
   unsigned axi_sram_size;             /* Size of SRAM behind AXI */
+   unsigned nn_zrl_bits;               /* Number of bits for zero run-length compression */
 };

 struct etna_core_info {
@@ -107,6 +107,7 @@ etna_query_feature_db(struct etna_core_info *info)
      info->npu.tp_core_count = db->TPEngine_CoreCount;
      info->npu.on_chip_sram_size = db->VIP_SRAM_SIZE;
      info->npu.axi_sram_size = db->AXI_SRAM_SIZE;
+      info->npu.nn_zrl_bits = db->NN_ZRL_BITS;
   }

   return true;
@@ -153,6 +153,8 @@ struct etna_specs {
   unsigned on_chip_sram_size;
   /* Size of SRAM behind AXI */
   unsigned axi_sram_size;
+   /* Number of bits for zero run-length compression */
+   unsigned nn_zrl_bits;
 };

 /* Compiled Gallium state. All the different compiled state atoms are woven
@@ -517,7 +517,8 @@ static unsigned
 calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
 {
   unsigned nn_core_count = ctx->screen->specs.nn_core_count;
-   unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count);
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
   unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y;

   if (operation->weight_width == 1)
@@ -526,16 +527,14 @@ calc_superblocks(struct etna_context *ctx, const struct etna_operation *operatio
   foo = MIN2(foo, kernels_per_core);
   foo = MIN2(foo, 127);

-   kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo);
-   unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count);
-   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels);
+   kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
+   unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
+   unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);

   /* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */
-   while(operation->output_channels % superblocks)
+   while(output_channels % superblocks)
      superblocks++;

-   ML_DBG("superblocks %d\n", superblocks);
-
   return superblocks;
 }

@@ -619,16 +618,13 @@ calculate_tiling(struct etna_context *ctx, const struct etna_operation *operatio
   interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);

   tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1;
-   ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width);
   tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH);
-   //tile_height = MIN2(tile_height, operation->input_width);
   tile_height = MIN2(tile_height, output_height);

   if (operation->stride > 1 && tile_height % 2 > 0)
      tile_height -= 1;

   superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
-   ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks);

   if (tile_width_out)
      *tile_width_out = tile_width;
@@ -789,9 +785,6 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation

   map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);

-   /* Should be max accumBufferDepth (64) / zdpNum (3) */
-   //assert(map->kernels_per_core <= (64 / 3));
-
   /* The header doesn't get cached */
   coefficients_size -= 64;

@@ -876,20 +869,102 @@ static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_op
   return correction;
 }

+
 static void
-write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
+append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
+{
+   *buffer |= (uint64_t)value << *bits_in_buffer;
+   *bits_in_buffer += size;
+   if (*bits_in_buffer >= 32) {
+      if (do_write)
+         **dest = *buffer & 0xffffffff;
+      *dest += 1;
+      *buffer >>= 32;
+      *bits_in_buffer -= 32;
+   }
+}
+
+struct wb_stream {
+   unsigned zero_point;
+   unsigned zrl_bits;
+   unsigned *bits_in_buffer;
+   uint64_t *buffer;
+   uint32_t **map;
+   bool do_write;
+
+   unsigned accum_zeroes;
+};
+
+static void
+wb_stream_flush_zeroes(struct wb_stream *wb_stream)
+{
+   if (wb_stream->accum_zeroes == 0)
+      return;
+
+   append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+   wb_stream->accum_zeroes = 0;
+   append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+}
+
+static void
+wb_stream_write(struct wb_stream *wb_stream, unsigned value)
+{
+   unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
+
+   if (wb_stream->zrl_bits == 0) {
+      append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+      return;
+   }
+
+   if (wb_stream->accum_zeroes == max_zeroes) {
+      append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+      wb_stream->accum_zeroes = 0;
+      append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+      return;
+   }
+
+   if (value == wb_stream->zero_point) {
+      wb_stream->accum_zeroes++;
+      return;
+   }
+
+   append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+   wb_stream->accum_zeroes = 0;
+   append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
+}
+
+static unsigned
+write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
   unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
-   unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
+   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
   uint8_t *input = map_resource(operation->weight_tensor);
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
-   unsigned stride = MIN2(operation->input_channels, 6);
+   unsigned stride = MIN2(input_channels, 6);
   unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
   uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
+   uint32_t *initial_ptr = map;
+   bool do_write = initial_ptr != NULL;
+   uint64_t buffer = 0;
+   unsigned bits_in_buffer = 0;
+   struct wb_stream wb_stream = {
+      .zero_point = operation->weight_zero_point,
+      .zrl_bits = zrl_bits,
+      .bits_in_buffer = &bits_in_buffer,
+      .buffer = &buffer,
+      .map = &map,
+      .do_write = do_write,
+   };

-   ML_DBG("%s\n", __func__);
+   ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
+
+   append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
+   append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);

   for (unsigned superblock = 0; superblock < superblocks; superblock++) {

@@ -898,53 +973,77 @@ write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned
         kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;

      for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
-         unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
-         weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels;
+         unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
+         weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
      }

-      for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) {
+      for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
         for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
-            unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
+            unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;

            if (block == 0) {
-               *map++ = weights_maps[kernel][0];
+               wb_stream_write(&wb_stream, weights_maps[kernel][0]);

               uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
-               //fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]);
-               *((uint32_t *)map) = biases[out_channel] - corr;
-               map += sizeof(uint32_t);
+               wb_stream_flush_zeroes(&wb_stream);
+               append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);

               for (int i = 1; i < stride; i++) {
-                  *map++ = weights_maps[kernel][i];
+                  wb_stream_write(&wb_stream, weights_maps[kernel][i]);
               }
            } else {
               for (int i = 0; i < stride; i++) {
-                  if (i + block * stride < operation->input_channels)
-                     *map++ = weights_maps[kernel][i + block * stride];
+                  if (i + block * stride < input_channels)
+                     wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
               }
            }
-            if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) {
-               *((uint32_t*)map) = out_values_per_channel * out_channel;
-               map += sizeof(uint32_t);
+            if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
+               wb_stream_flush_zeroes(&wb_stream);
+               append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
            }
         }
      }
   }
+
+   wb_stream_flush_zeroes(&wb_stream);
+
+   if (bits_in_buffer > 0)
+      append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
+
+   return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
 }

-static void
-write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
+static unsigned
+write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
   unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
-   unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
+   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
   uint8_t *input = map_resource(operation->weight_tensor);
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
   unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
-   uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input;
+   uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
+   uint32_t *initial_ptr = map;
+   bool do_write = initial_ptr != NULL;
+   uint64_t buffer = 0;
+   unsigned bits_in_buffer = 0;
+   struct wb_stream wb_stream = {
+      .zero_point = operation->weight_zero_point,
+      .zrl_bits = zrl_bits,
+      .bits_in_buffer = &bits_in_buffer,
+      .buffer = &buffer,
+      .map = &map,
+      .do_write = do_write,
+   };

-   ML_DBG("%s core %d\n", __func__, core);
+   ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
+
+   append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
+   append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);

   for (unsigned superblock = 0; superblock < superblocks; superblock++) {

@@ -952,15 +1051,9 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
      if (superblock == superblocks - 1)
         kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;

-      for (unsigned z = 0; z < operation->input_channels; z++) {
+      for (unsigned z = 0; z < input_channels; z++) {
         for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
-            unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
-
-#if 0
-            if (z == 0)
-               fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n",
-                       core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel);
-#endif
+            unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;

            for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
               unsigned stride = operation->weight_height;
@@ -970,13 +1063,11 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
                  if (x >= operation->weight_width)
                     break;
                  for (unsigned y = 0; y < stride; y++) {
-                     //fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]);
-                     *map++ = weights_map[out_channel][z][x][y];
+                     wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
                     if (x == 0 && y == 0 && z == 0) {
                        uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
-                        //fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]);
-                        *((uint32_t *)map) = biases[out_channel] - corr;
-                        map += sizeof(uint32_t);
+                        wb_stream_flush_zeroes(&wb_stream);
+                        append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
                     }
                  }
               }
@@ -985,34 +1076,59 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
                     if (x >= operation->weight_width)
                        break;
                     for (unsigned y = stride; y < operation->weight_width; y++) {
-                        //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]);
-                        *map++ = weights_map[out_channel][z][x][y];
+                        wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
                     }
                  }
               }
            }

-            if (z == operation->input_channels - 1) {
-               *((uint32_t*)map) = out_values_per_channel * out_channel;
-               map += sizeof(uint32_t);
+            if (z == input_channels - 1) {
+               wb_stream_flush_zeroes(&wb_stream);
+               append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
            }
         }
+         if (superblock == superblocks - 1)
+            wb_stream_flush_zeroes(&wb_stream);
      }
   }
+
+   wb_stream_flush_zeroes(&wb_stream);
+
+   if (bits_in_buffer > 0)
+      append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
+
+   return (uint8_t *)map - (uint8_t *)initial_ptr;
 }

-static void
-write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
+static unsigned
+write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
 {
   struct pipe_context *pctx = subgraph->base.context;
   unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
-   unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
   uint8_t *input = map_resource(operation->weight_tensor);
   uint32_t *biases = map_resource(operation->bias_tensor);
   unsigned out_values_per_channel = operation->output_width * operation->output_height;
   unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
+   uint32_t *initial_ptr = map;
+   bool do_write = initial_ptr != NULL;
+   uint64_t buffer = 0;
+   unsigned bits_in_buffer = 0;
+   struct wb_stream wb_stream = {
+      .zero_point = operation->weight_zero_point,
+      .zrl_bits = zrl_bits,
+      .bits_in_buffer = &bits_in_buffer,
+      .buffer = &buffer,
+      .map = &map,
+      .do_write = do_write,
+   };

-   ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels);
+   ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
+
+   append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
+   append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);

   for (unsigned superblock = 0; superblock < superblocks; superblock++) {

@@ -1021,7 +1137,7 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
         kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;

      for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
-         unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
+         unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;

         uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;

@@ -1034,13 +1150,12 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
               if (x >= operation->weight_width)
                  break;
               for (unsigned y = 0; y < stride; y++) {
-                  //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);

-                  *map++ = weights_map[x][y];
+                  wb_stream_write(&wb_stream, weights_map[x][y]);
                  if (x == 0 && y == 0) {
                     uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
-                     *((uint32_t *)map) = biases[out_channel] - corr;
-                     map += sizeof(uint32_t);
+                     wb_stream_flush_zeroes(&wb_stream);
+                     append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
                  }
               }
            }
@@ -1050,44 +1165,128 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
                  if (x >= operation->weight_width)
                     break;
                  for (unsigned y = stride; y < operation->weight_width; y++) {
-                     //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
-                     *map++ = weights_map[x][y];
+                     wb_stream_write(&wb_stream, weights_map[x][y]);
                  }
               }
            }
         }
-         if (operation->addition) {
-            *((uint32_t*)map) = operation->addition_offset;
-         } else
-            *((uint32_t*)map) = out_values_per_channel * out_channel;
-         map += sizeof(uint32_t);
+         wb_stream_flush_zeroes(&wb_stream);
+         if (operation->addition)
+            append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
+         else
+            append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
      }
   }
+
+   wb_stream_flush_zeroes(&wb_stream);
+
+   if (bits_in_buffer > 0)
+      append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
+
+   return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
+}
+
+static unsigned
+calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct etna_context *ctx = etna_context(context);
+   unsigned nn_core_count = ctx->screen->specs.nn_core_count;
+   unsigned header_size = ALIGN(nn_core_count * 4, 64);
+   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
+   unsigned weights_size;
+   unsigned core_size;
+   unsigned core_size_aligned;
+   unsigned compressed_size_aligned;
+
+   weights_size = operation->weight_width * operation->weight_height * input_channels;
+   core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
+   core_size_aligned = ALIGN(core_size, 64);
+   compressed_size_aligned = header_size + core_size_aligned * cores_used;
+
+   return compressed_size_aligned;
+}
+
+static unsigned
+calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
+{
+   struct pipe_context *context = subgraph->base.context;
+   struct etna_context *ctx = etna_context(context);
+   unsigned nn_core_count = ctx->screen->specs.nn_core_count;
+   unsigned max_zrl_bits = ctx->screen->specs.nn_zrl_bits;
+   unsigned header_size = ALIGN(nn_core_count * 4, 64);
+   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
+   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
+   unsigned cores_used = MIN2(output_channels, nn_core_count);
+   unsigned best_compressed_size;
+   unsigned best_zrl_bits;
+
+   /* On HW that doesn't natively support depthwise and strided convolutions,
+    * we have to lower them and pad with lots of zeroes. We can be pretty certain
+    * that max bits of compression will help these jobs.
+    */
+   if (operation->depthwise ||
+       operation->stride > 1) {
+
+      return max_zrl_bits;
+   }
+
+   /* These are very unlikely to have enough zeroes for compression to be useful. */
+   if (operation->addition ||
+       operation->pointwise) {
+
+      return 0;
+   }
+
+   /* This calculation can be really slow. Start from max_zrl_bits as big
+    * buffers will benefit the most from high zero compression.
+    */
+   best_compressed_size = UINT_MAX;
+   best_zrl_bits = 0;
+   for (unsigned zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
+
+      unsigned compressed_size = header_size;
+      for (unsigned core = 0; core < cores_used; core++) {
+
+         unsigned actual_size;
+         if (operation->pointwise && output_channels > 8)
+            actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
+         else if (input_channels > 1)
+            actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
+         else
+            actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
+
+         compressed_size += actual_size;
+      }
+
+      /* If more bits don't compress further, then stop */
+      if (compressed_size <= best_compressed_size) {
+         best_compressed_size = compressed_size;
+         best_zrl_bits = zrl_bits;
+      } else
+         break;
+   }
+
+   return best_zrl_bits;
 }

 static struct etna_bo *
 create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size)
 {
-   /* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */
   struct pipe_context *context = subgraph->base.context;
   struct etna_context *ctx = etna_context(context);
   unsigned nn_core_count = ctx->screen->specs.nn_core_count;
   unsigned header_size = ALIGN(nn_core_count * 4, 64);
-   unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */
-   unsigned input_channels;
+   unsigned input_channels = operation->addition ? 1 : operation->input_channels;
   unsigned output_channels = operation->addition ? 1 : operation->output_channels;
   unsigned cores_used = MIN2(output_channels, nn_core_count);
-   unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
-   uint8_t zero_length_encoding = false;
-   unsigned weights_size;
-   unsigned core_size;
-   unsigned core_size_aligned;
+   unsigned zrl_bits;

-   input_channels = operation->addition ? 1 : operation->input_channels;
-   weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size;
-   core_size = 3 + (weights_size + 4 + 4) * kernels_per_core;
-   core_size_aligned = ALIGN(core_size, 64);
-   *size = header_size + core_size_aligned * cores_used;
+   *size = calculate_weight_bo_size(subgraph, operation);
+   zrl_bits = calculate_zrl_bits(subgraph, operation);

   struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
                                            *size,
@@ -1095,37 +1294,27 @@ create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_oper

   etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);

-   uint8_t *map = etna_bo_map(compressed);
-   uint32_t *header = (uint32_t *)map;
-
+   uint32_t *map = etna_bo_map(compressed);
   memset(map, 0, *size);

-   for (unsigned core = 0; core < cores_used; core++)
-      header[core] = core_size_aligned;
-
-   map += header_size;
-
-#if 0
-   uint8_t *input = map_resource(operation->weight_tensor);
-   for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++)
-      fprintf(stderr, "i %d: %02x\n", i, input[i]);
-#endif
+   uint32_t *header = map;
+   map += header_size / 4;

   for (unsigned core = 0; core < cores_used; core++) {

-      *map++ = zero_length_encoding;
-
-      *((uint16_t *)map) = kernels_per_core;
-      map += sizeof(uint16_t);
-
-      if (operation->pointwise && input_channels >= 1 && output_channels > 8)
-         write_6_weight_format(subgraph, map, kernels_per_core, core, operation);
+      unsigned actual_size;
+      if (operation->pointwise && output_channels > 8)
+         actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
      else if (input_channels > 1)
-         write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation);
+         actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
      else
-         write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation);
+         actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);

-      map += core_size_aligned - 3;
+      actual_size = ALIGN(actual_size, 64);
+
+      header[core] = actual_size;
+
+      map += actual_size / 4;
   }

   etna_bo_cpu_fini(compressed);
@@ -858,6 +858,7 @@ etna_get_specs(struct etna_screen *screen)
      screen->specs.tp_core_count = info->npu.tp_core_count;
      screen->specs.on_chip_sram_size = info->npu.on_chip_sram_size;
      screen->specs.axi_sram_size = info->npu.axi_sram_size;
+      screen->specs.nn_zrl_bits = info->npu.nn_zrl_bits;
   }

   /* Figure out gross GPU architecture. See rnndb/common.xml for a specific