diff --git a/src/etnaviv/common/etna_core_info.h b/src/etnaviv/common/etna_core_info.h index 75401cd06b0..1dda8b456c5 100644 --- a/src/etnaviv/common/etna_core_info.h +++ b/src/etnaviv/common/etna_core_info.h @@ -88,6 +88,7 @@ struct etna_core_npu_info { unsigned tp_core_count; /* number of TP cores */ unsigned on_chip_sram_size; /* Size of on-chip SRAM */ unsigned axi_sram_size; /* Size of SRAM behind AXI */ + unsigned nn_zrl_bits; /* Number of bits for zero run-length compression */ }; struct etna_core_info { diff --git a/src/etnaviv/hwdb/etna_hwdb.c b/src/etnaviv/hwdb/etna_hwdb.c index acacd5e4721..d58cb47dc0f 100644 --- a/src/etnaviv/hwdb/etna_hwdb.c +++ b/src/etnaviv/hwdb/etna_hwdb.c @@ -107,6 +107,7 @@ etna_query_feature_db(struct etna_core_info *info) info->npu.tp_core_count = db->TPEngine_CoreCount; info->npu.on_chip_sram_size = db->VIP_SRAM_SIZE; info->npu.axi_sram_size = db->AXI_SRAM_SIZE; + info->npu.nn_zrl_bits = db->NN_ZRL_BITS; } return true; diff --git a/src/gallium/drivers/etnaviv/etnaviv_internal.h b/src/gallium/drivers/etnaviv/etnaviv_internal.h index 2dac4d633a3..8ef921cca0f 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_internal.h +++ b/src/gallium/drivers/etnaviv/etnaviv_internal.h @@ -153,6 +153,8 @@ struct etna_specs { unsigned on_chip_sram_size; /* Size of SRAM behind AXI */ unsigned axi_sram_size; + /* Number of bits for zero run-length compression */ + unsigned nn_zrl_bits; }; /* Compiled Gallium state. All the different compiled state atoms are woven diff --git a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c index 7f0b8696842..73bb3d1349a 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c +++ b/src/gallium/drivers/etnaviv/etnaviv_ml_nn.c @@ -517,7 +517,8 @@ static unsigned calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode) { unsigned nn_core_count = ctx->screen->specs.nn_core_count; - unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count); + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count); unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y; if (operation->weight_width == 1) @@ -526,16 +527,14 @@ calc_superblocks(struct etna_context *ctx, const struct etna_operation *operatio foo = MIN2(foo, kernels_per_core); foo = MIN2(foo, 127); - kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo); - unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count); - unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels); + kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo); + unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count); + unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels); /* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */ - while(operation->output_channels % superblocks) + while(output_channels % superblocks) superblocks++; - ML_DBG("superblocks %d\n", superblocks); - return superblocks; } @@ -619,16 +618,13 @@ calculate_tiling(struct etna_context *ctx, const struct etna_operation *operatio interleave_mode = calc_interleave_mode(tile_width, operation->weight_height); tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1; - ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width); tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH); - //tile_height = MIN2(tile_height, operation->input_width); tile_height = MIN2(tile_height, output_height); if (operation->stride > 1 && tile_height % 2 > 0) tile_height -= 1; superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode); - ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks); if (tile_width_out) *tile_width_out = tile_width; @@ -789,9 +785,6 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks); - /* Should be max accumBufferDepth (64) / zdpNum (3) */ - //assert(map->kernels_per_core <= (64 / 3)); - /* The header doesn't get cached */ coefficients_size -= 64; @@ -876,20 +869,102 @@ static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_op return correction; } + static void -write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write) +{ + *buffer |= (uint64_t)value << *bits_in_buffer; + *bits_in_buffer += size; + if (*bits_in_buffer >= 32) { + if (do_write) + **dest = *buffer & 0xffffffff; + *dest += 1; + *buffer >>= 32; + *bits_in_buffer -= 32; + } +} + +struct wb_stream { + unsigned zero_point; + unsigned zrl_bits; + unsigned *bits_in_buffer; + uint64_t *buffer; + uint32_t **map; + bool do_write; + + unsigned accum_zeroes; +}; + +static void +wb_stream_flush_zeroes(struct wb_stream *wb_stream) +{ + if (wb_stream->accum_zeroes == 0) + return; + + append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); + wb_stream->accum_zeroes = 0; + append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); +} + +static void +wb_stream_write(struct wb_stream *wb_stream, unsigned value) +{ + unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1; + + if (wb_stream->zrl_bits == 0) { + append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); + return; + } + + if (wb_stream->accum_zeroes == max_zeroes) { + append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); + wb_stream->accum_zeroes = 0; + append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); + return; + } + + if (value == wb_stream->zero_point) { + wb_stream->accum_zeroes++; + return; + } + + append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); + wb_stream->accum_zeroes = 0; + append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write); +} + +static unsigned +write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits) { struct pipe_context *pctx = subgraph->base.context; unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; - unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + unsigned input_channels = operation->addition ? 1 : operation->input_channels; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); uint8_t *input = map_resource(operation->weight_tensor); uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; - unsigned stride = MIN2(operation->input_channels, 6); + unsigned stride = MIN2(input_channels, 6); unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)]; + uint32_t *initial_ptr = map; + bool do_write = initial_ptr != NULL; + uint64_t buffer = 0; + unsigned bits_in_buffer = 0; + struct wb_stream wb_stream = { + .zero_point = operation->weight_zero_point, + .zrl_bits = zrl_bits, + .bits_in_buffer = &bits_in_buffer, + .buffer = &buffer, + .map = &map, + .do_write = do_write, + }; - ML_DBG("%s\n", __func__); + ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits); + + append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write); + append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write); for (unsigned superblock = 0; superblock < superblocks; superblock++) { @@ -898,53 +973,77 @@ write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { - unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; - weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels; + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used; + weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels; } - for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) { + for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) { for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { - unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used; if (block == 0) { - *map++ = weights_maps[kernel][0]; + wb_stream_write(&wb_stream, weights_maps[kernel][0]); uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation); - //fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]); - *((uint32_t *)map) = biases[out_channel] - corr; - map += sizeof(uint32_t); + wb_stream_flush_zeroes(&wb_stream); + append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write); for (int i = 1; i < stride; i++) { - *map++ = weights_maps[kernel][i]; + wb_stream_write(&wb_stream, weights_maps[kernel][i]); } } else { for (int i = 0; i < stride; i++) { - if (i + block * stride < operation->input_channels) - *map++ = weights_maps[kernel][i + block * stride]; + if (i + block * stride < input_channels) + wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]); } } - if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) { - *((uint32_t*)map) = out_values_per_channel * out_channel; - map += sizeof(uint32_t); + if (block == DIV_ROUND_UP(input_channels, stride) - 1) { + wb_stream_flush_zeroes(&wb_stream); + append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write); } } } } + + wb_stream_flush_zeroes(&wb_stream); + + if (bits_in_buffer > 0) + append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write); + + return (uint8_t *)map - (uint8_t *)initial_ptr - 1; } -static void -write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +static unsigned +write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits) { struct pipe_context *pctx = subgraph->base.context; unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; - unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + unsigned input_channels = operation->addition ? 1 : operation->input_channels; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); uint8_t *input = map_resource(operation->weight_tensor); uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); - uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input; + uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input; + uint32_t *initial_ptr = map; + bool do_write = initial_ptr != NULL; + uint64_t buffer = 0; + unsigned bits_in_buffer = 0; + struct wb_stream wb_stream = { + .zero_point = operation->weight_zero_point, + .zrl_bits = zrl_bits, + .bits_in_buffer = &bits_in_buffer, + .buffer = &buffer, + .map = &map, + .do_write = do_write, + }; - ML_DBG("%s core %d\n", __func__, core); + ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map); + + append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write); + append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write); for (unsigned superblock = 0; superblock < superblocks; superblock++) { @@ -952,15 +1051,9 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, if (superblock == superblocks - 1) kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; - for (unsigned z = 0; z < operation->input_channels; z++) { + for (unsigned z = 0; z < input_channels; z++) { for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { - unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; - -#if 0 - if (z == 0) - fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n", - core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel); -#endif + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used; for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) { unsigned stride = operation->weight_height; @@ -970,13 +1063,11 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, if (x >= operation->weight_width) break; for (unsigned y = 0; y < stride; y++) { - //fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]); - *map++ = weights_map[out_channel][z][x][y]; + wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]); if (x == 0 && y == 0 && z == 0) { uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation); - //fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]); - *((uint32_t *)map) = biases[out_channel] - corr; - map += sizeof(uint32_t); + wb_stream_flush_zeroes(&wb_stream); + append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write); } } } @@ -985,34 +1076,59 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, if (x >= operation->weight_width) break; for (unsigned y = stride; y < operation->weight_width; y++) { - //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]); - *map++ = weights_map[out_channel][z][x][y]; + wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]); } } } } - if (z == operation->input_channels - 1) { - *((uint32_t*)map) = out_values_per_channel * out_channel; - map += sizeof(uint32_t); + if (z == input_channels - 1) { + wb_stream_flush_zeroes(&wb_stream); + append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write); } } + if (superblock == superblocks - 1) + wb_stream_flush_zeroes(&wb_stream); } } + + wb_stream_flush_zeroes(&wb_stream); + + if (bits_in_buffer > 0) + append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write); + + return (uint8_t *)map - (uint8_t *)initial_ptr; } -static void -write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation) +static unsigned +write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits) { struct pipe_context *pctx = subgraph->base.context; unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count; - unsigned cores_used = MIN2(operation->output_channels, nn_core_count); + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); uint8_t *input = map_resource(operation->weight_tensor); uint32_t *biases = map_resource(operation->bias_tensor); unsigned out_values_per_channel = operation->output_width * operation->output_height; unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL); + uint32_t *initial_ptr = map; + bool do_write = initial_ptr != NULL; + uint64_t buffer = 0; + unsigned bits_in_buffer = 0; + struct wb_stream wb_stream = { + .zero_point = operation->weight_zero_point, + .zrl_bits = zrl_bits, + .bits_in_buffer = &bits_in_buffer, + .buffer = &buffer, + .map = &map, + .do_write = do_write, + }; - ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels); + ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks); + + append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write); + append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write); for (unsigned superblock = 0; superblock < superblocks; superblock++) { @@ -1021,7 +1137,7 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks; for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) { - unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used; + unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used; uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height; @@ -1034,13 +1150,12 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, if (x >= operation->weight_width) break; for (unsigned y = 0; y < stride; y++) { - //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]); - *map++ = weights_map[x][y]; + wb_stream_write(&wb_stream, weights_map[x][y]); if (x == 0 && y == 0) { uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation); - *((uint32_t *)map) = biases[out_channel] - corr; - map += sizeof(uint32_t); + wb_stream_flush_zeroes(&wb_stream); + append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write); } } } @@ -1050,44 +1165,128 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, if (x >= operation->weight_width) break; for (unsigned y = stride; y < operation->weight_width; y++) { - //fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]); - *map++ = weights_map[x][y]; + wb_stream_write(&wb_stream, weights_map[x][y]); } } } } - if (operation->addition) { - *((uint32_t*)map) = operation->addition_offset; - } else - *((uint32_t*)map) = out_values_per_channel * out_channel; - map += sizeof(uint32_t); + wb_stream_flush_zeroes(&wb_stream); + if (operation->addition) + append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write); + else + append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write); } } + + wb_stream_flush_zeroes(&wb_stream); + + if (bits_in_buffer > 0) + append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write); + + return (uint8_t *)map - (uint8_t *)initial_ptr - 1; +} + +static unsigned +calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + unsigned header_size = ALIGN(nn_core_count * 4, 64); + unsigned input_channels = operation->addition ? 1 : operation->input_channels; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); + unsigned weights_size; + unsigned core_size; + unsigned core_size_aligned; + unsigned compressed_size_aligned; + + weights_size = operation->weight_width * operation->weight_height * input_channels; + core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core; + core_size_aligned = ALIGN(core_size, 64); + compressed_size_aligned = header_size + core_size_aligned * cores_used; + + return compressed_size_aligned; +} + +static unsigned +calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation) +{ + struct pipe_context *context = subgraph->base.context; + struct etna_context *ctx = etna_context(context); + unsigned nn_core_count = ctx->screen->specs.nn_core_count; + unsigned max_zrl_bits = ctx->screen->specs.nn_zrl_bits; + unsigned header_size = ALIGN(nn_core_count * 4, 64); + unsigned input_channels = operation->addition ? 1 : operation->input_channels; + unsigned output_channels = operation->addition ? 1 : operation->output_channels; + unsigned cores_used = MIN2(output_channels, nn_core_count); + unsigned best_compressed_size; + unsigned best_zrl_bits; + + /* On HW that doesn't natively support depthwise and strided convolutions, + * we have to lower them and pad with lots of zeroes. We can be pretty certain + * that max bits of compression will help these jobs. + */ + if (operation->depthwise || + operation->stride > 1) { + + return max_zrl_bits; + } + + /* These are very unlikely to have enough zeroes for compression to be useful. */ + if (operation->addition || + operation->pointwise) { + + return 0; + } + + /* This calculation can be really slow. Start from max_zrl_bits as big + * buffers will benefit the most from high zero compression. + */ + best_compressed_size = UINT_MAX; + best_zrl_bits = 0; + for (unsigned zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) { + + unsigned compressed_size = header_size; + for (unsigned core = 0; core < cores_used; core++) { + + unsigned actual_size; + if (operation->pointwise && output_channels > 8) + actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits); + else if (input_channels > 1) + actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits); + else + actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits); + + compressed_size += actual_size; + } + + /* If more bits don't compress further, then stop */ + if (compressed_size <= best_compressed_size) { + best_compressed_size = compressed_size; + best_zrl_bits = zrl_bits; + } else + break; + } + + return best_zrl_bits; } static struct etna_bo * create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size) { - /* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */ struct pipe_context *context = subgraph->base.context; struct etna_context *ctx = etna_context(context); unsigned nn_core_count = ctx->screen->specs.nn_core_count; unsigned header_size = ALIGN(nn_core_count * 4, 64); - unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */ - unsigned input_channels; + unsigned input_channels = operation->addition ? 1 : operation->input_channels; unsigned output_channels = operation->addition ? 1 : operation->output_channels; unsigned cores_used = MIN2(output_channels, nn_core_count); - unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used); - uint8_t zero_length_encoding = false; - unsigned weights_size; - unsigned core_size; - unsigned core_size_aligned; + unsigned zrl_bits; - input_channels = operation->addition ? 1 : operation->input_channels; - weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size; - core_size = 3 + (weights_size + 4 + 4) * kernels_per_core; - core_size_aligned = ALIGN(core_size, 64); - *size = header_size + core_size_aligned * cores_used; + *size = calculate_weight_bo_size(subgraph, operation); + zrl_bits = calculate_zrl_bits(subgraph, operation); struct etna_bo *compressed = etna_bo_new(ctx->screen->dev, *size, @@ -1095,37 +1294,27 @@ create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_oper etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE); - uint8_t *map = etna_bo_map(compressed); - uint32_t *header = (uint32_t *)map; - + uint32_t *map = etna_bo_map(compressed); memset(map, 0, *size); - for (unsigned core = 0; core < cores_used; core++) - header[core] = core_size_aligned; - - map += header_size; - -#if 0 - uint8_t *input = map_resource(operation->weight_tensor); - for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++) - fprintf(stderr, "i %d: %02x\n", i, input[i]); -#endif + uint32_t *header = map; + map += header_size / 4; for (unsigned core = 0; core < cores_used; core++) { - *map++ = zero_length_encoding; - - *((uint16_t *)map) = kernels_per_core; - map += sizeof(uint16_t); - - if (operation->pointwise && input_channels >= 1 && output_channels > 8) - write_6_weight_format(subgraph, map, kernels_per_core, core, operation); + unsigned actual_size; + if (operation->pointwise && output_channels > 8) + actual_size = write_core_6(subgraph, map, core, operation, zrl_bits); else if (input_channels > 1) - write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation); + actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits); else - write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation); + actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits); - map += core_size_aligned - 3; + actual_size = ALIGN(actual_size, 64); + + header[core] = actual_size; + + map += actual_size / 4; } etna_bo_cpu_fini(compressed); diff --git a/src/gallium/drivers/etnaviv/etnaviv_screen.c b/src/gallium/drivers/etnaviv/etnaviv_screen.c index 06007029080..98dfb37aeec 100644 --- a/src/gallium/drivers/etnaviv/etnaviv_screen.c +++ b/src/gallium/drivers/etnaviv/etnaviv_screen.c @@ -858,6 +858,7 @@ etna_get_specs(struct etna_screen *screen) screen->specs.tp_core_count = info->npu.tp_core_count; screen->specs.on_chip_sram_size = info->npu.on_chip_sram_size; screen->specs.axi_sram_size = info->npu.axi_sram_size; + screen->specs.nn_zrl_bits = info->npu.nn_zrl_bits; } /* Figure out gross GPU architecture. See rnndb/common.xml for a specific