|
|
|
@@ -517,7 +517,8 @@ static unsigned
|
|
|
|
|
calc_superblocks(struct etna_context *ctx, const struct etna_operation *operation, unsigned tile_y, unsigned interleave_mode)
|
|
|
|
|
{
|
|
|
|
|
unsigned nn_core_count = ctx->screen->specs.nn_core_count;
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count);
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count);
|
|
|
|
|
unsigned foo = (ACCUM_BUFFER_DEPTH * interleave_mode) / tile_y;
|
|
|
|
|
|
|
|
|
|
if (operation->weight_width == 1)
|
|
|
|
@@ -526,16 +527,14 @@ calc_superblocks(struct etna_context *ctx, const struct etna_operation *operatio
|
|
|
|
|
foo = MIN2(foo, kernels_per_core);
|
|
|
|
|
foo = MIN2(foo, 127);
|
|
|
|
|
|
|
|
|
|
kernels_per_core = DIV_ROUND_UP(operation->output_channels, nn_core_count * foo);
|
|
|
|
|
unsigned num_kernels = DIV_ROUND_UP(operation->output_channels, kernels_per_core * nn_core_count);
|
|
|
|
|
unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, nn_core_count), num_kernels);
|
|
|
|
|
kernels_per_core = DIV_ROUND_UP(output_channels, nn_core_count * foo);
|
|
|
|
|
unsigned num_kernels = DIV_ROUND_UP(output_channels, kernels_per_core * nn_core_count);
|
|
|
|
|
unsigned superblocks = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), num_kernels);
|
|
|
|
|
|
|
|
|
|
/* TODO: Remove this once we support superblocks that don't divide output_channels in the compressed buffer */
|
|
|
|
|
while(operation->output_channels % superblocks)
|
|
|
|
|
while(output_channels % superblocks)
|
|
|
|
|
superblocks++;
|
|
|
|
|
|
|
|
|
|
ML_DBG("superblocks %d\n", superblocks);
|
|
|
|
|
|
|
|
|
|
return superblocks;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -619,16 +618,13 @@ calculate_tiling(struct etna_context *ctx, const struct etna_operation *operatio
|
|
|
|
|
interleave_mode = calc_interleave_mode(tile_width, operation->weight_height);
|
|
|
|
|
|
|
|
|
|
tile_height = INPUT_BUFFER_DEPTH * interleave_mode - operation->weight_height + 1;
|
|
|
|
|
ML_DBG("INPUT_BUFFER_DEPTH %d interleave_mode %d operation->weight_height %d tile_height %d input_width %d output_width %d\n", INPUT_BUFFER_DEPTH, interleave_mode, operation->weight_height, tile_height, operation->input_width, output_width);
|
|
|
|
|
tile_height = MIN2(tile_height, interleave_mode * ACCUM_BUFFER_DEPTH);
|
|
|
|
|
//tile_height = MIN2(tile_height, operation->input_width);
|
|
|
|
|
tile_height = MIN2(tile_height, output_height);
|
|
|
|
|
|
|
|
|
|
if (operation->stride > 1 && tile_height % 2 > 0)
|
|
|
|
|
tile_height -= 1;
|
|
|
|
|
|
|
|
|
|
superblocks = calc_superblocks(ctx, operation, tile_height, interleave_mode);
|
|
|
|
|
ML_DBG("tiling x %d y %d sb %d\n", tile_width, tile_height, superblocks);
|
|
|
|
|
|
|
|
|
|
if (tile_width_out)
|
|
|
|
|
*tile_width_out = tile_width;
|
|
|
|
@@ -789,9 +785,6 @@ create_nn_config(struct etna_ml_subgraph *subgraph, const struct etna_operation
|
|
|
|
|
|
|
|
|
|
map->kernels_per_core = DIV_ROUND_UP(DIV_ROUND_UP(output_channels, nn_core_count), superblocks);
|
|
|
|
|
|
|
|
|
|
/* Should be max accumBufferDepth (64) / zdpNum (3) */
|
|
|
|
|
//assert(map->kernels_per_core <= (64 / 3));
|
|
|
|
|
|
|
|
|
|
/* The header doesn't get cached */
|
|
|
|
|
coefficients_size -= 64;
|
|
|
|
|
|
|
|
|
@@ -876,20 +869,102 @@ static uint32_t calculate_bias_correction(uint8_t *weights, const struct etna_op
|
|
|
|
|
return correction;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
|
|
|
|
|
append_bits(uint32_t value, size_t size, unsigned *bits_in_buffer, uint64_t *buffer, uint32_t **dest, bool do_write)
|
|
|
|
|
{
|
|
|
|
|
*buffer |= (uint64_t)value << *bits_in_buffer;
|
|
|
|
|
*bits_in_buffer += size;
|
|
|
|
|
if (*bits_in_buffer >= 32) {
|
|
|
|
|
if (do_write)
|
|
|
|
|
**dest = *buffer & 0xffffffff;
|
|
|
|
|
*dest += 1;
|
|
|
|
|
*buffer >>= 32;
|
|
|
|
|
*bits_in_buffer -= 32;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
struct wb_stream {
|
|
|
|
|
unsigned zero_point;
|
|
|
|
|
unsigned zrl_bits;
|
|
|
|
|
unsigned *bits_in_buffer;
|
|
|
|
|
uint64_t *buffer;
|
|
|
|
|
uint32_t **map;
|
|
|
|
|
bool do_write;
|
|
|
|
|
|
|
|
|
|
unsigned accum_zeroes;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
wb_stream_flush_zeroes(struct wb_stream *wb_stream)
|
|
|
|
|
{
|
|
|
|
|
if (wb_stream->accum_zeroes == 0)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
append_bits(wb_stream->accum_zeroes - 1, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
wb_stream->accum_zeroes = 0;
|
|
|
|
|
append_bits(wb_stream->zero_point, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
wb_stream_write(struct wb_stream *wb_stream, unsigned value)
|
|
|
|
|
{
|
|
|
|
|
unsigned max_zeroes = (1 << wb_stream->zrl_bits) - 1;
|
|
|
|
|
|
|
|
|
|
if (wb_stream->zrl_bits == 0) {
|
|
|
|
|
append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (wb_stream->accum_zeroes == max_zeroes) {
|
|
|
|
|
append_bits(max_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
wb_stream->accum_zeroes = 0;
|
|
|
|
|
append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (value == wb_stream->zero_point) {
|
|
|
|
|
wb_stream->accum_zeroes++;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
append_bits(wb_stream->accum_zeroes, wb_stream->zrl_bits, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
wb_stream->accum_zeroes = 0;
|
|
|
|
|
append_bits(value, 8, wb_stream->bits_in_buffer, wb_stream->buffer, wb_stream->map, wb_stream->do_write);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned
|
|
|
|
|
write_core_6(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
|
|
|
|
{
|
|
|
|
|
struct pipe_context *pctx = subgraph->base.context;
|
|
|
|
|
unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
|
|
|
|
|
unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
|
|
|
|
|
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
|
|
|
|
uint8_t *input = map_resource(operation->weight_tensor);
|
|
|
|
|
uint32_t *biases = map_resource(operation->bias_tensor);
|
|
|
|
|
unsigned out_values_per_channel = operation->output_width * operation->output_height;
|
|
|
|
|
unsigned stride = MIN2(operation->input_channels, 6);
|
|
|
|
|
unsigned stride = MIN2(input_channels, 6);
|
|
|
|
|
unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
|
|
|
|
|
uint8_t *weights_maps[DIV_ROUND_UP(kernels_per_core, superblocks)];
|
|
|
|
|
uint32_t *initial_ptr = map;
|
|
|
|
|
bool do_write = initial_ptr != NULL;
|
|
|
|
|
uint64_t buffer = 0;
|
|
|
|
|
unsigned bits_in_buffer = 0;
|
|
|
|
|
struct wb_stream wb_stream = {
|
|
|
|
|
.zero_point = operation->weight_zero_point,
|
|
|
|
|
.zrl_bits = zrl_bits,
|
|
|
|
|
.bits_in_buffer = &bits_in_buffer,
|
|
|
|
|
.buffer = &buffer,
|
|
|
|
|
.map = &map,
|
|
|
|
|
.do_write = do_write,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ML_DBG("%s\n", __func__);
|
|
|
|
|
ML_DBG("%s core %d zrl_bits %d\n", __func__, core, zrl_bits);
|
|
|
|
|
|
|
|
|
|
append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
for (unsigned superblock = 0; superblock < superblocks; superblock++) {
|
|
|
|
|
|
|
|
|
@@ -898,53 +973,77 @@ write_6_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned
|
|
|
|
|
kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
|
|
|
|
|
|
|
|
|
|
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
|
|
|
|
|
weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * operation->input_channels;
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
|
|
|
|
|
weights_maps[kernel] = input + out_channel * operation->weight_width * operation->weight_height * input_channels;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (unsigned block = 0; block < DIV_ROUND_UP(operation->input_channels, stride); block++) {
|
|
|
|
|
for (unsigned block = 0; block < DIV_ROUND_UP(input_channels, stride); block++) {
|
|
|
|
|
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
|
|
|
|
|
|
|
|
|
|
if (block == 0) {
|
|
|
|
|
*map++ = weights_maps[kernel][0];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_maps[kernel][0]);
|
|
|
|
|
|
|
|
|
|
uint32_t corr = calculate_bias_correction(weights_maps[kernel], operation);
|
|
|
|
|
//fprintf(stderr, "core %d sb %d b %d kernel %d out_channel %d bias %x first %02x\n", core, superblock, block, kernel, out_channel, biases[out_channel] - corr, weights_maps[kernel][0]);
|
|
|
|
|
*((uint32_t *)map) = biases[out_channel] - corr;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
for (int i = 1; i < stride; i++) {
|
|
|
|
|
*map++ = weights_maps[kernel][i];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_maps[kernel][i]);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
for (int i = 0; i < stride; i++) {
|
|
|
|
|
if (i + block * stride < operation->input_channels)
|
|
|
|
|
*map++ = weights_maps[kernel][i + block * stride];
|
|
|
|
|
if (i + block * stride < input_channels)
|
|
|
|
|
wb_stream_write(&wb_stream, weights_maps[kernel][i + block * stride]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (block == DIV_ROUND_UP(operation->input_channels, stride) - 1) {
|
|
|
|
|
*((uint32_t*)map) = out_values_per_channel * out_channel;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
if (block == DIV_ROUND_UP(input_channels, stride) - 1) {
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
|
|
|
|
|
if (bits_in_buffer > 0)
|
|
|
|
|
append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
|
|
|
|
|
static unsigned
|
|
|
|
|
write_core_interleaved(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
|
|
|
|
{
|
|
|
|
|
struct pipe_context *pctx = subgraph->base.context;
|
|
|
|
|
unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
|
|
|
|
|
unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
|
|
|
|
|
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
|
|
|
|
uint8_t *input = map_resource(operation->weight_tensor);
|
|
|
|
|
uint32_t *biases = map_resource(operation->bias_tensor);
|
|
|
|
|
unsigned out_values_per_channel = operation->output_width * operation->output_height;
|
|
|
|
|
unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
|
|
|
|
|
uint8_t (*weights_map)[operation->input_channels][operation->weight_width][operation->weight_height] = (void *)input;
|
|
|
|
|
uint8_t (*weights_map)[input_channels][operation->weight_width][operation->weight_height] = (void *)input;
|
|
|
|
|
uint32_t *initial_ptr = map;
|
|
|
|
|
bool do_write = initial_ptr != NULL;
|
|
|
|
|
uint64_t buffer = 0;
|
|
|
|
|
unsigned bits_in_buffer = 0;
|
|
|
|
|
struct wb_stream wb_stream = {
|
|
|
|
|
.zero_point = operation->weight_zero_point,
|
|
|
|
|
.zrl_bits = zrl_bits,
|
|
|
|
|
.bits_in_buffer = &bits_in_buffer,
|
|
|
|
|
.buffer = &buffer,
|
|
|
|
|
.map = &map,
|
|
|
|
|
.do_write = do_write,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ML_DBG("%s core %d\n", __func__, core);
|
|
|
|
|
ML_DBG("%s core %d zrl_bits %d map %p\n", __func__, core, zrl_bits, map);
|
|
|
|
|
|
|
|
|
|
append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
for (unsigned superblock = 0; superblock < superblocks; superblock++) {
|
|
|
|
|
|
|
|
|
@@ -952,15 +1051,9 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
if (superblock == superblocks - 1)
|
|
|
|
|
kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
|
|
|
|
|
|
|
|
|
|
for (unsigned z = 0; z < operation->input_channels; z++) {
|
|
|
|
|
for (unsigned z = 0; z < input_channels; z++) {
|
|
|
|
|
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
|
|
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
|
if (z == 0)
|
|
|
|
|
fprintf(stderr, "core %d DIV_ROUND_UP(kernels_per_core, superblocks) %d kernel %d superblock * (operation->output_channels / superblocks) %u out_channel %d\n",
|
|
|
|
|
core, DIV_ROUND_UP(kernels_per_core, superblocks), kernel, superblock * (operation->output_channels / superblocks + 4), out_channel);
|
|
|
|
|
#endif
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
|
|
|
|
|
|
|
|
|
|
for (unsigned block = 0; block < DIV_ROUND_UP(operation->weight_width, 2); block++) {
|
|
|
|
|
unsigned stride = operation->weight_height;
|
|
|
|
@@ -970,13 +1063,11 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
if (x >= operation->weight_width)
|
|
|
|
|
break;
|
|
|
|
|
for (unsigned y = 0; y < stride; y++) {
|
|
|
|
|
//fprintf(stderr, "oc %d x %d y %d z %d: %02x\n", out_channel, x, y, z, weights_map[out_channel][z][x][y]);
|
|
|
|
|
*map++ = weights_map[out_channel][z][x][y];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
|
|
|
|
|
if (x == 0 && y == 0 && z == 0) {
|
|
|
|
|
uint32_t corr = calculate_bias_correction((uint8_t *)weights_map[out_channel], operation);
|
|
|
|
|
//fprintf(stderr, "core %d sb %d ic %d out_channel %d kernel %d bias %x first %02x\n", core, superblock, z, out_channel, kernel, biases[out_channel] - corr, weights_map[out_channel][z][x][y]);
|
|
|
|
|
*((uint32_t *)map) = biases[out_channel] - corr;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@@ -985,34 +1076,59 @@ write_interleaved_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
if (x >= operation->weight_width)
|
|
|
|
|
break;
|
|
|
|
|
for (unsigned y = stride; y < operation->weight_width; y++) {
|
|
|
|
|
//fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[out_channel][z][x][y]);
|
|
|
|
|
*map++ = weights_map[out_channel][z][x][y];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_map[out_channel][z][x][y]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (z == operation->input_channels - 1) {
|
|
|
|
|
*((uint32_t*)map) = out_values_per_channel * out_channel;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
if (z == input_channels - 1) {
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (superblock == superblocks - 1)
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
|
|
|
|
|
if (bits_in_buffer > 0)
|
|
|
|
|
append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
return (uint8_t *)map - (uint8_t *)initial_ptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map, unsigned kernels_per_core, unsigned core, const struct etna_operation *operation)
|
|
|
|
|
static unsigned
|
|
|
|
|
write_core_sequential(struct etna_ml_subgraph *subgraph, uint32_t *map, unsigned core, const struct etna_operation *operation, unsigned zrl_bits)
|
|
|
|
|
{
|
|
|
|
|
struct pipe_context *pctx = subgraph->base.context;
|
|
|
|
|
unsigned nn_core_count = etna_context(pctx)->screen->specs.nn_core_count;
|
|
|
|
|
unsigned cores_used = MIN2(operation->output_channels, nn_core_count);
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
|
|
|
|
uint8_t *input = map_resource(operation->weight_tensor);
|
|
|
|
|
uint32_t *biases = map_resource(operation->bias_tensor);
|
|
|
|
|
unsigned out_values_per_channel = operation->output_width * operation->output_height;
|
|
|
|
|
unsigned superblocks = calculate_tiling(etna_context(pctx), operation, NULL, NULL);
|
|
|
|
|
uint32_t *initial_ptr = map;
|
|
|
|
|
bool do_write = initial_ptr != NULL;
|
|
|
|
|
uint64_t buffer = 0;
|
|
|
|
|
unsigned bits_in_buffer = 0;
|
|
|
|
|
struct wb_stream wb_stream = {
|
|
|
|
|
.zero_point = operation->weight_zero_point,
|
|
|
|
|
.zrl_bits = zrl_bits,
|
|
|
|
|
.bits_in_buffer = &bits_in_buffer,
|
|
|
|
|
.buffer = &buffer,
|
|
|
|
|
.map = &map,
|
|
|
|
|
.do_write = do_write,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
ML_DBG("%s: superblocks %d channels %d\n", __func__, superblocks, operation->output_channels);
|
|
|
|
|
ML_DBG("%s core %d zrl_bits %d superblocks %d\n", __func__, core, zrl_bits, superblocks);
|
|
|
|
|
|
|
|
|
|
append_bits(zrl_bits, 8, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
append_bits(kernels_per_core, 16, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
for (unsigned superblock = 0; superblock < superblocks; superblock++) {
|
|
|
|
|
|
|
|
|
@@ -1021,7 +1137,7 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
kernels_in_superblock = DIV_ROUND_UP(kernels_per_core, superblocks) - kernels_per_core % superblocks;
|
|
|
|
|
|
|
|
|
|
for (unsigned kernel = 0; kernel < kernels_in_superblock; kernel++) {
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(DIV_ROUND_UP(operation->output_channels, cores_used), superblocks) * cores_used;
|
|
|
|
|
unsigned out_channel = core * kernels_in_superblock + kernel + superblock * DIV_ROUND_UP(kernels_per_core, superblocks) * cores_used;
|
|
|
|
|
|
|
|
|
|
uint8_t (*weights_map)[operation->weight_height] = (void*) input + out_channel * operation->weight_width * operation->weight_height;
|
|
|
|
|
|
|
|
|
@@ -1034,13 +1150,12 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
if (x >= operation->weight_width)
|
|
|
|
|
break;
|
|
|
|
|
for (unsigned y = 0; y < stride; y++) {
|
|
|
|
|
//fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
|
|
|
|
|
|
|
|
|
|
*map++ = weights_map[x][y];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_map[x][y]);
|
|
|
|
|
if (x == 0 && y == 0) {
|
|
|
|
|
uint32_t corr = calculate_bias_correction((uint8_t *)weights_map, operation);
|
|
|
|
|
*((uint32_t *)map) = biases[out_channel] - corr;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
append_bits(biases[out_channel] - corr, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@@ -1050,44 +1165,128 @@ write_sequential_weight_format(struct etna_ml_subgraph *subgraph, uint8_t *map,
|
|
|
|
|
if (x >= operation->weight_width)
|
|
|
|
|
break;
|
|
|
|
|
for (unsigned y = stride; y < operation->weight_width; y++) {
|
|
|
|
|
//fprintf(stderr, "x %d y %d: %02x\n", x, y, weights_map[x][y]);
|
|
|
|
|
*map++ = weights_map[x][y];
|
|
|
|
|
wb_stream_write(&wb_stream, weights_map[x][y]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (operation->addition) {
|
|
|
|
|
*((uint32_t*)map) = operation->addition_offset;
|
|
|
|
|
} else
|
|
|
|
|
*((uint32_t*)map) = out_values_per_channel * out_channel;
|
|
|
|
|
map += sizeof(uint32_t);
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
if (operation->addition)
|
|
|
|
|
append_bits(operation->addition_offset, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
else
|
|
|
|
|
append_bits(out_values_per_channel * out_channel, 32, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wb_stream_flush_zeroes(&wb_stream);
|
|
|
|
|
|
|
|
|
|
if (bits_in_buffer > 0)
|
|
|
|
|
append_bits(0, 32 - bits_in_buffer, &bits_in_buffer, &buffer, &map, do_write);
|
|
|
|
|
|
|
|
|
|
return (uint8_t *)map - (uint8_t *)initial_ptr - 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned
|
|
|
|
|
calculate_weight_bo_size(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
|
|
|
|
|
{
|
|
|
|
|
struct pipe_context *context = subgraph->base.context;
|
|
|
|
|
struct etna_context *ctx = etna_context(context);
|
|
|
|
|
unsigned nn_core_count = ctx->screen->specs.nn_core_count;
|
|
|
|
|
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
|
|
|
|
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
|
|
|
|
unsigned weights_size;
|
|
|
|
|
unsigned core_size;
|
|
|
|
|
unsigned core_size_aligned;
|
|
|
|
|
unsigned compressed_size_aligned;
|
|
|
|
|
|
|
|
|
|
weights_size = operation->weight_width * operation->weight_height * input_channels;
|
|
|
|
|
core_size = 1 + 2 + (weights_size + 4 + 4) * kernels_per_core;
|
|
|
|
|
core_size_aligned = ALIGN(core_size, 64);
|
|
|
|
|
compressed_size_aligned = header_size + core_size_aligned * cores_used;
|
|
|
|
|
|
|
|
|
|
return compressed_size_aligned;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static unsigned
|
|
|
|
|
calculate_zrl_bits(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation)
|
|
|
|
|
{
|
|
|
|
|
struct pipe_context *context = subgraph->base.context;
|
|
|
|
|
struct etna_context *ctx = etna_context(context);
|
|
|
|
|
unsigned nn_core_count = ctx->screen->specs.nn_core_count;
|
|
|
|
|
unsigned max_zrl_bits = ctx->screen->specs.nn_zrl_bits;
|
|
|
|
|
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
|
|
|
|
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned best_compressed_size;
|
|
|
|
|
unsigned best_zrl_bits;
|
|
|
|
|
|
|
|
|
|
/* On HW that doesn't natively support depthwise and strided convolutions,
|
|
|
|
|
* we have to lower them and pad with lots of zeroes. We can be pretty certain
|
|
|
|
|
* that max bits of compression will help these jobs.
|
|
|
|
|
*/
|
|
|
|
|
if (operation->depthwise ||
|
|
|
|
|
operation->stride > 1) {
|
|
|
|
|
|
|
|
|
|
return max_zrl_bits;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* These are very unlikely to have enough zeroes for compression to be useful. */
|
|
|
|
|
if (operation->addition ||
|
|
|
|
|
operation->pointwise) {
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* This calculation can be really slow. Start from max_zrl_bits as big
|
|
|
|
|
* buffers will benefit the most from high zero compression.
|
|
|
|
|
*/
|
|
|
|
|
best_compressed_size = UINT_MAX;
|
|
|
|
|
best_zrl_bits = 0;
|
|
|
|
|
for (unsigned zrl_bits = max_zrl_bits; zrl_bits >= 0; zrl_bits--) {
|
|
|
|
|
|
|
|
|
|
unsigned compressed_size = header_size;
|
|
|
|
|
for (unsigned core = 0; core < cores_used; core++) {
|
|
|
|
|
|
|
|
|
|
unsigned actual_size;
|
|
|
|
|
if (operation->pointwise && output_channels > 8)
|
|
|
|
|
actual_size = write_core_6(subgraph, NULL, core, operation, zrl_bits);
|
|
|
|
|
else if (input_channels > 1)
|
|
|
|
|
actual_size = write_core_interleaved(subgraph, NULL, core, operation, zrl_bits);
|
|
|
|
|
else
|
|
|
|
|
actual_size = write_core_sequential(subgraph, NULL, core, operation, zrl_bits);
|
|
|
|
|
|
|
|
|
|
compressed_size += actual_size;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If more bits don't compress further, then stop */
|
|
|
|
|
if (compressed_size <= best_compressed_size) {
|
|
|
|
|
best_compressed_size = compressed_size;
|
|
|
|
|
best_zrl_bits = zrl_bits;
|
|
|
|
|
} else
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return best_zrl_bits;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static struct etna_bo *
|
|
|
|
|
create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_operation *operation, unsigned *size)
|
|
|
|
|
{
|
|
|
|
|
/* TODO: Implement zero-length encoding of weights and biases for bandwidth savings */
|
|
|
|
|
struct pipe_context *context = subgraph->base.context;
|
|
|
|
|
struct etna_context *ctx = etna_context(context);
|
|
|
|
|
unsigned nn_core_count = ctx->screen->specs.nn_core_count;
|
|
|
|
|
unsigned header_size = ALIGN(nn_core_count * 4, 64);
|
|
|
|
|
unsigned weight_item_size = 1; /* TODO: Support types other than (u)int8 */
|
|
|
|
|
unsigned input_channels;
|
|
|
|
|
unsigned input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
unsigned output_channels = operation->addition ? 1 : operation->output_channels;
|
|
|
|
|
unsigned cores_used = MIN2(output_channels, nn_core_count);
|
|
|
|
|
unsigned kernels_per_core = DIV_ROUND_UP(output_channels, cores_used);
|
|
|
|
|
uint8_t zero_length_encoding = false;
|
|
|
|
|
unsigned weights_size;
|
|
|
|
|
unsigned core_size;
|
|
|
|
|
unsigned core_size_aligned;
|
|
|
|
|
unsigned zrl_bits;
|
|
|
|
|
|
|
|
|
|
input_channels = operation->addition ? 1 : operation->input_channels;
|
|
|
|
|
weights_size = operation->weight_width * operation->weight_height * input_channels * weight_item_size;
|
|
|
|
|
core_size = 3 + (weights_size + 4 + 4) * kernels_per_core;
|
|
|
|
|
core_size_aligned = ALIGN(core_size, 64);
|
|
|
|
|
*size = header_size + core_size_aligned * cores_used;
|
|
|
|
|
*size = calculate_weight_bo_size(subgraph, operation);
|
|
|
|
|
zrl_bits = calculate_zrl_bits(subgraph, operation);
|
|
|
|
|
|
|
|
|
|
struct etna_bo *compressed = etna_bo_new(ctx->screen->dev,
|
|
|
|
|
*size,
|
|
|
|
@@ -1095,37 +1294,27 @@ create_coefficients_bo(struct etna_ml_subgraph *subgraph, const struct etna_oper
|
|
|
|
|
|
|
|
|
|
etna_bo_cpu_prep(compressed, DRM_ETNA_PREP_WRITE);
|
|
|
|
|
|
|
|
|
|
uint8_t *map = etna_bo_map(compressed);
|
|
|
|
|
uint32_t *header = (uint32_t *)map;
|
|
|
|
|
|
|
|
|
|
uint32_t *map = etna_bo_map(compressed);
|
|
|
|
|
memset(map, 0, *size);
|
|
|
|
|
|
|
|
|
|
for (unsigned core = 0; core < cores_used; core++)
|
|
|
|
|
header[core] = core_size_aligned;
|
|
|
|
|
|
|
|
|
|
map += header_size;
|
|
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
|
uint8_t *input = map_resource(operation->weight_tensor);
|
|
|
|
|
for (int i = 0; i < operation->output_channels * operation->input_channels * operation->weight_width * operation->weight_height; i++)
|
|
|
|
|
fprintf(stderr, "i %d: %02x\n", i, input[i]);
|
|
|
|
|
#endif
|
|
|
|
|
uint32_t *header = map;
|
|
|
|
|
map += header_size / 4;
|
|
|
|
|
|
|
|
|
|
for (unsigned core = 0; core < cores_used; core++) {
|
|
|
|
|
|
|
|
|
|
*map++ = zero_length_encoding;
|
|
|
|
|
|
|
|
|
|
*((uint16_t *)map) = kernels_per_core;
|
|
|
|
|
map += sizeof(uint16_t);
|
|
|
|
|
|
|
|
|
|
if (operation->pointwise && input_channels >= 1 && output_channels > 8)
|
|
|
|
|
write_6_weight_format(subgraph, map, kernels_per_core, core, operation);
|
|
|
|
|
unsigned actual_size;
|
|
|
|
|
if (operation->pointwise && output_channels > 8)
|
|
|
|
|
actual_size = write_core_6(subgraph, map, core, operation, zrl_bits);
|
|
|
|
|
else if (input_channels > 1)
|
|
|
|
|
write_interleaved_weight_format(subgraph, map, kernels_per_core, core, operation);
|
|
|
|
|
actual_size = write_core_interleaved(subgraph, map, core, operation, zrl_bits);
|
|
|
|
|
else
|
|
|
|
|
write_sequential_weight_format(subgraph, map, kernels_per_core, core, operation);
|
|
|
|
|
actual_size = write_core_sequential(subgraph, map, core, operation, zrl_bits);
|
|
|
|
|
|
|
|
|
|
map += core_size_aligned - 3;
|
|
|
|
|
actual_size = ALIGN(actual_size, 64);
|
|
|
|
|
|
|
|
|
|
header[core] = actual_size;
|
|
|
|
|
|
|
|
|
|
map += actual_size / 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
etna_bo_cpu_fini(compressed);
|
|
|
|
|