diff --git a/src/intel/isl/isl_tiled_memcpy.c b/src/intel/isl/isl_tiled_memcpy.c index 33022da3f3d..da155574252 100644 --- a/src/intel/isl/isl_tiled_memcpy.c +++ b/src/intel/isl/isl_tiled_memcpy.c @@ -57,6 +57,9 @@ static const uint32_t xtile_span = 64; static const uint32_t ytile_width = 128; static const uint32_t ytile_height = 32; static const uint32_t ytile_span = 16; +static const uint32_t wtile_width = 64; +static const uint32_t wtile_height = 64; +static const uint32_t wtile_span = 2; static inline uint32_t ror(uint32_t n, uint32_t d) @@ -100,6 +103,198 @@ rgba8_copy(void *dst, const void *src, size_t bytes) return dst; } +#define wtile_block_id(x, y) \ + (((((x) >> 3) & 0x7) << 3) | \ + (((y) >> 3) & 0x7)) + +#define wtile_block_offset(x, y) \ + ((((y) & 4) << 3) + \ + (((y) & 2) << 2) + \ + (((y) & 1) << 1) + \ + (((x) & 4) << 2) + \ + (((x) & 2) << 1) + \ + (((x) & 1) << 0)) + +/** + * Copy from linear into a W tile block. + * + * @dst is a pointer to a block in a W tile, @src is a pointer to the linear + * data, coordinates are relative to the surface (not the tile). + */ +static inline void +wtile_block_copy_from_linear(void *dst, const void *src, + unsigned x0, unsigned x1, + unsigned y0, unsigned y1, + unsigned src_pitch) +{ + uint8_t *dst_data = dst + wtile_block_id(x0, y0) * 64; + const uint8_t *src_data = src; + + for (unsigned y = y0; y < y1; y++) + for (unsigned x = x0; x < x1; x++) + dst_data[wtile_block_offset(x, y)] = src_data[y * src_pitch + x]; +} + +/** + * Copy from linear into a full W tile block. + * + * @dst is a pointer to a block in a W tile, @src is a pointer to the linear + * data. + */ +static inline void +wtile_block_full_copy_from_linear(void *dst, const void *src, + unsigned x0, unsigned y0, + unsigned src_pitch) +{ + uint16_t *dst_data = dst + wtile_block_id(x0, y0) * 64; + const uint8_t *src_data = src; + + /* + * The layout of a block is a series of 2 consecutive bytes elements. + * _________________________________ + * |B00|B01|B04|B05|B16|B17|B20|B21| + * |B02|B03|B06|B07|B18|B19|B22|B23| + * |B08|B09|B12|B13|B24|B25|B28|B29| + * |B10|B11|B14|B15|B26|B27|B30|B31| + * |B32|B33|B36|B37|B48|B49|B52|B53| + * |B34|B35|B38|B39|B50|B51|B54|B55| + * |B40|B41|B44|B45|B56|B57|B60|B61| + * |B42|B43|B46|B47|B58|B59|B62|B64| + * --------------------------------- + */ + +#define src_lin(bx, by) \ + (*((const uint16_t *)(src_data + (y0 + by) * src_pitch + x0 + bx * 2))) + + dst_data[0] = src_lin(0, 0); + dst_data[1] = src_lin(0, 1); + dst_data[2] = src_lin(1, 0); + dst_data[3] = src_lin(1, 1); + dst_data[4] = src_lin(0, 2); + dst_data[5] = src_lin(0, 3); + dst_data[6] = src_lin(1, 2); + dst_data[7] = src_lin(1, 3); + + dst_data[8] = src_lin(2, 0); + dst_data[9] = src_lin(2, 1); + dst_data[10] = src_lin(3, 0); + dst_data[11] = src_lin(3, 1); + dst_data[12] = src_lin(2, 2); + dst_data[13] = src_lin(2, 3); + dst_data[14] = src_lin(3, 2); + dst_data[15] = src_lin(3, 3); + + dst_data[16] = src_lin(0, 4); + dst_data[17] = src_lin(0, 5); + dst_data[18] = src_lin(1, 4); + dst_data[19] = src_lin(1, 5); + dst_data[20] = src_lin(0, 6); + dst_data[21] = src_lin(0, 7); + dst_data[22] = src_lin(1, 6); + dst_data[23] = src_lin(1, 7); + + dst_data[24] = src_lin(2, 4); + dst_data[25] = src_lin(2, 5); + dst_data[26] = src_lin(3, 4); + dst_data[27] = src_lin(3, 5); + dst_data[28] = src_lin(2, 6); + dst_data[29] = src_lin(2, 7); + dst_data[30] = src_lin(3, 6); + dst_data[31] = src_lin(3, 7); + +#undef src_lin +} + +/** + * Copy from W tile block into linear. + * + * @dst is a pointer to the linear data, @src is a pointer to a block in the W + * tile. + */ +static inline void +wtile_block_copy_to_linear(void *dst, const void *src, + unsigned x0, unsigned x1, + unsigned y0, unsigned y1, + unsigned dst_pitch) +{ + uint8_t *dst_data = dst; + const uint8_t *src_data = src + wtile_block_id(x0, y0) * 64; + + for (unsigned y = y0; y < y1; y++) + for (unsigned x = x0; x < x1; x++) + dst_data[y * dst_pitch + x] = src_data[wtile_block_offset(x, y)]; +} + +/** + * Copy to linear from a full W tile block. + * + * @dst is a pointer to the linear data, @src is a pointer to a block in a W + * tile. + */ +static inline void +wtile_block_full_copy_to_linear(void *dst, const void *src, + unsigned x0, unsigned y0, + unsigned dst_pitch) +{ + uint8_t *dst_data = dst; + const uint16_t *src_data = src + wtile_block_id(x0, y0) * 64; + + /* + * The layout of a block is a series of 2 consecutive bytes elements. + * _________________________________ + * |B00|B01|B04|B05|B16|B17|B20|B21| + * |B02|B03|B06|B07|B18|B19|B22|B23| + * |B08|B09|B12|B13|B24|B25|B28|B29| + * |B10|B11|B14|B15|B26|B27|B30|B31| + * |B32|B33|B36|B37|B48|B49|B52|B53| + * |B34|B35|B38|B39|B50|B51|B54|B55| + * |B40|B41|B44|B45|B56|B57|B60|B61| + * |B42|B43|B46|B47|B58|B59|B62|B64| + * --------------------------------- + */ + +#define dst_lin(bx, by) \ + (*((uint16_t *)(dst_data + (y0 + by) * dst_pitch + x0 + bx * 2))) + + dst_lin(0, 0) = src_data[0]; + dst_lin(0, 1) = src_data[1]; + dst_lin(1, 0) = src_data[2]; + dst_lin(1, 1) = src_data[3]; + dst_lin(0, 2) = src_data[4]; + dst_lin(0, 3) = src_data[5]; + dst_lin(1, 2) = src_data[6]; + dst_lin(1, 3) = src_data[7]; + + dst_lin(2, 0) = src_data[8]; + dst_lin(2, 1) = src_data[9]; + dst_lin(3, 0) = src_data[10]; + dst_lin(3, 1) = src_data[11]; + dst_lin(2, 2) = src_data[12]; + dst_lin(2, 3) = src_data[13]; + dst_lin(3, 2) = src_data[14]; + dst_lin(3, 3) = src_data[15]; + + dst_lin(0, 4) = src_data[16]; + dst_lin(0, 5) = src_data[17]; + dst_lin(1, 4) = src_data[18]; + dst_lin(1, 5) = src_data[19]; + dst_lin(0, 6) = src_data[20]; + dst_lin(0, 7) = src_data[21]; + dst_lin(1, 6) = src_data[22]; + dst_lin(1, 7) = src_data[23]; + + dst_lin(2, 4) = src_data[24]; + dst_lin(2, 5) = src_data[25]; + dst_lin(3, 4) = src_data[26]; + dst_lin(3, 5) = src_data[27]; + dst_lin(2, 6) = src_data[28]; + dst_lin(2, 7) = src_data[29]; + dst_lin(3, 6) = src_data[30]; + dst_lin(3, 7) = src_data[31]; + +#undef dst_lin +} + #ifdef __SSSE3__ static const uint8_t rgba8_permutation[16] = { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 }; @@ -602,6 +797,79 @@ linear_to_tile4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +/** + * Copy texture data from linear to W tile layout. + * + * \copydoc tile_copy_fn + */ +static inline void +linear_to_wtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y3, + char *dst, const char *src, int32_t src_pitch) +{ + /* + * The layout is a series of block of 64B each. + * ___________________________________________ + * |blk00|blk08|blk16|blk24|blk32|blk48|blk56| + * |blk01|blk09|blk17|blk25|blk33|blk49|blk57| + * |blk02|blk10|blk18|blk26|blk34|blk50|blk58| + * |blk03|blk11|blk19|blk27|blk35|blk51|blk59| + * |blk04|blk12|blk20|blk28|blk36|blk52|blk60| + * |blk05|blk13|blk21|blk29|blk37|blk53|blk61| + * |blk06|blk14|blk22|blk30|blk38|blk54|blk62| + * |blk07|blk15|blk23|blk31|blk39|blk55|blk63| + * ------------------------------------------- + */ + + /* Find intermediate Y offsets that are aligned to a 64B element (8 rows). + */ + uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8)); + uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8)); + + uint32_t xo, yo; + + /* If the y0 coordinate is not aligned to a block, do partial copies into + * blocks 0, 8, 16, 24, 32, 48 & 56. + */ + if (y0 != y1) { + if (x0 != x1) + wtile_block_copy_from_linear(dst, src, x0, x1, y0, y1, src_pitch); + for (xo = x1; xo < x2; xo += 8) + wtile_block_copy_from_linear(dst, src, xo, xo + 8, y0, y1, src_pitch); + if (x2 != x3) + wtile_block_copy_from_linear(dst, src, x2, x3, y0, y1, src_pitch); + } + + for (yo = y1; yo < y2; yo += 8) { + /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */ + if (x0 != x1) { + wtile_block_copy_from_linear(dst, src, + x0, x1, yo, yo + 8, src_pitch); + } + /* Full block copies on the inside. */ + for (xo = x1; xo < x2; xo += 8) + wtile_block_full_copy_from_linear(dst, src, xo, yo, src_pitch); + /* Do partial copies int blocks [57, 62] if y3 is not aligned to block. + */ + if (x2 != x3) { + wtile_block_copy_from_linear(dst, src, + x2, x3, yo, yo + 8, src_pitch); + } + } + + /* If the x3 coordinate is not aligned to a block, do partial copies into + * blocks [57,62]. + */ + if (y2 != y3) { + if (x0 != x1) + wtile_block_copy_from_linear(dst, src, x0, x1, y2, y3, src_pitch); + for (xo = x1; xo < x2; xo += 8) + wtile_block_copy_from_linear(dst, src, xo, xo + 8, y2, y3, src_pitch); + if (x2 != x3) + wtile_block_copy_from_linear(dst, src, x2, x3, y2, y3, src_pitch); + } +} + /** * Copy texture data from X tile layout to linear. * @@ -961,6 +1229,78 @@ tile4_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +/** + * Copy texture data from W tile layout to linear. + * + * \copydoc tile_copy_fn + */ +static inline void +wtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y3, + char *dst, const char *src, + int32_t dst_pitch) +{ + /* + * The layout is a series of block of 64B each. + * ___________________________________________ + * |blk00|blk08|blk16|blk24|blk32|blk48|blk56| + * |blk01|blk09|blk17|blk25|blk33|blk49|blk57| + * |blk02|blk10|blk18|blk26|blk34|blk50|blk58| + * |blk03|blk11|blk19|blk27|blk35|blk51|blk59| + * |blk04|blk12|blk20|blk28|blk36|blk52|blk60| + * |blk05|blk13|blk21|blk29|blk37|blk53|blk61| + * |blk06|blk14|blk22|blk30|blk38|blk54|blk62| + * |blk07|blk15|blk23|blk31|blk39|blk55|blk63| + * ------------------------------------------- + */ + + /* Find intermediate Y offsets that are aligned to a 64B element (8 rows). + */ + uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 8)); + uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 8)); + + uint32_t xo, yo; + + /* If the y0 coordinate is not aligned to a block, do partial copies into + * blocks 0, 8, 16, 24, 32, 48 & 56. + */ + if (y0 != y1) { + if (x0 != x1) + wtile_block_copy_to_linear(dst, src, x0, x1, y0, y1, dst_pitch); + for (xo = x1; xo < x2; xo += 8) + wtile_block_copy_to_linear(dst, src, xo, xo + 8, y0, y1, dst_pitch); + if (x2 != x3) + wtile_block_copy_to_linear(dst, src, x2, x3, y0, y1, dst_pitch); + } + + for (yo = y1; yo < y2; yo += 8) { + /* Do partial copies int blocks [1, 6] if x0 is not aligned to block. */ + if (x0 != x1) + wtile_block_copy_to_linear(dst, src, x0, x1, yo, yo + 8, dst_pitch); + /* Full block copies on the inside. */ + for (xo = x1; xo < x2; xo += 8) + wtile_block_full_copy_to_linear(dst, src, xo, yo, dst_pitch); + /* Do partial copies int blocks [57, 62] if y3 is not aligned to block. + */ + if (x2 != x3) + wtile_block_copy_to_linear(dst, src, x2, x3, yo, yo + 8, dst_pitch); + } + + /* If the x3 coordinate is not aligned to a block, do partial copies into + * blocks [57,62]. + */ + if (y2 != y3) { + if (x0 != x1) + wtile_block_copy_to_linear(dst, src, x0, x1, y2, y3, dst_pitch); + for (xo = x1; xo < x2; xo += 8) { + wtile_block_copy_to_linear(dst, src, + xo, MIN2(xo + 8, x3), y2, y3, dst_pitch); + } + if (x2 != x3) + wtile_block_copy_to_linear(dst, src, x2, x3, y2, y3, dst_pitch); + } +} + #if defined(INLINE_SSE41) static ALWAYS_INLINE void * _memcpy_streaming_load(void *dest, const void *src, size_t count) @@ -1135,6 +1475,35 @@ linear_to_tile4_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +/** + * Copy texture data from linear to tile W layout, faster. + * + * Same as \ref linear_to_tilew but faster, because it passes constant + * parameters for common cases, allowing the compiler to inline code + * optimized for those cases. + * + * \copydoc tile_copy_fn + */ +static FLATTEN void +linear_to_wtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y1, + char *dst, const char *src, + int32_t src_pitch, + uint32_t swizzle_bit, + isl_memcpy_type copy_type) +{ + assert(swizzle_bit == 0); + if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) { + return linear_to_wtiled(0, 0, + wtile_width, wtile_width, + 0, wtile_height, + dst, src, src_pitch); + } else { + return linear_to_wtiled(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch); + } +} + /** * Copy texture data from X tile layout to linear, faster. * @@ -1298,6 +1667,36 @@ tile4_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +/** + * Copy texture data from tileW layout to linear, faster. + * + * Same as \ref tilew_to_linear but faster, because it passes constant + * parameters for common cases, allowing the compiler to inline code + * optimized for those cases. + * + * \copydoc tile_copy_fn + */ +static FLATTEN void +wtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y1, + char *dst, const char *src, + int32_t dst_pitch, + uint32_t swizzle_bit, + isl_memcpy_type copy_type) +{ + assert(swizzle_bit == 0); + + if (x0 == 0 && x3 == wtile_width && y0 == 0 && y1 == wtile_height) { + return wtiled_to_linear(0, 0, + wtile_width, wtile_width, + 0, wtile_height, + dst, src, dst_pitch); + } else { + return wtiled_to_linear(x0, x1, x2, x3, y0, y1, + dst, src, dst_pitch); + } +} + /** * Copy from linear to tiled texture. * @@ -1340,6 +1739,19 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, th = ytile_height; xt_sub_range_alignment = ytile_span; tile_copy = linear_to_tile4_faster; + } else if (tiling == ISL_TILING_W) { + tw = wtile_width; + th = wtile_height; + /* The copy function prioritizes W-Tile blocks. The width of a W-Tile + * block is four W-Tile spans. + */ + xt_sub_range_alignment = wtile_span * 4; + tile_copy = linear_to_wtiled_faster; + /* TileW is a special case with doubled physical tile width due to HW + * programming requirements (see isl_tiling_get_info() in + * src/intel/isl/isl.c) + */ + dst_pitch /= 2; } else { unreachable("unsupported tiling"); } @@ -1437,6 +1849,19 @@ tiled_to_linear(uint32_t xt1, uint32_t xt2, th = ytile_height; xt_sub_range_alignment = ytile_span; tile_copy = tile4_to_linear_faster; + } else if (tiling == ISL_TILING_W) { + tw = wtile_width; + th = wtile_height; + /* The copy function prioritizes W-Tile blocks. The width of a W-Tile + * block is four W-Tile spans. + */ + xt_sub_range_alignment = wtile_span * 4; + tile_copy = wtiled_to_linear_faster; + /* TileW is a special case with doubled physical tile width due to HW + * programming requirements (see isl_tiling_get_info() in + * src/intel/isl/isl.c) + */ + src_pitch /= 2; } else { unreachable("unsupported tiling"); } diff --git a/src/intel/isl/tests/isl_tilememcpy_tiled_unittest.cpp b/src/intel/isl/tests/isl_tilememcpy_tiled_unittest.cpp index b923776635c..24286710e8f 100644 --- a/src/intel/isl/tests/isl_tilememcpy_tiled_unittest.cpp +++ b/src/intel/isl/tests/isl_tilememcpy_tiled_unittest.cpp @@ -60,6 +60,11 @@ typedef uint8_t *(*swizzle_func_t)(const uint8_t *base_addr, uint32_t pitch, uin std::make_tuple( 0, 16, 0, 32), \ std::make_tuple( 0, 16, 0, 64) +#define FULL_TILEW_COORDINATES \ + std::make_tuple( 0, 64, 0, 64), \ + std::make_tuple( 0, 128, 0, 64), \ + std::make_tuple( 0, 128, 0,128) + struct tile_swizzle_ops { enum isl_tiling tiling; swizzle_func_t linear_to_tile_swizzle; @@ -138,10 +143,42 @@ uint8_t *linear_to_tileX_swizzle(const uint8_t * base_addr, uint32_t pitch, uint return (uint8_t *) (base_addr + tiled_off); } +uint8_t *linear_to_tileW_swizzle(const uint8_t *base_addr, uint32_t pitch, uint32_t x_B, uint32_t y_px) +{ + /* TileW is a special case with doubled physical tile width due to HW + * programming requirements (see isl_tiling_get_info() in + * src/intel/isl/isl.c) + */ + pitch /= 2; + + const uint32_t cu = 6, cv = 6; + const uint32_t tile_id = (y_px >> cv) * (pitch >> cu) + (x_B >> cu); + + /* The table below represents the mapping from coordinate (x_B, y_px) to + * byte offset in a 64x64px 1Bpp image: + * + * Bit ind : 11 10 9 8 7 6 5 4 3 2 1 0 + * Tile-W : u5 u4 u3 v5 v4 v3 v2 u2 v1 u1 v0 u0 + */ + uint32_t tiled_off; + + tiled_off = tile_id * 4096 | + swizzle_bitops(x_B, 1, 0, 0) | + swizzle_bitops(y_px, 1, 0, 1) | + swizzle_bitops(x_B, 1, 1, 2) | + swizzle_bitops(y_px, 1, 1, 3) | + swizzle_bitops(x_B, 1, 2, 4) | + swizzle_bitops(y_px, 4, 2, 5) | + swizzle_bitops(x_B, 3, 3, 9); + + return (uint8_t *) (base_addr + tiled_off); +} + struct tile_swizzle_ops swizzle_opers[] = { {ISL_TILING_Y0, linear_to_tileY_swizzle}, {ISL_TILING_4, linear_to_tile4_swizzle}, {ISL_TILING_X, linear_to_tileX_swizzle}, + {ISL_TILING_W, linear_to_tileW_swizzle}, }; class tileTFixture: public ::testing::Test { @@ -194,6 +231,11 @@ class tileXFixture : public tileTFixture, int, int>> {}; +class tileWFixture : public tileTFixture, + public ::testing::WithParamInterface> +{}; + void tileTFixture::test_setup(TILE_CONV convert, enum isl_tiling tiling_fmt, enum isl_format format, @@ -401,6 +443,24 @@ TEST_P(tileXFixture, tiletolin) run_test(x1, x2, y1, y2); } +TEST_P(tileWFixture, lintotile) +{ + auto [x1, x2, y1, y2] = GetParam(); + test_setup(LIN_TO_TILE, ISL_TILING_W, TILEW_IMAGE_FORMAT, x2, y2); + if (print_results) + printf("Coordinates: x1=%d x2=%d y1=%d y2=%d \n", x1, x2, y1, y2); + run_test(x1, x2, y1, y2); +} + +TEST_P(tileWFixture, tiletolin) +{ + auto [x1, x2, y1, y2] = GetParam(); + test_setup(TILE_TO_LIN, ISL_TILING_W, TILEW_IMAGE_FORMAT, x2, y2); + if (print_results) + printf("Coordinates: x1=%d x2=%d y1=%d y2=%d \n", x1, x2, y1, y2); + run_test(x1, x2, y1, y2); +} + INSTANTIATE_TEST_SUITE_P(tileY, tileYFixture, testing::Values(TILE_COORDINATES, FULL_TILEY_COORDINATES)); @@ -408,3 +468,5 @@ INSTANTIATE_TEST_SUITE_P(tile4, tile4Fixture, testing::Values(TILE_COORDINATES, FULL_TILEY_COORDINATES)); INSTANTIATE_TEST_SUITE_P(tileX, tileXFixture, testing::Values(TILE_COORDINATES, FULL_TILEX_COORDINATES)); +INSTANTIATE_TEST_SUITE_P(tileW, tileWFixture, testing::Values(TILE_COORDINATES, + FULL_TILEW_COORDINATES));