llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8

This patch converts the SSE-optimized build_mask_32() and
build_mask_linear_32() to VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

                      FPS/Score
  Name            Before     After    Delta
------------------------------------------------
glmark2 (score)   139.8      142.7    2.07%

openarena and xonotic didn't show a significant (more than 1%)
difference.

v2: Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
This commit is contained in:
Oded Gabbay
2015-12-29 18:12:34 +02:00
parent 3bbe16ea79
commit 925c46cfc4
+110 -40
View File
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
lp_rast_triangle_4(task, arg2);
}
#if !defined(PIPE_ARCH_SSE)
#if defined(PIPE_ARCH_SSE)
void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
union lp_rast_cmd_arg arg2;
arg2.triangle.tri = arg.triangle.tri;
arg2.triangle.plane_mask = (1<<3)-1;
lp_rast_triangle_32_3(task, arg2);
}
void
lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
union lp_rast_cmd_arg arg2;
arg2.triangle.tri = arg.triangle.tri;
arg2.triangle.plane_mask = (1<<4)-1;
lp_rast_triangle_32_4(task, arg2);
}
void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
lp_rast_triangle_32_3_16(task, arg);
}
#else
#include <emmintrin.h>
#include "util/u_sse.h"
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
#define NR_PLANES 3
void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
0xffff & ~out[i].mask);
}
void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
@@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
}
#undef NR_PLANES
#else
#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
#include <altivec.h>
#include "util/u_pwr8.h"
static inline void
build_masks_32(int c,
int cdiff,
int dcdx,
int dcdy,
unsigned *outmask,
unsigned *partmask)
{
__m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
__m128i xdcdy = (__m128i) vec_splats(dcdy);
/* Get values across the quad
*/
__m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
__m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
__m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
{
__m128i cstep01, cstep23, result;
cstep01 = vec_packs_epi32(cstep0, cstep1);
cstep23 = vec_packs_epi32(cstep2, cstep3);
result = vec_packs_epi16(cstep01, cstep23);
*outmask |= vec_movemask_epi8(result);
}
{
__m128i cio4 = (__m128i) vec_splats(cdiff);
__m128i cstep01, cstep23, result;
cstep0 = vec_add_epi32(cstep0, cio4);
cstep1 = vec_add_epi32(cstep1, cio4);
cstep2 = vec_add_epi32(cstep2, cio4);
cstep3 = vec_add_epi32(cstep3, cio4);
cstep01 = vec_packs_epi32(cstep0, cstep1);
cstep23 = vec_packs_epi32(cstep2, cstep3);
result = vec_packs_epi16(cstep01, cstep23);
*partmask |= vec_movemask_epi8(result);
}
}
static inline unsigned
build_mask_linear_32(int c, int dcdx, int dcdy)
{
__m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
__m128i xdcdy = (__m128i) vec_splats(dcdy);
/* Get values across the quad
*/
__m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
__m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
__m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
/* pack pairs of results into epi16
*/
__m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
__m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
/* pack into epi8, preserving sign bits
*/
__m128i result = vec_packs_epi16(cstep01, cstep23);
/* extract sign bits to create mask
*/
return vec_movemask_epi8(result);
}
#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
union lp_rast_cmd_arg arg2;
arg2.triangle.tri = arg.triangle.tri;
arg2.triangle.plane_mask = (1<<3)-1;
lp_rast_triangle_32_3(task, arg2);
}
void
lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
union lp_rast_cmd_arg arg2;
arg2.triangle.tri = arg.triangle.tri;
arg2.triangle.plane_mask = (1<<4)-1;
lp_rast_triangle_32_4(task, arg2);
}
void
lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
const union lp_rast_cmd_arg arg)
{
lp_rast_triangle_32_3_16(task, arg);
}
#endif
@@ -512,7 +582,7 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
#define NR_PLANES 8
#include "lp_rast_tri_tmp.h"
#ifdef PIPE_ARCH_SSE
#if defined(PIPE_ARCH_SSE) || (defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN))
#undef BUILD_MASKS
#undef BUILD_MASK_LINEAR
#define BUILD_MASKS(c, cdiff, dcdx, dcdy, omask, pmask) build_masks_32((int)c, (int)cdiff, dcdx, dcdy, omask, pmask)