diff --git a/src/util/meson.build b/src/util/meson.build index 23b70c92d25..db2292df7da 100644 --- a/src/util/meson.build +++ b/src/util/meson.build @@ -279,8 +279,8 @@ endif u_trace_py = files('perf/u_trace.py') -libmesa_util_sse41 = static_library( - 'mesa_util_sse41', +libmesa_util_simd = static_library( + 'mesa_util_simd', files('streaming-load-memcpy.c'), c_args : [c_msvc_compat_args, sse41_args], include_directories : [inc_util], @@ -293,7 +293,7 @@ _libmesa_util = static_library( [files_mesa_util, files_debug_stack, format_srgb], include_directories : [inc_util, include_directories('format')], dependencies : deps_for_libmesa_util, - link_with: [libmesa_util_sse41], + link_with: [libmesa_util_simd], c_args : [c_msvc_compat_args], gnu_symbol_visibility : 'hidden', build_by_default : false diff --git a/src/util/streaming-load-memcpy.c b/src/util/streaming-load-memcpy.c index e770bd17080..3757b1dc6ff 100644 --- a/src/util/streaming-load-memcpy.c +++ b/src/util/streaming-load-memcpy.c @@ -33,8 +33,8 @@ #include #endif -/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming - * read performance from uncached memory. +/* Copies memory from src to dst, using non-temporal load instructions to get + * streaming read performance from uncached memory. */ void util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) @@ -42,9 +42,14 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) char *restrict d = dst; char *restrict s = src; -#ifdef USE_SSE41 - /* If dst and src are not co-aligned, or if SSE4.1 is not present, fallback to memcpy(). */ - if (((uintptr_t)d & 15) != ((uintptr_t)s & 15) || !util_get_cpu_caps()->has_sse4_1) { +#if defined(USE_SSE41) || defined(USE_AARCH64_ASM) + /* If dst and src are not co-aligned, or if non-temporal load instructions + * are not present, fallback to memcpy(). */ + if (((uintptr_t)d & 15) != ((uintptr_t)s & 15) +#if defined(USE_SSE41) + || !util_get_cpu_caps()->has_sse4_1 +#endif + ) { memcpy(d, s, len); return; } @@ -63,6 +68,7 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) len -= MIN2(bytes_before_alignment_boundary, len); } +#if defined(USE_SSE41) if (len >= 64) _mm_mfence(); @@ -84,6 +90,37 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len) s += 64; len -= 64; } + +#elif defined(USE_AARCH64_ASM) + if (len >= 64) { + __asm__ volatile( + /* Memory barrier for loads completion in the non-shareable domain: + * https://developer.arm.com/documentation/102336/0100/Limiting-the-scope-of-memory-barriers */ + " dmb nshld\n" + + /* Allow branching on negative flag using subs. */ + " sub %[len], %[len], #64\n" + + /* Based on ARM optimized routines, using non-temporal loads: + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy-sve.S */ + "loop64:\n" + " ldnp q0, q1, [%[s]]\n" + " stp q0, q1, [%[d]]\n" + " ldnp q0, q1, [%[s], #32]\n" + " stp q0, q1, [%[d], #32]\n" + " add %[s], %[s], #64\n" + " add %[d], %[d], #64\n" + " subs %[len], %[len], #64\n" + " b.pl loop64\n" + + /* Restore . */ + " add %[len], %[len], #64\n" + + : [d]"+r"(d), [s]"+r"(s), [len]"+r"(len) : + : "v0", "v1", "cc", "memory"); + } +#endif + #endif /* memcpy() the tail. */ if (len) { diff --git a/src/util/streaming-load-memcpy.h b/src/util/streaming-load-memcpy.h index ee277b0860b..8136ec06e76 100644 --- a/src/util/streaming-load-memcpy.h +++ b/src/util/streaming-load-memcpy.h @@ -26,8 +26,8 @@ * */ -/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming - * read performance from uncached memory. +/* Copies memory from src to dst, using using non-temporal load instructions + * to get streaming read performance from uncached memory. */ #ifndef STREAMING_LOAD_MEMCPY_H