util: Add AArch64 support to util_streaming_load_memcpy()

AArch64 supports non-temporal (streaming) loads and writes. util_streaming_load_memcpy() is extended to support AArch64 non-temporal loads using inline assembly. The mesa_util_sse41 name is updated to mesa_util_simd to reflect support for non-x86 architectures. This makes copies from non-cacheable to cacheable memory about 20% faster on a Rock 5B. Signed-off-by: Loïc Molinari <loic.molinari@collabora.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/34606>
2025-04-17 14:36:33 +02:00
parent d3544aebd7
commit 293cdbce75
3 changed files with 47 additions and 10 deletions
@@ -279,8 +279,8 @@ endif

 u_trace_py = files('perf/u_trace.py')

-libmesa_util_sse41 = static_library(
-  'mesa_util_sse41',
+libmesa_util_simd = static_library(
+  'mesa_util_simd',
  files('streaming-load-memcpy.c'),
  c_args : [c_msvc_compat_args, sse41_args],
  include_directories : [inc_util],
@@ -293,7 +293,7 @@ _libmesa_util = static_library(
  [files_mesa_util, files_debug_stack, format_srgb],
  include_directories : [inc_util, include_directories('format')],
  dependencies : deps_for_libmesa_util,
-  link_with: [libmesa_util_sse41],
+  link_with: [libmesa_util_simd],
  c_args : [c_msvc_compat_args],
  gnu_symbol_visibility : 'hidden',
  build_by_default : false
@@ -33,8 +33,8 @@
 #include <smmintrin.h>
 #endif

-/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
- * read performance from uncached memory.
+/* Copies memory from src to dst, using non-temporal load instructions to get
+ * streaming read performance from uncached memory.
 */
 void
 util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
@@ -42,9 +42,14 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
   char *restrict d = dst;
   char *restrict s = src;

-#ifdef USE_SSE41
-   /* If dst and src are not co-aligned, or if SSE4.1 is not present, fallback to memcpy(). */
-   if (((uintptr_t)d & 15) != ((uintptr_t)s & 15) || !util_get_cpu_caps()->has_sse4_1) {
+#if defined(USE_SSE41) || defined(USE_AARCH64_ASM)
+   /* If dst and src are not co-aligned, or if non-temporal load instructions
+    * are not present, fallback to memcpy(). */
+   if (((uintptr_t)d & 15) != ((uintptr_t)s & 15)
+#if defined(USE_SSE41)
+       || !util_get_cpu_caps()->has_sse4_1
+#endif
+       ) {
      memcpy(d, s, len);
      return;
   }
@@ -63,6 +68,7 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
      len -= MIN2(bytes_before_alignment_boundary, len);
   }

+#if defined(USE_SSE41)
   if (len >= 64)
      _mm_mfence();

@@ -84,6 +90,37 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
      s += 64;
      len -= 64;
   }
+
+#elif defined(USE_AARCH64_ASM)
+   if (len >= 64) {
+      __asm__ volatile(
+         /* Memory barrier for loads completion in the non-shareable domain:
+          * https://developer.arm.com/documentation/102336/0100/Limiting-the-scope-of-memory-barriers */
+         "  dmb nshld\n"
+
+         /* Allow branching on negative flag using subs. */
+         "  sub %[len], %[len], #64\n"
+
+         /* Based on ARM optimized routines, using non-temporal loads:
+          * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy-sve.S */
+         "loop64:\n"
+         "  ldnp q0, q1, [%[s]]\n"
+         "  stp q0, q1, [%[d]]\n"
+         "  ldnp q0, q1, [%[s], #32]\n"
+         "  stp q0, q1, [%[d], #32]\n"
+         "  add %[s], %[s], #64\n"
+         "  add %[d], %[d], #64\n"
+         "  subs %[len], %[len], #64\n"
+         "  b.pl loop64\n"
+
+         /* Restore <len>. */
+         "  add %[len], %[len], #64\n"
+
+         : [d]"+r"(d), [s]"+r"(s), [len]"+r"(len) :
+         : "v0", "v1", "cc", "memory");
+   }
+#endif
+
 #endif
   /* memcpy() the tail. */
   if (len) {
@@ -26,8 +26,8 @@
 *
 */

-/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
- * read performance from uncached memory.
+/* Copies memory from src to dst, using using non-temporal load instructions
+ * to get streaming read performance from uncached memory.
 */

 #ifndef STREAMING_LOAD_MEMCPY_H