diff --git a/src/util/meson.build b/src/util/meson.build
index 23b70c92d25..db2292df7da 100644
--- a/src/util/meson.build
+++ b/src/util/meson.build
@@ -279,8 +279,8 @@ endif
 
 u_trace_py = files('perf/u_trace.py')
 
-libmesa_util_sse41 = static_library(
-  'mesa_util_sse41',
+libmesa_util_simd = static_library(
+  'mesa_util_simd',
   files('streaming-load-memcpy.c'),
   c_args : [c_msvc_compat_args, sse41_args],
   include_directories : [inc_util],
@@ -293,7 +293,7 @@ _libmesa_util = static_library(
   [files_mesa_util, files_debug_stack, format_srgb],
   include_directories : [inc_util, include_directories('format')],
   dependencies : deps_for_libmesa_util,
-  link_with: [libmesa_util_sse41],
+  link_with: [libmesa_util_simd],
   c_args : [c_msvc_compat_args],
   gnu_symbol_visibility : 'hidden',
   build_by_default : false
diff --git a/src/util/streaming-load-memcpy.c b/src/util/streaming-load-memcpy.c
index e770bd17080..3757b1dc6ff 100644
--- a/src/util/streaming-load-memcpy.c
+++ b/src/util/streaming-load-memcpy.c
@@ -33,8 +33,8 @@
 #include <smmintrin.h>
 #endif
 
-/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
- * read performance from uncached memory.
+/* Copies memory from src to dst, using non-temporal load instructions to get
+ * streaming read performance from uncached memory.
  */
 void
 util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
@@ -42,9 +42,14 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
    char *restrict d = dst;
    char *restrict s = src;
 
-#ifdef USE_SSE41
-   /* If dst and src are not co-aligned, or if SSE4.1 is not present, fallback to memcpy(). */
-   if (((uintptr_t)d & 15) != ((uintptr_t)s & 15) || !util_get_cpu_caps()->has_sse4_1) {
+#if defined(USE_SSE41) || defined(USE_AARCH64_ASM)
+   /* If dst and src are not co-aligned, or if non-temporal load instructions
+    * are not present, fallback to memcpy(). */
+   if (((uintptr_t)d & 15) != ((uintptr_t)s & 15)
+#if defined(USE_SSE41)
+       || !util_get_cpu_caps()->has_sse4_1
+#endif
+       ) {
       memcpy(d, s, len);
       return;
    }
@@ -63,6 +68,7 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
       len -= MIN2(bytes_before_alignment_boundary, len);
    }
 
+#if defined(USE_SSE41)
    if (len >= 64)
       _mm_mfence();
 
@@ -84,6 +90,37 @@ util_streaming_load_memcpy(void *restrict dst, void *restrict src, size_t len)
       s += 64;
       len -= 64;
    }
+
+#elif defined(USE_AARCH64_ASM)
+   if (len >= 64) {
+      __asm__ volatile(
+         /* Memory barrier for loads completion in the non-shareable domain:
+          * https://developer.arm.com/documentation/102336/0100/Limiting-the-scope-of-memory-barriers */
+         "  dmb nshld\n"
+
+         /* Allow branching on negative flag using subs. */
+         "  sub %[len], %[len], #64\n"
+
+         /* Based on ARM optimized routines, using non-temporal loads:
+          * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy-sve.S */
+         "loop64:\n"
+         "  ldnp q0, q1, [%[s]]\n"
+         "  stp q0, q1, [%[d]]\n"
+         "  ldnp q0, q1, [%[s], #32]\n"
+         "  stp q0, q1, [%[d], #32]\n"
+         "  add %[s], %[s], #64\n"
+         "  add %[d], %[d], #64\n"
+         "  subs %[len], %[len], #64\n"
+         "  b.pl loop64\n"
+
+         /* Restore <len>. */
+         "  add %[len], %[len], #64\n"
+
+         : [d]"+r"(d), [s]"+r"(s), [len]"+r"(len) :
+         : "v0", "v1", "cc", "memory");
+   }
+#endif
+
 #endif
    /* memcpy() the tail. */
    if (len) {
diff --git a/src/util/streaming-load-memcpy.h b/src/util/streaming-load-memcpy.h
index ee277b0860b..8136ec06e76 100644
--- a/src/util/streaming-load-memcpy.h
+++ b/src/util/streaming-load-memcpy.h
@@ -26,8 +26,8 @@
  *
  */
 
-/* Copies memory from src to dst, using SSE 4.1's MOVNTDQA to get streaming
- * read performance from uncached memory.
+/* Copies memory from src to dst, using using non-temporal load instructions
+ * to get streaming read performance from uncached memory.
  */
 
 #ifndef STREAMING_LOAD_MEMCPY_H