diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c
index 6882ab9d0fb..ea31fa3b4cd 100644
--- a/src/asahi/compiler/agx_compile.c
+++ b/src/asahi/compiler/agx_compile.c
@@ -612,10 +612,18 @@ agx_emit_store_preamble(agx_builder *b, nir_intrinsic_instr *instr)
    agx_index vec = agx_src_index(&instr->src[0]);
    unsigned base = nir_intrinsic_base(instr);
    unsigned stride = agx_size_align_16(vec.size);
+   unsigned nr = nir_src_num_components(instr->src[0]);
 
-   for (unsigned i = 0; i < nir_src_num_components(instr->src[0]); ++i) {
-      agx_uniform_store(b, agx_extract_nir_src(b, instr->src[0], i),
-                        agx_immediate(base + i * stride));
+   for (unsigned i = 0; i < nr; i += (4 / stride)) {
+      agx_index data[4] = {0};
+      unsigned count = MIN2(4 / stride, nr - i);
+
+      for (unsigned c = 0; c < count; ++c) {
+         data[c] = agx_extract_nir_src(b, instr->src[0], i + c);
+      }
+
+      agx_uniform_store(b, agx_emit_collect(b, count, data),
+                        agx_immediate(base + i * stride), BITFIELD_MASK(count));
    }
 
    return NULL;
diff --git a/src/asahi/compiler/agx_opcodes.py b/src/asahi/compiler/agx_opcodes.py
index 3dd027a1597..a08b63689d7 100644
--- a/src/asahi/compiler/agx_opcodes.py
+++ b/src/asahi/compiler/agx_opcodes.py
@@ -322,7 +322,7 @@ op("local_store",
 # TODO: Consider permitting the short form
 op("uniform_store",
       encoding_32 = ((0b111 << 27) | 0b1000101 | (1 << 47), 0, 8, _),
-      dests = 0, srcs = 2, can_eliminate = False)
+      dests = 0, srcs = 2, imms = [MASK], can_eliminate = False)
 
 # sources are value, base, index
 op("atomic",
diff --git a/src/asahi/compiler/agx_pack.c b/src/asahi/compiler/agx_pack.c
index a276cb1c8d8..b7884a2cfbd 100644
--- a/src/asahi/compiler/agx_pack.c
+++ b/src/asahi/compiler/agx_pack.c
@@ -674,16 +674,25 @@ agx_pack_instr(struct util_dynarray *emission, struct util_dynarray *fixups,
       bool is_store = is_device_store || is_uniform_store;
       bool has_base = !is_uniform_store;
 
-      /* Uniform stores internally packed as 16-bit. Fix up the format, mask,
-       * and size so we can use scalar 32-bit values in the IR and avoid
-       * special casing earlier in the compiler.
+      /* Uniform stores are required to be 16-bit. The encoding that should be
+       * 32-bit annoyingly doesn't work. Fix up the format and size so we can
+       * use scalar 32-bit values in the IR and avoid special casing earlier in
+       * the compiler.
        */
       enum agx_format format = is_uniform_store ? AGX_FORMAT_I16 : I->format;
       agx_index reg = is_store ? I->src[0] : I->dest[0];
       unsigned mask = I->mask;
 
-      if (is_uniform_store) {
-         mask = BITFIELD_MASK(agx_size_align_16(reg.size));
+      if (is_uniform_store && reg.size != AGX_SIZE_16) {
+         if (reg.size == AGX_SIZE_64) {
+            assert(mask == 1);
+            mask = BITFIELD_MASK(4);
+         } else {
+            assert(reg.size == AGX_SIZE_32);
+            assert(mask == 1 || mask == 3);
+            mask = BITFIELD_MASK(mask == 3 ? 4 : 2);
+         }
+
          reg.size = AGX_SIZE_16;
       }
 
diff --git a/src/asahi/compiler/agx_validate.c b/src/asahi/compiler/agx_validate.c
index defa090e40c..cf539fdb5b0 100644
--- a/src/asahi/compiler/agx_validate.c
+++ b/src/asahi/compiler/agx_validate.c
@@ -231,6 +231,12 @@ agx_read_registers(const agx_instr *I, unsigned s)
    case AGX_OPCODE_SPLIT:
       return I->nr_dests * agx_size_align_16(agx_split_width(I));
 
+   case AGX_OPCODE_UNIFORM_STORE:
+      if (s == 0)
+         return util_bitcount(I->mask) * size;
+      else
+         return size;
+
    case AGX_OPCODE_DEVICE_STORE:
    case AGX_OPCODE_LOCAL_STORE:
    case AGX_OPCODE_STACK_STORE: