diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
index 026f715edca..33012d4bb01 100644
--- a/src/compiler/nir/nir_intrinsics.py
+++ b/src/compiler/nir/nir_intrinsics.py
@@ -866,6 +866,14 @@ load("output_u8_as_fp16_pan", 0, [], [CAN_ELIMINATE, CAN_REORDER])
 # src[] = { sampler_index }
 load("sampler_lod_parameters_pan", 1, [CAN_ELIMINATE, CAN_REORDER])
 
+# R600 specific instrincs
+#
+# R600 can only fetch 16 byte aligned data from an UBO, and the actual offset
+# is given in vec4 units, so we have to fetch the a vec4 and get the component
+# later
+# src[] = { buffer_index, offset }.
+load("ubo_r600", 2, [ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE, CAN_REORDER])
+
 # V3D-specific instrinc for tile buffer color reads.
 #
 # The hardware requires that we read the samples and components of a pixel
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index 38e32e4ce3d..e0da743144a 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -531,6 +531,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 
    case nir_intrinsic_load_deref: {
       nir_deref_instr *src = nir_src_as_deref(instr->src[0]);
+      assert(src);
       validate_assert(state, glsl_type_is_vector_or_scalar(src->type) ||
                       (src->mode == nir_var_uniform &&
                        glsl_get_base_type(src->type) == GLSL_TYPE_SUBROUTINE));
@@ -545,6 +546,7 @@ validate_intrinsic_instr(nir_intrinsic_instr *instr, validate_state *state)
 
    case nir_intrinsic_store_deref: {
       nir_deref_instr *dst = nir_src_as_deref(instr->src[0]);
+      assert(dst);
       validate_assert(state, glsl_type_is_vector_or_scalar(dst->type));
       validate_assert(state, instr->num_components ==
                              glsl_get_vector_elements(dst->type));
diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
index 206435528a6..ff7bee26b4f 100644
--- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp
@@ -386,12 +386,112 @@ bool r600_lower_scratch_addresses(nir_shader *shader)
    return progress;
 }
 
+static nir_ssa_def *
+r600_lower_ubo_to_align16_impl(nir_builder *b, nir_instr *instr, void *_options)
+{
+   b->cursor = nir_before_instr(instr);
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   assert(op->intrinsic == nir_intrinsic_load_ubo);
+
+   bool const_address = (nir_src_is_const(op->src[1]) && nir_src_is_const(op->src[0]));
+
+   nir_ssa_def *offset = op->src[1].ssa;
+
+   /* This is ugly: With const addressing we can actually set a proper fetch target mask,
+    * but for this we need the component encoded, we don't shift and do de decoding in the
+    * backend. Otherwise we shift by four and resolve the component here
+    * (TODO: encode the start component in the intrinsic when the offset base is non-constant
+    * but a multiple of 16 */
+
+   nir_ssa_def *new_offset = offset;
+   if (!const_address)
+      new_offset = nir_ishr(b, offset,  nir_imm_int(b, 4));
+
+   nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_ubo_r600);
+   load->num_components = const_address ? op->num_components : 4;
+   load->src[0] = op->src[0];
+   load->src[1] = nir_src_for_ssa(new_offset);
+   nir_intrinsic_set_align(load, nir_intrinsic_align(op), nir_intrinsic_align_offset(op));
+
+   nir_ssa_dest_init(&load->instr, &load->dest, load->num_components, 32, NULL);
+   nir_builder_instr_insert(b, &load->instr);
+
+   /* when four components are loaded or both the offset and the location
+    * are constant, then the backend can deal with it better */
+   if (op->num_components == 4 || const_address)
+      return &load->dest.ssa;
+
+   /* What comes below is a performance disaster when the offset is not constant
+    * because then we have to assume that any component can be the first one and we
+    * have to pick the result manually. */
+   nir_ssa_def *first_comp = nir_iand(b, nir_ishr(b, offset,  nir_imm_int(b, 2)),
+                                     nir_imm_int(b,3));
+
+   const unsigned swz_000[4] = {0, 0, 0, 0};
+   nir_ssa_def *component_select = nir_ieq(b, nir_imm_ivec4(b, 0, 1, 2, 3),
+                                           nir_swizzle(b, first_comp, swz_000, 4));
+
+   const unsigned szw_0[1] = {0};
+   const unsigned szw_1[1] = {1};
+   const unsigned szw_2[1] = {2};
+
+   if (op->num_components == 1) {
+      const unsigned szw_3[1] = {3};
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_3, 1));
+      nir_ssa_def *check1 = nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_1, 1),
+                                      check0);
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_2, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_2, 1),
+                       check1);
+   } else if (op->num_components == 2) {
+      const unsigned szw_01[2] = {0, 1};
+      const unsigned szw_12[2] = {1, 2};
+      const unsigned szw_23[2] = {2, 3};
+
+      nir_ssa_def *check0 = nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_01, 2),
+                                      nir_swizzle(b, &load->dest.ssa, szw_23, 2));
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_1, 1),
+                                      nir_swizzle(b, &load->dest.ssa, szw_12, 2),
+                                      check0);
+   } else {
+      const unsigned szw_012[3] = {0, 1, 3};
+      const unsigned szw_123[3] = {1, 2, 3};
+      return nir_bcsel(b, nir_swizzle(b, component_select, szw_0, 1),
+                       nir_swizzle(b, &load->dest.ssa, szw_012, 3),
+                       nir_swizzle(b, &load->dest.ssa, szw_123, 3));
+   }
+}
+
+bool r600_lower_ubo_to_align16_filter(const nir_instr *instr, const void *_options)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *op = nir_instr_as_intrinsic(instr);
+   return op->intrinsic == nir_intrinsic_load_ubo;
+}
+
+
+bool r600_lower_ubo_to_align16(nir_shader *shader)
+{
+   return nir_shader_lower_instructions(shader,
+                                        r600_lower_ubo_to_align16_filter,
+                                        r600_lower_ubo_to_align16_impl,
+                                        nullptr);
+}
+
 }
 
 using r600::r600_nir_lower_int_tg4;
 using r600::r600_nir_lower_pack_unpack_2x16;
 using r600::r600_lower_scratch_addresses;
 using r600::r600_lower_fs_out_to_vector;
+using r600::r600_lower_ubo_to_align16;
 
 int
 r600_glsl_type_size(const struct glsl_type *type, bool is_bindless)
@@ -512,7 +612,10 @@ int r600_shader_from_nir(struct r600_context *rctx,
    const nir_function *func = reinterpret_cast<const nir_function *>(exec_list_get_head_const(&sel->nir->functions));
    bool optimize = func->impl->registers.length() == 0 && !has_saturate(func);
 
-
+   if (optimize) {
+      optimize_once(sel->nir);
+      NIR_PASS_V(sel->nir, r600_lower_ubo_to_align16);
+   }
    /* It seems the output of this optimization is cached somewhere, and
     * when there are registers, then we can no longer copy propagate, so
     * skip the optimization then. (There is probably a better way, but yeah)
diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
index 96c8e804e2b..0ee6bbd6313 100644
--- a/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
+++ b/src/gallium/drivers/r600/sfn/sfn_shader_base.cpp
@@ -447,7 +447,7 @@ bool ShaderFromNirProcessor::emit_intrinsic_instruction(nir_intrinsic_instr* ins
    case nir_intrinsic_discard:
    case nir_intrinsic_discard_if:
       return emit_discard_if(instr);
-   case nir_intrinsic_load_ubo:
+   case nir_intrinsic_load_ubo_r600:
       return emit_load_ubo(instr);
    case nir_intrinsic_copy_deref:
    case nir_intrinsic_load_constant: