diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 0cff9fae04e..4945e7a968d 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -107,6 +107,99 @@ collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg,
 	}
 }
 
+static bool
+should_double_threadsize(struct ir3_shader_variant *v,
+						 unsigned regs_count)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	switch (v->type) {
+	case MESA_SHADER_COMPUTE: {
+		unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+
+		/* For a5xx, if the workgroup size is greater than the maximum number
+		 * of threads per core with 32 threads per wave (512) then we have to
+		 * use the doubled threadsize because otherwise the workgroup wouldn't
+		 * fit. For smaller workgroup sizes, we follow the blob and use the
+		 * smaller threadsize.
+		 */
+		if (compiler->gpu_id < 600) {
+			return v->local_size_variable || threads_per_wg >
+				compiler->threadsize_base * compiler->max_waves;
+		}
+
+		/* On a6xx, we prefer the larger threadsize unless the workgroup is
+		 * small enough that it would be useless. Note that because
+		 * threadsize_base is bumped to 64, we don't have to worry about the
+		 * workgroup fitting, unlike the a5xx case.
+		 */
+		if (!v->local_size_variable) {
+			if (threads_per_wg <= compiler->threadsize_base)
+				return false;
+		}
+	}
+	/* fallthrough */
+	case MESA_SHADER_FRAGMENT: {
+		/* Check that doubling the threadsize wouldn't exceed the regfile size */
+		return regs_count * 2 <= compiler->reg_size_vec4;
+	}
+
+	default:
+		/* On a6xx+, it's impossible to use a doubled wavesize in the geometry
+		 * stages - the bit doesn't exist. The blob never used it for the VS
+		 * on earlier gen's anyway.
+		 */
+		return false;
+	}
+}
+
+/* Get the maximum number of waves that could be used even if this shader
+ * didn't use any registers.
+ */
+static unsigned
+get_reg_independent_max_waves(struct ir3_shader_variant *v, bool double_threadsize)
+{
+	const struct ir3_compiler *compiler = v->shader->compiler;
+	unsigned max_waves = compiler->max_waves;
+
+	/* If this is a compute shader, compute the limit based on shared size */
+	if (v->type == MESA_SHADER_COMPUTE) {
+		/* Shared is allocated in chunks of 1k */
+		unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024);
+		if (shared_per_wg > 0 && !v->local_size_variable) {
+			unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg;
+			unsigned threads_per_wg = v->local_size[0] * v->local_size[1] * v->local_size[2];
+			unsigned waves_per_wg =
+				DIV_ROUND_UP(threads_per_wg,
+					compiler->threadsize_base *
+					(double_threadsize ? 2 : 1) * compiler->wave_granularity);
+			max_waves =
+				MIN2(max_waves, waves_per_wg * wgs_per_core * compiler->wave_granularity);
+		}
+	}
+
+	/* Compute the limit based on branchstack */
+	if (v->branchstack > 0) {
+		unsigned branchstack_max_waves =
+			compiler->branchstack_size / v->branchstack *
+			compiler->wave_granularity;
+		max_waves = MIN2(max_waves, branchstack_max_waves);
+	}
+
+	return max_waves;
+}
+
+/* Get the maximum number of waves that could be launched limited by reg size.
+ */
+static unsigned
+get_reg_dependent_max_waves(const struct ir3_compiler *compiler,
+							unsigned reg_count, bool double_threadsize)
+{
+	return reg_count ?
+		(compiler->reg_size_vec4 / (reg_count * (double_threadsize ? 2 : 1)) *
+		 compiler->wave_granularity) :
+		compiler->max_waves;
+}
+
 void
 ir3_collect_info(struct ir3_shader_variant *v)
 {
@@ -200,6 +293,20 @@ ir3_collect_info(struct ir3_shader_variant *v)
 			}
 		}
 	}
+
+	/* TODO: for a5xx and below, is there a separate regfile for
+	 * half-registers?
+	 */
+	unsigned regs_count =
+		info->max_reg + 1 + (compiler->gpu_id >= 600 ? ((info->max_half_reg + 2) / 2) : 0);
+
+	info->double_threadsize = should_double_threadsize(v, regs_count);
+	unsigned reg_independent_max_waves =
+		get_reg_independent_max_waves(v, info->double_threadsize);
+	unsigned reg_dependent_max_waves =
+		get_reg_dependent_max_waves(compiler, regs_count, info->double_threadsize);
+	info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+	assert(info->max_waves <= v->shader->compiler->max_waves);
 }
 
 static struct ir3_register * reg_create(struct ir3 *shader,
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index c869844fa38..bc11f89b94b 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -64,6 +64,11 @@ struct ir3_info {
 	int8_t   max_reg;   /* highest GPR # used by shader */
 	int8_t   max_half_reg;
 	int16_t  max_const;
+	/* This is the maximum # of waves that can executed at once in one core,
+	 * assuming that they are all executing this shader.
+	 */
+	int8_t   max_waves;
+	bool     double_threadsize;
 	bool     multi_dword_ldp_stp;
 
 	/* number of sync bits: */
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index c27e8bcedfe..ed8b43364c5 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -79,6 +79,13 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 	compiler->gpu_id = gpu_id;
 	compiler->set = ir3_ra_alloc_reg_set(compiler, false);
 
+	/* All known GPU's have 32k local memory (aka shared) */
+	compiler->local_mem_size = 32 * 1024;
+	/* TODO see if older GPU's were different here */
+	compiler->branchstack_size = 64;
+	compiler->wave_granularity = 2;
+	compiler->max_waves = 16;
+
 	if (compiler->gpu_id >= 600) {
 		compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true);
 		compiler->samgq_workaround = true;
@@ -123,6 +130,34 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 		compiler->max_const_safe = 256;
 	}
 
+	if (compiler->gpu_id == 650) {
+		/* This changed mid-generation for a650, so that using r32.x and above
+		 * requires using the smallest threadsize.
+		 */
+		compiler->reg_size_vec4 = 64;
+	} else if (compiler->gpu_id >= 600) {
+		compiler->reg_size_vec4 = 96;
+	} else if (compiler->gpu_id >= 400) {
+		/* On a4xx-a5xx, using r24.x and above requires using the smallest
+		 * threadsize.
+		 */
+		compiler->reg_size_vec4 = 48;
+	} else {
+		/* TODO: confirm this */
+		compiler->reg_size_vec4 = 96;
+	}
+
+	if (compiler->gpu_id >= 600) {
+		compiler->threadsize_base = 64;
+	} else if (compiler->gpu_id >= 400) {
+		/* TODO: Confirm this for a4xx. For a5xx this is based on the Vulkan
+		 * 1.1 subgroupSize which is 32.
+		 */
+		compiler->threadsize_base = 32;
+	} else {
+		compiler->threadsize_base = 8;
+	}
+
 	if (compiler->gpu_id >= 400) {
 		/* need special handling for "flat" */
 		compiler->flat_bypass = true;
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 54a78f37726..6f7058f37e5 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -107,6 +107,44 @@ struct ir3_compiler {
 	 */
 	uint32_t const_upload_unit;
 
+	/* The base number of threads per wave. Some stages may be able to double
+	 * this.
+	 */
+	uint32_t threadsize_base;
+
+	/* On at least a6xx, waves are always launched in pairs. In calculations
+	 * about occupancy, we pretend that each wave pair is actually one wave,
+	 * which simplifies many of the calculations, but means we have to
+	 * multiply threadsize_base by this number.
+	 */
+	uint32_t wave_granularity;
+
+	/* The maximum number of simultaneous waves per core. */
+	uint32_t max_waves;
+
+	/* This is theoretical maximum number of vec4 registers that one wave of
+	 * the base threadsize could use. To get the actual size of the register
+	 * file in bytes one would need to compute:
+	 *
+	 * reg_size_vec4 * threadsize_base * wave_granularity * 16 (bytes per vec4)
+	 *
+	 * However this number is more often what we actually need. For example, a
+	 * max_reg more than half of this will result in a doubled threadsize
+	 * being impossible (because double-sized waves take up twice as many
+	 * registers). Also, the formula for the occupancy given a particular
+	 * register footprint is simpler.
+	 *
+	 * It is in vec4 units because the register file is allocated
+	 * with vec4 granularity, so it's in the same units as max_reg.
+	 */
+	uint32_t reg_size_vec4;
+
+	/* The size of local memory in bytes */
+	uint32_t local_mem_size;
+
+	/* The number of total branch stack entries, divided by wave_granularity. */
+	uint32_t branchstack_size;
+
 	/* Whether clip+cull distances are supported */
 	bool has_clip_cull;