diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 751921af2c6..ff1a09babfc 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -10,16 +10,14 @@ /* Set this if you want the ME to wait until CP DMA is done. * It should be set on the last CP DMA packet. */ -#define CP_DMA_SYNC (1 << 0) +#define CP_DMA_SYNC (1 << 0) /* Set this if the source data was used as a destination in a previous CP DMA * packet. It's for preventing a read-after-write (RAW) hazard between two * CP DMA packets. */ #define CP_DMA_RAW_WAIT (1 << 1) -#define CP_DMA_DST_IS_GDS (1 << 2) -#define CP_DMA_CLEAR (1 << 3) -#define CP_DMA_PFP_SYNC_ME (1 << 4) -#define CP_DMA_SRC_IS_GDS (1 << 5) +#define CP_DMA_CLEAR (1 << 2) +#define CP_DMA_PFP_SYNC_ME (1 << 3) static bool cp_dma_use_L2(struct si_context *sctx) { @@ -40,10 +38,7 @@ static inline unsigned cp_dma_max_byte_count(struct si_context *sctx) /* should cp dma skip the hole in sparse bo */ static inline bool cp_dma_sparse_wa(struct si_context *sctx, struct si_resource *sdst) { - if ((sctx->gfx_level == GFX9) && sdst && (sdst->flags & RADEON_FLAG_SPARSE)) - return true; - - return false; + return sctx->gfx_level == GFX9 && sdst->flags & RADEON_FLAG_SPARSE; } /* Emit a CP DMA packet to do a copy from one buffer to another, or to clear @@ -71,22 +66,11 @@ static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf *cs, ui command |= S_415_RAW_WAIT(1); /* Src and dst flags. */ - if (sctx->gfx_level >= GFX9 && !(flags & CP_DMA_CLEAR) && src_va == dst_va) { - header |= S_411_DST_SEL(V_411_NOWHERE); /* prefetch only */ - } else if (flags & CP_DMA_DST_IS_GDS) { - header |= S_411_DST_SEL(V_411_GDS); - /* GDS increments the address, not CP. */ - command |= S_415_DAS(V_415_REGISTER) | S_415_DAIC(V_415_NO_INCREMENT); - } else if (cp_dma_use_L2(sctx)) { + if (cp_dma_use_L2(sctx)) header |= S_501_DST_SEL(V_501_DST_ADDR_TC_L2); - } if (flags & CP_DMA_CLEAR) { header |= S_411_SRC_SEL(V_411_DATA); - } else if (flags & CP_DMA_SRC_IS_GDS) { - header |= S_411_SRC_SEL(V_411_GDS); - /* Both of these are required for GDS. It does increment the address. */ - command |= S_415_SAS(V_415_REGISTER) | S_415_SAIC(V_415_NO_INCREMENT); } else if (cp_dma_use_L2(sctx)) { header |= S_501_SRC_SEL(V_501_SRC_ADDR_TC_L2); } @@ -138,13 +122,11 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst uint64_t remaining_size, unsigned user_flags, bool *is_first, unsigned *packet_flags) { - if (!(user_flags & SI_OP_CPDMA_SKIP_CHECK_CS_SPACE)) - si_need_gfx_cs_space(sctx, 0); + si_need_gfx_cs_space(sctx, 0); /* This must be done after need_cs_space. */ - if (dst) - radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst), - RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA); + radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(dst), + RADEON_USAGE_WRITE | RADEON_PRIO_CP_DMA); if (src) radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(src), RADEON_USAGE_READ | RADEON_PRIO_CP_DMA); @@ -172,7 +154,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, unsigned value, unsigned user_flags) { struct si_resource *sdst = si_resource(dst); - uint64_t va = (sdst ? sdst->gpu_address : 0) + offset; + uint64_t va = sdst->gpu_address + offset; bool is_first = true; assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope); @@ -190,13 +172,11 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ - if (sdst) { - util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); + util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size); - if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { - sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | - (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2); - } + if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { + sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | + (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2); } if (sctx->flags) @@ -204,9 +184,9 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); - unsigned dma_flags = CP_DMA_CLEAR | (sdst ? 0 : CP_DMA_DST_IS_GDS); + unsigned dma_flags = CP_DMA_CLEAR; - if (cp_dma_sparse_wa(sctx,sdst)) { + if (cp_dma_sparse_wa(sctx, sdst)) { unsigned skip_count = sctx->ws->buffer_find_next_committed_memory(sdst->buf, va - sdst->gpu_address, &byte_count); @@ -226,7 +206,7 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs, va += byte_count; } - if (sdst && cp_dma_use_L2(sctx)) + if (cp_dma_use_L2(sctx)) sdst->TC_L2_dirty = true; sctx->num_cp_dma_calls++; @@ -271,7 +251,6 @@ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, uns /** * Do memcpy between buffers using CP DMA. - * If src or dst is NULL, it means read or write GDS, respectively. * * \param user_flags bitmask of SI_CPDMA_* */ @@ -279,28 +258,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags) { - uint64_t main_dst_offset, main_src_offset; - unsigned skipped_size = 0; - unsigned realign_size = 0; - unsigned gds_flags = (dst ? 0 : CP_DMA_DST_IS_GDS) | (src ? 0 : CP_DMA_SRC_IS_GDS); - bool is_first = true; - assert(!sctx->screen->info.cp_sdma_ge_use_system_memory_scope); assert(size); + assert(dst && src); - if (dst) { - /* Skip this for the L2 prefetch. */ - if (dst != src || dst_offset != src_offset) { - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); - } + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. + */ + util_range_add(dst, &si_resource(dst)->valid_buffer_range, dst_offset, dst_offset + size); - dst_offset += si_resource(dst)->gpu_address; - } - if (src) - src_offset += si_resource(src)->gpu_address; + dst_offset += si_resource(dst)->gpu_address; + src_offset += si_resource(src)->gpu_address; + + unsigned skipped_size = 0; + unsigned realign_size = 0; /* The workarounds aren't needed on Fiji and beyond. */ if (sctx->family <= CHIP_CARRIZO || sctx->family == CHIP_STONEY) { @@ -314,10 +286,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, /* If the copy begins unaligned, we must start copying from the next * aligned block and the skipped part should be copied after everything * else has been copied. Only the src alignment matters, not dst. - * - * GDS doesn't need the source address to be aligned. */ - if (src && src_offset % SI_CPDMA_ALIGNMENT) { + if (src_offset % SI_CPDMA_ALIGNMENT) { skipped_size = SI_CPDMA_ALIGNMENT - (src_offset % SI_CPDMA_ALIGNMENT); /* The main part will be skipped if the size is too small. */ skipped_size = MIN2(skipped_size, size); @@ -327,8 +297,8 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, /* TMZ handling */ if (unlikely(radeon_uses_secure_bos(sctx->ws))) { - bool secure = src && (si_resource(src)->flags & RADEON_FLAG_ENCRYPTED); - assert(!secure || (!dst || (si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED))); + bool secure = si_resource(src)->flags & RADEON_FLAG_ENCRYPTED; + assert(!secure || si_resource(dst)->flags & RADEON_FLAG_ENCRYPTED); if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) { si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW | RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL); @@ -344,7 +314,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if (user_flags & SI_OP_SYNC_PS_BEFORE) sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH; - if ((dst || src) && !(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { + if (!(user_flags & SI_OP_SKIP_CACHE_INV_BEFORE)) { sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE | (cp_dma_use_L2(sctx) ? 0 : SI_CONTEXT_INV_L2); } @@ -353,12 +323,13 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, si_mark_atom_dirty(sctx, &sctx->atoms.s.cache_flush); /* This is the main part doing the copying. Src is always aligned. */ - main_dst_offset = dst_offset + skipped_size; - main_src_offset = src_offset + skipped_size; + uint64_t main_dst_offset = dst_offset + skipped_size; + uint64_t main_src_offset = src_offset + skipped_size; + bool is_first = true; while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); - unsigned dma_flags = gds_flags; + unsigned dma_flags = 0; if (cp_dma_sparse_wa(sctx, si_resource(dst))) { unsigned skip_count = @@ -393,7 +364,7 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, /* Copy the part we skipped because src wasn't aligned. */ if (skipped_size) { - unsigned dma_flags = gds_flags; + unsigned dma_flags = 0; si_cp_dma_prepare(sctx, dst, src, skipped_size, skipped_size + realign_size, user_flags, &is_first, &dma_flags); @@ -405,12 +376,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, if (realign_size) si_cp_dma_realign_engine(sctx, realign_size, user_flags, &is_first); - if (dst && cp_dma_use_L2(sctx)) + if (cp_dma_use_L2(sctx)) si_resource(dst)->TC_L2_dirty = true; - /* If it's not a prefetch or GDS copy... */ - if (dst && src && (dst != src || dst_offset != src_offset)) - sctx->num_cp_dma_calls++; + sctx->num_cp_dma_calls++; } void si_cp_write_data(struct si_context *sctx, struct si_resource *buf, unsigned offset, diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index d8efac11b11..859f9ffd4f3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1468,7 +1468,7 @@ void si_destroy_compute(struct si_compute *program); #define SI_OP_SKIP_CACHE_INV_BEFORE (1 << 4) /* don't invalidate caches */ #define SI_OP_CS_IMAGE (1 << 5) #define SI_OP_CS_RENDER_COND_ENABLE (1 << 6) -#define SI_OP_CPDMA_SKIP_CHECK_CS_SPACE (1 << 7) /* don't call need_cs_space */ +/* gap */ #define SI_OP_SYNC_GE_BEFORE (1 << 8) /* only sync VS, TCS, TES, GS */ /* Only for si_compute_blit: */ #define SI_OP_FAIL_IF_SLOW (1 << 9)