mediafoundation: Refactor frame, multi slice and combine slice IMFSample emission to make it simpler

Reviewed-by: Pohsiang (John) Hsu <pohhsu@microsoft.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37982>
2025-10-16 16:47:50 -04:00
parent f4f619e26e
commit fd546c1cde
3 changed files with 227 additions and 220 deletions
@@ -74,6 +74,13 @@ typedef class DX12EncodeContext
      struct pipe_h265_enc_picture_desc h265enc;
      struct pipe_av1_enc_picture_desc av1enc;
   } encoderPicInfo = {};
+
+   bool IsSliceAutoModeEnabled()
+   {
+      return ((m_Codec == D3D12_VIDEO_ENCODER_CODEC_H264) && (encoderPicInfo.h264enc.slice_mode == PIPE_VIDEO_SLICE_MODE_AUTO)) ||
+             ((m_Codec == D3D12_VIDEO_ENCODER_CODEC_HEVC) && (encoderPicInfo.h265enc.slice_mode == PIPE_VIDEO_SLICE_MODE_AUTO));
+   }
+
   const D3D12_VIDEO_ENCODER_CODEC m_Codec = D3D12_VIDEO_ENCODER_CODEC_H264;
   UINT32 GetPictureType()
   {
@@ -620,6 +620,19 @@ class __declspec( uuid( HMFT_GUID ) ) CDX12EncHMFT : CMFD3DManager,
                                                              ComPtr<ID3D12Fence> &pResolveStatsCompletionFence,
                                                              UINT64 ResolveStatsCompletionFenceValue,
                                                              ID3D12CommandQueue *pSyncObjectQueue );
+   void ProcessSliceBitstreamData( LPDX12EncodeContext pDX12EncodeContext,
+                                   uint32_t slice_idx,
+                                   LPBYTE lpBuffer,
+                                   std::vector<struct codec_unit_location_t> &mfsample_codec_unit_metadata,
+                                   uint64_t &output_buffer_offset );
+   void FinalizeAndEmitOutputSample( LPDX12EncodeContext pDX12EncodeContext,
+                                     ComPtr<IMFMediaBuffer> &spMemoryBuffer,
+                                     ComPtr<IMFSample> &spOutputSample,
+                                     struct codec_unit_location_t *pCodecUnitMetadata,
+                                     unsigned CodecUnitMetadataCount,
+                                     DWORD dwReceivedInput,
+                                     BOOL bIsLastSlice,
+                                     uint64_t ResolveStatsCompletionFenceValue );
   HRESULT UpdateAvailableInputType();
   HRESULT InternalCheckInputType( IMFMediaType *pType );
   HRESULT InternalCheckOutputType( IMFMediaType *pType );
@@ -1259,6 +1259,128 @@ done:
   return hr;
 }

+void
+CDX12EncHMFT::ProcessSliceBitstreamData( LPDX12EncodeContext pDX12EncodeContext,
+                                          uint32_t slice_idx,
+                                          LPBYTE lpBuffer,
+                                          std::vector<struct codec_unit_location_t> &mfsample_codec_unit_metadata,
+                                          uint64_t &output_buffer_offset )
+{
+   HMFT_ETW_EVENT_START( "GPUIndividualSliceStatsRead", this );
+   unsigned codec_unit_metadata_count = 0u;
+   m_pPipeVideoCodec->get_slice_bitstream_data( m_pPipeVideoCodec,
+                                                 pDX12EncodeContext->pAsyncCookie,
+                                                 slice_idx,
+                                                 NULL /*get size*/,
+                                                 &codec_unit_metadata_count );
+   assert( codec_unit_metadata_count > 0 );
+   std::vector<struct codec_unit_location_t> codec_unit_metadata;
+   codec_unit_metadata.resize( codec_unit_metadata_count, {} );
+   m_pPipeVideoCodec->get_slice_bitstream_data( m_pPipeVideoCodec,
+                                                 pDX12EncodeContext->pAsyncCookie,
+                                                 slice_idx,
+                                                 codec_unit_metadata.data(),
+                                                 &codec_unit_metadata_count );
+
+   HMFT_ETW_EVENT_STOP( "GPUIndividualSliceStatsRead", this );
+
+   //
+   // Copy all the NALs produced in this slice
+   //
+   HMFT_ETW_EVENT_START( "GPUIndividualSliceBitstreamRead", this );
+   struct pipe_box box = { 0 };
+   box.width = 0;
+   for( auto &nal : codec_unit_metadata )
+      box.width += static_cast<int32_t>( nal.size );
+   box.height = pDX12EncodeContext->pOutputBitRes[slice_idx]->height0;
+   box.depth = pDX12EncodeContext->pOutputBitRes[slice_idx]->depth0;
+   struct pipe_transfer *transfer_data = NULL;
+   HMFT_ETW_EVENT_START( "GPUIndividualSliceBufferMap", this );
+   uint8_t *pMappedBuffer =
+      (uint8_t *) m_pPipeContext->buffer_map( m_pPipeContext,
+                                              pDX12EncodeContext->pOutputBitRes[slice_idx],
+                                              0,
+                                              PIPE_MAP_READ,
+                                              &box,
+                                              &transfer_data );
+   HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBufferMap", this );
+   assert( pMappedBuffer );
+   if( pMappedBuffer )
+   {
+      for( auto &nal : codec_unit_metadata )
+      {
+         // Add NAL with adjusted offset for accumulated buffer position
+         struct codec_unit_location_t accumulated_nal = nal;
+         accumulated_nal.offset = output_buffer_offset;
+         mfsample_codec_unit_metadata.push_back( accumulated_nal );
+
+         memcpy( lpBuffer + static_cast<size_t>( output_buffer_offset ),
+                  pMappedBuffer + static_cast<size_t>( nal.offset ),
+                  static_cast<size_t>( nal.size ) );
+         output_buffer_offset += nal.size;
+      }
+   }
+   HMFT_ETW_EVENT_START( "GPUIndividualSliceBufferUnmap", this );
+   pipe_buffer_unmap( m_pPipeContext, transfer_data );
+   HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBufferUnmap", this );
+   HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBitstreamRead", this );
+}
+
+void
+CDX12EncHMFT::FinalizeAndEmitOutputSample( LPDX12EncodeContext pDX12EncodeContext,
+                                           ComPtr<IMFMediaBuffer> &spMemoryBuffer,
+                                           ComPtr<IMFSample> &spOutputSample,
+                                           struct codec_unit_location_t *pCodecUnitMetadata,
+                                           unsigned CodecUnitMetadataCount,
+                                           DWORD dwReceivedInput,
+                                           BOOL bIsLastSlice,
+                                           uint64_t ResolveStatsCompletionFenceValue )
+{
+   spOutputSample->AddBuffer( spMemoryBuffer.Get() );
+
+   if( FAILED( ConfigureBitstreamOutputSampleAttributes( spOutputSample.Get(),
+                                                          pDX12EncodeContext,
+                                                          dwReceivedInput,
+                                                          bIsLastSlice,
+                                                          pCodecUnitMetadata,
+                                                          CodecUnitMetadataCount ) ) )
+   {
+      MFE_ERROR( "[dx12 hmft 0x%p] ConfigureBitstreamOutputSampleAttributes failed", this );
+   }
+
+   // Attach the async stats DXGIBuffers to the MFSample output gated by pAsyncFence completion
+   {
+      // Set stats metadata buffers to the sample here. As we are returning the dxgi buffers gated by the completion fence
+      // for the resolved stats we do not need to wait for the pAsyncFence completion on the CPU.
+      if( FAILED( ConfigureAsyncStatsMetadataOutputSampleAttributes( spOutputSample.Get(),
+                                                                     pDX12EncodeContext->pPipeResourcePSNRStats,
+                                                                     pDX12EncodeContext->pPipeResourceQPMapStats,
+                                                                     pDX12EncodeContext->pPipeResourceRCBitAllocMapStats,
+                                                                     pDX12EncodeContext->pPipeResourceSATDMapStats,
+                                                                     pDX12EncodeContext->spAsyncFence,
+                                                                     ResolveStatsCompletionFenceValue,
+                                                                     pDX12EncodeContext->pSyncObjectQueue ) ) )
+      {
+         MFE_ERROR( "[dx12 hmft 0x%p] ConfigureAsyncStatsMetadataOutputSampleAttributes failed", this );
+      }
+   }
+
+   // Issue a new METransformHaveOutput event for the async slices mode
+   // with the combined MFSample with all slices
+   // This is done before pAsyncFence is waited on below
+   // as we already have all the slice info and the async stats
+   // are attached gated by the pAsyncFence completion
+   {
+      std::lock_guard<std::mutex> lock( m_OutputQueueLock );
+      HMFT_ETW_EVENT_INFO( "METransformHaveOutput", this );
+      if( SUCCEEDED( QueueEvent( METransformHaveOutput, GUID_NULL, S_OK, nullptr ) ) )
+      {
+         m_OutputQueue.push( spOutputSample.Detach() );
+         m_dwHaveOutputCount++;
+      }
+   }
+}
+
 // internal thread function to handle encoding and output
 void WINAPI
 CDX12EncHMFT::xThreadProc( void *pCtx )
@@ -1304,25 +1426,12 @@ CDX12EncHMFT::xThreadProc( void *pCtx )
         HMFT_ETW_EVENT_START( "TimeToEmitMFSampleOutput", pThis );
         pipe_enc_feedback_metadata metadata = {};
         unsigned int encoded_bitstream_bytes = 0u;
-         std::vector<ComPtr<IMFSample>> spOutputSamples;
-         std::vector<ComPtr<IMFMediaBuffer>> spMemoryBuffers;
-
-         unsigned int num_output_samples_emitted = 1u; // Assume D3D12_VIDEO_ENCODER_COMPRESSED_BITSTREAM_NOTIFICATION_MODE_FULL_FRAME
-         if (pDX12EncodeContext->sliceNotificationMode == D3D12_VIDEO_ENCODER_COMPRESSED_BITSTREAM_NOTIFICATION_MODE_SUBREGIONS)
-         {
-            num_output_samples_emitted = (pThis->m_bSliceGenerationModeSet && (pThis->m_uiSliceGenerationMode == 1)) ?
-               static_cast<uint32_t>( pDX12EncodeContext->pSliceFences.size() ) :
-               1u;
-         }
-         spOutputSamples.resize( num_output_samples_emitted );
-         spMemoryBuffers.resize( num_output_samples_emitted );
-         HMFT_ETW_EVENT_START( "CreateOutputSamples", pThis );
-         for ( unsigned int sample_idx = 0; sample_idx < num_output_samples_emitted; sample_idx++ )
-         {
-            MFCreateSample( &spOutputSamples[sample_idx] );
-            MFCreateMemoryBuffer( pThis->m_uiMaxOutputBitstreamSize, &spMemoryBuffers[sample_idx] );
-         }
-         HMFT_ETW_EVENT_STOP( "CreateOutputSamples", pThis );
+         uint64_t ResolveStatsCompletionFenceValue = 0;
+         HANDLE fence_handle = (HANDLE) pThis->m_pPipeContext->screen->fence_get_win32_handle( pThis->m_pPipeContext->screen,
+                                                                                                pDX12EncodeContext->pAsyncFence,
+                                                                                                &ResolveStatsCompletionFenceValue );
+         if( fence_handle )
+            CloseHandle( fence_handle );

         {
            std::lock_guard<std::mutex> lock( pThis->m_encoderLock );
@@ -1334,178 +1443,93 @@ CDX12EncHMFT::xThreadProc( void *pCtx )
            // Otherwise, let's copy all the sliced together here after full frame completion (see below)
            if ( !pThis->m_bFlushing && ( pDX12EncodeContext->sliceNotificationMode == D3D12_VIDEO_ENCODER_COMPRESSED_BITSTREAM_NOTIFICATION_MODE_SUBREGIONS ))
            {
-               // Obtain fence value from pipe_fence_handle
-               uint64_t ResolveStatsCompletionFenceValue = 0;
-               HANDLE fence_handle = (HANDLE) pThis->m_pPipeContext->screen->fence_get_win32_handle( pThis->m_pPipeContext->screen,
-                                                                                                      pDX12EncodeContext->pAsyncFence,
-                                                                                                      &ResolveStatsCompletionFenceValue );
-               if( fence_handle )
-                  CloseHandle( fence_handle );
-
               //
               // Wait for each slice fence and resolve offset/size as each slice is ready
               //
-               uint64_t output_buffer_offset = 0u;
-
               uint32_t num_slice_buffers = static_cast<uint32_t>( pDX12EncodeContext->pSliceFences.size() );
-               std::vector<struct codec_unit_location_t> codec_unit_metadata;
+               uint64_t output_buffer_offset = 0u;
               std::vector<struct codec_unit_location_t> mfsample_codec_unit_metadata;
-               const size_t max_default_init_alloc_count_nals = 64u;
-               codec_unit_metadata.reserve( max_default_init_alloc_count_nals );
               mfsample_codec_unit_metadata.reserve( MAX_NALU_LENGTH_INFO_ENTRIES );
+               
+               auto WaitForFence = [&]( pipe_fence_handle *pFence, uint64_t timeout ) -> bool {
+                  assert( pFence );
+                  HMFT_ETW_EVENT_START( "GPUIndividualSliceCompletionWait", pThis );
+                  bool result = pThis->m_pPipeVideoCodec->fence_wait( pThis->m_pPipeVideoCodec, pFence, timeout ) != 0;
+                  HMFT_ETW_EVENT_STOP( "GPUIndividualSliceCompletionWait", pThis );
+                  assert( result );
+                  if( !result )
+                  {
+                     MFE_ERROR( "[dx12 hmft 0x%p] Fence wait failed", pThis );
+                  }
+                  return result;
+               };
+               
               LPBYTE lpBuffer = NULL;
-               for( uint32_t slice_idx = 0; slice_idx < num_slice_buffers; slice_idx++ )
+               // If slice generation mode is explicitly set to 1 (1 slice per output sample) and auto mode is off
+               // emit multiple output samples, one per slice
+               if (pThis->m_bSliceGenerationModeSet &&
+                  (pThis->m_uiSliceGenerationMode == 1) &&
+                  (!pDX12EncodeContext->IsSliceAutoModeEnabled())) // We cannot know if the last slice is actually the last one on time to set the last MFSample properties
               {
-                  auto cur_output_sample_emitted_idx = ( num_output_samples_emitted == 1 ) ? 0 : slice_idx;
-                  
-                  // Reset offset and clear accumulated NALs for per-slice mode (each slice goes to separate buffer)
-                  if( num_output_samples_emitted > 1 )
+                  for( uint32_t slice_idx = 0; slice_idx < num_slice_buffers; slice_idx++ )
                  {
                     output_buffer_offset = 0u;
                     mfsample_codec_unit_metadata.clear();
-                  }
-
-                  if ((num_output_samples_emitted > 1) || // If multiple output samples, we do this for every slice
-                     // Or if single output sample, we do this only for the first slice
-                     (num_output_samples_emitted == 1) && (slice_idx == 0))
-                  {
-                     spMemoryBuffers[cur_output_sample_emitted_idx]->Lock( &lpBuffer, NULL, NULL );
-                  }
-                  
-                  assert( pDX12EncodeContext->pSliceFences[slice_idx] );
-
-                  HMFT_ETW_EVENT_START( "GPUIndividualSliceCompletionWait", pThis );
-                  bool fenceWaitResult = pThis->m_pPipeVideoCodec->fence_wait( pThis->m_pPipeVideoCodec,
-                                                                               pDX12EncodeContext->pSliceFences[slice_idx],
-                                                                               OS_TIMEOUT_INFINITE ) != 0;
-                  HMFT_ETW_EVENT_STOP( "GPUIndividualSliceCompletionWait", pThis );
-                  assert( fenceWaitResult );
-                  if( fenceWaitResult )
-                  {
-                     HMFT_ETW_EVENT_START( "GPUIndividualSliceStatsRead", pThis );
-                     unsigned codec_unit_metadata_count = 0u;
-                     pThis->m_pPipeVideoCodec->get_slice_bitstream_data( pThis->m_pPipeVideoCodec,
-                                                                         pDX12EncodeContext->pAsyncCookie,
-                                                                         slice_idx,
-                                                                         NULL /*get size*/,
-                                                                         &codec_unit_metadata_count );
-                     assert( codec_unit_metadata_count > 0 );
-                     codec_unit_metadata.clear();
-                     codec_unit_metadata.resize( codec_unit_metadata_count, {} );
-                     pThis->m_pPipeVideoCodec->get_slice_bitstream_data( pThis->m_pPipeVideoCodec,
-                                                                         pDX12EncodeContext->pAsyncCookie,
-                                                                         slice_idx,
-                                                                         codec_unit_metadata.data(),
-                                                                         &codec_unit_metadata_count );
-
-                     HMFT_ETW_EVENT_STOP( "GPUIndividualSliceStatsRead", pThis );
-
-                     //
-                     // Copy all the NALs produced in this slice and add a new buffer to the MFSample
-                     //
-                     HMFT_ETW_EVENT_START( "GPUIndividualSliceBitstreamRead", pThis );
-                     struct pipe_box box = { 0 };
-                     box.width = 0;
-                     for( auto &nal : codec_unit_metadata )
-                        box.width += static_cast<int32_t>( nal.size );
-                     box.height = pDX12EncodeContext->pOutputBitRes[slice_idx]->height0;
-                     box.depth = pDX12EncodeContext->pOutputBitRes[slice_idx]->depth0;
-                     struct pipe_transfer *transfer_data = NULL;
-                     HMFT_ETW_EVENT_START( "GPUIndividualSliceBufferMap", pThis );
-                     uint8_t *pMappedBuffer =
-                        (uint8_t *) pThis->m_pPipeContext->buffer_map( pThis->m_pPipeContext,
-                                                                       pDX12EncodeContext->pOutputBitRes[slice_idx],
-                                                                       0,
-                                                                       PIPE_MAP_READ,
-                                                                       &box,
-                                                                       &transfer_data );
-                     HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBufferMap", pThis );
-                     assert( pMappedBuffer );
-                     if( pMappedBuffer )
+                     
+                     if( WaitForFence( pDX12EncodeContext->pSliceFences[slice_idx], OS_TIMEOUT_INFINITE ) )
                     {
-                        for( auto &nal : codec_unit_metadata )
-                        {
-                           // Add NAL with adjusted offset for accumulated buffer position
-                           struct codec_unit_location_t accumulated_nal = nal;
-                           accumulated_nal.offset = output_buffer_offset;
-                           mfsample_codec_unit_metadata.push_back( accumulated_nal );
-
-                           memcpy( lpBuffer + static_cast<size_t>( output_buffer_offset ),
-                                   pMappedBuffer + static_cast<size_t>( nal.offset ),
-                                   static_cast<size_t>( nal.size ) );
-                           output_buffer_offset += nal.size;
-                        }
-                        HMFT_ETW_EVENT_START( "GPUIndividualSliceBufferUnmap", pThis );
-                        pipe_buffer_unmap( pThis->m_pPipeContext, transfer_data );
-                        HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBufferUnmap", pThis );
-                     }
-                     HMFT_ETW_EVENT_STOP( "GPUIndividualSliceBitstreamRead", pThis );
-                  }
-                  else
-                  {
-                     MFE_ERROR( "[dx12 hmft 0x%p] Slice fence wait failed", pThis );
-                  }
-
-                  if ((num_output_samples_emitted > 1) || // If multiple output samples, we do this for every slice
-                      // Or if single output sample, we do this only for the last slice
-                      (num_output_samples_emitted == 1) && (slice_idx == (num_slice_buffers - 1)))
-                  {
-
-                     spMemoryBuffers[cur_output_sample_emitted_idx]->Unlock();
-                     spMemoryBuffers[cur_output_sample_emitted_idx]->SetCurrentLength( static_cast<DWORD>( output_buffer_offset ) );
-                     spOutputSamples[cur_output_sample_emitted_idx]->AddBuffer( spMemoryBuffers[cur_output_sample_emitted_idx].Get() );
-
-                     HRESULT hr = pThis->ConfigureBitstreamOutputSampleAttributes( spOutputSamples[cur_output_sample_emitted_idx].Get(),
-                                                            pDX12EncodeContext,
-                                                            dwReceivedInput,
-                                                            (slice_idx == (num_slice_buffers - 1)) /* bIsLastSlice */,
-                                                            mfsample_codec_unit_metadata.data(),
-                                                            static_cast<unsigned>( mfsample_codec_unit_metadata.size() ) );
-                     if( FAILED( hr ) )
-                     {
-                        MFE_ERROR( "[dx12 hmft 0x%p] ConfigureBitstreamOutputSampleAttributes failed - hr=0x%08x", pThis, hr );
-                     }
-
-                     // Attach the async stats DXGIBuffers to the MFSample output gated by pAsyncFence completion
-                     {
-                        // Set stats metadata buffers to the sample here. As we are returning the dxgi buffers gated by the completion fence
-                        // for the resolved stats we do not need to wait for the pAsyncFence completion on the CPU.
-                        if( FAILED( pThis->ConfigureAsyncStatsMetadataOutputSampleAttributes(spOutputSamples[cur_output_sample_emitted_idx].Get(),
-                                                                                             pDX12EncodeContext->pPipeResourcePSNRStats,
-                                                                                             pDX12EncodeContext->pPipeResourceQPMapStats,
-                                                                                             pDX12EncodeContext->pPipeResourceRCBitAllocMapStats,
-                                                                                             pDX12EncodeContext->pPipeResourceSATDMapStats,
-                                                                                             pDX12EncodeContext->spAsyncFence,
-                                                                                             ResolveStatsCompletionFenceValue,
-                                                                                             pDX12EncodeContext->pSyncObjectQueue )))
-                        {
-                           MFE_ERROR( "[dx12 hmft 0x%p] ConfigureAsyncStatsMetadataOutputSampleAttributes failed", pThis );
-                        }
-                     }
-
-                     // Issue a new METransformHaveOutput event for the async slices mode
-                     // with the combined MFSample with all slices
-                     // This is done before pAsyncFence is waited on below
-                     // as we already have all the slice info and the async stats
-                     // are attached gated by the pAsyncFence completion
-                     {
-                        std::lock_guard<std::mutex> lock( pThis->m_OutputQueueLock );
-                        HMFT_ETW_EVENT_INFO( "METransformHaveOutput", pThis );
-                        if( SUCCEEDED( pThis->QueueEvent( METransformHaveOutput, GUID_NULL, S_OK, nullptr ) ) )
-                        {
-                           pThis->m_OutputQueue.push( spOutputSamples[cur_output_sample_emitted_idx].Detach() );
-                           pThis->m_dwHaveOutputCount++;
-                        }
+                        ComPtr<IMFSample> spOutputSample;
+                        ComPtr<IMFMediaBuffer> spMemoryBuffer;
+                        MFCreateSample( &spOutputSample );
+                        MFCreateMemoryBuffer( pThis->m_uiMaxOutputBitstreamSize, &spMemoryBuffer );
+                        
+                        spMemoryBuffer->Lock( &lpBuffer, NULL, NULL );
+                        pThis->ProcessSliceBitstreamData( pDX12EncodeContext, slice_idx, lpBuffer,
+                                                          mfsample_codec_unit_metadata, output_buffer_offset );
+                        spMemoryBuffer->Unlock();
+                        spMemoryBuffer->SetCurrentLength( static_cast<DWORD>( output_buffer_offset ) );
+                        pThis->FinalizeAndEmitOutputSample( pDX12EncodeContext, spMemoryBuffer,
+                                                            spOutputSample, mfsample_codec_unit_metadata.data(),
+                                                            static_cast<unsigned>( mfsample_codec_unit_metadata.size() ),
+                                                            dwReceivedInput, (slice_idx == (num_slice_buffers - 1)), ResolveStatsCompletionFenceValue );
                        HMFT_ETW_EVENT_STOP( "TimeToEmitMFSampleOutput", pThis );
                     }
                  }
               }
+               else
+               {
+                  ComPtr<IMFSample> spOutputSample;
+                  ComPtr<IMFMediaBuffer> spMemoryBuffer;
+                  MFCreateSample( &spOutputSample );
+                  MFCreateMemoryBuffer( pThis->m_uiMaxOutputBitstreamSize, &spMemoryBuffer );
+                  
+                  spMemoryBuffer->Lock( &lpBuffer, NULL, NULL );
+                  for( uint32_t slice_idx = 0; slice_idx < num_slice_buffers; slice_idx++ )
+                  {
+                     if( WaitForFence( pDX12EncodeContext->pSliceFences[slice_idx], OS_TIMEOUT_INFINITE ) )
+                     {
+                        pThis->ProcessSliceBitstreamData( pDX12EncodeContext, slice_idx, lpBuffer,
+                                                          mfsample_codec_unit_metadata, output_buffer_offset );
+                     }
+                  }
+                  
+                  spMemoryBuffer->Unlock();
+                  spMemoryBuffer->SetCurrentLength( static_cast<DWORD>( output_buffer_offset ) );
+                  pThis->FinalizeAndEmitOutputSample( pDX12EncodeContext, spMemoryBuffer, spOutputSample,
+                                                      mfsample_codec_unit_metadata.data(),
+                                                      static_cast<unsigned>( mfsample_codec_unit_metadata.size() ),
+                                                      dwReceivedInput, TRUE,
+                                                      ResolveStatsCompletionFenceValue );
+                  HMFT_ETW_EVENT_STOP( "TimeToEmitMFSampleOutput", pThis );
+               }

               // Cleanup fences
               for (unsigned slice_idx = 0; slice_idx < pDX12EncodeContext->pSliceFences.size(); slice_idx++)
               {
                  if (pDX12EncodeContext->pSliceFences[slice_idx])
+                  {
                     pThis->m_pPipeVideoCodec->destroy_fence( pThis->m_pPipeVideoCodec, pDX12EncodeContext->pSliceFences[slice_idx] );
+                  }
               }
               if (pDX12EncodeContext->pLastSliceFence)
               {
@@ -1665,46 +1689,17 @@ CDX12EncHMFT::xThreadProc( void *pCtx )
             encoded_bitstream_bytes && ( pDX12EncodeContext->sliceNotificationMode ==
                  D3D12_VIDEO_ENCODER_COMPRESSED_BITSTREAM_NOTIFICATION_MODE_FULL_FRAME ))
         {
-            HRESULT hr = pThis->ConfigureBitstreamOutputSampleAttributes( spOutputSamples[0].Get(),
-                                                                        pDX12EncodeContext,
-                                                                        dwReceivedInput,
-                                                                        TRUE /* bIsLastSlice */,
-                                                                        &metadata.codec_unit_metadata[0],
-                                                                        metadata.codec_unit_metadata_count );
-            if( FAILED( hr ) )
-            {
-               MFE_ERROR( "[dx12 hmft 0x%p] ConfigureBitstreamOutputSampleAttributes failed - hr=0x%08x", pThis, hr );
-            }
-
-            // Attach the async stats DXGIBuffers to the MFSample output gated by pAsyncFence completion
-            {
-               // Obtain fence value from pipe_fence_handle
-               uint64_t ResolveStatsCompletionFenceValue = 0;
-               HANDLE fence_handle = (HANDLE) pThis->m_pPipeContext->screen->fence_get_win32_handle( pThis->m_pPipeContext->screen,
-                                                                                                     pDX12EncodeContext->pAsyncFence,
-                                                                                                     &ResolveStatsCompletionFenceValue );
-               if( fence_handle )
-                  CloseHandle( fence_handle );
-
-               if( FAILED( pThis->ConfigureAsyncStatsMetadataOutputSampleAttributes(spOutputSamples[0].Get(),
-                                                                                    pDX12EncodeContext->pPipeResourcePSNRStats,
-                                                                                    pDX12EncodeContext->pPipeResourceQPMapStats,
-                                                                                    pDX12EncodeContext->pPipeResourceRCBitAllocMapStats,
-                                                                                    pDX12EncodeContext->pPipeResourceSATDMapStats,
-                                                                                    pDX12EncodeContext->spAsyncFence,
-                                                                                    ResolveStatsCompletionFenceValue,
-                                                                                    pDX12EncodeContext->pSyncObjectQueue )))
-               {
-                  MFE_ERROR( "[dx12 hmft 0x%p] ConfigureAsyncStatsMetadataOutputSampleAttributes failed", pThis );
-               }
-            }
+            ComPtr<IMFSample> spOutputSample;
+            ComPtr<IMFMediaBuffer> spMemoryBuffer;
+            MFCreateSample( &spOutputSample );
+            MFCreateMemoryBuffer( pThis->m_uiMaxOutputBitstreamSize, &spMemoryBuffer );

            if( metadata.encode_result & PIPE_VIDEO_FEEDBACK_METADATA_ENCODE_FLAG_MAX_FRAME_SIZE_OVERFLOW )
               debug_printf( "[dx12 hmft 0x%p] PIPE_VIDEO_FEEDBACK_METADATA_ENCODE_FLAG_MAX_FRAME_SIZE_OVERFLOW set\n", pThis );

            // Set encoding quality metrics (only available after get_feedback on full frame encode)
            debug_printf( "[dx12 hmft 0x%p] Frame AverageQP: %d\n", pThis, metadata.average_frame_qp );
-            if( FAILED( spOutputSamples[0]->SetUINT64( MFSampleExtension_VideoEncodeQP, (UINT64) metadata.average_frame_qp ) ) )
+            if( FAILED( spOutputSample->SetUINT64( MFSampleExtension_VideoEncodeQP, (UINT64) metadata.average_frame_qp ) ) )
            {
               debug_printf( "[dx12 hmft 0x%p] WARNING: could not set MFSampleExtension_VideoEncodeQP\n", pThis );
            }
@@ -1728,7 +1723,7 @@ CDX12EncHMFT::xThreadProc( void *pCtx )
            if( pMappedBuffer )
            {
               LPBYTE lpBuffer;
-               spMemoryBuffers[0]->Lock( &lpBuffer, NULL, NULL );
+               spMemoryBuffer->Lock( &lpBuffer, NULL, NULL );
               size_t copied_bytes = 0;
               for( unsigned i = 0; i < metadata.codec_unit_metadata_count; i++ )
               {
@@ -1737,28 +1732,20 @@ CDX12EncHMFT::xThreadProc( void *pCtx )
                           static_cast<size_t>( metadata.codec_unit_metadata[i].size ) );
                  copied_bytes += static_cast<size_t>( metadata.codec_unit_metadata[i].size );
               }
-               spMemoryBuffers[0]->Unlock();
-               spMemoryBuffers[0]->SetCurrentLength( static_cast<DWORD>( copied_bytes ) );
+               spMemoryBuffer->Unlock();
+               spMemoryBuffer->SetCurrentLength( static_cast<DWORD>( copied_bytes ) );
               HMFT_ETW_EVENT_START( "GPUFrameEncodeGPUBufferUnmap", pThis );
               pipe_buffer_unmap( pThis->m_pPipeContext, transfer_data );
               HMFT_ETW_EVENT_STOP( "GPUFrameEncodeGPUBufferUnmap", pThis );
-               spOutputSamples[0]->AddBuffer( spMemoryBuffers[0].Get() );
            }
            HMFT_ETW_EVENT_STOP( "GPUFrameEncodeBitstreamRead", pThis );

-            // Issue a new METransformHaveOutput event for the full frame
-            // as we only output one MFSample per frame
-            // This is done after pAsyncFence was waited on above
-            // and get_feedback was called to get the post resolve metadata
-            {
-               std::lock_guard<std::mutex> lock( pThis->m_OutputQueueLock );
-               HMFT_ETW_EVENT_INFO( "METransformHaveOutput", pThis );
-               if( SUCCEEDED( pThis->QueueEvent( METransformHaveOutput, GUID_NULL, S_OK, nullptr ) ) )
-               {
-                  pThis->m_OutputQueue.push( spOutputSamples[0].Detach() );
-                  pThis->m_dwHaveOutputCount++;
-               }
-            }
+            // Use FinalizeAndEmitOutputSample to configure attributes and emit output
+            pThis->FinalizeAndEmitOutputSample( pDX12EncodeContext, spMemoryBuffer, spOutputSample,
+                                                &metadata.codec_unit_metadata[0],
+                                                metadata.codec_unit_metadata_count,
+                                                dwReceivedInput, TRUE,
+                                                ResolveStatsCompletionFenceValue );
            HMFT_ETW_EVENT_STOP( "TimeToEmitMFSampleOutput", pThis );
         }