[gnome-remote-desktop] hwaccel-nvidia: Use mapped CUDA pointer for AVC420 encodings



commit d3bdb34ca00d34a970ccf83e55074c64e5910a84
Author: Pascal Nowack <Pascal Nowack gmx de>
Date:   Fri Jan 7 12:48:34 2022 +0100

    hwaccel-nvidia: Use mapped CUDA pointer for AVC420 encodings
    
    Since the frame data is now already available on the GPU, just use the
    mapped CUDA pointer.
    This eliminates the transfer of the frame data from the CPU to the GPU,
    which has the effect that the graphics thread is blocked lesser now,
    thus increasing the maximum framerate.

 src/grd-hwaccel-nvidia.c        | 67 ++++++++---------------------------------
 src/grd-hwaccel-nvidia.h        |  5 +--
 src/grd-rdp-graphics-pipeline.c |  5 +--
 3 files changed, 18 insertions(+), 59 deletions(-)
---
diff --git a/src/grd-hwaccel-nvidia.c b/src/grd-hwaccel-nvidia.c
index 7659a781..c3f0c705 100644
--- a/src/grd-hwaccel-nvidia.c
+++ b/src/grd-hwaccel-nvidia.c
@@ -356,23 +356,23 @@ grd_hwaccel_nvidia_free_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia,
 gboolean
 grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia  *hwaccel_nvidia,
                                              uint32_t           encode_session_id,
-                                             uint8_t           *src_data,
+                                             CUdeviceptr        src_data,
                                              uint16_t           src_width,
                                              uint16_t           src_height,
                                              uint16_t           aligned_width,
                                              uint16_t           aligned_height,
                                              uint8_t          **bitstream,
-                                             uint32_t          *bitstream_size)
+                                             uint32_t          *bitstream_size,
+                                             CUstream           cuda_stream)
 {
   NvEncEncodeSession *encode_session;
-  CUDA_MEMCPY2D cu_memcpy_2d = {0};
   NV_ENC_REGISTER_RESOURCE register_res = {0};
   NV_ENC_MAP_INPUT_RESOURCE map_input_res = {0};
   NV_ENC_PIC_PARAMS pic_params = {0};
   NV_ENC_LOCK_BITSTREAM lock_bitstream = {0};
-  CUstream cu_stream = NULL;
-  CUdeviceptr bgrx_buffer = 0, nv12_buffer = 0;
-  size_t bgrx_pitch = 0, nv12_pitch = 0;
+  uint16_t src_stride;
+  CUdeviceptr nv12_buffer = 0;
+  size_t nv12_pitch = 0;
   unsigned int grid_dim_x, grid_dim_y, grid_dim_z;
   unsigned int block_dim_x, block_dim_y, block_dim_z;
   void *args[8];
@@ -385,51 +385,16 @@ grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia  *hwaccel_nvidia,
   g_assert (encode_session->enc_width == aligned_width);
   g_assert (encode_session->enc_height == aligned_height);
 
-  if (hwaccel_nvidia->cuda_funcs->cuStreamCreate (&cu_stream, 0) != CUDA_SUCCESS)
-    {
-      g_warning ("[HWAccel.CUDA] Failed to create stream");
-      return FALSE;
-    }
-
-  if (hwaccel_nvidia->cuda_funcs->cuMemAllocPitch (
-        &bgrx_buffer, &bgrx_pitch, src_width * 4, src_height, 4) != CUDA_SUCCESS)
-    {
-      g_warning ("[HWAccel.CUDA] Failed to allocate BGRX buffer");
-      hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
-      return FALSE;
-    }
-
-  cu_memcpy_2d.srcMemoryType = CU_MEMORYTYPE_HOST;
-  cu_memcpy_2d.srcHost = src_data;
-  cu_memcpy_2d.srcPitch = src_width * 4;
-
-  cu_memcpy_2d.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-  cu_memcpy_2d.dstDevice = bgrx_buffer;
-  cu_memcpy_2d.dstPitch = bgrx_pitch;
-
-  cu_memcpy_2d.WidthInBytes = src_width * 4;
-  cu_memcpy_2d.Height = src_height;
-
-  if (hwaccel_nvidia->cuda_funcs->cuMemcpy2DAsync (
-        &cu_memcpy_2d, cu_stream) != CUDA_SUCCESS)
-    {
-      g_warning ("[HWAccel.CUDA] Failed to initiate H2D copy");
-      hwaccel_nvidia->cuda_funcs->cuMemFree (bgrx_buffer);
-      hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
-      return FALSE;
-    }
-
   if (hwaccel_nvidia->cuda_funcs->cuMemAllocPitch (
         &nv12_buffer, &nv12_pitch,
         aligned_width, aligned_height + aligned_height / 2, 4) != CUDA_SUCCESS)
     {
       g_warning ("[HWAccel.CUDA] Failed to allocate NV12 buffer");
-      hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cu_stream);
-      hwaccel_nvidia->cuda_funcs->cuMemFree (bgrx_buffer);
-      hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
       return FALSE;
     }
 
+  src_stride = src_width * 4;
+
   /* Threads per blocks */
   block_dim_x = 32;
   block_dim_y = 8;
@@ -442,38 +407,30 @@ grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia  *hwaccel_nvidia,
   grid_dim_z = 1;
 
   args[0] = &nv12_buffer;
-  args[1] = &bgrx_buffer;
+  args[1] = &src_data;
   args[2] = &src_width;
   args[3] = &src_height;
-  args[4] = &bgrx_pitch;
+  args[4] = &src_stride;
   args[5] = &aligned_width;
   args[6] = &aligned_height;
   args[7] = &aligned_width;
 
   if (hwaccel_nvidia->cuda_funcs->cuLaunchKernel (
         hwaccel_nvidia->cu_bgrx_to_yuv420, grid_dim_x, grid_dim_y, grid_dim_z,
-        block_dim_x, block_dim_y, block_dim_z, 0, cu_stream, args, NULL) != CUDA_SUCCESS)
+        block_dim_x, block_dim_y, block_dim_z, 0, cuda_stream, args, NULL) != CUDA_SUCCESS)
     {
       g_warning ("[HWAccel.CUDA] Failed to launch BGRX_TO_YUV420 kernel");
-      hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cu_stream);
       hwaccel_nvidia->cuda_funcs->cuMemFree (nv12_buffer);
-      hwaccel_nvidia->cuda_funcs->cuMemFree (bgrx_buffer);
-      hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
       return FALSE;
     }
 
-  if (hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cu_stream) != CUDA_SUCCESS)
+  if (hwaccel_nvidia->cuda_funcs->cuStreamSynchronize (cuda_stream) != CUDA_SUCCESS)
     {
       g_warning ("[HWAccel.CUDA] Failed to synchronize stream");
       hwaccel_nvidia->cuda_funcs->cuMemFree (nv12_buffer);
-      hwaccel_nvidia->cuda_funcs->cuMemFree (bgrx_buffer);
-      hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
       return FALSE;
     }
 
-  hwaccel_nvidia->cuda_funcs->cuStreamDestroy (cu_stream);
-  hwaccel_nvidia->cuda_funcs->cuMemFree (bgrx_buffer);
-
   register_res.version = NV_ENC_REGISTER_RESOURCE_VER;
   register_res.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
   register_res.width = aligned_width;
diff --git a/src/grd-hwaccel-nvidia.h b/src/grd-hwaccel-nvidia.h
index 04d526ea..18c3e7d4 100644
--- a/src/grd-hwaccel-nvidia.h
+++ b/src/grd-hwaccel-nvidia.h
@@ -80,12 +80,13 @@ void grd_hwaccel_nvidia_free_nvenc_session (GrdHwAccelNvidia *hwaccel_nvidia,
 
 gboolean grd_hwaccel_nvidia_avc420_encode_bgrx_frame (GrdHwAccelNvidia  *hwaccel_nvidia,
                                                       uint32_t           encode_session_id,
-                                                      uint8_t           *src_data,
+                                                      CUdeviceptr        src_data,
                                                       uint16_t           src_width,
                                                       uint16_t           src_height,
                                                       uint16_t           aligned_width,
                                                       uint16_t           aligned_height,
                                                       uint8_t          **bitstream,
-                                                      uint32_t          *bitstream_size);
+                                                      uint32_t          *bitstream_size,
+                                                      CUstream           cuda_stream);
 
 #endif /* GRD_HWACCEL_NVIDIA_H */
diff --git a/src/grd-rdp-graphics-pipeline.c b/src/grd-rdp-graphics-pipeline.c
index 9cb02c84..20361044 100644
--- a/src/grd-rdp-graphics-pipeline.c
+++ b/src/grd-rdp-graphics-pipeline.c
@@ -418,10 +418,11 @@ refresh_gfx_surface_avc420 (GrdRdpGraphicsPipeline *graphics_pipeline,
 
   if (!grd_hwaccel_nvidia_avc420_encode_bgrx_frame (graphics_pipeline->hwaccel_nvidia,
                                                     hwaccel_context->encode_session_id,
-                                                    buffer->local_data,
+                                                    buffer->mapped_cuda_pointer,
                                                     surface_width, surface_height,
                                                     aligned_width, aligned_height,
-                                                    &avc420.data, &avc420.length))
+                                                    &avc420.data, &avc420.length,
+                                                    rdp_surface->cuda_stream))
     {
       g_warning ("[RDP.RDPGFX] Failed to encode YUV420 frame");
       return FALSE;


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]