[gnome-remote-desktop] hwaccel-nvidia: Add CUDA kernels for damage region detection
- From: Jonas Ådahl <jadahl src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-remote-desktop] hwaccel-nvidia: Add CUDA kernels for damage region detection
- Date: Thu, 3 Mar 2022 14:23:08 +0000 (UTC)
commit 9c16b9dc5d7a479ae5d55f317a79a77e790f6af0
Author: Pascal Nowack <Pascal Nowack gmx de>
Date: Fri Dec 24 21:31:23 2021 +0100
hwaccel-nvidia: Add CUDA kernels for damage region detection
As already mentioned in previous commits, damage region detection on
the GPU is much faster than damage region detection on the CPU.
Although the CPU has a higher clock rate, the GPU has many more cores,
which makes it more suitable for parallel tasks.
The damage detection on the GPU is done in four parts.
Part 1, the actual damage detection:
The check_damaged_pixel cuda kernel checks for each pixel, whether it
is damaged or not.
To make the implementation easier and to avoid accessing global memory
too much, read the values as 32-bit unsigned integers.
Before this kernel is run, the global region_is_damaged value is set to
0.
After running this kernel, gnome-remote-desktop knows whether the
region is damaged or not.
Since, NVENC does not make use of damage regions, only the
region_is_damaged value is retrieved, when using NVENC.
The rest of the region will still be retrieved, as clients like
xfreerdp still use it for optimization purposes.
This retrieval will run later, when NVENC encodes the data, as CUDA
kernels and data transfer between the GPU and CPU can still run, while
NVENC is encoding, thus saving time.
Part 2, Combining the coloums:
As gnome-remote-desktop uses damage tiles of the size of 64x64,
especially for RFX (Progressive), the damage values need to be
combined.
For this, the combine_damage_array_cols kernel runs multiple times.
First, all neighbours are combined, i.e. damage values of pixels 0 and
1 are combined in 0, and damage values of pixels 2 and 3 are combined
in pixel 2.
In the next round, the combine_shift is increased, leading to the
damage values of pixels 0 and 2 being combined in 0 and the damage
values of pixels 4 and 6 being combined in 4.
The same thing happens to the other pixels.
The combine_damage_array_cols kernel runs 6 times, while increasing the
combine_shift value in each run.
After this, every 64th value combines all damage values for maximum 64
pixels in each row.
Part 3, Combining the rows:
This is pretty much the same as part 2, except that this kernel now
runs on the rows, instead of the coloums.
Part 4: Simplifying the damage array:
With now every 64x64th value combines all damage values for its tile,
simplify the resulting damage array.
In case of the data needs to be transferred to the CPU, this
drastically increases the speed, as the simplified damage array is very
small.
These CUDA kernels will later be used by the CUDA damage detection
class.
data/grd-cuda-damage-utils_30.ptx | 251 ++++++++++++++++++++++++++++++++++++++
data/meson.build | 3 +
src/grd-cuda-damage-utils.cu | 138 +++++++++++++++++++++
src/grd-hwaccel-nvidia.c | 82 ++++++++++---
4 files changed, 460 insertions(+), 14 deletions(-)
---
diff --git a/data/grd-cuda-damage-utils_30.ptx b/data/grd-cuda-damage-utils_30.ptx
new file mode 100644
index 00000000..d8b8e1e0
--- /dev/null
+++ b/data/grd-cuda-damage-utils_30.ptx
@@ -0,0 +1,251 @@
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-27506705
+// Cuda compilation tools, release 10.2, V10.2.89
+// Based on LLVM 3.4svn
+//
+
+.version 6.5
+.target sm_30
+.address_size 64
+
+ // .globl check_damaged_pixel
+
+.visible .entry check_damaged_pixel(
+ .param .u64 check_damaged_pixel_param_0,
+ .param .u64 check_damaged_pixel_param_1,
+ .param .u64 check_damaged_pixel_param_2,
+ .param .u64 check_damaged_pixel_param_3,
+ .param .u32 check_damaged_pixel_param_4,
+ .param .u32 check_damaged_pixel_param_5,
+ .param .u32 check_damaged_pixel_param_6,
+ .param .u32 check_damaged_pixel_param_7
+)
+{
+ .reg .pred %p<5>;
+ .reg .b16 %rs<5>;
+ .reg .b32 %r<17>;
+ .reg .b64 %rd<14>;
+
+
+ ld.param.u64 %rd1, [check_damaged_pixel_param_0];
+ ld.param.u64 %rd2, [check_damaged_pixel_param_1];
+ ld.param.u64 %rd3, [check_damaged_pixel_param_2];
+ ld.param.u64 %rd4, [check_damaged_pixel_param_3];
+ ld.param.u32 %r3, [check_damaged_pixel_param_4];
+ ld.param.u32 %r5, [check_damaged_pixel_param_5];
+ ld.param.u32 %r6, [check_damaged_pixel_param_6];
+ ld.param.u32 %r4, [check_damaged_pixel_param_7];
+ mov.u32 %r7, %ntid.x;
+ mov.u32 %r8, %ctaid.x;
+ mov.u32 %r9, %tid.x;
+ mad.lo.s32 %r1, %r7, %r8, %r9;
+ mov.u32 %r10, %ntid.y;
+ mov.u32 %r11, %ctaid.y;
+ mov.u32 %r12, %tid.y;
+ mad.lo.s32 %r2, %r10, %r11, %r12;
+ setp.ge.u32 %p1, %r2, %r6;
+ setp.ge.u32 %p2, %r1, %r5;
+ or.pred %p3, %p1, %p2;
+ @%p3 bra BB0_4;
+
+ cvta.to.global.u64 %rd5, %rd4;
+ mad.lo.s32 %r13, %r2, %r4, %r1;
+ mul.wide.u32 %rd6, %r13, 4;
+ add.s64 %rd7, %rd5, %rd6;
+ cvta.to.global.u64 %rd8, %rd3;
+ add.s64 %rd9, %rd8, %rd6;
+ ld.global.u32 %r14, [%rd9];
+ ld.global.u32 %r15, [%rd7];
+ setp.eq.s32 %p4, %r15, %r14;
+ mov.u16 %rs4, 0;
+ @%p4 bra BB0_3;
+
+ cvta.to.global.u64 %rd10, %rd2;
+ mov.u16 %rs4, 1;
+ st.global.u8 [%rd10], %rs4;
+
+BB0_3:
+ mad.lo.s32 %r16, %r2, %r3, %r1;
+ cvt.u64.u32 %rd11, %r16;
+ cvta.to.global.u64 %rd12, %rd1;
+ add.s64 %rd13, %rd12, %rd11;
+ st.global.u8 [%rd13], %rs4;
+
+BB0_4:
+ ret;
+}
+
+ // .globl combine_damage_array_cols
+.visible .entry combine_damage_array_cols(
+ .param .u64 combine_damage_array_cols_param_0,
+ .param .u32 combine_damage_array_cols_param_1,
+ .param .u32 combine_damage_array_cols_param_2,
+ .param .u32 combine_damage_array_cols_param_3,
+ .param .u32 combine_damage_array_cols_param_4
+)
+{
+ .reg .pred %p<6>;
+ .reg .b16 %rs<3>;
+ .reg .b32 %r<20>;
+ .reg .b64 %rd<7>;
+
+
+ ld.param.u64 %rd2, [combine_damage_array_cols_param_0];
+ ld.param.u32 %r5, [combine_damage_array_cols_param_1];
+ ld.param.u32 %r8, [combine_damage_array_cols_param_2];
+ ld.param.u32 %r6, [combine_damage_array_cols_param_3];
+ ld.param.u32 %r7, [combine_damage_array_cols_param_4];
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.u32 %r9, %ntid.x;
+ mov.u32 %r10, %ctaid.x;
+ mov.u32 %r11, %tid.x;
+ mad.lo.s32 %r12, %r9, %r10, %r11;
+ mov.u32 %r13, %ntid.y;
+ mov.u32 %r14, %ctaid.y;
+ mov.u32 %r15, %tid.y;
+ mad.lo.s32 %r1, %r13, %r14, %r15;
+ add.s32 %r16, %r7, 1;
+ shl.b32 %r2, %r12, %r16;
+ setp.ge.u32 %p1, %r1, %r8;
+ setp.ge.u32 %p2, %r2, %r5;
+ or.pred %p3, %p1, %p2;
+ @%p3 bra BB1_4;
+
+ mov.u32 %r17, 1;
+ shl.b32 %r3, %r17, %r7;
+ add.s32 %r18, %r2, %r3;
+ setp.ge.u32 %p4, %r18, %r5;
+ @%p4 bra BB1_4;
+
+ mad.lo.s32 %r4, %r1, %r6, %r2;
+ add.s32 %r19, %r4, %r3;
+ cvt.u64.u32 %rd3, %r19;
+ add.s64 %rd4, %rd1, %rd3;
+ ld.global.u8 %rs1, [%rd4];
+ setp.eq.s16 %p5, %rs1, 0;
+ @%p5 bra BB1_4;
+
+ cvt.u64.u32 %rd5, %r4;
+ add.s64 %rd6, %rd1, %rd5;
+ mov.u16 %rs2, 1;
+ st.global.u8 [%rd6], %rs2;
+
+BB1_4:
+ ret;
+}
+
+ // .globl combine_damage_array_rows
+.visible .entry combine_damage_array_rows(
+ .param .u64 combine_damage_array_rows_param_0,
+ .param .u32 combine_damage_array_rows_param_1,
+ .param .u32 combine_damage_array_rows_param_2,
+ .param .u32 combine_damage_array_rows_param_3,
+ .param .u32 combine_damage_array_rows_param_4
+)
+{
+ .reg .pred %p<6>;
+ .reg .b16 %rs<3>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<7>;
+
+
+ ld.param.u64 %rd2, [combine_damage_array_rows_param_0];
+ ld.param.u32 %r7, [combine_damage_array_rows_param_1];
+ ld.param.u32 %r4, [combine_damage_array_rows_param_2];
+ ld.param.u32 %r5, [combine_damage_array_rows_param_3];
+ ld.param.u32 %r6, [combine_damage_array_rows_param_4];
+ cvta.to.global.u64 %rd1, %rd2;
+ mov.u32 %r8, %ntid.x;
+ mov.u32 %r9, %ctaid.x;
+ mov.u32 %r10, %tid.x;
+ mad.lo.s32 %r1, %r8, %r9, %r10;
+ mov.u32 %r11, %ntid.y;
+ mov.u32 %r12, %ctaid.y;
+ mov.u32 %r13, %tid.y;
+ mad.lo.s32 %r14, %r11, %r12, %r13;
+ add.s32 %r15, %r6, 1;
+ shl.b32 %r2, %r14, %r15;
+ setp.ge.u32 %p1, %r2, %r4;
+ setp.ge.u32 %p2, %r1, %r7;
+ or.pred %p3, %p1, %p2;
+ @%p3 bra BB2_4;
+
+ mov.u32 %r16, 1;
+ shl.b32 %r17, %r16, %r6;
+ add.s32 %r18, %r2, %r17;
+ setp.ge.u32 %p4, %r18, %r4;
+ @%p4 bra BB2_4;
+
+ mad.lo.s32 %r3, %r2, %r5, %r1;
+ shl.b32 %r19, %r5, %r6;
+ add.s32 %r20, %r3, %r19;
+ cvt.u64.u32 %rd3, %r20;
+ add.s64 %rd4, %rd1, %rd3;
+ ld.global.u8 %rs1, [%rd4];
+ setp.eq.s16 %p5, %rs1, 0;
+ @%p5 bra BB2_4;
+
+ cvt.u64.u32 %rd5, %r3;
+ add.s64 %rd6, %rd1, %rd5;
+ mov.u16 %rs2, 1;
+ st.global.u8 [%rd6], %rs2;
+
+BB2_4:
+ ret;
+}
+
+ // .globl simplify_damage_array
+.visible .entry simplify_damage_array(
+ .param .u64 simplify_damage_array_param_0,
+ .param .u64 simplify_damage_array_param_1,
+ .param .u32 simplify_damage_array_param_2,
+ .param .u32 simplify_damage_array_param_3,
+ .param .u32 simplify_damage_array_param_4,
+ .param .u32 simplify_damage_array_param_5
+)
+{
+ .reg .pred %p<4>;
+ .reg .b16 %rs<2>;
+ .reg .b32 %r<17>;
+ .reg .b64 %rd<9>;
+
+
+ ld.param.u64 %rd1, [simplify_damage_array_param_0];
+ ld.param.u64 %rd2, [simplify_damage_array_param_1];
+ ld.param.u32 %r5, [simplify_damage_array_param_2];
+ ld.param.u32 %r7, [simplify_damage_array_param_3];
+ ld.param.u32 %r8, [simplify_damage_array_param_4];
+ ld.param.u32 %r6, [simplify_damage_array_param_5];
+ mov.u32 %r9, %ctaid.x;
+ mov.u32 %r10, %ntid.x;
+ mov.u32 %r11, %tid.x;
+ mad.lo.s32 %r1, %r10, %r9, %r11;
+ mov.u32 %r12, %ntid.y;
+ mov.u32 %r13, %ctaid.y;
+ mov.u32 %r14, %tid.y;
+ mad.lo.s32 %r2, %r12, %r13, %r14;
+ shl.b32 %r3, %r1, 6;
+ shl.b32 %r4, %r2, 6;
+ setp.ge.u32 %p1, %r4, %r8;
+ setp.ge.u32 %p2, %r3, %r7;
+ or.pred %p3, %p1, %p2;
+ @%p3 bra BB3_2;
+
+ cvta.to.global.u64 %rd3, %rd2;
+ mad.lo.s32 %r15, %r4, %r6, %r3;
+ mad.lo.s32 %r16, %r2, %r5, %r1;
+ cvt.u64.u32 %rd4, %r15;
+ add.s64 %rd5, %rd3, %rd4;
+ ld.global.u8 %rs1, [%rd5];
+ cvt.u64.u32 %rd6, %r16;
+ cvta.to.global.u64 %rd7, %rd1;
+ add.s64 %rd8, %rd7, %rd6;
+ st.global.u8 [%rd8], %rs1;
+
+BB3_2:
+ ret;
+}
+
+
diff --git a/data/meson.build b/data/meson.build
index ac276bf4..4077c091 100644
--- a/data/meson.build
+++ b/data/meson.build
@@ -1,4 +1,7 @@
if have_rdp
+ install_data(['grd-cuda-damage-utils_30.ptx'],
+ install_dir: grd_datadir,
+ )
install_data(['grd-cuda-avc-utils_30.ptx'],
install_dir: grd_datadir,
)
diff --git a/src/grd-cuda-damage-utils.cu b/src/grd-cuda-damage-utils.cu
new file mode 100644
index 00000000..d5913f09
--- /dev/null
+++ b/src/grd-cuda-damage-utils.cu
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2021 Pascal Nowack
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ */
+
+#include <stdint.h>
+
+extern "C"
+{
+ __global__ void
+ check_damaged_pixel (uint8_t *damage_array,
+ uint8_t *region_is_damaged,
+ uint32_t *current_data,
+ uint32_t *previous_data,
+ uint32_t damage_array_stride,
+ uint32_t data_width,
+ uint32_t data_height,
+ uint32_t data_stride)
+ {
+ uint32_t data_pos;
+ uint8_t damaged = 0;
+ uint32_t x, y;
+
+ x = blockIdx.x * blockDim.x + threadIdx.x;
+ y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ if (x >= data_width || y >= data_height)
+ return;
+
+ data_pos = y * data_stride + x;
+ if (previous_data[data_pos] != current_data[data_pos])
+ {
+ damaged = 1;
+ *region_is_damaged = 1;
+ }
+
+ damage_array[y * damage_array_stride + x] = damaged;
+ }
+
+ __global__ void
+ combine_damage_array_cols (uint8_t *damage_array,
+ uint32_t damage_array_width,
+ uint32_t damage_array_height,
+ uint32_t damage_array_stride,
+ uint32_t combine_shift)
+ {
+ uint32_t data_pos;
+ uint32_t neighbour_offset;
+ uint32_t x, y;
+ uint32_t sx;
+
+ sx = blockIdx.x * blockDim.x + threadIdx.x;
+ y = blockIdx.y * blockDim.y + threadIdx.y;
+
+ x = sx << combine_shift + 1;
+
+ if (x >= damage_array_width || y >= damage_array_height)
+ return;
+
+ neighbour_offset = 1 << combine_shift;
+ if (x + neighbour_offset >= damage_array_width)
+ return;
+
+ data_pos = y * damage_array_stride + x;
+ if (damage_array[data_pos + neighbour_offset])
+ damage_array[data_pos] = 1;
+ }
+
+ __global__ void
+ combine_damage_array_rows (uint8_t *damage_array,
+ uint32_t damage_array_width,
+ uint32_t damage_array_height,
+ uint32_t damage_array_stride,
+ uint32_t combine_shift)
+ {
+ uint32_t data_pos;
+ uint32_t neighbour_offset;
+ uint32_t x, y;
+ uint32_t sy;
+
+ x = blockIdx.x * blockDim.x + threadIdx.x;
+ sy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ y = sy << combine_shift + 1;
+
+ if (x >= damage_array_width || y >= damage_array_height)
+ return;
+
+ neighbour_offset = 1 << combine_shift;
+ if (y + neighbour_offset >= damage_array_height)
+ return;
+
+ data_pos = y * damage_array_stride + x;
+ if (damage_array[data_pos + neighbour_offset * damage_array_stride])
+ damage_array[data_pos] = 1;
+ }
+
+ __global__ void
+ simplify_damage_array (uint8_t *dst_damage_array,
+ uint8_t *src_damage_array,
+ uint32_t dst_damage_array_stride,
+ uint32_t src_damage_array_width,
+ uint32_t src_damage_array_height,
+ uint32_t src_damage_array_stride)
+ {
+ uint32_t src_data_pos, dst_data_pos;
+ uint32_t sx, sy;
+ uint32_t x, y;
+
+ sx = blockIdx.x * blockDim.x + threadIdx.x;
+ sy = blockIdx.y * blockDim.y + threadIdx.y;
+
+ x = sx << 6;
+ y = sy << 6;
+
+ if (x >= src_damage_array_width || y >= src_damage_array_height)
+ return;
+
+ src_data_pos = y * src_damage_array_stride + x;
+ dst_data_pos = sy * dst_damage_array_stride + sx;
+
+ dst_damage_array[dst_data_pos] = src_damage_array[src_data_pos];
+ }
+}
diff --git a/src/grd-hwaccel-nvidia.c b/src/grd-hwaccel-nvidia.c
index 44862523..43dafd18 100644
--- a/src/grd-hwaccel-nvidia.c
+++ b/src/grd-hwaccel-nvidia.c
@@ -77,6 +77,12 @@ struct _GrdHwAccelNvidia
CUcontext cu_context;
gboolean initialized;
+ CUmodule cu_module_dmg_utils;
+ CUfunction cu_chk_dmg_pxl;
+ CUfunction cu_cmb_dmg_arr_cols;
+ CUfunction cu_cmb_dmg_arr_rows;
+ CUfunction cu_simplify_dmg_arr;
+
CUmodule cu_module_avc_utils;
CUfunction cu_bgrx_to_yuv420;
@@ -651,6 +657,40 @@ run_function_in_egl_thread (GrdHwAccelNvidia *hwaccel_nvidia,
grd_sync_point_clear (&sync_point);
}
+static gboolean
+load_cuda_module (GrdHwAccelNvidia *hwaccel_nvidia,
+ CUmodule *module,
+ const char *name,
+ const char *ptx_instructions)
+{
+ CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
+
+ if (cuda_funcs->cuModuleLoadData (module, ptx_instructions) != CUDA_SUCCESS)
+ {
+ g_warning ("[HWAccel.CUDA] Failed to load %s module", name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+static gboolean
+load_cuda_function (GrdHwAccelNvidia *hwaccel_nvidia,
+ CUfunction *function,
+ CUmodule module,
+ const char *name)
+{
+ CudaFunctions *cuda_funcs = hwaccel_nvidia->cuda_funcs;
+
+ if (cuda_funcs->cuModuleGetFunction (function, module, name) != CUDA_SUCCESS)
+ {
+ g_warning ("[HWAccel.CUDA] Failed to get kernel %s", name);
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
GrdHwAccelNvidia *
grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
{
@@ -661,6 +701,8 @@ grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
unsigned int cu_device_count = 0;
CudaFunctions *cuda_funcs;
NvencFunctions *nvenc_funcs;
+ g_autofree char *dmg_ptx_path = NULL;
+ g_autofree char *dmg_ptx_instructions = NULL;
g_autofree char *avc_ptx_path = NULL;
g_autofree char *avc_ptx_instructions = NULL;
g_autoptr (GError) error = NULL;
@@ -749,24 +791,34 @@ grd_hwaccel_nvidia_new (GrdEglThread *egl_thread)
hwaccel_nvidia->initialized = TRUE;
+ dmg_ptx_path = g_strdup_printf ("%s/grd-cuda-damage-utils_30.ptx", GRD_DATA_DIR);
avc_ptx_path = g_strdup_printf ("%s/grd-cuda-avc-utils_30.ptx", GRD_DATA_DIR);
- if (!g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error))
+
+ if (!g_file_get_contents (dmg_ptx_path, &dmg_ptx_instructions, NULL, &error) ||
+ !g_file_get_contents (avc_ptx_path, &avc_ptx_instructions, NULL, &error))
g_error ("[HWAccel.CUDA] Failed to read PTX instructions: %s", error->message);
- if (cuda_funcs->cuModuleLoadData (&hwaccel_nvidia->cu_module_avc_utils,
- avc_ptx_instructions) != CUDA_SUCCESS)
- {
- g_warning ("[HWAccel.CUDA] Failed to load CUDA module");
- return NULL;
- }
+ if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_dmg_utils,
+ "damage utils", dmg_ptx_instructions))
+ return NULL;
- if (cuda_funcs->cuModuleGetFunction (&hwaccel_nvidia->cu_bgrx_to_yuv420,
- hwaccel_nvidia->cu_module_avc_utils,
- "convert_2x2_bgrx_area_to_yuv420_nv12") != CUDA_SUCCESS)
- {
- g_warning ("[HWAccel.CUDA] Failed to get AVC CUDA kernel");
- return NULL;
- }
+ if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_chk_dmg_pxl,
+ hwaccel_nvidia->cu_module_dmg_utils, "check_damaged_pixel") ||
+ !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_cols,
+ hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_cols") ||
+ !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_cmb_dmg_arr_rows,
+ hwaccel_nvidia->cu_module_dmg_utils, "combine_damage_array_rows") ||
+ !load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_simplify_dmg_arr,
+ hwaccel_nvidia->cu_module_dmg_utils, "simplify_damage_array"))
+ return NULL;
+
+ if (!load_cuda_module (hwaccel_nvidia, &hwaccel_nvidia->cu_module_avc_utils,
+ "AVC utils", avc_ptx_instructions))
+ return NULL;
+
+ if (!load_cuda_function (hwaccel_nvidia, &hwaccel_nvidia->cu_bgrx_to_yuv420,
+ hwaccel_nvidia->cu_module_avc_utils, "convert_2x2_bgrx_area_to_yuv420_nv12"))
+ return NULL;
return g_steal_pointer (&hwaccel_nvidia);
}
@@ -788,6 +840,8 @@ grd_hwaccel_nvidia_dispose (GObject *object)
g_clear_pointer (&hwaccel_nvidia->cu_module_avc_utils,
hwaccel_nvidia->cuda_funcs->cuModuleUnload);
+ g_clear_pointer (&hwaccel_nvidia->cu_module_dmg_utils,
+ hwaccel_nvidia->cuda_funcs->cuModuleUnload);
g_clear_pointer (&hwaccel_nvidia->cuda_lib, dlclose);
g_clear_pointer (&hwaccel_nvidia->extra_cuda_funcs, g_free);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]