[gnome-remote-desktop] cuda: Fix out-of-bounds source read in BGRX_TO_YUV420 kernel
- From: Jonas Ådahl <jadahl src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gnome-remote-desktop] cuda: Fix out-of-bounds source read in BGRX_TO_YUV420 kernel
- Date: Thu, 10 Feb 2022 14:02:54 +0000 (UTC)
commit cc4ea662c834e7c00c6aca52795f82e9847cc6cb
Author: Pascal Nowack <Pascal Nowack gmx de>
Date: Fri Feb 4 13:17:24 2022 +0100
cuda: Fix out-of-bounds source read in BGRX_TO_YUV420 kernel
Compute operations on the GPU run in blocks. Each block consists of a
certain amount of threads.
Since these blocks are always a multiples of the warp size, additional
out-of-bounds checks are required when reading or writing memory on the
GPU.
For the NV12 buffer this is done in all cases. For the source data too,
except in one case, when reading the pixel x = 0, y = 1, in a 2x2
rectangle.
In that case, an invalid memory access can happen and all subsequent
CUDA operations will fail.
Fix this by adding the missing out-of-bounds check.
data/grd-cuda-avc-utils_30.ptx | 172 +++++++++++++++++++++--------------------
src/grd-cuda-avc-utils.cu | 2 +-
2 files changed, 88 insertions(+), 86 deletions(-)
---
diff --git a/data/grd-cuda-avc-utils_30.ptx b/data/grd-cuda-avc-utils_30.ptx
index 5fb44696..17266d4c 100644
--- a/data/grd-cuda-avc-utils_30.ptx
+++ b/data/grd-cuda-avc-utils_30.ptx
@@ -23,9 +23,9 @@
.param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_7
)
{
- .reg .pred %p<15>;
+ .reg .pred %p<17>;
.reg .b16 %rs<23>;
- .reg .b32 %r<127>;
+ .reg .b32 %r<129>;
.reg .b64 %rd<36>;
@@ -62,32 +62,32 @@
and.b32 %r6, %r44, 65534;
mov.u32 %r45, 1;
shl.b32 %r46, %r2, 1;
- and.b32 %r47, %r46, 65534;
- mul.lo.s32 %r48, %r47, %r42;
- cvt.u64.u32 %rd9, %r48;
- shl.b32 %r49, %r43, 3;
- cvt.u64.u32 %rd10, %r49;
+ and.b32 %r7, %r46, 65534;
+ mul.lo.s32 %r47, %r7, %r42;
+ cvt.u64.u32 %rd9, %r47;
+ shl.b32 %r48, %r43, 3;
+ cvt.u64.u32 %rd10, %r48;
add.s64 %rd1, %rd9, %rd10;
cvta.to.global.u64 %rd11, %rd7;
add.s64 %rd2, %rd11, %rd1;
- setp.lt.u32 %p4, %r47, %r5;
- shl.b32 %r50, %r2, 2;
- sub.s32 %r51, %r45, %r4;
- selp.b32 %r52, 0, %r51, %p4;
- mov.u32 %r124, 0;
- add.s32 %r53, %r52, %r50;
- cvt.u64.u32 %rd12, %r53;
+ setp.lt.u32 %p4, %r7, %r5;
+ shl.b32 %r49, %r2, 2;
+ sub.s32 %r50, %r45, %r4;
+ selp.b32 %r51, 0, %r50, %p4;
+ mov.u32 %r126, 0;
+ add.s32 %r52, %r51, %r49;
+ cvt.u64.u32 %rd12, %r52;
and.b64 %rd13, %rd12, 65535;
cvt.u64.u16 %rd14, %rs11;
mul.lo.s64 %rd15, %rd13, %rd14;
cvt.u64.u32 %rd16, %r6;
add.s64 %rd17, %rd15, %rd16;
add.s64 %rd3, %rd8, %rd17;
- add.s32 %r7, %r47, 1;
- and.b32 %r54, %r7, 65535;
+ add.s32 %r53, %r7, 1;
+ and.b32 %r54, %r53, 65535;
setp.lt.u32 %p5, %r54, %r5;
- shl.b32 %r55, %r7, 1;
- selp.b32 %r56, 0, %r51, %p5;
+ shl.b32 %r55, %r53, 1;
+ selp.b32 %r56, 0, %r50, %p5;
add.s32 %r57, %r56, %r55;
cvt.u64.u32 %rd18, %r57;
and.b64 %rd19, %rd18, 65535;
@@ -110,51 +110,53 @@
cvt.u32.u16 %r63, %rs7;
setp.ge.u32 %p7, %r6, %r63;
cvt.u32.u16 %r64, %rs8;
- setp.ge.u32 %p8, %r47, %r64;
+ setp.ge.u32 %p8, %r7, %r64;
mov.u16 %rs21, 0;
or.pred %p9, %p7, %p8;
mov.u16 %rs20, %rs21;
- mov.u32 %r125, %r124;
- mov.u32 %r126, %r124;
+ mov.u32 %r127, %r126;
+ mov.u32 %r128, %r126;
@%p9 bra BB0_3;
ld.global.u8 %rs13, [%rd2];
- cvt.u32.u16 %r126, %rs13;
- ld.global.u8 %r125, [%rd2+1];
- ld.global.u8 %r124, [%rd2+2];
+ cvt.u32.u16 %r128, %rs13;
+ ld.global.u8 %r127, [%rd2+1];
+ ld.global.u8 %r126, [%rd2+2];
mul.wide.u16 %r65, %rs13, 18;
- mad.lo.s32 %r66, %r125, 183, %r65;
- mad.lo.s32 %r67, %r124, 54, %r66;
+ mad.lo.s32 %r66, %r127, 183, %r65;
+ mad.lo.s32 %r67, %r126, 54, %r66;
shr.u32 %r68, %r67, 8;
cvt.u16.u32 %rs20, %r68;
BB0_3:
- and.b32 %r73, %r2, 32767;
- shl.b32 %r74, %r73, 1;
- setp.ge.u32 %p10, %r74, %r64;
st.global.u8 [%rd3], %rs20;
add.s32 %r14, %r6, 1;
setp.ge.u32 %p11, %r14, %r63;
- or.pred %p12, %p11, %p10;
+ or.pred %p12, %p11, %p8;
@%p12 bra BB0_5;
ld.global.u8 %rs15, [%rd2+4];
- cvt.u32.u16 %r77, %rs15;
- add.s32 %r126, %r77, %r126;
- ld.global.u8 %r78, [%rd2+5];
- add.s32 %r125, %r78, %r125;
- ld.global.u8 %r79, [%rd2+6];
- add.s32 %r124, %r79, %r124;
- mul.wide.u16 %r80, %rs15, 18;
- mad.lo.s32 %r81, %r78, 183, %r80;
- mad.lo.s32 %r82, %r79, 54, %r81;
- shr.u32 %r83, %r82, 8;
- cvt.u16.u32 %rs21, %r83;
+ cvt.u32.u16 %r71, %rs15;
+ add.s32 %r128, %r71, %r128;
+ ld.global.u8 %r72, [%rd2+5];
+ add.s32 %r127, %r72, %r127;
+ ld.global.u8 %r73, [%rd2+6];
+ add.s32 %r126, %r73, %r126;
+ mul.wide.u16 %r74, %rs15, 18;
+ mad.lo.s32 %r75, %r72, 183, %r74;
+ mad.lo.s32 %r76, %r73, 54, %r75;
+ shr.u32 %r77, %r76, 8;
+ cvt.u16.u32 %rs21, %r77;
BB0_5:
+ setp.lt.u32 %p13, %r6, %r63;
st.global.u8 [%rd3+1], %rs21;
- setp.lt.u32 %p13, %r7, %r64;
- @%p13 bra BB0_7;
+ and.b32 %r83, %r2, 32767;
+ shl.b32 %r84, %r83, 1;
+ add.s32 %r85, %r84, 1;
+ setp.lt.u32 %p14, %r85, %r64;
+ and.pred %p15, %p13, %p14;
+ @%p15 bra BB0_7;
bra.uni BB0_6;
BB0_7:
@@ -162,37 +164,37 @@ BB0_7:
add.s64 %rd29, %rd1, %rd28;
add.s64 %rd31, %rd11, %rd29;
ld.global.u8 %rs18, [%rd31];
- cvt.u32.u16 %r86, %rs18;
- add.s32 %r126, %r86, %r126;
- ld.global.u8 %r87, [%rd31+1];
- add.s32 %r125, %r87, %r125;
- ld.global.u8 %r88, [%rd31+2];
- add.s32 %r124, %r88, %r124;
- mul.wide.u16 %r89, %rs18, 18;
- mad.lo.s32 %r90, %r87, 183, %r89;
- mad.lo.s32 %r91, %r88, 54, %r90;
- shr.u32 %r92, %r91, 8;
- st.global.u8 [%rd4], %r92;
+ cvt.u32.u16 %r88, %rs18;
+ add.s32 %r128, %r88, %r128;
+ ld.global.u8 %r89, [%rd31+1];
+ add.s32 %r127, %r89, %r127;
+ ld.global.u8 %r90, [%rd31+2];
+ add.s32 %r126, %r90, %r126;
+ mul.wide.u16 %r91, %rs18, 18;
+ mad.lo.s32 %r92, %r89, 183, %r91;
+ mad.lo.s32 %r93, %r90, 54, %r92;
+ shr.u32 %r94, %r93, 8;
+ st.global.u8 [%rd4], %r94;
mov.u16 %rs22, 0;
@%p11 bra BB0_9;
- add.s32 %r94, %r42, 4;
- and.b32 %r95, %r94, 65535;
- cvt.u64.u32 %rd32, %r95;
+ add.s32 %r96, %r42, 4;
+ and.b32 %r97, %r96, 65535;
+ cvt.u64.u32 %rd32, %r97;
add.s64 %rd33, %rd1, %rd32;
add.s64 %rd35, %rd11, %rd33;
ld.global.u8 %rs19, [%rd35];
- cvt.u32.u16 %r96, %rs19;
- add.s32 %r126, %r96, %r126;
- ld.global.u8 %r97, [%rd35+1];
- add.s32 %r125, %r97, %r125;
- ld.global.u8 %r98, [%rd35+2];
- add.s32 %r124, %r98, %r124;
- mul.wide.u16 %r99, %rs19, 18;
- mad.lo.s32 %r100, %r97, 183, %r99;
- mad.lo.s32 %r101, %r98, 54, %r100;
- shr.u32 %r102, %r101, 8;
- cvt.u16.u32 %rs22, %r102;
+ cvt.u32.u16 %r98, %rs19;
+ add.s32 %r128, %r98, %r128;
+ ld.global.u8 %r99, [%rd35+1];
+ add.s32 %r127, %r99, %r127;
+ ld.global.u8 %r100, [%rd35+2];
+ add.s32 %r126, %r100, %r126;
+ mul.wide.u16 %r101, %rs19, 18;
+ mad.lo.s32 %r102, %r99, 183, %r101;
+ mad.lo.s32 %r103, %r100, 54, %r102;
+ shr.u32 %r104, %r103, 8;
+ cvt.u16.u32 %rs22, %r104;
bra.uni BB0_9;
BB0_6:
@@ -201,23 +203,23 @@ BB0_6:
BB0_9:
st.global.u8 [%rd4+1], %rs22;
- bfe.u32 %r103, %r124, 2, 8;
- mul.lo.s32 %r104, %r103, -29;
- bfe.u32 %r105, %r125, 2, 8;
- mad.lo.s32 %r106, %r105, -99, %r104;
- bfe.u32 %r107, %r126, 2, 8;
- shl.b32 %r108, %r107, 7;
- add.s32 %r109, %r106, %r108;
- shr.u32 %r110, %r109, 8;
- add.s32 %r111, %r110, 128;
- st.global.u8 [%rd5], %r111;
- shl.b32 %r112, %r124, 5;
- and.b32 %r113, %r112, 32640;
- mad.lo.s32 %r114, %r105, -116, %r113;
- mad.lo.s32 %r115, %r107, -12, %r114;
- shr.u32 %r116, %r115, 8;
- add.s32 %r117, %r116, 128;
- st.global.u8 [%rd5+1], %r117;
+ bfe.u32 %r105, %r126, 2, 8;
+ mul.lo.s32 %r106, %r105, -29;
+ bfe.u32 %r107, %r127, 2, 8;
+ mad.lo.s32 %r108, %r107, -99, %r106;
+ bfe.u32 %r109, %r128, 2, 8;
+ shl.b32 %r110, %r109, 7;
+ add.s32 %r111, %r108, %r110;
+ shr.u32 %r112, %r111, 8;
+ add.s32 %r113, %r112, 128;
+ st.global.u8 [%rd5], %r113;
+ shl.b32 %r114, %r126, 5;
+ and.b32 %r115, %r114, 32640;
+ mad.lo.s32 %r116, %r107, -116, %r115;
+ mad.lo.s32 %r117, %r109, -12, %r116;
+ shr.u32 %r118, %r117, 8;
+ add.s32 %r119, %r118, 128;
+ st.global.u8 [%rd5+1], %r119;
BB0_10:
ret;
diff --git a/src/grd-cuda-avc-utils.cu b/src/grd-cuda-avc-utils.cu
index 82d87f2e..9bca1ec1 100644
--- a/src/grd-cuda-avc-utils.cu
+++ b/src/grd-cuda-avc-utils.cu
@@ -174,7 +174,7 @@ extern "C"
*dst_y1 = 0;
}
- if (y_1x1 + 1 < src_height)
+ if (x_1x1 < src_width && y_1x1 + 1 < src_height)
{
/* d_2 */
b_a += b = src[s2 + 0];
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]