[gnome-remote-desktop] cuda: Fix out-of-bounds source read in BGRX_TO_YUV420 kernel



commit cc4ea662c834e7c00c6aca52795f82e9847cc6cb
Author: Pascal Nowack <Pascal Nowack gmx de>
Date:   Fri Feb 4 13:17:24 2022 +0100

    cuda: Fix out-of-bounds source read in BGRX_TO_YUV420 kernel
    
    Compute operations on the GPU run in blocks. Each block consists of a
    certain amount of threads.
    Since these blocks are always a multiples of the warp size, additional
    out-of-bounds checks are required when reading or writing memory on the
    GPU.
    For the NV12 buffer this is done in all cases. For the source data too,
    except in one case, when reading the pixel x = 0, y = 1, in a 2x2
    rectangle.
    In that case, an invalid memory access can happen and all subsequent
    CUDA operations will fail.
    
    Fix this by adding the missing out-of-bounds check.

 data/grd-cuda-avc-utils_30.ptx | 172 +++++++++++++++++++++--------------------
 src/grd-cuda-avc-utils.cu      |   2 +-
 2 files changed, 88 insertions(+), 86 deletions(-)
---
diff --git a/data/grd-cuda-avc-utils_30.ptx b/data/grd-cuda-avc-utils_30.ptx
index 5fb44696..17266d4c 100644
--- a/data/grd-cuda-avc-utils_30.ptx
+++ b/data/grd-cuda-avc-utils_30.ptx
@@ -23,9 +23,9 @@
        .param .u16 convert_2x2_bgrx_area_to_yuv420_nv12_param_7
 )
 {
-       .reg .pred      %p<15>;
+       .reg .pred      %p<17>;
        .reg .b16       %rs<23>;
-       .reg .b32       %r<127>;
+       .reg .b32       %r<129>;
        .reg .b64       %rd<36>;
 
 
@@ -62,32 +62,32 @@
        and.b32         %r6, %r44, 65534;
        mov.u32         %r45, 1;
        shl.b32         %r46, %r2, 1;
-       and.b32         %r47, %r46, 65534;
-       mul.lo.s32      %r48, %r47, %r42;
-       cvt.u64.u32     %rd9, %r48;
-       shl.b32         %r49, %r43, 3;
-       cvt.u64.u32     %rd10, %r49;
+       and.b32         %r7, %r46, 65534;
+       mul.lo.s32      %r47, %r7, %r42;
+       cvt.u64.u32     %rd9, %r47;
+       shl.b32         %r48, %r43, 3;
+       cvt.u64.u32     %rd10, %r48;
        add.s64         %rd1, %rd9, %rd10;
        cvta.to.global.u64      %rd11, %rd7;
        add.s64         %rd2, %rd11, %rd1;
-       setp.lt.u32     %p4, %r47, %r5;
-       shl.b32         %r50, %r2, 2;
-       sub.s32         %r51, %r45, %r4;
-       selp.b32        %r52, 0, %r51, %p4;
-       mov.u32         %r124, 0;
-       add.s32         %r53, %r52, %r50;
-       cvt.u64.u32     %rd12, %r53;
+       setp.lt.u32     %p4, %r7, %r5;
+       shl.b32         %r49, %r2, 2;
+       sub.s32         %r50, %r45, %r4;
+       selp.b32        %r51, 0, %r50, %p4;
+       mov.u32         %r126, 0;
+       add.s32         %r52, %r51, %r49;
+       cvt.u64.u32     %rd12, %r52;
        and.b64         %rd13, %rd12, 65535;
        cvt.u64.u16     %rd14, %rs11;
        mul.lo.s64      %rd15, %rd13, %rd14;
        cvt.u64.u32     %rd16, %r6;
        add.s64         %rd17, %rd15, %rd16;
        add.s64         %rd3, %rd8, %rd17;
-       add.s32         %r7, %r47, 1;
-       and.b32         %r54, %r7, 65535;
+       add.s32         %r53, %r7, 1;
+       and.b32         %r54, %r53, 65535;
        setp.lt.u32     %p5, %r54, %r5;
-       shl.b32         %r55, %r7, 1;
-       selp.b32        %r56, 0, %r51, %p5;
+       shl.b32         %r55, %r53, 1;
+       selp.b32        %r56, 0, %r50, %p5;
        add.s32         %r57, %r56, %r55;
        cvt.u64.u32     %rd18, %r57;
        and.b64         %rd19, %rd18, 65535;
@@ -110,51 +110,53 @@
        cvt.u32.u16     %r63, %rs7;
        setp.ge.u32     %p7, %r6, %r63;
        cvt.u32.u16     %r64, %rs8;
-       setp.ge.u32     %p8, %r47, %r64;
+       setp.ge.u32     %p8, %r7, %r64;
        mov.u16         %rs21, 0;
        or.pred         %p9, %p7, %p8;
        mov.u16         %rs20, %rs21;
-       mov.u32         %r125, %r124;
-       mov.u32         %r126, %r124;
+       mov.u32         %r127, %r126;
+       mov.u32         %r128, %r126;
        @%p9 bra        BB0_3;
 
        ld.global.u8    %rs13, [%rd2];
-       cvt.u32.u16     %r126, %rs13;
-       ld.global.u8    %r125, [%rd2+1];
-       ld.global.u8    %r124, [%rd2+2];
+       cvt.u32.u16     %r128, %rs13;
+       ld.global.u8    %r127, [%rd2+1];
+       ld.global.u8    %r126, [%rd2+2];
        mul.wide.u16    %r65, %rs13, 18;
-       mad.lo.s32      %r66, %r125, 183, %r65;
-       mad.lo.s32      %r67, %r124, 54, %r66;
+       mad.lo.s32      %r66, %r127, 183, %r65;
+       mad.lo.s32      %r67, %r126, 54, %r66;
        shr.u32         %r68, %r67, 8;
        cvt.u16.u32     %rs20, %r68;
 
 BB0_3:
-       and.b32         %r73, %r2, 32767;
-       shl.b32         %r74, %r73, 1;
-       setp.ge.u32     %p10, %r74, %r64;
        st.global.u8    [%rd3], %rs20;
        add.s32         %r14, %r6, 1;
        setp.ge.u32     %p11, %r14, %r63;
-       or.pred         %p12, %p11, %p10;
+       or.pred         %p12, %p11, %p8;
        @%p12 bra       BB0_5;
 
        ld.global.u8    %rs15, [%rd2+4];
-       cvt.u32.u16     %r77, %rs15;
-       add.s32         %r126, %r77, %r126;
-       ld.global.u8    %r78, [%rd2+5];
-       add.s32         %r125, %r78, %r125;
-       ld.global.u8    %r79, [%rd2+6];
-       add.s32         %r124, %r79, %r124;
-       mul.wide.u16    %r80, %rs15, 18;
-       mad.lo.s32      %r81, %r78, 183, %r80;
-       mad.lo.s32      %r82, %r79, 54, %r81;
-       shr.u32         %r83, %r82, 8;
-       cvt.u16.u32     %rs21, %r83;
+       cvt.u32.u16     %r71, %rs15;
+       add.s32         %r128, %r71, %r128;
+       ld.global.u8    %r72, [%rd2+5];
+       add.s32         %r127, %r72, %r127;
+       ld.global.u8    %r73, [%rd2+6];
+       add.s32         %r126, %r73, %r126;
+       mul.wide.u16    %r74, %rs15, 18;
+       mad.lo.s32      %r75, %r72, 183, %r74;
+       mad.lo.s32      %r76, %r73, 54, %r75;
+       shr.u32         %r77, %r76, 8;
+       cvt.u16.u32     %rs21, %r77;
 
 BB0_5:
+       setp.lt.u32     %p13, %r6, %r63;
        st.global.u8    [%rd3+1], %rs21;
-       setp.lt.u32     %p13, %r7, %r64;
-       @%p13 bra       BB0_7;
+       and.b32         %r83, %r2, 32767;
+       shl.b32         %r84, %r83, 1;
+       add.s32         %r85, %r84, 1;
+       setp.lt.u32     %p14, %r85, %r64;
+       and.pred        %p15, %p13, %p14;
+       @%p15 bra       BB0_7;
        bra.uni         BB0_6;
 
 BB0_7:
@@ -162,37 +164,37 @@ BB0_7:
        add.s64         %rd29, %rd1, %rd28;
        add.s64         %rd31, %rd11, %rd29;
        ld.global.u8    %rs18, [%rd31];
-       cvt.u32.u16     %r86, %rs18;
-       add.s32         %r126, %r86, %r126;
-       ld.global.u8    %r87, [%rd31+1];
-       add.s32         %r125, %r87, %r125;
-       ld.global.u8    %r88, [%rd31+2];
-       add.s32         %r124, %r88, %r124;
-       mul.wide.u16    %r89, %rs18, 18;
-       mad.lo.s32      %r90, %r87, 183, %r89;
-       mad.lo.s32      %r91, %r88, 54, %r90;
-       shr.u32         %r92, %r91, 8;
-       st.global.u8    [%rd4], %r92;
+       cvt.u32.u16     %r88, %rs18;
+       add.s32         %r128, %r88, %r128;
+       ld.global.u8    %r89, [%rd31+1];
+       add.s32         %r127, %r89, %r127;
+       ld.global.u8    %r90, [%rd31+2];
+       add.s32         %r126, %r90, %r126;
+       mul.wide.u16    %r91, %rs18, 18;
+       mad.lo.s32      %r92, %r89, 183, %r91;
+       mad.lo.s32      %r93, %r90, 54, %r92;
+       shr.u32         %r94, %r93, 8;
+       st.global.u8    [%rd4], %r94;
        mov.u16         %rs22, 0;
        @%p11 bra       BB0_9;
 
-       add.s32         %r94, %r42, 4;
-       and.b32         %r95, %r94, 65535;
-       cvt.u64.u32     %rd32, %r95;
+       add.s32         %r96, %r42, 4;
+       and.b32         %r97, %r96, 65535;
+       cvt.u64.u32     %rd32, %r97;
        add.s64         %rd33, %rd1, %rd32;
        add.s64         %rd35, %rd11, %rd33;
        ld.global.u8    %rs19, [%rd35];
-       cvt.u32.u16     %r96, %rs19;
-       add.s32         %r126, %r96, %r126;
-       ld.global.u8    %r97, [%rd35+1];
-       add.s32         %r125, %r97, %r125;
-       ld.global.u8    %r98, [%rd35+2];
-       add.s32         %r124, %r98, %r124;
-       mul.wide.u16    %r99, %rs19, 18;
-       mad.lo.s32      %r100, %r97, 183, %r99;
-       mad.lo.s32      %r101, %r98, 54, %r100;
-       shr.u32         %r102, %r101, 8;
-       cvt.u16.u32     %rs22, %r102;
+       cvt.u32.u16     %r98, %rs19;
+       add.s32         %r128, %r98, %r128;
+       ld.global.u8    %r99, [%rd35+1];
+       add.s32         %r127, %r99, %r127;
+       ld.global.u8    %r100, [%rd35+2];
+       add.s32         %r126, %r100, %r126;
+       mul.wide.u16    %r101, %rs19, 18;
+       mad.lo.s32      %r102, %r99, 183, %r101;
+       mad.lo.s32      %r103, %r100, 54, %r102;
+       shr.u32         %r104, %r103, 8;
+       cvt.u16.u32     %rs22, %r104;
        bra.uni         BB0_9;
 
 BB0_6:
@@ -201,23 +203,23 @@ BB0_6:
 
 BB0_9:
        st.global.u8    [%rd4+1], %rs22;
-       bfe.u32         %r103, %r124, 2, 8;
-       mul.lo.s32      %r104, %r103, -29;
-       bfe.u32         %r105, %r125, 2, 8;
-       mad.lo.s32      %r106, %r105, -99, %r104;
-       bfe.u32         %r107, %r126, 2, 8;
-       shl.b32         %r108, %r107, 7;
-       add.s32         %r109, %r106, %r108;
-       shr.u32         %r110, %r109, 8;
-       add.s32         %r111, %r110, 128;
-       st.global.u8    [%rd5], %r111;
-       shl.b32         %r112, %r124, 5;
-       and.b32         %r113, %r112, 32640;
-       mad.lo.s32      %r114, %r105, -116, %r113;
-       mad.lo.s32      %r115, %r107, -12, %r114;
-       shr.u32         %r116, %r115, 8;
-       add.s32         %r117, %r116, 128;
-       st.global.u8    [%rd5+1], %r117;
+       bfe.u32         %r105, %r126, 2, 8;
+       mul.lo.s32      %r106, %r105, -29;
+       bfe.u32         %r107, %r127, 2, 8;
+       mad.lo.s32      %r108, %r107, -99, %r106;
+       bfe.u32         %r109, %r128, 2, 8;
+       shl.b32         %r110, %r109, 7;
+       add.s32         %r111, %r108, %r110;
+       shr.u32         %r112, %r111, 8;
+       add.s32         %r113, %r112, 128;
+       st.global.u8    [%rd5], %r113;
+       shl.b32         %r114, %r126, 5;
+       and.b32         %r115, %r114, 32640;
+       mad.lo.s32      %r116, %r107, -116, %r115;
+       mad.lo.s32      %r117, %r109, -12, %r116;
+       shr.u32         %r118, %r117, 8;
+       add.s32         %r119, %r118, 128;
+       st.global.u8    [%rd5+1], %r119;
 
 BB0_10:
        ret;
diff --git a/src/grd-cuda-avc-utils.cu b/src/grd-cuda-avc-utils.cu
index 82d87f2e..9bca1ec1 100644
--- a/src/grd-cuda-avc-utils.cu
+++ b/src/grd-cuda-avc-utils.cu
@@ -174,7 +174,7 @@ extern "C"
         *dst_y1 = 0;
       }
 
-    if (y_1x1 + 1 < src_height)
+    if (x_1x1 < src_width && y_1x1 + 1 < src_height)
       {
         /* d_2 */
         b_a += b = src[s2 + 0];


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]