Skip to content

[cuclang 20.1.0] Using __reduce_max_sync in a cuda kernel fails with an illegal instruction was encountered #131415

@AustinSchuh

Description

@AustinSchuh

Using clang 20.1.0. I've got a CUDA kernel which boils down to the following:

__global__ void testcode(const float* data, unsigned *max_value) {
    unsigned r = static_cast<unsigned>(data[threadIdx.x]);

    const unsigned mask = __ballot_sync(0xFFFFFFFF, true);

    unsigned mx = __reduce_max_sync(mask, r);
    atomicMax(max_value, mx);
}

When I run this with nvcc, it works, and with clang, I get an an illegal instruction was encountered message.

Compiler explorer confirms that they generate slightly different ptx.

nvcc

        ld.param.u64    %rd1, [testcode(float const*, unsigned int*)_param_0];
        ld.param.u64    %rd2, [testcode(float const*, unsigned int*)_param_1];
        cvta.to.global.u64      %rd3, %rd2;
        cvta.to.global.u64      %rd4, %rd1;
        mov.u32         %r1, %tid.x;
        mul.wide.u32    %rd5, %r1, 4;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f32   %f1, [%rd6];
        cvt.rzi.u32.f32         %r2, %f1;
        mov.pred        %p1, -1;
        mov.u32         %r3, -1;
        vote.sync.ballot.b32    %r4, %p1, %r3;
        redux.sync.max.u32 %r5, %r2, %r4;
        atom.global.max.u32     %r6, [%rd3], %r5;
        ret;

clang

        ld.param.u64    %rd1, [testcode(float const*, unsigned int*)_param_0];
        ld.param.u64    %rd2, [testcode(float const*, unsigned int*)_param_1];
        cvta.to.global.u64      %rd3, %rd2;
        cvta.to.global.u64      %rd4, %rd1;
        mov.u32         %r1, %tid.x;
        mul.wide.u32    %rd5, %r1, 4;
        add.s64         %rd6, %rd4, %rd5;
        ld.global.f32   %f1, [%rd6];
        cvt.rzi.u32.f32         %r2, %f1;
        mov.pred        %p1, -1;
        vote.sync.ballot.b32    %r3, %p1, -1;
        redux.sync.max.u32 %r4, %r3, %r2;
        atom.global.max.u32     %r5, [%rd3], %r4;
        ret;

I hacked around and got it to work with:

__global__ void testcode2(const float* data, unsigned int* max_value) {
    unsigned int r = static_cast<unsigned int>(data[threadIdx.x]);
    unsigned int mask = __ballot_sync(0xFFFFFFFF, true);
    unsigned int mx;                                           
    asm volatile(                               
        "        redux.sync.max.u32 %0, %2, %1;"
        : "=r"(mx) // Output operand (mask)     
        : "r"(mask), "r"(r)// No input operands          
        : "cc" // Clobbered registers and condition codes
    );                                                   
      
    atomicMax(max_value, mx);         
}      

The fix being to swap the mask and input on the redux.sync.

Metadata

Metadata

Assignees

No one assigned

    Type

    Projects

    Status

    Done

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions