Numba cuda: for vs while in kernel performance difference

Hello,

there is something I don’t understand about my code (see below). I am writing a very basic matrix multiplication CUDA kernel in Python using Numba. I now have two versions, one uses a for in range loop, the other uses a while loop. They do exactly the same thing, the difference is only the used loop.

The for loop is more than two times slower than the while loop:

Matmul time using for:    40896.496 ms
Matmul time using while:  17172.275 ms

I wrote a kernel with the same behaviour also in C++ and Fortran, and the timing ± corresponds to the python while loop (C++ 15.6s, Fortran 17.7s).

Could you please explain to me what is wrong with the version that uses the for loop? I suspected the range actually allocating memory or some problems with type conversions (the val variable was already a problem), but would like to get a definitive answer.

The timing is in tens of seconds, so some kernel launch overhead or not-precise-enough timing should not be a concern here. Also, swapping the order of kernel launches doesnot make any difference. I use Nvidia GTX 1650 Ti GPU, have cuda-11.4, and run everything under WSL2 in Win11 latest insider build 22543, numba 0.53.1 and Python 3.6.9.

Thanks for help.

and the source code:

#! /usr/bin/env python3

import numpy as np
from numba import cuda



@cuda.jit
def matrix_init(matrix, width, height, a_r, a_c) :
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    if row < height and col < width :
        matrix[row, + col] = a_r * (row + 1) + a_c * (col + 1)



@cuda.jit
def matmul_for(M, N, P, size_m, size_n, size_k) :
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    if col < size_n and row < size_m :
        val = np.float32(0)
        for k in range(size_k) :
            val += M[row, k] * N[k, col]
        P[row, col] = val



@cuda.jit
def matmul_while(M, N, P, size_m, size_n, size_k) :
    col = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    row = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y

    if col < size_n and row < size_m :
        val = np.float32(0)
        k = 0
        while k < size_k :
            val += M[row, k] * N[k, col]
            k += 1        
        P[row, col] = val





# size of matrices
size_m = 15394
size_n = 12287
size_k = 14480

# memory allocation
d_M = cuda.device_array((size_m,size_k), dtype=np.float32)
d_N = cuda.device_array((size_k,size_n), dtype=np.float32)
d_P = cuda.device_array((size_m,size_n), dtype=np.float32)

# events
event_start = cuda.event(timing=True)
event_end = cuda.event(timing=True)

# initialization of matrices M and N
tpb = (32, 32)
bpg = ((size_k - 1) // tpb[0] + 1, (size_m - 1) // tpb[1] + 1)
matrix_init[ bpg, tpb ](d_M, size_k, size_m, 1, 0)
tpb = (32, 32)
bpg = ((size_n - 1) // tpb[0] + 1, (size_k - 1) // tpb[1] + 1)
matrix_init[ bpg, tpb ](d_N, size_n, size_k, 0, 1)
cuda.synchronize()


tpb = (32, 32)
bpg = ((size_n - 1) // tpb[0] + 1, (size_m - 1) // tpb[1] + 1)

# matmul_for kernel timing
event_start.record()
matmul_for[ bpg, tpb ](d_M, d_N, d_P, size_m, size_n, size_k)
event_end.record()
cuda.synchronize()
time_for = cuda.event_elapsed_time(event_start, event_end)

# matmul_while kernel timing
event_start.record()
matmul_while[ bpg, tpb ](d_M, d_N, d_P, size_m, size_n, size_k)
event_end.record()
cuda.synchronize()
time_while = cuda.event_elapsed_time(event_start, event_end)



# print timing results
print()
print("Matmul time using for:   %10.3f ms" % time_for)
print("Matmul time using while: %10.3f ms" % time_while)

I think this will be a similar issue with typing to that described here: help diagnosing a GPU performance issue · Issue #4647 · numba/numba · GitHub