Some trouble of using CUDA kernel function to calculate heap sort

Some trouble of using CUDA kernel function to calculate heap sort

hey guys, I createdPreformatted text some cuda kernel -funtion for heap-sort.
When I use CPU to simulate calculation, everything is normal, but random errors occurred when calling kernel functions .Help please
Here are the codes,

This is a heap sort algorithm that supports multiple branches

---------test-------------------------------

from time import time
import math
import numpy as np
from numba import cuda

RelativePixel = np.dtype({'names': ['X',
                                    'Y',
                                    'proportion'],
                          'formats': [np.int32,
                                      np.int32,
                                      np.float64]
                          })

def doTest4():
    Rdata = np.random.randint(0, 1000, 1000)
    data1_host = np.zeros(1000, dtype=RelativePixel)
    data2_host = np.zeros(1000, dtype=RelativePixel)
    swapBuffer_host = np.zeros(1, dtype=RelativePixel)
    for i in range(0, 1000, 1):
        data1_host['X'][i] = 0
        data1_host['Y'][i] = i

        data2_host['X'][i] = 0
        data2_host['Y'][i] = i
        if i % 1 == 0:
            # data1_host['proportion'][i] = Rdata[i]
            # data2_host['proportion'][i] = Rdata[i]
            data1_host['proportion'][i] = i
            data2_host['proportion'][i] = i
        else:
            data1_host['proportion'][i] = -1
            data2_host['proportion'][i] = -1

    data1 = cuda.to_device(data1_host)
    swapBuff = cuda.to_device(swapBuffer_host)
    bastBranch, powerSeries_host = resources.getFavourBranch(1000, 3)
    powerSeries = cuda.to_device(powerSeries_host)
    multiTreeHeartSort_GPU[1, 1](data1, swapBuff, 1, bastBranch, powerSeries)
    cuda.synchronize()
    resources.multiTreeHeartSort(data2_host, swapBuffer_host, 'proportion', 1, bastBranch, powerSeries)

    data1.copy_to_host(data1_host)
    print(data1_host)
    print('-------------------------------------------------------------------')
    print(data2_host)


@cuda.jit(max_registers=64)
def multiTreeHeartSort_GPU(data, swapBuff, order, branch, powerSeries):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bw = cuda.blockDim.x
    pos = tx + bx * bw
    if pos < swapBuff.shape[0]:
        resources.multiTreeHeartSort_GPU(data, swapBuff, 'proportion', order, branch, powerSeries)


if __name__ == '__main__':
    doTest4()

—resources-------------------------------

from numba import cuda, njit
import numpy as np
import math


def getFavourBranch(N, Branch):
    level = -1
    fx = 0
    while fx < N:
        level += 1
        fx += math.pow(Branch, level)

    powerSeries = np.zeros(level + 1, dtype=np.int64)
    fx = 0
    for i in range(level + 1):
        fx += math.pow(Branch, i)
        powerSeries[i] = fx
    return Branch, powerSeries


def multiTreeHeartSort(data, swapBuff, keyValueName, order, branch, powerSeries):
    """
    :param data:
    :param swapBuff:
    :param keyValueName:
    :param order: 1 is acs  -1 is decs
    :param branch: branch of multi-tree
    :param powerSeries: sum of from top to each level
    :return:
    """
    level = powerSeries.shape[0] - 1
    lastPoint = data.shape[0] - 1
    AO = 0
    for i in range(level - 1, -1, -1):
        for j in range(0, int(math.pow(branch, i))):
            AO += buildHead(data, i, j, lastPoint, level, swapBuff, keyValueName, order, branch, powerSeries)
            # buildHead(data, i, j, lastPoint, level, swapBuff, keyValueName, order, branch, powerSeries)
    AO = 0
    for i in range(data.shape[0] - 1, -1, -1):
        swap(data, 0, i, swapBuff)
        AO += buildHead(data, 0, 0, i - 1, level, swapBuff, keyValueName, order, branch, powerSeries)
        # buildHead(data, 0, 0, i - 1, level, swapBuff, keyValueName, order, branch, powerSeries)
    return data


def swap(data, A, B, swapBuff):
    swapBuff[0] = data[A]
    data[A] = data[B]
    data[B] = swapBuff[0]


def buildHead(data, x, y, lastPoint, level, swapBuff, keyValueName, order, branch, powerSeries):
    """
    :param data:
    :param x:
    :param y: 
    :param lastPoint:
    :param level:
    :param swapBuff:
    :param keyValueName:
    :param order: 1 is acs  -1 is decs
    :param branch:
    :param powerSeries:
    :return:
    """
    O = 0
    if x < 0 or x > level - 1:
        return O
        # return
    while x <= level - 1:
        extreme = 0
        extremePoint = -1
        mainNode = 0 if x == 0 else powerSeries[x - 1] + y
        y1 = -1
        O += 1
        upperNodes = powerSeries[ ] + y * branch
        for i in range(0, branch, 1):
            subNode = upperNodes + i
            O += 1
            if subNode <= lastPoint:
                gap = data[keyValueName][mainNode] - data[keyValueName][subNode]
                if order == -1:
                    if i == 0:
                        extreme = gap
                        if gap > 0:
                            extremePoint = subNode
                            y1 = i
                    elif gap > extreme and gap > 0:
                        extreme = gap
                        extremePoint = subNode
                        y1 = i
                elif order == 1:
                    if i == 0:
                        extreme = gap
                        if gap < 0:
                            extremePoint = subNode
                            y1 = i
                    elif gap < extreme and gap < 0:
                        extreme = gap
                        extremePoint = subNode
                        y1 = i
            else:
                break
        if extremePoint > -1:
            swap(data, mainNode, extremePoint, swapBuff)
            x += 1
            y = y * branch + y1
        else:
            return O
            # return
    return O
    # return


@cuda.jit(device=True)
def multiTreeHeartSort_GPU(data, swapBuff, keyValueName, order, branch, powerSeries):
    """
    :param data:
    :param swapBuff:
    :param keyValueName:
    :param order: 1 is acs  -1 is decs
    :param branch: branch of multi-tree
    :param powerSeries: sum of from top to each level
    :return:
    """
    level = powerSeries.shape[0] - 1
    lastPoint = data.shape[0] - 1
    for i in range(level - 1, -1, -1):
        for j in range(0, int(math.pow(branch, i))):
            buildHead_GPU(data, i, j, lastPoint, level, swapBuff, keyValueName, order, branch, powerSeries)
    for i in range(data.shape[0] - 1, -1, -1):
        swap_GPU(data, 0, i, swapBuff)
        buildHead_GPU(data, 0, 0, i - 1, level, swapBuff, keyValueName, order, branch, powerSeries)


@cuda.jit(device=True)
def swap_GPU(data, A, B, swapBuff):
    swapBuff[0] = data[A]
    data[A] = data[B]
    data[B] = swapBuff[0]


@cuda.jit(device=True)
def buildHead_GPU(data, x, y, lastPoint, level, swapBuff, keyValueName, order, branch, powerSeries):
    """
    :param data:
    :param x: 
    :param y:
    :param lastPoint: 
    :param level:
    :param swapBuff:
    :param keyValueName:
    :param order: 1 is acs  -1 is decs
    :param branch:
    :param powerSeries:
    :return:
    """
    if x < 0 or x > level - 1:
        return
    while x <= level - 1:
        extreme = 0
        extremePoint = -1
        mainNode = 0 if x == 0 else powerSeries[x - 1] + y
        y1 = -1
        upperNodes = powerSeries[x] + y * branch
        for i in range(0, branch, 1):
            subNode = upperNodes + i
            if subNode <= lastPoint:
                gap = data[mainNode][keyValueName] - data[subNode][keyValueName]
                if order == -1:
                    if i == 0:
                        extreme = gap
                        if gap > 0:
                            extremePoint = subNode
                            y1 = i
                    elif gap > extreme and gap > 0:
                        extreme = gap
                        extremePoint = subNode
                        y1 = i
                elif order == 1:
                    if i == 0:
                        extreme = gap
                        if gap < 0:
                            extremePoint = subNode
                            y1 = i
                    elif gap < extreme and gap < 0:
                        extreme = gap
                        extremePoint = subNode
                        y1 = i
            else:
                break
        if extremePoint > -1:
            swap_GPU(data, mainNode, extremePoint, swapBuff)
            x += 1
            y = y * branch + y1
        else:
            return
    return

The result of GPU is not completely sorted,random errors in sorting

–a-piece-of-data-sorted–by-GPU-------------------------------
[(0, 0, 0.) (0, 1, 1.) (0, 2, 2.) (0, 3, 3.)
(0, 4, 4.) (0, 5, 5.) (0, 6, 6.) (0, 7, 7.)
(0, 8, 8.) (0, 930, 930.) (0, 999, 999.) (0, 363, 363.)
(0, 9, 9.) (0, 10, 10.) (0, 11, 11.) (0, 12, 12.)
(0, 13, 13.) (0, 14, 14.) (0, 15, 15.) (0, 16, 16.)
(0, 17, 17.) (0, 18, 18.) (0, 19, 19.) (0, 20, 20.)
(0, 21, 21.) (0, 22, 22.) (0, 23, 23.) (0, 24, 24.)
(0, 25, 25.) (0, 26, 26.) (0, 876, 876.) (0, 903, 903.)
(0, 929, 929.) (0, 957, 957.) (0, 984, 984.) (0, 998, 998.)
(0, 345, 345.) (0, 354, 354.)
(0, 362, 362.) (0, 27, 27.)
(0, 28, 28.) (0, 29, 29.) (0, 30, 30.) (0, 31, 31.)
(0, 32, 32.) (0, 33, 33.) (0, 34, 34.) (0, 35, 35.)
(0, 36, 36.) (0, 37, 37.) (0, 38, 38.) (0, 39, 39.)
(0, 40, 40.) (0, 41, 41.) (0, 42, 42.) (0, 43, 43.)
(0, 44, 44.) (0, 45, 45.) (0, 46, 46.) (0, 47, 47.)
(0, 48, 48.) (0, 49, 49.) (0, 50, 50.) (0, 51, 51.)
(0, 52, 52.) (0, 53, 53.) (0, 54, 54.) (0, 55, 55.)
(0, 56, 56.) (0, 57, 57.) (0, 58, 58.) (0, 59, 59.)
(0, 60, 60.) (0, 61, 61.) (0, 62, 62.) (0, 63, 63.)
(0, 64, 64.) (0, 65, 65.) (0, 66, 66.) (0, 67, 67.)
(0, 68, 68.) (0, 69, 69.) (0, 70, 70.) (0, 71, 71.)
(0, 72, 72.) (0, 73, 73.) (0, 74, 74.) (0, 75, 75.)
(0, 76, 76.) (0, 77, 77.) (0, 78, 78.) (0, 79, 79.)
(0, 80, 80.) (0, 858, 858.) (0, 867, 867.) (0, 875, 875.)
(0, 885, 885.) (0, 894, 894.) (0, 902, 902.) (0, 912, 912.)
(0, 921, 921.) (0, 928, 928.) (0, 939, 939.) (0, 948, 948.)
(0, 956, 956.) (0, 966, 966.) (0, 975, 975.) (0, 983, 983.)
(0, 993, 993.) (0, 997, 997.) (0, 336, 336.) (0, 339, 339.)
(0, 342, 342.) (0, 344, 344.) (0, 348, 348.) (0, 351, 351.)
(0, 353, 353.) (0, 357, 357.) (0, 360, 360.) (0, 361, 361.)

–a-piece-of-data-sorted–by-CPU-------------------------------
[(0, 0, 0.) (0, 1, 1.) (0, 2, 2.) (0, 3, 3.)
(0, 4, 4.) (0, 5, 5.) (0, 6, 6.) (0, 7, 7.)
(0, 8, 8.) (0, 9, 9.) (0, 10, 10.) (0, 11, 11.)
(0, 12, 12.) (0, 13, 13.) (0, 14, 14.) (0, 15, 15.)
(0, 16, 16.) (0, 17, 17.) (0, 18, 18.) (0, 19, 19.)
(0, 20, 20.) (0, 21, 21.) (0, 22, 22.) (0, 23, 23.)
(0, 24, 24.) (0, 25, 25.) (0, 26, 26.) (0, 27, 27.)
(0, 28, 28.) (0, 29, 29.) (0, 30, 30.) (0, 31, 31.)
(0, 32, 32.) (0, 33, 33.) (0, 34, 34.) (0, 35, 35.)
(0, 36, 36.) (0, 37, 37.) (0, 38, 38.) (0, 39, 39.)
(0, 40, 40.) (0, 41, 41.) (0, 42, 42.) (0, 43, 43.)
(0, 44, 44.) (0, 45, 45.) (0, 46, 46.) (0, 47, 47.)
(0, 48, 48.) (0, 49, 49.) (0, 50, 50.) (0, 51, 51.)
(0, 52, 52.) (0, 53, 53.) (0, 54, 54.) (0, 55, 55.)
(0, 56, 56.) (0, 57, 57.) (0, 58, 58.) (0, 59, 59.)
(0, 60, 60.) (0, 61, 61.) (0, 62, 62.) (0, 63, 63.)
(0, 64, 64.) (0, 65, 65.) (0, 66, 66.) (0, 67, 67.)
(0, 68, 68.) (0, 69, 69.) (0, 70, 70.) (0, 71, 71.)
(0, 72, 72.) (0, 73, 73.) (0, 74, 74.) (0, 75, 75.)
(0, 76, 76.) (0, 77, 77.) (0, 78, 78.) (0, 79, 79.)
(0, 80, 80.) (0, 81, 81.) (0, 82, 82.) (0, 83, 83.)
(0, 84, 84.) (0, 85, 85.) (0, 86, 86.) (0, 87, 87.)
(0, 88, 88.) (0, 89, 89.) (0, 90, 90.) (0, 91, 91.)
(0, 92, 92.) (0, 93, 93.) (0, 94, 94.) (0, 95, 95.)
(0, 96, 96.) (0, 97, 97.) (0, 98, 98.) (0, 99, 99.)

I found the cause of the problem. Because this codefor j in range(0, int(math.pow(branch, i))):
the compiling of int() function in numba is floor and unlike python, it is approximate.cause the loop range error.I have replaced the code with for j in range(0,branch ** i): and the problem is solved.

the compiling of int() function in numba is floor and unlike python, it is approximate

Many thanks for posting the solution here. I think the problem is that math.pow is implemented differently in CUDA, rather than int() being different - int is the floor function in general:

In [1]: int(2.999999999999999)
Out[1]: 2

The math.pow implementation in CUDA results in math.pow(3, 1) being just slightly less than 3 though, so it ends up rounded down to 2. The following example demonstrates the issue in isolation:

from numba import cuda, njit
import numpy as np
import math

def inner(x, y):
    return int(math.pow(x, y))

cpu_jitted = njit(inner)

def cuda_jitted(x, y):
    inner_device = cuda.jit(device=True)(inner)

    @cuda.jit
    def outer(r, x, y):
        r[()] = inner_device(x[()], y[()])

    r_arr = np.array(0, dtype=np.int64)
    x_arr = np.array(0, dtype=np.int64)
    y_arr = np.array(0, dtype=np.int64)
    x_arr[()] = x
    y_arr[()] = y
    outer[1, 1](r_arr, x_arr, y_arr)

    return r_arr[()]

x = 3
y = 1

print(inner(x, y))
print(cpu_jitted(x, y))
print(cuda_jitted(x, y))

which produces:

3
3
2

This is not ideal - I’ve created CUDA: Numerical differences between CPython / CPU target and the CUDA target in `math.pow` · Issue #8218 · numba/numba · GitHub to keep track of it.

1 Like

Thank you for your attention

1 Like