import numpy as np
import math
import numba
from numba import cuda
# basic blelloch
@cuda.jit
def sum_reduction_kernel(array, niter):
k = cuda.grid(1)
n = array.shape[0]
if k < n:
for d in range(niter):
l = k + 2**d # neighbor offset is 1,2,4,8,16,...
if (k % 2**(d+1))== 0: # sum only if k itself is modulo 2,4,8,16,...
if l < n: # neighbor might not exist
array[k] = array[k] + array[l]
else:
break
cuda.syncthreads()
def sum_reduction(array):
n = len(array)
xcu = cuda.to_device(array)
threads_per_block = 128
blocks_per_grid = (n + (threads_per_block - 1)) // threads_per_block
niter = n.bit_length()
sum_reduction_kernel[blocks_per_grid, threads_per_block](xcu, niter)
xcpu = xcu.copy_to_host()
print(2**niter)
print(xcpu[::2**niter])
return xcu[0]
cpu_sum = lambda x:np.sum(x)
verbose = False
n = 100000
print(n.bit_length(), int(math.ceil(math.log(n,2))))
nchecks = 1
for i in range(nchecks):
# x = np.random.randn(n)
x = np.ones((n), dtype=np.float32)
y = sum_reduction(x)
y_cpu = cpu_sum(x)
diff = abs(y_cpu-y)
if diff > 1e-6:
print(f'{i}/100 ERROR: ', y, ' ', y_cpu)
elif verbose:
print(f'{i}/100 SUCCESS: ', y, ' ', y_cpu)
print('done')
my code works for small arrays (<20000) but not bigger… i do not understand why… i am trying to reproduce the basic sum parallel algorithm