Basic sum scan algorithm

import numpy as np
import math
import numba
from numba import cuda


# basic blelloch
@cuda.jit
def sum_reduction_kernel(array, niter):
    k = cuda.grid(1) 
    n = array.shape[0]
    if k < n:
        for d in range(niter):
            l = k + 2**d # neighbor offset is 1,2,4,8,16,...
            if (k % 2**(d+1))== 0: # sum only if k itself is modulo 2,4,8,16,...
                if l < n: # neighbor might not exist
                    array[k] = array[k] + array[l]
            else:
                break
        cuda.syncthreads()

def sum_reduction(array):
    n = len(array)
    xcu = cuda.to_device(array)

    threads_per_block = 128
    blocks_per_grid = (n + (threads_per_block - 1)) // threads_per_block
    
    niter = n.bit_length()
    sum_reduction_kernel[blocks_per_grid, threads_per_block](xcu, niter)
    xcpu = xcu.copy_to_host()
    print(2**niter)
    print(xcpu[::2**niter])
    return xcu[0]

cpu_sum = lambda x:np.sum(x)


verbose = False
n = 100000
print(n.bit_length(), int(math.ceil(math.log(n,2))))
nchecks = 1
for i in range(nchecks):
#     x = np.random.randn(n)
    x = np.ones((n), dtype=np.float32)
    y = sum_reduction(x)
    y_cpu = cpu_sum(x)
    diff = abs(y_cpu-y)
    if diff > 1e-6:
        print(f'{i}/100 ERROR: ', y, ' ', y_cpu)
    elif verbose:
        print(f'{i}/100 SUCCESS: ', y, ' ', y_cpu)
print('done')

my code works for small arrays (<20000) but not bigger… i do not understand why… i am trying to reproduce the basic sum parallel algorithm