This example (can’t add link …/examples.html#shared-memory-reduction) in the documentation seems to assume 1 block.
import numpy as np
from numba import cuda
from numba.types import int32
# generate data
a = cuda.to_device(np.arange(1024))
nelem = len(a)
@cuda.jit
def array_sum(data):
tid = cuda.threadIdx.x
size = len(data)
if tid < size:
i = cuda.grid(1)
# Declare an array in shared memory
shr = cuda.shared.array(nelem, int32)
shr[tid] = data[i]
# Ensure writes to shared memory are visible
# to all threads before reducing
cuda.syncthreads()
s = 1
while s < cuda.blockDim.x:
if tid % (2 * s) == 0:
# Stride by `s` and add
shr[tid] += shr[tid + s]
s *= 2
cuda.syncthreads()
# After the loop, the zeroth element contains the sum
if tid == 0:
data[tid] = shr[tid]
array_sum[2, 512](a)
print(a[0]) # 523776
print(sum(np.arange(1024))) # 523776
130816
523776
With array_sum[2, 1024](a)
, the result is 0.
I am confused by this part:
tid = cuda.threadIdx.x
size = len(data)
if tid < size:
i = cuda.grid(1)
If there is one block, then i
simply equals tid
.
Here is the equivalent implementation that also works with non-multiples of 2.
import numpy as np
from numba import cuda, int32
a = np.arange(1_000)
a_gpu = cuda.to_device(a)
n = len(a)
@cuda.jit
def sum_reduce_gpu(a): # assume 1 block
# thread id within a 1D block (position relative to a block)
thread_id = cuda.threadIdx.x
# Create an array in shared memory
shared_array = cuda.shared.array(n, int32) # shape needs to be a "simple constant", len(a) won't work
# each thread is responsible for copying one value
shared_array[thread_id] = a[thread_id]
# wait for all threads to finish
cuda.syncthreads()
step_size = 1
while step_size < n:
if thread_id % (2 * step_size) == 0 and thread_id + step_size < n:
shared_array[thread_id] += shared_array[thread_id + step_size]
step_size *= 2
# wait for all threads to finish to make sure shared_array[thread_id + step_size] has been updated
cuda.syncthreads()
# one thread is responsible for writing the result
if thread_id == 0:
# store the result in a
a[thread_id] = shared_array[0]
sum_reduce_gpu[1, n](a_gpu)
print(a.sum(), a_gpu[0])
If I wanted to use multiple blocks, would I need a 2 step process? Each block will be responsible for one part of the array, then at the end, one of the threads will combine the results of each block.