In the following example, I have a simple CPU function:
import numpy as np
from numba import njit, cuda
@njit
def cpu_func(a, b, c, d):
for i in range(len(a)):
for l in range(d[i], 0, -1):
for j in range(l):
a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
return a[:, 0]
and an equivalent GPU implementation of the same function above:
@cuda.jit
def _gpu_func(a, b, c, d, out):
i = cuda.grid(1)
if i < len(a):
for l in range(d[i], 0, -1):
for j in range(l):
a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
out[i] = a[i, 0]
def gpu_func(a, b, c, d):
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
d_d = cuda.to_device(d)
d_out = cuda.device_array(len(a))
threads_per_block = 64
blocks_per_grid = 128
_gpu_func[blocks_per_grid, threads_per_block](d_a, d_b, d_c, d_d, d_out)
out = d_out.copy_to_host()
return out
However, when I call BOTH functions with the same inputs, I am getting very different results:
a = np.array([[
1.150962188573305234e+00, 1.135924188549360281e+00, 1.121074496043255930e+00, 1.106410753047196494e+00, 1.091930631080626046e+00,
1.077631830820479752e+00, 1.063512081736074144e+00, 1.049569141728566635e+00, 1.035800796774924315e+00, 1.022204860576360286e+00,
1.008779174211164253e+00, 9.955216057918859773e-01, 9.824300501268078412e-01, 9.695024283856580327e-01, 9.567366877695103744e-01,
9.441308011848159598e-01, 9.316827669215179686e-01, 9.193906083351972569e-01, 9.072523735331967654e-01, 8.952661350646772265e-01,
8.834299896145549891e-01, 8.717420577012704452e-01, 8.602004833783417626e-01, 8.488034339396569594e-01, 8.375490996284553624e-01,
8.264356933499517055e-01, 8.154614503875622367e-01, 8.046246281226814290e-01, 7.939235057579688837e-01, 7.833563840441006842e-01,
7.729215850099430130e-01, 7.626174516961046201e-01, 7.524423478918242925e-01, 7.423946578751554615e-01, 7.324727861564024334e-01,
7.226751572247696043e-01, 7.130002152981841368e-01, 7.034464240762497989e-01, 6.940122664962961041e-01, 6.846962444924807878e-01,
6.754968787579095357e-01, 6.664127085097342196e-01, 6.574422912571935562e-01, 6.485842025725561122e-01, 6.398370358649341227e-01,
6.311994021569281577e-01, 6.226699298640693270e-01, 6.142472645770222783e-01, 6.059300688465173446e-01, 5.977170219709739829e-01,
0.000000000000000000e+00
]])
b = np.array([1.533940813369776279e+00])
c = np.array([1.018336794718317062e+00])
d = np.array([50], dtype=np.int64)
cpu_func(a.copy(), b, c, d) # Produces array([0.74204214])
gpu_func(a.copy(), b, c, d) # Produces array([0.67937252])
I understand that precision can be an issue and each compiler may optimize the code differently (e.g., fused multiply-and-add) but, setting that aside for now, is there some way to ensure/force the CPU code and the GPU code to both produce the SAME output?