Cuda.jit and njit giving different results

In the following example, I have a simple CPU function:

import numpy as np
from numba import njit, cuda

@njit
def cpu_func(a, b, c, d):
    for i in range(len(a)):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]

    return a[:, 0]

and an equivalent GPU implementation of the same function above:

@cuda.jit
def _gpu_func(a, b, c, d, out):
    i = cuda.grid(1)
    if i < len(a):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
            out[i] = a[i, 0]

def gpu_func(a, b, c, d):
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)
    d_d = cuda.to_device(d)
    d_out = cuda.device_array(len(a))

    threads_per_block = 64
    blocks_per_grid = 128
    _gpu_func[blocks_per_grid, threads_per_block](d_a, d_b, d_c, d_d, d_out)
    out = d_out.copy_to_host()

    return out

However, when I call BOTH functions with the same inputs, I am getting very different results:

a = np.array([[
    1.150962188573305234e+00, 1.135924188549360281e+00, 1.121074496043255930e+00, 1.106410753047196494e+00, 1.091930631080626046e+00,
    1.077631830820479752e+00, 1.063512081736074144e+00, 1.049569141728566635e+00, 1.035800796774924315e+00, 1.022204860576360286e+00,
    1.008779174211164253e+00, 9.955216057918859773e-01, 9.824300501268078412e-01, 9.695024283856580327e-01, 9.567366877695103744e-01,
    9.441308011848159598e-01, 9.316827669215179686e-01, 9.193906083351972569e-01, 9.072523735331967654e-01, 8.952661350646772265e-01,
    8.834299896145549891e-01, 8.717420577012704452e-01, 8.602004833783417626e-01, 8.488034339396569594e-01, 8.375490996284553624e-01,
    8.264356933499517055e-01, 8.154614503875622367e-01, 8.046246281226814290e-01, 7.939235057579688837e-01, 7.833563840441006842e-01,
    7.729215850099430130e-01, 7.626174516961046201e-01, 7.524423478918242925e-01, 7.423946578751554615e-01, 7.324727861564024334e-01,
    7.226751572247696043e-01, 7.130002152981841368e-01, 7.034464240762497989e-01, 6.940122664962961041e-01, 6.846962444924807878e-01,
    6.754968787579095357e-01, 6.664127085097342196e-01, 6.574422912571935562e-01, 6.485842025725561122e-01, 6.398370358649341227e-01,
    6.311994021569281577e-01, 6.226699298640693270e-01, 6.142472645770222783e-01, 6.059300688465173446e-01, 5.977170219709739829e-01,
    0.000000000000000000e+00
]])
b = np.array([1.533940813369776279e+00])
c = np.array([1.018336794718317062e+00])
d = np.array([50], dtype=np.int64)

cpu_func(a.copy(), b, c, d)  # Produces array([0.74204214])
gpu_func(a.copy(), b, c, d)  # Produces array([0.67937252])

I understand that precision can be an issue and each compiler may optimize the code differently (e.g., fused multiply-and-add) but, setting that aside for now, is there some way to ensure/force the CPU code and the GPU code to both produce the SAME output?

For completeness, I attempted to perform the same computation using the mpmath package which claims to provide fixed precision capabilities:

import mpmath
mpmath.mp.dps = 100  # Precision

def mpmath_cpu_func(a, b, c, d):
    for i in range(a.rows):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
    return a[:, 0]

# Convert inputs to mpmath types with high precision
mp_a = mpmath.matrix(a.copy())
for i in range(a.shape[0]):
    for j in range(a.shape[1]):
        mp_a[i, j] = mpmath.mpf(a[i, j].astype(str))
mp_b = [mpmath.mpf(b[0].astype(str))]
mp_c = [mpmath.mpf(c[0].astype(str))]

mpmath_cpu_func(mp_a, mp_b, mp_c, d)  # Produces 0.6449015342958763156413719663014237700252472529553791866336058829452386808983049961922292904843391669

Note that the high precision result is also different from the CPU and GPU results above.

Update

I understand that numerical precision can be an issue (e.g., overflow/underflow) and each compiler may optimize the code differently (e.g., fused multiply-and-add) but, setting that aside for now, is there some way to ensure/force the CPU code and the GPU code to both produce the SAME output (up to 4-5 decimal places)? Or maybe there is some clever way to improve the precision?

I was able to leverage the pyfma package in the CPU code (without numba njit!) to produce a result that matches the GPU code:

import numpy as np
import pyfma
import _pyfma
from numpy.typing import ArrayLike

def monkey_patch_fma(a: ArrayLike, b: ArrayLike, c: ArrayLike) -> np.ndarray:
    """
    This is a simple monkey patch to make `pyfma` compatible with NumPy v2.0

    Based on this `pyfma` PR - https://github.com/nschloe/pyfma/pull/17/files
    """
    a = np.asarray(a)
    b = np.asarray(b)
    c = np.asarray(c)
    # dtype = np.find_common_type([], [a.dtype, b.dtype, c.dtype])
    dtype = np.promote_types(np.promote_types(a.dtype, b.dtype), c.dtype)
    a = a.astype(dtype)
    b = b.astype(dtype)
    c = c.astype(dtype)

    if dtype == np.single:
        return _pyfma.fmaf(a, b, c)
    elif dtype == np.double:
        return _pyfma.fma(a, b, c)

    assert dtype == np.longdouble
    return _pyfma.fmal(a, b, c)

pyfma.fma = monkey_patch_fma

def fma_cpu_func(a, b, c, d):
    for i in range(len(a)):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = pyfma.fma(b[i], a[i, j], (1.0 - b[i]) * a[i, j + 1]) / c[i]
    return a[:, 0]

a = np.array([[
    1.150962188573305234e+00, 1.135924188549360281e+00, 1.121074496043255930e+00, 1.106410753047196494e+00, 1.091930631080626046e+00,
    1.077631830820479752e+00, 1.063512081736074144e+00, 1.049569141728566635e+00, 1.035800796774924315e+00, 1.022204860576360286e+00,
    1.008779174211164253e+00, 9.955216057918859773e-01, 9.824300501268078412e-01, 9.695024283856580327e-01, 9.567366877695103744e-01,
    9.441308011848159598e-01, 9.316827669215179686e-01, 9.193906083351972569e-01, 9.072523735331967654e-01, 8.952661350646772265e-01,
    8.834299896145549891e-01, 8.717420577012704452e-01, 8.602004833783417626e-01, 8.488034339396569594e-01, 8.375490996284553624e-01,
    8.264356933499517055e-01, 8.154614503875622367e-01, 8.046246281226814290e-01, 7.939235057579688837e-01, 7.833563840441006842e-01,
    7.729215850099430130e-01, 7.626174516961046201e-01, 7.524423478918242925e-01, 7.423946578751554615e-01, 7.324727861564024334e-01,
    7.226751572247696043e-01, 7.130002152981841368e-01, 7.034464240762497989e-01, 6.940122664962961041e-01, 6.846962444924807878e-01,
    6.754968787579095357e-01, 6.664127085097342196e-01, 6.574422912571935562e-01, 6.485842025725561122e-01, 6.398370358649341227e-01,
    6.311994021569281577e-01, 6.226699298640693270e-01, 6.142472645770222783e-01, 6.059300688465173446e-01, 5.977170219709739829e-01,
    0.000000000000000000e+00
]])
b = np.array([1.533940813369776279e+00])
c = np.array([1.018336794718317062e+00])
d = np.array([50], dtype=np.int64)

fma_cpu_func(a.copy(), b, c, d)  # Produces array([0.67937252])

So, both the CPU and GPU outputs are identical and “closer” to the higher-precision output (0.644901534295876315641) but still inaccurate.