Cuda.jit and njit giving different results

For completeness, I attempted to perform the same computation using the mpmath package which claims to provide fixed precision capabilities:

import mpmath
mpmath.mp.dps = 100  # Precision

def mpmath_cpu_func(a, b, c, d):
    for i in range(a.rows):
        for l in range(d[i], 0, -1):
            for j in range(l):
                a[i, j] = (b[i] * a[i, j] + (1.0 - b[i]) * a[i, j + 1]) / c[i]
    return a[:, 0]

# Convert inputs to mpmath types with high precision
mp_a = mpmath.matrix(a.copy())
for i in range(a.shape[0]):
    for j in range(a.shape[1]):
        mp_a[i, j] = mpmath.mpf(a[i, j].astype(str))
mp_b = [mpmath.mpf(b[0].astype(str))]
mp_c = [mpmath.mpf(c[0].astype(str))]

mpmath_cpu_func(mp_a, mp_b, mp_c, d)  # Produces 0.6449015342958763156413719663014237700252472529553791866336058829452386808983049961922292904843391669

Note that the high precision result is also different from the CPU and GPU results above.