 # Numerical difference in matrix multiplication

I am seeing numerical differences (though small) between custom numba.cuda matrix multiplication and that of numpy.matmul and torch.matmul, and even a naive matmul function. I am seeing this using torch as well as numpy. Here is an example:

``````from __future__ import division
from numba import cuda, float32
import numpy
import math

# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 4

@cuda.jit
def fast_matmul(A, B, C):
"""
Perform matrix multiplication of C = A * B
Each thread computes one element of the result matrix C
"""

# Define an array in the shared memory
# The size and type of the arrays must be known at compile time
sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

x, y = cuda.grid(2)

tx = cuda.threadIdx.x
ty = cuda.threadIdx.y

if x >= C.shape and y >= C.shape:
# Quit if (x, y) is outside of valid C boundary
return

# Each thread computes one element in the result matrix.
# The dot product is chunked into dot products of TPB-long vectors.
tmp = 0.
for i in range(int(A.shape / TPB)):
# Preload data into shared memory
sA[tx, ty] = A[x, ty + i * TPB]
sB[tx, ty] = B[tx + i * TPB, y]

# Wait until all threads finish preloading
cuda.syncthreads()

# Computes partial product on the shared memory
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]

# Wait until all threads finish computing
cuda.syncthreads()

C[x, y] = tmp

# The data array
#A = numpy.full((TPB*2, TPB*3), 3, numpy.float) # [32 x 48] matrix containing all 3's
#B = numpy.full((TPB*3, TPB*1), 4, numpy.float) # [48 x 16] matrix containing all 4's
A = numpy.random.rand(TPB*2, TPB*3).astype(float)
B = numpy.random.rand(TPB*3, TPB*1).astype(float)

A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((TPB*2, TPB*1)) # [32 x 16] matrix result

# Configure the blocks
threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape / threadsperblock))
blockspergrid_y = int(math.ceil(B.shape / threadsperblock))
blockspergrid = (blockspergrid_x, blockspergrid_y)

# Start the kernel
fast_matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
res = C_global_mem.copy_to_host()

print('A.shape',A.shape)
print('B.shape',B.shape)
print('A',A)
print('B',B)
print('res.shape',res.shape)
#print('res',res)

print('numpy.abs(numpy.matmul(A,B) - res).sum()',numpy.abs(numpy.matmul(A,B) - res).sum())

``````

These numerical differences aren’t unexpected, because floating point arithmetic isn’t associative. With a slightly modified example that seeds the RNG for reproducibility:

``````from __future__ import division
from numba import cuda, float32
import numpy
import math

# Controls threads per block and shared memory usage.
# The computation will be done on blocks of TPBxTPB elements.
TPB = 4

@cuda.jit
def fast_matmul(A, B, C):
"""
Perform matrix multiplication of C = A * B
Each thread computes one element of the result matrix C
"""

# Define an array in the shared memory
# The size and type of the arrays must be known at compile time
sA = cuda.shared.array(shape=(TPB, TPB), dtype=float32)
sB = cuda.shared.array(shape=(TPB, TPB), dtype=float32)

x, y = cuda.grid(2)

tx = cuda.threadIdx.x
ty = cuda.threadIdx.y

if x >= C.shape and y >= C.shape:
# Quit if (x, y) is outside of valid C boundary
return

# Each thread computes one element in the result matrix.
# The dot product is chunked into dot products of TPB-long vectors.
tmp = 0.
for i in range(int(A.shape / TPB)):
# Preload data into shared memory
sA[tx, ty] = A[x, ty + i * TPB]
sB[tx, ty] = B[tx + i * TPB, y]

# Wait until all threads finish preloading
cuda.syncthreads()

# Computes partial product on the shared memory
for j in range(TPB):
tmp += sA[tx, j] * sB[j, ty]

# Wait until all threads finish computing
cuda.syncthreads()

C[x, y] = tmp

# Seed RNG for reproducibility
numpy.random.seed(1729)

# The data array
#A = numpy.full((TPB*2, TPB*3), 3, numpy.float) # [32 x 48] matrix containing all 3's
#B = numpy.full((TPB*3, TPB*1), 4, numpy.float) # [48 x 16] matrix containing all 4's
A = numpy.random.rand(TPB*2, TPB*3).astype(float)
B = numpy.random.rand(TPB*3, TPB*1).astype(float)

A_global_mem = cuda.to_device(A)
B_global_mem = cuda.to_device(B)
C_global_mem = cuda.device_array((TPB*2, TPB*1)) # [32 x 16] matrix result

# Configure the blocks
threadsperblock = (TPB, TPB)
blockspergrid_x = int(math.ceil(A.shape / threadsperblock))
blockspergrid_y = int(math.ceil(B.shape / threadsperblock))
blockspergrid = (blockspergrid_x, blockspergrid_y)

# Start the kernel
fast_matmul[blockspergrid, threadsperblock](A_global_mem, B_global_mem, C_global_mem)
res = C_global_mem.copy_to_host()

print('A.shape',A.shape)
print('B.shape',B.shape)
print('A',A)
print('B',B)
print('res.shape',res.shape)
#print('res',res)

print('numpy.abs(numpy.matmul(A,B) - res).sum()',numpy.abs(numpy.matmul(A,B) - res).sum())
``````

I get

``````numpy.abs(numpy.matmul(A,B) - res).sum() 1.204777436258908e-06
``````

which seems reasonable at a glance for matrix multiplication using single precision arithmetic.