I am very surprised np.dot is 10 x slower than the loop in numba. Does someone have an idea? In my example below, go_fast is using a loop instead of cross dot, and go_fast_np is using array cross dot directly. go_faster_np is 10x slower than go_faster
0.653007984161377
0.012694835662841797
0.0072171688079833984
0.0067157745361328125
0.007045269012451172
0.543813943862915
0.041455984115600586
0.029541015625
0.03484296798706055
0.03132915496826172
import numpy as np
from numba import njit
import time
@njit
def go_fast(a, b):
# Function is compiled and runs in machine code
m = a.shape[0]
n = b.shape[0]
f = a.shape[1]
trace = np.zeros((m, n))
for i in range(m):
for j in range(n):
for k in range(f):
trace[i, j] += a[i, k] * b[j, k]
return trace
@njit
def go_fast_np(a, b):
# why is it 10x slower than go_fast
m = a.shape[0]
n = b.shape[0]
trace = np.zeros((m, n))
for i in range(m):
for j in range(n):
trace[i, j] = a[i, :].dot(b[j, :])
return trace
if name == ‘main’:
x = np.arange(20000).astype(float).reshape(1000, 20)
y = np.arange(10000).astype(float).reshape(500, 20)
t = time.time()
for _ in range(5):
s = go_fast(x, y)
prev_t = t
t = time.time()
print(t - prev_t)
print(’=’*60)
for _ in range(5):
s1 = go_fast_np(x, y)
prev_t = t
t = time.time()
print(t - prev_t)