This bug first occurs in version 0.54 (0.49-0.53 are OK). This results in a perf…ormance decrease of a factor of 4 in a bit optimized summation and a factor of 2 in a naive implementation.
**Example**
```python
import numpy as np
import numba as nb
import time
f = np.random.rand(32,33,2000,64)
b = np.random.rand(32,64)
@nb.njit(fastmath=True)
def test_numba_1(f,b):
fnew = np.zeros((f.shape[1],f.shape[2]))
for i in range(f.shape[0]):
for j in range(f.shape[1]):
for k in range(f.shape[2]):
for l in range(f.shape[3]):
fnew[j,k] += f[i,j,k,l] * b[i,l]
return fnew
@nb.njit(fastmath=True)
def test_numba_2(f,b):
fnew = np.zeros((f.shape[1],f.shape[2]))
for i in range(f.shape[0]):
for j in range(f.shape[1]):
for k in range(f.shape[2]):
acc=fnew[j,k]
for l in range(f.shape[3]):
acc += f[i,j,k,l] * b[i,l]
fnew[j,k]=acc
return fnew
%timeit test_numba_1(f,b)
%timeit test_numba_2(f,b)
#'0.53.1' Last version without bug
#108 ms ± 1.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#53.9 ms ± 809 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
#'0.54.0' First version with bug
#216 ms ± 4.68 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#200 ms ± 2.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#'0.55.0'
#218 ms ± 5.36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#201 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
```
Another question would be if, a rewrite pass is possible in Numba to optimize `test_numba_1` automatically to `test_numba_2`. Clang shows exactly the same behavior as Numba before version 0.54, therefore this likely has to be fixed before generating LLVM-IR.
```c
//clang v11
//clang -shared -O3 -march=native -ffast-math -o summation.dll summation.c
__declspec(dllexport) void eq_nb_1(size_t s0,size_t s1,size_t s2,size_t s3,double f[][s1][s2][s3],double b[][s3],double fnew[][s2])
{
for (size_t i=0;i<s0;i++){
for (size_t j=0;j<s1;j++){
for (size_t k=0;k<s2;k++){
for (size_t l=0;l<s3;l++){
fnew[j][k]+=f[i][j][k][l]*b[i][l];
}
}
}
}
}
__declspec(dllexport) void eq_nb_2(size_t s0,size_t s1,size_t s2,size_t s3,double f[][s1][s2][s3],double b[][s3],double fnew[][s2])
{
for (size_t i=0;i<s0;i++){
for (size_t j=0;j<s1;j++){
for (size_t k=0;k<s2;k++){
double acc=fnew[j][k];
for (size_t l=0;l<s3;l++){
acc+=f[i][j][k][l]*b[i][l];
}
fnew[j][k]=acc;
}
}
}
}
```
```python
import numpy as np
import ctypes
lib = ctypes.cdll.LoadLibrary("summation.dll")
def eq_nb_1(f,b):
fnew = np.zeros((f.shape[1],f.shape[2]))
s_0=ctypes.c_size_t(f.shape[0])
s_1=ctypes.c_size_t(f.shape[1])
s_2=ctypes.c_size_t(f.shape[2])
s_3=ctypes.c_size_t(f.shape[3])
f_p =f.ctypes
b_p =b.ctypes
fnew_p=fnew.ctypes
lib.eq_nb_1(s_0,s_1,s_2,s_3,f_p,b_p,fnew_p)
return fnew
def eq_nb_2(f,b):
fnew = np.zeros((f.shape[1],f.shape[2]))
s_0=ctypes.c_size_t(f.shape[0])
s_1=ctypes.c_size_t(f.shape[1])
s_2=ctypes.c_size_t(f.shape[2])
s_3=ctypes.c_size_t(f.shape[3])
f_p =f.ctypes
b_p =b.ctypes
fnew_p=fnew.ctypes
lib.eq_nb_2(s_0,s_1,s_2,s_3,f_p,b_p,fnew_p)
return fnew
%timeit eq_nb_1(f,b)
%timeit eq_nb_2(f,b)
#100 ms ± 311 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
#50.5 ms ± 1.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
```