I was wondering if declaring the argument as a contiguous array would help (it helps a bit) and if a C implementation with AVX instructions would be faster (apparently twice as fast as Numpy). The AVX implementation might serve as a stronger baseline for Numba to target, since it runs approximately at the memory bandwidth limit. There is not much reason why such a simple function should be slower than memcpy since there is not much computation being done here.
Implementation | Time (Microseconds) |
---|---|
Numba | 1014.748 ± 160.301 |
Numba, contiguous | 792.670 ± 47.724 |
Numpy | 429.551 ± 42.410 |
C with AVX | 235.201 ± 31.141 |
import os, time, ctypes, numba, numpy as np
with open("mylib.c", "w") as f:
f.write("""
#include <immintrin.h>
#include <math.h>
// Note: Only works for array dimensions which are multiples of 4
void reduce(double *result, double *x, int n){
for (int i = 0; i < n; i += 4){
__m256d res[4];
for (int k = 0; k < 4; k++){
res[k] = _mm256_set1_pd(-INFINITY);
}
for (int j = 0; j < n; j += 4){
for (int k = 0; k < 4; k++){
res[k] = _mm256_max_pd(res[k], _mm256_loadu_pd(x + (i + k) * n + j));
}
}
for (int k = 0; k < 4; k++){
result[i + k] = fmax(fmax(res[k][0], res[k][1]), fmax(res[k][2], res[k][3]));
}
}
}
""")
# Compile and build C function
os.system("gcc -mavx -O3 -shared mylib.c -o mylib.so")
lib = ctypes.CDLL("./mylib.so")
_reduce = lib.reduce
c_double_p = ctypes.POINTER(ctypes.c_double)
_reduce.argtypes = [c_double_p, c_double_p, ctypes.c_int]
def c_avx_max_reduce_axis_1(x):
result = np.zeros(len(x))
_reduce(np.ctypeslib.as_ctypes(result), np.ctypeslib.as_ctypes(x.ravel()), len(x))
return result
@numba.njit("f8[:](f8[:, :])")
def numba_max_reduce_axis_1(x):
res = np.full((x.shape[0]), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
if res[i] < x[i, j]:
res[i] = x[i, j]
return res
@numba.njit("f8[:](f8[:, ::1])")
def numba_c_order_max_reduce_axis_1(x):
res = np.full((x.shape[0]), -np.inf, dtype=x.dtype)
for i in range(x.shape[0]):
for j in range(x.shape[1]):
if res[i] < x[i, j]:
res[i] = x[i, j]
return res
def np_max_reduce_axis_1(x):
return x.max(axis=1)
test_data = np.random.normal(size=(1024, 1024))
functions = [
numba_max_reduce_axis_1,
numba_c_order_max_reduce_axis_1,
np_max_reduce_axis_1,
c_avx_max_reduce_axis_1,
]
results = [max_reduce_axis_1(test_data) for max_reduce_axis_1 in functions]
for res in results:
assert(np.allclose(results[0], res))
for max_reduce_axis_1 in functions:
timings = []
for _ in range(1000):
t = time.perf_counter()
max_reduce_axis_1(test_data)
timings.append((time.perf_counter() - t) * 1e6)
print(f"| {max_reduce_axis_1.__name__:31} | {np.mean(timings):9.3f} +- {np.std(timings):8.3f} us |")