Numba performance doesn't scale as well as NumPy in vectorized max function

I was wondering if declaring the argument as a contiguous array would help (it helps a bit) and if a C implementation with AVX instructions would be faster (apparently twice as fast as Numpy). The AVX implementation might serve as a stronger baseline for Numba to target, since it runs approximately at the memory bandwidth limit. There is not much reason why such a simple function should be slower than memcpy since there is not much computation being done here.

Implementation Time (Microseconds)
Numba 1014.748 ± 160.301
Numba, contiguous 792.670 ± 47.724
Numpy 429.551 ± 42.410
C with AVX 235.201 ± 31.141
import os, time, ctypes, numba, numpy as np

with open("mylib.c", "w") as f:
    f.write("""
#include <immintrin.h>
#include <math.h>

// Note: Only works for array dimensions which are multiples of 4
void reduce(double *result, double *x, int n){
    for (int i = 0; i < n; i += 4){
        __m256d res[4];

        for (int k = 0; k < 4; k++){
            res[k] = _mm256_set1_pd(-INFINITY);
        }

        for (int j = 0; j < n; j += 4){
            for (int k = 0; k < 4; k++){
                res[k] = _mm256_max_pd(res[k], _mm256_loadu_pd(x + (i + k) * n + j));
            }
        }

        for (int k = 0; k < 4; k++){
            result[i + k] = fmax(fmax(res[k][0], res[k][1]), fmax(res[k][2], res[k][3]));
        }
    }
}
""")

# Compile and build C function
os.system("gcc -mavx -O3 -shared mylib.c -o mylib.so")
lib = ctypes.CDLL("./mylib.so")
_reduce = lib.reduce
c_double_p = ctypes.POINTER(ctypes.c_double)
_reduce.argtypes = [c_double_p, c_double_p, ctypes.c_int]
def c_avx_max_reduce_axis_1(x):
    result = np.zeros(len(x))
    _reduce(np.ctypeslib.as_ctypes(result), np.ctypeslib.as_ctypes(x.ravel()), len(x))
    return result

@numba.njit("f8[:](f8[:, :])")
def numba_max_reduce_axis_1(x):
    res = np.full((x.shape[0]), -np.inf, dtype=x.dtype)

    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            if res[i] < x[i, j]:
                res[i] = x[i, j]

    return res

@numba.njit("f8[:](f8[:, ::1])")
def numba_c_order_max_reduce_axis_1(x):
    res = np.full((x.shape[0]), -np.inf, dtype=x.dtype)

    for i in range(x.shape[0]):
        for j in range(x.shape[1]):
            if res[i] < x[i, j]:
                res[i] = x[i, j]

    return res

def np_max_reduce_axis_1(x):
    return x.max(axis=1)

test_data = np.random.normal(size=(1024, 1024))

functions = [
    numba_max_reduce_axis_1,
    numba_c_order_max_reduce_axis_1,
    np_max_reduce_axis_1,
    c_avx_max_reduce_axis_1,
]

results = [max_reduce_axis_1(test_data) for max_reduce_axis_1 in functions]

for res in results:
    assert(np.allclose(results[0], res))

for max_reduce_axis_1 in functions:
    timings = []
    for _ in range(1000):
        t = time.perf_counter()

        max_reduce_axis_1(test_data)

        timings.append((time.perf_counter() - t) * 1e6)

    print(f"| {max_reduce_axis_1.__name__:31} | {np.mean(timings):9.3f} +- {np.std(timings):8.3f} us |")

1 Like