Numba performance doesn't scale as well as NumPy in vectorized max function

@brandonwillard, I’m thinking about the way loops are represented in Numba in general and how the ufunc loop nests (as referenced) are quite involved. The control flow graphs for the LLVM IR can be seen with:

import numpy as np
import numba

@numba.vectorize(["float64(float64, float64)"], identity="reorderable")
def custom_op_fn(x, y):
    if x > y:
        return x
    else:
        return y


@numba.njit(debug=True)
def max_reduce_axis_1(x):
    res = np.full((x.shape[0],), -np.inf, dtype=x.dtype)
    x_transpose = np.transpose(x)
    for m in range(x.shape[1]):
        custom_op_fn(res, x_transpose[m], res)
    return res


X = np.random.normal(size=(5000, 5000))
res_1 = max_reduce_axis_1(X)


@numba.njit(debug=True, error_model='numpy')
def max_reduce_axis_2(x):
    res = np.empty((x.shape[0],), dtype=x.dtype)
    for i in range(x.shape[0]):
        res[i] = -np.inf
        for j in range(x.shape[1]):
            tmp = x[i, j]
            if res[i] < tmp:
                res[i] = tmp
    return res


res_2 = max_reduce_axis_2(X)
assert np.array_equal(res_1, res_2)
assert np.array_equal(res_1, np.max(X, axis=1))

def show_llvm_cfg(func):
    func.inspect_cfg(func.signatures[0]).pretty_printer(interleave=True, view=True)

show_llvm_cfg(max_reduce_axis_1)
show_llvm_cfg(max_reduce_axis_2)
1 Like