Hey @lesshaste ,
If disc space and compilation time is not critical, you can compile the same core algorithm twice, once with parallel=False and once with parallel=True.
You can use a dispatcher to choose between them at runtime. This lets you switch modes within the same Python session without changing thread settings.
Here is an example:
import numpy as np
from numba import njit, prange
NBCONFIG = {'fastmath': True, 'cache': False}
@njit(**NBCONFIG)
def lower_tri_matvec(L, x):
"""Benchmark"""
n = L.shape[0]
y = np.zeros(n, dtype=L.dtype)
for i in range(n):
s = 0.0
for j in range(i + 1):
s += L[i, j] * x[j]
y[i] = s
return y
@njit(**NBCONFIG, inline='always')
def lower_tri_matvec_core(L, x):
"""Core algorithm to be inlined."""
n = L.shape[0]
# initialization overhead: use empty array instead
y = np.zeros(n, dtype=L.dtype)
for i in prange(n):
# casting overhead: use type aware scalar variable instead
s = 0.0
for j in range(i + 1):
s += L[i, j] * x[j]
y[i] = s
return y
@njit(**NBCONFIG, parallel=False)
def lower_tri_matvec_seq(L, x):
"""Sequential algorithm (same as lower_tri_matvec)."""
return lower_tri_matvec_core(L, x)
@njit(**NBCONFIG, parallel=True)
def lower_tri_matvec_par(L, x):
"""Parallel algorithm."""
return lower_tri_matvec_core(L, x)
@njit(**NBCONFIG)
def lower_tri_matvec_dispatch(L, x, parallel=False):
return lower_tri_matvec_par(L, x) if parallel else lower_tri_matvec_seq(L, x)
N = 2000
L = np.tril(np.random.rand(N, N).astype(np.float32))
x = np.random.rand(N).astype(np.float32)
# warm-up
lower_tri_matvec(L, x)
lower_tri_matvec_dispatch(L, x)
lower_tri_matvec_dispatch(L, x, parallel=True)
%timeit -n 300 -r 100 lower_tri_matvec(L, x)
%timeit -n 300 -r 100 lower_tri_matvec_dispatch(L, x, parallel=False)
%timeit -n 300 -r 100 lower_tri_matvec_dispatch(L, x, parallel=True)
# 811 μs ± 80 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 793 μs ± 35.9 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 670 μs ± 29.8 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 794 μs ± 40 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 788 μs ± 22.3 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 673 μs ± 39.4 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 788 μs ± 24.6 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 794 μs ± 38 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)
# 705 μs ± 112 μs per loop (mean ± std. dev. of 100 runs, 300 loops each)