I can’t seem to produce a small example that exhibits the problem blatantly. So excuse the large amount of code (should be self-contained and run fine anywhere though). You should see a difference in execution time around 25%. With real life data I get:
CPU times: user 5.72 s, sys: 21 µs, total: 5.72 s
Wall time: 5.72 s
CPU times: user 8.46 s, sys: 16 µs, total: 8.46 s
Wall time: 8.47 s
Which is close to 50%.
Code:
import numba as nb
import numpy as np
from numba.extending import intrinsic
from numba.types import ir
@intrinsic
def addmod_tuples(tyctx, tpl1, tpl2, shape):
"""Returns tuple((t1+t2)%s for t1, t2, s in zip(tpl1, tpl2, shape))"""
ret = tpl1
count = tpl1.count
dtype = ret.dtype
typ = ir.types.IntType(dtype.bitwidth)
zero = ir.Constant(typ, 0)
def codegen(cgctx, builder, sig, args):
val_tpl1, val_tpl2, val_shape = args
tup = cgctx.get_constant_undef(ret)
for i in range(count):
# Extract scalars
t1 = builder.extract_value(val_tpl1, i)
t2 = builder.extract_value(val_tpl2, i)
s = builder.extract_value(val_shape, i)
# Modulo
bb_main = builder.block
val_ = builder.srem(builder.add(t1, t2), s)
is_neg = builder.icmp_signed("<", val_, zero, name='')
with builder.if_then(is_neg, likely=False):
pos_val = builder.add(s, val_)
bb_neg = builder.block
# Phi node: val = (t1+t2)%s
val = builder.phi(typ)
val.add_incoming(val_, bb_main)
val.add_incoming(pos_val, bb_neg)
# Assign to tuple
tup = builder.insert_value(tup, val, i)
return tup
sig = ret(tpl1, tpl2, shape)
return sig, codegen
@nb.njit(parallel=True)
def process(arr, stencils, weights):
shape = arr.shape
direction = np.empty(arr.shape, dtype=np.int8)
for index in nb.pndindex(shape):
old_g = -np.inf
cdir = -1 # No gradient, default
g_c = arr[index]
for i, (stencil, w) in enumerate(zip(stencils, weights)):
nindex = addmod_tuples(index, stencil, shape)
g_n = arr[nindex]
new_g = (g_n - g_c) * w
if old_g < new_g:
old_g = new_g
if new_g > 0.0:
# Saves i (as in 'stencils[i]') as main direction
cdir = i
if (new_g < 0.0) and (cdir == -1):
# Local maximum
cdir = -2
direction[index] = cdir
return direction
# Generate main data
from scipy.ndimage import gaussian_filter
shape = (259, 259, 259)
ary = np.zeros(shape, dtype=np.float64)
for _ in range(3):
ary += (np.random.rand(*shape) > 0.95).astype(np.float64)
ary = gaussian_filter(ary, 10.0, mode="wrap")
# Generate stencils
r = (-1, 0, 1)
stencils_refl = [(i, j, k) for i in r for j in r for k in r if (i, j, k) != (0, 0, 0)]
stencils_typed = nb.typed.List()
for s in stencils_refl:
stencils_typed.append(s)
# Generate weights
weights = 1.0 / np.linalg.norm(stencils_refl, axis=1)
# JIT warm-up
nb.set_num_threads(2)
process(ary, stencils_refl, weights)
process(ary, stencils_typed, weights)
# Benchmark
dir1 = %time process(ary, stencils_refl, weights)
dir2 = %time process(ary, stencils_typed, weights)
assert np.all(dir1 == dir2)