These two single-threaded functions have unexpected performance: f1 is faster than f0. 1.6x faster on x86-64 CPU, 2.4x faster on aarch64 CPU.
mp.omp_set_num_threads(1)
@nb.njit(fastmath=True)
def f0(img, w, gMin=16):
peaks = np.zeros((img.size,), dtype='i2')
for i in range(2*w+2, img.size-2*w-2):
gh1 = img[i+1]-img[i] + img[i+1+w]-img[i+w]
gv1 = img[i+w]-img[i] + img[i+1+w]-img[i+1]
if abs(gh1) > abs(gv1):
gh0 = img[i]-img[i-1] + img[i+w]-img[i-1+w]
gh2 = img[i+2]-img[i+1] + img[i+2+w]-img[i+1+w]
if (gMin <= gh1 and gh0 < gh1 and gh1 >= gh2) or (-gMin >= gh1 and gh0 > gh1 and gh1 <= gh2):
peaks[i] = gh1 + max(0, gh0) + max(0, gh2)
else:
gv0 = img[i]-img[i-w] + img[i+1]-img[i+1-w]
gv2 = img[i+2*w]-img[i+w] + img[i+1+2*w]-img[i+1+w]
if (gMin <= gv1 and gv0 < gv1 and gv1 >= gv2) or (-gMin >= gv1 and gv0 > gv1 and gv1 <= gv2):
peaks[i] = -(gh1 + max(0, gh0) + max(0, gh2))
return peaks
@nb.njit(fastmath=True)
def f1(img, w, gMin=16):
peaks = np.zeros((img.size,), dtype='i2')
with openmp("parallel for"):
for i in range(2*w+2, img.size-2*w-2):
gh1 = img[i+1]-img[i] + img[i+1+w]-img[i+w]
gv1 = img[i+w]-img[i] + img[i+1+w]-img[i+1]
if abs(gh1) > abs(gv1):
gh0 = img[i]-img[i-1] + img[i+w]-img[i-1+w]
gh2 = img[i+2]-img[i+1] + img[i+2+w]-img[i+1+w]
if (gMin <= gh1 and gh0 < gh1 and gh1 >= gh2) or (-gMin >= gh1 and gh0 > gh1 and gh1 <= gh2):
peaks[i] = gh1 + max(0, gh0) + max(0, gh2)
else:
gv0 = img[i]-img[i-w] + img[i+1]-img[i+1-w]
gv2 = img[i+2*w]-img[i+w] + img[i+1+2*w]-img[i+1+w]
if (gMin <= gv1 and gv0 < gv1 and gv1 >= gv2) or (-gMin >= gv1 and gv0 > gv1 and gv1 <= gv2):
peaks[i] = -(gh1 + max(0, gh0) + max(0, gh2))
return peaks
Full code on python-benchmarks/so-6.py at main · pauljurczak/python-benchmarks · GitHub . Benchmark is run with taskset -c 5 python ./so-6.py.
f1 uses more SIMD registers on x86-64 CPU than f0: 12 x XMM and 10 x YMM vs. 6 x XMM and 6 x YMM. Surprisingly, no difference on aarch64.
Surprisingly, with openmp("parallel for") directive works inside @nb.njit function.