Disclaimer: Resolution of this example is likely of extremely limited personal use to me since I can just pick the faster of the two versions. However, I thought that it might be helpful for numba core development purposes since njit should make things faster and seeing cases where it doesn’t seemed helpful. If not, feel free to delete.
Here are the results. Notice that the njitted version (get_d_2) is faster, but the parent function becomes significantly slower.
$ time python3 partials.py
using numpy
0.005436 0.758 partials_2
0.001049 0.146 -|- get_d_2
0.000590 0.082 -|--|- intk1d3o
0.000154 0.022 -|--|- intk1d1o
0.000286 0.040 -|--|- intk1d2o
0.001445 0.201 partials_1
0.001322 0.184 -|- get_d_1
0.000590 0.082 -|--|- intk1d3o
0.000154 0.022 -|--|- intk1d1o
0.000286 0.040 -|--|- intk1d2o
real 0m6.967s
user 0m6.895s
sys 0m0.227s
Here’s all of the relevant code.
#partials.py
from cupy_or_numpy import xp
from decorators import get_jit_decorator, get_decorator, reset_globals, profile_results
from intk import intk1d1o, intk1d2o, intk1d3o
@get_decorator()
def get_d_1(K, x, dx, xt):
d = K[0]
if 0 < K[1].size:
d += intk1d1o(K[1], dx, xt)
if 0 < K[2].size:
d += intk1d2o(K[2], x, dx, xt)
d += intk1d2o(K[2], dx, x, xt)
if 0 < K[3].size:
d += intk1d3o(K[3], dx, x, x, xt)
d += intk1d3o(K[3], x, dx, x, xt)
d += intk1d3o(K[3], x, x, dx, xt)
return d if d!=0 else 1
@get_jit_decorator()
def get_d_2(K0, K1, K2, K3, x, dx, xt):
d = K0
if 0 < K1.size:
d += intk1d1o(K1, dx, xt)
if 0 < K2.size:
d += intk1d2o(K2, x, dx, xt)
d += intk1d2o(K2, dx, x, xt)
if 0 < K3.size:
d += intk1d3o(K3, dx, x, x, xt)
d += intk1d3o(K3, x, dx, x, xt)
d += intk1d3o(K3, x, x, dx, xt)
return d if d!=0 else 1
@get_decorator()
def partials_1(x, dx, K, st):
xt = int(st[-1]+1)
d = get_d_1(K, x, dx, xt)
@get_decorator()
def partials_2(x, dx, K, st):
xt = int(st[-1]+1)
d = get_d_2(K[0], K[1], K[2], K[3], x, dx, xt)
@get_decorator()
def main(n):
x = xp.random.uniform(-1,1,400)
dx = xp.zeros(x.size)
dx[1:] = xp.diff(x)
s = 10
K = [1, xp.random.uniform(-1,1,s)
, xp.random.uniform(-1,1,(s,s))
, xp.random.uniform(-1,1,(s,s,s))
]
st = xp.arange(n)*15 + xp.random.uniform(0,1,n)*10
for i in range(n):
partials_1(x, dx, K, st[:i+1])
partials_2(x, dx, K, st[:i+1])
if __name__ == '__main__':
main(10)
reset_globals()
main(20)
profile_results()
####################################
####################################
#intk.py
from decorators import get_jit_decorator
@get_jit_decorator()
def intk1d1o(k, x, tl):
s = 0
for i in range(k.size):
s += k[i] * x[tl-i]
return s
@get_jit_decorator()
def intk1d2o(k, x1, x2, tl):
s = 0
for i in range(k.shape[0]):
for j in range(k.shape[1]):
s += k[i,j] * x1[tl-i] * x2[tl-j]
return s
@get_jit_decorator()
def intk1d3o(k, x1, x2, x3, tl):
s = 0
for i in range(k.shape[0]):
for j in range(k.shape[1]):
for l in range(k.shape[2]):
s += k[i,j,l] * x1[tl-i] * x2[tl-j] * x3[tl-l]
return s
####################################
####################################
#decorators.py
import time
from numba import njit, jit, objmode
from cupy_or_numpy import xp
USE_TIMER = True
results = {}
tree = {'stack':['main'], 'main':set()}
def wrapper_objm_start(f):
start = time.time()
tree[ tree['stack'][-1] ].add( f.__name__ )
tree['stack'] += [ f.__name__ ]
if f.__name__ not in results:
tree[f.__name__] = set()
# print(tree['stack'])
return start
def wrapper_objm_end(f, start):
run_time = time.time() - start
if f.__name__ in results:
results[f.__name__] += [run_time]
else:
results[f.__name__] = [run_time]
tree['stack'] = tree['stack'][:-1]
def timer(f):
def wrapper(*args, **kwargs):
start = wrapper_objm_start(f)
g = f(*args)
wrapper_objm_end(f, start)
return g
return wrapper
def timer_none(f):
def wrapper(*args):
return f(*args)
return wrapper
def jit_timer(f):
jf = njit(f)
@njit(cache=False)
def wrapper(*args):
with objmode(start='float64'):
start = wrapper_objm_start(f)
g = jf(*args)
# g = f(*args)
with objmode():
wrapper_objm_end(f, start)
return g
return wrapper
def get_jit_decorator():
if USE_TIMER:
# return timer
return jit_timer
else:
return njit
def get_decorator():
if USE_TIMER:
return timer
else:
return timer_none
def reset_globals():
global results
results = {}
global tree
tree = {'stack':['main'], 'main':set()}
def print_tree(node, layer):
for n in node:
rt = xp.sum(results[n])
rtr = rt / xp.sum(results['main'])
print('{0:>9.6f} {1:.03f}'.format( rt, rtr ), '-|-'*layer, n)
print_tree(tree[n], layer+1)
def profile_results():
# print(results)
# print(tree)
l = []
for k in results:
a = xp.asarray(results[k])
# l += [[k+' '*(17-len(k)), xp.sum(a[1:])]]
l += [[k+' '*(17-len(k)), xp.sum(a)]]
l = sorted(l, key=lambda x: x[1])
# for i in range(len(l)):
# print( '{:.6f}'.format( l[i][1] ), l[i][0] )
# print( l[i][0], "{:.6f}".format( l[i][1] ) )
print_tree(tree['main'], 0)
####################################
####################################
#cupy_or_numpy.py
try:
import cupy as xp
print("using cupy")
except:
import numpy as xp
print("using numpy")