hi, I have a comparison of unrolling a loop via code generation vs via literal_unroll
. The second option takes longer to compile (understandable to a certain extent) but it’s also much slower to execute (like 1500x slower). I’d prefer to avoid code generation but it’s proving to be the best option so far.
I was wondering if there’s anything I could do here to improve the compilation speed in either option, and/or the execution speed in the literal_unroll
option. The latter seems to suffer from reference counting cost (based on a quick look to the llvm ir).
import time
import warnings
from typing import Tuple, Callable, NamedTuple, Union
import numpy as np
from numba import njit, literal_unroll, NumbaExperimentalFeatureWarning
@njit
def id_fc(x):
return x
class COOFunctions(NamedTuple):
data: Tuple[Callable[[int], Union[int, Tuple[int, ...]]], ...]
rows: Tuple[int, ...]
cols: Tuple[int, ...]
class COOVecs(NamedTuple):
data: Tuple[np.ndarray, ...]
rows: Tuple[int, ...]
cols: Tuple[int, ...]
def make_looper(coo_fs):
fs_enum = tuple(enumerate(coo_fs.data))
@njit
def get_values(out, t, vecs) -> np.ndarray:
for x in literal_unroll(fs_enum):
idx, fc = x
out[coo_fs.rows[idx], coo_fs.cols[idx]] = vecs.data[idx][fc(t)]
return out
return get_values
@njit
def foo(t):
return 0, t *2
def make_looper_txt(timing_fs, sparse_pos):
namespace = {'np': np}
fc_idx = 0
def add_fc(fc, fc_idx, namespace):
exec(f"fc_{fc_idx} = fc", locals(), namespace)
return f"fc_{fc_idx}"
fc_text = ""
fc_text += "def get_values(out, t, vecs):" + "\n"
enum_gen = enumerate(zip(timing_fs, sparse_pos))
for state_idx, (row_time_fc, row_pos) in enum_gen:
if len(row_time_fc) > 0:
for dec_idx, (time_fc, pos) in enumerate(zip(row_time_fc, row_pos)):
time_fc_name = add_fc(time_fc, fc_idx, namespace)
in_, out_ = pos
fc_text += f" out[{in_}, {out_}] = vecs[{in_}][{dec_idx}][{time_fc_name}(t)]" + "\n"
fc_text += " return out"
exec(fc_text, namespace, namespace)
return njit(namespace['get_values'])
class Timer:
def __init__(self, text= "Elapsed time: {:0.10f} seconds"):
self.text = text
self._start_time: float = 0
def __enter__(self):
self._start_time = time.perf_counter()
return self
def __exit__(self, *exc_info):
end = time.perf_counter()
print(self.text.format(end - self._start_time))
if __name__ == '__main__':
timing_funs = tuple([foo for _ in range(367)])
coo_fs = COOFunctions(timing_funs, tuple(range(367)), tuple(np.repeat(0, 367)))
vecs_data = tuple([np.zeros((1, 1441)) for _ in range(367)])
vecs = COOVecs(vecs_data, tuple(range(367)), tuple(np.repeat(0, 367)))
fs = tuple([(foo, foo, foo) for _ in range(125)])
sparse_pos = tuple([((1, 0), (1, 123), (1, 124)) for _ in range(125)])
vecs_2 = np.zeros((125, 125, 1, 10000))
out = np.zeros((1000, 1000), np.float64)
warnings.filterwarnings("ignore", category=NumbaExperimentalFeatureWarning)
with Timer("Text: {:0.10f} seconds"):
looper_txt = make_looper_txt(fs, sparse_pos)
res1 = looper_txt(out, 0, vecs_2)
with Timer("Text without compilation: {:0.10f} seconds"):
res2 = looper_txt(out, 0, vecs_2)
with Timer("Literal unroll: {:0.10f} seconds"):
looper = make_looper(coo_fs)
res3 = looper(out, 0, vecs)
with Timer("Literal unroll without compilation: {:0.10f} seconds"):
res4 = looper(out, 0, vecs)