Hi all, I’m trying to use numba to speed up random array generation. I’m generating a random dataset for labels and I need a array with mostly nan values, and a specific percentage to be randomly set to 0 or 1. Here are my implementations in numpy and numba:
from numba import jit
import numpy as np
def numba_random_label_gen(seed: int, size: int, num_labels_per_batch: int) -> np.ndarray:
np.random.seed(seed)
# Create a 1D Numpy array with all nan
label_data = np.full(size, np.nan, dtype=np.float64)
# Choose random indexes
label_indexes = np.random.choice(size, num_labels_per_batch, replace=False)
# Create labels for those indexes, e.g. assign 0/1 randomly
label_data[label_indexes] = np.random.randint(0, 1, num_labels_per_batch)
return label_data, label_indexes
def np_random_label_gen(
rng: np.random.Generator, size: int, num_labels_per_batch: int
) -> np.ndarray:
# Create a 1D Numpy array with all nan
label_data = np.full(size, np.nan, dtype=np.float64)
# Choose random indexes
label_indexes = rng.choice(size, num_labels_per_batch, replace=False)
# Create labels for those indexes, e.g. assign 0/1 randomly
label_data[label_indexes] = rng.integers(2, size=num_labels_per_batch)
return label_data, label_indexes
When running the code for large inputs, but also using timeit
(after running the numba function once first to compile it) my numba implementation is 5x slower than the pure numpy one.
import timeit
numba_labels = numba_random_label_gen(42, size=10000, num_labels_per_batch=100)
timeit.timeit("numba_random_label_gen(42, size=10000, num_labels_per_batch=100)", number=100000)
#11.433971279999241 sec
np_setup = """
import numpy as np
rng = np.random.default_rng(seed=42)
"""
timeit.timeit("np_random_label_gen(rng, size=10000, num_labels_per_batch=100)", number=100000, setup=np_setup
# 2.6677552140317857 sec
Any ideas on how I could speed up the numba implementation? Is it possible the use of np.random vs. np.random.Generator? My versions are:
>>> np.__version__
'1.26.3'
>>> numba.__version__
'0.59.0'