Faster for loop - help pls

Am looking to speed up the following function:

def get_groups(values: list, threshold: int) -> np.array:
    x: int = 0
    count: int = 1
    group = []
    for i in values:
        x += i
        group.append(count)
        if x >= threshold:
            count +=1
            x = 0
    return np.array(group)

values  = list(np.random.randint(0,100,10000))
get_groups(values, 1000)

using @njit() decorator gives me a deprecation warning and the function is even slower than in pure python. Any pointer how I can speed this up would be really appreciated.

hi @steff, if you change values to be an array instead of a list values = np.random.randint(0,100,10000), you can get 100x improvement.
As a general rule, it’s always faster to use arrays instead of lists.

Hope it helps!

1 Like

Hi @steff,

As @luk-f-a suggests, passing arrays instead of lists really helps. When data is passed into Numba functions it has to be “unboxed” into a native form, the cost of which depends on the data type but for lists it’s quite expensive and for arrays it’s cheap. It also looks like group is appended to for every iteration of the loop in values, therefore it’s size is known ahead of running the loop, this information can be exploited and the use of a list entirely avoided. Here’s a few variants to get you started:

from numba import njit
import numpy as np
import time

iterations = 50

# --- pure Numpy/Python
def get_groups(values, threshold):
    x = 0
    count = 1
    group = []
    for i in values:
        x += i
        group.append(count)
        if x >= threshold:
            count +=1
            x = 0
    return np.array(group)

np.random.seed(0)
values  = list(np.random.randint(0,100,10000))
r0 = get_groups(values, 1000)

ts = time.time()
for x in range(iterations):
    get_groups(values, 1000)
e0 = e = time.time() - ts
print("Elapsed:", e, "speed up:", e0 / e)

# --- original

@njit
def get_groups0(values, threshold):
    x = 0
    count = 1
    group = []
    for i in values:
        x += i
        group.append(count)
        if x >= threshold:
            count +=1
            x = 0
    return np.array(group)

np.random.seed(0)
values  = list(np.random.randint(0,100,10000))
r = get_groups0(values, 1000)
np.testing.assert_allclose(r, r0) # check result


ts = time.time()
for x in range(iterations):
    get_groups0(values, 1000)
e = time.time() - ts
print("Elapsed:", e, "speed up:", e0 / e)

# --- array input instead of list

@njit
def get_groups1(values, threshold):
    x = 0
    count = 1
    group = []
    for i in values:
        x += i
        group.append(count)
        if x >= threshold:
            count +=1
            x = 0
    return np.array(group)

np.random.seed(0)
values  = np.random.randint(0,100,10000)
r = get_groups1(values, 1000)
np.testing.assert_allclose(r, r0) # check result

ts = time.time()
for x in range(iterations):
    get_groups1(values, 1000)
e = time.time() - ts
print("Elapsed:", e, "speed up:", e0 / e)

# --- array buffer instead of list

@njit
def get_groups2(values, threshold):
    x = 0
    count = 1
    group = np.empty_like(values)
    for ix, i in enumerate(values):
        x += i
        group[ix] = count
        if x >= threshold:
            count +=1
            x = 0
    return group

np.random.seed(0)
values  = np.random.randint(0,100,10000)
r = get_groups2(values, 1000)
np.testing.assert_allclose(r, r0) # check result

ts = time.time()
for x in range(iterations):
    get_groups2(values, 1000)
e = time.time() - ts
print("Elapsed:", e, "speed up:", e0 / e)

# --- simplify loop

@njit
def get_groups3(values, threshold):
    x = 0
    count = 1
    group = np.empty_like(values)
    ix = 0
    while(ix < len(values)):
        x += values[ix]
        group[ix] = count
        if x >= threshold:
            count +=1
            x = 0
        ix += 1
    return group

np.random.seed(0)
values  = np.random.randint(0,100,10000)
r = get_groups3(values, 1000)
np.testing.assert_allclose(r, r0) # check result

ts = time.time()
for x in range(iterations):
    get_groups3(values, 1000)
e = time.time() - ts
print("Elapsed:", e, "speed up:", e0 / e)

gives me:

Elapsed: 0.2797975540161133 speed up: 1.0
Elapsed: 1.9415757656097412 speed up: 0.14410849114005314
Elapsed: 0.02059769630432129 speed up: 13.583924623522739
Elapsed: 0.001268148422241211 speed up: 220.63470577176162
Elapsed: 0.0008413791656494141 speed up: 332.54633040521395

The last version being a load faster than the original. Hope this helps?

3 Likes

Thanks a lot Stuart. Much appreciated. Pure Numba magic!

It did help indeed. Thank you for the valuable hint!

@steff No problem, glad it worked ok!

Hi guys. have another question if you don’t mind. And its another for loop I would like to make faaaaster. This is what I have come up with but seems np.pad is not support by Numba (yet):

# @njit(cache=True)
def clean_ticks_np(values: np.array, window: int=3, granularity: float=0.02) -> np.array:
    keep = np.empty_like(values)
    values = np.pad(values, pad_width=(k, 0), mode='edge')
    x = k
    ix = 0
    while (x < len(values)):
        keep[ix] = np.abs(values[x] - np.mean(values[x-k:x])) < 3 * np.std(values[x-k:x],ddof=1) + granularity
        x += 1
        ix += 1
    return keep

to create data:

from math import sqrt
from scipy.stats import norm
import numpy as np

def brownian(x0, n, dt, delta, out=None, normalise=True):
    x0 = np.asarray(x0)
    r = norm.rvs(size=x0.shape + (n,), scale=delta * sqrt(dt))
    if out is None:
        out = np.empty(r.shape)
    np.cumsum(r, axis=-1, out=out)
    out += np.expand_dims(x0, axis=-1)
    if normalise:
        out = (out - out.min())/(out.max()- out.min())
    return out

n = 1000000
values = brownian(0.5, n, 10, 1000) * 10

Is there some other way to speed this up?
Or do I need to code a pure python replacement for np.pad?

np.concatenate((np.repeat(values[0], window), values)) instead of np.pad does the trick. sorry to have bothered you.