As mentioned in Trying to use a set with string values causes AssertionError(), I’m running into a performance issue with typed dicts and lists. Here is some code that illustrates the issue:
import random
import string
import pandas as pd
import numba
from time import time
def seen_name(names):
seen = {}
for i in range(len(names)):
if names[i] not in seen: seen[names[i]] = True
ntypes = numba.core.types
ndict = numba.typed.Dict
@numba.njit
def seen_name_numba(names):
seen = ndict.empty(key_type=ntypes.unicode_type,
value_type=ntypes.boolean)
for i in range(len(names)):
if names[i] not in seen: seen[names[i]] = True
allnames = []
# Generate a long list of random 10-letter strings
for i in range(1000000):
allnames.append(''.join(
random.choices(string.ascii_letters, k=10)))
# The real-world scenario stores the data in a dataframe
df = pd.DataFrame(allnames, columns=['name'])
start = time()
seen_name(df.name.tolist());
print(f'\nseen_name => {time() - start:.4} seconds\n')
start = time()
seen_name_numba(df.name.tolist());
print(f'\nseen_name_numba tolist => {time() - start:.4} seconds')
start = time()
seen_name_numba(numba.typed.List(df.name))
print(f'\nseen_name_numba typed.List => {time() - start:.4} seconds')
Here are the results from a sample run:
seen_name => 0.2247 seconds
/tmp/test/lib64/python3.8/site-packages/numba/core/ir_utils.py:2031: NumbaPendingDeprecationWarning:
Encountered the use of a type that is scheduled for deprecation: type 'reflected list' found for argument 'names' of function 'seen_name_numba'.
For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-reflection-for-list-and-set-types
File "numba_dict.py", line 23:
@numba.njit
def seen_name_numba(names):
^
warnings.warn(NumbaPendingDeprecationWarning(msg, loc=loc))
seen_name_numba tolist => 2.71 seconds
seen_name_numba typed.List => 3.321 seconds
The non-numba version was the fastest, at 0.22 seconds. Using the deprecated tolist() approach took 2.71 seconds, and using typed.List took 3.32 seconds.