Weird parallel prange behaviour

Hi, I’m using numba.prange to speed up some calculations on a dataset. Each row can be processed independently, so parallelization is easy to exploit. However I’m experiencing weird performance outcomes.

This is an example):

# example A
import numba
import numpy as np
import math
import time

num_rows = 10000

def print_time(the_func):
    f_j = numba.njit(the_func)
    # run f to compile it
    f_j(1)
    # Time it
    print("Using parallel=False")
    %timeit f_j(1)
    f_j = numba.njit(the_func, parallel=True)
    # run f to compile it
    f_j(2)
    # Time it
    for nt in [1, 2, 4]:
        print("Using parallel=True and {:.0f} threads".format(nt))
        %timeit f_j(nt)


def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate number of rows per thread
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        tmp = np.empty(5)
        for i_row in range(i_row_begin, i_row_end):
            # Do some calculation on a single row
            tmp[0] = i_row
            for index_s in range(out.shape[1]):
                tmp[1] = index_s
                for index_t in range(out.shape[2]):
                    tmp[2] = index_t + index_s
                    tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                    tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                    out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
    return out

print_time(f)

And the output is:

Using parallel=False
1.14 s ± 46.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Using parallel=True and 1 threads
140 ms ± 2.18 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Using parallel=True and 2 threads
88.4 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Using parallel=True and 4 threads
76.7 ms ± 7.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Weird enough, just enabling Parallel=True makes the function much faster, but the speed does not depend on the prange. Even when using prange(1), I can see that 80% of my CPU (46 cores) is used, meaning many threads (roughly 36) are being used.

Now let’s do a change and put the inner loops in a separate function that works on a bunch of rows:

# example B
def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s+1))
                tmp[4] = index_t / (1.8 + i_row / (index_s+1))
                out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
calc_bunch_rows = numba.njit(calc_bunch_rows, nogil=True)

def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
print_time(f)

The output now is:

Using parallel=False
1.16 s ± 64.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Using parallel=True and 1 threads
1.13 s ± 5.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Using parallel=True and 2 threads
2.29 s ± 521 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Using parallel=True and 4 threads
2.16 s ± 210 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Parallelization is now not used even when enabled, actually it makes the code slower.

Finally, I remembered that in some cases allocation of numpy arrays inside a parallel loop could create performance issues inside pranges (see here and here). So I removed the np.sum functions and replace by the C-way equivalent:

# example C
def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    tmp2 = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s+1))
                tmp[4] = index_t / (1.8 + i_row / (index_s+1))
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum == tmp[tmp_i]
                for tmp_i in range(tmp.shape[0]):
                    tmp2[tmp_i] == tmp[tmp_i] / 1 + tmp_sum
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum == tmp2[tmp_i]
                out[i_row, index_s, index_t] = tmp_sum
calc_bunch_rows = numba.njit(calc_bunch_rows, nogil=True)

def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
print_time(f)
Using parallel=False
69.3 ms ± 422 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

Using parallel=True and 1 threads
99.2 ms ± 5.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Using parallel=True and 2 threads
83.5 ms ± 6.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Using parallel=True and 4 threads
78.8 ms ± 13.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Now parallelization is always performed, I can see many cores are used even when parallel is set to False.

Hi and welcome to the forum.

I definitely cannot answer all your questions (some things seem a bit strange indeed), but maybe I can help to shed some light on the situation.

  1. parallel=True introduces general parallel semantics which can be included in your code by the compiler/numba even if you do not add them to your code explicitly (see https://numba.pydata.org/numba-doc/latest/user/parallel.html). So there may be parallel optimisations going on and using your CPU cores even if you don’t expect that to happen because you use prange(1). In your first example that seems to offer a significant speed up - so good :slight_smile:

  2. You do not really set the number of threads through prange. The number of threads used by numba is set through an environment variable (see https://numba.pydata.org/numba-doc/latest/reference/envvars.html?highlight=environment#threading-control). Prange tells numba that the inner part of the loop can be run in individual threads. So if your number of threads is smaller than the number of available threads, I guess you can limit the number of threads for the loop in that way, but I don’t think that is really the way to do it, since other optimisations may still make use of the vast number of cores of your CPU (core team and people who know more about numba, please scrutinise me on this :-P)

1 Like

Hi, thanks for your insight! I didn’t know Numba was able to parallelize even loops inside the prange loop; this indeed explains the behavior in example A.
Yet, still I cannot explain the behavior of examples B and C. Why parallelization is not applied in B? And why parallelization is applied in C, even with parallel=False?

After other tests I’ve got a clearer idea of what is going on. First of all, I modified the above examples by adding some more ‘number crunching stuff’ inside the inner loop.

Example A now is as follows:

# Example A
def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate number of rows per thread
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        tmp = np.empty(5)
        for i_row in range(i_row_begin, i_row_end):
            # Do some calculation on a single row
            tmp[0] = i_row
            for index_s in range(out.shape[1]):
                tmp[1] = index_s
                for index_t in range(out.shape[2]):
                    tmp[2] = index_t + index_s
                    tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                    tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                    tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                    tmp[4] = tmp[3] * math.cos(tmp[3])
                    tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                    tmp[4] = tmp[3] + math.cos(tmp[3])
                    tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                    tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                    if index_t % 2 == 1:
                        tmp[3] = tmp[3] - tmp[2] * out[i_row, index_s, index_t-1]
                    out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
    return out
print()
print("Example A")
print_time(f)

Leading to:

Example A
Using parallel=False
2.33 s ± 208 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(1)
1.18 s ± 7.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(2)
645 ms ± 48.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(4)
321 ms ± 7.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(8)
171 ms ± 7.85 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(16)
90.2 ms ± 2.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(32)
82.2 ms ± 4.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(46)
74.6 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(64)
70.5 ms ± 2.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

So now there is a clear scaling with the prange loop. Still, using parallel=False takes almost double that parallel=True and prange(1), even if in the latter case the code is still using only one core.

Then, the outcome of example 2 is exactly as before, i.e. no parallelization is used:

# Example B
def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] * math.cos(tmp[3])
                tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] + math.cos(tmp[3])
                tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                if index_t % 2 == 1:
                    tmp[3] = tmp[3] - tmp[2] * out[i_row, index_s, index_t-1]
                out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
calc_bunch_rows = numba.njit(calc_bunch_rows, nogil=True)

def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
print()
print("Example B")
print_time(f)

Leading to:

Example B
Using parallel=False
2.25 s ± 29.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(1)
2.23 s ± 13.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(2)
3.13 s ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(4)
2.86 s ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(8)
2.63 s ± 153 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(16)
2.47 s ± 112 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(32)
3.4 s ± 50.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(46)
3.85 s ± 42.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(64)
3.88 s ± 35.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

Finally, example C has now exactly the same performance as example A:

# Example C
def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    tmp2 = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] * math.cos(tmp[3])
                tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] + math.cos(tmp[3])
                tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                if index_t % 2 == 1:
                    tmp[3] = tmp[3] - tmp[2] * out[i_row, index_s, index_t-1]
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum += tmp[tmp_i]
                for tmp_i in range(tmp.shape[0]):
                    tmp2[tmp_i] += tmp[tmp_i] / 1 + tmp_sum
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum += tmp2[tmp_i]
                out[i_row, index_s, index_t] = tmp_sum
calc_bunch_rows = numba.njit(calc_bunch_rows, nogil=True)

def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
print()
print("Example C")
print_time(f)
Example C
Using parallel=False
1.18 s ± 697 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(1)
1.18 s ± 4.14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(2)
612 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(4)
309 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(8)
172 ms ± 8.69 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(16)
127 ms ± 33.8 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(32)
83.6 ms ± 3.33 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(46)
75.4 ms ± 4.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(64)
70.4 ms ± 3.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Summarizing, in examples A and C n cores are used with the statement prange(n) (checked looking at cpu load).
However, if the number crunching stuff is made simpler, only in example A Numba is able to perform some extra optimization and will parallelize not only the prange loop. E.g. in the following example I’ removing the two lines

if index_t % 2 == 1:
    tmp[3] = tmp[3] - tmp[2] * out[i_row, index_s, index_t-1]

seems like Numba is able to parallelize even loops over index_t and index_s. Look here:

# Example A2 (modification of example A)
def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate number of rows per thread
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        tmp = np.empty(5)
        for i_row in range(i_row_begin, i_row_end):
            # Do some calculation on a single row
            tmp[0] = i_row
            for index_s in range(out.shape[1]):
                tmp[1] = index_s
                for index_t in range(out.shape[2]):
                    tmp[2] = index_t + index_s
                    tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                    tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                    tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                    tmp[4] = tmp[3] * math.cos(tmp[3])
                    tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                    tmp[4] = tmp[3] + math.cos(tmp[3])
                    tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                    tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                    out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
    return out
print()
print("Example A2")
print_time(f)
Example A2
Using parallel=False
2.22 s ± 9.89 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(1)
502 ms ± 19.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(2)
293 ms ± 30.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(4)
165 ms ± 5.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(8)
123 ms ± 3.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(16)
99.5 ms ± 8.96 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(32)
80.7 ms ± 6.32 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(46)
84.2 ms ± 9.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(64)
81.6 ms ± 9.61 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

Indeed, cpu load inspection reveals that more than n cores are used with the statement prange(n).
Finally, as it could be expected, Numba will not parallelize the inner loops if they are moved to an external function, as follows (modification of Example C):

# Example C2 (modfication of C)
def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    tmp2 = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] * math.cos(tmp[3])
                tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] + math.cos(tmp[3])
                tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum += tmp[tmp_i]
                for tmp_i in range(tmp.shape[0]):
                    tmp2[tmp_i] += tmp[tmp_i] / 1 + tmp_sum
                tmp_sum = 0
                for tmp_i in range(tmp.shape[0]):
                    tmp_sum += tmp2[tmp_i]
                out[i_row, index_s, index_t] = tmp_sum
calc_bunch_rows = numba.njit(calc_bunch_rows, nogil=True)

def f(num_threads):

    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
print()
print("Example C2")
print_time(f)
Example C2
Using parallel=False
1.17 s ± 2.24 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(1)
1.19 s ± 31.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(2)
598 ms ± 6.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(4)
307 ms ± 6.53 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Using parallel=True and prange(8)
175 ms ± 7.98 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(16)
140 ms ± 4.31 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(32)
116 ms ± 7.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(46)
109 ms ± 9.06 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Using parallel=True and prange(64)
88 ms ± 6.42 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

So now things are much clearer to me. However still I’m asking: how can I get any parallelization out of example B and why Numba cannot parallelize it? Is it a bug to be reported or an intended behavior?

Lack of parallelization of example B is really frustrating me. I currently need to parallelize some algorithm where multiplication of many medium-sized matrices (let’s say 100x100) is involved. The multiplications are independent of each other, so they can be parallelized very easily. However Numba does not support numpy.dot when the ‘out’ argument is specified, meaning that numpy arrays are dynamically created (more or less like numpy.sum in example B) thus making Numba not able to parallelize the prange. Also, the size of the matrices is such that if I move away from BLAS libraries and write my own matrix multiplication in Numba things are going to be very much slower.

Hi :slight_smile:

Okay this is getting a bit intricate and its not all immediately obvious to me (maybe someone else can chip in as well) but I have a few ideas:

  1. Concerning Example C: I think you have small mistake in there, that may be related to the better performance. Where you are trying to compute tmp_sum (I suppose that is what you are trying to do) you used == instead of +=.
  2. I am not sure if this makes a difference, but have you tried to add parallel=True to the decorator of calc_bunch_rows? Maybe the real performance gain is hidden in parallelising that and not in the prange statement alone. In example A the code is all in f so I guess by activating parallel on that you also benefit from those other optimisations I mentioned before.
  3. Instead of manually managing the the “thread count” thorugh prange, and splitting your data into chunks, I think it would be a good idea to just iterate over every individual row with prange and see if that offers improvements. It would also be advantageous to set the number of threads in the intended way as is explained here https://numba.pydata.org/numba-doc/latest/user/threading-layer.html#setting-the-number-of-threads and not through tweaking around with prange, it is not really made for that purpose as far as I understand. (You can reduce the number of threads for numba during runtime, but not increase it beyond the number of threads that were spun up during the numba initialisation (set through the environment variable))
  4. There are ways to set the LLVM compiler optimisation level, through environment variables afaik. Maybe that would offer a way to knock out optimisations beyond prange to make it easier to see where the actual performance gain is hiding (but I am not certain about that)
  5. If you are not scared about staring into the abyss, you could dump the IR that is generated by numba (or even at the LLVM assembly code) The docs contain information on how to do that as well. But if you are like me and scared of machine instructions, it may not be the most helpful thing in the world :smiley:
  6. I hazard a guess that prange is not happy with you writing to different locations of out from different threads, but that may just be my wrong assumption.
  7. I have also been scratching my head as to why numba does not support the out argument for many numpy functions. I am not sure if there is a good reason for that beyond it has not been implemented by anyone yet. Maybe it would not be difficult for you to extend the existing implementations of e.g. np.dot to offer the out argument as well

Does numba give you any warnings that it was unable to introduce parallelism despite your wish to do so? I think usually a warning pops up when no improvements can be made through numba.

Hi! Regarding your previous message:

  1. Indeed they were some typos, thanks for noticing. After fixing them the output didn’t change significantly
  2. I tried that and, surprisingly enough, it gets stuck at compiling the functions.
  3. Yes, it’s clear that I can/should control the number of threads in that way, and probably I’ll do that. Yet examples A to D are a nice exercise to assess parallelization capabilities of Numba.
  4. Will have a look at that but not having so many hopes.
  5. and 7. I never went to that level with Numba, and think will take a while to get started. In particular 7 seems much more doable, yet it would take some time only to get started. I rather think that if it were possible it should be something very quick to do for someone who already has some experience.
  6. This I think is not an issue. I noticed that when calling external functions from prange, the compiler simply assumes that there are no race conditions (which actually is perfectly fine for me). Indeed examples C and C2 get a significant speedup with prange.

And no warnings at all about unable to apply parallelization.

Regarding 2. Can you clarify what you mean by getting stuck? Does it crash or just take an awfully long time?

Taking an awful long time, I even didn’t manage to wait until the end.

Btw, I now see that recent versions of Numba now support the ‘out’ argument in numpy.dot, and that’s what I needed!

Is there a reproducer for this somewhere please, I’d like to take a look. Thanks.

Hi, actually I was wrong. It doesn’t take a long time to compile, but to execute. If you are still interested in this issue, I can reproduce it with the following code:

import math
import numba
import numpy as np

num_rows = 10000


def calc_bunch_rows(i_row_begin, i_row_end, out):
    tmp = np.empty(5)
    for i_row in range(i_row_begin, i_row_end):
        tmp[0] = i_row
        for index_s in range(out.shape[1]):
            tmp[1] = index_s
            for index_t in range(out.shape[2]):
                tmp[2] = index_t + index_s
                tmp[3] = index_t / (1.2 + i_row / (index_s + 1))
                tmp[4] = index_t / (1.8 + i_row / (index_s + 1))
                tmp[3] = tmp[4] * math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] * math.cos(tmp[3])
                tmp[3] = tmp[4] + math.sin(1.3 + tmp[3])
                tmp[4] = tmp[3] + math.cos(tmp[3])
                tmp[2] = math.cos(tmp[1]) * math.sin(tmp[4] - tmp[2])
                tmp[3] = math.exp(tmp[2] * tmp[4]) + math.cos(tmp[3])
                out[i_row, index_s, index_t] = np.sum(tmp / (1 + np.sum(tmp)))
                
# Setting parallel=False fixes the issue
calc_bunch_rows = numba.njit(calc_bunch_rows, parallel=True)


def f(num_threads):
    print("Started")
    out = np.empty((num_rows, 3, 300))

    # calculate threads
    num_rows_per_thread = int(math.ceil(num_rows / num_threads))

    for index_thread in numba.prange(num_threads):
        # Loop over loan parts
        i_row_begin = index_thread * num_rows_per_thread
        i_row_end = min(num_rows, (index_thread + 1) * num_rows_per_thread)

        calc_bunch_rows(i_row_begin, i_row_end, out)
        
    print("Ended")

# There is no need to compile f, the issue appears anyway
# f = numba.njit(f, parallel=True)

f(2)
# The function keeps using 100% of my cpu (46 cores) but takes a very long 
# time to execute (more than 5 minutes, didn't wait until the end)

I was running it on Python 3.7.7 [MSC v.1916 64 bit (AMD64)], numba 0.50.1, Windows Server 2012R2 (don’t blame me, I didn’t choose it).

If you don’t compile f then prange just acts as range and the calls will happen sequentially. The 100% CPU load must be caused by calc_bunch_rows using parallel features. Maybe something makes this terribly inefficient due to the threading overhead?

@stavoltafunzia Do you have TBB installed? If so I can guess at what the problem may be.

This could indeed be a nested parallelism issue. That said calc_bunch_rows seems pretty intensive, will have to see if TBB is installed to give a hint.

I don’t have TBB installed indeed.