How can I make this arithmetic run faster?

MRE
import numpy as np
from numba import jit, f8

def function( A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = ((A1-B1)/const1)**2  +  ((A2-B2)/const2)**2  + ((A3-B3)/const3)**2
    return x

@jit(f8[:,:]( f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8, f8, f8 ), nopython=True, parallel=False)
def function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = ((A1-B1)/const1)**2  +  ((A2-B2)/const2)**2  + ((A3-B3)/const3)**2
    return x

@jit(f8[:,:]( f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8[:,:], f8, f8, f8 ), nopython=True, fastmath=True, parallel=False)
def function_jit_loop_fastmath( A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = np.empty_like(A1)
    for i in nb.prange(A1.shape[0]):
        for j in range(A1.shape[1]):
            x[i,j] = ((A1[i,j]-B1[i,j])/const1)**2  +  ((A2[i,j]-B2[i,j])/const2)**2  + ((A3[i,j]-B3[i,j])/const3)**2
    return x

n = 200

A1 = np.arange(n**2, dtype=np.float64).reshape(n, n)
A2 = A1 * np.random.uniform()
A3 = A1 * np.random.uniform()
B1 = A1.T
B2 = A2.T
B3 = A3.T

const1 = np.random.uniform()*10
const2 = np.random.uniform()*10
const3 = np.random.uniform()*100

%timeit function( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
%timeit function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
%timeit function_jit_loop_fastmath( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
print(np.allclose( function( A1, A2, A3, B1, B2, B3, const1, const2, const3 ), function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 )))
print(np.allclose( function( A1, A2, A3, B1, B2, B3, const1, const2, const3 ), function_jit_loop_fastmath( A1, A2, A3, B1, B2, B3, const1, const2, const3 )))


416 µs ± 2.29 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
624 µs ± 3.11 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
261 µs ± 4.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
True
True

After reading this answer I created a loop. Also fastmath=True seems to help. Together this cuts 38% off the original runtime. Is it possible to cut more from the original runtime?

Hi @ofk123

There are a few things to note here:

  • In most cases, it is better not to give the jit decorator a signature and let Numba infer the types. For example, in your case, you tell Numba that the arrays can be any 2-dimensional array of type float64. However, Numba can do better if you allow it to compile a more specific function for your particular array layout (the array layout is part of the type). This is the reason why your vectorized jitted function runs slower than the pure Python version.
  • There is no real need to traverse your data in two dimensions. It may be better to pretend the array is one dimensional. An example of this will be shown later.
  • Make sure your tests are actually representative. The A arrays have C layout, while the B arrays have F layout. Also, the B arrays share data with the A arrays. Such things have a big impact on performance since the memory access patterns are different. How big the effect is depends on your hardware and the size of the array. You can run the code below to see what I mean.
see code here
import numpy as np
import numba as nb

opts = dict(
    fastmath=True,
    parallel=False,
    error_model="numpy",
)

def function(A1, A2, A3, B1, B2, B3, const1, const2, const3):
    return (
          ((A1 - B1) / const1) ** 2  
        + ((A2 - B2) / const2) ** 2  
        + ((A3 - B3) / const3) ** 2
    )

@nb.njit(**opts)
def function_jit1(A1, A2, A3, B1, B2, B3, const1, const2, const3):
    return (
          ((A1 - B1) / const1) ** 2  
        + ((A2 - B2) / const2) ** 2  
        + ((A3 - B3) / const3) ** 2
    )

@nb.njit(**opts)
def function_jit2(A1, A2, A3, B1, B2, B3, const1, const2, const3):
    x = np.empty_like(A1)
    for i in nb.prange(x.shape[0]):
        for j in range(x.shape[1]):
            x[i, j] = (
                  ((A1[i, j] - B1[i, j]) / const1) ** 2  
                + ((A2[i, j] - B2[i, j]) / const2) ** 2  
                + ((A3[i, j] - B3[i, j]) / const3) ** 2
            )
    return x

@nb.njit(**opts)
def function_jit3(A1, A2, A3, B1, B2, B3, const1, const2, const3):
    x = np.empty_like(A1)
    for i in nb.prange(x.size):
        x.flat[i] = (
              ((A1.flat[i] - B1.flat[i]) / const1) ** 2  
            + ((A2.flat[i] - B2.flat[i]) / const2) ** 2  
            + ((A3.flat[i] - B3.flat[i]) / const3) ** 2
        )
    return x

for n in (50, 200, 600):
    print(f"\nArray size: {n} by {n}")
    A1 = np.random.rand(n, n)
    A2 = np.random.rand(n, n)
    A3 = np.random.rand(n, n)

    c1 = np.random.rand()
    c2 = np.random.rand()
    c3 = np.random.rand()

    for B1, B2, B3, case in (
        (A1.copy(),   A2.copy(),   A3.copy(),   "C order, data not shared"),
        (A1.copy().T, A2.copy().T, A3.copy().T, "\nF order, data not shared"),
        (A1,          A2,          A3,          "\nC order, shared data"),
        (A1.T,        A2.T,        A3.T,        "\nF order, shared data"),
    ):
        print(case)
        
        assert np.allclose(function(A1, A2, A3, B1, B2, B3, c1, c2, c3), function_jit1(A1, A2, A3, B1, B2, B3, c1, c2, c3))
        assert np.allclose(function(A1, A2, A3, B1, B2, B3, c1, c2, c3), function_jit2(A1, A2, A3, B1, B2, B3, c1, c2, c3))
        assert np.allclose(function(A1, A2, A3, B1, B2, B3, c1, c2, c3), function_jit3(A1, A2, A3, B1, B2, B3, c1, c2, c3))

        %timeit function(A1, A2, A3, B1, B2, B3, c1, c2, c3)
        %timeit function_jit1(A1, A2, A3, B1, B2, B3, c1, c2, c3)
        %timeit function_jit2(A1, A2, A3, B1, B2, B3, c1, c2, c3)
        %timeit function_jit3(A1, A2, A3, B1, B2, B3, c1, c2, c3)
1 Like

Hi @sschaer, I ran your code and it looks very interesting!
function_jit3 with C-order is definitely the fastest combination, however the input is always A1, A2, A3, A1.T, A2.T, A3.T, const1, const2, const3 so I will not be able to get the runtime-reduction seen in functon_jit3.

The timeit-results tell me I should use function_jit2. This does traverse the array in 2 dimensions, and is equal to the function in my original post.

timeit-results
Array size: 50 by 50
C order, data not shared
24.1 µs ± 123 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
24.4 µs ± 176 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
14.6 µs ± 57.7 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
4.06 µs ± 43.5 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

F order, data not shared
31.7 µs ± 235 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
23.9 µs ± 155 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
17.5 µs ± 55.8 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
52.3 µs ± 102 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

C order, shared data
23 µs ± 77.6 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
24.2 µs ± 183 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
14.7 µs ± 75.3 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
3.63 µs ± 24.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

F order, shared data
31.5 µs ± 169 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
23.7 µs ± 71.9 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
17.6 µs ± 97.4 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
52.7 µs ± 250 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Array size: 200 by 200
C order, data not shared
291 µs ± 3.73 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
360 µs ± 2.54 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
206 µs ± 731 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
56.7 µs ± 826 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

F order, data not shared
381 µs ± 2.36 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
484 µs ± 2.41 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
319 µs ± 2.03 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
902 µs ± 6.75 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

C order, shared data
256 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
359 µs ± 1.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
206 µs ± 1.7 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
37.3 µs ± 448 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

F order, shared data
361 µs ± 2.19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
449 µs ± 6.94 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
292 µs ± 1.18 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
846 µs ± 2.54 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

Array size: 600 by 600
C order, data not shared
7.21 ms ± 91 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.3 ms ± 6.72 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.91 ms ± 19 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
1.07 ms ± 41.8 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

F order, data not shared
9.73 ms ± 171 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
13.9 ms ± 107 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
12 ms ± 98.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
18.2 ms ± 154 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

C order, shared data
6.28 ms ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.28 ms ± 5.03 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.85 ms ± 15.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
410 µs ± 6.34 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

F order, shared data
9.12 ms ± 213 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
13.6 ms ± 237 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
11.6 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
17.8 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
CPU times: user 4min 33s, sys: 8.92 s, total: 4min 42s
Wall time: 4min 42s

Thanks alot for the feedback though! The bulletpoints and code was very helpful. Are there any other methods that can help with this arithmetic?

Since the input-2D-arrays actually have some of its data repeated: Instead of passing the 2D-arrays as arguments, I ended up reducing them to 1D arrays including only the unique values, and loop through them as if they were 2D-arrays. It reduces time to a half, which is great, but I would love to improve it further.

In each array’s case, I need to calculate the difference between each element, and all other elements.
I edited the example a bit, and provided the function I use today, named function_1darrays() .

Summary
import numpy as np
import numpy.matlib as nm
from numba import njit

def function( A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = ((A1-B1)/const1)**2  +  ((A2-B2)/const2)**2  + ((A3-B3)/const3)**2
    return x

@njit(parallel=False)
def function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = ((A1-B1)/const1)**2  +  ((A2-B2)/const2)**2  + ((A3-B3)/const3)**2
    return x

@njit(fastmath=True, parallel=False)
def function_jit_loop_fastmath(n, A1, A2, A3, B1, B2, B3, const1, const2, const3 ):
    x = np.zeros_like(A1)
    for i in range(n):
        for j in range(n):
            x[i,j] = ((A1[i,j]-B1[i,j])/const1)**2  +  ((A2[i,j]-B2[i,j])/const2)**2  + ((A3[i,j]-B3[i,j])/const3)**2
    return x

@njit( fastmath = True, parallel=False )
def function_1darrays( n, a1, a2, a3, const1, const2, const3 ) -> np.ndarray:
    x = np.empty((n,n))
    for i in range(n):
        a1i = a1[i]
        a2i = a2[i]
        a3i = a3[i]
        for j in range(n):
            x[i,j] = ((a1i - a1[j])/const1)**2  +  ((a2i - a2[j])/const2)**2  + ((a3i - a3[j])/const3)**2
    return x

n = 200

a1 = np.arange(1,      n+1, dtype=np.float64)
a2 = np.arange(n+1,  2*n+1, dtype=np.float64)
a3 = np.arange(2*n+1,3*n+1, dtype=np.float64)
A1 = nm.repmat(a1, n, 1)
A2 = nm.repmat(a2, n, 1)
A3 = nm.repmat(a3, n, 1)
B1 = A1.T
B2 = A2.T
B3 = A3.T

const1 = 1.0
const2 = 1.0
const3 = 1.0

K1 = function( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
K2 = function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
K3 = function_jit_loop_fastmath( n, A1, A2, A3, B1, B2, B3, const1, const2, const3 )
K4 = function_1darrays( n, a1, a2, a3, const1, const2, const3 )

%timeit function( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
%timeit function_jit( A1, A2, A3, B1, B2, B3, const1, const2, const3 )
%timeit function_jit_loop_fastmath( n, A1, A2, A3, B1, B2, B3, const1, const2, const3 )
%timeit function_1darrays( n, a1, a2, a3, const1, const2, const3 )

print(np.array_equal( K1, K2))
print(np.array_equal( K1, K3))
print(np.array_equal( K1, K4))
print(np.array_equal( K2, K3))
print(np.array_equal( K2, K4))
print(np.array_equal( K3, K4))

Out:

376 µs ± 2.25 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
488 µs ± 2.06 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
270 µs ± 1.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
125 µs ± 709 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
True
True
True
True
True
True