Accelerating calculations involving 3D array

Here is my function accelerated by numba. The function has a parameter of 3D array. I wonder if there is any way to make it faster.

a = np.ones((256,256), dtype='float32')
b = np.ones((256,256), dtype='float32') * 3
c = np.random.rand(4, 256, 256) + 1j*np.random.rand(4, 256, 256)
c = c.astype('complex64')

sig = ['void(float32[:,:],float32[:,:],complex64[:,:,:])',
        'void(float32[:,:],float32[:,:],complex128[:,:,:])',
        'void(float64[:,:],float64[:,:],complex64[:,:,:])',
        'void(float64[:,:],float64[:,:],complex128[:,:,:])']
@njit(sig, parallel=True, boundscheck=False)
def f(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

@njit(parallel=True, boundscheck=False)
def f1(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

%%time
f(a,b,c)
Wall time: 5.95 ms

%%time
f1(a,b,c)
Wall time: 5.95 ms

hi @shz, interestingly the signature did not make a difference. In many cases it will, but here it does not. This shows the importance of measuring rather than guessing.

I made some modifications to explore different possibilities. I was able to achieve a 4.5x speedup. The most important change is to avoid calculating amplitude[j, k] / (np.sqrt(denom[j, k]) + eps) for each i since that factor is constant on i. To do that I had to change the order of the loop. Since I had to change the order to the loop, in f3 I went ahead and changed the input array order to column-major (Fortran) order.

import numpy as np
from numba import njit, prange


a = np.ones((256,256), dtype='float32')
b = np.ones((256,256), dtype='float32') * 3
c = np.random.rand(4, 256, 256) + 1j*np.random.rand(4, 256, 256)
c = c.astype('complex64')

paral = True

sig = ['void(float32[:,:],float32[:,:],complex64[:,:,:])',
        'void(float32[:,:],float32[:,:],complex128[:,:,:])',
        'void(float64[:,:],float64[:,:],complex64[:,:,:])',
        'void(float64[:,:],float64[:,:],complex128[:,:,:])']
@njit(sig, parallel=paral, boundscheck=False)
def f(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

                
@njit(parallel=paral, boundscheck=False)
def f1(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)
                

@njit(parallel=paral, boundscheck=False)
def f2(amplitude, denom, wave):
    eps = 2.2204e-16
    for j in prange(wave.shape[1]):
        for k in range(wave.shape[2]):
            factor = amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)
            for i in range(wave.shape[0]):
                wave[i, j, k] = wave[i, j, k] * factor
                
                
@njit(parallel=paral, boundscheck=False)
def f3(amplitude, denom, wave):
    eps = 2.2204e-16
    for k in prange(wave.shape[2]):
        for j in range(wave.shape[1]):
            factor = amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)
            for i in range(wave.shape[0]):
                wave[i, j, k] = wave[i, j, k] * factor

1 Like

OK, I got it. Thanks for your kind reply and detailed suggestion.

It’s probably worth noting that expressions like:

float32[:,:]

evaluate as “Any” ordered arrays, and as a result Numba cannot optimise quite so well as it doesn’t know about the memory layout. This is how to be specific and generate types that are of a specific order:

In [3]: from numba import types

In [4]: types.float32[:,:] # This is "any" order, same as above
Out[4]: array(float32, 2d, A)

In [5]: types.float32[:,::1] # This is "C" order
Out[5]: array(float32, 2d, C)

In [6]: types.float32[::1,:] # This is "Fortran" order
Out[6]: array(float32, 2d, F)

hope this helps.

1 Like

Thanks for your explanations. I modified the signature but it still helps little.

a = np.ones((256,256), dtype='float32')
b = np.ones((256,256), dtype='float32') * 3
c = np.random.rand(4, 256, 256) + 1j*np.random.rand(4, 256, 256)
c = c.astype('complex64')

sig = ['void(float32[:,:],float32[:,:],complex64[:,:,:])',
        'void(float32[:,:],float32[:,:],complex128[:,:,:])',
        'void(float64[:,:],float64[:,:],complex64[:,:,:])',
        'void(float64[:,:],float64[:,:],complex128[:,:,:])']
@njit(sig, parallel=True, boundscheck=False)
def f(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

sig1 = ['void(float32[:,::1],float32[:,::1],complex64[:,:,::1])',
        'void(float32[:,::1],float32[:,::1],complex128[:,:,::1])',
        'void(float64[:,::1],float64[:,::1],complex64[:,:,::1])',
        'void(float64[:,::1],float64[:,::1],complex128[:,:,::1])']
@njit(sig1, parallel=True, boundscheck=False)
def f1(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

@njit(parallel=True, boundscheck=False)
def f2(amplitude, denom, wave):
    eps = 2.2204e-16
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * amplitude[j, k] / (np.sqrt(denom[j, k]) + eps)

image

If a division is involved error_model=“numpy” in combination with contiguous arrays usually helps, but not in this case.

If you don’t want to change the array order as @luk-f-a showed, you could also use a temporary array.

@nb.njit(parallel=True,error_model="numpy",fastmath=True)
def f(amplitude, denom, wave):
    eps = 2.2204e-16
    TMP=np.empty_like(denom)
    for j in nb.prange(wave.shape[1]):
        for k in range(wave.shape[2]):
            TMP[j,k]=amplitude[j, k]/(np.sqrt(denom[j, k]) + eps)
            
    for i in nb.prange(wave.shape[0]):
        for j in range(wave.shape[1]):
            for k in range(wave.shape[2]):
                wave[i, j, k] = wave[i, j, k] * TMP[j,k]

%timeit f(a, b, c)
#866 µs ± 37.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)

This really helps me out. In my calculation, denom.shape[0] will be big and I think this method will save much time.