Floyd warshall's in numba vs c++ openmp

Hi,
I am trying to implement/benchmark a parallel Floyd Warshall algorithm’s performance on Numba vs c++.
I would like to know if I can do anything to improve the performance of numba version and bring it closer to the c++ version with openmp and -O2 flag.

I tried three methods with slight variations

  1. min - using regular python min inside the loops
  2. np.minimum in loop - using numpy minimum within the parallel loop
  3. np.minimum in function - using numpy minimum within the loop but on a different function

For some reason moving the numpy minimum outside of the parallel function performs better as the size of the matrix increases.
image

import numpy as np
from timeit import default_timer
from tqdm import trange
from numba import njit, prange, set_num_threads


@njit(cache=True,fastmath=True)
def npmin_infunc(i_array,k_array,k):
    np.minimum(i_array,k_array+i_array[k],i_array)


@njit(parallel=True,cache=True,fastmath=True)
def apsp_kernel_min(sp):
    n = sp.shape[0]
    for k in range(n):
        for i in prange(n):
            for j in range(n):
                sp[i,j] = min(sp[i,j],sp[i,k]+sp[k,j])
    return sp


@njit(parallel=True,cache=True,fastmath=True)
def apsp_kernel_npmin_inloop(sp):
    n = sp.shape[0]
    for k in range(n):
        for i in prange(n):
            np.minimum(sp[i],sp[k]+sp[i,k],sp[i])
    return sp


@njit(parallel=True,cache=True,fastmath=True)
def apsp_kernel_npmin_infunc(sp):
    n = sp.shape[0]
    for k in range(n):
        for i in prange(n):
            npmin_infunc(sp[i],sp[k],k)
    return sp


if __name__ == '__main__':
    N=[500,1000,1500,2000]
    repeat=5
    num_threads=4

    methods = {"min":apsp_kernel_min,"npmin_inloop":apsp_kernel_npmin_inloop,"npmin_infunc":apsp_kernel_npmin_infunc}
    run_time = {k:[np.inf]*len(N) for k in methods.keys()} 

    print(f"Running APSP for |N| = {len(N)}, repeat = {repeat}, |methods| =  {len(methods)}, threads = {num_threads}")

    set_num_threads(num_threads)
    for i,n in enumerate(N):
        for _ in trange(repeat,desc=str(n)+"x"+str(n)):
            adj_matrix = np.random.randint(1, 100, (n, n))
            np.fill_diagonal(adj_matrix,0)
            
            for k,v in methods.items():
                new_matrix = adj_matrix.copy()
                start = default_timer()
                
                v(new_matrix)
                
                end = default_timer()
                run_time[k][i] = min(end - start,run_time[k][i])

    print("Results:")
    print(f"N,{','.join(run_time.keys())}")
    for i,n in enumerate(N):
        print(f"{n},{','.join([str(v[i]) for v in run_time.values()])}")

I also installed SVML and have the following environmental variables set

export NUMBA_OPT=3
export NUMBA_LOOP_VECTORIZE=1
export NUMBA_THREADING_LAYER="omp"
export NUMBA_ENABLE_AVX=1

The C++ version is with omp parallel for on the 2nd loop with a fixed chunk size of 25.

int apsp(vector< vector<int> >& paths){
	for(int k=0;k<paths.size();++k){
#pragma omp parallel for num_threads(n_threads) schedule(static,25)
		for(int i=0;i<paths.size();++i){
			for(int j=0;j<paths[i].size();++j){
				paths[i][j] = min(paths[i][j],paths[i][k]+paths[k][j]);
			}
		}	
	}
	return (0);
}