How can I improve the runtime of this linear system solve?

I am trying out different ways to solve a linear system using numba. As of now I am using scipy.linalg.cho_factor and cho_solve due to it being faster than the njit-ed version of np.linalg.solve. I have had success using numba on a bunch of other parts of my code, and would love to improve the solve as well.

(Excuse me if this has been discussed elsewhere in the discoure, I could not find a good post on it after some searches.)

How can I improve the solve-examples below, using numba?

Context

Want to solve(K,data) fast (approximately 2e9 times, where K.shape = (n,n), data.shape=(n,), and n vary between 1 and 400. In every run K and data have different values. I have a cluster of remote workers available, each w/ 1 core per worker, that I distribute the calculations to, and with scipy.linalg this takes more than 2 hours. Would love to improve the calculation.

Example
import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve

# Usually I use the options below, and cache functions on the remote worker-side, 
# ahead of the computation. 

def options():
    return dict(
        parallel = False, # Since only 1 core per worker
        fastmath = True,
        cache = True,
        nogil = True,
        error_model = "numpy"
    )


@njit(**options())
def numpy_solve(K, data):
    return np.linalg.solve(K, data)

@njit(**options())
def solve_cholesky(K, data):
    LNumba = cholesky_numba(K)
    y = np.linalg.solve(LNumba, data)
    return np.linalg.solve(LNumba.T.conj(), y)

@njit(**options())
def cholesky_numba(K):
    n = K.shape[0]
    L = np.zeros_like(K)
    for i in range(n):
        for j in range(i+1):
            s = 0
            for k in range(j):
                s += L[i][k] * L[j][k]
            if (i == j):
                L[i][j] = (K[i][i] - s) ** 0.5
            else:
                L[i][j] = (1.0 / L[j][j] * (K[i][j] - s))
    return L

def scipysolve(K, data):
    c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
    return cho_solve(c_and_lower, data, check_finite = False)



%timeit f0 = numpy_solve(K, data)
%timeit f1 = solve_cholesky(K, data)
%timeit f2 = scipysolve(K, data)
23.2 µs ± 3.07 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
49.6 µs ± 2.98 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
10.1 µs ± 111 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


Arrays used, for reproducability
# Just copy-pasting an example in here, to make it reproducible.
data = np.array([-1.83533745e-02, -2.62924084e-02, -4.46773166e-03,  2.48877247e-02,
        3.36381269e-02, -1.34949865e-02, -2.57439008e-02,  1.41652167e-02,
        3.41128634e-03, -2.29770657e-02, -1.70633180e-02, -3.16427878e-02,
       -1.29482122e-02,  4.46694804e-02, -1.40248402e-06, -1.10435528e-03,
        1.49894427e-02,  2.48672250e-02])
K = np.array([[ 1.88679408e-03,  1.59684967e-03,  1.33447234e-03,
         1.15446881e-03,  1.07275824e-03,  1.70223626e-04,
         1.61994314e-04,  1.51883781e-04,  1.44565016e-04,
         0.00000000e+00,  1.12002441e-05,  3.66292098e-05,
         4.10665050e-05,  4.95428828e-06,  5.06540388e-05,
         4.52545932e-05,  3.00509432e-05,  2.60088220e-05],
       [ 1.59684967e-03,  1.87990962e-03,  1.56633850e-03,
         1.34747409e-03,  1.23710933e-03,  1.91337105e-04,
         1.83811650e-04,  1.77607485e-04,  1.69435316e-04,
         1.11976194e-05,  0.00000000e+00,  3.14312422e-05,
         3.15812479e-05, -8.26493526e-07,  5.02506560e-05,
         4.57856347e-05,  3.38848683e-05,  2.92687498e-05],
       [ 1.33447234e-03,  1.56633850e-03,  1.85988039e-03,
         1.58418356e-03,  1.38911730e-03,  2.23152978e-04,
         2.15480031e-04,  2.08326339e-04,  1.99154811e-04,
         3.65952793e-05,  3.14094873e-05,  0.00000000e+00,
         9.65482459e-07,  8.13753464e-06,  4.64415663e-05,
         4.35303679e-05,  3.39330403e-05,  2.90651199e-05],
       [ 1.15446881e-03,  1.34747409e-03,  1.58418356e-03,
         1.83985842e-03,  1.51937560e-03,  2.57253488e-04,
         2.49319809e-04,  2.28152631e-04,  2.19356268e-04,
         4.09994186e-05,  3.15370472e-05,  9.64798959e-07,
         0.00000000e+00,  4.92499208e-05,  5.45643572e-05,
         5.08831811e-05,  3.11473273e-05,  2.65390476e-05],
       [ 1.07275824e-03,  1.23710933e-03,  1.38911730e-03,
         1.51937560e-03,  1.86880224e-03,  2.41340451e-04,
         2.40780764e-04,  2.24390938e-04,  2.18382847e-04,
         4.95123547e-06, -8.26177854e-07,  8.14006066e-06,
         4.93001100e-05,  0.00000000e+00,  8.97243250e-05,
         8.14007318e-05,  4.37606403e-05,  3.86739545e-05],
       [ 1.70223626e-04,  1.91337105e-04,  2.23152978e-04,
         2.57253488e-04,  2.41340451e-04,  1.53351813e-03,
         1.32960595e-03,  3.00014346e-04,  3.05442260e-04,
         4.99018110e-05,  4.95160225e-05,  4.57943154e-05,
         5.38420166e-05,  8.84463921e-05,  0.00000000e+00,
        -1.08927502e-05, -1.86346107e-04, -1.94558012e-04],
       [ 1.61994314e-04,  1.83811650e-04,  2.15480031e-04,
         2.49319809e-04,  2.40780764e-04,  1.32960595e-03,
         1.54340857e-03,  3.19948671e-04,  3.29223469e-04,
         4.46057170e-05,  4.51397226e-05,  4.29459961e-05,
         5.02356654e-05,  8.02830498e-05, -1.08984108e-05,
         0.00000000e+00, -1.85542909e-04, -1.97926982e-04],
       [ 1.51883781e-04,  1.77607485e-04,  2.08326339e-04,
         2.28152631e-04,  2.24390938e-04,  3.00014346e-04,
         3.19948671e-04,  1.75840993e-03,  1.53005684e-03,
         2.99102281e-05,  3.37341060e-05,  3.38054618e-05,
         3.10522053e-05,  4.35825842e-05, -1.88269389e-04,
        -1.87360536e-04,  0.00000000e+00,  5.74482718e-06],
       [ 1.44565016e-04,  1.69435316e-04,  1.99154811e-04,
         2.19356268e-04,  2.18382847e-04,  3.05442260e-04,
         3.29223469e-04,  1.53005684e-03,  1.76364514e-03,
         2.58923593e-05,  2.91445196e-05,  2.89617996e-05,
         2.64634414e-05,  3.85245183e-05, -1.96606482e-04,
        -1.99907038e-04,  5.74600888e-06,  0.00000000e+00],
       [ 0.00000000e+00,  1.11976194e-05,  3.65952793e-05,
         4.09994186e-05,  4.95123547e-06,  4.99018110e-05,
         4.46057170e-05,  2.99102281e-05,  2.58923593e-05,
         1.79289224e-03,  1.44846528e-03,  1.16324154e-03,
         1.04458060e-03,  1.01210499e-03,  2.25202867e-04,
         1.98037880e-04, -5.25430505e-05, -3.61875293e-05],
       [ 1.12002441e-05,  0.00000000e+00,  3.14094873e-05,
         3.15370472e-05, -8.26177854e-07,  4.95160225e-05,
         4.51397226e-05,  3.37341060e-05,  2.91445196e-05,
         1.44846528e-03,  1.78718795e-03,  1.43743698e-03,
         1.27650446e-03,  1.17742592e-03,  2.69368408e-04,
         2.37871575e-04, -5.39067389e-05, -3.51288038e-05],
       [ 3.66292098e-05,  3.14312422e-05,  0.00000000e+00,
         9.64798959e-07,  8.14006066e-06,  4.57943154e-05,
         4.29459961e-05,  3.38054618e-05,  2.89617996e-05,
         1.16324154e-03,  1.43743698e-03,  1.77059676e-03,
         1.52637970e-03,  1.27899609e-03,  3.22252555e-04,
         2.85694021e-04, -5.48020161e-05, -3.31806425e-05],
       [ 4.10665050e-05,  3.15812479e-05,  9.65482459e-07,
         0.00000000e+00,  4.93001100e-05,  5.38420166e-05,
         5.02356654e-05,  3.10522053e-05,  2.64634414e-05,
         1.04458060e-03,  1.27650446e-03,  1.52637970e-03,
         1.75401853e-03,  1.39040946e-03,  3.66558052e-04,
         3.25109958e-04, -7.11544451e-05, -4.70177726e-05],
       [ 4.95428828e-06, -8.26493526e-07,  8.13753464e-06,
         4.92499208e-05,  0.00000000e+00,  8.84463921e-05,
         8.02830498e-05,  4.35825842e-05,  3.85245183e-05,
         1.01210499e-03,  1.17742592e-03,  1.27899609e-03,
         1.39040946e-03,  1.77798632e-03,  3.70092071e-04,
         3.28740223e-04, -1.05561515e-04, -8.11476293e-05],
       [ 5.06540388e-05,  5.02506560e-05,  4.64415663e-05,
         5.45643572e-05,  8.97243250e-05,  0.00000000e+00,
        -1.08984108e-05, -1.88269389e-04, -1.96606482e-04,
         2.25202867e-04,  2.69368408e-04,  3.22252555e-04,
         3.66558052e-04,  3.70092071e-04,  1.50146129e-03,
         1.28998516e-03, -9.96000586e-05, -8.10580453e-05],
       [ 4.52545932e-05,  4.57856347e-05,  4.35303679e-05,
         5.08831811e-05,  8.14007318e-05, -1.08927502e-05,
         0.00000000e+00, -1.87360536e-04, -1.99907038e-04,
         1.98037880e-04,  2.37871575e-04,  2.85694021e-04,
         3.25109958e-04,  3.28740223e-04,  1.28998516e-03,
         1.50957561e-03, -8.80310616e-05, -7.29915038e-05],
       [ 3.00509432e-05,  3.38848683e-05,  3.39330403e-05,
         3.11473273e-05,  4.37606403e-05, -1.86346107e-04,
        -1.85542909e-04,  0.00000000e+00,  5.74600888e-06,
        -5.25430505e-05, -5.39067389e-05, -5.48020161e-05,
        -7.11544451e-05, -1.05561515e-04, -9.96000586e-05,
        -8.80310616e-05,  1.68665623e-03,  1.44540841e-03],
       [ 2.60088220e-05,  2.92687498e-05,  2.90651199e-05,
         2.65390476e-05,  3.86739545e-05, -1.94558012e-04,
        -1.97926982e-04,  5.74482718e-06,  0.00000000e+00,
        -3.61875293e-05, -3.51288038e-05, -3.31806425e-05,
        -4.70177726e-05, -8.11476293e-05, -8.10580453e-05,
        -7.29915038e-05,  1.44540841e-03,  1.69098207e-03]])

Hey @ofk123 ,

have you tried to unpack your solver algorithm?
Below is a naive implementation which unfortunately becomes less efficient with increasing data size.
Numpy and Scipy Cholesky Algorithms might be slower but are probably more stable.

@njit(**options())
def cholesky_decomp(A):
    """Perform Cholesky decomposition on symmetric, positive definite matrix."""
    n = A.shape[0]
    L = np.zeros_like(A)

    # Perform the Cholesky decomposition
    for row in range(n):
        for col in range(row+1):
            tmp_sum = 0.0
            for j in range(col):
                tmp_sum += L[row, j] * L[col, j]
            if (row == col):
                # diag elts.
                L[row, col] = math.sqrt(A[row, row] - tmp_sum)
            else:
                L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
    return L

@njit(**options())
def cholesky_solver_nb(L, b):
    """Solve system using forward substition and then backward substition."""
    U = L.T
    n = L.shape[0]
    y = np.zeros(n)
    x = np.zeros(n)

    # Forward substitution
    for i in range(n):
        sumj = 0
        for j in range(i):
            sumj += L[i, j] * y[j]
        y[i] = (b[i] - sumj) / L[i, i]

    # Backward substitution
    for i in range(n - 1, -1, -1):
        sumj = 0
        for j in range(i + 1, n):
            sumj += U[i, j] * x[j]
        x[i] = (y[i] - sumj) / U[i, i]
    return x

@njit(**options())
def cholesky_nb(K, data):
    """Decompose and solve."""
    L = cholesky_decomp(K)
    return cholesky_solver_nb(L, data)

Hi @Oyibo ,
Thanks for reaching out, and for the suggestion.
It seems unpacking the algorithm in your way results in a ~50% reduction in solve-runtime, though only when n < ~40…
Attached is a plot, with a log-scale y-axis (seconds).

Would you happen to have any other ideas?

code to produce plot
# Here, 'Ks' & 'datas' are two lists of different K- and data-arrays. 
# I am not sure how to best share my test-data with you on here... But I can share if you want!

import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve

def options():
    return dict(
        parallel = False, # Since only 1 core per worker
        fastmath = True,
        cache = True,
        nogil = True,
        error_model = "numpy"
    )


@njit(**options())
def numpy_solve(K, data):
    return np.linalg.solve(K, data)


def scipysolve(K, data):
    c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
    return cho_solve(c_and_lower, data, check_finite = False)


@njit(**options())
def cholesky_decomp(A):
    """Perform Cholesky decomposition on symmetric, positive definite matrix."""
    n = A.shape[0]
    L = np.zeros_like(A)

    # Perform the Cholesky decomposition
    for row in range(n):
        for col in range(row+1):
            tmp_sum = 0.0
            for j in range(col):
                tmp_sum += L[row, j] * L[col, j]
            if (row == col):
                # diag elts.
                L[row, col] = math.sqrt(A[row, row] - tmp_sum)
            else:
                L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
    return L

@njit(**options())
def cholesky_solver_nb(L, b):
    """Solve system using forward substition and then backward substition."""
    U = L.T
    n = L.shape[0]
    y = np.zeros(n)
    x = np.zeros(n)

    # Forward substitution
    for i in range(n):
        sumj = 0
        for j in range(i):
            sumj += L[i, j] * y[j]
        y[i] = (b[i] - sumj) / L[i, i]

    # Backward substitution
    for i in range(n - 1, -1, -1):
        sumj = 0
        for j in range(i + 1, n):
            sumj += U[i, j] * x[j]
        x[i] = (y[i] - sumj) / U[i, i]
    return x

@njit(**options())
def cholesky_nb(K, data):
    """Decompose and solve."""
    L = cholesky_decomp(K)
    return cholesky_solver_nb(L, data)


sizes = []
for i in range(len(datas)):
    sizes.append(datas[i].size)
sizesorted_i = np.argsort(np.array(sizes))
seconds_per_solve = np.full((len(Ks),3), np.nan)
for k, i in enumerate(sizesorted_i):
    K0 = Ks[i]
    d0 = datas[i]
    t0 = Time()
    fitcoef = numpy_solve(K0, d0)
    seconds_per_solve[k,0] = Time() - t0
    t0 = Time()
    fitcoef = scipysolve(K0, d0)
    seconds_per_solve[k,1] = Time() - t0
    t0 = Time()
    fitcoef = cholesky_nb(K0, d0)
    seconds_per_solve[k,2] = Time() - t0
    del fitcoef, t0


import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(12,7))
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,0], s=3, label='numpy_solve(K, data)' )
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,1], s=3, label='scipysolve(K, data)' )
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,2], s=3, label='cholesky_nb(K, data)')
ax.set_yscale('log');ax.grid()
ax.legend(loc='upper right', bbox_to_anchor=(1.01, 1.00), fontsize=7.75)
ax.set_ylabel('seconds per solve');ax.set_xlabel('n, where K.shape = (n,n), data.shape=(n,) ');fig.show()
fig.savefig('runtime_per_solve.png', dpi=360)

Hey @ofk123 ,

theoretically it might be possible to attempt optimizing the Cholesky decomposition using Numba.
However, SciPy’s implementation already leverages LAPACK’s POTRF algorithm, which is optimized and efficient.
Numba helps reducing Python overhead when applied to lower-level operations, especially for smaller data sizes where Python overhead dominates. You could try to call the Lapack function on a C-Level.
However, for larger datasets, the overhead reduction might not provide significant performance gains compared to SciPy’s LAPACK implementation.
Sorry ;-(

2 Likes

Okay, I see,
so I take it scipy.linalg will for these matrix-sizes be better then… That is unfortunate, but also what I found. Thanks for looking into it @Oyibo.

Where can I see an example where a Lapack function on a C-level is called to solve a linear system?

I previously tried the Cython API on a distributed cluster, but ran into this error. So I am not sure I can make it work for solve either…

Hey @ofk123 ,

Have you tried Numba’s “get_cython_function_address”?

The Lapack functions should be:

Decomposer:
void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)

Solver:
void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)


from numba.extending import get_cython_function_address

addr_decom = get_cython_function_address('scipy.linalg.cython_lapack','dpotrf')
addr_solve = get_cython_function_address('scipy.linalg.cython_lapack','dpotrs')

# TODO: 
# Define c-types, c-function
# Retrieve function address

Here are discussions that might help:

Thanks, but I think that would work locally, and not remote then, correct?

  1. Not sure if you saw the link above or not, but I use get_cython_function_address there to collect a scipy.special.cython_special-address, but fail to run it on remote workers.

  2. I tried creating the functypes, but a bit confused about why dpotrf takes char, int, and double as input arguments. If I remember correct, the first type is the output-type, and the rest are function-argument-types. So I assume decom-types would be something like (2D Array{Float64}, 2DArray{Float64})., since it takes an array and returns an array. What would be the correct ctype-specification?

Attempt
from numba.extending import get_cython_function_address
addr_decom = get_cython_function_address('scipy.linalg.cython_lapack','dpotrf')
_PTR, _dble, _char, _int  = ctypes.POINTER, ctypes.c_double, ctypes.c_char, ctypes.c_int
_ptr_dble = _PTR(_dble)
_ptr_char = _PTR(_char)
_ptr_int  = _PTR(_int)
functype_decom = ctypes.CFUNCTYPE(_ptr_char, _ptr_int, _ptr_dble, _ptr_int, _ptr_int)

decom = functype_decom(addr_decom)
decom(K)



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[59], line 10
      7 functype_decom = ctypes.CFUNCTYPE(_ptr_char, _ptr_int, _ptr_dble, _ptr_int, _ptr_int)
      9 decom = functype_decom(addr_decom)
---> 10 decom(K)

TypeError: this function takes at least 4 arguments (1 given)
​

Hey @ofk123 ,

  1. I’m not sure.

  2. Implementing the Numba wrappers unfortunately turned out to be as challenging as anticipated. While I was able to replicate the functionality of scipy.linalg.cho_factor with numba_dpotrf, I encountered difficulties replicating results for sp.linalg.cho_solve with numba_dpotrs. It seems to produce correct solutions for right-hand sides with a single column, but for multiple columns, the results are incorrect. I suspect there might be an issue with the CFUNCTYPE definition. Maybe you can spot the problem.

EDIT:
Unfortunately, this API has no feature to pass an additional argument to give hints about the memory layout. Since the algorithm is written in Fortran, it prefers F-order. That’s why the RHS matrix has to be in column major order.

"""
Example how to call LAPACK function using Numba.

https://www.netlib.org/lapack/lapacke.html#_array_arguments
https://www.netlib.org/lapack/lug/node116.html
https://www.netlib.org/lapack/lug/node121.html#secstorage

https://netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
https://netlib.org/lapack/explore-html/d8/d2f/dpotrs_8f.html
https://github.com/Reference-LAPACK/lapack/blob/master/LAPACKE/include/lapacke.h
"""
import timeit
import ctypes
import numpy as np
import scipy as sp
from numba import njit
from numba.extending import get_cython_function_address

# Unfortunately, it is not possible to pass an argument defining the memory layout
# https://github.com/Reference-LAPACK/lapack/blob/master/LAPACKE/include/lapacke.h
# LAPACK_ROW_MAJOR = 101
# LAPACK_COL_MAJOR = 102


# Define necessary datatypes and pointers
_ptr_dbl = ctypes.POINTER(ctypes.c_double)
_ptr_int = ctypes.POINTER(ctypes.c_int)
# _ptr_char = ctypes.POINTER(ctypes.c_char)     # This does not work
_ptr_char = ctypes.POINTER(ctypes.c_int)        # Must be integer representing Unicode

# Get addresses of dpotrf and dpotrs functions
dpotrf_addr = get_cython_function_address('scipy.linalg.cython_lapack', 'dpotrf')
dpotrs_addr = get_cython_function_address('scipy.linalg.cython_lapack', 'dpotrs')
# print(sp.linalg.cython_lapack.__pyx_capi__['dpotrf'])
# print(sp.linalg.cython_lapack.__pyx_capi__['dpotrs'])
# <capsule object "void (char *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, int *)" at 0x7fb0104a9350>
# <capsule object "void (char *, int *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, int *)" at 0x7fb0104a93b0>

# Define function types

# void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
dpotrf_functype = ctypes.CFUNCTYPE(
    None,       # void
    _ptr_char,  # UPLO
    _ptr_int,   # N
    _ptr_dbl,   # A
    _ptr_int,   # LDA
    _ptr_int,   # INFO
)

# void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
dpotrs_functype = ctypes.CFUNCTYPE(
    None,       # void
    _ptr_char,  # UPLO
    _ptr_int,   # N
    _ptr_int,   # NRHS
    _ptr_dbl,   # A
    _ptr_int,   # LDA
    _ptr_dbl,   # B
    _ptr_int,   # LDB
    _ptr_int,   # INFO
)

# Create function objects
dpotrf_fn = dpotrf_functype(dpotrf_addr)
dpotrs_fn = dpotrs_functype(dpotrs_addr)


@njit
def numba_dpotrf(A, lower=True):
    """
    DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.

    void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
    lapack_int LAPACKE_dpotrf( int matrix_layout, char uplo, lapack_int n, double* a,
                               lapack_int lda )
    """
    # Data type must be integer representing Unicode!!!
    UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
    size = A.shape[0]
    N = np.array(size, dtype=np.int32)
    LDA = np.array(size, dtype=np.int32)
    INFO = np.array(0, dtype=np.int32)
    dpotrf_fn(
        UPLO.ctypes,
        N.ctypes,
        A.ctypes,       # out
        LDA.ctypes,
        INFO.ctypes,    # out
    )
    if INFO:
        raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
    return A  # , INFO


@njit
def numba_dpotrs(A, B, lower=True):
    """
    DPOTRS solves a system of linear equations A*X = B.

    System is solved using a symmetric positive definite matrix A using the
    Cholesky factorization.
    A = U**T*U or A = L*L**T computed by DPOTRF.

    void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
    lapack_int LAPACKE_dpotrs( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, const double* a, lapack_int lda,
                           double* b, lapack_int ldb )
    """
    # Data type must be integer representing Unicode!!!
    UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
    INFO = np.array(0, dtype=np.int32)
    size = A.shape[0]
    N = np.array(size, dtype=np.int32)
    NRHS = np.array(B.shape[0], dtype=np.int32)
    LDA = np.array(size, dtype=np.int32)
    LDB = np.array(B.shape[1], dtype=np.int32)
    dpotrs_fn(
        UPLO.ctypes,
        N.ctypes,
        NRHS.ctypes,
        A.ctypes,
        LDA.ctypes,
        B.ctypes,       # out
        LDB.ctypes,
        INFO.ctypes,    # out
    )
    if INFO:
        raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
    return B


def generate_random_spd_matrix(size=4, rhs=2, seed=None):
    """Generate a random symmetric positive definite matrix."""
    rng = np.random.default_rng(seed)
    A = rng.random((size, size))    # Generate a random square matrix
    A = 0.5 * (A + A.T)             # Make it symmetric
    A = np.dot(A, A.T)              # Ensure it is positive definite
    B = rng.random((size, rhs))     # Generate a random square matrix for RHS
    return A, B


if __name__ == "__main__":
    # Example:
    # https://support.nag.com/numeric/fl/nagdoc_fl26/pdf/f07/f07fef.pdf
    A = np.array([
        [4.16, -3.12, 0.56, -0.10],
        [-3.12, 5.03, -0.83, 1.18],
        [0.56, -0.83, 0.76, 0.34],
        [-0.10, 1.18, 0.34, 1.18]], dtype=np.float64)
    B = np.array([
        [8.70, 8.30],
        [-13.35, 2.13],
        [1.89, 1.61],
        [-4.14, 5.00]], dtype=np.float64)
    B_cmo = B.T.copy()  # B must be in column mayor format
    sep = '\n\n****************************************************\n'
    print('Example:', '\nA\n', A, '\nB\n', B, sep)
    UPPER = False
    LOWER = True
    L_sp, _ = sp.linalg.cho_factor(A, LOWER)
    X_sp = sp.linalg.cho_solve((L_sp, LOWER), B)
    print('Scipy:', '\nL\n', L_sp, '\nX\n', X_sp, sep)

    # potrf & potrs are inplace operations (in other context .copy can be removed)
    L_nb = numba_dpotrf(A.copy(), LOWER)
    X_nb = numba_dpotrs(L_nb, B_cmo, LOWER).T
    print('Numba:', '\nL\n', L_nb, '\nX\n', X_nb, '\n')

    equal_L = np.allclose(L_sp, L_nb)
    equal_X = np.allclose(X_sp, X_nb)
    print('Is L equal?:', equal_L)
    print('Is X equal?:', equal_X)
    print(sep)

    # Define setup code
    number = 1
    SIZE = 1000
    RHS = 50
    setup = f'''
import numpy as np
import scipy as sp
A, B = generate_random_spd_matrix(size={SIZE}, rhs={RHS}, seed=None)
B_cmo = B.T.copy()
LOWER = True
    '''

    # Define Scipy version
    scipy_code = '''
L_sp, _ = sp.linalg.cho_factor(A, LOWER)
X_sp = sp.linalg.cho_solve((L_sp, LOWER), B)
    '''

    # Define Numba version
    numba_code = '''
L_nb = numba_dpotrf(A, LOWER)
X_nb = numba_dpotrs(L_nb, B_cmo, LOWER)
    '''

    # Time the Scipy version
    print(f'Timer for random system\nA: {SIZE}x{SIZE}, B: {SIZE}x{RHS}, runs: {number}\n')
    scipy_time = timeit.timeit(scipy_code, setup=setup,
                               number=number, globals=globals()) * 1e3
    print(f'Scipy time: {scipy_time:.3f} ms')

    # Time the Numba version
    numba_time = timeit.timeit(numba_code, setup=setup,
                               number=number, globals=globals()) * 1e3
    print(f'Numba time: {numba_time:.3f} ms')

# Example:
# A
#  [[ 4.16 -3.12  0.56 -0.1 ]
#  [-3.12  5.03 -0.83  1.18]
#  [ 0.56 -0.83  0.76  0.34]
#  [-0.1   1.18  0.34  1.18]]
# B
#  [[  8.7    8.3 ]
#  [-13.35   2.13]
#  [  1.89   1.61]
#  [ -4.14   5.  ]]

# ****************************************************

# Scipy:
# L
#  [[ 2.03960781 -3.12        0.56       -0.1       ]
#  [-1.52970585  1.64012195 -0.83        1.18      ]
#  [ 0.27456259 -0.24998141  0.78874881  0.34      ]
#  [-0.04902903  0.67373039  0.66165756  0.53468943]]
# X
#  [[ 1.  4.]
#  [-1.  3.]
#  [ 2.  2.]
#  [-3.  1.]]

# ****************************************************

# Numba:
# L
#  [[ 2.03960781 -3.12        0.56       -0.1       ]
#  [-1.52970585  1.64012195 -0.83        1.18      ]
#  [ 0.27456259 -0.24998141  0.78874881  0.34      ]
#  [-0.04902903  0.67373039  0.66165756  0.53468943]]
# X
#  [[ 1.  4.]
#  [-1.  3.]
#  [ 2.  2.]
#  [-3.  1.]]

# Is L equal?: True
# Is X equal?: True


# ****************************************************

# Timer for random system
# A: 1000x1000, B: 1000x50, runs: 1

# Scipy time: 15.035 ms
# Numba time: 7.636 ms

# ****************************************************

# Python: 3.12.0
# Numpy: 1.26.3
# Numba: 0.59.0

1 Like

Hi @Oyibo

Pretty sure the memory layout of the matrices causes issues. Did you try column major order arrays since this is the default in Fortran?

1 Like

Hey @sschaer ,

Unfortunately, there seems to be no way to pass an additional argument to give hints about the memory layout using this API.
I have transposed the RHS matrix and adjusted the arguments in the solver function accordingly.

Thanks.

Maybe a better way to deal with it is to make the argument F-contiguous if it is not already. You can check this at compile time. If you make a copy you either warn the user that you return a copy or you write the result back to the original array.

Here is a dummy example:

import warnings
import numba as nb 
from numba.core.errors import NumbaPerformanceWarning


def foo(x):
    ...

@nb.extending.overload(foo) 
def foo_impl(x):
    if x.layout == "F":
        return lambda x: x
    
    warnings.warn(
        "foo is faster on f-contiguous arrays", 
        NumbaPerformanceWarning
    )

    # make F order copy and write to x since your function
    # operates inplace (or warn that you will return a copy)
    def impl(x):
        x[:] = np.asfortranarray(x)
        return x 
    
    return impl

@nb.njit 
def test(x):
    return foo(x)

a = np.ones((42, 42))
b = np.ones((42, 42), order="F")

print(test(a))
print(test(b))

@sschaer absolutely!

I have not much experience using the LAPACK interface on a low level and the Scipy interface isn’t well documented, too. Nevertheless, here is what I learned:

When utilizing the LAPACK interface, it’s essential to understand at least the basic conventions.
The LAPACKE C-interface to LAPACK introduces special arguments in its functions (LAPACK_ROW_MAJOR or LAPACK_COL_MAJOR).
These arguments specify the memory layout of matrices. From my perspective this allows a safer and more precise way of interacting with LAPACK.

However, the Cython-LAPACK implementation seems to rely on the FORTRAN interface.
In this case, the FORTRAN interface assumes and prefers LAPACK_COL_MAJOR layout or F-order.
Therefore, providing F-contiguous arrays to this interface is probably a good idea and might be beneficial from a performance perspective.

For operations like Cholesky decomposition, the result remains consistent regardless of memory layout. The matrix is square and symmetric.
The performance on the other hand can vary significantly.
Empirical testing with a 1000x1000 matrix demonstrates that the F-order layout seems to be more efficient.
The choice of triangle storage (lower or upper) does not significantly impact performance.
Here is a test result:

Cholesky decomposition of a matrix 1000 x 1000, runs: 10, reps: 100
order:C, triangle storage: lower, time: 10.412 ms
order:C, triangle storage: upper, time: 10.553 ms
order:F, triangle storage: lower, time: 8.776 ms
order:F, triangle storage: upper, time: 8.705 ms

When using the Cython-LAPACK interface on C-order matrices, passing a transposed reference could make sense to avoid unnecessary copies.
A transposed C-order matrix has F-order layout.
If you know your matrix structures and sizes you should benchmark the algorithm and find out what process pipeline works best.

One potential challenge arises if the user is not aware of what happens under the hood and when providing both C- and F-order matrices to LAPACK functions.
In such cases, it’s crucial to provide the correct dimensions, especially for rectangular matrices (m-by-n).

For the Cholesky solver specifically, it’s advisable to determine the possible cases:

  • If A is an n-by-n square matrix and B is an m-by-n rectangular matrix, n represents the system size, and m is the number of right-hand sides. => You can determine the mayor order and set the matrix dimensions passed to the LAPACK function accordingly.
  • If B is a 1D vector, n still represents the system size, with 1 indicating the number of right-hand sides. => Again, you can determine the mayor order and set the matrix dimensions passed to the LAPACK function.
  • If A and B are both n-by-n matrices, the direction of the solution may not be clear => warn the user and assume a layout.

In my provided example I did not make this determination, so the code will fail if you use an other matrix layout.
Compared to the more convenient Scipy functions, I would assume that the performance gains using the low level LAPACK functions should decrease with increasing system dimensions.
When interacting with LAPACK functions (without LAPACKE C-interface) I would strictly use FORTRAN memory layouts. Just to be on the safe side.

1 Like

Much appreciate both your replies, @sschaer, @Oyibo I am eager to test this out more in detail. Though it will only make sense for me when I have learned how to cache it on a remote worker.

Might it be possible to call the C function directly bypassing the cython function?

Hey @nelson2005 , it’s quite likely that there are additional methods, such as a direct C-call to the underlying LAPACK functions, which could further improve performance. The crucial point is whether the performance gains justify the additional effort.
The Numba-Scipy wrapper shows awesome speed. Even for small linear systems, this function outperforms my naive Numba solution. However, at a matrix size of around 1000x1000, the overhead of the Scipy function becomes negligible. Both the Scipy function and the direct Numba-LAPACK approach show similar performance beyond this point, while the naive version deteriorates more and more.

scipy_vs_numba

1 Like
  1. Below an updated figure including numba_dpotrf & numba_dpotrs runtimes on my test-data,(called cythonlapackCholesky in figure). The test was done locally in an ipython-session. However, when distributing the calculations to remote workers, I unfortunately do not get the improvement seen locally, so I think I would benefit from caching it on the workerside. What would be a good next step for me to try out, in order to get closer to being able to caching numba_dpotrf and numba_dpotrs?

  1. I see that all my K-arrays are initially C_CONTIGUOUS. I do not really see a difference in performance when K being F-order, as compared with the C-order it is already in. Are you sure we are supposed to see a difference? K is always symmetric, so maybe this means the order does not matter in my case?
F-order vs. C-order argument
def not_cache():
    return dict(
        parallel = False,
        fastmath = True,
        cache = False,
        nogil = True,
        error_model = "numpy"
    )

_ptr_dbl = ctypes.POINTER(ctypes.c_double)
_ptr_int = ctypes.POINTER(ctypes.c_int)
_ptr_char = ctypes.POINTER(ctypes.c_int)        # Must be integer representing Unicode

# Get addresses of dpotrf and dpotrs functions
dpotrf_addr = get_cython_function_address('scipy.linalg.cython_lapack', 'dpotrf')
dpotrs_addr = get_cython_function_address('scipy.linalg.cython_lapack', 'dpotrs')


# Define function types

# void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
dpotrf_functype = ctypes.CFUNCTYPE(
    None,       # void
    _ptr_char,  # UPLO
    _ptr_int,   # N
    _ptr_dbl,   # A
    _ptr_int,   # LDA
    _ptr_int,   # INFO
)

# void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
dpotrs_functype = ctypes.CFUNCTYPE(
    None,       # void
    _ptr_char,  # UPLO
    _ptr_int,   # N
    _ptr_int,   # NRHS
    _ptr_dbl,   # A
    _ptr_int,   # LDA
    _ptr_dbl,   # B
    _ptr_int,   # LDB
    _ptr_int,   # INFO
)

# Create function objects
dpotrf_fn = dpotrf_functype(dpotrf_addr)
dpotrs_fn = dpotrs_functype(dpotrs_addr)


@njit(**not_cache())
def numba_dpotrf(A, lower=True):
    """
    DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.

    void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
    lapack_int LAPACKE_dpotrf( int matrix_layout, char uplo, lapack_int n, double* a,
                               lapack_int lda )
    """
    # Data type must be integer representing Unicode!!!
    UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
    size = A.shape[0]
    N = np.array(size, dtype=np.int32)
    LDA = np.array(size, dtype=np.int32)
    INFO = np.array(0, dtype=np.int32)
    dpotrf_fn(
        UPLO.ctypes,
        N.ctypes,
        A.ctypes,       # out
        LDA.ctypes,
        INFO.ctypes,    # out
    )
    #if INFO > 0:
    #    raise Exception('INFO > 0. The leading principal minor of order i is not positive, and the factorization could not be completed.')
    #if INFO < 0:
    #    raise Exception('INFO < 0. The i-th argument had an illegal value')
        
    return A #, INFO


@njit(**not_cache())
def numba_dpotrs(A, B, lower=True):
    """
    DPOTRS solves a system of linear equations A*X = B.

    System is solved using a symmetric positive definite matrix A using the
    Cholesky factorization.
    A = U**T*U or A = L*L**T computed by DPOTRF.

    void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
    lapack_int LAPACKE_dpotrs( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, const double* a, lapack_int lda,
                           double* b, lapack_int ldb )
    """
    # Data type must be integer representing Unicode!!!
    UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
    INFO = np.array(0, dtype=np.int32)
    size = A.shape[0]
    N = np.array(size, dtype=np.int32)
    NRHS = np.array(B.shape[0], dtype=np.int32)
    LDA = np.array(size, dtype=np.int32)
    LDB = np.array(B.shape[1], dtype=np.int32)
    dpotrs_fn(
        UPLO.ctypes,
        N.ctypes,
        NRHS.ctypes,
        A.ctypes,
        LDA.ctypes,
        B.ctypes,       # out
        LDB.ctypes,
        INFO.ctypes,    # out
    )
    #if INFO > 0:
    #    raise Exception('INFO > 0. The leading principal minor of order i is not positive, and the factorization could not be completed.')
    #if INFO < 0:
    #    raise Exception('INFO < 0. The i-th argument had an illegal value')
        
    return B#, INFO

def scipyCythonLapackCholesky_nb(K, data):
    L_nb = numba_dpotrf(K, True)
    return numba_dpotrs(L_nb, data, True)


# K used below has K.shape(n,n,) data.shape=(n,), where n = 1420, just to test a large array. 
# I also tested with n=100. No notable runtime-difference in either case.
# I do not know a good way to share my real test-data, but can share it if you want.
Kcopy = K.copy()
n = K.shape[0]
KcopyF = np.asfortranarray(Kcopy)
datacopy = data.copy().reshape(1,n)
datacopy2 = datacopy.copy()
print(Kcopy.flags)
print(KcopyF.flags)
%timeit XnbC = scipyCythonLapackCholesky_nb(Kcopy, datacopy)
%timeit XnbF = scipyCythonLapackCholesky_nb(KcopyF, datacopy2)

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

  C_CONTIGUOUS : False
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False

3.34 ms ± 30.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.34 ms ± 32.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
  1. My data-array will always be 1D, so during runtime I add an extra axis (data.reshape(1,n)) before each cythonlapackCholesky-run. @Oyibo Is there any performance-related changes you would do to the function ( f.ex. when defining the data-types), now that data always is 1D, or would you leave the function as it is?

Should probably close this post as the “main question” has been answered, and continue on “caching” in my other post. But I wanted to ask these questions first!

1 Like

Thank you for crosschecking @ofk123 .

  1. In the given example we have used mutable global variables to pass the c-functions to the jitted functions. You have to change this approach to be able to cache. Can you provide a minimum working example of what you want to accomplish using Scipy and the version that fails (in your other thread)? This will increase the likelihood to get help from other users.

  2. If you can’t measure a difference in performance between using F-order and C-order, then it’s likely that the memory layout doesn’t significantly affect performance in your case. Since the matrices are square and symmetric, the computation will produce the same results regardless of the memory layout.

  3. Since your data-array is always 1D and you don’t have multiple RHS, you can simplify the process by providing a vector instead of a matrix to the LAPACK function dpotrs. You will receive a vector as a result, too. This change should be sufficient for your specific case.

@njit
def numba_dpotrs(A, B, lower=True):
    """DPOTRS solves a system of linear equations."""
    UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
    INFO = np.array(0, dtype=np.int32)
    size = A.shape[0]
    N = np.array(size, dtype=np.int32)
    LDA = np.array(size, dtype=np.int32)
    if B.ndim == 1:
        NRHS = np.array(1, dtype=np.int32)
        LDB = np.array(B.size, dtype=np.int32)
    else:
        NRHS = np.array(B.shape[0], dtype=np.int32)
        LDB = np.array(B.shape[1], dtype=np.int32)
    dpotrs_fn(
        UPLO.ctypes,
        N.ctypes,
        NRHS.ctypes,
        A.ctypes,
        LDA.ctypes,
        B.ctypes,       # out
        LDB.ctypes,
        INFO.ctypes,    # out
    )
    if INFO:
        raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
    return B
1 Like
  1. Yep, will provide an MRE in that post.
  2. I am a bit confused why on your end you get different runtime-results using “FARRAY” and “CARRAY”, and not on my end. When time I will dig into that further.
  3. Great, thanks alot!
    In my case, N, LDA, (and LDB in dpotrs) end up equal.
N = np.array(size, dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
LDB = np.array(size, dtype=np.int32)

Is there anything in the way for me to disregards those definitions, and reuse the N.ctypes argument, like:

dpotrs_fn(
        UPLO.ctypes,
        N.ctypes,
        NRHS.ctypes,
        A.ctypes,
        N.ctypes,
        B.ctypes,       # out
        N.ctypes,
        INFO.ctypes,    # out
    )

I am asking because I want to be sure reusing N.ctypes does not disturb anything internally in dpotrs_fn.

(Edit: I guess you meant ord('L') if lower else ord('U') instead of ord('U') if lower else ord('L'). But I dont think this should matter for the results. )

Just adding onto here that I am considering changing to float32, especially when seeing the potential gain on sizes ~1e2, which would be very relevant in our own case. Adding the single-precision-runtimes to the plot, using spotrf and spotrs. Not directly comparable since it is float32, but could perhaps be interesting for others who end up here.

plot code
# Here, 'Ks' & 'datas' was two lists of different K- and data-arrays. 
# I am not sure how to best share test-data on here, but can share if you want.

import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve
from time import time as Time

def options():
    return dict(
        parallel = False, # Since only 1 core per worker
        fastmath = True,
        cache = True,
        nogil = True,
        error_model = "numpy"
    )


@njit(**not_cache())
def scipyCythonLapackCholesky_f8(A, b, k):
    Ai = A.copy()
    bi = b.copy()
    L_nb = solve.dpotrf(Ai, k)
    x_nb = solve.dpotrs(L_nb, bi, k)
    return np.sum(x_nb)

@njit(**not_cache())
def scipyCythonLapackCholesky_f4(A, b, k):
    Ai = A.copy()
    bi = b.copy()
    L_nb = solve.spotrf(Ai, k)
    x_nb = solve.spotrs(L_nb, bi, k)
    return np.sum(x_nb)


@njit(**options())
def numpy_solve(K, data):
    return np.linalg.solve(K, data)


def scipysolve(K, data):
    c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
    return cho_solve(c_and_lower, data, check_finite = False)


@njit(**options())
def cholesky_decomp(A):
    """Perform Cholesky decomposition on symmetric, positive definite matrix."""
    n = A.shape[0]
    L = np.zeros_like(A)

    # Perform the Cholesky decomposition
    for row in range(n):
        for col in range(row+1):
            tmp_sum = 0.0
            for j in range(col):
                tmp_sum += L[row, j] * L[col, j]
            if (row == col):
                # diag elts.
                L[row, col] = math.sqrt(A[row, row] - tmp_sum)
            else:
                L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
    return L

@njit(**options())
def cholesky_solver_nb(L, b):
    """Solve system using forward substition and then backward substition."""
    U = L.T
    n = L.shape[0]
    y = np.zeros(n)
    x = np.zeros(n)

    # Forward substitution
    for i in range(n):
        sumj = 0
        for j in range(i):
            sumj += L[i, j] * y[j]
        y[i] = (b[i] - sumj) / L[i, i]

    # Backward substitution
    for i in range(n - 1, -1, -1):
        sumj = 0
        for j in range(i + 1, n):
            sumj += U[i, j] * x[j]
        x[i] = (y[i] - sumj) / U[i, i]
    return x

@njit(**options())
def cholesky_nb(K, data):
    """Decompose and solve."""
    L = cholesky_decomp(K)
    return cholesky_solver_nb(L, data)


@njit(**not_cache())
def scipyCythonLapackCholesky_f8(A, b, k):
    #Ai = A.copy()
    #bi = b.copy()
    L_nb = numba_dpotrf(A, k)
    x_nb = numba_dpotrs(L_nb, b, k)
    return x_nb

@njit(**not_cache())
def scipyCythonLapackCholesky_f4(A, b, k):
    #Ai = A.copy()
    #bi = b.copy()
    L_nb = numba_spotrf(A, k)
    x_nb = numba_spotrs(L_nb, b, k)
    return x_nb

sizes = []
for i in range(len(datas)):
    sizes.append(datas[i].size)
sizesorted_i = np.argsort(np.array(sizes))
seconds_per_solve = np.full((len(Ks),5), np.nan)
for k, i in enumerate(sizesorted_i):
    K0 = Ks[i]
    d0 = datas[i]
    n = d0.size
    K32 = Ks[i].astype('float32')
    d32 = datas[i].astype('float32')
    t0 = Time()
    fitcoef = numpy_solve(K0, d0)
    seconds_per_solve[k,0] = Time() - t0
    t0 = Time()
    fitcoef = scipysolve(K0, d0)
    seconds_per_solve[k,1] = Time() - t0
    t0 = Time()
    fitcoef = cholesky_nb(K0, d0)
    seconds_per_solve[k,2] = Time() - t0
    t0 = Time()
    fitcoef = scipyCythonLapackCholesky_f8(K0, d0, n)
    seconds_per_solve[k,3] = Time() - t0
    t0 = Time()
    fitcoef = scipyCythonLapackCholesky_f4(K32, d32, n)
    seconds_per_solve[k,4] = Time() - t0
    del fitcoef, t0


sortedsizes = np.array(sizes)[sizesorted_i]
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(12,7))
ax.scatter(sortedsizes,seconds_per_solve[:,0], s=1.2, label='numpy_solve(K, data)' )
ax.scatter(sortedsizes,seconds_per_solve[:,1], s=1.2, label='scipysolve(K, data)' )
ax.scatter(sortedsizes,seconds_per_solve[:,2], s=1.2, label='cholesky_nb(K, data)')
ax.scatter(sortedsizes,seconds_per_solve[:,3], s=1.2, label='numba_dpotrs(K, data) (float64)' )
ax.scatter(sortedsizes,seconds_per_solve[:,4], s=1.2, label='numba_spotrs(K, data) (float32)' )
ax.set_yscale('log');ax.set_xscale('log');ax.grid()
ax.legend(loc='upper right', bbox_to_anchor=(1.01, 1.00), fontsize=7.75)
ax.set_ylabel('seconds per solve');ax.set_xlabel('n, where K.shape = (n,n), data.shape=(n,) ');fig.show()
fig.savefig('runtime_per_solve_loglog.png', dpi=360)