How can I improve the runtime of this linear system solve?

I am trying out different ways to solve a linear system using numba. As of now I am using `scipy.linalg.cho_factor` and `cho_solve` due to it being faster than the `njit`-ed version of `np.linalg.solve`. I have had success using numba on a bunch of other parts of my code, and would love to improve the solve as well.

(Excuse me if this has been discussed elsewhere in the discoure, I could not find a good post on it after some searches.)

How can I improve the solve-examples below, using numba?

Context

Want to `solve(K,data)` fast (approximately 2e9 times, where K.shape = (n,n), data.shape=(n,), and n vary between 1 and 400. In every run K and data have different values. I have a cluster of remote workers available, each w/ 1 core per worker, that I distribute the calculations to, and with `scipy.linalg` this takes more than 2 hours. Would love to improve the calculation.

Example
``````import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve

# Usually I use the options below, and cache functions on the remote worker-side,

def options():
return dict(
parallel = False, # Since only 1 core per worker
fastmath = True,
cache = True,
nogil = True,
error_model = "numpy"
)

@njit(**options())
def numpy_solve(K, data):
return np.linalg.solve(K, data)

@njit(**options())
def solve_cholesky(K, data):
LNumba = cholesky_numba(K)
y = np.linalg.solve(LNumba, data)
return np.linalg.solve(LNumba.T.conj(), y)

@njit(**options())
def cholesky_numba(K):
n = K.shape[0]
L = np.zeros_like(K)
for i in range(n):
for j in range(i+1):
s = 0
for k in range(j):
s += L[i][k] * L[j][k]
if (i == j):
L[i][j] = (K[i][i] - s) ** 0.5
else:
L[i][j] = (1.0 / L[j][j] * (K[i][j] - s))
return L

def scipysolve(K, data):
c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
return cho_solve(c_and_lower, data, check_finite = False)

%timeit f0 = numpy_solve(K, data)
%timeit f1 = solve_cholesky(K, data)
%timeit f2 = scipysolve(K, data)
23.2 Âµs Â± 3.07 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)
49.6 Âµs Â± 2.98 Âµs per loop (mean Â± std. dev. of 7 runs, 10,000 loops each)
10.1 Âµs Â± 111 ns per loop (mean Â± std. dev. of 7 runs, 100,000 loops each)

``````
Arrays used, for reproducability
``````# Just copy-pasting an example in here, to make it reproducible.
data = np.array([-1.83533745e-02, -2.62924084e-02, -4.46773166e-03,  2.48877247e-02,
3.36381269e-02, -1.34949865e-02, -2.57439008e-02,  1.41652167e-02,
3.41128634e-03, -2.29770657e-02, -1.70633180e-02, -3.16427878e-02,
-1.29482122e-02,  4.46694804e-02, -1.40248402e-06, -1.10435528e-03,
1.49894427e-02,  2.48672250e-02])
K = np.array([[ 1.88679408e-03,  1.59684967e-03,  1.33447234e-03,
1.15446881e-03,  1.07275824e-03,  1.70223626e-04,
1.61994314e-04,  1.51883781e-04,  1.44565016e-04,
0.00000000e+00,  1.12002441e-05,  3.66292098e-05,
4.10665050e-05,  4.95428828e-06,  5.06540388e-05,
4.52545932e-05,  3.00509432e-05,  2.60088220e-05],
[ 1.59684967e-03,  1.87990962e-03,  1.56633850e-03,
1.34747409e-03,  1.23710933e-03,  1.91337105e-04,
1.83811650e-04,  1.77607485e-04,  1.69435316e-04,
1.11976194e-05,  0.00000000e+00,  3.14312422e-05,
3.15812479e-05, -8.26493526e-07,  5.02506560e-05,
4.57856347e-05,  3.38848683e-05,  2.92687498e-05],
[ 1.33447234e-03,  1.56633850e-03,  1.85988039e-03,
1.58418356e-03,  1.38911730e-03,  2.23152978e-04,
2.15480031e-04,  2.08326339e-04,  1.99154811e-04,
3.65952793e-05,  3.14094873e-05,  0.00000000e+00,
9.65482459e-07,  8.13753464e-06,  4.64415663e-05,
4.35303679e-05,  3.39330403e-05,  2.90651199e-05],
[ 1.15446881e-03,  1.34747409e-03,  1.58418356e-03,
1.83985842e-03,  1.51937560e-03,  2.57253488e-04,
2.49319809e-04,  2.28152631e-04,  2.19356268e-04,
4.09994186e-05,  3.15370472e-05,  9.64798959e-07,
0.00000000e+00,  4.92499208e-05,  5.45643572e-05,
5.08831811e-05,  3.11473273e-05,  2.65390476e-05],
[ 1.07275824e-03,  1.23710933e-03,  1.38911730e-03,
1.51937560e-03,  1.86880224e-03,  2.41340451e-04,
2.40780764e-04,  2.24390938e-04,  2.18382847e-04,
4.95123547e-06, -8.26177854e-07,  8.14006066e-06,
4.93001100e-05,  0.00000000e+00,  8.97243250e-05,
8.14007318e-05,  4.37606403e-05,  3.86739545e-05],
[ 1.70223626e-04,  1.91337105e-04,  2.23152978e-04,
2.57253488e-04,  2.41340451e-04,  1.53351813e-03,
1.32960595e-03,  3.00014346e-04,  3.05442260e-04,
4.99018110e-05,  4.95160225e-05,  4.57943154e-05,
5.38420166e-05,  8.84463921e-05,  0.00000000e+00,
-1.08927502e-05, -1.86346107e-04, -1.94558012e-04],
[ 1.61994314e-04,  1.83811650e-04,  2.15480031e-04,
2.49319809e-04,  2.40780764e-04,  1.32960595e-03,
1.54340857e-03,  3.19948671e-04,  3.29223469e-04,
4.46057170e-05,  4.51397226e-05,  4.29459961e-05,
5.02356654e-05,  8.02830498e-05, -1.08984108e-05,
0.00000000e+00, -1.85542909e-04, -1.97926982e-04],
[ 1.51883781e-04,  1.77607485e-04,  2.08326339e-04,
2.28152631e-04,  2.24390938e-04,  3.00014346e-04,
3.19948671e-04,  1.75840993e-03,  1.53005684e-03,
2.99102281e-05,  3.37341060e-05,  3.38054618e-05,
3.10522053e-05,  4.35825842e-05, -1.88269389e-04,
-1.87360536e-04,  0.00000000e+00,  5.74482718e-06],
[ 1.44565016e-04,  1.69435316e-04,  1.99154811e-04,
2.19356268e-04,  2.18382847e-04,  3.05442260e-04,
3.29223469e-04,  1.53005684e-03,  1.76364514e-03,
2.58923593e-05,  2.91445196e-05,  2.89617996e-05,
2.64634414e-05,  3.85245183e-05, -1.96606482e-04,
-1.99907038e-04,  5.74600888e-06,  0.00000000e+00],
[ 0.00000000e+00,  1.11976194e-05,  3.65952793e-05,
4.09994186e-05,  4.95123547e-06,  4.99018110e-05,
4.46057170e-05,  2.99102281e-05,  2.58923593e-05,
1.79289224e-03,  1.44846528e-03,  1.16324154e-03,
1.04458060e-03,  1.01210499e-03,  2.25202867e-04,
1.98037880e-04, -5.25430505e-05, -3.61875293e-05],
[ 1.12002441e-05,  0.00000000e+00,  3.14094873e-05,
3.15370472e-05, -8.26177854e-07,  4.95160225e-05,
4.51397226e-05,  3.37341060e-05,  2.91445196e-05,
1.44846528e-03,  1.78718795e-03,  1.43743698e-03,
1.27650446e-03,  1.17742592e-03,  2.69368408e-04,
2.37871575e-04, -5.39067389e-05, -3.51288038e-05],
[ 3.66292098e-05,  3.14312422e-05,  0.00000000e+00,
9.64798959e-07,  8.14006066e-06,  4.57943154e-05,
4.29459961e-05,  3.38054618e-05,  2.89617996e-05,
1.16324154e-03,  1.43743698e-03,  1.77059676e-03,
1.52637970e-03,  1.27899609e-03,  3.22252555e-04,
2.85694021e-04, -5.48020161e-05, -3.31806425e-05],
[ 4.10665050e-05,  3.15812479e-05,  9.65482459e-07,
0.00000000e+00,  4.93001100e-05,  5.38420166e-05,
5.02356654e-05,  3.10522053e-05,  2.64634414e-05,
1.04458060e-03,  1.27650446e-03,  1.52637970e-03,
1.75401853e-03,  1.39040946e-03,  3.66558052e-04,
3.25109958e-04, -7.11544451e-05, -4.70177726e-05],
[ 4.95428828e-06, -8.26493526e-07,  8.13753464e-06,
4.92499208e-05,  0.00000000e+00,  8.84463921e-05,
8.02830498e-05,  4.35825842e-05,  3.85245183e-05,
1.01210499e-03,  1.17742592e-03,  1.27899609e-03,
1.39040946e-03,  1.77798632e-03,  3.70092071e-04,
3.28740223e-04, -1.05561515e-04, -8.11476293e-05],
[ 5.06540388e-05,  5.02506560e-05,  4.64415663e-05,
5.45643572e-05,  8.97243250e-05,  0.00000000e+00,
-1.08984108e-05, -1.88269389e-04, -1.96606482e-04,
2.25202867e-04,  2.69368408e-04,  3.22252555e-04,
3.66558052e-04,  3.70092071e-04,  1.50146129e-03,
1.28998516e-03, -9.96000586e-05, -8.10580453e-05],
[ 4.52545932e-05,  4.57856347e-05,  4.35303679e-05,
5.08831811e-05,  8.14007318e-05, -1.08927502e-05,
0.00000000e+00, -1.87360536e-04, -1.99907038e-04,
1.98037880e-04,  2.37871575e-04,  2.85694021e-04,
3.25109958e-04,  3.28740223e-04,  1.28998516e-03,
1.50957561e-03, -8.80310616e-05, -7.29915038e-05],
[ 3.00509432e-05,  3.38848683e-05,  3.39330403e-05,
3.11473273e-05,  4.37606403e-05, -1.86346107e-04,
-1.85542909e-04,  0.00000000e+00,  5.74600888e-06,
-5.25430505e-05, -5.39067389e-05, -5.48020161e-05,
-7.11544451e-05, -1.05561515e-04, -9.96000586e-05,
-8.80310616e-05,  1.68665623e-03,  1.44540841e-03],
[ 2.60088220e-05,  2.92687498e-05,  2.90651199e-05,
2.65390476e-05,  3.86739545e-05, -1.94558012e-04,
-1.97926982e-04,  5.74482718e-06,  0.00000000e+00,
-3.61875293e-05, -3.51288038e-05, -3.31806425e-05,
-4.70177726e-05, -8.11476293e-05, -8.10580453e-05,
-7.29915038e-05,  1.44540841e-03,  1.69098207e-03]])

``````

Hey @ofk123 ,

have you tried to unpack your solver algorithm?
Below is a naive implementation which unfortunately becomes less efficient with increasing data size.
Numpy and Scipy Cholesky Algorithms might be slower but are probably more stable.

``````@njit(**options())
def cholesky_decomp(A):
"""Perform Cholesky decomposition on symmetric, positive definite matrix."""
n = A.shape[0]
L = np.zeros_like(A)

# Perform the Cholesky decomposition
for row in range(n):
for col in range(row+1):
tmp_sum = 0.0
for j in range(col):
tmp_sum += L[row, j] * L[col, j]
if (row == col):
# diag elts.
L[row, col] = math.sqrt(A[row, row] - tmp_sum)
else:
L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
return L

@njit(**options())
def cholesky_solver_nb(L, b):
"""Solve system using forward substition and then backward substition."""
U = L.T
n = L.shape[0]
y = np.zeros(n)
x = np.zeros(n)

# Forward substitution
for i in range(n):
sumj = 0
for j in range(i):
sumj += L[i, j] * y[j]
y[i] = (b[i] - sumj) / L[i, i]

# Backward substitution
for i in range(n - 1, -1, -1):
sumj = 0
for j in range(i + 1, n):
sumj += U[i, j] * x[j]
x[i] = (y[i] - sumj) / U[i, i]
return x

@njit(**options())
def cholesky_nb(K, data):
"""Decompose and solve."""
L = cholesky_decomp(K)
return cholesky_solver_nb(L, data)
``````

Hi @Oyibo ,
Thanks for reaching out, and for the suggestion.
It seems unpacking the algorithm in your way results in a ~50% reduction in solve-runtime, though only when n < ~40â€¦
Attached is a plot, with a log-scale y-axis (seconds).

Would you happen to have any other ideas?

code to produce plot
``````# Here, 'Ks' & 'datas' are two lists of different K- and data-arrays.
# I am not sure how to best share my test-data with you on here... But I can share if you want!

import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve

def options():
return dict(
parallel = False, # Since only 1 core per worker
fastmath = True,
cache = True,
nogil = True,
error_model = "numpy"
)

@njit(**options())
def numpy_solve(K, data):
return np.linalg.solve(K, data)

def scipysolve(K, data):
c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
return cho_solve(c_and_lower, data, check_finite = False)

@njit(**options())
def cholesky_decomp(A):
"""Perform Cholesky decomposition on symmetric, positive definite matrix."""
n = A.shape[0]
L = np.zeros_like(A)

# Perform the Cholesky decomposition
for row in range(n):
for col in range(row+1):
tmp_sum = 0.0
for j in range(col):
tmp_sum += L[row, j] * L[col, j]
if (row == col):
# diag elts.
L[row, col] = math.sqrt(A[row, row] - tmp_sum)
else:
L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
return L

@njit(**options())
def cholesky_solver_nb(L, b):
"""Solve system using forward substition and then backward substition."""
U = L.T
n = L.shape[0]
y = np.zeros(n)
x = np.zeros(n)

# Forward substitution
for i in range(n):
sumj = 0
for j in range(i):
sumj += L[i, j] * y[j]
y[i] = (b[i] - sumj) / L[i, i]

# Backward substitution
for i in range(n - 1, -1, -1):
sumj = 0
for j in range(i + 1, n):
sumj += U[i, j] * x[j]
x[i] = (y[i] - sumj) / U[i, i]
return x

@njit(**options())
def cholesky_nb(K, data):
"""Decompose and solve."""
L = cholesky_decomp(K)
return cholesky_solver_nb(L, data)

sizes = []
for i in range(len(datas)):
sizes.append(datas[i].size)
sizesorted_i = np.argsort(np.array(sizes))
seconds_per_solve = np.full((len(Ks),3), np.nan)
for k, i in enumerate(sizesorted_i):
K0 = Ks[i]
d0 = datas[i]
t0 = Time()
fitcoef = numpy_solve(K0, d0)
seconds_per_solve[k,0] = Time() - t0
t0 = Time()
fitcoef = scipysolve(K0, d0)
seconds_per_solve[k,1] = Time() - t0
t0 = Time()
fitcoef = cholesky_nb(K0, d0)
seconds_per_solve[k,2] = Time() - t0
del fitcoef, t0

import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(12,7))
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,0], s=3, label='numpy_solve(K, data)' )
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,1], s=3, label='scipysolve(K, data)' )
ax.scatter(np.array(sizes)[sizesorted_i],seconds_per_solve[:,2], s=3, label='cholesky_nb(K, data)')
ax.set_yscale('log');ax.grid()
ax.legend(loc='upper right', bbox_to_anchor=(1.01, 1.00), fontsize=7.75)
ax.set_ylabel('seconds per solve');ax.set_xlabel('n, where K.shape = (n,n), data.shape=(n,) ');fig.show()
fig.savefig('runtime_per_solve.png', dpi=360)
``````

Hey @ofk123 ,

theoretically it might be possible to attempt optimizing the Cholesky decomposition using Numba.
However, SciPyâ€™s implementation already leverages LAPACKâ€™s POTRF algorithm, which is optimized and efficient.
Numba helps reducing Python overhead when applied to lower-level operations, especially for smaller data sizes where Python overhead dominates. You could try to call the Lapack function on a C-Level.
However, for larger datasets, the overhead reduction might not provide significant performance gains compared to SciPyâ€™s LAPACK implementation.
Sorry ;-(

2 Likes

Okay, I see,
so I take it `scipy.linalg` will for these matrix-sizes be better thenâ€¦ That is unfortunate, but also what I found. Thanks for looking into it @Oyibo.

Where can I see an example where a Lapack function on a C-level is called to solve a linear system?

I previously tried the Cython API on a distributed cluster, but ran into this error. So I am not sure I can make it work for solve eitherâ€¦

Hey @ofk123 ,

The Lapack functions should be:

Decomposer:
void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)

Solver:
void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)

``````

# TODO:
# Define c-types, c-function
``````

Here are discussions that might help:

Thanks, but I think that would work locally, and not remote then, correct?

1. Not sure if you saw the link above or not, but I use `get_cython_function_address` there to collect a `scipy.special.cython_special`-address, but fail to run it on remote workers.

2. I tried creating the functypes, but a bit confused about why `dpotrf` takes char, int, and double as input arguments. If I remember correct, the first type is the output-type, and the rest are function-argument-types. So I assume `decom`-types would be something like `(2D Array{Float64}, 2DArray{Float64})`., since it takes an array and returns an array. What would be the correct ctype-specification?

Attempt
``````from numba.extending import get_cython_function_address
_PTR, _dble, _char, _int  = ctypes.POINTER, ctypes.c_double, ctypes.c_char, ctypes.c_int
_ptr_dble = _PTR(_dble)
_ptr_char = _PTR(_char)
_ptr_int  = _PTR(_int)
functype_decom = ctypes.CFUNCTYPE(_ptr_char, _ptr_int, _ptr_dble, _ptr_int, _ptr_int)

decom(K)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[59], line 10
7 functype_decom = ctypes.CFUNCTYPE(_ptr_char, _ptr_int, _ptr_dble, _ptr_int, _ptr_int)
---> 10 decom(K)

TypeError: this function takes at least 4 arguments (1 given)
â€‹
``````

Hey @ofk123 ,

1. Iâ€™m not sure.

2. Implementing the Numba wrappers unfortunately turned out to be as challenging as anticipated. While I was able to replicate the functionality of `scipy.linalg.cho_factor` with `numba_dpotrf`, I encountered difficulties replicating results for sp.linalg.cho_solve with `numba_dpotrs`. It seems to produce correct solutions for right-hand sides with a single column, but for multiple columns, the results are incorrect. I suspect there might be an issue with the CFUNCTYPE definition. Maybe you can spot the problem.

EDIT:
Unfortunately, this API has no feature to pass an additional argument to give hints about the memory layout. Since the algorithm is written in Fortran, it prefers F-order. Thatâ€™s why the RHS matrix has to be in column major order.

``````"""
Example how to call LAPACK function using Numba.

https://www.netlib.org/lapack/lapacke.html#_array_arguments
https://www.netlib.org/lapack/lug/node116.html
https://www.netlib.org/lapack/lug/node121.html#secstorage

https://netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html
https://netlib.org/lapack/explore-html/d8/d2f/dpotrs_8f.html
https://github.com/Reference-LAPACK/lapack/blob/master/LAPACKE/include/lapacke.h
"""
import timeit
import ctypes
import numpy as np
import scipy as sp
from numba import njit

# Unfortunately, it is not possible to pass an argument defining the memory layout
# https://github.com/Reference-LAPACK/lapack/blob/master/LAPACKE/include/lapacke.h
# LAPACK_ROW_MAJOR = 101
# LAPACK_COL_MAJOR = 102

# Define necessary datatypes and pointers
_ptr_dbl = ctypes.POINTER(ctypes.c_double)
_ptr_int = ctypes.POINTER(ctypes.c_int)
# _ptr_char = ctypes.POINTER(ctypes.c_char)     # This does not work
_ptr_char = ctypes.POINTER(ctypes.c_int)        # Must be integer representing Unicode

# Get addresses of dpotrf and dpotrs functions
# print(sp.linalg.cython_lapack.__pyx_capi__['dpotrf'])
# print(sp.linalg.cython_lapack.__pyx_capi__['dpotrs'])
# <capsule object "void (char *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, int *)" at 0x7fb0104a9350>
# <capsule object "void (char *, int *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, __pyx_t_5scipy_6linalg_13cython_lapack_d *, int *, int *)" at 0x7fb0104a93b0>

# Define function types

# void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
dpotrf_functype = ctypes.CFUNCTYPE(
None,       # void
_ptr_char,  # UPLO
_ptr_int,   # N
_ptr_dbl,   # A
_ptr_int,   # LDA
_ptr_int,   # INFO
)

# void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
dpotrs_functype = ctypes.CFUNCTYPE(
None,       # void
_ptr_char,  # UPLO
_ptr_int,   # N
_ptr_int,   # NRHS
_ptr_dbl,   # A
_ptr_int,   # LDA
_ptr_dbl,   # B
_ptr_int,   # LDB
_ptr_int,   # INFO
)

# Create function objects

@njit
def numba_dpotrf(A, lower=True):
"""
DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.

void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
lapack_int LAPACKE_dpotrf( int matrix_layout, char uplo, lapack_int n, double* a,
lapack_int lda )
"""
# Data type must be integer representing Unicode!!!
UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
size = A.shape[0]
N = np.array(size, dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
INFO = np.array(0, dtype=np.int32)
dpotrf_fn(
UPLO.ctypes,
N.ctypes,
A.ctypes,       # out
LDA.ctypes,
INFO.ctypes,    # out
)
if INFO:
raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
return A  # , INFO

@njit
def numba_dpotrs(A, B, lower=True):
"""
DPOTRS solves a system of linear equations A*X = B.

System is solved using a symmetric positive definite matrix A using the
Cholesky factorization.
A = U**T*U or A = L*L**T computed by DPOTRF.

void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
lapack_int LAPACKE_dpotrs( int matrix_layout, char uplo, lapack_int n,
lapack_int nrhs, const double* a, lapack_int lda,
double* b, lapack_int ldb )
"""
# Data type must be integer representing Unicode!!!
UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
INFO = np.array(0, dtype=np.int32)
size = A.shape[0]
N = np.array(size, dtype=np.int32)
NRHS = np.array(B.shape[0], dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
LDB = np.array(B.shape[1], dtype=np.int32)
dpotrs_fn(
UPLO.ctypes,
N.ctypes,
NRHS.ctypes,
A.ctypes,
LDA.ctypes,
B.ctypes,       # out
LDB.ctypes,
INFO.ctypes,    # out
)
if INFO:
raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
return B

def generate_random_spd_matrix(size=4, rhs=2, seed=None):
"""Generate a random symmetric positive definite matrix."""
rng = np.random.default_rng(seed)
A = rng.random((size, size))    # Generate a random square matrix
A = 0.5 * (A + A.T)             # Make it symmetric
A = np.dot(A, A.T)              # Ensure it is positive definite
B = rng.random((size, rhs))     # Generate a random square matrix for RHS
return A, B

if __name__ == "__main__":
# Example:
# https://support.nag.com/numeric/fl/nagdoc_fl26/pdf/f07/f07fef.pdf
A = np.array([
[4.16, -3.12, 0.56, -0.10],
[-3.12, 5.03, -0.83, 1.18],
[0.56, -0.83, 0.76, 0.34],
[-0.10, 1.18, 0.34, 1.18]], dtype=np.float64)
B = np.array([
[8.70, 8.30],
[-13.35, 2.13],
[1.89, 1.61],
[-4.14, 5.00]], dtype=np.float64)
B_cmo = B.T.copy()  # B must be in column mayor format
sep = '\n\n****************************************************\n'
print('Example:', '\nA\n', A, '\nB\n', B, sep)
UPPER = False
LOWER = True
L_sp, _ = sp.linalg.cho_factor(A, LOWER)
X_sp = sp.linalg.cho_solve((L_sp, LOWER), B)
print('Scipy:', '\nL\n', L_sp, '\nX\n', X_sp, sep)

# potrf & potrs are inplace operations (in other context .copy can be removed)
L_nb = numba_dpotrf(A.copy(), LOWER)
X_nb = numba_dpotrs(L_nb, B_cmo, LOWER).T
print('Numba:', '\nL\n', L_nb, '\nX\n', X_nb, '\n')

equal_L = np.allclose(L_sp, L_nb)
equal_X = np.allclose(X_sp, X_nb)
print('Is L equal?:', equal_L)
print('Is X equal?:', equal_X)
print(sep)

# Define setup code
number = 1
SIZE = 1000
RHS = 50
setup = f'''
import numpy as np
import scipy as sp
A, B = generate_random_spd_matrix(size={SIZE}, rhs={RHS}, seed=None)
B_cmo = B.T.copy()
LOWER = True
'''

# Define Scipy version
scipy_code = '''
L_sp, _ = sp.linalg.cho_factor(A, LOWER)
X_sp = sp.linalg.cho_solve((L_sp, LOWER), B)
'''

# Define Numba version
numba_code = '''
L_nb = numba_dpotrf(A, LOWER)
X_nb = numba_dpotrs(L_nb, B_cmo, LOWER)
'''

# Time the Scipy version
print(f'Timer for random system\nA: {SIZE}x{SIZE}, B: {SIZE}x{RHS}, runs: {number}\n')
scipy_time = timeit.timeit(scipy_code, setup=setup,
number=number, globals=globals()) * 1e3
print(f'Scipy time: {scipy_time:.3f} ms')

# Time the Numba version
numba_time = timeit.timeit(numba_code, setup=setup,
number=number, globals=globals()) * 1e3
print(f'Numba time: {numba_time:.3f} ms')

# Example:
# A
#  [[ 4.16 -3.12  0.56 -0.1 ]
#  [-3.12  5.03 -0.83  1.18]
#  [ 0.56 -0.83  0.76  0.34]
#  [-0.1   1.18  0.34  1.18]]
# B
#  [[  8.7    8.3 ]
#  [-13.35   2.13]
#  [  1.89   1.61]
#  [ -4.14   5.  ]]

# ****************************************************

# Scipy:
# L
#  [[ 2.03960781 -3.12        0.56       -0.1       ]
#  [-1.52970585  1.64012195 -0.83        1.18      ]
#  [ 0.27456259 -0.24998141  0.78874881  0.34      ]
#  [-0.04902903  0.67373039  0.66165756  0.53468943]]
# X
#  [[ 1.  4.]
#  [-1.  3.]
#  [ 2.  2.]
#  [-3.  1.]]

# ****************************************************

# Numba:
# L
#  [[ 2.03960781 -3.12        0.56       -0.1       ]
#  [-1.52970585  1.64012195 -0.83        1.18      ]
#  [ 0.27456259 -0.24998141  0.78874881  0.34      ]
#  [-0.04902903  0.67373039  0.66165756  0.53468943]]
# X
#  [[ 1.  4.]
#  [-1.  3.]
#  [ 2.  2.]
#  [-3.  1.]]

# Is L equal?: True
# Is X equal?: True

# ****************************************************

# Timer for random system
# A: 1000x1000, B: 1000x50, runs: 1

# Scipy time: 15.035 ms
# Numba time: 7.636 ms

# ****************************************************

# Python: 3.12.0
# Numpy: 1.26.3
# Numba: 0.59.0

``````
1 Like

Hi @Oyibo

Pretty sure the memory layout of the matrices causes issues. Did you try column major order arrays since this is the default in Fortran?

1 Like

Hey @sschaer ,

Unfortunately, there seems to be no way to pass an additional argument to give hints about the memory layout using this API.
I have transposed the RHS matrix and adjusted the arguments in the solver function accordingly.

Thanks.

Maybe a better way to deal with it is to make the argument F-contiguous if it is not already. You can check this at compile time. If you make a copy you either warn the user that you return a copy or you write the result back to the original array.

Here is a dummy example:

``````import warnings
import numba as nb
from numba.core.errors import NumbaPerformanceWarning

def foo(x):
...

def foo_impl(x):
if x.layout == "F":
return lambda x: x

warnings.warn(
"foo is faster on f-contiguous arrays",
NumbaPerformanceWarning
)

# make F order copy and write to x since your function
# operates inplace (or warn that you will return a copy)
def impl(x):
x[:] = np.asfortranarray(x)
return x

return impl

@nb.njit
def test(x):
return foo(x)

a = np.ones((42, 42))
b = np.ones((42, 42), order="F")

print(test(a))
print(test(b))
``````

@sschaer absolutely!

I have not much experience using the LAPACK interface on a low level and the Scipy interface isnâ€™t well documented, too. Nevertheless, here is what I learned:

When utilizing the LAPACK interface, itâ€™s essential to understand at least the basic conventions.
The LAPACKE C-interface to LAPACK introduces special arguments in its functions (LAPACK_ROW_MAJOR or LAPACK_COL_MAJOR).
These arguments specify the memory layout of matrices. From my perspective this allows a safer and more precise way of interacting with LAPACK.

However, the Cython-LAPACK implementation seems to rely on the FORTRAN interface.
In this case, the FORTRAN interface assumes and prefers LAPACK_COL_MAJOR layout or F-order.
Therefore, providing F-contiguous arrays to this interface is probably a good idea and might be beneficial from a performance perspective.

For operations like Cholesky decomposition, the result remains consistent regardless of memory layout. The matrix is square and symmetric.
The performance on the other hand can vary significantly.
Empirical testing with a 1000x1000 matrix demonstrates that the F-order layout seems to be more efficient.
The choice of triangle storage (lower or upper) does not significantly impact performance.
Here is a test result:

Cholesky decomposition of a matrix 1000 x 1000, runs: 10, reps: 100
order:C, triangle storage: lower, time: 10.412 ms
order:C, triangle storage: upper, time: 10.553 ms
order:F, triangle storage: lower, time: 8.776 ms
order:F, triangle storage: upper, time: 8.705 ms

When using the Cython-LAPACK interface on C-order matrices, passing a transposed reference could make sense to avoid unnecessary copies.
A transposed C-order matrix has F-order layout.
If you know your matrix structures and sizes you should benchmark the algorithm and find out what process pipeline works best.

One potential challenge arises if the user is not aware of what happens under the hood and when providing both C- and F-order matrices to LAPACK functions.
In such cases, itâ€™s crucial to provide the correct dimensions, especially for rectangular matrices (m-by-n).

For the Cholesky solver specifically, itâ€™s advisable to determine the possible cases:

• If A is an n-by-n square matrix and B is an m-by-n rectangular matrix, n represents the system size, and m is the number of right-hand sides. => You can determine the mayor order and set the matrix dimensions passed to the LAPACK function accordingly.
• If B is a 1D vector, n still represents the system size, with 1 indicating the number of right-hand sides. => Again, you can determine the mayor order and set the matrix dimensions passed to the LAPACK function.
• If A and B are both n-by-n matrices, the direction of the solution may not be clear => warn the user and assume a layout.

In my provided example I did not make this determination, so the code will fail if you use an other matrix layout.
Compared to the more convenient Scipy functions, I would assume that the performance gains using the low level LAPACK functions should decrease with increasing system dimensions.
When interacting with LAPACK functions (without LAPACKE C-interface) I would strictly use FORTRAN memory layouts. Just to be on the safe side.

1 Like

Much appreciate both your replies, @sschaer, @Oyibo I am eager to test this out more in detail. Though it will only make sense for me when I have learned how to cache it on a remote worker.

Might it be possible to call the C function directly bypassing the cython function?

Hey @nelson2005 , itâ€™s quite likely that there are additional methods, such as a direct C-call to the underlying LAPACK functions, which could further improve performance. The crucial point is whether the performance gains justify the additional effort.
The Numba-Scipy wrapper shows awesome speed. Even for small linear systems, this function outperforms my naive Numba solution. However, at a matrix size of around 1000x1000, the overhead of the Scipy function becomes negligible. Both the Scipy function and the direct Numba-LAPACK approach show similar performance beyond this point, while the naive version deteriorates more and more.

1 Like
1. Below an updated figure including `numba_dpotrf` & `numba_dpotrs` runtimes on my test-data,(called `cythonlapackCholesky` in figure). The test was done locally in an ipython-session. However, when distributing the calculations to remote workers, I unfortunately do not get the improvement seen locally, so I think I would benefit from caching it on the workerside. What would be a good next step for me to try out, in order to get closer to being able to caching `numba_dpotrf` and `numba_dpotrs`?

1. I see that all my K-arrays are initially C_CONTIGUOUS. I do not really see a difference in performance when `K` being F-order, as compared with the C-order it is already in. Are you sure we are supposed to see a difference? `K` is always symmetric, so maybe this means the order does not matter in my case?
F-order vs. C-order argument
``````def not_cache():
return dict(
parallel = False,
fastmath = True,
cache = False,
nogil = True,
error_model = "numpy"
)

_ptr_dbl = ctypes.POINTER(ctypes.c_double)
_ptr_int = ctypes.POINTER(ctypes.c_int)
_ptr_char = ctypes.POINTER(ctypes.c_int)        # Must be integer representing Unicode

# Get addresses of dpotrf and dpotrs functions

# Define function types

# void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
dpotrf_functype = ctypes.CFUNCTYPE(
None,       # void
_ptr_char,  # UPLO
_ptr_int,   # N
_ptr_dbl,   # A
_ptr_int,   # LDA
_ptr_int,   # INFO
)

# void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
dpotrs_functype = ctypes.CFUNCTYPE(
None,       # void
_ptr_char,  # UPLO
_ptr_int,   # N
_ptr_int,   # NRHS
_ptr_dbl,   # A
_ptr_int,   # LDA
_ptr_dbl,   # B
_ptr_int,   # LDB
_ptr_int,   # INFO
)

# Create function objects

@njit(**not_cache())
def numba_dpotrf(A, lower=True):
"""
DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A.

void dpotrf(char *uplo, int *n, d *a, int *lda, int *info)
lapack_int LAPACKE_dpotrf( int matrix_layout, char uplo, lapack_int n, double* a,
lapack_int lda )
"""
# Data type must be integer representing Unicode!!!
UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
size = A.shape[0]
N = np.array(size, dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
INFO = np.array(0, dtype=np.int32)
dpotrf_fn(
UPLO.ctypes,
N.ctypes,
A.ctypes,       # out
LDA.ctypes,
INFO.ctypes,    # out
)
#if INFO > 0:
#    raise Exception('INFO > 0. The leading principal minor of order i is not positive, and the factorization could not be completed.')
#if INFO < 0:
#    raise Exception('INFO < 0. The i-th argument had an illegal value')

return A #, INFO

@njit(**not_cache())
def numba_dpotrs(A, B, lower=True):
"""
DPOTRS solves a system of linear equations A*X = B.

System is solved using a symmetric positive definite matrix A using the
Cholesky factorization.
A = U**T*U or A = L*L**T computed by DPOTRF.

void dpotrs(char *uplo, int *n, int *nrhs, d *a, int *lda, d *b, int *ldb, int *info)
lapack_int LAPACKE_dpotrs( int matrix_layout, char uplo, lapack_int n,
lapack_int nrhs, const double* a, lapack_int lda,
double* b, lapack_int ldb )
"""
# Data type must be integer representing Unicode!!!
UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
INFO = np.array(0, dtype=np.int32)
size = A.shape[0]
N = np.array(size, dtype=np.int32)
NRHS = np.array(B.shape[0], dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
LDB = np.array(B.shape[1], dtype=np.int32)
dpotrs_fn(
UPLO.ctypes,
N.ctypes,
NRHS.ctypes,
A.ctypes,
LDA.ctypes,
B.ctypes,       # out
LDB.ctypes,
INFO.ctypes,    # out
)
#if INFO > 0:
#    raise Exception('INFO > 0. The leading principal minor of order i is not positive, and the factorization could not be completed.')
#if INFO < 0:
#    raise Exception('INFO < 0. The i-th argument had an illegal value')

return B#, INFO

def scipyCythonLapackCholesky_nb(K, data):
L_nb = numba_dpotrf(K, True)
return numba_dpotrs(L_nb, data, True)

# K used below has K.shape(n,n,) data.shape=(n,), where n = 1420, just to test a large array.
# I also tested with n=100. No notable runtime-difference in either case.
# I do not know a good way to share my real test-data, but can share it if you want.
Kcopy = K.copy()
n = K.shape[0]
KcopyF = np.asfortranarray(Kcopy)
datacopy = data.copy().reshape(1,n)
datacopy2 = datacopy.copy()
print(Kcopy.flags)
print(KcopyF.flags)
%timeit XnbC = scipyCythonLapackCholesky_nb(Kcopy, datacopy)
%timeit XnbF = scipyCythonLapackCholesky_nb(KcopyF, datacopy2)

C_CONTIGUOUS : True
F_CONTIGUOUS : False
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False

C_CONTIGUOUS : False
F_CONTIGUOUS : True
OWNDATA : True
WRITEABLE : True
ALIGNED : True
WRITEBACKIFCOPY : False

3.34 ms Â± 30.6 Âµs per loop (mean Â± std. dev. of 7 runs, 100 loops each)
3.34 ms Â± 32.4 Âµs per loop (mean Â± std. dev. of 7 runs, 100 loops each)
``````
1. My `data`-array will always be 1D, so during runtime I add an extra axis (`data.reshape(1,n)`) before each `cythonlapackCholesky`-run. @Oyibo Is there any performance-related changes you would do to the function ( f.ex. when defining the data-types), now that `data` always is 1D, or would you leave the function as it is?

Should probably close this post as the â€śmain questionâ€ť has been answered, and continue on â€ścachingâ€ť in my other post. But I wanted to ask these questions first!

1 Like

Thank you for crosschecking @ofk123 .

1. In the given example we have used mutable global variables to pass the c-functions to the jitted functions. You have to change this approach to be able to cache. Can you provide a minimum working example of what you want to accomplish using Scipy and the version that fails (in your other thread)? This will increase the likelihood to get help from other users.

2. If you canâ€™t measure a difference in performance between using F-order and C-order, then itâ€™s likely that the memory layout doesnâ€™t significantly affect performance in your case. Since the matrices are square and symmetric, the computation will produce the same results regardless of the memory layout.

3. Since your data-array is always 1D and you donâ€™t have multiple RHS, you can simplify the process by providing a vector instead of a matrix to the LAPACK function `dpotrs`. You will receive a vector as a result, too. This change should be sufficient for your specific case.

``````@njit
def numba_dpotrs(A, B, lower=True):
"""DPOTRS solves a system of linear equations."""
UPLO = np.array(ord('U') if lower else ord('L'), np.int32)
INFO = np.array(0, dtype=np.int32)
size = A.shape[0]
N = np.array(size, dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
if B.ndim == 1:
NRHS = np.array(1, dtype=np.int32)
LDB = np.array(B.size, dtype=np.int32)
else:
NRHS = np.array(B.shape[0], dtype=np.int32)
LDB = np.array(B.shape[1], dtype=np.int32)
dpotrs_fn(
UPLO.ctypes,
N.ctypes,
NRHS.ctypes,
A.ctypes,
LDA.ctypes,
B.ctypes,       # out
LDB.ctypes,
INFO.ctypes,    # out
)
if INFO:
raise Exception(f'Oh no, something went wrong. INFO: {INFO}')
return B
``````
1 Like
1. Yep, will provide an MRE in that post.
2. I am a bit confused why on your end you get different runtime-results using â€śFARRAYâ€ť and â€śCARRAYâ€ť, and not on my end. When time I will dig into that further.
3. Great, thanks alot!
In my case, N, LDA, (and LDB in dpotrs) end up equal.
``````N = np.array(size, dtype=np.int32)
LDA = np.array(size, dtype=np.int32)
LDB = np.array(size, dtype=np.int32)
``````

Is there anything in the way for me to disregards those definitions, and reuse the `N.ctypes` argument, like:

``````dpotrs_fn(
UPLO.ctypes,
N.ctypes,
NRHS.ctypes,
A.ctypes,
N.ctypes,
B.ctypes,       # out
N.ctypes,
INFO.ctypes,    # out
)
``````

I am asking because I want to be sure reusing `N.ctypes` does not disturb anything internally in `dpotrs_fn`.

(Edit: I guess you meant `ord('L') if lower else ord('U')` instead of `ord('U') if lower else ord('L')`. But I dont think this should matter for the results. )

Just adding onto here that I am considering changing to float32, especially when seeing the potential gain on sizes ~1e2, which would be very relevant in our own case. Adding the single-precision-runtimes to the plot, using `spotrf` and `spotrs`. Not directly comparable since it is float32, but could perhaps be interesting for others who end up here.

plot code
``````# Here, 'Ks' & 'datas' was two lists of different K- and data-arrays.
# I am not sure how to best share test-data on here, but can share if you want.

import numpy as np
from numba import njit
from scipy.linalg import cho_factor, cho_solve
from time import time as Time

def options():
return dict(
parallel = False, # Since only 1 core per worker
fastmath = True,
cache = True,
nogil = True,
error_model = "numpy"
)

@njit(**not_cache())
def scipyCythonLapackCholesky_f8(A, b, k):
Ai = A.copy()
bi = b.copy()
L_nb = solve.dpotrf(Ai, k)
x_nb = solve.dpotrs(L_nb, bi, k)
return np.sum(x_nb)

@njit(**not_cache())
def scipyCythonLapackCholesky_f4(A, b, k):
Ai = A.copy()
bi = b.copy()
L_nb = solve.spotrf(Ai, k)
x_nb = solve.spotrs(L_nb, bi, k)
return np.sum(x_nb)

@njit(**options())
def numpy_solve(K, data):
return np.linalg.solve(K, data)

def scipysolve(K, data):
c_and_lower = cho_factor( K, lower=True, overwrite_a=True, check_finite=False )
return cho_solve(c_and_lower, data, check_finite = False)

@njit(**options())
def cholesky_decomp(A):
"""Perform Cholesky decomposition on symmetric, positive definite matrix."""
n = A.shape[0]
L = np.zeros_like(A)

# Perform the Cholesky decomposition
for row in range(n):
for col in range(row+1):
tmp_sum = 0.0
for j in range(col):
tmp_sum += L[row, j] * L[col, j]
if (row == col):
# diag elts.
L[row, col] = math.sqrt(A[row, row] - tmp_sum)
else:
L[row, col] = (1.0 / L[col, col] * (A[row, col] - tmp_sum))
return L

@njit(**options())
def cholesky_solver_nb(L, b):
"""Solve system using forward substition and then backward substition."""
U = L.T
n = L.shape[0]
y = np.zeros(n)
x = np.zeros(n)

# Forward substitution
for i in range(n):
sumj = 0
for j in range(i):
sumj += L[i, j] * y[j]
y[i] = (b[i] - sumj) / L[i, i]

# Backward substitution
for i in range(n - 1, -1, -1):
sumj = 0
for j in range(i + 1, n):
sumj += U[i, j] * x[j]
x[i] = (y[i] - sumj) / U[i, i]
return x

@njit(**options())
def cholesky_nb(K, data):
"""Decompose and solve."""
L = cholesky_decomp(K)
return cholesky_solver_nb(L, data)

@njit(**not_cache())
def scipyCythonLapackCholesky_f8(A, b, k):
#Ai = A.copy()
#bi = b.copy()
L_nb = numba_dpotrf(A, k)
x_nb = numba_dpotrs(L_nb, b, k)
return x_nb

@njit(**not_cache())
def scipyCythonLapackCholesky_f4(A, b, k):
#Ai = A.copy()
#bi = b.copy()
L_nb = numba_spotrf(A, k)
x_nb = numba_spotrs(L_nb, b, k)
return x_nb

sizes = []
for i in range(len(datas)):
sizes.append(datas[i].size)
sizesorted_i = np.argsort(np.array(sizes))
seconds_per_solve = np.full((len(Ks),5), np.nan)
for k, i in enumerate(sizesorted_i):
K0 = Ks[i]
d0 = datas[i]
n = d0.size
K32 = Ks[i].astype('float32')
d32 = datas[i].astype('float32')
t0 = Time()
fitcoef = numpy_solve(K0, d0)
seconds_per_solve[k,0] = Time() - t0
t0 = Time()
fitcoef = scipysolve(K0, d0)
seconds_per_solve[k,1] = Time() - t0
t0 = Time()
fitcoef = cholesky_nb(K0, d0)
seconds_per_solve[k,2] = Time() - t0
t0 = Time()
fitcoef = scipyCythonLapackCholesky_f8(K0, d0, n)
seconds_per_solve[k,3] = Time() - t0
t0 = Time()
fitcoef = scipyCythonLapackCholesky_f4(K32, d32, n)
seconds_per_solve[k,4] = Time() - t0
del fitcoef, t0

sortedsizes = np.array(sizes)[sizesorted_i]
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1,figsize=(12,7))
ax.scatter(sortedsizes,seconds_per_solve[:,0], s=1.2, label='numpy_solve(K, data)' )
ax.scatter(sortedsizes,seconds_per_solve[:,1], s=1.2, label='scipysolve(K, data)' )
ax.scatter(sortedsizes,seconds_per_solve[:,2], s=1.2, label='cholesky_nb(K, data)')
ax.scatter(sortedsizes,seconds_per_solve[:,3], s=1.2, label='numba_dpotrs(K, data) (float64)' )
ax.scatter(sortedsizes,seconds_per_solve[:,4], s=1.2, label='numba_spotrs(K, data) (float32)' )
ax.set_yscale('log');ax.set_xscale('log');ax.grid()
ax.legend(loc='upper right', bbox_to_anchor=(1.01, 1.00), fontsize=7.75)
ax.set_ylabel('seconds per solve');ax.set_xlabel('n, where K.shape = (n,n), data.shape=(n,) ');fig.show()
fig.savefig('runtime_per_solve_loglog.png', dpi=360)
``````