I’m trying to replace a few C extensions with numba, but I’m having some issues with performance.
The problem is simple: rotate an xyz point by a series of angles, using rotation matrices.
I have an old C extension that does the job like this
void rotation(double *xyz, double lon, double lat, double inc, double phase, double *xiyizi)
{
double g2 = --phase * (2 * M_PI);
double i = inc * M_PI / 180.;
double b = lat * M_PI / 180.;
double g = -lon * M_PI / 180.;
double b2 = -(M_PI / 2. - i);
double R[3][3] = {{(1 - cos(g2)) * cos(i) * cos(i) + cos(g2), sin(g2) * sin(i), (1 - cos(g2)) * cos(i) * sin(i)},
{-sin(g2) * sin(i), cos(g2), sin(g2) * cos(i)},
{(1 - cos(g2)) * sin(i) * cos(i), -sin(g2) * cos(i), (1 - cos(g2)) * sin(i) * sin(i) + cos(g2)}};
double R2[3][3] = {{cos(b) * cos(g) * cos(b2) - sin(b2) * sin(b), -sin(g) * cos(b), cos(b) * cos(g) * sin(b2) + sin(b) * cos(b2)},
{sin(g) * cos(b2), cos(g), sin(g) * sin(b2)},
{-sin(b) * cos(g) * cos(b2) - cos(b) * sin(b2), sin(b) * sin(g), -sin(b) * cos(g) * sin(b2) + cos(b) * cos(b2)}};
double R3[3][3] = {{R2[0][0] * R[0][0] + R2[0][1] * R[1][0] + R2[0][2] * R[2][0],
R2[0][0] * R[0][1] + R2[0][1] * R[1][1] + R2[0][2] * R[2][1],
R2[0][0] * R[0][2] + R2[0][1] * R[1][2] + R2[0][2] * R[2][2]},
{R2[1][0] * R[0][0] + R2[1][1] * R[1][0] + R2[1][2] * R[2][0],
R2[1][0] * R[0][1] + R2[1][1] * R[1][1] + R2[1][2] * R[2][1],
R2[1][0] * R[0][2] + R2[1][1] * R[1][2] + R2[1][2] * R[2][2]},
{R2[2][0] * R[0][0] + R2[2][1] * R[1][0] + R2[2][2] * R[2][0],
R2[2][0] * R[0][1] + R2[2][1] * R[1][1] + R2[2][2] * R[2][1],
R2[2][0] * R[0][2] + R2[2][1] * R[1][2] + R2[2][2] * R[2][2]}};
xiyizi[0] = R3[0][0] * xyz[0] + R3[0][1] * xyz[1] + R3[0][2] * xyz[2];
xiyizi[1] = R3[1][0] * xyz[0] + R3[1][1] * xyz[1] + R3[1][2] * xyz[2];
xiyizi[2] = R3[2][0] * xyz[0] + R3[2][1] * xyz[1] + R3[2][2] * xyz[2];
}
which I tried to replace with the following numba function, returning a numpy array
from numpy import pi, sin, cos, zeros, ndarray
@numba.njit()
def rotation_numba(xyz: ndarray, lon: float, lat: float, inc: float, phase: float):
g2 = (-1.0 + phase) * (2.0 * pi)
i = inc * pi / 180.
b = lat * pi / 180.
g = -lon * pi / 180.
b2 = -(pi / 2.0 - i)
R = [
[(1 - cos(g2)) * cos(i) * cos(i) + cos(g2), sin(g2) * sin(i), (1 - cos(g2)) * cos(i) * sin(i)],
[-sin(g2) * sin(i), cos(g2), sin(g2) * cos(i)],
[(1 - cos(g2)) * sin(i) * cos(i), -sin(g2) * cos(i), (1 - cos(g2)) * sin(i) * sin(i) + cos(g2)]
]
R2 = [
[cos(b) * cos(g) * cos(b2) - sin(b2) * sin(b), -sin(g) * cos(b), cos(b) * cos(g) * sin(b2) + sin(b) * cos(b2)],
[sin(g) * cos(b2), cos(g), sin(g) * sin(b2)],
[-sin(b) * cos(g) * cos(b2) - cos(b) * sin(b2), sin(b) * sin(g), -sin(b) * cos(g) * sin(b2) + cos(b) * cos(b2)]
]
R3 = [
[R2[0][0] * R[0][0] + R2[0][1] * R[1][0] + R2[0][2] * R[2][0],
R2[0][0] * R[0][1] + R2[0][1] * R[1][1] + R2[0][2] * R[2][1],
R2[0][0] * R[0][2] + R2[0][1] * R[1][2] + R2[0][2] * R[2][2]
],
[R2[1][0] * R[0][0] + R2[1][1] * R[1][0] + R2[1][2] * R[2][0],
R2[1][0] * R[0][1] + R2[1][1] * R[1][1] + R2[1][2] * R[2][1],
R2[1][0] * R[0][2] + R2[1][1] * R[1][2] + R2[1][2] * R[2][2],
],
[R2[2][0] * R[0][0] + R2[2][1] * R[1][0] + R2[2][2] * R[2][0],
R2[2][0] * R[0][1] + R2[2][1] * R[1][1] + R2[2][2] * R[2][1],
R2[2][0] * R[0][2] + R2[2][1] * R[1][2] + R2[2][2] * R[2][2]
]
]
xyz_out = zeros(3)
xyz_out[0] = R3[0][0] * xyz[0] + R3[0][1] * xyz[1] + R3[0][2] * xyz[2]
xyz_out[1] = R3[1][0] * xyz[0] + R3[1][1] * xyz[1] + R3[1][2] * xyz[2]
xyz_out[2] = R3[2][0] * xyz[0] + R3[2][1] * xyz[1] + R3[2][2] * xyz[2]
return xyz_out
However, I’m not able to achieve the same performance with the two functions:
xyz = np.array([-0.8, 0.1, 0.5])
%timeit rotation_c(xyz, 180.0, 30.0, 90.0, 0.1)
# 931 ns ± 5.49 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%timeit rotation_numba(xyz, 180.0, 30.0, 90.0, 0.1)
# 2.58 µs ± 45.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Is there something I’m doing wrong, or a different implementation in numba that would give me similar performance to the C extension?