I’ve been having some fun benchmarking some Numba, Julia, C++, and Fortran implementations of a simple clipping algorithm – and I’ve very happy to report that I can get very close to ifort (and beating gfortran). Additionally, the numba.prange makes parallellisation trivial to do:
However, somewhat to my surprise, by manually setting inline="always" for all functions I got a factor ~3 speedup. Code below, and full code for the above figures here: https://github.com/Huite/clipping-benchmarks
Anyway, some questions:
- In general, how much faith should I put in Numba’s (or is it LLVM’s) inlining heuristics? Or is it good practice to generally test this? (I reckon ifort and msvc do the inlining in this case as well.)
- Are their any downsides in this case to setting everything to
inline="always"? - I also copied the way of getting a stack allocated array: https://github.com/numba/numba/issues/5084 In almost all cases, my polygons will have only a few sides so I’m not too worried about running out of memory. But are there heuristics at which size it’s better to switch to ordinary arrays? (For the case of Julia’s StaticArrays, the size used to be roughly a hundred elements, but I don’t believe that’s true anymore).
- Any other ways this can come back to bite me in the behind? I’m using a closure to define the allocation function based on size, so it’s easy to check for size. But is there anything else I should be aware of?
(I understand if most answers are somewhere along the line of “it depends”
)
from typing import NamedTuple, Sequence, Tuple
import numba as nb
import numpy as np
from numba import types
from numba.extending import intrinsic
from numba.core import cgutils
@intrinsic
def stack_empty(typingctx, size, dtype):
def impl(context, builder, signature, args):
ty = context.get_value_type(dtype.dtype)
ptr = cgutils.alloca_once(builder, ty, size=args[0])
return ptr
sig = types.CPointer(dtype.dtype)(types.int64,dtype)
return sig, impl
FLOAT_TYPE = np.float64
class Point(NamedTuple):
x: float
y: float
class Vector(NamedTuple):
x: float
y: float
@nb.njit(inline="always")
def cross_product(u: Vector, v: Vector) -> float:
return u.x * v.y - u.y * v.x
@nb.njit(inline="always")
def dot_product(u: Vector, v: Vector) -> float:
return u.x * v.x + u.y * v.y
@nb.njit(inline="always")
def _push(array: np.ndarray, n: int, value: Vector) -> int:
array[n] = value
return n + 1
@nb.njit(inline="always")
def _copy(src, dst, n) -> None:
for i in range(n):
dst[i] = src[i]
@nb.njit(inline="always")
def _inside(p: Point, r: Point, U: Vector):
# U: a -> b direction vector
# p is point r or s
return U.x * (p.y - r.y) > U.y * (p.x - r.x)
@nb.njit(inline="always")
def _intersection(a: Point, V: Vector, r: Point, N: Vector) -> Tuple[bool, Point]:
W = Vector(r.x - a.x, r.y - a.y)
nw = dot_product(N, W)
nv = dot_product(N, V)
if nv != 0:
t = nw / nv
return True, Point(a.x + t * V.x, a.y + t * V.y)
else:
return False, Point(0.0, 0.0)
@nb.njit(inline="always")
def _polygon_area(polygon: Sequence, length: Sequence) -> float:
area = 0.0
a = Point(polygon[0][0], polygon[0][1])
b = Point(polygon[1][0], polygon[1][1])
U = Vector(b.x - a.x, b.y - a.y)
for i in range(2, length):
c = Point(polygon[i][0], polygon[i][1])
V = Vector(a.x - c.x, a.y - c.y)
area += abs(cross_product(U, V))
b = c
U = V
return 0.5 * area
def make_allocate(nvertex, ndim):
size = nvertex * ndim
@nb.njit(inline="always")
def allocate_empty():
arr_ptr = stack_empty(size, np.float64)
arr = nb.carray(arr_ptr, (nvertex, ndim))
return arr
return allocate_empty
@nb.njit(inline="always")
def clip_polygons(polygon: Sequence, clipper: Sequence) -> float:
n_output = len(polygon)
n_clip = len(clipper)
n_max = n_output + n_clip
# Create a view on the allocated memory and do something
subject = allocate_empty()
output = allocate_empty()
# Copy polygon into output
_copy(polygon, output, n_output)
# Grab last point
r = Point(clipper[n_clip - 1][0], clipper[n_clip - 1][1])
for i in range(n_clip):
s = Point(clipper[i][0], clipper[i][1])
U = Vector(s.x - r.x, s.y - r.y)
N = Vector(-U.y, U.x)
if U.x == 0 and U.y == 0:
continue
# Copy output into subject
length = n_output
_copy(output, subject, length)
# Reset
n_output = 0
# Grab last point
a = Point(subject[length - 1][0], subject[length - 1][1])
a_inside = _inside(a, r, U)
for j in range(length):
b = Point(subject[j][0], subject[j][1])
V = Vector(b.x - a.x, b.y - a.y)
if V.x == 0 and V.y == 0:
continue
b_inside = _inside(b, r, U)
if b_inside:
if not a_inside: # out, or on the edge
succes, point = _intersection(a, V, r, N)
if succes:
n_output = _push(output, n_output, point)
n_output = _push(output, n_output, b)
elif a_inside:
succes, point = _intersection(a, V, r, N)
if succes:
n_output = _push(output, n_output, point)
else: # Floating point failure
b_inside = True # flip it for consistency, will be set as a
n_output = _push(output, n_output, b) # push b instead
# Advance to next polygon edge
a = b
a_inside = b_inside
# Exit early in case not enough vertices are left.
if n_output < 3:
return 0.0
# Advance to next clipping edge
r = s
area = _polygon_area(output, n_output)
return area
def _area_of_intersection(a, b):
n = len(a)
out = np.zeros(n)
for i in nb.prange(n):
t0 = a[i]
t1 = b[i]
out[i] = clip_polygons(t0, t1)
return out
Generate a bunch of counter-clockwise triangles
@nb.njit
def ccw(a):
for i in range(len(a)):
t = a[i]
normal = (t[1][0] - t[0][0])*(t[2][1]-t[0][1])-(t[1][1]-t[0][1])*(t[2][0]-t[0][0])
if normal < 0:
a[i] = t[::-1]
a = np.random.rand(10_000_000, 3, 2)
b = np.random.rand(10_000_000, 3, 2)
ccw(a)
ccw(b)
allocate_empty = make_allocate(nvertex=2 * a.shape[1], ndim=a.shape[2])
area_of_intersection = nb.njit(_area_of_intersection, parallel=True)
Profiling it:
%timeit area_of_intersection(a, b)

