Dear experts,
I am coding my first program in numba for Cuda and I am having some trouble to paralelize a function. I am solving the N-body problem, defining each body as an object using:
spec = [
(‘dim’, int32), # a simple scalar field
(‘x’, float32[:]), # an array field
(‘y’, float32[:]),
(‘z’, float32[:]),
(‘vx’, float32[:]),
(‘vy’, float32[:]),
(‘vz’, float32[:]),
(‘force’, int32),
]
@jitclass(spec)
class Body(object):
def init(self,n,x,y,z,vx,vy,vz):
#coordinates and velocity of the body
#n is the number of elements to keep (info)
#FUTURE PROBLEM: when we deal with 10^3 objects, it’s going to be impossible to keep their trajectories, how can we do it?
self.dim = n
self.x = np.zeros(n,dtype=np.float32)
self.x[0] = x
self.y = np.zeros(n,dtype=np.float32)
self.y[0] = y
self.z =np.zeros(n,dtype=np.float32)
self.z[0] = z
self.vx = np.zeros(n,dtype=np.float32)
self.vx[0] = vx
self.vy = np.zeros(n,dtype=np.float32)
self.vy[0] = vy
self.vz = np.zeros(n,dtype=np.float32)
self.vz[0] = vz
#maybe this is useful, total force over the body, but only in one time step, i don’t think we need
#to keep these values for all integration time
self.force = np.zeros(3,dtype=np.float32)
This seems to work when defining bodies as:
b1 = Body(1000,1,0,10,1,1,1)
b2 = Body(1000,0,3,5,1,1,1)
bodies = [b1,b2]
Then, I want to compute the force of all bodies over one of them with the function:
@cuda.jit
def bodyForce(p,n):
global SOFTENING
#p is the array or struct of bodies in the problem
#n is the number of bodies
#dt timestep
#Thread ID
idx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
#Compute force for each body i
for i in range(n):
Fx = 0.0
Fy = 0.0
Fz = 0.0
#Summing over all elements. Note here I don’t need to discard the
#i=j because of the use of the SOFTERNING quantity. Check next reference
if(idx<n):
dx = p[idx].x - p[i].x
dy = p[idx].y - p[i].y
dz = p[idx].z - p[i].z
#Compute the square of distance. Softerning is a parameter that is inclded to avoid
#infinite forces when objects are very close (collisions)
distSqr = dxdx + dydy + dz*dz + SOFTENING
#sqrt of this guy
invDist = np.sqrt(distSqr)
#and now this to the power of 3
invDist3 = invDist * invDist * invDist
#Forces in the 3 cartesian directions
Fx += dx * invDist3
Fy += dy * invDist3
Fz += dz * invDist3
p[i].totalF = [Fx,Fy,Fz]
Calling it as:
SOFTENING = 0.1
bodyForce[1,2] (bodies,2)
I get the attached error message, which I do not manage to solve. Apologies for my few knowledge in the field, I hope that someone could give me a little hint!
Error:
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py:488: NumbaPerformanceWarning: Grid size 1 will likely result in GPU under-utilization due to low occupancy.
warn(NumbaPerformanceWarning(msg))
---------------------------------------------------------------------------
TypingError Traceback (most recent call last)
<ipython-input-5-9988521531ab> in <module>
1 SOFTENING = 0.1
2
----> 3 bodyForce[1,2](bodies,2)
20 frames
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in __call__(self, *args)
489
490 def __call__(self, *args):
--> 491 return self.dispatcher.call(args, self.griddim, self.blockdim,
492 self.stream, self.sharedmem)
493
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in call(self, args, griddim, blockdim, stream, sharedmem)
623 kernel = next(iter(self.overloads.values()))
624 else:
--> 625 kernel = _dispatcher.Dispatcher._cuda_call(self, *args)
626
627 kernel.launch(args, griddim, blockdim, stream, sharedmem)
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in _compile_for_args(self, *args, **kws)
631 assert not kws
632 argtypes = [self.typeof_pyval(a) for a in args]
--> 633 return self.compile(tuple(argtypes))
634
635 def typeof_pyval(self, val):
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in compile(self, sig)
792 raise RuntimeError("Compilation disabled")
793
--> 794 kernel = _Kernel(self.py_func, argtypes, **self.targetoptions)
795 # We call bind to force codegen, so that there is a cubin to cache
796 kernel.bind()
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
33 def _acquire_compile_lock(*args, **kwargs):
34 with self:
---> 35 return func(*args, **kwargs)
36 return _acquire_compile_lock
37
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in __init__(self, py_func, argtypes, link, debug, lineinfo, inline, fastmath, extensions, max_registers, opt, device)
73 }
74
---> 75 cres = compile_cuda(self.py_func, types.void, self.argtypes,
76 debug=self.debug,
77 lineinfo=self.lineinfo,
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
33 def _acquire_compile_lock(*args, **kwargs):
34 with self:
---> 35 return func(*args, **kwargs)
36 return _acquire_compile_lock
37
/usr/local/lib/python3.8/dist-packages/numba/cuda/compiler.py in compile_cuda(pyfunc, return_type, args, debug, lineinfo, inline, fastmath, nvvm_options)
210 from numba.core.target_extension import target_override
211 with target_override('cuda'):
--> 212 cres = compiler.compile_extra(typingctx=typingctx,
213 targetctx=targetctx,
214 func=pyfunc,
/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
714 pipeline = pipeline_class(typingctx, targetctx, library,
715 args, return_type, flags, locals)
--> 716 return pipeline.compile_extra(func)
717
718
/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in compile_extra(self, func)
450 self.state.lifted = ()
451 self.state.lifted_from = None
--> 452 return self._compile_bytecode()
453
454 def compile_ir(self, func_ir, lifted=(), lifted_from=None):
/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_bytecode(self)
518 """
519 assert self.state.func_ir is None
--> 520 return self._compile_core()
521
522 def _compile_ir(self):
/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_core(self)
497 self.state.status.fail_reason = e
498 if is_final_pipeline:
--> 499 raise e
500 else:
501 raise CompilerError("All available pipelines exhausted")
/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_core(self)
484 res = None
485 try:
--> 486 pm.run(self.state)
487 if self.state.cr is not None:
488 break
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in run(self, state)
366 (self.pipeline_name, pass_desc)
367 patched_exception = self._patch_error(msg, e)
--> 368 raise patched_exception
369
370 def dependency_analysis(self):
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in run(self, state)
354 pass_inst = _pass_registry.get(pss).pass_inst
355 if isinstance(pass_inst, CompilerPass):
--> 356 self._runPass(idx, pass_inst, state)
357 else:
358 raise BaseException("Legacy pass in use")
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
33 def _acquire_compile_lock(*args, **kwargs):
34 with self:
---> 35 return func(*args, **kwargs)
36 return _acquire_compile_lock
37
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in _runPass(self, index, pss, internal_state)
309 mutated |= check(pss.run_initialization, internal_state)
310 with SimpleTimer() as pass_time:
--> 311 mutated |= check(pss.run_pass, internal_state)
312 with SimpleTimer() as finalize_time:
313 mutated |= check(pss.run_finalizer, internal_state)
/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in check(func, compiler_state)
271
272 def check(func, compiler_state):
--> 273 mangled = func(compiler_state)
274 if mangled not in (True, False):
275 msg = ("CompilerPass implementations should return True/False. "
/usr/local/lib/python3.8/dist-packages/numba/core/typed_passes.py in run_pass(self, state)
103 % (state.func_id.func_name,)):
104 # Type inference
--> 105 typemap, return_type, calltypes, errs = type_inference_stage(
106 state.typingctx,
107 state.targetctx,
/usr/local/lib/python3.8/dist-packages/numba/core/typed_passes.py in type_inference_stage(typingctx, targetctx, interp, args, return_type, locals, raise_errors)
81 infer.build_constraint()
82 # return errors in case of partial typing
---> 83 errs = infer.propagate(raise_errors=raise_errors)
84 typemap, restype, calltypes = infer.unify(raise_errors=raise_errors)
85
/usr/local/lib/python3.8/dist-packages/numba/core/typeinfer.py in propagate(self, raise_errors)
1084 if isinstance(e, ForceLiteralArg)]
1085 if not force_lit_args:
-> 1086 raise errors[0]
1087 else:
1088 raise reduce(operator.or_, force_lit_args)
TypingError: Failed in cuda mode pipeline (step: nopython frontend)
No implementation of function Function(<built-in function sub>) found for signature:
>>> sub(array(float32, 1d, A), array(float32, 1d, A))
There are 10 candidate implementations:
- Of which 2 did not match due to:
Operator Overload in function 'sub': File: unknown: Line unknown.
With argument(s): '(array(float32, 1d, A), array(float32, 1d, A))':
No match for registered cases:
* (int64, int64) -> int64
* (int64, uint64) -> int64
* (uint64, int64) -> int64
* (uint64, uint64) -> uint64
* (float32, float32) -> float32
* (float64, float64) -> float64
* (complex64, complex64) -> complex64
* (complex128, complex128) -> complex128
- Of which 8 did not match due to:
Overload of function 'sub': File: <numerous>: Line N/A.
With argument(s): '(array(float32, 1d, A), array(float32, 1d, A))':
No match.
During: typing of intrinsic-call at <ipython-input-4-363332bbfd52> (25)
File "<ipython-input-4-363332bbfd52>", line 25:
def bodyForce(p,n):
<source elided>
dx = p[idx].x - p[i].x
Thanks in advance.
Best,
Diego