Paralization of function

Dear experts,

I am coding my first program in numba for Cuda and I am having some trouble to paralelize a function. I am solving the N-body problem, defining each body as an object using:

spec = [
(‘dim’, int32), # a simple scalar field
(‘x’, float32[:]), # an array field
(‘y’, float32[:]),
(‘z’, float32[:]),
(‘vx’, float32[:]),
(‘vy’, float32[:]),
(‘vz’, float32[:]),
(‘force’, int32),
]
@jitclass(spec)
class Body(object):
def init(self,n,x,y,z,vx,vy,vz):
#coordinates and velocity of the body
#n is the number of elements to keep (info)
#FUTURE PROBLEM: when we deal with 10^3 objects, it’s going to be impossible to keep their trajectories, how can we do it?
self.dim = n
self.x = np.zeros(n,dtype=np.float32)
self.x[0] = x
self.y = np.zeros(n,dtype=np.float32)
self.y[0] = y
self.z =np.zeros(n,dtype=np.float32)
self.z[0] = z
self.vx = np.zeros(n,dtype=np.float32)
self.vx[0] = vx
self.vy = np.zeros(n,dtype=np.float32)
self.vy[0] = vy
self.vz = np.zeros(n,dtype=np.float32)
self.vz[0] = vz
#maybe this is useful, total force over the body, but only in one time step, i don’t think we need
#to keep these values for all integration time
self.force = np.zeros(3,dtype=np.float32)

This seems to work when defining bodies as:

b1 = Body(1000,1,0,10,1,1,1)
b2 = Body(1000,0,3,5,1,1,1)
bodies = [b1,b2]

Then, I want to compute the force of all bodies over one of them with the function:

@cuda.jit
def bodyForce(p,n):
global SOFTENING
#p is the array or struct of bodies in the problem
#n is the number of bodies
#dt timestep
#Thread ID
idx = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
#Compute force for each body i
for i in range(n):
Fx = 0.0
Fy = 0.0
Fz = 0.0
#Summing over all elements. Note here I don’t need to discard the
#i=j because of the use of the SOFTERNING quantity. Check next reference
if(idx<n):
dx = p[idx].x - p[i].x
dy = p[idx].y - p[i].y
dz = p[idx].z - p[i].z
#Compute the square of distance. Softerning is a parameter that is inclded to avoid
#infinite forces when objects are very close (collisions)
distSqr = dxdx + dydy + dz*dz + SOFTENING
#sqrt of this guy
invDist = np.sqrt(distSqr)
#and now this to the power of 3
invDist3 = invDist * invDist * invDist
#Forces in the 3 cartesian directions
Fx += dx * invDist3
Fy += dy * invDist3
Fz += dz * invDist3
p[i].totalF = [Fx,Fy,Fz]

Calling it as:

SOFTENING = 0.1
bodyForce[1,2] (bodies,2)

I get the attached error message, which I do not manage to solve. Apologies for my few knowledge in the field, I hope that someone could give me a little hint!
Error:

/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py:488: NumbaPerformanceWarning: Grid size 1 will likely result in GPU under-utilization due to low occupancy.
  warn(NumbaPerformanceWarning(msg))
---------------------------------------------------------------------------
TypingError                               Traceback (most recent call last)
<ipython-input-5-9988521531ab> in <module>
      1 SOFTENING = 0.1
      2 
----> 3 bodyForce[1,2](bodies,2)

20 frames
/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in __call__(self, *args)
    489 
    490     def __call__(self, *args):
--> 491         return self.dispatcher.call(args, self.griddim, self.blockdim,
    492                                     self.stream, self.sharedmem)
    493 

/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in call(self, args, griddim, blockdim, stream, sharedmem)
    623             kernel = next(iter(self.overloads.values()))
    624         else:
--> 625             kernel = _dispatcher.Dispatcher._cuda_call(self, *args)
    626 
    627         kernel.launch(args, griddim, blockdim, stream, sharedmem)

/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in _compile_for_args(self, *args, **kws)
    631         assert not kws
    632         argtypes = [self.typeof_pyval(a) for a in args]
--> 633         return self.compile(tuple(argtypes))
    634 
    635     def typeof_pyval(self, val):

/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in compile(self, sig)
    792                 raise RuntimeError("Compilation disabled")
    793 
--> 794             kernel = _Kernel(self.py_func, argtypes, **self.targetoptions)
    795             # We call bind to force codegen, so that there is a cubin to cache
    796             kernel.bind()

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
     33         def _acquire_compile_lock(*args, **kwargs):
     34             with self:
---> 35                 return func(*args, **kwargs)
     36         return _acquire_compile_lock
     37 

/usr/local/lib/python3.8/dist-packages/numba/cuda/dispatcher.py in __init__(self, py_func, argtypes, link, debug, lineinfo, inline, fastmath, extensions, max_registers, opt, device)
     73         }
     74 
---> 75         cres = compile_cuda(self.py_func, types.void, self.argtypes,
     76                             debug=self.debug,
     77                             lineinfo=self.lineinfo,

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
     33         def _acquire_compile_lock(*args, **kwargs):
     34             with self:
---> 35                 return func(*args, **kwargs)
     36         return _acquire_compile_lock
     37 

/usr/local/lib/python3.8/dist-packages/numba/cuda/compiler.py in compile_cuda(pyfunc, return_type, args, debug, lineinfo, inline, fastmath, nvvm_options)
    210     from numba.core.target_extension import target_override
    211     with target_override('cuda'):
--> 212         cres = compiler.compile_extra(typingctx=typingctx,
    213                                       targetctx=targetctx,
    214                                       func=pyfunc,

/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
    714     pipeline = pipeline_class(typingctx, targetctx, library,
    715                               args, return_type, flags, locals)
--> 716     return pipeline.compile_extra(func)
    717 
    718 

/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in compile_extra(self, func)
    450         self.state.lifted = ()
    451         self.state.lifted_from = None
--> 452         return self._compile_bytecode()
    453 
    454     def compile_ir(self, func_ir, lifted=(), lifted_from=None):

/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_bytecode(self)
    518         """
    519         assert self.state.func_ir is None
--> 520         return self._compile_core()
    521 
    522     def _compile_ir(self):

/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_core(self)
    497                     self.state.status.fail_reason = e
    498                     if is_final_pipeline:
--> 499                         raise e
    500             else:
    501                 raise CompilerError("All available pipelines exhausted")

/usr/local/lib/python3.8/dist-packages/numba/core/compiler.py in _compile_core(self)
    484                 res = None
    485                 try:
--> 486                     pm.run(self.state)
    487                     if self.state.cr is not None:
    488                         break

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in run(self, state)
    366                     (self.pipeline_name, pass_desc)
    367                 patched_exception = self._patch_error(msg, e)
--> 368                 raise patched_exception
    369 
    370     def dependency_analysis(self):

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in run(self, state)
    354                 pass_inst = _pass_registry.get(pss).pass_inst
    355                 if isinstance(pass_inst, CompilerPass):
--> 356                     self._runPass(idx, pass_inst, state)
    357                 else:
    358                     raise BaseException("Legacy pass in use")

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_lock.py in _acquire_compile_lock(*args, **kwargs)
     33         def _acquire_compile_lock(*args, **kwargs):
     34             with self:
---> 35                 return func(*args, **kwargs)
     36         return _acquire_compile_lock
     37 

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in _runPass(self, index, pss, internal_state)
    309                 mutated |= check(pss.run_initialization, internal_state)
    310             with SimpleTimer() as pass_time:
--> 311                 mutated |= check(pss.run_pass, internal_state)
    312             with SimpleTimer() as finalize_time:
    313                 mutated |= check(pss.run_finalizer, internal_state)

/usr/local/lib/python3.8/dist-packages/numba/core/compiler_machinery.py in check(func, compiler_state)
    271 
    272         def check(func, compiler_state):
--> 273             mangled = func(compiler_state)
    274             if mangled not in (True, False):
    275                 msg = ("CompilerPass implementations should return True/False. "

/usr/local/lib/python3.8/dist-packages/numba/core/typed_passes.py in run_pass(self, state)
    103                               % (state.func_id.func_name,)):
    104             # Type inference
--> 105             typemap, return_type, calltypes, errs = type_inference_stage(
    106                 state.typingctx,
    107                 state.targetctx,

/usr/local/lib/python3.8/dist-packages/numba/core/typed_passes.py in type_inference_stage(typingctx, targetctx, interp, args, return_type, locals, raise_errors)
     81         infer.build_constraint()
     82         # return errors in case of partial typing
---> 83         errs = infer.propagate(raise_errors=raise_errors)
     84         typemap, restype, calltypes = infer.unify(raise_errors=raise_errors)
     85 

/usr/local/lib/python3.8/dist-packages/numba/core/typeinfer.py in propagate(self, raise_errors)
   1084                                   if isinstance(e, ForceLiteralArg)]
   1085                 if not force_lit_args:
-> 1086                     raise errors[0]
   1087                 else:
   1088                     raise reduce(operator.or_, force_lit_args)

TypingError: Failed in cuda mode pipeline (step: nopython frontend)
No implementation of function Function(<built-in function sub>) found for signature:
 
 >>> sub(array(float32, 1d, A), array(float32, 1d, A))
 
There are 10 candidate implementations:
    - Of which 2 did not match due to:
    Operator Overload in function 'sub': File: unknown: Line unknown.
      With argument(s): '(array(float32, 1d, A), array(float32, 1d, A))':
     No match for registered cases:
      * (int64, int64) -> int64
      * (int64, uint64) -> int64
      * (uint64, int64) -> int64
      * (uint64, uint64) -> uint64
      * (float32, float32) -> float32
      * (float64, float64) -> float64
      * (complex64, complex64) -> complex64
      * (complex128, complex128) -> complex128
    - Of which 8 did not match due to:
    Overload of function 'sub': File: <numerous>: Line N/A.
      With argument(s): '(array(float32, 1d, A), array(float32, 1d, A))':
     No match.

During: typing of intrinsic-call at <ipython-input-4-363332bbfd52> (25)

File "<ipython-input-4-363332bbfd52>", line 25:
def bodyForce(p,n):
    <source elided>

      dx = p[idx].x - p[i].x

Thanks in advance.

Best,

Diego