I would like to optimize a 3d binary dilation on GPU. I am suspecting the code is not optimal:

```
@cuda.jit("void(uint8[:,:,:],uint8[:,:,:],uint8[:,:,:])")
def cuda_binary_dilate_u8(vol, out, kern):
z,y,x = cuda.grid(3)
d,h,w = vol.shape
a,b,c = kern.shape
pa,pb,pc = a//2,b//2,c//2
if z >= 0 and z < d and y >= 0 and y < h and x >= 0 and x < w:
out[z,y,x] = False
for i in range(a):
for j in range(b):
for k in range(c):
kv = kern[i,j,k]
zz = z+i-pa
yy = y+j-pb
xx = x+k-pc
if zz < 0 or zz >= d or yy < 0 or yy >= h or xx < 0 or xx >= w:
continue
if vol[zz,yy,xx] and kv:
out[z,y,x] = True
def binary_dilate_u8_cuda(vol, kern, iterations=1):
xcu = cuda.to_device(vol)
kcu = cuda.to_device(kern)
sizes = vol.shape
block_dim = (4,4,4)
grid_dim = tuple(int(np.ceil(a/b)) for a, b in zip(sizes, block_dim))
ycu = cuda.device_array(shape=vol.shape, dtype=np.uint8)
a,b = ycu, xcu
for i in range(iterations):
a,b = b,a
cuda_binary_dilate_u8[grid_dim, block_dim](a, b, kcu)
return b
```