CUDA complex powers

Migrating my question from github

I’m trying to find the complex powers using numba cuda, and it seems that it has been supported, but for some reason, my code doesn’t run. I’ve implemented my algorithm in CUDA and CPU, Ideally the plots should be the same. I’ve tried this with real numbers and things do work. Open to any suggestions

import numpy as np
import matplotlib.pyplot as plt
import time, math
from matplotlib.colors import hsv_to_rgb
from numba import cuda, jit, prange

@cuda.jit
def get_grid_cuda(Z):
    i, j = cuda.grid(2)
    if i >=0 and i < Z.shape[0] and j >=0 and j < Z.shape[1]:     
        for i in range(200):
            fx=Z[i,j]**3+Z[i,j]**2+Z[i,j]+1
            dfx=3*Z[i,j]**2+2*Z[i,j]+1
            div=fx/dfx
            Z[i,j]=Z[i,j]-div
            if(abs(div)<1e-3):
                break

@jit(nopython=True)
def NewtonR(x):
    for i in range(100):
        fx=x**3+x**2+x+1
        dfx=3*x**2+2*x+1
        div=fx/dfx
        x=x-div
        if(abs(div)<1e-3):
            break
    return x

@jit(nopython=True, parallel=True)
def get_grid(Z):
    for i in prange(len(Z)):
        for j in prange(len(Z[0])):
            Z[i,j]=NewtonR(Z[i,j])
    return Z

x=np.linspace(-1,1,512,dtype=complex)
xj=np.linspace(-1j,1j,512,dtype=complex)
xx,yy=np.meshgrid(x,xj)
X=xx+yy

Z=np.copy(X)
Z_cpu=np.copy(X)

threadsperblock = (16, 16)
blockspergrid_x = math.ceil(Z.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(Z.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

get_grid_cuda[blockspergrid, threadsperblock](Z)

Z_cpu=get_grid(Z_cpu)

plt.contourf(X.real,X.imag,Z,levels=30)
plt.show()
plt.close()

plt.contourf(X.real,X.imag,Z_cpu,levels=30)
plt.show()
plt.close()

Hi @joej7

I think the issue in the above code is that the induction variable (i) in the for i in range(200) loop in the CUDA implementation overwrites the CUDA indexing variable i, j = cuda.grid(2), such that data ends up being read from and written to unexpected locations. This seems to work ok:

import numpy as np
import time, math
from numba import cuda, jit, prange

@cuda.jit
def get_grid_cuda(Z):
    i, j = cuda.grid(2)
    if i >=0 and i < Z.shape[0] and j >=0 and j < Z.shape[1]:
        # NOTE THE USE OF _ AS THE INDUCTION VARIABLE
        for _ in range(200):
            fx=Z[i,j]**3+Z[i,j]**2+Z[i,j]+1
            dfx=3*Z[i,j]**2+2*Z[i,j]+1
            div=fx/dfx
            Z[i,j]=Z[i,j]-div
            if(abs(div)<1e-3):
                break

@jit(nopython=True)
def NewtonR(x):
    for i in range(100):
        fx=x**3+x**2+x+1
        dfx=3*x**2+2*x+1
        div=fx/dfx
        x=x-div
        if(abs(div)<1e-3):
            break
    return x

@jit(nopython=True, parallel=True)
def get_grid(Z):
    for i in prange(len(Z)):
        for j in prange(len(Z[0])):
            Z[i,j]=NewtonR(Z[i,j])
    return Z

x=np.linspace(-1,1,512,dtype=complex)
xj=np.linspace(-1j,1j,512,dtype=complex)
xx,yy=np.meshgrid(x,xj)
X=xx+yy

Z=np.copy(X)
Z_cpu=np.copy(X)

threadsperblock = (16, 16)
blockspergrid_x = math.ceil(Z.shape[0] / threadsperblock[0])
blockspergrid_y = math.ceil(Z.shape[1] / threadsperblock[1])
blockspergrid = (blockspergrid_x, blockspergrid_y)

get_grid_cuda[blockspergrid, threadsperblock](Z)

Z_cpu=get_grid(Z_cpu)

np.testing.assert_allclose(Z_cpu, Z)

Hope this helps?

1 Like

OMG, I’m stupid, lol. Thank you so much!