I want to get the similar results as shown in another post (Numba #801, topic: cuda-shared-memory-on-1d-arrays) but failed.
My process:
ncu -o profile python xxx.py
ncu-ui profile.ncu-rep
However, it seems that the results are different (not nearly one-on-one mapping from python file to sass code).
Btw, I am using cuda toolkit 11.3.
Any advice?
My code (modified from tutorial):
from __future__ import print_function, division, absolute_import
from timeit import default_timer as timer
from matplotlib.pylab import imshow, show
import numpy as np
from numba import cuda
@cuda.jit(debug=True, lineinfo=True)
def create_fractal(min_x, max_x, min_y, max_y, image, iters):
height = image.shape[0]
width = image.shape[1]
pixel_size_x = (max_x - min_x) / width
pixel_size_y = (max_y - min_y) / height
x, y = cuda.grid(2)
if x < width and y < height:
real = min_x + x * pixel_size_x
imag = min_y + y * pixel_size_y
# color = mandel(real, imag, iters)
i = 0
c = complex(x, y)
z = 0.0j
color = 255
for i in range(iters):
z = z * z + c
if (z.real * z.real + z.imag * z.imag) >= 4:
color = i
break
image[y, x] = color
width = 15000
height = 10000
image = np.zeros((height, width), dtype=np.uint8)
pixels = width * height
nthreads = 32
nblocksy = (height // nthreads) + 1
nblocksx = (width // nthreads) + 1
s = timer()
create_fractal[(nblocksx, nblocksy), (nthreads, nthreads)](
-2.0, 1.0, -1.0, 1.0, image, 20
)
e = timer()
print("Execution time: %f seconds" % (e - s))