CUDA + OpenGL interop

Working example of CUDA OpenGL interop. This is a follow up to my earlier question on the same topic.

OpenGL bits are provided by awesome moderngl + moderngl-window library.

Giant thanks to Graham @gmakrall!

import ctypes

import numpy as np
import numba.cuda as cuda
import cuda.cudart as curt
from cuda.cudart import cudaGraphicsRegisterFlags as GLFlags

import moderngl as mgl
import moderngl_window as mglw

from moderngl_window.conf import settings


###################################################
# wrap the raw pointer into DeviceNDArray
def GL_NDArray(ptr, buff_size, dtype=np.float32, strides=None, order="C"):
    shape = buff_size // np.dtype(dtype).itemsize
    shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(
        shape, strides, dtype, order
    )
    datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)

    ctx = cuda.current_context()
    c_ptr = ctypes.c_uint64(ptr)
    mem = cuda.driver.MemoryPointer(ctx, c_ptr, datasize)
    return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data=mem)


##################################################################
# these shaders do nothing just passing the data along the pipeline

vert_code = """
#version 330

in vec2 in_vert;
in vec3 in_color;

out vec3 color;

void main() {
    gl_Position = vec4(in_vert, 0.0, 1.0);
    color = in_color;
}
"""

frag_code = """
#version 330

in vec3 color;

out vec4 fragColor;

void main(){
    fragColor = vec4(color, 1.0);
}
"""

########################################################################################
# window config
settings.WINDOW["class"] = "moderngl_window.context.glfw.Window"
settings.WINDOW["gl_version"] = (3, 3)

########################################################################################
# Init Vertices numpy array with random y data
N = 300
x = np.linspace(-1.0, 1.0, N)
y = (np.random.rand(N) - 0.5) * 2
r = np.ones(N)
g = np.zeros(N)
b = np.zeros(N)

vertices = np.dstack([x, y, r, g, b])
vert_data = vertices.astype("f4").tobytes()


########################################################################################
# Silly CUDA kernel that scales Y coordinate by a factor
@cuda.jit
def shift(x):
    idx = cuda.grid(1)
    if idx > x.size:
        return

    ty = idx % 5
    if ty == 1:  # _ Y _ _ _
        x[idx] = x[idx] * 0.997


########################################################################################
########################################################################################
# create window. everything else is on window context

window = mglw.create_window_from_settings()
ctx = window.ctx

# Program
prog = ctx.program(vertex_shader=vert_code, fragment_shader=frag_code)

# Init → VBO VAO FBO
vbo = ctx.buffer(vert_data)  # f4 float32
vao = ctx.simple_vertex_array(prog, vbo, "in_vert", "in_color")

########################################################################################
# Get CUDA pointer to VBO

err, gl_resource = curt.cudaGraphicsGLRegisterBuffer(
    vbo.glo, GLFlags.cudaGraphicsRegisterFlagsNone
)
(err,) = curt.cudaGraphicsMapResources(1, gl_resource, 0)
(err, dev_ptr, dev_buff_size) = curt.cudaGraphicsResourceGetMappedPointer(gl_resource)

# wrap into DeviceNDArray
vbo_arr = GL_NDArray(dev_ptr, dev_buff_size)

########################################################################################
# render loop
while not window.is_closing:
    window.clear()

    # kernel → sync → vao render
    shift[vbo_arr.size // 1024 + 1, 1024](vbo_arr)
    cuda.synchronize()
    vao.render(mgl.LINE_STRIP)

    window.swap_buffers()

# CUDA cleanup
(err,) = curt.cudaGraphicsUnmapResources(1, gl_resource, 0)
(err,) = curt.cudaGraphicsUnregisterResource(gl_resource)
window.destroy()

########################################################################################

1 Like