CUDA + OpenGL interop

Working example of CUDA OpenGL interop. This is a follow up to my earlier question on the same topic.

OpenGL bits are provided by awesome moderngl + moderngl-window library.

Giant thanks to Graham @gmakrall!

import ctypes

import numpy as np
import numba.cuda as cuda
import cuda.cudart as curt
from cuda.cudart import cudaGraphicsRegisterFlags as GLFlags

import moderngl as mgl
import moderngl_window as mglw

from moderngl_window.conf import settings


###################################################
# wrap the raw pointer into DeviceNDArray
def GL_NDArray(ptr, buff_size, dtype=np.float32, strides=None, order="C"):
    shape = buff_size // np.dtype(dtype).itemsize
    shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(
        shape, strides, dtype, order
    )
    datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)

    ctx = cuda.current_context()
    c_ptr = ctypes.c_uint64(ptr)
    mem = cuda.driver.MemoryPointer(ctx, c_ptr, datasize)
    return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data=mem)


##################################################################
# these shaders do nothing just passing the data along the pipeline

vert_code = """
#version 330

in vec2 in_vert;
in vec3 in_color;

out vec3 color;

void main() {
    gl_Position = vec4(in_vert, 0.0, 1.0);
    color = in_color;
}
"""

frag_code = """
#version 330

in vec3 color;

out vec4 fragColor;

void main(){
    fragColor = vec4(color, 1.0);
}
"""

########################################################################################
# window config
settings.WINDOW["class"] = "moderngl_window.context.glfw.Window"
settings.WINDOW["gl_version"] = (3, 3)

########################################################################################
# Init Vertices numpy array with random y data
N = 300
x = np.linspace(-1.0, 1.0, N)
y = (np.random.rand(N) - 0.5) * 2
r = np.ones(N)
g = np.zeros(N)
b = np.zeros(N)

vertices = np.dstack([x, y, r, g, b])
vert_data = vertices.astype("f4").tobytes()


########################################################################################
# Silly CUDA kernel that scales Y coordinate by a factor
@cuda.jit
def shift(x):
    idx = cuda.grid(1)
    if idx > x.size:
        return

    ty = idx % 5
    if ty == 1:  # _ Y _ _ _
        x[idx] = x[idx] * 0.997


########################################################################################
########################################################################################
# create window. everything else is on window context

window = mglw.create_window_from_settings()
ctx = window.ctx

# Program
prog = ctx.program(vertex_shader=vert_code, fragment_shader=frag_code)

# Init → VBO VAO FBO
vbo = ctx.buffer(vert_data)  # f4 float32
vao = ctx.simple_vertex_array(prog, vbo, "in_vert", "in_color")

########################################################################################
# Get CUDA pointer to VBO

err, gl_resource = curt.cudaGraphicsGLRegisterBuffer(
    vbo.glo, GLFlags.cudaGraphicsRegisterFlagsNone
)
(err,) = curt.cudaGraphicsMapResources(1, gl_resource, 0)
(err, dev_ptr, dev_buff_size) = curt.cudaGraphicsResourceGetMappedPointer(gl_resource)

# wrap into DeviceNDArray
vbo_arr = GL_NDArray(dev_ptr, dev_buff_size)

########################################################################################
# render loop
while not window.is_closing:
    window.clear()

    # kernel → sync → vao render
    shift[vbo_arr.size // 1024 + 1, 1024](vbo_arr)
    cuda.synchronize()
    vao.render(mgl.LINE_STRIP)

    window.swap_buffers()

# CUDA cleanup
(err,) = curt.cudaGraphicsUnmapResources(1, gl_resource, 0)
(err,) = curt.cudaGraphicsUnregisterResource(gl_resource)
window.destroy()

########################################################################################

1 Like

Here is a slightly larger demo based on this (forum does not allow me to post a proper link):

  • htttttttp://gist.github.com/stgatilov/0bb58bf5296c3dfabd2eecd8dbf42237

It wraps the code listed above a bit:

import ctypes
import numpy as np
import numba.cuda as cuda
import cuda.cudart as curt

# source: https://numba.discourse.group/t/cuda-opengl-interop/1898
class OpenglBufferInCuda:
    """A mapping of OpenGL buffer into CUDA device memory."""

    def __init__(self, glbuf):
        """Establishes mapping for given GL buffer object."""
        self._glbuf = glbuf

        err, self._cuglresource = curt.cudaGraphicsGLRegisterBuffer(
            self._glbuf,
            curt.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone
        )
        assert err == 0

        (err,) = curt.cudaGraphicsMapResources(1, self._cuglresource, 0)
        assert err == 0

        (err, self._cuptr, self._cusize) = curt.cudaGraphicsResourceGetMappedPointer(self._cuglresource)
        assert err == 0

    def shutdown(self):
        """Breaks mapping."""
        if self._cuglresource is not None:
            (err,) = curt.cudaGraphicsUnmapResources(1, self._cuglresource, 0)
            (err,) = curt.cudaGraphicsUnregisterResource(self._cuglresource)
            self._cuglresource = None
        
    def asarray(self, dtype = np.float32, shape = None, *, strides = None, order = "C"):
        """
        Returns DeviceNDArray view on mapped buffer compatible with numba.cuda.
        Unless shape is specified, 1D array of maximum size is returned.
        """
        if shape is None:
            shape = self._cusize // np.dtype(dtype).itemsize
        shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(shape, strides, dtype, order)
        datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize)
        assert datasize <= self._cusize

        ctx = cuda.current_context()
        cptr = ctypes.c_uint64(self._cuptr)
        mem = cuda.driver.MemoryPointer(ctx, cptr, datasize)
        return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data = mem)

Also it showcases a way to interactively write screen image with numba. Just write a kernel to fill image buffer, and it will be displayed in realtime.

from OpenGL.GL import *
import numpy as np

class OpenglScreenBuffer:
    """Represents OpenGL buffer object with RGB8 data that can be blit to default framebuffer."""

    def __init__(self, imgsize, *, channels = 3, dtype = np.uint8):
        self.size = imgsize
        self.channels = channels
        self.dtype = dtype

        # numba.cuda does not support images, so I'd better use buffers for interop
        # since I'm lazy to write shaders, PBO is the only option
        self.buffer = glGenBuffers(1)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
        glBufferStorage(GL_PIXEL_UNPACK_BUFFER, self.size[0] * self.size[1] * channels * np.dtype(dtype).itemsize, None, 0)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

        # PBO cannot be bound to framebuffer, so we have to copy it to a texture
        self._tex = glGenTextures(1)
        glBindTexture(GL_TEXTURE_2D, self._tex)
        formatmode = {
            (3, np.uint8) : GL_RGB8,
            (4, np.uint8) : GL_RGBA8,
            (3, np.float32) : GL_RGB32F,
            (4, np.float32) : GL_RGBA32F,
        }[(self.channels, self.dtype)]
        glTexStorage2D(GL_TEXTURE_2D, 1, formatmode, self.size[0], self.size[1])
        glBindTexture(GL_TEXTURE_2D, 0)

        # texture is attached to framebuffer, and framebuffer can be blit with scaling
        self._fbo = glGenFramebuffers(1)
        glBindFramebuffer(GL_FRAMEBUFFER, self._fbo)
        glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, self._tex, 0)
        glBindFramebuffer(GL_FRAMEBUFFER, 0)
        
    def blittoscreen(self, screenSize):
        # copy PBO context to texture
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer)
        glBindTexture(GL_TEXTURE_2D, self._tex)
        colormode = [GL_RED, GL_RG, GL_RGB, GL_RGBA][self.channels - 1]
        typemode = {np.uint8 : GL_UNSIGNED_BYTE, np.float32 : GL_FLOAT}[self.dtype]
        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.size[0], self.size[1], colormode, typemode, None)
        glBindTexture(GL_TEXTURE_2D, 0)
        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0)

        # and blit framebuffer with texture into screen/default framebuffer
        glBindFramebuffer(GL_READ_FRAMEBUFFER, self._fbo)
        glBlitFramebuffer(0, 0, self.size[0], self.size[1], 0, 0, screenSize[0], screenSize[1], GL_COLOR_BUFFER_BIT, GL_LINEAR)
        glBindFramebuffer(GL_READ_FRAMEBUFFER, 0)

    def shutdown(self):
        if self.buffer is not None:
            glDeleteFramebuffers(1, [self._fbo])
            glDeleteTextures(1, [self._tex])
            glDeleteBuffers(1, [self.buffer])
            self.buffer = None