Implementing pandas DataFrame type via numba extension types

Hi,
I’ve been trying to implement a pandas DataFrame type via numba extension types (for use in pandas internals where we integrate with numba) and I’ve been struggling with getting the code to work properly.

So far, I’ve gotten most of it to work, I think.

I’ve been getting segfaults kind of randomly though (e.g. addition segfaults and so does division, but subtraction works fine), and have been struggling with compiler errors too.

My best guess is that I’m not parametrizing the dtype correctly. The goal is to have the structmodel be able to hold any numpy dtype a pandas DataFrame can hold. The tricky part is probably that different calls to the DataFrame constructor can have different dtypes depending on the values passed in.

My code is below. Any help is greatly appreciated!

import operator

import pandas as pd
import numpy as np
import numba
from numba import types
from numba.core import cgutils, boxing
from numba.extending import typeof_impl, type_callable, models, register_model, make_attribute_wrapper, lower_builtin, unbox, box, NativeValue, overload_method, overload


class DataFrameType(types.Type):
    def __init__(self, vals_dtype, index_dtype):
        self.vals_dtype = vals_dtype
        self.index_dtype = index_dtype
        super(DataFrameType, self).__init__(name=f"DataFrame")

@typeof_impl.register(pd.DataFrame)
def typeof_df(val, c):
    return DataFrameType(typeof_impl(val.values, c), typeof_impl(val.index.values, c))

@type_callable(pd.DataFrame)
def type_df(context):
    def typer(values, index):
        if isinstance(values, types.Array) and isinstance(index, types.Array):
            return DataFrameType(values.dtype, index.dtype)
    return typer

@register_model(DataFrameType)
class DataFrameModel(models.StructModel):
    def __init__(self, dmm, fe_type):
        members = [
            ('values', fe_type.vals_dtype),
            ('index', fe_type.index_dtype)
        ]
        models.StructModel.__init__(self, dmm, fe_type, members)

make_attribute_wrapper(DataFrameType, 'values', 'values')
make_attribute_wrapper(DataFrameType, 'index', 'index')

@lower_builtin(pd.DataFrame, types.Array, types.Array)
def impl_df(context, builder, sig, args):
    typ = sig.return_type
    values, index = args
    df = cgutils.create_struct_proxy(typ)(context, builder)
    df.values = values
    df.index = index
    return df._getvalue()


@unbox(DataFrameType)
def unbox_df(typ, obj, c):
    # xref https://stackoverflow.com/questions/58565368/python-numba-trouble-creating-custom-type-using-numba-extension-api
    values_obj = c.pyapi.object_getattr_string(obj, "values")
    index_obj = c.pyapi.object_getattr_string(obj, "index")
    # Get values from the Index object
    index_values_obj = c.pyapi.object_getattr_string(index_obj, "values")

    df = cgutils.create_struct_proxy(typ)(c.context, c.builder)
    df.values = boxing.unbox_array(typ.vals_dtype, values_obj, c).value
    df.index = boxing.unbox_array(typ.index_dtype, index_values_obj, c).value
    c.pyapi.decref(values_obj)
    c.pyapi.decref(index_obj)
    c.pyapi.decref(index_values_obj)
    is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred())
    return NativeValue(df._getvalue(), is_error=is_error)

@box(DataFrameType)
def box_df(typ, val, c):
    df = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
    index_obj = boxing.box_array(typ.index_dtype, df.index, c)
    values_obj = boxing.box_array(typ.vals_dtype, df.values, c)
    df_obj = c.pyapi.unserialize(c.pyapi.serialize_object(pd.DataFrame))
    res = c.pyapi.call_function_objargs(df_obj, (values_obj, index_obj))
    c.pyapi.decref(values_obj)
    c.pyapi.decref(index_obj)
    c.pyapi.decref(df_obj)
    return res

@overload(operator.add)
def df_add(self_df, other):
    if isinstance(other, DataFrameType):
        # TODO: Different Index?
        return lambda self_df, other: pd.DataFrame(self_df.values + other.values, index=self_df.index)
    else:
        return lambda self_df, other: pd.DataFrame(self_df.values + other, index=self_df.index)

@overload(operator.sub)
def df_sub(self_df, other):
    if isinstance(other, DataFrameType):
        # TODO: Different Index?
        return lambda self_df, other: pd.DataFrame(self_df.values - other.values, index=self_df.index)
    else:
        return lambda self_df, other: pd.DataFrame(self_df.values - other, index=self_df.index)

@overload(operator.truediv)
def df_truediv(self_df, other):
    if isinstance(other, DataFrameType):
        # TODO: Different Index?
        return lambda self_df, other: pd.DataFrame(self_df.values / other.values, index=self_df.index)
    else:
        return lambda self_df, other: pd.DataFrame(self_df.values / other, index=self_df.index)


@numba.njit
def test1(df):
    #return pd.DataFrame(values=df.values, index=df.index) # Segfaults
    #return df / 2 # Segfaults
    #return df + 2 # Segfaults
    #return df     # Works
    return df - 2  # Works
df = pd.DataFrame({"a": np.array([4,5,6], dtype=np.float64),
                   "b": np.array([7,8,9], dtype=np.float64)},
                  index=np.array([1,2,3], dtype=np.float64))

result = test1(df)
print(result)
1 Like

This is not a direct response to the original question, but if you’re attempting to implement support for Pandas dataframes I think you might be able to draw a lot of inspiration from our implementation for UDFs in cuDF. We used a Record type to represent a row / dataframe rather than a struct - I can’t recall the reason right now, but I think it made the implementation simpler in some way (with some thinking / exploration I should be able to remember). The Numba extension code mostly lives in https://github.com/rapidsai/cudf/tree/branch-23.08/python/cudf/cudf/core/udf - there’s a lot in there these days, but I’m happy to spend some time discussing the implementation and how you can make progress with the implementation in pandas - feel free to reach out and let me know if this would be helpful.