Hi,
I’ve been trying to implement a pandas DataFrame type via numba extension types (for use in pandas internals where we integrate with numba) and I’ve been struggling with getting the code to work properly.
So far, I’ve gotten most of it to work, I think.
I’ve been getting segfaults kind of randomly though (e.g. addition segfaults and so does division, but subtraction works fine), and have been struggling with compiler errors too.
My best guess is that I’m not parametrizing the dtype correctly. The goal is to have the structmodel be able to hold any numpy dtype a pandas DataFrame can hold. The tricky part is probably that different calls to the DataFrame constructor can have different dtypes depending on the values passed in.
My code is below. Any help is greatly appreciated!
import operator
import pandas as pd
import numpy as np
import numba
from numba import types
from numba.core import cgutils, boxing
from numba.extending import typeof_impl, type_callable, models, register_model, make_attribute_wrapper, lower_builtin, unbox, box, NativeValue, overload_method, overload
class DataFrameType(types.Type):
def __init__(self, vals_dtype, index_dtype):
self.vals_dtype = vals_dtype
self.index_dtype = index_dtype
super(DataFrameType, self).__init__(name=f"DataFrame")
@typeof_impl.register(pd.DataFrame)
def typeof_df(val, c):
return DataFrameType(typeof_impl(val.values, c), typeof_impl(val.index.values, c))
@type_callable(pd.DataFrame)
def type_df(context):
def typer(values, index):
if isinstance(values, types.Array) and isinstance(index, types.Array):
return DataFrameType(values.dtype, index.dtype)
return typer
@register_model(DataFrameType)
class DataFrameModel(models.StructModel):
def __init__(self, dmm, fe_type):
members = [
('values', fe_type.vals_dtype),
('index', fe_type.index_dtype)
]
models.StructModel.__init__(self, dmm, fe_type, members)
make_attribute_wrapper(DataFrameType, 'values', 'values')
make_attribute_wrapper(DataFrameType, 'index', 'index')
@lower_builtin(pd.DataFrame, types.Array, types.Array)
def impl_df(context, builder, sig, args):
typ = sig.return_type
values, index = args
df = cgutils.create_struct_proxy(typ)(context, builder)
df.values = values
df.index = index
return df._getvalue()
@unbox(DataFrameType)
def unbox_df(typ, obj, c):
# xref https://stackoverflow.com/questions/58565368/python-numba-trouble-creating-custom-type-using-numba-extension-api
values_obj = c.pyapi.object_getattr_string(obj, "values")
index_obj = c.pyapi.object_getattr_string(obj, "index")
# Get values from the Index object
index_values_obj = c.pyapi.object_getattr_string(index_obj, "values")
df = cgutils.create_struct_proxy(typ)(c.context, c.builder)
df.values = boxing.unbox_array(typ.vals_dtype, values_obj, c).value
df.index = boxing.unbox_array(typ.index_dtype, index_values_obj, c).value
c.pyapi.decref(values_obj)
c.pyapi.decref(index_obj)
c.pyapi.decref(index_values_obj)
is_error = cgutils.is_not_null(c.builder, c.pyapi.err_occurred())
return NativeValue(df._getvalue(), is_error=is_error)
@box(DataFrameType)
def box_df(typ, val, c):
df = cgutils.create_struct_proxy(typ)(c.context, c.builder, value=val)
index_obj = boxing.box_array(typ.index_dtype, df.index, c)
values_obj = boxing.box_array(typ.vals_dtype, df.values, c)
df_obj = c.pyapi.unserialize(c.pyapi.serialize_object(pd.DataFrame))
res = c.pyapi.call_function_objargs(df_obj, (values_obj, index_obj))
c.pyapi.decref(values_obj)
c.pyapi.decref(index_obj)
c.pyapi.decref(df_obj)
return res
@overload(operator.add)
def df_add(self_df, other):
if isinstance(other, DataFrameType):
# TODO: Different Index?
return lambda self_df, other: pd.DataFrame(self_df.values + other.values, index=self_df.index)
else:
return lambda self_df, other: pd.DataFrame(self_df.values + other, index=self_df.index)
@overload(operator.sub)
def df_sub(self_df, other):
if isinstance(other, DataFrameType):
# TODO: Different Index?
return lambda self_df, other: pd.DataFrame(self_df.values - other.values, index=self_df.index)
else:
return lambda self_df, other: pd.DataFrame(self_df.values - other, index=self_df.index)
@overload(operator.truediv)
def df_truediv(self_df, other):
if isinstance(other, DataFrameType):
# TODO: Different Index?
return lambda self_df, other: pd.DataFrame(self_df.values / other.values, index=self_df.index)
else:
return lambda self_df, other: pd.DataFrame(self_df.values / other, index=self_df.index)
@numba.njit
def test1(df):
#return pd.DataFrame(values=df.values, index=df.index) # Segfaults
#return df / 2 # Segfaults
#return df + 2 # Segfaults
#return df # Works
return df - 2 # Works
df = pd.DataFrame({"a": np.array([4,5,6], dtype=np.float64),
"b": np.array([7,8,9], dtype=np.float64)},
index=np.array([1,2,3], dtype=np.float64))
result = test1(df)
print(result)