Skip to content

Commit

Permalink
feat: bump narwhals and adapt to support pyarrow (#694)
Browse files Browse the repository at this point in the history
* refactor all **kwargs into kwargs

* refactor tests

* rerun meta-models

* print

* codebase

* tests

* bump min version

* fix with narwhals branch

* bump narwhals

* list -> numpy constant creation

---------

Co-authored-by: vincent d warmerdam <vincentwarmerdam@gmail.com>
  • Loading branch information
FBruzzesi and koaning committed Aug 21, 2024
1 parent 192d406 commit 236f491
Show file tree
Hide file tree
Showing 11 changed files with 71 additions and 54 deletions.
5 changes: 2 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ maintainers = [
]

dependencies = [
"narwhals>=1.0.0",
"narwhals>=1.2.0",
"pandas>=1.1.5",
"scikit-learn>=1.0",
"importlib-metadata >= 1.0; python_version < '3.8'",
Expand Down Expand Up @@ -62,8 +62,7 @@ docs = [
]

test = [
"narwhals[polars]",
"pyarrow",
"narwhals[polars,pyarrow]",
"pytest>=6.2.5",
"pytest-xdist>=1.34.0",
"pytest-cov>=2.6.1",
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
setup.py is used to distribute scikit-lego as a package using setuptools and twine
"""

from setuptools import setup

if __name__ == "__main__":
Expand Down
7 changes: 4 additions & 3 deletions sklego/meta/_grouped_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,11 @@ def parse_X_y(X, y, groups, check_X=True, **kwargs) -> nw.DataFrame:

# Convert y and assign it to the frame
n_samples = X.shape[0]
native_space = nw.get_native_namespace(X)
y_series = nw.from_dict(
data={"tmp": [None] * n_samples if y is None else y}, native_namespace=nw.get_native_namespace(X)
)["tmp"]

y_native = native_space.Series([None] * n_samples) if y is None else native_space.Series(y)
return X.with_columns(__sklego_target__=nw.from_native(y_native, allow_series=True))
return X.with_columns(__sklego_target__=y_series)


def _validate_groups_values(X: nw.DataFrame, groups: List[int] | List[str]) -> None:
Expand Down
15 changes: 8 additions & 7 deletions sklego/meta/grouped_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,13 @@ def __fit_grouped_estimator(
if columns is None:
columns = self._groups

to_drop = list(set(["__sklego_target__", *columns, *as_list(self.groups)]))
grouped_estimators = {
# Fit a clone of the estimators to each group
(group_name[0] if len(group_name) == 1 else group_name): self.__fit_single_group(
group=(group_name[0] if len(group_name) == 1 else group_name),
X=nw.to_native(X_grp.drop(["__sklego_target__", *columns, *as_list(self.groups)])),
y=(nw.to_native(X_grp.select("__sklego_target__")).to_numpy().reshape(-1) if y is not None else None),
X=nw.to_native(X_grp.drop(to_drop)),
y=(X_grp.select("__sklego_target__").to_numpy().reshape(-1) if y is not None else None),
)
for group_name, X_grp in frame.group_by(columns)
}
Expand All @@ -149,12 +150,12 @@ def __add_shrinkage_column(self, frame, groups=None):

if self.shrinkage is not None and self.use_global_model:
n_samples = frame.shape[0]
native_space = nw.get_native_namespace(frame)

frame = frame.select(
nw.from_native(native_space.Series([self._global_col_value] * n_samples), allow_series=True).alias(
self._global_col_name
),
nw.from_dict(
data={self._global_col_name: np.full(shape=n_samples, fill_value=self._global_col_value)},
native_namespace=nw.get_native_namespace(frame),
)[self._global_col_name],
nw.all(),
)
groups = [self._global_col_name] if groups is None else [self._global_col_name, *groups]
Expand Down Expand Up @@ -285,7 +286,7 @@ def __predict_groups(self, frame: nw.DataFrame, method="predict", groups=None):
(group_value[0] if len(group_value) == 1 else group_value),
nw.to_native(X_grp.drop(["__sklego_index__", *groups, *as_list(self.groups)])),
method=method,
).set_index(nw.to_native(X_grp["__sklego_index__"]).to_numpy().reshape(-1).astype(int))
).set_index(X_grp["__sklego_index__"].to_numpy().reshape(-1).astype(int))
for group_value, X_grp in frame.group_by(groups)
],
axis=0,
Expand Down
2 changes: 1 addition & 1 deletion sklego/meta/grouped_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def __transform_groups(self, frame: nw.DataFrame):

results = [
(
nw.to_native(X_grp.select("__sklego_index__")).to_numpy().squeeze().astype(int),
X_grp.select("__sklego_index__").to_numpy().squeeze().astype(int),
self.__transform_single_group(
group_name, nw.to_native(X_grp.drop(["__sklego_index__", *self.groups_]))
),
Expand Down
20 changes: 13 additions & 7 deletions sklego/meta/hierarchical_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,12 +281,15 @@ def fit(self, X, y=None):
msg = "Found 0 features, while a minimum of 1 if required."
raise ValueError(msg)

native_space = nw.get_native_namespace(X)

native_namespace = nw.get_native_namespace(X)
target_series = nw.from_dict({self._TARGET_NAME: y}, native_namespace=native_namespace)[self._TARGET_NAME]
global_series = nw.from_dict({self._GLOBAL_NAME: np.ones(n_samples)}, native_namespace=native_namespace)[
self._GLOBAL_NAME
]
frame = X.with_columns(
**{
self._TARGET_NAME: nw.from_native(native_space.Series(y), allow_series=True),
self._GLOBAL_NAME: nw.from_native(native_space.Series([1] * n_samples), allow_series=True),
self._TARGET_NAME: target_series,
self._GLOBAL_NAME: global_series,
}
).pipe(self.__validate_frame)

Expand Down Expand Up @@ -318,11 +321,14 @@ def _predict_estimators(self, X, method_name):
X = nw.from_native(pd.DataFrame(X))

n_samples = X.shape[0]
native_space = nw.get_native_namespace(X)
native_namespace = nw.get_native_namespace(X)
global_series = nw.from_dict({self._GLOBAL_NAME: np.ones(n_samples)}, native_namespace=native_namespace)[
self._GLOBAL_NAME
]

frame = X.with_columns(
**{
self._GLOBAL_NAME: nw.from_native(native_space.Series([1] * n_samples), allow_series=True),
self._GLOBAL_NAME: global_series,
self._INDEX_NAME: np.arange(n_samples),
}
).pipe(self.__validate_frame)
Expand All @@ -340,7 +346,7 @@ def _predict_estimators(self, X, method_name):

for level_idx, grp_names in enumerate(self.fitted_levels_):
for grp_values, grp_frame in frame.group_by(grp_names):
grp_idx = nw.to_native(grp_frame.select(self._INDEX_NAME)).to_numpy().reshape(-1)
grp_idx = grp_frame.select(self._INDEX_NAME).to_numpy().reshape(-1)

_estimator, _level = _get_estimator(
estimators=self.estimators_,
Expand Down
8 changes: 5 additions & 3 deletions sklego/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,13 @@ def update_split_info(indices, j, part, summary):

j = 0
for i in self.split(nw.to_native(X)):
update_split_info(native_namespace.Series(i[0]), j, "train", summary)
update_split_info(native_namespace.Series(i[1]), j, "valid", summary)
train_info = nw.to_native(nw.from_dict({"tmp": i[0]}, native_namespace=native_namespace)["tmp"])
valid_info = nw.to_native(nw.from_dict({"tmp": i[1]}, native_namespace=native_namespace)["tmp"])
update_split_info(train_info, j, "train", summary)
update_split_info(valid_info, j, "valid", summary)
j = j + 1

result = nw.from_native(native_namespace.DataFrame(summary))
result = nw.from_dict(summary, native_namespace=native_namespace)
result = nw.maybe_set_index(result, ["fold", "part"])
return nw.to_native(result)

Expand Down
22 changes: 12 additions & 10 deletions tests/test_meta/test_grouped_predictor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from contextlib import nullcontext as does_not_raise

import narwhals.stable.v1 as nw
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
from sklearn.dummy import DummyRegressor
from sklearn.impute import SimpleImputer
Expand Down Expand Up @@ -60,32 +62,32 @@ def random_xy_grouped_clf_different_classes(request):


@pytest.mark.parametrize("groups, expected", [("diet", {1, 2, 3, 4}), ("chick", set(range(1, 50 + 1)))])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_chickweight_keys(groups, expected, frame_func):
df = frame_func(load_chicken(as_frame=True))
df = nw.from_native(frame_func(load_chicken(as_frame=True).to_dict(orient="list")))
mod = GroupedPredictor(estimator=LinearRegression(), groups=groups)
mod.fit(df[["time", groups]], df["weight"])
mod.fit(nw.to_native(df.select("time", groups)), nw.to_native(df["weight"]))
assert set(mod.estimators_.keys()) == expected


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_chickweight_can_do_fallback(frame_func):
df = frame_func(load_chicken(as_frame=True))
df = nw.from_native(frame_func(load_chicken(as_frame=True).to_dict(orient="list")))
mod = GroupedPredictor(estimator=LinearRegression(), groups="diet")
mod.fit(df[["time", "diet"]], df["weight"])
mod.fit(nw.to_native(df.select("time", "diet")), nw.to_native(df["weight"]))
assert set(mod.estimators_.keys()) == {1, 2, 3, 4}
to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]})
assert mod.predict(to_predict).shape == (2,)
assert mod.predict(to_predict)[0] == mod.predict(to_predict)[1]


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_chickweight_can_do_fallback_proba(frame_func):
df = frame_func(load_chicken(as_frame=True))
df = nw.from_native(frame_func(load_chicken(as_frame=True).to_dict(orient="list")))

y = np.where(df["weight"] > df["weight"].mean(), 1, 0)
y = nw.to_native((df["weight"] > df["weight"].mean()).cast(nw.Int32))
mod = GroupedPredictor(estimator=LogisticRegression(), groups="diet")
mod.fit(df[["time", "diet"]], y)
mod.fit(nw.to_native(df.select("time", "diet")), y)
assert set(mod.estimators_.keys()) == {1, 2, 3, 4}

to_predict = pd.DataFrame({"time": [21, 21], "diet": [5, 6]})
Expand Down
25 changes: 14 additions & 11 deletions tests/test_meta/test_grouped_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
import sklearn
from sklearn import clone
Expand Down Expand Up @@ -264,9 +265,9 @@ def test_array_with_strings():
transformer.fit_transform(X)


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_df(penguins_df, frame_func):
penguins_df = frame_func(penguins_df)
penguins_df = frame_func(penguins_df.to_dict(orient="list"))
meta = GroupedTransformer(StandardScaler(), groups=["island", "sex"])

transformed = meta.fit_transform(penguins_df)
Expand All @@ -275,14 +276,14 @@ def test_df(penguins_df, frame_func):
assert transformed.shape == (penguins_df.shape[0], penguins_df.shape[1] - 2)


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_df_missing_group(penguins_df, frame_func):
meta = GroupedTransformer(StandardScaler(), groups=["island", "sex"])

# Otherwise the fixture is changed
X = penguins_df.copy()
X.loc[0, "island"] = None
X = frame_func(X)
X = frame_func(X.to_dict(orient="list"))
with pytest.raises(ValueError):
meta.fit_transform(X)

Expand All @@ -309,29 +310,31 @@ def test_grouping_column_not_in_array(penguins):
meta.fit_transform(X[:, :3])


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_grouping_column_not_in_df(penguins_df, frame_func):
meta = GroupedTransformer(StandardScaler(), groups=["island", "unexisting_column"])

# This should raise ValueError
with pytest.raises(ValueError):
meta.fit_transform(frame_func(penguins_df))
meta.fit_transform(frame_func(penguins_df.to_dict(orient="list")))


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_no_grouping(penguins_df, frame_func):
penguins_numeric = frame_func(penguins_df[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]])
penguins_numeric = frame_func(
penguins_df[["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]].to_dict(orient="list")
)

meta = GroupedTransformer(StandardScaler(), groups=None)
nonmeta = StandardScaler()

assert (meta.fit_transform(penguins_numeric) == nonmeta.fit_transform(penguins_numeric)).all()


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_with_y(penguins_df, frame_func):
X = frame_func(penguins_df.drop(columns=["sex"]))
y = penguins_df["sex"]
X = frame_func(penguins_df.drop(columns=["sex"]).to_dict(orient="list"))
y = penguins_df["sex"].to_numpy()

meta = GroupedTransformer(StandardScaler(), groups="island")

Expand Down
15 changes: 8 additions & 7 deletions tests/test_meta/test_hierarchical_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
from sklearn import clone
from sklearn.datasets import make_classification, make_regression
Expand All @@ -17,7 +18,7 @@

from sklego.meta import HierarchicalClassifier, HierarchicalRegressor

frame_funcs = [pd.DataFrame, pl.DataFrame]
frame_funcs = [pd.DataFrame, pl.DataFrame, pa.table]


@parametrize_with_checks([HierarchicalRegressor(estimator=LinearRegression(), groups=0)])
Expand Down Expand Up @@ -60,14 +61,14 @@ def make_hierarchical_dataset(task, frame_func=pd.DataFrame):
else:
raise ValueError("Invalid task")

X_ = (
X_ = frame_func(
pd.DataFrame(X, columns=[f"x_{i}" for i in range(X.shape[1])])
.assign(
g_0=1,
g_1=["A"] * (n_samples // 2) + ["B"] * (n_samples // 2),
g_2=["X"] * (n_samples // 4) + ["Y"] * (n_samples // 2) + ["Z"] * (n_samples // 4),
)
.pipe(frame_func)
.to_dict(orient="list")
)
groups = ["g_0", "g_1", "g_2"]

Expand Down Expand Up @@ -129,7 +130,7 @@ def test_fit_predict(meta_cls, base_estimator, task, fallback_method, shrinkage,
"""Tests that the model can be fit and predict with different configurations of fallback and shrinkage methods if
X to predict contains same groups as X used to fit.
"""
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 1)])
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 2)])

meta_model = meta_cls(
estimator=base_estimator,
Expand Down Expand Up @@ -159,10 +160,10 @@ def test_fallback(meta_cls, base_estimator, task, fallback_method, context):
"""Tests that the model fails or not when predicting with different fallback methods if X to predict contains
unseen group values.
"""
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 1)])
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 2)])

meta_model = meta_cls(estimator=base_estimator, groups=groups, fallback_method=fallback_method).fit(X, y)
X[groups] = np.ones((X.shape[0], len(groups))) * -1 # Shortcut assignment that works both in pandas and polars
X = nw.to_native(nw.from_native(X).with_columns(**{g: nw.lit(-1) for g in groups}))

with context:
meta_model.predict(X)
Expand Down Expand Up @@ -190,7 +191,7 @@ def test_shrinkage(meta_cls, base_estimator, task, metric, shrinkage, kwargs):
"""Tests that the model performance is better than the base estimator when predicting with different shrinkage
methods.
"""
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 1)])
X, y, groups = make_hierarchical_dataset(task, frame_func=frame_funcs[randint(0, 2)])

X_ = nw.from_native(X).drop(groups).pipe(nw.to_native)
meta_model = meta_cls(
Expand Down
5 changes: 3 additions & 2 deletions tests/test_meta/test_regression_outlier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import polars as pl
import pyarrow as pa
import pytest
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.utils.estimator_checks import parametrize_with_checks
Expand Down Expand Up @@ -37,7 +38,7 @@ def test_obvious_example():
assert preds[i] == -1


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_obvious_example_dataframe(frame_func):
# generate random data for illustrative example
np.random.seed(42)
Expand All @@ -54,7 +55,7 @@ def test_obvious_example_dataframe(frame_func):
assert preds[i] == -1


@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame])
@pytest.mark.parametrize("frame_func", [pd.DataFrame, pl.DataFrame, pa.table])
def test_raises_error(frame_func):
# generate random data for illustrative example
np.random.seed(42)
Expand Down

0 comments on commit 236f491

Please sign in to comment.