koaning · MBrouns · Sep 20, 2023 · Jul 13, 2023 · Jul 18, 2023 · Jul 18, 2023
diff --git a/doc/index.rst b/doc/index.rst
@@ -80,7 +80,7 @@ Usage
    meta.ipynb
    fairness.ipynb
    outliers.ipynb
-   timegapsplit.ipynb
+   crossvalidation.ipynb
    preprocessing.ipynb
    debug_pipeline.ipynb
    pandas_pipeline.ipynb

diff --git a/sklego/model_selection.py b/sklego/model_selection.py
@@ -16,37 +16,44 @@
 class TimeGapSplit:
     """
     Provides train/test indices to split time series data samples.
-    This cross-validation object is a variation of TimeSeriesSplit with the following differences:
+    This cross-validation object is a variation of TimeSeriesSplit with the following 
+    differences:
+
     - The splits are made based on datetime duration, instead of number of rows.
-    - The user specifies the validation durations and either training_duration or n_splits
-    - The user can specify a 'gap' duration that is added
-      after the training split and before the validation split
-    The 3 duration parameters can be used to really replicate how the model
-    is going to be used in production in batch learning.
-    Each validation fold doesn't overlap. The entire 'window' moves by 1 valid_duration until there is not enough data.
-    If this would lead to more splits then specified with n_splits, the 'window' moves by
-    the validation_duration times the fraction of possible splits and requested splits
-     -- n_possible_splits = (total_length-train_duration-gap_duration)//valid_duration
-     -- time_shift = valid_duration n_possible_splits/n_slits
+    - The user specifies the validation durations and either training_duration or n_splits.
+    - The user can specify a 'gap' duration that is added after the training split and before the validation split.
+
+    The 3 duration parameters can be used to really replicate how the model is going to
+    be used in production in batch learning.
+
+    Each validation fold doesn't overlap. The entire 'window' moves by 1 'valid_duration'
+    until there is not enough data.
+
+    If this would lead to more splits then specified with 'n_splits', the 'window' moves by
+    the 'validation_duration' times the fraction of possible splits and requested splits:
+
+    - n_possible_splits = (total_length - train_duration-gap_duration) // valid_duration
+    - time_shift = valid_duration * n_possible_splits / n_slits
     so the CV spans the whole dataset.
-    If train_duration is not passed but n_split is,
-    the training duration is increased to
-     -- train_duration = total_length-(self.gap_duration + self.valid_duration * self.n_splits)
-     such that the shifting the entire window by one validation duration spans the whole training set
 
-    :param pandas.Series date_serie: Series with the date, that should have all the indices of X used in split()
+    If 'train_duration' is not passed but 'n_splits' is, the training duration is increased to:
+
+        train_duration = total_length - (gap_duration + valid_duration * n_splits)
+
+    such that the shifting the entire window by one validation duration spans the whole training set.
+
+    :param pandas.Series date_serie: series with the date, that should have all the indices of X used in the split() method.
     :param datetime.timedelta train_duration: historical training data.
     :param datetime.timedelta valid_duration: retraining period.
     :param datetime.timedelta gap_duration: forward looking window of the target.
         The period of the forward looking window necessary to create your target variable.
         This period is dropped at the end of your training folds due to lack of recent data.
         In production you would have not been able to create the target for that period, and you would have drop it from
         the training data.
-    :param int n_splits: number of splits
-    :param string window:
-         'rolling' window has fixed size and is shifted entirely
-         'expanding' left side of window is fixed, right border increases each fold
-
+    :param int n_splits: number of splits.
+    :param string window: either 'rolling' or 'expanding'.
+        'rolling' window has fixed size and is shifted entirely.
+        'expanding' left side of window is fixed, right border increases each fold.
     """
 
     def __init__(
@@ -59,9 +66,7 @@ def __init__(
         window="rolling",
     ):
         if (train_duration is None) and (n_splits is None):
-            raise ValueError(
-                "Either train_duration or n_splits have to be defined"
-            )
+            raise ValueError("Either train_duration or n_splits have to be defined")
 
         if (train_duration is not None) and (train_duration <= gap_duration):
             raise ValueError(
@@ -82,20 +87,20 @@ def __init__(
     def _join_date_and_x(self, X):
         """
         Make a DataFrame indexed by the pandas index (the same as date_series) with date column joined with that index
-        and with the 'numpy index' column (i.e. just a range) that is required for the output and the rest of sklearn
-        :param pandas.DataFrame X:
+        and with the 'numpy index' column (i.e. just a range) that is required for the output and the rest of sklearn.
+
+        :param pandas.DataFrame X: Dataframe with the data to split
         """
-        X_index_df = pd.DataFrame(
-            range(len(X)), columns=["np_index"], index=X.index
-        )
+        X_index_df = pd.DataFrame(range(len(X)), columns=["np_index"], index=X.index)
         X_index_df = X_index_df.join(self.date_serie)
 
         return X_index_df
 
     def split(self, X, y=None, groups=None):
         """
         Generate indices to split data into training and test set.
-        :param pandas.DataFrame X:
+
+        :param pandas.DataFrame X: Dataframe with the data to split
         :param y: Always ignored, exists for compatibility
         :param groups: Always ignored, exists for compatibility
         """
@@ -111,9 +116,7 @@ def split(self, X, y=None, groups=None):
 
         date_min = X_index_df["__date__"].min()
         date_max = X_index_df["__date__"].max()
-        date_length = (
-            X_index_df["__date__"].max() - X_index_df["__date__"].min()
-        )
+        date_length = X_index_df["__date__"].max() - X_index_df["__date__"].min()
 
         if (self.train_duration is None) and (self.n_splits is not None):
             self.train_duration = date_length - (
@@ -148,13 +151,9 @@ def split(self, X, y=None, groups=None):
             time_shift = self.valid_duration * n_split_max / self.n_splits
         else:
             time_shift = self.valid_duration
-
         while True:
             if (
-                current_date
-                + self.train_duration
-                + time_shift
-                + self.gap_duration
+                current_date + self.train_duration + time_shift + self.gap_duration
                 > date_max
             ):
                 break
@@ -186,13 +185,17 @@ def split(self, X, y=None, groups=None):
             )
 
     def get_n_splits(self, X=None, y=None, groups=None):
+        """Gets the number of splits
 
+        :return: amount of n_splits
+        :rtype: int
+        """
         return sum(1 for x in self.split(X, y, groups))
 
     def summary(self, X):
-        """
-        Describe all folds
-        :param pandas.DataFrame X:
+        """Describes all folds
+
+        :param pandas.DataFrame X: Dataframe with the data to split
         :returns: ``pd.DataFrame`` summary of all folds
         """
         summary = []
@@ -291,17 +294,16 @@ class GroupTimeSeriesSplit(_BaseKFold):
     Create n_splits folds with an as equally possible size through a smart variant of a brute
     force search. Groups parameter in .split() should be filled with the time groups (e.g. years)
 
-    :param n_splits: the amount of train-test combinations.
-    :type n_splits: int
+    If n_splits is 3 ("*" = train, "x" = test)::
 
-    with n_splits at 3
-    * = train
-    x = test
     |-----------------------|
     | * * * x x x - - - - - |
     | - - - * * * x x x - - |
     | - - - - - - * * * x x |
     |-----------------------|
+
+    :param n_splits: the amount of train-test combinations.
+    :type n_splits: int
     """
 
     # table above inspired by sktime
@@ -350,16 +352,15 @@ def summary(self):
             )
         except AttributeError:
             raise AttributeError(
-                ".summary() only works after having ran"
-                " .split(X, y, groups)."
+                ".summary() only works after having ran" " .split(X, y, groups)."
             )
 
     def split(self, X=None, y=None, groups=None):
         """Returns the train-test splits of all the folds
 
         :param X: array-like of shape (n_samples, n_features)
             Training data, where `n_samples` is the number of samples
-            and `n_features` is the number of features., defaults to None
+            and `n_features` is the number of features, defaults to None
         :type X: np.array, optional
         :param y: array-like of shape (n_samples,)
             The target variable for supervised learning problems, defaults to None
@@ -484,13 +485,8 @@ def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
 
         # initialize the index of the first split, to reduce the amount of possible index split options
         first_split_index = (
-            self._grouped_df.assign(
-                cumsum_obs=lambda df: df["observations"].cumsum()
-            )
-            .assign(
-                group_id=lambda df: (df["cumsum_obs"] - 1)
-                // init_ideal_group_size
-            )
+            self._grouped_df.assign(cumsum_obs=lambda df: df["observations"].cumsum())
+            .assign(group_id=lambda df: (df["cumsum_obs"] - 1) // init_ideal_group_size)
             .reset_index()
             .loc[lambda df: df["group_id"] != 0]
             .iloc[0]
@@ -503,10 +499,7 @@ def _calc_first_and_last_split_index(self, X=None, y=None, groups=None):
                 cumsum_obs=lambda df: df["observations"].cumsum(),
             )
             .reset_index()
-            .assign(
-                group_id=lambda df: (df["cumsum_obs"] - 1)
-                // init_ideal_group_size
-            )
+            .assign(group_id=lambda df: (df["cumsum_obs"] - 1) // init_ideal_group_size)
             .loc[lambda df: df["group_id"] != 0]
             .iloc[0]
             .name
@@ -553,8 +546,7 @@ def _get_split_indices(self):
         for split in sliding_window(first_splits, window_size=2, step_size=1):
             try:
                 diff_from_ideal_list += [
-                    sum(observations[split[0] : split[1]])
-                    - self._ideal_group_size
+                    sum(observations[split[0] : split[1]]) - self._ideal_group_size
                 ]
             except IndexError:
                 diff_from_ideal_list += [
@@ -567,9 +559,7 @@ def _get_split_indices(self):
 
         # loop through all possible split points and check whether a new split
         # has a less total difference from all groups to the ideal group size
-        for prev_splits, new_splits in zip(
-            splits_generator, splits_generator_shifted
-        ):
+        for prev_splits, new_splits in zip(splits_generator, splits_generator_shifted):
             diff_from_ideal_list = self._calc_new_diffs(
                 observations, diff_from_ideal_list, prev_splits, new_splits
             )
@@ -643,9 +633,7 @@ def _regroup(self, groups):
         df = self._grouped_df.copy().reset_index()
         # set each unique group to the right group_id to group them into folds
         df.loc[: self._best_splits[0], "group"] = 0
-        for group_id, splits in enumerate(
-            sliding_window(self._best_splits, 2, 1)
-        ):
+        for group_id, splits in enumerate(sliding_window(self._best_splits, 2, 1)):
             try:
                 df.loc[splits[0] : splits[1], "group"] = group_id + 1
             except IndexError: