From d090224ec2f2af3030ff3ffe09667b3f21246884 Mon Sep 17 00:00:00 2001 From: solegalli Date: Tue, 27 Jan 2026 21:35:29 -0500 Subject: [PATCH 01/24] update dt functions --- .../_variable_type_checks.py | 12 ++++----- .../test_fe_type_checks.py | 26 +++++++++++++++++++ 2 files changed, 32 insertions(+), 6 deletions(-) create mode 100644 tests/test_variable_handling/test_fe_type_checks.py diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index c3e16d383..044c2667d 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -1,9 +1,7 @@ -import warnings - import pandas as pd +from pandas.api.types import is_string_dtype as is_object from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime from pandas.core.dtypes.common import is_numeric_dtype as is_numeric -from pandas.core.dtypes.common import is_object_dtype as is_object def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: @@ -25,9 +23,11 @@ def _is_categories_num(column: pd.Series) -> bool: def _is_convertible_to_dt(column: pd.Series) -> bool: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return is_datetime(pd.to_datetime(column, errors="ignore", utc=True)) + try: + var = pd.to_datetime(column, utc=True) + return is_datetime(var) + except: + return False def _is_convertible_to_num(column: pd.Series) -> bool: diff --git a/tests/test_variable_handling/test_fe_type_checks.py b/tests/test_variable_handling/test_fe_type_checks.py new file mode 100644 index 000000000..ecf553e90 --- /dev/null +++ b/tests/test_variable_handling/test_fe_type_checks.py @@ -0,0 +1,26 @@ +import pytest + +from feature_engine.variable_handling._variable_type_checks import ( + _is_categorical_and_is_datetime, + _is_categorical_and_is_not_datetime, + _is_convertible_to_dt, + +) + +def test_is_convertible_to_num(df): + assert _is_convertible_to_dt(df["Name"]) is False + assert _is_convertible_to_dt(df["date_obj0"]) is True + +def test_is_convertible_to_dt(df): + assert _is_convertible_to_dt(df["date_obj0"]) is True + assert _is_convertible_to_dt(df["date_range"]) is True + assert _is_convertible_to_dt(df["Name"]) is False + +def test_is_categorical_and_is_datetime(df): + assert _is_categorical_and_is_datetime(df["date_obj0"]) is True + assert _is_categorical_and_is_datetime(df["Name"]) is False + +def test_is_categorical_and_is_not_datetime(df): + assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False + assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False + assert _is_categorical_and_is_not_datetime(df["Name"]) is True From 6ff27aa30a115ae7354dcef2b692a1c72b0313a1 Mon Sep 17 00:00:00 2001 From: solegalli Date: Tue, 27 Jan 2026 21:59:00 -0500 Subject: [PATCH 02/24] expand tests --- .../test_fe_type_checks.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/tests/test_variable_handling/test_fe_type_checks.py b/tests/test_variable_handling/test_fe_type_checks.py index ecf553e90..b0e991617 100644 --- a/tests/test_variable_handling/test_fe_type_checks.py +++ b/tests/test_variable_handling/test_fe_type_checks.py @@ -1,26 +1,40 @@ -import pytest - from feature_engine.variable_handling._variable_type_checks import ( _is_categorical_and_is_datetime, _is_categorical_and_is_not_datetime, _is_convertible_to_dt, - + _is_convertible_to_num, + _is_categories_num, ) def test_is_convertible_to_num(df): - assert _is_convertible_to_dt(df["Name"]) is False - assert _is_convertible_to_dt(df["date_obj0"]) is True + assert _is_convertible_to_num(df["Name"]) is False + assert _is_convertible_to_num(df["date_obj0"]) is False + + df["age_str"] = ["20", "21", "19", "18"] + assert _is_convertible_to_num(df["age_str"]) is True + def test_is_convertible_to_dt(df): assert _is_convertible_to_dt(df["date_obj0"]) is True assert _is_convertible_to_dt(df["date_range"]) is True assert _is_convertible_to_dt(df["Name"]) is False -def test_is_categorical_and_is_datetime(df): + df["age_str"] = ["20", "21", "19", "18"] + assert _is_convertible_to_dt(df["age_str"]) is False + + +def test_is_categorical_and_is_datetime(df, df_datetime): assert _is_categorical_and_is_datetime(df["date_obj0"]) is True assert _is_categorical_and_is_datetime(df["Name"]) is False + assert _is_categorical_and_is_datetime(df_datetime["date_obj1"]) is True + + df["age_str"] = ["20", "21", "19", "18"] + assert _is_categorical_and_is_datetime(df["age_str"]) is False def test_is_categorical_and_is_not_datetime(df): assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False assert _is_categorical_and_is_not_datetime(df["Name"]) is True + + df["age_str"] = ["20", "21", "19", "18"] + assert _is_categorical_and_is_not_datetime(df["age_str"]) is True \ No newline at end of file From 9d443033a03899f8df130cf2237f9e0ee1d792d0 Mon Sep 17 00:00:00 2001 From: solegalli Date: Tue, 27 Jan 2026 22:16:41 -0500 Subject: [PATCH 03/24] expand tests --- tests/test_variable_handling/test_fe_type_checks.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/test_variable_handling/test_fe_type_checks.py b/tests/test_variable_handling/test_fe_type_checks.py index b0e991617..ad915b611 100644 --- a/tests/test_variable_handling/test_fe_type_checks.py +++ b/tests/test_variable_handling/test_fe_type_checks.py @@ -1,11 +1,19 @@ from feature_engine.variable_handling._variable_type_checks import ( _is_categorical_and_is_datetime, _is_categorical_and_is_not_datetime, + _is_categories_num, _is_convertible_to_dt, _is_convertible_to_num, - _is_categories_num, ) + +def test_is_categories_num(df): + assert _is_categories_num(df["Name"]) is False + + df["Age"] = df["Age"].astype("category") + assert _is_categories_num(df["Age"]) is True + + def test_is_convertible_to_num(df): assert _is_convertible_to_num(df["Name"]) is False assert _is_convertible_to_num(df["date_obj0"]) is False @@ -31,10 +39,11 @@ def test_is_categorical_and_is_datetime(df, df_datetime): df["age_str"] = ["20", "21", "19", "18"] assert _is_categorical_and_is_datetime(df["age_str"]) is False + def test_is_categorical_and_is_not_datetime(df): assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False assert _is_categorical_and_is_not_datetime(df["Name"]) is True df["age_str"] = ["20", "21", "19", "18"] - assert _is_categorical_and_is_not_datetime(df["age_str"]) is True \ No newline at end of file + assert _is_categorical_and_is_not_datetime(df["age_str"]) is True From de4d663031123cb2c4c38d378388a1f9d82ba82b Mon Sep 17 00:00:00 2001 From: solegalli Date: Tue, 27 Jan 2026 22:40:13 -0500 Subject: [PATCH 04/24] update fpr new pandas behaviour --- feature_engine/variable_handling/_variable_type_checks.py | 2 ++ tests/test_variable_handling/test_fe_type_checks.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index 044c2667d..fb54c997e 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -49,4 +49,6 @@ def _is_categorical_and_is_datetime(column: pd.Series) -> bool: elif isinstance(column.dtype, pd.CategoricalDtype): is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column) + else: + is_dt = False return is_dt diff --git a/tests/test_variable_handling/test_fe_type_checks.py b/tests/test_variable_handling/test_fe_type_checks.py index ad915b611..86c5609b8 100644 --- a/tests/test_variable_handling/test_fe_type_checks.py +++ b/tests/test_variable_handling/test_fe_type_checks.py @@ -39,6 +39,12 @@ def test_is_categorical_and_is_datetime(df, df_datetime): df["age_str"] = ["20", "21", "19", "18"] assert _is_categorical_and_is_datetime(df["age_str"]) is False + df = df.copy() + # from pandas 3 onwards, object types that contain strings are not recognised as + # objects any more + df["Age"] = df["Age"].astype("O") + assert _is_categorical_and_is_datetime(df["Age"]) is False + def test_is_categorical_and_is_not_datetime(df): assert _is_categorical_and_is_not_datetime(df["date_obj0"]) is False From 7f8883911ca49c8216bd6afa2fe7e0e1fbb70146 Mon Sep 17 00:00:00 2001 From: Ankit Hemant Lade Date: Fri, 6 Feb 2026 14:56:23 -0600 Subject: [PATCH 05/24] fix: Pandas 3 compatibility - robust dtype checks and test fixes (#885) * fix: Pandas 3 compatibility - robust dtype checks and test fixes - Fix UnboundLocalError in _variable_type_checks.py by initializing is_cat/is_dt - Add robust dtype checking using both is_object_dtype and is_string_dtype - Update find_variables.py with same robust logic for consistency - Fix warning count assertions in encoder tests (Pandas 3 adds extra deprecation warnings) - Fix floating point precision assertion in recursive feature elimination test - Apply ruff formatting and fix linting errors - All 1900 tests passing * fix: Remove whitespace before colon in slice notation (flake8 E203) * feat: finalize Pandas 3 compatibility fixes and test updates * style: fix flake8 line length and linting issues * style: fix remaining flake8 C416 issue * Fix Pandas 3 regressions in check_y, _check_contains_inf, and StringSimilarityEncoder * Fix E501 line too long in dataframe_checks.py * Fix StringSimilarityEncoder NaN issues and fragile test assertions * fix: Pandas 3 stability - mock datasets and fix FutureWarnings * style: fix flake8 linting errors E501, E302, E305, SIM102 * test: improve patch coverage for Pandas 3 stability fixes * style: fix E501 line too long in similarity encoder tests * style: revert unrelated flake8 and formatting changes * fix: restore Pandas 3 test logic and silence Pandas4Warning * style: move numpy import to top of math_features.py * style: fix spacing in MatchVariables verbose error message * test: revert dynamic std values to hardcoded values in MathFeatures tests * style: combine imports in _variable_type_checks.py * refactor: centralize is_object function and use it across the codebase * refactor: further simplify check_y dtype checks using is_object * revert: remove unnecessary complexity in _check_contains_inf and associated tests * docs: rename _normalize_func to _map_unnamed_func_to_str and add comments * perf: optimize casting logic in SimilarityEncoder * fix: address remaining code review feedback - follow sklearn convention for init params - make tests conditional on pandas version - restore encoder_dict_ assertion * style: fix linting and follow sklearn convention for MathFeatures * revert: remove california housing mock from conftest.py * revert: restore original error message assertion in DatetimeFeatures test * fix: use robust datetime normalization and flexible error assertions in tests --- feature_engine/creation/math_features.py | 81 +++++++++---- feature_engine/dataframe_checks.py | 8 +- feature_engine/encoding/similarity_encoder.py | 47 +++++--- feature_engine/preprocessing/match_columns.py | 9 +- .../timeseries/forecasting/lag_features.py | 4 +- .../timeseries/forecasting/window_features.py | 2 +- .../_variable_type_checks.py | 36 +++--- .../variable_handling/find_variables.py | 18 +-- tests/test_creation/test_math_features.py | 3 - tests/test_dataframe_checks.py | 37 ++++-- tests/test_datetime/test_datetime_features.py | 4 +- tests/test_encoding/test_mean_encoder.py | 9 +- tests/test_encoding/test_ordinal_encoder.py | 9 +- .../test_encoding/test_similarity_encoder.py | 25 ++++ .../test_woe/test_woe_encoder.py | 9 +- .../test_preprocessing/test_match_columns.py | 6 +- .../test_recursive_feature_elimination.py | 4 +- .../test_fe_type_checks.py | 38 ++++++ tests/test_wrappers/test_sklearn_wrapper.py | 109 +++++++++++++++++- 19 files changed, 362 insertions(+), 96 deletions(-) diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py index 35cbe73aa..5537c876f 100644 --- a/feature_engine/creation/math_features.py +++ b/feature_engine/creation/math_features.py @@ -1,5 +1,6 @@ from typing import Any, List, Optional, Union +import numpy as np import pandas as pd from feature_engine._docstrings.fit_attributes import ( @@ -140,7 +141,6 @@ def __init__( missing_values: str = "raise", drop_original: bool = False, ) -> None: - if ( not isinstance(variables, list) or not all(isinstance(var, (int, str)) for var in variables) @@ -157,16 +157,15 @@ def __init__( "func does not work with dictionaries in this transformer." ) - if new_variables_names is not None: - if ( - not isinstance(new_variables_names, list) - or not all(isinstance(var, str) for var in new_variables_names) - or len(set(new_variables_names)) != len(new_variables_names) - ): - raise ValueError( - "new_variable_names should be None or a list of unique strings. " - f"Got {new_variables_names} instead." - ) + if new_variables_names is not None and ( + not isinstance(new_variables_names, list) + or not all(isinstance(var, str) for var in new_variables_names) + or len(set(new_variables_names)) != len(new_variables_names) + ): + raise ValueError( + "new_variable_names should be None or a list of unique strings. " + f"Got {new_variables_names} instead." + ) if new_variables_names is not None: if isinstance(func, list): @@ -175,12 +174,11 @@ def __init__( "The number of new feature names must coincide with the number " "of functions." ) - else: - if len(new_variables_names) != 1: - raise ValueError( - "The number of new feature names must coincide with the number " - "of functions." - ) + elif len(new_variables_names) != 1: + raise ValueError( + "The number of new feature names must coincide with the number " + "of functions." + ) super().__init__(missing_values, drop_original) @@ -188,6 +186,45 @@ def __init__( self.func = func self.new_variables_names = new_variables_names + def _map_unnamed_func_to_str(self, func: Any) -> Any: + if isinstance(func, list): + return [self._map_unnamed_func_to_str(f) for f in func] + + # We map certain numpy functions to their string alias. + # This serves two purposes: + # 1) It avoids a FutureWarning in pandas 2.1+ which recommends + # using the string alias for better performance and future-proofing. + # 2) It ensures consistent column naming (e.g. "sum_x1_x2") + # regardless of how the function was passed (np.sum vs "sum"). + map_dict = { + np.sum: "sum", + np.mean: "mean", + np.std: "std", + np.min: "min", + np.max: "max", + np.median: "median", + np.prod: "prod", + } + return map_dict.get(func, func) + + def fit(self, X: pd.DataFrame, y=None): + """ + This method does not learn any parameters. It just stores the normalized + function representation. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + """ + super().fit(X, y) + # Normalize func to func_ (sklearn convention: don't modify init params) + self.func_ = self._map_unnamed_func_to_str(self.func) + return self + def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Create and add new variables. @@ -207,9 +244,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: new_variable_names = self._get_new_features_name() if len(new_variable_names) == 1: - X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1) + X[new_variable_names[0]] = X[self.variables].agg(self.func_, axis=1) else: - X[new_variable_names] = X[self.variables].agg(self.func, axis=1) + X[new_variable_names] = X[self.variables].agg(self.func_, axis=1) if self.drop_original: X.drop(columns=self.variables, inplace=True) @@ -226,14 +263,14 @@ def _get_new_features_name(self) -> List: else: varlist = [f"{var}" for var in self.variables_] - if isinstance(self.func, list): + if isinstance(self.func_, list): functions = [ - fun if type(fun) is str else fun.__name__ for fun in self.func + fun if type(fun) is str else fun.__name__ for fun in self.func_ ] feature_names = [ f"{function}_{'_'.join(varlist)}" for function in functions ] else: - feature_names = [f"{self.func}_{'_'.join(varlist)}"] + feature_names = [f"{self.func_}_{'_'.join(varlist)}"] return feature_names diff --git a/feature_engine/dataframe_checks.py b/feature_engine/dataframe_checks.py index 2d41727f7..9ef9b3f82 100644 --- a/feature_engine/dataframe_checks.py +++ b/feature_engine/dataframe_checks.py @@ -9,6 +9,8 @@ from scipy.sparse import issparse from sklearn.utils.validation import _check_y, check_consistent_length, column_or_1d +from feature_engine.variable_handling._variable_type_checks import is_object + def check_X(X: Union[np.generic, np.ndarray, pd.DataFrame]) -> pd.DataFrame: """ @@ -121,10 +123,10 @@ def check_y( elif isinstance(y, pd.Series): if y.isnull().any(): raise ValueError("y contains NaN values.") - if y.dtype != "O" and not np.isfinite(y).all(): + if not is_object(y) and not np.isfinite(y).all(): raise ValueError("y contains infinity values.") - if y_numeric and y.dtype == "O": - y = y.astype("float") + if y_numeric and is_object(y): + y = y.astype("float64") y = y.copy() elif isinstance(y, pd.DataFrame): diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py index 137034ddb..2599d2f91 100644 --- a/feature_engine/encoding/similarity_encoder.py +++ b/feature_engine/encoding/similarity_encoder.py @@ -232,12 +232,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): X = check_X(X) variables_ = self._check_or_select_variables(X) - if self.keywords: - if not all(item in variables_ for item in self.keywords.keys()): - raise ValueError( - "There are variables in keywords that are not present " - "in the dataset." - ) + if self.keywords and not all( + item in variables_ for item in self.keywords.keys() + ): + raise ValueError( + "There are variables in keywords that are not present " + "in the dataset." + ) # if data contains nan, fail before running any logic if self.missing_values == "raise": @@ -262,10 +263,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ) elif self.missing_values == "impute": for var in cols_to_iterate: + series = X[var] self.encoder_dict_[var] = ( - X[var] - .astype(str) - .replace("nan", "") + series.astype(str) + .mask(series.isna(), "") .value_counts() .head(self.top_categories) .index.tolist() @@ -276,7 +277,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): X[var] .astype(str) .value_counts(dropna=True) - .drop("nan", errors="ignore") + .drop(["nan", ""], errors="ignore") .head(self.top_categories) .index.tolist() ) @@ -316,13 +317,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: new_values = [] for var in self.variables_: if self.missing_values == "impute": - X[var] = X[var].astype(str).replace("nan", "") - categories = X[var].dropna().astype(str).unique() + series = X[var] + series = series.astype(str).mask(series.isna(), "") + else: + series = X[var].astype(str) + + categories = series.unique() column_encoder_dict = { x: _gpm_fast_vec(x, self.encoder_dict_[var]) for x in categories } - column_encoder_dict["nan"] = [np.nan] * len(self.encoder_dict_[var]) - encoded = np.vstack(X[var].astype(str).map(column_encoder_dict).values) + # Ensure map result is always an array of the correct size. + # Missing values in categories or unknown categories will map to NaN. + default_nan = [np.nan] * len(self.encoder_dict_[var]) + if "nan" not in column_encoder_dict: + column_encoder_dict["nan"] = default_nan + if "" not in column_encoder_dict: + column_encoder_dict[""] = default_nan + + encoded_series = series.map(column_encoder_dict) + + # Robust stacking: replace any float NaNs (from unknown values) with arrays + encoded_list = [ + v if isinstance(v, (list, np.ndarray)) else default_nan + for v in encoded_series + ] + encoded = np.vstack(encoded_list) if self.missing_values == "ignore": encoded[X[var].isna(), :] = np.nan new_values.append(encoded) diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py index c5321b6c3..da34f5e9c 100644 --- a/feature_engine/preprocessing/match_columns.py +++ b/feature_engine/preprocessing/match_columns.py @@ -175,7 +175,7 @@ def __init__( if not isinstance(verbose, bool): raise ValueError( - "verbose takes only booleans True and False." f"Got '{verbose} instead." + f"verbose takes only booleans True and False. Got '{verbose} instead." ) # note: np.nan is an instance of float!!! @@ -262,7 +262,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.drop(_columns_to_drop, axis=1) - X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value) + # Add missing columns one at a time to avoid Pandas 3 StringDtype reindex issue + for col in _columns_to_add: + X[col] = self.fill_value + + # Reorder columns to match training set, without fill_value to avoid issues + X = X[self.feature_names_in_] if self.match_dtypes: _current_dtypes = X.dtypes.to_dict() diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 7ed7ed200..ee9c1c151 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -201,7 +201,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: axis=0, ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = X[self.variables_].shift( @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: axis=0, ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = X[self.variables_].shift( diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 47071efa7..a1e526c3e 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: .shift(periods=self.periods, freq=self.freq) ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = ( diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index fb54c997e..3427c60be 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -1,20 +1,25 @@ import pandas as pd -from pandas.api.types import is_string_dtype as is_object +from pandas.api.types import is_object_dtype, is_string_dtype from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime from pandas.core.dtypes.common import is_numeric_dtype as is_numeric -def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: - # check for datetime only if object cannot be cast as numeric because - # if it could pd.to_datetime would convert it to datetime regardless - if is_object(column): - is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column) +def is_object(s) -> bool: + return is_object_dtype(s) or is_string_dtype(s) + +def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: + is_cat = False # check for datetime only if the type of the categories is not numeric # because pd.to_datetime throws an error when it is an integer - elif isinstance(column.dtype, pd.CategoricalDtype): + if isinstance(column.dtype, pd.CategoricalDtype): is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column) + # check for datetime only if object cannot be cast as numeric because + # if it could pd.to_datetime would convert it to datetime regardless + elif is_object(column): + is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column) + return is_cat @@ -26,7 +31,7 @@ def _is_convertible_to_dt(column: pd.Series) -> bool: try: var = pd.to_datetime(column, utc=True) return is_datetime(var) - except: + except Exception: return False @@ -39,16 +44,15 @@ def _is_convertible_to_num(column: pd.Series) -> bool: def _is_categorical_and_is_datetime(column: pd.Series) -> bool: - # check for datetime only if object cannot be cast as numeric because - # if it could pd.to_datetime would convert it to datetime regardless - if is_object(column): - is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column) - + is_dt = False # check for datetime only if the type of the categories is not numeric # because pd.to_datetime throws an error when it is an integer - elif isinstance(column.dtype, pd.CategoricalDtype): + if isinstance(column.dtype, pd.CategoricalDtype): is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column) - else: - is_dt = False + # check for datetime only if object cannot be cast as numeric because + # if it could pd.to_datetime would convert it to datetime regardless + elif is_object(column): + is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column) + return is_dt diff --git a/feature_engine/variable_handling/find_variables.py b/feature_engine/variable_handling/find_variables.py index 04779ad5d..72e17d9ef 100644 --- a/feature_engine/variable_handling/find_variables.py +++ b/feature_engine/variable_handling/find_variables.py @@ -5,11 +5,11 @@ import pandas as pd from pandas.api.types import is_datetime64_any_dtype as is_datetime from pandas.core.dtypes.common import is_numeric_dtype as is_numeric -from pandas.core.dtypes.common import is_object_dtype as is_object from feature_engine.variable_handling._variable_type_checks import ( _is_categorical_and_is_datetime, _is_categorical_and_is_not_datetime, + is_object, ) from feature_engine.variable_handling.dtypes import DATETIME_TYPES @@ -85,7 +85,9 @@ def find_categorical_variables(X: pd.DataFrame) -> List[Union[str, int]]: """ variables = [ column - for column in X.select_dtypes(include=["O", "category"]).columns + for column in X.select_dtypes( + include=["O", "category", "string"] + ).columns if _is_categorical_and_is_not_datetime(X[column]) ] if len(variables) == 0: @@ -254,7 +256,9 @@ def find_categorical_and_numerical_variables( if variables is None: variables_cat = [ column - for column in X.select_dtypes(include=["O", "category"]).columns + for column in X.select_dtypes( + include=["O", "category", "string"] + ).columns if _is_categorical_and_is_not_datetime(X[column]) ] # find numerical variables in dataset @@ -271,14 +275,14 @@ def find_categorical_and_numerical_variables( raise ValueError("The list of variables is empty.") # find categorical variables - variables_cat = [ - var for var in X[variables].select_dtypes(include=["O", "category"]).columns - ] + variables_cat = list( + X[variables].select_dtypes(include=["O", "category", "string"]).columns + ) # find numerical variables variables_num = list(X[variables].select_dtypes(include="number").columns) - if any([v for v in variables if v not in variables_cat + variables_num]): + if any(v for v in variables if v not in variables_cat + variables_num): raise TypeError( "Some of the variables are neither numerical nor categorical." ) diff --git a/tests/test_creation/test_math_features.py b/tests/test_creation/test_math_features.py index f65e932ee..6a5590019 100644 --- a/tests/test_creation/test_math_features.py +++ b/tests/test_creation/test_math_features.py @@ -237,7 +237,6 @@ def test_variable_names_when_df_cols_are_integers(df_numeric_columns): def test_error_when_null_values_in_variable(df_vartypes): - df_na = df_vartypes.copy() df_na.loc[1, "Age"] = np.nan @@ -256,7 +255,6 @@ def test_error_when_null_values_in_variable(df_vartypes): def test_no_error_when_null_values_in_variable(df_vartypes): - df_na = df_vartypes.copy() df_na.loc[1, "Age"] = np.nan @@ -323,7 +321,6 @@ def test_get_feature_names_out(_varnames, _drop, df_vartypes): @pytest.mark.parametrize("_varnames", [None, ["var1", "var2"]]) @pytest.mark.parametrize("_drop", [True, False]) def test_get_feature_names_out_from_pipeline(_varnames, _drop, df_vartypes): - # set up transformer transformer = MathFeatures( variables=["Age", "Marks"], diff --git a/tests/test_dataframe_checks.py b/tests/test_dataframe_checks.py index d38e7cd54..09cd22ccf 100644 --- a/tests/test_dataframe_checks.py +++ b/tests/test_dataframe_checks.py @@ -249,22 +249,43 @@ def test_optional_contains_na(df_na): def test_contains_inf(df_na): - df_na.fillna(np.inf, inplace=True) + # Test numeric column with inf + df_num_inf = pd.DataFrame({"A": [1.1, np.inf, 3.3]}) with pytest.raises(ValueError): - assert _check_contains_inf(df_na, ["Age", "Marks"]) + _check_contains_inf(df_num_inf, ["A"]) + + # Test numeric column WITHOUT inf + df_num_no_inf = pd.DataFrame({"A": [1.1, 2.2, 3.3]}) + _check_contains_inf(df_num_no_inf, ["A"]) def test_check_X_raises_error_on_duplicated_column_names(): df = pd.DataFrame( { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": pd.date_range("2023-01-01", periods=3), + "Name": ["tom", "nick", "krish", "jack"], + "City": ["London", "Manchester", "Liverpool", "Bristol"], + "Age": [20, 21, 19, 18], + "Marks": [0.9, 0.8, 0.7, 0.6], } ) - df.columns = ["same", "unique", "same"] - + df.columns = ["var_A", "var_A", "var_B", "var_C"] with pytest.raises(ValueError) as err_txt: check_X(df) - assert err_txt.match("Input data contains duplicated variable names.") + + +def test_check_X_errors(): + # Test scalar array error (line 58) + with pytest.raises(ValueError) as record: + check_X(np.array(1)) + assert record.match("Expected 2D array, got scalar array instead") + + # Test 1D array error (line 65) + with pytest.raises(ValueError) as record: + check_X(np.array([1, 2, 3])) + assert record.match("Expected 2D array, got 1D array instead") + + # Test incorrect type error (line 80) + with pytest.raises(TypeError) as record: + check_X("not a dataframe") + assert record.match("X must be a numpy array or pandas dataframe") diff --git a/tests/test_datetime/test_datetime_features.py b/tests/test_datetime/test_datetime_features.py index 1d95ffe83..456f41e84 100644 --- a/tests/test_datetime/test_datetime_features.py +++ b/tests/test_datetime/test_datetime_features.py @@ -336,13 +336,13 @@ def test_extract_features_from_different_timezones(): ) exp_err_msg = ( "Tz-aware datetime.datetime cannot be converted to datetime64 " - "unless utc=True, at position 3" + "unless utc=True" ) with pytest.raises(ValueError) as errinfo: assert DatetimeFeatures( variables="time", features_to_extract=["hour"], utc=False ).fit_transform(df) - assert str(errinfo.value) == exp_err_msg + assert exp_err_msg in str(errinfo.value) def test_extract_features_from_different_timezones_when_string( diff --git a/tests/test_encoding/test_mean_encoder.py b/tests/test_encoding/test_mean_encoder.py index 1026936be..a13d0e5bf 100644 --- a/tests/test_encoding/test_mean_encoder.py +++ b/tests/test_encoding/test_mean_encoder.py @@ -183,10 +183,11 @@ def test_warning_if_transform_df_contains_categories_not_present_in_fit_df( encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -364,7 +365,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): ] pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == float + assert X["var_A"].dtypes.name == "float64" def test_auto_smoothing(df_enc): diff --git a/tests/test_encoding/test_ordinal_encoder.py b/tests/test_encoding/test_ordinal_encoder.py index ae7705643..e447c4176 100644 --- a/tests/test_encoding/test_ordinal_encoder.py +++ b/tests/test_encoding/test_ordinal_encoder.py @@ -138,10 +138,11 @@ def test_error_if_input_df_contains_categories_not_present_in_training_df( encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -243,7 +244,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == int + assert X["var_A"].dtypes.name == "int64" @pytest.mark.parametrize( diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index 3e74b3717..f32ac3823 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -150,6 +150,30 @@ def test_nan_behaviour_ignore(df_enc_big_na): } +def test_string_dtype_with_pd_na(): + # Test StringDtype with pd.NA to hit "" branch in transform + df = pd.DataFrame({"var_A": ["A", "B", pd.NA]}, dtype="string") + encoder = StringSimilarityEncoder(missing_values="impute") + X = encoder.fit_transform(df) + assert (X.isna().sum() == 0).all(axis=None) + # The categories will include "" or the string version of it + assert ( + "" in encoder.encoder_dict_["var_A"] + or "" in encoder.encoder_dict_["var_A"] + ) + + +def test_string_dtype_with_literal_nan_strings(): + # Test with literal "nan" and "" strings to hit skips in + # transform (line 339, 341 False) + df = pd.DataFrame({"var_A": ["nan", "", "A", "B"]}, dtype="string") + encoder = StringSimilarityEncoder(missing_values="impute") + X = encoder.fit_transform(df) + assert (X.isna().sum() == 0).all(axis=None) + assert "nan" in encoder.encoder_dict_["var_A"] + assert "" in encoder.encoder_dict_["var_A"] + + def test_inverse_transform_error(df_enc_big): encoder = StringSimilarityEncoder() X = encoder.fit_transform(df_enc_big) @@ -237,6 +261,7 @@ def test_get_feature_names_out_na(df_enc_big_na): "var_C_F", ] + # NaN values are replaced with empty string "" before string conversion assert tr.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F", ""], "var_B": ["A", "D", "B", "G", "C", "E", "F"], diff --git a/tests/test_encoding/test_woe/test_woe_encoder.py b/tests/test_encoding/test_woe/test_woe_encoder.py index 44181c5d7..a38caa6fa 100644 --- a/tests/test_encoding/test_woe/test_woe_encoder.py +++ b/tests/test_encoding/test_woe/test_woe_encoder.py @@ -149,10 +149,11 @@ def test_warn_if_transform_df_contains_categories_not_seen_in_fit(df_enc, df_enc encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -389,7 +390,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): transf_df["var_B"] = VAR_B pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == float + assert X["var_A"].dtypes.name == "float64" @pytest.mark.parametrize( diff --git a/tests/test_preprocessing/test_match_columns.py b/tests/test_preprocessing/test_match_columns.py index 16ee0633d..6726b33f9 100644 --- a/tests/test_preprocessing/test_match_columns.py +++ b/tests/test_preprocessing/test_match_columns.py @@ -189,7 +189,11 @@ def test_match_dtypes_string_to_datetime(df_vartypes): assert match_columns.match_dtypes is True assert match_columns.verbose is False # test fit attrs - assert match_columns.dtype_dict_ == {"dob": np.dtype("= "3": + assert match_columns.dtype_dict_ == {"dob": np.dtype("= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -371,6 +392,27 @@ def test_sklearn_ohe_object_many_features(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes[variables_to_encode]) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -393,6 +435,27 @@ def test_sklearn_ohe_numeric(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes[variables_to_encode]) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -428,6 +491,27 @@ def test_sklearn_ohe_all_features(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -466,7 +550,7 @@ def test_sklearn_ohe_with_crossvalidation(): results: np.ndarray = cross_val_score( pipeline, X, y, scoring="neg_mean_squared_error", cv=3 ) - assert not any([np.isnan(i) for i in results]) + assert not any(np.isnan(i) for i in results) def test_wrap_one_hot_encoder_get_features_name_out(df_vartypes): @@ -496,7 +580,28 @@ def test_wrap_one_hot_encoder_get_features_name_out(df_vartypes): "dob_2020-02-24T00:03:00.000000000", ] - assert ohe_wrap.get_feature_names_out() == expected_features_all + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + actual_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in ohe_wrap.get_feature_names_out() + ] + expected_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in expected_features_all + ] + else: + # Pandas 2 uses nanoseconds format + actual_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in ohe_wrap.get_feature_names_out() + ] + expected_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in expected_features_all + ] + assert actual_features == expected_features @pytest.mark.parametrize( From 6099193b3c3e209a71b74900057e9ce3a913d16f Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 16:05:35 -0500 Subject: [PATCH 06/24] remove extra learned parameter, pass function to transform --- feature_engine/creation/math_features.py | 37 +++++++----------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py index 5537c876f..5ea0d0e5b 100644 --- a/feature_engine/creation/math_features.py +++ b/feature_engine/creation/math_features.py @@ -186,9 +186,9 @@ def __init__( self.func = func self.new_variables_names = new_variables_names - def _map_unnamed_func_to_str(self, func: Any) -> Any: + def _map_numpy_func_to_str(self, func: Any) -> Any: if isinstance(func, list): - return [self._map_unnamed_func_to_str(f) for f in func] + return [self._map_numpy_func_to_str(f) for f in func] # We map certain numpy functions to their string alias. # This serves two purposes: @@ -207,23 +207,6 @@ def _map_unnamed_func_to_str(self, func: Any) -> Any: } return map_dict.get(func, func) - def fit(self, X: pd.DataFrame, y=None): - """ - This method does not learn any parameters. It just stores the normalized - function representation. - - Parameters - ---------- - X: pandas dataframe of shape = [n_samples, n_features] - The training input samples. - - y: pandas Series, or np.array. Defaults to None. - It is not needed in this transformer. You can pass y or None. - """ - super().fit(X, y) - # Normalize func to func_ (sklearn convention: don't modify init params) - self.func_ = self._map_unnamed_func_to_str(self.func) - return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ @@ -241,12 +224,14 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ X = self._check_transform_input_and_state(X) + func_ = self._map_numpy_func_to_str(self.func) + new_variable_names = self._get_new_features_name() if len(new_variable_names) == 1: - X[new_variable_names[0]] = X[self.variables].agg(self.func_, axis=1) + X[new_variable_names[0]] = X[self.variables].agg(func_, axis=1) else: - X[new_variable_names] = X[self.variables].agg(self.func_, axis=1) + X[new_variable_names] = X[self.variables].agg(func_, axis=1) if self.drop_original: X.drop(columns=self.variables, inplace=True) @@ -262,15 +247,13 @@ def _get_new_features_name(self) -> List: else: varlist = [f"{var}" for var in self.variables_] + func_ = self._map_numpy_func_to_str(self.func) - if isinstance(self.func_, list): - functions = [ - fun if type(fun) is str else fun.__name__ for fun in self.func_ - ] + if isinstance(func_, list): feature_names = [ - f"{function}_{'_'.join(varlist)}" for function in functions + f"{function}_{'_'.join(varlist)}" for function in func_ ] else: - feature_names = [f"{self.func_}_{'_'.join(varlist)}"] + feature_names = [f"{func_}_{'_'.join(varlist)}"] return feature_names From 8c7b35ee2c839c7c016f7ffdd8f991171d446d1e Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 16:23:30 -0500 Subject: [PATCH 07/24] rolled back mapping function --- feature_engine/creation/math_features.py | 39 +++++------------------- 1 file changed, 8 insertions(+), 31 deletions(-) diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py index 5ea0d0e5b..19bfded91 100644 --- a/feature_engine/creation/math_features.py +++ b/feature_engine/creation/math_features.py @@ -1,6 +1,5 @@ from typing import Any, List, Optional, Union -import numpy as np import pandas as pd from feature_engine._docstrings.fit_attributes import ( @@ -186,28 +185,6 @@ def __init__( self.func = func self.new_variables_names = new_variables_names - def _map_numpy_func_to_str(self, func: Any) -> Any: - if isinstance(func, list): - return [self._map_numpy_func_to_str(f) for f in func] - - # We map certain numpy functions to their string alias. - # This serves two purposes: - # 1) It avoids a FutureWarning in pandas 2.1+ which recommends - # using the string alias for better performance and future-proofing. - # 2) It ensures consistent column naming (e.g. "sum_x1_x2") - # regardless of how the function was passed (np.sum vs "sum"). - map_dict = { - np.sum: "sum", - np.mean: "mean", - np.std: "std", - np.min: "min", - np.max: "max", - np.median: "median", - np.prod: "prod", - } - return map_dict.get(func, func) - - def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ Create and add new variables. @@ -224,14 +201,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ X = self._check_transform_input_and_state(X) - func_ = self._map_numpy_func_to_str(self.func) - new_variable_names = self._get_new_features_name() if len(new_variable_names) == 1: - X[new_variable_names[0]] = X[self.variables].agg(func_, axis=1) + X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1) else: - X[new_variable_names] = X[self.variables].agg(func_, axis=1) + X[new_variable_names] = X[self.variables].agg(self.func, axis=1) if self.drop_original: X.drop(columns=self.variables, inplace=True) @@ -247,13 +222,15 @@ def _get_new_features_name(self) -> List: else: varlist = [f"{var}" for var in self.variables_] - func_ = self._map_numpy_func_to_str(self.func) - if isinstance(func_, list): + if isinstance(self.func, list): + functions = [ + fun if type(fun) is str else fun.__name__ for fun in self.func + ] feature_names = [ - f"{function}_{'_'.join(varlist)}" for function in func_ + f"{function}_{'_'.join(varlist)}" for function in functions ] else: - feature_names = [f"{func_}_{'_'.join(varlist)}"] + feature_names = [f"{self.func}_{'_'.join(varlist)}"] return feature_names From ff451615fcd8e595ea648c5720245f6031c8f4c6 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 17:01:42 -0500 Subject: [PATCH 08/24] refactor creation of array of nan --- feature_engine/encoding/similarity_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py index 2599d2f91..b4dd91f99 100644 --- a/feature_engine/encoding/similarity_encoder.py +++ b/feature_engine/encoding/similarity_encoder.py @@ -328,7 +328,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: } # Ensure map result is always an array of the correct size. # Missing values in categories or unknown categories will map to NaN. - default_nan = [np.nan] * len(self.encoder_dict_[var]) + default_nan = np.full(len(self.encoder_dict_[var]), np.nan) if "nan" not in column_encoder_dict: column_encoder_dict["nan"] = default_nan if "" not in column_encoder_dict: From 2a6775d2dd10185b64410b3405e0b7e3cd7124c8 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 17:40:31 -0500 Subject: [PATCH 09/24] expand test to cover different expressions of nan values --- .../test_encoding/test_similarity_encoder.py | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index f32ac3823..f31889dfe 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -1,5 +1,6 @@ from difflib import SequenceMatcher +import numpy as np import pandas as pd import pytest @@ -115,9 +116,14 @@ def test_nan_behaviour_error_fit(df_enc_big_na): assert str(record.value) == msg -def test_nan_behaviour_error_transform(df_enc_big, df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_error_transform(df_enc_big, nan_value): encoder = StringSimilarityEncoder(missing_values="raise") encoder.fit(df_enc_big) + + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + with pytest.raises(ValueError) as record: encoder.transform(df_enc_big_na) msg = ( @@ -128,9 +134,15 @@ def test_nan_behaviour_error_transform(df_enc_big, df_enc_big_na): assert str(record.value) == msg -def test_nan_behaviour_impute(df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_impute(df_enc_big, nan_value): + + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + encoder = StringSimilarityEncoder(missing_values="impute") X = encoder.fit_transform(df_enc_big_na) + assert (X.isna().sum() == 0).all(axis=None) assert encoder.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F", ""], @@ -139,15 +151,27 @@ def test_nan_behaviour_impute(df_enc_big_na): } -def test_nan_behaviour_ignore(df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_ignore(df_enc_big, nan_value): + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + encoder = StringSimilarityEncoder(missing_values="ignore") X = encoder.fit_transform(df_enc_big_na) assert (X.isna().any(axis=1) == df_enc_big_na.isna().any(axis=1)).all() - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } + if nan_value is not None: + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } + else: + # Note that None is converted to a string and not treated as nan value + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F", "None"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } def test_string_dtype_with_pd_na(): @@ -157,10 +181,7 @@ def test_string_dtype_with_pd_na(): X = encoder.fit_transform(df) assert (X.isna().sum() == 0).all(axis=None) # The categories will include "" or the string version of it - assert ( - "" in encoder.encoder_dict_["var_A"] - or "" in encoder.encoder_dict_["var_A"] - ) + assert "" in encoder.encoder_dict_["var_A"] def test_string_dtype_with_literal_nan_strings(): From 1f526cf471ead71e29eb274a8f7cbff0824ddea4 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 17:52:42 -0500 Subject: [PATCH 10/24] refactor match columns update --- feature_engine/preprocessing/match_columns.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py index da34f5e9c..41bd70660 100644 --- a/feature_engine/preprocessing/match_columns.py +++ b/feature_engine/preprocessing/match_columns.py @@ -262,11 +262,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.drop(_columns_to_drop, axis=1) - # Add missing columns one at a time to avoid Pandas 3 StringDtype reindex issue - for col in _columns_to_add: - X[col] = self.fill_value - - # Reorder columns to match training set, without fill_value to avoid issues + # Add missing columns first and then reorder to avoid + # Pandas 3 StringDtype reindex issue (before we used reindex) + X[_columns_to_add] = self.fill_value X = X[self.feature_names_in_] if self.match_dtypes: From 9368b17ae257adf3541c581e6dd54bd669d91d50 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 17:57:04 -0500 Subject: [PATCH 11/24] refactor code variable checks --- feature_engine/variable_handling/_variable_type_checks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index 3427c60be..17eb4e41d 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -9,7 +9,6 @@ def is_object(s) -> bool: def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: - is_cat = False # check for datetime only if the type of the categories is not numeric # because pd.to_datetime throws an error when it is an integer if isinstance(column.dtype, pd.CategoricalDtype): @@ -20,6 +19,9 @@ def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: elif is_object(column): is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column) + else: + is_cat = False + return is_cat @@ -44,7 +46,6 @@ def _is_convertible_to_num(column: pd.Series) -> bool: def _is_categorical_and_is_datetime(column: pd.Series) -> bool: - is_dt = False # check for datetime only if the type of the categories is not numeric # because pd.to_datetime throws an error when it is an integer if isinstance(column.dtype, pd.CategoricalDtype): @@ -55,4 +56,7 @@ def _is_categorical_and_is_datetime(column: pd.Series) -> bool: elif is_object(column): is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column) + else: + is_dt = False + return is_dt From 564a84f3e4c36b37c6f936f40fdea0a165e29a8d Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 18:13:17 -0500 Subject: [PATCH 12/24] refactor inf tests --- tests/test_dataframe_checks.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/test_dataframe_checks.py b/tests/test_dataframe_checks.py index 09cd22ccf..3b43c0b5d 100644 --- a/tests/test_dataframe_checks.py +++ b/tests/test_dataframe_checks.py @@ -248,15 +248,19 @@ def test_optional_contains_na(df_na): assert str(record.value) == msg -def test_contains_inf(df_na): - # Test numeric column with inf - df_num_inf = pd.DataFrame({"A": [1.1, np.inf, 3.3]}) - with pytest.raises(ValueError): - _check_contains_inf(df_num_inf, ["A"]) +def test_contains_inf_raises_on_inf(): + msg = ( + "Some of the variables to transform contain inf values. Check and " + "remove those before using this transformer." + ) + df = pd.DataFrame({"A": [1.1, np.inf, 3.3]}) + with pytest.raises(ValueError, match=msg): + _check_contains_inf(df, ["A"]) + - # Test numeric column WITHOUT inf - df_num_no_inf = pd.DataFrame({"A": [1.1, 2.2, 3.3]}) - _check_contains_inf(df_num_no_inf, ["A"]) +def test_contains_inf_passes_without_inf(): + df = pd.DataFrame({"A": [1.1, 2.2, 3.3]}) + assert _check_contains_inf(df, ["A"]) is None def test_check_X_raises_error_on_duplicated_column_names(): From 25414101baf10d7548c561df5f503e6d3d9f765f Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 18:31:30 -0500 Subject: [PATCH 13/24] split test by pandas version --- .../test_encoding/test_similarity_encoder.py | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index f31889dfe..672efe0b7 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -159,19 +159,27 @@ def test_nan_behaviour_ignore(df_enc_big, nan_value): encoder = StringSimilarityEncoder(missing_values="ignore") X = encoder.fit_transform(df_enc_big_na) assert (X.isna().any(axis=1) == df_enc_big_na.isna().any(axis=1)).all() - if nan_value is not None: + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": assert encoder.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F"], "var_B": ["A", "D", "B", "G", "C", "E", "F"], "var_C": ["C", "D", "B", "G", "A", "E", "F"], } else: - # Note that None is converted to a string and not treated as nan value - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F", "None"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } + if nan_value is not None: + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } + else: + # Note that None is converted to a string and not treated as nan value + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F", "None"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } def test_string_dtype_with_pd_na(): From b57a60060354447f3ac69bdd379e99dbf11a4202 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 18:41:24 -0500 Subject: [PATCH 14/24] solve additional errors specific to pandas 3 --- tests/test_creation/test_math_features.py | 7 +++++++ tests/test_encoding/test_similarity_encoder.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/tests/test_creation/test_math_features.py b/tests/test_creation/test_math_features.py index 6a5590019..6e16821be 100644 --- a/tests/test_creation/test_math_features.py +++ b/tests/test_creation/test_math_features.py @@ -136,6 +136,13 @@ def test_aggregations_with_functions(df_vartypes): } ) + # TODO: Remove pandas < 3 support when dropping older pandas versions + # In pandas >=3, when the user passes np.std, agg will use numpy. + # In pandas <3, when the user passes np.std, agg will use pd.std. + # Hence the difference in results + if pd.__version__ >= "3": + ref["std_Age_Marks"] = np.std(df_vartypes[["Age", "Marks"]], axis=1) + # transform params pd.testing.assert_frame_equal(X, ref) diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index 672efe0b7..49f809038 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -174,6 +174,11 @@ def test_nan_behaviour_ignore(df_enc_big, nan_value): "var_C": ["C", "D", "B", "G", "A", "E", "F"], } else: + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } # Note that None is converted to a string and not treated as nan value assert encoder.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F", "None"], From 75fde70e5354af5fa4ab76b0dbc2ec5a12c82785 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:06:56 -0500 Subject: [PATCH 15/24] add pandas version tests to tox.ini --- tox.ini | 87 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 13 deletions(-) diff --git a/tox.ini b/tox.ini index e55e03a47..096cf88f4 100644 --- a/tox.ini +++ b/tox.ini @@ -1,16 +1,36 @@ [tox] -envlist = py39, py310, py311-sklearn150, py311-sklearn160, py311-sklearn170, py312, py313, codecov, docs, stylechecks, typechecks +envlist = + py39 + py310 + py311-sklearn150 + py311-sklearn160 + py311-sklearn170 + py311-pandas230 + py312 + py313 + codecov + docs + stylechecks + typechecks skipsdist = true + [testenv] -install_command = pip install {opts} {packages} envdir = {toxworkdir}/unit_tests +install_command = pip install {opts} {packages} + setenv = - PYTHONPATH=. + PYTHONPATH = . COVERAGE_RCFILE = {envtmpdir}/coveragerc + commands = pytest tests + +# ------------------------- +# Python versions +# ------------------------- + [testenv:py39] deps = .[tests] @@ -19,6 +39,19 @@ deps = deps = .[tests] +[testenv:py312] +deps = + .[tests] + +[testenv:py313] +deps = + .[tests] + + +# ------------------------- +# scikit-learn matrix +# ------------------------- + [testenv:py311-sklearn150] deps = .[tests] @@ -34,45 +67,73 @@ deps = .[tests] scikit-learn==1.7.1 -[testenv:py312] -deps = - .[tests] -[testenv:py313] +[testenv:py311-pandas230] deps = .[tests] + pandas==2.3.0 + + +# ------------------------- +# Coverage +# ------------------------- [testenv:codecov] deps = .[tests] + commands_pre = {envpython} -c 'from pathlib import Path; Path(r"{env:COVERAGE_RCFILE}").write_text(Path(".coveragerc").read_text())' + commands = coverage run -m pytest -v coverage report + +# ------------------------- +# Docs +# ------------------------- + [testenv:docs] deps = .[docs] + commands = sphinx-build -W -b html -d {envtmpdir}/doctrees docs {envtmpdir}/html + +# ------------------------- +# Linting & typing +# ------------------------- + [testenv:stylechecks] deps = flake8 -commands = {posargs:flake8 feature_engine tests} + +commands = + {posargs:flake8 feature_engine tests} [testenv:typechecks] deps = - mypy -commands = {posargs:mypy feature_engine} + mypy + +commands = + {posargs:mypy feature_engine} + + +# ------------------------- +# flake8 configuration +# ------------------------- [flake8] -exclude = .git, env -# match black code formatter +exclude = + .git + env + +# Match Black max-line-length = 88 profile = black line_length = 88 lines_between_sections = 1 -known_first_party = "sentry" \ No newline at end of file +known_first_party = sentry From 3488426b60572b0a543bcc304bd8405def7e93ad Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:10:37 -0500 Subject: [PATCH 16/24] move versioned tests to python 12 --- tox.ini | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tox.ini b/tox.ini index 096cf88f4..b09a57c6c 100644 --- a/tox.ini +++ b/tox.ini @@ -5,8 +5,8 @@ envlist = py311-sklearn150 py311-sklearn160 py311-sklearn170 - py311-pandas230 - py312 + py312-pandas230 + py312-pandas300 py313 codecov docs @@ -39,10 +39,6 @@ deps = deps = .[tests] -[testenv:py312] -deps = - .[tests] - [testenv:py313] deps = .[tests] @@ -68,11 +64,16 @@ deps = scikit-learn==1.7.1 -[testenv:py311-pandas230] +[testenv:py312-pandas230] deps = .[tests] pandas==2.3.0 +[testenv:py312-pandas300] +deps = + .[tests] + pandas==3.0.0 + # ------------------------- # Coverage From 8758cdf3991fe43ff8249421d7b184f7f6ee7262 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:14:51 -0500 Subject: [PATCH 17/24] add tests to circleci config --- .circleci/config.yml | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index cbc578a99..11335e92d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -90,7 +90,7 @@ jobs: command: | tox -e py311-sklearn170 - test_feature_engine_py312: + test_feature_engine_py312_pandas230: docker: - image: cimg/python:3.12.1 working_directory: ~/project @@ -101,7 +101,20 @@ jobs: - run: name: Run tests with Python 3.12 command: | - tox -e py312 + tox -e py312-pandas230 + + test_feature_engine_py312_pandas300: + docker: + - image: cimg/python:3.12.1 + working_directory: ~/project + steps: + - checkout: + path: ~/project + - *prepare_tox + - run: + name: Run tests with Python 3.12 + command: | + tox -e py312-pandas300 test_feature_engine_py313: docker: @@ -197,7 +210,8 @@ workflows: - test_feature_engine_py311_sklearn150 - test_feature_engine_py311_sklearn160 - test_feature_engine_py311_sklearn170 - - test_feature_engine_py312 + - test_feature_engine_py312_pandas230 + - test_feature_engine_py312_pandas300 - test_feature_engine_py313 - test_style - test_docs @@ -214,7 +228,8 @@ workflows: - test_feature_engine_py311_sklearn150 - test_feature_engine_py311_sklearn160 - test_feature_engine_py311_sklearn170 - - test_feature_engine_py312 + - test_feature_engine_py312_pandas230 + - test_feature_engine_py312_pandas300 - test_feature_engine_py313 - test_style - test_docs From a7ab1a1309de0efbab49a01e6d468dca28221ed4 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:23:31 -0500 Subject: [PATCH 18/24] reformat circleci config" --- .circleci/config.yml | 67 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 11335e92d..4df23b66b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,13 @@ version: 2.1 + orbs: codecov: codecov/codecov@3.2.3 + +# -------------------------------------------------- +# Anchors & defaults +# -------------------------------------------------- + defaults: &defaults docker: - image: cimg/python:3.10.0 @@ -17,14 +23,24 @@ prepare_tox: &prepare_tox init_pypirc: &init_pypirc run: - name: init .pypirc + name: Init .pypirc command: | echo -e "[pypi]" >> ~/.pypirc echo -e "repository = $FE_PYPI_URL" >> ~/.pypirc echo -e "username = $FE_PYPI_USER" >> ~/.pypirc echo -e "password = $FE_PYPI_API_KEY" >> ~/.pypirc + +# -------------------------------------------------- +# Jobs +# -------------------------------------------------- + jobs: + + # ------------------------ + # Test matrix + # ------------------------ + test_feature_engine_py39: docker: - image: cimg/python:3.9.0 @@ -34,7 +50,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.9 + name: Run tests (Python 3.9) command: | tox -e py39 @@ -47,7 +63,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.10 + name: Run tests (Python 3.10) command: | tox -e py310 @@ -60,7 +76,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.5.0 + name: Run tests (Python 3.11, scikit-learn 1.5) command: | tox -e py311-sklearn150 @@ -73,7 +89,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.6.0 + name: Run tests (Python 3.11, scikit-learn 1.6) command: | tox -e py311-sklearn160 @@ -86,7 +102,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.7.0 + name: Run tests (Python 3.11, scikit-learn 1.7) command: | tox -e py311-sklearn170 @@ -99,7 +115,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.12 + name: Run tests (Python 3.12, pandas 2.3) command: | tox -e py312-pandas230 @@ -112,7 +128,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.12 + name: Run tests (Python 3.12, pandas 3.0) command: | tox -e py312-pandas300 @@ -125,10 +141,15 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.13 + name: Run tests (Python 3.13) command: | tox -e py313 + + # ------------------------ + # Quality checks + # ------------------------ + test_style: docker: - image: cimg/python:3.10.0 @@ -138,7 +159,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run stylechecks + name: Run style checks command: | tox -e stylechecks @@ -151,7 +172,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run doc build + name: Build documentation command: | tox -e docs @@ -164,10 +185,15 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run typechecks + name: Run type checks command: | tox -e typechecks + + # ------------------------ + # Coverage + # ------------------------ + upload_codecov: docker: - image: cimg/python:3.10.0 @@ -184,13 +210,18 @@ jobs: coverage report - codecov/upload + + # ------------------------ + # Release + # ------------------------ + package_and_upload_to_pypi: <<: *defaults steps: - checkout - *init_pypirc - run: - name: upload to pypi + name: Build and upload package command: | python -m venv env source env/bin/activate @@ -201,8 +232,14 @@ jobs: ls -l dist twine upload dist/* + +# -------------------------------------------------- +# Workflows +# -------------------------------------------------- + workflows: version: 2 + test-all: jobs: - test_feature_engine_py39 @@ -216,11 +253,13 @@ workflows: - test_style - test_docs - test_type + - upload_codecov: filters: branches: ignore: - 1.9.X + - package_and_upload_to_pypi: requires: - test_feature_engine_py39 @@ -237,4 +276,4 @@ workflows: filters: branches: only: - - 1.9.X \ No newline at end of file + - 1.9.X From c272f86fc82e348da18e0d834fb5ab0ddf34ee57 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:24:02 -0500 Subject: [PATCH 19/24] test revering pandas version on None --- .../test_encoding/test_similarity_encoder.py | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index 49f809038..b33ef4f00 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -167,24 +167,12 @@ def test_nan_behaviour_ignore(df_enc_big, nan_value): "var_C": ["C", "D", "B", "G", "A", "E", "F"], } else: - if nan_value is not None: - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } - else: - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } - # Note that None is converted to a string and not treated as nan value - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F", "None"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } + + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } def test_string_dtype_with_pd_na(): From 2c0c51c7bc9813cdd9d18ba03014be47393f8668 Mon Sep 17 00:00:00 2001 From: solegalli Date: Fri, 6 Feb 2026 20:42:11 -0500 Subject: [PATCH 20/24] move dropna a level up when missing values ignore in string similarity --- feature_engine/encoding/similarity_encoder.py | 2 +- .../test_encoding/test_similarity_encoder.py | 19 +++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py index b4dd91f99..b6aa1b249 100644 --- a/feature_engine/encoding/similarity_encoder.py +++ b/feature_engine/encoding/similarity_encoder.py @@ -275,9 +275,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): for var in cols_to_iterate: self.encoder_dict_[var] = ( X[var] + .dropna() .astype(str) .value_counts(dropna=True) - .drop(["nan", ""], errors="ignore") .head(self.top_categories) .index.tolist() ) diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index b33ef4f00..09c17443b 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -159,20 +159,11 @@ def test_nan_behaviour_ignore(df_enc_big, nan_value): encoder = StringSimilarityEncoder(missing_values="ignore") X = encoder.fit_transform(df_enc_big_na) assert (X.isna().any(axis=1) == df_enc_big_na.isna().any(axis=1)).all() - # TODO: Remove pandas < 3 support when dropping older pandas versions - if pd.__version__ >= "3": - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } - else: - - assert encoder.encoder_dict_ == { - "var_A": ["B", "D", "G", "A", "C", "E", "F"], - "var_B": ["A", "D", "B", "G", "C", "E", "F"], - "var_C": ["C", "D", "B", "G", "A", "E", "F"], - } + assert encoder.encoder_dict_ == { + "var_A": ["B", "D", "G", "A", "C", "E", "F"], + "var_B": ["A", "D", "B", "G", "C", "E", "F"], + "var_C": ["C", "D", "B", "G", "A", "E", "F"], + } def test_string_dtype_with_pd_na(): From 1f8e4e64066dbe24efc03c057a9af715a2ee9eba Mon Sep 17 00:00:00 2001 From: solegalli Date: Sat, 7 Feb 2026 06:48:36 -0500 Subject: [PATCH 21/24] refactor condition logic for codecoverage --- feature_engine/creation/math_features.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py index 19bfded91..368ac6a73 100644 --- a/feature_engine/creation/math_features.py +++ b/feature_engine/creation/math_features.py @@ -167,13 +167,12 @@ def __init__( ) if new_variables_names is not None: - if isinstance(func, list): - if len(new_variables_names) != len(func): - raise ValueError( - "The number of new feature names must coincide with the number " - "of functions." - ) - elif len(new_variables_names) != 1: + if isinstance(new_variables_names, list): + expected = len(func) + else: + expected = 1 + + if len(new_variables_names) != expected: raise ValueError( "The number of new feature names must coincide with the number " "of functions." From 9406e1b04cdcd80ef4ef7d700d23fb6c0829e184 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sat, 7 Feb 2026 07:15:24 -0500 Subject: [PATCH 22/24] fix decreased coverage --- .../variable_handling/find_variables.py | 15 +++++++-------- tests/test_creation/test_math_features.py | 6 ++++++ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/feature_engine/variable_handling/find_variables.py b/feature_engine/variable_handling/find_variables.py index 72e17d9ef..6162fdb43 100644 --- a/feature_engine/variable_handling/find_variables.py +++ b/feature_engine/variable_handling/find_variables.py @@ -253,14 +253,13 @@ def find_categorical_and_numerical_variables( # If user leaves default None parameter. elif variables is None: # find categorical variables - if variables is None: - variables_cat = [ - column - for column in X.select_dtypes( - include=["O", "category", "string"] - ).columns - if _is_categorical_and_is_not_datetime(X[column]) - ] + variables_cat = [ + column + for column in X.select_dtypes( + include=["O", "category", "string"] + ).columns + if _is_categorical_and_is_not_datetime(X[column]) + ] # find numerical variables in dataset variables_num = list(X.select_dtypes(include="number").columns) diff --git a/tests/test_creation/test_math_features.py b/tests/test_creation/test_math_features.py index 6e16821be..332fcfea8 100644 --- a/tests/test_creation/test_math_features.py +++ b/tests/test_creation/test_math_features.py @@ -79,6 +79,12 @@ def test_error_new_variable_names_not_permitted(): func=["sum", "mean"], new_variables_names=["sum_of_two_vars", "sum_of_two_vars"], ) + with pytest.raises(ValueError): + MathFeatures( + variables=variables, + func=["sum", "mean"], + new_variables_names="sum_of_two_vars" + ) def test_aggregations_with_strings(df_vartypes): From c516db2de308988c2df357a08b1db6eb9e5676b1 Mon Sep 17 00:00:00 2001 From: solegalli Date: Sat, 7 Feb 2026 07:36:51 -0500 Subject: [PATCH 23/24] revert math features to main --- feature_engine/creation/math_features.py | 40 +++++++++++++----------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py index 368ac6a73..35cbe73aa 100644 --- a/feature_engine/creation/math_features.py +++ b/feature_engine/creation/math_features.py @@ -140,6 +140,7 @@ def __init__( missing_values: str = "raise", drop_original: bool = False, ) -> None: + if ( not isinstance(variables, list) or not all(isinstance(var, (int, str)) for var in variables) @@ -156,28 +157,31 @@ def __init__( "func does not work with dictionaries in this transformer." ) - if new_variables_names is not None and ( - not isinstance(new_variables_names, list) - or not all(isinstance(var, str) for var in new_variables_names) - or len(set(new_variables_names)) != len(new_variables_names) - ): - raise ValueError( - "new_variable_names should be None or a list of unique strings. " - f"Got {new_variables_names} instead." - ) - if new_variables_names is not None: - if isinstance(new_variables_names, list): - expected = len(func) - else: - expected = 1 - - if len(new_variables_names) != expected: + if ( + not isinstance(new_variables_names, list) + or not all(isinstance(var, str) for var in new_variables_names) + or len(set(new_variables_names)) != len(new_variables_names) + ): raise ValueError( - "The number of new feature names must coincide with the number " - "of functions." + "new_variable_names should be None or a list of unique strings. " + f"Got {new_variables_names} instead." ) + if new_variables_names is not None: + if isinstance(func, list): + if len(new_variables_names) != len(func): + raise ValueError( + "The number of new feature names must coincide with the number " + "of functions." + ) + else: + if len(new_variables_names) != 1: + raise ValueError( + "The number of new feature names must coincide with the number " + "of functions." + ) + super().__init__(missing_values, drop_original) self.variables = variables From 5e6338c709b9390bff561b54d159b3d39c2d441a Mon Sep 17 00:00:00 2001 From: solegalli Date: Sat, 7 Feb 2026 07:38:23 -0500 Subject: [PATCH 24/24] remove test from math features --- tests/test_creation/test_math_features.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/test_creation/test_math_features.py b/tests/test_creation/test_math_features.py index 332fcfea8..6e16821be 100644 --- a/tests/test_creation/test_math_features.py +++ b/tests/test_creation/test_math_features.py @@ -79,12 +79,6 @@ def test_error_new_variable_names_not_permitted(): func=["sum", "mean"], new_variables_names=["sum_of_two_vars", "sum_of_two_vars"], ) - with pytest.raises(ValueError): - MathFeatures( - variables=variables, - func=["sum", "mean"], - new_variables_names="sum_of_two_vars" - ) def test_aggregations_with_strings(df_vartypes):