diff --git a/.circleci/config.yml b/.circleci/config.yml index cbc578a99..4df23b66b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,7 +1,13 @@ version: 2.1 + orbs: codecov: codecov/codecov@3.2.3 + +# -------------------------------------------------- +# Anchors & defaults +# -------------------------------------------------- + defaults: &defaults docker: - image: cimg/python:3.10.0 @@ -17,14 +23,24 @@ prepare_tox: &prepare_tox init_pypirc: &init_pypirc run: - name: init .pypirc + name: Init .pypirc command: | echo -e "[pypi]" >> ~/.pypirc echo -e "repository = $FE_PYPI_URL" >> ~/.pypirc echo -e "username = $FE_PYPI_USER" >> ~/.pypirc echo -e "password = $FE_PYPI_API_KEY" >> ~/.pypirc + +# -------------------------------------------------- +# Jobs +# -------------------------------------------------- + jobs: + + # ------------------------ + # Test matrix + # ------------------------ + test_feature_engine_py39: docker: - image: cimg/python:3.9.0 @@ -34,7 +50,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.9 + name: Run tests (Python 3.9) command: | tox -e py39 @@ -47,7 +63,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.10 + name: Run tests (Python 3.10) command: | tox -e py310 @@ -60,7 +76,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.5.0 + name: Run tests (Python 3.11, scikit-learn 1.5) command: | tox -e py311-sklearn150 @@ -73,7 +89,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.6.0 + name: Run tests (Python 3.11, scikit-learn 1.6) command: | tox -e py311-sklearn160 @@ -86,11 +102,11 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.11 and scikit-learn 1.7.0 + name: Run tests (Python 3.11, scikit-learn 1.7) command: | tox -e py311-sklearn170 - test_feature_engine_py312: + test_feature_engine_py312_pandas230: docker: - image: cimg/python:3.12.1 working_directory: ~/project @@ -99,9 +115,22 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.12 + name: Run tests (Python 3.12, pandas 2.3) command: | - tox -e py312 + tox -e py312-pandas230 + + test_feature_engine_py312_pandas300: + docker: + - image: cimg/python:3.12.1 + working_directory: ~/project + steps: + - checkout: + path: ~/project + - *prepare_tox + - run: + name: Run tests (Python 3.12, pandas 3.0) + command: | + tox -e py312-pandas300 test_feature_engine_py313: docker: @@ -112,10 +141,15 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run tests with Python 3.13 + name: Run tests (Python 3.13) command: | tox -e py313 + + # ------------------------ + # Quality checks + # ------------------------ + test_style: docker: - image: cimg/python:3.10.0 @@ -125,7 +159,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run stylechecks + name: Run style checks command: | tox -e stylechecks @@ -138,7 +172,7 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run doc build + name: Build documentation command: | tox -e docs @@ -151,10 +185,15 @@ jobs: path: ~/project - *prepare_tox - run: - name: Run typechecks + name: Run type checks command: | tox -e typechecks + + # ------------------------ + # Coverage + # ------------------------ + upload_codecov: docker: - image: cimg/python:3.10.0 @@ -171,13 +210,18 @@ jobs: coverage report - codecov/upload + + # ------------------------ + # Release + # ------------------------ + package_and_upload_to_pypi: <<: *defaults steps: - checkout - *init_pypirc - run: - name: upload to pypi + name: Build and upload package command: | python -m venv env source env/bin/activate @@ -188,8 +232,14 @@ jobs: ls -l dist twine upload dist/* + +# -------------------------------------------------- +# Workflows +# -------------------------------------------------- + workflows: version: 2 + test-all: jobs: - test_feature_engine_py39 @@ -197,16 +247,19 @@ workflows: - test_feature_engine_py311_sklearn150 - test_feature_engine_py311_sklearn160 - test_feature_engine_py311_sklearn170 - - test_feature_engine_py312 + - test_feature_engine_py312_pandas230 + - test_feature_engine_py312_pandas300 - test_feature_engine_py313 - test_style - test_docs - test_type + - upload_codecov: filters: branches: ignore: - 1.9.X + - package_and_upload_to_pypi: requires: - test_feature_engine_py39 @@ -214,7 +267,8 @@ workflows: - test_feature_engine_py311_sklearn150 - test_feature_engine_py311_sklearn160 - test_feature_engine_py311_sklearn170 - - test_feature_engine_py312 + - test_feature_engine_py312_pandas230 + - test_feature_engine_py312_pandas300 - test_feature_engine_py313 - test_style - test_docs @@ -222,4 +276,4 @@ workflows: filters: branches: only: - - 1.9.X \ No newline at end of file + - 1.9.X diff --git a/feature_engine/dataframe_checks.py b/feature_engine/dataframe_checks.py index 2d41727f7..9ef9b3f82 100644 --- a/feature_engine/dataframe_checks.py +++ b/feature_engine/dataframe_checks.py @@ -9,6 +9,8 @@ from scipy.sparse import issparse from sklearn.utils.validation import _check_y, check_consistent_length, column_or_1d +from feature_engine.variable_handling._variable_type_checks import is_object + def check_X(X: Union[np.generic, np.ndarray, pd.DataFrame]) -> pd.DataFrame: """ @@ -121,10 +123,10 @@ def check_y( elif isinstance(y, pd.Series): if y.isnull().any(): raise ValueError("y contains NaN values.") - if y.dtype != "O" and not np.isfinite(y).all(): + if not is_object(y) and not np.isfinite(y).all(): raise ValueError("y contains infinity values.") - if y_numeric and y.dtype == "O": - y = y.astype("float") + if y_numeric and is_object(y): + y = y.astype("float64") y = y.copy() elif isinstance(y, pd.DataFrame): diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py index 137034ddb..b6aa1b249 100644 --- a/feature_engine/encoding/similarity_encoder.py +++ b/feature_engine/encoding/similarity_encoder.py @@ -232,12 +232,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): X = check_X(X) variables_ = self._check_or_select_variables(X) - if self.keywords: - if not all(item in variables_ for item in self.keywords.keys()): - raise ValueError( - "There are variables in keywords that are not present " - "in the dataset." - ) + if self.keywords and not all( + item in variables_ for item in self.keywords.keys() + ): + raise ValueError( + "There are variables in keywords that are not present " + "in the dataset." + ) # if data contains nan, fail before running any logic if self.missing_values == "raise": @@ -262,10 +263,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): ) elif self.missing_values == "impute": for var in cols_to_iterate: + series = X[var] self.encoder_dict_[var] = ( - X[var] - .astype(str) - .replace("nan", "") + series.astype(str) + .mask(series.isna(), "") .value_counts() .head(self.top_categories) .index.tolist() @@ -274,9 +275,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): for var in cols_to_iterate: self.encoder_dict_[var] = ( X[var] + .dropna() .astype(str) .value_counts(dropna=True) - .drop("nan", errors="ignore") .head(self.top_categories) .index.tolist() ) @@ -316,13 +317,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: new_values = [] for var in self.variables_: if self.missing_values == "impute": - X[var] = X[var].astype(str).replace("nan", "") - categories = X[var].dropna().astype(str).unique() + series = X[var] + series = series.astype(str).mask(series.isna(), "") + else: + series = X[var].astype(str) + + categories = series.unique() column_encoder_dict = { x: _gpm_fast_vec(x, self.encoder_dict_[var]) for x in categories } - column_encoder_dict["nan"] = [np.nan] * len(self.encoder_dict_[var]) - encoded = np.vstack(X[var].astype(str).map(column_encoder_dict).values) + # Ensure map result is always an array of the correct size. + # Missing values in categories or unknown categories will map to NaN. + default_nan = np.full(len(self.encoder_dict_[var]), np.nan) + if "nan" not in column_encoder_dict: + column_encoder_dict["nan"] = default_nan + if "" not in column_encoder_dict: + column_encoder_dict[""] = default_nan + + encoded_series = series.map(column_encoder_dict) + + # Robust stacking: replace any float NaNs (from unknown values) with arrays + encoded_list = [ + v if isinstance(v, (list, np.ndarray)) else default_nan + for v in encoded_series + ] + encoded = np.vstack(encoded_list) if self.missing_values == "ignore": encoded[X[var].isna(), :] = np.nan new_values.append(encoded) diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py index c5321b6c3..41bd70660 100644 --- a/feature_engine/preprocessing/match_columns.py +++ b/feature_engine/preprocessing/match_columns.py @@ -175,7 +175,7 @@ def __init__( if not isinstance(verbose, bool): raise ValueError( - "verbose takes only booleans True and False." f"Got '{verbose} instead." + f"verbose takes only booleans True and False. Got '{verbose} instead." ) # note: np.nan is an instance of float!!! @@ -262,7 +262,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X = X.drop(_columns_to_drop, axis=1) - X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value) + # Add missing columns first and then reorder to avoid + # Pandas 3 StringDtype reindex issue (before we used reindex) + X[_columns_to_add] = self.fill_value + X = X[self.feature_names_in_] if self.match_dtypes: _current_dtypes = X.dtypes.to_dict() diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 7ed7ed200..ee9c1c151 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -201,7 +201,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: axis=0, ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = X[self.variables_].shift( @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: axis=0, ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = X[self.variables_].shift( diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 47071efa7..a1e526c3e 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: .shift(periods=self.periods, freq=self.freq) ) df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) + tmp = pd.concat(df_ls, axis=1, sort=False) else: tmp = ( diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py index c3e16d383..17eb4e41d 100644 --- a/feature_engine/variable_handling/_variable_type_checks.py +++ b/feature_engine/variable_handling/_variable_type_checks.py @@ -1,21 +1,26 @@ -import warnings - import pandas as pd +from pandas.api.types import is_object_dtype, is_string_dtype from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime from pandas.core.dtypes.common import is_numeric_dtype as is_numeric -from pandas.core.dtypes.common import is_object_dtype as is_object + + +def is_object(s) -> bool: + return is_object_dtype(s) or is_string_dtype(s) def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool: + # check for datetime only if the type of the categories is not numeric + # because pd.to_datetime throws an error when it is an integer + if isinstance(column.dtype, pd.CategoricalDtype): + is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column) + # check for datetime only if object cannot be cast as numeric because # if it could pd.to_datetime would convert it to datetime regardless - if is_object(column): + elif is_object(column): is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column) - # check for datetime only if the type of the categories is not numeric - # because pd.to_datetime throws an error when it is an integer - elif isinstance(column.dtype, pd.CategoricalDtype): - is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column) + else: + is_cat = False return is_cat @@ -25,9 +30,11 @@ def _is_categories_num(column: pd.Series) -> bool: def _is_convertible_to_dt(column: pd.Series) -> bool: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - return is_datetime(pd.to_datetime(column, errors="ignore", utc=True)) + try: + var = pd.to_datetime(column, utc=True) + return is_datetime(var) + except Exception: + return False def _is_convertible_to_num(column: pd.Series) -> bool: @@ -39,14 +46,17 @@ def _is_convertible_to_num(column: pd.Series) -> bool: def _is_categorical_and_is_datetime(column: pd.Series) -> bool: + # check for datetime only if the type of the categories is not numeric + # because pd.to_datetime throws an error when it is an integer + if isinstance(column.dtype, pd.CategoricalDtype): + is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column) + # check for datetime only if object cannot be cast as numeric because # if it could pd.to_datetime would convert it to datetime regardless - if is_object(column): + elif is_object(column): is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column) - # check for datetime only if the type of the categories is not numeric - # because pd.to_datetime throws an error when it is an integer - elif isinstance(column.dtype, pd.CategoricalDtype): - is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column) + else: + is_dt = False return is_dt diff --git a/feature_engine/variable_handling/find_variables.py b/feature_engine/variable_handling/find_variables.py index 04779ad5d..6162fdb43 100644 --- a/feature_engine/variable_handling/find_variables.py +++ b/feature_engine/variable_handling/find_variables.py @@ -5,11 +5,11 @@ import pandas as pd from pandas.api.types import is_datetime64_any_dtype as is_datetime from pandas.core.dtypes.common import is_numeric_dtype as is_numeric -from pandas.core.dtypes.common import is_object_dtype as is_object from feature_engine.variable_handling._variable_type_checks import ( _is_categorical_and_is_datetime, _is_categorical_and_is_not_datetime, + is_object, ) from feature_engine.variable_handling.dtypes import DATETIME_TYPES @@ -85,7 +85,9 @@ def find_categorical_variables(X: pd.DataFrame) -> List[Union[str, int]]: """ variables = [ column - for column in X.select_dtypes(include=["O", "category"]).columns + for column in X.select_dtypes( + include=["O", "category", "string"] + ).columns if _is_categorical_and_is_not_datetime(X[column]) ] if len(variables) == 0: @@ -251,12 +253,13 @@ def find_categorical_and_numerical_variables( # If user leaves default None parameter. elif variables is None: # find categorical variables - if variables is None: - variables_cat = [ - column - for column in X.select_dtypes(include=["O", "category"]).columns - if _is_categorical_and_is_not_datetime(X[column]) - ] + variables_cat = [ + column + for column in X.select_dtypes( + include=["O", "category", "string"] + ).columns + if _is_categorical_and_is_not_datetime(X[column]) + ] # find numerical variables in dataset variables_num = list(X.select_dtypes(include="number").columns) @@ -271,14 +274,14 @@ def find_categorical_and_numerical_variables( raise ValueError("The list of variables is empty.") # find categorical variables - variables_cat = [ - var for var in X[variables].select_dtypes(include=["O", "category"]).columns - ] + variables_cat = list( + X[variables].select_dtypes(include=["O", "category", "string"]).columns + ) # find numerical variables variables_num = list(X[variables].select_dtypes(include="number").columns) - if any([v for v in variables if v not in variables_cat + variables_num]): + if any(v for v in variables if v not in variables_cat + variables_num): raise TypeError( "Some of the variables are neither numerical nor categorical." ) diff --git a/tests/test_creation/test_math_features.py b/tests/test_creation/test_math_features.py index f65e932ee..6e16821be 100644 --- a/tests/test_creation/test_math_features.py +++ b/tests/test_creation/test_math_features.py @@ -136,6 +136,13 @@ def test_aggregations_with_functions(df_vartypes): } ) + # TODO: Remove pandas < 3 support when dropping older pandas versions + # In pandas >=3, when the user passes np.std, agg will use numpy. + # In pandas <3, when the user passes np.std, agg will use pd.std. + # Hence the difference in results + if pd.__version__ >= "3": + ref["std_Age_Marks"] = np.std(df_vartypes[["Age", "Marks"]], axis=1) + # transform params pd.testing.assert_frame_equal(X, ref) @@ -237,7 +244,6 @@ def test_variable_names_when_df_cols_are_integers(df_numeric_columns): def test_error_when_null_values_in_variable(df_vartypes): - df_na = df_vartypes.copy() df_na.loc[1, "Age"] = np.nan @@ -256,7 +262,6 @@ def test_error_when_null_values_in_variable(df_vartypes): def test_no_error_when_null_values_in_variable(df_vartypes): - df_na = df_vartypes.copy() df_na.loc[1, "Age"] = np.nan @@ -323,7 +328,6 @@ def test_get_feature_names_out(_varnames, _drop, df_vartypes): @pytest.mark.parametrize("_varnames", [None, ["var1", "var2"]]) @pytest.mark.parametrize("_drop", [True, False]) def test_get_feature_names_out_from_pipeline(_varnames, _drop, df_vartypes): - # set up transformer transformer = MathFeatures( variables=["Age", "Marks"], diff --git a/tests/test_dataframe_checks.py b/tests/test_dataframe_checks.py index d38e7cd54..3b43c0b5d 100644 --- a/tests/test_dataframe_checks.py +++ b/tests/test_dataframe_checks.py @@ -248,23 +248,48 @@ def test_optional_contains_na(df_na): assert str(record.value) == msg -def test_contains_inf(df_na): - df_na.fillna(np.inf, inplace=True) - with pytest.raises(ValueError): - assert _check_contains_inf(df_na, ["Age", "Marks"]) +def test_contains_inf_raises_on_inf(): + msg = ( + "Some of the variables to transform contain inf values. Check and " + "remove those before using this transformer." + ) + df = pd.DataFrame({"A": [1.1, np.inf, 3.3]}) + with pytest.raises(ValueError, match=msg): + _check_contains_inf(df, ["A"]) + + +def test_contains_inf_passes_without_inf(): + df = pd.DataFrame({"A": [1.1, 2.2, 3.3]}) + assert _check_contains_inf(df, ["A"]) is None def test_check_X_raises_error_on_duplicated_column_names(): df = pd.DataFrame( { - "col1": [1, 2, 3], - "col2": ["a", "b", "c"], - "col3": pd.date_range("2023-01-01", periods=3), + "Name": ["tom", "nick", "krish", "jack"], + "City": ["London", "Manchester", "Liverpool", "Bristol"], + "Age": [20, 21, 19, 18], + "Marks": [0.9, 0.8, 0.7, 0.6], } ) - df.columns = ["same", "unique", "same"] - + df.columns = ["var_A", "var_A", "var_B", "var_C"] with pytest.raises(ValueError) as err_txt: check_X(df) - assert err_txt.match("Input data contains duplicated variable names.") + + +def test_check_X_errors(): + # Test scalar array error (line 58) + with pytest.raises(ValueError) as record: + check_X(np.array(1)) + assert record.match("Expected 2D array, got scalar array instead") + + # Test 1D array error (line 65) + with pytest.raises(ValueError) as record: + check_X(np.array([1, 2, 3])) + assert record.match("Expected 2D array, got 1D array instead") + + # Test incorrect type error (line 80) + with pytest.raises(TypeError) as record: + check_X("not a dataframe") + assert record.match("X must be a numpy array or pandas dataframe") diff --git a/tests/test_datetime/test_datetime_features.py b/tests/test_datetime/test_datetime_features.py index 1d95ffe83..456f41e84 100644 --- a/tests/test_datetime/test_datetime_features.py +++ b/tests/test_datetime/test_datetime_features.py @@ -336,13 +336,13 @@ def test_extract_features_from_different_timezones(): ) exp_err_msg = ( "Tz-aware datetime.datetime cannot be converted to datetime64 " - "unless utc=True, at position 3" + "unless utc=True" ) with pytest.raises(ValueError) as errinfo: assert DatetimeFeatures( variables="time", features_to_extract=["hour"], utc=False ).fit_transform(df) - assert str(errinfo.value) == exp_err_msg + assert exp_err_msg in str(errinfo.value) def test_extract_features_from_different_timezones_when_string( diff --git a/tests/test_encoding/test_mean_encoder.py b/tests/test_encoding/test_mean_encoder.py index 1026936be..a13d0e5bf 100644 --- a/tests/test_encoding/test_mean_encoder.py +++ b/tests/test_encoding/test_mean_encoder.py @@ -183,10 +183,11 @@ def test_warning_if_transform_df_contains_categories_not_present_in_fit_df( encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -364,7 +365,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): ] pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == float + assert X["var_A"].dtypes.name == "float64" def test_auto_smoothing(df_enc): diff --git a/tests/test_encoding/test_ordinal_encoder.py b/tests/test_encoding/test_ordinal_encoder.py index ae7705643..e447c4176 100644 --- a/tests/test_encoding/test_ordinal_encoder.py +++ b/tests/test_encoding/test_ordinal_encoder.py @@ -138,10 +138,11 @@ def test_error_if_input_df_contains_categories_not_present_in_training_df( encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -243,7 +244,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): # test transform output pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == int + assert X["var_A"].dtypes.name == "int64" @pytest.mark.parametrize( diff --git a/tests/test_encoding/test_similarity_encoder.py b/tests/test_encoding/test_similarity_encoder.py index 3e74b3717..09c17443b 100644 --- a/tests/test_encoding/test_similarity_encoder.py +++ b/tests/test_encoding/test_similarity_encoder.py @@ -1,5 +1,6 @@ from difflib import SequenceMatcher +import numpy as np import pandas as pd import pytest @@ -115,9 +116,14 @@ def test_nan_behaviour_error_fit(df_enc_big_na): assert str(record.value) == msg -def test_nan_behaviour_error_transform(df_enc_big, df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_error_transform(df_enc_big, nan_value): encoder = StringSimilarityEncoder(missing_values="raise") encoder.fit(df_enc_big) + + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + with pytest.raises(ValueError) as record: encoder.transform(df_enc_big_na) msg = ( @@ -128,9 +134,15 @@ def test_nan_behaviour_error_transform(df_enc_big, df_enc_big_na): assert str(record.value) == msg -def test_nan_behaviour_impute(df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_impute(df_enc_big, nan_value): + + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + encoder = StringSimilarityEncoder(missing_values="impute") X = encoder.fit_transform(df_enc_big_na) + assert (X.isna().sum() == 0).all(axis=None) assert encoder.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F", ""], @@ -139,7 +151,11 @@ def test_nan_behaviour_impute(df_enc_big_na): } -def test_nan_behaviour_ignore(df_enc_big_na): +@pytest.mark.parametrize("nan_value", [np.nan, pd.NA, None]) +def test_nan_behaviour_ignore(df_enc_big, nan_value): + df_enc_big_na = df_enc_big.copy() + df_enc_big_na.loc[0, "var_A"] = nan_value + encoder = StringSimilarityEncoder(missing_values="ignore") X = encoder.fit_transform(df_enc_big_na) assert (X.isna().any(axis=1) == df_enc_big_na.isna().any(axis=1)).all() @@ -150,6 +166,27 @@ def test_nan_behaviour_ignore(df_enc_big_na): } +def test_string_dtype_with_pd_na(): + # Test StringDtype with pd.NA to hit "" branch in transform + df = pd.DataFrame({"var_A": ["A", "B", pd.NA]}, dtype="string") + encoder = StringSimilarityEncoder(missing_values="impute") + X = encoder.fit_transform(df) + assert (X.isna().sum() == 0).all(axis=None) + # The categories will include "" or the string version of it + assert "" in encoder.encoder_dict_["var_A"] + + +def test_string_dtype_with_literal_nan_strings(): + # Test with literal "nan" and "" strings to hit skips in + # transform (line 339, 341 False) + df = pd.DataFrame({"var_A": ["nan", "", "A", "B"]}, dtype="string") + encoder = StringSimilarityEncoder(missing_values="impute") + X = encoder.fit_transform(df) + assert (X.isna().sum() == 0).all(axis=None) + assert "nan" in encoder.encoder_dict_["var_A"] + assert "" in encoder.encoder_dict_["var_A"] + + def test_inverse_transform_error(df_enc_big): encoder = StringSimilarityEncoder() X = encoder.fit_transform(df_enc_big) @@ -237,6 +274,7 @@ def test_get_feature_names_out_na(df_enc_big_na): "var_C_F", ] + # NaN values are replaced with empty string "" before string conversion assert tr.encoder_dict_ == { "var_A": ["B", "D", "G", "A", "C", "E", "F", ""], "var_B": ["A", "D", "B", "G", "C", "E", "F"], diff --git a/tests/test_encoding/test_woe/test_woe_encoder.py b/tests/test_encoding/test_woe/test_woe_encoder.py index 44181c5d7..a38caa6fa 100644 --- a/tests/test_encoding/test_woe/test_woe_encoder.py +++ b/tests/test_encoding/test_woe/test_woe_encoder.py @@ -149,10 +149,11 @@ def test_warn_if_transform_df_contains_categories_not_seen_in_fit(df_enc, df_enc encoder.fit(df_enc[["var_A", "var_B"]], df_enc["target"]) encoder.transform(df_enc_rare[["var_A", "var_B"]]) - # check that only one warning was raised - assert len(record) == 1 + # check that at least one warning was raised (Pandas 3 may emit additional + # deprecation warnings) + assert len(record) >= 1 # check that the message matches - assert record[0].message.args[0] == msg + assert any(r.message.args[0] == msg for r in record) # check for error when rare_labels equals 'raise' with pytest.raises(ValueError) as record: @@ -389,7 +390,7 @@ def test_variables_cast_as_category(df_enc_category_dtypes): transf_df["var_B"] = VAR_B pd.testing.assert_frame_equal(X, transf_df[["var_A", "var_B"]], check_dtype=False) - assert X["var_A"].dtypes == float + assert X["var_A"].dtypes.name == "float64" @pytest.mark.parametrize( diff --git a/tests/test_preprocessing/test_match_columns.py b/tests/test_preprocessing/test_match_columns.py index 16ee0633d..6726b33f9 100644 --- a/tests/test_preprocessing/test_match_columns.py +++ b/tests/test_preprocessing/test_match_columns.py @@ -189,7 +189,11 @@ def test_match_dtypes_string_to_datetime(df_vartypes): assert match_columns.match_dtypes is True assert match_columns.verbose is False # test fit attrs - assert match_columns.dtype_dict_ == {"dob": np.dtype("= "3": + assert match_columns.dtype_dict_ == {"dob": np.dtype("= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -371,6 +392,27 @@ def test_sklearn_ohe_object_many_features(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes[variables_to_encode]) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -393,6 +435,27 @@ def test_sklearn_ohe_numeric(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes[variables_to_encode]) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -428,6 +491,27 @@ def test_sklearn_ohe_all_features(df_vartypes): transformed_df = transformer.fit_transform(df_vartypes) + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] + else: + # Pandas 2 uses nanoseconds format + transformed_df.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in transformed_df.columns + ] + ref.columns = [ + c.replace(".000000000", "").replace(".000000", "") + for c in ref.columns + ] pd.testing.assert_frame_equal(ref, transformed_df) @@ -466,7 +550,7 @@ def test_sklearn_ohe_with_crossvalidation(): results: np.ndarray = cross_val_score( pipeline, X, y, scoring="neg_mean_squared_error", cv=3 ) - assert not any([np.isnan(i) for i in results]) + assert not any(np.isnan(i) for i in results) def test_wrap_one_hot_encoder_get_features_name_out(df_vartypes): @@ -496,7 +580,28 @@ def test_wrap_one_hot_encoder_get_features_name_out(df_vartypes): "dob_2020-02-24T00:03:00.000000000", ] - assert ohe_wrap.get_feature_names_out() == expected_features_all + # TODO: Remove pandas < 3 support when dropping older pandas versions + if pd.__version__ >= "3": + # Pandas 3 uses microseconds format + actual_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in ohe_wrap.get_feature_names_out() + ] + expected_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in expected_features_all + ] + else: + # Pandas 2 uses nanoseconds format + actual_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in ohe_wrap.get_feature_names_out() + ] + expected_features = [ + f.replace(".000000000", "").replace(".000000", "") + for f in expected_features_all + ] + assert actual_features == expected_features @pytest.mark.parametrize( diff --git a/tox.ini b/tox.ini index e55e03a47..b09a57c6c 100644 --- a/tox.ini +++ b/tox.ini @@ -1,16 +1,36 @@ [tox] -envlist = py39, py310, py311-sklearn150, py311-sklearn160, py311-sklearn170, py312, py313, codecov, docs, stylechecks, typechecks +envlist = + py39 + py310 + py311-sklearn150 + py311-sklearn160 + py311-sklearn170 + py312-pandas230 + py312-pandas300 + py313 + codecov + docs + stylechecks + typechecks skipsdist = true + [testenv] -install_command = pip install {opts} {packages} envdir = {toxworkdir}/unit_tests +install_command = pip install {opts} {packages} + setenv = - PYTHONPATH=. + PYTHONPATH = . COVERAGE_RCFILE = {envtmpdir}/coveragerc + commands = pytest tests + +# ------------------------- +# Python versions +# ------------------------- + [testenv:py39] deps = .[tests] @@ -19,6 +39,15 @@ deps = deps = .[tests] +[testenv:py313] +deps = + .[tests] + + +# ------------------------- +# scikit-learn matrix +# ------------------------- + [testenv:py311-sklearn150] deps = .[tests] @@ -34,45 +63,78 @@ deps = .[tests] scikit-learn==1.7.1 -[testenv:py312] + +[testenv:py312-pandas230] deps = .[tests] + pandas==2.3.0 -[testenv:py313] +[testenv:py312-pandas300] deps = .[tests] + pandas==3.0.0 + + +# ------------------------- +# Coverage +# ------------------------- [testenv:codecov] deps = .[tests] + commands_pre = {envpython} -c 'from pathlib import Path; Path(r"{env:COVERAGE_RCFILE}").write_text(Path(".coveragerc").read_text())' + commands = coverage run -m pytest -v coverage report + +# ------------------------- +# Docs +# ------------------------- + [testenv:docs] deps = .[docs] + commands = sphinx-build -W -b html -d {envtmpdir}/doctrees docs {envtmpdir}/html + +# ------------------------- +# Linting & typing +# ------------------------- + [testenv:stylechecks] deps = flake8 -commands = {posargs:flake8 feature_engine tests} + +commands = + {posargs:flake8 feature_engine tests} [testenv:typechecks] deps = - mypy -commands = {posargs:mypy feature_engine} + mypy + +commands = + {posargs:mypy feature_engine} + + +# ------------------------- +# flake8 configuration +# ------------------------- [flake8] -exclude = .git, env -# match black code formatter +exclude = + .git + env + +# Match Black max-line-length = 88 profile = black line_length = 88 lines_between_sections = 1 -known_first_party = "sentry" \ No newline at end of file +known_first_party = sentry