Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
d090224
update dt functions
solegalli Jan 28, 2026
6ff27aa
expand tests
solegalli Jan 28, 2026
9d44303
expand tests
solegalli Jan 28, 2026
de4d663
update fpr new pandas behaviour
solegalli Jan 28, 2026
7f88839
fix: Pandas 3 compatibility - robust dtype checks and test fixes (#885)
ankitlade12 Feb 6, 2026
6099193
remove extra learned parameter, pass function to transform
solegalli Feb 6, 2026
8c7b35e
rolled back mapping function
solegalli Feb 6, 2026
ff45161
refactor creation of array of nan
solegalli Feb 6, 2026
2a6775d
expand test to cover different expressions of nan values
solegalli Feb 6, 2026
1f526cf
refactor match columns update
solegalli Feb 6, 2026
9368b17
refactor code variable checks
solegalli Feb 6, 2026
564a84f
refactor inf tests
solegalli Feb 6, 2026
2541410
split test by pandas version
solegalli Feb 6, 2026
b57a600
solve additional errors specific to pandas 3
solegalli Feb 6, 2026
75fde70
add pandas version tests to tox.ini
solegalli Feb 7, 2026
3488426
move versioned tests to python 12
solegalli Feb 7, 2026
8758cdf
add tests to circleci config
solegalli Feb 7, 2026
a7ab1a1
reformat circleci config"
solegalli Feb 7, 2026
c272f86
test revering pandas version on None
solegalli Feb 7, 2026
2c0c51c
move dropna a level up when missing values ignore in string similarity
solegalli Feb 7, 2026
1f8e4e6
refactor condition logic for codecoverage
solegalli Feb 7, 2026
9406e1b
fix decreased coverage
solegalli Feb 7, 2026
c516db2
revert math features to main
solegalli Feb 7, 2026
5e6338c
remove test from math features
solegalli Feb 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 71 additions & 17 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
version: 2.1

orbs:
codecov: codecov/codecov@3.2.3


# --------------------------------------------------
# Anchors & defaults
# --------------------------------------------------

defaults: &defaults
docker:
- image: cimg/python:3.10.0
Expand All @@ -17,14 +23,24 @@ prepare_tox: &prepare_tox

init_pypirc: &init_pypirc
run:
name: init .pypirc
name: Init .pypirc
command: |
echo -e "[pypi]" >> ~/.pypirc
echo -e "repository = $FE_PYPI_URL" >> ~/.pypirc
echo -e "username = $FE_PYPI_USER" >> ~/.pypirc
echo -e "password = $FE_PYPI_API_KEY" >> ~/.pypirc


# --------------------------------------------------
# Jobs
# --------------------------------------------------

jobs:

# ------------------------
# Test matrix
# ------------------------

test_feature_engine_py39:
docker:
- image: cimg/python:3.9.0
Expand All @@ -34,7 +50,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.9
name: Run tests (Python 3.9)
command: |
tox -e py39

Expand All @@ -47,7 +63,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.10
name: Run tests (Python 3.10)
command: |
tox -e py310

Expand All @@ -60,7 +76,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.11 and scikit-learn 1.5.0
name: Run tests (Python 3.11, scikit-learn 1.5)
command: |
tox -e py311-sklearn150

Expand All @@ -73,7 +89,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.11 and scikit-learn 1.6.0
name: Run tests (Python 3.11, scikit-learn 1.6)
command: |
tox -e py311-sklearn160

Expand All @@ -86,11 +102,11 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.11 and scikit-learn 1.7.0
name: Run tests (Python 3.11, scikit-learn 1.7)
command: |
tox -e py311-sklearn170

test_feature_engine_py312:
test_feature_engine_py312_pandas230:
docker:
- image: cimg/python:3.12.1
working_directory: ~/project
Expand All @@ -99,9 +115,22 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.12
name: Run tests (Python 3.12, pandas 2.3)
command: |
tox -e py312
tox -e py312-pandas230

test_feature_engine_py312_pandas300:
docker:
- image: cimg/python:3.12.1
working_directory: ~/project
steps:
- checkout:
path: ~/project
- *prepare_tox
- run:
name: Run tests (Python 3.12, pandas 3.0)
command: |
tox -e py312-pandas300

test_feature_engine_py313:
docker:
Expand All @@ -112,10 +141,15 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run tests with Python 3.13
name: Run tests (Python 3.13)
command: |
tox -e py313


# ------------------------
# Quality checks
# ------------------------

test_style:
docker:
- image: cimg/python:3.10.0
Expand All @@ -125,7 +159,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run stylechecks
name: Run style checks
command: |
tox -e stylechecks

Expand All @@ -138,7 +172,7 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run doc build
name: Build documentation
command: |
tox -e docs

Expand All @@ -151,10 +185,15 @@ jobs:
path: ~/project
- *prepare_tox
- run:
name: Run typechecks
name: Run type checks
command: |
tox -e typechecks


# ------------------------
# Coverage
# ------------------------

upload_codecov:
docker:
- image: cimg/python:3.10.0
Expand All @@ -171,13 +210,18 @@ jobs:
coverage report
- codecov/upload


# ------------------------
# Release
# ------------------------

package_and_upload_to_pypi:
<<: *defaults
steps:
- checkout
- *init_pypirc
- run:
name: upload to pypi
name: Build and upload package
command: |
python -m venv env
source env/bin/activate
Expand All @@ -188,38 +232,48 @@ jobs:
ls -l dist
twine upload dist/*


# --------------------------------------------------
# Workflows
# --------------------------------------------------

workflows:
version: 2

test-all:
jobs:
- test_feature_engine_py39
- test_feature_engine_py310
- test_feature_engine_py311_sklearn150
- test_feature_engine_py311_sklearn160
- test_feature_engine_py311_sklearn170
- test_feature_engine_py312
- test_feature_engine_py312_pandas230
- test_feature_engine_py312_pandas300
- test_feature_engine_py313
- test_style
- test_docs
- test_type

- upload_codecov:
filters:
branches:
ignore:
- 1.9.X

- package_and_upload_to_pypi:
requires:
- test_feature_engine_py39
- test_feature_engine_py310
- test_feature_engine_py311_sklearn150
- test_feature_engine_py311_sklearn160
- test_feature_engine_py311_sklearn170
- test_feature_engine_py312
- test_feature_engine_py312_pandas230
- test_feature_engine_py312_pandas300
- test_feature_engine_py313
- test_style
- test_docs
- test_type
filters:
branches:
only:
- 1.9.X
- 1.9.X
8 changes: 5 additions & 3 deletions feature_engine/dataframe_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from scipy.sparse import issparse
from sklearn.utils.validation import _check_y, check_consistent_length, column_or_1d

from feature_engine.variable_handling._variable_type_checks import is_object


def check_X(X: Union[np.generic, np.ndarray, pd.DataFrame]) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -121,10 +123,10 @@ def check_y(
elif isinstance(y, pd.Series):
if y.isnull().any():
raise ValueError("y contains NaN values.")
if y.dtype != "O" and not np.isfinite(y).all():
if not is_object(y) and not np.isfinite(y).all():
raise ValueError("y contains infinity values.")
if y_numeric and y.dtype == "O":
y = y.astype("float")
if y_numeric and is_object(y):
y = y.astype("float64")
y = y.copy()

elif isinstance(y, pd.DataFrame):
Expand Down
47 changes: 33 additions & 14 deletions feature_engine/encoding/similarity_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
X = check_X(X)
variables_ = self._check_or_select_variables(X)

if self.keywords:
if not all(item in variables_ for item in self.keywords.keys()):
raise ValueError(
"There are variables in keywords that are not present "
"in the dataset."
)
if self.keywords and not all(
item in variables_ for item in self.keywords.keys()
):
raise ValueError(
"There are variables in keywords that are not present "
"in the dataset."
)

# if data contains nan, fail before running any logic
if self.missing_values == "raise":
Expand All @@ -262,10 +263,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
)
elif self.missing_values == "impute":
for var in cols_to_iterate:
series = X[var]
self.encoder_dict_[var] = (
X[var]
.astype(str)
.replace("nan", "")
series.astype(str)
.mask(series.isna(), "")
.value_counts()
.head(self.top_categories)
.index.tolist()
Expand All @@ -274,9 +275,9 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
for var in cols_to_iterate:
self.encoder_dict_[var] = (
X[var]
.dropna()
.astype(str)
.value_counts(dropna=True)
.drop("nan", errors="ignore")
.head(self.top_categories)
.index.tolist()
)
Expand Down Expand Up @@ -316,13 +317,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
new_values = []
for var in self.variables_:
if self.missing_values == "impute":
X[var] = X[var].astype(str).replace("nan", "")
categories = X[var].dropna().astype(str).unique()
series = X[var]
series = series.astype(str).mask(series.isna(), "")
else:
series = X[var].astype(str)

categories = series.unique()
column_encoder_dict = {
x: _gpm_fast_vec(x, self.encoder_dict_[var]) for x in categories
}
column_encoder_dict["nan"] = [np.nan] * len(self.encoder_dict_[var])
encoded = np.vstack(X[var].astype(str).map(column_encoder_dict).values)
# Ensure map result is always an array of the correct size.
# Missing values in categories or unknown categories will map to NaN.
default_nan = np.full(len(self.encoder_dict_[var]), np.nan)
if "nan" not in column_encoder_dict:
column_encoder_dict["nan"] = default_nan
if "<NA>" not in column_encoder_dict:
column_encoder_dict["<NA>"] = default_nan

encoded_series = series.map(column_encoder_dict)

# Robust stacking: replace any float NaNs (from unknown values) with arrays
encoded_list = [
v if isinstance(v, (list, np.ndarray)) else default_nan
for v in encoded_series
]
encoded = np.vstack(encoded_list)
if self.missing_values == "ignore":
encoded[X[var].isna(), :] = np.nan
new_values.append(encoded)
Expand Down
7 changes: 5 additions & 2 deletions feature_engine/preprocessing/match_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def __init__(

if not isinstance(verbose, bool):
raise ValueError(
"verbose takes only booleans True and False." f"Got '{verbose} instead."
f"verbose takes only booleans True and False. Got '{verbose} instead."
)

# note: np.nan is an instance of float!!!
Expand Down Expand Up @@ -262,7 +262,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

X = X.drop(_columns_to_drop, axis=1)

X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value)
# Add missing columns first and then reorder to avoid
# Pandas 3 StringDtype reindex issue (before we used reindex)
X[_columns_to_add] = self.fill_value
X = X[self.feature_names_in_]

if self.match_dtypes:
_current_dtypes = X.dtypes.to_dict()
Expand Down
4 changes: 2 additions & 2 deletions feature_engine/timeseries/forecasting/lag_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = X[self.variables_].shift(
Expand All @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = X[self.variables_].shift(
Expand Down
2 changes: 1 addition & 1 deletion feature_engine/timeseries/forecasting/window_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
.shift(periods=self.periods, freq=self.freq)
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = (
Expand Down
Loading