Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ concurrency:

jobs:
test:
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
runs-on: ${{ matrix.os }}

strategy:
Expand Down Expand Up @@ -64,6 +64,14 @@ jobs:
sklearn-only: "false"
code-cov: true

# Pandas 2 run
- os: ubuntu-latest
python-version: "3.12"
scikit-learn: "1.5.*"
sklearn-only: "false"
pandas-version: "2.*"
code-cov: false

steps:
- uses: actions/checkout@v6
with:
Expand All @@ -74,10 +82,16 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install test dependencies and scikit-learn
- name: Install test dependencies, scikit-learn, and optional pandas
shell: bash
run: |
python -m pip install --upgrade pip
pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}

if [ "${{ matrix.pandas-version }}" != "" ]; then
echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
pip install "pandas==${{ matrix.pandas-version }}"
fi

- name: Store repository status
id: status-before
Expand Down
2 changes: 1 addition & 1 deletion openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def _parse_data_from_arff( # noqa: C901, PLR0912, PLR0915
try:
# checks if the strings which should be the class labels
# can be encoded into integers
pd.factorize(type_)[0]
pd.factorize(np.array(type_))[0]
except ValueError as e:
raise ValueError(
"Categorical data needs to be numeric when using sparse ARFF."
Expand Down
15 changes: 9 additions & 6 deletions tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,24 @@ def test_get_data_pandas(self):
assert isinstance(data, pd.DataFrame)
assert data.shape[1] == len(self.titanic.features)
assert data.shape[0] == 1309
# Dynamically detect what this version of Pandas calls string columns.
str_dtype = data["name"].dtype.name

col_dtype = {
"pclass": "uint8",
"survived": "category",
"name": "object",
"name": str_dtype,
"sex": "category",
"age": "float64",
"sibsp": "uint8",
"parch": "uint8",
"ticket": "object",
"ticket": str_dtype,
"fare": "float64",
"cabin": "object",
"cabin": str_dtype,
"embarked": "category",
"boat": "object",
"boat": str_dtype,
"body": "float64",
"home.dest": "object",
"home.dest": str_dtype,
}
for col_name in data.columns:
assert data[col_name].dtype.name == col_dtype[col_name]
Expand Down Expand Up @@ -357,7 +360,7 @@ def setUp(self):
def test_get_sparse_dataset_dataframe_with_target(self):
X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
assert isinstance(X, pd.DataFrame)
assert isinstance(X.dtypes[0], pd.SparseDtype)
assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
assert X.shape == (600, 20000)

assert isinstance(y, pd.Series)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_flows/test_flow_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@ def _check_flow(self, flow):
assert isinstance(flow["full_name"], str)
assert isinstance(flow["version"], str)
# There are some runs on openml.org that can have an empty external version
ext_version = flow["external_version"]
ext_version_str_or_none = (
isinstance(flow["external_version"], str) or flow["external_version"] is None
isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
)
assert ext_version_str_or_none

Expand Down