openml · fkiraly · Jan 26, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -22,7 +22,7 @@ concurrency:
 
 jobs:
   test:
-    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }},sk-only:${{ matrix.sklearn-only }})
+    name: (${{ matrix.os }},Py${{ matrix.python-version }},sk${{ matrix.scikit-learn }}${{ matrix.pandas-version != '' && format(',pd:{0}', matrix.pandas-version) || '' }},sk-only:${{ matrix.sklearn-only }})
     runs-on: ${{ matrix.os }}
 
     strategy:
@@ -64,6 +64,14 @@ jobs:
             sklearn-only: "false"
             code-cov: true
 
+          # Pandas 2 run
+          - os: ubuntu-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+            pandas-version: "2.*"
+            code-cov: false
+
     steps:
     - uses: actions/checkout@v6
       with:
@@ -74,10 +82,16 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install test dependencies and scikit-learn
+    - name: Install test dependencies, scikit-learn, and optional pandas
+      shell: bash
       run: |
         python -m pip install --upgrade pip
         pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
+
+        if [ "${{ matrix.pandas-version }}" != "" ]; then
+          echo "Installing specific pandas version: ${{ matrix.pandas-version }}"
+          pip install "pandas==${{ matrix.pandas-version }}"
+        fi
 
     - name: Store repository status
       id: status-before

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -488,7 +488,7 @@ def _parse_data_from_arff(  # noqa: C901, PLR0912, PLR0915
                 try:
                     # checks if the strings which should be the class labels
                     # can be encoded into integers
-                    pd.factorize(type_)[0]
+                    pd.factorize(np.array(type_))[0]
                 except ValueError as e:
                     raise ValueError(
                         "Categorical data needs to be numeric when using sparse ARFF."

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
@@ -102,21 +102,24 @@ def test_get_data_pandas(self):
         assert isinstance(data, pd.DataFrame)
         assert data.shape[1] == len(self.titanic.features)
         assert data.shape[0] == 1309
+        # Dynamically detect what this version of Pandas calls string columns.
+        str_dtype = data["name"].dtype.name
+
         col_dtype = {
             "pclass": "uint8",
             "survived": "category",
-            "name": "object",
+            "name": str_dtype,
             "sex": "category",
             "age": "float64",
             "sibsp": "uint8",
             "parch": "uint8",
-            "ticket": "object",
+            "ticket": str_dtype,
             "fare": "float64",
-            "cabin": "object",
+            "cabin": str_dtype,
             "embarked": "category",
-            "boat": "object",
+            "boat": str_dtype,
             "body": "float64",
-            "home.dest": "object",
+            "home.dest": str_dtype,
         }
         for col_name in data.columns:
             assert data[col_name].dtype.name == col_dtype[col_name]
@@ -357,7 +360,7 @@ def setUp(self):
     def test_get_sparse_dataset_dataframe_with_target(self):
         X, y, _, attribute_names = self.sparse_dataset.get_data(target="class")
         assert isinstance(X, pd.DataFrame)
-        assert isinstance(X.dtypes[0], pd.SparseDtype)
+        assert isinstance(X.dtypes.iloc[0], pd.SparseDtype)
         assert X.shape == (600, 20000)
 
         assert isinstance(y, pd.Series)

diff --git a/tests/test_flows/test_flow_functions.py b/tests/test_flows/test_flow_functions.py
@@ -41,8 +41,9 @@ def _check_flow(self, flow):
         assert isinstance(flow["full_name"], str)
         assert isinstance(flow["version"], str)
         # There are some runs on openml.org that can have an empty external version
+        ext_version = flow["external_version"]
         ext_version_str_or_none = (
-            isinstance(flow["external_version"], str) or flow["external_version"] is None
+            isinstance(ext_version, str) or ext_version is None or pd.isna(ext_version)
         )
         assert ext_version_str_or_none