From fd062f7840be5d0fd3a3d3e26cd4fdc14aa82dba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 16:55:05 +0100
Subject: [PATCH 01/18] Update test.yml

---
 .github/workflows/test.yml | 93 ++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 49 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 31cdff602..b3108ff59 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,7 +14,6 @@ on:
     branches:
       - main
       - develop
-
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true
@@ -23,70 +22,48 @@ jobs:
   test:
     name: (${{ matrix.os }}, Py${{ matrix.python-version }}, sk${{ matrix.scikit-learn }}, sk-only:${{ matrix.sklearn-only }})
     runs-on: ${{ matrix.os }}
+
     strategy:
+      fail-fast: false
       matrix:
-        python-version: ["3.9"]
-        scikit-learn: ["1.0.*", "1.1.*", "1.2.*", "1.3.*", "1.4.*", "1.5.*"]
+        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
+        scikit-learn: ["1.2.*", "1.3.*", "1.4.*", "1.5.*"]
         os: [ubuntu-latest]
         sklearn-only: ["true"]
+
         include:
-          - os: ubuntu-latest
-            python-version: "3.8"  # no scikit-learn 0.23 release for Python 3.9
-            scikit-learn: "0.23.1"
-            sklearn-only: "true"
-          # scikit-learn 0.24 relies on scipy defaults, so we need to fix the version
-          # c.f. https://github.com/openml/openml-python/pull/1267
-          - os: ubuntu-latest
-            python-version: "3.9"
-            scikit-learn: "0.24"
-            scipy: "1.10.0"
-            sklearn-only: "true"
-          # Do a Windows and Ubuntu test for _all_ openml functionality
-          # I am not sure why these are on 3.8 and older scikit-learn
+          # Full test run on Windows
           - os: windows-latest
-            python-version: "3.8"
-            scikit-learn: 0.24.*
-            scipy: "1.10.0"
-            sklearn-only: 'false'
-          # Include a code cov version
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
+
+          # Coverage run
           - os: ubuntu-latest
+            python-version: "3.12"
+            scikit-learn: "1.5.*"
+            sklearn-only: "false"
             code-cov: true
-            python-version: "3.8"
-            scikit-learn: 0.23.1
-            sklearn-only: 'false'
-      fail-fast:  false
 
     steps:
     - uses: actions/checkout@v4
       with:
         fetch-depth: 2
+
     - name: Setup Python ${{ matrix.python-version }}
-      if: matrix.os != 'windows-latest'  # windows-latest only uses preinstalled Python (3.9.13)
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
+
     - name: Install test dependencies
       run: |
         python -m pip install --upgrade pip
         pip install -e .[test]
+
     - name: Install scikit-learn ${{ matrix.scikit-learn }}
       run: |
         pip install scikit-learn==${{ matrix.scikit-learn }}
-    - name: Install numpy for Python 3.8
-      # Python 3.8 & scikit-learn<0.24 requires numpy<=1.23.5
-      if: ${{ matrix.python-version == '3.8' && matrix.scikit-learn == '0.23.1' }}
-      run: |
-        pip install numpy==1.23.5
-    - name: "Install NumPy 1.x and SciPy <1.11 for scikit-learn < 1.4"
-      if: ${{ contains(fromJSON('["1.0.*", "1.1.*", "1.2.*", "1.3.*"]'), matrix.scikit-learn) }}
-      run: |
-        # scipy has a change to the 'mode' behavior which breaks scikit-learn < 1.4
-        # numpy 2.0 has several breaking changes
-        pip install "numpy<2.0" "scipy<1.11"
-    - name: Install scipy ${{ matrix.scipy }}
-      if: ${{ matrix.scipy }}
-      run: |
-        pip install scipy==${{ matrix.scipy }}
+
     - name: Store repository status
       id: status-before
       if: matrix.os != 'windows-latest'
@@ -94,28 +71,45 @@ jobs:
         git_status=$(git status --porcelain -b)
         echo "BEFORE=$git_status" >> $GITHUB_ENV
         echo "Repository status before tests: $git_status"
+
     - name: Show installed dependencies
       run: python -m pip list
+
     - name: Run tests on Ubuntu Test
       if: matrix.os == 'ubuntu-latest'
       run: |
-        if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        # Most of the time, running only the scikit-learn tests is sufficient
-        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and not production'; else marks='not production'; fi
-        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and not production"
+        else
+          marks="not production"
+        fi
+
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
     - name: Run tests on Ubuntu Production
       if: matrix.os == 'ubuntu-latest'
       run: |
-        if [ ${{ matrix.code-cov }} ]; then codecov='--cov=openml --long  --cov-report=xml'; fi
-        # Most of the time, running only the scikit-learn tests is sufficient
-        if [ ${{ matrix.sklearn-only }} = 'true' ]; then marks='sklearn and production'; else marks='production'; fi
-        echo pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+        if [ "${{ matrix.code-cov }}" = "true" ]; then
+          codecov="--cov=openml --long --cov-report=xml"
+        fi
+
+        if [ "${{ matrix.sklearn-only }}" = "true" ]; then
+          marks="sklearn and production"
+        else
+          marks="production"
+        fi
+
         pytest -n 4 --durations=20 --dist load -sv $codecov -o log_cli=true -m "$marks"
+
     - name: Run tests on Windows
       if: matrix.os == 'windows-latest'
       run: |  # we need a separate step because of the bash-specific if-statement in the previous one.
         pytest -n 4 --durations=20 --dist load -sv --reruns 5 --reruns-delay 1
+
     - name: Check for files left behind by test
       if: matrix.os != 'windows-latest' && always()
       run: |
@@ -127,6 +121,7 @@ jobs:
             echo "Not all generated files have been deleted!"
             exit 1
         fi
+
     - name: Upload coverage
       if: matrix.code-cov && always()
       uses: codecov/codecov-action@v4

From ac4c670375dbf6e4e79c6142ee5853a3e53e327e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 16:55:26 +0100
Subject: [PATCH 02/18] identifiers

---
 pyproject.toml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2bf762b09..ede204ca0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,12 +50,11 @@ classifiers = [
   "Operating System :: Unix",
   "Operating System :: MacOS",
   "Programming Language :: Python :: 3",
-  "Programming Language :: Python :: 3.8",
-  "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",
   "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: 3.14",
 ]
 license = { file = "LICENSE" }
 

From 04c473afbda501b308f0cacae13a53a793c2b063 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 17:07:48 +0100
Subject: [PATCH 03/18] Update test.yml

---
 .github/workflows/test.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b3108ff59..0df538fa9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -55,14 +55,10 @@ jobs:
       with:
         python-version: ${{ matrix.python-version }}
 
-    - name: Install test dependencies
+    - name: Install test dependencies and scikit-learn
       run: |
         python -m pip install --upgrade pip
-        pip install -e .[test]
-
-    - name: Install scikit-learn ${{ matrix.scikit-learn }}
-      run: |
-        pip install scikit-learn==${{ matrix.scikit-learn }}
+        pip install -e .[test] scikit-learn==${{ matrix.scikit-learn }}
 
     - name: Store repository status
       id: status-before

From af041915b27f8023b8956630ecdc05d3572d5626 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 17:15:31 +0100
Subject: [PATCH 04/18] Update test.yml

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0df538fa9..9d41436d0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -26,8 +26,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"]
-        scikit-learn: ["1.2.*", "1.3.*", "1.4.*", "1.5.*"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        scikit-learn: ["1.3.*", "1.4.*", "1.5.*", "1.6.*", "1.7.*"]
         os: [ubuntu-latest]
         sklearn-only: ["true"]
 

From 16309a68fa27712d0cb7d0b86a1720a3e46b93fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 21:59:10 +0100
Subject: [PATCH 05/18] Update test.yml

---
 .github/workflows/test.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 54fbc7a72..32d3602e1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -31,6 +31,11 @@ jobs:
         os: [ubuntu-latest]
         sklearn-only: ["true"]
 
+        exclude:
+          # incompatible version combinations
+          - python-version: "3.13"
+            scikit-learn: ["1.3.*", "1.4.*"]
+
         include:
           # Full test run on Windows
           - os: windows-latest

From 62c0651e1409bfda07fd2c3668fcf10af88cd49f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 22:18:29 +0100
Subject: [PATCH 06/18] Update test.yml

---
 .github/workflows/test.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 32d3602e1..5ffe08fae 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -34,7 +34,9 @@ jobs:
         exclude:
           # incompatible version combinations
           - python-version: "3.13"
-            scikit-learn: ["1.3.*", "1.4.*"]
+            scikit-learn: "1.3.*"
+          - python-version: "3.13"
+            scikit-learn: "1.4.*"
 
         include:
           # Full test run on Windows

From e902465e9bfb598d5ee3b66f2d531c23004e13aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Mon, 29 Dec 2025 23:30:09 +0100
Subject: [PATCH 07/18] Update test.yml

---
 .github/workflows/test.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5ffe08fae..3d7998f24 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -43,14 +43,14 @@ jobs:
           - os: windows-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
-            sklearn-only: "false"
+            sklearn-only: "true"
 
           # Coverage run
           - os: ubuntu-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
-            sklearn-only: "false"
-            code-cov: true
+            sklearn-only: "true"
+            # code-cov: true
 
     steps:
     - uses: actions/checkout@v4

From 00f1b29d71d27e1106416f76a1239587b95e773c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Tue, 30 Dec 2025 00:04:53 +0100
Subject: [PATCH 08/18] Update test.yml

---
 .github/workflows/test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3d7998f24..7940f4a7e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,7 +40,8 @@ jobs:
 
         include:
           # Full test run on Windows
-          - os: windows-latest
+          - os: ubuntu-latest
+          # - os: windows-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
             sklearn-only: "true"

From 406205a714ebbd2178eef88e5d022989b4624e70 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Tue, 30 Dec 2025 14:43:14 +0530
Subject: [PATCH 09/18] mark xfail

---
 tests/test_datasets/test_dataset_functions.py | 16 ++++++++++++++++
 tests/test_runs/test_run_functions.py         |  4 ++++
 tests/test_setups/test_setup_functions.py     |  1 +
 tests/test_study/test_study_functions.py      |  2 ++
 tests/test_tasks/test_classification_task.py  |  3 +++
 tests/test_tasks/test_learning_curve_task.py  |  4 ++++
 tests/test_tasks/test_regression_task.py      |  2 ++
 tests/test_tasks/test_task_functions.py       |  5 +++++
 8 files changed, 37 insertions(+)

diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 266a6f6f7..f63bd1534 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -244,6 +244,7 @@ def test_get_datasets(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -262,6 +263,7 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -280,6 +282,7 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -309,6 +312,7 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
@@ -327,6 +331,7 @@ def test__get_dataset_description(self):
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -430,12 +435,14 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
@@ -853,6 +860,7 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_online_dataset_arff(self):
         dataset_id = 100  # Australian
         # lazy loading not used as arff file is checked.
@@ -1332,6 +1340,7 @@ def test_list_qualities(self):
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1347,6 +1356,7 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1523,6 +1533,7 @@ def test_list_datasets_with_high_size_parameter(self):
         (None, None, ["wrong", "sunny"]),
     ],
 )
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_invalid_attribute_validations(
     default_target_attribute,
     row_id_attribute,
@@ -1584,6 +1595,7 @@ def test_invalid_attribute_validations(
         (None, None, ["outlook", "windy"]),
     ],
 )
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
     data = [
         ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1802,6 +1814,7 @@ def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
@@ -1814,12 +1827,14 @@ def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1892,6 +1907,7 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3bb4b0a0c..c4b1e4cc7 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1695,6 +1695,7 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1704,6 +1705,7 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1714,6 +1716,7 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1724,6 +1727,7 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index 18d7f5cc6..9ffd7b9c1 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -166,6 +166,7 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 40026592f..837feb5bb 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -148,6 +148,7 @@ def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -217,6 +218,7 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index d4f2ed9d7..70c3115e5 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -25,12 +26,14 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
 @pytest.mark.server()
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index 885f80a27..f8ed876cc 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 
 import pandas as pd
+import pytest
 
 from openml.tasks import TaskType, get_task
 
@@ -17,6 +18,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -25,12 +27,14 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 14ed59470..5c3e79061 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -4,6 +4,7 @@
 import ast
 
 import pandas as pd
+import pytest
 
 import openml
 from openml.exceptions import OpenMLServerException
@@ -48,6 +49,7 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 5f1d577c0..0e9b5fdce 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -100,6 +100,7 @@ def test_list_tasks(self):
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -139,6 +140,7 @@ def test__get_task_live(self):
         # https://github.com/openml/openml-python/issues/378
         openml.tasks.get_task(34536)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -152,6 +154,7 @@ def test_get_task(self):
             os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -191,6 +194,7 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
@@ -206,6 +210,7 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()

From f38efad97c3757472e3a8d0e8eb2f9d215965c19 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Tue, 30 Dec 2025 16:20:11 +0100
Subject: [PATCH 10/18] Update test.yml

---
 .github/workflows/test.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 875429dd4..2865264bb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -40,8 +40,7 @@ jobs:
 
         include:
           # Full test run on Windows
-          - os: ubuntu-latest
-          # - os: windows-latest
+          - os: windows-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
             sklearn-only: "true"
@@ -51,7 +50,7 @@ jobs:
             python-version: "3.12"
             scikit-learn: "1.5.*"
             sklearn-only: "true"
-            # code-cov: true
+            code-cov: true
 
     steps:
     - uses: actions/checkout@v6

From 9220d369d840b43d732366fd11f4e7c27671eae4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Tue, 30 Dec 2025 16:21:50 +0100
Subject: [PATCH 11/18] Update test.yml

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2865264bb..2ba950e8d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -14,6 +14,7 @@ on:
     branches:
       - main
       - develop
+
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
   cancel-in-progress: true

From f5a13bbfd0d9aaf5298dfcf7c2bc46cc90c96b0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Tue, 30 Dec 2025 16:26:52 +0100
Subject: [PATCH 12/18] Update test.yml

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2ba950e8d..b77cfd38c 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -44,13 +44,13 @@ jobs:
           - os: windows-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
-            sklearn-only: "true"
+            sklearn-only: "false"
 
           # Coverage run
           - os: ubuntu-latest
             python-version: "3.12"
             scikit-learn: "1.5.*"
-            sklearn-only: "true"
+            sklearn-only: "false"
             code-cov: true
 
     steps:

From b7401017c193a7581e889daecd73280b419da4bd Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Wed, 31 Dec 2025 11:10:44 +0530
Subject: [PATCH 13/18] More xfail skips

---
 tests/test_datasets/test_dataset.py           | 2 ++
 tests/test_datasets/test_dataset_functions.py | 8 ++++++--
 tests/test_runs/test_run.py                   | 1 +
 tests/test_tasks/test_supervised_task.py      | 2 ++
 tests/test_tasks/test_task_functions.py       | 2 +-
 tests/test_tasks/test_task_methods.py         | 2 ++
 6 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index 86a4d3f57..58eea1f05 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -302,6 +302,7 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -309,6 +310,7 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
+@pytest.mark.xfail(reason="failures_issue_1544")
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index f63bd1534..1c2058d21 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -513,6 +513,7 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
         datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_publish_dataset(self):
         # lazy loading not possible as we need the arff-file.
         openml.datasets.get_dataset(3, download_data=True)
@@ -1389,6 +1390,7 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1410,6 +1412,7 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1458,6 +1461,7 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1533,7 +1537,7 @@ def test_list_datasets_with_high_size_parameter(self):
         (None, None, ["wrong", "sunny"]),
     ],
 )
-@pytest.mark.xfail(reason="failures_issue_1544")
+@pytest.mark.xfail(reason="failures_issue_1544",strict=False)
 def test_invalid_attribute_validations(
     default_target_attribute,
     row_id_attribute,
@@ -1595,7 +1599,7 @@ def test_invalid_attribute_validations(
         (None, None, ["outlook", "windy"]),
     ],
 )
-@pytest.mark.xfail(reason="failures_issue_1544")
+@pytest.mark.xfail(reason="failures_issue_1544",strict=False)
 def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
     data = [
         ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 088856450..7d60e35bd 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,6 +25,7 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 9c90b7e03..48e036d3e 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -6,6 +6,7 @@
 import pandas as pd
 
 from openml.tasks import get_task
+import pytest
 
 from .test_task import OpenMLTaskTest
 
@@ -27,6 +28,7 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 0e9b5fdce..25dab05ea 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -100,7 +100,6 @@ def test_list_tasks(self):
         for task in tasks.to_dict(orient="index").values():
             self._check_task(task)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_list_tasks_paginate(self):
         size = 10
         max = 100
@@ -177,6 +176,7 @@ def test_get_task_lazy(self):
         )
 
     @mock.patch("openml.tasks.functions.get_dataset")
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_removal_upon_download_failure(self, get_dataset):
         class WeirdException(Exception):
             pass
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 4480c2cbc..65c4ac37c 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -5,6 +5,7 @@
 
 import openml
 from openml.testing import TestBase
+import pytest
 
 
 # Common methods between tasks
@@ -15,6 +16,7 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
+    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars

From 7179d4b9e63b5949006693eb0bf6c68d2cb7dd7e Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Wed, 31 Dec 2025 11:13:06 +0530
Subject: [PATCH 14/18] Only remove newly added

---
 openml/tasks/functions.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/openml/tasks/functions.py b/openml/tasks/functions.py
index d2bf5e946..e9b879ae4 100644
--- a/openml/tasks/functions.py
+++ b/openml/tasks/functions.py
@@ -415,8 +415,9 @@ def get_task(
     if not isinstance(task_id, int):
         raise TypeError(f"Task id should be integer, is {type(task_id)}")
 
-    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
-
+    cache_key_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
+    tid_cache_dir = cache_key_dir / str(task_id)
+    tid_cache_dir_existed = tid_cache_dir.exists()
     try:
         task = _get_task_description(task_id)
         dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
@@ -430,7 +431,8 @@ def get_task(
         if download_splits and isinstance(task, OpenMLSupervisedTask):
             task.download_split()
     except Exception as e:
-        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
+        if not tid_cache_dir_existed:
+            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
         raise e
 
     return task

From 9889bbf23b171e9395c4c24e95009e223e591db8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Wed, 31 Dec 2025 10:26:14 +0100
Subject: [PATCH 15/18] Update test_run_functions.py

---
 tests/test_runs/test_run_functions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3bb4b0a0c..37f478fa7 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -822,7 +822,7 @@ def test_run_and_upload_gridsearch(self):
         assert len(run.trace.trace_iterations) == 9
 
     @pytest.mark.sklearn()
-    @pytest.mark.xfail(reason="failures_issue_1544")
+    @pytest.mark.skip(reason="failures_issue_1544")
     def test_run_and_upload_randomsearch(self):
         randomsearch = RandomizedSearchCV(
             RandomForestClassifier(n_estimators=5),

From ed894c1fbd05a33b71369c7a5da18aecca65761c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Wed, 31 Dec 2025 10:52:47 +0100
Subject: [PATCH 16/18] Update test_run_functions.py

---
 tests/test_runs/test_run_functions.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 3988b0037..144abb6a7 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -792,6 +792,7 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):
                 call_count += 1
         assert call_count == 3
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     def test_run_and_upload_gridsearch(self):
         estimator_name = (
@@ -847,6 +848,7 @@ def test_run_and_upload_randomsearch(self):
         trace = openml.runs.get_run_trace(run.run_id)
         assert len(trace.trace_iterations) == 5
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     def test_run_and_upload_maskedarrays(self):
         # This testcase is important for 2 reasons:

From 725cbce95d42c5dd946171e24c7091315a9256e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <fkiraly@gcos.ai>
Date: Wed, 31 Dec 2025 11:06:17 +0100
Subject: [PATCH 17/18] Update test_run_functions.py

---
 tests/test_runs/test_run_functions.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 144abb6a7..645e008db 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -625,6 +625,7 @@ def _run_and_upload_regression(
             sentinel=sentinel,
         )
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     def test_run_and_upload_logistic_regression(self):
         lr = LogisticRegression(solver="lbfgs", max_iter=1000)
@@ -633,6 +634,7 @@ def test_run_and_upload_logistic_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     def test_run_and_upload_linear_regression(self):
         lr = LinearRegression()
@@ -663,6 +665,7 @@ def test_run_and_upload_linear_regression(self):
         n_test_obs = self.TEST_SERVER_TASK_REGRESSION["n_test_obs"]
         self._run_and_upload_regression(lr, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     def test_run_and_upload_pipeline_dummy_pipeline(self):
         pipeline1 = Pipeline(
@@ -676,6 +679,7 @@ def test_run_and_upload_pipeline_dummy_pipeline(self):
         n_test_obs = self.TEST_SERVER_TASK_SIMPLE["n_test_obs"]
         self._run_and_upload_classification(pipeline1, task_id, n_missing_vals, n_test_obs, "62501")
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skipIf(
         Version(sklearn.__version__) < Version("0.20"),
@@ -740,6 +744,7 @@ def get_ct_cf(nominal_indices, numeric_indices):
             sentinel=sentinel,
         )
 
+    @pytest.mark.skip(reason="failures_issue_1544")
     @pytest.mark.sklearn()
     @unittest.skip("https://github.com/openml/OpenML/issues/1180")
     @unittest.skipIf(

From b1e06ecc0a49f853af851f0dbc1da652549d1112 Mon Sep 17 00:00:00 2001
From: JATAYU000 <shrivaths44kunju@gmail.com>
Date: Wed, 31 Dec 2025 16:19:33 +0530
Subject: [PATCH 18/18] Revert all xfails which passed

---
 tests/test_datasets/test_dataset.py           |  2 --
 tests/test_datasets/test_dataset_functions.py | 20 -------------------
 tests/test_runs/test_run.py                   |  1 -
 tests/test_runs/test_run_functions.py         |  4 ----
 tests/test_setups/test_setup_functions.py     |  1 -
 tests/test_study/test_study_functions.py      |  2 --
 tests/test_tasks/test_classification_task.py  |  3 ---
 tests/test_tasks/test_learning_curve_task.py  |  3 ---
 tests/test_tasks/test_regression_task.py      |  1 -
 tests/test_tasks/test_supervised_task.py      |  1 -
 tests/test_tasks/test_task_functions.py       |  4 ----
 tests/test_tasks/test_task_methods.py         |  1 -
 12 files changed, 43 deletions(-)

diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index bf570cd3c..66e9b8554 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -303,7 +303,6 @@ def test_get_feature_with_ontology_data_id_11():
     assert len(dataset.features[2].ontologies) >= 1
     assert len(dataset.features[3].ontologies) >= 1   
 
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_add_remove_ontology_to_dataset():
     did = 1
     feature_index = 1
@@ -311,7 +310,6 @@ def test_add_remove_ontology_to_dataset():
     openml.datasets.functions.data_feature_add_ontology(did, feature_index, ontology)
     openml.datasets.functions.data_feature_remove_ontology(did, feature_index, ontology)    
 
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_add_same_ontology_multiple_features():
     did = 1
     ontology = "https://www.openml.org/unittest/" + str(time())
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
index 1c2058d21..266a6f6f7 100644
--- a/tests/test_datasets/test_dataset_functions.py
+++ b/tests/test_datasets/test_dataset_functions.py
@@ -244,7 +244,6 @@ def test_get_datasets(self):
         assert len(datasets) == 2
         _assert_datasets_retrieved_successfully([1, 2])
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_by_name(self):
         dataset = openml.datasets.get_dataset("anneal")
         assert type(dataset) == OpenMLDataset
@@ -263,7 +262,6 @@ def test_get_dataset_download_all_files(self):
         # test_get_dataset_lazy
         raise NotImplementedError
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_uint8_dtype(self):
         dataset = openml.datasets.get_dataset(1)
         assert type(dataset) == OpenMLDataset
@@ -282,7 +280,6 @@ def test_dataset_by_name_cannot_access_private_data(self):
         self.use_production_server()
         self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_lazy_all_functions(self):
         """Test that all expected functionality is available without downloading the dataset."""
         dataset = openml.datasets.get_dataset(1)
@@ -312,7 +309,6 @@ def ensure_absence_of_real_data():
         assert classes == ["1", "2", "3", "4", "5", "U"]
         ensure_absence_of_real_data()
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_sparse(self):
         dataset = openml.datasets.get_dataset(102)
         X, *_ = dataset.get_data()
@@ -331,7 +327,6 @@ def test__get_dataset_description(self):
         description_xml_path = os.path.join(self.workdir, "description.xml")
         assert os.path.exists(description_xml_path)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__getarff_path_dataset_arff(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         description = _get_dataset_description(self.workdir, 2)
@@ -435,14 +430,12 @@ def test__getarff_md5_issue(self):
 
         openml.config.connection_n_retries = n
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__get_dataset_features(self):
         features_file = _get_dataset_features_file(self.workdir, 2)
         assert isinstance(features_file, Path)
         features_xml_path = self.workdir / "features.xml"
         assert features_xml_path.exists()
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test__get_dataset_qualities(self):
         qualities = _get_dataset_qualities_file(self.workdir, 2)
         assert isinstance(qualities, Path)
@@ -513,7 +506,6 @@ def test_deletion_of_cache_dir_faulty_download(self, patch):
         datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
         assert len(os.listdir(datasets_cache_dir)) == 0
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_publish_dataset(self):
         # lazy loading not possible as we need the arff-file.
         openml.datasets.get_dataset(3, download_data=True)
@@ -861,7 +853,6 @@ def test_create_invalid_dataset(self):
         param["data"] = data[0]
         self.assertRaises(ValueError, create_dataset, **param)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_online_dataset_arff(self):
         dataset_id = 100  # Australian
         # lazy loading not used as arff file is checked.
@@ -1341,7 +1332,6 @@ def test_list_qualities(self):
         assert isinstance(qualities, list) is True
         assert all(isinstance(q, str) for q in qualities) is True
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_cache_format_pickle(self):
         dataset = openml.datasets.get_dataset(1)
         dataset.get_data()
@@ -1357,7 +1347,6 @@ def test_get_dataset_cache_format_pickle(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_dataset_cache_format_feather(self):
         # This test crashed due to using the parquet file by default, which is downloaded
         # from minio. However, there is a mismatch between OpenML test server and minio IDs.
@@ -1390,7 +1379,6 @@ def test_get_dataset_cache_format_feather(self):
         assert len(categorical) == X.shape[1]
         assert len(attribute_names) == X.shape[1]
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_non_critical_field(self):
         # Case 1
         # All users can edit non-critical fields of datasets
@@ -1412,7 +1400,6 @@ def test_data_edit_non_critical_field(self):
         edited_dataset = openml.datasets.get_dataset(did)
         assert edited_dataset.description == desc
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_critical_field(self):
         # Case 2
         # only owners (or admin) can edit all critical fields of datasets
@@ -1461,7 +1448,6 @@ def test_data_edit_requires_valid_dataset(self):
             description="xor operation dataset",
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_data_edit_cannot_edit_critical_field_if_dataset_has_task(self):
         # Need to own a dataset to be able to edit meta-data
         # Will be creating a forked version of an existing dataset to allow the unit test user
@@ -1537,7 +1523,6 @@ def test_list_datasets_with_high_size_parameter(self):
         (None, None, ["wrong", "sunny"]),
     ],
 )
-@pytest.mark.xfail(reason="failures_issue_1544",strict=False)
 def test_invalid_attribute_validations(
     default_target_attribute,
     row_id_attribute,
@@ -1599,7 +1584,6 @@ def test_invalid_attribute_validations(
         (None, None, ["outlook", "windy"]),
     ],
 )
-@pytest.mark.xfail(reason="failures_issue_1544",strict=False)
 def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
     data = [
         ["a", "sunny", 85.0, 85.0, "FALSE", "no"],
@@ -1818,7 +1802,6 @@ def test_list_datasets_by_number_instances(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(small_datasets)
 
 
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_by_number_features(all_datasets: pd.DataFrame):
     wide_datasets = openml.datasets.list_datasets(number_features="50..100")
     assert 8 <= len(wide_datasets) < len(all_datasets)
@@ -1831,14 +1814,12 @@ def test_list_datasets_by_number_classes(all_datasets: pd.DataFrame):
     _assert_datasets_have_id_and_valid_status(five_class_datasets)
 
 
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_by_number_missing_values(all_datasets: pd.DataFrame):
     na_datasets = openml.datasets.list_datasets(number_missing_values="5..100")
     assert 5 <= len(na_datasets) < len(all_datasets)
     _assert_datasets_have_id_and_valid_status(na_datasets)
 
 
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_list_datasets_combined_filters(all_datasets: pd.DataFrame):
     combined_filter_datasets = openml.datasets.list_datasets(
         tag="study_14",
@@ -1911,7 +1892,6 @@ def isolate_for_test():
     ("with_data", "with_qualities", "with_features"),
     itertools.product([True, False], repeat=3),
 )
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_get_dataset_lazy_behavior(
     isolate_for_test, with_data: bool, with_qualities: bool, with_features: bool
 ):
diff --git a/tests/test_runs/test_run.py b/tests/test_runs/test_run.py
index 6fbe166ff..034b731aa 100644
--- a/tests/test_runs/test_run.py
+++ b/tests/test_runs/test_run.py
@@ -25,7 +25,6 @@ class TestRun(TestBase):
     # Splitting not helpful, these test's don't rely on the server and take
     # less than 1 seconds
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_tagging(self):
         runs = openml.runs.list_runs(size=1)
         assert not runs.empty, "Test server state is incorrect"
diff --git a/tests/test_runs/test_run_functions.py b/tests/test_runs/test_run_functions.py
index 645e008db..e4cec56ab 100644
--- a/tests/test_runs/test_run_functions.py
+++ b/tests/test_runs/test_run_functions.py
@@ -1678,7 +1678,6 @@ def test_format_prediction_non_supervised(self):
         ):
             format_prediction(clustering, *ignored_input)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_classification_no_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1688,7 +1687,6 @@ def test_format_prediction_classification_no_probabilities(self):
         with pytest.raises(ValueError, match="`proba` is required for classification task"):
             format_prediction(classification, *ignored_input, proba=None)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_classification_incomplete_probabilities(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1699,7 +1697,6 @@ def test_format_prediction_classification_incomplete_probabilities(self):
         with pytest.raises(ValueError, match="Each class should have a predicted probability"):
             format_prediction(classification, *ignored_input, proba=incomplete_probabilities)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_task_without_classlabels_set(self):
         classification = openml.tasks.get_task(
             self.TEST_SERVER_TASK_SIMPLE["task_id"],
@@ -1710,7 +1707,6 @@ def test_format_prediction_task_without_classlabels_set(self):
         with pytest.raises(ValueError, match="The classification task must have class labels set"):
             format_prediction(classification, *ignored_input, proba={})
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_format_prediction_task_learning_curve_sample_not_set(self):
         learning_curve = openml.tasks.get_task(801, download_data=False)  # diabetes;crossvalidation
         probabilities = {c: 0.2 for c in learning_curve.class_labels}
diff --git a/tests/test_setups/test_setup_functions.py b/tests/test_setups/test_setup_functions.py
index d371f6588..42af5362b 100644
--- a/tests/test_setups/test_setup_functions.py
+++ b/tests/test_setups/test_setup_functions.py
@@ -163,7 +163,6 @@ def test_list_setups_output_format(self):
         assert isinstance(setups, pd.DataFrame)
         assert len(setups) == 10
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_setuplist_offset(self):
         size = 10
         setups = openml.setups.list_setups(offset=0, size=size)
diff --git a/tests/test_study/test_study_functions.py b/tests/test_study/test_study_functions.py
index 837feb5bb..40026592f 100644
--- a/tests/test_study/test_study_functions.py
+++ b/tests/test_study/test_study_functions.py
@@ -148,7 +148,6 @@ def test_publish_empty_study_implicit(self):
         self._test_publish_empty_study_is_allowed(explicit=False)
 
     @pytest.mark.flaky()
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_publish_study(self):
         # get some random runs to attach
         run_list = openml.evaluations.list_evaluations("predictive_accuracy", size=10)
@@ -218,7 +217,6 @@ def test_publish_study(self):
         res = openml.study.delete_study(study.id)
         assert res
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_study_attach_illegal(self):
         run_list = openml.runs.list_runs(size=10)
         assert len(run_list) == 10
diff --git a/tests/test_tasks/test_classification_task.py b/tests/test_tasks/test_classification_task.py
index 70c3115e5..d4f2ed9d7 100644
--- a/tests/test_tasks/test_classification_task.py
+++ b/tests/test_tasks/test_classification_task.py
@@ -18,7 +18,6 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_CLASSIFICATION
         self.estimation_procedure = 5
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
@@ -26,14 +25,12 @@ def test_download_task(self):
         assert task.dataset_id == 20
         assert task.estimation_procedure_id == self.estimation_procedure
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
 
 
 @pytest.mark.server()
-@pytest.mark.xfail(reason="failures_issue_1544")
 def test_get_X_and_Y():
     task = get_task(119)
     X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_learning_curve_task.py b/tests/test_tasks/test_learning_curve_task.py
index f8ed876cc..4a3dede4e 100644
--- a/tests/test_tasks/test_learning_curve_task.py
+++ b/tests/test_tasks/test_learning_curve_task.py
@@ -18,7 +18,6 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.LEARNING_CURVE
         self.estimation_procedure = 13
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (768, 8)
@@ -27,14 +26,12 @@ def test_get_X_and_Y(self):
         assert isinstance(Y, pd.Series)
         assert pd.api.types.is_categorical_dtype(Y)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_task(self):
         task = super().test_download_task()
         assert task.task_id == self.task_id
         assert task.task_type_id == TaskType.LEARNING_CURVE
         assert task.dataset_id == 20
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_class_labels(self):
         task = get_task(self.task_id)
         assert task.class_labels == ["tested_negative", "tested_positive"]
diff --git a/tests/test_tasks/test_regression_task.py b/tests/test_tasks/test_regression_task.py
index 5c3e79061..3e324c4f8 100644
--- a/tests/test_tasks/test_regression_task.py
+++ b/tests/test_tasks/test_regression_task.py
@@ -49,7 +49,6 @@ def setUp(self, n_levels: int = 1):
         self.task_type = TaskType.SUPERVISED_REGRESSION
 
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self):
         X, Y = super().test_get_X_and_Y()
         assert X.shape == (194, 32)
diff --git a/tests/test_tasks/test_supervised_task.py b/tests/test_tasks/test_supervised_task.py
index 48e036d3e..e5a17a72b 100644
--- a/tests/test_tasks/test_supervised_task.py
+++ b/tests/test_tasks/test_supervised_task.py
@@ -28,7 +28,6 @@ def setUpClass(cls):
     def setUp(self, n_levels: int = 1):
         super().setUp()
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_X_and_Y(self) -> tuple[pd.DataFrame, pd.Series]:
         task = get_task(self.task_id)
         X, Y = task.get_X_and_y()
diff --git a/tests/test_tasks/test_task_functions.py b/tests/test_tasks/test_task_functions.py
index 25dab05ea..0aa2dcc9b 100644
--- a/tests/test_tasks/test_task_functions.py
+++ b/tests/test_tasks/test_task_functions.py
@@ -139,7 +139,6 @@ def test__get_task_live(self):
         # https://github.com/openml/openml-python/issues/378
         openml.tasks.get_task(34536)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task(self):
         task = openml.tasks.get_task(1, download_data=True)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -153,7 +152,6 @@ def test_get_task(self):
             os.path.join(self.workdir, "org", "openml", "test", "datasets", "1", "dataset.arff")
         )
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task_lazy(self):
         task = openml.tasks.get_task(2, download_data=False)  # anneal; crossvalidation
         assert isinstance(task, OpenMLTask)
@@ -194,7 +192,6 @@ def assert_and_raise(*args, **kwargs):
         # Now the file should no longer exist
         assert not os.path.exists(os.path.join(os.getcwd(), "tasks", "1", "tasks.xml"))
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_get_task_with_cache(self):
         openml.config.set_root_cache_directory(self.static_cache_dir)
         task = openml.tasks.get_task(1)
@@ -210,7 +207,6 @@ def test_get_task_different_types(self):
         # Issue 538, get_task failing with clustering task.
         openml.tasks.functions.get_task(126033)
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_download_split(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         split = task.download_split()
diff --git a/tests/test_tasks/test_task_methods.py b/tests/test_tasks/test_task_methods.py
index 65c4ac37c..540c43de0 100644
--- a/tests/test_tasks/test_task_methods.py
+++ b/tests/test_tasks/test_task_methods.py
@@ -16,7 +16,6 @@ def setUp(self):
     def tearDown(self):
         super().tearDown()
 
-    @pytest.mark.xfail(reason="failures_issue_1544")
     def test_tagging(self):
         task = openml.tasks.get_task(1)  # anneal; crossvalidation
         # tags can be at most 64 alphanumeric (+ underscore) chars