From 64c7f6783b4a6e3965fb3c37620653084fd6e6e0 Mon Sep 17 00:00:00 2001 From: jonas2612 Date: Thu, 30 Oct 2025 07:59:10 +0100 Subject: [PATCH 1/4] fix categories import --- src/spatialdata/models/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index 60f4ee20..e8bc0815 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -885,7 +885,8 @@ def _add_metadata_and_validate( # It also just changes the state of the series, so it is not a big deal. if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known: try: - data[c] = data[c].cat.set_categories(data[c].head(1).cat.categories) + data[c] = data[c].cat.as_known() + data[c] = data[c].cat.set_categories(data[c]._meta.cat.categories) except ValueError: logger.info(f"Column `{c}` contains unknown categories. Consider casting it.") From 8425f3497135c7741125d319bbb1d2f65fff7af5 Mon Sep 17 00:00:00 2001 From: jonas2612 Date: Mon, 3 Nov 2025 16:17:46 +0100 Subject: [PATCH 2/4] ensure reproducibility --- src/spatialdata/models/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index e8bc0815..a4efd60e 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -885,8 +885,7 @@ def _add_metadata_and_validate( # It also just changes the state of the series, so it is not a big deal. if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known: try: - data[c] = data[c].cat.as_known() - data[c] = data[c].cat.set_categories(data[c]._meta.cat.categories) + data[c] = data[c].cat.set_categories(data[c].compute().cat.categories) except ValueError: logger.info(f"Column `{c}` contains unknown categories. Consider casting it.") From 040021bb2ab152ee262063363735eaf144df3bbe Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sat, 3 Jan 2026 23:24:53 +0100 Subject: [PATCH 3/4] add test for missing categories bug in PointsModel --- src/spatialdata/models/models.py | 1 + tests/models/test_models.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index 92acbec4..e9fe0f32 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -891,6 +891,7 @@ def _add_metadata_and_validate( # It also just changes the state of the series, so it is not a big deal. if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known: try: + # data[c] = data[c].cat.set_categories(data[c].head(1).cat.categories) data[c] = data[c].cat.set_categories(data[c].compute().cat.categories) except ValueError: logger.info(f"Column `{c}` contains unknown categories. Consider casting it.") diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 124933f4..1e82b698 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -830,3 +830,31 @@ def test_warning_on_large_chunks(): assert len(w) == 1, "Warning should be raised for large chunk size" assert issubclass(w[-1].category, UserWarning) assert "Detected chunks larger than:" in str(w[-1].message) + + +def test_categories_on_partitioned_dataframe(sdata_blobs: SpatialData): + df = sdata_blobs["blobs_points"].compute() + df["genes"] = RNG.choice([f"gene_{i}" for i in range(200)], len(df)) + N_PARTITIONS = 200 + ddf = dd.from_pandas(df, npartitions=N_PARTITIONS) + ddf["genes"] = ddf["genes"].astype("category") + + df["genes"] = df["genes"].astype("category") + df_parsed = PointsModel.parse(df, npartitions=N_PARTITIONS) + ddf_parsed = PointsModel.parse(ddf, npartitions=N_PARTITIONS) + + assert df["genes"].equals(df_parsed["genes"].compute()) + assert df["genes"].cat.categories.equals(df_parsed["genes"].compute().cat.categories) + + assert np.array_equal(df["genes"].to_numpy(), ddf_parsed["genes"].compute().to_numpy()) + assert set(df["genes"].cat.categories.tolist()) == set(ddf_parsed["genes"].compute().cat.categories.tolist()) + + # two behavior to investigate later/report to dask (they originate in dask) + # TODO: df['genes'].cat.categories has dtype 'object', while ddf_parsed['genes'].compute().cat.categories has dtype + # 'string' + # this problem should disappear after pandas 3.0 is released + assert df["genes"].cat.categories.dtype == "object" + assert ddf_parsed["genes"].compute().cat.categories.dtype == "string" + + # TODO: the list of categories are not preserving the order + assert df["genes"].cat.categories.tolist() != ddf_parsed["genes"].compute().cat.categories.tolist() From 5103dbed23e6f4e4785812add51d714b182a89ff Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sat, 3 Jan 2026 23:42:17 +0100 Subject: [PATCH 4/4] remove comment --- src/spatialdata/models/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index e9fe0f32..92acbec4 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -891,7 +891,6 @@ def _add_metadata_and_validate( # It also just changes the state of the series, so it is not a big deal. if isinstance(data[c].dtype, CategoricalDtype) and not data[c].cat.known: try: - # data[c] = data[c].cat.set_categories(data[c].head(1).cat.categories) data[c] = data[c].cat.set_categories(data[c].compute().cat.categories) except ValueError: logger.info(f"Column `{c}` contains unknown categories. Consider casting it.")