apache · ueshin · Mar 20, 2026 · Mar 20, 2026
diff --git a/python/pyspark/pandas/data_type_ops/string_ops.py b/python/pyspark/pandas/data_type_ops/string_ops.py
@@ -20,6 +20,7 @@
 import pandas as pd
 from pandas.api.types import CategoricalDtype
 
+from pyspark.loose_version import LooseVersion
 from pyspark.sql import functions as F
 from pyspark.sql.types import IntegralType, StringType
 from pyspark.sql.utils import pyspark_column_op
@@ -140,6 +141,13 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
         else:
             return _as_other_type(index_ops, dtype, spark_type)
 
+    def restore(self, col: pd.Series) -> pd.Series:
+        """Restore column when to_pandas."""
+        if LooseVersion(pd.__version__) < "3.0.0":
+            return super().restore(col)
+        else:
+            return col.astype(self.dtype)
+
 
 class StringExtensionOps(StringOps):
     """

diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
@@ -829,6 +829,13 @@ def _reduce_for_stat_function(
                 internal = InternalFrame(
                     spark_frame=sdf,
                     index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
+                    index_fields=(
+                        None
+                        if LooseVersion(pd.__version__) < "3.0.0"
+                        # Explicitly specify the dtype as "object" to avoid converting to `nan`
+                        # due to pandas 3's default string type (`str`) behavior.
+                        else [InternalField(np.dtype("object"))]
+                    ),
                     column_labels=new_column_labels,
                     column_label_names=self._internal.column_label_names,
                 )
@@ -4994,7 +5001,13 @@ def nunique(
                     spark_frame=sdf,
                     index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
                     index_names=[None],
-                    index_fields=[None],
+                    index_fields=(
+                        [None]
+                        if LooseVersion(pd.__version__) < "3.0.0"
+                        # Explicitly specify the dtype as "object" to avoid converting to `nan`
+                        # due to pandas 3's default string type (`str`) behavior.
+                        else [InternalField(np.dtype("object"))]
+                    ),
                     data_spark_columns=[
                         scol_for(sdf, col) for col in self._internal.data_spark_column_names
                     ],

diff --git a/python/pyspark/pandas/tests/computation/test_compute.py b/python/pyspark/pandas/tests/computation/test_compute.py
@@ -419,6 +419,25 @@ def test_nunique(self):
             psdf.nunique(axis=1, dropna=False).tolist(), pdf.nunique(axis=1, dropna=False).tolist()
         )
 
+    def test_nunique_with_string_column_and_missing_values(self):
+        pdf = pd.DataFrame({"A": ["x", None, "x"], "B": ["y", "z", None]})
+        psdf = ps.from_pandas(pdf)
+
+        expected = pdf.nunique()
+        expected_dropna_false = pdf.nunique(dropna=False)
+        actual = psdf.nunique()
+        actual_dropna_false = psdf.nunique(dropna=False)
+
+        self.assert_eq(actual, expected)
+        self.assert_eq(actual_dropna_false, expected_dropna_false)
+
+        self.assertEqual(actual.index.dtype, expected.index.dtype)
+        self.assertEqual(actual_dropna_false.index.dtype, expected_dropna_false.index.dtype)
+        self.assertEqual(actual.to_pandas().index.dtype, expected.index.dtype)
+        self.assertEqual(
+            actual_dropna_false.to_pandas().index.dtype, expected_dropna_false.index.dtype
+        )
+
     def test_quantile(self):
         pdf, psdf = self.df_pair
 

diff --git a/python/pyspark/pandas/tests/computation/test_cumulative.py b/python/pyspark/pandas/tests/computation/test_cumulative.py
@@ -24,6 +24,16 @@
 
 
 class FrameCumulativeMixin:
+    def test_cumulative_reduction_preserves_none_name(self):
+        pdf = pd.DataFrame({"A": [2.0, 5.0, 1.0], "B": [1.0, None, 0.0]})
+        psdf = ps.from_pandas(pdf)
+
+        expected = pdf.cumsum().sum()
+        actual = psdf.cumsum().sum()
+
+        self.assert_eq(actual, expected)
+        self.assertEqual(actual._to_pandas().name, expected.name)
+
     def _test_cummin(self, pdf, psdf):
         self.assert_eq(pdf.cummin(), psdf.cummin())
         self.assert_eq(pdf.cummin(skipna=False), psdf.cummin(skipna=False))

diff --git a/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py b/python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
@@ -160,6 +160,14 @@ def test_from_to_pandas(self):
         self.assert_eq(pser, psser._to_pandas())
         self.assert_eq(ps.from_pandas(pser), psser)
 
+    def test_from_to_pandas_with_missing_values(self):
+        pser = pd.Series(["x", None, "z"])
+        psser = ps.from_pandas(pser)
+
+        self.assert_eq(psser.to_pandas(), pser)
+        self.assertEqual(psser.dtype, pser.dtype)
+        self.assertEqual(psser.to_pandas().dtype, pser.dtype)
+
     def test_isnull(self):
         self.assert_eq(self.pdf["string"].isnull(), self.psdf["string"].isnull())
 

diff --git a/python/pyspark/pandas/tests/frame/test_spark.py b/python/pyspark/pandas/tests/frame/test_spark.py
@@ -88,6 +88,19 @@ def test_all_null_dataframe(self):
             psdf = ps.from_pandas(pdf)
             self.assert_eq(psdf, pdf)
 
+    def test_to_pandas_with_nullable_string_column(self):
+        pdf = pd.DataFrame({"a": ["x", None, "z"]})
+
+        psdf = ps.from_pandas(pdf)
+        actual = psdf.to_pandas()
+        self.assert_eq(actual, pdf)
+        self.assertEqual(actual["a"].dtype, pdf["a"].dtype)
+
+        with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
+            actual = ps.from_pandas(pdf).to_pandas()
+            self.assert_eq(actual, pdf)
+            self.assertEqual(actual["a"].dtype, pdf["a"].dtype)
+
     def test_nullable_object(self):
         pdf = pd.DataFrame(
             {