Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions python/pyspark/pandas/data_type_ops/string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import pandas as pd
from pandas.api.types import CategoricalDtype

from pyspark.loose_version import LooseVersion
from pyspark.sql import functions as F
from pyspark.sql.types import IntegralType, StringType
from pyspark.sql.utils import pyspark_column_op
Expand Down Expand Up @@ -140,6 +141,13 @@ def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> Ind
else:
return _as_other_type(index_ops, dtype, spark_type)

def restore(self, col: pd.Series) -> pd.Series:
"""Restore column when to_pandas."""
if LooseVersion(pd.__version__) < "3.0.0":
return super().restore(col)
else:
return col.astype(self.dtype)


class StringExtensionOps(StringOps):
"""
Expand Down
15 changes: 14 additions & 1 deletion python/pyspark/pandas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,13 @@ def _reduce_for_stat_function(
internal = InternalFrame(
spark_frame=sdf,
index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
index_fields=(
None
if LooseVersion(pd.__version__) < "3.0.0"
# Explicitly specify the dtype as "object" to avoid converting to `nan`
# due to pandas 3's default string type (`str`) behavior.
else [InternalField(np.dtype("object"))]
),
column_labels=new_column_labels,
column_label_names=self._internal.column_label_names,
)
Expand Down Expand Up @@ -4994,7 +5001,13 @@ def nunique(
spark_frame=sdf,
index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
index_names=[None],
index_fields=[None],
index_fields=(
[None]
if LooseVersion(pd.__version__) < "3.0.0"
# Explicitly specify the dtype as "object" to avoid converting to `nan`
# due to pandas 3's default string type (`str`) behavior.
else [InternalField(np.dtype("object"))]
),
data_spark_columns=[
scol_for(sdf, col) for col in self._internal.data_spark_column_names
],
Expand Down
19 changes: 19 additions & 0 deletions python/pyspark/pandas/tests/computation/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,25 @@ def test_nunique(self):
psdf.nunique(axis=1, dropna=False).tolist(), pdf.nunique(axis=1, dropna=False).tolist()
)

def test_nunique_with_string_column_and_missing_values(self):
pdf = pd.DataFrame({"A": ["x", None, "x"], "B": ["y", "z", None]})
psdf = ps.from_pandas(pdf)

expected = pdf.nunique()
expected_dropna_false = pdf.nunique(dropna=False)
actual = psdf.nunique()
actual_dropna_false = psdf.nunique(dropna=False)

self.assert_eq(actual, expected)
self.assert_eq(actual_dropna_false, expected_dropna_false)

self.assertEqual(actual.index.dtype, expected.index.dtype)
self.assertEqual(actual_dropna_false.index.dtype, expected_dropna_false.index.dtype)
self.assertEqual(actual.to_pandas().index.dtype, expected.index.dtype)
self.assertEqual(
actual_dropna_false.to_pandas().index.dtype, expected_dropna_false.index.dtype
)

def test_quantile(self):
pdf, psdf = self.df_pair

Expand Down
10 changes: 10 additions & 0 deletions python/pyspark/pandas/tests/computation/test_cumulative.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,16 @@


class FrameCumulativeMixin:
def test_cumulative_reduction_preserves_none_name(self):
pdf = pd.DataFrame({"A": [2.0, 5.0, 1.0], "B": [1.0, None, 0.0]})
psdf = ps.from_pandas(pdf)

expected = pdf.cumsum().sum()
actual = psdf.cumsum().sum()

self.assert_eq(actual, expected)
self.assertEqual(actual._to_pandas().name, expected.name)

def _test_cummin(self, pdf, psdf):
self.assert_eq(pdf.cummin(), psdf.cummin())
self.assert_eq(pdf.cummin(skipna=False), psdf.cummin(skipna=False))
Expand Down
8 changes: 8 additions & 0 deletions python/pyspark/pandas/tests/data_type_ops/test_string_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,14 @@ def test_from_to_pandas(self):
self.assert_eq(pser, psser._to_pandas())
self.assert_eq(ps.from_pandas(pser), psser)

def test_from_to_pandas_with_missing_values(self):
pser = pd.Series(["x", None, "z"])
psser = ps.from_pandas(pser)

self.assert_eq(psser.to_pandas(), pser)
self.assertEqual(psser.dtype, pser.dtype)
self.assertEqual(psser.to_pandas().dtype, pser.dtype)

def test_isnull(self):
self.assert_eq(self.pdf["string"].isnull(), self.psdf["string"].isnull())

Expand Down
13 changes: 13 additions & 0 deletions python/pyspark/pandas/tests/frame/test_spark.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ def test_all_null_dataframe(self):
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf, pdf)

def test_to_pandas_with_nullable_string_column(self):
pdf = pd.DataFrame({"a": ["x", None, "z"]})

psdf = ps.from_pandas(pdf)
actual = psdf.to_pandas()
self.assert_eq(actual, pdf)
self.assertEqual(actual["a"].dtype, pdf["a"].dtype)

with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
actual = ps.from_pandas(pdf).to_pandas()
self.assert_eq(actual, pdf)
self.assertEqual(actual["a"].dtype, pdf["a"].dtype)

def test_nullable_object(self):
pdf = pd.DataFrame(
{
Expand Down