From 3feba1136303923a10df9b2e8aa261de58b318c8 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 9 Feb 2026 11:59:28 -0800 Subject: [PATCH 01/14] reformat script --- .../models/mixins/table_components.py | 86 ++++++++++++++++++- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index dc583975b..5acb9e3db 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -138,9 +138,23 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]: ) +# class EllipsisJSONEncoder(json.JSONEncoder): +# """Custom JSON encoder that handles Ellipsis and pandas NA objects by converting them to strings.""" +# def default(self, obj): +# if obj is ...: +# return "..." +# # Handle pandas NA types +# import pandas as pd +# if obj is pd.NA or (hasattr(obj, '__class__') and obj.__class__.__name__ == 'NAType'): +# return None +# return super().default(obj) + + def convert_dtypes_to_json_serializable(df): """ Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types. + Convert the list and dict columns to JSON strings which are JSON serializable types. + Replace both Ellipsis and pandas NA within nested structures which are not JSON serializable types. Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types. Arguments: df: The dataframe to convert the dtypes of. @@ -163,6 +177,29 @@ def convert_dtypes_to_json_serializable(df): "datetime_list_col": [[datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], [datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 6)], None, [datetime(2021, 1, 7), datetime(2021, 1, 8), datetime(2021, 1, 9)]], "entityid_list_col": [["syn123", "syn456", None], ["syn101", "syn102", "syn103"], None, ["syn104", "syn105", "syn106"]], "userid_list_col": [["user1", "user2", "user3"], ["user4", "user5", None], None, ["user7", "user8", "user9"]], + "json_col_with_quotes": [ + { + "id": 1, + "description": 'Text with "quotes" in the description field', + "references": [] + }, + { + "id": 2, + "description": 'Another description with "quoted text" here',` + "references": ["ref1", "ref2"] + }, + { + "id": 3, + "description": 'Description containing "multiple" quoted "words"', + "references": [...] + } + { + "id": 4, + "description": 'Description containing apostrophes sage\'s', + "references": [...] + } + + ], }).convert_dtypes() df = convert_dtypes_to_json_serializable(df) print(df) @@ -170,9 +207,6 @@ def convert_dtypes_to_json_serializable(df): import pandas as pd for col in df.columns: - df[col] = ( - df[col].replace({pd.NA: None}).astype(object) - ) # this will convert the int64 and float64 columns to object columns # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) if col in [ "ROW_ID", @@ -180,6 +214,49 @@ def convert_dtypes_to_json_serializable(df): "ROW_ID.1", ]: # ROW_ID.1 is the temporary row id to constrct row to upsert df[col] = df[col].astype(int) + + # Check if any values in the column are lists, dicts, or JSON strings, and serialize them to JSON + if df[col].notna().any(): + sample_values = df[col].dropna() + if len(sample_values): + + def _serialize_json_value(x): + if x is None: + return None + # Serialize lists and dicts to JSON using custom encoder for Ellipsis handling + if isinstance(x, (list, dict)): + # Replace both Ellipsis and pd.NA within nested structures + def _reformat_special_values(obj): + if obj is ...: + return "..." + # Handle pandas NA - check type name to avoid array ambiguity + if obj is pd.NA: + return None + if isinstance(obj, dict): + return { + k: _reformat_special_values(v) + for k, v in obj.items() + } + if isinstance(obj, list): + return [_reformat_special_values(item) for item in obj] + return obj + + cleaned_x = _reformat_special_values(x) + return json.dumps(cleaned_x, ensure_ascii=False).replace( + '\\"', "\\'" + ) + + # Handle standalone ellipsis + if x is ...: + return "..." + return x + + df[col] = df[col].apply(lambda x: _serialize_json_value(x)) + + # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype + df[col] = df[col].convert_dtypes() + # convert the int64 and float64 columns to object columns which are JSON serializable types + df[col] = df[col].replace({pd.NA: None}).astype(object) return df @@ -4031,8 +4108,9 @@ async def _chunk_and_upload_df( to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv` function when writing the data to a CSV file. """ + # Serializes dict/list values to JSON strings + df = convert_dtypes_to_json_serializable(df) # Loop over the rows of the DF to determine the size/boundries we'll be uploading - chunks_to_upload = [] size_of_chunk = 0 buffer = BytesIO() From 1c68dace8a23605c09d67b0ccfc63ff06ad0f3a8 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 9 Feb 2026 16:23:23 -0800 Subject: [PATCH 02/14] reorganize code to ensure row columns remain int --- synapseclient/models/mixins/table_components.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index 5acb9e3db..ca7a03717 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -207,14 +207,6 @@ def convert_dtypes_to_json_serializable(df): import pandas as pd for col in df.columns: - # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) - if col in [ - "ROW_ID", - "ROW_VERSION", - "ROW_ID.1", - ]: # ROW_ID.1 is the temporary row id to constrct row to upsert - df[col] = df[col].astype(int) - # Check if any values in the column are lists, dicts, or JSON strings, and serialize them to JSON if df[col].notna().any(): sample_values = df[col].dropna() @@ -257,6 +249,14 @@ def _reformat_special_values(obj): df[col] = df[col].convert_dtypes() # convert the int64 and float64 columns to object columns which are JSON serializable types df[col] = df[col].replace({pd.NA: None}).astype(object) + + # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) + if col in [ + "ROW_ID", + "ROW_VERSION", + "ROW_ID.1", + ]: # ROW_ID.1 is the temporary row id to constrct row to upsert + df[col] = df[col].astype(int) return df From 4a29a1674440dc2f3ab3e64430081782ea9fa36e Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 9 Feb 2026 16:23:59 -0800 Subject: [PATCH 03/14] add unit test for convert_dtypes_to_json_serializable --- .../mixins/unit_test_table_components.py | 213 ++++++++++++++++++ 1 file changed, 213 insertions(+) diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 5f8b91566..72b22098f 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -9,6 +9,7 @@ import numpy as np import pandas as pd import pytest +from pandas.api.types import is_float_dtype, is_integer_dtype, is_object_dtype from synapseclient import Synapse from synapseclient.api import ViewEntityType, ViewTypeMask @@ -37,6 +38,7 @@ _query_table_csv, _query_table_next_page, _query_table_row_set, + convert_dtypes_to_json_serializable, csv_to_pandas_df, ) from synapseclient.models.table_components import ( @@ -3968,3 +3970,214 @@ def test_csv_pandas_df_with_row_id_and_version_etag_in_index( ).convert_dtypes() # resolve datatype issue such as StringDtype vs object # THEN assert the dataframe is equal to the expected dataframe pd.testing.assert_frame_equal(df, expected_df) + + +class TestConvertDtypesToJsonSerializable: + """Tests for convert_dtypes_to_json_serializable function""" + + def test_no_conversion_when_no_na_in_column(self): + """Test that int64 and float64 columns are not converted to object dtype when no NA is present""" + df = pd.DataFrame({"int_col": [1, 2, 3, 4], "float_col": [1.1, 2.2, 3.3, 4.4]}) + assert df["int_col"].dtype == "int64" + assert df["float_col"].dtype == "float64" + + result = convert_dtypes_to_json_serializable(df) + assert is_object_dtype(result.int_col) + assert is_object_dtype(result.float_col) + assert list(result["int_col"]) == [1, 2, 3, 4] + assert list(result["float_col"]) == [1.1, 2.2, 3.3, 4.4] + + def test_convert_na_and_columns_to_object(self): + """Test that pd.NA values are converted to None for int64 and float64 columns by _serialize_json_value""" + df = pd.DataFrame( + { + "int_col": pd.array([1, 2, pd.NA, 4], dtype="Int64"), + "float_col": pd.array([1.1, 2.2, pd.NA, 4.4], dtype="Float64"), + } + ) + result = convert_dtypes_to_json_serializable(df) + assert is_object_dtype(result.int_col) + assert is_object_dtype(result.float_col) + assert list(result["int_col"]) == [1, 2, None, 4] + assert list(result["float_col"]) == [1.1, 2.2, None, 4.4] + + def test_row_columns_remain_int(self): + """Test that ROW_ID, ROW_VERSION, and ROW_ID.1 columns remain as int while other columns become object""" + # GIVEN a dataframe with special columns (ROW_ID, ROW_VERSION, ROW_ID.1) and a regular column + df = pd.DataFrame( + { + "ROW_ID": pd.array([1, 2, 3, 4], dtype="Int64"), + "ROW_VERSION": pd.array([5, 6, 7, 8], dtype="Int64"), + "ROW_ID.1": pd.array([9, 10, 11, 12], dtype="Int64"), + "other_col": [10, 20, 30, 40], # Use regular list without pd.NA + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN all special columns should remain as int while other_col should become object + assert is_integer_dtype(result.ROW_ID), "ROW_ID should remain integer dtype" + assert is_integer_dtype( + result.ROW_VERSION + ), "ROW_VERSION should remain int64 dtype" + assert is_integer_dtype( + result["ROW_ID.1"] + ), "ROW_ID.1 should remain int64 dtype" + assert is_object_dtype(result.other_col), "other_col should become object dtype" + + def test_ellipsis_handling_in_list(self): + """Test that Ellipsis (...) objects in lists are converted to '...' strings""" + # GIVEN a dataframe with Ellipsis in a list + df = pd.DataFrame({"list_with_ellipsis": [[1, 2, ...], [4, ..., 6]]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." in JSON string + assert result["list_with_ellipsis"].iloc[0] == '[1, 2, "..."]' + assert result["list_with_ellipsis"].iloc[1] == '[4, "...", 6]' + assert is_object_dtype(result.list_with_ellipsis) + + def test_ellipsis_handling_in_dict(self): + """Test that Ellipsis (...) objects in dicts are converted to '...' strings""" + # GIVEN a dataframe with Ellipsis in a dict + df = pd.DataFrame( + { + "dict_with_ellipsis": [ + {"id": 1, "data": ...}, + {"id": 2, "items": [1, ...]}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." in JSON string + assert result.dict_with_ellipsis.iloc[0] == '{"id": 1, "data": "..."}' + assert result.dict_with_ellipsis.iloc[1] == '{"id": 2, "items": [1, "..."]}' + assert is_object_dtype(result.dict_with_ellipsis) + + def test_standalone_ellipsis(self): + """Test that standalone Ellipsis objects are converted to '...' strings""" + # GIVEN a dataframe with standalone Ellipsis + df = pd.DataFrame({"ellipsis_col": [1, ..., 3]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted to "..." + assert result["ellipsis_col"].iloc[0] == 1 + assert result["ellipsis_col"].iloc[1] == "..." + assert result["ellipsis_col"].iloc[2] == 3 + + def test_none_in_list_serialized_to_empty_list(self): + """Test that None values in list columns are serialized to '[]'""" + # GIVEN a dataframe with None in list column + df = pd.DataFrame({"list_col": [[1, 2, 3], pd.NA, [7, 8, 9]]}) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN None should be converted to "[]" + assert result["list_col"].iloc[0] == "[1, 2, 3]" + assert result["list_col"].iloc[1] == None + assert result["list_col"].iloc[2] == "[7, 8, 9]" + + def test_dict_with_quotes_in_values(self): + """Test that dicts with quotes in string values are properly handled""" + # GIVEN a dataframe with dict containing quotes + df = pd.DataFrame( + { + "dict_col": [ + {"description": 'Text with "quotes" here'}, + {"description": 'Another "quoted" text'}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN the JSON string should be properly formatted + assert ( + result["dict_col"].iloc[0] + == '{"description": "Text with \\\'quotes\\\' here"}' + ) + assert ( + result["dict_col"].iloc[1] + == '{"description": "Another \\\'quoted\\\' text"}' + ) + assert is_object_dtype(result.dict_col) + + def test_empty_dataframe(self): + """Test that empty dataframe is handled correctly""" + # GIVEN an empty dataframe + df = pd.DataFrame() + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN it should return an empty dataframe + assert len(result) == 0 + assert len(result.columns) == 0 + + def test_mixed_column_types(self): + """Test that multiple column types are handled correctly together""" + # GIVEN a dataframe with mixed column types + df = pd.DataFrame( + { + "ROW_ID": pd.array([1, 2, 3], dtype="Int64"), + "ROW_VERSION": pd.array([1, 1, 1], dtype="Int64"), + "int_col": [10, 20, 30], # Use regular list without pd.NA + "float_col": [1.1, 2.2, 3.3], + "string_col": ["a", "b", "c"], + "list_col": [[1, 2], [3, 4], None], + "dict_col": [{"id": 1}, {"id": 2}, {"id": 3}], + "bool_col": [True, False, True], + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN verify all conversions + pd.testing.assert_frame_equal(result, df) + + def test_nested_dict_with_ellipsis(self): + """Test that nested dicts with Ellipsis are properly handled""" + # GIVEN a dataframe with nested dict containing Ellipsis + df = pd.DataFrame( + { + "nested_dict": [ + {"outer": {"inner": ...}}, + {"data": {"list": [1, 2, ...]}}, + ] + } + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN Ellipsis should be converted in nested structures + assert result["nested_dict"].iloc[0] == '{"outer": {"inner": "..."}}' + assert result["nested_dict"].iloc[1] == '{"data": {"list": [1, 2, "..."]}}' + + def test_nullable_int64_with_pd_na(self): + """Test that Int64 columns with pd.NA get pd.NA converted to '[]' by _serialize_json_value""" + # GIVEN a dataframe with nullable Int64 column containing pd.NA + df = pd.DataFrame( + {"nullable_int_col": pd.array([1, 2, pd.NA, 4, pd.NA], dtype="Int64")} + ) + + # WHEN convert_dtypes_to_json_serializable is called + result = convert_dtypes_to_json_serializable(df) + + # THEN the column should be object type and pd.NA should be converted to "[]" + assert is_object_dtype(result.nullable_int_col) + expected_result = pd.DataFrame( + {"nullable_int_col": [1, 2, None, 4, None]} + ).convert_dtypes() + pd.testing.assert_frame_equal(result, expected_result, check_dtype=False) + assert is_object_dtype(result.nullable_int_col) From 3ecb6ece7454f46392332fcb0dbff2e2345b5877 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 9 Feb 2026 16:44:00 -0800 Subject: [PATCH 04/14] correct unit for datetime64 --- tests/unit/synapseclient/mixins/unit_test_table_components.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 72b22098f..1d2463a50 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -3726,7 +3726,7 @@ def test_csv_to_pandas_df_with_date_columns(self, csv_with_date_columns): date_columns=["created_date"], ) # THEN assert the date column is converted to datetime - assert str(df["created_date"].dtype) == "datetime64[ns, UTC]" + assert str(df["created_date"].dtype) == "datetime64[ms, UTC]" expected_dates = pd.to_datetime( [1609459200000, 1609545600000, 1609632000000], unit="ms", utc=True From af989c0dbcc1d77dc83e0a31b1c0bddcf70fd243 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 9 Feb 2026 18:08:39 -0800 Subject: [PATCH 05/14] remove the unwanted code --- .../models/mixins/table_components.py | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index ca7a03717..12baa2871 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -138,18 +138,6 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]: ) -# class EllipsisJSONEncoder(json.JSONEncoder): -# """Custom JSON encoder that handles Ellipsis and pandas NA objects by converting them to strings.""" -# def default(self, obj): -# if obj is ...: -# return "..." -# # Handle pandas NA types -# import pandas as pd -# if obj is pd.NA or (hasattr(obj, '__class__') and obj.__class__.__name__ == 'NAType'): -# return None -# return super().default(obj) - - def convert_dtypes_to_json_serializable(df): """ Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types. @@ -207,7 +195,6 @@ def convert_dtypes_to_json_serializable(df): import pandas as pd for col in df.columns: - # Check if any values in the column are lists, dicts, or JSON strings, and serialize them to JSON if df[col].notna().any(): sample_values = df[col].dropna() if len(sample_values): @@ -215,9 +202,8 @@ def convert_dtypes_to_json_serializable(df): def _serialize_json_value(x): if x is None: return None - # Serialize lists and dicts to JSON using custom encoder for Ellipsis handling if isinstance(x, (list, dict)): - # Replace both Ellipsis and pd.NA within nested structures + def _reformat_special_values(obj): if obj is ...: return "..." @@ -237,7 +223,6 @@ def _reformat_special_values(obj): return json.dumps(cleaned_x, ensure_ascii=False).replace( '\\"', "\\'" ) - # Handle standalone ellipsis if x is ...: return "..." @@ -247,7 +232,6 @@ def _reformat_special_values(obj): # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype df[col] = df[col].convert_dtypes() - # convert the int64 and float64 columns to object columns which are JSON serializable types df[col] = df[col].replace({pd.NA: None}).astype(object) # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) From 4d06d3a205995617fcb8590eff88d7d969f45b6c Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 10 Feb 2026 12:08:44 -0800 Subject: [PATCH 06/14] revert changes in test_csv_to_pandas_df_with_date_columns --- tests/unit/synapseclient/mixins/unit_test_table_components.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 1d2463a50..72b22098f 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -3726,7 +3726,7 @@ def test_csv_to_pandas_df_with_date_columns(self, csv_with_date_columns): date_columns=["created_date"], ) # THEN assert the date column is converted to datetime - assert str(df["created_date"].dtype) == "datetime64[ms, UTC]" + assert str(df["created_date"].dtype) == "datetime64[ns, UTC]" expected_dates = pd.to_datetime( [1609459200000, 1609545600000, 1609632000000], unit="ms", utc=True From e1b20dca5ba8a5c8f2c8a23d2ad73549dde05d75 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Wed, 11 Feb 2026 09:49:20 -0800 Subject: [PATCH 07/14] update doctrings --- .../unit/synapseclient/mixins/unit_test_table_components.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 72b22098f..5d148fe71 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -4123,7 +4123,7 @@ def test_empty_dataframe(self): assert len(result) == 0 assert len(result.columns) == 0 - def test_mixed_column_types(self): + def test_mixed_column_types_no_conversion_needed(self): """Test that multiple column types are handled correctly together""" # GIVEN a dataframe with mixed column types df = pd.DataFrame( @@ -4165,7 +4165,7 @@ def test_nested_dict_with_ellipsis(self): assert result["nested_dict"].iloc[1] == '{"data": {"list": [1, 2, "..."]}}' def test_nullable_int64_with_pd_na(self): - """Test that Int64 columns with pd.NA get pd.NA converted to '[]' by _serialize_json_value""" + """Test that Int64 columns with pd.NA get pd.NA converted to None by _serialize_json_value""" # GIVEN a dataframe with nullable Int64 column containing pd.NA df = pd.DataFrame( {"nullable_int_col": pd.array([1, 2, pd.NA, 4, pd.NA], dtype="Int64")} @@ -4174,7 +4174,7 @@ def test_nullable_int64_with_pd_na(self): # WHEN convert_dtypes_to_json_serializable is called result = convert_dtypes_to_json_serializable(df) - # THEN the column should be object type and pd.NA should be converted to "[]" + # THEN the column should be object type and pd.NA should be converted to None assert is_object_dtype(result.nullable_int_col) expected_result = pd.DataFrame( {"nullable_int_col": [1, 2, None, 4, None]} From 7ef71107e80e37fdf5b2f81661d3a3a51f9c02e2 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Thu, 12 Feb 2026 11:35:41 -0800 Subject: [PATCH 08/14] add integration test for store_rows --- .../models/async/test_table_async.py | 405 ++++++++++++++++ .../models/synchronous/test_table.py | 453 +++++++++++++++++- 2 files changed, 857 insertions(+), 1 deletion(-) diff --git a/tests/integration/synapseclient/models/async/test_table_async.py b/tests/integration/synapseclient/models/async/test_table_async.py index b2d49e2ca..6ab334514 100644 --- a/tests/integration/synapseclient/models/async/test_table_async.py +++ b/tests/integration/synapseclient/models/async/test_table_async.py @@ -10,6 +10,7 @@ import pandas as pd import pytest +from pandas.api.types import is_object_dtype from pytest_mock import MockerFixture import synapseclient.models.mixins.asynchronous_job as asynchronous_job_module @@ -351,6 +352,8 @@ async def test_store_rows_from_csv_infer_columns( "float_string": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -512,6 +515,8 @@ async def test_store_rows_from_manually_defined_columns( "float_column": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -977,6 +982,404 @@ async def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 + async def test_store_rows_with_json_dict_columns_and_quotes( + self, project_model: Project + ) -> None: + """Test that dict columns with quotes in values are properly stored and retrieved""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with quotes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + {"description": 'Another "quoted" text', "value": 200}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + ], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data correctly + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND the JSON data should be properly preserved with quotes + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Text with 'quotes' here", "value": 100}, + {"description": "Another 'quoted' text", "value": 200}, + {"description": "Multiple 'quoted' 'words' here", "value": 300}, + ], + } + ) + assert is_object_dtype(results.json_data) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert ( + retrieved_data["description"] + == expected_result.loc[idx, "json_data"]["description"] + ) + assert ( + retrieved_data["value"] + == expected_result.loc[idx, "json_data"]["value"] + ) + + async def test_store_rows_with_ellipsis_in_list_columns( + self, project_model: Project + ) -> None: + """Test that Ellipsis (...) in list columns are properly converted to '...' strings""" + # GIVEN a table with list columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="string_list", column_type=ColumnType.STRING_LIST), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with Ellipsis in lists + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "string_list": [["a", "b", ...], ["d", ..., "f"], ["g", "h", "i"]], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data with Ellipsis converted + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND Ellipsis should be converted to "..." string in lists + assert len(results) == 3 + # Note: Synapse returns list columns as JSON strings, need to parse them + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["string_list"]) + assert retrieved_data == expected_result.loc[idx, "string_list"] + + async def test_store_rows_with_ellipsis_in_json_columns( + self, project_model: Project + ) -> None: + """Test that Ellipsis (...) in JSON dict columns are properly converted""" + # GIVEN a table with JSON columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with Ellipsis in dicts + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"id": 1, "data": ...}, + {"id": 2, "items": [1, 2, ...]}, + {"id": 3, "nested": {"inner": ...}}, + ], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and Ellipsis should be converted to "..." + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND Ellipsis should be converted to "..." string in JSON + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"id": 1, "data": "..."}, + {"id": 2, "items": [1, 2, "..."]}, + {"id": 3, "nested": {"inner": "..."}}, + ], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert retrieved_data == expected_result.loc[idx, "json_data"] + assert is_object_dtype(results.json_data) + + async def test_store_rows_with_standalone_ellipsis( + self, project_model: Project + ) -> None: + """Test that standalone Ellipsis values are converted to '...' strings""" + # GIVEN a table with mixed column types + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="string_col", column_type=ColumnType.STRING), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with standalone Ellipsis + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "string_col": ["value1", ..., "value3"], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and Ellipsis should be converted to "..." + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND standalone Ellipsis should be converted to "..." string + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "string_col": ["value1", "...", "value3"], + } + ) + for idx, row in results.iterrows(): + assert row["string_col"] == expected_result.loc[idx, "string_col"] + assert is_object_dtype(results.string_col) + + async def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: + """Test that pd.NA values in list columns are properly handled""" + # GIVEN a table with list columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="int_list", column_type=ColumnType.INTEGER_LIST), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with pd.NA in lists + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "int_list": [[1, 2, 3], pd.NA, [7, 8, 9], None], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + assert len(results) == 4 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "int_list": [[1, 2, 3], [], [7, 8, 9], []], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["int_list"]) + assert retrieved_data == expected_result.loc[idx, "int_list"] + assert is_object_dtype(results.int_list) + + async def test_store_rows_with_nullable_integer_columns( + self, project_model: Project + ) -> None: + """Test that nullable integer columns with pd.NA are properly stored""" + # GIVEN a table with integer columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with pd.NA in nullable integer columns + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA]), + } + ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and pd.NA should be converted to None + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND pd.NA should be represented as None/NaN + assert len(results) == 4 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "nullable_int": pd.array([10, None, 30, None]), + "nullable_float": pd.array([1.1, None, 3.3, None]), + } + ) + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) + assert is_object_dtype(results.nullable_int) + assert is_object_dtype(results.nullable_float) + + async def test_store_rows_with_json_containing_apostrophes( + self, project_model: Project + ) -> None: + """Test that JSON data with apostrophes is properly stored and retrieved""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = await table.store_async(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with apostrophes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Sage's work", "author": "O'Brien"}, + {"description": "It's a test", "author": "D'Angelo"}, + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, + ], + } + ) + + # WHEN I store the rows + await table.store_rows_async( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data with apostrophes correctly + results = await query_async( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + timeout=QUERY_TIMEOUT_SEC, + ) + + # AND the JSON data should preserve apostrophes + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Sage's work", "author": "O'Brien"}, + {"description": "It's a test", "author": "D'Angelo"}, + {"description": "Multiple's apostrophe's", "author": "McDonald's"}, + ], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert retrieved_data == expected_result.loc[idx, "json_data"] + assert is_object_dtype(results.json_data) + class TestUpsertRows: @pytest.fixture(autouse=True, scope="function") @@ -1552,6 +1955,7 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: pd.testing.assert_frame_equal( results_after_insert, expected_results, check_dtype=False ) + # Create a second test file to update references path2 = utils.make_bogus_data_file() self.schedule_for_cleanup(path2) @@ -1734,6 +2138,7 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: } ) pd.testing.assert_frame_equal(results, expected_results, check_dtype=False) + # WHEN I upsert with multiple primary keys and null values multi_key_data = pd.DataFrame( { diff --git a/tests/integration/synapseclient/models/synchronous/test_table.py b/tests/integration/synapseclient/models/synchronous/test_table.py index d0629b75d..fcbc9230a 100644 --- a/tests/integration/synapseclient/models/synchronous/test_table.py +++ b/tests/integration/synapseclient/models/synchronous/test_table.py @@ -9,6 +9,7 @@ import pandas as pd import pytest +from pandas.api.types import is_object_dtype from pytest_mock import MockerFixture import synapseclient.models.mixins.asynchronous_job as asynchronous_job_module @@ -325,6 +326,8 @@ def test_store_rows_from_csv_infer_columns( "float_string": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -474,6 +477,8 @@ def test_store_rows_from_manually_defined_columns( "float_column": [1.1, 2.2, 3.3, None], } ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv" self.schedule_for_cleanup(filepath) data_for_table.to_csv(filepath, index=False, float_format="%.12g") @@ -927,6 +932,395 @@ def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 + def test_store_rows_with_json_dict_columns_and_quotes( + self, project_model: Project + ) -> None: + """Test that dict columns with quotes in values are properly stored and retrieved""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with quotes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": 'Text with "quotes" here', "value": 100}, + {"description": 'Another "quoted" text', "value": 200}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + ], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data correctly + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND the JSON data should be properly preserved with quotes + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Text with 'quotes' here", "value": 100}, + {"description": "Another 'quoted' text", "value": 200}, + {"description": "Multiple 'quoted' 'words' here", "value": 300}, + ], + } + ) + assert is_object_dtype(results.json_data) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert ( + retrieved_data["description"] + == expected_result.loc[idx, "json_data"]["description"] + ) + assert ( + retrieved_data["value"] + == expected_result.loc[idx, "json_data"]["value"] + ) + + def test_store_rows_with_ellipsis_in_list_columns( + self, project_model: Project + ) -> None: + """Test that Ellipsis (...) in list columns are properly converted to '...' strings""" + # GIVEN a table with list columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="string_list", column_type=ColumnType.STRING_LIST), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with Ellipsis in lists + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "string_list": [["a", "b", ...], ["d", ..., "f"], ["g", "h", "i"]], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data with Ellipsis converted + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND Ellipsis should be converted to "..." string in lists + assert len(results) == 3 + # Note: Synapse returns list columns as JSON strings, need to parse them + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["string_list"]) + assert retrieved_data == expected_result.loc[idx, "string_list"] + + def test_store_rows_with_ellipsis_in_json_columns( + self, project_model: Project + ) -> None: + """Test that Ellipsis (...) in JSON dict columns are properly converted""" + # GIVEN a table with JSON columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with Ellipsis in dicts + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"id": 1, "data": ...}, + {"id": 2, "items": [1, 2, ...]}, + {"id": 3, "nested": {"inner": ...}}, + ], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and Ellipsis should be converted to "..." + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND Ellipsis should be converted to "..." string in JSON + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"id": 1, "data": "..."}, + {"id": 2, "items": [1, 2, "..."]}, + {"id": 3, "nested": {"inner": "..."}}, + ], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert retrieved_data == expected_result.loc[idx, "json_data"] + assert is_object_dtype(results.json_data) + + def test_store_rows_with_standalone_ellipsis(self, project_model: Project) -> None: + """Test that standalone Ellipsis values are converted to '...' strings""" + # GIVEN a table with mixed column types + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="string_col", column_type=ColumnType.STRING), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with standalone Ellipsis + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "string_col": ["value1", ..., "value3"], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and Ellipsis should be converted to "..." + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND standalone Ellipsis should be converted to "..." string + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "string_col": ["value1", "...", "value3"], + } + ) + for idx, row in results.iterrows(): + assert row["string_col"] == expected_result.loc[idx, "string_col"] + assert is_object_dtype(results.string_col) + + def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: + """Test that pd.NA values in list columns are properly handled""" + # GIVEN a table with list columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="int_list", column_type=ColumnType.INTEGER_LIST), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with pd.NA in lists + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "int_list": [[1, 2, 3], pd.NA, [7, 8, 9], None], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + assert len(results) == 4 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "int_list": [[1, 2, 3], [], [7, 8, 9], []], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["int_list"]) + assert retrieved_data == expected_result.loc[idx, "int_list"] + assert is_object_dtype(results.int_list) + + def test_store_rows_with_nullable_integer_columns( + self, project_model: Project + ) -> None: + """Test that nullable integer columns with pd.NA are properly stored""" + # GIVEN a table with integer columns + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with pd.NA in nullable integer columns + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA]), + } + ) + data_for_table = data_for_table.convert_dtypes() + data_for_table = data_for_table.replace({pd.NA: None}) + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and pd.NA should be converted to None + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND pd.NA should be represented as None/NaN + assert len(results) == 4 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3, 4], + "nullable_int": pd.array([10, None, 30, None]), + "nullable_float": pd.array([1.1, None, 3.3, None]), + } + ) + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) + assert is_object_dtype(results.nullable_int) + assert is_object_dtype(results.nullable_float) + + def test_store_rows_with_json_containing_apostrophes( + self, project_model: Project + ) -> None: + """Test that JSON data with apostrophes is properly stored and retrieved""" + # GIVEN a table with a JSON column + table_name = str(uuid.uuid4()) + table = Table( + name=table_name, + parent_id=project_model.id, + columns=[ + Column(name="id", column_type=ColumnType.INTEGER), + Column(name="json_data", column_type=ColumnType.JSON), + ], + ) + table = table.store(synapse_client=self.syn) + self.schedule_for_cleanup(table.id) + + # AND data with apostrophes in JSON values + data_for_table = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Sage's work", "author": "O'Brien"}, + {"description": "It's a test", "author": "D'Angelo"}, + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, + ], + } + ) + + # WHEN I store the rows + table.store_rows( + values=data_for_table, + synapse_client=self.syn, + ) + + # THEN I can query the table and retrieve the data with apostrophes correctly + results = query( + f"SELECT * FROM {table.id}", + synapse_client=self.syn, + ) + + # AND the JSON data should preserve apostrophes + assert len(results) == 3 + expected_result = pd.DataFrame( + { + "id": [1, 2, 3], + "json_data": [ + {"description": "Sage's work", "author": "O'Brien"}, + {"description": "It's a test", "author": "D'Angelo"}, + {"description": "Multiple's apostrophe's", "author": "McDonald's"}, + ], + } + ) + for idx, row in results.iterrows(): + retrieved_data = json.loads(row["json_data"]) + assert retrieved_data == expected_result.loc[idx, "json_data"] + assert is_object_dtype(results.json_data) + class TestUpsertRows: @pytest.fixture(autouse=True, scope="function") @@ -1488,9 +1882,66 @@ def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) pd.testing.assert_frame_equal( - results_after_insert, expected_results, check_dtype=False + results_after_insert.drop( + columns=[ + "column_string_LIST", + "column_integer_LIST", + "column_boolean_LIST", + "column_date_LIST", + "column_entity_id_list", + "column_user_id_list", + "column_json", + ] + ), + expected_results.drop( + columns=[ + "column_string_LIST", + "column_integer_LIST", + "column_boolean_LIST", + "column_date_LIST", + "column_entity_id_list", + "column_user_id_list", + "column_json", + ] + ), + check_dtype=False, ) + for idx, row in results_after_insert.iterrows(): + retrieved_data_string_list = json.loads(row["column_string_LIST"]) + assert ( + retrieved_data_string_list + == expected_results.loc[idx, "column_string_LIST"] + ) + retrieved_data_integer_list = json.loads(row["column_integer_LIST"]) + assert ( + retrieved_data_integer_list + == expected_results.loc[idx, "column_integer_LIST"] + ) + retrieved_data_boolean_list = json.loads(row["column_boolean_LIST"]) + assert ( + retrieved_data_boolean_list + == expected_results.loc[idx, "column_boolean_LIST"] + ) + retrieved_data_date_list = json.loads(row["column_date_LIST"]) + assert ( + retrieved_data_date_list + == expected_results.loc[idx, "column_date_LIST"] + ) + retrieved_data_entity_id_list = json.loads(row["column_entity_id_list"]) + assert ( + retrieved_data_entity_id_list + == expected_results.loc[idx, "column_entity_id_list"] + ) + retrieved_data_user_id_list = json.loads(row["column_user_id_list"]) + assert ( + retrieved_data_user_id_list + == expected_results.loc[idx, "column_user_id_list"] + ) + retrieved_data_json = json.loads(row["column_json"]) + assert retrieved_data_json == expected_results.loc[idx, "column_json"] # Create a second test file to update references path2 = utils.make_bogus_data_file() From a4913a6e01d5145fdd48cb59c9d8a09eb6f0c419 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 16 Feb 2026 14:00:30 -0800 Subject: [PATCH 09/14] add to_csv kwargs to ensure double quote and apostophe formated correctly when upload data from a dataframe --- synapseclient/core/upload/upload_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/synapseclient/core/upload/upload_utils.py b/synapseclient/core/upload/upload_utils.py index 9175dfd0e..99c7fd429 100644 --- a/synapseclient/core/upload/upload_utils.py +++ b/synapseclient/core/upload/upload_utils.py @@ -70,6 +70,9 @@ def get_partial_dataframe_chunk( header=False, index=False, float_format="%.12g", + doublequote=False, + escapechar="\\", + quoting=0, **(to_csv_kwargs or {}), ) number_of_bytes_in_buffer = buffer.tell() From 98689d3384feaaae07eb0af6b3b22a9b2aaedccf Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 16 Feb 2026 14:01:46 -0800 Subject: [PATCH 10/14] remove json string dumps function to let synapse decode data directly --- synapseclient/models/mixins/table_components.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index 12baa2871..5f18a6cba 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -220,9 +220,8 @@ def _reformat_special_values(obj): return obj cleaned_x = _reformat_special_values(x) - return json.dumps(cleaned_x, ensure_ascii=False).replace( - '\\"', "\\'" - ) + # return json.dumps(cleaned_x, ensure_ascii=False) + return cleaned_x # Handle standalone ellipsis if x is ...: return "..." From a0af1b6c85d572844541d2bac6de67dbdaa7fe14 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 16 Feb 2026 17:24:03 -0800 Subject: [PATCH 11/14] update unit test since the convert_dtypes_to_json_serializable no longer output json string --- .../mixins/unit_test_table_components.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/tests/unit/synapseclient/mixins/unit_test_table_components.py b/tests/unit/synapseclient/mixins/unit_test_table_components.py index 5d148fe71..0db8ecddd 100644 --- a/tests/unit/synapseclient/mixins/unit_test_table_components.py +++ b/tests/unit/synapseclient/mixins/unit_test_table_components.py @@ -1,3 +1,4 @@ +import json import os import re from collections import OrderedDict @@ -4035,8 +4036,8 @@ def test_ellipsis_handling_in_list(self): result = convert_dtypes_to_json_serializable(df) # THEN Ellipsis should be converted to "..." in JSON string - assert result["list_with_ellipsis"].iloc[0] == '[1, 2, "..."]' - assert result["list_with_ellipsis"].iloc[1] == '[4, "...", 6]' + assert result["list_with_ellipsis"].iloc[0] == [1, 2, "..."] + assert result["list_with_ellipsis"].iloc[1] == [4, "...", 6] assert is_object_dtype(result.list_with_ellipsis) def test_ellipsis_handling_in_dict(self): @@ -4055,8 +4056,8 @@ def test_ellipsis_handling_in_dict(self): result = convert_dtypes_to_json_serializable(df) # THEN Ellipsis should be converted to "..." in JSON string - assert result.dict_with_ellipsis.iloc[0] == '{"id": 1, "data": "..."}' - assert result.dict_with_ellipsis.iloc[1] == '{"id": 2, "items": [1, "..."]}' + assert result.dict_with_ellipsis.iloc[0] == {"id": 1, "data": "..."} + assert result.dict_with_ellipsis.iloc[1] == {"id": 2, "items": [1, "..."]} assert is_object_dtype(result.dict_with_ellipsis) def test_standalone_ellipsis(self): @@ -4081,9 +4082,9 @@ def test_none_in_list_serialized_to_empty_list(self): result = convert_dtypes_to_json_serializable(df) # THEN None should be converted to "[]" - assert result["list_col"].iloc[0] == "[1, 2, 3]" + assert result["list_col"].iloc[0] == [1, 2, 3] assert result["list_col"].iloc[1] == None - assert result["list_col"].iloc[2] == "[7, 8, 9]" + assert result["list_col"].iloc[2] == [7, 8, 9] def test_dict_with_quotes_in_values(self): """Test that dicts with quotes in string values are properly handled""" @@ -4101,14 +4102,8 @@ def test_dict_with_quotes_in_values(self): result = convert_dtypes_to_json_serializable(df) # THEN the JSON string should be properly formatted - assert ( - result["dict_col"].iloc[0] - == '{"description": "Text with \\\'quotes\\\' here"}' - ) - assert ( - result["dict_col"].iloc[1] - == '{"description": "Another \\\'quoted\\\' text"}' - ) + assert result["dict_col"].iloc[0] == {"description": 'Text with "quotes" here'} + assert result["dict_col"].iloc[1] == {"description": 'Another "quoted" text'} assert is_object_dtype(result.dict_col) def test_empty_dataframe(self): @@ -4161,8 +4156,8 @@ def test_nested_dict_with_ellipsis(self): result = convert_dtypes_to_json_serializable(df) # THEN Ellipsis should be converted in nested structures - assert result["nested_dict"].iloc[0] == '{"outer": {"inner": "..."}}' - assert result["nested_dict"].iloc[1] == '{"data": {"list": [1, 2, "..."]}}' + assert result["nested_dict"].iloc[0] == {"outer": {"inner": "..."}} + assert result["nested_dict"].iloc[1] == {"data": {"list": [1, 2, "..."]}} def test_nullable_int64_with_pd_na(self): """Test that Int64 columns with pd.NA get pd.NA converted to None by _serialize_json_value""" From 5002bd620529259765ba0b431efdfa049f3d91ba Mon Sep 17 00:00:00 2001 From: danlu1 Date: Tue, 17 Feb 2026 23:00:24 -0800 Subject: [PATCH 12/14] update integration test as no json string need to be generated --- .../models/async/test_table_async.py | 66 +++++----- .../models/synchronous/test_table.py | 113 +++++------------- 2 files changed, 71 insertions(+), 108 deletions(-) diff --git a/tests/integration/synapseclient/models/async/test_table_async.py b/tests/integration/synapseclient/models/async/test_table_async.py index 6ab334514..0fc6d87a8 100644 --- a/tests/integration/synapseclient/models/async/test_table_async.py +++ b/tests/integration/synapseclient/models/async/test_table_async.py @@ -1033,23 +1033,18 @@ async def test_store_rows_with_json_dict_columns_and_quotes( { "id": [1, 2, 3], "json_data": [ - {"description": "Text with 'quotes' here", "value": 100}, - {"description": "Another 'quoted' text", "value": 200}, - {"description": "Multiple 'quoted' 'words' here", "value": 300}, + {"description": 'Text with "quotes" here', "value": 100}, + {"description": 'Another "quoted" text', "value": 200}, + {"description": 'Multiple "quoted" "words" here', "value": 300}, ], } ) assert is_object_dtype(results.json_data) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert ( - retrieved_data["description"] - == expected_result.loc[idx, "json_data"]["description"] - ) - assert ( - retrieved_data["value"] - == expected_result.loc[idx, "json_data"]["value"] - ) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) async def test_store_rows_with_ellipsis_in_list_columns( self, project_model: Project @@ -1098,9 +1093,11 @@ async def test_store_rows_with_ellipsis_in_list_columns( "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["string_list"]) - assert retrieved_data == expected_result.loc[idx, "string_list"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) async def test_store_rows_with_ellipsis_in_json_columns( self, project_model: Project @@ -1156,9 +1153,11 @@ async def test_store_rows_with_ellipsis_in_json_columns( ], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert retrieved_data == expected_result.loc[idx, "json_data"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.json_data) async def test_store_rows_with_standalone_ellipsis( @@ -1207,8 +1206,11 @@ async def test_store_rows_with_standalone_ellipsis( "string_col": ["value1", "...", "value3"], } ) - for idx, row in results.iterrows(): - assert row["string_col"] == expected_result.loc[idx, "string_col"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.string_col) async def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: @@ -1253,9 +1255,11 @@ async def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> N "int_list": [[1, 2, 3], [], [7, 8, 9], []], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["int_list"]) - assert retrieved_data == expected_result.loc[idx, "int_list"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.int_list) async def test_store_rows_with_nullable_integer_columns( @@ -1375,9 +1379,11 @@ async def test_store_rows_with_json_containing_apostrophes( ], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert retrieved_data == expected_result.loc[idx, "json_data"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.json_data) @@ -1952,6 +1958,10 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) + # import pdb; pdb.set_trace() pd.testing.assert_frame_equal( results_after_insert, expected_results, check_dtype=False ) @@ -2137,6 +2147,8 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) pd.testing.assert_frame_equal(results, expected_results, check_dtype=False) # WHEN I upsert with multiple primary keys and null values diff --git a/tests/integration/synapseclient/models/synchronous/test_table.py b/tests/integration/synapseclient/models/synchronous/test_table.py index fcbc9230a..e7526e800 100644 --- a/tests/integration/synapseclient/models/synchronous/test_table.py +++ b/tests/integration/synapseclient/models/synchronous/test_table.py @@ -982,23 +982,18 @@ def test_store_rows_with_json_dict_columns_and_quotes( { "id": [1, 2, 3], "json_data": [ - {"description": "Text with 'quotes' here", "value": 100}, - {"description": "Another 'quoted' text", "value": 200}, - {"description": "Multiple 'quoted' 'words' here", "value": 300}, + {"description": 'Text with "quotes" here', "value": 100}, + {"description": 'Another "quoted" text', "value": 200}, + {"description": 'Multiple "quoted" "words" here', "value": 300}, ], } ) assert is_object_dtype(results.json_data) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert ( - retrieved_data["description"] - == expected_result.loc[idx, "json_data"]["description"] - ) - assert ( - retrieved_data["value"] - == expected_result.loc[idx, "json_data"]["value"] - ) + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) def test_store_rows_with_ellipsis_in_list_columns( self, project_model: Project @@ -1046,9 +1041,11 @@ def test_store_rows_with_ellipsis_in_list_columns( "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["string_list"]) - assert retrieved_data == expected_result.loc[idx, "string_list"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) def test_store_rows_with_ellipsis_in_json_columns( self, project_model: Project @@ -1103,9 +1100,11 @@ def test_store_rows_with_ellipsis_in_json_columns( ], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert retrieved_data == expected_result.loc[idx, "json_data"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.json_data) def test_store_rows_with_standalone_ellipsis(self, project_model: Project) -> None: @@ -1196,9 +1195,11 @@ def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: "int_list": [[1, 2, 3], [], [7, 8, 9], []], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["int_list"]) - assert retrieved_data == expected_result.loc[idx, "int_list"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.int_list) def test_store_rows_with_nullable_integer_columns( @@ -1316,9 +1317,11 @@ def test_store_rows_with_json_containing_apostrophes( ], } ) - for idx, row in results.iterrows(): - retrieved_data = json.loads(row["json_data"]) - assert retrieved_data == expected_result.loc[idx, "json_data"] + pd.testing.assert_frame_equal( + results.drop(columns=["ROW_ID", "ROW_VERSION"]), + expected_result, + check_dtype=False, + ) assert is_object_dtype(results.json_data) @@ -1884,64 +1887,10 @@ def test_upsert_all_data_types(self, project_model: Project) -> None: ) expected_results = expected_results.convert_dtypes() expected_results = expected_results.replace({pd.NA: None}) + # import pdb; pdb.set_trace() pd.testing.assert_frame_equal( - results_after_insert.drop( - columns=[ - "column_string_LIST", - "column_integer_LIST", - "column_boolean_LIST", - "column_date_LIST", - "column_entity_id_list", - "column_user_id_list", - "column_json", - ] - ), - expected_results.drop( - columns=[ - "column_string_LIST", - "column_integer_LIST", - "column_boolean_LIST", - "column_date_LIST", - "column_entity_id_list", - "column_user_id_list", - "column_json", - ] - ), - check_dtype=False, + results_after_insert, expected_results, check_dtype=False ) - for idx, row in results_after_insert.iterrows(): - retrieved_data_string_list = json.loads(row["column_string_LIST"]) - assert ( - retrieved_data_string_list - == expected_results.loc[idx, "column_string_LIST"] - ) - retrieved_data_integer_list = json.loads(row["column_integer_LIST"]) - assert ( - retrieved_data_integer_list - == expected_results.loc[idx, "column_integer_LIST"] - ) - retrieved_data_boolean_list = json.loads(row["column_boolean_LIST"]) - assert ( - retrieved_data_boolean_list - == expected_results.loc[idx, "column_boolean_LIST"] - ) - retrieved_data_date_list = json.loads(row["column_date_LIST"]) - assert ( - retrieved_data_date_list - == expected_results.loc[idx, "column_date_LIST"] - ) - retrieved_data_entity_id_list = json.loads(row["column_entity_id_list"]) - assert ( - retrieved_data_entity_id_list - == expected_results.loc[idx, "column_entity_id_list"] - ) - retrieved_data_user_id_list = json.loads(row["column_user_id_list"]) - assert ( - retrieved_data_user_id_list - == expected_results.loc[idx, "column_user_id_list"] - ) - retrieved_data_json = json.loads(row["column_json"]) - assert retrieved_data_json == expected_results.loc[idx, "column_json"] # Create a second test file to update references path2 = utils.make_bogus_data_file() @@ -2126,6 +2075,8 @@ def test_upsert_all_data_types(self, project_model: Project) -> None: ], } ) + expected_results = expected_results.convert_dtypes() + expected_results = expected_results.replace({pd.NA: None}) pd.testing.assert_frame_equal(results, expected_results, check_dtype=False) # WHEN I upsert with multiple primary keys and null values From c874fe46689f32b819e5823e3fe216a920659480 Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 23 Feb 2026 16:09:30 -0800 Subject: [PATCH 13/14] remvoe unwanted code --- .../models/mixins/table_components.py | 70 ++++++++----------- 1 file changed, 31 insertions(+), 39 deletions(-) diff --git a/synapseclient/models/mixins/table_components.py b/synapseclient/models/mixins/table_components.py index 5f18a6cba..8e5083005 100644 --- a/synapseclient/models/mixins/table_components.py +++ b/synapseclient/models/mixins/table_components.py @@ -141,7 +141,6 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]: def convert_dtypes_to_json_serializable(df): """ Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types. - Convert the list and dict columns to JSON strings which are JSON serializable types. Replace both Ellipsis and pandas NA within nested structures which are not JSON serializable types. Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types. Arguments: @@ -195,43 +194,37 @@ def convert_dtypes_to_json_serializable(df): import pandas as pd for col in df.columns: - if df[col].notna().any(): - sample_values = df[col].dropna() - if len(sample_values): - - def _serialize_json_value(x): - if x is None: - return None - if isinstance(x, (list, dict)): - - def _reformat_special_values(obj): - if obj is ...: - return "..." - # Handle pandas NA - check type name to avoid array ambiguity - if obj is pd.NA: - return None - if isinstance(obj, dict): - return { - k: _reformat_special_values(v) - for k, v in obj.items() - } - if isinstance(obj, list): - return [_reformat_special_values(item) for item in obj] - return obj - - cleaned_x = _reformat_special_values(x) - # return json.dumps(cleaned_x, ensure_ascii=False) - return cleaned_x - # Handle standalone ellipsis - if x is ...: - return "..." - return x - - df[col] = df[col].apply(lambda x: _serialize_json_value(x)) - - # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype - df[col] = df[col].convert_dtypes() - df[col] = df[col].replace({pd.NA: None}).astype(object) + sample_values = df[col].dropna() + if len(sample_values): + + def _serialize_json_value(x): + if isinstance(x, (list, dict)): + + def _reformat_special_values(obj): + if obj is ...: + return "..." + if obj is pd.NA: + return None + if isinstance(obj, dict): + return { + k: _reformat_special_values(v) for k, v in obj.items() + } + if isinstance(obj, list): + return [_reformat_special_values(item) for item in obj] + return obj + + cleaned_x = _reformat_special_values(x) + return cleaned_x + # Handle standalone ellipsis + if x is ...: + return "..." + return x + + df[col] = df[col].apply(lambda x: _serialize_json_value(x)) + + # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype + df[col] = df[col].convert_dtypes() + df[col] = df[col].replace({pd.NA: None}).astype(object) # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION) if col in [ @@ -2869,7 +2862,6 @@ async def main(): timeout=timeout, synapse_client=synapse_client, ) - if download_location: return csv_path From dab80f06bf7c899b9c3acb320e5cb605aeb5d3df Mon Sep 17 00:00:00 2001 From: danlu1 Date: Mon, 23 Feb 2026 16:10:16 -0800 Subject: [PATCH 14/14] simplify test cases --- .../models/async/test_table_async.py | 461 +++++------------- .../models/synchronous/test_table.py | 450 +++++------------ 2 files changed, 230 insertions(+), 681 deletions(-) diff --git a/tests/integration/synapseclient/models/async/test_table_async.py b/tests/integration/synapseclient/models/async/test_table_async.py index 0fc6d87a8..432d8e9bf 100644 --- a/tests/integration/synapseclient/models/async/test_table_async.py +++ b/tests/integration/synapseclient/models/async/test_table_async.py @@ -982,10 +982,10 @@ async def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 - async def test_store_rows_with_json_dict_columns_and_quotes( + async def test_store_rows_with_quotes_and_apostrophes_ellipses( self, project_model: Project ) -> None: - """Test that dict columns with quotes in values are properly stored and retrieved""" + """Test columns with quotes, apostrophes, and ellipses (in lists, dicts, and standalone) in values are properly stored and retrieved in the tables""" # GIVEN a table with a JSON column table_name = str(uuid.uuid4()) table = Table( @@ -994,6 +994,13 @@ async def test_store_rows_with_json_dict_columns_and_quotes( columns=[ Column(name="id", column_type=ColumnType.INTEGER), Column(name="json_data", column_type=ColumnType.JSON), + Column( + name="string_list_with_ellipses", column_type=ColumnType.STRING_LIST + ), + Column(name="string_col_with_ellipses", column_type=ColumnType.STRING), + Column(name="int_list_with_pa_na", column_type=ColumnType.INTEGER_LIST), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), ], ) table = await table.store_async(synapse_client=self.syn) @@ -1002,389 +1009,151 @@ async def test_store_rows_with_json_dict_columns_and_quotes( # AND data with quotes in JSON values data_for_table = pd.DataFrame( { - "id": [1, 2, 3], + "id": [1, 2, 3, 4, 5, 6, 7], "json_data": [ {"description": 'Text with "quotes" here', "value": 100}, - {"description": 'Another "quoted" text', "value": 200}, { "description": 'Multiple "quoted" "words" here', "value": 300, }, + { + "description": ..., + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, ...], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": ...}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", ...], + ["d", ..., "f"], + ["g", "h", "i"], + [...], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], ], + "string_col_with_ellipses": [ + "value1", + ..., + "value3", + ..., + "value6", + ..., + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + pd.NA, + [7, 8, 9], + pd.NA, + [11, 12, 13], + pd.NA, + [15, 16, 17], + ], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA, 31, pd.NA, 32]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA, 3.4, pd.NA, 3.5]), } ) - # WHEN I store the rows await table.store_rows_async( values=data_for_table, synapse_client=self.syn, ) - # THEN I can query the table and retrieve the data correctly results = await query_async( f"SELECT * FROM {table.id}", synapse_client=self.syn, timeout=QUERY_TIMEOUT_SEC, ) - # AND the JSON data should be properly preserved with quotes - assert len(results) == 3 + assert len(results) == 7 expected_result = pd.DataFrame( { - "id": [1, 2, 3], + "id": [1, 2, 3, 4, 5, 6, 7], "json_data": [ {"description": 'Text with "quotes" here', "value": 100}, - {"description": 'Another "quoted" text', "value": 200}, - {"description": 'Multiple "quoted" "words" here', "value": 300}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": "...", + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, "..."], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": "..."}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value ], - } - ) - assert is_object_dtype(results.json_data) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - - async def test_store_rows_with_ellipsis_in_list_columns( - self, project_model: Project - ) -> None: - """Test that Ellipsis (...) in list columns are properly converted to '...' strings""" - # GIVEN a table with list columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="string_list", column_type=ColumnType.STRING_LIST), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with Ellipsis in lists - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "string_list": [["a", "b", ...], ["d", ..., "f"], ["g", "h", "i"]], - } - ) - - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and retrieve the data with Ellipsis converted - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - # AND Ellipsis should be converted to "..." string in lists - assert len(results) == 3 - # Note: Synapse returns list columns as JSON strings, need to parse them - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], - } - ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - - async def test_store_rows_with_ellipsis_in_json_columns( - self, project_model: Project - ) -> None: - """Test that Ellipsis (...) in JSON dict columns are properly converted""" - # GIVEN a table with JSON columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="json_data", column_type=ColumnType.JSON), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with Ellipsis in dicts - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"id": 1, "data": ...}, - {"id": 2, "items": [1, 2, ...]}, - {"id": 3, "nested": {"inner": ...}}, + "string_list_with_ellipses": [ + ["a", "b", "..."], + ["d", "...", "f"], + ["g", "h", "i"], + ["..."], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], ], - } - ) - - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and Ellipsis should be converted to "..." - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - # AND Ellipsis should be converted to "..." string in JSON - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"id": 1, "data": "..."}, - {"id": 2, "items": [1, 2, "..."]}, - {"id": 3, "nested": {"inner": "..."}}, + "string_col_with_ellipses": [ + "value1", + "...", + "value3", + "...", + "value6", + "...", + "value8", ], + "int_list_with_pa_na": [ + [1, 2, 3], + [], + [7, 8, 9], + [], + [11, 12, 13], + [], + [15, 16, 17], + ], + "nullable_int": pd.array([10, None, 30, None, 31, None, 32]), + "nullable_float": pd.array([1.1, None, 3.3, None, 3.4, None, 3.5]), } ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) assert is_object_dtype(results.json_data) - - async def test_store_rows_with_standalone_ellipsis( - self, project_model: Project - ) -> None: - """Test that standalone Ellipsis values are converted to '...' strings""" - # GIVEN a table with mixed column types - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="string_col", column_type=ColumnType.STRING), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with standalone Ellipsis - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "string_col": ["value1", ..., "value3"], - } - ) - - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and Ellipsis should be converted to "..." - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - # AND standalone Ellipsis should be converted to "..." string - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "string_col": ["value1", "...", "value3"], - } - ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - assert is_object_dtype(results.string_col) - - async def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: - """Test that pd.NA values in list columns are properly handled""" - # GIVEN a table with list columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="int_list", column_type=ColumnType.INTEGER_LIST), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with pd.NA in lists - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "int_list": [[1, 2, 3], pd.NA, [7, 8, 9], None], - } - ) - - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - assert len(results) == 4 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "int_list": [[1, 2, 3], [], [7, 8, 9], []], - } - ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - assert is_object_dtype(results.int_list) - - async def test_store_rows_with_nullable_integer_columns( - self, project_model: Project - ) -> None: - """Test that nullable integer columns with pd.NA are properly stored""" - # GIVEN a table with integer columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="nullable_int", column_type=ColumnType.INTEGER), - Column(name="nullable_float", column_type=ColumnType.DOUBLE), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with pd.NA in nullable integer columns - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "nullable_int": pd.array([10, pd.NA, 30, pd.NA]), - "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA]), - } - ) - data_for_table = data_for_table.convert_dtypes() - data_for_table = data_for_table.replace({pd.NA: None}) - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and pd.NA should be converted to None - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - # AND pd.NA should be represented as None/NaN - assert len(results) == 4 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "nullable_int": pd.array([10, None, 30, None]), - "nullable_float": pd.array([1.1, None, 3.3, None]), - } - ) - expected_result = expected_result.convert_dtypes() - expected_result = expected_result.replace({pd.NA: None}) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) + assert is_object_dtype(results.int_list_with_pa_na) assert is_object_dtype(results.nullable_int) assert is_object_dtype(results.nullable_float) - async def test_store_rows_with_json_containing_apostrophes( - self, project_model: Project - ) -> None: - """Test that JSON data with apostrophes is properly stored and retrieved""" - # GIVEN a table with a JSON column - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="json_data", column_type=ColumnType.JSON), - ], - ) - table = await table.store_async(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with apostrophes in JSON values - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"description": "Sage's work", "author": "O'Brien"}, - {"description": "It's a test", "author": "D'Angelo"}, - { - "description": "Multiple's apostrophe's", - "author": "McDonald's", - }, - ], - } - ) - - # WHEN I store the rows - await table.store_rows_async( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and retrieve the data with apostrophes correctly - results = await query_async( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - timeout=QUERY_TIMEOUT_SEC, - ) - - # AND the JSON data should preserve apostrophes - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"description": "Sage's work", "author": "O'Brien"}, - {"description": "It's a test", "author": "D'Angelo"}, - {"description": "Multiple's apostrophe's", "author": "McDonald's"}, - ], - } - ) + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) pd.testing.assert_frame_equal( results.drop(columns=["ROW_ID", "ROW_VERSION"]), expected_result, check_dtype=False, ) - assert is_object_dtype(results.json_data) class TestUpsertRows: diff --git a/tests/integration/synapseclient/models/synchronous/test_table.py b/tests/integration/synapseclient/models/synchronous/test_table.py index e7526e800..6178de523 100644 --- a/tests/integration/synapseclient/models/synchronous/test_table.py +++ b/tests/integration/synapseclient/models/synchronous/test_table.py @@ -932,10 +932,10 @@ def test_store_rows_as_large_df_being_split_and_uploaded( # AND The spy should have been called in multiple batches assert spy_send_job.call_count == 1 - def test_store_rows_with_json_dict_columns_and_quotes( + def test_store_rows_with_quotes_and_apostrophes_ellipses( self, project_model: Project ) -> None: - """Test that dict columns with quotes in values are properly stored and retrieved""" + """Test columns with quotes, apostrophes, and ellipses (in lists, dicts, and standalone) in values are properly stored and retrieved in the tables""" # GIVEN a table with a JSON column table_name = str(uuid.uuid4()) table = Table( @@ -944,6 +944,13 @@ def test_store_rows_with_json_dict_columns_and_quotes( columns=[ Column(name="id", column_type=ColumnType.INTEGER), Column(name="json_data", column_type=ColumnType.JSON), + Column( + name="string_list_with_ellipses", column_type=ColumnType.STRING_LIST + ), + Column(name="string_col_with_ellipses", column_type=ColumnType.STRING), + Column(name="int_list_with_pa_na", column_type=ColumnType.INTEGER_LIST), + Column(name="nullable_int", column_type=ColumnType.INTEGER), + Column(name="nullable_float", column_type=ColumnType.DOUBLE), ], ) table = table.store(synapse_client=self.syn) @@ -952,377 +959,150 @@ def test_store_rows_with_json_dict_columns_and_quotes( # AND data with quotes in JSON values data_for_table = pd.DataFrame( { - "id": [1, 2, 3], + "id": [1, 2, 3, 4, 5, 6, 7], "json_data": [ {"description": 'Text with "quotes" here', "value": 100}, - {"description": 'Another "quoted" text', "value": 200}, { "description": 'Multiple "quoted" "words" here', "value": 300, }, + { + "description": ..., + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, ...], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": ...}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value + ], + "string_list_with_ellipses": [ + ["a", "b", ...], + ["d", ..., "f"], + ["g", "h", "i"], + [...], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], ], + "string_col_with_ellipses": [ + "value1", + ..., + "value3", + ..., + "value6", + ..., + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + pd.NA, + [7, 8, 9], + pd.NA, + [11, 12, 13], + pd.NA, + [15, 16, 17], + ], + "nullable_int": pd.array([10, pd.NA, 30, pd.NA, 31, pd.NA, 32]), + "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA, 3.4, pd.NA, 3.5]), } ) - # WHEN I store the rows table.store_rows( values=data_for_table, synapse_client=self.syn, ) - # THEN I can query the table and retrieve the data correctly results = query( f"SELECT * FROM {table.id}", synapse_client=self.syn, ) - # AND the JSON data should be properly preserved with quotes - assert len(results) == 3 + assert len(results) == 7 expected_result = pd.DataFrame( { - "id": [1, 2, 3], + "id": [1, 2, 3, 4, 5, 6, 7], "json_data": [ {"description": 'Text with "quotes" here', "value": 100}, - {"description": 'Another "quoted" text', "value": 200}, - {"description": 'Multiple "quoted" "words" here', "value": 300}, + { + "description": 'Multiple "quoted" "words" here', + "value": 300, + }, + { + "description": "...", + "value": 200, + }, # standalone ellipses in the json value + { + "description": [1, 2, "..."], + "value": 400, + }, # list with ellipses in the json value + { + "description": {"inner": "..."}, + "value": 500, + }, # dict with ellipses in the json value + { + "description": "single apostrophe's", + "author": "D'Angelo", + }, # single apostrophe in the json value + { + "description": "Multiple's apostrophe's", + "author": "McDonald's", + }, # multiple apostrophe's in the json value ], - } - ) - assert is_object_dtype(results.json_data) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - - def test_store_rows_with_ellipsis_in_list_columns( - self, project_model: Project - ) -> None: - """Test that Ellipsis (...) in list columns are properly converted to '...' strings""" - # GIVEN a table with list columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="string_list", column_type=ColumnType.STRING_LIST), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with Ellipsis in lists - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "string_list": [["a", "b", ...], ["d", ..., "f"], ["g", "h", "i"]], - } - ) - - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and retrieve the data with Ellipsis converted - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - # AND Ellipsis should be converted to "..." string in lists - assert len(results) == 3 - # Note: Synapse returns list columns as JSON strings, need to parse them - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "string_list": [["a", "b", "..."], ["d", "...", "f"], ["g", "h", "i"]], - } - ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - - def test_store_rows_with_ellipsis_in_json_columns( - self, project_model: Project - ) -> None: - """Test that Ellipsis (...) in JSON dict columns are properly converted""" - # GIVEN a table with JSON columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="json_data", column_type=ColumnType.JSON), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with Ellipsis in dicts - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"id": 1, "data": ...}, - {"id": 2, "items": [1, 2, ...]}, - {"id": 3, "nested": {"inner": ...}}, + "string_list_with_ellipses": [ + ["a", "b", "..."], + ["d", "...", "f"], + ["g", "h", "i"], + ["..."], + ["m", "n", "..."], + ["p", "q", "r"], + ["s", "t", "u"], ], - } - ) - - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and Ellipsis should be converted to "..." - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - # AND Ellipsis should be converted to "..." string in JSON - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"id": 1, "data": "..."}, - {"id": 2, "items": [1, 2, "..."]}, - {"id": 3, "nested": {"inner": "..."}}, + "string_col_with_ellipses": [ + "value1", + "...", + "value3", + "...", + "value6", + "...", + "value8", + ], + "int_list_with_pa_na": [ + [1, 2, 3], + [], + [7, 8, 9], + [], + [11, 12, 13], + [], + [15, 16, 17], ], + "nullable_int": pd.array([10, None, 30, None, 31, None, 32]), + "nullable_float": pd.array([1.1, None, 3.3, None, 3.4, None, 3.5]), } ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) assert is_object_dtype(results.json_data) - - def test_store_rows_with_standalone_ellipsis(self, project_model: Project) -> None: - """Test that standalone Ellipsis values are converted to '...' strings""" - # GIVEN a table with mixed column types - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="string_col", column_type=ColumnType.STRING), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with standalone Ellipsis - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "string_col": ["value1", ..., "value3"], - } - ) - - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and Ellipsis should be converted to "..." - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - # AND standalone Ellipsis should be converted to "..." string - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "string_col": ["value1", "...", "value3"], - } - ) - for idx, row in results.iterrows(): - assert row["string_col"] == expected_result.loc[idx, "string_col"] - assert is_object_dtype(results.string_col) - - def test_store_rows_with_pd_na_in_lists(self, project_model: Project) -> None: - """Test that pd.NA values in list columns are properly handled""" - # GIVEN a table with list columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="int_list", column_type=ColumnType.INTEGER_LIST), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with pd.NA in lists - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "int_list": [[1, 2, 3], pd.NA, [7, 8, 9], None], - } - ) - - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - assert len(results) == 4 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "int_list": [[1, 2, 3], [], [7, 8, 9], []], - } - ) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) - assert is_object_dtype(results.int_list) - - def test_store_rows_with_nullable_integer_columns( - self, project_model: Project - ) -> None: - """Test that nullable integer columns with pd.NA are properly stored""" - # GIVEN a table with integer columns - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="nullable_int", column_type=ColumnType.INTEGER), - Column(name="nullable_float", column_type=ColumnType.DOUBLE), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with pd.NA in nullable integer columns - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "nullable_int": pd.array([10, pd.NA, 30, pd.NA]), - "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA]), - } - ) - data_for_table = data_for_table.convert_dtypes() - data_for_table = data_for_table.replace({pd.NA: None}) - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and pd.NA should be converted to None - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - # AND pd.NA should be represented as None/NaN - assert len(results) == 4 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3, 4], - "nullable_int": pd.array([10, None, 30, None]), - "nullable_float": pd.array([1.1, None, 3.3, None]), - } - ) - expected_result = expected_result.convert_dtypes() - expected_result = expected_result.replace({pd.NA: None}) - pd.testing.assert_frame_equal( - results.drop(columns=["ROW_ID", "ROW_VERSION"]), - expected_result, - check_dtype=False, - ) + assert is_object_dtype(results.int_list_with_pa_na) assert is_object_dtype(results.nullable_int) assert is_object_dtype(results.nullable_float) - def test_store_rows_with_json_containing_apostrophes( - self, project_model: Project - ) -> None: - """Test that JSON data with apostrophes is properly stored and retrieved""" - # GIVEN a table with a JSON column - table_name = str(uuid.uuid4()) - table = Table( - name=table_name, - parent_id=project_model.id, - columns=[ - Column(name="id", column_type=ColumnType.INTEGER), - Column(name="json_data", column_type=ColumnType.JSON), - ], - ) - table = table.store(synapse_client=self.syn) - self.schedule_for_cleanup(table.id) - - # AND data with apostrophes in JSON values - data_for_table = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"description": "Sage's work", "author": "O'Brien"}, - {"description": "It's a test", "author": "D'Angelo"}, - { - "description": "Multiple's apostrophe's", - "author": "McDonald's", - }, - ], - } - ) - - # WHEN I store the rows - table.store_rows( - values=data_for_table, - synapse_client=self.syn, - ) - - # THEN I can query the table and retrieve the data with apostrophes correctly - results = query( - f"SELECT * FROM {table.id}", - synapse_client=self.syn, - ) - - # AND the JSON data should preserve apostrophes - assert len(results) == 3 - expected_result = pd.DataFrame( - { - "id": [1, 2, 3], - "json_data": [ - {"description": "Sage's work", "author": "O'Brien"}, - {"description": "It's a test", "author": "D'Angelo"}, - {"description": "Multiple's apostrophe's", "author": "McDonald's"}, - ], - } - ) + expected_result = expected_result.convert_dtypes() + expected_result = expected_result.replace({pd.NA: None}) pd.testing.assert_frame_equal( results.drop(columns=["ROW_ID", "ROW_VERSION"]), expected_result, check_dtype=False, ) - assert is_object_dtype(results.json_data) class TestUpsertRows: