Skip to content

Commit 595046a

Browse files
authored
fix(sql_execution): Fix is large number check to use 2**53 as cutoff (#73)
* fix(sql_execution): Fix is large number check to use 2**53 as cutoff * feat(pandas): Add functionality to cast large numbers to strings for JSON compatibility - Introduced `cast_large_numbers_to_string` function to convert numeric values exceeding the float64 safe integer range (2**53) to strings, preserving precision for JSON serialization. - Updated `PandasImplementation.to_json` method to utilize the new function. - Added unit tests to ensure correct behavior for large numbers in dataframes. * refactor(pandas): Improve readability of large number checks in utils.py - Reformatted the `is_large_number` and `cast_large_numbers_to_string` functions for better readability by using multi-line expressions. - Updated unit tests to call the `is_large_number` function directly instead of through a different module, ensuring consistency and clarity in test cases. * Remove unused import * Add type hints to cast_large_numbers_to_string Add explicit pd.DataFrame input and return type annotations to cast_large_numbers_to_string so callers and static type checkers (mypy) recognise the typed signature. * Add type hint Any to is_large_number parameter Import typing.Any and annotate the x parameter of is_large_number so static type checkers (mypy) accept the function signature.
1 parent 2df9d55 commit 595046a

6 files changed

Lines changed: 211 additions & 26 deletions

File tree

deepnote_toolkit/ocelots/pandas/implementation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from .analyze import analyze_columns
1515
from .utils import (
16+
cast_large_numbers_to_string,
1617
cast_objects_to_string,
1718
deduplicate_columns,
1819
fill_nat,
@@ -303,6 +304,7 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]:
303304
if mode == "json":
304305
fill_nat(df_copy, "NaT")
305306
cast_objects_to_string(df_copy)
307+
cast_large_numbers_to_string(df_copy)
306308
return df_copy.to_dict("records")
307309

308310
def to_csv(self, path_or_buf: Union[str, TextIO]) -> None:

deepnote_toolkit/ocelots/pandas/utils.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
from decimal import Decimal
2+
from typing import Any
3+
14
import numpy as np
25
import pandas as pd
36
from packaging.requirements import Requirement
@@ -104,6 +107,42 @@ def to_string_truncated(elem):
104107
return df
105108

106109

110+
MAX_SAFE_FLOAT64_INTEGER = 2**53
111+
112+
113+
def is_large_number(x: Any) -> bool:
114+
"""Return True if *x* is a numeric value that would lose precision as float64.
115+
116+
float64 can represent integers exactly only up to 2**53, so any
117+
numeric value whose absolute value exceeds that threshold is
118+
considered "large" and should be converted to a string.
119+
"""
120+
try:
121+
return (
122+
isinstance(x, (int, float, Decimal, np.integer, np.floating))
123+
and abs(x) > MAX_SAFE_FLOAT64_INTEGER
124+
)
125+
except (TypeError, OverflowError, ArithmeticError):
126+
return False
127+
128+
129+
def cast_large_numbers_to_string(df: pd.DataFrame) -> pd.DataFrame:
130+
"""Convert columns containing numbers beyond float64 safe integer range to strings.
131+
132+
JavaScript's JSON.parse() reads all numbers as float64, which can only
133+
represent integers exactly up to 2**53. Values above that threshold lose
134+
precision, so we convert the entire column to strings to preserve the
135+
exact value.
136+
"""
137+
for column in df:
138+
if (
139+
is_pure_numeric(df[column].dtype)
140+
and df[column].apply(is_large_number).any()
141+
):
142+
df[column] = df[column].apply(safe_convert_to_string)
143+
return df
144+
145+
107146
def is_type_datetime_or_timedelta(series_or_dtype):
108147
"""
109148
Returns True if the series or dtype is datetime or timedelta, False otherwise.

deepnote_toolkit/sql/sql_execution.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import uuid
66
import warnings
77
import weakref
8-
from decimal import Decimal
98
from typing import TYPE_CHECKING, Any, Optional
109
from urllib.parse import quote
1110

@@ -31,7 +30,7 @@
3130
)
3231
from deepnote_toolkit.ipython_utils import output_sql_metadata
3332
from deepnote_toolkit.logging import LoggerManager
34-
from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns
33+
from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns, is_large_number
3534
from deepnote_toolkit.sql.duckdb_sql import execute_duckdb_sql
3635
from deepnote_toolkit.sql.jinjasql_utils import render_jinja_sql_template
3736
from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview
@@ -687,14 +686,6 @@ class BigQueryCredentialsError(Exception):
687686
return {"connect_args": {"client": client}}
688687

689688

690-
def _is_large_number(x: Any) -> bool:
691-
"""Return True if *x* is a numeric value that exceeds the int64 range"""
692-
try:
693-
return isinstance(x, (int, float, Decimal)) and abs(x) > 2**63 - 1
694-
except (TypeError, OverflowError, ArithmeticError):
695-
return False
696-
697-
698689
def _sanitize_dataframe_for_parquet(dataframe):
699690
"""Sanitizes the dataframe so that we can safely call .to_parquet on it"""
700691

@@ -714,9 +705,11 @@ def _sanitize_dataframe_for_parquet(dataframe):
714705
):
715706
dataframe[column] = dataframe[column].astype(str)
716707

717-
# Convert columns with large numbers to strings
708+
# Convert columns with large numbers to strings to preserve precision.
709+
# float64 can only represent integers exactly up to 2**53; values
710+
# above that threshold are converted to strings.
718711
for column in dataframe.columns:
719-
if dataframe[column].apply(_is_large_number).any():
712+
if dataframe[column].apply(is_large_number).any():
720713
dataframe[column] = dataframe[column].astype(str)
721714

722715

tests/unit/helpers/testing_dataframes.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,17 @@ def create_dataframe_with_duplicate_column_names():
113113
"col1": [2**53],
114114
}
115115
),
116+
"large_numbers_above_threshold": pd.DataFrame(
117+
data={
118+
"col1": [2**53 + 1, 42, 2**53 + 100],
119+
}
120+
),
121+
"large_numbers_mixed_columns": pd.DataFrame(
122+
data={
123+
"safe_col": [1, 2, 3],
124+
"large_col": [2**53 + 1, 2**53 + 2, 2**53 + 3],
125+
}
126+
),
116127
"infinity": pd.DataFrame(
117128
data={
118129
"col1": [0, np.inf, -np.inf],

tests/unit/test_dataframe_utils.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import unittest
33
from unittest.mock import MagicMock
44

5+
import numpy as np
56
from ipykernel.jsonutil import json_clean
67

78
from deepnote_toolkit.dataframe_utils import _describe_dataframe, add_formatters
@@ -188,6 +189,34 @@ def test_large_numbers(self):
188189
self.assertEqual(result["columns"][0]["stats"]["min"], str(2**53))
189190
self.assertEqual(result["columns"][0]["stats"]["max"], str(2**53))
190191

192+
def test_large_numbers_above_threshold_are_strings_in_rows(self):
193+
"""Integers above 2**53 must be converted to strings in rows to preserve precision."""
194+
df = testing_dataframes["large_numbers_above_threshold"]
195+
result = describe_and_json_clean(df)
196+
self.assertEqual(result["row_count"], 3)
197+
for row in result["rows"]:
198+
self.assertIsInstance(row["col1"], str)
199+
self.assertEqual(result["rows"][0]["col1"], str(2**53 + 1))
200+
self.assertEqual(result["rows"][1]["col1"], "42")
201+
self.assertEqual(result["rows"][2]["col1"], str(2**53 + 100))
202+
203+
def test_large_numbers_mixed_columns_only_affects_large_column(self):
204+
"""Only columns containing values above 2**53 should be converted; safe columns stay numeric."""
205+
df = testing_dataframes["large_numbers_mixed_columns"]
206+
result = describe_and_json_clean(df)
207+
self.assertEqual(result["row_count"], 3)
208+
for row in result["rows"]:
209+
self.assertIsInstance(row["safe_col"], int)
210+
self.assertIsInstance(row["large_col"], str)
211+
self.assertEqual(result["rows"][0]["large_col"], str(2**53 + 1))
212+
213+
def test_large_numbers_at_boundary_stay_numeric(self):
214+
"""Integers exactly at 2**53 should remain as numbers (still exact in float64)."""
215+
df = testing_dataframes["large_numbers"]
216+
result = describe_and_json_clean(df)
217+
for row in result["rows"]:
218+
self.assertIsInstance(row["col1"], (int, np.integer))
219+
191220
def test_infinity(self):
192221
df = testing_dataframes["infinity"]
193222
result = describe_and_json_clean(df)

tests/unit/test_sql_execution_internal.py

Lines changed: 125 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def test_sanitize_dataframe_for_parquet_decimal_large_numbers():
218218

219219

220220
def test_sanitize_dataframe_for_parquet_decimal_small_numbers():
221-
"""Decimal values within int64 range should not be converted."""
221+
"""Decimal values within float64 exact range should not be converted."""
222222
from decimal import Decimal
223223

224224
data = pd.DataFrame(
@@ -246,19 +246,130 @@ def test_sanitize_dataframe_for_parquet_decimal_nan():
246246
def test_is_large_number():
247247
from decimal import Decimal
248248

249-
assert se._is_large_number(2**63) is True
250-
assert se._is_large_number(-(2**63) - 1) is True
251-
assert se._is_large_number(2**63 - 1) is False
252-
assert se._is_large_number(42) is False
253-
assert se._is_large_number(float("inf")) is True
254-
assert se._is_large_number(float("nan")) is False
255-
assert se._is_large_number(Decimal("1e40")) is True
256-
assert se._is_large_number(Decimal("100")) is False
257-
assert se._is_large_number(Decimal("NaN")) is False
258-
assert se._is_large_number(Decimal("sNaN")) is False
259-
assert se._is_large_number(Decimal("Infinity")) is True
260-
assert se._is_large_number("not a number") is False
261-
assert se._is_large_number(None) is False
249+
from deepnote_toolkit.ocelots.pandas.utils import is_large_number
250+
251+
# 2**53 boundary: float64 can represent integers exactly up to 2**53
252+
assert is_large_number(2**53) is False
253+
assert is_large_number(2**53 + 1) is True
254+
assert is_large_number(-(2**53)) is False
255+
assert is_large_number(-(2**53) - 1) is True
256+
257+
# Small integers should not trigger
258+
assert is_large_number(0) is False
259+
assert is_large_number(1) is False
260+
assert is_large_number(-1) is False
261+
assert is_large_number(42) is False
262+
263+
# Large ints well beyond 2**53 should trigger
264+
assert is_large_number(2**63 - 1) is True
265+
assert is_large_number(2**63) is True
266+
assert is_large_number(10**18) is True
267+
268+
# Floats
269+
assert is_large_number(float("inf")) is True
270+
assert is_large_number(float("nan")) is False
271+
assert is_large_number(1.0) is False
272+
273+
# Decimals
274+
assert is_large_number(Decimal("1e40")) is True
275+
assert is_large_number(Decimal("9007199254740994")) is True
276+
assert is_large_number(Decimal("100")) is False
277+
assert is_large_number(Decimal("NaN")) is False
278+
assert is_large_number(Decimal("sNaN")) is False
279+
assert is_large_number(Decimal("Infinity")) is True
280+
281+
# Non-numeric types should not trigger
282+
assert is_large_number("not a number") is False
283+
assert is_large_number(None) is False
284+
285+
286+
def test_sanitize_dataframe_for_parquet_large_int_precision_loss():
287+
"""Integers above 2**53 must be converted to strings to preserve precision."""
288+
val_above = 2**53 + 1 # 9007199254740993
289+
val_exact = 2**53 # 9007199254740992
290+
291+
data = pd.DataFrame(
292+
{
293+
"lossy": [val_above, val_exact],
294+
"safe": [42, 100],
295+
}
296+
)
297+
se._sanitize_dataframe_for_parquet(data)
298+
assert data["lossy"].dtype == object
299+
assert data["lossy"].iloc[0] == str(val_above)
300+
assert data["lossy"].iloc[1] == str(val_exact)
301+
assert pd.api.types.is_integer_dtype(data["safe"])
302+
303+
304+
def test_sanitize_dataframe_for_parquet_large_int_negative():
305+
"""Negative integers beyond -2**53 must also be converted."""
306+
data = pd.DataFrame(
307+
{
308+
"neg": [-(2**53) - 1, 0],
309+
}
310+
)
311+
se._sanitize_dataframe_for_parquet(data)
312+
assert data["neg"].dtype == object
313+
assert data["neg"].iloc[0] == str(-(2**53) - 1)
314+
315+
316+
def test_sanitize_dataframe_for_parquet_int_at_boundary():
317+
"""Integers exactly at 2**53 should not be converted (still exact in float64)."""
318+
data = pd.DataFrame(
319+
{
320+
"boundary": [2**53, -(2**53)],
321+
}
322+
)
323+
se._sanitize_dataframe_for_parquet(data)
324+
assert pd.api.types.is_integer_dtype(data["boundary"])
325+
326+
327+
def test_sanitize_dataframe_for_parquet_mixed_int_with_none():
328+
"""Mixed object column with None and large int should convert to strings."""
329+
data = pd.DataFrame(
330+
{
331+
"mixed": pd.array([2**53 + 1, None, 42], dtype=object),
332+
}
333+
)
334+
se._sanitize_dataframe_for_parquet(data)
335+
assert data["mixed"].dtype == object
336+
assert data["mixed"].iloc[0] == str(2**53 + 1)
337+
338+
339+
def test_sanitize_dataframe_for_parquet_decimal_int_precision_loss():
340+
"""Integer-valued Decimals above 2**53 should be converted to strings."""
341+
from decimal import Decimal
342+
343+
data = pd.DataFrame(
344+
{
345+
"d": [Decimal("9007199254740993"), Decimal("42")],
346+
}
347+
)
348+
se._sanitize_dataframe_for_parquet(data)
349+
assert data["d"].dtype == object
350+
assert data["d"].iloc[0] == str(Decimal("9007199254740993"))
351+
352+
353+
def test_sanitize_dataframe_for_parquet_precision_loss_preserves_value():
354+
"""Verify the string conversion preserves the exact integer value."""
355+
val = 9007199254740993
356+
assert float(val) == float(9007199254740992) # proves precision loss in float64
357+
358+
data = pd.DataFrame({"x": [val]})
359+
se._sanitize_dataframe_for_parquet(data)
360+
assert data["x"].iloc[0] == "9007199254740993" # exact value preserved
361+
362+
363+
def test_sanitize_dataframe_for_parquet_very_large_int():
364+
"""Integers far beyond 2**53 (e.g. 2**64) must also be converted."""
365+
data = pd.DataFrame(
366+
{
367+
"huge": [2**64, 42],
368+
}
369+
)
370+
se._sanitize_dataframe_for_parquet(data)
371+
assert data["huge"].dtype == object
372+
assert data["huge"].iloc[0] == str(2**64)
262373

263374

264375
def test_create_sql_ssh_uri_no_ssh():

0 commit comments

Comments
 (0)