fix(sql_execution): Fix is large number check to use 2**53 as cutoff (#73)

tkislan · web-flow · commit 595046a235c3 · 2026-03-11T11:10:01.000+01:00
* fix(sql_execution): Fix is large number check to use 2**53 as cutoff

* feat(pandas): Add functionality to cast large numbers to strings for JSON compatibility

- Introduced `cast_large_numbers_to_string` function to convert numeric values exceeding the float64 safe integer range (2**53) to strings, preserving precision for JSON serialization.
- Updated `PandasImplementation.to_json` method to utilize the new function.
- Added unit tests to ensure correct behavior for large numbers in dataframes.

* refactor(pandas): Improve readability of large number checks in utils.py

- Reformatted the `is_large_number` and `cast_large_numbers_to_string` functions for better readability by using multi-line expressions.
- Updated unit tests to call the `is_large_number` function directly instead of through a different module, ensuring consistency and clarity in test cases.

* Remove unused import

* Add type hints to cast_large_numbers_to_string

Add explicit pd.DataFrame input and return type annotations to
cast_large_numbers_to_string so callers and static type checkers
(mypy) recognise the typed signature.

* Add type hint Any to is_large_number parameter

Import typing.Any and annotate the x parameter of is_large_number
so static type checkers (mypy) accept the function signature.
diff --git a/deepnote_toolkit/ocelots/pandas/implementation.py b/deepnote_toolkit/ocelots/pandas/implementation.py
@@ -13,6 +13,7 @@
 
 from .analyze import analyze_columns
 from .utils import (
+    cast_large_numbers_to_string,
     cast_objects_to_string,
     deduplicate_columns,
     fill_nat,
@@ -303,6 +304,7 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]:
         if mode == "json":
             fill_nat(df_copy, "NaT")
             cast_objects_to_string(df_copy)
+            cast_large_numbers_to_string(df_copy)
         return df_copy.to_dict("records")
 
     def to_csv(self, path_or_buf: Union[str, TextIO]) -> None:
diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py
@@ -1,3 +1,6 @@
+from decimal import Decimal
+from typing import Any
+
 import numpy as np
 import pandas as pd
 from packaging.requirements import Requirement
@@ -104,6 +107,42 @@ def to_string_truncated(elem):
     return df
 
 
+MAX_SAFE_FLOAT64_INTEGER = 2**53
+
+
+def is_large_number(x: Any) -> bool:
+    """Return True if *x* is a numeric value that would lose precision as float64.
+
+    float64 can represent integers exactly only up to 2**53, so any
+    numeric value whose absolute value exceeds that threshold is
+    considered "large" and should be converted to a string.
+    """
+    try:
+        return (
+            isinstance(x, (int, float, Decimal, np.integer, np.floating))
+            and abs(x) > MAX_SAFE_FLOAT64_INTEGER
+        )
+    except (TypeError, OverflowError, ArithmeticError):
+        return False
+
+
+def cast_large_numbers_to_string(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert columns containing numbers beyond float64 safe integer range to strings.
+
+    JavaScript's JSON.parse() reads all numbers as float64, which can only
+    represent integers exactly up to 2**53. Values above that threshold lose
+    precision, so we convert the entire column to strings to preserve the
+    exact value.
+    """
+    for column in df:
+        if (
+            is_pure_numeric(df[column].dtype)
+            and df[column].apply(is_large_number).any()
+        ):
+            df[column] = df[column].apply(safe_convert_to_string)
+    return df
+
+
 def is_type_datetime_or_timedelta(series_or_dtype):
     """
     Returns True if the series or dtype is datetime or timedelta, False otherwise.
diff --git a/deepnote_toolkit/sql/sql_execution.py b/deepnote_toolkit/sql/sql_execution.py
@@ -5,7 +5,6 @@
 import uuid
 import warnings
 import weakref
-from decimal import Decimal
 from typing import TYPE_CHECKING, Any, Optional
 from urllib.parse import quote
 
@@ -31,7 +30,7 @@
 )
 from deepnote_toolkit.ipython_utils import output_sql_metadata
 from deepnote_toolkit.logging import LoggerManager
-from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns
+from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns, is_large_number
 from deepnote_toolkit.sql.duckdb_sql import execute_duckdb_sql
 from deepnote_toolkit.sql.jinjasql_utils import render_jinja_sql_template
 from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview
@@ -687,14 +686,6 @@ class BigQueryCredentialsError(Exception):
     return {"connect_args": {"client": client}}
 
 
-def _is_large_number(x: Any) -> bool:
-    """Return True if *x* is a numeric value that exceeds the int64 range"""
-    try:
-        return isinstance(x, (int, float, Decimal)) and abs(x) > 2**63 - 1
-    except (TypeError, OverflowError, ArithmeticError):
-        return False
-
-
 def _sanitize_dataframe_for_parquet(dataframe):
     """Sanitizes the dataframe so that we can safely call .to_parquet on it"""
 
@@ -714,9 +705,11 @@ def _sanitize_dataframe_for_parquet(dataframe):
         ):
             dataframe[column] = dataframe[column].astype(str)
 
-    # Convert columns with large numbers to strings
+    # Convert columns with large numbers to strings to preserve precision.
+    # float64 can only represent integers exactly up to 2**53; values
+    # above that threshold are converted to strings.
     for column in dataframe.columns:
-        if dataframe[column].apply(_is_large_number).any():
+        if dataframe[column].apply(is_large_number).any():
             dataframe[column] = dataframe[column].astype(str)
 
 
diff --git a/tests/unit/helpers/testing_dataframes.py b/tests/unit/helpers/testing_dataframes.py
@@ -113,6 +113,17 @@ def create_dataframe_with_duplicate_column_names():
             "col1": [2**53],
         }
     ),
+    "large_numbers_above_threshold": pd.DataFrame(
+        data={
+            "col1": [2**53 + 1, 42, 2**53 + 100],
+        }
+    ),
+    "large_numbers_mixed_columns": pd.DataFrame(
+        data={
+            "safe_col": [1, 2, 3],
+            "large_col": [2**53 + 1, 2**53 + 2, 2**53 + 3],
+        }
+    ),
     "infinity": pd.DataFrame(
         data={
             "col1": [0, np.inf, -np.inf],
diff --git a/tests/unit/test_dataframe_utils.py b/tests/unit/test_dataframe_utils.py
@@ -2,6 +2,7 @@
 import unittest
 from unittest.mock import MagicMock
 
+import numpy as np
 from ipykernel.jsonutil import json_clean
 
 from deepnote_toolkit.dataframe_utils import _describe_dataframe, add_formatters
@@ -188,6 +189,34 @@ def test_large_numbers(self):
         self.assertEqual(result["columns"][0]["stats"]["min"], str(2**53))
         self.assertEqual(result["columns"][0]["stats"]["max"], str(2**53))
 
+    def test_large_numbers_above_threshold_are_strings_in_rows(self):
+        """Integers above 2**53 must be converted to strings in rows to preserve precision."""
+        df = testing_dataframes["large_numbers_above_threshold"]
+        result = describe_and_json_clean(df)
+        self.assertEqual(result["row_count"], 3)
+        for row in result["rows"]:
+            self.assertIsInstance(row["col1"], str)
+        self.assertEqual(result["rows"][0]["col1"], str(2**53 + 1))
+        self.assertEqual(result["rows"][1]["col1"], "42")
+        self.assertEqual(result["rows"][2]["col1"], str(2**53 + 100))
+
+    def test_large_numbers_mixed_columns_only_affects_large_column(self):
+        """Only columns containing values above 2**53 should be converted; safe columns stay numeric."""
+        df = testing_dataframes["large_numbers_mixed_columns"]
+        result = describe_and_json_clean(df)
+        self.assertEqual(result["row_count"], 3)
+        for row in result["rows"]:
+            self.assertIsInstance(row["safe_col"], int)
+            self.assertIsInstance(row["large_col"], str)
+        self.assertEqual(result["rows"][0]["large_col"], str(2**53 + 1))
+
+    def test_large_numbers_at_boundary_stay_numeric(self):
+        """Integers exactly at 2**53 should remain as numbers (still exact in float64)."""
+        df = testing_dataframes["large_numbers"]
+        result = describe_and_json_clean(df)
+        for row in result["rows"]:
+            self.assertIsInstance(row["col1"], (int, np.integer))
+
     def test_infinity(self):
         df = testing_dataframes["infinity"]
         result = describe_and_json_clean(df)
diff --git a/tests/unit/test_sql_execution_internal.py b/tests/unit/test_sql_execution_internal.py
@@ -218,7 +218,7 @@ def test_sanitize_dataframe_for_parquet_decimal_large_numbers():
 
 
 def test_sanitize_dataframe_for_parquet_decimal_small_numbers():
-    """Decimal values within int64 range should not be converted."""
+    """Decimal values within float64 exact range should not be converted."""
     from decimal import Decimal
 
     data = pd.DataFrame(
@@ -246,19 +246,130 @@ def test_sanitize_dataframe_for_parquet_decimal_nan():
 def test_is_large_number():
     from decimal import Decimal
 
-    assert se._is_large_number(2**63) is True
-    assert se._is_large_number(-(2**63) - 1) is True
-    assert se._is_large_number(2**63 - 1) is False
-    assert se._is_large_number(42) is False
-    assert se._is_large_number(float("inf")) is True
-    assert se._is_large_number(float("nan")) is False
-    assert se._is_large_number(Decimal("1e40")) is True
-    assert se._is_large_number(Decimal("100")) is False
-    assert se._is_large_number(Decimal("NaN")) is False
-    assert se._is_large_number(Decimal("sNaN")) is False
-    assert se._is_large_number(Decimal("Infinity")) is True
-    assert se._is_large_number("not a number") is False
-    assert se._is_large_number(None) is False
+    from deepnote_toolkit.ocelots.pandas.utils import is_large_number
+
+    # 2**53 boundary: float64 can represent integers exactly up to 2**53
+    assert is_large_number(2**53) is False
+    assert is_large_number(2**53 + 1) is True
+    assert is_large_number(-(2**53)) is False
+    assert is_large_number(-(2**53) - 1) is True
+
+    # Small integers should not trigger
+    assert is_large_number(0) is False
+    assert is_large_number(1) is False
+    assert is_large_number(-1) is False
+    assert is_large_number(42) is False
+
+    # Large ints well beyond 2**53 should trigger
+    assert is_large_number(2**63 - 1) is True
+    assert is_large_number(2**63) is True
+    assert is_large_number(10**18) is True
+
+    # Floats
+    assert is_large_number(float("inf")) is True
+    assert is_large_number(float("nan")) is False
+    assert is_large_number(1.0) is False
+
+    # Decimals
+    assert is_large_number(Decimal("1e40")) is True
+    assert is_large_number(Decimal("9007199254740994")) is True
+    assert is_large_number(Decimal("100")) is False
+    assert is_large_number(Decimal("NaN")) is False
+    assert is_large_number(Decimal("sNaN")) is False
+    assert is_large_number(Decimal("Infinity")) is True
+
+    # Non-numeric types should not trigger
+    assert is_large_number("not a number") is False
+    assert is_large_number(None) is False
+
+
+def test_sanitize_dataframe_for_parquet_large_int_precision_loss():
+    """Integers above 2**53 must be converted to strings to preserve precision."""
+    val_above = 2**53 + 1  # 9007199254740993
+    val_exact = 2**53  # 9007199254740992
+
+    data = pd.DataFrame(
+        {
+            "lossy": [val_above, val_exact],
+            "safe": [42, 100],
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["lossy"].dtype == object
+    assert data["lossy"].iloc[0] == str(val_above)
+    assert data["lossy"].iloc[1] == str(val_exact)
+    assert pd.api.types.is_integer_dtype(data["safe"])
+
+
+def test_sanitize_dataframe_for_parquet_large_int_negative():
+    """Negative integers beyond -2**53 must also be converted."""
+    data = pd.DataFrame(
+        {
+            "neg": [-(2**53) - 1, 0],
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["neg"].dtype == object
+    assert data["neg"].iloc[0] == str(-(2**53) - 1)
+
+
+def test_sanitize_dataframe_for_parquet_int_at_boundary():
+    """Integers exactly at 2**53 should not be converted (still exact in float64)."""
+    data = pd.DataFrame(
+        {
+            "boundary": [2**53, -(2**53)],
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert pd.api.types.is_integer_dtype(data["boundary"])
+
+
+def test_sanitize_dataframe_for_parquet_mixed_int_with_none():
+    """Mixed object column with None and large int should convert to strings."""
+    data = pd.DataFrame(
+        {
+            "mixed": pd.array([2**53 + 1, None, 42], dtype=object),
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["mixed"].dtype == object
+    assert data["mixed"].iloc[0] == str(2**53 + 1)
+
+
+def test_sanitize_dataframe_for_parquet_decimal_int_precision_loss():
+    """Integer-valued Decimals above 2**53 should be converted to strings."""
+    from decimal import Decimal
+
+    data = pd.DataFrame(
+        {
+            "d": [Decimal("9007199254740993"), Decimal("42")],
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["d"].dtype == object
+    assert data["d"].iloc[0] == str(Decimal("9007199254740993"))
+
+
+def test_sanitize_dataframe_for_parquet_precision_loss_preserves_value():
+    """Verify the string conversion preserves the exact integer value."""
+    val = 9007199254740993
+    assert float(val) == float(9007199254740992)  # proves precision loss in float64
+
+    data = pd.DataFrame({"x": [val]})
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["x"].iloc[0] == "9007199254740993"  # exact value preserved
+
+
+def test_sanitize_dataframe_for_parquet_very_large_int():
+    """Integers far beyond 2**53 (e.g. 2**64) must also be converted."""
+    data = pd.DataFrame(
+        {
+            "huge": [2**64, 42],
+        }
+    )
+    se._sanitize_dataframe_for_parquet(data)
+    assert data["huge"].dtype == object
+    assert data["huge"].iloc[0] == str(2**64)
 
 
 def test_create_sql_ssh_uri_no_ssh():