From 357ccca44951e87f48320a9b2f9971848fb506e6 Mon Sep 17 00:00:00 2001 From: Liu Haixin Date: Fri, 27 Feb 2026 15:12:58 -0500 Subject: [PATCH 1/3] Add Pandera data validation --- requirements.txt | 2 +- tests/test_validation.py | 88 +++++++++++++++++++++++++++++++++++++ validation.py | 94 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 tests/test_validation.py create mode 100644 validation.py diff --git a/requirements.txt b/requirements.txt index e35eafc..50a7580 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,4 @@ plotly requests ruff pytest -matplotlib \ No newline at end of file +matplotlibpandera diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..5b7a965 --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,88 @@ +import pandas as pd +import pytest + +from validation import validate_mta_data + + +def test_valid_data(): + """Test that valid data passes validation.""" + df = pd.DataFrame( + { + "date": ["2020-03-01", "2020-03-02"], + "subways_total_estimated_ridership": [1000000.0, 1100000.0], + "subways_of_comparable_pre_pandemic_day": [0.5, 0.6], + "buses_total_estimated_ridership": [500000.0, 550000.0], + "buses_of_comparable_pre_pandemic_day": [0.6, 0.65], + "lirr_total_estimated_ridership": [100000.0, 110000.0], + "lirr_of_comparable_pre_pandemic_day": [0.4, 0.45], + "metro_north_total_estimated_ridership": [80000.0, 85000.0], + "metro_north_of_comparable_pre_pandemic_day": [0.35, 0.4], + "bridges_and_tunnels_total_traffic": [700000.0, 720000.0], + "bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9, 0.92], + } + ) + result = validate_mta_data(df) + assert len(result) == 2 + + +def test_negative_ridership_fails(): + """Test that negative ridership values fail validation.""" + df = pd.DataFrame( + { + "date": ["2020-03-01"], + "subways_total_estimated_ridership": [-100.0], + "subways_of_comparable_pre_pandemic_day": [0.5], + "buses_total_estimated_ridership": [500000.0], + "buses_of_comparable_pre_pandemic_day": [0.6], + "lirr_total_estimated_ridership": [100000.0], + "lirr_of_comparable_pre_pandemic_day": [0.4], + "metro_north_total_estimated_ridership": [80000.0], + "metro_north_of_comparable_pre_pandemic_day": [0.35], + "bridges_and_tunnels_total_traffic": [700000.0], + "bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9], + } + ) + with pytest.raises(pa.errors.SchemaError): + validate_mta_data(df) + + +def test_ratio_exceeds_max_fails(): + """Test that pre-pandemic ratio > 2.0 fails validation.""" + df = pd.DataFrame( + { + "date": ["2020-03-01"], + "subways_total_estimated_ridership": [1000000.0], + "subways_of_comparable_pre_pandemic_day": [3.0], + "buses_total_estimated_ridership": [500000.0], + "buses_of_comparable_pre_pandemic_day": [0.6], + "lirr_total_estimated_ridership": [100000.0], + "lirr_of_comparable_pre_pandemic_day": [0.4], + "metro_north_total_estimated_ridership": [80000.0], + "metro_north_of_comparable_pre_pandemic_day": [0.35], + "bridges_and_tunnels_total_traffic": [700000.0], + "bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9], + } + ) + with pytest.raises(pa.errors.SchemaError): + validate_mta_data(df) + + +def test_missing_date_fails(): + """Test that null dates fail validation.""" + df = pd.DataFrame( + { + "date": [None], + "subways_total_estimated_ridership": [1000000.0], + "subways_of_comparable_pre_pandemic_day": [0.5], + "buses_total_estimated_ridership": [500000.0], + "buses_of_comparable_pre_pandemic_day": [0.6], + "lirr_total_estimated_ridership": [100000.0], + "lirr_of_comparable_pre_pandemic_day": [0.4], + "metro_north_total_estimated_ridership": [80000.0], + "metro_north_of_comparable_pre_pandemic_day": [0.35], + "bridges_and_tunnels_total_traffic": [700000.0], + "bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9], + } + ) + with pytest.raises(pa.errors.SchemaError): + validate_mta_data(df) diff --git a/validation.py b/validation.py new file mode 100644 index 0000000..90cb567 --- /dev/null +++ b/validation.py @@ -0,0 +1,94 @@ +import pandera as pa + +# Schema for MTA Daily Ridership Data +mta_schema = pa.DataFrameSchema( + { + "date": pa.Column( + pa.DateTime, + nullable=False, + checks=pa.Check.greater_than_or_equal_to("2020-03-01"), + description="Date of ridership record, starting from March 2020", + ), + "subways_total_estimated_ridership": pa.Column( + float, + nullable=True, + checks=pa.Check.greater_than_or_equal_to(0), + description="Total estimated subway ridership", + ), + "subways_of_comparable_pre_pandemic_day": pa.Column( + float, + nullable=True, + checks=[ + pa.Check.greater_than_or_equal_to(0), + pa.Check.less_than_or_equal_to(2.0), + ], + description="Subway ridership as ratio of pre-pandemic levels (0 to 2.0)", + ), + "buses_total_estimated_ridership": pa.Column( + float, + nullable=True, + checks=pa.Check.greater_than_or_equal_to(0), + description="Total estimated bus ridership", + ), + "buses_of_comparable_pre_pandemic_day": pa.Column( + float, + nullable=True, + checks=[ + pa.Check.greater_than_or_equal_to(0), + pa.Check.less_than_or_equal_to(2.0), + ], + description="Bus ridership as ratio of pre-pandemic levels", + ), + "lirr_total_estimated_ridership": pa.Column( + float, + nullable=True, + checks=pa.Check.greater_than_or_equal_to(0), + description="Total estimated LIRR ridership", + ), + "lirr_of_comparable_pre_pandemic_day": pa.Column( + float, + nullable=True, + checks=[ + pa.Check.greater_than_or_equal_to(0), + pa.Check.less_than_or_equal_to(2.0), + ], + description="LIRR ridership as ratio of pre-pandemic levels", + ), + "metro_north_total_estimated_ridership": pa.Column( + float, + nullable=True, + checks=pa.Check.greater_than_or_equal_to(0), + description="Total estimated Metro-North ridership", + ), + "metro_north_of_comparable_pre_pandemic_day": pa.Column( + float, + nullable=True, + checks=[ + pa.Check.greater_than_or_equal_to(0), + pa.Check.less_than_or_equal_to(2.0), + ], + description="Metro-North ridership as ratio of pre-pandemic levels", + ), + "bridges_and_tunnels_total_traffic": pa.Column( + float, + nullable=True, + checks=pa.Check.greater_than_or_equal_to(0), + description="Total bridges and tunnels traffic", + ), + "bridges_and_tunnels_of_comparable_pre_pandemic_day": pa.Column( + float, + nullable=True, + checks=[ + pa.Check.greater_than_or_equal_to(0), + pa.Check.less_than_or_equal_to(2.0), + ], + description="B&T traffic as ratio of pre-pandemic levels", + ), + }, + coerce=True, +) + + +def validate_mta_data(df): + """Validate MTA ridership dataframe against schema.""" + return mta_schema.validate(df) From 13afa2fcadef47983df466b3f2c6eb03587052e0 Mon Sep 17 00:00:00 2001 From: Liu Haixin Date: Fri, 27 Feb 2026 15:16:02 -0500 Subject: [PATCH 2/3] fixed requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 50a7580..a91dd02 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ plotly requests ruff pytest -matplotlibpandera +matplotlib +pandera \ No newline at end of file From d4718a296353da3caa62b3235f3afd975998eff7 Mon Sep 17 00:00:00 2001 From: Liu Haixin Date: Fri, 27 Feb 2026 15:27:55 -0500 Subject: [PATCH 3/3] Add missing pandera import in test --- tests/test_validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index 5b7a965..1dba220 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -1,4 +1,5 @@ import pandas as pd +import pandera as pa import pytest from validation import validate_mta_data