Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ plotly
requests
ruff
pytest
matplotlib
matplotlib
pandera
89 changes: 89 additions & 0 deletions tests/test_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pandas as pd
import pandera as pa
import pytest

from validation import validate_mta_data


def test_valid_data():
"""Test that valid data passes validation."""
df = pd.DataFrame(
{
"date": ["2020-03-01", "2020-03-02"],
"subways_total_estimated_ridership": [1000000.0, 1100000.0],
"subways_of_comparable_pre_pandemic_day": [0.5, 0.6],
"buses_total_estimated_ridership": [500000.0, 550000.0],
"buses_of_comparable_pre_pandemic_day": [0.6, 0.65],
"lirr_total_estimated_ridership": [100000.0, 110000.0],
"lirr_of_comparable_pre_pandemic_day": [0.4, 0.45],
"metro_north_total_estimated_ridership": [80000.0, 85000.0],
"metro_north_of_comparable_pre_pandemic_day": [0.35, 0.4],
"bridges_and_tunnels_total_traffic": [700000.0, 720000.0],
"bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9, 0.92],
}
)
result = validate_mta_data(df)
assert len(result) == 2


def test_negative_ridership_fails():
"""Test that negative ridership values fail validation."""
df = pd.DataFrame(
{
"date": ["2020-03-01"],
"subways_total_estimated_ridership": [-100.0],
"subways_of_comparable_pre_pandemic_day": [0.5],
"buses_total_estimated_ridership": [500000.0],
"buses_of_comparable_pre_pandemic_day": [0.6],
"lirr_total_estimated_ridership": [100000.0],
"lirr_of_comparable_pre_pandemic_day": [0.4],
"metro_north_total_estimated_ridership": [80000.0],
"metro_north_of_comparable_pre_pandemic_day": [0.35],
"bridges_and_tunnels_total_traffic": [700000.0],
"bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9],
}
)
with pytest.raises(pa.errors.SchemaError):
validate_mta_data(df)


def test_ratio_exceeds_max_fails():
"""Test that pre-pandemic ratio > 2.0 fails validation."""
df = pd.DataFrame(
{
"date": ["2020-03-01"],
"subways_total_estimated_ridership": [1000000.0],
"subways_of_comparable_pre_pandemic_day": [3.0],
"buses_total_estimated_ridership": [500000.0],
"buses_of_comparable_pre_pandemic_day": [0.6],
"lirr_total_estimated_ridership": [100000.0],
"lirr_of_comparable_pre_pandemic_day": [0.4],
"metro_north_total_estimated_ridership": [80000.0],
"metro_north_of_comparable_pre_pandemic_day": [0.35],
"bridges_and_tunnels_total_traffic": [700000.0],
"bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9],
}
)
with pytest.raises(pa.errors.SchemaError):
validate_mta_data(df)


def test_missing_date_fails():
"""Test that null dates fail validation."""
df = pd.DataFrame(
{
"date": [None],
"subways_total_estimated_ridership": [1000000.0],
"subways_of_comparable_pre_pandemic_day": [0.5],
"buses_total_estimated_ridership": [500000.0],
"buses_of_comparable_pre_pandemic_day": [0.6],
"lirr_total_estimated_ridership": [100000.0],
"lirr_of_comparable_pre_pandemic_day": [0.4],
"metro_north_total_estimated_ridership": [80000.0],
"metro_north_of_comparable_pre_pandemic_day": [0.35],
"bridges_and_tunnels_total_traffic": [700000.0],
"bridges_and_tunnels_of_comparable_pre_pandemic_day": [0.9],
}
)
with pytest.raises(pa.errors.SchemaError):
validate_mta_data(df)
94 changes: 94 additions & 0 deletions validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pandera as pa

# Schema for MTA Daily Ridership Data
mta_schema = pa.DataFrameSchema(
{
"date": pa.Column(
pa.DateTime,
nullable=False,
checks=pa.Check.greater_than_or_equal_to("2020-03-01"),
description="Date of ridership record, starting from March 2020",
),
"subways_total_estimated_ridership": pa.Column(
float,
nullable=True,
checks=pa.Check.greater_than_or_equal_to(0),
description="Total estimated subway ridership",
),
"subways_of_comparable_pre_pandemic_day": pa.Column(
float,
nullable=True,
checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2.0),
],
description="Subway ridership as ratio of pre-pandemic levels (0 to 2.0)",
),
"buses_total_estimated_ridership": pa.Column(
float,
nullable=True,
checks=pa.Check.greater_than_or_equal_to(0),
description="Total estimated bus ridership",
),
"buses_of_comparable_pre_pandemic_day": pa.Column(
float,
nullable=True,
checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2.0),
],
description="Bus ridership as ratio of pre-pandemic levels",
),
"lirr_total_estimated_ridership": pa.Column(
float,
nullable=True,
checks=pa.Check.greater_than_or_equal_to(0),
description="Total estimated LIRR ridership",
),
"lirr_of_comparable_pre_pandemic_day": pa.Column(
float,
nullable=True,
checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2.0),
],
description="LIRR ridership as ratio of pre-pandemic levels",
),
"metro_north_total_estimated_ridership": pa.Column(
float,
nullable=True,
checks=pa.Check.greater_than_or_equal_to(0),
description="Total estimated Metro-North ridership",
),
"metro_north_of_comparable_pre_pandemic_day": pa.Column(
float,
nullable=True,
checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2.0),
],
description="Metro-North ridership as ratio of pre-pandemic levels",
),
"bridges_and_tunnels_total_traffic": pa.Column(
float,
nullable=True,
checks=pa.Check.greater_than_or_equal_to(0),
description="Total bridges and tunnels traffic",
),
"bridges_and_tunnels_of_comparable_pre_pandemic_day": pa.Column(
float,
nullable=True,
checks=[
pa.Check.greater_than_or_equal_to(0),
pa.Check.less_than_or_equal_to(2.0),
],
description="B&T traffic as ratio of pre-pandemic levels",
),
},
coerce=True,
)


def validate_mta_data(df):
"""Validate MTA ridership dataframe against schema."""
return mta_schema.validate(df)