From f4c4b56c4b119addf45c6704314a3775ef0fd831 Mon Sep 17 00:00:00 2001
From: Liu Haixin <hl3968hue@example.com>
Date: Wed, 25 Mar 2026 20:19:43 -0400
Subject: [PATCH 1/3] Add BigQuery integration for MTA ridership data

- Add load_data_to_bq.py script to load MTA data into BigQuery
- Modify utils.py to read data from BigQuery using service account
- Add pandas-gbq, google-cloud-bigquery, db-dtypes to requirements
- Add .streamlit/secrets.toml to .gitignore
- Fix plotly holiday vline compatibility with BigQuery date types

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gitignore         |  5 +++-
 load_data_to_bq.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt   |  5 +++-
 streamlit_app.py   | 23 +++++++++++-----
 utils.py           | 22 ++++++++++++---
 5 files changed, 110 insertions(+), 12 deletions(-)
 create mode 100644 load_data_to_bq.py

diff --git a/.gitignore b/.gitignore
index b508df9..cff8e75 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,7 @@
 
 __pycache__/
 *.pyc
-.pytest_cache/
\ No newline at end of file
+.pytest_cache/
+
+# Streamlit secrets
+.streamlit/secrets.toml
\ No newline at end of file
diff --git a/load_data_to_bq.py b/load_data_to_bq.py
new file mode 100644
index 0000000..d9284fe
--- /dev/null
+++ b/load_data_to_bq.py
@@ -0,0 +1,67 @@
+"""Load MTA ridership data from NYC Open Data API into BigQuery."""
+
+import sys
+
+import pandas as pd
+import pydata_google_auth
+
+import pandas_gbq
+
+PROJECT_ID = "sipa-adv-c-bouncing-penguin"
+DATASET_TABLE = "mta_data.daily_ridership"
+
+SCOPES = [
+    "https://www.googleapis.com/auth/bigquery",
+]
+
+
+def get_credentials():
+    """Get Google credentials with browser-based auth flow."""
+    print("Authenticating with Google... A browser window should open.")
+    print("If it doesn't, copy the URL shown below and open it manually.")
+    credentials = pydata_google_auth.get_user_credentials(
+        SCOPES,
+        auth_local_webserver=False,
+    )
+    print("Authentication successful!")
+    return credentials
+
+
+def fetch_mta_data() -> pd.DataFrame:
+    """Pull MTA ridership data from NYC Open Data API."""
+    print("Fetching MTA data from NYC Open Data API...")
+    sys.stdout.flush()
+    url = "https://data.ny.gov/resource/vxuj-8kew.csv?$limit=50000"
+    df = pd.read_csv(url)
+    df["date"] = pd.to_datetime(df["date"])
+    print(f"Fetched {len(df)} rows (from {df['date'].min().date()} to {df['date'].max().date()})")
+    return df
+
+
+def main():
+    # Step 1: Authenticate
+    credentials = get_credentials()
+
+    # Step 2: Fetch data
+    df = fetch_mta_data()
+
+    # Step 3: Upload to BigQuery
+    print(f"Uploading to BigQuery: {PROJECT_ID}.{DATASET_TABLE} ...")
+    sys.stdout.flush()
+    pandas_gbq.to_gbq(
+        df,
+        destination_table=DATASET_TABLE,
+        project_id=PROJECT_ID,
+        if_exists="replace",
+        credentials=credentials,
+    )
+    print("Done! Data loaded to BigQuery successfully.")
+
+    # Step 4: Verify
+    query = f"SELECT COUNT(*) as row_count FROM `{PROJECT_ID}.{DATASET_TABLE}`"
+    result = pandas_gbq.read_gbq(query, project_id=PROJECT_ID, credentials=credentials)
+    print(f"Verification: {result['row_count'].iloc[0]} rows in BigQuery table.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
index a91dd02..b106761 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,7 @@ requests
 ruff
 pytest
 matplotlib
-pandera
\ No newline at end of file
+pandera
+pandas-gbq
+google-cloud-bigquery
+db-dtypes
\ No newline at end of file
diff --git a/streamlit_app.py b/streamlit_app.py
index bfc5a1c..58e52bd 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -276,14 +276,23 @@ def fetch_data():
             dates = sel_holidays[sel_holidays["holiday"] == holiday]["date"]
             color = colors[i % len(colors)]
             for j, d in enumerate(dates):
+                d = pd.Timestamp(d)
                 if filtered["date"].min() <= d <= filtered["date"].max():
-                    fig_holiday.add_vline(
-                        x=d,
-                        line_dash="dot",
-                        line_color=color,
-                        annotation_text=holiday if j == 0 else None,
-                        annotation_position="top left",
+                    d_str = d.strftime("%Y-%m-%d")
+                    fig_holiday.add_shape(
+                        type="line",
+                        x0=d_str, x1=d_str,
+                        y0=0, y1=1,
+                        yref="paper",
+                        line=dict(dash="dot", color=color),
                     )
+                    if j == 0:
+                        fig_holiday.add_annotation(
+                            x=d_str, y=1, yref="paper",
+                            text=holiday,
+                            showarrow=False,
+                            xanchor="left",
+                        )
 
         fig_holiday.update_layout(
             yaxis_title="Subway Recovery (% of Pre-Pandemic)",
@@ -298,7 +307,7 @@ def fetch_data():
         st.markdown("**Average Subway Ridership Around Holidays**")
         impact_rows = []
         for _, row in sel_holidays.iterrows():
-            h_date = row["date"]
+            h_date = pd.Timestamp(row["date"])
             # 3-day window around the holiday
             window = filtered[
                 (filtered["date"] >= h_date - pd.Timedelta(days=1))
diff --git a/utils.py b/utils.py
index 5e7a5d6..e7e6026 100644
--- a/utils.py
+++ b/utils.py
@@ -1,11 +1,27 @@
 import matplotlib.pyplot as plt
 import pandas as pd
+from google.cloud import bigquery
+from google.oauth2 import service_account
+
+PROJECT_ID = "sipa-adv-c-bouncing-penguin"
+DATASET_TABLE = "mta_data.daily_ridership"
 
 
 def load_mta_data() -> pd.DataFrame:
-    """Load MTA ridership data from NYC Open Data API."""
-    url = "https://data.ny.gov/resource/vxuj-8kew.csv?$limit=50000"
-    df = pd.read_csv(url)
+    """Load MTA ridership data from BigQuery."""
+    try:
+        import streamlit as st
+
+        credentials = service_account.Credentials.from_service_account_info(
+            st.secrets["gcp_service_account"]
+        )
+        client = bigquery.Client(credentials=credentials, project=PROJECT_ID)
+    except Exception:
+        # Fallback: use default credentials (e.g. local gcloud auth)
+        client = bigquery.Client(project=PROJECT_ID)
+
+    query = f"SELECT * FROM `{PROJECT_ID}.{DATASET_TABLE}`"
+    df = client.query(query).to_dataframe()
     df = clean_mta_df(df)
     return df
 

From 650644cd8fe56f0ed4b6c664e0650e85bf6181bc Mon Sep 17 00:00:00 2001
From: Liu Haixin <hl3968hue@example.com>
Date: Wed, 25 Mar 2026 20:34:08 -0400
Subject: [PATCH 2/3] update test

---
 test_auth.py | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 test_auth.py

diff --git a/test_auth.py b/test_auth.py
new file mode 100644
index 0000000..e79374e
--- /dev/null
+++ b/test_auth.py
@@ -0,0 +1,8 @@
+import pydata_google_auth
+
+print("Starting auth...")
+creds = pydata_google_auth.get_user_credentials(
+    ["https://www.googleapis.com/auth/bigquery"],
+    auth_local_webserver=False,
+)
+print("Auth done!")

From 87e671bd2170ce29897fa4aed84177b67c2ff90e Mon Sep 17 00:00:00 2001
From: Liu Haixin <hl3968hue@example.com>
Date: Wed, 25 Mar 2026 20:35:42 -0400
Subject: [PATCH 3/3] Remove test_auth.py temporary file

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 test_auth.py | 8 --------
 1 file changed, 8 deletions(-)
 delete mode 100644 test_auth.py

diff --git a/test_auth.py b/test_auth.py
deleted file mode 100644
index e79374e..0000000
--- a/test_auth.py
+++ /dev/null
@@ -1,8 +0,0 @@
-import pydata_google_auth
-
-print("Starting auth...")
-creds = pydata_google_auth.get_user_credentials(
-    ["https://www.googleapis.com/auth/bigquery"],
-    auth_local_webserver=False,
-)
-print("Auth done!")