# Import libraries
import pandas as pd
import numpy as np
import json
from pathlib import Path
from datetime import datetime
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

import folium
from folium.plugins import MarkerCluster

import warnings
warnings.filterwarnings('ignore')

# Config for inline display
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported")

✓ Libraries imported

# Create output directory for static visualizations
output_dir = Path('./static_viz')
output_dir.mkdir(exist_ok=True)
print(f"✓ Output directory: {output_dir.absolute()}")

✓ Output directory: /Users/rzrizaldy/CodeFolder/pgh_transit_atlas/static_viz

# Load station metadata
stations = pd.read_excel('dataset/pogoh-station-locations-october-2025.xlsx')
print(f"Loaded {len(stations)} stations")
stations.head()

Loaded 60 stations

# Load 12 months of trip data
trip_files = [
    'november-2024.xlsx', 'december-2024.xlsx', 'january-2025.xlsx',
    'february-2025.xlsx', 'march-2025.xlsx', 'april-2025.xlsx',
    'may-2025.xlsx', 'june-2025.xlsx', 'july-2025.xlsx',
    'august-2025.xlsx', 'september-2025.xlsx', 'pogoh-october-2025.xlsx'
]

trips_list = []
for file in trip_files:
    df = pd.read_excel(f'dataset/{file}')
    trips_list.append(df)
    print(f"{file}: {len(df):,} trips")

trips = pd.concat(trips_list, ignore_index=True)
print(f"\nTotal: {len(trips):,} trips")

november-2024.xlsx: 43,436 trips

december-2024.xlsx: 19,739 trips

january-2025.xlsx: 15,940 trips

february-2025.xlsx: 23,571 trips

march-2025.xlsx: 35,940 trips

april-2025.xlsx: 47,523 trips

may-2025.xlsx: 32,922 trips

june-2025.xlsx: 36,345 trips

july-2025.xlsx: 43,958 trips

august-2025.xlsx: 65,153 trips

# Check data structure
trips.head(10)

trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557542 entries, 0 to 557541
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   Closed Status       557542 non-null  object        
 1   Duration            557542 non-null  int64         
 2   Start Station Id    557542 non-null  int64         
 3   Start Date          557542 non-null  datetime64[ns]
 4   Start Station Name  557542 non-null  object        
 5   End Date            557542 non-null  datetime64[ns]
 6   End Station Id      557440 non-null  float64       
 7   End Station Name    557440 non-null  object        
 8   Rider Type          557542 non-null  object        
dtypes: datetime64[ns](2), float64(1), int64(2), object(4)
memory usage: 38.3+ MB

# Parse dates and filter outliers
trips['Start Date'] = pd.to_datetime(trips['Start Date'])
trips['End Date'] = pd.to_datetime(trips['End Date'])

initial = len(trips)
trips = trips[trips['Duration'] <= 14400]  # Filter > 4 hours
print(f"Filtered {initial - len(trips):,} outliers (>4hrs)")
print(f"Remaining: {len(trips):,} trips")

Filtered 1,027 outliers (>4hrs)
Remaining: 556,515 trips

# Add temporal features
trips['hour'] = trips['Start Date'].dt.hour
trips['day_of_week'] = trips['Start Date'].dt.dayofweek
trips['day_name'] = trips['Start Date'].dt.day_name()
trips['date'] = trips['Start Date'].dt.date
trips['month'] = trips['Start Date'].dt.month

SEASON_ORDER = ['Winter', 'Spring', 'Summer', 'Fall']
season_map = {
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
}
trips['season'] = trips['month'].map(season_map)
trips['season'] = pd.Categorical(trips['season'], categories=SEASON_ORDER, ordered=True)

trips[['Start Date', 'hour', 'day_name', 'season']].head(10)

# Merge coordinates
trips = trips.merge(
    stations[['Id', 'Latitude', 'Longitude']],
    left_on='Start Station Id', right_on='Id', how='left'
).rename(columns={'Latitude': 'start_lat', 'Longitude': 'start_lon'})

trips = trips.merge(
    stations[['Id', 'Latitude', 'Longitude']],
    left_on='End Station Id', right_on='Id', how='left', suffixes=('', '_end')
).rename(columns={'Latitude': 'end_lat', 'Longitude': 'end_lon'})

print("✓ Merged station coordinates")
trips[['Start Station Name', 'start_lat', 'start_lon']].head()

✓ Merged station coordinates

# Calculate Haversine displacement
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

trips['displacement'] = haversine_distance(
    trips['start_lat'], trips['start_lon'],
    trips['end_lat'], trips['end_lon']
)

print("✓ Calculated displacement")
trips[['Duration', 'displacement']].describe()

✓ Calculated displacement

# Flag Campus Corridor trips
CAMPUS_LAT_MIN, CAMPUS_LAT_MAX = 40.435, 40.450
CAMPUS_LON_MIN, CAMPUS_LON_MAX = -79.970, -79.940

trips['is_campus'] = (
    ((trips['start_lat'] >= CAMPUS_LAT_MIN) & (trips['start_lat'] <= CAMPUS_LAT_MAX) &
     (trips['start_lon'] >= CAMPUS_LON_MIN) & (trips['start_lon'] <= CAMPUS_LON_MAX)) |
    ((trips['end_lat'] >= CAMPUS_LAT_MIN) & (trips['end_lat'] <= CAMPUS_LAT_MAX) &
     (trips['end_lon'] >= CAMPUS_LON_MIN) & (trips['end_lon'] <= CAMPUS_LON_MAX))
)

campus_pct = trips['is_campus'].sum() / len(trips) * 100
print(f"Campus Corridor: {trips['is_campus'].sum():,} ({campus_pct:.1f}%)")
print(f"City-wide: {(~trips['is_campus']).sum():,} ({100-campus_pct:.1f}%)")

Campus Corridor: 378,811 (68.1%)
City-wide: 177,704 (31.9%)

# Daily timeseries
daily_pogoh = trips.groupby('date').size().reset_index(name='trips')
daily_campus = trips[trips['is_campus']].groupby('date').size().reset_index(name='campus_trips')
daily_city = trips[~trips['is_campus']].groupby('date').size().reset_index(name='city_trips')

daily_full = daily_pogoh.merge(daily_campus, on='date', how='left').merge(daily_city, on='date', how='left').fillna(0)
daily_full['date'] = pd.to_datetime(daily_full['date'])

print(f"{len(daily_full)} days")
daily_full.head(10)

365 days

# Peak/trough
peak = daily_full.loc[daily_full['trips'].idxmax()]
trough = daily_full.loc[daily_full['trips'].idxmin()]
print(f"Peak: {peak['date'].strftime('%b %d')} with {int(peak['trips']):,} trips")
print(f"Trough: {trough['date'].strftime('%b %d')} with {int(trough['trips']):,} trips")
print(f"Variance: {peak['trips']/trough['trips']:.1f}x")

Peak: Sep 19 with 3,892 trips
Trough: Dec 22 with 98 trips
Variance: 39.7x

# Plot Daily Timeseries
fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(daily_full['date'], daily_full['trips'], color='#2B4CFF', linewidth=2, label='Total Trips')

# Highlight Campus vs City if available
if 'campus_trips' in daily_full.columns:
    ax.plot(daily_full['date'], daily_full['campus_trips'], color='#FF2B8C', linewidth=1.5, alpha=0.7, label='Campus Corridor')
    ax.plot(daily_full['date'], daily_full['city_trips'], color='#00B884', linewidth=1.5, alpha=0.7, label='City Network')

ax.set_title('FIG 1: Daily Trip Volume (Nov 2024 - Oct 2025)', fontsize=16, fontweight='bold', pad=20)
ax.set_ylabel('Trips', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# Annotate Peak
peak = daily_full.loc[daily_full['trips'].idxmax()]
ax.annotate(f"Peak: {int(peak['trips']):,}\n{peak['date'].strftime('%b %d')}",
            xy=(peak['date'], peak['trips']),
            xytext=(peak['date'], peak['trips']+500),
            arrowprops=dict(arrowstyle='->', color='black'),
            fontsize=10, fontweight='bold', ha='center')

plt.tight_layout()
plt.savefig(output_dir / 'fig1_daily_timeseries.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig1_daily_timeseries.png")

✓ Saved fig1_daily_timeseries.png

# Prepare features
trips_clean = trips.dropna(subset=['displacement', 'Duration', 'hour'])
features = trips_clean[['Duration', 'displacement', 'hour']].values
print(f"Feature matrix: {features.shape}")

# Normalize
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
print(f"Duration: mean={scaler.mean_[0]:.1f}s, std={scaler.scale_[0]:.1f}")
print(f"Displacement: mean={scaler.mean_[1]:.1f}m")

Feature matrix: (556437, 3)
Duration: mean=705.0s, std=953.7
Displacement: mean=1252.1m

# K-Means (k=4)
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
trips_clean['archetype'] = kmeans.fit_predict(features_scaled)

print(f"✓ K-Means inertia: {kmeans.inertia_:.1f}")
print("\nCluster sizes:")
print(trips_clean['archetype'].value_counts().sort_index())

✓ K-Means inertia: 647166.6

Cluster sizes:
archetype
0    266603
1     87128
2    182663
3     20043
Name: count, dtype: int64

# Analyze centroids and label
archetype_labels = []
for i in range(4):
    cluster = trips_clean[trips_clean['archetype'] == i]
    avg_dur = cluster['Duration'].mean()
    avg_dist = cluster['displacement'].mean()
    avg_hour = cluster['hour'].mean()
    weekend_pct = (cluster['day_of_week'] >= 5).mean() * 100
    
    # Label logic
    if (avg_hour >= 7 and avg_hour <= 9) or (avg_hour >= 17 and avg_hour <= 19):
        label = "Commuter"
    elif weekend_pct > 40:
        label = "Leisure"
    elif avg_dur < 600:
        label = "Last-Mile"
    else:
        label = "Errand"
    
    archetype_labels.append({
        'cluster': i, 'label': label, 'count': len(cluster),
        'avg_duration': avg_dur, 'avg_displacement': avg_dist,
        'avg_hour': avg_hour, 'weekend_pct': weekend_pct
    })
    
    print(f"Cluster {i} → {label}")
    print(f"  {len(cluster):,} trips ({len(cluster)/len(trips_clean)*100:.1f}%)")
    print(f"  Dur: {avg_dur/60:.1f}min, Dist: {avg_dist:.0f}m, Hour: {avg_hour:.1f}\n")

Cluster 0 → Commuter
  266,603 trips (47.9%)
  Dur: 7.7min, Dist: 836m, Hour: 17.7

Cluster 1 → Errand
  87,128 trips (15.7%)
  Dur: 20.1min, Dist: 3359m, Hour: 14.8

Cluster 2 → Last-Mile
  182,663 trips (32.8%)
  Dur: 7.0min, Dist: 910m, Hour: 9.3

Cluster 3 → Leisure
  20,043 trips (3.6%)
  Dur: 73.2min, Dist: 737m, Hour: 14.4

# Map labels
archetype_map = {a['cluster']: a['label'] for a in archetype_labels}
trips_clean['archetype_label'] = trips_clean['archetype'].map(archetype_map)

trips_clean[['Start Station Name', 'Duration', 'displacement', 'archetype_label']].head(10)

hourly = trips.groupby('hour').size().reset_index(name='trips')

seasonal_hourly = (
    trips.groupby(['season', 'hour']).size()
         .reset_index(name='trips')
         .pivot(index='hour', columns='season', values='trips')
         .reindex(columns=SEASON_ORDER)
)

seasonal_hourly.head(10)

fig, ax = plt.subplots(figsize=(14, 5))
season_palette = {
    'Winter': '#2B4CFF',
    'Spring': '#00B884',
    'Summer': '#FF4D00',
    'Fall': '#FF2B8C'
}

for season, color in season_palette.items():
    series = seasonal_hourly[season].fillna(0)
    ax.plot(series.index, series.values, linewidth=2.5, color=color, alpha=0.9, marker='o', markersize=5, label=season)

ax.set_title('FIG 2: Hourly Trip Distribution by Season', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Hour of Day', fontsize=12, fontweight='bold')
ax.set_ylabel('Total Trips', fontsize=12, fontweight='bold')
ax.set_xticks(range(0, 24, 2))
ax.grid(alpha=0.3, linestyle='--')
ax.legend(title='Season')

plt.tight_layout()
plt.savefig(output_dir / 'fig2_hourly.png', dpi=150, bbox_inches='tight')
plt.show()
print('✓ Saved fig2_hourly.png')

✓ Saved fig2_hourly.png

# Calculate Heatmap Data
heatmap_data = trips.groupby(['day_name', 'hour']).size().reset_index(name='count')
days_ordered = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_pivot = heatmap_data.pivot_table(index='hour', columns='day_name', values='count', fill_value=0)
heatmap_pivot = heatmap_pivot.reindex(columns=days_ordered, fill_value=0)

print(f"Heatmap shape: {heatmap_pivot.shape}")
heatmap_pivot.head()

Heatmap shape: (24, 7)

fig, ax = plt.subplots(figsize=(14, 6))
sns.heatmap(heatmap_pivot, cmap='YlOrRd', annot=False, cbar_kws={'label': 'Trips'}, linewidths=0.5, ax=ax)
ax.set_title('FIG 3: Day × Hour Heatmap', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Day', fontsize=12, fontweight='bold')
ax.set_ylabel('Hour', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig3_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig3_heatmap.png")

✓ Saved fig3_heatmap.png

# Top 10 stations
top_stations = trips.groupby('Start Station Name').size().nlargest(10).index
demo_data = trips[trips['Start Station Name'].isin(top_stations)].groupby(['Start Station Name', 'Rider Type']).size().unstack(fill_value=0)
demo_data

# Overall rider type
rider_totals = trips['Rider Type'].value_counts()
rider_pct = trips['Rider Type'].value_counts(normalize=True) * 100
for r in rider_totals.index:
    print(f"{r}: {rider_totals[r]:,} ({rider_pct[r]:.1f}%)")

MEMBER: 517,200 (92.9%)
CASUAL: 39,315 (7.1%)

df_demo = pd.DataFrame({'Type': rider_totals.index, 'Trips': rider_totals.values, 'Pct': rider_pct.values})

fig, ax = plt.subplots(figsize=(8, 5))
ax.bar(df_demo['Type'], df_demo['Trips'], color=['#FF9500', '#2B4CFF'], edgecolor='black', linewidth=2, width=0.6)
ax.set_title('FIG 4: Rider Type', fontsize=16, fontweight='bold', pad=20)
ax.set_ylabel('Trips', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

for i, (_, row) in enumerate(df_demo.iterrows()):
    ax.text(i, row['Trips'] + 5000, f"{int(row['Trips']):,}\n({row['Pct']:.1f}%)", ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig4_demographics.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig4_demographics.png")

✓ Saved fig4_demographics.png

df_stations = demo_data.reset_index()
df_stations['Total'] = df_stations['CASUAL'] + df_stations['MEMBER']
df_stations = df_stations.sort_values('Total', ascending=True)
df_stations['Short'] = df_stations['Start Station Name'].apply(lambda x: x[:30]+'...' if len(x)>30 else x)

fig, ax = plt.subplots(figsize=(12, 8))
x = np.arange(len(df_stations))
width = 0.35

ax.barh(x - width/2, df_stations['CASUAL'], width, label='Casual', color='#FF9500', edgecolor='black')
ax.barh(x + width/2, df_stations['MEMBER'], width, label='Member', color='#2B4CFF', edgecolor='black')

ax.set_title('FIG 5: Top 10 Stations by Rider Type', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Trips', fontsize=12, fontweight='bold')
ax.set_yticks(x)
ax.set_yticklabels(df_stations['Short'], fontsize=9)
ax.legend(loc='lower right', fontsize=11)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'fig5_top_stations.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig5_top_stations.png")

✓ Saved fig5_top_stations.png

# Station × archetype percentages
station_totals = trips_clean.groupby('Start Station Name').size().reset_index(name='total_trips')
station_arch = trips_clean.groupby(['Start Station Name', 'archetype_label']).size().reset_index(name='arch_trips')
station_arch_pct = station_arch.merge(station_totals, on='Start Station Name')
station_arch_pct['pct'] = (station_arch_pct['arch_trips'] / station_arch_pct['total_trips']) * 100
station_arch_pct = station_arch_pct[station_arch_pct['total_trips'] >= 50]

station_arch_pct.head(15)

# Top 3 per archetype
station_archetype_top = {}
for archetype in ['Commuter', 'Last-Mile', 'Errand', 'Leisure']:
    top_3 = station_arch_pct[station_arch_pct['archetype_label'] == archetype].nlargest(3, 'pct')
    station_archetype_top[archetype] = {
        'stations': top_3['Start Station Name'].tolist(),
        'percentages': top_3['pct'].round(1).tolist(),
        'trip_counts': top_3['arch_trips'].astype(int).tolist(),
        'total_trips': top_3['total_trips'].astype(int).tolist()
    }
    print(f"\n{archetype}:")
    for i, station in enumerate(top_3['Start Station Name']):
        print(f"  {i+1}. {station}: {top_3.iloc[i]['pct']:.1f}%")

Commuter:
  1. Schenley Dr & Schenley Dr Ext: 64.8%
  2. Forbes Ave & Schenley Dr: 63.3%
  3. Forbes Ave at TCS Hall (CMU Campus): 61.2%

Last-Mile:
  1. Boulevard of the Allies & Parkview Ave: 46.9%
  2. S Millvale Ave & Centre Ave: 43.4%
  3. Allequippa St & Darragh St: 42.0%

Errand:
  1. Wilkinsburg Park & Ride: 68.4%
  2. Second Ave & Tecumseh St: 61.4%
  3. Shady Ave & Ellsworth Ave: 56.5%

Leisure:
  1. South Side Trail & S 4th St: 19.0%
  2. Liberty Ave & Stanwix St: 18.9%
  3. W Ohio St & Brighton Rd: 18.3%

df_arch = pd.DataFrame(archetype_labels)
df_arch['pct'] = (df_arch['count'] / df_arch['count'].sum()) * 100
df_arch = df_arch.sort_values('count', ascending=True)

fig, ax = plt.subplots(figsize=(10, 5))
colors = ['#2B4CFF', '#34C759', '#FF9500', '#FF2B8C']
ax.barh(df_arch['label'], df_arch['count'], color=colors, edgecolor='black', linewidth=1.5)
ax.set_title('FIG 6: Trip Archetypes (K-Means)', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Trips', fontsize=12, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

for i, (_, row) in enumerate(df_arch.iterrows()):
    ax.text(row['count'] + 5000, i, f"{row['pct']:.1f}%", va='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig6_archetypes.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig6_archetypes.png")

✓ Saved fig6_archetypes.png

fig, ax = plt.subplots(figsize=(10, 6))

# Use the same data as before
colors = ['#2B4CFF', '#34C759', '#FF9500', '#FF2B8C']
bars = ax.bar(df_arch['label'], df_arch['count'], color=colors, edgecolor='black', linewidth=2, width=0.6)

ax.set_title('FIG 7: Trip Archetype Distribution', fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Trip Archetype', fontsize=12, fontweight='bold')
ax.set_ylabel('Number of Trips', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3, linestyle='--')

# Add value labels on top of bars
for i, (_, row) in enumerate(df_arch.iterrows()):
    ax.text(i, row['count'] + 5000, f"{row['count']:,}\n({row['pct']:.1f}%)", 
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig6_archetypes.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig6_archetypes.png")

✓ Saved fig6_archetypes.png

fig, axes = plt.subplots(2, 2, figsize=(16, 10))
fig.suptitle('FIG 7: Behavioral Hotspots', fontsize=18, fontweight='bold', y=0.995)

configs = [
    ('Commuter', '🚴 COMMUTER', '#2B4CFF', axes[0,0]),
    ('Last-Mile', '🔗 LAST-MILE', '#FF9500', axes[0,1]),
    ('Errand', '🛒 ERRAND', '#34C759', axes[1,0]),
    ('Leisure', '🎨 LEISURE', '#FF2B8C', axes[1,1])
]

for arch, title, color, ax in configs:
    data = station_archetype_top[arch]
    labels = [s[:35]+'...' if len(s)>35 else s for s in data['stations']]
    ax.barh(labels, data['percentages'], color=color, edgecolor='black', linewidth=2)
    ax.set_title(title, fontsize=13, fontweight='bold', pad=10)
    ax.set_xlabel('% of Trips', fontsize=11, fontweight='bold')
    ax.set_xlim(0, 75)
    ax.grid(axis='x', alpha=0.3)
    ax.invert_yaxis()
    for i, (pct, t_count, total) in enumerate(zip(data['percentages'], data['trip_counts'], data['total_trips'])):
        ax.text(pct+1, i, f"{pct:.1f}% ({t_count:,}/{total:,})", va='center', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig7_station_archetypes.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig7_station_archetypes.png")

✓ Saved fig7_station_archetypes.png

# Load bus stop data
bus_stops = pd.read_csv('dataset/PRT_Bus_Stop_Usage_Unweighted_-5413281589865626035.csv', encoding='utf-8-sig')
# Clean duplicates as in ETL
bus_stops = bus_stops.drop_duplicates(subset='stop_id', keep='first')
print(f"Loaded {len(bus_stops)} unique bus stops")

# Calculate trip volumes per bike station
station_volumes = trips.groupby('Start Station Name').size().reset_index(name='total_trips')
station_data = stations.merge(station_volumes, left_on='Name', right_on='Start Station Name', how='left')
station_data['total_trips'] = station_data['total_trips'].fillna(0).astype(int)

print(f"Bike stations with trip data: {len(station_data[station_data['total_trips'] > 0])}")
station_data[station_data['total_trips'] > 0].head()

Loaded 7075 unique bus stops
Bike stations with trip data: 60

# Create interactive Folium map
output_dir = Path('./static_viz')
output_dir.mkdir(exist_ok=True)

pgh_center = [40.4406, -79.9959]
m = folium.Map(location=pgh_center, zoom_start=12, tiles='CartoDB positron') # Zoom out slightly for system view

# Helper function to calculate distance (Haversine)
def calculate_distance(lat1, lon1, lat2, lon2):
    R = 6371000  # Earth radius in meters
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Get active bike stations with coordinates
active_bike_stations = station_data[station_data['total_trips'] > 0][['Latitude', 'Longitude']].values

# Check which bus stops are within 400m of any bike station
def is_near_bike_station(bus_lat, bus_lon, threshold=400):
    for bike_lat, bike_lon in active_bike_stations:
        distance = calculate_distance(bus_lat, bus_lon, bike_lat, bike_lon)
        if distance <= threshold:
            return True
    return False

# Count integrated bus stops
integrated_count = 0
total_bus_stops = 0

# Add bike stations (Green Squares - matched to interactive dash)
# We use green CircleMarkers to approximate the "Green" style
for _, row in station_data.iterrows():
    if row['total_trips'] > 0:
        # Let's use Green (#00B884) for all POGOH stations to match dashboard
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=6, # Standard size
            popup=f"<b>🚴 {row['Name']}</b><br>{row['total_trips']:,} trips",
            tooltip=row['Name'],
            color='#000000', # Black border
            fill=True,
            fillColor='#00B884', # Green fill
            fillOpacity=1.0,
            weight=2
        ).add_to(m)

# Add bus stops (Red if within 400m, Gray otherwise)
for _, row in bus_stops.iterrows():
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        total_bus_stops += 1
        near_bike = is_near_bike_station(row['latitude'], row['longitude'])
        
        if near_bike:
            integrated_count += 1
            color = '#FF2B4C'  # Red - Integrated
            fill_color = '#FF2B4C'
            opacity = 0.8
            fill_opacity = 0.8
            label = 'Integrated (within 400m)'
            radius = 2 # Small dot
        else:
            color = '#999999'  # Gray - Not integrated
            fill_color = '#999999'
            opacity = 0.5
            fill_opacity = 0.5
            label = 'Not integrated'
            radius = 2 # Small dot
        
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=radius,
            popup=f"<b>🚌 {row['stop_name']}</b><br>Bus Stop<br><i>{label}</i>",
            tooltip=row['stop_name'],
            color=color,
            fill=True,
            fillColor=fill_color,
            fillOpacity=fill_opacity,
            weight=0
        ).add_to(m)

# Add legend
legend_html = f'''
<div style="position: fixed; bottom: 50px; right: 50px; width: 250px; background-color: white; 
     border:2px solid black; z-index:9999; font-size:12px; padding: 15px; font-family: monospace;">
<p style="font-weight:bold; border-bottom:1px solid #ccc; margin-top:0;">MAP KEY</p>
<p><span style="color:#00B884; font-size:14px;">●</span> POGOH Station</p>
<p><span style="color:#FF2B4C; font-size:14px;">●</span> Bus Stop (Nearby POGOH)</p>
<p><span style="color:#999999; font-size:14px;">●</span> Bus Stop (Not Nearby)</p>
<hr>
<p style="font-size:11px; margin-bottom:0;">
<b>System Integration:</b><br>
{integrated_count:,} / {total_bus_stops:,} stops<br>
({integrated_count/total_bus_stops*100:.1f}% within 400m)
</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Save and display
m.save(output_dir / 'fig8_station_map.html')
print(f"✓ Saved fig8_station_map.html")
print(f"  • {len(station_data[station_data['total_trips'] > 0])} bike stations")
print(f"  • {total_bus_stops:,} bus stops")
print(f"  • {integrated_count:,} integrated bus stops (within 400m)")
print(f"  • Integration rate: {integrated_count/total_bus_stops*100:.1f}%")

# Display in notebook
m

✓ Saved fig8_station_map.html
  • 60 bike stations
  • 7,075 bus stops
  • 817 integrated bus stops (within 400m)
  • Integration rate: 11.5%

# Calculate distance to nearest bike station for ALL bus stops
def get_min_dist(bus_lat, bus_lon):
    dists = [calculate_distance(bus_lat, bus_lon, lat, lon) for lat, lon in active_bike_stations]
    return min(dists) if dists else 9999

bus_stops['dist_to_bike'] = bus_stops.apply(lambda x: get_min_dist(x['latitude'], x['longitude']), axis=1)

# Plot Distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 1. Histogram of Distances
sns.histplot(bus_stops['dist_to_bike'], bins=50, kde=True, color='#2B4CFF', ax=ax1)
ax1.axvline(400, color='#FF2B4C', linestyle='--', linewidth=2, label='400m Walk Threshold')
ax1.set_xlim(0, 5000)  # Cap at 5km for readability
ax1.set_title('Distribution of Bus Stop Distance to Nearest POGOH', fontsize=14, fontweight='bold')
ax1.set_xlabel('Distance (Meters)', fontsize=12)
ax1.set_ylabel('Count of Bus Stops', fontsize=12)
ax1.legend()

# 2. Integration Pie Chart
integrated = (bus_stops['dist_to_bike'] <= 400).sum()
not_integrated = (bus_stops['dist_to_bike'] > 400).sum()

ax2.pie([integrated, not_integrated], labels=[f'Integrated\n({integrated:,})', f'Not Integrated\n({not_integrated:,})'], 
        colors=['#FF2B4C', '#999999'], autopct='%1.1f%%', startangle=90, explode=(0.1, 0), 
        textprops={'fontsize': 12, 'fontweight': 'bold'})
ax2.set_title('System-Wide Integration Rate (400m Buffer)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(output_dir / 'fig9_connectivity_gap.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig9_connectivity_gap.png")

✓ Saved fig9_connectivity_gap.png

# Define Corridors (Refined Boundaries)
corridors = {
    'Campus': ['North Oakland', 'Central Oakland', 'South Oakland', 'West Oakland', 'Shadyside', 'Terrace Village'],
    'Downtown': ['Central Business District', 'Bluff'],
    'Strip': ['Strip District'],
    'East Liberty': ['East Liberty'], 
    'Squirrel Hill': ['Squirrel Hill North', 'Squirrel Hill South'],
    'Homewood / East Hills': ['Homewood North', 'Homewood South', 'Larimer', 'Lincoln-Lemington-Belmar']
}

# Map stops to corridors
def get_corridor(hood):
    for corr, hoods in corridors.items():
        if hood in hoods:
            return corr
    return 'Other'

bus_stops['Corridor'] = bus_stops['HOOD'].apply(get_corridor)

# Filter out 'Other' for focused analysis
df_corr = bus_stops[bus_stops['Corridor'] != 'Other'].copy()

# Plot Regional Distribution (2x3 Grid)
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Distribution of Distance to Nearest Bikeshare by Corridor', fontsize=18, fontweight='bold', y=0.98)

corridor_list = ['Campus', 'Downtown', 'Strip', 'East Liberty', 'Squirrel Hill', 'Homewood / East Hills']
colors = ['#2B4CFF', '#FF9500', '#34C759', '#FF2B8C', '#8E44AD', '#E74C3C']

# Flatten axes for easy iteration
axes_flat = axes.flatten()

for i, corridor in enumerate(corridor_list):
    ax = axes_flat[i]
    color = colors[i]
    
    data = df_corr[df_corr['Corridor'] == corridor]
    if len(data) > 0:
        sns.histplot(data['dist_to_bike'], bins=20, kde=True, color=color, ax=ax)
        ax.axvline(400, color='red', linestyle='--', linewidth=2, label='400m Threshold')
        
        # Calculate integration rate
        rate = (data['dist_to_bike'] <= 400).mean() * 100
        ax.text(0.95, 0.85, f"Integration Rate:\n{rate:.1f}%", transform=ax.transAxes, 
                ha='right', fontsize=12, fontweight='bold', bbox=dict(facecolor='white', alpha=0.8))
        
        ax.set_title(f"{corridor} (n={len(data)})", fontsize=14, fontweight='bold')
        ax.set_xlim(0, 2500)
        ax.set_xlabel('Distance (m)')
    else:
        ax.text(0.5, 0.5, 'No Data', ha='center')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(output_dir / 'fig10_regional_connectivity.png', dpi=150, bbox_inches='tight')
plt.show()
print("✓ Saved fig10_regional_connectivity.png")

✓ Saved fig10_regional_connectivity.png

	Id	Name	Total Docks	Latitude	Longitude
0	1	Pierce St & Summerlea St	19	40.456507	-79.932331
1	2	Eliza Furnace Trail & Swineburne St	15	40.425800	-79.953400
2	3	Centre Ave & Addison St	15	40.444600	-79.978300
3	4	Burns White Center at 3 Crossings	15	40.456400	-79.980000
4	5	Allegheny Station	19	40.448301	-80.018007

	Closed Status	Duration	Start Station Id	Start Date	Start Station Name	End Date	End Station Id	End Station Name	Rider Type
0	NORMAL	253	28	2024-11-30 23:51:33	Fifth Ave & S Bouquet St	2024-11-30 23:55:46	20.0	Boulevard of the Allies & Parkview Ave	MEMBER
1	NORMAL	1368	59	2024-11-30 23:49:22	Forbes Ave at TCS Hall (CMU Campus)	2024-12-01 00:12:10	59.0	Forbes Ave at TCS Hall (CMU Campus)	MEMBER
2	NORMAL	1394	59	2024-11-30 23:49:12	Forbes Ave at TCS Hall (CMU Campus)	2024-12-01 00:12:26	59.0	Forbes Ave at TCS Hall (CMU Campus)	MEMBER
3	NORMAL	198	12	2024-11-30 23:47:49	O'Hara St and University Place	2024-11-30 23:51:07	34.0	N Dithridge St & Centre Ave	MEMBER
4	NORMAL	210	28	2024-11-30 23:45:32	Fifth Ave & S Bouquet St	2024-11-30 23:49:02	20.0	Boulevard of the Allies & Parkview Ave	MEMBER
5	NORMAL	281	51	2024-11-30 23:38:04	Coltart Ave & Forbes Ave	2024-11-30 23:42:45	20.0	Boulevard of the Allies & Parkview Ave	MEMBER
6	NORMAL	323	37	2024-11-30 23:31:23	S Negley Ave & Centre Ave	2024-11-30 23:36:46	30.0	Shady Ave & Ellsworth Ave	MEMBER
7	NORMAL	1372	28	2024-11-30 23:25:31	Fifth Ave & S Bouquet St	2024-11-30 23:48:23	59.0	Forbes Ave at TCS Hall (CMU Campus)	MEMBER
8	NORMAL	1323	28	2024-11-30 23:25:19	Fifth Ave & S Bouquet St	2024-11-30 23:47:22	59.0	Forbes Ave at TCS Hall (CMU Campus)	MEMBER
9	NORMAL	201	13	2024-11-30 23:21:02	S Bouquet Ave & Sennott St	2024-11-30 23:24:23	28.0	Fifth Ave & S Bouquet St	MEMBER

	Start Station Name	start_lat	start_lon
0	Fifth Ave & S Bouquet St	40.442200	-79.957600
1	Forbes Ave at TCS Hall (CMU Campus)	40.444654	-79.947535
2	Forbes Ave at TCS Hall (CMU Campus)	40.444654	-79.947535
3	O'Hara St and University Place	40.445117	-79.957118
4	Fifth Ave & S Bouquet St	40.442200	-79.957600

	Duration	displacement
count	556515.000000	556437.000000
mean	705.091241	1252.059636
std	953.957185	1131.984984
min	0.000000	0.000000
25%	229.000000	516.520501
50%	397.000000	868.728209
75%	824.000000	1655.615969
max	14389.000000	10776.503161

	date	trips	campus_trips	city_trips
0	2024-11-01	2391	1798	593
1	2024-11-02	1851	1216	635
2	2024-11-03	1778	1251	527
3	2024-11-04	2206	1801	405
4	2024-11-05	2219	1621	598
5	2024-11-06	2240	1780	460
6	2024-11-07	2457	1950	507
7	2024-11-08	2440	1903	537
8	2024-11-09	1778	1134	644
9	2024-11-10	719	565	154

PGH Transit Atlas - EDA Notebook¶

1. Data Ingestion¶

2. Data Cleaning & Feature Engineering¶

FIG 1: Daily Timeseries¶

Trip Archetypes¶

FIG 2: Hourly Pattern¶

FIG 3: Heatmap¶

3. Demographics¶

FIG 4: Rider Type¶

4. Station Behavioral Profiling¶

FIG 6: Archetype Comparison¶

FIG 7: Station Behavioral Hotspots¶

5. Geographic Visualization¶

FIG 9: The Connectivity Gap¶

FIG 10: Regional Connectivity Analysis¶

9. Policy Recommendations¶

1. Fill the Homewood & Squirrel Hill Gaps¶

2. Winter Gear & Fleet Resilience¶

3. Downtown Triangle Densification¶

Conclusion & Key Summary¶

	Start Date	hour	day_name	season
0	2024-11-30 23:51:33	23	Saturday	Fall
1	2024-11-30 23:49:22	23	Saturday	Fall
2	2024-11-30 23:49:12	23	Saturday	Fall
3	2024-11-30 23:47:49	23	Saturday	Fall
4	2024-11-30 23:45:32	23	Saturday	Fall
5	2024-11-30 23:38:04	23	Saturday	Fall
6	2024-11-30 23:31:23	23	Saturday	Fall
7	2024-11-30 23:25:31	23	Saturday	Fall
8	2024-11-30 23:25:19	23	Saturday	Fall
9	2024-11-30 23:21:02	23	Saturday	Fall

	Start Station Name	Duration	displacement	archetype_label
0	Fifth Ave & S Bouquet St	253	992.447135	Commuter
1	Forbes Ave at TCS Hall (CMU Campus)	1368	0.000000	Commuter
2	Forbes Ave at TCS Hall (CMU Campus)	1394	0.000000	Commuter
3	O'Hara St and University Place	198	769.589720	Commuter
4	Fifth Ave & S Bouquet St	210	992.447135	Commuter
5	Coltart Ave & Forbes Ave	281	876.021030	Commuter
6	S Negley Ave & Centre Ave	323	1063.997498	Commuter
7	Fifth Ave & S Bouquet St	1372	894.352617	Commuter
8	Fifth Ave & S Bouquet St	1323	894.352617	Commuter
9	S Bouquet Ave & Sennott St	201	167.470362	Commuter

season	Winter	Spring	Summer	Fall
hour
0	860	1945	2622	4211
1	639	1425	1820	3016
2	393	897	1201	1864
3	287	404	585	902
4	161	278	482	398
5	184	338	780	808
6	647	1129	1863	1978
7	1651	2931	4266	5861
8	2283	4159	5615	7664
9	3258	4569	5755	10164

day_name	Monday	Tuesday	Wednesday	Thursday	Friday	Saturday	Sunday
hour
0	904.0	734.0	872.0	916.0	1544.0	2266.0	2402.0
1	562.0	404.0	474.0	507.0	1028.0	1980.0	1945.0
2	305.0	289.0	275.0	254.0	613.0	1288.0	1331.0
3	178.0	196.0	143.0	185.0	272.0	586.0	618.0
4	168.0	178.0	143.0	156.0	176.0	235.0	263.0

Rider Type	CASUAL	MEMBER
Start Station Name
Allequippa St & Darragh St	137	16607
Atwood St & Bates St	303	33692
Boulevard of the Allies & Parkview Ave	310	29228
Coltart Ave & Forbes Ave	407	26554
Fifth Ave & S Bouquet St	413	24998
N Dithridge St & Centre Ave	374	32472
O'Hara St & University Place	287	27018
S Bouquet Ave & Sennott St	630	56314
Schenley Dr & Schenley Dr Ext	915	18722
Zulema St & Coltart Ave	277	25803

	Start Station Name	archetype_label	arch_trips	total_trips	pct
0	10th St & Penn Ave	Commuter	2732	6659	41.027181
1	10th St & Penn Ave	Errand	1541	6659	23.141613
2	10th St & Penn Ave	Last-Mile	1682	6659	25.259048
3	10th St & Penn Ave	Leisure	704	6659	10.572158
4	17th St & Penn Ave	Commuter	2271	5567	40.793964
5	17th St & Penn Ave	Errand	1795	5567	32.243578
6	17th St & Penn Ave	Last-Mile	951	5567	17.082809
7	17th St & Penn Ave	Leisure	550	5567	9.879648
8	21st St & Penn Ave	Commuter	1896	6095	31.107465
9	21st St & Penn Ave	Errand	2120	6095	34.782609
10	21st St & Penn Ave	Last-Mile	1492	6095	24.479081
11	21st St & Penn Ave	Leisure	587	6095	9.630845
12	42nd St & Butler St	Commuter	1121	4419	25.367730
13	42nd St & Butler St	Errand	2139	4419	48.404616
14	42nd St & Butler St	Last-Mile	783	4419	17.718941