"""
km_cleanup.py
Cleans up Kinder_Morgan_Network_Definition_v1.csv:
  - Drops rows where B_Call_Sign is not a valid FCC call sign (e.g. "NEW")
  - Flags zero-distance paths
  - Renumbers path_num sequentially after drops
  - Saves cleaned file (overwrites v1)
"""
import re, os
import pandas as pd

CSV = r"D:\KinderMorgan\Kinder_Morgan_Network_Definition_v1.csv"

df = pd.read_csv(CSV, dtype=str)
print(f"Loaded: {len(df)} rows")

# ── Drop rows where B_Call_Sign is not a valid FCC call sign ─────────────────
valid_cs = re.compile(r'^[KW][A-Z0-9]{2,8}$')

def is_valid_cs(cs):
    if not isinstance(cs, str): return False
    return bool(valid_cs.match(cs.strip()))

bad_b = ~df["B_Call_Sign"].apply(is_valid_cs)
if bad_b.any():
    print(f"\nDropping {bad_b.sum()} rows with invalid B_Call_Sign:")
    print(df[bad_b][["path_num","A_Call_Sign","B_Call_Sign"]].to_string())
    df = df[~bad_b].copy()

# ── Flag zero-distance paths (don't drop -- just report) ─────────────────────
df["Path_Length_Miles"] = pd.to_numeric(df["Path_Length_Miles"], errors="coerce")
zero_dist = df[df["Path_Length_Miles"] == 0.0]
if not zero_dist.empty:
    print(f"\nZero-distance paths ({len(zero_dist)}) -- same site or bad coordinates:")
    print(zero_dist[["path_num","A_Call_Sign","B_Call_Sign",
                      "A_Latitude","A_Longitude","B_Latitude","B_Longitude"]].to_string())

# ── Renumber ──────────────────────────────────────────────────────────────────
df = df.reset_index(drop=True)
df["path_num"] = [f"KMI_{i+1:04d}" for i in range(len(df))]
df["Path_ID"]  = df["A_Call_Sign"] + "_" + df["B_Call_Sign"]

df.to_csv(CSV, index=False)
print(f"\nSaved: {len(df)} paths -> {CSV}")

print(f"\nState breakdown:")
print(df["a_state"].value_counts().to_string())
print(f"\nPath length distribution:")
print(df["Path_Length_Miles"].describe())
