"""
km_diagnose2.py - figure out why HD usids don't match EN usids
"""
import os
import pandas as pd

ULS_DIR = r"D:\FCC_ULS"

# ── Raw peek at HD.dat ────────────────────────────────────────────────────────
print("=== HD.dat first 5 rows raw ===")
with open(os.path.join(ULS_DIR, "HD.dat"), "r", encoding="latin-1") as f:
    for i, line in enumerate(f):
        if i >= 5: break
        fields = line.rstrip("\n").split("|")
        print(f"  Row {i}: {len(fields)} fields | col[0]='{fields[0]}' col[1]='{fields[1]}' col[4]='{fields[4]}' col[5]='{fields[5]}' col[6]='{fields[6]}'")

# ── Load HD with minimal columns ──────────────────────────────────────────────
print("\n=== Loading HD.dat ===")
hd = pd.read_csv(os.path.join(ULS_DIR, "HD.dat"), sep="|", header=None,
                 dtype=str, on_bad_lines="skip", encoding="latin-1")
print(f"  Rows: {len(hd)}  Cols: {len(hd.columns)}")
print(f"  Col[0] unique values (record types): {hd[0].value_counts().head(5).to_dict()}")
print(f"  Col[5] sample (should be license_status A/C/T): {hd[5].value_counts().head(8).to_dict()}")
print(f"  Col[6] sample (should be radio_service_code): {hd[6].value_counts().head(10).to_dict()}")

# ── Load EN and get KM usids ──────────────────────────────────────────────────
print("\n=== Loading EN.dat ===")
en = pd.read_csv(os.path.join(ULS_DIR, "EN.dat"), sep="|", header=None,
                 dtype=str, on_bad_lines="skip", encoding="latin-1")
print(f"  Rows: {len(en)}  Cols: {len(en.columns)}")
en_name_col = en[7].str.strip().str.lower().fillna("")
km_en = en[en_name_col.str.contains("kinder morgan", na=False)]
km_usids = set(km_en[1].str.strip().dropna().unique())
km_callsigns = set(km_en[4].str.strip().dropna().unique())
print(f"  KM usids from EN: {len(km_usids)}")
print(f"  KM callsigns from EN: {len(km_callsigns)}")
print(f"  Sample KM usids: {sorted(list(km_usids))[:10]}")
print(f"  Sample KM callsigns: {sorted(list(km_callsigns))[:10]}")

# ── Try matching HD by usid (col[1]) ─────────────────────────────────────────
hd_usids = set(hd[1].str.strip().dropna().unique())
print(f"\n  HD usid count: {len(hd_usids)}")
print(f"  Sample HD usids: {sorted(list(hd_usids))[:10]}")

overlap_usid = km_usids & hd_usids
print(f"  KM usids found in HD: {len(overlap_usid)}")

# ── Try matching HD by callsign (col[4]) ─────────────────────────────────────
hd_callsigns = set(hd[4].str.strip().dropna().unique())
overlap_cs = km_callsigns & hd_callsigns
print(f"  KM callsigns found in HD: {len(overlap_cs)}")
if overlap_cs:
    print(f"  Sample matches: {sorted(list(overlap_cs))[:10]}")
    # Show those rows
    matches = hd[hd[4].str.strip().isin(km_callsigns)]
    print(f"\n  HD rows for KM callsigns:")
    print(matches[[0,1,4,5,6]].head(10).to_string())

# ── Check if EN usids exist anywhere in HD ───────────────────────────────────
print(f"\n=== Checking EN usids vs HD usids ===")
sample_km_usids = sorted(list(km_usids))[:20]
for usid in sample_km_usids:
    in_hd = usid in hd_usids
    print(f"  EN usid {usid}: {'IN HD' if in_hd else 'NOT IN HD'}")
