"""
km_diagnose.py
Run this to figure out why HD.dat returns 0 KM records.
Place in D:\KinderMorgan\ and run from geo_env.
"""
import os
import pandas as pd

ULS_DIR = r"D:\FCC_ULS"

# ── Load EN.dat -- get KM usids AND call signs ────────────────────────────────
print("Loading EN.dat...")
en_cols = [
    "record_type","unique_system_identifier","uls_file_number","ebf_number",
    "call_sign","entity_type","licensee_id","entity_name","first_name","mi",
    "last_name","suffix","phone","fax","email","street_address","city",
    "state","zip_code","po_box","attention_line","sgin","fcc_registration_number",
    "applicant_type_code","applicant_type_code_other","status_code","status_date",
    "ghz_licensed","linked_unique_system_identifier","linked_call_sign"
]
en = pd.read_csv(os.path.join(ULS_DIR, "EN.dat"), sep="|", header=None,
                 names=en_cols[:30], dtype=str, on_bad_lines="skip", encoding="latin-1")
en["entity_name_lc"] = en["entity_name"].str.strip().str.lower().fillna("")

km_en = en[en["entity_name_lc"].str.contains("kinder morgan", na=False)].copy()
km_usids     = set(km_en["unique_system_identifier"].dropna().str.strip().unique())
km_callsigns = set(km_en["call_sign"].dropna().str.strip().unique())

print(f"  KM usids from EN: {len(km_usids)}")
print(f"  KM call signs from EN: {len(km_callsigns)}")
print(f"  Sample usids: {sorted(list(km_usids))[:10]}")
print(f"  Sample call signs: {sorted(list(km_callsigns))[:10]}")

# ── Load HD.dat -- check what's there ────────────────────────────────────────
print("\nLoading HD.dat...")
hd_cols = [
    "record_type","unique_system_identifier","uls_file_number","ebf_number",
    "call_sign","license_status","radio_service_code","grant_date","expired_date",
    "cancellation_date","eligibility_rule_num","reserved","alien","alien_government",
    "alien_corporation","alien_officer","alien_control","revoked","convicted",
    "adjudged","reserved2","common_carrier","non_common_carrier","private_comm",
    "fixed","mobile","radiolocation","satellite","developmental_or_sta","interconnected_voip",
    "certifier_first_name","certifier_mi","certifier_last_name","certifier_suffix",
    "certifier_title","female","minority","hispanic","native_american","veteran",
    "licensee_name_change","whitespace_ind","additional_cert_choice","additional_cert_answer",
    "discontinuation_ind","regulatory_status","conflict_with_non_usa","nontable_emission",
    "broadcast_ind","section_22_ship","call_sign_change"
]
hd = pd.read_csv(os.path.join(ULS_DIR, "HD.dat"), sep="|", header=None,
                 names=hd_cols[:50], dtype=str, on_bad_lines="skip", encoding="latin-1")
hd["unique_system_identifier"] = hd["unique_system_identifier"].str.strip()
hd["call_sign"] = hd["call_sign"].str.strip()

print(f"  Total HD rows: {len(hd)}")
print(f"  Sample HD usids: {sorted(hd['unique_system_identifier'].dropna().unique())[:5]}")
print(f"  Sample HD call signs: {sorted(hd['call_sign'].dropna().unique())[:5]}")

# ── Try matching by usid ──────────────────────────────────────────────────────
hd_by_usid = hd[hd["unique_system_identifier"].isin(km_usids)]
print(f"\n  HD rows matching KM usids: {len(hd_by_usid)}")

# ── Try matching by call sign ─────────────────────────────────────────────────
hd_by_cs = hd[hd["call_sign"].isin(km_callsigns)]
print(f"  HD rows matching KM call signs: {len(hd_by_cs)}")

if not hd_by_cs.empty:
    print("\n  Service code breakdown (by call sign match):")
    print(hd_by_cs["radio_service_code"].value_counts().to_string())
    print("\n  License status breakdown:")
    print(hd_by_cs["license_status"].value_counts().to_string())
    print("\n  Sample rows:")
    print(hd_by_cs[["call_sign","unique_system_identifier","radio_service_code",
                     "license_status"]].head(20).to_string())

# ── Check for usid type mismatch (int vs string) ──────────────────────────────
print("\n  Checking usid dtypes...")
sample_en_usid = sorted(list(km_usids))[0]
sample_hd_usid = hd["unique_system_identifier"].dropna().iloc[0]
print(f"  EN sample usid: '{sample_en_usid}' (type: {type(sample_en_usid).__name__})")
print(f"  HD sample usid: '{sample_hd_usid}' (type: {type(sample_hd_usid).__name__})")

# ── Check if any KM call signs appear in HD at all ───────────────────────────
sample_cs = sorted(list(km_callsigns))[:5]
for cs in sample_cs:
    rows = hd[hd["call_sign"] == cs]
    print(f"  HD lookup for '{cs}': {len(rows)} rows")
    if not rows.empty:
        print(f"    -> service={rows.iloc[0]['radio_service_code']} status={rows.iloc[0]['license_status']} usid={rows.iloc[0]['unique_system_identifier']}")

# ── Check what microwave service codes look like in HD generally ──────────────
print("\n  Top 20 service codes in all of HD.dat:")
print(hd["radio_service_code"].value_counts().head(20).to_string())