r"""
extract_oncor_txu_uls.py
------------------------
Extracts all microwave license data for Oncor and TXU from FCC ULS bulk data files.

Input:  D:\FCC_ULS\  (pipe-delimited .dat files)
Output: D:\oncor\ULS_output\  (full extract + 5925-7125 MHz filtered extract)

DAT files used:
  EN.dat  - Entity names (licensee identification)
  HD.dat  - License header (call sign, radio service, license status, dates)
  FR.dat  - Frequency assignments
  PA.dat  - Path data (azimuths, distances, passive repeaters)
  LO.dat  - Location data (site coordinates, elevation, structure type)
  AN.dat  - Antenna data (make, model, gain, polarization, HAAT)
  MW.dat  - Microwave-specific data (type of operation, station class)

Join key: unique_system_identifier (col index 1 in every DAT file)
"""

import os
import sys
import pandas as pd
from pathlib import Path

# ── Configuration ─────────────────────────────────────────────────────────────

ULS_DIR   = Path(r"D:\FCC_ULS")
OUTPUT_DIR = Path(r"D:\oncor\ULS_output")
OUTPUT_FILE = OUTPUT_DIR / "Oncor_TXU_Microwave_Licenses.csv"

# Entity name search terms (case-insensitive, partial match)
ENTITY_SEARCH_TERMS = ["oncor", "txu"]

# Microwave radio service codes in ULS
FREQ_MIN_MHZ = 5925.0    # Lower bound of frequency filter (MHz)
FREQ_MAX_MHZ = 7125.0    # Upper bound of frequency filter (MHz)

MICROWAVE_SERVICES = {
    "MG",   # Microwave Industrial/Business Pool
    "MW",   # Microwave Common Carrier
    "CF",   # Common Carrier Fixed Point-to-Point Microwave
    "TP",   # 24 GHz Service
    "TI",   # 39 GHz Service
    "WU",   # 70/80/90 GHz
    "YD",   # Broadcast Auxiliary Remote Pickup
    "GB",   # Paging and Radiotelephone
}

# ── Column definitions ─────────────────────────────────────────────────────────
# Positions are 1-based per FCC spec; converted to 0-based indices below.

EN_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "entity_type",
    6:  "licensee_id",
    7:  "entity_name",
    8:  "first_name",
    9:  "mi",
    10: "last_name",
    11: "suffix",
    12: "phone",
    13: "fax",
    14: "email",
    15: "street_address",
    16: "city",
    17: "state",
    18: "zip_code",
    19: "po_box",
    20: "attention_line",
    21: "sgin",
    22: "frn",
    23: "applicant_type_code",
    24: "applicant_type_other",
    25: "status_code",
    26: "status_date",
}

HD_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "license_status",
    6:  "radio_service_code",
    7:  "grant_date",
    8:  "expired_date",
    9:  "cancellation_date",
    10: "eligibility_rule_num",
    11: "applicant_type_code_reserved",
    12: "alien",
    13: "alien_government",
    14: "alien_corporation",
    15: "alien_officer",
    16: "alien_control",
    17: "revoked",
    18: "convicted",
    19: "adjudged",
    20: "involved_reserved",
    21: "common_carrier",
    22: "non_common_carrier",
    23: "private_comm",
    24: "fixed",
    25: "mobile",
    26: "radiolocation",
    27: "satellite",
    28: "developmental_or_sta",
    29: "interconnected",
    30: "certifier_first_name",
    31: "certifier_mi",
    32: "certifier_last_name",
    33: "certifier_suffix",
    34: "certifier_title",
    35: "gender",
    36: "african_american",
    37: "native_american",
    38: "hawaiian",
    39: "asian",
    40: "white",
    41: "hispanic",
    42: "effective_date",
    43: "last_action_date",
    44: "auction_id",
    45: "reg_stat_broad_serv",
    46: "band_manager",
    47: "type_serv_broad_serv",
    48: "alien_ruling",
    49: "licensee_name_change",
}

FR_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "frequency_action_performed",
    6:  "location_number",
    7:  "antenna_number",
    8:  "class_station_code",
    9:  "op_altitude_code",
    10: "frequency_assigned",
    11: "frequency_upper_band",
    12: "frequency_offset",
    13: "frequency_channel_block",
    14: "emission_code",
    15: "digital_modulation_code",
    16: "frequency_number",
    17: "status_code",
    18: "status_date",
}

PA_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "location_action_performed",
    6:  "location_number",
    7:  "path_number",
    8:  "transmit_location_number",
    9:  "transmit_antenna_number",
    10: "receiver_location_number",
    11: "receiver_antenna_number",
    12: "mas_dms_ind",
    13: "path_azimuth",
    14: "elevation_angle",
    15: "receiver_height_above_ground",
    16: "receiver_call_sign",
    17: "path_loss",
    18: "receiver_threshold_criteria",
    19: "receiver_threshold_value",
    20: "passive_receiver_indicator",
    21: "passive_coordinates_text",
    22: "status_code",
    23: "status_date",
    24: "path_loss_approach",
    25: "path_number_system_id",
}

LO_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "location_action_performed",
    6:  "location_type_code",
    7:  "location_description",
    8:  "location_number",
    9:  "site_status_code",
    10: "corresponding_fixed_location",
    11: "location_address",
    12: "location_city",
    13: "location_county",
    14: "location_state",
    15: "radius_of_operation",
    16: "area_of_operation_code",
    17: "clearance_indicator",
    18: "ground_elevation",
    19: "lat_degrees",
    20: "lat_minutes",
    21: "lat_seconds",
    22: "lat_direction",
    23: "lon_degrees",
    24: "lon_minutes",
    25: "lon_seconds",
    26: "lon_direction",
    27: "max_latitudinal_deg",
    28: "max_longitudinal_deg",
    29: "nepa",
    30: "quiet_zone_notification_date",
    31: "tower_registration_number",
    32: "height_of_support_structure",
    33: "overall_height_above_ground",
    34: "structure_type",
    35: "airport_id",
    36: "location_name",
    37: "units_hand_held",
    38: "units_mobile",
    39: "units_temp_fixed",
    40: "units_aircraft",
    41: "units_itinerant",
    42: "status_code",
    43: "status_date",
}

AN_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "antenna_action_performed",
    6:  "location_number",
    7:  "antenna_number",
    8:  "licensee_antenna_number",
    9:  "antenna_type",
    10: "height_to_tip",
    11: "height_center_raat",
    12: "antenna_make",
    13: "antenna_model",
    14: "tilt",
    15: "polarization_code",
    16: "beamwidth",
    17: "gain",
    18: "azimuth",
    19: "height_above_avg_terrain",
    20: "diversity_height",
    21: "diversity_gain",
    22: "diversity_beam",
    23: "reflector",
    24: "reflector_height",
    25: "reflector_width",
    26: "reflector_angle",
    27: "reflector_rotation",
    28: "back_to_back_tx_dish_gain",
    29: "back_to_back_rx_dish_gain",
    30: "location_name",
    31: "passive_repeater_indicator",
    32: "back_to_back_passive_indicator",
    33: "status_code",
    34: "status_date",
}

MW_COLS = {
    0:  "record_type",
    1:  "unique_system_identifier",
    2:  "uls_file_number",
    3:  "ebf_number",
    4:  "call_sign",
    5:  "pack_indicator",
    6:  "pack_registration_number",
    7:  "pack_name",
    8:  "type_of_operation",
    9:  "smsa_code",
    10: "station_class",
    11: "cummulative_effect_major",
    12: "status_code",
    13: "status_date",
}

# ── Helpers ────────────────────────────────────────────────────────────────────

def load_dat(filepath: Path, col_map: dict, label: str) -> pd.DataFrame:
    """Load a pipe-delimited DAT file with named columns."""
    if not filepath.exists():
        print(f"  [WARN] {label} not found: {filepath}")
        return pd.DataFrame()

    print(f"  Loading {label} ({filepath.name})...")
    try:
        df = pd.read_csv(
            filepath,
            sep="|",
            header=None,
            dtype=str,
            encoding="latin-1",
            on_bad_lines="skip",
            low_memory=False,
        )
        # Rename only the columns that exist
        rename = {k: v for k, v in col_map.items() if k < len(df.columns)}
        df.rename(columns=rename, inplace=True)
        print(f"    → {len(df):,} rows, {len(df.columns)} columns")
        return df
    except Exception as e:
        print(f"  [ERROR] Failed to load {label}: {e}")
        return pd.DataFrame()


def decimal_degrees(deg, mins, secs, direction) -> float | None:
    """Convert DMS columns to decimal degrees."""
    try:
        dd = float(deg) + float(mins) / 60 + float(secs) / 3600
        if str(direction).strip().upper() in ("S", "W"):
            dd = -dd
        return round(dd, 6)
    except Exception:
        return None


# ── Main ───────────────────────────────────────────────────────────────────────

def main():
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    print(f"\n{'='*60}")
    print("  FCC ULS Extractor — Oncor / TXU Microwave Licenses")
    print(f"{'='*60}\n")

    # ── Step 1: Load EN.dat and find target licensees ──────────────────────────
    print("Step 1: Searching EN.dat for Oncor / TXU entities...")
    en = load_dat(ULS_DIR / "EN.dat", EN_COLS, "EN (Entity)")
    if en.empty:
        sys.exit("ERROR: EN.dat is required and could not be loaded.")

    pattern = "|".join(ENTITY_SEARCH_TERMS)
    mask = (
        en["entity_name"].str.contains(pattern, case=False, na=False) &
        ~en["entity_name"].str.contains("concord", case=False, na=False)
    )
    en_target = en[mask].copy()

    print(f"\n  Found {len(en_target):,} entity records matching Oncor/TXU:")
    for name, count in en_target["entity_name"].value_counts().head(30).items():
        print(f"    {count:4d}  {name}")

    target_ids = set(en_target["unique_system_identifier"].dropna().unique())
    print(f"\n  → {len(target_ids):,} unique system identifiers to extract\n")

    if not target_ids:
        sys.exit("ERROR: No matching entities found. Check entity name search terms.")

    # ── Step 2: Load HD.dat and filter to microwave services ──────────────────
    print("Step 2: Loading HD.dat (license headers)...")
    hd = load_dat(ULS_DIR / "HD.dat", HD_COLS, "HD (License Header)")

    hd_target = hd[
        hd["unique_system_identifier"].isin(target_ids) &
        hd["radio_service_code"].isin(MICROWAVE_SERVICES)
    ].copy()

    print(f"  → {len(hd_target):,} microwave license records for Oncor/TXU")
    if hd_target.empty:
        print("  [WARN] No microwave licenses found. Check radio service codes.")

    # Refine target IDs to only microwave licenses
    mw_ids = set(hd_target["unique_system_identifier"].dropna().unique())
    print(f"  → {len(mw_ids):,} unique microwave system identifiers\n")

    # ── Step 3: Load and filter remaining DAT files ────────────────────────────
    print("Step 3: Loading supplemental DAT files...")

    fr = load_dat(ULS_DIR / "FR.dat", FR_COLS, "FR (Frequencies)")
    pa = load_dat(ULS_DIR / "PA.dat", PA_COLS, "PA (Paths)")
    lo = load_dat(ULS_DIR / "LO.dat", LO_COLS, "LO (Locations)")
    an = load_dat(ULS_DIR / "AN.dat", AN_COLS, "AN (Antennas)")
    mw = load_dat(ULS_DIR / "MW.dat", MW_COLS, "MW (Microwave Detail)")

    def filter_ids(df, ids):
        if df.empty or "unique_system_identifier" not in df.columns:
            return df
        return df[df["unique_system_identifier"].isin(ids)].copy()

    fr_t = filter_ids(fr, mw_ids)
    pa_t = filter_ids(pa, mw_ids)
    lo_t = filter_ids(lo, mw_ids)
    an_t = filter_ids(an, mw_ids)
    mw_t = filter_ids(mw, mw_ids)

    print(f"\n  Filtered rows:")
    print(f"    FR (frequencies):  {len(fr_t):,}")
    print(f"    PA (paths):        {len(pa_t):,}")
    print(f"    LO (locations):    {len(lo_t):,}")
    print(f"    AN (antennas):     {len(an_t):,}")
    print(f"    MW (microwave):    {len(mw_t):,}\n")

    # ── Step 3b: Filter FR to 5925–7125 MHz band ────────────────────────────────
    print("Step 3b: Filtering frequencies to {:.0f}–{:.0f} MHz band...".format(FREQ_MIN_MHZ, FREQ_MAX_MHZ))
    if not fr_t.empty and "frequency_assigned" in fr_t.columns:
        fr_t["frequency_assigned_mhz"] = pd.to_numeric(fr_t["frequency_assigned"], errors="coerce")
        fr_band = fr_t[
            (fr_t["frequency_assigned_mhz"] >= FREQ_MIN_MHZ) &
            (fr_t["frequency_assigned_mhz"] <= FREQ_MAX_MHZ)
        ].copy()
        band_ids = set(fr_band["unique_system_identifier"].dropna().unique())
        print(f"  → {len(fr_band):,} frequency records in band")
        print(f"  → {len(band_ids):,} unique licenses have at least one frequency in band\n")
    else:
        fr_band = pd.DataFrame()
        band_ids = set()
        print("  [WARN] FR data unavailable for frequency filtering\n")

    # ── Step 4: Compute decimal lat/lon in LO ─────────────────────────────────
    print("Step 4: Computing decimal coordinates from LO.dat...")
    if not lo_t.empty:
        coord_cols = ["lat_degrees", "lat_minutes", "lat_seconds", "lat_direction",
                      "lon_degrees", "lon_minutes", "lon_seconds", "lon_direction"]
        if all(c in lo_t.columns for c in coord_cols):
            lo_t["latitude"] = lo_t.apply(
                lambda r: decimal_degrees(r["lat_degrees"], r["lat_minutes"],
                                          r["lat_seconds"], r["lat_direction"]), axis=1)
            lo_t["longitude"] = lo_t.apply(
                lambda r: decimal_degrees(r["lon_degrees"], r["lon_minutes"],
                                          r["lon_seconds"], r["lon_direction"]), axis=1)
            print(f"  → Coordinates computed for {lo_t['latitude'].notna().sum():,} sites\n")

    # ── Step 5: Assemble master output ────────────────────────────────────────
    print("Step 5: Joining tables into master output...")

    # Start with HD (one row per license)
    out = hd_target.copy()

    # Join entity info (licensee name, address)
    en_cols_keep = ["unique_system_identifier", "entity_name", "entity_type",
                    "street_address", "city", "state", "zip_code", "frn",
                    "applicant_type_code"]
    en_merge = en_target[
        [c for c in en_cols_keep if c in en_target.columns]
    ].drop_duplicates(subset=["unique_system_identifier"])
    out = out.merge(en_merge, on="unique_system_identifier", how="left", suffixes=("", "_en"))

    # Join MW detail (type of operation, station class)
    if not mw_t.empty:
        mw_cols_keep = ["unique_system_identifier", "type_of_operation", "station_class", "pack_name"]
        mw_merge = mw_t[[c for c in mw_cols_keep if c in mw_t.columns]].drop_duplicates(
            subset=["unique_system_identifier"])
        out = out.merge(mw_merge, on="unique_system_identifier", how="left")

    print(f"  → Master (HD + EN + MW): {len(out):,} rows\n")

    # ── Step 6: Write supplemental files ─────────────────────────────────────
    print("Step 6: Writing output files...")

    # Master license summary
    out.to_csv(OUTPUT_FILE, index=False)
    print(f"  ✓ Master CSV:      {OUTPUT_FILE}")

    # Frequencies
    if not fr_t.empty:
        fr_out = OUTPUT_DIR / "Oncor_TXU_Frequencies.csv"
        fr_t.to_csv(fr_out, index=False)
        print(f"  ✓ Frequencies:     {fr_out}")

    # Frequency-band filtered outputs (5925–7125 MHz)
    if not fr_band.empty:
        fr_band_out = OUTPUT_DIR / "Oncor_TXU_Frequencies_5925_7125MHz.csv"
        fr_band.to_csv(fr_band_out, index=False)
        print(f"  ✓ Freq 5925-7125 MHz: {fr_band_out}")

        # Band-filtered master licenses
        if band_ids:
            out_band = out[out["unique_system_identifier"].isin(band_ids)].copy()
            band_lic_out = OUTPUT_DIR / "Oncor_TXU_Licenses_5925_7125MHz.csv"
            out_band.to_csv(band_lic_out, index=False)
            print(f"  ✓ Licenses 5925-7125 MHz: {band_lic_out} ({len(out_band):,} rows)")

        # Band-filtered locations
        if not lo_t.empty and band_ids:
            lo_band_out = OUTPUT_DIR / "Oncor_TXU_Locations_5925_7125MHz.csv"
            lo_t[lo_t["unique_system_identifier"].isin(band_ids)].to_csv(lo_band_out, index=False)
            print(f"  ✓ Locations 5925-7125 MHz: {lo_band_out}")

        # Band-filtered paths
        if not pa_t.empty and band_ids:
            pa_band_out = OUTPUT_DIR / "Oncor_TXU_Paths_5925_7125MHz.csv"
            pa_t[pa_t["unique_system_identifier"].isin(band_ids)].to_csv(pa_band_out, index=False)
            print(f"  ✓ Paths 5925-7125 MHz:    {pa_band_out}")

        # Band-filtered antennas
        if not an_t.empty and band_ids:
            an_band_out = OUTPUT_DIR / "Oncor_TXU_Antennas_5925_7125MHz.csv"
            an_t[an_t["unique_system_identifier"].isin(band_ids)].to_csv(an_band_out, index=False)
            print(f"  ✓ Antennas 5925-7125 MHz: {an_band_out}")

    # Paths
    if not pa_t.empty:
        pa_out = OUTPUT_DIR / "Oncor_TXU_Paths.csv"
        pa_t.to_csv(pa_out, index=False)
        print(f"  ✓ Paths:           {pa_out}")

    # Locations (with decimal coords)
    if not lo_t.empty:
        lo_out = OUTPUT_DIR / "Oncor_TXU_Locations.csv"
        lo_t.to_csv(lo_out, index=False)
        print(f"  ✓ Locations:       {lo_out}")

    # Antennas
    if not an_t.empty:
        an_out = OUTPUT_DIR / "Oncor_TXU_Antennas.csv"
        an_t.to_csv(an_out, index=False)
        print(f"  ✓ Antennas:        {an_out}")

    # ── Step 7: Summary report ────────────────────────────────────────────────
    print(f"\n{'='*60}")
    print("  SUMMARY")
    print(f"{'='*60}")
    print(f"  Total microwave licenses:   {len(out):,}")
    if "radio_service_code" in out.columns:
        print(f"\n  By radio service:")
        for svc, cnt in out["radio_service_code"].value_counts().items():
            print(f"    {svc}:  {cnt:,}")
    if "license_status" in out.columns:
        print(f"\n  By license status:")
        for st, cnt in out["license_status"].value_counts().items():
            print(f"    {st}:  {cnt:,}")
    if "entity_name" in out.columns:
        print(f"\n  By entity name (top 20):")
        for name, cnt in out["entity_name"].value_counts().head(20).items():
            print(f"    {cnt:4d}  {name}")
    if "state" in out.columns:
        print(f"\n  By state:")
        for st, cnt in out["state"].value_counts().items():
            print(f"    {st}:  {cnt:,}")
    if band_ids:
        print(f"\n  Licenses in 5925-7125 MHz band: {len(band_ids):,}")
    print(f"\n  Output directory: {OUTPUT_DIR}")
    print(f"{'='*60}\n")


if __name__ == "__main__":
    main()
