"""
VEN Pipeline - Step 1  v3
=========================
Generates {pathid}_ven_1.csv files for each path ID.
Each file is saved to D:\\ATT\\complete\\{pathid}\\{pathid}_ven_1.csv
Summary log is saved to D:\\ATT\\complete\\ven_pipeline_summary.csv

Data hierarchy:
  1. Overture Maps   — primary building footprints + heights
  2. Overture Address theme — address fields (spatial join)
  3. KNN (200m radius) — height interpolation from neighbors
  4. Median fallback — guarantees zero null heights
  5. Floor rule (1m) — corrects sub-1m artifacts last

Requirements:
    pip install pandas geopandas shapely overturemaps lxml pyproj
                scikit-learn requests
"""

import os
import re
import sys
import math
import uuid
import logging
import threading
import subprocess
import json
import numpy as np
import pandas as pd
import geopandas as gpd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from lxml import etree
import pyproj
from sklearn.neighbors import BallTree
from datetime import datetime, timedelta
import time

# ─── CONFIG ───────────────────────────────────────────────────────────────────
PATHIDS_CSV      = r"C:\Users\Public\pathids.csv"
KML_DIR          = r"C:\Users\Public\rectangles"
COMPLETE_DIR     = r"D:\ATT\complete"     # Root output folder
STOP_FLAG        = r"D:\ATT\complete\ven_pipeline_stop.flag"
MIN_HEIGHT_M     = 1.0        # Floor rule: correct sub-1m artifacts only
KNN_RADIUS_M     = 200.0      # KNN search radius in metres
RESUME           = False      # True = skip already-completed paths
PARALLEL_WORKERS = 12         # Number of concurrent path processors
# ──────────────────────────────────────────────────────────────────────────────

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)s  %(message)s"
)
log = logging.getLogger(__name__)
_error_log_path = None  # set in main() once COMPLETE_DIR is known
_error_lock = threading.Lock()


def log_error(path_id: str, error: str):
    """Write a failed path entry to the persistent error log."""
    if not _error_log_path:
        return
    with _error_lock:
        with open(_error_log_path, "a", encoding="utf-8") as f:
            ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"{ts}  FAILED  {path_id}\n{error}\n\n")


_summary_lock = threading.Lock()

OUTPUT_COLUMNS = [
    "gid", "str_uuid", "bld_uuid", "area_sqft", "address", "address_no",
    "street", "subaddress", "city", "stpostal", "zip", "state", "county",
    "ctfips", "add_conf", "latitude", "longitude", "maxheight", "meanheight",
    "maxobjecth", "lag", "hag", "meanelev", "meanslope", "maxslope",
    "modeslope", "ll_uuid", "lbcs_activ", "lbcs_act_1", "lbcs_funct",
    "lbcs_fun_1", "lbcs_struc", "lbcs_str_1", "lbcs_site", "lbcs_site_",
    "lbcs_owner", "lbcs_own_1", "sourcedate", "source", "stories",
    "grossarea", "volume", "largest", "geom"
]

STATE_ABBREV = {
    'AL':'Alabama','AK':'Alaska','AZ':'Arizona','AR':'Arkansas','CA':'California',
    'CO':'Colorado','CT':'Connecticut','DE':'Delaware','FL':'Florida','GA':'Georgia',
    'HI':'Hawaii','ID':'Idaho','IL':'Illinois','IN':'Indiana','IA':'Iowa',
    'KS':'Kansas','KY':'Kentucky','LA':'Louisiana','ME':'Maine','MD':'Maryland',
    'MA':'Massachusetts','MI':'Michigan','MN':'Minnesota','MS':'Mississippi',
    'MO':'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada','NH':'New Hampshire',
    'NJ':'New Jersey','NM':'New Mexico','NY':'New York','NC':'North Carolina',
    'ND':'North Dakota','OH':'Ohio','OK':'Oklahoma','OR':'Oregon','PA':'Pennsylvania',
    'RI':'Rhode Island','SC':'South Carolina','SD':'South Dakota','TN':'Tennessee',
    'TX':'Texas','UT':'Utah','VT':'Vermont','VA':'Virginia','WA':'Washington',
    'WV':'West Virginia','WI':'Wisconsin','WY':'Wyoming','DC':'District of Columbia',
}

FEMA_URL         = ("https://services2.arcgis.com/FiaPA4ga0iQKduv3/arcgis/rest"
                    "/services/USA_Structures_View/FeatureServer/0/query")
FEMA_MAX_RECORDS = 4000


# ─── KML PARSING ──────────────────────────────────────────────────────────────

def parse_kml_polygon(kml_path: str) -> Polygon:
    tree = etree.parse(kml_path)
    root = tree.getroot()
    ns   = re.match(r'\{.*\}', root.tag)
    ns   = ns.group(0) if ns else ''
    coords_text = None
    for tag in ['coordinates', f'{ns}coordinates']:
        els = root.findall(f'.//{tag}')
        if els:
            coords_text = els[0].text.strip()
            break
    if not coords_text:
        raise ValueError(f"No coordinates found in {kml_path}")
    coords = []
    for triplet in coords_text.split():
        parts = triplet.split(',')
        coords.append((float(parts[0]), float(parts[1])))
    return Polygon(coords)


def polygon_envelope(poly: Polygon):
    minx, miny, maxx, maxy = poly.bounds
    return minx, miny, maxx, maxy


# ─── OVERTURE BUILDINGS ───────────────────────────────────────────────────────

def _run_overture(bbox_str: str, theme: str, output_path: str) -> bool:
    """Run overturemaps CLI download. Returns True on success."""
    env = os.environ.copy()
    env["PYTHONIOENCODING"] = "utf-8"
    env["PYTHONUTF8"]       = "1"
    result = subprocess.run(
        ["overturemaps", "download",
         "--bbox", bbox_str, "-f", "geojson",
         "--type", theme, "-o", output_path],
        capture_output=True, text=True, encoding="utf-8", env=env
    )
    # Remove state file if created
    state_file = output_path + ".state"
    if os.path.exists(state_file):
        os.remove(state_file)
    return result.returncode == 0


def download_overture_buildings(bbox, output_path: str) -> gpd.GeoDataFrame:
    min_lon, min_lat, max_lon, max_lat = bbox
    bbox_str = f"{min_lon},{min_lat},{max_lon},{max_lat}"
    log.info(f"  Downloading Overture buildings for bbox {bbox_str}")
    if not _run_overture(bbox_str, "building", output_path):
        raise RuntimeError("Overture building download failed")
    gdf = gpd.read_file(output_path)
    log.info(f"  Overture returned {len(gdf)} buildings (pre-filter)")
    return gdf


def download_overture_addresses(bbox, output_path: str) -> gpd.GeoDataFrame:
    min_lon, min_lat, max_lon, max_lat = bbox
    bbox_str = f"{min_lon},{min_lat},{max_lon},{max_lat}"
    log.info(f"  Downloading Overture addresses for bbox {bbox_str}")
    if not _run_overture(bbox_str, "address", output_path):
        log.warning("  Overture address download failed")
        return gpd.GeoDataFrame()
    gdf = gpd.read_file(output_path)
    log.info(f"  Overture returned {len(gdf)} address points")
    return gdf


def filter_to_polygon(gdf: gpd.GeoDataFrame, poly: Polygon) -> gpd.GeoDataFrame:
    if gdf.crs and gdf.crs.to_epsg() != 4326:
        gdf = gdf.to_crs(epsg=4326)
    if not poly.exterior.is_ccw:
        poly = Polygon(list(poly.exterior.coords)[::-1])
    filtered = gdf[gdf.geometry.intersects(poly)].copy()
    log.info(f"  {len(filtered)} buildings after polygon filter")
    return filtered


def _area_sqft(geom) -> float:
    """Compute building footprint area in sq ft via equal-area projection."""
    try:
        proj      = pyproj.Transformer.from_crs("EPSG:4326", "EPSG:6933", always_xy=True)
        geom_proj = transform(proj.transform, geom)
        return round(geom_proj.area * 10.7639, 4)
    except Exception:
        return None


def extract_overture_fields(gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    rows = []
    for _, row in gdf.iterrows():
        props = row.to_dict()
        geom  = row.geometry

        def get(*keys, default=None):
            for k in keys:
                if k in props and props[k] is not None:
                    return props[k]
            return default

        try:
            rep_pt = geom.representative_point()
            lat, lon = rep_pt.y, rep_pt.x
        except Exception:
            lat, lon = None, None

        area = _area_sqft(geom) if geom else None

        # Height — treat 0 or negative as null
        height_m = get('height', 'building:height', 'roof_height')
        try:
            height_m = float(height_m) if height_m is not None else None
            if height_m is not None and height_m <= 0:
                height_m = None
        except (ValueError, TypeError):
            height_m = None

        stories = get('num_floors', 'building:levels', 'levels')
        try:
            stories = int(float(stories)) if stories is not None else None
        except (ValueError, TypeError):
            stories = None

        gross_area = (area * stories) if (area and stories) else None
        volume     = (gross_area * height_m * 3.28084) if (gross_area and height_m) else None

        r = {col: None for col in OUTPUT_COLUMNS}
        r.update({
            "str_uuid":   str(uuid.uuid4()),
            "bld_uuid":   get('id', 'building_id'),
            "area_sqft":  area,
            "latitude":   round(lat, 6) if lat else None,
            "longitude":  round(lon, 6) if lon else None,
            "maxheight":  height_m,
            "meanheight": height_m,
            "stories":    stories,
            "grossarea":  round(gross_area, 4) if gross_area else None,
            "volume":     round(volume, 2)     if volume     else None,
            "source":     "Overture" if height_m is not None else None,
            "sourcedate": pd.Timestamp.today().strftime("%m/%d/%Y"),
            "geom":       geom.wkb_hex if geom else None,
        })
        rows.append(r)

    df = pd.DataFrame(rows, columns=OUTPUT_COLUMNS)
    log.info(f"  Heights from Overture: {df['maxheight'].notna().sum()} / {len(df)}")
    return df


# ─── ADDRESS JOIN ─────────────────────────────────────────────────────────────

def join_addresses_to_buildings(df: pd.DataFrame,
                                 addr_gdf: gpd.GeoDataFrame,
                                 match_radius_m: float = 25.0) -> pd.DataFrame:
    """Spatial join Overture address points to all buildings within match_radius_m."""
    if addr_gdf is None or addr_gdf.empty:
        log.warning("  No address data to join")
        return df

    def get_state(levels):
        try:
            if isinstance(levels, list) and levels:
                v = levels[0]
                return v.get('value') if isinstance(v, dict) else None
        except Exception:
            return None

    addr_gdf         = addr_gdf.copy()
    addr_gdf['_slat'] = addr_gdf.geometry.y
    addr_gdf['_slon'] = addr_gdf.geometry.x
    addr_gdf['_state_abbrev'] = addr_gdf['address_levels'].apply(get_state)
    addr_valid = addr_gdf.dropna(subset=['_slat', '_slon'])
    if addr_valid.empty:
        log.warning("  No valid address coordinates")
        return df

    addr_rad = np.radians(addr_valid[['_slat', '_slon']].values)
    tree     = BallTree(addr_rad, metric='haversine')
    earth_r  = 6371000.0

    bld_valid = df.dropna(subset=['latitude', 'longitude'])
    if bld_valid.empty:
        return df

    bld_rad             = np.radians(bld_valid[['latitude', 'longitude']].values)
    distances, indices  = tree.query(bld_rad, k=1)

    filled = 0
    for i, (dist_arr, idx_arr) in enumerate(zip(distances, indices)):
        if dist_arr[0] * earth_r > match_radius_m:
            continue
        bld_idx   = bld_valid.index[i]
        addr_row  = addr_valid.iloc[idx_arr[0]]
        number    = addr_row.get('number')
        street    = addr_row.get('street')
        unit      = addr_row.get('unit')
        postcode  = addr_row.get('postcode')
        city      = addr_row.get('postal_city')
        state_abbr = addr_row.get('_state_abbrev')
        state_full = STATE_ABBREV.get(state_abbr, state_abbr) if state_abbr else None
        parts      = [str(x) for x in [number, street, city, state_abbr, postcode] if x]
        full_addr  = ', '.join(parts) if parts else None
        df.at[bld_idx, 'address']    = full_addr
        df.at[bld_idx, 'address_no'] = number
        df.at[bld_idx, 'street']     = street
        df.at[bld_idx, 'subaddress'] = unit
        df.at[bld_idx, 'city']       = city
        df.at[bld_idx, 'stpostal']   = state_abbr
        df.at[bld_idx, 'zip']        = postcode
        df.at[bld_idx, 'state']      = state_full
        df.at[bld_idx, 'add_conf']   = 'High'
        filled += 1

    log.info(f"  Address join: {filled}/{len(bld_valid)} buildings matched "
             f"(within {match_radius_m}m)")
    return df


# ─── FEMA USA STRUCTURES ──────────────────────────────────────────────────────

def query_fema_buildings(bbox, poly: Polygon) -> pd.DataFrame:
    """Query FEMA USA Structures REST API. Returns addresses + footprints."""
    import requests
    min_lon, min_lat, max_lon, max_lat = bbox
    all_features = []
    offset = 0
    while True:
        params = {
            "f":                 "geojson",
            "where":             "1=1",
            "geometry":          f"{min_lon},{min_lat},{max_lon},{max_lat}",
            "geometryType":      "esriGeometryEnvelope",
            "inSR":              "4326",
            "outSR":             "4326",
            "spatialRel":        "esriSpatialRelIntersects",
            "outFields":         ("UUID,PROP_ADDR,PROP_CITY,PROP_ST,PROP_ZIP,"
                                  "HEIGHT,SQFEET,OCC_CLS,LATITUDE,LONGITUDE,CENSUSCODE"),
            "returnGeometry":    "true",
            "resultOffset":      offset,
            "resultRecordCount": FEMA_MAX_RECORDS,
        }
        try:
            resp = requests.get(FEMA_URL, params=params, timeout=60)
            resp.raise_for_status()
            data = resp.json()
        except Exception as e:
            log.warning(f"  FEMA API error at offset {offset}: {e}")
            break
        features = data.get("features", [])
        if not features:
            break
        all_features.extend(features)
        exceeded = data.get("exceededTransferLimit", False)
        log.info(f"  FEMA: fetched {len(all_features)} records "
                 f"(exceededTransferLimit={exceeded})...")
        if not exceeded:
            break
        offset += FEMA_MAX_RECORDS

    if not all_features:
        log.warning("  FEMA returned no buildings")
        return pd.DataFrame(columns=OUTPUT_COLUMNS)

    log.info(f"  FEMA: {len(all_features)} total (pre-filter)")
    rows = []
    for feat in all_features:
        props = feat.get("properties", {})
        geom  = feat.get("geometry")
        try:
            shp = shape(geom) if geom else None
            if shp and not poly.intersects(shp):
                continue
        except Exception:
            continue

        lat = props.get("LATITUDE")  or (shp.centroid.y if shp else None)
        lon = props.get("LONGITUDE") or (shp.centroid.x if shp else None)

        height_m = None
        raw_h = props.get("HEIGHT")
        if raw_h:
            try:
                h = float(raw_h)
                height_m = round(h * 0.3048, 3) if h > 0 else None
            except (ValueError, TypeError):
                pass

        addr    = props.get("PROP_ADDR")
        city    = props.get("PROP_CITY")
        state   = props.get("PROP_ST")
        zipcode = props.get("PROP_ZIP")
        parts   = [str(x) for x in [addr, city, state, zipcode] if x]
        full_addr = ", ".join(parts) if parts else None
        censuscode = props.get("CENSUSCODE", "")
        ctfips     = censuscode[:5] if censuscode else None

        r = {col: None for col in OUTPUT_COLUMNS}
        r.update({
            "str_uuid":   str(uuid.uuid4()),
            "bld_uuid":   props.get("UUID"),
            "area_sqft":  props.get("SQFEET"),
            "address":    full_addr,
            "street":     addr,
            "city":       city,
            "stpostal":   state,
            "zip":        zipcode,
            "state":      STATE_ABBREV.get(state, state) if state else None,
            "ctfips":     ctfips,
            "latitude":   round(float(lat), 6) if lat else None,
            "longitude":  round(float(lon), 6) if lon else None,
            "maxheight":  height_m,
            "meanheight": height_m,
            "source":     "FEMA" if height_m else None,
            "sourcedate": pd.Timestamp.today().strftime("%m/%d/%Y"),
            "geom":       shp.wkb_hex if shp else None,
        })
        rows.append(r)

    df = pd.DataFrame(rows, columns=OUTPUT_COLUMNS)
    log.info(f"  FEMA: {len(df)} buildings after polygon filter")
    return df


# ─── MERGE HELPERS ────────────────────────────────────────────────────────────

def _merge_new_buildings(df_base: pd.DataFrame,
                          df_new: pd.DataFrame,
                          match_radius_m: float,
                          source_label: str,
                          addr_backfill: bool = False) -> pd.DataFrame:
    """
    Generic merge: match new buildings to base by nearest centroid.
    - Unmatched → added as new rows
    - Matched + addr_backfill=True → fill null address fields from new source
    """
    if df_new is None or df_new.empty:
        return df_base

    base_valid = df_base.dropna(subset=['latitude', 'longitude'])
    if base_valid.empty:
        log.info(f"  {source_label} merge: adding all {len(df_new)} as new buildings")
        return pd.concat([df_base, df_new], ignore_index=True)

    base_rad = np.radians(base_valid[['latitude', 'longitude']].values)
    tree     = BallTree(base_rad, metric='haversine')
    earth_r  = 6371000.0
    radius   = match_radius_m / earth_r

    new_valid = df_new.dropna(subset=['latitude', 'longitude'])
    if new_valid.empty:
        return df_base

    new_rad             = np.radians(new_valid[['latitude', 'longitude']].values)
    distances, indices  = tree.query(new_rad, k=1)

    addr_cols    = ['address', 'address_no', 'street', 'city',
                    'stpostal', 'zip', 'state', 'ctfips']
    matched      = set()
    backfilled   = 0

    for i, (dist_arr, idx_arr) in enumerate(zip(distances, indices)):
        dist_m = dist_arr[0] * earth_r
        if dist_m <= match_radius_m:
            matched.add(new_valid.index[i])
            if addr_backfill:
                base_idx = base_valid.index[idx_arr[0]]
                new_idx  = new_valid.index[i]
                for col in addr_cols:
                    if (pd.isna(df_base.at[base_idx, col]) and
                            not pd.isna(df_new.at[new_idx, col])):
                        df_base.at[base_idx, col] = df_new.at[new_idx, col]
                        backfilled += 1

    unmatched = df_new[~df_new.index.isin(matched)]
    log.info(f"  {source_label} merge: {len(unmatched)} new buildings added"
             + (f", {backfilled} address fields backfilled" if addr_backfill else ""))

    if not unmatched.empty:
        return pd.concat([df_base, unmatched], ignore_index=True)
    return df_base


# ─── KNN GAP-FILL ─────────────────────────────────────────────────────────────

def knn_gap_fill(df: pd.DataFrame, radius_m: float = KNN_RADIUS_M) -> pd.DataFrame:
    """Inverse-distance weighted KNN height interpolation."""
    missing_mask  = df['maxheight'].isna()
    missing_count = missing_mask.sum()
    if missing_count == 0:
        log.info("  No height gaps — KNN not needed")
        return df

    log.info(f"  KNN gap-fill: {missing_count} buildings remaining")

    known_mask = (df['maxheight'].notna() &
                  df['latitude'].notna() & df['longitude'].notna())
    if known_mask.sum() < 1:
        log.warning("  Not enough known heights for KNN")
        return df

    known   = df[known_mask][['latitude', 'longitude', 'maxheight']].copy()
    unknown = df[missing_mask & df['latitude'].notna() & df['longitude'].notna()].copy()
    if unknown.empty:
        return df

    earth_r    = 6371000.0
    known_rad  = np.radians(known[['latitude', 'longitude']].values)
    unk_rad    = np.radians(unknown[['latitude', 'longitude']].values)
    tree       = BallTree(known_rad, metric='haversine')
    radius_rad = radius_m / earth_r

    indices, distances = tree.query_radius(
        unk_rad, r=radius_rad, return_distance=True, sort_results=True
    )

    filled = 0
    for i, (idx_arr, dist_arr) in enumerate(zip(indices, distances)):
        if len(idx_arr) == 0:
            continue
        heights = known.iloc[idx_arr]['maxheight'].values
        dists   = dist_arr * earth_r
        dists   = np.where(dists == 0, 0.001, dists)
        weights = 1.0 / dists
        h       = np.average(heights, weights=weights)
        orig    = unknown.index[i]
        df.at[orig, 'maxheight']  = round(h, 2)
        df.at[orig, 'meanheight'] = round(h, 2)
        df.at[orig, 'source']     = 'KNN'
        filled += 1

    log.info(f"  KNN filled {filled} buildings")

    # Median fallback for anything still null
    still_null = df['maxheight'].isna().sum()
    if still_null > 0:
        median_h = df['maxheight'].median()
        median_h = max(float(median_h) if not pd.isna(median_h) else MIN_HEIGHT_M,
                       MIN_HEIGHT_M)
        df.loc[df['maxheight'].isna(), 'maxheight']  = median_h
        df.loc[df['meanheight'].isna(), 'meanheight'] = median_h
        df.loc[df['source'].isna(), 'source']         = 'Median Fallback'
        log.info(f"  Median fallback: {still_null} buildings set to {median_h:.2f}m")

    return df


# ─── FLOOR RULE ───────────────────────────────────────────────────────────────

def apply_floor_rule(df: pd.DataFrame, min_h: float = MIN_HEIGHT_M) -> pd.DataFrame:
    """Set any height below min_h to min_h."""
    mask = df['maxheight'].notna() & (df['maxheight'] < min_h)
    df.loc[mask, 'maxheight']  = min_h
    df.loc[mask, 'meanheight'] = min_h
    log.info(f"  Floor rule: {mask.sum()} buildings set to {min_h}m")
    return df


# ─── HEIGHT REPORT ────────────────────────────────────────────────────────────

def print_height_report(df: pd.DataFrame, path_id: str):
    total = len(df)
    h     = df['maxheight']
    log.info(f"\n  {'─'*50}")
    log.info(f"  HEIGHT REPORT: {path_id}  ({total:,} buildings)")
    log.info(f"  {'─'*50}")
    log.info(f"  Source breakdown:")
    for src, n in df['source'].value_counts(dropna=False).items():
        log.info(f"    {str(src):<25} {n:>8,}  ({n/total*100:.1f}%)")
    log.info(f"  Height distribution:")
    bins = [
        ("== 0.0",              h == 0.0),
        ("> 0.0 and < 1.0",    (h > 0)   & (h < 1)),
        (">= 1.0 and < 2.0",   (h >= 1)  & (h < 2)),
        (">= 2.0 and < 3.0",   (h >= 2)  & (h < 3)),
        (">= 3.0 and < 4.0",   (h >= 3)  & (h < 4)),
        (">= 4.0 and < 5.0",   (h >= 4)  & (h < 5)),
        ("== 5.0",              h == 5.0),
        ("> 5.0 and < 10.0",   (h > 5)   & (h < 10)),
        (">= 10.0 and < 20.0", (h >= 10) & (h < 20)),
        (">= 20.0 and < 50.0", (h >= 20) & (h < 50)),
        (">= 50.0",             h >= 50),
    ]
    for label, mask in bins:
        n = mask.sum()
        log.info(f"    {label:<25} {n:>8,}  ({n/total*100:.1f}%)")
    pcts = h.quantile([0.10, 0.25, 0.50, 0.75, 0.90, 0.99])
    log.info(f"  Percentiles (m):  p10={pcts[0.10]:.1f}  p25={pcts[0.25]:.1f}  "
             f"p50={pcts[0.50]:.1f}  p75={pcts[0.75]:.1f}  "
             f"p90={pcts[0.90]:.1f}  p99={pcts[0.99]:.1f}")
    log.info(f"  Min={h.min():.1f}m  Mean={h.mean():.1f}m  "
             f"Max={h.max():.1f}m  Nulls={h.isna().sum()}")
    log.info(f"  {'─'*50}\n")


# ─── SUMMARY LOG ──────────────────────────────────────────────────────────────

def write_summary_log(stats: dict):
    with _summary_lock:
        file_exists = os.path.exists(SUMMARY_LOG)
        pd.DataFrame([stats]).to_csv(
            SUMMARY_LOG, mode='a', index=False,
            header=not file_exists, encoding='utf-8-sig'
        )


# ─── PROCESS PATH ─────────────────────────────────────────────────────────────

def process_path(path_id: str, kml_dir: str, complete_dir: str):
    path_start = datetime.now()
    log.info(f"{'='*60}")
    log.info(f"Processing path ID: {path_id}")

    # Write in-progress status for GUI
    status_file = os.path.join(complete_dir, f"inprogress_{path_id}.txt")
    with open(status_file, "w") as f:
        f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

    kml_path = os.path.join(kml_dir, f"{path_id}.kml")
    if not os.path.exists(kml_path):
        log.error(f"  KML not found: {kml_path} — skipping")
        return

    # Create per-path output folder
    path_dir = os.path.join(complete_dir, path_id)
    os.makedirs(path_dir, exist_ok=True)

    # 1. Parse KML polygon
    poly = parse_kml_polygon(kml_path)
    bbox = polygon_envelope(poly)
    log.info(f"  Polygon bounds: {bbox}")

    # Temp files written to path folder, deleted after use
    tmp_bld  = os.path.join(path_dir, f"{path_id}_tmp_bld.geojson")
    tmp_addr = os.path.join(path_dir, f"{path_id}_tmp_addr.geojson")

    # 2. Download Overture buildings + addresses
    gdf      = download_overture_buildings(bbox, tmp_bld)
    addr_gdf = download_overture_addresses(bbox, tmp_addr)

    if gdf.empty:
        msg = f"No buildings returned by Overture for {path_id} — writing empty CSV"
        log.warning(f"  {msg}")
        log_error(path_id, msg)
        pd.DataFrame(columns=OUTPUT_COLUMNS).to_csv(
            os.path.join(path_dir, f"{path_id}_ven_1.csv"),
            index=False, encoding="utf-8-sig"
        )
        return

    # 3. Filter buildings to rotated polygon
    gdf = filter_to_polygon(gdf, poly)

    if gdf.empty:
        msg = f"No buildings remain after polygon filter for {path_id} — writing empty CSV"
        log.warning(f"  {msg}")
        log_error(path_id, msg)
        pd.DataFrame(columns=OUTPUT_COLUMNS).to_csv(
            os.path.join(path_dir, f"{path_id}_ven_1.csv"),
            index=False, encoding="utf-8-sig"
        )
        return

    # 4. Map Overture fields to output schema
    df = extract_overture_fields(gdf)

    # 5. Join Overture addresses to buildings
    df = join_addresses_to_buildings(df, addr_gdf)

    # 6. KNN height gap-fill + median fallback
    df = knn_gap_fill(df)

    # 7. Sequential gid
    df['gid'] = range(1, len(df) + 1)

    # 8. Flag largest building by area
    if df['area_sqft'].notna().any():
        df['largest'] = 0
        df.at[df['area_sqft'].idxmax(), 'largest'] = 1

    # 9. Ensure all 44 columns in order
    for col in OUTPUT_COLUMNS:
        if col not in df.columns:
            df[col] = None
    df = df[OUTPUT_COLUMNS]

    # 10. Floor rule — last
    df = apply_floor_rule(df)

    # 11. Height report
    print_height_report(df, path_id)

    # 12. Write output to per-path folder
    out_path = os.path.join(path_dir, f"{path_id}_ven_1.csv")
    df.to_csv(out_path, index=False, encoding="utf-8-sig")

    elapsed = datetime.now() - path_start
    log.info(f"  Written: {out_path}  ({len(df):,} buildings)  "
             f"[{str(timedelta(seconds=int(elapsed.total_seconds())))}]")

    # 13. Summary log
    h = df['maxheight']
    write_summary_log({
        'path_id':          path_id,
        'timestamp':        pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
        'elapsed_time':     str(timedelta(seconds=int(elapsed.total_seconds()))),
        'total_buildings':  len(df),
        'overture_count':   (df['source'] == 'Overture').sum(),
        'knn_count':        (df['source'] == 'KNN').sum(),
        'median_fallback':  (df['source'] == 'Median Fallback').sum(),
        'null_heights':     h.isna().sum(),
        'address_pct':      round(df['address'].notna().mean() * 100, 1),
        'height_min':       round(h.min(), 2),
        'height_mean':      round(h.mean(), 2),
        'height_median':    round(h.median(), 2),
        'height_max':       round(h.max(), 2),
        'floor_rule_count': (h == MIN_HEIGHT_M).sum(),
    })

    # 14. Delete temp files and status file
    for tmp in [tmp_bld, tmp_addr, status_file]:
        try:
            if os.path.exists(tmp):
                os.remove(tmp)
            state = tmp + ".state"
            if os.path.exists(state):
                os.remove(state)
        except Exception:
            pass


# ─── MAIN ─────────────────────────────────────────────────────────────────────

def main():
    global SUMMARY_LOG, RESUME, _error_log_path

    # Command line override for RESUME (usage: resume=true or resume=false)
    for arg in sys.argv[1:]:
        if arg.lower().startswith("resume="):
            RESUME = arg.split("=", 1)[1].strip().lower() in ("true", "1", "yes")

    run_start = datetime.now()
    os.makedirs(COMPLETE_DIR, exist_ok=True)
    log.info(f"RESUME = {RESUME}")

    # Set timestamped log filenames
    ts = run_start.strftime("%Y%m%d_%H%M%S")
    SUMMARY_LOG     = os.path.join(COMPLETE_DIR, f"ven_pipeline_summary_{ts}.csv")
    _error_log_path = os.path.join(COMPLETE_DIR, f"ven_pipeline_errors_{ts}.txt")
    log.info(f"Summary log: {SUMMARY_LOG}")
    log.info(f"Error log:   {_error_log_path}")

    # Write run start time for GUI
    start_file = os.path.join(COMPLETE_DIR, "run_start.txt")
    run_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(start_file, "w") as f:
        f.write(run_ts)

    # Write summary log path so GUI can find the timestamped file
    log_ptr = os.path.join(COMPLETE_DIR, "current_summary.txt")
    with open(log_ptr, "w") as f:
        f.write(SUMMARY_LOG)

    # Clear done marker from previous run
    done_file = os.path.join(COMPLETE_DIR, "run_done.txt")
    if os.path.exists(done_file):
        os.remove(done_file)

    # Write PID so GUI can kill this process
    pid_file = os.path.join(COMPLETE_DIR, "pipeline.pid")
    with open(pid_file, "w") as f:
        f.write(str(os.getpid()))

    pathids_df = pd.read_csv(PATHIDS_CSV)
    path_ids   = pathids_df.iloc[:, 0].astype(str).tolist()
    log.info(f"Found {len(path_ids)} path IDs — running with {PARALLEL_WORKERS} workers")

    # Write total count for GUI
    with open(os.path.join(COMPLETE_DIR, "run_total.txt"), "w") as f:
        f.write(str(len(path_ids)))

    if RESUME:
        existing = {p.name for p in Path(COMPLETE_DIR).iterdir()
                    if p.is_dir() and (p / f"{p.name}_ven_1.csv").exists()}
        pending  = [p for p in path_ids if p not in existing]
        skipped  = len(path_ids) - len(pending)
        log.info(f"RESUME=True: {skipped} already done, {len(pending)} to process")
    else:
        pending = path_ids

    if not pending:
        log.info("All paths already processed. Done.")
        return

    completed, failed = 0, 0

    with ThreadPoolExecutor(max_workers=PARALLEL_WORKERS) as executor:
        futures = {
            executor.submit(process_path, pid, KML_DIR, COMPLETE_DIR): pid
            for pid in pending
        }
        for future in as_completed(futures):
            pid = futures[future]

            if os.path.exists(STOP_FLAG):
                log.info("Stop flag detected — cancelling remaining paths...")
                for f in futures:
                    f.cancel()
                break

            try:
                future.result()
                completed += 1
            except Exception as e:
                failed += 1
                import traceback
                err_msg = traceback.format_exc()
                log.error(f"Failed on {pid}: {e}", exc_info=True)
                log_error(pid, err_msg)

            elapsed = datetime.now() - run_start
            log.info(f"Progress: {completed + failed}/{len(pending)} "
                     f"({completed} ok, {failed} failed)  "
                     f"Total elapsed: {str(timedelta(seconds=int(elapsed.total_seconds())))}")

    total_elapsed = datetime.now() - run_start
    log.info(f"{'='*60}")
    log.info(f"COMPLETE — {completed} succeeded, {failed} failed")
    log.info(f"Total run time: {str(timedelta(seconds=int(total_elapsed.total_seconds())))}")
    log.info(f"Summary log:    {SUMMARY_LOG}")
    log.info(f"{'='*60}")

    if os.path.exists(STOP_FLAG):
        os.remove(STOP_FLAG)
        log.info("Stop flag removed.")

    # Write done marker for GUI
    done_file = os.path.join(COMPLETE_DIR, "run_done.txt")
    with open(done_file, "w") as f:
        f.write(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))


# ─── ENTRY POINT ──────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import sys

    if "--monitor" in sys.argv:
        # ── Monitor process: simple GUI ───────────────────────────────────────
        import tkinter as tk
        from tkinter import messagebox

        RUN_START    = [datetime.now()]  # list so inner function can mutate it
        START_FILE   = os.path.join(COMPLETE_DIR, "run_start.txt")
        LOG_PTR_FILE = os.path.join(COMPLETE_DIR, "current_summary.txt")
        DONE_FILE    = os.path.join(COMPLETE_DIR, "run_done.txt")
        pipeline_done = [False]

        root = tk.Tk()
        root.title("VEN Pipeline")
        root.configure(bg="#1a1a2e")
        root.geometry("520x480")
        root.resizable(True, True)

        # Total elapsed header
        tk.Label(root, text="TOTAL ELAPSED", bg="#1a1a2e", fg="#888888",
                 font=("Courier New", 9)).pack(pady=(16, 0))
        lbl_total = tk.Label(root, text="00:00:00", bg="#1a1a2e", fg="#4f9cf9",
                              font=("Courier New", 32, "bold"))
        lbl_total.pack()

        # Counts row
        lbl_counts = tk.Label(root, text="", bg="#1a1a2e", fg="#888888",
                               font=("Courier New", 10))
        lbl_counts.pack(pady=(4, 4))

        # In-progress files label
        lbl_inprogress = tk.Label(root, text="", bg="#1a1a2e", fg="#f5a623",
                                   font=("Courier New", 9), justify="left",
                                   wraplength=480)
        lbl_inprogress.pack(pady=(0, 8))

        # Divider
        tk.Frame(root, bg="#2a2d3a", height=1).pack(fill="x", padx=20)

        # Scrollable list of completed files
        tk.Label(root, text="COMPLETED FILES", bg="#1a1a2e", fg="#888888",
                 font=("Courier New", 9)).pack(pady=(10, 4))

        frame = tk.Frame(root, bg="#1a1a2e")
        frame.pack(fill="both", expand=True, padx=20, pady=(0, 16))

        scrollbar = tk.Scrollbar(frame)
        scrollbar.pack(side="right", fill="y")

        listbox = tk.Listbox(frame, bg="#0f1117", fg="#e2e8f0",
                              font=("Courier New", 10),
                              selectbackground="#2a2d3a",
                              activestyle="none",
                              borderwidth=0, highlightthickness=0,
                              yscrollcommand=scrollbar.set)
        listbox.pack(side="left", fill="both", expand=True)
        scrollbar.config(command=listbox.yview)

        seen_paths = set()
        pipeline_pid_file = os.path.join(COMPLETE_DIR, "pipeline.pid")

        def stop_pipeline():
            if tk.messagebox.askyesno("Stop Pipeline", "Kill the pipeline immediately?"):
                try:
                    if os.path.exists(pipeline_pid_file):
                        with open(pipeline_pid_file) as f:
                            pid = int(f.read().strip())
                        import signal, psutil
                        parent = psutil.Process(pid)
                        for child in parent.children(recursive=True):
                            child.kill()
                        parent.kill()
                        listbox.insert(0, ">>> PIPELINE KILLED <<<")
                        btn_stop.config(state="disabled", text="Stopped", bg="#555555")
                except Exception as e:
                    tk.messagebox.showerror("Error", f"Could not kill pipeline:\n{e}")

        btn_stop = tk.Button(root, text="⏹  STOP NOW",
                              bg="#8b0000", fg="white",
                              font=("Courier New", 11, "bold"),
                              relief="flat", padx=16, pady=8,
                              cursor="hand2",
                              command=stop_pipeline)
        btn_stop.pack(pady=(0, 8))

        def refresh():
            # Check if pipeline finished
            if os.path.exists(DONE_FILE):
                pipeline_done[0] = True

            # Read total path count
            total = 0
            total_file = os.path.join(COMPLETE_DIR, "run_total.txt")
            if os.path.exists(total_file):
                try:
                    with open(total_file) as f:
                        total = int(f.read().strip())
                except Exception:
                    pass

            # Read summary log for completed count
            current_log = None
            if os.path.exists(LOG_PTR_FILE):
                try:
                    with open(LOG_PTR_FILE) as f:
                        current_log = f.read().strip()
                except Exception:
                    pass

            done = 0
            if current_log and os.path.exists(current_log):
                try:
                    df = pd.read_csv(current_log, on_bad_lines='skip')
                    if not df.empty:
                        done = len(df)
                        for _, row in df.iterrows():
                            pid = str(row.get('path_id', ''))
                            if pid and pid not in seen_paths:
                                seen_paths.add(pid)
                                t   = str(row.get('elapsed_time', ''))
                                bld = int(row.get('total_buildings', 0))
                                entry = f"{pid:<30}  {t:>8}  {bld:>8,} bldgs"
                                listbox.insert(0, entry)
                except Exception:
                    pass

            # Update counts label
            if total > 0:
                lbl_counts.config(text=f"Processing {done} of {total} files")
            elif done > 0:
                lbl_counts.config(text=f"{done} completed")
            else:
                lbl_counts.config(text="Waiting for first completion...")

            # Show in-progress files with elapsed time
            if not pipeline_done[0]:
                inprogress = []
                for f in Path(COMPLETE_DIR).glob("inprogress_*.txt"):
                    pid = f.stem.replace("inprogress_", "")
                    try:
                        with open(f) as fh:
                            start = datetime.strptime(fh.read().strip(), "%Y-%m-%d %H:%M:%S")
                        elapsed = datetime.now() - start
                        h, rem = divmod(int(elapsed.total_seconds()), 3600)
                        m, s   = divmod(rem, 60)
                        inprogress.append(f"{pid}  {h:02d}:{m:02d}:{s:02d}")
                    except Exception:
                        inprogress.append(pid)
                if inprogress:
                    lbl_inprogress.config(text="In progress:\n" + "\n".join(inprogress))
                else:
                    lbl_inprogress.config(text="")
            else:
                lbl_inprogress.config(text="")
                lbl_counts.config(text=f"{done} of {total} completed  ✓ DONE")

            root.after(2000, refresh)  # data refresh every 2 seconds

        def tick():
            """Updates clock every second independently of data refresh."""
            # Read actual run start time from file if available
            if os.path.exists(START_FILE):
                try:
                    with open(START_FILE) as f:
                        RUN_START[0] = datetime.strptime(f.read().strip(), "%Y-%m-%d %H:%M:%S")
                except Exception:
                    pass

            if not pipeline_done[0]:
                elapsed = datetime.now() - RUN_START[0]
            else:
                try:
                    with open(DONE_FILE) as f:
                        done_time = datetime.strptime(f.read().strip(), "%Y-%m-%d %H:%M:%S")
                    elapsed = done_time - RUN_START[0]
                except Exception:
                    elapsed = datetime.now() - RUN_START[0]

            h, rem = divmod(int(elapsed.total_seconds()), 3600)
            m, s   = divmod(rem, 60)
            color  = "#3ecf8e" if pipeline_done[0] else "#4f9cf9"
            lbl_total.config(text=f"{h:02d}:{m:02d}:{s:02d}", fg=color)

            if pipeline_done[0]:
                txt = lbl_counts.cget("text")
                if "✓ DONE" not in txt:
                    lbl_counts.config(text=txt + "  ✓ DONE")
            else:
                root.after(1000, tick)  # keep ticking only while running

        root.after(500, tick)    # start clock
        root.after(1000, refresh)  # start data refresh
        root.mainloop()

    else:
        # ── Main process: launch monitor + run pipeline ───────────────────────
        subprocess.Popen(
            [sys.executable, __file__, "--monitor"],
            creationflags=subprocess.CREATE_NEW_CONSOLE
            if sys.platform == "win32" else 0
        )
        main()