import os
import pandas as pd
import numpy as np
from datetime import datetime
import json
from tqdm import tqdm
from Dust_Storm_Modules import MERRA2AODProcessor
[docs]
class AirportDustFlagger:
def __init__(self, metar_csv_path, config_path, airport_csv_path):
self.metar_csv_path = metar_csv_path
self.config_path = config_path
self.airport_csv_path = airport_csv_path
self._load_config()
def _load_config(self):
with open(self.config_path, "r") as f:
config = json.load(f)
self.project_name = config["project_name"]
self.region_bounds = tuple(config["region_bounds"])
self.start_date = config["start_date"]
self.end_date = config["end_date"]
self.output_dir = config["modis_output_dir"]
self.csv_output_path = config["csv_output_path"]
def _vectorized_haversine(self, lat1_array, lon1_array, lat2, lon2):
lat1 = np.radians(lat1_array)
lon1 = np.radians(lon1_array)
lat2 = np.radians(lat2)
lon2 = np.radians(lon2)
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
c = 2 * np.arcsin(np.sqrt(a))
r = 6371
return r * c
[docs]
def run(self):
print("\n[1] Checking and processing MERRA-2 data...")
processor = MERRA2AODProcessor(
self.start_date, self.end_date, self.region_bounds, self.output_dir
)
processor.download_files()
processor.convert_to_csv()
print("\n[2] Loading METAR and airport data...")
metar_df = pd.read_csv(self.metar_csv_path)
metar_df["time"] = pd.to_datetime(metar_df["time"], errors="coerce")
metar_df.dropna(subset=["time"], inplace=True)
airport_df = pd.read_csv(self.airport_csv_path)
airport_df = airport_df[airport_df["type"] == "large_airport"]
unique_stations = metar_df["station"].unique()
airport_df = airport_df[airport_df["icao_code"].isin(unique_stations)]
results = []
debug_logs = []
print("\n[3] Matching MERRA-2 spikes with METAR reports...")
merra_files = sorted(
[
f
for f in os.listdir(self.output_dir)
if f.endswith(".csv") and f.startswith("MERRA2_400")
]
)
all_merra_df = []
for f in merra_files:
df = pd.read_csv(os.path.join(self.output_dir, f))
df["time"] = pd.to_datetime(df["time"], format="%d/%m/%Y %H:%M")
df["hour"] = df["time"].dt.hour
all_merra_df.append(df)
if not all_merra_df:
print("No MERRA CSVs found. Aborting.")
return
merra_full = pd.concat(all_merra_df, ignore_index=True)
metar_times = set(metar_df["time"])
merra_by_hour = {hour: df for hour, df in merra_full.groupby("hour")}
for hour, hour_df in tqdm(merra_by_hour.items(), desc="[MERRA-led analysis]"):
for _, airport in airport_df.iterrows():
airport_lat = airport["latitude_deg"]
airport_lon = airport["longitude_deg"]
icao_code = airport["icao_code"]
airport_name = airport["name"]
hour_df = hour_df.copy()
hour_df["dist"] = self._vectorized_haversine(
hour_df["latitude"].values,
hour_df["longitude"].values,
airport_lat,
airport_lon,
)
candidate_df = hour_df[hour_df["dist"] <= 50]
threshold_df = hour_df[hour_df["dist"] <= 100]
if threshold_df.empty:
continue
monthly_mean = threshold_df["dust_mass"].mean()
std_dev = threshold_df["dust_mass"].std()
threshold_val = monthly_mean + 2 * std_dev
filtered_df = candidate_df[candidate_df["dust_mass"] > threshold_val]
for _, row in filtered_df.iterrows():
timestamp = row["time"]
metar_agrees = any(
abs((timestamp - mt).total_seconds()) <= 1800
for mt in metar_times
)
source = 3 if metar_agrees else 2
result = {
"datetime": timestamp,
"airport_name": airport_name,
"icao_code": icao_code,
"latitude": row["latitude"],
"longitude": row["longitude"],
"distance_km": row["dist"],
"dust_mass": row["dust_mass"],
"monthly_mean": monthly_mean,
"threshold": threshold_val,
"above_monthly_avg": True,
"Source": source,
}
results.append(result)
for _, metar_row in tqdm(
metar_df.iterrows(), total=len(metar_df), desc="[METAR-led analysis]"
):
metar_time = metar_row["time"]
station = metar_row["station"]
airport_info = airport_df[airport_df["icao_code"] == station]
if airport_info.empty:
continue
airport = airport_info.iloc[0]
airport_lat = airport["latitude_deg"]
airport_lon = airport["longitude_deg"]
airport_name = airport["name"]
time_window_start = metar_time - pd.Timedelta(minutes=30)
time_window_end = metar_time + pd.Timedelta(minutes=30)
merra_candidates = merra_full[
(merra_full["time"] >= time_window_start)
& (merra_full["time"] <= time_window_end)
].copy()
merra_candidates["dist"] = self._vectorized_haversine(
merra_candidates["latitude"].values,
merra_candidates["longitude"].values,
airport_lat,
airport_lon,
)
candidate_df = merra_candidates[merra_candidates["dist"] <= 50]
threshold_df = merra_candidates[merra_candidates["dist"] <= 100]
if threshold_df.empty:
continue
monthly_mean = threshold_df["dust_mass"].mean()
std_dev = threshold_df["dust_mass"].std()
threshold_val = monthly_mean + 2 * std_dev
match = candidate_df[candidate_df["dust_mass"] > threshold_val]
source_code = 3 if not match.empty else 1
for _, match_row in match.iterrows():
result = {
"datetime": metar_time,
"airport_name": airport_name,
"icao_code": station,
"latitude": match_row["latitude"],
"longitude": match_row["longitude"],
"distance_km": match_row["dist"],
"dust_mass": match_row["dust_mass"],
"monthly_mean": monthly_mean,
"threshold": threshold_val,
"above_monthly_avg": True,
"Source": source_code,
}
results.append(result)
results_df = pd.DataFrame(results)
output_file = os.path.join(
self.csv_output_path, f"{self.project_name}_metar_dust_crosscheck.csv"
)
results_df.to_csv(output_file, index=False)
agreement_summary = results_df.copy()
agreement_summary["agreement"] = agreement_summary["Source"].isin([1, 2, 3])
summary = agreement_summary.groupby(
["airport_name", agreement_summary["datetime"].dt.hour]
)["agreement"].agg(["count", "sum"])
summary["percentage_agreement"] = 100 * summary["sum"] / summary["count"]
print("\nSummary of METAR agreement by airport and hour:")
print(summary[["percentage_agreement"]].round(1))
print(f"\n Hourly crosscheck complete. Results saved to: {output_file}")
return results_df
from Dust_Storm_Modules import MERRA2AODANAProcessor
[docs]
class AirportAODFlagger:
def __init__(self, metar_csv_path, config_path, airport_csv_path):
self.metar_csv_path = metar_csv_path
self.config_path = config_path
self.airport_csv_path = airport_csv_path
self._load_config()
def _load_config(self):
with open(self.config_path, "r") as f:
config = json.load(f)
self.project_name = config["project_name"]
self.region_bounds = tuple(config["region_bounds"])
self.start_date = config["start_date"]
self.end_date = config["end_date"]
self.output_dir = config["AOD_output_dir"]
self.csv_output_path = config["csv_output_path"]
def _vectorized_haversine(self, lat1_array, lon1_array, lat2, lon2):
lat1 = np.radians(lat1_array)
lon1 = np.radians(lon1_array)
lat2 = np.radians(lat2)
lon2 = np.radians(lon2)
dlat = lat2 - lat1
dlon = lon2 - lon1
a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
c = 2 * np.arcsin(np.sqrt(a))
r = 6371
return r * c
[docs]
def run(self):
print("\n[1] Checking and processing MERRA-2 AODANA data...")
processor = MERRA2AODANAProcessor(
self.start_date, self.end_date, self.region_bounds, self.output_dir
)
processor.download_files()
processor.process_files()
print("\n[2] Loading METAR and airport data...")
metar_df = pd.read_csv(self.metar_csv_path)
metar_df["time"] = pd.to_datetime(metar_df["time"], errors="coerce")
metar_df.dropna(subset=["time"], inplace=True)
airport_df = pd.read_csv(self.airport_csv_path)
airport_df = airport_df[airport_df["type"] == "large_airport"]
unique_stations = metar_df["station"].unique()
airport_df = airport_df[airport_df["icao_code"].isin(unique_stations)]
print("\n[3] Matching MERRA-2 AODANA spikes with METAR reports...")
aod_files = sorted(
[
f
for f in os.listdir(self.output_dir)
if f.endswith(".csv") and f.startswith("MERRA2_400")
]
)
all_aod_df = []
for f in aod_files:
df = pd.read_csv(os.path.join(self.output_dir, f))
df["time"] = pd.to_datetime(df["time"], format="%d/%m/%Y %H:%M")
df["hour"] = df["time"].dt.hour
all_aod_df.append(df)
print(f"Found {len(all_aod_df)} AODANA files.")
if not all_aod_df:
print("No AODANA CSVs found. Aborting.")
return
aod_full = pd.concat(all_aod_df, ignore_index=True)
metar_times = set(metar_df["time"])
aod_by_hour = {hour: df for hour, df in aod_full.groupby("hour")}
results = []
for hour, hour_df in tqdm(aod_by_hour.items(), desc="[AODANA-led analysis]"):
for _, airport in airport_df.iterrows():
airport_lat = airport["latitude_deg"]
airport_lon = airport["longitude_deg"]
icao_code = airport["icao_code"]
airport_name = airport["name"]
hour_df = hour_df.copy()
hour_df["dist"] = self._vectorized_haversine(
hour_df["latitude"].values,
hour_df["longitude"].values,
airport_lat,
airport_lon,
)
candidate_df = hour_df[hour_df["dist"] <= 50]
threshold_df = hour_df[hour_df["dist"] <= 100]
if threshold_df.empty:
continue
monthly_mean = threshold_df["AODANA"].mean()
std_dev = threshold_df["AODANA"].std()
threshold_val = monthly_mean + 2 * std_dev
filtered_df = candidate_df[candidate_df["AODANA"] > threshold_val]
for _, row in filtered_df.iterrows():
timestamp = row["time"]
metar_agrees = any(
abs((timestamp - mt).total_seconds()) <= 1800
for mt in metar_times
)
result = {
"datetime": timestamp,
"airport_name": airport_name,
"icao_code": icao_code,
"latitude": row["latitude"],
"longitude": row["longitude"],
"distance_km": row["dist"],
"AODANA": row["AODANA"],
"monthly_mean_AOD": monthly_mean,
"threshold_AOD": threshold_val,
"above_monthly_avg_AOD": True,
"METAR_agrees": metar_agrees,
}
results.append(result)
print(airport_df["icao_code"].unique())
results_df = pd.DataFrame(results)
output_file = os.path.join(
self.csv_output_path, f"{self.project_name}_metar_AOD_crosscheck.csv"
)
results_df.to_csv(output_file, index=False)
print(f"\n✅ AODANA crosscheck complete. Results saved to: {output_file}")
return results_df