# -*- coding: utf-8 -*-
"""
Created on Wed Jul 28 13:09:28 2021

@author: Mark B
"""

import urllib.request
import pandas as pd
import glob
import matplotlib.pyplot as plt
import numpy as np
from climateUtils import runAvg

baseUrl = r"https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/{:d}/"
#baseFile = r"CRNH0203-{:d}-CO_Dinosaur_2_E.txt"
#baseFile = r"CRNH0203-{:d}-AK_Utqiagvik_formerly_Barrow_4_ENE.txt"
#baseFile = r"CRNH0203-{:d}-FL_Everglades_City_5_NE.txt"
#baseFile = r"CRNH0203-{:d}-NY_Ithaca_13_E.txt"
baseFile = r"CRNH0203-{:d}-AZ_Yuma_27_ENE.txt"
#baseFile = r"CRNH0203-{:d}-CA_Fallbrook_5_NE.txt"
#localDir = r"C:/Users/brugger/Downloads/dinosaur/"
#localDir = r"/home/mark/Downloads/diCRNH0203-2021-AK_Utqiagvik_formerly_Barrow_4_ENE.txtnosaur/"
#localDir = r"/home/mark/Downloads/utqiagvik/"
#localDir = r"/home/mark/Downloads/everglades/"
localDir = r"/home/mark/Downloads/{:s}/".format(baseFile.split("_")[1])
avgDays = [30,15,-14]

colNames = ["WBANNO", "UTC_DATE", "UTC_TIME", "LST_DATE", "LST_TIME", "CRX_VN",
            "LONGITUDE", "LATITUDE", "T_CALC", "T_HR_AVG", "T_MAX", "T_MIN", 
            "P_CALC", "SOLARAD", "SOLARAD_FLAG", "SOLARAD_MAX", 
            "SOLARAD_MAX_FLAG", "SOLARAD_MIN", "SOLARAD_MIN_FLAG", 
            "SUR_TEMP_TYPE", "SUR_TEMP", "SUR_TEMP_FLAG", "SUR_TEMP_MAX", 
            "SUR_TEMP_MAX_FLAG", "SUR_TEMP_MIN", "SUR_TEMP_MIN_FLAG", 
            "RH_HR_AVG", "RH_HR_AVG_FLAG", "SOIL_MOISTURE_5", 
            "SOIL_MOISTURE_10", "SOIL_MOISTURE_20", "SOIL_MOISTURE_50", 
            "SOIL_MOISTURE_100", "SOIL_TEMP_5", "SOIL_TEMP_10", 
            "SOIL_TEMP_20", "SOIL_TEMP_50", "SOIL_TEMP_100"]

useCols =["UTC_DATE", "UTC_TIME", "LST_DATE", "LST_TIME", "T_CALC", 
          "T_HR_AVG", "T_MAX", "T_MIN"]

dtype_dic = {"UTC_DATE":str, "UTC_TIME":str, "LST_DATE":str, "LST_TIME":str}

# Retrieve files from http site, this takes a long time
def getDinosaur():
    for year in range(2022, 1999, -1):
        try:
            urllib.request.urlretrieve(baseUrl.format(year) + baseFile.format(year),
                                       localDir + baseFile.format(year))
            print(baseFile.format(year) + " : retrieved")
        except:
            print(baseFile.format(year) + " : not retrieved")

def plotDino():
    print("hi!")
    
def getUSCRN(localDir=localDir, baseFile=baseFile):
    files = getFileNameList(localDir=localDir)
    df = pd.DataFrame({})
    for file in files:
        print(file)
        #df = pd.read_csv(file, sep="\s+", names=colNames, na_values=["-9999.0"])
        dfY = pd.read_csv(file, sep="\s+", names=colNames, usecols=[1,2,3,4,6,7,8,9,10,11],
                         na_values=["-9999.0"], dtype=dtype_dic)
        df = pd.concat([df, dfY])
    return df
    
def getFileNameList(localDir=localDir):
    files = glob.glob(localDir + "*.txt")
    return files
  
# Get station information as pandas dataframe
# Data retrieved from https://www.ncei.noaa.gov/pub/data/uscrn/products/stations.tsv
def getStations():
    file = r"/home/mark/Downloads/stations.tsv"
    df = pd.read_csv(file, sep="\t")
    return df

if __name__ == "__main__":
    df = getUSCRN(localDir=localDir, baseFile=baseFile)
    
    df["DT_Local"] = pd.to_datetime(df.LST_DATE, format='%Y%m%d') + ( 
            pd.to_datetime(df.LST_TIME, format='%H%M') 
            - pd.to_datetime('1900', format='%Y') )
    
    dayList = list(set(df.LST_DATE))
    pd.to_datetime(df.LST_DATE, format='%Y%m%d')
    dayList.sort()
    L = len(dayList)
    tMaxArray = np.empty(L) * np.nan
    tMinArray = np.empty(L) * np.nan
    tMeanArray = np.empty(L) * np.nan
    tAvgArray = np.empty(L) * np.nan
    tMedianArray = np.empty(L) * np.nan
    
    for k, date in enumerate(dayList):
        mask = (df.LST_DATE == date) & (df.T_MAX == df.T_MAX)
        if sum(mask) > 0:
            tMaxArray[k] = max(df.T_MAX[mask])
            tMinArray[k] = min(df.T_MIN[mask])
            tMeanArray[k] = (tMaxArray[k] + tMinArray[k]) / 2
            tAvgArray[k] = np.mean(df.T_HR_AVG[mask])
            tMedianArray[k] = np.median(df.T_HR_AVG[mask])
            
    monthList = list([d[4:-2] for d in dayList])
        
    dfDaily = pd.DataFrame( {"day":dayList, "tMax":tMaxArray, "tMin":tMinArray,
                             "tMean":tMeanArray, "tAvg":tAvgArray, 
                             "tMed":tMedianArray, 
                             "date":pd.to_datetime(dayList, format='%Y%m%d'),
                             "month":monthList, "mAvg":np.empty(L), 
                             "mMean":np.empty(L), "mMed":np.empty(L), 
                             "aAvg":np.empty(L), "aMean":np.empty(L), 
                             "aMed":np.empty(L)} )
    
    for month in set(dfDaily.month):
        monMask = month == dfDaily.month
        dfDaily.mAvg[monMask] = np.mean(dfDaily.tAvg[monMask])
        dfDaily.mMean[monMask] = np.mean(dfDaily.tMean[monMask])
        dfDaily.mMed[monMask] = np.mean(dfDaily.tMed[monMask])
        
    dfDaily.aAvg = dfDaily.tAvg - dfDaily.mAvg
    dfDaily.aMean = dfDaily.tMean - dfDaily.mMean
    dfDaily.aMed = dfDaily.tMed - dfDaily.mMed
        
    maskAvg = dfDaily.aAvg == dfDaily.aAvg
    maskMean = dfDaily.aMean == dfDaily.aMean
    maskMed = dfDaily.aMed == dfDaily.aMed
    
    days = np.arange(0,L)
    pAvg = np.polyfit(days[maskAvg], dfDaily.aAvg[maskAvg], 1)
    pMean = np.polyfit(days[maskMean], dfDaily.aMean[maskMean], 1)
    pMed = np.polyfit(days[maskMed], dfDaily.aMed[maskMed], 1)
    
    plt.clf()
    plt.subplot(2,1,1)
    plt.plot(dfDaily.date[14:-15], runAvg(dfDaily.aMean, 30), '-c',
             dfDaily.date[14:-15], runAvg(dfDaily.aMed, 30), '-g',
             dfDaily.date[14:-15], runAvg(dfDaily.aAvg, 30), '-b',
             dfDaily.date[maskMean], np.polyval(pMean, days[maskMean]), '-c',
             dfDaily.date[maskMed], np.polyval(pMed, days[maskMed]), '--g',
             dfDaily.date[maskAvg], np.polyval(pAvg, days[maskAvg]), ':b')
    
    plt.legend(["Mean Anomaly (Delta T={0:4.2f})".format(np.mean(dfDaily.tMean - dfDaily.tAvg)), 
                "Median Anomaly (Delta T={0:4.2f})".format(np.mean(dfDaily.tMed - dfDaily.tAvg)), 
                "Average Anomaly",
                "Mean Anomaly Trend: {0:4.2f}/decade".format(pMean[0]*365.2425*10),
                "Median Anomaly Trend: {0:4.2f}/decade".format(pMed[0]*365.2425*10),
                "Average Anomaly Trend: {0:4.2f}/decade".format(pAvg[0]*365.2425*10)])
    
    plt.grid(axis="both")    
    plt.xlabel("Year")
    plt.ylabel(u"Anomaly (\u00B0C)")
    plt.title("USCRN {:s} Station Anomalies (30 day average)".format(baseFile.split("_")[1]))
    
    plt.subplot(2,1,2)
    plt.plot(dfDaily.date[14:-15], runAvg(dfDaily.aAvg-dfDaily.aMean, 30), '-c',
             dfDaily.date[14:-15], runAvg(dfDaily.aAvg-dfDaily.aMed, 30), '-g')
    
    plt.legend(["Residual (Avg - Mean) Anomaly", 
                "Residual (Avg - Median) Anomaly"])
    
    plt.grid(axis="both")    
    plt.xlabel("Year")
    plt.ylabel(u"Residual Anomaly (\u00B0C)")