import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn.model_selection import train_test_split  # Import train_test_split function
from sklearn import metrics  # Import scikit-learn metrics module for accuracy calculation
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz


# using DT regression

class DTRegression():
    # manually change df_with_cluster_cases_20210223 csv file to excel
    df = pd.read_excel(
        "C:/CovidData/DataForTrainingDT/DataNew_20210222/df_with_cluster_cases_20210223.xlsx")

    # pre-processing
    df.rename(columns={
        "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": "Cumulative_number_for_14_days_of_COVID_19_cases_per_100000"},
              inplace=True)
    df['Cumulative_number_for_14_days_of_COVID_19_cases_per_100000'] = df[
        'Cumulative_number_for_14_days_of_COVID_19_cases_per_100000'].fillna(0)

    df['Cat_Meas'] = df['Cat_Meas'].replace(
        ['Governance and socio-economic measures.Military deployment', 'Humanitarian exemption.Humanitarian exemptions',
         'Lockdown.Lockdown of refugee/idp camps or other minorities',
         'Movement restrictions.Checkpoints within the country',
         'Public health measures.Amendments to funeral and burial regulations',
         'Social distancing.Changes in prison-related policies'], 'Deleted')
    df['Cat_Meas'] = df['Cat_Meas'].replace(['Governance and socio-economic measures.Limit product imports/exports',
                                             'Governance and socio-economic measures.Economic measures',
                                             'Movement restrictions.Border checks',
                                             'Movement restrictions.Visa restrictions',
                                             'Movement restrictions.Additional health/documents requirements upon arrival',
                                             'Public health measures.Health screenings in airports and border crossings',
                                             'Public health measures.Awareness campaigns',
                                             'Public health measures.Strengthening the public health system'],
                                            'No Lockdown')
    df['Cat_Meas'] = df['Cat_Meas'].replace(['Public health measures.Other public health measures enforced',
                                             'Public health measures.General recommendations',
                                             'Public health measures.Requirement to wear protective gear in public',
                                             'Public health measures.Testing policy',
                                             'Public health measures.Psychological assistance and medical social work',
                                             'Public health measures.Obligatory medical tests not related to COVID-19'],
                                            'Social Distancing')
    df['Cat_Meas'] = df['Cat_Meas'].replace(
        ['Lockdown.Partial lockdown', 'Movement restrictions.International flights suspension',
         'Public health measures.Isolation and quarantine policies', 'Public health measures.Mass population testing',
         'Social distancing.Limit public gatherings'], 'Soft Lockdown')
    df['Cat_Meas'] = df['Cat_Meas'].replace(
        ['Governance and socio-economic measures.Emergency administrative structures activated or established',
         'Governance and socio-economic measures.State of emergency declared', 'Lockdown.Full lockdown',
         'Movement restrictions.Border closure', 'Movement restrictions.Surveillance and monitoring',
         'Movement restrictions.Domestic travel restrictions', 'Movement restrictions.Curfews',
         'Movement restrictions.Complete border closure', 'Social distancing.Schools closure',
         'Social distancing.Closure of businesses and public services'], 'Hard Lockdown')

    df = df.loc[df.Cat_Meas != 'Deleted']

    # in order to be in proper yyyymmdd format
    df['DATE_IMPLEMENTED_df'] = df['DATE_IMPLEMENTED_df'] / 1000
    df['Cumulative_number_for_14_days_of_COVID_19_cases_per_100000'] = df[
                                                                           'Cumulative_number_for_14_days_of_COVID_19_cases_per_100000'] / 1000

    min_df = df['DATE_IMPLEMENTED_df'].min()
    print(min_df)

    # change time to 2 weeks
    df["week_bin"] = pd.cut(x=df.DATE_IMPLEMENTED_df,
                            bins=[20200100, 20200114, 20200128, 20200211, 20200225, 20200310, 20200324, 20200407,
                                  20200421, 20200505, 20200519, 20200602, 20200616, 20200630, 20200714, 20200728,
                                  20200811, 20200825, 20200908, 20200922, 20201006, 20201020, 20201103, 20201117,
                                  20201201, 20201215],
                            labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                    24, 25])

    # we have 8 No Lockdown, 6 Social, 5 Soft, 10 Hard rules.
    df['No Lockdown'] = df['Cat_Meas'].replace(
        to_replace=['No Lockdown', 'Social Distancing', 'Soft Lockdown', 'Hard Lockdown'], value=[0.125, 0, 0, 0])
    df['Social Distancing'] = df['Cat_Meas'].replace(
        to_replace=['No Lockdown', 'Social Distancing', 'Soft Lockdown', 'Hard Lockdown'], value=[0, 0.17, 0, 0])
    df['Soft Lockdown'] = df['Cat_Meas'].replace(
        to_replace=['No Lockdown', 'Social Distancing', 'Soft Lockdown', 'Hard Lockdown'], value=[0, 0, 0.2, 0])
    df['Hard Lockdown'] = df['Cat_Meas'].replace(
        to_replace=['No Lockdown', 'Social Distancing', 'Soft Lockdown', 'Hard Lockdown'], value=[0, 0, 0, 0.1])
    # merge in one day first
    df = df.groupby(["COUNTRY", "DATE_IMPLEMENTED_df"], as_index=False).agg(
        {'week_bin': 'first', 'Cumulative_number_for_14_days_of_COVID_19_cases_per_100000': 'first',
         'REGION': 'first', 'No Lockdown': sum, 'Social Distancing': sum, 'Soft Lockdown': sum, 'Hard Lockdown': sum})

    df = df.reset_index()

    df.to_csv('C:/CovidData/DataForTrainingDT/DataNew_20210222/df20210410g0.csv',
              float_format='%.3f')

    print(df.describe())
    print("ff", df['week_bin'].dtypes)

    df['week_bin'] = df['week_bin'].astype(int)

    df['No Lockdown'] = df['No Lockdown'].astype(float)
    df['Social Distancing'] = df['Social Distancing'].astype(float)
    df['Soft Lockdown'] = df['Soft Lockdown'].astype(float)
    df['Hard Lockdown'] = df['Hard Lockdown'].astype(float)

    df.to_csv('C:/Users/maleebrahimdeh/Desktop/CovidData/DataForTrainingDT/DataNew_20210222/df20210410g.csv',
              float_format='%.3f')

    df = df.groupby(["COUNTRY", "week_bin"], as_index=False).agg(
        {'DATE_IMPLEMENTED_df': 'first', 'Cumulative_number_for_14_days_of_COVID_19_cases_per_100000': 'mean',
         'REGION': 'first', 'No Lockdown': sum, 'Social Distancing': sum, 'Soft Lockdown': sum, 'Hard Lockdown': sum})
    df = df.sort_values(by='DATE_IMPLEMENTED_df', ascending=True)

    df.to_csv('C:/CovidData/DataForTrainingDT/DataNew_20210222/df20210410br.csv',
              float_format='%.3f')

    # v2 (test is goed)
    df['No Lockdown'].loc[df['No Lockdown'] > 0.6] = 1
    df['Social Distancing'].loc[df['Social Distancing'] > 0.6] = 1
    df['Soft Lockdown'].loc[df['Soft Lockdown'] > 0.6] = 1
    df['Hard Lockdown'].loc[df['Hard Lockdown'] > 0.6] = 1

    df['Social Distancing'].loc[df['Social Distancing'] == 0.51] = 0.5
    df['Soft Lockdown'].loc[df['Soft Lockdown'] == 0.4] = 0.5
    df['Soft Lockdown'].loc[df['Soft Lockdown'] == 0.6] = 0.5
    df['Hard Lockdown'].loc[df['Hard Lockdown'] == 0.4] = 0.5
    df['Hard Lockdown'].loc[df['Hard Lockdown'] == 0.6] = 0.5

    df['No Lockdown'].loc[df['No Lockdown'] < 0.4] = 0
    df['Social Distancing'].loc[df['Social Distancing'] < 0.4] = 0
    df['Soft Lockdown'].loc[df['Soft Lockdown'] < 0.4] = 0
    df['Hard Lockdown'].loc[df['Hard Lockdown'] < 0.4] = 0

    df.to_csv('C:/CovidData/DataForTrainingDT/DataNew_20210222/df20210410br.csv',
              float_format='%.3f')

    # , 'COUNTRY', , 'deaths per population' (accuracy with death is better)
    feature_cols_X = ['week_bin', 'REGION', 'Cumulative_number_for_14_days_of_COVID_19_cases_per_100000']
    feature_cols_y = ['No Lockdown', 'Social Distancing', 'Soft Lockdown', 'Hard Lockdown']
    X = df[feature_cols_X]  # Features
    y = df[feature_cols_y]

    one_hot_data_X = pd.get_dummies(X)

    X_train, X_test, y_train, y_test = train_test_split(one_hot_data_X, y, test_size=0.2,
                                                        random_state=1)  # 0.3, 70% training and 30% test
    regressor = DecisionTreeRegressor(random_state=0)
    # regression = linear_model.LinearRegression()
    # regression.fit(X_train, y_train)
    regressor.fit(X_train, y_train)

    ##accuracy of train set
    y_pred_train = regressor.predict(X_train)

    df_accuracy_train = pd.DataFrame(
        columns=['yno_ac', 'ysoc_ac', 'ysof_ac', 'yhar_ac', 'yno_pr', 'ysoc_pr', 'ysof_pr', 'yhar_pr', 'yno_sub',
                 'ysoc_sub', 'ysof_sub', 'yhar_sub'])
    df_accuracy_train['yno_ac'] = y_train['No Lockdown']
    df_accuracy_train['ysoc_ac'] = y_train['Social Distancing']
    df_accuracy_train['ysof_ac'] = y_train['Soft Lockdown']
    df_accuracy_train['yhar_ac'] = y_train['Hard Lockdown']
    df_accuracy_train['yno_pr'] = y_pred_train[:, 0]
    df_accuracy_train['ysoc_pr'] = y_pred_train[:, 1]
    df_accuracy_train['ysof_pr'] = y_pred_train[:, 2]
    df_accuracy_train['yhar_pr'] = y_pred_train[:, 3]
    df_accuracy_train['yno_sub'] = abs(df_accuracy_train['yno_ac'] - df_accuracy_train['yno_pr'])
    df_accuracy_train['ysoc_sub'] = abs(df_accuracy_train['ysoc_ac'] - df_accuracy_train['ysoc_pr'])
    df_accuracy_train['ysof_sub'] = abs(df_accuracy_train['ysof_ac'] - df_accuracy_train['ysof_pr'])
    df_accuracy_train['yhar_sub'] = abs(df_accuracy_train['yhar_ac'] - df_accuracy_train['yhar_pr'])
    print("df_accuracy_train:", df_accuracy_train)
    print("sum abs errors train per sum actuals:", (
                df_accuracy_train['yno_sub'].sum() + df_accuracy_train['ysoc_sub'].sum() + df_accuracy_train[
            'ysof_sub'].sum() + df_accuracy_train['yhar_sub'].sum()) / (
                      df_accuracy_train['yno_ac'].sum() + df_accuracy_train['ysoc_ac'].sum() + df_accuracy_train[
                  'ysof_ac'].sum() + df_accuracy_train['yhar_ac'].sum()))

    print(df_accuracy_train['ysoc_ac'].count())
    print(df_accuracy_train['yno_ac'].loc[df_accuracy_train.yno_ac == df_accuracy_train.yno_pr].count())
    print("Number of correct (on train) prediction with accuracy of 0.1 for No Lockdown:",
          (df_accuracy_train['yno_ac'].loc[df_accuracy_train.yno_sub < 0.1].count()) / (
              df_accuracy_train['ysoc_ac'].count()))
    print("Number of correct (on train) prediction with accuracy of 0.1 for Social Distancing:",
          (df_accuracy_train['ysoc_sub'].loc[df_accuracy_train.ysoc_sub < 0.1].count()) / (
              df_accuracy_train['ysoc_ac'].count()))
    print("Number of correct (on train) prediction with accuracy of 0.1 for Soft Lockdown:",
          (df_accuracy_train['ysof_ac'].loc[df_accuracy_train.ysof_ac < 0.1].count()) / (
              df_accuracy_train['ysoc_ac'].count()))
    print("Number of correct (on train) prediction with accuracy of 0.1 for Hard Lockdown:",
          (df_accuracy_train['yhar_ac'].loc[df_accuracy_train.yhar_ac < 0.1].count()) / (
              df_accuracy_train['ysoc_ac'].count()))

    ##test set prediction
    # prediction = regression.predict(X_test)
    y_pred = regressor.predict(X_test)
    # print("prediction", prediction)
    print("y_pred", y_pred)

    ##accuracy of test set
    df_accuracy = pd.DataFrame(
        columns=['yno_ac', 'ysoc_ac', 'ysof_ac', 'yhar_ac', 'yno_pr', 'ysoc_pr', 'ysof_pr', 'yhar_pr', 'yno_sub',
                 'ysoc_sub', 'ysof_sub', 'yhar_sub'])
    df_accuracy['yno_ac'] = y_test['No Lockdown']
    df_accuracy['ysoc_ac'] = y_test['Social Distancing']
    df_accuracy['ysof_ac'] = y_test['Soft Lockdown']
    df_accuracy['yhar_ac'] = y_test['Hard Lockdown']
    df_accuracy['yno_pr'] = y_pred[:, 0]
    df_accuracy['ysoc_pr'] = y_pred[:, 1]
    df_accuracy['ysof_pr'] = y_pred[:, 2]
    df_accuracy['yhar_pr'] = y_pred[:, 3]
    df_accuracy['yno_sub'] = abs(df_accuracy['yno_ac'] - df_accuracy['yno_pr'])
    df_accuracy['ysoc_sub'] = abs(df_accuracy['ysoc_ac'] - df_accuracy['ysoc_pr'])
    df_accuracy['ysof_sub'] = abs(df_accuracy['ysof_ac'] - df_accuracy['ysof_pr'])
    df_accuracy['yhar_sub'] = abs(df_accuracy['yhar_ac'] - df_accuracy['yhar_pr'])
    print("df_accuracy:", df_accuracy)
    print("sum abs errors test per sum actuals:", (
                df_accuracy['yno_sub'].sum() + df_accuracy['ysoc_sub'].sum() + df_accuracy['ysof_sub'].sum() +
                df_accuracy['yhar_sub'].sum()) / (
                      df_accuracy['yno_ac'].sum() + df_accuracy['ysoc_ac'].sum() + df_accuracy['ysof_ac'].sum() +
                      df_accuracy['yhar_ac'].sum()))


    print(df_accuracy['ysoc_ac'].count())
    print(df_accuracy['yno_ac'].loc[df_accuracy.yno_ac == df_accuracy.yno_pr].count())
    print("Number of correct (on test) prediction with accuracy of 0.1 for No Lockdown:",
          (df_accuracy['yno_ac'].loc[df_accuracy.yno_sub < 0.1].count()) / (df_accuracy['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.1 for Social Distancing:",
          (df_accuracy['ysoc_sub'].loc[df_accuracy.ysoc_sub < 0.1].count()) / (df_accuracy['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.1 for Soft Lockdown:",
          (df_accuracy['ysof_ac'].loc[df_accuracy.ysof_ac < 0.1].count()) / (df_accuracy['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.1 for Hard Lockdown:",
          (df_accuracy['yhar_ac'].loc[df_accuracy.yhar_ac < 0.1].count()) / (df_accuracy['ysoc_ac'].count()))

    # optimizing (it doesnt reduce depth)
    regressor_opt = DecisionTreeRegressor(max_depth=7)  # mse, criterion="gdp", max_depth=3, random_state=0
    regressor_opt = regressor.fit(X_train, y_train)
    y_pred_opt = regressor_opt.predict(X_test)

    ##accuracy of test set
    df_accuracy_opt = pd.DataFrame(
        columns=['yno_ac', 'ysoc_ac', 'ysof_ac', 'yhar_ac', 'yno_pr', 'ysoc_pr', 'ysof_pr', 'yhar_pr', 'yno_sub',
                 'ysoc_sub', 'ysof_sub', 'yhar_sub'])
    df_accuracy_opt['yno_ac'] = y_test['No Lockdown']
    df_accuracy_opt['ysoc_ac'] = y_test['Social Distancing']
    df_accuracy_opt['ysof_ac'] = y_test['Soft Lockdown']
    df_accuracy_opt['yhar_ac'] = y_test['Hard Lockdown']
    df_accuracy_opt['yno_pr'] = y_pred_opt[:, 0]
    df_accuracy_opt['ysoc_pr'] = y_pred_opt[:, 1]
    df_accuracy_opt['ysof_pr'] = y_pred_opt[:, 2]
    df_accuracy_opt['yhar_pr'] = y_pred_opt[:, 3]
    df_accuracy_opt['yno_sub'] = abs(df_accuracy_opt['yno_ac'] - df_accuracy_opt['yno_pr'])
    df_accuracy_opt['ysoc_sub'] = abs(df_accuracy_opt['ysoc_ac'] - df_accuracy_opt['ysoc_pr'])
    df_accuracy_opt['ysof_sub'] = abs(df_accuracy_opt['ysof_ac'] - df_accuracy_opt['ysof_pr'])
    df_accuracy_opt['yhar_sub'] = abs(df_accuracy_opt['yhar_ac'] - df_accuracy_opt['yhar_pr'])
    print("df_accuracy_opt:", df_accuracy_opt)
    print("sum abs errors test per sum actuals (optimal solution):", (
            df_accuracy_opt['yno_sub'].sum() + df_accuracy_opt['ysoc_sub'].sum() + df_accuracy_opt['ysof_sub'].sum() +
            df_accuracy_opt['yhar_sub'].sum()) / (
                  df_accuracy_opt['yno_ac'].sum() + df_accuracy_opt['ysoc_ac'].sum() + df_accuracy_opt[
              'ysof_ac'].sum() +
                  df_accuracy_opt['yhar_ac'].sum()))


    print(df_accuracy_opt['ysoc_ac'].count())
    print(df_accuracy_opt['yno_ac'].loc[df_accuracy_opt.yno_ac == df_accuracy_opt.yno_pr].count())
    print("Number of correct (on test) prediction with accuracy of 0.001 for No Lockdown (optimal):",
          (df_accuracy_opt['yno_ac'].loc[df_accuracy_opt.yno_sub < 0.001].count()) / (
              df_accuracy_opt['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.001 for Social Distancing (optimal):",
          (df_accuracy_opt['ysoc_sub'].loc[df_accuracy_opt.ysoc_sub < 0.001].count()) / (
              df_accuracy_opt['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.001 for Soft Lockdown (optimal):",
          (df_accuracy_opt['ysof_ac'].loc[df_accuracy_opt.ysof_ac < 0.001].count()) / (
              df_accuracy_opt['ysoc_ac'].count()))
    print("Number of correct (on test) prediction with accuracy of 0.001 for Hard Lockdown (optimal):",
          (df_accuracy_opt['yhar_ac'].loc[df_accuracy_opt.yhar_ac < 0.001].count()) / (
              df_accuracy_opt['ysoc_ac'].count()))

    export_graphviz(regressor_opt,
                    out_file='C:/CovidData/DataForTrainingDT/DataNew_20210222/tree20210410.dot',
                    feature_names=one_hot_data_X.columns)

    # a tuple
    # input_values = [20200120, 0, 0, 1, 0, 0, 0, 8.593]
    input_values = [9, 1.705, 0, 0, 1, 0, 0, 0]

    prediction = regressor.predict([input_values])
    prediction = [round(x, 2) for x in prediction[0]]
    print("input value prediction:", prediction)

    input_values = [9, 62, 0, 0, 1, 0, 0, 0]

    prediction = regressor.predict([input_values])
    prediction = [round(x, 2) for x in prediction[0]]
    print("input value prediction:", prediction)
