import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy import stats
import csv
from sklearn.preprocessing import StandardScaler      #for PCA/standardizeing
from sklearn.datasets import make_blobs  #clustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from datetime import datetime
from datetime import datetime

sc = StandardScaler()
#care about ones with wrong date formating
df= pd.read_excel("C:/CovidData/DataForTrainingDT/DataNew_20210222/acaps_catmeas_20210223.xlsx")

#just considering categories
df_cluster = pd.read_excel("C:/CovidData/DataForTrainingDT/DataNew_20210222/df_cluster.xlsx")   #clusters are based on Covid_Clustering, but we do not use them in this work
df_cases= pd.read_excel("C:/CovidData/DataForTrainingDT/DataNew_20210222/COVID-19-geographic-disbtribution-worldwide.xlsx")
df_population = pd.read_csv("C:/CovidData/DataForTrainingDT/DataNew_20210222/WPP2019_TotalPopulationBySex.csv")


df= df.loc[df.LOG_TYPE == 'Introduction / extension of measures']

df = df.dropna(subset=['DATE_IMPLEMENTED'])  # remove rows with null date

df_cluster['cluster'] = df_cluster['cluster'].replace([0, 1, 2, 3, 4], ['cluster0', 'cluster1', 'cluster2', 'cluster3', 'cluster4'])
df_population= df_population.loc[df_population.Time == 2020]
df_population= df_population.loc[df_population.Variant == 'Medium']


#date formating

df['DATE_IMPLEMENTED']=df['DATE_IMPLEMENTED'].dt.strftime('%Y%m%d').astype(float)
df_cases['dateRep']=df_cases['dateRep'].dt.strftime('%Y%m%d').astype(float)

df.to_csv('C:/CovidData/DataForTrainingDT/df_temp20201119.csv')

#df merge with df_cluster
df_with_cluster = pd.merge(df, df_cluster, left_on=['ISO'], right_on=['ISO'], how='inner',suffixes=('_df','_df_cluster'))

#merge result with cases
df_with_cluster_cases = pd.merge(df_cases, df_with_cluster, left_on=['countryterritoryCode', 'dateRep'], right_on=['ISO', 'DATE_IMPLEMENTED_df'], how='inner',suffixes=('_df_cases','_df_with_cluster'))

df_with_cluster_cases.to_csv('C:/CovidData/DataForTrainingDT/DataNew_20210222/df_with_cluster_cases_20210223.csv', header=True, float_format='%.3f')




