import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy import stats
import csv
from sklearn.preprocessing import StandardScaler      #for PCA/standardizeing
from sklearn.datasets import make_blobs  #clustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from datetime import datetime

sc = StandardScaler()
#care about ones with wrong date formating
#acaps data that I manually concatenate cat + measures
df= pd.read_excel("C:/CovidData/DataForTrainingDT/DataNew_20210222/acaps_catmeas_20210223.xlsx")

df= df.loc[df.LOG_TYPE == 'Introduction / extension of measures']
df= df.dropna(subset=['DATE_IMPLEMENTED'])  #remove rows with null date
#df = df['DATE_IMPLEMENTED'].values.astype('datetime64[D]')
#using ISO to have consistent names of countries (its abbreviation of country name)
df_group_country = df.groupby(["ISO"], as_index=False).agg({'DATE_IMPLEMENTED': min})


data_for_clustering = pd.DataFrame(columns=['DATE_IMPLEMENTED'])
data_for_clustering["DATE_IMPLEMENTED"] = df_group_country["DATE_IMPLEMENTED"]
data_scaled = sc.fit_transform(data_for_clustering)
data_scaled_df = pd.DataFrame(data_scaled)

#fitting multiple k-means algorithms and storing the interia values (find best k)
SSE=[]
for cluster in range(1,20):
    kmeans= KMeans( n_clusters=cluster, init='k-means++')
    kmeans.fit(data_scaled)
    SSE.append(kmeans.inertia_)
#converting the results into a dataframe and plotting them
frame=pd.DataFrame({'Cluster':range(1,20),'SSE':SSE})
plt.figure(figsize=(12,6))
plt.plot(frame['Cluster'], frame['SSE'], marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

kmeans = KMeans(n_clusters=5)
kmeans.fit(data_scaled_df)
pred = kmeans.predict(data_scaled_df)
#value count of points in each of clusters
frame = pd.DataFrame(data_scaled_df)
frame['cluster']=pred
print(frame['cluster'].value_counts())
centers = kmeans.cluster_centers_
print("centers:", centers)


data_labels = kmeans.labels_


df_group_country['cluster'] = frame['cluster']
df_group_country.to_csv(r'C:/CovidData/DataForTrainingDT/DataNew_20210222/df_cluster.csv', header=True)

df_inner_cluster = pd.merge(df_group_country, df, how= 'inner', left_on= ['ISO'], right_on=['ISO'])

df_group = df_inner_cluster.groupby(["Cat_Meas", "ISO", "cluster"], as_index=False).agg({'DATE_IMPLEMENTED_y': "first"})


df_group = df_group.sort_values(by='DATE_IMPLEMENTED_y',ascending=True)

for rule in df_group['Cat_Meas']:
    temp = df_group.loc[df_group.Cat_Meas == rule]
    print("5 first leaders countries of", rule, temp['ISO'].head(5))