import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

Marseille=pd.read_csv("../stage/final_marseille.csv")
Marseille.head()

def signal_process(table,event_var='PM25_Marseille_Longchamp',event_threshold = 40, duration = 6,var_name_list='air pollution'):
    """collect time segments when pm2.5 hit above cetrain threshold non-overlap in a single day  
    param:
    event_var: str
        critical variable we would like to observe and select time segment based on it
    event_threshold: int
        critical value when event_var go above it
    duration: int 
        time length, lag of window
    var_name_list: list
        variable list that we would like to watch
    """
    
    assert(hasattr(table,"datetime"))
    table["date"] = list(map(lambda date: datetime.strptime(date,"%Y-%m-%d %H:%M:%S").date(),table.datetime))
    
    index_list=np.where(table[event_var]>event_threshold)[0]
    
    # add first element
    shrink_index_list = [index_list[0]]
    date_set ={table.date[index_list[0]]}
    
 
    for cnt in range(1,len(index_list)):
# #         exclude for the same day
#         if table.date[index_list[cnt]] in date_set:
#             pass
#         else:
            shrink_index_list.append(index_list[cnt].copy())
            date_set.add(table.date[index_list[cnt]])
    
    bind_list = []
    for start_index in shrink_index_list:
        this_sequence = table.loc[range(start_index-2,start_index+duration),
                                  var_name_list]
        start_date = table.datetime[start_index]
        tuples = list(zip([start_date]*(duration+2),range(-2,duration)))

        this_sequence.index = pd.MultiIndex.from_tuples(tuples,names=['event_date_index','hour_index'])
    #         this_sequence = (this_sequence- this_sequence.mean())/this_sequence.std()

        bind_list.append(this_sequence)
    new_table=pd.concat(bind_list)     
    
    return new_table

watch_list =["PM25_Marseille_Longchamp","asthme","dyspnée","respiration","banane"]

Marseille_post=signal_process(Marseille,event_var="PM25_Marseille_Longchamp",event_threshold=35,duration=12,var_name_list=watch_list)

Marseille_post.head(15)

Marseille_post.groupby("hour_index").mean().plot()
plt.show()

Marseille.to_csv("../output/marceille_with_overlap.csv")

	Unnamed: 0	datetime	PM25_Marseille_Longchamp	asthme	dyspnée	respiration	banane	banane_topic	la diarrhée	la diarrhée_topic	pollution de l’air	pollution de l’air_topic
0	0	2017-08-31 00:00:00	8.0	13	0	0	46	54	NaN	NaN	NaN	NaN
1	1	2017-08-31 01:00:00	9.0	0	0	79	23	35	NaN	NaN	NaN	NaN
2	2	2017-08-31 02:00:00	8.0	0	23	46	27	41	NaN	NaN	NaN	NaN
3	3	2017-08-31 03:00:00	7.0	0	0	68	10	20	NaN	NaN	NaN	NaN
4	4	2017-08-31 04:00:00	8.0	0	0	26	42	52	NaN	NaN	NaN	NaN

		PM25_Marseille_Longchamp	asthme	dyspnée	respiration	banane
event_date_index	hour_index
2017-09-30 23:00:00	-2	30.0	6	3	28	32
	-1	22.0	0	0	10	32
	0	39.0	44	0	17	38
	1	25.0	0	0	29	8
	2	31.0	0	0	46	37
	3	25.0	29	0	0	0
	4	8.0	52	26	26	14
	5	6.0	0	0	30	41
	6	3.0	0	7	0	52
	7	4.0	17	0	10	63
	8	6.0	7	5	9	58
	9	29.0	8	0	10	63
	10	15.0	6	2	21	62
	11	15.0	10	0	21	57
2017-10-31 20:00:00	-2	16.0	2	2	6	30