def signal_process(table,event_var='PM25_Marseille_Longchamp',event_threshold = 40, duration = 6,var_name_list='air pollution'):
"""collect time segments when pm2.5 hit above cetrain threshold non-overlap in a single day
param:
event_var: str
critical variable we would like to observe and select time segment based on it
event_threshold: int
critical value when event_var go above it
duration: int
time length, lag of window
var_name_list: list
variable list that we would like to watch
"""
assert(hasattr(table,"datetime"))
table["date"] = list(map(lambda date: datetime.strptime(date,"%Y-%m-%d %H:%M:%S").date(),table.datetime))
index_list=np.where(table[event_var]>event_threshold)[0]
# add first element
shrink_index_list = [index_list[0]]
date_set ={table.date[index_list[0]]}
for cnt in range(1,len(index_list)):
# # exclude for the same day
# if table.date[index_list[cnt]] in date_set:
# pass
# else:
shrink_index_list.append(index_list[cnt].copy())
date_set.add(table.date[index_list[cnt]])
bind_list = []
for start_index in shrink_index_list:
this_sequence = table.loc[range(start_index-2,start_index+duration),
var_name_list]
start_date = table.datetime[start_index]
tuples = list(zip([start_date]*(duration+2),range(-2,duration)))
this_sequence.index = pd.MultiIndex.from_tuples(tuples,names=['event_date_index','hour_index'])
# this_sequence = (this_sequence- this_sequence.mean())/this_sequence.std()
bind_list.append(this_sequence)
new_table=pd.concat(bind_list)
return new_table