Historical Map  1.0
Plugin for automatic extraction of old forest from historical map
 All Classes Namespaces Files Functions Variables
gmm_ridge.py
Go to the documentation of this file.
1 """!@brief Script by Mathieu Fauvel which performs Gaussian Mixture Model
2 """# -*- coding: utf-8 -*-
3 import scipy as sp
4 from scipy import linalg
5 import multiprocessing as mp
6 
7 ## Temporary predict function
8 def predict(tau,model,xT,yT):
9  err = sp.zeros(tau.size)
10  for j,t in enumerate(tau):
11  yp = model.predict(xT,tau=t)[0]
12  eq = sp.where(yp.ravel()==yT.ravel())[0]
13  err[j] = eq.size*100.0/yT.size
14  return err
15 
16 
17 class CV:
18  '''
19  This class implements the generation of several folds to be used in the cross validation
20  '''
21  def __init__(self):
22  self.it=[]
23  self.iT=[]
24 
25  def split_data(self,n,v=5):
26  ''' The function split the data into v folds. Whatever the number of sample per class
27  Input:
28  n : the number of samples
29  v : the number of folds
30  Output: None
31  '''
32  step = n //v # Compute the number of samples in each fold
33  sp.random.seed(1) # Set the random generator to the same initial state
34  t = sp.random.permutation(n) # Generate random sampling of the indices
35 
36  indices=[]
37  for i in range(v-1): # group in v fold
38  indices.append(t[i*step:(i+1)*step])
39  indices.append(t[(v-1)*step:n])
40 
41  for i in range(v):
42  self.iT.append(sp.asarray(indices[i]))
43  l = range(v)
44  l.remove(i)
45  temp = sp.empty(0,dtype=sp.int64)
46  for j in l:
47  temp = sp.concatenate((temp,sp.asarray(indices[j])))
48  self.it.append(temp)
49 
50  def split_data_class(self,y,v=5):
51  ''' The function split the data into v folds. The samples of each class are split approximatly in v folds
52  Input:
53  n : the number of samples
54  v : the number of folds
55  Output: None
56  '''
57  # Get parameters
58  n = y.size
59  C = y.max().astype('int')
60 
61  # Get the step for each class
62  tc = []
63  for j in range(v):
64  tempit = []
65  tempiT = []
66  for i in range(C):
67  # Get all samples for each class
68  t = sp.where(y==(i+1))[0]
69  nc = t.size
70  stepc = nc // v # Step size for each class
71  if stepc == 0:
72  print "Not enough sample to build "+ str(v) +" folds in class " + str(i)
73  sp.random.seed(i) # Set the random generator to the same initial state
74  tc = t[sp.random.permutation(nc)] # Random sampling of indices of samples for class i
75 
76  # Set testing and training samples
77  if j < (v-1):
78  start,end = j*stepc,(j+1)*stepc
79  else:
80  start,end = j*stepc,nc
81  tempiT.extend(sp.asarray(tc[start:end])) #Testing
82  k = range(v)
83  k.remove(j)
84  for l in k:
85  if l < (v-1):
86  start,end = l*stepc,(l+1)*stepc
87  else:
88  start,end = l*stepc,nc
89  tempit.extend(sp.asarray(tc[start:end])) #Training
90 
91  self.it.append(tempit)
92  self.iT.append(tempiT)
93 
94 
95 class GMMR:
96  def __init__(self):
97  self.ni = []
98  self.prop = []
99  self.mean = []
100  self.cov =[]
101  self.Q = []
102  self.L = []
103  self.tau = 0.0
104 
105  def learn(self,x,y):
106  '''
107  Function that learns the GMM with ridge regularizationb from training samples
108  Input:
109  x : the training samples
110  y : the labels
111  Output:
112  the mean, covariance and proportion of each class, as well as the spectral decomposition of the covariance matrix
113  '''
114 
115  ## Get information from the data
116  C = int(y.max(0)) # Number of classes
117  n = x.shape[0] # Number of samples
118  d = x.shape[1] # Number of variables
119  eps = sp.finfo(sp.float64).eps
120 
121  ## Initialization
122  self.ni = sp.empty((C,1)) # Vector of number of samples for each class
123  self.prop = sp.empty((C,1)) # Vector of proportion
124  self.mean = sp.empty((C,d)) # Vector of means
125  self.cov = sp.empty((C,d,d)) # Matrix of covariance
126  self.Q = sp.empty((C,d,d)) # Matrix of eigenvectors
127  self.L = sp.empty((C,d)) # Vector of eigenvalues
128 
129  ## Learn the parameter of the model for each class
130  for c in range(C):
131  j = sp.where(y==(c+1))[0]
132  self.ni[c] = float(j.size)
133  self.prop[c] = self.ni[c]/n
134  self.mean[c,:] = sp.mean(x[j,:],axis=0)
135  self.cov[c,:,:] = sp.cov(x[j,:],bias=1,rowvar=0) # Normalize by ni to be consistent with the update formulae
136 
137  # Spectral decomposition
138  L,Q = linalg.eigh(self.cov[c,:,:])
139  idx = L.argsort()[::-1]
140  self.L[c,:] = L[idx]
141  self.Q[c,:,:]=Q[:,idx]
142 
143  def predict(self,xt,tau=None,proba=None):
144  '''
145  Function that predict the label for sample xt using the learned model
146  Inputs:
147  xt: the samples to be classified
148  Outputs:
149  y: the class
150  K: the decision value for each class
151  '''
152  ## Get information from the data
153  nt = xt.shape[0] # Number of testing samples
154  C = self.ni.shape[0] # Number of classes
155 
156  ## Initialization
157  K = sp.empty((nt,C))
158  if tau is None:
159  TAU=self.tau
160  else:
161  TAU=tau
162 
163  for c in range(C):
164  invCov,logdet = self.compute_inverse_logdet(c,TAU)
165  cst = logdet - 2*sp.log(self.prop[c]) # Pre compute the constant
166 
167  xtc = xt-self.mean[c,:]
168  temp = sp.dot(invCov,xtc.T).T
169  K[:,c] = sp.sum(xtc*temp,axis=1)+cst
170  del temp,xtc
171 
172  ## Assign the label to the minimum value of K
173  yp = sp.argmin(K,1)+1
174  if proba is None:
175  return yp
176  else:
177  return yp,K
178 
179  def compute_inverse_logdet(self,c,tau):
180  Lr = self.L[c,:]+tau # Regularized eigenvalues
181  temp = self.Q[c,:,:]*(1/Lr)
182  invCov = sp.dot(temp,self.Q[c,:,:].T) # Pre compute the inverse
183  logdet = sp.sum(sp.log(Lr)) # Compute the log determinant
184  return invCov,logdet
185 
186  def BIC(self,x,y,tau=None):
187  '''
188  Computes the Bayesian Information Criterion of the model
189  '''
190  ## Get information from the data
191  C,d = self.mean.shape
192  n = x.shape[0]
193 
194  ## Initialization
195  if tau is None:
196  TAU=self.tau
197  else:
198  TAU=tau
199 
200  ## Penalization
201  P = C*(d*(d+3)/2) + (C-1)
202  P *= sp.log(n)
203 
204  ## Compute the log-likelihood
205  L = 0
206  for c in range(C):
207  j = sp.where(y==(c+1))[0]
208  xi = x[j,:]
209  invCov,logdet = self.compute_inverse_logdet(c,TAU)
210  cst = logdet - 2*sp.log(self.prop[c]) # Pre compute the constant
211  xi -= self.mean[c,:]
212  temp = sp.dot(invCov,xi.T).T
213  K = sp.sum(xi*temp,axis=1)+cst
214  L +=sp.sum(K)
215  del K,xi
216 
217  return L + P
218 
219  def cross_validation(self,x,y,tau,v=5):
220  '''
221  Function that computes the cross validation accuracy for the value tau of the regularization
222  Input:
223  x : the training samples
224  y : the labels
225  tau : a range of values to be tested
226  v : the number of fold
227  Output:
228  err : the estimated error with cross validation for all tau's value
229  '''
230  ## Initialization
231  ns = x.shape[0] # Number of samples
232  np = tau.size # Number of parameters to test
233  cv = CV() # Initialization of the indices for the cross validation
234  cv.split_data_class(y)
235  err = sp.zeros(np) # Initialization of the errors
236 
237  ## Create GMM model for each fold
238  model_cv = []
239  for i in range(v):
240  model_cv.append(GMMR())
241  model_cv[i].learn(x[cv.it[i],:], y[cv.it[i]])
242 
243  ## Initialization of the pool of processes
244  pool = mp.Pool()
245  processes = [pool.apply_async(predict,args=(tau,model_cv[i],x[cv.iT[i],:],y[cv.iT[i]])) for i in range(v)]
246  pool.close()
247  pool.join()
248  for p in processes:
249  err += p.get()
250  err /= v
251 
252  ## Free memory
253  for model in model_cv:
254  del model
255 
256  del processes,pool,model_cv
257 
258  return tau[err.argmax()],err
def predict
Temporary predict function.
Definition: gmm_ridge.py:8
ni
Get information from the data.
Definition: gmm_ridge.py:97