data_generator.py
Go to the documentation of this file.
1 """
2 This is the generator module.
3 """
4 
5 __version__ = '1.0'
6 __author__ = 'Saul Alonso-Monsalve'
7 __email__ = "saul.alonso.monsalve@cern.ch"
8 
9 import numpy as np
10 import zlib
11 from string import digits
12 
13 class DataGenerator(object):
14 
15  'Generates data for Keras'
16 
17  '''
18  Initialization function of the class
19  '''
20  def __init__(self, cells=500, planes=500, views=3, batch_size=32, branches=True,
21  outputs=7, standardize=True, images_path = '/', shuffle=True, test_values=[]):
22  'Initialization'
23  self.cells = cells
24  self.planes = planes
25  self.views = views
26  self.batch_size = batch_size
27  self.branches = branches
28  self.outputs = outputs
29  self.images_path = images_path
30  self.standardize = standardize
31  self.shuffle = shuffle
32  self.test_values = test_values
33 
34  '''
35  Goes through the dataset and outputs one batch at a time.
36  '''
37  def generate(self, labels, list_IDs, yield_labels=True):
38  'Generates batches of samples'
39 
40  # Infinite loop
41  while 1:
42  # Generate random order of exploration of dataset (to make each epoch different)
43  indexes = self.__get_exploration_order(list_IDs)
44 
45  # Generate batches
46  imax = int(len(indexes)/self.batch_size) # number of batches
47 
48  for i in range(imax):
49  # Find list of IDs for one batch
50  list_IDs_temp = [list_IDs[k] for k in indexes[i*self.batch_size:(i+1)*self.batch_size]]
51 
52  # Generate data
53  if yield_labels:
54  # Train, validation
55  X, y = self.__data_generation(labels, list_IDs_temp, yield_labels)
56 
57  yield X, y
58  else:
59  # Test, predictions
60  X = self.__data_generation(labels, list_IDs_temp, yield_labels)
61 
62  yield X
63 
64  '''
65  Generates a random order of exploration for a given set of list_IDs.
66  If activated, this feature will shuffle the order in which the examples
67  are fed to the classifier so that batches between epochs do not look alike.
68  Doing so will eventually make our model more robust.
69  '''
70  def __get_exploration_order(self, list_IDs):
71  'Generates order of exploration'
72 
73  # Find exploration order
74  indexes = np.arange(len(list_IDs))
75 
76  if self.shuffle == True:
77  np.random.shuffle(indexes)
78 
79  return indexes
80 
81  '''
82  Outputs batches of data and only needs to know about the list of IDs included
83  in batches as well as their corresponding labels.
84  '''
85  def __data_generation(self, labels, list_IDs_temp, yield_labels):
86  'Generates data of batch_size samples' # X : (n_samples, v_size, v_size, v_size, n_channels)
87 
88  # Initialization
89  if self.branches:
90  # X data should be a list of length == branches
91  X = [None]*self.views
92 
93  for view in range(self.views):
94  X[view] = np.empty((self.batch_size, self.planes, self.cells, 1))
95  else:
96  # X data should't be a list because there is only one branch
97  X = np.empty((self.batch_size, self.planes, self.cells, self.views))
98 
99  if yield_labels:
100  # only include the labels when requested (train, validation)
101  if self.outputs == 1:
102  y = np.empty((self.batch_size), dtype = int)
103  else:
104  y = np.empty((self.batch_size, self.outputs), dtype = int)
105 
106  # Generate data
107  for i, ID in enumerate(list_IDs_temp):
108  # Decompress image into pixel NumPy tensor
109  with open(self.images_path + '/' + ID.split('.')[0].lstrip('a') + '/images/' + ID + '.gz', 'rb') as image_file:
110  pixels = np.fromstring(zlib.decompress(image_file.read()), dtype=np.uint8, sep='').reshape(self.views, self.planes, self.cells)
111  #pixels = np.load(self.images_path + '/' + labels[ID] + '/' + ID + '.npy')
112 
113  if self.standardize:
114  # standardize the image
115  pixels = pixels.astype('float32') # 32-bit precision floating-point pixel image
116  pixels /= 255. # pixel range from 0 to 1
117 
118  # Store volume
119  if self.branches:
120  for view in range(self.views):
121  X[view][i, :, :, :] = pixels[view, :, :].reshape(self.planes, self.cells, 1)
122  else:
123  pixels = np.rollaxis(pixels, 0, 3) # from 'channels_first' to 'channels_last'
124  X[i, :, :, :] = pixels
125 
126  # get y value
127  y_value = labels[ID]
128 
129  if yield_labels:
130  # store class/label (train, validation)
131  y[i] = y_value
132  else:
133  # store actual label and energy values (used for the confusion matrix and normalization)
134  with open(self.images_path + '/' + ID.split('.')[0].lstrip('a') + '/info/' + ID + '.info', 'rb') as info_file:
135  energy_values = info_file.readlines()
136  self.test_values.append({'y_value':y_value,
137  'fNuEnergy':float(energy_values[1]),
138  'fLepEnergy':float(energy_values[2]),
139  'fRecoNueEnergy': float(energy_values[3]),
140  'fRecoNumuEnergy': float(energy_values[4]),
141  'fEventWeight': float(energy_values[5])})
142 
143  if yield_labels:
144  # return X and Y (train, validation)
145  if self.outputs == 1:
146  return X, self.sparsify1(y)
147  if self.outputs == 5:
148  return X, self.sparsify5(y)
149  return X, self.sparsify7(y)
150 
151  # return X (test, predictions)
152  return X
153 
154  '''
155  Please note that Keras only accepts labels written in a binary form
156  (in a 6-label problem, the third label is writtten [0 0 1 0 0 0]),
157  which is why we need the sparsify function to perform this task,
158  should y be a list of numerical values.
159  '''
160 
161  def sparsify1(self, y):
162  'Returns labels in binary NumPy array'
163  return np.array([[1 if y[i] == j else 1 if y[i]-1 == j and j == 12 else 0 for j in range(13)] for i in range(y.shape[0])])
164 
165  def sparsify2(self, y):
166  'Returns labels in binary NumPy array'
167  res = [None]*2
168  res[0] = np.zeros((y.shape[0], 4), dtype=int)
169  res[1] = np.zeros((y.shape[0], 4), dtype=int)
170 
171  for i in range(y.shape[0]):
172 
173  res[0][i][(y[i] // 4)] = 1 # CC Numu, CC Nue, CC Nutau
174  res[1][i][(y[i] % 4)] = 1 # CC QE, CC Res, CC DIS, CC Other
175 
176  if y[i] == 12:
177  res[1][i] = [-1, -1, -1, -1]
178 
179  return res
180 
181  def sparsify3(self, y):
182  'Returns labels in binary NumPy array'
183  res = [None]*3
184  res[0] = np.zeros((y.shape[0], 1), dtype=int)
185  res[1] = np.zeros((y.shape[0], 4), dtype=int)
186  res[2] = np.zeros((y.shape[0], 4), dtype=int)
187 
188  for i in range(y.shape[0]):
189  quotient = y[i] // 13
190 
191  if quotient > 0:
192  y[i] %= 13 # from 0 to 12
193  res[0][i][0] = 1 # antineutrino
194 
195  res[1][i][(y[i] // 4)] = 1 # CC Numu, CC Nue, CC Nutau
196  res[2][i][(y[i] % 4)] = 1 # CC QE, CC Res, CC DIS, CC Other
197 
198  if y[i] == 12:
199  res[0][i] = [-1]
200  res[2][i] = [-1, -1, -1, -1]
201 
202  return res
203 
204  def normalize(self, value, obj):
205  if value == -1 or obj.size == 1:
206  obj.fill(value)
207  else:
208  obj[value] = 1
209 
210  def sparsify5(self, y):
211  'Returns labels in binary NumPy array'
212  res = [None]*self.outputs
213 
214  for i in range(0,len(res)): # flavour, fNProton, fNPion, fNPizero, fNNeutron
215  res[i] = np.zeros((y.shape[0], 4), dtype=int)
216 
217  for i in range(y.shape[0]):
218  for j in range(len(res)):
219  self.normalize(y[i][j], res[j][i])
220 
221  return res
222 
223  def sparsify7(self, y):
224  'Returns labels in binary NumPy array'
225  res = [None]*self.outputs
226  res[0] = np.zeros((y.shape[0], 1), dtype=int) # fNuPDG
227 
228  for i in range(1,len(res)): # flavour, interaction, fNProton, fNPion, fNPizero, fNNeutron
229  res[i] = np.zeros((y.shape[0], 4), dtype=int)
230 
231  for i in range(y.shape[0]):
232  for j in range(len(res)):
233  self.normalize(y[i][j], res[j][i])
234 
235  return res
def generate(self, labels, list_IDs, yield_labels=True)
int open(const char *, int)
Opens a file descriptor.
def __init__(self, cells=500, planes=500, views=3, batch_size=32, branches=True, outputs=7, standardize=True, images_path='/', shuffle=True, test_values=[])
auto enumerate(Iterables &&...iterables)
Range-for loop helper tracking the number of iteration.
Definition: enumerate.h:69
def normalize(self, value, obj)
def __get_exploration_order(self, list_IDs)
def __data_generation(self, labels, list_IDs_temp, yield_labels)