2 This is the dataset generator module. 6 __author__ =
'Saul Alonso-Monsalve' 7 __email__ =
"saul.alonso.monsalve@cern.ch" 21 from sklearn.utils
import class_weight
22 from collections
import Counter
25 **************************************** 26 ************** PARAMETERS ************** 27 **************************************** 30 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
32 config = configparser.ConfigParser()
33 config.read(
'config/config.ini')
37 SEED =
int(config[
'random'][
'seed'])
40 SEED =
int(time.time())
46 IMAGES_PATH = config[
'images'][
'path']
47 VIEWS =
int(config[
'images'][
'views'])
48 PLANES =
int(config[
'images'][
'planes'])
49 CELLS =
int(config[
'images'][
'cells'])
53 DATASET_PATH = config[
'dataset'][
'path']
54 PARTITION_PREFIX = config[
'dataset'][
'partition_prefix']
55 LABELS_PREFIX = config[
'dataset'][
'labels_prefix']
56 UNIFORM = ast.literal_eval(config[
'dataset'][
'uniform'])
60 OUTPUTS =
int(config[
'model'][
'outputs'])
64 TRAIN_FRACTION =
float(config[
'train'][
'fraction'])
65 WEIGHTED_LOSS_FUNCTION = ast.literal_eval(config[
'train'][
'weighted_loss_function'])
66 CLASS_WEIGHTS_PREFIX = config[
'train'][
'class_weights_prefix']
70 VALIDATION_FRACTION =
float(config[
'validation'][
'fraction'])
74 TEST_FRACTION =
float(config[
'test'][
'fraction'])
76 if((TRAIN_FRACTION + VALIDATION_FRACTION + TEST_FRACTION) > 1):
77 logging.error(
'(TRAIN_FRACTION + VALIDATION_FRACTION + TEST_FRACTION) must be <= 1')
93 count_category = [0]*14
96 **************************************** 97 *************** DATASETS *************** 98 **************************************** 101 partition = {
'train' : [],
'validation' : [],
'test' : []}
104 y1_class_weights = []
105 y2_class_weights = []
107 only_train = [
'nutau2',
'nutau3']
114 logging.info(
'Filling datasets...')
117 count_antineutrinos = 0
118 count_empty_views = 0
119 count_empty_events = 0
120 count_less_10nonzero_views = 0
121 count_less_10nonzero_events = 0
123 for images_path
in glob.iglob(IMAGES_PATH +
'/*'):
125 count_train, count_val, count_test = (0, 0, 0)
129 if 'nutau2' in images_path
or 'nutau3' in images_path:
132 files = list(glob.iglob(images_path +
"/images/*"))
133 random.shuffle(files)
135 for imagefile
in files:
137 ID = imagefile.split(
"/")[-1][:-3]
138 infofile = images_path +
'/info/' + ID +
'.info' 142 info =
open(infofile,
'r').readlines() 143 fInt = int(info[0].strip()) 145 interaction = fInt % 4 149 fRecoNueEnergy = float(info[3].strip()) 150 fRecoNumuEnergy = float(info[4].strip()) 151 fEventWeight = float(info[5].strip()) 168 count_antineutrinos+=1
174 print "flavour:", flavour 175 print "interaction:", interaction 176 print "fNuEnergy:", fNuEnergy 177 print "fLepEnergy:", fLepEnergy 178 print "fRecoNueEnergy:", fRecoNueEnergy 179 print "fRecoNumuEnergy:", fRecoNumuEnergy 180 print "fEventWeight:", fEventWeight 181 print "fNuPDG:", fNuPDG 182 print "fNProton:", fNProton 183 print "fNPion:", fNPion 184 print "fNPizero:", fNPizero 185 print "fNNeutron:", fNNeutron 188 random_value = np.random.uniform(0,1)
190 with
open(imagefile,
'rb')
as image_file:
191 pixels = np.fromstring(zlib.decompress(image_file.read()), dtype=np.uint8, sep=
'').reshape(VIEWS, PLANES, CELLS)
197 non_empty_view = [0,0,0]
202 count_less_10nonzero = 0
203 for i
in range(len(views)):
204 views[i] = pixels[i, :, :].reshape(PLANES, CELLS, 1)
205 maxi = np.max(views[i])
206 mini = np.min(views[i])
207 nonzero = np.count_nonzero(views[i])
208 total = np.sum(views[i])
209 avg = np.mean(views[i])
214 count_less_10nonzero+=1
215 count_less_10nonzero_views+=1
216 if count_empty == len(views):
217 count_empty_events+=1
218 if count_less_10nonzero > 0:
219 count_less_10nonzero_events+=1
223 if(random_value < TRAIN_FRACTION):
226 count_flavour[flavour] += 1
227 count_category[fInt] += 1
228 partition[
'train'].append(ID)
231 elif(random_value < (TRAIN_FRACTION + VALIDATION_FRACTION)):
232 partition[
'validation'].append(ID)
235 elif(random_value < (TRAIN_FRACTION + VALIDATION_FRACTION + TEST_FRACTION)):
236 partition[
'test'].append(ID)
243 labels[ID] = [flavour, fNProton, fNPion, fNPizero, fNNeutron]
245 labels[ID] = [fNuPDG, flavour, interaction, fNProton, fNPion, fNPizero, fNNeutron]
247 logging.debug(
'%d train images', count_train)
248 logging.debug(
'%d val images', count_val)
249 logging.debug(
'%d test images', count_test)
250 logging.debug(
'%d total images', count_train + count_val + count_test)
257 logging.info(
'Number of neutrino events: %d', count_neutrinos)
258 logging.info(
'Number of antineutrino events: %d', count_antineutrinos)
260 logging.info(
'Number of empty views: %d', count_empty_views)
261 logging.info(
'Number of views with <10 non-zero pixels: %d', count_less_10nonzero_views)
262 logging.info(
'Number of empty events: %d', count_empty_events)
263 logging.info(
'Number of events with at least one view with <10 non-zero pixels: %d', count_less_10nonzero_events)
265 logging.info(
'Number of training examples: %d', len(partition[
'train']))
266 logging.info(
'Number of validation examples: %d', len(partition[
'validation']))
267 logging.info(
'Number of test examples: %d', len(partition[
'test']))
271 logging.info(
'Serializing datasets...')
273 with
open(DATASET_PATH + PARTITION_PREFIX +
'.p',
'w')
as partition_file:
274 pickle.dump(partition, partition_file)
276 with
open(DATASET_PATH + LABELS_PREFIX +
'.p',
'w')
as labels_file:
277 pickle.dump(labels, labels_file)
int open(const char *, int)
Opens a file descriptor.