cvnCreateDB.cc
Go to the documentation of this file.
1 // std library
2 #include <iostream>
3 #include <sys/stat.h>
4 #include <fstream>
5 #include <sstream>
6 #include <algorithm>
7 
8 // Boost, for program options
9 #include "boost/program_options/options_description.hpp"
10 #include "boost/program_options/variables_map.hpp"
11 #include "boost/program_options/parsers.hpp"
12 #include "boost/algorithm/string/predicate.hpp"
13 
14 // ART/fcl stuff
15 #include "cetlib/filepath_maker.h"
18 #include "fhiclcpp/ParameterSet.h"
19 
20 // ROOT stuff
21 #include "TSystem.h"
22 #include "TFile.h"
23 #include "TTree.h"
24 #include "TChain.h"
25 #include "TCanvas.h"
26 #include "TGraph.h"
27 
28 // CVN stuff
30 //#include "CVN/art/CaffeNetHandler.h"
31 
32 // Caffe stuff
33 #include <leveldb/db.h>
34 #include <leveldb/write_batch.h>
35 #include <lmdb.h>
36 #define CPU_ONLY
37 // Suppress warnings originating in Caffe that we can't do anything about
38 #pragma GCC diagnostic push
39 #pragma GCC diagnostic ignored "-Wsign-compare"
40 #include "caffe/caffe.hpp"
41 #pragma GCC diagnostic pop
42 
43 #include "H5Cpp.h"
44 
45 
46 
47 namespace po = boost::program_options;
48 
49 
51 {
52  kAll, ///< Label all interaction types separately
53  kNumu, ///< Label numu:1, else 0
54  kNue, ///< Label nue:1, else 0
55  kNC, ///< Label NC:1, else 0
56  kEnergy ///< Label is conversion of fNuEnergy to int
57 };
58 
59 
60 class Config
61 {
62 public:
64  fOutputFormat (pset.get<std::string>("OutputFormat")),
65  fTreeName (pset.get<std::string>("TreeName")),
66  fTrainingBranchObjectName (pset.get<std::string>("TrainingDataBranchName")),
67  fTestOutputDir (pset.get<std::string>("TestOutputDir")),
68  fTrainOutputDir (pset.get<std::string>("TrainOutputDir")),
69  fNTrainPerTest (pset.get<unsigned int>("NTrainPerTest")),
70  fProgressInterval (pset.get<unsigned int>("ProgressInterval")),
71  fErrorIfExists (pset.get<bool>("ErrorIfExists")),
72  fSetLog (pset.get<bool>("SetLog")),
73  fCreateIfMissing (pset.get<bool>("CreateIfMissing")),
74  fWriteSync (pset.get<bool>("WriteSync")),
75  fMaxKeyLength (pset.get<unsigned int>("MaxKeyLength")),
76  fWriteBufferSize (pset.get<unsigned int>("WriteBufferSize")),
77  fLabeling (pset.get<std::string>("Labeling")),
78  fUseGeV (pset.get<bool>("UseGeV")),
79  fWriteRegressionHDF5 (pset.get<bool>("WriteRegressionHDF5")),
80  fRegressionHDF5NameTrain (pset.get<std::string>("RegressionHDF5NameTrain")),
81  fRegressionHDF5NameTest (pset.get<std::string>("RegressionHDF5NameTest")),
82  fMaxEnergyForLabel (pset.get<float>("MaxEnergyForLabel")),
83  fNEvents (pset.get<unsigned int>("NEvents")),
84  fPlaneLimit (pset.get<unsigned int>("PlaneLimit")),
85  fTDCLimit (pset.get<unsigned int>("TDCLimit")),
86  fReverseViews(pset.get<std::vector<bool> >("ReverseViews"))
87  {
88  if(!fLabeling.compare("all")) fLabelingMode = kAll;
89  if(!fLabeling.compare("numu")) fLabelingMode = kNumu;
90  if(!fLabeling.compare("nue")) fLabelingMode = kNue;
91  if(!fLabeling.compare("nc")) fLabelingMode = kNC;
92  if(!fLabeling.compare("energy")) fLabelingMode = kEnergy;
93  };
94 
95 
101 
102  /// Number of training examples for each test sample, e.g. 4 for 80/20 split
103  unsigned int fNTrainPerTest;
104  /// Number of examples in between progress updates (% complete)
105  unsigned int fProgressInterval;
106 
108  bool fSetLog;
111  unsigned int fMaxKeyLength;
112  unsigned int fWriteBufferSize;
113 
115  unsigned int fLabelingMode;
116 
117  bool fUseGeV;
118 
119  /// Flag to control whether or not we write HDF5 regression features
124  /// Limit the number of entries in the tree to consider
125  unsigned int fNEvents;
126  /// Limit the number of wires in the output image
128  /// Limit the number of TDCs in the output image
130  /// Views to reverse
131  std::vector<bool> fReverseViews;
132 };
133 
134 class OutputDB {
135 public:
136  OutputDB(std::string sample, const Config& config);
137  ~OutputDB();
138 
139  void Put(std::string &serializeKey, std::string &serializeString);
140 
141 private:
142  leveldb::DB* fLevelDB;
143  leveldb::WriteOptions fWriteOptions;
144 
145  MDB_env *mdb_env;
146  MDB_dbi mdb_dbi;
147  MDB_val mdb_key, mdb_data;
148  MDB_txn *mdb_txn;
149 
150 };
151 
153  fLevelDB(0), mdb_env(0), mdb_txn(0) {
154 
155  std::string outputDir;
156  if (sample=="test")
157  outputDir=config.fTestOutputDir;
158  else
159  outputDir=config.fTrainOutputDir;
160 
161  if (config.fOutputFormat=="LevelDB"){
162  leveldb::Options fileOptions;
163  fileOptions.error_if_exists = config.fErrorIfExists;
164  fileOptions.create_if_missing = config.fCreateIfMissing;
165  fileOptions.write_buffer_size = config.fWriteBufferSize;
166 
167  fWriteOptions.sync = config.fWriteSync;
168 
169  if(!leveldb::DB::Open(fileOptions, outputDir, &fLevelDB).ok()) {
170  std::cout << "Problem opening the database: "
171  << outputDir << std::endl;
172  exit(1);
173  }
174  }
175 
176  else if (config.fOutputFormat=="LMDB") {
177  mkdir(outputDir.c_str(),0777);
178  mdb_env_create(&mdb_env);
179  mdb_env_set_mapsize(mdb_env, 10737418240);
180  mdb_env_open(mdb_env, outputDir.c_str(), 0, 0777);
181  mdb_txn_begin(mdb_env, NULL, 0, &mdb_txn);
182  mdb_dbi_open(mdb_txn,NULL, 0, &mdb_dbi);
183  }
184 
185  else {
186  std::cout << "Unrecognized output format " << config.fOutputFormat << std::endl;
187  exit(1);
188  }
189 
190 }
191 
193  if (mdb_txn) mdb_txn_commit(mdb_txn);
194 }
195 
196 void OutputDB::Put(std::string &serializeKey, std::string &serializeString) {
197  if (fLevelDB) {
198  fLevelDB->Put(fWriteOptions, serializeKey, serializeString);
199  } //end if LevelDB
200  else {//it must be LMDB
201  mdb_data.mv_size=serializeString.size();
202  mdb_data.mv_data=reinterpret_cast<void*>(&serializeString[0]);
203  mdb_key.mv_size=serializeKey.size();
204  mdb_key.mv_data=reinterpret_cast<void*>(&serializeKey[0]);
205  if ( mdb_put(mdb_txn,mdb_dbi,&mdb_key,&mdb_data,0)!= MDB_SUCCESS){
206  std::cout<< "ERROR: Events not loaded correctly" <<std::endl;
207  }//end if put fails
208  }//end if LMDB
209 }//end OutputDB::Put
210 
212 {
213 
214  TChain chain(config.fTreeName.c_str());
215 
216  if (boost::ends_with(input,".list")) {
217  std::ifstream list_file(input.c_str());
218  if (!list_file.is_open()) {
219  std::cout << "Could not open " << input << std::endl;
220  exit(1);
221  }
222 
223  std::string ifname;
224  while (list_file>>ifname)
225  chain.Add(ifname.c_str());
226 
227  }//end if list file
228 
229  else if (boost::ends_with(input,".root")) {
230  chain.Add(input.c_str());
231  }//end if root file
232 
233  chain.SetMakeClass(1);
234 
235  int fInt;
236  UInt_t fPMap_fNWire;
237  UInt_t fPMap_fNTdc;
238  std::vector<float> fPMap_fPEX;
239  std::vector<float> fPMap_fPEY;
240  std::vector<float> fPMap_fPEZ;
241 
242  chain.SetBranchAddress("fInt", &fInt);
243  chain.SetBranchAddress("fPMap.fNWire", &fPMap_fNWire);
244  chain.SetBranchAddress("fPMap.fNTdc", &fPMap_fNTdc);
245  chain.SetBranchAddress("fPMap.fPEX", &fPMap_fPEX);
246  chain.SetBranchAddress("fPMap.fPEY", &fPMap_fPEY);
247  chain.SetBranchAddress("fPMap.fPEZ", &fPMap_fPEZ);
248 
249  unsigned int entries = chain.GetEntries();
250  if(config.fNEvents < entries){
251  entries = config.fNEvents;
252  }
253  if(entries <= 0){
254  std::cout << "Error: Input tree has no entries." << std::endl;
255  exit(4);
256  }
257 
258  std::cout << "- Will process " << entries << " from the input tree." << std::endl;
259 
260  OutputDB TrainDB("train",config);
261  OutputDB TestDB( "test",config);
262 
263  char* key = new char[config.fMaxKeyLength];
264  std::string serializeString;
265 
266  //need to shuffle entries...
267  std::srand ( unsigned ( std::time(0) ) );
268  std::vector<unsigned int> shuffled;
269  for (unsigned int i = 0; i < entries; ++i)
270  {
271  shuffled.push_back(i);
272  }
273 
274  std::random_shuffle( shuffled.begin(), shuffled.end() );
275 
276  // Figure out the size of the train and test samples
277  // Call a 'block' a particular set of one test and nTrainPerTest train
278  unsigned int blockSize = config.fNTrainPerTest + 1;
279  // number of test is the number of blocks, using integer division
280  unsigned int nTest = 1 + entries / blockSize;
281  // number of training samples is number of blocks times train per test
282  unsigned int nTrain = entries / blockSize * config.fNTrainPerTest;
283  // Add on the entries from the last, potentially partial block, minus test
284  if (entries % blockSize) nTrain += entries % blockSize - 1;
285 
286  // Create an array to hold regression features.
287  const unsigned int nRegressionFeatures = 2; // Currently nuEnergy, lepEnergy
288 // float regressionDataTest[nTest][nRegressionFeatures];
289 // float regressionDataTrain[nTrain][nRegressionFeatures];
290 
291  int** regressionDataTest = new int*[nTest];
292  for(unsigned int i = 0; i < nTest; ++i) {regressionDataTest[i] = new int[nRegressionFeatures];}
293  int** regressionDataTrain = new int*[nTrain];
294  for(unsigned int i = 0; i < nTrain; ++i) {regressionDataTrain[i] = new int[nRegressionFeatures];}
295 
296  int iTrain = 0;
297  int iTest = 0;
298 
299  ////hdf5////
300 
301  const char saveFilePath[] = "test.h5";
302  const hsize_t ndims = 2;
303  const hsize_t ncols = 3;
304 
305  hid_t file = H5Fcreate(saveFilePath, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
306  std::cout << "- File created" << std::endl;
307 
308  hsize_t dims[ndims] = {0, ncols};
309  hsize_t max_dims[ndims] = {H5S_UNLIMITED, ncols};
310  hid_t file_space = H5Screate_simple(ndims, dims, max_dims);
311  std::cout << "- Dataspace created" << std::endl;
312 
313  hid_t plist = H5Pcreate(H5P_DATASET_CREATE);
314  H5Pset_layout(plist, H5D_CHUNKED);
315  hsize_t chunk_dims[ndims] = {2, ncols};
316  H5Pset_chunk(plist, ndims, chunk_dims);
317  std::cout << "- Property list created" << std::endl;
318 
319  //hid_t dset = H5Dcreate(file, "dset1", H5T_NATIVE_FLOAT, file_space, H5P_DEFAULT, plist, H5P_DEFAULT);
320  H5Dcreate(file, "dset1", H5T_NATIVE_FLOAT, file_space, H5P_DEFAULT, plist, H5P_DEFAULT);
321  std::cout << "- Dataset 'dset1' created" << std::endl;
322 
323  H5Pclose(plist);
324  H5Sclose(file_space);
325 
326  if(entries > chain.GetEntries()){
327  entries = chain.GetEntries();
328  }
329 
330  for(unsigned int iEntry = 0; iEntry < entries; ++iEntry)
331  {
332  unsigned int entry = shuffled[iEntry];
333  chain.GetEntry(entry);
334 
335  unsigned int nViews = 3;
336 
337  // Create a CVNImageUtils object and use it to produce the pixels. The arguments
338  // define how large we want the output image to be
339  cvn::CVNImageUtils imageUtils(config.fPlaneLimit,config.fTDCLimit,nViews);
340  // Since we don't have a PixelMap object, we need to tell it how big it is
341  imageUtils.SetPixelMapSize(fPMap_fNWire,fPMap_fNTdc);
342 
343  std::vector<unsigned char> pixelArray(nViews * config.fPlaneLimit * config.fTDCLimit,0);
344 
345  imageUtils.SetLogScale(config.fSetLog);
346  imageUtils.SetViewReversal(config.fReverseViews);
347  imageUtils.ConvertChargeVectorsToPixelArray(fPMap_fPEX, fPMap_fPEY, fPMap_fPEZ, pixelArray);
348 
349  caffe::Datum datum;
350  datum.set_height(config.fPlaneLimit);
351  datum.set_width(config.fTDCLimit);
352 
353  datum.set_data(pixelArray.data(), nViews * config.fPlaneLimit * config.fTDCLimit);
354 
355  datum.set_label(fInt);
356 
357  datum.SerializeToString(&serializeString);
358 
359  if(iEntry % (blockSize))
360  {
361  snprintf(key, config.fMaxKeyLength, "%08lld", (long long int)iTrain);
362  std::string serializeKey(key);
363 
364  TrainDB.Put(serializeKey,serializeString);
365 
366  regressionDataTrain[iTrain][0] = 1.;
367  regressionDataTrain[iTrain][1] = 1.;
368  iTrain += 1;
369 
370  ////hdf5////
371  hsize_t nlines = 1;
372  float *buffer = new float[nlines * ncols];
373  float **b = new float*[nlines];
374  for (hsize_t i = 0; i < nlines; ++i){
375  b[i] = &buffer[i * ncols];
376  }
377 
378  b[0][0] = 0.1;
379  b[0][1] = 0.2;
380  b[0][2] = 0.3;
381 
382  }
383  else
384  {
385  snprintf(key, config.fMaxKeyLength, "%08lld", (long long int)iTest);
386  std::string serializeKey(key);
387 
388  TestDB.Put(serializeKey,serializeString);
389 
390  regressionDataTest[iTest][0] = 1.;
391  regressionDataTest[iTest][1] = 1.;
392  iTest += 1;
393  }
394  if(not (iEntry % config.fProgressInterval))
395  std::cout << "Fraction complete: "
396  << iEntry / (float)entries << std::endl;
397 
398  }
399 
400  if (config.fWriteRegressionHDF5)
401  {
402 
403  H5::FloatType type(H5::PredType::IEEE_F32LE);
404  std::cout << "Writing HDF5 regression output : "
405  << config.fRegressionHDF5NameTest << std::endl;
406  H5::H5File h5FileTest(config.fRegressionHDF5NameTest, H5F_ACC_TRUNC );
407  hsize_t shape[2];
408  shape[0] = nTest;
409  shape[1] = nRegressionFeatures;
410  H5::DataSpace spaceTest(2, shape);
411 
412  H5::DataSet datasetTest = h5FileTest.createDataSet("regression",
413  type,
414  spaceTest);
415 
416  datasetTest.write(regressionDataTest, type);
417 
418  std::cout << "Writing HDF5 regression output : "
419  << config.fRegressionHDF5NameTrain << std::endl;
420  H5::H5File h5FileTrain(config.fRegressionHDF5NameTrain, H5F_ACC_TRUNC );
421  shape[0] = nTrain;
422  H5::DataSpace spaceTrain(2, shape);
423 
424  H5::DataSet datasetTrain = h5FileTrain.createDataSet("regression",
425  type,
426  spaceTrain);
427 
428  datasetTrain.write(regressionDataTrain, type);
429 
430  }
431 
432  // Clear up
433  delete key;
434  for(unsigned int i = 0; i < nTest; ++i) {
435  delete [] regressionDataTest[i];
436  }
437  delete [] regressionDataTest;
438  for(unsigned int i = 0; i < nTrain; ++i) {
439  delete [] regressionDataTrain[i];
440  }
441  delete [] regressionDataTrain;
442 
443 }
444 
445 
446 
447 po::variables_map getOptions(int argc, char* argv[], std::string& config,
449 {
450 
451  // Declare the supported options.
452  po::options_description desc("Allowed options");
453  desc.add_options()
454  ("help", "produce help message")
455  ("config,c", po::value<std::string>(&config)->required(),
456  "configuration file")
457  ("input,i", po::value<std::string>(&input)->required(),
458  "Input data in ROOT file.");
459  po::variables_map vm;
460 
461  try
462  {
463  po::store(po::parse_command_line(argc, argv, desc), vm);
464  po::notify(vm);
465 
466  }
467  catch(po::error& e)
468  {
469  std::cout << "ERROR: " << e.what() << std::endl;
470  exit(1);
471  }
472 
473 
474  if (vm.count("help")) {
475  std::cout << desc << "\n";
476  exit(1);
477  }
478 
479  return vm;
480 }
481 
482 
483 
485 {
486 
487  cet::filepath_first_absolute_or_lookup_with_dot policy("FHICL_FILE_PATH");
488 
489  // parse a configuration file; obtain intermediate form
491  fhicl::parse_document(configPath, policy, tbl);
492 
493  // convert to ParameterSet
494  fhicl::ParameterSet pset;
495  fhicl::make_ParameterSet(tbl, pset);
496 
497  return pset;
498 
499 }
500 
501 int main(int argc, char* argv[])
502 {
503 
504  std::string configPath, inputPath, outputPath, logPath;
505  po::variables_map vm = getOptions(argc, argv, configPath, inputPath);
506 
507  Config config(getPSet(configPath));
508 
509 
510  fill(config, inputPath);
511 
512  return 0;
513 
514 }
515 
516 
517 
fhicl::ParameterSet getPSet(std::string configPath)
Definition: cvnCreateDB.cc:484
MDB_dbi mdb_dbi
Definition: cvnCreateDB.cc:146
Label is conversion of fNuEnergy to int.
Definition: cvnCreateDB.cc:56
Config(const fhicl::ParameterSet &pset)
Definition: cvnCreateDB.cc:63
void fill(const Config &config, std::string input)
Definition: cvnCreateDB.cc:211
QList< Entry > entry
bool fSetLog
Definition: cvnCreateDB.cc:108
MDB_val mdb_data
Definition: cvnCreateDB.cc:147
def mkdir(path, mode=0o777)
std::string string
Definition: nybbler.cc:12
void ConvertChargeVectorsToPixelArray(std::vector< float > &v0pe, std::vector< float > &v1pe, std::vector< float > &v2pe, std::vector< unsigned char > &pix)
int main(int argc, char *argv[])
Definition: cvnCreateDB.cc:501
std::string fRegressionHDF5NameTest
Definition: cvnCreateDB.cc:122
void SetViewReversal(bool reverseX, bool reverseY, bool reverseZ)
Function to set any views that need reversing.
bool fUseGeV
Definition: cvnCreateDB.cc:117
void SetLogScale(bool setLog)
Set the log scale for charge.
error
Definition: include.cc:26
STL namespace.
float fMaxEnergyForLabel
Definition: cvnCreateDB.cc:123
void make_ParameterSet(intermediate_table const &tbl, ParameterSet &ps)
MDB_val mdb_key
Definition: cvnCreateDB.cc:147
unsigned int fMaxKeyLength
Definition: cvnCreateDB.cc:111
bool fWriteSync
Definition: cvnCreateDB.cc:110
std::string fRegressionHDF5NameTrain
Definition: cvnCreateDB.cc:121
Label all interaction types separately.
Definition: cvnCreateDB.cc:52
MDB_txn * mdb_txn
Definition: cvnCreateDB.cc:148
Label NC:1, else 0.
Definition: cvnCreateDB.cc:55
unsigned int fLabelingMode
Definition: cvnCreateDB.cc:115
typename config_impl< T >::type Config
Definition: ModuleMacros.h:52
const double e
static int input(void)
Definition: code.cpp:15695
MDB_env * mdb_env
Definition: cvnCreateDB.cc:145
bool fWriteRegressionHDF5
Flag to control whether or not we write HDF5 regression features.
Definition: cvnCreateDB.cc:120
unsigned int fNTrainPerTest
Number of training examples for each test sample, e.g. 4 for 80/20 split.
Definition: cvnCreateDB.cc:103
Label numu:1, else 0.
Definition: cvnCreateDB.cc:53
def key(type, name=None)
Definition: graph.py:13
Utilities for producing images for the CVN.
static Config * config
Definition: config.cpp:1054
leveldb::DB * fLevelDB
Definition: cvnCreateDB.cc:142
std::string fLabeling
Definition: cvnCreateDB.cc:114
int fTDCLimit
Limit the number of TDCs in the output image.
Definition: cvnCreateDB.cc:129
void Put(std::string &serializeKey, std::string &serializeString)
Definition: cvnCreateDB.cc:196
bool fErrorIfExists
Definition: cvnCreateDB.cc:107
bool fCreateIfMissing
Definition: cvnCreateDB.cc:109
Class containing some utility functions for all things CVN.
Definition: CVNImageUtils.h:24
unsigned int fNEvents
Limit the number of entries in the tree to consider.
Definition: cvnCreateDB.cc:125
std::string fOutputFormat
Definition: cvnCreateDB.cc:93
std::string fTrainOutputDir
Definition: cvnCreateDB.cc:100
po::variables_map getOptions(int argc, char *argv[], std::string &config, std::string &input)
Definition: cvnCreateDB.cc:447
OutputDB(std::string sample, const Config &config)
Definition: cvnCreateDB.cc:152
std::vector< bool > fReverseViews
Views to reverse.
Definition: cvnCreateDB.cc:131
int fPlaneLimit
Limit the number of wires in the output image.
Definition: cvnCreateDB.cc:127
std::string fTrainingBranchObjectName
Definition: cvnCreateDB.cc:98
leveldb::WriteOptions fWriteOptions
Definition: cvnCreateDB.cc:143
intermediate_table parse_document(std::string const &filename, cet::filepath_maker &maker)
Definition: parse.cc:720
unsigned int fProgressInterval
Number of examples in between progress updates (% complete)
Definition: cvnCreateDB.cc:105
std::string fTestOutputDir
Definition: cvnCreateDB.cc:99
static QCString type
Definition: declinfo.cpp:672
LabelingMode
Definition: cvnCreateDB.cc:50
Label nue:1, else 0.
Definition: cvnCreateDB.cc:54
static bool * b
Definition: config.cpp:1043
auto const & get(AssnsNode< L, R, D > const &r)
Definition: AssnsNode.h:115
int bool
Definition: qglobal.h:345
QTextStream & endl(QTextStream &s)
unsigned int fWriteBufferSize
Definition: cvnCreateDB.cc:112
void SetPixelMapSize(unsigned int nWires, unsigned int nTDCs)
Set the input pixel map size.
std::string fTreeName
Definition: cvnCreateDB.cc:97