cnpy.cxx
Go to the documentation of this file.
1 //Copyright (C) 2011 Carl Rogers
2 //Released under MIT License
3 //license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
4 // "vendored" from https://github.com/rogersce/cnpy/commit/4e8810b1a8637695171ed346ce68f6984e585ef4
5 
6 #include"WireCellUtil/cnpy.h"
7 #include<complex>
8 #include<cstdlib>
9 #include<algorithm>
10 #include<cstring>
11 #include<iomanip>
12 #include<stdint.h>
13 #include<stdexcept>
14 #include <regex>
15 
17  int x = 1;
18  return (((char *)&x)[0]) ? '<' : '>';
19 }
20 
21 char cnpy::map_type(const std::type_info& t)
22 {
23  if(t == typeid(float) ) return 'f';
24  if(t == typeid(double) ) return 'f';
25  if(t == typeid(long double) ) return 'f';
26 
27  if(t == typeid(int) ) return 'i';
28  if(t == typeid(char) ) return 'i';
29  if(t == typeid(short) ) return 'i';
30  if(t == typeid(long) ) return 'i';
31  if(t == typeid(long long) ) return 'i';
32 
33  if(t == typeid(unsigned char) ) return 'u';
34  if(t == typeid(unsigned short) ) return 'u';
35  if(t == typeid(unsigned long) ) return 'u';
36  if(t == typeid(unsigned long long) ) return 'u';
37  if(t == typeid(unsigned int) ) return 'u';
38 
39  if(t == typeid(bool) ) return 'b';
40 
41  if(t == typeid(std::complex<float>) ) return 'c';
42  if(t == typeid(std::complex<double>) ) return 'c';
43  if(t == typeid(std::complex<long double>) ) return 'c';
44 
45  else return '?';
46 }
47 
48 template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
49  lhs.insert(lhs.end(),rhs.begin(),rhs.end());
50  return lhs;
51 }
52 
53 template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
54  //write in little endian
55  size_t len = strlen(rhs);
56  lhs.reserve(len);
57  for(size_t byte = 0; byte < len; byte++) {
58  lhs.push_back(rhs[byte]);
59  }
60  return lhs;
61 }
62 
63 void cnpy::parse_npy_header(unsigned char* buffer,size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
64  //std::string magic_string(buffer,6);
65  //uint8_t major_version = *reinterpret_cast<uint8_t*>(buffer+6);
66  //uint8_t minor_version = *reinterpret_cast<uint8_t*>(buffer+7);
67  uint16_t header_len = *reinterpret_cast<uint16_t*>(buffer+8);
68  std::string header(reinterpret_cast<char*>(buffer+9),header_len);
69 
70  size_t loc1, loc2;
71 
72  //fortran order
73  loc1 = header.find("fortran_order")+16;
74  fortran_order = (header.substr(loc1,4) == "True" ? true : false);
75 
76  //shape
77  loc1 = header.find("(");
78  loc2 = header.find(")");
79 
80  std::regex num_regex("[0-9][0-9]*");
81  std::smatch sm;
82  shape.clear();
83 
84  std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
85  while(std::regex_search(str_shape, sm, num_regex)) {
86  shape.push_back(std::stoi(sm[0].str()));
87  str_shape = sm.suffix().str();
88  }
89 
90  //endian, word size, data type
91  //byte order code | stands for not applicable.
92  //not sure when this applies except for byte array
93  loc1 = header.find("descr")+9;
94  bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
95  if (!littleEndian) {
96  throw std::runtime_error("parse_npy_header: header is not little endian");
97  }
98 
99  //char type = header[loc1+1];
100  //assert(type == map_type(T));
101 
102  std::string str_ws = header.substr(loc1+2);
103  loc2 = str_ws.find("'");
104  word_size = atoi(str_ws.substr(0,loc2).c_str());
105 }
106 
107 void cnpy::parse_npy_header(FILE* fp, size_t& word_size, std::vector<size_t>& shape, bool& fortran_order) {
108  char buffer[256];
109  size_t res = fread(buffer,sizeof(char),11,fp);
110  if(res != 11)
111  throw std::runtime_error("parse_npy_header: failed fread");
112  std::string header = fgets(buffer,256,fp);
113  if (header[header.size()-1] != '\n') {
114  throw std::runtime_error("parse_npy_header: header is not newline terminated");
115  }
116 
117  size_t loc1, loc2;
118 
119  //fortran order
120  loc1 = header.find("fortran_order");
121  if (loc1 == std::string::npos)
122  throw std::runtime_error("parse_npy_header: failed to find header keyword: 'fortran_order'");
123  loc1 += 16;
124  fortran_order = (header.substr(loc1,4) == "True" ? true : false);
125 
126  //shape
127  loc1 = header.find("(");
128  loc2 = header.find(")");
129  if (loc1 == std::string::npos || loc2 == std::string::npos)
130  throw std::runtime_error("parse_npy_header: failed to find header keyword: '(' or ')'");
131 
132  std::regex num_regex("[0-9][0-9]*");
133  std::smatch sm;
134  shape.clear();
135 
136  std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
137  while(std::regex_search(str_shape, sm, num_regex)) {
138  shape.push_back(std::stoi(sm[0].str()));
139  str_shape = sm.suffix().str();
140  }
141 
142  //endian, word size, data type
143  //byte order code | stands for not applicable.
144  //not sure when this applies except for byte array
145  loc1 = header.find("descr");
146  if (loc1 == std::string::npos)
147  throw std::runtime_error("parse_npy_header: failed to find header keyword: 'descr'");
148  loc1 += 9;
149  bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
150  if (!littleEndian) {
151  throw std::runtime_error("parse_npy_header: header is not little endian");
152  }
153 
154  //char type = header[loc1+1];
155  //assert(type == map_type(T));
156 
157  std::string str_ws = header.substr(loc1+2);
158  loc2 = str_ws.find("'");
159  word_size = atoi(str_ws.substr(0,loc2).c_str());
160 }
161 
162 void cnpy::parse_zip_footer(FILE* fp, uint16_t& nrecs, size_t& global_header_size, size_t& global_header_offset)
163 {
164  std::vector<char> footer(22);
165  fseek(fp,-22,SEEK_END);
166  size_t res = fread(&footer[0],sizeof(char),22,fp);
167  if(res != 22)
168  throw std::runtime_error("parse_zip_footer: failed fread");
169 
170  uint16_t disk_no, disk_start, nrecs_on_disk, comment_len;
171  disk_no = *(uint16_t*) &footer[4];
172  disk_start = *(uint16_t*) &footer[6];
173  nrecs_on_disk = *(uint16_t*) &footer[8];
174  nrecs = *(uint16_t*) &footer[10];
175  global_header_size = *(uint32_t*) &footer[12];
176  global_header_offset = *(uint32_t*) &footer[16];
177  comment_len = *(uint16_t*) &footer[20];
178 
179  if (disk_no != 0) {
180  throw std::runtime_error("parse_zip_footer: non-zero disk number");
181  }
182 
183  if (disk_start != 0) {
184  throw std::runtime_error("parse_zip_footer: non-zero disk start");
185  }
186 
187  if (nrecs_on_disk != nrecs) {
188  throw std::runtime_error("parse_zip_footer: number of recrods do not match");
189  }
190  if (comment_len != 0) {
191  throw std::runtime_error("parse_zip_footer: unexepcted comment");
192  }
193 }
194 
196  std::vector<size_t> shape;
197  size_t word_size;
198  bool fortran_order;
199  cnpy::parse_npy_header(fp,word_size,shape,fortran_order);
200 
201  cnpy::NpyArray arr(shape, word_size, fortran_order);
202  size_t nread = fread(arr.data<char>(),1,arr.num_bytes(),fp);
203  if(nread != arr.num_bytes())
204  throw std::runtime_error("load_the_npy_file: failed fread");
205  return arr;
206 }
207 
208 cnpy::NpyArray load_the_npz_array(FILE* fp, uint32_t compr_bytes, uint32_t uncompr_bytes) {
209 
210  std::vector<unsigned char> buffer_compr(compr_bytes);
211  std::vector<unsigned char> buffer_uncompr(uncompr_bytes);
212  size_t nread = fread(&buffer_compr[0],1,compr_bytes,fp);
213  if(nread != compr_bytes)
214  throw std::runtime_error("load_the_npy_file: failed fread");
215 
216  z_stream d_stream;
217 
218  d_stream.zalloc = Z_NULL;
219  d_stream.zfree = Z_NULL;
220  d_stream.opaque = Z_NULL;
221  d_stream.avail_in = 0;
222  d_stream.next_in = Z_NULL;
223  inflateInit2(&d_stream, -MAX_WBITS);
224 
225  d_stream.avail_in = compr_bytes;
226  d_stream.next_in = &buffer_compr[0];
227  d_stream.avail_out = uncompr_bytes;
228  d_stream.next_out = &buffer_uncompr[0];
229 
230  inflate(&d_stream, Z_FINISH);
231  inflateEnd(&d_stream);
232 
233  std::vector<size_t> shape;
234  size_t word_size;
235  bool fortran_order;
236  cnpy::parse_npy_header(&buffer_uncompr[0],word_size,shape,fortran_order);
237 
238  cnpy::NpyArray array(shape, word_size, fortran_order);
239 
240  size_t offset = uncompr_bytes - array.num_bytes();
241  memcpy(array.data<unsigned char>(),&buffer_uncompr[0]+offset,array.num_bytes());
242 
243  return array;
244 }
245 
247  FILE* fp = fopen(fname.c_str(),"rb");
248 
249  if(!fp) {
250  throw std::runtime_error("npz_load: Error! Unable to open file "+fname+"!");
251  }
252 
253  cnpy::npz_t arrays;
254 
255  while(1) {
256  std::vector<char> local_header(30);
257  size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
258  if(headerres != 30)
259  throw std::runtime_error("npz_load: failed fread");
260 
261  //if we've reached the global header, stop reading
262  if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
263 
264  //read in the variable name
265  uint16_t name_len = *(uint16_t*) &local_header[26];
266  std::string varname(name_len,' ');
267  size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
268  if(vname_res != name_len)
269  throw std::runtime_error("npz_load: failed fread");
270 
271  //erase the lagging .npy
272  varname.erase(varname.end()-4,varname.end());
273 
274  //read in the extra field
275  uint16_t extra_field_len = *(uint16_t*) &local_header[28];
276  if(extra_field_len > 0) {
277  std::vector<char> buff(extra_field_len);
278  size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
279  if(efield_res != extra_field_len)
280  throw std::runtime_error("npz_load: failed fread");
281  }
282 
283  uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
284  uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
285  uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
286 
287  if(compr_method == 0) {arrays[varname] = load_the_npy_file(fp);}
288  else {arrays[varname] = load_the_npz_array(fp,compr_bytes,uncompr_bytes);}
289  }
290 
291  fclose(fp);
292  return arrays;
293 }
294 
296  FILE* fp = fopen(fname.c_str(),"rb");
297 
298  if(!fp) throw std::runtime_error("npz_load: Unable to open file "+fname);
299 
300  while(1) {
301  std::vector<char> local_header(30);
302  size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
303  if(header_res != 30)
304  throw std::runtime_error("npz_load: failed fread");
305 
306  //if we've reached the global header, stop reading
307  if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
308 
309  //read in the variable name
310  uint16_t name_len = *(uint16_t*) &local_header[26];
311  std::string vname(name_len,' ');
312  size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);
313  if(vname_res != name_len)
314  throw std::runtime_error("npz_load: failed fread");
315  vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
316 
317  //read in the extra field
318  uint16_t extra_field_len = *(uint16_t*) &local_header[28];
319  fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
320 
321  uint16_t compr_method = *reinterpret_cast<uint16_t*>(&local_header[0]+8);
322  uint32_t compr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+18);
323  uint32_t uncompr_bytes = *reinterpret_cast<uint32_t*>(&local_header[0]+22);
324 
325  if(vname == varname) {
326  NpyArray array = (compr_method == 0) ? load_the_npy_file(fp) : load_the_npz_array(fp,compr_bytes,uncompr_bytes);
327  fclose(fp);
328  return array;
329  }
330  else {
331  //skip past the data
332  uint32_t size = *(uint32_t*) &local_header[22];
333  fseek(fp,size,SEEK_CUR);
334  }
335  }
336 
337  fclose(fp);
338 
339  //if we get here, we haven't found the variable in the file
340  throw std::runtime_error("npz_load: Variable name "+varname+" not found in "+fname);
341 }
342 
344 
345  FILE* fp = fopen(fname.c_str(), "rb");
346 
347  if(!fp) throw std::runtime_error("npy_load: Unable to open file "+fname);
348 
349  NpyArray arr = load_the_npy_file(fp);
350 
351  fclose(fp);
352  return arr;
353 }
354 
355 
356 
std::string string
Definition: nybbler.cc:12
char map_type(const std::type_info &t)
Definition: cnpy.cxx:21
cnpy::NpyArray load_the_npy_file(FILE *fp)
Definition: cnpy.cxx:195
void parse_zip_footer(FILE *fp, uint16_t &nrecs, size_t &global_header_size, size_t &global_header_offset)
Definition: cnpy.cxx:162
npz_t npz_load(std::string fname)
Definition: cnpy.cxx:246
cnpy::NpyArray load_the_npz_array(FILE *fp, uint32_t compr_bytes, uint32_t uncompr_bytes)
Definition: cnpy.cxx:208
unsigned short uint16_t
Definition: stdint.h:125
NpyArray npy_load(std::string fname)
Definition: cnpy.cxx:343
std::map< std::string, NpyArray > npz_t
Definition: cnpy.h:63
decltype(auto) constexpr size(T &&obj)
ADL-aware version of std::size.
Definition: StdUtils.h:87
auto array(Array const &a)
Returns a manipulator which will print the specified array.
Definition: DumpUtils.h:228
unsigned int uint32_t
Definition: stdint.h:126
char BigEndianTest()
Definition: cnpy.cxx:16
std::string str(const std::pair< Type, Type > &tt)
Definition: test_pimpos.cxx:12
void parse_npy_header(FILE *fp, size_t &word_size, std::vector< size_t > &shape, bool &fortran_order)
Definition: cnpy.cxx:107
byte_as<> byte
Type of data size stored in bytes, in long long precision.
Definition: datasize.h:98
std::vector< char > & operator+=(std::vector< char > &lhs, const T rhs)
Definition: cnpy.h:75
list x
Definition: train.py:276
T * data()
Definition: cnpy.h:37
size_t num_bytes() const
Definition: cnpy.h:52