doxyindexer.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  *
3  * Copyright (C) 1997-2015 by Dimitri van Heesch.
4  *
5  * Permission to use, copy, modify, and distribute this software and its
6  * documentation under the terms of the GNU General Public License is hereby
7  * granted. No representations are made about the suitability of this software
8  * for any purpose. It is provided "as is" without express or implied warranty.
9  * See the GNU General Public License for more details.
10  *
11  * Documents produced by Doxygen are derivative works derived from the
12  * input used in their production; they are not affected by this license.
13  *
14  */
15 
16 // STL includes
17 #include <cstdio>
18 #include <cstdlib>
19 #include <iostream>
20 #include <string>
21 #include <algorithm>
22 #include <sstream>
23 
24 // Qtools includes
25 #include <qregexp.h>
26 #include <qxml.h>
27 #include <qfile.h>
28 #include <qfileinfo.h>
29 
30 // Xapian include
31 #include <xapian.h>
32 
33 #define MAX_TERM_LENGTH 245
34 
35 #if defined(_WIN32) && !defined(__CYGWIN__)
36 static char pathSep = '\\';
37 #else
38 static char pathSep = '/';
39 #endif
40 
41 static void safeAddTerm(const std::string &term,Xapian::Document &doc,int wfd)
42 {
43  if (term.length()<=MAX_TERM_LENGTH) doc.add_term(term,wfd);
44 }
45 
46 /** trims \a whitespace characters from the start and end of string \a str. */
48  const std::string& whitespace = " \t")
49 {
50  size_t strBegin = str.find_first_not_of(whitespace);
51  if (strBegin == std::string::npos)
52  return ""; // no content
53 
54  size_t strEnd = str.find_last_not_of(whitespace);
55  int strRange = strEnd - strBegin + 1;
56 
57  return str.substr(strBegin, strRange);
58 }
59 
60 /** trims \a whitespace from start and end and replace occurrences of
61  * \a whitespace with \a fill.
62  */
64  const std::string& fill = " ",
65  const std::string& whitespace = " \t")
66 {
67  // trim first
68  std::string result = trim(str, whitespace);
69 
70  // replace sub ranges
71  size_t beginSpace = result.find_first_of(whitespace);
72  while (beginSpace != std::string::npos)
73  {
74  size_t endSpace = result.find_first_not_of(whitespace, beginSpace);
75  int range = endSpace - beginSpace;
76 
77  result.replace(beginSpace, range, fill);
78 
79  size_t newStart = beginSpace + fill.length();
80  beginSpace = result.find_first_of(whitespace, newStart);
81  }
82 
83  return result;
84 }
85 
86 /** Adds all words in \a s to document \a doc with weight \a wfd */
87 static void addWords(const std::string &s,Xapian::Document &doc,int wfd)
88 {
89  std::istringstream iss(s);
90  std::istream_iterator<std::string> begin(iss),end,it;
91  for (it=begin;it!=end;++it)
92  {
93  std::string word = *it;
94  std::string lword = word;
95  std::transform(lword.begin(), lword.end(), lword.begin(), ::tolower);
96  safeAddTerm(word,doc,wfd);
97  if (lword!=word)
98  {
99  safeAddTerm(lword,doc,wfd);
100  }
101  }
102 }
103 
104 /** Adds all identifiers in \a s to document \a doc with weight \a wfd */
105 static void addIdentifiers(const std::string &s,Xapian::Document &doc,int wfd)
106 {
107  QRegExp re("[A-Z_a-z][A-Z_a-z0-9]*");
108  int i,l,p=0;
109  QCString qs = s.c_str();
110  while ((i=re.match(qs,p,&l))!=-1)
111  {
112  safeAddTerm(qs.mid(p,i-p).data(),doc,wfd);
113  p=i+l;
114  }
115 }
116 
117 /** Replaces all occurrences of \a old with \a repl in string \a str */
118 static void replace_all(std::string& str, const std::string& old, const std::string& repl)
119 {
120  size_t pos = 0;
121  while ((pos = str.find(old, pos)) != std::string::npos)
122  {
123  str.replace(pos, old.length(), repl);
124  pos += repl.length();
125  }
126 }
127 
128 /** Replaces all XML entities in \a s with their unescaped representation */
130 {
132  replace_all(result,"&gt;",">");
133  replace_all(result,"&lt;","<");
134  replace_all(result,"&apos;","'");
135  replace_all(result,"&quot;","\"");
136  replace_all(result,"&amp;","&");
137  return result;
138 }
139 
140 /** This class is a wrapper around SAX style XML parser, which
141  * parses the file without first building a DOM tree in memory.
142  */
144 {
145  public:
146  /** Handler for parsing XML data */
148  : m_db((path+"doxysearch.db").utf8().data(),Xapian::DB_CREATE_OR_OVERWRITE),
149  m_stemmer("english")
150  {
152  m_indexer.set_stemmer(m_stemmer);
153  m_indexer.set_document(m_doc);
154  }
155 
156  /** Free data handler */
158  {
159  m_db.commit();
160  }
161 
162  private:
164  {
169  TagField = 4,
170  UrlField = 5,
173  };
174 
175  /** Handler for a start tag. Called for <doc> and <field> tags */
176  bool startElement(const QString &, const QString &,
177  const QString &name, const QXmlAttributes &attrib)
178  {
179  m_data="";
180  if (name=="field")
181  {
182  QString fieldName = attrib.value("name");
183  if (fieldName=="type") m_curFieldName=TypeField;
184  else if (fieldName=="name") m_curFieldName=NameField;
185  else if (fieldName=="args") m_curFieldName=ArgsField;
186  else if (fieldName=="tag") m_curFieldName=TagField;
187  else if (fieldName=="url") m_curFieldName=UrlField;
188  else if (fieldName=="keywords") m_curFieldName=KeywordField;
189  else if (fieldName=="text") m_curFieldName=TextField;
191  }
192  return TRUE;
193  }
194 
195  /** Handler for an end tag. Called for </doc> and </field> tags */
196  bool endElement(const QString &, const QString &, const QString &name)
197  {
198  if (name=="doc") // </doc>
199  {
200  std::string term = m_doc.get_value(NameField);
201  std::string partTerm;
202  size_t pos = term.rfind("::");
203  if (pos!=std::string::npos)
204  {
205  partTerm = term.substr(pos+2);
206  }
207  if (m_doc.get_value(TypeField)=="class" ||
208  m_doc.get_value(TypeField)=="file" ||
209  m_doc.get_value(TypeField)=="namespace") // containers get highest prio
210  {
211  safeAddTerm(term,m_doc,1000);
212  if (!partTerm.empty())
213  {
214  safeAddTerm(partTerm,m_doc,500);
215  }
216  }
217  else // members and others get lower prio
218  {
219  safeAddTerm(m_doc.get_value(NameField),m_doc,100);
220  if (!partTerm.empty())
221  {
222  safeAddTerm(partTerm,m_doc,50);
223  }
224  }
225  m_db.add_document(m_doc);
226  m_doc.clear_values();
227  m_doc.clear_terms();
228  }
229  else if (name=="field" && m_curFieldName!=UnknownField) // </field>
230  {
231  // strip whitespace from m_data
232  m_data = reduce(m_data);
233  // replace XML entities
235  // add data to the document
236  m_doc.add_value(m_curFieldName,m_data);
237  switch (m_curFieldName)
238  {
239  case TypeField:
240  case NameField:
241  case TagField:
242  case UrlField:
243  // meta data that is not searchable
244  break;
245  case KeywordField:
246  addWords(m_data,m_doc,50);
247  break;
248  case ArgsField:
250  break;
251  case TextField:
252  addWords(m_data,m_doc,2);
253  break;
254  default:
255  break;
256  }
257  m_data="";
259  }
260  // reset m_data
261  return TRUE;
262  }
263 
264  /** Handler for inline text */
265  bool characters(const QString& ch)
266  {
267  m_data += ch.utf8();
268  return TRUE;
269  }
270 
271  // internal state
272  Xapian::WritableDatabase m_db;
273  Xapian::Document m_doc;
274  Xapian::TermGenerator m_indexer;
275  Xapian::Stem m_stemmer;
278 };
279 
280 /** Class for handling error during XML parsing */
282 {
283  public:
284  virtual ~XMLErrorHandler() {}
285  bool warning( const QXmlParseException & )
286  {
287  return FALSE;
288  }
289  bool error( const QXmlParseException & )
290  {
291  return FALSE;
292  }
294  {
295  std::cerr << "Fatal error at line " << exception.lineNumber()
296  << " column " << exception.columnNumber() << ": "
297  << exception.message().utf8() << std::endl;
298  return FALSE;
299  }
300  QString errorString() { return ""; }
301 
302  private:
304 };
305 
306 static void usage(const char *name)
307 {
308  std::cerr << "Usage: " << name << " [-o output_dir] searchdata.xml [searchdata2.xml ...]" << std::endl;
309  exit(1);
310 }
311 
312 /** main function to index data */
313 int main(int argc,const char **argv)
314 {
315  if (argc<2)
316  {
317  usage(argv[0]);
318  }
319  QString outputDir;
320  for (int i=1;i<argc;i++)
321  {
322  if (std::string(argv[i])=="-o")
323  {
324  if (i>=argc-1)
325  {
326  std::cerr << "Error: missing parameter for -o option" << std::endl;
327  usage(argv[0]);
328  }
329  else
330  {
331  i++;
332  outputDir=argv[i];
333  QFileInfo fi(outputDir);
334  if (!fi.exists() || !fi.isDir())
335  {
336  std::cerr << "Error: specified output directory does not exist!" << std::endl;
337  usage(argv[0]);
338  }
339  }
340  }
341  else if (std::string(argv[i])=="-h" || std::string(argv[i])=="--help")
342  {
343  usage(argv[0]);
344  }
345  }
346 
347  try
348  {
349  if (!outputDir.isEmpty() && outputDir.at(outputDir.length()-1)!=pathSep)
350  {
351  outputDir+=pathSep;
352  }
353  XMLContentHandler handler(outputDir);
354  XMLErrorHandler errorHandler;
355  for (int i=1;i<argc;i++)
356  {
357  if (std::string(argv[i])=="-o")
358  {
359  i++;
360  }
361  else
362  {
363  QString xmlFileName = argv[i];
364  std::cout << "Processing " << xmlFileName.utf8() << "..." << std::endl;
365  QFile xmlFile(xmlFileName);
366  QXmlInputSource source(xmlFile);
367  QXmlSimpleReader reader;
368  reader.setContentHandler(&handler);
369  reader.setErrorHandler(&errorHandler);
370  reader.parse(source);
371  }
372  }
373  }
374  catch(const Xapian::Error &e)
375  {
376  std::cerr << "Caught exception: " << e.get_description() << std::endl;
377  }
378  catch(...)
379  {
380  std::cerr << "Caught an unknown exception" << std::endl;
381  }
382 
383  return 0;
384 }
static QCString name
Definition: declinfo.cpp:673
end
while True: pbar.update(maxval-len(onlies[E][S])) #print iS, "/", len(onlies[E][S]) found = False for...
static std::string reduce(const std::string &str, const std::string &fill=" ", const std::string &whitespace=" \t")
Definition: doxyindexer.cpp:63
bool characters(const QString &ch)
static void replace_all(std::string &str, const std::string &old, const std::string &repl)
bool isEmpty() const
Definition: qstring.h:682
static std::string trim(const std::string &str, const std::string &whitespace=" \t")
Definition: doxyindexer.cpp:47
static QCString result
The QRegExp class provides pattern matching using regular expressions or wildcards.
Definition: qregexp.h:46
std::string string
Definition: nybbler.cc:12
static void addIdentifiers(const std::string &s, Xapian::Document &doc, int wfd)
XMLContentHandler(const QString &path)
bool parse(const QXmlInputSource &input)
Definition: qxml.cpp:2077
const bool FALSE
Definition: qglobal.h:370
int columnNumber() const
Definition: qxml.cpp:231
QString errorString()
int main(int argc, const char **argv)
The QString class provides an abstraction of Unicode text and the classic C null-terminated char arra...
Definition: qstring.h:350
void setErrorHandler(QXmlErrorHandler *handler)
Definition: qxml.cpp:2051
The QXmlErrorHandler class provides an interface to report errors in XML data.
Definition: qxml.h:420
QString message() const
Definition: qxml.cpp:224
bool startElement(const QString &, const QString &, const QString &name, const QXmlAttributes &attrib)
QString value(int index) const
Definition: qxml.cpp:664
Xapian::Stem m_stemmer
static QStrList * l
Definition: config.cpp:1044
Xapian::TermGenerator m_indexer
The QXmlSimpleReader class provides an implementation of a simple XML reader (i.e. parser).
Definition: qxml.h:238
#define MAX_TERM_LENGTH
Definition: doxyindexer.cpp:33
void setContentHandler(QXmlContentHandler *handler)
Definition: qxml.cpp:2043
const double e
QChar at(uint i) const
Definition: qstring.h:492
bool warning(const QXmlParseException &)
nvidia::inferenceserver::client::Error Error
Definition: triton_utils.h:15
uint length() const
Definition: qstring.h:679
Xapian::WritableDatabase m_db
Xapian::Document m_doc
The QXmlDefaultHandler class provides a default implementation of all XML handler classes...
Definition: qxml.h:472
p
Definition: test.py:223
const char * data() const
Definition: qcstring.h:207
std::string m_data
QCString mid(uint index, uint len=0xffffffff) const
Definition: qcstring.cpp:246
int match(const QCString &str, int index=0, int *len=0, bool indexIsStart=TRUE) const
Definition: qregexp.cpp:649
The QFile class is an I/O device that operates on files.
Definition: qfile.h:50
static void usage(const char *name)
static void safeAddTerm(const std::string &term, Xapian::Document &doc, int wfd)
Definition: doxyindexer.cpp:41
bool error(const QXmlParseException &)
def fill(s)
Definition: translator.py:93
virtual ~XMLErrorHandler()
QCString doc
The QXmlAttributes class provides XML attributes.
Definition: qxml.h:128
static msg_handler handler
Definition: qglobal.cpp:234
decltype(auto) constexpr begin(T &&obj)
ADL-aware version of std::begin.
Definition: StdUtils.h:72
static void addWords(const std::string &s, Xapian::Document &doc, int wfd)
Definition: doxyindexer.cpp:87
FieldNames m_curFieldName
The QXmlInputSource class is the source where XML data is read from.
Definition: qxml.h:162
bool fatalError(const QXmlParseException &exception)
The QFileInfo class provides system-independent file information.
Definition: qfileinfo.h:51
bool endElement(const QString &, const QString &, const QString &name)
QCString utf8() const
Definition: qstring.cpp:14507
int lineNumber() const
Definition: qxml.cpp:238
static QCString * s
Definition: config.cpp:1042
bool isDir() const
union ptb::content::word::word word
const bool TRUE
Definition: qglobal.h:371
static QCString str
cet::coded_exception< error, detail::translate > exception
Definition: exception.h:33
QTextStream & endl(QTextStream &s)
The QXmlParseException class is used to report errors with the QXmlErrorHandler interface.
Definition: qxml.h:185
bool exists() const
Definition: qfileinfo.cpp:265
static char pathSep
Definition: doxyindexer.cpp:38
static std::string unescapeXmlEntities(const std::string &s)