LibOFX
ofx_preproc.cpp
Go to the documentation of this file.
1 /***************************************************************************
2  ofx_preproc.cpp
3  -------------------
4  copyright : (C) 2002 by Benoit Gr�oir
5  email : benoitg@coeus.ca
6 ***************************************************************************/
12 /***************************************************************************
13  * *
14  * This program is free software; you can redistribute it and/or modify *
15  * it under the terms of the GNU General Public License as published by *
16  * the Free Software Foundation; either version 2 of the License, or *
17  * (at your option) any later version. *
18  * *
19  ***************************************************************************/
20 #include "../config.h"
21 #include <iostream>
22 #include <fstream>
23 #include <cstdlib>
24 #include <stdio.h>
25 #include <sstream>
26 #include <string>
27 #include "ParserEventGeneratorKit.h"
28 #include "libofx.h"
29 #include "messages.hh"
30 #include "ofx_sgml.hh"
31 #include "ofc_sgml.hh"
32 #include "ofx_preproc.hh"
33 #include "ofx_utilities.hh"
34 #ifdef HAVE_ICONV
35 #include <iconv.h>
36 #endif
37 
38 #ifdef _WIN32
39 # define DIRSEP "\\"
40 #else
41 # define DIRSEP "/"
42 #endif
43 
44 #ifdef _WIN32
45 # include "win32.hh"
46 # include <windows.h> // for GetModuleFileName()
47 # undef ERROR
48 # undef DELETE
49 #endif
50 
51 #define LIBOFX_DEFAULT_INPUT_ENCODING "CP1252"
52 #define LIBOFX_DEFAULT_OUTPUT_ENCODING "UTF-8"
53 
57 #ifdef MAKEFILE_DTD_PATH
58 const int DTD_SEARCH_PATH_NUM = 4;
59 #else
60 const int DTD_SEARCH_PATH_NUM = 3;
61 #endif
62 
67 {
68 #ifdef MAKEFILE_DTD_PATH
69  MAKEFILE_DTD_PATH,
70 #endif
71  "/usr/local/share/libofx/dtd",
72  "/usr/share/libofx/dtd",
73  "~"
74 };
75 
80 int ofx_proc_file(LibofxContextPtr ctx, const char * p_filename)
81 {
82  LibofxContext *libofx_context;
83  bool ofx_start = false;
84  bool ofx_end = false;
85  bool file_is_xml = false;
86  bool used_iconv = false;
87  std::ifstream input_file;
88  std::ofstream tmp_file;
89  char *filenames[3];
90  char tmp_filename[256];
91  int tmp_file_fd;
92 #ifdef HAVE_ICONV
93  iconv_t conversion_descriptor;
94 #endif
95  libofx_context = (LibofxContext*)ctx;
96 
97  if (p_filename != NULL && strcmp(p_filename, "") != 0)
98  {
99  message_out(DEBUG, std::string("ofx_proc_file():Opening file: ") + p_filename);
100 
101  input_file.open(p_filename);
102  if (!input_file)
103  {
104  message_out(ERROR, "ofx_proc_file():Unable to open the input file " + std::string(p_filename));
105  }
106 
107  mkTempFileName("libofxtmpXXXXXX", tmp_filename, sizeof(tmp_filename));
108 
109  message_out(DEBUG, "ofx_proc_file(): Creating temp file: " + std::string(tmp_filename));
110 #ifdef _WIN32
111  tmp_file_fd = mkstemp_win32(tmp_filename);
112 #else
113  tmp_file_fd = mkstemp(tmp_filename);
114 #endif
115  if (tmp_file_fd)
116  {
117  tmp_file.open(tmp_filename);
118  if (!tmp_file)
119  {
120  message_out(ERROR, "ofx_proc_file():Unable to open the created temp file " + std::string(tmp_filename));
121  return -1;
122  }
123  }
124  else
125  {
126  message_out(ERROR, "ofx_proc_file():Unable to create a temp file at " + std::string(tmp_filename));
127  return -1;
128  }
129 
130  if (input_file && tmp_file)
131  {
132  std::size_t header_separator_idx;
133  std::string header_name;
134  std::string header_value;
135  std::string ofx_encoding;
136  std::string ofx_charset;
137  do
138  {
139  std::stringbuf buffer;
140  std::string s_buffer;
141  input_file.get(buffer, '\n');
142  //cout<< "got: \"" << buffer<<"\"\n";
143  s_buffer = buffer.str();
144 
145  // Watch out: If input_file is in eof(), any subsequent read or
146  // peek() will fail and we must exit this loop.
147  if (!input_file.eof())
148  {
149  //cout<<"input_file.gcount(): "<<input_file.gcount()<< " s_buffer.size=" << s_buffer.size()<<" sizeof(buffer): "<<sizeof(buffer) << " peek=\"" << int(input_file.peek()) << "\"" <<endl;
150  if (input_file.fail()) // If no characters were extracted above, the failbit is set.
151  {
152  // No characters extracted means that we've reached the newline
153  // delimiter (because we already checked for EOF). We will check
154  // for and remove that newline in the next if-clause, but must
155  // remove the failbit so that peek() will work again.
156  input_file.clear();
157  }
158 
159  // Is the next character really the newline?
160  if (input_file.peek() == '\n')
161  {
162  // Yes. Then discard that newline character from the stream
163  input_file.get();
164  }
165  }
166 
167  if (ofx_start == false && (s_buffer.find("<?xml") != std::string::npos))
168  {
169  message_out(DEBUG, "ofx_proc_file(): File is an actual XML file, iconv conversion will be skipped.");
170  file_is_xml = true;
171  }
172 
173  std::size_t ofx_start_idx;
174  if (ofx_start == false)
175  {
176  if (
177  (libofx_context->currentFileType() == OFX &&
178  ((ofx_start_idx = s_buffer.find("<OFX>")) != std::string::npos ||
179  (ofx_start_idx = s_buffer.find("<ofx>")) != std::string::npos))
180  ||
181  (libofx_context->currentFileType() == OFC &&
182  ((ofx_start_idx = s_buffer.find("<OFC>")) != std::string::npos ||
183  (ofx_start_idx = s_buffer.find("<ofc>")) != std::string::npos))
184  )
185  {
186  ofx_start = true;
187  if (file_is_xml == false)
188  {
189  s_buffer.erase(0, ofx_start_idx); //Fix for really broken files that don't have a newline after the header.
190  }
191  message_out(DEBUG, "ofx_proc_file():<OFX> or <OFC> has been found");
192 
193  static char sp_charset_fixed[] = "SP_CHARSET_FIXED=1";
194  if (putenv(sp_charset_fixed) != 0)
195  {
196  message_out(ERROR, "ofx_proc_file(): putenv failed");
197  }
198 #define OPENSP_UTF8_WARNING_TEXT "ofx_proc_file(): OpenSP cannot process an UTF-8 XML file without garbling it. Furthermore, on windows the support for UTF-8 encode SGML files is broken. This is worked around by forcing a single byte encoding. If the file is indeed UTF-8, it should pass through unmolested, but you will likely get 'non SGML character number' errors, even though the output is correct."
199  if (file_is_xml == true)
200  {
201  /* Normally the following would be "SP_ENCODING=xml".
202  * Unfortunately, opensp's generic api will garble UTF-8 if this
203  * is set to xml. So we set a single byte encoding that uses most
204  * values to avoid messing up the UTF-8.
205  * Unfortunately this means that non-UTF-8 files will not
206  * get properly translated. We'd need to manually detect the
207  * encoding in the XML header and convert the xml with iconv like
208  * we do for SGML to work around the problem. Most unfortunate. */
209  message_out(WARNING, OPENSP_UTF8_WARNING_TEXT);
210  static char sp_encoding[] = "SP_ENCODING=ms-dos";
211  if (putenv(sp_encoding) != 0)
212  {
213  message_out(ERROR, "ofx_proc_file(): putenv failed");
214  }
215  }
216  else
217  {
218  static char sp_encoding[] = "SP_ENCODING=ms-dos"; // Like the above, force a single byte encoding in every case, we don't want opensp messing up UTF-8
219  if (putenv(sp_encoding) != 0)
220  {
221  message_out(ERROR, "ofx_proc_file(): putenv failed");
222  }
223 #ifdef HAVE_ICONV
224  std::string fromcode;
225  std::string tocode;
226  if (ofx_encoding.compare("USASCII") == 0)
227  {
228  if (ofx_charset.compare("ISO-8859-1") == 0 || ofx_charset.compare("8859-1") == 0)
229  {
230  //Only "ISO-8859-1" is actually a legal value, but since the banks follows the spec SO well...
231  fromcode = "ISO-8859-1";
232  }
233  else if (ofx_charset.compare("1252") == 0 || ofx_charset.compare("CP1252") == 0)
234  {
235  //Only "1252" is actually a legal value, but since the banks follows the spec SO well...
236  fromcode = "CP1252";
237  }
238  else if (ofx_charset.compare("NONE") == 0)
239  {
240  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
241  }
242  else
243  {
244  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
245  }
246  }
247  else if (ofx_encoding.compare("UTF-8") == 0 || ofx_encoding.compare("UNICODE") == 0)
248  {
249  //While "UNICODE" isn't a legal value, some cyrilic files do specify it as such...
250  fromcode = "UTF-8";
251  message_out(WARNING, OPENSP_UTF8_WARNING_TEXT);
252  }
253  else
254  {
255  fromcode = LIBOFX_DEFAULT_INPUT_ENCODING;
256  }
257  tocode = LIBOFX_DEFAULT_OUTPUT_ENCODING;
258  message_out(DEBUG, "ofx_proc_file(): Setting up iconv for fromcode: " + fromcode + ", tocode: " + tocode);
259  conversion_descriptor = iconv_open (tocode.c_str(), fromcode.c_str());
260  used_iconv = true;
261 #endif
262  }
263  }
264  else
265  {
266  //We are still in the headers
267  if ((header_separator_idx = s_buffer.find(':')) != std::string::npos)
268  {
269  //Header processing
270  header_name.assign(s_buffer.substr(0, header_separator_idx));
271  header_value.assign(s_buffer.substr(header_separator_idx + 1));
272  while ( header_value.length() > 0 &&
273  ( header_value[header_value.length() - 1 ] == '\n' ||
274  header_value[header_value.length() - 1 ] == '\r' ))
275  header_value.erase(header_value.length() - 1);
276  message_out(DEBUG, "ofx_proc_file():Header: " + header_name + " with value: " + header_value + " has been found");
277  if (header_name.compare("ENCODING") == 0)
278  {
279  ofx_encoding.assign(header_value);
280  }
281  if (header_name.compare("CHARSET") == 0)
282  {
283  ofx_charset.assign(header_value);
284  }
285  }
286  }
287  }
288 
289  if (file_is_xml == true || (ofx_start == true && ofx_end == false))
290  {
291  if (ofx_start == true)
292  {
293  /* The above test won't help us if the <OFX> tag is on the same line
294  * as the xml header, but as opensp can't be used to parse it anyway
295  * this isn't a great loss for now.
296  */
297  s_buffer = sanitize_proprietary_tags(s_buffer);
298  if (s_buffer.empty())
299  continue;
300  }
301  //cout<< s_buffer<<"\n";
302  if (file_is_xml == false)
303  {
304 #ifdef HAVE_ICONV
305  size_t inbytesleft = s_buffer.size();
306  size_t outbytesleft = inbytesleft * 2 - 1;
307  char * iconv_buffer = (char*) malloc (inbytesleft * 2);
308  memset(iconv_buffer, 0, inbytesleft * 2);
309  const char* inchar = s_buffer.c_str();
310  char * outchar = iconv_buffer;
311  int iconv_retval = iconv (conversion_descriptor,
312 #ifdef HAVE_ICONV_CONST
313  &inchar,
314 #else
315  const_cast<char**>(&inchar),
316 #endif
317  &inbytesleft, &outchar, &outbytesleft);
318  if (iconv_retval == -1)
319  {
320  message_out(ERROR, "ofx_proc_file(): Iconv conversion error");
321  }
322  // All validly converted bytes will be copied to the
323  // original buffer
324  s_buffer = std::string(iconv_buffer, outchar - iconv_buffer);
325  free (iconv_buffer);
326 #endif
327  }
328  //cout << s_buffer << "\n";
329  tmp_file << s_buffer << std::endl;
330  }
331 
332  if (ofx_start == true &&
333  (
334  (libofx_context->currentFileType() == OFX &&
335  ((ofx_start_idx = s_buffer.find("</OFX>")) != std::string::npos ||
336  (ofx_start_idx = s_buffer.find("</ofx>")) != std::string::npos))
337  || (libofx_context->currentFileType() == OFC &&
338  ((ofx_start_idx = s_buffer.find("</OFC>")) != std::string::npos ||
339  (ofx_start_idx = s_buffer.find("</ofc>")) != std::string::npos))
340  )
341  )
342  {
343  ofx_end = true;
344  message_out(DEBUG, "ofx_proc_file():</OFX> or </OFC> has been found");
345  }
346 
347  }
348  while (!input_file.eof() && !input_file.bad());
349  }
350  input_file.close();
351  tmp_file.close();
352 #ifdef HAVE_ICONV
353  if (used_iconv == true)
354  {
355  iconv_close(conversion_descriptor);
356  }
357 #endif
358  char filename_openspdtd[255];
359  char filename_dtd[255];
360  char filename_ofx[255];
361  STRNCPY(filename_openspdtd, find_dtd(ctx, OPENSPDCL_FILENAME)); //The opensp sgml dtd file
362  if (libofx_context->currentFileType() == OFX)
363  {
364  STRNCPY(filename_dtd, find_dtd(ctx, OFX160DTD_FILENAME)); //The ofx dtd file
365  }
366  else if (libofx_context->currentFileType() == OFC)
367  {
368  STRNCPY(filename_dtd, find_dtd(ctx, OFCDTD_FILENAME)); //The ofc dtd file
369  }
370  else
371  {
372  message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
373  }
374 
375  if ((std::string)filename_dtd != "" && (std::string)filename_openspdtd != "")
376  {
377  strncpy(filename_ofx, tmp_filename, 255); //The processed ofx file
378  filenames[0] = filename_openspdtd;
379  filenames[1] = filename_dtd;
380  filenames[2] = filename_ofx;
381  int rv;
382  if (libofx_context->currentFileType() == OFX)
383  {
384  rv = ofx_proc_sgml(libofx_context, 3, filenames);
385  }
386  else if (libofx_context->currentFileType() == OFC)
387  {
388  rv = ofc_proc_sgml(libofx_context, 3, filenames);
389  }
390  else
391  {
392  message_out(ERROR, std::string("ofx_proc_file(): Error unknown file format for the OFX parser"));
393  rv = -1;
394  }
395  if (remove(tmp_filename) != 0)
396  {
397  message_out(ERROR, "ofx_proc_file(): Error deleting temporary file " + std::string(tmp_filename));
398  }
399  return rv;
400  }
401  else
402  {
403  message_out(ERROR, "ofx_proc_file(): FATAL: Missing DTD, aborting");
404  return -1;
405  }
406  }
407  else
408  {
409  message_out(ERROR, "ofx_proc_file():No input file specified");
410  return -1;
411  }
412  return 0;
413 }
414 
415 /* Searches input string for an opening or closing tag starting from pos_start.
416  * If found will return the tag_name and pos_start will be set to the string
417  * of the starting <, pos_end to the position after the closing '>'
418  * If the tag doesn't have a closing '>', pos_end will be set to string::npos.
419  */
420 static std::string find_tag_open (std::string& input_string, size_t& pos_start, size_t& pos_end)
421 {
422  pos_start = input_string.find ('<', pos_start);
423 
424  if (pos_start == std::string::npos)
425  {
426  pos_end = std::string::npos;
427  return std::string();
428  }
429 
430  pos_end = input_string.find ('>', pos_start + 1);
431  if (pos_end != std::string::npos)
432  pos_end = pos_end + 1;
433  size_t tag_size = (pos_end - 1) - (pos_start + 1);
434  return input_string.substr(pos_start + 1, tag_size);
435 }
436 
437 /* Searches input string for a closing tag matching tag_name starting at pos.
438  * If found pos will be set to the position right after of the closing '>'
439  * If no matching closing tag is found pos will be set to the start of the next
440  * opening or closing tag found.
441  */
442 static void find_tag_close (std::string& input_string, std::string& tag_name, size_t& pos)
443 {
444  size_t start_idx = input_string.find ("</" + tag_name + ">", pos);
445 
446  if (start_idx == std::string::npos)
447  {
448  start_idx = pos;
449  size_t end_idx;
450  std::string new_tag_name = find_tag_open (input_string, start_idx, end_idx);
451  if (!new_tag_name.empty())
452  {
453  message_out(DEBUG, "find_tag_close() fell back to next open tag: " + new_tag_name);
454  // find_tag_open returns the *end* of an opening tag, but in this
455  // case we want its start, so we need to rewind a bit..
456  pos = start_idx;
457  //printf("find_tag_close() returning pos after fallback: %d\n",pos);
458  }
459  else
460  {
461  pos = input_string.length();
462  }
463  }
464  else
465  {
466  pos = start_idx + tag_name.length() + 3;
467  }
468  return;
469 }
470 
471 
483 std::string sanitize_proprietary_tags(std::string input_string)
484 {
485  size_t last_known_good_pos = 0;
486  size_t open_tag_start_pos = last_known_good_pos;
487  size_t open_tag_end_pos;
488  size_t close_tag_end_pos;
489 
490  std::string tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
491  while (!tag_name.empty())
492  {
493  // Determine whether the current tag is proprietary.
494  if ((tag_name.find('.') != std::string::npos) || // tag has a . in the name
495  (tag_name == "CATEGORY")) // Chase bank started setting these in 2017
496  {
497  close_tag_end_pos = open_tag_end_pos;
498  find_tag_close (input_string, tag_name, close_tag_end_pos);
499  size_t tag_size = close_tag_end_pos - open_tag_start_pos;
500  std::string prop_tag = input_string.substr(open_tag_start_pos, tag_size);
501  message_out(INFO, "sanitize_proprietary_tags() removed: " + prop_tag);
502  input_string.erase(open_tag_start_pos, tag_size);
503  last_known_good_pos = open_tag_start_pos;
504  }
505  else
506  {
507  last_known_good_pos = open_tag_end_pos;
508  }
509  tag_name.clear();
510  open_tag_start_pos = last_known_good_pos;
511  if (last_known_good_pos != std::string::npos)
512  tag_name = find_tag_open(input_string, open_tag_start_pos, open_tag_end_pos);
513  }
514  return input_string;
515 }
516 
517 
518 #ifdef _WIN32
519 static std::string get_dtd_installation_directory()
520 {
521  // Partial implementation of
522  // http://developer.gnome.org/doc/API/2.0/glib/glib-Windows-Compatibility-Functions.html#g-win32-get-package-installation-directory
523  char ch_fn[MAX_PATH], *p;
524  std::string str_fn;
525 
526  if (!GetModuleFileName(NULL, ch_fn, MAX_PATH)) return "";
527 
528  if ((p = strrchr(ch_fn, '\\')) != NULL)
529  * p = '\0';
530 
531  p = strrchr(ch_fn, '\\');
532  if (p && (_stricmp(p + 1, "bin") == 0 ||
533  _stricmp(p + 1, "lib") == 0))
534  *p = '\0';
535 
536  str_fn = ch_fn;
537  str_fn += "\\share\\libofx\\dtd";
538 
539  return str_fn;
540 }
541 #endif
542 
543 
557 std::string find_dtd(LibofxContextPtr ctx, const std::string& dtd_filename)
558 {
559  std::string dtd_path_filename;
560  char *env_dtd_path;
561 
562  dtd_path_filename = reinterpret_cast<const LibofxContext*>(ctx)->dtdDir();
563  if (!dtd_path_filename.empty())
564  {
565  dtd_path_filename.append(dtd_filename);
566  std::ifstream dtd_file(dtd_path_filename.c_str());
567  if (dtd_file)
568  {
569  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
570  return dtd_path_filename;
571  }
572  }
573 
574 #ifdef _WIN32
575  dtd_path_filename = get_dtd_installation_directory();
576  if (!dtd_path_filename.empty())
577  {
578  dtd_path_filename.append(DIRSEP);
579  dtd_path_filename.append(dtd_filename);
580  std::ifstream dtd_file(dtd_path_filename.c_str());
581  if (dtd_file)
582  {
583  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
584  return dtd_path_filename;
585  }
586  }
587 #endif
588  /* Search in environment variable OFX_DTD_PATH */
589  env_dtd_path = getenv("OFX_DTD_PATH");
590  if (env_dtd_path)
591  {
592  dtd_path_filename = env_dtd_path;
593  dtd_path_filename.append(DIRSEP);
594  dtd_path_filename.append(dtd_filename);
595  std::ifstream dtd_file(dtd_path_filename.c_str());
596  if (!dtd_file)
597  {
598  message_out(STATUS, "find_dtd():OFX_DTD_PATH env variable was was present, but unable to open the file " + dtd_path_filename);
599  }
600  else
601  {
602  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
603  return dtd_path_filename;
604  }
605  }
606 
607  for (int i = 0; i < DTD_SEARCH_PATH_NUM; i++)
608  {
609  dtd_path_filename = DTD_SEARCH_PATH[i];
610  dtd_path_filename.append(DIRSEP);
611  dtd_path_filename.append(dtd_filename);
612  std::ifstream dtd_file(dtd_path_filename.c_str());
613  if (!dtd_file)
614  {
615  message_out(DEBUG, "find_dtd():Unable to open the file " + dtd_path_filename);
616  }
617  else
618  {
619  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
620  return dtd_path_filename;
621  }
622  }
623 
624  /* Last resort, look in source tree relative path (useful for development) */
625  dtd_path_filename = "";
626  dtd_path_filename.append("..");
627  dtd_path_filename.append(DIRSEP);
628  dtd_path_filename.append("dtd");
629  dtd_path_filename.append(DIRSEP);
630  dtd_path_filename.append(dtd_filename);
631  std::ifstream dtd_file(dtd_path_filename.c_str());
632  if (!dtd_file)
633  {
634  message_out(DEBUG, "find_dtd(): Unable to open the file " + dtd_path_filename + ", most likely we are not in the source tree.");
635  }
636  else
637  {
638  message_out(STATUS, "find_dtd():DTD found: " + dtd_path_filename);
639  return dtd_path_filename;
640  }
641 
642 
643  message_out(ERROR, "find_dtd():Unable to find the DTD named " + dtd_filename);
644  return "";
645 }
Definition: messages.hh:32
int ofx_proc_file(LibofxContextPtr ctx, const char *p_filename)
File pre-processing of OFX AND for OFC files.
Definition: ofx_preproc.cpp:80
int message_out(OfxMsgType error_type, const std::string message)
Message output function.
Definition: messages.cpp:67
const int DTD_SEARCH_PATH_NUM
The number of different paths to search for DTDs.
Definition: ofx_preproc.cpp:60
OFX/SGML parsing functionality.
const char * DTD_SEARCH_PATH[DTD_SEARCH_PATH_NUM]
The list of paths to search for the DTDs.
Definition: ofx_preproc.cpp:66
int ofc_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofc_sgml.cpp:346
Various simple functions for type conversion & al.
int ofx_proc_sgml(LibofxContext *libofx_context, int argc, char *const *argv)
Parses a DTD and OFX file(s)
Definition: ofx_sgml.cpp:444
void STRNCPY(T &dest, const std::string &src)
OFX/SGML parsing functionality.
Message IO functionality.
Preprocessing of the OFX files before parsing.
std::string find_dtd(LibofxContextPtr ctx, const std::string &dtd_filename)
Find the appropriate DTD for the file version.
std::string sanitize_proprietary_tags(std::string input_string)
Removes proprietary tags and comments.