libzypp  17.37.5
metalinkparser.cc
Go to the documentation of this file.
1 /*---------------------------------------------------------------------\
2 | ____ _ __ __ ___ |
3 | |__ / \ / / . \ . \ |
4 | / / \ V /| _/ _/ |
5 | / /__ | | | | | | |
6 | /_____||_| |_| |_| |
7 | |
8 \---------------------------------------------------------------------*/
13 #include "metalinkparser.h"
15 #include <zypp-core/base/Logger.h>
16 #include <zypp-core/ByteArray.h>
17 #include <zypp-core/AutoDispose.h>
18 
19 #include <stack>
20 #include <vector>
21 #include <algorithm>
22 
23 #include <libxml/SAX2.h>
24 
25 using namespace zypp::base;
26 
27 namespace zypp::env
28 {
30  inline bool ZYPP_METALINK_DEBUG()
31  {
32  static bool val = [](){
33  const char * env = getenv("ZYPP_METALINK_DEBUG");
34  return( env && zypp::str::strToBool( env, true ) );
35  }();
36  return val;
37  }
38 }
39 
40 namespace zypp::media {
41  enum ParserState {
60  };
61 
62  struct transition {
63  std::string elementName; //< Name of the element for the transition to trigger
64  ParserState transitionTo; //< The state we go into when the element name in \a elementName is encountered
65  int docontent; //< Store the content of the element in the \a content member
66  };
67 
73  const std::unordered_map<ParserState, std::vector<transition> > & transitions () {
74  static std::unordered_map<ParserState, std::vector<transition> > map {
75  { STATE_START, {
76  { "metalink", STATE_METALINK, 0},
77  }
78  },
79  { STATE_METALINK, {
80  { "files", STATE_FILES, 0 },
81  { "file", STATE_M4FILE, 0 },
82  }
83  },
84  { STATE_FILES, {
85  { "file", STATE_FILE, 0},
86  }
87  },
88  { STATE_FILE, {
89  { "size", STATE_SIZE, 1 },
90  { "verification", STATE_VERIFICATION, 0 },
91  { "resources", STATE_RESOURCES, 0 },
92  }
93  },
95  { "hash", STATE_HASH, 1 },
96  { "pieces", STATE_PIECES, 0 },
97  }
98  },
99  { STATE_PIECES, {
100  { "hash", STATE_PHASH, 1 },
101  }
102  },
103  { STATE_RESOURCES, {
104  { "url", STATE_URL, 1 },
105  }
106  },
107  { STATE_M4FILE, {
108  { "size", STATE_M4SIZE, 1 },
109  { "hash", STATE_M4HASH, 1},
110  { "url", STATE_M4URL, 1},
111  { "pieces", STATE_M4PIECES, 0},
112  }
113  },
114  { STATE_M4PIECES, {
115  { "hash", STATE_M4PHASH, 1 },
116  }
117  },
118  };
119 
120  return map;
121  }
122 
123 static void XMLCALL startElement(void *userData, const xmlChar *name, const xmlChar **atts);
124 static void XMLCALL endElement(void *userData, const xmlChar *name);
125 static void XMLCALL characterData(void *userData, const xmlChar *s, int len);
126 static void XMLCALL parseError(void *userData, const xmlError *error);
127 
130  : parser( nullptr )
131  , state( STATE_START )
132  , depth( 0 )
133  , statedepth( 0 )
134  , docontent( 0 )
135  , gotfile( 0 )
136  , size( -1 )
137  , blksize( 0 )
138  , piecel( 0 )
139  , chksuml( 0 )
140  {
141  content.reserve( 256 );
142 
143  xmlSAXHandler sax;
144  memset(&sax, 0, sizeof(sax));
145  sax.startElement = startElement;
146  sax.endElement = endElement;
147  sax.characters = characterData;
148 
149  //internally creates a copy of xmlSaxHandler, so having it as local variable is save
150  parser = AutoDispose<xmlParserCtxtPtr>( xmlCreatePushParserCtxt(&sax, this, NULL, 0, NULL), xmlFreeParserCtxt );
151 #ifdef HAVE_LIBXML2_XMLCTXTSETERRORHANDLER
152  xmlCtxtSetErrorHandler ( parser, parseError, this );
153 #else
154  xmlSetStructuredErrorFunc ( this, (xmlStructuredErrorFunc)parseError );
155 #endif
156  }
157 
159 #ifndef HAVE_LIBXML2_XMLCTXTSETERRORHANDLER
160  xmlSetStructuredErrorFunc ( nullptr, nullptr );
161 #endif
162  }
163 
164  void doTransition ( const transition &t ) {
165  parentStates.push( state );
166  state = t.transitionTo;
167  docontent = t.docontent;
168  statedepth = depth;
169  content.clear();
170  }
171 
172  void popState () {
173  state = parentStates.top();
174  statedepth--;
175  parentStates.pop();
176 
177  }
178 
180 
181  ParserState state; //< current state as defined in \ref stateswitch
182  std::stack<ParserState> parentStates;
183 
184  int depth; //< current element depth of traversing the document elements
185 
192 
193  std::string content; //< content of the current element
194  int docontent; //< should the content of the current elem be parsed
195 
196  int gotfile;
197  off_t size;
198  std::vector<MetalinkMirror> urls;
199  size_t blksize;
200 
201  std::vector<UByteArray> piece;
202  int piecel;
203 
204  std::vector<UByteArray> sha1;
205  std::vector<UByteArray> zsync;
206 
208  int chksuml;
209 
210  std::optional<filesystem::Pathname> _filename; // if the filename is known, we can find it here
211  std::exception_ptr _lastError; // if a error was encountered during XML parsing we remember it here
212 };
213 
218 static const char *
219 find_attr(const char *txt, const xmlChar **atts)
220 {
221  if(!atts) {
222  return nullptr;
223  }
224 
225  for (; *atts; atts += 2)
226  {
227  if (!strcmp(reinterpret_cast<const char*>(*atts), txt))
228  return reinterpret_cast<const char*>(atts[1]);
229  }
230  return nullptr;
231 }
232 
233 static void XMLCALL
234 startElement(void *userData, const xmlChar *name, const xmlChar **atts)
235 {
236  struct ml_parsedata *pd = reinterpret_cast<struct ml_parsedata *>(userData);
237 
238  // if the current element depth does not match the expected depth for the current state we
239  // ignore the element and just increase the depth
240  if (pd->depth != pd->statedepth) {
241  pd->depth++;
242  return;
243  }
244  pd->depth++;
245 
246  const auto &trMap = transitions();
247  const auto currStateTrs = trMap.find( pd->state );
248  if ( currStateTrs == trMap.end() )
249  return;
250 
251  // check if the current element name is part of our transitions
252  auto foundTr = std::find_if( currStateTrs->second.begin(), currStateTrs->second.end(), [name]( const auto &tr ){
253  return tr.elementName == reinterpret_cast<const char *>(name);
254  });
255 
256  if ( foundTr == currStateTrs->second.end() ) {
257  // we found no possible transition, ignore
258  return;
259  }
260 
261  if ( ( foundTr->transitionTo == STATE_FILE || foundTr->transitionTo == STATE_M4FILE ) && pd->gotfile++)
262  return; /* ignore all but the first file */
263 
264  // advance the state machine and prepare variables for the new state
265  pd->doTransition( *foundTr );
266 
267  switch(pd->state)
268  {
269  case STATE_URL:
270  case STATE_M4URL:
271  {
272  const char *priority = find_attr("priority", atts);
273  const char *preference = find_attr("preference", atts);
274  const char *maxconnections = find_attr("maxconnections", atts);
275  int prio = 0;
276  auto &mirr = pd->urls.emplace_back();
277  if (priority)
278  prio = str::strtonum<int>(priority);
279  else if (preference)
280  prio = 101 - str::strtonum<int>(preference);
281  else
282  prio = 999999;
283  mirr.priority = prio;
284 
285  if ( maxconnections )
286  mirr.maxConnections = str::strtonum<int>( maxconnections );
287 
288  break;
289  }
290  case STATE_PIECES:
291  case STATE_M4PIECES:
292  {
293  const char *type = find_attr("type", atts);
294  const char *length = find_attr("length", atts);
295  size_t blksize = 0;
296 
297  if (!type || !length)
298  {
299  pd->popState();
300  break;
301  }
302  blksize = str::strtonum<unsigned long>(length);
303  if (!blksize || (pd->blksize && pd->blksize != blksize))
304  {
305  pd->popState();
306  break;
307  }
308  pd->blksize = blksize;
309  pd->piece.clear();
310  if (!strcmp(type, "sha1") || !strcmp(type, "sha-1"))
311  pd->piecel = 20;
312  else if (!strcmp(type, "zsync"))
313  pd->piecel = 4;
314  else
315  {
316  pd->popState();
317  break;
318  }
319  break;
320  }
321  case STATE_HASH:
322  case STATE_M4HASH:
323  {
324  const char *type = find_attr("type", atts);
325  if (!type)
326  type = "?";
327  if ((!strcmp(type, "sha1") || !strcmp(type, "sha-1")) && pd->chksuml < 20)
328  pd->chksuml = 20;
329  else if (!strcmp(type, "sha256") || !strcmp(type, "sha-256"))
330  pd->chksuml = 32;
331  else
332  {
333  pd->popState();
334  pd->docontent = 0;
335  }
336  break;
337  }
338  case STATE_PHASH:
339  case STATE_M4PHASH:
340  {
341  const char *piece = find_attr("piece", atts);
342  if ( pd->state == STATE_PHASH && (!piece || str::strtonum<uint>(piece) != pd->piece.size()) )
343  {
344  pd->popState();
345  }
346  break;
347  }
348  default:
349  break;
350  }
351 }
352 
353 UByteArray hexstr2bytes( const std::string& str )
354 {
355  return Digest::hexStringToUByteArray( str );
356 }
357 
358 static void XMLCALL
359 endElement(void *userData, const xmlChar *)
360 {
361  struct ml_parsedata *pd = reinterpret_cast<struct ml_parsedata *>(userData);
362  //printf("end depth %d-%d name %s\n", pd->depth, pd->statedepth, name);
363  if (pd->depth != pd->statedepth)
364  {
365  pd->depth--;
366  return;
367  }
368  switch (pd->state)
369  {
370  case STATE_SIZE:
371  case STATE_M4SIZE:
372  pd->size = (off_t)str::strtonum<off_t>(pd->content); //strtoull(pd->content, 0, 10);
373  break;
374  case STATE_HASH:
375  case STATE_M4HASH:
376  pd->chksum.clear();
377  pd->chksum = hexstr2bytes( pd->content );
378  if ( pd->content.length() != size_t(pd->chksuml) * 2 || !pd->chksum.size() )
379  {
380  pd->chksum.clear();
381  pd->chksuml = 0;
382  }
383  break;
384  case STATE_PHASH:
385  case STATE_M4PHASH: {
386  if ( pd->content.length() != size_t(pd->piecel) * 2 )
387  break;
388  UByteArray pieceHash = hexstr2bytes( pd->content );
389  if ( !pieceHash.size() )
390  pieceHash.resize( pd->piecel, 0 );
391  pd->piece.push_back( pieceHash );
392  break;
393  }
394  case STATE_PIECES:
395  case STATE_M4PIECES:
396  if (pd->piecel == 4)
397  pd->zsync = pd->piece;
398  else
399  pd->sha1 = pd->piece;
400 
401  pd->piecel = 0;
402  pd->piece.clear();
403  break;
404  case STATE_URL:
405  case STATE_M4URL:
406  if ( pd->content.length() )
407  pd->urls.back().url = std::string(pd->content);
408  else
409  // without a actual URL the mirror is useless
410  pd->urls.pop_back();
411  break;
412  default:
413  break;
414  }
415 
416  pd->depth--;
417  pd->popState();
418  pd->docontent = 0;
419 }
420 
421 static void XMLCALL
422 characterData(void *userData, const xmlChar *s, int len)
423 {
424  struct ml_parsedata *pd = reinterpret_cast<struct ml_parsedata *>(userData);
425  if (!pd->docontent)
426  return;
427 
428  if ( pd->content.length() + len + 1 > pd->content.capacity() )
429  pd->content.reserve( pd->content.capacity() + 256 );
430  pd->content.append( s, s+len );
431 }
432 
433 static void XMLCALL parseError(void *userData, const xmlError *error)
434 {
435  struct ml_parsedata *pd = reinterpret_cast<struct ml_parsedata *>(userData);
436  if (!pd)
437  return;
438 
439  ERR << "Parse error in " << (pd->_filename ? pd->_filename->asString() : std::string("unknown filename")) << " : " << error->message << std::endl;
440 
441  auto ex = parser::ParseException( str::Str() << "Parse error in " << (pd->_filename ? pd->_filename->asString() : std::string("unknown filename")) << " : " << error->message ) ;
442  if ( pd->_lastError )
443  ex.remember (pd->_lastError);
444  pd->_lastError = std::make_exception_ptr (ex);
445 }
446 
447 
448 MetaLinkParser::MetaLinkParser()
449  : pd( new ml_parsedata )
450 {}
451 
453 {
454  delete pd;
455 }
456 
457 void
459 {
460  pd->_filename = filename;
461  zypp_defer {
462  pd->_filename.reset();
463  };
464  parse(InputStream(filename));
465 }
466 
467 void
469 {
470  char buf[4096];
471  if (!is.stream())
472  ZYPP_THROW(parser::ParseException("MetaLinkParser: no such file"));
473 
474  pd->_lastError = {};
475  zypp_defer {
476  // clear the error when we leave this function
477  pd->_lastError = {};
478  };
479 
480  while (is.stream().good())
481  {
482  is.stream().read(buf, sizeof(buf));
483  parseBytes(buf, is.stream().gcount());
484  }
485  parseEnd();
486  MIL << "Parsed " << pd->urls.size() << " mirrors from " << is.path() << std::endl;
487  if ( env::ZYPP_METALINK_DEBUG() ) {
488  for ( const auto &mirr : pd->urls )
489  DBG << "- " << mirr.priority << " " << mirr.url << std::endl;
490  }
491 }
492 
493 void
494 MetaLinkParser::parseBytes(const char *buf, size_t len)
495 {
496  if (!len)
497  return;
498 
499  if (xmlParseChunk(pd->parser, buf, len, 0)) {
500  if ( pd->_lastError )
502  else
503  ZYPP_THROW(parser::ParseException("Parse Error"));
504  }
505 }
506 
507 void
509 {
510  if (xmlParseChunk(pd->parser, NULL, 0, 1)) {
511  if ( pd->_lastError )
513  else
514  ZYPP_THROW(parser::ParseException("Parse Error"));
515  }
516  if (pd->urls.size() ) {
517  stable_sort(pd->urls.begin(), pd->urls.end(), []( const auto &a, const auto &b ){
518  return a.priority < b.priority;
519  });
520  }
521 }
522 
523 std::vector<Url>
525 {
526  std::vector<Url> urls;
527  urls.reserve(pd->urls.size());
528  for ( const auto &mirr : pd->urls )
529  urls.push_back( mirr.url );
530  return urls;
531 }
532 
533 const std::vector<MetalinkMirror> &MetaLinkParser::getMirrors() const
534 {
535  return pd->urls;
536 }
537 
539 {
540  MediaBlockList bl(pd->size);
541  if (pd->chksuml == 20)
542  bl.setFileChecksum("SHA1", pd->chksuml, pd->chksum.data() );
543  else if (pd->chksuml == 32)
544  bl.setFileChecksum("SHA256", pd->chksuml, pd->chksum.data());
545  if (pd->size != off_t(-1) && pd->blksize)
546  {
547  size_t nb = (pd->size + pd->blksize - 1) / pd->blksize;
548  off_t off = 0;
549  size_t size = pd->blksize;
550  for ( size_t i = 0; i < nb; i++ )
551  {
552  if (i == nb - 1)
553  {
554  size = pd->size % pd->blksize;
555  if (!size)
556  size = pd->blksize;
557  }
558  size_t blkno = bl.addBlock(off, size);
559  if ( i < pd->sha1.size())
560  {
561  bl.setChecksum(blkno, "SHA1", 20, pd->sha1[i].data());
562  if ( i < pd->zsync.size())
563  {
564  unsigned char *p = pd->zsync[i].data();
565  bl.setRsum(blkno, 4, p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24, pd->blksize);
566  }
567  }
568  off += pd->blksize;
569  }
570  }
571  return bl;
572 }
573 
574 const std::vector<UByteArray> &MetaLinkParser::getZsyncBlockHashes() const
575 {
576  return pd->zsync;
577 }
578 
579 const std::vector<UByteArray> &MetaLinkParser::getSHA1BlockHashes() const
580 {
581  return pd->sha1;
582 }
583 
584 } // namespace zypp::media
std::optional< filesystem::Pathname > _filename
size_t addBlock(off_t off, size_t size)
add a block with offset off and size size to the block list.
MediaBlockList getBlockList() const
return the block list from the parsed metalink data
#define MIL
Definition: Logger.h:100
const std::vector< UByteArray > & getZsyncBlockHashes() const
Namespace intended to collect all environment variables we use.
Definition: Env.h:24
bool ZYPP_METALINK_DEBUG()
Hack to circumvent the currently poor –root support.
static void XMLCALL characterData(void *userData, const xmlChar *s, int len)
#define ZYPP_THROW(EXCPT)
Drops a logline and throws the Exception.
Definition: Exception.h:459
unsigned short b
std::vector< UByteArray > sha1
static void XMLCALL endElement(void *userData, const xmlChar *name)
void parseBytes(const char *bytes, size_t len)
parse a chunk of a file consisting of metalink xml data.
String related utilities and Regular expression matching.
Helper to create and pass std::istream.
Definition: inputstream.h:56
void parse(const Pathname &filename)
parse a file consisting of metalink xml data
#define zypp_defer
Definition: AutoDispose.h:293
static void XMLCALL parseError(void *userData, const xmlError *error)
#define ERR
Definition: Logger.h:102
const std::unordered_map< ParserState, std::vector< transition > > & transitions()
void remember(const Exception &old_r)
Store an other Exception as history.
Definition: Exception.cc:154
void parseEnd()
tells the parser that all chunks are now processed
UByteArray hexstr2bytes(const std::string &str)
#define ZYPP_RETHROW(EXCPT)
Drops a logline and rethrows, updating the CodeLocation.
Definition: Exception.h:479
Convenient building of std::string via std::ostringstream Basically a std::ostringstream autoconverti...
Definition: String.h:212
struct ml_parsedata * pd
void setRsum(size_t blkno, int rsl, unsigned int rs, size_t rspad=0)
set / verify the (weak) rolling checksum over a single block
const std::vector< MetalinkMirror > & getMirrors() const
return the mirrors from the parsed metalink data
const Pathname & path() const
Path to the input file or empty if no file.
Definition: inputstream.h:111
std::exception_ptr _lastError
AutoDispose< xmlParserCtxtPtr > parser
static void XMLCALL startElement(void *userData, const xmlChar *name, const xmlChar **atts)
const std::vector< UByteArray > & getSHA1BlockHashes() const
void setChecksum(size_t blkno, const std::string &cstype, int csl, unsigned char *cs, size_t cspad=0)
set / verify the (strong) checksum over a single block
static const char * find_attr(const char *txt, const xmlChar **atts)
Look up a xml attribute in the passed array atts.
std::istream & stream() const
The std::istream.
Definition: inputstream.h:93
std::vector< UByteArray > zsync
std::stack< ParserState > parentStates
bool strToBool(const C_Str &str, bool default_r)
Parse str into a bool depending on the default value.
Definition: String.h:500
unsigned short a
std::vector< UByteArray > piece
std::vector< MetalinkMirror > urls
std::vector< Url > getUrls() const
return the download urls from the parsed metalink data
void setFileChecksum(std::string ctype, int cl, unsigned char *c)
set / verify the checksum over the whole file
void doTransition(const transition &t)
#define DBG
Definition: Logger.h:99
boost::noncopyable NonCopyable
Ensure derived classes cannot be copied.
Definition: NonCopyable.h:26