Class | REXML::Parsers::BaseParser |
In: |
lib/rexml/parsers/baseparser.rb
|
Parent: | Object |
This API is experimental, and subject to change.
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) while parser.has_next? res = parser.next puts res[1]['att'] if res.start_tag? and res[0] == 'b' end
See the PullEvent class for information on the content of the results. The data is identical to the arguments passed for the various events to the StreamListener API.
Notice that:
parser = PullParser.new( "<a>BAD DOCUMENT" ) while parser.has_next? res = parser.next raise res[1] if res.error? end
Nat Price gave me some good ideas for the API.
NCNAME_STR | = | '[\w:][\-\w\d.]*' | ||
NAME_STR | = | "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" | ||
UNAME_STR | = | "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" | ||
NAMECHAR | = | '[\-\w\d\.:]' | ||
NAME | = | "([\\w:]#{NAMECHAR}*)" | ||
NMTOKEN | = | "(?:#{NAMECHAR})+" | ||
NMTOKENS | = | "#{NMTOKEN}(\\s+#{NMTOKEN})*" | ||
REFERENCE | = | "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" | ||
REFERENCE_RE | = | /#{REFERENCE}/ | ||
DOCTYPE_START | = | /\A\s*<!DOCTYPE\s/um | ||
DOCTYPE_PATTERN | = | /\s*<!DOCTYPE\s+(.*?)(\[|>)/um | ||
ATTRIBUTE_PATTERN | = | /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um | ||
COMMENT_START | = | /\A<!--/u | ||
COMMENT_PATTERN | = | /<!--(.*?)-->/um | ||
CDATA_START | = | /\A<!\[CDATA\[/u | ||
CDATA_END | = | /^\s*\]\s*>/um | ||
CDATA_PATTERN | = | /<!\[CDATA\[(.*?)\]\]>/um | ||
XMLDECL_START | = | /\A<\?xml\s/u; | ||
XMLDECL_PATTERN | = | /<\?xml\s+(.*?)\?>/um | ||
INSTRUCTION_START | = | /\A<\?/u | ||
INSTRUCTION_PATTERN | = | /<\?(.*?)(\s+.*?)?\?>/um | ||
TAG_MATCH | = | /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um | ||
CLOSE_MATCH | = | /^\s*<\/(#{NAME_STR})\s*>/um | ||
VERSION | = | /\bversion\s*=\s*["'](.*?)['"]/um | ||
ENCODING | = | /\bencoding\s*=\s*["'](.*?)['"]/um | ||
STANDALONE | = | /\bstandalone\s*=\s["'](.*?)['"]/um | ||
ENTITY_START | = | /^\s*<!ENTITY/ | ||
IDENTITY | = | /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u | ||
ELEMENTDECL_START | = | /^\s*<!ELEMENT/um | ||
ELEMENTDECL_PATTERN | = | /^\s*(<!ELEMENT.*?)>/um | ||
SYSTEMENTITY | = | /^\s*(%.*?;)\s*$/um | ||
ENUMERATION | = | "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" | ||
NOTATIONTYPE | = | "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" | ||
ENUMERATEDTYPE | = | "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" | ||
ATTTYPE | = | "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" | ||
ATTVALUE | = | "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" | ||
DEFAULTDECL | = | "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" | ||
ATTDEF | = | "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" | ||
ATTDEF_RE | = | /#{ATTDEF}/ | ||
ATTLISTDECL_START | = | /^\s*<!ATTLIST/um | ||
ATTLISTDECL_PATTERN | = | /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um | ||
NOTATIONDECL_START | = | /^\s*<!NOTATION/um | ||
PUBLIC | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um | ||
SYSTEM | = | /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um | ||
TEXT_PATTERN | = | /\A([^<]*)/um | ||
PUBIDCHAR | = | "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" | Entity constants | |
SYSTEMLITERAL | = | %Q{((?:"[^"]*")|(?:'[^']*'))} | ||
PUBIDLITERAL | = | %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} | ||
EXTERNALID | = | "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" | ||
NDATADECL | = | "\\s+NDATA\\s+#{NAME}" | ||
PEREFERENCE | = | "%#{NAME};" | ||
ENTITYVALUE | = | %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} | ||
PEDEF | = | "(?:#{ENTITYVALUE}|#{EXTERNALID})" | ||
ENTITYDEF | = | "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" | ||
PEDECL | = | "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" | ||
GEDECL | = | "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" | ||
ENTITYDECL | = | /\s*(?:#{GEDECL})|(?:#{PEDECL})/um | ||
EREFERENCE | = | /&(?!#{NAME};)/ | ||
DEFAULT_ENTITIES | = | { 'gt' => [/>/, '>', '>', />/], 'lt' => [/</, '<', '<', /</], 'quot' => [/"/, '"', '"', /"/], "apos" => [/'/, "'", "'", /'/] | ||
MISSING_ATTRIBUTE_QUOTES | = | /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um | These are patterns to identify common markup errors, to make the error messages more informative. |
source | [R] |
# File lib/rexml/parsers/baseparser.rb, line 109 109: def initialize( source ) 110: self.stream = source 111: end
# File lib/rexml/parsers/baseparser.rb, line 113 113: def add_listener( listener ) 114: if !defined?(@listeners) or !@listeners 115: @listeners = [] 116: instance_eval "alias :_old_pull :pull\ndef pull\nevent = _old_pull\n@listeners.each do |listener|\nlistener.receive event\nend\nevent\nend\n" 117: end 118: @listeners << listener 119: end
Returns true if there are no more events
# File lib/rexml/parsers/baseparser.rb, line 153 153: def empty? 154: return (@source.empty? and @stack.empty?) 155: end
# File lib/rexml/parsers/baseparser.rb, line 424 424: def entity( reference, entities ) 425: value = nil 426: value = entities[ reference ] if entities 427: if not value 428: value = DEFAULT_ENTITIES[ reference ] 429: value = value[2] if value 430: end 431: unnormalize( value, entities ) if value 432: end
Escapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 435 435: def normalize( input, entities=nil, entity_filter=nil ) 436: copy = input.clone 437: # Doing it like this rather than in a loop improves the speed 438: copy.gsub!( EREFERENCE, '&' ) 439: entities.each do |key, value| 440: copy.gsub!( value, "&#{key};" ) unless entity_filter and 441: entity_filter.include?(entity) 442: end if entities 443: copy.gsub!( EREFERENCE, '&' ) 444: DEFAULT_ENTITIES.each do |key, value| 445: copy.gsub!( value[3], value[1] ) 446: end 447: copy 448: end
Peek at the depth event in the stack. The first element on the stack is at depth 0. If depth is -1, will parse to the end of the input stream and return the last event, which is always :end_document. Be aware that this causes the stream to be parsed up to the depth event, so you can effectively pre-parse the entire document (pull the entire thing into memory) using this method.
# File lib/rexml/parsers/baseparser.rb, line 174 174: def peek depth=0 175: raise %Q[Illegal argument "#{depth}"] if depth < -1 176: temp = [] 177: if depth == -1 178: temp.push(pull()) until empty? 179: else 180: while @stack.size+temp.size < depth+1 181: temp.push(pull()) 182: end 183: end 184: @stack += temp if temp.size > 0 185: @stack[depth] 186: end
# File lib/rexml/parsers/baseparser.rb, line 143 143: def position 144: if @source.respond_to? :position 145: @source.position 146: else 147: # FIXME 148: 0 149: end 150: end
Returns the next event. This is a PullEvent object.
# File lib/rexml/parsers/baseparser.rb, line 189 189: def pull 190: if @closed 191: x, @closed = @closed, nil 192: return [ :end_element, x ] 193: end 194: return [ :end_document ] if empty? 195: return @stack.shift if @stack.size > 0 196: #STDERR.puts @source.encoding 197: @source.read if @source.buffer.size<2 198: #STDERR.puts "BUFFER = #{@source.buffer.inspect}" 199: if @document_status == nil 200: #@source.consume( /^\s*/um ) 201: word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) 202: word = word[1] unless word.nil? 203: #STDERR.puts "WORD = #{word.inspect}" 204: case word 205: when COMMENT_START 206: return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] 207: when XMLDECL_START 208: #STDERR.puts "XMLDECL" 209: results = @source.match( XMLDECL_PATTERN, true )[1] 210: version = VERSION.match( results ) 211: version = version[1] unless version.nil? 212: encoding = ENCODING.match(results) 213: encoding = encoding[1] unless encoding.nil? 214: @source.encoding = encoding 215: standalone = STANDALONE.match(results) 216: standalone = standalone[1] unless standalone.nil? 217: return [ :xmldecl, version, encoding, standalone ] 218: when INSTRUCTION_START 219: return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] 220: when DOCTYPE_START 221: md = @source.match( DOCTYPE_PATTERN, true ) 222: @nsstack.unshift(curr_ns=Set.new) 223: identity = md[1] 224: close = md[2] 225: identity =~ IDENTITY 226: name = $1 227: raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? 228: pub_sys = $2.nil? ? nil : $2.strip 229: long_name = $4.nil? ? nil : $4.strip 230: uri = $6.nil? ? nil : $6.strip 231: args = [ :start_doctype, name, pub_sys, long_name, uri ] 232: if close == ">" 233: @document_status = :after_doctype 234: @source.read if @source.buffer.size<2 235: md = @source.match(/^\s*/um, true) 236: @stack << [ :end_doctype ] 237: else 238: @document_status = :in_doctype 239: end 240: return args 241: when /^\s+/ 242: else 243: @document_status = :after_doctype 244: @source.read if @source.buffer.size<2 245: md = @source.match(/\s*/um, true) 246: end 247: end 248: if @document_status == :in_doctype 249: md = @source.match(/\s*(.*?>)/um) 250: case md[1] 251: when SYSTEMENTITY 252: match = @source.match( SYSTEMENTITY, true )[1] 253: return [ :externalentity, match ] 254: 255: when ELEMENTDECL_START 256: return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] 257: 258: when ENTITY_START 259: match = @source.match( ENTITYDECL, true ).to_a.compact 260: match[0] = :entitydecl 261: ref = false 262: if match[1] == '%' 263: ref = true 264: match.delete_at 1 265: end 266: # Now we have to sort out what kind of entity reference this is 267: if match[2] == 'SYSTEM' 268: # External reference 269: match[3] = match[3][1..-2] # PUBID 270: match.delete_at(4) if match.size > 4 # Chop out NDATA decl 271: # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] 272: elsif match[2] == 'PUBLIC' 273: # External reference 274: match[3] = match[3][1..-2] # PUBID 275: match[4] = match[4][1..-2] # HREF 276: # match is [ :entity, name, PUBLIC, pubid, href ] 277: else 278: match[2] = match[2][1..-2] 279: match.pop if match.size == 4 280: # match is [ :entity, name, value ] 281: end 282: match << '%' if ref 283: return match 284: when ATTLISTDECL_START 285: md = @source.match( ATTLISTDECL_PATTERN, true ) 286: raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? 287: element = md[1] 288: contents = md[0] 289: 290: pairs = {} 291: values = md[0].scan( ATTDEF_RE ) 292: values.each do |attdef| 293: unless attdef[3] == "#IMPLIED" 294: attdef.compact! 295: val = attdef[3] 296: val = attdef[4] if val == "#FIXED " 297: pairs[attdef[0]] = val 298: if attdef[0] =~ /^xmlns:(.*)/ 299: @nsstack[0] << $1 300: end 301: end 302: end 303: return [ :attlistdecl, element, pairs, contents ] 304: when NOTATIONDECL_START 305: md = nil 306: if @source.match( PUBLIC ) 307: md = @source.match( PUBLIC, true ) 308: vals = [md[1],md[2],md[4],md[6]] 309: elsif @source.match( SYSTEM ) 310: md = @source.match( SYSTEM, true ) 311: vals = [md[1],md[2],nil,md[4]] 312: else 313: raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) 314: end 315: return [ :notationdecl, *vals ] 316: when CDATA_END 317: @document_status = :after_doctype 318: @source.match( CDATA_END, true ) 319: return [ :end_doctype ] 320: end 321: end 322: begin 323: if @source.buffer[0] == ?< 324: if @source.buffer[1] == ?/ 325: @nsstack.shift 326: last_tag = @tags.pop 327: #md = @source.match_to_consume( '>', CLOSE_MATCH) 328: md = @source.match( CLOSE_MATCH, true ) 329: raise REXML::ParseException.new( "Missing end tag for "+ 330: "'#{last_tag}' (got \"#{md[1]}\")", 331: @source) unless last_tag == md[1] 332: return [ :end_element, last_tag ] 333: elsif @source.buffer[1] == ?! 334: md = @source.match(/\A(\s*[^>]*>)/um) 335: #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" 336: raise REXML::ParseException.new("Malformed node", @source) unless md 337: if md[0][2] == ?- 338: md = @source.match( COMMENT_PATTERN, true ) 339: return [ :comment, md[1] ] if md 340: else 341: md = @source.match( CDATA_PATTERN, true ) 342: return [ :cdata, md[1] ] if md 343: end 344: raise REXML::ParseException.new( "Declarations can only occur "+ 345: "in the doctype declaration.", @source) 346: elsif @source.buffer[1] == ?? 347: md = @source.match( INSTRUCTION_PATTERN, true ) 348: return [ :processing_instruction, md[1], md[2] ] if md 349: raise REXML::ParseException.new( "Bad instruction declaration", 350: @source) 351: else 352: # Get the next tag 353: md = @source.match(TAG_MATCH, true) 354: unless md 355: # Check for missing attribute quotes 356: raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) 357: raise REXML::ParseException.new("malformed XML: missing tag start", @source) 358: end 359: attributes = {} 360: prefixes = Set.new 361: prefixes << md[2] if md[2] 362: @nsstack.unshift(curr_ns=Set.new) 363: if md[4].size > 0 364: attrs = md[4].scan( ATTRIBUTE_PATTERN ) 365: raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 366: attrs.each { |a,b,c,d,e| 367: if b == "xmlns" 368: if c == "xml" 369: if d != "http://www.w3.org/XML/1998/namespace" 370: msg = "The 'xml' prefix must not be bound to any other namespace "+ 371: "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" 372: raise REXML::ParseException.new( msg, @source, self ) 373: end 374: elsif c == "xmlns" 375: msg = "The 'xmlns' prefix must not be declared "+ 376: "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" 377: raise REXML::ParseException.new( msg, @source, self) 378: end 379: curr_ns << c 380: elsif b 381: prefixes << b unless b == "xml" 382: end 383: attributes[a] = e 384: } 385: end 386: 387: # Verify that all of the prefixes have been defined 388: for prefix in prefixes 389: unless @nsstack.find{|k| k.member?(prefix)} 390: raise UndefinedNamespaceException.new(prefix,@source,self) 391: end 392: end 393: 394: if md[6] 395: @closed = md[1] 396: @nsstack.shift 397: else 398: @tags.push( md[1] ) 399: end 400: return [ :start_element, md[1], attributes ] 401: end 402: else 403: md = @source.match( TEXT_PATTERN, true ) 404: if md[0].length == 0 405: @source.match( /(\s+)/, true ) 406: end 407: #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 408: #return [ :text, "" ] if md[0].length == 0 409: # unnormalized = Text::unnormalize( md[1], self ) 410: # return PullEvent.new( :text, md[1], unnormalized ) 411: return [ :text, md[1] ] 412: end 413: rescue REXML::UndefinedNamespaceException 414: raise 415: rescue REXML::ParseException 416: raise 417: rescue Exception, NameError => error 418: raise REXML::ParseException.new( "Exception parsing", 419: @source, self, (error ? error : $!) ) 420: end 421: return [ :dummy ] 422: end
# File lib/rexml/parsers/baseparser.rb, line 133 133: def stream=( source ) 134: @source = SourceFactory.create_from( source ) 135: @closed = nil 136: @document_status = nil 137: @tags = [] 138: @stack = [] 139: @entities = [] 140: @nsstack = [] 141: end
Unescapes all possible entities
# File lib/rexml/parsers/baseparser.rb, line 451 451: def unnormalize( string, entities=nil, filter=nil ) 452: rv = string.clone 453: rv.gsub!( /\r\n?/, "\n" ) 454: matches = rv.scan( REFERENCE_RE ) 455: return rv if matches.size == 0 456: rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| 457: m=$1 458: m = "0#{m}" if m[0] == ?x 459: [Integer(m)].pack('U*') 460: } 461: matches.collect!{|x|x[0]}.compact! 462: if matches.size > 0 463: matches.each do |entity_reference| 464: unless filter and filter.include?(entity_reference) 465: entity_value = entity( entity_reference, entities ) 466: if entity_value 467: re = /&#{entity_reference};/ 468: rv.gsub!( re, entity_value ) 469: end 470: end 471: end 472: matches.each do |entity_reference| 473: unless filter and filter.include?(entity_reference) 474: er = DEFAULT_ENTITIES[entity_reference] 475: rv.gsub!( er[0], er[2] ) if er 476: end 477: end 478: rv.gsub!( /&/, '&' ) 479: end 480: rv 481: end