class REXML::Parsers::BaseParser
使用 Pull 解析器¶ ↑
此 API 是实验性的,可能会发生更改。
parser = PullParser.new( "<a>text<b att='val'/>txet</a>" ) while parser.has_next? res = parser.next puts res[1]['att'] if res.start_tag? and res[0] == 'b' end
有关结果内容的信息,请参阅 PullEvent
类。数据与传递给 StreamListener
API 的各种事件的参数相同。
请注意,
parser = PullParser.new( "<a>BAD DOCUMENT" ) while parser.has_next? res = parser.next raise res[1] if res.error? end
Nat Price 为我提供了关于 API 的一些好主意。
常量
- ATTDEF
- ATTDEF_RE
- ATTLISTDECL_PATTERN
- ATTLISTDECL_START
- ATTRIBUTE_PATTERN
- ATTTYPE
- ATTVALUE
- CDATA_END
- CDATA_PATTERN
- CDATA_START
- CLOSE_MATCH
- COMBININGCHAR
- COMMENT_PATTERN
- COMMENT_START
- DEFAULTDECL
- DEFAULT_ENTITIES
- DIGIT
- DOCTYPE_END
- DOCTYPE_START
- ELEMENTDECL_PATTERN
- ELEMENTDECL_START
- ENCODING
- ENTITYDECL
- ENTITYDEF
- ENTITYVALUE
- ENTITY_START
- ENUMERATEDTYPE
- ENUMERATION
- EREFERENCE
- EXTENDER
- EXTERNALID
- EXTERNAL_ID_PUBLIC
- EXTERNAL_ID_SYSTEM
- GEDECL
- INSTRUCTION_PATTERN
- INSTRUCTION_START
- LETTER
- NAME
- NAMECHAR
- NCNAME_STR
- NDATADECL
- NMTOKEN
- NMTOKENS
- NOTATIONDECL_START
- NOTATIONTYPE
- PEDECL
- PEDEF
- PEREFERENCE
- PUBIDCHAR
Entity
常量- PUBIDLITERAL
- PUBLIC_ID
- QNAME
- QNAME_STR
- REFERENCE
- REFERENCE_RE
- STANDALONE
- SYSTEMENTITY
- SYSTEMLITERAL
- TAG_MATCH
- TEXT_PATTERN
- UNAME_STR
仅用于向后兼容。例如,kramdown 使用它。它在
REXML
中未使用。- VERSION
- XMLDECL_PATTERN
- XMLDECL_START
属性
entity_expansion_count[R]
entity_expansion_limit[W]
entity_expansion_text_limit[W]
source[R]
公共类方法
new( source ) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 163 def initialize( source ) self.stream = source @listeners = [] @prefixes = Set.new @entity_expansion_count = 0 @entity_expansion_limit = Security.entity_expansion_limit @entity_expansion_text_limit = Security.entity_expansion_text_limit @source.ensure_buffer end
公共实例方法
add_listener( listener ) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 173 def add_listener( listener ) @listeners << listener end
empty?() 点击以切换源代码
如果没有更多事件,则返回 true
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 208 def empty? return (@source.empty? and @stack.empty?) end
entity( reference, entities ) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 540 def entity( reference, entities ) return unless entities value = entities[ reference ] return if value.nil? record_entity_expansion unnormalize( value, entities ) end
has_next?() 点击以切换源代码
如果还有更多事件,则返回 true。与 !empty? 同义
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 213 def has_next? return !(@source.empty? and @stack.empty?) end
normalize( input, entities=nil, entity_filter=nil ) 点击以切换源代码
转义所有可能的实体
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 551 def normalize( input, entities=nil, entity_filter=nil ) copy = input.clone # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| copy.gsub!( value[3], value[1] ) end copy end
peek(depth=0) 点击以切换源代码
查看堆栈中 depth
事件。堆栈中的第一个元素深度为 0。如果 depth
为 -1,则会解析到输入流的末尾并返回最后一个事件,该事件始终为 :end_document。请注意,这会导致解析流直到 depth
事件,因此您可以使用此方法有效地预解析整个文档(将整个内容拉入内存)。
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 229 def peek depth=0 raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] if depth == -1 temp.push(pull()) until empty? else while @stack.size+temp.size < depth+1 temp.push(pull()) end end @stack += temp if temp.size > 0 @stack[depth] end
position() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 198 def position if @source.respond_to? :position @source.position else # FIXME 0 end end
pull() 点击以切换源代码
返回下一个事件。这是一个 PullEvent
对象。
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 244 def pull @source.drop_parsed_content pull_event.tap do |event| @listeners.each do |listener| listener.receive event end end end
reset() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 187 def reset @closed = nil @have_root = false @document_status = nil @tags = [] @stack = [] @entities = [] @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE} @namespaces_restore_stack = [] end
stream=( source ) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 182 def stream=( source ) @source = SourceFactory.create_from( source ) reset end
unnormalize( string, entities=nil, filter=nil ) 点击以切换源代码
取消转义所有可能的实体
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 567 def unnormalize( string, entities=nil, filter=nil ) if string.include?("\r") rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) else rv = string.dup end matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 rv.gsub!( Private::CHARACTER_REFERENCES ) { m=$1 if m.start_with?("x") code_point = Integer(m[1..-1], 16) else code_point = Integer(m, 10) end [code_point].pack('U*') } matches.collect!{|x|x[0]}.compact! if filter matches.reject! do |entity_reference| filter.include?(entity_reference) end end if matches.size > 0 matches.tally.each do |entity_reference, n| entity_expansion_count_before = @entity_expansion_count entity_value = entity( entity_reference, entities ) if entity_value if n > 1 entity_expansion_count_delta = @entity_expansion_count - entity_expansion_count_before record_entity_expansion(entity_expansion_count_delta * (n - 1)) end re = Private::DEFAULT_ENTITIES_PATTERNS[entity_reference] || /&#{entity_reference};/ rv.gsub!( re, entity_value ) if rv.bytesize > @entity_expansion_text_limit raise "entity expansion has grown too large" end else er = DEFAULT_ENTITIES[entity_reference] rv.gsub!( er[0], er[2] ) if er end end rv.gsub!( Private::DEFAULT_ENTITIES_PATTERNS['amp'], '&' ) end rv end
unshift(token) 点击以切换源代码
将事件推回流的头部。此方法(理论上)具有无限深度。
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 219 def unshift token @stack.unshift(token) end
私有实例方法
add_namespace(prefix, uri) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 616 def add_namespace(prefix, uri) @namespaces_restore_stack.last[prefix] = @namespaces[prefix] if uri.nil? @namespaces.delete(prefix) else @namespaces[prefix] = uri end end
need_source_encoding_update?(xml_declaration_encoding) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 649 def need_source_encoding_update?(xml_declaration_encoding) return false if xml_declaration_encoding.nil? return false if /\AUTF-16\z/i =~ xml_declaration_encoding true end
parse_attributes(prefixes) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 769 def parse_attributes(prefixes) attributes = {} expanded_names = {} closed = false while true if @source.match?(">", true) return attributes, closed elsif @source.match?("/>", true) closed = true return attributes, closed elsif match = @source.match(QNAME, true) name = match[1] prefix = match[2] local_part = match[3] unless @source.match?(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end unless match = @source.match(/(['"])/, true) message = "Missing attribute value start quote: <#{name}>" raise REXML::ParseException.new(message, @source) end quote = match[1] start_position = @source.position value = @source.read_until(quote) unless value.chomp!(quote) @source.position = start_position message = "Missing attribute value end quote: <#{name}>: <#{quote}>" raise REXML::ParseException.new(message, @source) end @source.match?(/\s*/um, true) if prefix == "xmlns" if local_part == "xml" if value != Private::XML_PREFIXED_NAMESPACE msg = "The 'xml' prefix must not be bound to any other namespace "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self ) end elsif local_part == "xmlns" msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" raise REXML::ParseException.new( msg, @source, self) end add_namespace(local_part, value) elsif prefix prefixes << prefix unless prefix == "xml" end if attributes[name] msg = "Duplicate attribute #{name.inspect}" raise REXML::ParseException.new(msg, @source, self) end unless prefix == "xmlns" uri = @namespaces[prefix] expanded_name = [uri, local_part] existing_prefix = expanded_names[expanded_name] if existing_prefix message = "Namespace conflict in adding attribute " + "\"#{local_part}\": " + "Prefix \"#{existing_prefix}\" = \"#{uri}\" and " + "prefix \"#{prefix}\" = \"#{uri}\"" raise REXML::ParseException.new(message, @source, self) end expanded_names[expanded_name] = prefix end attributes[name] = value else message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>" raise REXML::ParseException.new(message, @source) end end end
parse_id(base_error_message, accept_external_id:, accept_public_id:) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 668 def parse_id(base_error_message, accept_external_id:, accept_public_id:) if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote system_literal = md[2] system = system_literal[1..-2] if system_literal # Remove quote ["PUBLIC", pubid, system] elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) pubid = system = nil pubid_literal = md[1] pubid = pubid_literal[1..-2] if pubid_literal # Remove quote ["PUBLIC", pubid, nil] elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) system = nil system_literal = md[1] system = system_literal[1..-2] if system_literal # Remove quote ["SYSTEM", nil, system] else details = parse_id_invalid_details(accept_external_id: accept_external_id, accept_public_id: accept_public_id) message = "#{base_error_message}: #{details}" raise REXML::ParseException.new(message, @source) end end
parse_id_invalid_details(accept_external_id:, accept_public_id:) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 696 def parse_id_invalid_details(accept_external_id:, accept_public_id:) public = /\A\s*PUBLIC/um system = /\A\s*SYSTEM/um if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um) if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um) return "public ID literal is missing" end unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um) return "invalid public ID literal" end if accept_public_id if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) return "system ID literal is missing" end unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else "garbage after public ID literal" end elsif accept_external_id and @source.match?(/#{system}/um) if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um) return "system literal is missing" end unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um) return "invalid system literal" end "garbage after system literal" else unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um) return "invalid ID type" end "ID type is missing" end end
parse_name(base_error_message) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 655 def parse_name(base_error_message) md = @source.match(Private::NAME_PATTERN, true) unless md if @source.match?(/\S/um) message = "#{base_error_message}: invalid name" else message = "#{base_error_message}: name is missing" end raise REXML::ParseException.new(message, @source) end md[0] end
pop_namespaces_restore() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 631 def pop_namespaces_restore namespaces_restore = @namespaces_restore_stack.pop namespaces_restore.each do |prefix, uri| if uri.nil? @namespaces.delete(prefix) else @namespaces[prefix] = uri end end end
process_instruction() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 734 def process_instruction name = parse_name("Malformed XML: Invalid processing instruction node") if @source.match?(/\s+/um, true) match_data = @source.match(/(.*?)\?>/um, true) unless match_data raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) end content = match_data[1] else content = nil unless @source.match?("?>", true) raise ParseException.new("Malformed XML: Unclosed processing instruction", @source) end end if name == "xml" if @document_status raise ParseException.new("Malformed XML: XML declaration is not at the start", @source) end version = VERSION.match(content) version = version[1] unless version.nil? encoding = ENCODING.match(content) encoding = encoding[1] unless encoding.nil? if need_source_encoding_update?(encoding) @source.encoding = encoding end if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding encoding = "UTF-16" end standalone = STANDALONE.match(content) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] end [:processing_instruction, name, content] end
pull_event() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 254 def pull_event if @closed x, @closed = @closed, nil return [ :end_element, x ] end if empty? if @document_status == :in_doctype raise ParseException.new("Malformed DOCTYPE: unclosed", @source) end unless @tags.empty? path = "/" + @tags.join("/") raise ParseException.new("Missing end tag for '#{path}'", @source) end return [ :end_document ] end return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding #STDERR.puts "BUFFER = #{@source.buffer.inspect}" @source.ensure_buffer if @document_status == nil start_position = @source.position if @source.match?("<?", true) return process_instruction elsif @source.match?("<!", true) if @source.match?("--", true) md = @source.match(/(.*?)-->/um, true) if md.nil? raise REXML::ParseException.new("Unclosed comment", @source) end if /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] elsif @source.match?("DOCTYPE", true) base_error_message = "Malformed DOCTYPE" unless @source.match?(/\s+/um, true) if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" end @source.position = start_position raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) if @source.match?(/\s*\[/um, true) id = [nil, nil, nil] @document_status = :in_doctype elsif @source.match?(/\s*>/um, true) id = [nil, nil, nil] @document_status = :after_doctype @source.ensure_buffer else id = parse_id(base_error_message, accept_external_id: true, accept_public_id: false) if id[0] == "SYSTEM" # For backward compatibility id[1], id[2] = id[2], nil end if @source.match?(/\s*\[/um, true) @document_status = :in_doctype elsif @source.match?(/\s*>/um, true) @document_status = :after_doctype @source.ensure_buffer else message = "#{base_error_message}: garbage after external ID" raise REXML::ParseException.new(message, @source) end end args = [:start_doctype, name, *id] if @document_status == :after_doctype @source.match?(/\s*/um, true) @stack << [ :end_doctype ] end return args else message = "Invalid XML" raise REXML::ParseException.new(message, @source) end end end if @document_status == :in_doctype @source.match?(/\s*/um, true) # skip spaces start_position = @source.position if @source.match?("<!", true) if @source.match?("ELEMENT", true) md = @source.match(/(.*?)>/um, true) raise REXML::ParseException.new( "Bad ELEMENT declaration!", @source ) if md.nil? return [ :elementdecl, "<!ELEMENT" + md[1] ] elsif @source.match?("ENTITY", true) match_data = @source.match(Private::ENTITYDECL_PATTERN, true) unless match_data raise REXML::ParseException.new("Malformed entity declaration", @source) end match = [:entitydecl, *match_data.captures.compact] ref = false if match[1] == '%' ref = true match.delete_at 1 end # Now we have to sort out what kind of entity reference this is if match[2] == 'SYSTEM' # External reference match[3] = match[3][1..-2] # PUBID match.delete_at(4) if match.size > 4 # Chop out NDATA decl # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] elsif match[2] == 'PUBLIC' # External reference match[3] = match[3][1..-2] # PUBID match[4] = match[4][1..-2] # HREF match.delete_at(5) if match.size > 5 # Chop out NDATA decl # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] elsif Private::PEREFERENCE_PATTERN.match?(match[2]) raise REXML::ParseException.new("Parameter entity references forbidden in internal subset: #{match[2]}", @source) else match[2] = match[2][1..-2] match.pop if match.size == 4 # match is [ :entity, name, value ] end match << '%' if ref return match elsif @source.match?("ATTLIST", true) md = @source.match(Private::ATTLISTDECL_END, true) raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? element = md[1] contents = md[0] pairs = {} values = md[0].strip.scan( ATTDEF_RE ) values.each do |attdef| unless attdef[3] == "#IMPLIED" attdef.compact! val = attdef[3] val = attdef[4] if val == "#FIXED " pairs[attdef[0]] = val if attdef[0] =~ /^xmlns:(.*)/ @namespaces[$1] = val end end end return [ :attlistdecl, element, pairs, contents ] elsif @source.match?("NOTATION", true) base_error_message = "Malformed notation declaration" unless @source.match?(/\s+/um, true) if @source.match?(">") message = "#{base_error_message}: name is missing" else message = "#{base_error_message}: invalid name" end @source.position = start_position raise REXML::ParseException.new(message, @source) end name = parse_name(base_error_message) id = parse_id(base_error_message, accept_external_id: true, accept_public_id: true) unless @source.match?(/\s*>/um, true) message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end return [:notationdecl, name, *id] elsif md = @source.match(/--(.*?)-->/um, true) case md[1] when /--/, /-\z/ raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] if md end elsif match = @source.match(/(%.*?;)\s*/um, true) return [ :externalentity, match[1] ] elsif @source.match?(/\]\s*>/um, true) @document_status = :after_doctype return [ :end_doctype ] end if @document_status == :in_doctype raise ParseException.new("Malformed DOCTYPE: invalid declaration", @source) end end if @document_status == :after_doctype @source.match?(/\s*/um, true) end begin start_position = @source.position if @source.match?("<", true) # :text's read_until may remain only "<" in buffer. In the # case, buffer is empty here. So we need to fill buffer # here explicitly. @source.ensure_buffer if @source.match?("/", true) @namespaces_restore_stack.pop last_tag = @tags.pop md = @source.match(Private::CLOSE_PATTERN, true) if md and !last_tag message = "Unexpected top-level end tag (got '#{md[1]}')" raise REXML::ParseException.new(message, @source) end if md.nil? or last_tag != md[1] message = "Missing end tag for '#{last_tag}'" message += " (got '#{md[1]}')" if md @source.position = start_position if md.nil? raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] elsif @source.match?("!", true) md = @source.match(/([^>]*>)/um) #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][0] == ?- md = @source.match(/--(.*?)-->/um, true) if md.nil? || /--|-\z/.match?(md[1]) raise REXML::ParseException.new("Malformed comment", @source) end return [ :comment, md[1] ] else md = @source.match(/\[CDATA\[(.*?)\]\]>/um, true) return [ :cdata, md[1] ] if md end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.match?("?", true) return process_instruction else # Get the next tag md = @source.match(Private::TAG_PATTERN, true) unless md @source.position = start_position raise REXML::ParseException.new("malformed XML: missing tag start", @source) end tag = md[1] @document_status = :in_element @prefixes.clear @prefixes << md[2] if md[2] push_namespaces_restore attributes, closed = parse_attributes(@prefixes) # Verify that all of the prefixes have been defined for prefix in @prefixes unless @namespaces.key?(prefix) raise UndefinedNamespaceException.new(prefix,@source,self) end end if closed @closed = tag pop_namespaces_restore else if @tags.empty? and @have_root raise ParseException.new("Malformed XML: Extra tag at the end of the document (got '<#{tag}')", @source) end @tags.push( tag ) end @have_root = true return [ :start_element, tag, attributes ] end else text = @source.read_until("<") if text.chomp!("<") @source.position -= "<".bytesize end if @tags.empty? unless /\A\s*\z/.match?(text) if @have_root raise ParseException.new("Malformed XML: Extra content at the end of the document (got '#{text}')", @source) else raise ParseException.new("Malformed XML: Content at the start of the document (got '#{text}')", @source) end end return pull_event if @have_root end return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise rescue REXML::ParseException raise rescue => error raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end return [ :dummy ] end
push_namespaces_restore() 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 625 def push_namespaces_restore namespaces_restore = {} @namespaces_restore_stack.push(namespaces_restore) namespaces_restore end
record_entity_expansion(delta=1) 点击以切换源代码
# File rexml-3.4.0/lib/rexml/parsers/baseparser.rb, line 642 def record_entity_expansion(delta=1) @entity_expansion_count += delta if @entity_expansion_count > @entity_expansion_limit raise "number of entity expansions exceeded, processing aborted." end end