class XMLScan::HTMLScanner

Public Instance Methods

get_cdata_content() click to toggle source

This method should be called only from on_stag_end.

    # File lib/xmlscan/htmlscan.rb
197 def get_cdata_content
198   unless not s = @src.test or s[0] == ?< && s[1] == ?/ then
199     dst = @src.get
200     until not s = @src.test or s[0] == ?< && s[1] == ?/
201       dst << @src.get_plain
202     end
203     dst
204   else
205     ''
206   end
207 end

Private Instance Methods

found_invalid_pubsys(pubsys) click to toggle source
Calls superclass method
    # File lib/xmlscan/htmlscan.rb
229 def found_invalid_pubsys(pubsys)
230   s = pubsys.upcase
231   return s if s == 'PUBLIC' or s == 'SYSTEM'
232   super
233 end
on_stag_end_empty(name, *a) click to toggle source
   # File lib/xmlscan/htmlscan.rb
50 def on_stag_end_empty(name, *a)
51   raise "[BUG] this method must be never called"
52 end
on_xmldecl() click to toggle source
   # File lib/xmlscan/htmlscan.rb
26 def on_xmldecl
27   raise "[BUG] this method must be never called"
28 end
on_xmldecl_encoding(str) click to toggle source
   # File lib/xmlscan/htmlscan.rb
34 def on_xmldecl_encoding(str)
35   raise "[BUG] this method must be never called"
36 end
on_xmldecl_end() click to toggle source
   # File lib/xmlscan/htmlscan.rb
46 def on_xmldecl_end
47   raise "[BUG] this method must be never called"
48 end
on_xmldecl_other(name, value) click to toggle source
   # File lib/xmlscan/htmlscan.rb
42 def on_xmldecl_other(name, value)
43   raise "[BUG] this method must be never called"
44 end
on_xmldecl_standalone(str) click to toggle source
   # File lib/xmlscan/htmlscan.rb
38 def on_xmldecl_standalone(str)
39   raise "[BUG] this method must be never called"
40 end
on_xmldecl_version(str) click to toggle source
   # File lib/xmlscan/htmlscan.rb
30 def on_xmldecl_version(str)
31   raise "[BUG] this method must be never called"
32 end
scan_bang_tag(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
211 def scan_bang_tag(s)
212   if s == '<!' and @src.close_tag then    # <!>
213     on_comment ''
214   else
215     parse_error "parse error at `<!'"
216     while s and not @src.close_tag        # skip entire
217       s = @src.get_plain
218     end
219   end
220 end
scan_comment(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
 57 def scan_comment(s)
 58   s[0,4] = ''  # remove `<!--'
 59   comm = ''
 60   until /--/n =~ s
 61     comm << s
 62     s = @src.get_plain
 63     unless s then
 64       parse_error "unterminated comment meets EOF"
 65       return on_comment(comm)
 66     end
 67   end
 68   comm << $`
 69   s = $'
 70   until s.empty? || s.strip.empty? and @src.close_tag   # --> or -- >
 71     comm << '--'
 72     if /\A\s*--/n =~ s then   # <!--hoge-- --
 73       comm << $&
 74       s = $'
 75       if s.empty? and @src.close_tag then   # <!--hoge-- -->
 76         parse_error "`-->' is found but comment must not end here"
 77         comm.chop!.chop!
 78         break
 79       end
 80     else                     # <!--hoge-- fuga
 81       parse_error "only whitespace can appear between two comments"
 82     end
 83     if /\A-\s*\z/n =~ s and @src.close_tag then  # <!--hoge--->
 84       parse_error "`-->' is found but comment must not end here"
 85       comm.chop!
 86       break
 87     end
 88     until /--/n =~ s      # copy & paste for performance
 89       comm << s
 90       s = @src.get_plain
 91       unless s then
 92         parse_error "unterminated comment meets EOF"
 93         return on_comment(comm)
 94       end
 95     end
 96     comm << $`
 97     s = $'
 98   end
 99   on_comment comm
100 end
scan_internal_dtd(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
223 def scan_internal_dtd(s)
224   parse_error "DTD subset is found but it is not permitted in HTML"
225   skip_internal_dtd s
226 end
scan_pi(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
106 def scan_pi(s)   # <?PI >  this is default in SGML.
107   s[0,2] = ''    # remove `<?'
108   pi = s
109   until @src.close_tag
110     s = @src.get_plain
111     unless s then
112       parse_error "unterminated PI meets EOF"
113       break
114     end
115     pi << s
116   end
117   on_pi '', pi
118 end
Also aliased as: scan_xml_pi
scan_prolog(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
236 def scan_prolog(s)
237   doctype = 0
238   while s
239     if s[0] == ?< then
240       if (c = s[1]) == ?! then
241         if s[2] == ?- and s[3] == ?- then
242           scan_comment s
243         elsif /\A<!doctype(?=\s)/in =~ s then
244           doctype += 1
245           if doctype > 1 then
246             parse_error "another document type declaration is found"
247           end
248           scan_doctype $'
249         else
250           break
251         end
252       elsif c == ?? then
253         scan_pi s
254       else
255         break
256       end
257     elsif s.strip.empty? then
258       on_prolog_space s
259     else
260       break
261     end
262     s = @src.get
263   end
264   scan_content(s || @src.get)
265 end
scan_stag(s) click to toggle source
    # File lib/xmlscan/htmlscan.rb
121 def scan_stag(s)
122   unless /(?=[\/\s='"])/n =~ s then
123     name = s
124     name[0,1] = ''        # remove `<'
125     if name.empty? then   # <> or <<
126       if @src.close_tag then
127         return found_empty_stag
128       else
129         parse_error "parse error at `<'"
130         return on_chardata '<'
131       end
132     end
133     on_stag name
134     found_unclosed_stag name unless @src.close_tag
135     on_stag_end name
136   else
137     name = $`
138     s = $'
139     name[0,1] = ''        # remove `<'
140     if name.empty? then   # `< tag' or `<=`
141       parse_error "parse error at `<'"
142       if @src.close_tag then
143         s << '>'
144       end
145       return on_chardata '<'+s
146     end
147     on_stag name
148     begin
149       continue = false
150       s.scan(
151      /([^\s=\/'"]+)(?:\s*=\s*(?:('[^']*'?|"[^"]*"?)|([^\s='"]+)))?|(\S)/n
152              ) { |key,val,val2,error|
153         if key then
154           if val then                # key="value"
155             on_attribute key
156             qmark = val.slice!(0,1)
157             if val[-1] == qmark[0] then
158               val.chop!
159               scan_attr_value val unless val.empty?
160             else
161               scan_attr_value val unless val.empty?
162               begin
163                 s = @src.get
164                 unless s then
165                   parse_error "unterminated attribute `#{key}' meets EOF"
166                   break
167                 end
168                 c = s[0]
169                 val, s = s.split(qmark, 2)
170                 scan_attr_value '>' unless c == ?< or c == ?>
171                 scan_attr_value val if c
172               end until s
173               continue = s
174             end
175             on_attribute_end key
176           elsif val2 then            # key=value
177             on_attribute key
178             on_attr_value val2
179             on_attribute_end key
180           else                       # value
181             on_attribute nil
182             on_attr_value key
183             on_attribute_end nil
184           end
185         else
186           parse_error "parse error at `#{error}'"
187         end
188       }
189     end while continue
190     found_unclosed_stag name unless @src.close_tag
191     on_stag_end name
192   end
193 end
scan_xml_pi(s)
Alias for: scan_pi
wellformed_error(msg) click to toggle source
   # File lib/xmlscan/htmlscan.rb
19 def wellformed_error(msg)
20   # All wellformed error raised by XMLScanner are ignored.
21   # XMLScanner only raises wellformed error in stan_stag, which is a
22   # method completely overrided by HTMLScanner, so this method is
23   # never called in fact.
24 end