class XMLScan::HTMLScanner
Public Instance Methods
get_cdata_content()
click to toggle source
This method should be called only from on_stag_end.
# File lib/xmlscan/htmlscan.rb 197 def get_cdata_content 198 unless not s = @src.test or s[0] == ?< && s[1] == ?/ then 199 dst = @src.get 200 until not s = @src.test or s[0] == ?< && s[1] == ?/ 201 dst << @src.get_plain 202 end 203 dst 204 else 205 '' 206 end 207 end
Private Instance Methods
found_invalid_pubsys(pubsys)
click to toggle source
Calls superclass method
# File lib/xmlscan/htmlscan.rb 229 def found_invalid_pubsys(pubsys) 230 s = pubsys.upcase 231 return s if s == 'PUBLIC' or s == 'SYSTEM' 232 super 233 end
on_stag_end_empty(name, *a)
click to toggle source
# File lib/xmlscan/htmlscan.rb 50 def on_stag_end_empty(name, *a) 51 raise "[BUG] this method must be never called" 52 end
on_xmldecl()
click to toggle source
# File lib/xmlscan/htmlscan.rb 26 def on_xmldecl 27 raise "[BUG] this method must be never called" 28 end
on_xmldecl_encoding(str)
click to toggle source
# File lib/xmlscan/htmlscan.rb 34 def on_xmldecl_encoding(str) 35 raise "[BUG] this method must be never called" 36 end
on_xmldecl_end()
click to toggle source
# File lib/xmlscan/htmlscan.rb 46 def on_xmldecl_end 47 raise "[BUG] this method must be never called" 48 end
on_xmldecl_other(name, value)
click to toggle source
# File lib/xmlscan/htmlscan.rb 42 def on_xmldecl_other(name, value) 43 raise "[BUG] this method must be never called" 44 end
on_xmldecl_standalone(str)
click to toggle source
# File lib/xmlscan/htmlscan.rb 38 def on_xmldecl_standalone(str) 39 raise "[BUG] this method must be never called" 40 end
on_xmldecl_version(str)
click to toggle source
# File lib/xmlscan/htmlscan.rb 30 def on_xmldecl_version(str) 31 raise "[BUG] this method must be never called" 32 end
scan_bang_tag(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 211 def scan_bang_tag(s) 212 if s == '<!' and @src.close_tag then # <!> 213 on_comment '' 214 else 215 parse_error "parse error at `<!'" 216 while s and not @src.close_tag # skip entire 217 s = @src.get_plain 218 end 219 end 220 end
scan_comment(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 57 def scan_comment(s) 58 s[0,4] = '' # remove `<!--' 59 comm = '' 60 until /--/n =~ s 61 comm << s 62 s = @src.get_plain 63 unless s then 64 parse_error "unterminated comment meets EOF" 65 return on_comment(comm) 66 end 67 end 68 comm << $` 69 s = $' 70 until s.empty? || s.strip.empty? and @src.close_tag # --> or -- > 71 comm << '--' 72 if /\A\s*--/n =~ s then # <!--hoge-- -- 73 comm << $& 74 s = $' 75 if s.empty? and @src.close_tag then # <!--hoge-- --> 76 parse_error "`-->' is found but comment must not end here" 77 comm.chop!.chop! 78 break 79 end 80 else # <!--hoge-- fuga 81 parse_error "only whitespace can appear between two comments" 82 end 83 if /\A-\s*\z/n =~ s and @src.close_tag then # <!--hoge---> 84 parse_error "`-->' is found but comment must not end here" 85 comm.chop! 86 break 87 end 88 until /--/n =~ s # copy & paste for performance 89 comm << s 90 s = @src.get_plain 91 unless s then 92 parse_error "unterminated comment meets EOF" 93 return on_comment(comm) 94 end 95 end 96 comm << $` 97 s = $' 98 end 99 on_comment comm 100 end
scan_internal_dtd(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 223 def scan_internal_dtd(s) 224 parse_error "DTD subset is found but it is not permitted in HTML" 225 skip_internal_dtd s 226 end
scan_pi(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 106 def scan_pi(s) # <?PI > this is default in SGML. 107 s[0,2] = '' # remove `<?' 108 pi = s 109 until @src.close_tag 110 s = @src.get_plain 111 unless s then 112 parse_error "unterminated PI meets EOF" 113 break 114 end 115 pi << s 116 end 117 on_pi '', pi 118 end
Also aliased as: scan_xml_pi
scan_prolog(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 236 def scan_prolog(s) 237 doctype = 0 238 while s 239 if s[0] == ?< then 240 if (c = s[1]) == ?! then 241 if s[2] == ?- and s[3] == ?- then 242 scan_comment s 243 elsif /\A<!doctype(?=\s)/in =~ s then 244 doctype += 1 245 if doctype > 1 then 246 parse_error "another document type declaration is found" 247 end 248 scan_doctype $' 249 else 250 break 251 end 252 elsif c == ?? then 253 scan_pi s 254 else 255 break 256 end 257 elsif s.strip.empty? then 258 on_prolog_space s 259 else 260 break 261 end 262 s = @src.get 263 end 264 scan_content(s || @src.get) 265 end
scan_stag(s)
click to toggle source
# File lib/xmlscan/htmlscan.rb 121 def scan_stag(s) 122 unless /(?=[\/\s='"])/n =~ s then 123 name = s 124 name[0,1] = '' # remove `<' 125 if name.empty? then # <> or << 126 if @src.close_tag then 127 return found_empty_stag 128 else 129 parse_error "parse error at `<'" 130 return on_chardata '<' 131 end 132 end 133 on_stag name 134 found_unclosed_stag name unless @src.close_tag 135 on_stag_end name 136 else 137 name = $` 138 s = $' 139 name[0,1] = '' # remove `<' 140 if name.empty? then # `< tag' or `<=` 141 parse_error "parse error at `<'" 142 if @src.close_tag then 143 s << '>' 144 end 145 return on_chardata '<'+s 146 end 147 on_stag name 148 begin 149 continue = false 150 s.scan( 151 /([^\s=\/'"]+)(?:\s*=\s*(?:('[^']*'?|"[^"]*"?)|([^\s='"]+)))?|(\S)/n 152 ) { |key,val,val2,error| 153 if key then 154 if val then # key="value" 155 on_attribute key 156 qmark = val.slice!(0,1) 157 if val[-1] == qmark[0] then 158 val.chop! 159 scan_attr_value val unless val.empty? 160 else 161 scan_attr_value val unless val.empty? 162 begin 163 s = @src.get 164 unless s then 165 parse_error "unterminated attribute `#{key}' meets EOF" 166 break 167 end 168 c = s[0] 169 val, s = s.split(qmark, 2) 170 scan_attr_value '>' unless c == ?< or c == ?> 171 scan_attr_value val if c 172 end until s 173 continue = s 174 end 175 on_attribute_end key 176 elsif val2 then # key=value 177 on_attribute key 178 on_attr_value val2 179 on_attribute_end key 180 else # value 181 on_attribute nil 182 on_attr_value key 183 on_attribute_end nil 184 end 185 else 186 parse_error "parse error at `#{error}'" 187 end 188 } 189 end while continue 190 found_unclosed_stag name unless @src.close_tag 191 on_stag_end name 192 end 193 end
wellformed_error(msg)
click to toggle source
# File lib/xmlscan/htmlscan.rb 19 def wellformed_error(msg) 20 # All wellformed error raised by XMLScanner are ignored. 21 # XMLScanner only raises wellformed error in stan_stag, which is a 22 # method completely overrided by HTMLScanner, so this method is 23 # never called in fact. 24 end