class Doko
Public Class Methods
deep(str,base_uri=nil)
click to toggle source
# File lib/doko.rb, line 11 def self.deep(str,base_uri=nil) addrs = parse(str) if addrs.empty? addrs = links(str,base_uri).map{ |u| parse(u) }.flatten end addrs end
links(str,base_uri=nil)
click to toggle source
# File lib/doko.rb, line 21 def self.links(str,base_uri=nil) out = [] if str.match( /^#{URI.regexp}$/ ) uri = URI.parse(str) doc = Nokogiri::HTML(open(uri).read) elsif str.kind_of? String uri = URI.parse(base_uri) doc = Nokogiri::HTML(str) end doc.search("a").each do |a| if a[:href] && a[:href].match(/access/) && !a[:href].match(/http/) out << uri + a[:href] end end out.uniq end
new(str)
click to toggle source
# File lib/doko.rb, line 38 def initialize(str) if str.kind_of? URI str = open(str.to_s).read elsif str.match( /^#{URI.regexp}$/ ) str = open(str).read end if str.match(/<html/i) @text = (Nokogiri::HTML(str)/"body").text else @text = str end end
parse(str)
click to toggle source
# File lib/doko.rb, line 7 def self.parse(str) self.new(str).parse end
Public Instance Methods
parse()
click to toggle source
# File lib/doko.rb, line 51 def parse body = @text body.tr!("0-9","0-9") body.tr!("()","()") body.tr!("、",",") body.tr!(" "," ") body.tr!(".",".") body.tr!(":",":") blackchars = ",()\n" addrs = body.scan(/\b([^\s,()]{2,3}(都|道|府|県)[^\s,()]{1,8}(市|区|町|村)[^#{blackchars}]+)/).map{ |m| clean(m[0]) } if addrs.empty? addrs = body.scan(/([^\s]{1,6}(市|区).{1,8}(区|町|村)[^\s,()]{2,10}\d)/).map{ |m| clean(m[0]) } end addrs.select{ |a| !a.match(/を/) } end
Private Instance Methods
clean(line)
click to toggle source
# File lib/doko.rb, line 77 def clean(line) line.gsub!(/住所(\s|\n)?/,"") line.gsub!(/〒\d{3}-\d{4} ?/,"") line.gsub!(/\s+$/,"") line.gsub!(/\s?電話:.+$/,"") line.gsub!("[MAP]","") line.gsub!(/(TEL|FAX):\d{2,4}-\d{2,4}-\d{2,4}.+/,"") line.gsub!(/(\dー)*\d/) do |t| t.tr("ー","-") end line.sub!(/\s$/,"") line.gsub!(/\s{3,}+.+$/,"") line end