class WebStat::Fetch

Attributes

header[RW]
html[RW]
nokogiri[RW]
status[RW]
url[RW]
userdic[RW]

Public Instance Methods

content() click to toggle source

Get main section

# File lib/web_stat/fetch.rb, line 36
def content
  if @url&.match(WebStat::Configure.get["id_extraction_regexs"]["youtube"])
    youtube_decscription
  else
    Sanitize.clean(Readability::Document.new(@nokogiri.at('body').to_s).content)
  end
end
eyecatch_image_path() click to toggle source

Get temporary path of image

# File lib/web_stat/fetch.rb, line 57
def eyecatch_image_path
  # Reuse `path` in this method
  path = nil
  WebStat::Configure.get["eyecatch_image_xpaths"].each do |xpath|
    if @nokogiri.xpath(xpath).first.respond_to?(:value)
      path = @nokogiri.xpath(xpath).first.value
      break
    end
  end
  # If there is a thumbnail rule, apply it.
  WebStat::Configure.get["id_extraction_regexs"].each do |provider, regex_string|
    if @url.match(regex_string)
      return @url.gsub(%r{#{regex_string}.*$}, WebStat::Configure.get["thumbnail_regex"][provider])
    end
  end
  readability_content = ::Nokogiri::HTML(Readability::Document.new(@nokogiri.at('body').to_s).content)
  if (path.nil? || path.empty?) && readability_content.xpath('//img').first
    path =  readability_content.xpath('//img').first.attr('src')
  end
  if (path.nil? || path.empty?) && @nokogiri.xpath('//img').first
    path = @nokogiri.xpath('//img').first.attr('src')
  end
  if ! path.nil? && path.match(/^\//)
    "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}#{path}"
  else
    path
  end
end
get_last_modified() click to toggle source

Return Date or last modified header. @param [String] url @return DataTime

# File lib/web_stat/fetch.rb, line 140
def get_last_modified
  @header = @header || {}
  if @header.has_key?("date") && @header.has_key?("last-modified")
    if DateTime.parse(@header["date"]) >= DateTime.parse(@header["last-modified"])
      DateTime.parse(@header["date"])
    else
      DateTime.parse(@header["last-modified"])
    end
  elsif @header.has_key?("date")
    DateTime.parse(@header["date"])
  elsif @header.has_key?("last-modified")
    DateTime.parse(@header["last-modified"])
  end
end
get_url(url) click to toggle source

Get url @param [String] url @param [String] body

# File lib/web_stat/fetch.rb, line 108
def get_url(url)
  mech = Mechanize.new { |_mech| _mech.user_agent = WebStat::Configure.get["user_agent"] }
  # Enable to read Robots.txt
  mech.robots = true
  begin
    if mech.agent.robots_disallowed?(url)
      raise Mechanize::RobotsDisallowedError.new(url)
    end
    document = mech.get(url, [], nil, { 'Accept-Language' => 'ja'})
    @header = document.header
    begin
      raise 'not_use_chromedirver' unless WebStat::Configure.get["use_chromedirver"]
      body = WebStat::WebDriverHelper.get_source(url)
      @status = 200
    rescue
      if document.class == Mechanize::File
        body = document.body
      else
        body = document.body.encode('UTF-8', document.encoding)
      end
      @status = document.code
    end
  rescue Mechanize::ResponseCodeError => e
    body = e.page.body
    @status = e.page.code
  end
  body
end
save_local_path(url) click to toggle source

Get local path to save url @param [String] url

# File lib/web_stat/fetch.rb, line 88
def save_local_path(url)
  return nil if url.nil? || ! url.match(%{^http})
  tmp_file = "/tmp/#{Digest::SHA1.hexdigest(url)}"
  agent = Mechanize.new { |_agent| _agent.user_agent = WebStat::Configure.get["user_agent"] }
  image = agent.get(url)
  File.open(tmp_file, "w+b") do |_file|
    if image.class == Mechanize::File
      _file.puts(image.body)
    elsif image.respond_to?(:body_io)
      _file.puts(image.body_io.read)
    end
  end
  tmp_file
rescue
  false
end
site_name() click to toggle source

Get name of domain

# File lib/web_stat/fetch.rb, line 23
def site_name
  begin
    site_name = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).last
  rescue
    site_name = @nokogiri.title
  end
  if site_name.nil?
    "No Sitename"
  else
    site_name.strip
  end
end
stat(userdics: nil) click to toggle source

Get the informations of @url @param [Hash] Specify a dictionary for each language code. example ) {“ja”: /*/.dic, “other”: /***/***.dic}

# File lib/web_stat/fetch.rb, line 157
def stat(userdics: nil)
  clean_content = content.scrub('').gsub(/[\n\t\r ]/, "").gsub(/\s{2,}/, "\s").gsub(URI.regexp, "")
  language_code = CLD.detect_language(clean_content)[:code]
  if userdics && userdics.has_key?(language_code) && File.exists?(userdics[language_code])
    tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics[language_code])
  elsif userdics && userdics.has_key?("other") && File.exists?(userdics["other"])
    tag = WebStat::Tag.new("#{title} #{content}", userdic: userdics["other"])
  else
    tag = WebStat::Tag.new("#{title} #{content}", userdic: WebStat::Configure.get["userdic"])
  end
  {
    title: title,
    site_name: site_name,
    content: clean_content,
    language_code: language_code,
    status: @status,
    url: @url,
    last_modified_at: get_last_modified,
    eyecatch_image_path: save_local_path(eyecatch_image_path),
    tags: tag.nouns
  }
end
title() click to toggle source

Get title @return [String] title

# File lib/web_stat/fetch.rb, line 6
def title
  begin
    title = @nokogiri.title.split(/#{WebStat::Configure.get["regex_to_sprit_title"]}/, 2).first
    if title.length < WebStat::Configure.get["min_length_of_meta_title"]
      title = @nokogiri.css("h1").first.content
    end
  rescue
    title = @nokogiri.title
  end
  if title.nil?
    "No Title"
  else
    title.strip
  end
end
youtube_decscription() click to toggle source

Get describe of youtube movie.

# File lib/web_stat/fetch.rb, line 45
def youtube_decscription
  regex_string = WebStat::Configure.get["id_extraction_regexs"]["youtube"]
  if @url.match(regex_string)
    id = @url.gsub(%r{#{regex_string}.*$}, '\1')
    youtube = Google::Apis::YoutubeV3::YouTubeService.new
    youtube.key = WebStat::Configure.get["api_keys"]["youtube"]
    response = youtube.list_videos(:snippet, id: id)
    response.items.first.snippet.description
  end
end

Private Instance Methods

original_url(url) click to toggle source

Get original url @param [String] url

# File lib/web_stat/fetch.rb, line 184
def original_url(url)
  last_url = WebStat::FinalRedirectUrl.final_redirect_url(url)
  unless last_url.nil? || last_url.scrub('').empty?
    last_url
  else
    url
  end
end