class NHKore::ArticleScraper

@author Jonathan Bradley Whited @since 0.2.0

Attributes

cleaners[R]
datetime[RW]
dict[RW]
kargs[R]
missingno[RW]
polishers[R]
splitter[RW]
variators[R]
year[RW]

Public Class Methods

new(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil, polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true, variators: [BestVariator.new],year: nil,**kargs) click to toggle source

@param dict [Dict,:scrape,nil] the {Dict} (dictionary) to use for {Word#defn} (definitions)

[+:scrape+] auto-scrape it using {DictScraper}
[+nil+]     don't scrape/use it

@param missingno [Missingno] data to use as a fallback for Ruby words without kana/kanji,

instead of raising an error

@param strict [true,false]

Calls superclass method
# File lib/nhkore/article_scraper.rb, line 53
def initialize(url,cleaners: [BestCleaner.new],datetime: nil,dict: :scrape,missingno: nil,
    polishers: [BestPolisher.new],splitter: BestSplitter.new,strict: true,
    variators: [BestVariator.new],year: nil,**kargs)
  super(url,**kargs)

  @cleaners = Array(cleaners)
  @datetime = datetime.nil? ? nil : Util.jst_time(datetime)
  @dict = dict
  @kargs = kargs
  @missingno = missingno
  @polishers = Array(polishers)
  @splitter = splitter
  @strict = strict
  @variators = Array(variators)
  @year = year
end

Public Instance Methods

add_words(article,words,text) click to toggle source
# File lib/nhkore/article_scraper.rb, line 70
def add_words(article,words,text)
  words.each do |word|
    # Words should have already been cleaned.
    # If we don't check this, Word.new() could raise an error in polish().
    next if polish(word.word).empty?

    article.add_word(polish(word))

    variate(word.word).each do |v|
      v = polish(clean(v))

      next if v.empty?

      # Do not pass in "word: word". We only want defn & eng.
      # If we pass in kanji/kana & unknown, it will raise an error.
      article.add_word(Word.new(
        defn: word.defn,
        eng: word.eng,
        unknown: v
      ))
    end
  end

  split(text).each do |t|
    t = polish(clean(t))

    next if t.empty?

    article.add_word(Word.new(unknown: t))

    variate(t).each do |v|
      v = polish(clean(v))

      next if v.empty?

      article.add_word(Word.new(unknown: v))
    end
  end
end
clean(obj) click to toggle source
# File lib/nhkore/article_scraper.rb, line 110
def clean(obj)
  return Cleaner.clean_any(obj,@cleaners)
end
fix_bad_html() click to toggle source
# File lib/nhkore/article_scraper.rb, line 114
def fix_bad_html
  # Fixes:
  # - '<「<' without escaping '<' as '&lt;'
  #   - https://www3.nhk.or.jp/news/easy/k10012118911000/k10012118911000.html
  #   - '</p><br><「<ruby>台風<rt>たいふう</rt></ruby>'

  read

  # To add a new one, simply add '|(...)' on a newline and test Regexp.last_match().
  @str_or_io = @str_or_io.gsub(/
    (?<cane><「<)
  /x) do |match|
    if !Regexp.last_match(:cane).nil?
      match = match.sub('<','&lt;')
    end

    match
  end
end
parse_datetime(str,year) click to toggle source
# File lib/nhkore/article_scraper.rb, line 134
def parse_datetime(str,year)
  str = str.gsub(/[\[\][[:space:]]]+/,'') # Remove: [ ] \s
  str = "#{year}年 #{str} #{Util::JST_OFFSET}"

  return Time.strptime(str,'%Y年 %m月%d日%H時%M分 %:z')
end
parse_dicwin_id(str) click to toggle source
# File lib/nhkore/article_scraper.rb, line 141
def parse_dicwin_id(str)
  str = str.gsub(/\D+/,'')

  return nil if str.empty?
  return str
end
polish(obj) click to toggle source
# File lib/nhkore/article_scraper.rb, line 148
def polish(obj)
  return Polisher.polish_any(obj,@polishers)
end
scrape() click to toggle source
# File lib/nhkore/article_scraper.rb, line 152
def scrape
  scrape_dict
  fix_bad_html

  article = Article.new
  doc = html_doc

  article.futsuurl = scrape_futsuurl(doc)

  article.datetime = scrape_datetime(doc,article.futsuurl)
  article.sha256 = scrape_content(doc,article)
  article.title = scrape_title(doc,article)
  article.url = @url

  return article
end
scrape_and_add_words(tag,article,result: ScrapeWordsResult.new) click to toggle source
# File lib/nhkore/article_scraper.rb, line 169
def scrape_and_add_words(tag,article,result: ScrapeWordsResult.new)
  result = scrape_words(tag,result: result)
  result.polish!

  add_words(article,result.words,result.text)

  return result
end
scrape_content(doc,article) click to toggle source
# File lib/nhkore/article_scraper.rb, line 178
def scrape_content(doc,article)
  tag = doc.css('div#js-article-body')
  tag = doc.css('div.article-main__body') if tag.length < 1
  tag = doc.css('div.article-body') if tag.length < 1

  # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
  tag = doc.css('div#main') if tag.length < 1 && !@strict

  if tag.length > 0
    text = Util.unspace_web_str(tag.text.to_s)

    if !text.empty?
      hexdigest = Digest::SHA256.hexdigest(text)

      return hexdigest if article.nil? # For scrape_sha256_only()

      result = scrape_and_add_words(tag,article)

      return hexdigest if result.words?
    end
  end

  raise ScrapeError,"could not scrape content at URL[#{@url}]"
end
scrape_datetime(doc,futsuurl=nil) click to toggle source
# File lib/nhkore/article_scraper.rb, line 203
def scrape_datetime(doc,futsuurl=nil)
  year = scrape_year(doc,futsuurl)

  # First, try with the id.
  tag_name = 'p#js-article-date'
  tag = doc.css(tag_name)

  if tag.length > 0
    tag_text = tag[0].text

    begin
      datetime = parse_datetime(tag_text,year)

      return datetime
    rescue ArgumentError => e
      # Ignore; try again below.
      Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
    end
  end

  # Second, try with the class.
  tag_name = 'p.article-main__date'
  tag = doc.css(tag_name)

  if tag.length > 0
    tag_text = tag[0].text

    begin
      datetime = parse_datetime(tag_text,year)

      return datetime
    rescue ArgumentError => e
      # Ignore; try again below.
      Util.warn("could not parse date time[#{tag_text}] from tag[#{tag_name}] at URL[#{@url}]: #{e}")
    end

    return datetime
  end

  # Third, try body's id.
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
  # - 'news20170331_k10010922481000'
  tag = doc.css('body')

  if tag.length > 0
    tag_id = tag[0]['id'].to_s.split('_',2)

    if tag_id.length > 0
      tag_id = tag_id[0].gsub(/[^[[:digit:]]]+/,'')

      if tag_id.length == 8
        datetime = Time.strptime(tag_id,'%Y%m%d')

        return datetime
      end
    end
  end

  # As a last resort, use our user-defined fallback (if specified).
  return @datetime unless @datetime.nil?

  raise ScrapeError,"could not scrape date time at URL[#{@url}]"
end
scrape_dict() click to toggle source
# File lib/nhkore/article_scraper.rb, line 267
def scrape_dict
  return if @dict != :scrape

  dict_url = DictScraper.parse_url(@url)
  retries = 0

  begin
    scraper = DictScraper.new(dict_url,missingno: @missingno,parse_url: false,**@kargs)
  rescue OpenURI::HTTPError => e
    if retries == 0 && e.to_s.include?('404')
      read

      scraper = ArticleScraper.new(@url,str_or_io: @str_or_io,**@kargs)

      dict_url = scraper.scrape_dict_url_only
      retries += 1

      retry
    else
      raise e.exception("could not scrape dictionary URL[#{dict_url}] at URL[#{@url}]: #{e}")
    end
  end

  @dict = scraper.scrape
end
scrape_dict_url_only() click to toggle source
# File lib/nhkore/article_scraper.rb, line 293
def scrape_dict_url_only
  doc = html_doc

  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
  # - 'news20170331_k10010922481000'
  tag = doc.css('body')

  if tag.length > 0
    tag_id = tag[0]['id'].to_s.split('_',2)

    if tag_id.length == 2
      dict_url = Util.strip_web_str(tag_id[1])

      if !dict_url.empty?
        return DictScraper.parse_url(@url,basename: dict_url)
      end
    end
  end

  raise ScrapeError,"could not scrape dictionary URL at URL[#{@url}]"
end
scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new) click to toggle source
# File lib/nhkore/article_scraper.rb, line 315
def scrape_dicwin_word(tag,id,result: ScrapeWordsResult.new)
  dicwin_result = scrape_words(tag,dicwin: true)

  return nil unless dicwin_result.words?

  kana = ''.dup
  kanji = ''.dup

  dicwin_result.words.each do |word|
    kana << word.kana unless word.kana.nil?

    if kanji.empty?
      kanji << word.kanji unless word.kanji.nil?
    else
      kanji << word.word # Add trailing kana (or kanji) to kanji
    end
  end

  entry = nil
  kana = clean(kana)
  kanji = clean(kanji)

  raise ScrapeError,"empty dicWin word at URL[#{@url}] in tag[#{tag}]" if kana.empty? && kanji.empty?

  if !@dict.nil?
    entry = @dict[id]

    raise ScrapeError,"no dicWin ID[#{id}] at URL[#{@url}] in dictionary[#{@dict}]" if entry.nil?

    entry = entry.to_s
  end

  word = Word.new(
    defn: entry,
    kana: kana,
    kanji: kanji
  )

  result.add_text(dicwin_result.text) # Don't call dicwin_result.polish!()
  result.add_word(word)

  return word
end
scrape_futsuurl(doc) click to toggle source
# File lib/nhkore/article_scraper.rb, line 359
def scrape_futsuurl(doc)
  # First, try with the id.
  tag = doc.css('div#js-regular-news-wrapper')

  if tag.length > 0
    link = scrape_link(tag[0])

    return link unless link.nil?
  end

  # Second, try with the class.
  tag = doc.css('div.link-to-normal')

  if tag.length > 0
    link = scrape_link(tag[0])

    return link unless link.nil?
  end

  # Some sites don't have a futsuurl and need a lenient mode.
  # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html
  warn_or_error(ScrapeError,"could not scrape futsuurl at URL[#{@url}]")

  return nil
end
scrape_ruby_word(word,result: ScrapeWordsResult.new) click to toggle source
# File lib/nhkore/article_scraper.rb, line 411
def scrape_ruby_word(word,result: ScrapeWordsResult.new)
  # No cleaning; raw text.
  # Do not add kana to the text.
  result.add_text(word.kanji)

  kanji = clean(word.kanji)
  kana = clean(word.kana)

  # Even though Word.scrape_ruby_tag() also does this,
  #   check it again after cleaning above.
  if !@missingno.nil?
    # Check kana first, since this is the typical scenario.
    # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html
    # - '窓' in '(8)窓を開けて外の空気を入れましょう'
    if kana.empty?
      kana = @missingno.kana_from_kanji(kanji)
      kana = kana.nil? ? '' : clean(kana)

      if !kana.empty?
        Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]")
      end
    elsif kanji.empty?
      kanji = @missingno.kanji_from_kana(kana)
      kanji = kanji.nil? ? '' : clean(kanji)

      if !kanji.empty?
        Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]")
      end
    end
  end

  raise ScrapeError,"empty kanji at URL[#{@url}] in tag[#{tag}]" if kanji.empty?
  raise ScrapeError,"empty kana at URL[#{@url}] in tag[#{tag}]" if kana.empty?

  word = Word.new(
    kana: kana,
    kanji: kanji,
    word: word
  )

  return word
end
scrape_ruby_words(tag,result: ScrapeWordsResult.new) click to toggle source

@since 0.3.8 @see www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html

# File lib/nhkore/article_scraper.rb, line 398
def scrape_ruby_words(tag,result: ScrapeWordsResult.new)
  words = Word.scrape_ruby_tag(tag,missingno: @missingno,url: @url)
  final_words = []

  return final_words if words.nil?

  words.each do |word|
    final_words << scrape_ruby_word(word,result: result)
  end

  return final_words
end
scrape_sha256_only() click to toggle source
# File lib/nhkore/article_scraper.rb, line 454
def scrape_sha256_only
  doc = html_doc

  sha256 = scrape_content(doc,nil)

  return sha256
end
scrape_text_word(tag,result: ScrapeWordsResult.new) click to toggle source
# File lib/nhkore/article_scraper.rb, line 462
def scrape_text_word(tag,result: ScrapeWordsResult.new)
  word = Word.scrape_text_node(tag,url: @url)

  if word.nil?
    result.add_text(tag.text.to_s) # Raw spaces for output

    return nil
  end

  # Kanji only for:
  # - https://www3.nhk.or.jp/news/easy/k10012639271000/k10012639271000.html
  #   - '第3のビール'
  text = word.word # Should usually be kana only

  result.add_text(text) # No cleaning; raw text

  text = clean(text)

  return nil if text.empty? # No error; empty text is fine here

  word = Word.new(
    kana: clean(word.kana),
    kanji: clean(word.kanji),
    word: word,
  )

  return word
end
scrape_title(doc,article) click to toggle source
# File lib/nhkore/article_scraper.rb, line 491
def scrape_title(doc,article)
  tag = doc.css('h1.article-main__title')
  tag_name = nil

  if tag.length < 1
    # - https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_illust.html
    tag_name = 'h1.article-eq__title'
    tag = doc.css(tag_name)
  end

  if tag.length < 1 && !@strict
    # This shouldn't be used except for select sites.
    # - https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html
    tag_name = 'div#main h2'
    tag = doc.css(tag_name)
  end

  if tag.length > 0
    Util.warn("using [#{tag_name}] for title at URL[#{@url}]") unless tag_name.nil?

    result = scrape_and_add_words(tag,article)
    title = result.text

    return title unless title.empty?
  end

  raise ScrapeError,"could not scrape title at URL[#{@url}]"
end
scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new) click to toggle source
# File lib/nhkore/article_scraper.rb, line 520
def scrape_words(tag,dicwin: false,result: ScrapeWordsResult.new)
  children = tag.children.to_a.reverse # A faster stack?

  while !children.empty?
    child = children.pop
    name = nil
    words = []

    name = Util.unspace_web_str(child.name.to_s).downcase if child.respond_to?(:name)

    if name == 'ruby'
      # Returns an array.
      words = scrape_ruby_words(child,result: result)
    elsif child.text?
      words << scrape_text_word(child,result: result)
    elsif name == 'rt'
      raise ScrapeError,"invalid rt tag[#{child}] without a ruby tag at URL[#{@url}]"
    else
      dicwin_id = nil

      if name == 'a'
        id = parse_dicwin_id(child['id'].to_s)
        klass = Util.unspace_web_str(child['class'].to_s).downcase

        if klass == 'dicwin' && !id.nil?
          if dicwin
            raise ScrapeError,"invalid dicWin class[#{child}] nested inside another dicWin class at" \
              " URL[#{@url}]"
          end

          dicwin_id = id
        end
      end

      if dicwin_id.nil?
        # I originally didn't use a stack-like Array and did a constant insert,
        #   but I think this is slower (moving all elements down every time).
        # However, if it's using C-like code for moving memory, then maybe it
        #   is faster?
        # Old code:
        #   children.insert(i + 1,*child.children.to_a())
        grand_children = child.children.to_a

        (grand_children.length - 1).downto(0).each do |i|
          children.push(grand_children[i])
        end
      else
        words << scrape_dicwin_word(child,dicwin_id,result: result)
      end
    end

    words&.each do |word|
      # All word-scraping methods can return nil.
      result.add_word(word) unless word.nil?
    end
  end

  return result
end
scrape_year(doc,futsuurl=nil) click to toggle source
# File lib/nhkore/article_scraper.rb, line 580
def scrape_year(doc,futsuurl=nil)
  # First, try body's id.
  tag = doc.css('body')

  if tag.length > 0
    tag_id = tag[0]['id'].to_s.gsub(/[^[[:digit:]]]+/,'')

    if tag_id.length >= 4
      year = tag_id[0..3].to_i

      return year if Util.sane_year?(year)
    end
  end

  # Second, try futsuurl.
  if !futsuurl.nil?
    m = futsuurl.match(/([[:digit:]]{4,})/)

    if !m.nil? && (m = m[0].to_s).length >= 4
      year = m[0..3].to_i

      return year if Util.sane_year?(year)
    end
  end

  # As a last resort, use our user-defined fallbacks (if specified).
  return @year.to_i unless @year.nil?
  return @datetime.year if !@datetime.nil? && Util.sane_year?(@datetime.year)

  raise ScrapeError,"could not scrape year at URL[#{@url}]"
end
split(str) click to toggle source
# File lib/nhkore/article_scraper.rb, line 612
def split(str)
  return @splitter.split(str)
end
variate(str) click to toggle source
# File lib/nhkore/article_scraper.rb, line 616
def variate(str)
  variations = []

  @variators.each do |variator|
    variations.push(*variator.variate(str))
  end

  return variations
end
warn_or_error(klass,msg) click to toggle source
# File lib/nhkore/article_scraper.rb, line 626
def warn_or_error(klass,msg)
  if @strict
    raise klass,msg
  else
    Util.warn(msg)
  end
end