module NHKore::CLI::NewsCmd
@author Jonathan Bradley Whited @since 0.2.0
Constants
- DEFAULT_NEWS_SCRAPE
Public Instance Methods
build_news_cmd()
click to toggle source
# File lib/nhkore/cli/news_cmd.rb, line 31 def build_news_cmd app = self @news_cmd = @app_cmd.define_command do name 'news' usage 'news [OPTIONS] [COMMAND]...' aliases :n summary "Scrape NHK News Web (Easy) articles (aliases: #{app.color_alias('n')})" description <<-DESC Scrape NHK News Web (Easy) articles & save to folder: #{News::DEFAULT_DIR} DESC option :d,:datetime,<<-DESC,argument: :required,transform: lambda { |value| date time to use as a fallback in cases when an article doesn't have one; format: YYYY-mm-dd H:M; example: 2020-03-30 15:30 DESC value = Time.strptime(value,'%Y-%m-%d %H:%M',&DatetimeParser.method(:guess_year)) value = Util.jst_time(value) value } option :i,:in,<<-DESC,argument: :required,transform: lambda { |value| HTML file of article to read instead of URL (for offline testing and/or slow internet; see '--no-dict' option) DESC app.check_empty_opt(:in,value) } flag :L,:lenient,<<-DESC leniently (not strict) scrape articles: body & title content without the proper HTML/CSS classes/IDs and no futsuurl; example URLs that need this flag: -https://www3.nhk.or.jp/news/easy/article/disaster_earthquake_02.html -https://www3.nhk.or.jp/news/easy/tsunamikeihou/index.html DESC option :k,:like,<<-DESC,argument: :required,transform: lambda { |value| text to fuzzy search links for; for example, "--like '00123'" will only scrape links containing text '00123' -- like '*00123*' DESC value = Util.strip_web_str(value).downcase value } option :l,:links,<<-DESC,argument: :required,transform: lambda { |value| 'directory/file' of article links to scrape (see '#{App::NAME} search'; defaults: #{SearchLinks::DEFAULT_YASASHII_FILE}, #{SearchLinks::DEFAULT_FUTSUU_FILE}) DESC app.check_empty_opt(:links,value) } flag :M,:missingno,<<-DESC very rarely an article will not have kana or kanji for a Ruby tag; to not raise an error, this will use previously scraped data to fill it in; example URL: -https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html DESC flag :D,:'no-dict',<<-DESC do not try to parse the dictionary files for the articles; useful in case of errors trying to load the dictionaries (or for offline testing) DESC flag :H,'no-sha256',<<-DESC do not check the SHA-256 of the content to see if an article has already been scraped; for example, 2 URLs with the same content, but 1 with 'http' & 1 with 'https', will both be scraped; this is useful if 2 articles have the same SHA-256, but different content (unlikely) DESC option :o,:out,<<-DESC,argument: :required,transform: lambda { |value| 'directory/file' to save words to; if you only specify a directory or a file, it will attach the appropriate default directory/file name (defaults: #{YasashiiNews::DEFAULT_FILE}, #{FutsuuNews::DEFAULT_FILE}) DESC app.check_empty_opt(:out,value) } flag :r,:redo,'scrape article links even if they have already been scraped' option :s,:scrape,'number of unscraped article links to scrape',argument: :required, default: DEFAULT_NEWS_SCRAPE,transform: lambda { |value| value = value.to_i value = 1 if value < 1 value } option nil,:'show-dict',<<-DESC show dictionary URL and contents for the first article and exit; useful for debugging dictionary errors (see '--no-dict' option); implies '--dry-run' option DESC option :u,:url,<<-DESC,argument: :required,transform: lambda { |value| URL of article to scrape, instead of article links file (see '--links' option) DESC app.check_empty_opt(:url,value) } run do |opts,args,cmd| puts cmd.help end end @news_easy_cmd = @news_cmd.define_command do name 'easy' usage 'easy [OPTIONS] [COMMAND]...' aliases :e,:ez summary "Scrape NHK News Web Easy (Yasashii) articles (aliases: #{app.color_alias('e ez')})" description <<-DESC Search for NHK News Web Easy (Yasashii) links & save to file: #{YasashiiNews::DEFAULT_FILE} DESC run do |opts,args,cmd| app.refresh_cmd(opts,args,cmd) app.run_news_cmd(:yasashii) end end @news_regular_cmd = @news_cmd.define_command do name 'regular' usage 'regular [OPTIONS] [COMMAND]...' aliases :r,:reg summary "Scrape NHK News Web Regular (Futsuu) articles (aliases: #{app.color_alias('r reg')})" description <<-DESC Search for NHK News Web Regular (Futsuu) links & save to file: #{FutsuuNews::DEFAULT_FILE} DESC run do |opts,args,cmd| app.refresh_cmd(opts,args,cmd) app.run_news_cmd(:futsuu) end end end
run_news_cmd(type)
click to toggle source
# File lib/nhkore/cli/news_cmd.rb, line 159 def run_news_cmd(type) @cmd_opts[:dry_run] = true if @cmd_opts[:show_dict] news_name = nil build_in_file(:in) case type when :futsuu build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR, default_filename: SearchLinks::DEFAULT_FUTSUU_FILENAME) build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: FutsuuNews::DEFAULT_FILENAME) news_name = 'Regular' when :yasashii build_in_file(:links,default_dir: SearchLinks::DEFAULT_DIR, default_filename: SearchLinks::DEFAULT_YASASHII_FILENAME) build_out_file(:out,default_dir: News::DEFAULT_DIR,default_filename: YasashiiNews::DEFAULT_FILENAME) news_name = 'Easy' else raise ArgumentError,"invalid type[#{type}]" end return unless check_in_file(:in,empty_ok: true) return unless check_out_file(:out) datetime = @cmd_opts[:datetime] dict = @cmd_opts[:no_dict] ? nil : :scrape dry_run = @cmd_opts[:dry_run] in_file = @cmd_opts[:in] lenient = @cmd_opts[:lenient] like = @cmd_opts[:like] links_file = @cmd_opts[:links] max_scrapes = @cmd_opts[:scrape] max_scrapes = DEFAULT_NEWS_SCRAPE if max_scrapes.nil? missingno = @cmd_opts[:missingno] no_sha256 = @cmd_opts[:no_sha256] out_file = @cmd_opts[:out] redo_scrapes = @cmd_opts[:redo] show_dict = @cmd_opts[:show_dict] # Favor in_file option over url option. url = in_file.nil? ? Util.strip_web_str(@cmd_opts[:url].to_s) : in_file url = nil if url.empty? if url.nil? # Then we must have a links file that exists. return unless check_in_file(:links,empty_ok: false) end start_spin("Scraping NHK News Web #{news_name} articles") is_file = !in_file.nil? link_count = -1 links = File.exist?(links_file) ? SearchLinks.load_file(links_file) : SearchLinks.new new_articles = [] # For --dry-run news = nil scrape_count = 0 if File.exist?(out_file) news = (type == :yasashii) ? YasashiiNews.load_file(out_file,overwrite: no_sha256) : FutsuuNews.load_file(out_file,overwrite: no_sha256) else news = (type == :yasashii) ? YasashiiNews.new : FutsuuNews.new end @news_article_scraper_kargs = @scraper_kargs.merge({ datetime: datetime, dict: dict, is_file: is_file, missingno: missingno ? Missingno.new(news) : nil, strict: !lenient, }) @news_dict_scraper_kargs = @scraper_kargs.merge({ is_file: is_file, }) if url.nil? # Why store each() and do `links_len` instead of `links-len - 1`? # # If links contains 5 entries and you scrape all 5, then the output of # update_spin_detail() will end on 4, so all of this complexity is so # that update_spin_detail() only needs to be written/updated on one line. links_each = links.links.values.each links_len = links.length 0.upto(links_len) do |i| update_spin_detail(" (scraped=#{scrape_count}, considered=#{link_count += 1})") break if i >= links_len || scrape_count >= max_scrapes link = links_each.next next if !like.nil? && !link.url.to_s.downcase.include?(like) next if !redo_scrapes && scraped_news_article?(news,link) url = link.url if (new_url = scrape_news_article(url,link: link,new_articles: new_articles,news: news)) # --show-dict url = new_url scrape_count = max_scrapes - 1 # Break on next iteration for update_spin_detail() end # Break on next iteration for update_spin_detail(). next if (scrape_count += 1) >= max_scrapes sleep_scraper end else link = links[url] if link.nil? link = SearchLink.new(url) links.add_link(link) end scrape_news_article(url,link: link,new_articles: new_articles,news: news) scrape_count += 1 end stop_spin puts if scrape_count <= 0 puts 'Nothing scraped!' if !dry_run && !show_dict puts start_spin('Saving updated links to file') links.save_file(links_file) stop_spin puts "> #{links_file}" end else puts 'Last URL scraped:' puts "> #{url}" puts if show_dict puts @cmd_opts[:show_dict] # Updated in scrape_news_article() elsif dry_run if new_articles.length < 1 raise CLIError,"scrape_count[#{scrape_count}] != new_articles[#{new_articles.length}];" \ ' internal code is broken' elsif new_articles.length == 1 puts new_articles.first else # Don't show the words (mini), too verbose for more than 1. new_articles.each do |article| puts article.to_s(mini: true) end end else start_spin('Saving scraped data to files') links.save_file(links_file) news.save_file(out_file) stop_spin puts "> #{out_file}" puts "> #{links_file}" end end end
scrape_news_article(url,link:,new_articles:,news:)
click to toggle source
# File lib/nhkore/cli/news_cmd.rb, line 330 def scrape_news_article(url,link:,new_articles:,news:) show_dict = @cmd_opts[:show_dict] if show_dict scraper = DictScraper.new(url,**@news_dict_scraper_kargs) @cmd_opts[:show_dict] = scraper.scrape.to_s return scraper.url end scraper = ArticleScraper.new(url,**@news_article_scraper_kargs) article = scraper.scrape # run_news_cmd() handles overwriting with --redo or not # using scraped_news_article?(). news.add_article(article,overwrite: true) news.update_article(article,link.url) # Favors https link.update_from_article(article) new_articles << article return false # No --show-dict end
scraped_news_article?(news,link)
click to toggle source
# File lib/nhkore/cli/news_cmd.rb, line 356 def scraped_news_article?(news,link) return true if link.scraped? no_sha256 = @cmd_opts[:no_sha256] article = news.article(link.url) if !no_sha256 && article.nil? if !Util.empty_web_str?(link.sha256) && news.sha256?(link.sha256) article = news.article_with_sha256(link.sha256) end if article.nil? scraper = ArticleScraper.new(link.url,**@news_article_scraper_kargs) sha256 = scraper.scrape_sha256_only article = news.article_with_sha256(sha256) if news.sha256?(sha256) end end if article news.update_article(article,link.url) # Favors https link.update_from_article(article) return true end return false end