class NHKore::Word
@author Jonathan Bradley Whited @since 0.1.0
Attributes
defn[RW]
eng[RW]
freq[RW]
kana[R]
kanji[R]
key[R]
Public Class Methods
load_data(key,hash)
click to toggle source
# File lib/nhkore/word.rb, line 85 def self.load_data(key,hash) key = key.to_s # Change from a symbol word = Word.new( defn: hash[:defn], eng: hash[:eng], kana: hash[:kana], kanji: hash[:kanji] ) if key != word.key raise ArgumentError,"the key from the hash[#{key}] does not match the generated key[#{word.key}]" end freq = hash[:freq].to_i # nil.to_i() is 0 word.freq = freq if freq > 0 return word end
new(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs)
click to toggle source
Calls superclass method
# File lib/nhkore/word.rb, line 31 def initialize(defn: nil,eng: nil,freq: 1,kana: nil,kanji: nil,unknown: nil,word: nil,**kargs) super() if !word.nil? defn = word.defn if defn.nil? eng = word.eng if eng.nil? freq = word.freq if freq.nil? kana = word.kana if kana.nil? kanji = word.kanji if kanji.nil? end raise ArgumentError,"freq[#{freq}] cannot be < 1" if freq < 1 if !unknown.nil? # kanji?() only tests if it contains kanji, so don't use kana?(). if Util.kanji?(unknown) if !Util.empty_web_str?(kanji) raise ArgumentError,"unknown[#{unknown}] will overwrite kanji[#{kanji}]" end kanji = unknown else if !Util.empty_web_str?(kana) raise ArgumentError,"unknown[#{unknown}] will overwrite kana[#{kana}]" end kana = unknown end end kana = nil if Util.empty_web_str?(kana) kanji = nil if Util.empty_web_str?(kanji) raise ArgumentError,'kanji and kana cannot both be empty' if kana.nil? && kanji.nil? @defn = defn @eng = eng @freq = freq @kana = kana @kanji = kanji @key = "#{kanji}=#{kana}" # nil.to_s() is '' end
scrape_ruby_tag(tag,missingno: nil,url: nil)
click to toggle source
Do not clean and/or strip spaces, as the raw text is important for Defn
and ArticleScraper
.
This originally only scraped 1 word
, but multiple words were added after seeing this link for 産業能率大学, which is valid HTML:
https://www3.nhk.or.jp/news/easy/k10012759201000/k10012759201000.html
@return [Array<Word>] the scraped {Word}(s)
# File lib/nhkore/word.rb, line 113 def self.scrape_ruby_tag(tag,missingno: nil,url: nil) # First, try <rb> tags. kanjis = tag.css('rb') # Second, try text nodes. kanjis = tag.search('./text()') if kanjis.length < 1 # Third, try non-<rt> tags, in case of being surrounded by <span>, <b>, etc. kanjis = tag.search("./*[not(name()='rt')]") if kanjis.length < 1 kanas = tag.css('rt') raise ScrapeError,"no kanji at URL[#{url}] in tag[#{tag}]" if kanjis.length < 1 raise ScrapeError,"no kana at URL[#{url}] in tag[#{tag}]" if kanas.length < 1 if kanjis.length != kanas.length raise ScrapeError,"number of kanji & kana mismatch at URL[#{url}] in tag[#{tag}]" end words = [] (0...kanjis.length).each do |i| kanji = kanjis[i].text kana = kanas[i].text # Uncomment for debugging; really need a logger. #puts "Word[#{i}]: #{kanji} => #{kana}" if !missingno.nil? # Check kana first, since this is the typical scenario. # - https://www3.nhk.or.jp/news/easy/k10012331311000/k10012331311000.html # - '窓' in '(8)窓を開けて外の空気を入れましょう' if Util.empty_web_str?(kana) kana = missingno.kana_from_kanji(kanji) if !Util.empty_web_str?(kana) Util.warn("using missingno for kana[#{kana}] from kanji[#{kanji}]") end elsif Util.empty_web_str?(kanji) kanji = missingno.kanji_from_kana(kana) if !Util.empty_web_str?(kanji) Util.warn("using missingno for kanji[#{kanji}] from kana[#{kana}]") end end end raise ScrapeError,"empty kanji at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kanji) raise ScrapeError,"empty kana at URL[#{url}] in tag[#{tag}]" if Util.empty_web_str?(kana) words << Word.new(kanji: kanji,kana: kana) end return words end
scrape_text_node(tag,url: nil)
click to toggle source
Do not clean and/or strip spaces, as the raw text is important for Defn
and ArticleScraper
.
# File lib/nhkore/word.rb, line 169 def self.scrape_text_node(tag,url: nil) text = tag.text # No error; empty text is fine (not strictly kanji/kana only). return nil if Util.empty_web_str?(text) word = Word.new(unknown: text) return word end
Public Instance Methods
encode_with(coder)
click to toggle source
# File lib/nhkore/word.rb, line 74 def encode_with(coder) # Ignore @key because it will be the key in the YAML/Hash. # Order matters. coder[:kanji] = @kanji coder[:kana] = @kana coder[:freq] = @freq coder[:defn] = @defn coder[:eng] = @eng end
kanji?()
click to toggle source
# File lib/nhkore/word.rb, line 180 def kanji? return !Util.empty_web_str?(@kanji) end
to_s()
click to toggle source
# File lib/nhkore/word.rb, line 188 def to_s s = ''.dup s << "'#{@key}': " s << "{ kanji=>'#{@kanji}'" s << ", kana=>'#{@kana}'" s << ", freq=>#{@freq}" s << ", defn=>'#{@defn.to_s.gsub("\n",'\\n')}'" s << ", eng=>'#{@eng}'" s << ' }' return s end
word()
click to toggle source
# File lib/nhkore/word.rb, line 184 def word return kanji? ? @kanji : @kana end