module MovieCrawler

Constants

ATMOVIES_MAIN_URL
MOVIE_BASE_URL
REFLECTION_BASE
REFLECTION_CLASS
REFLECTION_FS
REFLECTION_NAME
REFLECTION_SAID
REFLECTION_SATITLE
SEARCH_URL
TRAILER_URL
URL_LIST

get the info from atmovies

VERSION
WHOLE_MOVIEWS_CODES
WHOLE_MOVIEWS_DATES
WHOLE_MOVIEWS_STORIES
WHOLE_MOVIEWS_TITLES

Public Class Methods

dvd_rank() click to toggle source
# File lib/movie_crawler/crawler.rb, line 94
def self.dvd_rank
  result = get_table('3')
  to_yaml(result)
end
encode_zh(text) click to toggle source
# File lib/movie_crawler/crawler.rb, line 136
def self.encode_zh(text)
  encoded = URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s
  REFLECTION_SATITLE + encoded
end
get_codes(doc) click to toggle source

get the code of movies

# File lib/movie_crawler/crawler.rb, line 191
def self.get_codes(doc)
  codes = doc.xpath(WHOLE_MOVIEWS_CODES)
  codes.map { |code| code.value.split('/')[2] }
end
get_dates(doc) click to toggle source

get the release date

# File lib/movie_crawler/crawler.rb, line 179
def self.get_dates(doc)
  days_times = split_day_and_time(doc)
  days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy
end
get_movie_info(movie_name) click to toggle source

combine the workflow for user to call for movie info

# File lib/movie_crawler/crawler.rb, line 76
def self.get_movie_info(movie_name)
  form = start_mechan
  film_page = trace_page(form, movie_name)
  collection = parse_movie_info(film_page)
  to_yaml(collection)
end
get_one_movie_name(doc) click to toggle source
# File lib/movie_crawler/crawler.rb, line 141
def self.get_one_movie_name(doc)
  name = doc.xpath(REFLECTION_NAME).text
  name.gsub!(/[\t\r\n]/, '')
end
get_reflection(doc) click to toggle source
# File lib/movie_crawler/crawler.rb, line 146
def self.get_reflection(doc)
  doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split
end
get_runtime(doc) click to toggle source

get the runtime of movie

# File lib/movie_crawler/crawler.rb, line 173
def self.get_runtime(doc)
  days_times = split_day_and_time(doc)
  days_times.map { |d_t| d_t[0].match(/\d+/).to_s }
end
get_stories(doc) click to toggle source

get the storyline of movie

# File lib/movie_crawler/crawler.rb, line 167
def self.get_stories(doc)
  storylines = doc.xpath(WHOLE_MOVIEWS_STORIES)
  storylines.map(&:text) # { |story| story.text }
end
get_table(rankid) click to toggle source

parse the ranktable info

# File lib/movie_crawler/crawler.rb, line 100
def self.get_table(rankid)
  doc = open_html(ATMOVIES_MAIN_URL)
  table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text
  table = table.gsub(' : ', ':').gsub(' ', '').split
  table = table.each { |item| item.gsub(/[\t\r\n]/, '') }
  table.pop
  rankmix(table)
end
get_titles(doc) click to toggle source

get the movie name

# File lib/movie_crawler/crawler.rb, line 161
def self.get_titles(doc)
  titles = doc.xpath(WHOLE_MOVIEWS_TITLES)
  titles.map { |title| title.text.gsub(/[\t\n\r]/, '') }
end
get_trailer(doc) click to toggle source

get the trailer link of the movies

# File lib/movie_crawler/crawler.rb, line 197
def self.get_trailer(doc)
  codes = get_codes(doc)
  codes.map { |trailer| TRAILER_URL + trailer }
end
mix(t, s, d, ti, tr) click to toggle source

build the hash for yaml output

# File lib/movie_crawler/crawler.rb, line 203
def self.mix(t, s, d, ti, tr)
  informations = t.each_with_index.map do |_, index|
    { 'title' => t[index], 'story' => s[index], \
      'date' => d[index], 'runtime(minutes)' => ti[index], \
      'trailer' => tr[index] }
  end
  informations
end
movie_details(code) click to toggle source

get the details of movie

# File lib/movie_crawler/crawler.rb, line 151
def self.movie_details(code)
  open_html(MOVIE_BASE_URL + code + '/')
end
movies(category = 'LATEST') click to toggle source

switch to different url accordingly

# File lib/movie_crawler/crawler.rb, line 119
def self.movies(category = 'LATEST')
  result = movies_parser(category)
  to_yaml(result)
end
movies_parser(category) click to toggle source

parse the movies acoordingly

# File lib/movie_crawler/crawler.rb, line 125
def self.movies_parser(category)
  url = URL_LIST[category.upcase]
  document = open_html(url)
  titles = get_titles(document)
  stories = get_stories(document)
  dates = get_dates(document)
  trailers = get_trailer(document)
  runtimes = get_runtime(document)
  mix(titles, stories, dates, runtimes, trailers)
end
open_html(url) click to toggle source

open the destination url

# File lib/movie_crawler/crawler.rb, line 156
def self.open_html(url)
  Nokogiri::HTML(open(url))
end
parse_comment(film_page) click to toggle source
# File lib/movie_crawler/crawler.rb, line 47
def self.parse_comment(film_page)
  begin
    comment_data = film_page.at('.comment_data').text
                   .gsub(/[\t\r\n]+/, "\t")
                   .strip.split("\t")
  rescue
    comment_data = 'not exised'
  end
end
parse_crew(film_page) click to toggle source
# File lib/movie_crawler/crawler.rb, line 57
def self.parse_crew(film_page)
  crew_row = film_page.at('.crew_row').text.gsub(/[\t\r\n]+/, "\t").strip.split(/[\t+]/)
  crew_row = crew_row.collect { |a| a.strip }
  crew_row -= ['']
end
parse_movie_info(film_page) click to toggle source

parse the film info in the target web page

# File lib/movie_crawler/crawler.rb, line 64
def self.parse_movie_info(film_page)
  name = film_page.at('.name .at21b').text.strip
  schedule = film_page.at('#movie_info01 b').text.strip
  sub_content = film_page.at('.sub_content').text.strip
  crew_row = parse_crew(film_page)
  comment_data = parse_comment(film_page)
  { 'name' => name, 'schedule' => schedule,
    'crew_info' => crew_row, 'comment' => comment_data,
    'content' => sub_content }
end
rankmix(t) click to toggle source

mix the rank info

# File lib/movie_crawler/crawler.rb, line 110
def self.rankmix(t)
  t.each_with_index.map do |_, index|
    {
      index + 1 => t[index].to_s
    }
  end
end
split_day_and_time(doc) click to toggle source
# File lib/movie_crawler/crawler.rb, line 184
def self.split_day_and_time(doc)
  gap = "\n\t\t\t\t\s\s\s\s\t"
  days_times = doc.xpath(WHOLE_MOVIEWS_DATES)
  days_times.map { |d_t| d_t.text.split(gap) }
end
start_mechan() click to toggle source

start a Mechanize agent for crawling from web to web

# File lib/movie_crawler/crawler.rb, line 31
def self.start_mechan
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  agent.post(SEARCH_URL)
  search_page = agent.post(SEARCH_URL)
  search_page.form_with(action: 'search.cfm')
end
taipei_weekend() click to toggle source
# File lib/movie_crawler/crawler.rb, line 89
def self.taipei_weekend
  result = get_table('2')
  to_yaml(result)
end
to_yaml(mix) click to toggle source

convert the schedules to yaml format

# File lib/movie_crawler/crawler.rb, line 213
def self.to_yaml(mix)
  mix.to_yaml
end
trace_page(form, movie_name) click to toggle source

finish a post request to search for specific film

# File lib/movie_crawler/crawler.rb, line 40
def self.trace_page(form, movie_name)
  form.search_term = movie_name # movie name
  search_result = form.submit(form.button_with(name: 'search'))
  link = search_result.link_with(href: /F&d=/)
  link.click
end
us_weekend() click to toggle source

add three rank parser

# File lib/movie_crawler/crawler.rb, line 84
def self.us_weekend
  result = get_table('1')
  to_yaml(result)
end