module MovieCrawler
Constants
- ATMOVIES_MAIN_URL
- MOVIE_BASE_URL
- REFLECTION_BASE
- REFLECTION_CLASS
- REFLECTION_FS
- REFLECTION_NAME
- REFLECTION_SAID
- REFLECTION_SATITLE
- SEARCH_URL
- TRAILER_URL
- URL_LIST
get the info from atmovies
- VERSION
- WHOLE_MOVIEWS_CODES
- WHOLE_MOVIEWS_DATES
- WHOLE_MOVIEWS_STORIES
- WHOLE_MOVIEWS_TITLES
Public Class Methods
dvd_rank()
click to toggle source
# File lib/movie_crawler/crawler.rb, line 94 def self.dvd_rank result = get_table('3') to_yaml(result) end
encode_zh(text)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 136 def self.encode_zh(text) encoded = URI.encode(Iconv.new('big5', 'utf-8').iconv(text)).to_s REFLECTION_SATITLE + encoded end
get_codes(doc)
click to toggle source
get the code of movies
# File lib/movie_crawler/crawler.rb, line 191 def self.get_codes(doc) codes = doc.xpath(WHOLE_MOVIEWS_CODES) codes.map { |code| code.value.split('/')[2] } end
get_dates(doc)
click to toggle source
get the release date
# File lib/movie_crawler/crawler.rb, line 179 def self.get_dates(doc) days_times = split_day_and_time(doc) days_times.map { |d_t| d_t[1].match(%r{\d+/\d+/\d+}).to_s } # mm/dd/yy end
get_movie_info(movie_name)
click to toggle source
combine the workflow for user to call for movie info
# File lib/movie_crawler/crawler.rb, line 76 def self.get_movie_info(movie_name) form = start_mechan film_page = trace_page(form, movie_name) collection = parse_movie_info(film_page) to_yaml(collection) end
get_one_movie_name(doc)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 141 def self.get_one_movie_name(doc) name = doc.xpath(REFLECTION_NAME).text name.gsub!(/[\t\r\n]/, '') end
get_reflection(doc)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 146 def self.get_reflection(doc) doc.xpath(REFLECTION_CLASS).text.gsub!(/[\t\r\n]/, '').split end
get_runtime(doc)
click to toggle source
get the runtime of movie
# File lib/movie_crawler/crawler.rb, line 173 def self.get_runtime(doc) days_times = split_day_and_time(doc) days_times.map { |d_t| d_t[0].match(/\d+/).to_s } end
get_stories(doc)
click to toggle source
get the storyline of movie
# File lib/movie_crawler/crawler.rb, line 167 def self.get_stories(doc) storylines = doc.xpath(WHOLE_MOVIEWS_STORIES) storylines.map(&:text) # { |story| story.text } end
get_table(rankid)
click to toggle source
parse the ranktable info
# File lib/movie_crawler/crawler.rb, line 100 def self.get_table(rankid) doc = open_html(ATMOVIES_MAIN_URL) table = doc.xpath("//*[@id = 'ranklist']/div[" + rankid + ']').text table = table.gsub(' : ', ':').gsub(' ', '').split table = table.each { |item| item.gsub(/[\t\r\n]/, '') } table.pop rankmix(table) end
get_titles(doc)
click to toggle source
get the movie name
# File lib/movie_crawler/crawler.rb, line 161 def self.get_titles(doc) titles = doc.xpath(WHOLE_MOVIEWS_TITLES) titles.map { |title| title.text.gsub(/[\t\n\r]/, '') } end
get_trailer(doc)
click to toggle source
get the trailer link of the movies
# File lib/movie_crawler/crawler.rb, line 197 def self.get_trailer(doc) codes = get_codes(doc) codes.map { |trailer| TRAILER_URL + trailer } end
mix(t, s, d, ti, tr)
click to toggle source
build the hash for yaml output
# File lib/movie_crawler/crawler.rb, line 203 def self.mix(t, s, d, ti, tr) informations = t.each_with_index.map do |_, index| { 'title' => t[index], 'story' => s[index], \ 'date' => d[index], 'runtime(minutes)' => ti[index], \ 'trailer' => tr[index] } end informations end
movie_details(code)
click to toggle source
get the details of movie
# File lib/movie_crawler/crawler.rb, line 151 def self.movie_details(code) open_html(MOVIE_BASE_URL + code + '/') end
movies(category = 'LATEST')
click to toggle source
switch to different url accordingly
# File lib/movie_crawler/crawler.rb, line 119 def self.movies(category = 'LATEST') result = movies_parser(category) to_yaml(result) end
movies_parser(category)
click to toggle source
parse the movies acoordingly
# File lib/movie_crawler/crawler.rb, line 125 def self.movies_parser(category) url = URL_LIST[category.upcase] document = open_html(url) titles = get_titles(document) stories = get_stories(document) dates = get_dates(document) trailers = get_trailer(document) runtimes = get_runtime(document) mix(titles, stories, dates, runtimes, trailers) end
open_html(url)
click to toggle source
open the destination url
# File lib/movie_crawler/crawler.rb, line 156 def self.open_html(url) Nokogiri::HTML(open(url)) end
parse_comment(film_page)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 47 def self.parse_comment(film_page) begin comment_data = film_page.at('.comment_data').text .gsub(/[\t\r\n]+/, "\t") .strip.split("\t") rescue comment_data = 'not exised' end end
parse_crew(film_page)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 57 def self.parse_crew(film_page) crew_row = film_page.at('.crew_row').text.gsub(/[\t\r\n]+/, "\t").strip.split(/[\t+]/) crew_row = crew_row.collect { |a| a.strip } crew_row -= [''] end
parse_movie_info(film_page)
click to toggle source
parse the film info in the target web page
# File lib/movie_crawler/crawler.rb, line 64 def self.parse_movie_info(film_page) name = film_page.at('.name .at21b').text.strip schedule = film_page.at('#movie_info01 b').text.strip sub_content = film_page.at('.sub_content').text.strip crew_row = parse_crew(film_page) comment_data = parse_comment(film_page) { 'name' => name, 'schedule' => schedule, 'crew_info' => crew_row, 'comment' => comment_data, 'content' => sub_content } end
rankmix(t)
click to toggle source
mix the rank info
# File lib/movie_crawler/crawler.rb, line 110 def self.rankmix(t) t.each_with_index.map do |_, index| { index + 1 => t[index].to_s } end end
split_day_and_time(doc)
click to toggle source
# File lib/movie_crawler/crawler.rb, line 184 def self.split_day_and_time(doc) gap = "\n\t\t\t\t\s\s\s\s\t" days_times = doc.xpath(WHOLE_MOVIEWS_DATES) days_times.map { |d_t| d_t.text.split(gap) } end
start_mechan()
click to toggle source
start a Mechanize agent for crawling from web to web
# File lib/movie_crawler/crawler.rb, line 31 def self.start_mechan agent = Mechanize.new agent.user_agent_alias = 'Mac Safari' agent.post(SEARCH_URL) search_page = agent.post(SEARCH_URL) search_page.form_with(action: 'search.cfm') end
taipei_weekend()
click to toggle source
# File lib/movie_crawler/crawler.rb, line 89 def self.taipei_weekend result = get_table('2') to_yaml(result) end
to_yaml(mix)
click to toggle source
convert the schedules to yaml format
# File lib/movie_crawler/crawler.rb, line 213 def self.to_yaml(mix) mix.to_yaml end
trace_page(form, movie_name)
click to toggle source
finish a post request to search for specific film
# File lib/movie_crawler/crawler.rb, line 40 def self.trace_page(form, movie_name) form.search_term = movie_name # movie name search_result = form.submit(form.button_with(name: 'search')) link = search_result.link_with(href: /F&d=/) link.click end
us_weekend()
click to toggle source
add three rank parser
# File lib/movie_crawler/crawler.rb, line 84 def self.us_weekend result = get_table('1') to_yaml(result) end