class TildeScraper::Scraper

Public Class Methods

scrape_comments(url) click to toggle source
# File lib/tilde_scraper/scraper.rb, line 53
def self.scrape_comments(url)
  doc = open_url(url)
  comments = doc.css("#comments")
  array = scrape_children(comments, url)
  array
end
scrape_groups(url) click to toggle source
# File lib/tilde_scraper/scraper.rb, line 41
def self.scrape_groups(url)
  doc = open_url(url)
  out = doc.css("tr.group-level-0").map do |group|
    {
      name: group.css("a").text,
      description: group.css("p").text,
      subs: group.css("span.group-subscription-count").text.split(" ").first
    }
  end
  out
end
scrape_page(url) click to toggle source

Returns an array with two elements. the first a hash containing general page info the secound an array of hashes containing topic info

# File lib/tilde_scraper/scraper.rb, line 5
def self.scrape_page(url)
  doc = open_url(url)
  output_array = []
  output_array << {
    url: url,
  }
  page_buttons = doc.css("a.page-item").each do |button|
    button_name = button.text
    output_array[0]["#{button_name.downcase}_link".to_sym] = button.attribute("href").value
  end

  topics = doc.css("article.topic")
  output_array << topics.map do |topic|
    title = topic.css("h1.topic-title a")
    metadata = topic.css("div.topic-metadata")
    info = {
      title: title.text,
      comment_count: topic.css("div.topic-info-comments").text.strip,
      comment_link: "https://tildes.net" + topic.css("div.topic-info-comments a").attribute("href").value.split(" ").first,
      group: metadata.css("span.topic-group").text,
      word_count: metadata.css("span.topic-content-metadata").text.split(" ")[0],
      age: topic.css("time.time-responsive").attribute("data-abbreviated").value,
      votes: topic.css("div.topic-voting span.topic-voting-votes").text
    }
    topic_text = topic.css(".topic-text-excerpt")
    topic_text = topic_text.children.reject { |el| el.name == "summary" }
    if topic_text.length > 0
      info[:topic_text] = topic_text.reduce("") { |s, el| s + el.text}.strip
    else
      info[:link] = title.attribute("href").value
    end
    info
  end
  output_array
end

Private Class Methods

open_url(url) click to toggle source
# File lib/tilde_scraper/scraper.rb, line 76
def self.open_url(url)
  Nokogiri::HTML(open(url))
end
scrape_children(top_comment, url, level = 0) click to toggle source
# File lib/tilde_scraper/scraper.rb, line 61
def self.scrape_children(top_comment, url, level = 0)
  comments = top_comment.css("> li > article").map do |comment|
    comment_info = comment.css("> div.comment-itself").first
    hash = {
      text: comment_info.css("div.comment-text").text.strip,
      author: comment_info.css("a.link-user").text,
      votes: comment_info.css("div.comment-votes").text.split(" ").first,
      level: level,
      url: url,
      children: scrape_children(comment.css("> ol.comment-tree-replies"), url, level + 1)
    }
    hash
  end
  comments
end