module Twitter::TwitterText::Extractor
A module for including Tweet parsing in a class. This module provides function for the extraction and processing of usernames, lists, URLs and hashtags.
Constants
- MAX_TCO_SLUG_LENGTH
The maximum t.co path length that the
Twitter
backend supports.- MAX_URL_LENGTH
Maximum URL length as defined by Twitter's backend.
- URL_PROTOCOL_LENGTH
Public Instance Methods
# File lib/twitter-text/extractor.rb, line 347 def extract_emoji_with_indices(text) # :yields: emoji, start, end emoji = [] text.scan(Twitter::TwitterText::Regex[:valid_emoji]) do |emoji_text| match_data = $~ start_position = match_data.char_begin(0) end_position = match_data.char_end(0) emoji << { :emoji => emoji_text, :indices => [start_position, end_position] } end emoji end
Extracts all usernames, lists, hashtags and URLs in the Tweet text
along with the indices for where the entity ocurred If the text
is nil
or contains no entity an empty array will be returned.
If a block is given then it will be called for each entity.
# File lib/twitter-text/extractor.rb, line 83 def extract_entities_with_indices(text, options = {}, &block) config = options[:config] || Twitter::TwitterText::Configuration.default_configuration # extract all entities entities = extract_urls_with_indices(text, options) + extract_hashtags_with_indices(text, :check_url_overlap => false) + extract_mentions_or_lists_with_indices(text) + extract_cashtags_with_indices(text) entities += extract_emoji_with_indices(text) if config.emoji_parsing_enabled return [] if entities.empty? entities = remove_overlapping_entities(entities) entities.each(&block) if block_given? entities end
Extracts a list of all usernames mentioned in the Tweet text
. If the text
is nil
or contains no username mentions an empty array will be returned.
If a block is given then it will be called for each username.
# File lib/twitter-text/extractor.rb, line 106 def extract_mentioned_screen_names(text, &block) # :yields: username screen_names = extract_mentioned_screen_names_with_indices(text).map{|m| m[:screen_name]} screen_names.each(&block) if block_given? screen_names end
Extracts a list of all usernames mentioned in the Tweet text
along with the indices for where the mention ocurred. If the text
is nil or contains no username mentions, an empty array will be returned.
If a block is given, then it will be called with each username, the start index, and the end index in the text
.
# File lib/twitter-text/extractor.rb, line 119 def extract_mentioned_screen_names_with_indices(text) # :yields: username, start, end return [] unless text possible_screen_names = [] extract_mentions_or_lists_with_indices(text) do |screen_name, list_slug, start_position, end_position| next unless list_slug.empty? possible_screen_names << { :screen_name => screen_name, :indices => [start_position, end_position] } end if block_given? possible_screen_names.each do |mention| yield mention[:screen_name], mention[:indices].first, mention[:indices].last end end possible_screen_names end
Extracts a list of all usernames or lists mentioned in the Tweet text
along with the indices for where the mention ocurred. If the text
is nil or contains no username or list mentions, an empty array will be returned.
If a block is given, then it will be called with each username, list slug, the start index, and the end index in the text
. The list_slug will be an empty stirng if this is a username mention.
# File lib/twitter-text/extractor.rb, line 148 def extract_mentions_or_lists_with_indices(text) # :yields: username, list_slug, start, end return [] unless text =~ /[@@]/ possible_entries = [] text.to_s.scan(Twitter::TwitterText::Regex[:valid_mention_or_list]) do |before, at, screen_name, list_slug| match_data = $~ after = $' unless after =~ Twitter::TwitterText::Regex[:end_mention_match] start_position = match_data.char_begin(3) - 1 end_position = match_data.char_end(list_slug.nil? ? 3 : 4) possible_entries << { :screen_name => screen_name, :list_slug => list_slug || "", :indices => [start_position, end_position] } end end if block_given? possible_entries.each do |mention| yield mention[:screen_name], mention[:list_slug], mention[:indices].first, mention[:indices].last end end possible_entries end
Extracts the username username replied to in the Tweet text
. If the text
is nil
or is not a reply nil will be returned.
If a block is given then it will be called with the username replied to (if any)
# File lib/twitter-text/extractor.rb, line 179 def extract_reply_screen_name(text) # :yields: username return nil unless text possible_screen_name = text.match(Twitter::TwitterText::Regex[:valid_reply]) return unless possible_screen_name.respond_to?(:captures) return if $' =~ Twitter::TwitterText::Regex[:end_mention_match] screen_name = possible_screen_name.captures.first yield screen_name if block_given? screen_name end
Extracts a list of all URLs included in the Tweet text
. If the text
is nil
or contains no URLs an empty array will be returned.
If a block is given then it will be called for each URL.
# File lib/twitter-text/extractor.rb, line 195 def extract_urls(text, &block) # :yields: url urls = extract_urls_with_indices(text).map{|u| u[:url]} urls.each(&block) if block_given? urls end
Extracts a list of all URLs included in the Tweet text
along with the indices. If the text
is nil
or contains no URLs an empty array will be returned.
If a block is given then it will be called for each URL.
# File lib/twitter-text/extractor.rb, line 206 def extract_urls_with_indices(text, options = {:extract_url_without_protocol => true}) # :yields: url, start, end return [] unless text && (options[:extract_url_without_protocol] ? text.index(".") : text.index(":")) urls = [] text.to_s.scan(Twitter::TwitterText::Regex[:valid_url]) do |all, before, url, protocol, domain, port, path, query| valid_url_match_data = $~ start_position = valid_url_match_data.char_begin(3) end_position = valid_url_match_data.char_end(3) # If protocol is missing and domain contains non-ASCII characters, # extract ASCII-only domains. if !protocol next if !options[:extract_url_without_protocol] || before =~ Twitter::TwitterText::Regex[:invalid_url_without_protocol_preceding_chars] last_url = nil domain.scan(Twitter::TwitterText::Regex[:valid_ascii_domain]) do |ascii_domain| next unless is_valid_domain(url.length, ascii_domain, protocol) last_url = { :url => ascii_domain, :indices => [start_position + $~.char_begin(0), start_position + $~.char_end(0)] } urls << last_url end # no ASCII-only domain found. Skip the entire URL next unless last_url # last_url only contains domain. Need to add path and query if they exist. if path # last_url was not added. Add it to urls here. last_url[:url] = url.sub(domain, last_url[:url]) last_url[:indices][1] = end_position end else # In the case of t.co URLs, don't allow additional path characters if url =~ Twitter::TwitterText::Regex[:valid_tco_url] next if $1 && $1.length > MAX_TCO_SLUG_LENGTH url = $& end_position = start_position + url.codepoint_length end next unless is_valid_domain(url.length, domain, protocol) urls << { :url => url, :indices => [start_position, end_position] } end end urls.each{|url| yield url[:url], url[:indices].first, url[:indices].last} if block_given? urls end
# File lib/twitter-text/extractor.rb, line 372 def is_valid_domain(url_length, domain, protocol) begin raise ArgumentError.new("invalid empty domain") unless domain original_domain_length = domain.length encoded_domain = SimpleIDN.to_ascii(domain) updated_domain_length = encoded_domain.length url_length += (updated_domain_length - original_domain_length) if (updated_domain_length > original_domain_length) url_length += URL_PROTOCOL_LENGTH unless protocol url_length <= MAX_URL_LENGTH rescue Exception # On error don't consider this a valid domain. return false end end
# File lib/twitter-text/extractor.rb, line 361 def is_valid_emoji(text) begin raise ArgumentError.new("invalid empty emoji") unless text entities = extract_emoji_with_indices(text) entities.count == 1 && entities[0][:emoji] == text rescue Exception # On error don't consider this a valid domain. return false end end
Remove overlapping entities. This returns a new array with no overlapping entities.
# File lib/twitter-text/extractor.rb, line 67 def remove_overlapping_entities(entities) # sort by start index entities = entities.sort_by{|entity| entity[:indices].first} # remove duplicates prev = nil entities.reject!{|entity| (prev && prev[:indices].last > entity[:indices].first) || (prev = entity) && false} entities end