module Twitter::TwitterText::Validation

Constants

DEFAULT_TCO_URL_LENGTHS
MAX_LENGTH_LEGACY
VALID_LIST_RE

Public Instance Methods

contains_invalid?(text) click to toggle source
# File lib/twitter-text/validation.rb, line 127
def contains_invalid?(text)
  return false if !text || text.empty?
  begin
    return true if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
  rescue ArgumentError
    # non-Unicode value.
    return true
  end
  return false
end
parse_tweet(text, options = {}) click to toggle source

Parse input text and return hash with descriptive parameters populated.

# File lib/twitter-text/validation.rb, line 37
def parse_tweet(text, options = {})
  options = DEFAULT_TCO_URL_LENGTHS.merge(options)
  config = options[:config] || Twitter::TwitterText::Configuration.default_configuration
  normalized_text = text.to_nfc
  unless (normalized_text.length > 0)
    ParseResults.empty()
  end

  scale = config.scale
  max_weighted_tweet_length = config.max_weighted_tweet_length
  scaled_max_weighted_tweet_length = max_weighted_tweet_length * scale
  transformed_url_length = config.transformed_url_length * scale
  ranges = config.ranges

  url_entities = Twitter::TwitterText::Extractor.extract_urls_with_indices(normalized_text)
  emoji_entities = config.emoji_parsing_enabled ? Twitter::TwitterText::Extractor.extract_emoji_with_indices(normalized_text) : []

  has_invalid_chars = false
  weighted_count = 0
  offset = 0
  display_offset = 0
  valid_offset = 0

  while offset < normalized_text.codepoint_length
    # Reset the default char weight each pass through the loop
    char_weight = config.default_weight
    entity_length = 0

    url_entities.each do |url_entity|
      if url_entity[:indices].first == offset
        entity_length = url_entity[:indices].last - url_entity[:indices].first
        weighted_count += transformed_url_length
        offset += entity_length
        display_offset += entity_length
        if weighted_count <= scaled_max_weighted_tweet_length
          valid_offset += entity_length
        end
        # Finding a match breaks the loop
        break
      end
    end

    emoji_entities.each do |emoji_entity|
      if emoji_entity[:indices].first == offset
        entity_length = emoji_entity[:indices].last - emoji_entity[:indices].first
        weighted_count += char_weight # the default weight
        offset += entity_length
        display_offset += entity_length
        if weighted_count <= scaled_max_weighted_tweet_length
          valid_offset += entity_length
        end
        # Finding a match breaks the loop
        break
      end
    end

    next if entity_length > 0

    if offset < normalized_text.codepoint_length
      code_point = normalized_text[offset]

      ranges.each do |range|
        if range.contains?(code_point.unpack("U").first)
          char_weight = range.weight
          break
        end
      end

      weighted_count += char_weight

      has_invalid_chars = contains_invalid?(code_point) unless has_invalid_chars
      codepoint_length = code_point.codepoint_length
      offset += codepoint_length
      display_offset += codepoint_length
      #          index += codepoint_length

      if !has_invalid_chars && (weighted_count <= scaled_max_weighted_tweet_length)
        valid_offset += codepoint_length
      end
    end
  end

  normalized_text_offset = text.codepoint_length - normalized_text.codepoint_length
  scaled_weighted_length = weighted_count / scale
  is_valid = !has_invalid_chars && (scaled_weighted_length <= max_weighted_tweet_length)
  permillage = scaled_weighted_length * 1000 / max_weighted_tweet_length

  return ParseResults.new(weighted_length: scaled_weighted_length, permillage: permillage, valid: is_valid, display_range_start: 0, display_range_end: (display_offset + normalized_text_offset - 1), valid_range_start: 0, valid_range_end: (valid_offset + normalized_text_offset - 1))
end
tweet_invalid?(text) click to toggle source

DEPRECATED: Please use parse_text instead.

Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback.

Returns false if this text is valid. Otherwise one of the following Symbols will be returned:

<tt>:too_long</tt>:: if the <tt>text</tt> is too long
<tt>:empty</tt>:: if the <tt>text</tt> is nil or empty
<tt>:invalid_characters</tt>:: if the <tt>text</tt> contains non-Unicode or any of the disallowed Unicode characters
# File lib/twitter-text/validation.rb, line 223
def tweet_invalid?(text)
  return :empty if !text || text.empty?
  begin
    return :too_long if tweet_length(text) > MAX_LENGTH_LEGACY
    return :invalid_characters if Twitter::TwitterText::Regex::INVALID_CHARACTERS.any?{|invalid_char| text.include?(invalid_char) }
  rescue ArgumentError
    # non-Unicode value.
    return :invalid_characters
  end

  return false
end
tweet_length(text, options = {}) click to toggle source

DEPRECATED: Please use parse_text instead.

Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a string no matter which actual form was transmitted. For example:

U+0065  Latin Small Letter E

+ U+0301 Combining Acute Accent


2 bytes, 2 characters, displayed as é (1 visual glyph)

… The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1

The string could also contain U+00E9 already, in which case the canonicalization will not change the value.

# File lib/twitter-text/validation.rb, line 198
def tweet_length(text, options = {})
  options = DEFAULT_TCO_URL_LENGTHS.merge(options)

  length = text.to_nfc.unpack("U*").length

  Twitter::TwitterText::Extractor.extract_urls_with_indices(text) do |url, start_position, end_position|
    length += start_position - end_position
    length += options[:short_url_length] if url.length > 0
  end

  length
end
valid_hashtag?(hashtag) click to toggle source
# File lib/twitter-text/validation.rb, line 153
def valid_hashtag?(hashtag)
  return false if !hashtag || hashtag.empty?

  extracted = Twitter::TwitterText::Extractor.extract_hashtags(hashtag)
  # Should extract the hashtag minus the # sign, hence the [1..-1]
  extracted.size == 1 && extracted.first == hashtag[1..-1]
end
valid_list?(username_list) click to toggle source
# File lib/twitter-text/validation.rb, line 147
def valid_list?(username_list)
  match = username_list.match(VALID_LIST_RE)
  # Must have matched and had nothing before or after
  !!(match && match[1] == "" && match[4] && !match[4].empty?)
end
valid_tweet_text?(text) click to toggle source
# File lib/twitter-text/validation.rb, line 237
def valid_tweet_text?(text)
  !tweet_invalid?(text)
end
valid_url?(url, unicode_domains=true, require_protocol=true) click to toggle source
# File lib/twitter-text/validation.rb, line 161
def valid_url?(url, unicode_domains=true, require_protocol=true)
  return false if !url || url.empty?

  url_parts = url.match(Twitter::TwitterText::Regex[:validate_url_unencoded])
  return false unless (url_parts && url_parts.to_s == url)

  scheme, authority, path, query, fragment = url_parts.captures

  return false unless ((!require_protocol ||
                        (valid_match?(scheme, Twitter::TwitterText::Regex[:validate_url_scheme]) && scheme.match(/\Ahttps?\Z/i))) &&
                       valid_match?(path, Twitter::TwitterText::Regex[:validate_url_path]) &&
                       valid_match?(query, Twitter::TwitterText::Regex[:validate_url_query], true) &&
                       valid_match?(fragment, Twitter::TwitterText::Regex[:validate_url_fragment], true))

  return (unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_unicode_authority])) ||
         (!unicode_domains && valid_match?(authority, Twitter::TwitterText::Regex[:validate_url_authority]))
end
valid_username?(username) click to toggle source
# File lib/twitter-text/validation.rb, line 138
def valid_username?(username)
  return false if !username || username.empty?

  extracted = Twitter::TwitterText::Extractor.extract_mentioned_screen_names(username)
  # Should extract the username minus the @ sign, hence the [1..-1]
  extracted.size == 1 && extracted.first == username[1..-1]
end

Private Instance Methods

valid_match?(string, regex, optional=false) click to toggle source
# File lib/twitter-text/validation.rb, line 244
def valid_match?(string, regex, optional=false)
  return (string && string.match(regex) && $~.to_s == string) unless optional

  !(string && (!string.match(regex) || $~.to_s != string))
end