class Tokenizer

Attributes

Tags[RW]
html[RW]
type[RW]

Public Class Methods

new(html) click to toggle source
# File lib/rLexer/tokenizer.rb, line 6
def initialize(html)
  @html = html.gsub('"', '\'')
  @html.strip!
  @type = :EOF
  @tokens = []
end

Public Instance Methods

close_tag?(char) click to toggle source
# File lib/rLexer/tokenizer.rb, line 150
def close_tag?(char)
  char == Tags::CLOSE_TAG
end
comment_end(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 115
def comment_end(idx)
  return if not @type == :COMMENT
  if comment_end?(idx)
    set_type(idx)
  end
end
comment_end?(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 122
def comment_end?(idx)
  suitable?(idx, Tags::END_COMMENT)
end
comment_start?(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 138
def comment_start?(idx)
  suitable?(idx, Tags::START_COMMENT)
end
consume(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 28
def consume(idx)
  if @type == :COMMENT
    consume_comment(idx)
  elsif @type == :OPEN or @type == :CLOSE
    consume_tag(idx)
  elsif @type == :DOCTYPE
    #consume_doctype(idx)
  elsif @type == :DATA
    consume_data(idx)
  end
end
consume_attributes() click to toggle source
# File lib/rLexer/tokenizer.rb, line 77
def consume_attributes
  atts_new = []
  @tokens.each.with_index do |token, i|
    atts = token[1].split(' ')[1..-1]
    if token[0] == :OPEN and !atts[0].nil?
      atts_new.push([i, atts.join(' ').split("' ")])
    end
    @tokens[i][1] = @tokens[i][1].split(' ')[0] unless @tokens[i][0] == :COMMENT or @tokens[i][0] == :DATA
  end
  c = 1
  atts_new.each.with_index do |x|
    @tokens.insert(x[0] + c, [:ATTRIBUTES, x[1]])
    c += 1
  end
end
consume_comment(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 58
def consume_comment(idx)
  slice = @html[idx..-1]
  slice = slice[Tags::START_COMMENT.length..end_comment_index(slice)]
  set_token(slice)
end
consume_data(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 93
def consume_data(idx)
  return if next_char?(idx)

  slice = @html[idx..-1]
  slice = slice[Tags::CLOSE_TAG.length..slice.index(Tags::OPEN_TAG) || slice.length]

  set_token(slice) unless slice == ''
end
consume_tag(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 64
def consume_tag(idx)
  slice = @html[idx..-1]

  if slice.index(Tags::CLOSE_TAG).nil?
    index = -1
  else
    index = slice.index(Tags::CLOSE_TAG) - 1
  end

  slice = slice[tag_index(slice)..index]
  set_token(slice)
end
current_char(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 102
def current_char(idx)
  @html[idx]
end
doctype?(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 134
def doctype?(idx)
  false
end
end_comment_index(html) click to toggle source
# File lib/rLexer/tokenizer.rb, line 106
def end_comment_index(html)
  idx = html.index(Tags::END_COMMENT)
  (not idx.nil?) ? (idx + 2) - Tags::END_COMMENT.length : -1
end
end_tag?(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 130
def end_tag?(idx)
  suitable?(idx, Tags::CLOSING_TAG)
end
next_char?(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 126
def next_char?(idx)
  @html[idx +1] == Tags::OPEN_TAG or @html[idx +1].nil?
end
open_tag?(char) click to toggle source
# File lib/rLexer/tokenizer.rb, line 146
def open_tag?(char)
  char == Tags::OPEN_TAG
end
process(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 24
def process(idx)
  set_type(idx); consume(idx)
end
set_token(slice) click to toggle source
# File lib/rLexer/tokenizer.rb, line 54
def set_token(slice)
  @tokens.push([@type, slice])
end
set_type(idx) click to toggle source
# File lib/rLexer/tokenizer.rb, line 40
def set_type(idx)
  if comment_start?(idx)
    @type = :COMMENT
  elsif end_tag?(idx)
    @type = :CLOSE
  elsif doctype?(idx)
    @type = :DOCTYPE
  elsif close_tag?(current_char(idx)) or comment_end?(idx)
    @type = :DATA
  elsif open_tag?(current_char(idx))
    @type = :OPEN
  end
end
suitable?(idx, tag) click to toggle source
# File lib/rLexer/tokenizer.rb, line 142
def suitable?(idx, tag)
  tag == @html.byteslice(idx, tag.length)
end
tag_index(html) click to toggle source
# File lib/rLexer/tokenizer.rb, line 111
def tag_index(html)
  (@type == :OPEN) ? Tags::OPEN_TAG.length : Tags::CLOSING_TAG.length
end
tokenize() click to toggle source
# File lib/rLexer/tokenizer.rb, line 13
def tokenize
  @html.each_char.with_index do |ch, idx|
    comment_end(idx)
    next if @type == :COMMENT
    if open_tag?(ch) or close_tag?(ch)
      process(idx)
    end
  end
  consume_attributes
end
tokens() click to toggle source
# File lib/rLexer/tokenizer.rb, line 154
def tokens
  @tokens
end