class Bayeux

Attributes

block_forest[R]

Accessors

Public Class Methods

json_create(json_hash) click to toggle source
# File lib/bayeux/bayeux.rb, line 322
def self.json_create(json_hash)
  begin
    require 'json'

    syntax_tree = new(json_hash["block_forest"])
    return syntax_tree

  rescue LoadError
    warn "The JSON gem couldn't be loaded, and so the JSON representation could not be generated"
  end
end

Public Instance Methods

content_string(first, last) click to toggle source

Helper functions

# File lib/bayeux/bayeux.rb, line 344
def content_string(first, last)
  unless first.nil? or last.nil? then
    return @content[first, (last - first)]
  else
    return ""
  end
end
parse(content) click to toggle source

Start parsing

# File lib/bayeux/bayeux.rb, line 11
def parse(content)
  
  # Set-up the logger
  @parse_log = Logger.new('bayex_parser')
  @parse_log.outputters = Outputter.stdout
  
  # Save the content for later
  @content = content
  
  # Final string to return to the caller
  pandoc_str = String.new
  
  # Forest containing the paragraph level block trees
  @block_forest = Array.new
  
  # Index of the paragraph blocks
  @block_index = 0
      
  # Indicies into the content, denoting the current limits of
  # parsing
  from_index = 0
  to_index = content.length
  
  ## Procedure for adding a node to the tree
  new_node = ->(node_type = :none, node_contents = ""){

    @block_index += 100
    @parse_log.info {"creating node #{@block_index}"}
    return Tree::TreeNode.new(@block_index, ParaBlock.new(node_type, node_contents))
    
  }
  
  # Chunk the content into paragraph blocks
  while from_index < to_index do
    
    # Look for the next space, and the next next '[' character
    # which should tell us what type of paragraph we are dealing with
    space_index = content.index(' ', from_index)
    left_sb_index = content.index(']', from_index) 
    block_start = content.index('[', from_index)
    
    if block_start.nil? then
      # It's a paragraph
      unless space_index.nil? then
        block = ParaBlock.new(:paragraph, "")
        block_start = from_index
      else
        # No more blocks
        break
      end
    elsif (left_sb_index < space_index) and (left_sb_index > block_start) then
      # This should be a block start marker
      orig_type_length = (left_sb_index - 1) - block_start
      block = ParaBlock.from_s(content[block_start + 1, orig_type_length])
      
    elsif space_index > block_start then
      # This should be an annotated block
      orig_type_length = space_index - (block_start + 1)  
      block = ParaBlock.from_s(content[block_start + 1, orig_type_length])
      
    elsif space_index < block_start then
      # It's a paragraph
      unless space_index.nil? then
        block = ParaBlock.new(:paragraph, "")
        block_start = from_index
      else
        @parse_log.error {"Cannot find a valid block (syntax error in the start of block characters)"}
        return nil
      end
    end
            
    # Now we know the type of block, and where it starts, work out
    # where the end of the block is
    case block.type
      when :h1,:h2,:h3,:h4,:h5,:h6,:bib
        # Look for a terminating ']'
        block_end = left_sb_index
        
        # Add the node, and paragraph sub_tree, to the block forest
        block.content = content_string(block_start + 4, block_end)
        para_node = Tree::TreeNode.new(@block_index, block)
        para_subtree(para_node)
        @block_forest << para_node
        
      when :paragraph
        # Look for a single new line, containing only whitespace
        block_end = content.index(/^\s/, block_start)
        
        # Add a node to the block forest
        block.content = content_string(block_start, block_end)
        para_node = Tree::TreeNode.new(@block_index, block)
        para_subtree(para_node)
        @block_forest << para_node
        
      when :medskip
        # Self-terminating
        block_end = block_start + 9
        
        # Add a node to the block forest
        para_node = Tree::TreeNode.new(@block_index, block)
        @block_forest << para_node

      when :command, :output, :file
        # Look for a terminating '[end]'
        block_end = content.index('[end]', block_start)
        content_index = block_start + (block.orig_type_length + 2)
        
        # Add a node to the block forest, and do no further processing
        block.content = content_string(content_index, block_end)
        para_node = Tree::TreeNode.new(@block_index, block)
        @block_forest << para_node
        
        # Skip the 'end'
        block_end += 5
        
      when :code
        # Look for a terminating '[end]'
        block_end = content.index('[end]', block_start)
        content_index = block_start + (block.orig_type_length + 2)
        
        # Look for arguments to the block
        unless content[from_index + 1] == ']' then
                      
          # Find the end of the arguments
          arg_end = content.index(/]\s/, from_index)
          arguments = content[from_index + 6, (arg_end - (from_index + 6))] 
          
          # Create the parent node for this content
          code_start = content.index(/^/, arg_end)
          block.content = content_string(code_start, block_end)
          para_node = Tree::TreeNode.new(@block_index, block)
                
          # Split the item
          items = arguments.split('|')

          # Create a node for the description title
          unless items[0].nil? then
            para_node << new_node.call(:code_language, items[0].strip)
          else
            para_node << new_node.call(:code_language, "")
          end

          # Create the description text
          unless items[1].nil? then
            para_node << new_node.call(:code_start_number, items[1].strip)
          else
            para_node << new_node.call(:code_start_number, "0")
          end
          
        else
          
          # Create the parent node for this content
          block.content = content_string(content_index, block_end)
          para_node = Tree::TreeNode.new(@block_index, block)
          
          para_node << new_node.call(:code_language, "")            
          para_node << new_node.call(:code_start_number, "0")
          
        end
        
        # Add a node to the block forest, and do no further processing
        @block_forest << para_node
        
        # Skip the 'end'
        block_end += 5
        
      when :block_quote, :figure, :note, :quote
        # Look for a terminating '[end]'
        block_end = content.index('[end]', block_start)
        content_index = block_start + (block.orig_type_length + 2)
        
        # Add a node to the block forest
        block.content = content_string(content_index, block_end)
        para_node = Tree::TreeNode.new(@block_index, block)
        para_subtree(para_node)
        @block_forest << para_node
        
        # Skip the 'end'
        block_end += 5
        
      when :dl
        # Look for a terminating '[end]'
        block_end = content.index('[end]', block_start)
        
        ## Description List blocks are really two blocks, so should be
        ## parsed as such. First we look for (and parse) the description
        ## header, then we look for (and parse) the description body. We
        ## keep doing this until we run out of things to do.
        
        # Create the master node for the list
        dl_node = new_node.call(:dl)
        
        # Get the list contents
        content_index = block_start + (block.orig_type_length + 2)
        list_content = content_string(content_index, block_end)
        
        # First break the list into components
        list_items = list_content.split(/^\n/)
        
        # Process each list item
        list_items.each{|list_item|
          
          # Break the list item into a description header and
          # description body
          break_at = list_item.index(/:$/)
          item_index = list_item.index('[item')
          item_end = list_item.rindex(']')
          
          description_header = list_item[item_index + 5, break_at - 5]
          description_body = list_item[break_at + 1, (item_end - break_at - 1)]
          
          # Parse the description header
          header_node = new_node.call(:dl_header, description_header)
          para_subtree(header_node)
          dl_node << header_node
          
          # Parse the description body
          body_node = new_node.call(:dl_text, description_body)
          para_subtree(body_node)
          dl_node << body_node
          
        }
        
        # Add the node to the block forest
        @block_forest << dl_node
        
        # Skip the 'end'
        block_end += 5
        
      when :ol, :ul
        # Lists are special because they can be nested. Hence we
        # have to be a bit careful about the end marker.
        block_end = content.index(/\[end\]((\n\n)|($\z))/, block_start)
        
        if block_end.nil? then
          @parse_log.error {"Error: Invalid end to a list"}
          @parse_log.error {"Backtrace: #{to_s}"}
          return nil
        end
        
        # Add a node to the block forest
        block.content = content_string(block_start + 4, block_end)
        para_node = Tree::TreeNode.new(@block_index, block)
        para_subtree(para_node)
        @block_forest << para_node
        
        # Skip the 'end'
        block_end += 5
        
      else
        @parse_log.error {"Error: Unknown block type '#{block.type}'"}
        @parse_log.error {"Backtrace: #{to_s}"}
        return nil
        
    end
    
    @block_index += 100
    
    # Skip spaces
    unless block_end.nil? then
      from_index = block_end + 1
      until not (content[from_index] =~ /\n/) do
        from_index += 1
      end
    else
      return nil
    end
    
  end
  
end
to_json(*a) click to toggle source
# File lib/bayeux/bayeux.rb, line 306
def to_json(*a)
  begin
    require 'json'

    json_hash = {
      "block_forest"  => @block_forest,
      JSON.create_id => self.class.name
    }

    return json_hash.to_json

  rescue LoadError
    warn "The JSON gem couldn't be loaded, and so the JSON representation could not be generated"
  end
end
to_s() click to toggle source

Generators: Convinience functions taking an AST (Abstract Syntax Tree),

and returning a document of the specified type
# File lib/bayeux/bayeux.rb, line 288
def to_s
  return_str = String.new
  @block_forest.each{|tree|
    return_str << "Paragraph Type: #{tree.content.type}\n"
    return_str << "\n"
    
    tree.each{|node|
      return_str << "  Node Name:     #{node.name}\n"
      return_str << "  Node Type:     #{node.content.type}\n"
      return_str << "  Node Depth:    #{node.node_depth}\n"
      
      node_content = node.content.content.inspect.line_wrap(70,2).indent(17)
      return_str << "  Node Contents: |#{node_content[17, node_content.length]}|\n\n"
    }
  }
  return return_str
end

Private Instance Methods

para_subtree(tree_root) click to toggle source

Return the sub-tree of paragraph blocks, given a suitably initalised root of a tree

# File lib/bayeux/bayeux.rb, line 354
def para_subtree(tree_root)
      
  # Set containing the next codepoints to search for
  stop_at = Set.new [10,12,13,32,34,45,46,91,93]
  
  # Value of the current block
  block_index = tree_root.name.to_i
      
  # Flag indicating the start of a new block
  block_start = false
  
  # String holding the list of characters to parse
  content_string = tree_root.content.to_s
  
  # Massage the content string to replace pseudo-blocks
  content_string.gsub!(/\s'([^']+)'/, ' [single_quote \1]')
  
  # Index into the content string
  string_index = 0
  
  ## Procedure for adding a node to the tree
  new_node = ->(node_type = :none, node_contents = ""){

    block_index += 1
    @parse_log.debug {"creating node #{block_index}"}
    return Tree::TreeNode.new(block_index, ParaBlock.new(node_type, node_contents))
    
  }
  
  # Register holding the current paragraph string being built
  register = new_node.call
  
  # Add new nodes to here
  add_to = tree_root
  
  ## Procedure for attaching the tree node held in the register to the current attachment
  ## point of the growing tree
  attach_register = -> {
        
    begin
      @parse_log.debug {" > attaching node #{register.name} as a child of #{add_to.name} (#{register.content.to_debug_s})"}
      add_to << register
    rescue
      tree_root.print_tree
      raise
    end
      
  }
  
  ## Procedure for adding the register to the tree, at the current level. This also
  ## creates a new node at the same level, and sets the register to point to that
  ## node
  add_current_node = -> {
    
    # Save a pointer to the current node
    attach_register.call      
    
    # Add a new node at the same level as the current_node
    register = new_node.call
    
  }
  
  ## Procedure for down the tree. This involves saving the current contents of the
  ## register as a new node at the current level, then creating a new node as a child
  move_down_tree = -> {
    @parse_log.debug {"-> going down #{register.name} (#{register.content.to_debug_s})"}
    
    # Save the current node, pointed to by the register
    attach_register.call
      
    # Add future nodes as the child of this node
    add_to = register
    
    # Create a new node as the first child
    register = new_node.call

    # Flag the start of a new block
    block_start = true            
  }
  
  ## Procedure for moving up the tree, from the current node
  ## (identified by the register), to the next level up
  move_up_tree = -> {
    @parse_log.debug {"<- going up #{register.name} (#{register.content.to_debug_s})"}
  
    # Save the current node, pointed to by the register
    attach_register.call
    
    # Move up the tree
    add_to = add_to.parent
    register = new_node.call

    # Remove any 'empty' children (sub-tree nodes that we might
    # have just added, but which in fact could be trimmed)
    #clean_children.call
      
  }
  
  ###
  ### Parse the content
  ###
  
  content_string.each_codepoint{|codepoint|
    
    # Check if we need to stop, in order to process the next block
    if stop_at.include?(codepoint) then
      
      # If a stop has been requested, undertake the action
      # indicated by the current codepoint
      case codepoint
          
        # Check if this is a new block (we may be dealing with a block
        # terminated by a new-line, instead of a space)
        when 10,12,13            
          
          if block_start then
            
            # Record the block type, and re-set the register content
            register.content.content_to_type!
            
            # Unset the start of a new block flag
            block_start = false
          
          elsif not ((register.content.empty?) or (content_string[string_index - 1] =~ /^\s/)) then
            # Otherwise treat the new-line as a space, but suppress multiple spaces
            register.content << " "
          end
        
        # On a space character, first check if the flag indicating the start
        # of a block. If set, this should indicate that this space follows the
        # name of the block
        when 32
          
          if block_start then
            
            # Record the block type if not already set
            if register.content.type == :none then 
              register.content.content_to_type!
            else
              register.content << " "
            end
            
            # Unset the start of a new block flag
            block_start = false
          
          else

            # If this is not the start of a block, pass on the space (but suppress multiple spaces)
            unless content_string[string_index - 1] =~ /^\s/ then
              register.content << " "
            end

          end
          
        # On a double quote, we need to check if this is a special block (the single
        # quote block), or something else
        when 34

          # If the last character was whitespace, this should be the start of
          # a single quote block
          if content_string[string_index - 1] =~ /\s|\W/ then

            # Create a new sub-block of the current block
            move_down_tree.call

            # Set the block type
            register.content.type = :double_quote
                    
          # If the next character is not alphanumeric, then this should be the
          # end of a single quote block
          elsif not content_string[string_index + 1] =~ /\w/ then

            # Move back to the parent of the quote block
            move_up_tree.call

          # No significance, so pass it on
          else
            register.content << "'"
          end
          
        # Count the minus signs: this could be a hypen, an em-dash or an en-dash
        when 45
          
          if content_string[string_index + 1] =~ /\s|\w/ and not content_string[string_index + 1] == '-' then
            
            if ((content_string[string_index - 1] == '-') and (content_string[string_index - 2] == '-')) then
              
              # Add the current content, and create a new node at the same level
              add_current_node.call
                
              # Change the type of the register, and add it to the tree (which also
              # create a new new at the same level for the following content)
              register.content.type = :em_dash
              add_current_node.call
              
            elsif content_string[string_index - 1] == '-' 

              # Add the current content, and create a new node at the same level
              add_current_node.call
                
              # Change the type of the register, and add it to the tree (which also
              # create a new new at the same level for the following content)
              register.content.type = :en_dash
              add_current_node.call

            else
              register.content << "-"
            end
            
          end
          
          # Count the dots: this could be an elipses
        when 46
          register.content << "."

          if content_string[string_index + 1] =~ /\s|\w|$/ then
            
            # Work out how many dots there are
            split = register.content.content.rpartition(/\s|\w/)
            
            unless split[0].empty? then
                              
              case split[2].length
                
                when 3
                  register.content.content = split[0] + split[1]
                  add_current_node.call

                  register.content.type = :elipses
                  add_current_node.call
                  
                when 4
                  add_current_node.call
                  register.content.content = split[0] + split[1]

                  register.content.type = :elipses_stop
                  add_current_node.call                      
              end
            end
          end

          
        # On the start of a new block, first record the old block
        # then initialize the new
        when 91            
          
          # Move down the tree
          move_down_tree.call

          # Flag the start of a new block
          block_start = true

        # At the termination of a block, transform the block text to the
        # literal equivalent
        when 93

          ## Do any special processing ##

          # Supress block characters from lists that are
          # dealt with elsewhere
          if register.content.type == :none then

            case register.content.to_s
              when "end", "ol", "ul"
                register.content.clear
            end

          end
          
          # Split items in links
          if register.content.type == :link then
            # Split the item
            items = register.content.to_s.split('|')

            # Create a node for the link title
            unless items[0].nil? then
              title = new_node.call(:link_text, items[0].strip)
              register << title
            end

            # Create a node for the link target
            unless items[1].nil? then
              target = new_node.call(:link_target, items[1].strip)
              register << target
            end
          end
          
          # Add the register to the tree, and move back up to
          # the previous level
          move_up_tree.call
       
      end
    else
      # If we are not stopping, just add the codepoint to the register
      register.content << codepoint
    end
    
    # Increment the string index
    string_index += 1
          
  }
  
  # Add the last node to the tree
  attach_register.call
      
  ## Prune any null nodes: this makes the tree smaller, and means we can idenify
  ## places where the branches need to be re-ordered
  
  tree_root.each{|node|
    if node.content.type == :none and node.content.empty? then
      @parse_log.debug {"--- pruning node #{node.name}"}
      node.remove_from_parent!
    end
  }
  
  ## Now we need to walk the AST, patching up the types so that enclosing
  ## types work properly. This requires finding all the :none nodes, and
  ## working out the correct type from the found siblings. To make sure
  ## we infer the correct type, we need to do a breadth first walk

  tree_root.breadth_each{|node|
          
    # First look 'across' the tree, looking for cases where enclosing node
    # have not been properly identified
    if node.is_first_sibling? then
      
      # Check the type of the last sibling: if this is :none, then
      # we have something like a list, where the paragraph text needs to
      # be separated from the element type
      if node.first_sibling.content.type != :none and node.last_sibling.content.type == :none then
        @parse_log.info {"examining node #{node.name}"}
        
        # Patch the type hierarchy. The easiest way to do this is to
        # look for a parent whose type is also :none: if found we can
        # 'borrow' the node as the item type
        if node.parent.content.type == :none and node.parent.content.empty? then
          @parse_log.info {"changing parent type"}
          node.parent.content.type = node.content.type
          node.content.type = :none
          
        # Otherwise we have to work a bit harder, separating us and
        # our siblings, creating a new node of the correct type, and
        # binding everything back together
        else
          @parse_log.info {"re-ordering siblings"}           
          siblings = node.siblings
          
          new_root = new_node.call(node.content.type)
          node.parent << new_root
          
          node.remove_from_parent!
          new_root << node
          siblings.each{|node|
            @parse_log.info {"removing node #{node.name}"}
            @parse_log.info {"attaching to #{new_root.name}"}
            node.remove_from_parent!
            new_root << node
          }

        end
              
      end
    end
  } 
    
  # We need to preserve the children, so delete the content from
  # the parent to tell users of the AST to walk our children
  tree_root.content.clear
    
end