require “correspondence-markup/types”

# Grammar for a markup language which can be compiled into the HTML format # required by correspondence.js

# General note on bracketing of sequences: different components are enclosed # by different types of bracket, in particular: # # * item: “[]” # * item-group: “[]” # * structure: “{}” # * structure group: “()” # # However, in anticipation of a UI where the user may choose the granularity # at which to edit components of particular content, the parsing of brackets # is handled by the parent component, e.g. the “{}” bracketing of structures # is specified in the grammar rule for structure_group. # The motivation for this is: if a user is editing a structure definition in a UI text area, # there should be no necessity for the user to enter the enclosing “{}” brackets, # because the UI text area implicitly encloses the definition that the user is editing. # So the software would want to parse the definition of a structure without the {}. # (And if the user was editing each item group in a separate text field, then # similarly the user would not want to include the outer “[]” brackets in each # item group definition.)

grammar CorrespondenceMarkupLanguage

# Include the Module containing Ruby classes representing the AST nodes
include CorrespondenceMarkup

# This rule defines a sequence of structure groups (intended to be displayed on one web page).
# Individual structure groups are independent of each other
# (although typically a sequence of structure groups
# where each structure group has structures with the same sequence of languages, 
# e.g. three structure groups of two structures each with languages 
# spanish/english, spanish/english, spanish/english, representing the
# translations of three verses of a song).
rule structure_groups
  s groups:("(" structure_group ")" s)*
  {
    # Return an array of StructureGroup's
    def value
      groups.elements.map {|e| e.structure_group.value}
    end
  }
end

# A structure group is a group of structures where each structure represents
# the same information in a different "language".
rule structure_group
  s description:structure_group_description? s structures:("{" structure "}" s)*
  { 
    # Return a StructureGroup
    def value
      structureObjects = structures.elements.map {|e| e.structure.value}
      CorrespondenceMarkup::StructureGroup.new(structureObjects)
    end
  }
end

# Optional lengthy description of a particular structure group. 
# (Intended to be displayed as the title of the structure group
# describing the information presented in the structure group.)
rule structure_group_description
  "#" s [^{\n]* "\n"
end

# A structure is one of two or more structures in a structure group
# (although the grammar does not impose any count restriction, in anticipation
# of application users editing and saving incomplete content).
# A structure has a "type" (short language description intended to map to a CSS class), 
# a "description" (longer but still concise language description for display to the reader), 
# and contains a sequence of "item groups".
rule structure
  structure_annotation s itemGroups:("[" item_group "]" s)*
  { 
    # Return a Structure
    def value
      itemGroupObjects = itemGroups.elements.map {|e| e.item_group.value}
      class_name, description = structure_annotation.value
      CorrespondenceMarkup::Structure.new(class_name, description, 
                                          itemGroupObjects)
    end
  }
end

# Structure class (for the structure's "type"), with rules similar to those of a CSS class identifier.
rule structure_class
  ([a-zA-Z] [a-zA-Z0-9_-]*)?
end

# Structure annotation contains the "type" and the "description" (both optional)
rule structure_annotation
  structure_class description_section:(":" s description:[^\n]* "\n")?
  {
    # Return an array of two strings for the type and the description
    def value
      class_name = structure_class.text_value
      description = nil
      if description_section.elements
        description = description_section.description.text_value
      end
      [class_name, description]
    end
  }
end

# An item group is a sub-structure of a structure which contains a sequence of items and "non-items".
# An item group has an upper-case alphabetic ID (which should be unique within a structure, 
# and which should be the ID of an item-group in the first structure of a structure group, but
# neither of these rules is required by the grammar).
# The item group ID is used as a default prefix for any item IDs that do not start with 
# alphabetic characters (so a full item ID is always alphabetic followed by something else).
rule item_group
  optional_id:(id:[A-Z]* ":")? components:(item / non_item)*
  { 
    # Return an ItemGroup
    def value
      group_id = optional_id.elements ? optional_id.elements[0].text_value : ""
      componentObjects = components.elements.map {|e| e.value(group_id)}
      CorrespondenceMarkup::ItemGroup.new(group_id, componentObjects)
    end
  }
end

# A "non-item" is textual content in an item group that is not part of an actual item.
# In effect this is text which is either not translatable to content in other structures
# in the same structure group, or, it is considered unimportant to identify its translation.
# For example, in the second case, punctuation in sentences, where the translation is reasonably 
# obvious, and we wish to highlight only the translations of the actual words.
rule non_item
  text:text
  { 
    # Given the item group ID (as a default prefix for the item IDs, which is ignored for non-items), 
    # return a NonItem.
    def value(group_id = "")
      CorrespondenceMarkup::NonItem.new(text.value)
    end
  }
end

# An item is textual content with an ID, where different items in the same structure group
# with the same ID are considered to be related to each other.
# Typically, items with the same ID in the same structure are considered to be part of the
# "same item", and items with the same ID in different structures are considered to be
# translations of each other.
# An item ID consists of an upper-case alphabetic prefix followed by a numeric ID.
# Any item ID that lacks an alphabetic prefix will have the item group ID of the containing
# item group added as a prefix to its ID.
# (This reflects the assumption that an item usually relates to items in item groups in other
# structures with the same item group ID, but occasionally an item relates to an item in
# some other item group in another structure.)
rule item
  "[" id:item_ids S text:text "]"
  { 
    # Given the item group ID (as a default prefix for the item IDs), return an Item
    def value(group_id = "")
      item_ids = id.text_value.split(",")
      item_ids = item_ids.map { |item_id| item_id.match(/[A-Z]/) ? item_id : group_id + item_id}
      CorrespondenceMarkup::Item.new(item_ids.join(","), text.value)
    end
  }
end

# Text is the textual component of both items and non-items.
# Text is delimited by "]", "[" and (at the beginning of items) whitespace.
# Text can include backslash-quoted characters, for example to include any of the delimiter characters.
rule text
  (("\\" .) / (![\[\]\\] .))+
  { 
    # Return the text, de-quoting any backslash-quoted characters.
    def value
      text_value.gsub(/\\(.)/, '\1')
    end
  }
end

# Items can have multiple IDs, in which case they are separated by commas
# (and no whitespace). If there are multiple IDs, the convention of applying the
# item group ID as a default prefix is applied individually to each ID.
# So, for example, "2,A2,3" in item group B would be expanded to "B2,A2,B3".
rule item_ids
  item_id ("," item_id)*
end

# An item ID - optional upper-case alphabetic prefix, followed by a numeric ID.
rule item_id
  [A-Z]* [0-9]+
end

# Rule for optional whitespace
rule s
  [\s\n\r\t]*
end

# Rule for mandatory whitespace
rule S
  [\s\n\r\t]+
end

end