Module RTFDE.utils
Expand source code
# -*- coding: utf-8 -*-
#
# This file is part of package name, a package description short.
# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
import difflib
import sys
import re
from typing import Union, AnyStr, Any
# from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator
from collections.abc import Generator
from lark.lexer import Token
from lark.tree import Tree
from lark import Lark
import logging
log = logging.getLogger("RTFDE")
def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
"""Returns the hex encoded value of a .rtf control parameter.
Args:
control_parameter: (int/str) Int or a string which represents an int.
Returns:
Zero padded 6 char long hexedecimal string.
"""
try:
return f"{control_parameter:#06x}"
except ValueError:
# If passed as string convert first
control_parameter = int(control_parameter)
return f"{control_parameter:#06x}"
def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
"""Prints binary object to a dump file for quick debugging.
Warning: Not for normal use. Only use when debugging.
Args:
data (bytes|str): Data to write to path
path (str): The file path to write data to
"""
# Be able to print binary objects easily
if isinstance(data, (bytes, bytearray)) is True:
open_as = 'wb+'
else:
open_as = 'w+'
with open(path, open_as) as fp:
original_stdout = sys.stdout
sys.stdout = fp
print(data)
sys.stdout = original_stdout
def encode_escaped_control_chars(raw_text: bytes) -> bytes:
"""Replaces escaped control chars within the text with their RTF encoded versions \\'HH.
Args:
raw_text (str): string which needs escape characters encoded
Returns:
A string with escaped control chars
"""
cleaned = raw_text.replace(b'\\\\', b"\\'5c")
cleaned = cleaned.replace(b'\\{', b"\\'7b")
cleaned = cleaned.replace(b'\\}', b"\\'7d")
return cleaned
def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
"""Checks if a Token is a codeword with a numeric argument.
Returns:
True if a Token is a codeword with a numeric argument. False if not.
"""
try:
val = token.value.strip()
# print(val, codeword)
if (val.startswith(codeword) and
val[len(codeword):].isdigit()):
return True
except AttributeError:
return False
return False
def print_lark_parser_evaluated_grammar(parser):
"""Prints the final evaluated grammar.
Can be useful for debugging possible errors in grammar evaluation.
Args:
parser (Lark obj): Lark object to extract grammar from.
"""
if not isinstance(parser, Lark):
raise ValueError("Requires a Lark object.")
eq = "="*15
eq = " " + eq + " "
print(eq + "RULES" + eq + "\n")
for i in parser.rules:
print(" " + i)
print(eq + "TERMINALS" + eq + "\n")
for i in parser.terminals:
print(" " + i)
print(eq + "IGNORED TOKENS" + eq + "\n")
for i in parser.ignore_tokens:
print(" " + i)
def log_validators(data):
"""Log validator logging only if RTFDE.validation_logger set to debug.
"""
logger = logging.getLogger("RTFDE.validation_logger")
if logger.level == logging.DEBUG:
logger.debug(data)
def log_transformations(data):
"""Log transform logging only if RTFDE.transform_logger set to debug.
"""
logger = logging.getLogger("RTFDE.transform_logger")
if logger.level == logging.DEBUG:
logger.debug(data)
def is_logger_on(logger_name, level=logging.DEBUG):
"""Check if a logger is enabled and on debug.
"""
logger = logging.getLogger(logger_name)
if logger.level == level:
return True
return False
def log_text_extraction(data):
"""Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
"""
logger = logging.getLogger("RTFDE.text_extraction")
if logger.level == logging.DEBUG:
logger.debug(data)
def log_htmlrtf_stripping(data: Token):
"""Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.
Raises:
AttributeError: Will occur if you pass this something that is not a token.
"""
logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
if logger.level == logging.DEBUG:
if not isinstance(data, Token):
raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
log_msg = tok_desc.format(value=data.value,
line=data.line,
end_line=data.end_line,
start_pos=data.start_pos,
end_pos = data.end_pos)
logger.debug(log_msg)
def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
"""Log diff of two strings. Defaults to splitting by newlines and keeping the ends.
Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.
Args:
original: The original string
revised: The changed version of the string
sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
"""
log.debug(get_string_diff(original, revised, sep))
def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
"""Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.
Args:
original: The original string
revised: The changed version of the string
sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
Returns:
A string object representing the diff of the two strings provided.
"""
if sep is None:
orig_split = original.decode().splitlines(keepends=True)
revised_split = revised.decode().splitlines(keepends=True)
else:
original = original.replace(b'\n',b'')
revised = revised.replace(b'\n',b'')
orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
return "\n".join(list(difflib.context_diff(orig_split,
revised_split)))
def get_tree_diff(original: Tree, revised: Tree):
"""Get the diff of two trees.
Args:
original (lark Tree): A lark tree before transformation
revised (lark Tree): A lark tree after transformation
Returns:
A string object representing the diff of the two Trees provided.
Example:
rtf_obj = DeEncapsulator(raw_rtf)
rtf_obj.deencapsulate()
transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
get_tree_diff(rtf_obj.full_tree, transformed_tree)
"""
log = logging.getLogger("RTFDE")
flat_original = list(flatten_tree(original))
flat_revised = list(flatten_tree(revised))
return "\n".join(list(difflib.context_diff(flat_original,
flat_revised)))
def flatten_tree(tree: Tree) -> Generator:
"""Flatten a lark Tree into a list of repr's of tree objects.
Args:
tree (lark Tree): A lark tree
"""
yield f"Tree('{tree.data}')"
for child in tree.children:
if isinstance(child, Token):
yield repr(child)
elif isinstance(child, Tree):
for i in flatten_tree(child):
yield i
else:
yield repr(child)
def flatten_tree_to_string_array(tree: Tree) -> Generator:
"""Flatten a lark Tree into a list of repr's of tree objects.
Args:
tree (lark Tree): A lark tree
"""
for child in tree.children:
if isinstance(child, Tree):
for i in flatten_tree_to_string_array(child):
yield i
elif isinstance(child, Token):
yield child.value
else:
yield child
def make_token_replacement(ttype, value, example):
if isinstance(example, Token):
fake_tok = Token(ttype,
value,
start_pos=example.start_pos,
end_pos=example.end_pos,
line=example.line,
end_line=example.end_line,
column=example.column,
end_column=example.end_column)
elif isinstance(example, Tree):
fake_tok = Token(ttype,
value,
start_pos=example.meta.start_pos,
end_pos=example.meta.end_pos,
line=example.meta.line,
end_line=example.meta.end_line,
column=example.meta.column,
end_column=example.meta.end_column)
return fake_tok
def embed():
import os
import readline
import rlcompleter
import code
import inspect
import traceback
history = os.path.join(os.path.expanduser('~'), '.python_history')
if os.path.isfile(history):
readline.read_history_file(history)
frame = inspect.currentframe().f_back
namespace = frame.f_locals.copy()
namespace.update(frame.f_globals)
readline.set_completer(rlcompleter.Completer(namespace).complete)
readline.parse_and_bind("tab: complete")
file = frame.f_code.co_filename
line = frame.f_lineno
function = frame.f_code.co_name
stack = ''.join(traceback.format_stack()[:-1])
print(stack)
banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
try:
code.interact(banner=banner, local=namespace)
finally:
readline.write_history_file(history)
Functions
def embed()
-
Expand source code
def embed(): import os import readline import rlcompleter import code import inspect import traceback history = os.path.join(os.path.expanduser('~'), '.python_history') if os.path.isfile(history): readline.read_history_file(history) frame = inspect.currentframe().f_back namespace = frame.f_locals.copy() namespace.update(frame.f_globals) readline.set_completer(rlcompleter.Completer(namespace).complete) readline.parse_and_bind("tab: complete") file = frame.f_code.co_filename line = frame.f_lineno function = frame.f_code.co_name stack = ''.join(traceback.format_stack()[:-1]) print(stack) banner = f" [ {os.path.basename(file)}:{line} in {function}() ]" banner += "\n Entering interactive mode (Ctrl-D to exit) ..." try: code.interact(banner=banner, local=namespace) finally: readline.write_history_file(history)
def encode_escaped_control_chars(raw_text: bytes) ‑> bytes
-
Replaces escaped control chars within the text with their RTF encoded versions 'HH.
Args
raw_text
:str
- string which needs escape characters encoded
Returns
A string with escaped control chars
Expand source code
def encode_escaped_control_chars(raw_text: bytes) -> bytes: """Replaces escaped control chars within the text with their RTF encoded versions \\'HH. Args: raw_text (str): string which needs escape characters encoded Returns: A string with escaped control chars """ cleaned = raw_text.replace(b'\\\\', b"\\'5c") cleaned = cleaned.replace(b'\\{', b"\\'7b") cleaned = cleaned.replace(b'\\}', b"\\'7d") return cleaned
def flatten_tree(tree: lark.tree.Tree) ‑> collections.abc.Generator
-
Flatten a lark Tree into a list of repr's of tree objects.
Args
tree
:lark Tree
- A lark tree
Expand source code
def flatten_tree(tree: Tree) -> Generator: """Flatten a lark Tree into a list of repr's of tree objects. Args: tree (lark Tree): A lark tree """ yield f"Tree('{tree.data}')" for child in tree.children: if isinstance(child, Token): yield repr(child) elif isinstance(child, Tree): for i in flatten_tree(child): yield i else: yield repr(child)
def flatten_tree_to_string_array(tree: lark.tree.Tree) ‑> collections.abc.Generator
-
Flatten a lark Tree into a list of repr's of tree objects.
Args
tree
:lark Tree
- A lark tree
Expand source code
def flatten_tree_to_string_array(tree: Tree) -> Generator: """Flatten a lark Tree into a list of repr's of tree objects. Args: tree (lark Tree): A lark tree """ for child in tree.children: if isinstance(child, Tree): for i in flatten_tree_to_string_array(child): yield i elif isinstance(child, Token): yield child.value else: yield child
def get_control_parameter_as_hex_strings(control_parameter: Union[str, int]) ‑> str
-
Returns the hex encoded value of a .rtf control parameter.
Args
control_parameter
- (int/str) Int or a string which represents an int.
Returns
Zero padded 6 char long hexedecimal string.
Expand source code
def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str: """Returns the hex encoded value of a .rtf control parameter. Args: control_parameter: (int/str) Int or a string which represents an int. Returns: Zero padded 6 char long hexedecimal string. """ try: return f"{control_parameter:#06x}" except ValueError: # If passed as string convert first control_parameter = int(control_parameter) return f"{control_parameter:#06x}"
def get_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)
-
Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.
Args
original
- The original string
revised
- The changed version of the string
sep
:string
- A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
Returns
A string object representing the diff of the two strings provided.
Expand source code
def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None): """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends. Args: original: The original string revised: The changed version of the string sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. Returns: A string object representing the diff of the two strings provided. """ if sep is None: orig_split = original.decode().splitlines(keepends=True) revised_split = revised.decode().splitlines(keepends=True) else: original = original.replace(b'\n',b'') revised = revised.replace(b'\n',b'') orig_split = [i.decode() for i in re.split(sep, original) if i != b''] revised_split = [i.decode() for i in re.split(sep, revised) if i != b''] return "\n".join(list(difflib.context_diff(orig_split, revised_split)))
def get_tree_diff(original: lark.tree.Tree, revised: lark.tree.Tree)
-
Get the diff of two trees.
Args
original
:lark Tree
- A lark tree before transformation
revised
:lark Tree
- A lark tree after transformation
Returns
A string object representing the diff of the two Trees provided.
Example
rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) get_tree_diff(rtf_obj.full_tree, transformed_tree)
Expand source code
def get_tree_diff(original: Tree, revised: Tree): """Get the diff of two trees. Args: original (lark Tree): A lark tree before transformation revised (lark Tree): A lark tree after transformation Returns: A string object representing the diff of the two Trees provided. Example: rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) get_tree_diff(rtf_obj.full_tree, transformed_tree) """ log = logging.getLogger("RTFDE") flat_original = list(flatten_tree(original)) flat_revised = list(flatten_tree(revised)) return "\n".join(list(difflib.context_diff(flat_original, flat_revised)))
def is_codeword_with_numeric_arg(token: Union[lark.lexer.Token, Any], codeword: bytes) ‑> bool
-
Checks if a Token is a codeword with a numeric argument.
Returns
True if a Token is a codeword with a numeric argument. False if not.
Expand source code
def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool: """Checks if a Token is a codeword with a numeric argument. Returns: True if a Token is a codeword with a numeric argument. False if not. """ try: val = token.value.strip() # print(val, codeword) if (val.startswith(codeword) and val[len(codeword):].isdigit()): return True except AttributeError: return False return False
def is_logger_on(logger_name, level=10)
-
Check if a logger is enabled and on debug.
Expand source code
def is_logger_on(logger_name, level=logging.DEBUG): """Check if a logger is enabled and on debug. """ logger = logging.getLogger(logger_name) if logger.level == level: return True return False
def log_htmlrtf_stripping(data: lark.lexer.Token)
-
Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.
Raises
AttributeError
- Will occur if you pass this something that is not a token.
Expand source code
def log_htmlrtf_stripping(data: Token): """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug. Raises: AttributeError: Will occur if you pass this something that is not a token. """ logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger") if logger.level == logging.DEBUG: if not isinstance(data, Token): raise AttributeError("HTMLRTF Stripping logger only logs Tokens") tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}" log_msg = tok_desc.format(value=data.value, line=data.line, end_line=data.end_line, start_pos=data.start_pos, end_pos = data.end_pos) logger.debug(log_msg)
def log_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)
-
Log diff of two strings. Defaults to splitting by newlines and keeping the ends.
Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.
Args
original
- The original string
revised
- The changed version of the string
sep
:string
- A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
Expand source code
def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None): """Log diff of two strings. Defaults to splitting by newlines and keeping the ends. Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging. Args: original: The original string revised: The changed version of the string sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise. """ log.debug(get_string_diff(original, revised, sep))
def log_text_extraction(data)
-
Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
Expand source code
def log_text_extraction(data): """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug. """ logger = logging.getLogger("RTFDE.text_extraction") if logger.level == logging.DEBUG: logger.debug(data)
def log_transformations(data)
-
Log transform logging only if RTFDE.transform_logger set to debug.
Expand source code
def log_transformations(data): """Log transform logging only if RTFDE.transform_logger set to debug. """ logger = logging.getLogger("RTFDE.transform_logger") if logger.level == logging.DEBUG: logger.debug(data)
def log_validators(data)
-
Log validator logging only if RTFDE.validation_logger set to debug.
Expand source code
def log_validators(data): """Log validator logging only if RTFDE.validation_logger set to debug. """ logger = logging.getLogger("RTFDE.validation_logger") if logger.level == logging.DEBUG: logger.debug(data)
def make_token_replacement(ttype, value, example)
-
Expand source code
def make_token_replacement(ttype, value, example): if isinstance(example, Token): fake_tok = Token(ttype, value, start_pos=example.start_pos, end_pos=example.end_pos, line=example.line, end_line=example.end_line, column=example.column, end_column=example.end_column) elif isinstance(example, Tree): fake_tok = Token(ttype, value, start_pos=example.meta.start_pos, end_pos=example.meta.end_pos, line=example.meta.line, end_line=example.meta.end_line, column=example.meta.column, end_column=example.meta.end_column) return fake_tok
def print_lark_parser_evaluated_grammar(parser)
-
Prints the final evaluated grammar.
Can be useful for debugging possible errors in grammar evaluation.
Args
parser
:Lark obj
- Lark object to extract grammar from.
Expand source code
def print_lark_parser_evaluated_grammar(parser): """Prints the final evaluated grammar. Can be useful for debugging possible errors in grammar evaluation. Args: parser (Lark obj): Lark object to extract grammar from. """ if not isinstance(parser, Lark): raise ValueError("Requires a Lark object.") eq = "="*15 eq = " " + eq + " " print(eq + "RULES" + eq + "\n") for i in parser.rules: print(" " + i) print(eq + "TERMINALS" + eq + "\n") for i in parser.terminals: print(" " + i) print(eq + "IGNORED TOKENS" + eq + "\n") for i in parser.ignore_tokens: print(" " + i)
def print_to_tmp_file(data: Union[~AnyStr, bytes, bytearray], path: str)
-
Prints binary object to a dump file for quick debugging.
Warning: Not for normal use. Only use when debugging.
Args
- data (bytes|str): Data to write to path
path
:str
- The file path to write data to
Expand source code
def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str): """Prints binary object to a dump file for quick debugging. Warning: Not for normal use. Only use when debugging. Args: data (bytes|str): Data to write to path path (str): The file path to write data to """ # Be able to print binary objects easily if isinstance(data, (bytes, bytearray)) is True: open_as = 'wb+' else: open_as = 'w+' with open(path, open_as) as fp: original_stdout = sys.stdout sys.stdout = fp print(data) sys.stdout = original_stdout