base_markdown_node
trestle.core.markdown.base_markdown_node
¤
A base markdown node.
logger
¤
Classes¤
BaseMarkdownNode
¤
Markdown will be read to the tree.
Source code in trestle/core/markdown/base_markdown_node.py
class BaseMarkdownNode:
"""Markdown will be read to the tree."""
def __init__(self, key: str, content: BaseSectionContent, starting_line: int):
"""Initialize markdown node."""
self.subnodes: List[BaseMarkdownNode] = []
self.key = key
self.content = content
self.starting_line = starting_line
@classmethod
def build_tree_from_markdown(cls, lines: List[str]) -> BaseMarkdownNode:
"""Construct a tree out of the given markdown."""
ob = cls.__new__(cls)
start_level = ob._get_max_header_lvl(lines)
ob, _ = ob._build_tree(lines, 'root', 0, start_level)
return ob
def get_all_headers_for_level(self, level: int) -> Iterable[str]:
"""Return all headers per specified level of hierarchy."""
return list(
filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
).__iter__()
def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[BaseMarkdownNode]:
"""Return a first node for the given key, substring matching is supported. The method is case insensitive."""
if not strict_matching:
if not any(key.lower() in el.lower() for el in self.content.subnodes_keys):
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key.lower() in el.lower())) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
else:
if key.lower() not in [el.lower() for el in self.content.subnodes_keys]:
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el.lower() == key.lower())) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
return self._rec_traverse(self, key, strict_matching)
def get_all_nodes_for_keys(
self,
keys: List[str],
strict_matching: bool = True,
stop_recurse_on_first_match: bool = False
) -> List[BaseMarkdownNode]:
"""
Return all nodes for the given keys, substring matching is supported.
Args:
keys: List of strings for the headers being collected
strict_matching: Force exact match of key with header vs. simple substring match
stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes
Returns: List of found markdown nodes
"""
if not strict_matching:
if not any(key in el for el in self.content.subnodes_keys for key in keys):
return []
elif not set(keys).intersection(self.content.subnodes_keys):
return []
return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)
def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
"""Return all headers contained in the node with a given key."""
if strict_matching:
return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
else:
return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()
def get_node_header_lvl(self) -> Optional[int]:
"""Return current node header level."""
return self._get_header_level_if_valid(self.key)
def change_header_level_by(self, delta_level: int) -> None:
"""
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Args:
delta_level: each header will be modified by this number, can be negative.
"""
# construct a map
header_map = {}
if self.key != 'root':
new_key = self._modify_header_level(self.key, delta_level)
header_map[self.key] = new_key
for key in self.content.subnodes_keys:
new_key = self._modify_header_level(key, delta_level)
header_map[key] = new_key
# go through all contents and modify headers
self._rec_traverse_header_update(self, header_map)
def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
"""Remove text from this node that is found in matching subnodes."""
text_lines = self.content.raw_text.split('\n')
matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
# need to delete from end and proceed backwards
sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
for node in sorted_nodes:
last_line = node.starting_line + len(node.content.raw_text.split('\n'))
delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
return text_lines
@abstractmethod
def _build_tree(self, lines: List[str], root_key: str, starting_line: int,
level: int) -> Tuple[BaseMarkdownNode, int]:
"""Build a tree from the markdown recursively."""
pass
def _modify_header_level(self, header: str, delta_level: int) -> str:
"""Modify header level by specified level."""
if delta_level == 0:
logger.debug('Nothing to modify in header, level 0 is given.')
return header
current_level = self._get_header_level_if_valid(header)
if current_level is None:
current_level = 0
if current_level + delta_level < 0:
logger.warning(
f'Cannot substract {delta_level} as level of {header} is {current_level}. All `#` will be removed.'
)
delta_level = current_level * -1
if current_level + delta_level == 0:
replacement = ''
else:
replacement = '#' * (current_level + delta_level)
header = header.replace('#' * current_level, replacement)
return header.strip(' ')
def _get_header_level_if_valid(self, line: str) -> Optional[int]:
"""
Return a level of the header if the given line is indeed a header.
Level of the header is determined by the number of # symbols.
"""
header_symbols = re.match(md_const.HEADER_REGEX, line)
# Header is valid only if it line starts with header
if header_symbols is not None and header_symbols.regs[0][0] == 0:
return header_symbols.regs[0][1]
return None
def _does_start_with(self, line: str, start_chars: str) -> bool:
"""Determine whether the line starts with given characters."""
return line.startswith(start_chars)
def _does_contain(self, line: str, reg: str) -> bool:
"""Determine if the line matches regex."""
if len(line) == 0 and reg != r'':
return False
regexp = re.compile(reg)
return regexp.search(line) is not None
def _read_code_lines(self, lines: List[str], line: str, i: int) -> tuple[list[str], int]:
"""Read code block."""
code_lines = [line]
while True:
if i >= len(lines):
raise TrestleError(f'Code block is not closed: {code_lines}')
line = lines[i]
code_lines.append(line)
i += 1
if self._does_contain(line, md_const.CODEBLOCK_DEF):
break
return code_lines, i
def _read_html_block(self, lines: List[str], line: str, i: int, ending_regex: str) -> tuple[list[str], int]:
"""Read html block."""
html_block = [line]
if self._does_contain(line, r'<br[ /]*>'):
return html_block, i
if self._does_contain(line, ending_regex):
return html_block, i
while True:
if i >= len(lines):
raise TrestleError(f'HTML block is not closed: {html_block}')
line = lines[i]
html_block.append(line)
i += 1
if self._does_contain(line, ending_regex):
break
return html_block, i
def _read_table_block(self, lines: List[str], line: str, i: int) -> tuple[list[str], int]:
"""Read table."""
table_block = [line]
while True:
if i >= len(lines):
return table_block, i
line = lines[i]
if not self._does_contain(line, md_const.TABLE_REGEX):
table_block.append(line)
break
table_block.append(line)
i += 1
return table_block, i
def _rec_traverse(self, node: BaseMarkdownNode, key: str, strict_matching: bool) -> Optional[BaseMarkdownNode]:
"""
Recursevely traverses the tree and searches for the given key.
If strict matching is turned off, node will be matched if key is a substring of the node's header.
"""
if key.lower() == node.key.lower() or (not strict_matching and key.lower() in node.key.lower()):
return node
if (not strict_matching and any(key.lower() in el.lower()
for el in node.content.subnodes_keys)) or (key.lower() in [
el.lower() for el in node.content.subnodes_keys
]):
for subnode in node.subnodes:
matched_node = self._rec_traverse(subnode, key, strict_matching)
if matched_node is not None:
return matched_node
return None
def _rec_traverse_all(
self, node: BaseMarkdownNode, keys: List[str], strict_matching: bool, stop_recurse_on_first_match: bool
) -> List[BaseMarkdownNode]:
"""
Recursevely traverse the tree and find all nodes matching the keys.
If strict matching is turned off, nodes will be matched if key is a substring of the node's header.
stop_recurse_on_first_match will return only the highest level key match and not any subnodes
"""
found_nodes: List[BaseMarkdownNode] = []
for key in keys:
if key == node.key or (not strict_matching and key in node.key):
found_nodes.append(node)
if stop_recurse_on_first_match:
return found_nodes
for subnode in node.subnodes:
matched_nodes = self._rec_traverse_all(subnode, keys, strict_matching, stop_recurse_on_first_match)
found_nodes.extend(matched_nodes)
return found_nodes
def _rec_traverse_header_update(self, node: BaseMarkdownNode, header_map: Dict[str, str]) -> None:
"""Recursively traverse tree and update the contents."""
if node:
if node.key != 'root':
new_key = header_map[node.key]
node.key = new_key
# update text
lines = node.content.raw_text.split('\n')
if lines:
for i in range(0, len(lines)):
line = lines[i]
if line in header_map.keys():
new_key = header_map[line]
lines[i] = new_key
elif line.strip(' ') in header_map.keys():
# keep spaces if any
new_key = header_map[line.strip(' ')]
lines[i] = line.replace(line.strip(' '), new_key)
node.content.raw_text = '\n'.join(lines)
# update subnodes
if node.content.subnodes_keys:
for i in range(0, len(node.content.subnodes_keys)):
subnode_key = node.content.subnodes_keys[i]
if subnode_key in header_map.keys():
new_key = header_map[subnode_key]
node.content.subnodes_keys[i] = new_key
for subnode in node.subnodes:
self._rec_traverse_header_update(subnode, header_map)
def _get_max_header_lvl(self, lines: List[str]) -> int:
"""Go through all lines to determine highest header level. Less # means higher."""
min_lvl = math.inf
for line in lines:
line = line.strip(' ')
header_lvl = self._get_header_level_if_valid(line)
if header_lvl is not None and header_lvl < min_lvl:
min_lvl = header_lvl
return min_lvl - 1
Methods¤
__init__(self, key, content, starting_line)
special
¤
Initialize markdown node.
Source code in trestle/core/markdown/base_markdown_node.py
def __init__(self, key: str, content: BaseSectionContent, starting_line: int):
"""Initialize markdown node."""
self.subnodes: List[BaseMarkdownNode] = []
self.key = key
self.content = content
self.starting_line = starting_line
build_tree_from_markdown(lines)
classmethod
¤
Construct a tree out of the given markdown.
Source code in trestle/core/markdown/base_markdown_node.py
@classmethod
def build_tree_from_markdown(cls, lines: List[str]) -> BaseMarkdownNode:
"""Construct a tree out of the given markdown."""
ob = cls.__new__(cls)
start_level = ob._get_max_header_lvl(lines)
ob, _ = ob._build_tree(lines, 'root', 0, start_level)
return ob
change_header_level_by(self, delta_level)
¤
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
delta_level |
int |
each header will be modified by this number, can be negative. |
required |
Source code in trestle/core/markdown/base_markdown_node.py
def change_header_level_by(self, delta_level: int) -> None:
"""
Change all headers in the tree by specified level up or down.
All children nodes will be modified by specified level as well.
Args:
delta_level: each header will be modified by this number, can be negative.
"""
# construct a map
header_map = {}
if self.key != 'root':
new_key = self._modify_header_level(self.key, delta_level)
header_map[self.key] = new_key
for key in self.content.subnodes_keys:
new_key = self._modify_header_level(key, delta_level)
header_map[key] = new_key
# go through all contents and modify headers
self._rec_traverse_header_update(self, header_map)
delete_nodes_text(self, keys, strict_matching=True)
¤
Remove text from this node that is found in matching subnodes.
Source code in trestle/core/markdown/base_markdown_node.py
def delete_nodes_text(self, keys: List[str], strict_matching: bool = True) -> List[str]:
"""Remove text from this node that is found in matching subnodes."""
text_lines = self.content.raw_text.split('\n')
matching_nodes = self.get_all_nodes_for_keys(keys, strict_matching, True)
# need to delete from end and proceed backwards
sorted_nodes = sorted(matching_nodes, key=lambda node: node.starting_line, reverse=True)
for node in sorted_nodes:
last_line = node.starting_line + len(node.content.raw_text.split('\n'))
delete_list_from_list(text_lines, list(range(node.starting_line, last_line)))
return text_lines
get_all_headers_for_key(self, key, strict_matching=True)
¤
Return all headers contained in the node with a given key.
Source code in trestle/core/markdown/base_markdown_node.py
def get_all_headers_for_key(self, key: str, strict_matching: bool = True) -> Iterable[str]:
"""Return all headers contained in the node with a given key."""
if strict_matching:
return list(filter(lambda header: key == header, self.content.subnodes_keys)).__iter__()
else:
return list(filter(lambda header: key in header, self.content.subnodes_keys)).__iter__()
get_all_headers_for_level(self, level)
¤
Return all headers per specified level of hierarchy.
Source code in trestle/core/markdown/base_markdown_node.py
def get_all_headers_for_level(self, level: int) -> Iterable[str]:
"""Return all headers per specified level of hierarchy."""
return list(
filter(lambda header: self._get_header_level_if_valid(header) == level, self.content.subnodes_keys)
).__iter__()
get_all_nodes_for_keys(self, keys, strict_matching=True, stop_recurse_on_first_match=False)
¤
Return all nodes for the given keys, substring matching is supported.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
keys |
List[str] |
List of strings for the headers being collected |
required |
strict_matching |
bool |
Force exact match of key with header vs. simple substring match |
True |
stop_recurse_on_first_match |
bool |
Return first match of any of the keys and don't search subnodes |
False |
Returns: List of found markdown nodes
Source code in trestle/core/markdown/base_markdown_node.py
def get_all_nodes_for_keys(
self,
keys: List[str],
strict_matching: bool = True,
stop_recurse_on_first_match: bool = False
) -> List[BaseMarkdownNode]:
"""
Return all nodes for the given keys, substring matching is supported.
Args:
keys: List of strings for the headers being collected
strict_matching: Force exact match of key with header vs. simple substring match
stop_recurse_on_first_match: Return first match of any of the keys and don't search subnodes
Returns: List of found markdown nodes
"""
if not strict_matching:
if not any(key in el for el in self.content.subnodes_keys for key in keys):
return []
elif not set(keys).intersection(self.content.subnodes_keys):
return []
return self._rec_traverse_all(self, keys, strict_matching, stop_recurse_on_first_match)
get_node_for_key(self, key, strict_matching=True)
¤
Return a first node for the given key, substring matching is supported. The method is case insensitive.
Source code in trestle/core/markdown/base_markdown_node.py
def get_node_for_key(self, key: str, strict_matching: bool = True) -> Optional[BaseMarkdownNode]:
"""Return a first node for the given key, substring matching is supported. The method is case insensitive."""
if not strict_matching:
if not any(key.lower() in el.lower() for el in self.content.subnodes_keys):
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: key.lower() in el.lower())) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
else:
if key.lower() not in [el.lower() for el in self.content.subnodes_keys]:
return None
elif len(as_filtered_list(self.content.subnodes_keys, lambda el: el.lower() == key.lower())) > 1:
logger.warning(f'Multiple nodes for {key} were found, only the first one will be returned.')
return self._rec_traverse(self, key, strict_matching)
get_node_header_lvl(self)
¤
Return current node header level.
Source code in trestle/core/markdown/base_markdown_node.py
def get_node_header_lvl(self) -> Optional[int]:
"""Return current node header level."""
return self._get_header_level_if_valid(self.key)
BaseSectionContent
¤
A content of the node.
Source code in trestle/core/markdown/base_markdown_node.py
class BaseSectionContent:
"""A content of the node."""
def __init__(self) -> None:
"""Initialize section content."""
self.raw_text = ''
self.subnodes_keys: List[str] = []
def union(self, node: BaseMarkdownNode) -> None:
"""Unites contents together."""
self.subnodes_keys.append(node.key)
self.subnodes_keys.extend(node.content.subnodes_keys)
Methods¤
__init__(self)
special
¤
Initialize section content.
Source code in trestle/core/markdown/base_markdown_node.py
def __init__(self) -> None:
"""Initialize section content."""
self.raw_text = ''
self.subnodes_keys: List[str] = []
union(self, node)
¤
Unites contents together.
Source code in trestle/core/markdown/base_markdown_node.py
def union(self, node: BaseMarkdownNode) -> None:
"""Unites contents together."""
self.subnodes_keys.append(node.key)
self.subnodes_keys.extend(node.content.subnodes_keys)
handler: python