From 1a8a3132f4fd0af6d80f607f84b615f2509a9339 Mon Sep 17 00:00:00 2001 From: Amy Gale Ruth Bowersox Date: Sat, 10 Aug 2024 02:46:42 -0600 Subject: [PATCH] updated Obsidian style list processing to get formatting better, also implement nested lists properly --- src/dragonglass/mparse.py | 243 +++++++++++++++++++++++++++++++++++++- 1 file changed, 242 insertions(+), 1 deletion(-) diff --git a/src/dragonglass/mparse.py b/src/dragonglass/mparse.py index e9a09d8..5ad5fed 100644 --- a/src/dragonglass/mparse.py +++ b/src/dragonglass/mparse.py @@ -3,6 +3,7 @@ """The Markdown parser and its extensions.""" import hashlib +import logging import re import xml.etree.ElementTree as etree # noqa: N813 from typing import Any @@ -11,6 +12,8 @@ from urllib.parse import quote as urlquote from urllib.parse import urlparse import markdown +from markdown import Markdown +from markdown.blockprocessors import BlockProcessor, BlockParser from markdown.extensions import Extension from markdown.extensions.footnotes import (FootnoteExtension, FootnoteBlockProcessor, FootnoteInlineProcessor, FootnoteTreeprocessor, FootnotePostTreeprocessor, FootnotePostprocessor) @@ -20,6 +23,8 @@ from markdown.util import STX, ETX from .config import Context +logger = logging.getLogger(__name__) + PRIO_BASE = 10000 # priority base for our extensions # Patterns for ObsidianImages @@ -383,6 +388,240 @@ class ObsidianLinks(Extension): 'obsidian_generic_links', PRIO_BASE + 100) +class ObsidianLists(Extension): + """An extension to process Obsidian lists, including making nested lists.""" + UL_RE = re.compile(r'^([ ]*)[-+*][ ]+(.*)') + OL_RE = re.compile(r'^([ ]*)([1-9][0-9]*)\.[ ]+(.*)') + LIST_START = STX + "erbosoft:lstart" + ETX + LIST_END = STX + "erbosoft:lend" + ETX + + def _find_listhead(self, line) -> tuple[str, int, int]: + """ + Find if the specified line is a list element head. + + Args: + line (str): The line to be checked. + + Returns: + str: The list type, "ol" or "ul", or "" if this line is not a list head. + int: The indent level of the list, or -1 if this line is not a list head. + int: The start index of the list, or -1 if this line is not a list head for an ordered list. + """ + m = self.UL_RE.match(line) + if m: + return 'ul', len(m.group(1)), -1 + m = self.OL_RE.match(line) + if m: + return 'ol', len(m.group(1)), int(m.group(2)) + return '', -1, -1 + + class ObsidianListFinder(Preprocessor): + def __init__(self, extref: Any) -> None: + super(ObsidianLists.ObsidianListFinder, self).__init__() + self._extref = extref + + def run(self, lines: list[str]) -> list[str]: + i = 0 + in_list = False + while i < len(lines): + if in_list: + if len(lines[i].strip()) == 0: + in_list = False + lines.insert(i, self._extref.LIST_END) + i += 1 + else: + listtype, _, _ = self._extref._find_listhead(lines[i]) + if len(listtype) > 0: + in_list = True + lines.insert(i, self._extref.LIST_START) + i += 1 + i += 1 + return lines + + class ObsidianListBlock(BlockProcessor): + """The actual block processor that generates lists.""" + def __init__(self, parser: BlockParser, extref: Any) -> None: + super(ObsidianLists.ObsidianListBlock, self).__init__(parser) + self._extref = extref + + @staticmethod + def _extract(blocks: list[str]) -> list[str]: + """ + Extract all lines up to the next blank line, which terminates the list. + + Args: + blocks (list[str]): List of blocks to be parsed. + + Returns: + list[str]: The block of lines to be parsed for a list. + """ + out_lines: list[str] = [] + while len(blocks) > 0: + curblk = blocks.pop(0) + cur_lines = curblk.split('\n') + while len(cur_lines) > 0: + line = cur_lines.pop(0) + if line.strip() == '': + cur_lines.insert(0, line) + blocks.insert(0, '\n'.join(cur_lines)) + return out_lines + out_lines.append(line) + return out_lines + + def _build_element(self, parent: etree.Element, listtype: str, indent: int, lines: list[str]) -> None: + """ + Builds a list element and adds it to the specified parent list. + + Args: + parent (etree.element): The parent list element to insert "li" items under. + listtype (str): The list type, either "ol" or "ul". + indent (int): The indent level of the list, which is greater than or equal to 0. + lines (list[str]): The lines comprising the list element. + """ + textdata: list[str] = [] + + # Parse the header line to get the first bit of text. + if listtype == 'ol': + m = self._extref.OL_RE.match(lines[0]) + assert len(m.group(1)) == indent + textdata.append(m.group(3)) + elif listtype == 'ul': + m = self._extref.UL_RE.match(lines[0]) + assert len(m.group(1)) == indent + textdata.append(m.group(2)) + + # Build the list element + listelement = etree.SubElement(parent, 'li') + my_lines = list(lines) + i = 1 + last_sublist: etree.Element | None = None + while i < len(my_lines): + subtype, subindent, _ = self._extref._find_listhead(my_lines[i]) + if len(subtype) == 0: + # ordinary text line - append it to the text data + textdata.append(my_lines[i]) + i += 1 + else: + # start of new sublist + assert subindent > indent + # flush text data + if last_sublist is not None: + last_sublist.tail = '\n'.join(textdata) + else: + listelement.text = '\n'.join(textdata) + textdata = [] + # parse the sublist and reset to parse the "excess" + last_sublist, my_lines = self._build_list(listelement, my_lines[i:]) + i = 0 + + # flush the text data one more time + if len(textdata) > 0: + if last_sublist is not None: + last_sublist.tail = '\n'.join(textdata) + else: + listelement.text = '\n'.join(textdata) + + def _build_list(self, parent: etree.Element, lines: list[str]) -> tuple[etree.Element, list[str]]: + """ + Build a new list attached to the specified parent element. + + Args: + parent (etree.Element): The parent element to build the list under. + lines (list[str]): The lines of text comprising the list. + + Returns: + etree.Element: The new list element. + list[str]: The excess lines that can't be parsed as part of this list. + """ + + # Establish the start of the list. + listtype, indent_level, start_index = self._extref._find_listhead(lines[0]) + assert indent_level >= 0 + list_top = etree.SubElement(parent, listtype) + if listtype == 'ol' and start_index > 1: + list_top.attrib['start'] = str(start_index) + + # The start index is set to -1 so, the first time around, we'll pick it up and set the start point. + st = -1 + for i, line in enumerate(lines): + new_type, new_indent, _ = self._extref._find_listhead(lines[i]) + if len(new_type) > 0: + if new_indent < indent_level: + # if the list head is lesser indented, this is the end of this element and the list + if st >= 0: + self._build_element(list_top, listtype, indent_level, lines[st:i]) + return list_top, lines[i:] + if new_indent == indent_level: + # this is the end of this element + if st >= 0: + self._build_element(list_top, listtype, indent_level, lines[st:i]) + if new_type != listtype: + # this is also the end of the list + return list_top, lines[i:] + st = i # start of next element + if st >= 0: # end the final element + self._build_element(list_top, listtype, indent_level, lines[st:]) + return list_top, [] + + def test(self, parent: etree.Element, block: str) -> bool: + """ + Tests to see whether the current block can be handled by this processor. + + Args: + parent (etree.Element): The current parent element. + block (str): The current block to be tested. + + Returns: + bool: ``True`` if this processor can handle the block, ``False`` if not. + """ + if self._extref.LIST_START in block: + logger.debug("DETECT") + return True + return False + + def run(self, parent: etree.Element, blocks: list[str]) -> bool: + """ + Processes the text in the current block and adds elements to the parent element. + + Args: + parent (etree.Element): Parent element to add new subelements to. + blocks (list[str]): Blocks of text to be processed. The first block in this list is the one for which + the ``test`` method returned ``True``. This method should remove any parsed text from the + ``block`` list before it returns. + + Returns: + bool: ``True`` if text was parsed by this method, ``False`` if not. + """ + chunk = blocks.pop(0) + p = chunk.find(self._extref.LIST_START) + self.parser.parseChunk(parent, chunk[:p]) + chunk = chunk[p + len(self._extref.LIST_START) + 1:] + p = chunk.find(self._extref.LIST_END) + while p < 0: + chunk += '\n' + chunk += blocks.pop(0) + p = chunk.find(self._extref.LIST_END) + blocks.insert(0, chunk[p + len(self._extref.LIST_END) + 1:]) + list_lines = chunk[:p].rstrip().split('\n') + assert len(list_lines) > 0 + logger.debug(f"*** Found list: {list_lines}") + _, excess = self._build_list(parent, list_lines) + if len(excess) > 0: + blocks.insert(0, '\n'.join(excess)) + return True + + def extendMarkdown(self, md: Markdown) -> None: # noqa: N802 + """ + Registers the list block processor with the Markdown parser. + + Args: + md (markdown.Markdown): The Markdown parser to register the patterns with. + """ + md.preprocessors.register(ObsidianLists.ObsidianListFinder(self), 'obsidian-list-finder', 18) + md.parser.blockprocessors.register(ObsidianLists.ObsidianListBlock(md.parser, self), + 'obsidian-lists', PRIO_BASE + 50) + + class ObsidianInlines(Extension): """An extension that handles the special Obsidian markdown format sequences.""" def extendMarkdown(self, md: markdown.Markdown) -> None: # noqa: N802 @@ -529,7 +768,7 @@ class ObsidianStyleFootnotes(FootnoteExtension): ) return sup, m.start(0), etxpoint + 1 - def extendMarkdown(self, md): + def extendMarkdown(self, md) -> None: """ Registers the footnote processor with the Markdown parser. @@ -583,11 +822,13 @@ def create_markdown_parser(context: Context) -> markdown.Markdown: } return markdown.Markdown(extensions=['fenced_code', 'codehilite', + 'sane_lists', 'tables', MetaStripper(), ObsidianComments(), ObsidianStyleFootnotes(SUPERSCRIPT_TEXT='[{}]', SEPARATOR='-'), ObsidianImages(context), ObsidianLinks(context), + ObsidianLists(), ObsidianInlines()], extension_configs=extconfig)