updated Obsidian style list processing to get formatting better, also implement nested lists properly

2024-08-10 02:46:42 -06:00 · 2024-08-10 02:46:42 -06:00 · 1a8a3132f4
commit 1a8a3132f4
parent dc25acd5da
1 changed files with 242 additions and 1 deletions
--- a/src/dragonglass/mparse.py
+++ b/src/dragonglass/mparse.py
@ -3,6 +3,7 @@
 """The Markdown parser and its extensions."""

 import hashlib
+import logging
 import re
 import xml.etree.ElementTree as etree  # noqa: N813
 from typing import Any
@ -11,6 +12,8 @@ from urllib.parse import quote as urlquote
 from urllib.parse import urlparse

 import markdown
+from markdown import Markdown
+from markdown.blockprocessors import BlockProcessor, BlockParser
 from markdown.extensions import Extension
 from markdown.extensions.footnotes import (FootnoteExtension, FootnoteBlockProcessor, FootnoteInlineProcessor,
                                           FootnoteTreeprocessor, FootnotePostTreeprocessor, FootnotePostprocessor)
@ -20,6 +23,8 @@ from markdown.util import STX, ETX

 from .config import Context

+logger = logging.getLogger(__name__)
+
 PRIO_BASE = 10000  # priority base for our extensions

 # Patterns for ObsidianImages
@ -383,6 +388,240 @@ class ObsidianLinks(Extension):
                                   'obsidian_generic_links', PRIO_BASE + 100)


+class ObsidianLists(Extension):
+    """An extension to process Obsidian lists, including making nested lists."""
+    UL_RE = re.compile(r'^([ ]*)[-+*][ ]+(.*)')
+    OL_RE = re.compile(r'^([ ]*)([1-9][0-9]*)\.[ ]+(.*)')
+    LIST_START = STX + "erbosoft:lstart" + ETX
+    LIST_END = STX + "erbosoft:lend" + ETX
+
+    def _find_listhead(self, line) -> tuple[str, int, int]:
+        """
+        Find if the specified line is a list element head.
+
+        Args:
+            line (str): The line to be checked.
+
+        Returns:
+            str: The list type, "ol" or "ul", or "" if this line is not a list head.
+            int: The indent level of the list, or -1 if this line is not a list head.
+            int: The start index of the list, or -1 if this line is not a list head for an ordered list.
+        """
+        m = self.UL_RE.match(line)
+        if m:
+            return 'ul', len(m.group(1)), -1
+        m = self.OL_RE.match(line)
+        if m:
+            return 'ol', len(m.group(1)), int(m.group(2))
+        return '', -1, -1
+
+    class ObsidianListFinder(Preprocessor):
+        def __init__(self, extref: Any) -> None:
+            super(ObsidianLists.ObsidianListFinder, self).__init__()
+            self._extref = extref
+
+        def run(self, lines: list[str]) -> list[str]:
+            i = 0
+            in_list = False
+            while i < len(lines):
+                if in_list:
+                    if len(lines[i].strip()) == 0:
+                        in_list = False
+                        lines.insert(i, self._extref.LIST_END)
+                        i += 1
+                else:
+                    listtype, _, _ = self._extref._find_listhead(lines[i])
+                    if len(listtype) > 0:
+                        in_list = True
+                        lines.insert(i, self._extref.LIST_START)
+                        i += 1
+                i += 1
+            return lines
+
+    class ObsidianListBlock(BlockProcessor):
+        """The actual block processor that generates lists."""
+        def __init__(self, parser: BlockParser, extref: Any) -> None:
+            super(ObsidianLists.ObsidianListBlock, self).__init__(parser)
+            self._extref = extref
+
+        @staticmethod
+        def _extract(blocks: list[str]) -> list[str]:
+            """
+            Extract all lines up to the next blank line, which terminates the list.
+
+            Args:
+                blocks (list[str]): List of blocks to be parsed.
+
+            Returns:
+                list[str]: The block of lines to be parsed for a list.
+            """
+            out_lines: list[str] = []
+            while len(blocks) > 0:
+                curblk = blocks.pop(0)
+                cur_lines = curblk.split('\n')
+                while len(cur_lines) > 0:
+                    line = cur_lines.pop(0)
+                    if line.strip() == '':
+                        cur_lines.insert(0, line)
+                        blocks.insert(0, '\n'.join(cur_lines))
+                        return out_lines
+                    out_lines.append(line)
+            return out_lines
+
+        def _build_element(self, parent: etree.Element, listtype: str, indent: int, lines: list[str]) -> None:
+            """
+            Builds a list element and adds it to the specified parent list.
+
+            Args:
+                parent (etree.element): The parent list element to insert "li" items under.
+                listtype (str): The list type, either "ol" or "ul".
+                indent (int): The indent level of the list, which is greater than or equal to 0.
+                lines (list[str]): The lines comprising the list element.
+            """
+            textdata: list[str] = []
+
+            # Parse the header line to get the first bit of text.
+            if listtype == 'ol':
+                m = self._extref.OL_RE.match(lines[0])
+                assert len(m.group(1)) == indent
+                textdata.append(m.group(3))
+            elif listtype == 'ul':
+                m = self._extref.UL_RE.match(lines[0])
+                assert len(m.group(1)) == indent
+                textdata.append(m.group(2))
+
+            # Build the list element
+            listelement = etree.SubElement(parent, 'li')
+            my_lines = list(lines)
+            i = 1
+            last_sublist: etree.Element | None = None
+            while i < len(my_lines):
+                subtype, subindent, _ = self._extref._find_listhead(my_lines[i])
+                if len(subtype) == 0:
+                    # ordinary text line - append it to the text data
+                    textdata.append(my_lines[i])
+                    i += 1
+                else:
+                    # start of new sublist
+                    assert subindent > indent
+                    # flush text data
+                    if last_sublist is not None:
+                        last_sublist.tail = '\n'.join(textdata)
+                    else:
+                        listelement.text = '\n'.join(textdata)
+                    textdata = []
+                    # parse the sublist and reset to parse the "excess"
+                    last_sublist, my_lines = self._build_list(listelement, my_lines[i:])
+                    i = 0
+
+            # flush the text data one more time
+            if len(textdata) > 0:
+                if last_sublist is not None:
+                    last_sublist.tail = '\n'.join(textdata)
+                else:
+                    listelement.text = '\n'.join(textdata)
+
+        def _build_list(self, parent: etree.Element, lines: list[str]) -> tuple[etree.Element, list[str]]:
+            """
+            Build a new list attached to the specified parent element.
+
+            Args:
+                parent (etree.Element): The parent element to build the list under.
+                lines (list[str]): The lines of text comprising the list.
+
+            Returns:
+                etree.Element: The new list element.
+                list[str]: The excess lines that can't be parsed as part of this list.
+            """
+
+            # Establish the start of the list.
+            listtype, indent_level, start_index = self._extref._find_listhead(lines[0])
+            assert indent_level >= 0
+            list_top = etree.SubElement(parent, listtype)
+            if listtype == 'ol' and start_index > 1:
+                list_top.attrib['start'] = str(start_index)
+
+            # The start index is set to -1 so, the first time around, we'll pick it up and set the start point.
+            st = -1
+            for i, line in enumerate(lines):
+                new_type, new_indent, _ = self._extref._find_listhead(lines[i])
+                if len(new_type) > 0:
+                    if new_indent < indent_level:
+                        # if the list head is lesser indented, this is the end of this element and the list
+                        if st >= 0:
+                            self._build_element(list_top, listtype, indent_level, lines[st:i])
+                        return list_top, lines[i:]
+                    if new_indent == indent_level:
+                        # this is the end of this element
+                        if st >= 0:
+                            self._build_element(list_top, listtype, indent_level, lines[st:i])
+                        if new_type != listtype:
+                            # this is also the end of the list
+                            return list_top, lines[i:]
+                        st = i  # start of next element
+            if st >= 0:  # end the final element
+                self._build_element(list_top, listtype, indent_level, lines[st:])
+            return list_top, []
+
+        def test(self, parent: etree.Element, block: str) -> bool:
+            """
+            Tests to see whether the current block can be handled by this processor.
+
+            Args:
+                parent (etree.Element): The current parent element.
+                block (str): The current block to be tested.
+
+            Returns:
+                bool: ``True`` if this processor can handle the block, ``False`` if not.
+            """
+            if self._extref.LIST_START in block:
+                logger.debug("DETECT")
+                return True
+            return False
+
+        def run(self, parent: etree.Element, blocks: list[str]) -> bool:
+            """
+            Processes the text in the current block and adds elements to the parent element.
+
+            Args:
+                parent (etree.Element): Parent element to add new subelements to.
+                blocks (list[str]): Blocks of text to be processed. The first block in this list is the one for which
+                    the ``test`` method returned ``True``.  This method should remove any parsed text from the
+                    ``block`` list before it returns.
+
+            Returns:
+                bool: ``True`` if text was parsed by this method, ``False`` if not.
+            """
+            chunk = blocks.pop(0)
+            p = chunk.find(self._extref.LIST_START)
+            self.parser.parseChunk(parent, chunk[:p])
+            chunk = chunk[p + len(self._extref.LIST_START) + 1:]
+            p = chunk.find(self._extref.LIST_END)
+            while p < 0:
+                chunk += '\n'
+                chunk += blocks.pop(0)
+                p = chunk.find(self._extref.LIST_END)
+            blocks.insert(0, chunk[p + len(self._extref.LIST_END) + 1:])
+            list_lines = chunk[:p].rstrip().split('\n')
+            assert len(list_lines) > 0
+            logger.debug(f"*** Found list: {list_lines}")
+            _, excess = self._build_list(parent, list_lines)
+            if len(excess) > 0:
+                blocks.insert(0, '\n'.join(excess))
+            return True
+
+    def extendMarkdown(self, md: Markdown) -> None:  # noqa: N802
+        """
+        Registers the list block processor with the Markdown parser.
+
+        Args:
+            md (markdown.Markdown): The Markdown parser to register the patterns with.
+        """
+        md.preprocessors.register(ObsidianLists.ObsidianListFinder(self), 'obsidian-list-finder', 18)
+        md.parser.blockprocessors.register(ObsidianLists.ObsidianListBlock(md.parser, self),
+                                           'obsidian-lists', PRIO_BASE + 50)
+
+
 class ObsidianInlines(Extension):
    """An extension that handles the special Obsidian markdown format sequences."""
    def extendMarkdown(self, md: markdown.Markdown) -> None:  # noqa: N802
@ -529,7 +768,7 @@ class ObsidianStyleFootnotes(FootnoteExtension):
            )
            return sup, m.start(0), etxpoint + 1

-    def extendMarkdown(self, md):
+    def extendMarkdown(self, md) -> None:
        """
        Registers the footnote processor with the Markdown parser.

@ -583,11 +822,13 @@ def create_markdown_parser(context: Context) -> markdown.Markdown:
    }
    return markdown.Markdown(extensions=['fenced_code',
                                         'codehilite',
+                                         'sane_lists',
                                         'tables',
                                         MetaStripper(),
                                         ObsidianComments(),
                                         ObsidianStyleFootnotes(SUPERSCRIPT_TEXT='[{}]', SEPARATOR='-'),
                                         ObsidianImages(context),
                                         ObsidianLinks(context),
+                                         ObsidianLists(),
                                         ObsidianInlines()],
                             extension_configs=extconfig)