updated Obsidian style list processing to get formatting better, also implement nested lists properly

This commit is contained in:
Amy G. Bowersox 2024-08-10 02:46:42 -06:00
parent dc25acd5da
commit 1a8a3132f4

View File

@ -3,6 +3,7 @@
"""The Markdown parser and its extensions."""
import hashlib
import logging
import re
import xml.etree.ElementTree as etree # noqa: N813
from typing import Any
@ -11,6 +12,8 @@ from urllib.parse import quote as urlquote
from urllib.parse import urlparse
import markdown
from markdown import Markdown
from markdown.blockprocessors import BlockProcessor, BlockParser
from markdown.extensions import Extension
from markdown.extensions.footnotes import (FootnoteExtension, FootnoteBlockProcessor, FootnoteInlineProcessor,
FootnoteTreeprocessor, FootnotePostTreeprocessor, FootnotePostprocessor)
@ -20,6 +23,8 @@ from markdown.util import STX, ETX
from .config import Context
logger = logging.getLogger(__name__)
PRIO_BASE = 10000 # priority base for our extensions
# Patterns for ObsidianImages
@ -383,6 +388,240 @@ class ObsidianLinks(Extension):
'obsidian_generic_links', PRIO_BASE + 100)
class ObsidianLists(Extension):
"""An extension to process Obsidian lists, including making nested lists."""
UL_RE = re.compile(r'^([ ]*)[-+*][ ]+(.*)')
OL_RE = re.compile(r'^([ ]*)([1-9][0-9]*)\.[ ]+(.*)')
LIST_START = STX + "erbosoft:lstart" + ETX
LIST_END = STX + "erbosoft:lend" + ETX
def _find_listhead(self, line) -> tuple[str, int, int]:
"""
Find if the specified line is a list element head.
Args:
line (str): The line to be checked.
Returns:
str: The list type, "ol" or "ul", or "" if this line is not a list head.
int: The indent level of the list, or -1 if this line is not a list head.
int: The start index of the list, or -1 if this line is not a list head for an ordered list.
"""
m = self.UL_RE.match(line)
if m:
return 'ul', len(m.group(1)), -1
m = self.OL_RE.match(line)
if m:
return 'ol', len(m.group(1)), int(m.group(2))
return '', -1, -1
class ObsidianListFinder(Preprocessor):
def __init__(self, extref: Any) -> None:
super(ObsidianLists.ObsidianListFinder, self).__init__()
self._extref = extref
def run(self, lines: list[str]) -> list[str]:
i = 0
in_list = False
while i < len(lines):
if in_list:
if len(lines[i].strip()) == 0:
in_list = False
lines.insert(i, self._extref.LIST_END)
i += 1
else:
listtype, _, _ = self._extref._find_listhead(lines[i])
if len(listtype) > 0:
in_list = True
lines.insert(i, self._extref.LIST_START)
i += 1
i += 1
return lines
class ObsidianListBlock(BlockProcessor):
"""The actual block processor that generates lists."""
def __init__(self, parser: BlockParser, extref: Any) -> None:
super(ObsidianLists.ObsidianListBlock, self).__init__(parser)
self._extref = extref
@staticmethod
def _extract(blocks: list[str]) -> list[str]:
"""
Extract all lines up to the next blank line, which terminates the list.
Args:
blocks (list[str]): List of blocks to be parsed.
Returns:
list[str]: The block of lines to be parsed for a list.
"""
out_lines: list[str] = []
while len(blocks) > 0:
curblk = blocks.pop(0)
cur_lines = curblk.split('\n')
while len(cur_lines) > 0:
line = cur_lines.pop(0)
if line.strip() == '':
cur_lines.insert(0, line)
blocks.insert(0, '\n'.join(cur_lines))
return out_lines
out_lines.append(line)
return out_lines
def _build_element(self, parent: etree.Element, listtype: str, indent: int, lines: list[str]) -> None:
"""
Builds a list element and adds it to the specified parent list.
Args:
parent (etree.element): The parent list element to insert "li" items under.
listtype (str): The list type, either "ol" or "ul".
indent (int): The indent level of the list, which is greater than or equal to 0.
lines (list[str]): The lines comprising the list element.
"""
textdata: list[str] = []
# Parse the header line to get the first bit of text.
if listtype == 'ol':
m = self._extref.OL_RE.match(lines[0])
assert len(m.group(1)) == indent
textdata.append(m.group(3))
elif listtype == 'ul':
m = self._extref.UL_RE.match(lines[0])
assert len(m.group(1)) == indent
textdata.append(m.group(2))
# Build the list element
listelement = etree.SubElement(parent, 'li')
my_lines = list(lines)
i = 1
last_sublist: etree.Element | None = None
while i < len(my_lines):
subtype, subindent, _ = self._extref._find_listhead(my_lines[i])
if len(subtype) == 0:
# ordinary text line - append it to the text data
textdata.append(my_lines[i])
i += 1
else:
# start of new sublist
assert subindent > indent
# flush text data
if last_sublist is not None:
last_sublist.tail = '\n'.join(textdata)
else:
listelement.text = '\n'.join(textdata)
textdata = []
# parse the sublist and reset to parse the "excess"
last_sublist, my_lines = self._build_list(listelement, my_lines[i:])
i = 0
# flush the text data one more time
if len(textdata) > 0:
if last_sublist is not None:
last_sublist.tail = '\n'.join(textdata)
else:
listelement.text = '\n'.join(textdata)
def _build_list(self, parent: etree.Element, lines: list[str]) -> tuple[etree.Element, list[str]]:
"""
Build a new list attached to the specified parent element.
Args:
parent (etree.Element): The parent element to build the list under.
lines (list[str]): The lines of text comprising the list.
Returns:
etree.Element: The new list element.
list[str]: The excess lines that can't be parsed as part of this list.
"""
# Establish the start of the list.
listtype, indent_level, start_index = self._extref._find_listhead(lines[0])
assert indent_level >= 0
list_top = etree.SubElement(parent, listtype)
if listtype == 'ol' and start_index > 1:
list_top.attrib['start'] = str(start_index)
# The start index is set to -1 so, the first time around, we'll pick it up and set the start point.
st = -1
for i, line in enumerate(lines):
new_type, new_indent, _ = self._extref._find_listhead(lines[i])
if len(new_type) > 0:
if new_indent < indent_level:
# if the list head is lesser indented, this is the end of this element and the list
if st >= 0:
self._build_element(list_top, listtype, indent_level, lines[st:i])
return list_top, lines[i:]
if new_indent == indent_level:
# this is the end of this element
if st >= 0:
self._build_element(list_top, listtype, indent_level, lines[st:i])
if new_type != listtype:
# this is also the end of the list
return list_top, lines[i:]
st = i # start of next element
if st >= 0: # end the final element
self._build_element(list_top, listtype, indent_level, lines[st:])
return list_top, []
def test(self, parent: etree.Element, block: str) -> bool:
"""
Tests to see whether the current block can be handled by this processor.
Args:
parent (etree.Element): The current parent element.
block (str): The current block to be tested.
Returns:
bool: ``True`` if this processor can handle the block, ``False`` if not.
"""
if self._extref.LIST_START in block:
logger.debug("DETECT")
return True
return False
def run(self, parent: etree.Element, blocks: list[str]) -> bool:
"""
Processes the text in the current block and adds elements to the parent element.
Args:
parent (etree.Element): Parent element to add new subelements to.
blocks (list[str]): Blocks of text to be processed. The first block in this list is the one for which
the ``test`` method returned ``True``. This method should remove any parsed text from the
``block`` list before it returns.
Returns:
bool: ``True`` if text was parsed by this method, ``False`` if not.
"""
chunk = blocks.pop(0)
p = chunk.find(self._extref.LIST_START)
self.parser.parseChunk(parent, chunk[:p])
chunk = chunk[p + len(self._extref.LIST_START) + 1:]
p = chunk.find(self._extref.LIST_END)
while p < 0:
chunk += '\n'
chunk += blocks.pop(0)
p = chunk.find(self._extref.LIST_END)
blocks.insert(0, chunk[p + len(self._extref.LIST_END) + 1:])
list_lines = chunk[:p].rstrip().split('\n')
assert len(list_lines) > 0
logger.debug(f"*** Found list: {list_lines}")
_, excess = self._build_list(parent, list_lines)
if len(excess) > 0:
blocks.insert(0, '\n'.join(excess))
return True
def extendMarkdown(self, md: Markdown) -> None: # noqa: N802
"""
Registers the list block processor with the Markdown parser.
Args:
md (markdown.Markdown): The Markdown parser to register the patterns with.
"""
md.preprocessors.register(ObsidianLists.ObsidianListFinder(self), 'obsidian-list-finder', 18)
md.parser.blockprocessors.register(ObsidianLists.ObsidianListBlock(md.parser, self),
'obsidian-lists', PRIO_BASE + 50)
class ObsidianInlines(Extension):
"""An extension that handles the special Obsidian markdown format sequences."""
def extendMarkdown(self, md: markdown.Markdown) -> None: # noqa: N802
@ -529,7 +768,7 @@ class ObsidianStyleFootnotes(FootnoteExtension):
)
return sup, m.start(0), etxpoint + 1
def extendMarkdown(self, md):
def extendMarkdown(self, md) -> None:
"""
Registers the footnote processor with the Markdown parser.
@ -583,11 +822,13 @@ def create_markdown_parser(context: Context) -> markdown.Markdown:
}
return markdown.Markdown(extensions=['fenced_code',
'codehilite',
'sane_lists',
'tables',
MetaStripper(),
ObsidianComments(),
ObsidianStyleFootnotes(SUPERSCRIPT_TEXT='[{}]', SEPARATOR='-'),
ObsidianImages(context),
ObsidianLinks(context),
ObsidianLists(),
ObsidianInlines()],
extension_configs=extconfig)