%PDF- %PDF-
Direktori : /lib/python3/dist-packages/markdown/ |
Current File : //lib/python3/dist-packages/markdown/inlinepatterns.py |
# Python Markdown # A Python implementation of John Gruber's Markdown. # Documentation: https://python-markdown.github.io/ # GitHub: https://github.com/Python-Markdown/markdown/ # PyPI: https://pypi.org/project/Markdown/ # Started by Manfred Stienstra (http://www.dwerg.net/). # Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). # Currently maintained by Waylan Limberg (https://github.com/waylan), # Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). # Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later) # Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) # Copyright 2004 Manfred Stienstra (the original version) # License: BSD (see LICENSE.md for details). """ In version 3.0, a new, more flexible inline processor was added, [`markdown.inlinepatterns.InlineProcessor`][]. The original inline patterns, which inherit from [`markdown.inlinepatterns.Pattern`][] or one of its children are still supported, though users are encouraged to migrate. The new `InlineProcessor` provides two major enhancements to `Patterns`: 1. Inline Processors no longer need to match the entire block, so regular expressions no longer need to start with `r'^(.*?)'` and end with `r'(.*?)%'`. This runs faster. The returned [`Match`][re.Match] object will only contain what is explicitly matched in the pattern, and extension pattern groups now start with `m.group(1)`. 2. The `handleMatch` method now takes an additional input called `data`, which is the entire block under analysis, not just what is matched with the specified pattern. The method now returns the element *and* the indexes relative to `data` that the return element is replacing (usually `m.start(0)` and `m.end(0)`). If the boundaries are returned as `None`, it is assumed that the match did not take place, and nothing will be altered in `data`. This allows handling of more complex constructs than regular expressions can handle, e.g., matching nested brackets, and explicit control of the span "consumed" by the processor. """ from __future__ import annotations from . import util from typing import TYPE_CHECKING, Any, Collection, NamedTuple import re import xml.etree.ElementTree as etree from html import entities if TYPE_CHECKING: # pragma: no cover from markdown import Markdown def build_inlinepatterns(md: Markdown, **kwargs: Any) -> util.Registry[InlineProcessor]: """ Build the default set of inline patterns for Markdown. The order in which processors and/or patterns are applied is very important - e.g. if we first replace `http://.../` links with `<a>` tags and _then_ try to replace inline HTML, we would end up with a mess. So, we apply the expressions in the following order: * backticks and escaped characters have to be handled before everything else so that we can preempt any markdown patterns by escaping them; * then we handle the various types of links (auto-links must be handled before inline HTML); * then we handle inline HTML. At this point we will simply replace all inline HTML strings with a placeholder and add the actual HTML to a stash; * finally we apply strong, emphasis, etc. """ inlinePatterns = util.Registry() inlinePatterns.register(BacktickInlineProcessor(BACKTICK_RE), 'backtick', 190) inlinePatterns.register(EscapeInlineProcessor(ESCAPE_RE, md), 'escape', 180) inlinePatterns.register(ReferenceInlineProcessor(REFERENCE_RE, md), 'reference', 170) inlinePatterns.register(LinkInlineProcessor(LINK_RE, md), 'link', 160) inlinePatterns.register(ImageInlineProcessor(IMAGE_LINK_RE, md), 'image_link', 150) inlinePatterns.register( ImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'image_reference', 140 ) inlinePatterns.register( ShortReferenceInlineProcessor(REFERENCE_RE, md), 'short_reference', 130 ) inlinePatterns.register( ShortImageReferenceInlineProcessor(IMAGE_REFERENCE_RE, md), 'short_image_ref', 125 ) inlinePatterns.register(AutolinkInlineProcessor(AUTOLINK_RE, md), 'autolink', 120) inlinePatterns.register(AutomailInlineProcessor(AUTOMAIL_RE, md), 'automail', 110) inlinePatterns.register(SubstituteTagInlineProcessor(LINE_BREAK_RE, 'br'), 'linebreak', 100) inlinePatterns.register(HtmlInlineProcessor(HTML_RE, md), 'html', 90) inlinePatterns.register(HtmlInlineProcessor(ENTITY_RE, md), 'entity', 80) inlinePatterns.register(SimpleTextInlineProcessor(NOT_STRONG_RE), 'not_strong', 70) inlinePatterns.register(AsteriskProcessor(r'\*'), 'em_strong', 60) inlinePatterns.register(UnderscoreProcessor(r'_'), 'em_strong2', 50) return inlinePatterns # The actual regular expressions for patterns # ----------------------------------------------------------------------------- NOIMG = r'(?<!\!)' """ Match not an image. Partial regular expression which matches if not preceded by `!`. """ BACKTICK_RE = r'(?:(?<!\\)((?:\\{2})+)(?=`+)|(?<!\\)(`+)(.+?)(?<!`)\2(?!`))' """ Match backtick quoted string (`` `e=f()` `` or ``` ``e=f("`")`` ```). """ ESCAPE_RE = r'\\(.)' """ Match a backslash escaped character (`\\<` or `\\*`). """ EMPHASIS_RE = r'(\*)([^\*]+)\1' """ Match emphasis with an asterisk (`*emphasis*`). """ STRONG_RE = r'(\*{2})(.+?)\1' """ Match strong with an asterisk (`**strong**`). """ SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)' """ Match strong with underscore while ignoring middle word underscores (`__smart__strong__`). """ SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)' """ Match emphasis with underscore while ignoring middle word underscores (`_smart_emphasis_`). """ SMART_STRONG_EM_RE = r'(?<!\w)(\_)\1(?!\1)(.+?)(?<!\w)\1(?!\1)(.+?)\1{3}(?!\w)' """ Match strong emphasis with underscores (`__strong _em__`). """ EM_STRONG_RE = r'(\*)\1{2}(.+?)\1(.*?)\1{2}' """ Match emphasis strong with asterisk (`***strongem***` or `***em*strong**`). """ EM_STRONG2_RE = r'(_)\1{2}(.+?)\1(.*?)\1{2}' """ Match emphasis strong with underscores (`___emstrong___` or `___em_strong__`). """ STRONG_EM_RE = r'(\*)\1{2}(.+?)\1{2}(.*?)\1' """ Match strong emphasis with asterisk (`***strong**em*`). """ STRONG_EM2_RE = r'(_)\1{2}(.+?)\1{2}(.*?)\1' """ Match strong emphasis with underscores (`___strong__em_`). """ STRONG_EM3_RE = r'(\*)\1(?!\1)([^*]+?)\1(?!\1)(.+?)\1{3}' """ Match strong emphasis with asterisk (`**strong*em***`). """ LINK_RE = NOIMG + r'\[' """ Match start of in-line link (`[text](url)` or `[text](<url>)` or `[text](url "title")`). """ IMAGE_LINK_RE = r'\!\[' """ Match start of in-line image link (`![alttxt](url)` or `![alttxt](<url>)`). """ REFERENCE_RE = LINK_RE """ Match start of reference link (`[Label][3]`). """ IMAGE_REFERENCE_RE = IMAGE_LINK_RE """ Match start of image reference (`![alt text][2]`). """ NOT_STRONG_RE = r'((^|(?<=\s))(\*{1,3}|_{1,3})(?=\s|$))' """ Match a stand-alone `*` or `_`. """ AUTOLINK_RE = r'<((?:[Ff]|[Hh][Tt])[Tt][Pp][Ss]?://[^<>]*)>' """ Match an automatic link (`<http://www.example.com>`). """ AUTOMAIL_RE = r'<([^<> !]+@[^@<> ]+)>' """ Match an automatic email link (`<me@example.com>`). """ HTML_RE = r'(<(\/?[a-zA-Z][^<>@ ]*( [^<>]*)?|!--(?:(?!<!--|-->).)*--)>)' """ Match an HTML tag (`<...>`). """ ENTITY_RE = r'(&(?:\#[0-9]+|\#x[0-9a-fA-F]+|[a-zA-Z0-9]+);)' """ Match an HTML entity (`&` (decimal) or `&` (hex) or `&` (named)). """ LINE_BREAK_RE = r' \n' """ Match two spaces at end of line. """ def dequote(string: str) -> str: """Remove quotes from around a string.""" if ((string.startswith('"') and string.endswith('"')) or (string.startswith("'") and string.endswith("'"))): return string[1:-1] else: return string class EmStrongItem(NamedTuple): """Emphasis/strong pattern item.""" pattern: re.Pattern[str] builder: str tags: str # The pattern classes # ----------------------------------------------------------------------------- class Pattern: # pragma: no cover """ Base class that inline patterns subclass. Inline patterns are handled by means of `Pattern` subclasses, one per regular expression. Each pattern object uses a single regular expression and must support the following methods: [`getCompiledRegExp`][markdown.inlinepatterns.Pattern.getCompiledRegExp] and [`handleMatch`][markdown.inlinepatterns.Pattern.handleMatch]. All the regular expressions used by `Pattern` subclasses must capture the whole block. For this reason, they all start with `^(.*)` and end with `(.*)!`. When passing a regular expression on class initialization, the `^(.*)` and `(.*)!` are added automatically and the regular expression is pre-compiled. It is strongly suggested that the newer style [`markdown.inlinepatterns.InlineProcessor`][] that use a more efficient and flexible search approach be used instead. However, the older style `Pattern` remains for backward compatibility with many existing third-party extensions. """ ANCESTOR_EXCLUDES: Collection[str] = tuple() """ A collection of elements which are undesirable ancestors. The processor will be skipped if it would cause the content to be a descendant of one of the listed tag names. """ compiled_re: re.Pattern[str] md: Markdown | None def __init__(self, pattern: str, md: Markdown | None = None): """ Create an instant of an inline pattern. Arguments: pattern: A regular expression that matches a pattern. md: An optional pointer to the instance of `markdown.Markdown` and is available as `self.md` on the class instance. """ self.pattern = pattern self.compiled_re = re.compile(r"^(.*?)%s(.*)$" % pattern, re.DOTALL | re.UNICODE) self.md = md def getCompiledRegExp(self) -> re.Pattern: """ Return a compiled regular expression. """ return self.compiled_re def handleMatch(self, m: re.Match[str]) -> etree.Element | str: """Return a ElementTree element from the given match. Subclasses should override this method. Arguments: m: A match object containing a match of the pattern. Returns: An ElementTree Element object. """ pass # pragma: no cover def type(self) -> str: """ Return class name, to define pattern type """ return self.__class__.__name__ def unescape(self, text: str) -> str: """ Return unescaped text given text with an inline placeholder. """ try: stash = self.md.treeprocessors['inline'].stashed_nodes except KeyError: # pragma: no cover return text def get_stash(m): id = m.group(1) if id in stash: value = stash.get(id) if isinstance(value, str): return value else: # An `etree` Element - return text content only return ''.join(value.itertext()) return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) class InlineProcessor(Pattern): """ Base class that inline processors subclass. This is the newer style inline processor that uses a more efficient and flexible search approach. """ def __init__(self, pattern: str, md: Markdown | None = None): """ Create an instant of an inline processor. Arguments: pattern: A regular expression that matches a pattern. md: An optional pointer to the instance of `markdown.Markdown` and is available as `self.md` on the class instance. """ self.pattern = pattern self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) # API for Markdown to pass `safe_mode` into instance self.safe_mode = False self.md = md def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str | None, int | None, int | None]: """Return a ElementTree element from the given match and the start and end index of the matched text. If `start` and/or `end` are returned as `None`, it will be assumed that the processor did not find a valid region of text. Subclasses should override this method. Arguments: m: A re match object containing a match of the pattern. data: The buffer currently under analysis. Returns: el: The ElementTree element, text or None. start: The start of the region that has been matched or None. end: The end of the region that has been matched or None. """ pass # pragma: no cover class SimpleTextPattern(Pattern): # pragma: no cover """ Return a simple text of `group(2)` of a Pattern. """ def handleMatch(self, m: re.Match[str]) -> str: """ Return string content of `group(2)` of a matching pattern. """ return m.group(2) class SimpleTextInlineProcessor(InlineProcessor): """ Return a simple text of `group(1)` of a Pattern. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: """ Return string content of `group(1)` of a matching pattern. """ return m.group(1), m.start(0), m.end(0) class EscapeInlineProcessor(InlineProcessor): """ Return an escaped character. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[str | None, int, int]: """ If the character matched by `group(1)` of a pattern is in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS] then return the integer representing the character's Unicode code point (as returned by [`ord`][]) wrapped in [`util.STX`][markdown.util.STX] and [`util.ETX`][markdown.util.ETX]. If the matched character is not in [`ESCAPED_CHARS`][markdown.Markdown.ESCAPED_CHARS], then return `None`. """ char = m.group(1) if char in self.md.ESCAPED_CHARS: return '{}{}{}'.format(util.STX, ord(char), util.ETX), m.start(0), m.end(0) else: return None, m.start(0), m.end(0) class SimpleTagPattern(Pattern): # pragma: no cover """ Return element of type `tag` with a text attribute of `group(3)` of a Pattern. """ def __init__(self, pattern: str, tag: str): """ Create an instant of an simple tag pattern. Arguments: pattern: A regular expression that matches a pattern. tag: Tag of element. """ Pattern.__init__(self, pattern) self.tag = tag """ The tag of the rendered element. """ def handleMatch(self, m: re.Match[str]) -> etree.Element: """ Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(3)` of a matching pattern as the Element's text. """ el = etree.Element(self.tag) el.text = m.group(3) return el class SimpleTagInlineProcessor(InlineProcessor): """ Return element of type `tag` with a text attribute of `group(2)` of a Pattern. """ def __init__(self, pattern: str, tag: str): """ Create an instant of an simple tag processor. Arguments: pattern: A regular expression that matches a pattern. tag: Tag of element. """ InlineProcessor.__init__(self, pattern) self.tag = tag """ The tag of the rendered element. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover """ Return [`Element`][xml.etree.ElementTree.Element] of type `tag` with the string in `group(2)` of a matching pattern as the Element's text. """ el = etree.Element(self.tag) el.text = m.group(2) return el, m.start(0), m.end(0) class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover """ Return an element of type `tag` with no children. """ def handleMatch(self, m: re.Match[str]) -> etree.Element: """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ return etree.Element(self.tag) class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): """ Return an element of type `tag` with no children. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: """ Return empty [`Element`][xml.etree.ElementTree.Element] of type `tag`. """ return etree.Element(self.tag), m.start(0), m.end(0) class BacktickInlineProcessor(InlineProcessor): """ Return a `<code>` element containing the escaped matching text. """ def __init__(self, pattern: str): InlineProcessor.__init__(self, pattern) self.ESCAPED_BSLASH = '{}{}{}'.format(util.STX, ord('\\'), util.ETX) self.tag = 'code' """ The tag of the rendered element. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]: """ If the match contains `group(3)` of a pattern, then return a `code` [`Element`][xml.etree.ElementTree.Element] which contains HTML escaped text (with [`code_escape`][markdown.util.code_escape]) as an [`AtomicString`][markdown.util.AtomicString]. If the match does not contain `group(3)` then return the text of `group(1)` backslash escaped. """ if m.group(3): el = etree.Element(self.tag) el.text = util.AtomicString(util.code_escape(m.group(3).strip())) return el, m.start(0), m.end(0) else: return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) class DoubleTagPattern(SimpleTagPattern): # pragma: no cover """Return a ElementTree element nested in tag2 nested in tag1. Useful for strong emphasis etc. """ def handleMatch(self, m: re.Match[str]) -> etree.Element: """ Return [`Element`][xml.etree.ElementTree.Element] in following format: `<tag1><tag2>group(3)</tag2>group(4)</tag2>` where `group(4)` is optional. """ tag1, tag2 = self.tag.split(",") el1 = etree.Element(tag1) el2 = etree.SubElement(el1, tag2) el2.text = m.group(3) if len(m.groups()) == 5: el2.tail = m.group(4) return el1 class DoubleTagInlineProcessor(SimpleTagInlineProcessor): """Return a ElementTree element nested in tag2 nested in tag1. Useful for strong emphasis etc. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # pragma: no cover """ Return [`Element`][xml.etree.ElementTree.Element] in following format: `<tag1><tag2>group(2)</tag2>group(3)</tag2>` where `group(3)` is optional. """ tag1, tag2 = self.tag.split(",") el1 = etree.Element(tag1) el2 = etree.SubElement(el1, tag2) el2.text = m.group(2) if len(m.groups()) == 3: el2.tail = m.group(3) return el1, m.start(0), m.end(0) class HtmlInlineProcessor(InlineProcessor): """ Store raw inline html and return a placeholder. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]: """ Store the text of `group(1)` of a pattern and return a placeholder string. """ rawhtml = self.backslash_unescape(self.unescape(m.group(1))) place_holder = self.md.htmlStash.store(rawhtml) return place_holder, m.start(0), m.end(0) def unescape(self, text: str) -> str: """ Return unescaped text given text with an inline placeholder. """ try: stash = self.md.treeprocessors['inline'].stashed_nodes except KeyError: # pragma: no cover return text def get_stash(m: re.Match[str]) -> str: id = m.group(1) value = stash.get(id) if value is not None: try: return self.md.serializer(value) except Exception: return r'\%s' % value return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) def backslash_unescape(self, text: str) -> str: """ Return text with backslash escapes undone (backslashes are restored). """ try: RE = self.md.treeprocessors['unescape'].RE except KeyError: # pragma: no cover return text def _unescape(m: re.Match[str]) -> str: return chr(int(m.group(1))) return RE.sub(_unescape, text) class AsteriskProcessor(InlineProcessor): """Emphasis processor for handling strong and em matches inside asterisks.""" PATTERNS = [ EmStrongItem(re.compile(EM_STRONG_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), EmStrongItem(re.compile(STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), EmStrongItem(re.compile(STRONG_EM3_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), EmStrongItem(re.compile(STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), EmStrongItem(re.compile(EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') ] """ The various strong and emphasis patterns handled by this processor. """ def build_single(self, m: re.Match[str], tag: str, idx: int) -> etree.Element: """Return single tag.""" el1 = etree.Element(tag) text = m.group(2) self.parse_sub_patterns(text, el1, None, idx) return el1 def build_double(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: """Return double tag.""" tag1, tag2 = tags.split(",") el1 = etree.Element(tag1) el2 = etree.Element(tag2) text = m.group(2) self.parse_sub_patterns(text, el2, None, idx) el1.append(el2) if len(m.groups()) == 3: text = m.group(3) self.parse_sub_patterns(text, el1, el2, idx) return el1 def build_double2(self, m: re.Match[str], tags: str, idx: int) -> etree.Element: """Return double tags (variant 2): `<strong>text <em>text</em></strong>`.""" tag1, tag2 = tags.split(",") el1 = etree.Element(tag1) el2 = etree.Element(tag2) text = m.group(2) self.parse_sub_patterns(text, el1, None, idx) text = m.group(3) el1.append(el2) self.parse_sub_patterns(text, el2, None, idx) return el1 def parse_sub_patterns( self, data: str, parent: etree.Element, last: etree.Element | None, idx: int ) -> None: """ Parses sub patterns. `data`: text to evaluate. `parent`: Parent to attach text and sub elements to. `last`: Last appended child to parent. Can also be None if parent has no children. `idx`: Current pattern index that was used to evaluate the parent. """ offset = 0 pos = 0 length = len(data) while pos < length: # Find the start of potential emphasis or strong tokens if self.compiled_re.match(data, pos): matched = False # See if the we can match an emphasis/strong pattern for index, item in enumerate(self.PATTERNS): # Only evaluate patterns that are after what was used on the parent if index <= idx: continue m = item.pattern.match(data, pos) if m: # Append child nodes to parent # Text nodes should be appended to the last # child if present, and if not, it should # be added as the parent's text node. text = data[offset:m.start(0)] if text: if last is not None: last.tail = text else: parent.text = text el = self.build_element(m, item.builder, item.tags, index) parent.append(el) last = el # Move our position past the matched hunk offset = pos = m.end(0) matched = True if not matched: # We matched nothing, move on to the next character pos += 1 else: # Increment position as no potential emphasis start was found. pos += 1 # Append any leftover text as a text node. text = data[offset:] if text: if last is not None: last.tail = text else: parent.text = text def build_element(self, m: re.Match[str], builder: str, tags: str, index: int) -> etree.Element: """Element builder.""" if builder == 'double2': return self.build_double2(m, tags, index) elif builder == 'double': return self.build_double(m, tags, index) else: return self.build_single(m, tags, index) def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: """Parse patterns.""" el = None start = None end = None for index, item in enumerate(self.PATTERNS): m1 = item.pattern.match(data, m.start(0)) if m1: start = m1.start(0) end = m1.end(0) el = self.build_element(m1, item.builder, item.tags, index) break return el, start, end class UnderscoreProcessor(AsteriskProcessor): """Emphasis processor for handling strong and em matches inside underscores.""" PATTERNS = [ EmStrongItem(re.compile(EM_STRONG2_RE, re.DOTALL | re.UNICODE), 'double', 'strong,em'), EmStrongItem(re.compile(STRONG_EM2_RE, re.DOTALL | re.UNICODE), 'double', 'em,strong'), EmStrongItem(re.compile(SMART_STRONG_EM_RE, re.DOTALL | re.UNICODE), 'double2', 'strong,em'), EmStrongItem(re.compile(SMART_STRONG_RE, re.DOTALL | re.UNICODE), 'single', 'strong'), EmStrongItem(re.compile(SMART_EMPHASIS_RE, re.DOTALL | re.UNICODE), 'single', 'em') ] """ The various strong and emphasis patterns handled by this processor. """ class LinkInlineProcessor(InlineProcessor): """ Return a link element from the given match. """ RE_LINK = re.compile(r'''\(\s*(?:(<[^<>]*>)\s*(?:('[^']*'|"[^"]*")\s*)?\))?''', re.DOTALL | re.UNICODE) RE_TITLE_CLEAN = re.compile(r'\s') def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: """ Return an `a` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ text, index, handled = self.getText(data, m.end(0)) if not handled: return None, None, None href, title, index, handled = self.getLink(data, index) if not handled: return None, None, None el = etree.Element("a") el.text = text el.set("href", href) if title is not None: el.set("title", title) return el, m.start(0), index def getLink(self, data: str, index: int) -> tuple[str, str | None, int, bool]: """Parse data between `()` of `[Text]()` allowing recursive `()`. """ href = '' title: str | None = None handled = False m = self.RE_LINK.match(data, pos=index) if m and m.group(1): # Matches [Text](<link> "title") href = m.group(1)[1:-1].strip() if m.group(2): title = m.group(2)[1:-1] index = m.end(0) handled = True elif m: # Track bracket nesting and index in string bracket_count = 1 backtrack_count = 1 start_index = m.end() index = start_index last_bracket = -1 # Primary (first found) quote tracking. quote: str | None = None start_quote = -1 exit_quote = -1 ignore_matches = False # Secondary (second found) quote tracking. alt_quote = None start_alt_quote = -1 exit_alt_quote = -1 # Track last character last = '' for pos in range(index, len(data)): c = data[pos] if c == '(': # Count nested ( # Don't increment the bracket count if we are sure we're in a title. if not ignore_matches: bracket_count += 1 elif backtrack_count > 0: backtrack_count -= 1 elif c == ')': # Match nested ) to ( # Don't decrement if we are sure we are in a title that is unclosed. if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): bracket_count = 0 elif not ignore_matches: bracket_count -= 1 elif backtrack_count > 0: backtrack_count -= 1 # We've found our backup end location if the title doesn't resolve. if backtrack_count == 0: last_bracket = index + 1 elif c in ("'", '"'): # Quote has started if not quote: # We'll assume we are now in a title. # Brackets are quoted, so no need to match them (except for the final one). ignore_matches = True backtrack_count = bracket_count bracket_count = 1 start_quote = index + 1 quote = c # Secondary quote (in case the first doesn't resolve): [text](link'"title") elif c != quote and not alt_quote: start_alt_quote = index + 1 alt_quote = c # Update primary quote match elif c == quote: exit_quote = index + 1 # Update secondary quote match elif alt_quote and c == alt_quote: exit_alt_quote = index + 1 index += 1 # Link is closed, so let's break out of the loop if bracket_count == 0: # Get the title if we closed a title string right before link closed if exit_quote >= 0 and quote == last: href = data[start_index:start_quote - 1] title = ''.join(data[start_quote:exit_quote - 1]) elif exit_alt_quote >= 0 and alt_quote == last: href = data[start_index:start_alt_quote - 1] title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) else: href = data[start_index:index - 1] break if c != ' ': last = c # We have a scenario: `[test](link"notitle)` # When we enter a string, we stop tracking bracket resolution in the main counter, # but we do keep a backup counter up until we discover where we might resolve all brackets # if the title string fails to resolve. if bracket_count != 0 and backtrack_count == 0: href = data[start_index:last_bracket - 1] index = last_bracket bracket_count = 0 handled = bracket_count == 0 if title is not None: title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) href = self.unescape(href).strip() return href, title, index, handled def getText(self, data: str, index: int) -> tuple[str, int, bool]: """Parse the content between `[]` of the start of an image or link resolving nested square brackets. """ bracket_count = 1 text = [] for pos in range(index, len(data)): c = data[pos] if c == ']': bracket_count -= 1 elif c == '[': bracket_count += 1 index += 1 if bracket_count == 0: break text.append(c) return ''.join(text), index, bracket_count == 0 class ImageInlineProcessor(LinkInlineProcessor): """ Return a `img` element from the given match. """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: """ Return an `img` [`Element`][xml.etree.ElementTree.Element] or `(None, None, None)`. """ text, index, handled = self.getText(data, m.end(0)) if not handled: return None, None, None src, title, index, handled = self.getLink(data, index) if not handled: return None, None, None el = etree.Element("img") el.set("src", src) if title is not None: el.set("title", title) el.set('alt', self.unescape(text)) return el, m.start(0), index class ReferenceInlineProcessor(LinkInlineProcessor): """ Match to a stored reference and return link element. """ NEWLINE_CLEANUP_RE = re.compile(r'\s+', re.MULTILINE) RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | None, int | None, int | None]: """ Return [`Element`][xml.etree.ElementTree.Element] returned by `makeTag` method or `(None, None, None)`. """ text, index, handled = self.getText(data, m.end(0)) if not handled: return None, None, None id, end, handled = self.evalId(data, index, text) if not handled: return None, None, None # Clean up line breaks in id id = self.NEWLINE_CLEANUP_RE.sub(' ', id) if id not in self.md.references: # ignore undefined refs return None, m.start(0), end href, title = self.md.references[id] return self.makeTag(href, title, text), m.start(0), end def evalId(self, data: str, index: int, text: str) -> tuple[str | None, int, bool]: """ Evaluate the id portion of `[ref][id]`. If `[ref][]` use `[ref]`. """ m = self.RE_LINK.match(data, pos=index) if not m: return None, index, False else: id = m.group(1).lower() end = m.end(0) if not id: id = text.lower() return id, end, True def makeTag(self, href: str, title: str, text: str) -> etree.Element: """ Return an `a` [`Element`][xml.etree.ElementTree.Element]. """ el = etree.Element('a') el.set('href', href) if title: el.set('title', title) el.text = text return el class ShortReferenceInlineProcessor(ReferenceInlineProcessor): """Short form of reference: `[google]`. """ def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: """Evaluate the id of `[ref]`. """ return text.lower(), index, True class ImageReferenceInlineProcessor(ReferenceInlineProcessor): """ Match to a stored reference and return `img` element. """ def makeTag(self, href: str, title: str, text: str) -> etree.Element: """ Return an `img` [`Element`][xml.etree.ElementTree.Element]. """ el = etree.Element("img") el.set("src", href) if title: el.set("title", title) el.set("alt", self.unescape(text)) return el class ShortImageReferenceInlineProcessor(ImageReferenceInlineProcessor): """ Short form of image reference: `![ref]`. """ def evalId(self, data: str, index: int, text: str) -> tuple[str, int, bool]: """Evaluate the id of `[ref]`. """ return text.lower(), index, True class AutolinkInlineProcessor(InlineProcessor): """ Return a link Element given an auto-link (`<http://example/com>`). """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: """ Return an `a` [`Element`][xml.etree.ElementTree.Element] of `group(1)`. """ el = etree.Element("a") el.set('href', self.unescape(m.group(1))) el.text = util.AtomicString(m.group(1)) return el, m.start(0), m.end(0) class AutomailInlineProcessor(InlineProcessor): """ Return a `mailto` link Element given an auto-mail link (`<foo@example.com>`). """ def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: """ Return an [`Element`][xml.etree.ElementTree.Element] containing a `mailto` link of `group(1)`. """ el = etree.Element('a') email = self.unescape(m.group(1)) if email.startswith("mailto:"): email = email[len("mailto:"):] def codepoint2name(code: int) -> str: """Return entity definition by code, or the code if not defined.""" entity = entities.codepoint2name.get(code) if entity: return "{}{};".format(util.AMP_SUBSTITUTE, entity) else: return "%s#%d;" % (util.AMP_SUBSTITUTE, code) letters = [codepoint2name(ord(letter)) for letter in email] el.text = util.AtomicString(''.join(letters)) mailto = "mailto:" + email mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % ord(letter) for letter in mailto]) el.set('href', mailto) return el, m.start(0), m.end(0)