diff --git a/lib/markdown2.py b/lib/markdown2.py index dc698970..2921c22a 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -1128,7 +1128,19 @@ def _strict_tag_block_sub( def _tag_is_closed(self, tag_name: str, text: str) -> bool: # check if number of open tags == number of close tags - if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('' % tag_name): + open_count = 0 + pos = 0 + open_tag = '<%s' % tag_name + while True: + pos = text.find(open_tag, pos) + if pos == -1: + break + if text.find('>', pos + len(open_tag)) == -1: + return False + open_count += 1 + pos += len(open_tag) + + if open_count != text.count('' % tag_name): return False # check that close tag position is AFTER open tag @@ -1303,6 +1315,62 @@ def _run_span_gamut(self, text: str) -> str: # it does this by matching pairs of `\` chars and checking that they're NOT followed by another `\` _is_unescaped_re = re.compile(r'^((?:\\\\)*(?!\\))') + def _sorta_html_tokenize(self, text: str): + """Yield (is_html_markup, token) pairs from *text*. + + This replaces the previous ``_sorta_html_tokenize_re.split(text)`` + approach which was vulnerable to catastrophic backtracking (ReDoS) + on malformed HTML fragments. The new implementation first locates + ``<`` characters with a simple linear scan, bounds each candidate + token between ``<`` and the next ``>``, then uses the existing + regex in ``.match()`` mode on only that bounded substring. This + keeps the regex work O(1) per token instead of O(n) over the + full input. + """ + pos = 0 + while pos < len(text): + start = text.find('<', pos) + if start == -1: + yield False, text[pos:] + return + + # Count preceding backslashes to determine if this '<' is escaped. + slash_start = start + while slash_start > pos and text[slash_start - 1] == '\\': + slash_start -= 1 + if slash_start > pos: + yield False, text[pos:slash_start] + + # Determine the extent of the candidate token. + if text.startswith('', start + 4) + if end != -1: + end += 3 + else: + end = -1 + elif text.startswith('', start + 2) + if end != -1: + end += 2 + else: + end = -1 + else: + end = text.find('>', start + 1) + if end != -1: + end += 1 + + if end == -1: + yield False, text[slash_start:] + return + + token_match = self._sorta_html_tokenize_re.match(text, slash_start, end) + if token_match and token_match.end() == end: + yield True, token_match.group(0) + pos = end + else: + yield False, text[slash_start:start + 1] + pos = start + 1 + @mark_stage(Stage.ESCAPE_SPECIAL) def _escape_special_chars(self, text: str) -> str: # Python markdown note: the HTML tokenization here differs from @@ -1312,8 +1380,7 @@ def _escape_special_chars(self, text: str) -> str: # Note, however, that '>' is not allowed in an auto-link URL # here. escaped = [] - is_html_markup = False - for token in self._sorta_html_tokenize_re.split(text): + for is_html_markup, token in self._sorta_html_tokenize(text): # check token is preceded by 0 or more PAIRS of escapes, because escape pairs # escape themselves and don't affect the token if is_html_markup and self._is_unescaped_re.match(token): @@ -1335,7 +1402,6 @@ def _escape_special_chars(self, text: str) -> str: token.replace('\\<', '<').replace('\\>', '>') ) ) - is_html_markup = not is_html_markup return ''.join(escaped) def _is_auto_link(self, text):