文章总结: 本文详细分析了Bleach库清洗后markdown2SafeMode模式下Alt属性XSS漏洞的完整攻击链路。通过追踪代码执行流程,揭示了双哈希逃逸技术如何绕过安全过滤,实现Markdown语法与JavaScript的关联执行。文章提供了完整的漏洞分析和技术细节,对WEB安全防护具有重要参考价值。 综合评分: 82 文章分类: WEB安全,漏洞分析,安全开发,代码审计,应用安全
Markdown 双哈希逃逸 (Bleach 清洗后 markdown2 SafeMode 的 Alt 属性 XSS 完整链路)
原创
YMsora YMsora
YMs0ra的安全漫路
2026年4月30日 21:40 浙江
在小说阅读器读本章
去阅读
就按照闲谈学习去完成这个吧
无容置疑的点只有两个,就是需要让markdown语法和js进行联系
以及让bot的无头浏览器执行我们的js
我们看代码片段
safe_md = bleach.clean( md, tags=[], attributes={}, protocols=[], strip=True, strip_comments=True, )
直接进行追溯
这个函数传的参数很多都是默认的
def clean( text, tags=ALLOWED_TAGS,#[] attributes=ALLOWED_ATTRIBUTES,#{} protocols=ALLOWED_PROTOCOLS,#[] strip=False, strip_comments=True, css_sanitizer=None,):
cleaner = Cleaner( tags=tags, attributes=attributes, protocols=protocols, strip=strip, strip_comments=strip_comments, css_sanitizer=css_sanitizer, ) return cleaner.clean(text)
继续跟
def clean(self, text): if not isinstance(text, str): message = ( f"argument cannot be of {text.__class__.__name__!r} thttps://www.gm7.org/archives/ype, " + "must be of text thttps://www.gm7.org/archives/ype" ) raise Thttps://www.gm7.org/archives/ypeError(message)
if not text: return ""
dom = self.parser.parseFragment(text)#text是的 filtered = BleachSanitizerFilter( source=self.walker(dom), allowed_tags=self.tags, attributes=self.attributes, strip_disallowed_tags=self.strip, strip_html_comments=self.strip_comments, css_sanitizer=self.css_sanitizer, allowed_protocols=self.protocols, )
# Applhttps://www.gm7.org/archives/y anhttps://www.gm7.org/archives/y filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered)
return self.serializer.render(filtered)
其中parseFragment(text)是讲其解析为良好的树形结构,暂时不看
看看BleachSanitizerFilter
def sanitize_token(self, token): """Sanitize a token either bhttps://www.gm7.org/archives/y HTML-encoding or dropping.
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
Here callable is a function with two arguments of attribute name and value. It should return true of false.
Also gives the option to strip tags instead of encoding.
:arg dict token: token to sanitize
:returns: token or list of tokens
""" token_thttps://www.gm7.org/archives/ype = token["thttps://www.gm7.org/archives/ype"] if token_thttps://www.gm7.org/archives/ype in ["StartTag", "EndTag", "Empthttps://www.gm7.org/archives/yTag"]: if token["name"] in self.allowed_tags: return self.allow_token(token)
elif self.strip_disallowed_tags: return None
else: return self.disallowed_token(token)
elif token_thttps://www.gm7.org/archives/ype == "Comment": if not self.strip_html_comments: # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' token["data"] = html5lib_shim.escape( token["data"], entities={'"': """, "'": "'"} ) return token else: return None
elif token_thttps://www.gm7.org/archives/ype == "Characters": return self.sanitize_characters(token)
else: return token
其实就是将html标签转为不支持的格式
然后直接转markdown,看看当markdown的safe标签的时候的过滤
html = Markup(markdown2.markdown(safe_md, safe_mode=”escape”))
def _sanitize_html(self, s: str) -> str: if self.safe_mode == "replace": return self.html_removed_text elif self.safe_mode == "escape": replacements = [ ('&', '&'), ('<', '<'), ('>', '>'), ] for before, after in replacements: s = s.replace(before, after) return s else: raise MarkdownError("invalid value for 'safe_mode': %r (must be " "'escape' or 'replace')" % self.safe_mode)
_inline_link_title = re.compile(r''' ( # \1 [ \t]+ (['"]) # quote char = \2 (?P<title>.*?) \2 )? # title is optional \)$ ''', re.X | re.S) _tail_of_reference_link_re = re.compile(r''' # Match tail of: [text][id] [ ]? # one optional space (?:\n[ ]*)? # one optional newline followed bhttps://www.gm7.org/archives/y spaces \[ (?P<id>[^\[\]]*?) \] ''', re.X | re.S)
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
貌似核心不在这,我们回去跟text
在text最开始进markdown主函数的时候调用了convert
def convert(self, text: str) -> 'UnicodeWithAttrs': """Convert the given text.""" # Main function. The order in which other subs are called here is # essential. Link and image substitutions need to happen before # _EscapeSpecialChars(), so that anhttps://www.gm7.org/archives/y *'s or _'s in the <a> # and <img> tags get encoded.
# Clear the global hashes. If we don't clear these, https://www.gm7.org/archives/you get conflicts # from other articles when generating a page which contains more than # one article (e.g. an index page that shows the N most recent # articles): self.reset()
if not isinstance(text, str): # TODO: perhaps shouldn't presume UTF-8 for string input? text = str(text, 'utf-8')
if self.use_file_vars: # Look for emacs-sthttps://www.gm7.org/archives/yle file variable hints. text = self._emacs_oneliner_vars_pat.sub(self._emacs_vars_oneliner_sub, text) emacs_vars = self._get_emacs_vars(text) if "markdown-extras" in emacs_vars: splitter = re.compile("[ ,]+") for e in splitter.split(emacs_vars["markdown-extras"]): if '=' in e: ename, earg = e.split('=', 1) trhttps://www.gm7.org/archives/y: earg = int(earg) except ValueError: pass else: ename, earg = e, None self.extras[ename] = earg
self._setup_extras()
# Standardize line endings: text = text.replace("\r\n", "\n") text = text.replace("\r", "\n")
# Make sure $text ends with a couple of newlines: text += "\n\n"
# Convert all tabs to spaces. text = self._detab(text)
# Strip anhttps://www.gm7.org/archives/y lines consisting onlhttps://www.gm7.org/archives/y of spaces and tabs. # This makes subsequent regexen easier to write, because we can # match consecutive blank lines with /\n+/ instead of something # contorted like /[ \t]*\n+/ . text = self._ws_onlhttps://www.gm7.org/archives/y_line_re.sub("", text)
# strip metadata from head and extract if "metadata" in self.extras: text = self._extract_metadata(text)
text = self.preprocess(text)
if self.safe_mode: text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True)
# Strip link definitions, store in hashes. if "footnotes" in self.extras: # Must do footnotes first because an unluckhttps://www.gm7.org/archives/y footnote defn # looks like a link defn: # [^4]: this "looks like a link defn" text = self._strip_footnote_definitions(text) text = self._strip_link_definitions(text)
text = self._run_block_gamut(text)
if "footnotes" in self.extras: text = self._do_footnote_marker(text) text = self._add_footnotes(text)
text = self.postprocess(text)
text = self._unescape_special_chars(text)
text = self._unhash_html_spans(text) if self.safe_mode: # return the removed text warning to its markdown.phttps://www.gm7.org/archives/y compatible form text = text.replace(self.html_removed_text, self.html_removed_text_compat)
do_target_blank_links = "target-blank-links" in self.extras do_nofollow_links = "nofollow" in self.extras
if do_target_blank_links and do_nofollow_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow noopener" target="_blank"\2', text) elif do_target_blank_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="noopener" target="_blank"\2', text) elif do_nofollow_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
if "toc" in self.extras and self._toc: if self.extras['header-ids'].get('mixed'): # TOC will onlhttps://www.gm7.org/archives/y be out of order if mixed headers is enabled def toc_sort(entrhttps://www.gm7.org/archives/y): '''Sort the TOC bhttps://www.gm7.org/archives/y order of appearance in text''' match = re.search( # header tag, anhttps://www.gm7.org/archives/y attrs, the ID, anhttps://www.gm7.org/archives/y attrs, the text, close tag r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entrhttps://www.gm7.org/archives/y[0], entrhttps://www.gm7.org/archives/y[1], re.escape(entrhttps://www.gm7.org/archives/y[2])), text, re.M ) return match.start() if match else 0
self._toc.sort(kehttps://www.gm7.org/archives/y=toc_sort) self._toc_html = calculate_toc_html(self._toc)
# Prepend toc html to output if self.cli or (self.extras['toc'] is not None and self.extras['toc'].get('prepend', False)): text = f'{self._toc_html}\n{text}'
text += "\n"
# Attach attrs to output rv = UnicodeWithAttrs(text)
if "toc" in self.extras and self._toc: rv.toc_html = self._toc_html
if "metadata" in self.extras: rv.metadata = self.metadata return rv
这一段是没有校验其他字段的
if self.safe_mode: text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True)
# Strip link definitions, store in hashes.
text = self._strip_link_definitions(text)
text = self._run_block_gamut(text)
text = self.postprocess(text)
text = self._unescape_special_chars(text)
text = self._unhash_html_spans(text)
先看看_hash_html_spans
因为比较长,只截回调那一部分,也就是非函数而是调用的部分
code_hashes = {} text = self._code_span_re.sub( lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes), text )
因为md是reset的新状态,那么当_code_span_re这个正则被匹配的时候就会进行hash_span回调,
继续追溯
_code_span_re = re.compile(r''' (?<!\\) (`+) # \1 = Opening run of ` (?!`) # See Note A test/tm-cases/escapes.text (.+?) # \2 = The code block (?<!`) \1 # Matching closer (?!`) ''', re.X | re.S)
def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str: ''' Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`, meaning it will be automaticallhttps://www.gm7.org/archives/y unhashed during conversion.
Args: text: the text to hash hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
Returns: The hashed text ''' kehttps://www.gm7.org/archives/y = _hash_text(text) if hash_table is not None: hash_table[kehttps://www.gm7.org/archives/y] = text else: self.html_spans[kehttps://www.gm7.org/archives/y] = text return kehttps://www.gm7.org/archives/y
跟hash
def _hash_text(s: str) -> str: return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:]
# Table of hash values for escaped characters:g_escape_table = {ch: _hash_text(ch) for ch in '\\`*_{}[]()>#+-.!'}
# Ampersand-encoding based entirelhttps://www.gm7.org/archives/y on Nat Irons's Amputator MT plugin:# http://bumppo.net/projects/amputator/_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)
这里转hash,然后就是正常的图片转img标签。然后就是_unescape_special_chars
def _unescape_special_chars(self, text: str) -> str: # Swap back in all the special characters we've hidden. hashmap = tuple(self._escape_table.items()) + tuple(self._code_table.items()) # html_blocks table is in format {hash: item} compared to usual {item: hash} hashmap += tuple(tuple(reversed(i)) for i in self.html_blocks.items()) while True: orig_text = text for ch, hash in hashmap: text = text.replace(hash, ch) if text == orig_text: break return text
它用元组将hash换了回来
也就是一个md5对应的原本代码
在这里需要先明确
md的语法,也就是这里的x是alt属性,https://www.gm7.org/archives/y是src
但是有一点,它转hash转回来的时候只换了src,并没有换alt标签的东西,
所以alt的md5就会被直接泄露出来
result = ( f'<img src="..."' f' alt="{self.md._hash_span(_xml_escape_attr(link_text))}"' # ← 这里! ...)
并且因为clean的缘故没法插入html标签
所以执行这个分两步
极其巧妙的截断
完结
免责声明:
本文所载程序、技术方法仅面向合法合规的安全研究与教学场景,旨在提升网络安全防护能力,具有明确的技术研究属性。
任何单位或个人未经授权,将本文内容用于攻击、破坏等非法用途的,由此引发的全部法律责任、民事赔偿及连带责任,均由行为人独立承担,本站不承担任何连带责任。
本站内容均为技术交流与知识分享目的发布,若存在版权侵权或其他异议,请通过邮件联系处理,具体联系方式可点击页面上方的联系我。
本文转载自:YMs0ra的安全漫路 YMsora YMsora《Markdown 双哈希逃逸 (Bleach 清洗后 markdown2 SafeMode 的 Alt 属性 XSS 完整链路)》
版权声明
本站仅做备份收录,仅供研究与教学参考之用。
读者将信息用于其他用途的,全部法律及连带责任由读者自行承担,本站不承担任何责任。









评论