sanitize.py
1 """HTML sanitizer for Gruyere, a web application with holes.
2
3 Copyright 2017 Google Inc. All rights reserved.
4
5 This code is licensed under the https://creativecommons.org/licenses/by-nd/3.0/us/
6 Creative Commons Attribution-No Derivative Works 3.0 United States license.
7
8 DO NOT COPY THIS CODE!
9
10 This application is a small self-contained web application with numerous
11 security holes. It is provided for use with the Web Application Exploits and
12 Defenses codelab. You may modify the code for your own use while doing the
13 codelab but you may not distribute the modified code. Brief excerpts of this
14 code may be used for educational or instructional purposes provided this
15 notice is kept intact. By using Gruyere you agree to the Terms of Service
16 https://www.google.com/intl/en/policies/terms/
17 """
18
19 __author__ = 'Bruce Leban'
20
21 # system modules
22 import re
23
24
25 def SanitizeHtml(s):
26 """Makes html safe for embedding in a document.
27
28 Filters the html to exclude all but a small subset of html by
29 removing script tags/attributes.
30
31 Args:
32 s: some html to sanitize.
33
34 Returns:
35 The html with all unsafe html removed.
36 """
37 processed = ''
38 while s:
39 start = s.find('<')
40 if start >= 0:
41 end = s.find('>', start)
42 if end >= 0:
43 before = s[:start]
44 tag = s[start:end+1]
45 after = s[end+1:]
46 else:
47 before = s[:start]
48 tag = s[start:]
49 after = ''
50 else:
51 before = s
52 tag = ''
53 after = ''
54
55 processed += before + _SanitizeTag(tag)
56 s = after
57 return processed
58
59
60 TAG_RE = re.compile(r'<(.*?)(\s|>)') # matches the start of an html tag
61
62
63 def _SanitizeTag(t):
64 """Sanitizes a single html tag.
65
66 This does both a 'whitelist' for
67 the allowed tags and a 'blacklist' for the disallowed attributes.
68
69 Args:
70 t: a tag to sanitize.
71
72 Returns:
73 a safe tag.
74 """
75 allowed_tags = [
76 'a', 'b', 'big', 'br', 'center', 'code', 'em', 'h1', 'h2', 'h3',
77 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 's', 'small',
78 'span', 'strong', 'table', 'td', 'tr', 'u', 'ul',
79 ]
80 disallowed_attributes = [
81 'onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus',
82 'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown',
83 'onmousemove', 'onmouseout', 'onmouseup', 'onreset',
84 'onselect', 'onsubmit', 'onunload'
85 ]
86
87 # Extract the tag name and make sure it's allowed.
88 if t.startswith('</'):
89 return t
90 m = TAG_RE.match(t)
91 if m is None:
92 return t
93 tag_name = m.group(1)
94 if tag_name not in allowed_tags:
95 t = t[:m.start(1)] + 'blocked' + t[m.end(1):]
96
97 # This is a bit heavy handed but we want to be sure we don't
98 # allow any to get through.
99 for a in disallowed_attributes:
100 t = t.replace(a, 'blocked')
101 return t