sanitize.py
  1  """HTML sanitizer for Gruyere, a web application with holes.
  2  
  3  Copyright 2010 Google Inc. All rights reserved.
  4  
  5  This code is licensed under the http://creativecommons.org/licenses/by-nd/3.0/us
  6  Creative Commons Attribution-No Derivative Works 3.0 United States license.
  7  
  8  DO NOT COPY THIS CODE!
  9  
 10  This application is a small self-contained web application with numerous
 11  security holes. It is provided for use with the Web Application Exploits and
 12  Defenses codelab. You may modify the code for your own use while doing the
 13  codelab but you may not distribute the modified code. Brief excerpts of this
 14  code may be used for educational or instructional purposes provided this
 15  notice is kept intact. By using Gruyere you agree to the Terms of Service
 16  http://code.google.com/terms.html
 17  """
 18  
 19  __author__ = 'Bruce Leban'
 20  
 21  # system modules
 22  import re
 23  
 24  
 25  def SanitizeHtml(s):
 26    """Makes html safe for embedding in a document.
 27  
 28    Filters the html to exclude all but a small subset of html by
 29    removing script tags/attributes.
 30  
 31    Args:
 32      s: some html to sanitize.
 33  
 34    Returns:
 35      The html with all unsafe html removed.
 36    """
 37    processed = ''
 38    while s:
 39      start = s.find('<')
 40      if start >= 0:
 41        end = s.find('>', start)
 42        if end >= 0:
 43          before = s[:start]
 44          tag = s[start:end+1]
 45          after = s[end+1:]
 46        else:
 47          before = s[:start]
 48          tag = s[start:]
 49          after = ''
 50      else:
 51        before = s
 52        tag = ''
 53        after = ''
 54  
 55      processed += before + _SanitizeTag(tag)
 56      s = after
 57    return processed
 58  
 59  
 60  TAG_RE = re.compile(r'<(.*?)(\s|>)')  # matches the start of an html tag
 61  
 62  
 63  def _SanitizeTag(t):
 64    """Sanitizes a single html tag.
 65  
 66    This does both a 'whitelist' for
 67    the allowed tags and a 'blacklist' for the disallowed attributes.
 68  
 69    Args:
 70      t: a tag to sanitize.
 71  
 72    Returns:
 73      a safe tag.
 74    """
 75    allowed_tags = [
 76        'a', 'b', 'big', 'br', 'center', 'code', 'em', 'h1', 'h2', 'h3',
 77        'h4', 'h5', 'h6', 'hr', 'i', 'img', 'li', 'ol', 'p', 's', 'small',
 78        'span', 'strong', 'table', 'td', 'tr', 'u', 'ul',
 79    ]
 80    disallowed_attributes = [
 81        'onblur', 'onchange', 'onclick', 'ondblclick', 'onfocus',
 82        'onkeydown', 'onkeypress', 'onkeyup', 'onload', 'onmousedown',
 83        'onmousemove', 'onmouseout', 'onmouseup', 'onreset',
 84        'onselect', 'onsubmit', 'onunload'
 85    ]
 86  
 87    # Extract the tag name and make sure it's allowed.
 88    if t.startswith('</'):
 89      return t
 90    m = TAG_RE.match(t)
 91    if m is None:
 92      return t
 93    tag_name = m.group(1)
 94    if tag_name not in allowed_tags:
 95      t = t[:m.start(1)] + 'blocked' + t[m.end(1):]
 96  
 97    # This is a bit heavy handed but we want to be sure we don't
 98    # allow any to get through.
 99    for a in disallowed_attributes:
100      t = t.replace(a, 'blocked')
101    return t