Ruby on Rails | Screencasts | Download | Documentation | Weblog | Community | Source

root/trunk/actionpack/lib/action_controller/vendor/html-scanner/html/sanitizer.rb

Revision 8269, 6.8 kB (checked in by david, 2 years ago)

Removed some of the tags that does not make sense to allow per default in the whitelist

Line 
1 module HTML
2   class Sanitizer
3     def sanitize(text, options = {})
4       return text unless sanitizeable?(text)
5       tokenize(text, options).join
6     end
7    
8     def sanitizeable?(text)
9       !(text.nil? || text.empty? || !text.index("<"))
10     end
11    
12   protected
13     def tokenize(text, options)
14       tokenizer = HTML::Tokenizer.new(text)
15       result = []
16       while token = tokenizer.next
17         node = Node.parse(nil, 0, 0, token, false)
18         process_node node, result, options
19       end
20       result
21     end
22    
23     def process_node(node, result, options)
24       result << node.to_s
25     end
26   end
27  
28   class FullSanitizer < Sanitizer
29     def sanitize(text, options = {})
30       result = super
31       # strip any comments, and if they have a newline at the end (ie. line with
32       # only a comment) strip that too
33       result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result
34       # Recurse - handle all dirty nested tags
35       result == text ? result : sanitize(result, options)
36     end
37    
38     def process_node(node, result, options)
39       result << node.to_s if node.class == HTML::Text
40     end
41   end
42  
43   class LinkSanitizer < FullSanitizer
44     cattr_accessor :included_tags, :instance_writer => false
45     self.included_tags = Set.new(%w(a href))
46
47     def sanitizeable?(text)
48       !(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">")))
49     end
50    
51   protected
52     def process_node(node, result, options)
53       result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name)
54     end
55   end
56  
57   class WhiteListSanitizer < Sanitizer
58     [:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags,
59      :allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr|
60       class_inheritable_accessor attr, :instance_writer => false
61     end
62
63     # A regular expression of the valid characters used to separate protocols like
64     # the ':' in 'http://foo.com'
65     self.protocol_separator     = /:|(&#0*58)|(&#x70)|(%|&#37;)3A/
66    
67     # Specifies a Set of HTML attributes that can have URIs.
68     self.uri_attributes         = Set.new(%w(href src cite action longdesc xlink:href lowsrc))
69
70     # Specifies a Set of 'bad' tags that the #sanitize helper will remove completely, as opposed
71     # to just escaping harmless tags like &lt;font&gt;
72     self.bad_tags               = Set.new(%w(script))
73    
74     # Specifies the default Set of tags that the #sanitize helper will allow unscathed.
75     self.allowed_tags           = Set.new(%w(strong em b i p code pre tt samp kbd var sub
76       sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr
77       acronym a img blockquote del ins))
78
79     # Specifies the default Set of html attributes that the #sanitize helper will leave
80     # in the allowed tag.
81     self.allowed_attributes     = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr))
82    
83     # Specifies the default Set of acceptable css properties that #sanitize and #sanitize_css will accept.
84     self.allowed_protocols      = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto
85       feed svn urn aim rsync tag ssh sftp rtsp afs))
86    
87     # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
88     self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse
89       border-color border-left-color border-right-color border-top-color clear color cursor direction display
90       elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height
91       overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation
92       speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space
93       width))
94  
95     # Specifies the default Set of acceptable css keywords that #sanitize and #sanitize_css will accept.
96     self.allowed_css_keywords   = Set.new(%w(auto aqua black block blue bold both bottom brown center
97       collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal
98       nowrap olive pointer purple red right solid silver teal top transparent underline white yellow))
99
100     # Specifies the default Set of allowed shorthand css properties for the #sanitize and #sanitize_css helpers.
101     self.shorthand_css_properties = Set.new(%w(background border margin padding))
102
103     # Sanitizes a block of css code.  Used by #sanitize when it comes across a style attribute
104     def sanitize_css(style)
105       # disallow urls
106       style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ')
107
108       # gauntlet
109       if style !~ /^([:,;#%.\sa-zA-Z0-9!]|\w-\w|\'[\s\w]+\'|\"[\s\w]+\"|\([\d,\s]+\))*$/ ||
110           style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/
111         return ''
112       end
113
114       clean = []
115       style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val|
116         if allowed_css_properties.include?(prop.downcase)
117           clean <<  prop + ': ' + val + ';'
118         elsif shorthand_css_properties.include?(prop.split('-')[0].downcase)
119           unless val.split().any? do |keyword|
120             !allowed_css_keywords.include?(keyword) &&
121               keyword !~ /^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$/
122           end
123             clean << prop + ': ' + val + ';'
124           end
125         end
126       end
127       clean.join(' ')
128     end
129
130   protected
131     def tokenize(text, options)
132       options[:parent] = []
133       options[:attributes] ||= allowed_attributes
134       options[:tags]       ||= allowed_tags
135       super
136     end
137
138     def process_node(node, result, options)
139       result << case node
140         when HTML::Tag
141           if node.closing == :close
142             options[:parent].shift
143           else
144             options[:parent].unshift node.name
145           end
146          
147           process_attributes_for node, options
148
149           options[:tags].include?(node.name) ? node : nil
150         else
151           bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "&lt;")
152       end
153     end
154    
155     def process_attributes_for(node, options)
156       return unless node.attributes
157       node.attributes.keys.each do |attr_name|
158         value = node.attributes[attr_name].to_s
159
160         if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value)
161           node.attributes.delete(attr_name)
162         else
163           node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value)
164         end
165       end
166     end
167
168     def contains_bad_protocols?(attr_name, value)
169       uri_attributes.include?(attr_name) &&
170       (value =~ /(^[^\/:]*):|(&#0*58)|(&#x70)|(%|&#37;)3A/ && !allowed_protocols.include?(value.split(protocol_separator).first))
171     end
172   end
173 end
Note: See TracBrowser for help on using the browser.