| 1 |
module HTML |
|---|
| 2 |
class Sanitizer |
|---|
| 3 |
def sanitize(text, options = {}) |
|---|
| 4 |
return text unless sanitizeable?(text) |
|---|
| 5 |
tokenize(text, options).join |
|---|
| 6 |
end |
|---|
| 7 |
|
|---|
| 8 |
def sanitizeable?(text) |
|---|
| 9 |
!(text.nil? || text.empty? || !text.index("<")) |
|---|
| 10 |
end |
|---|
| 11 |
|
|---|
| 12 |
protected |
|---|
| 13 |
def tokenize(text, options) |
|---|
| 14 |
tokenizer = HTML::Tokenizer.new(text) |
|---|
| 15 |
result = [] |
|---|
| 16 |
while token = tokenizer.next |
|---|
| 17 |
node = Node.parse(nil, 0, 0, token, false) |
|---|
| 18 |
process_node node, result, options |
|---|
| 19 |
end |
|---|
| 20 |
result |
|---|
| 21 |
end |
|---|
| 22 |
|
|---|
| 23 |
def process_node(node, result, options) |
|---|
| 24 |
result << node.to_s |
|---|
| 25 |
end |
|---|
| 26 |
end |
|---|
| 27 |
|
|---|
| 28 |
class FullSanitizer < Sanitizer |
|---|
| 29 |
def sanitize(text, options = {}) |
|---|
| 30 |
result = super |
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
result.gsub!(/<!--(.*?)-->[\n]?/m, "") if result |
|---|
| 34 |
|
|---|
| 35 |
result == text ? result : sanitize(result, options) |
|---|
| 36 |
end |
|---|
| 37 |
|
|---|
| 38 |
def process_node(node, result, options) |
|---|
| 39 |
result << node.to_s if node.class == HTML::Text |
|---|
| 40 |
end |
|---|
| 41 |
end |
|---|
| 42 |
|
|---|
| 43 |
class LinkSanitizer < FullSanitizer |
|---|
| 44 |
cattr_accessor :included_tags, :instance_writer => false |
|---|
| 45 |
self.included_tags = Set.new(%w(a href)) |
|---|
| 46 |
|
|---|
| 47 |
def sanitizeable?(text) |
|---|
| 48 |
!(text.nil? || text.empty? || !((text.index("<a") || text.index("<href")) && text.index(">"))) |
|---|
| 49 |
end |
|---|
| 50 |
|
|---|
| 51 |
protected |
|---|
| 52 |
def process_node(node, result, options) |
|---|
| 53 |
result << node.to_s unless node.is_a?(HTML::Tag) && included_tags.include?(node.name) |
|---|
| 54 |
end |
|---|
| 55 |
end |
|---|
| 56 |
|
|---|
| 57 |
class WhiteListSanitizer < Sanitizer |
|---|
| 58 |
[:protocol_separator, :uri_attributes, :allowed_attributes, :allowed_tags, :allowed_protocols, :bad_tags, |
|---|
| 59 |
:allowed_css_properties, :allowed_css_keywords, :shorthand_css_properties].each do |attr| |
|---|
| 60 |
class_inheritable_accessor attr, :instance_writer => false |
|---|
| 61 |
end |
|---|
| 62 |
|
|---|
| 63 |
|
|---|
| 64 |
|
|---|
| 65 |
self.protocol_separator = /:|(& |
|---|
| 66 |
|
|---|
| 67 |
|
|---|
| 68 |
self.uri_attributes = Set.new(%w(href src cite action longdesc xlink:href lowsrc)) |
|---|
| 69 |
|
|---|
| 70 |
|
|---|
| 71 |
|
|---|
| 72 |
self.bad_tags = Set.new(%w(script)) |
|---|
| 73 |
|
|---|
| 74 |
|
|---|
| 75 |
self.allowed_tags = Set.new(%w(strong em b i p code pre tt samp kbd var sub |
|---|
| 76 |
sup dfn cite big small address hr br div span h1 h2 h3 h4 h5 h6 ul ol li dt dd abbr |
|---|
| 77 |
acronym a img blockquote del ins)) |
|---|
| 78 |
|
|---|
| 79 |
|
|---|
| 80 |
|
|---|
| 81 |
self.allowed_attributes = Set.new(%w(href src width height alt cite datetime title class name xml:lang abbr)) |
|---|
| 82 |
|
|---|
| 83 |
|
|---|
| 84 |
self.allowed_protocols = Set.new(%w(ed2k ftp http https irc mailto news gopher nntp telnet webcal xmpp callto |
|---|
| 85 |
feed svn urn aim rsync tag ssh sftp rtsp afs)) |
|---|
| 86 |
|
|---|
| 87 |
|
|---|
| 88 |
self.allowed_css_properties = Set.new(%w(azimuth background-color border-bottom-color border-collapse |
|---|
| 89 |
border-color border-left-color border-right-color border-top-color clear color cursor direction display |
|---|
| 90 |
elevation float font font-family font-size font-style font-variant font-weight height letter-spacing line-height |
|---|
| 91 |
overflow pause pause-after pause-before pitch pitch-range richness speak speak-header speak-numeral speak-punctuation |
|---|
| 92 |
speech-rate stress text-align text-decoration text-indent unicode-bidi vertical-align voice-family volume white-space |
|---|
| 93 |
width)) |
|---|
| 94 |
|
|---|
| 95 |
|
|---|
| 96 |
self.allowed_css_keywords = Set.new(%w(auto aqua black block blue bold both bottom brown center |
|---|
| 97 |
collapse dashed dotted fuchsia gray green !important italic left lime maroon medium none navy normal |
|---|
| 98 |
nowrap olive pointer purple red right solid silver teal top transparent underline white yellow)) |
|---|
| 99 |
|
|---|
| 100 |
|
|---|
| 101 |
self.shorthand_css_properties = Set.new(%w(background border margin padding)) |
|---|
| 102 |
|
|---|
| 103 |
|
|---|
| 104 |
def sanitize_css(style) |
|---|
| 105 |
|
|---|
| 106 |
style = style.to_s.gsub(/url\s*\(\s*[^\s)]+?\s*\)\s*/, ' ') |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
if style !~ /^([:,; |
|---|
| 110 |
style !~ /^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$/ |
|---|
| 111 |
return '' |
|---|
| 112 |
end |
|---|
| 113 |
|
|---|
| 114 |
clean = [] |
|---|
| 115 |
style.scan(/([-\w]+)\s*:\s*([^:;]*)/) do |prop,val| |
|---|
| 116 |
if allowed_css_properties.include?(prop.downcase) |
|---|
| 117 |
clean << prop + ': ' + val + ';' |
|---|
| 118 |
elsif shorthand_css_properties.include?(prop.split('-')[0].downcase) |
|---|
| 119 |
unless val.split().any? do |keyword| |
|---|
| 120 |
!allowed_css_keywords.include?(keyword) && |
|---|
| 121 |
keyword !~ /^( |
|---|
| 122 |
end |
|---|
| 123 |
clean << prop + ': ' + val + ';' |
|---|
| 124 |
end |
|---|
| 125 |
end |
|---|
| 126 |
end |
|---|
| 127 |
clean.join(' ') |
|---|
| 128 |
end |
|---|
| 129 |
|
|---|
| 130 |
protected |
|---|
| 131 |
def tokenize(text, options) |
|---|
| 132 |
options[:parent] = [] |
|---|
| 133 |
options[:attributes] ||= allowed_attributes |
|---|
| 134 |
options[:tags] ||= allowed_tags |
|---|
| 135 |
super |
|---|
| 136 |
end |
|---|
| 137 |
|
|---|
| 138 |
def process_node(node, result, options) |
|---|
| 139 |
result << case node |
|---|
| 140 |
when HTML::Tag |
|---|
| 141 |
if node.closing == :close |
|---|
| 142 |
options[:parent].shift |
|---|
| 143 |
else |
|---|
| 144 |
options[:parent].unshift node.name |
|---|
| 145 |
end |
|---|
| 146 |
|
|---|
| 147 |
process_attributes_for node, options |
|---|
| 148 |
|
|---|
| 149 |
options[:tags].include?(node.name) ? node : nil |
|---|
| 150 |
else |
|---|
| 151 |
bad_tags.include?(options[:parent].first) ? nil : node.to_s.gsub(/</, "<") |
|---|
| 152 |
end |
|---|
| 153 |
end |
|---|
| 154 |
|
|---|
| 155 |
def process_attributes_for(node, options) |
|---|
| 156 |
return unless node.attributes |
|---|
| 157 |
node.attributes.keys.each do |attr_name| |
|---|
| 158 |
value = node.attributes[attr_name].to_s |
|---|
| 159 |
|
|---|
| 160 |
if !options[:attributes].include?(attr_name) || contains_bad_protocols?(attr_name, value) |
|---|
| 161 |
node.attributes.delete(attr_name) |
|---|
| 162 |
else |
|---|
| 163 |
node.attributes[attr_name] = attr_name == 'style' ? sanitize_css(value) : CGI::escapeHTML(value) |
|---|
| 164 |
end |
|---|
| 165 |
end |
|---|
| 166 |
end |
|---|
| 167 |
|
|---|
| 168 |
def contains_bad_protocols?(attr_name, value) |
|---|
| 169 |
uri_attributes.include?(attr_name) && |
|---|
| 170 |
(value =~ /(^[^\/:]*):|(& |
|---|
| 171 |
end |
|---|
| 172 |
end |
|---|
| 173 |
end |
|---|