// Copyright 2014 The Closure Library Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS-IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** * @fileoverview * An HTML sanitizer that takes untrusted HTML snippets and produces * safe HTML by filtering/rewriting tags and attributes that contain * high-privilege instructions. */ goog.provide('goog.labs.html.Sanitizer'); goog.require('goog.asserts'); goog.require('goog.html.SafeUrl'); goog.require('goog.labs.html.attributeRewriterPresubmitWorkaround'); goog.require('goog.labs.html.scrubber'); goog.require('goog.object'); goog.require('goog.string'); /** * A sanitizer that converts untrusted, messy HTML into more regular HTML * that cannot abuse high-authority constructs like the ability to execute * arbitrary JavaScript. * @constructor */ goog.labs.html.Sanitizer = function() { /** * Maps the lower-case names of allowed elements to attribute white-lists. * An attribute white-list maps lower-case attribute names to functions * from values to values or undefined to disallow. * * The special element name {@code "*"} contains a white-list of attributes * allowed on any tag, which is useful for attributes like {@code title} and * {@code id} which are widely available with element-agnostic meanings. * It should not be used for attributes like {@code type} whose meaning * differs based on the element on which it appears: * e.g. {@code <input type=text>} vs {@code <style type=text/css>}. * * @type {!Object.<string, !Object.<string, goog.labs.html.AttributeRewriter>>} * @private */ this.whitelist_ = goog.labs.html.Sanitizer.createBlankObject_(); this.whitelist_['*'] = goog.labs.html.Sanitizer.createBlankObject_(); // To use the sanitizer, we build inputs for the scrubber. // These inputs are invalidated by changes to the policy, so we (re)build them // lazily. /** * Maps element names to {@code true} so the scrubber does not have to do * own property checks for every tag filtered. * * Built lazily and invalidated when the white-list is modified. * * @type {Object.<string, boolean>} * @private */ this.allowedElementSet_ = null; }; // TODO(user): Should the return type be goog.html.SafeHtml? // If we receive a safe HTML string as input, should we simply rebalance // tags? /** * Yields a string of safe HTML that contains all and only the safe * text-nodes and elements in the input. * * <p> * For the purposes of this function, "safe" is defined thus: * <ul> * <li>Contains only elements explicitly allowed via {@code this.allow*}. * <li>Contains only attributes explicitly allowed via {@code this.allow*} * and having had all relevant transformations applied. * <li>Contains an end tag for all and only non-void open tags. * <li>Tags nest per XHTML rules. * <li>Tags do not nest beyond a finite but fairly large level. * </ul> * * @param {!string} unsafeHtml A string of HTML which need not originate with * a trusted source. * @return {!string} A string of HTML that contains only tags and attributes * explicitly allowed by this sanitizer, and with end tags for all and only * non-void elements. */ goog.labs.html.Sanitizer.prototype.sanitize = function(unsafeHtml) { var unsafeHtmlString = '' + unsafeHtml; /** * @type {!Object.<string, !Object.<string, goog.labs.html.AttributeRewriter>>} */ var whitelist = this.whitelist_; if (!this.allowedElementSet_) { this.allowedElementSet_ = goog.object.createSet( // This can lead to '*' in the allowed element set, but the scrubber // will not parse "<*" as a tag beginning. goog.object.getKeys(whitelist)); } return goog.labs.html.scrubber.scrub( this.allowedElementSet_, whitelist, unsafeHtmlString); }; /** * Adds the element names to the white-list of elements that are allowed * in the safe HTML output. * <p> * Allowing elements does not, by itself, allow any attributes on * those elements. * * @param {...!string} var_args element names that should be allowed in the * safe HTML output. * @return {!goog.labs.html.Sanitizer} {@code this}. */ goog.labs.html.Sanitizer.prototype.allowElements = function(var_args) { this.allowedElementSet_ = null; // Invalidate. var whitelist = this.whitelist_; for (var i = 0; i < arguments.length; ++i) { var elementName = arguments[i].toLowerCase(); goog.asserts.assert( goog.labs.html.Sanitizer.isValidHtmlName_(elementName), elementName); if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) { whitelist[elementName] = goog.labs.html.Sanitizer.createBlankObject_(); } } return this; }; /** * Allows in the sanitized output * <tt><<i>element</i> <i>attr</i>="..."></tt> * when <i>element</i> is in {@code elementNames} and * <i>attrNames</i> is in {@code attrNames}. * * If specified, {@code opt_valueXform} is a function that takes the * HTML-entity-decoded attribute value, and can choose to disallow the * attribute by returning {@code null} or substitute a new value * by returning a string with the new value. * * @param {!Array.<string>|string} elementNames names (or name) on which the * attributes are allowed. * * Element names should be allowed via {@code allowElements(...)} prior * to white-listing attributes. * * The special element name {@code "*"} has the same meaning as in CSS * selectors: it can be used to white-list attributes like {@code title} * and {@code id} which are widely available with element-agnostic * meanings. * * It should not be used for attributes like {@code type} whose meaning * differs based on the element on which it appears: * e.g. {@code <input type=text>} vs {@code <style type=text/css>}. * * @param {!Array.<string>|string} attrNames names (or name) of the attribute * that should be allowed. * * @param {goog.labs.html.AttributeRewriter=} opt_rewriteValue A function * that receives the HTML-entity-decoded attribute value and can return * {@code null} to disallow the attribute entirely or the value for the * attribute as a string. * <p> * The default is the identity function ({@code function(x){return x}}), * and the value rewriter is composed with an attribute specific handler: * <table> * <tr> * <th>href, src</th> * <td>Requires that the value be an absolute URL with a protocol in * (http, https, mailto) or a protocol relative URL. * </tr> * </table> * * @return {!goog.labs.html.Sanitizer} {@code this}. */ goog.labs.html.Sanitizer.prototype.allowAttributes = function(elementNames, attrNames, opt_rewriteValue) { if (!goog.isArray(elementNames)) { elementNames = [elementNames]; } if (!goog.isArray(attrNames)) { attrNames = [attrNames]; } goog.asserts.assert( !opt_rewriteValue || 'function' === typeof opt_rewriteValue, 'opt_rewriteValue should be a function'); var whitelist = this.whitelist_; for (var ei = 0; ei < elementNames.length; ++ei) { var elementName = elementNames[ei].toLowerCase(); goog.asserts.assert( goog.labs.html.Sanitizer.isValidHtmlName_(elementName) || '*' === elementName, elementName); // If the element has not been white-listed then panic. // TODO(user): allow allow{Elements,Attributes} to be called in any // order if someone needs it. if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) { throw new Error(elementName); } var attrWhitelist = whitelist[elementName]; for (var ai = 0, an = attrNames.length; ai < an; ++ai) { var attrName = attrNames[ai].toLowerCase(); goog.asserts.assert( goog.labs.html.Sanitizer.isValidHtmlName_(attrName), attrName); // If the value has already been allowed, then chain the rewriters // so that both white-listers concerns are met. // We do not use the default rewriter here since it should have // been introduced by the call that created the initial white-list // entry. attrWhitelist[attrName] = goog.labs.html.Sanitizer.chain_( opt_rewriteValue || goog.labs.html.Sanitizer.valueIdentity_, Object.prototype.hasOwnProperty.call(attrWhitelist, attrName) ? attrWhitelist[attrName] : goog.labs.html.Sanitizer.defaultRewriterForAttr_(attrName)); } } return this; }; /** * A new object that is as blank as possible. * * Using {@code Object.create} to create an object with * no prototype speeds up whitelist access since there's fewer prototypes * to fall-back to for a common case where an element is not in the * white-list, and reduces the chance of confusing a member of * {@code Object.prototype} with a whitelist entry. * * @return {!Object.<string, ?>} a reference to a newly allocated object that * does not alias any reference that existed prior. * @private */ goog.labs.html.Sanitizer.createBlankObject_ = function() { return (Object.create || Object)(null); }; /** * HTML element and attribute names may be almost arbitrary strings, but the * sanitizer is more restrictive as to what can be white-listed. * * Since HTML is case-insensitive, only lower-case identifiers composed of * ASCII letters, digits, and select punctuation are allowed. * * @param {string} name * @return {boolean} true iff name is a valid white-list key. * @private */ goog.labs.html.Sanitizer.isValidHtmlName_ = function(name) { return 'string' === typeof name && // Names must be strings. // Names must be lower-case and ASCII identifier chars only. /^[a-z][a-z0-9\-:]*$/.test(name); }; /** * @param {goog.labs.html.AttributeValue} x * @return {goog.labs.html.AttributeValue} * @private */ goog.labs.html.Sanitizer.valueIdentity_ = function(x) { return x; }; /** * @param {goog.labs.html.AttributeValue} x * @return {null} * @private */ goog.labs.html.Sanitizer.disallow_ = function(x) { return null; }; /** * Chains attribute rewriters. * * @param {goog.labs.html.AttributeRewriter} f * @param {goog.labs.html.AttributeRewriter} g * @return {goog.labs.html.AttributeRewriter} * a function that return g(f(x)) or null if f(x) is null. * @private */ goog.labs.html.Sanitizer.chain_ = function(f, g) { // Sometimes white-listing code ends up allowing things multiple times. if (f === goog.labs.html.Sanitizer.valueIdentity_) { return g; } if (g === goog.labs.html.Sanitizer.valueIdentity_) { return f; } // If someone tries to white-list a really problematic value, we reject // it by returning disallow_. Disallow it quickly. if (f === goog.labs.html.Sanitizer.disallow_) { return f; } if (g === goog.labs.html.Sanitizer.disallow_) { return g; } return ( /** * @param {goog.labs.html.AttributeValue} x * @return {goog.labs.html.AttributeValue} */ function(x) { var y = f(x); return y != null ? g(y) : null; }); }; /** * Given an attribute name, returns a value rewriter that enforces some * minimal safety properties. * * <p> * For url atributes, it checks that any protocol is on a safe set that * doesn't allow script execution. * <p> * It also blanket disallows CSS and event handler attributes. * * @param {string} attrName lower-cased attribute name. * @return {goog.labs.html.AttributeRewriter} * @private */ goog.labs.html.Sanitizer.defaultRewriterForAttr_ = function(attrName) { if ('href' === attrName || 'src' === attrName) { return goog.labs.html.Sanitizer.checkUrl_; } else if ('style' === attrName || 'on' === attrName.substr(0, 2)) { // TODO(user): delegate to a CSS sanitizer if one is available. return goog.labs.html.Sanitizer.disallow_; } return goog.labs.html.Sanitizer.valueIdentity_; }; /** * Applied automatically to URL attributes to check that they are safe as per * {@link SafeUrl}. * * @param {goog.labs.html.AttributeValue} attrValue a decoded attribute value. * @return {goog.html.SafeUrl | null} a URL that is equivalent to the * input or {@code null} if the input is not a safe URL. * @private */ goog.labs.html.Sanitizer.checkUrl_ = function(attrValue) { if (attrValue == null) { return null; } /** @type {!goog.html.SafeUrl} */ var safeUrl; if (attrValue instanceof goog.html.SafeUrl) { safeUrl = /** @type {!goog.html.SafeUrl} */ (attrValue); } else { if (typeof attrValue === 'string') { // Whitespace at the ends of URL-valued attributes in HTML is ignored. attrValue = goog.string.trim(/** @type {string} */ (attrValue)); } safeUrl = goog.html.SafeUrl.sanitize( /** @type {!goog.string.TypedString | string} */ (attrValue)); } if (goog.html.SafeUrl.unwrap(safeUrl) == goog.html.SafeUrl.INNOCUOUS_STRING) { return null; } else { return safeUrl; } }; goog.labs.html.attributeRewriterPresubmitWorkaround();