htmlprettyprinter.js

// Copyright 2008 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/**
 * @fileoverview Provides functions to parse and pretty-print HTML strings.
 *
 */

goog.provide('goog.format.HtmlPrettyPrinter');
goog.provide('goog.format.HtmlPrettyPrinter.Buffer');

goog.require('goog.object');
goog.require('goog.string.StringBuffer');



/**
 * This class formats HTML to be more human-readable.
 * TODO(user): Add hierarchical indentation.
 * @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If
 *     this time is exceeded, return partially formatted. 0 or negative number
 *     indicates no timeout.
 * @constructor
 * @final
 */
goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {
  /**
   * Max # milliseconds to spend on #format.
   * @type {number}
   * @private
   */
  this.timeOutMillis_ = opt_timeOutMillis && opt_timeOutMillis > 0 ?
      opt_timeOutMillis : 0;
};


/**
 * Singleton.
 * @type {goog.format.HtmlPrettyPrinter?}
 * @private
 */
goog.format.HtmlPrettyPrinter.instance_ = null;


/**
 * Singleton lazy initializer.
 * @return {!goog.format.HtmlPrettyPrinter} Singleton.
 * @private
 */
goog.format.HtmlPrettyPrinter.getInstance_ = function() {
  if (!goog.format.HtmlPrettyPrinter.instance_) {
    goog.format.HtmlPrettyPrinter.instance_ =
        new goog.format.HtmlPrettyPrinter();
  }
  return goog.format.HtmlPrettyPrinter.instance_;
};


/**
 * Static utility function. See prototype #format.
 * @param {string} html The HTML text to pretty print.
 * @return {string} Formatted result.
 */
goog.format.HtmlPrettyPrinter.format = function(html) {
  return goog.format.HtmlPrettyPrinter.getInstance_().format(html);
};


/**
 * List of patterns used to tokenize HTML for pretty printing. Cache
 * subexpression for tag name.
 * comment|meta-tag|tag|text|other-less-than-characters
 * @type {RegExp}
 * @private
 */
goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =
    /(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^>]*>|[^<]+|<)/g;


/**
 * Tags whose contents we don't want pretty printed.
 * @type {Object}
 * @private
 */
goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(
    'script',
    'style',
    'pre',
    'xmp');


/**
 * 'Block' tags. We should add newlines before and after these tags during
 * pretty printing. Tags drawn mostly from HTML4 definitions for block and other
 * non-online tags, excepting the ones in
 * #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.
 *
 * @type {Object}
 * @private
 */
goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(
    'address',
    'applet',
    'area',
    'base',
    'basefont',
    'blockquote',
    'body',
    'caption',
    'center',
    'col',
    'colgroup',
    'dir',
    'div',
    'dl',
    'fieldset',
    'form',
    'frame',
    'frameset',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'head',
    'hr',
    'html',
    'iframe',
    'isindex',
    'legend',
    'link',
    'menu',
    'meta',
    'noframes',
    'noscript',
    'ol',
    'optgroup',
    'option',
    'p',
    'param',
    'table',
    'tbody',
    'td',
    'tfoot',
    'th',
    'thead',
    'title',
    'tr',
    'ul');


/**
 * Non-block tags that break flow. We insert a line break after, but not before
 * these. Tags drawn from HTML4 definitions.
 * @type {Object}
 * @private
 */
goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(
    'br',
    'dd',
    'dt',
    'br',
    'li',
    'noframes');


/**
 * Empty tags. These are treated as both start and end tags.
 * @type {Object}
 * @private
 */
goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(
    'br',
    'hr',
    'isindex');


/**
 * Breaks up HTML so it's easily readable by the user.
 * @param {string} html The HTML text to pretty print.
 * @return {string} Formatted result.
 * @throws {Error} Regex error, data loss, or endless loop detected.
 */
goog.format.HtmlPrettyPrinter.prototype.format = function(html) {
  // Trim leading whitespace, but preserve first indent; in other words, keep
  // any spaces immediately before the first non-whitespace character (that's
  // what $1 is), but remove all other leading whitespace. This adjustment
  // historically had been made in Docs. The motivation is that some
  // browsers prepend several line breaks in designMode.
  html = html.replace(/^\s*?( *\S)/, '$1');

  // Trim trailing whitespace.
  html = html.replace(/\s+$/, '');

  // Keep track of how much time we've used.
  var timeOutMillis = this.timeOutMillis_;
  var startMillis = timeOutMillis ? goog.now() : 0;

  // Handles concatenation of the result and required line breaks.
  var buffer = new goog.format.HtmlPrettyPrinter.Buffer();

  // Declare these for efficiency since we access them in a loop.
  var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;
  var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;
  var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;
  var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;
  var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;

  // Used to verify we're making progress through our regex tokenization.
  var lastIndex = 0;

  // Use this to track non-pretty-printed tags and childen.
  var nonPpTagStack = [];

  // Loop through each matched token.
  var match;
  while (match = tokenRegex.exec(html)) {
    // Get token.
    var token = match[0];

    // Is this token a tag? match.length == 3 for tags, 1 for all others.
    if (match.length == 3) {
      var tagName = match[2];
      if (tagName) {
        tagName = tagName.toLowerCase();
      }

      // Non-pretty-printed tags?
      if (nonPpTags.hasOwnProperty(tagName)) {
        // End tag?
        if (match[1] == '/') {
          // Do we have a matching start tag?
          var stackSize = nonPpTagStack.length;
          var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;
          if (startTagName == tagName) {
            // End of non-pretty-printed block. Line break after.
            nonPpTagStack.pop();
            buffer.pushToken(false, token, !nonPpTagStack.length);
          } else {
            // Malformed HTML. No line breaks.
            buffer.pushToken(false, token, false);
          }
        } else {
          // Start of non-pretty-printed block. Line break before.
          buffer.pushToken(!nonPpTagStack.length, token, false);
          nonPpTagStack.push(tagName);
        }
      } else if (nonPpTagStack.length) {
        // Inside non-pretty-printed block, no new line breaks.
        buffer.pushToken(false, token, false);
      } else if (blockTags.hasOwnProperty(tagName)) {
        // Put line break before start block and after end block tags.
        var isEmpty = emptyTags.hasOwnProperty(tagName);
        var isEndTag = match[1] == '/';
        buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);
      } else if (breaksFlowTags.hasOwnProperty(tagName)) {
        var isEmpty = emptyTags.hasOwnProperty(tagName);
        var isEndTag = match[1] == '/';
        // Put line break after end flow-breaking tags.
        buffer.pushToken(false, token, isEndTag || isEmpty);
      } else {
        // All other tags, no line break.
        buffer.pushToken(false, token, false);
      }
    } else {
      // Non-tags, no line break.
      buffer.pushToken(false, token, false);
    }

    // Double check that we're making progress.
    var newLastIndex = tokenRegex.lastIndex;
    if (!token || newLastIndex <= lastIndex) {
      throw Error('Regex failed to make progress through source html.');
    }
    lastIndex = newLastIndex;

    // Out of time?
    if (timeOutMillis) {
      if (goog.now() - startMillis > timeOutMillis) {
        // Push unprocessed data as one big token and reset regex object.
        buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);
        tokenRegex.lastIndex = 0;
        break;
      }
    }
  }

  // Ensure we end in a line break.
  buffer.lineBreak();

  // Construct result string.
  var result = String(buffer);

  // Length should be original length plus # line breaks added.
  var expectedLength = html.length + buffer.breakCount;
  if (result.length != expectedLength) {
    throw Error('Lost data pretty printing html.');
  }

  return result;
};



/**
 * This class is a buffer to which we push our output. It tracks line breaks to
 * make sure we don't add unnecessary ones.
 * @constructor
 * @final
 */
goog.format.HtmlPrettyPrinter.Buffer = function() {
  /**
   * Tokens to be output in #toString.
   * @type {goog.string.StringBuffer}
   * @private
   */
  this.out_ = new goog.string.StringBuffer();
};


/**
 * Tracks number of line breaks added.
 * @type {number}
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;


/**
 * Tracks if we are at the start of a new line.
 * @type {boolean}
 * @private
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;


/**
 * Tracks if we need a new line before the next token.
 * @type {boolean}
 * @private
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;


/**
 * Adds token and necessary line breaks to output buffer.
 * @param {boolean} breakBefore If true, add line break before token if
 *     necessary.
 * @param {string} token Token to push.
 * @param {boolean} breakAfter If true, add line break after token if
 *     necessary.
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(
    breakBefore, token, breakAfter) {
  // If this token needs a preceeding line break, and
  // we haven't already added a line break, and
  // this token does not start with a line break,
  // then add line break.
  // Due to FF3.0 bug with lists, we don't insert a /n
  // right before </ul>. See bug 1520665.
  if ((this.needsNewLine_ || breakBefore) &&
      !/^\r?\n/.test(token) &&
      !/\/ul/i.test(token)) {
    this.lineBreak();
  }

  // Token.
  this.out_.append(token);

  // Remember if this string ended with a line break so we know we don't have to
  // insert another one before the next token.
  this.isBeginningOfNewLine_ = /\r?\n$/.test(token);

  // Remember if this token requires a line break after it. We don't insert it
  // here because we might not have to if the next token starts with a line
  // break.
  this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;
};


/**
 * Append line break if we need one.
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {
  if (!this.isBeginningOfNewLine_) {
    this.out_.append('\n');
    ++this.breakCount;
  }
};


/**
 * @return {string} String representation of tokens.
 * @override
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {
  return this.out_.toString();
};