annotate.js

// Copyright 2006 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/**
 * @fileoverview Methods for annotating occurrences of query terms in text or
 *   in a DOM tree. Adapted from Gmail code.
 *
 */

goog.provide('goog.dom.annotate');
goog.provide('goog.dom.annotate.AnnotateFn');

goog.require('goog.array');
goog.require('goog.asserts');
goog.require('goog.dom');
goog.require('goog.dom.NodeType');
goog.require('goog.dom.safe');
goog.require('goog.html.SafeHtml');


/**
 * A function that takes:
 *   (1) the number of the term that is "hit",
 *   (2) the HTML (search term) to be annotated,
 * and returns the annotated term as an HTML.
 * @typedef {function(number, !goog.html.SafeHtml): !goog.html.SafeHtml}
 */
goog.dom.annotate.AnnotateFn;


/**
 * Calls {@code annotateFn} for each occurrence of a search term in text nodes
 * under {@code node}. Returns the number of hits.
 *
 * @param {Node} node  A DOM node.
 * @param {Array} terms  An array of [searchTerm, matchWholeWordOnly] tuples.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {goog.dom.annotate.AnnotateFn} annotateFn
 * @param {*=} opt_ignoreCase  Whether to ignore the case of the query
 *   terms when looking for matches.
 * @param {Array.<string>=} opt_classesToSkip  Nodes with one of these CSS class
 *   names (and its descendants) will be skipped.
 * @param {number=} opt_maxMs  Number of milliseconds after which this function,
 *   if still annotating, should stop and return.
 *
 * @return {boolean} Whether any terms were annotated.
 */
goog.dom.annotate.annotateTerms = function(node, terms, annotateFn,
                                           opt_ignoreCase,
                                           opt_classesToSkip,
                                           opt_maxMs) {
  if (opt_ignoreCase) {
    terms = goog.dom.annotate.lowercaseTerms_(terms);
  }
  var stopTime = opt_maxMs > 0 ? goog.now() + opt_maxMs : 0;

  return goog.dom.annotate.annotateTermsInNode_(
      node, terms, annotateFn, opt_ignoreCase, opt_classesToSkip || [],
      stopTime, 0);
};


/**
 * The maximum recursion depth allowed. Any DOM nodes deeper than this are
 * ignored.
 * @type {number}
 * @private
 */
goog.dom.annotate.MAX_RECURSION_ = 200;


/**
 * The node types whose descendants should not be affected by annotation.
 * @type {Array}
 * @private
 */
goog.dom.annotate.NODES_TO_SKIP_ = ['SCRIPT', 'STYLE', 'TEXTAREA'];


/**
 * Recursive helper function.
 *
 * @param {Node} node  A DOM node.
 * @param {Array} terms  An array of [searchTerm, matchWholeWordOnly] tuples.
 *     The matchWholeWordOnly value is a per-term attribute because some terms
 *     may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *     should always be false for CJK terms.).
 * @param {goog.dom.annotate.AnnotateFn} annotateFn
 * @param {*} ignoreCase  Whether to ignore the case of the query terms
 *     when looking for matches.
 * @param {Array.<string>} classesToSkip  Nodes with one of these CSS class
 *     names will be skipped (as will their descendants).
 * @param {number} stopTime  Deadline for annotation operation (ignored if 0).
 * @param {number} recursionLevel  How deep this recursive call is; pass the
 *     value 0 in the initial call.
 * @return {boolean} Whether any terms were annotated.
 * @private
 */
goog.dom.annotate.annotateTermsInNode_ =
    function(node, terms, annotateFn, ignoreCase, classesToSkip,
             stopTime, recursionLevel) {
  if ((stopTime > 0 && goog.now() >= stopTime) ||
      recursionLevel > goog.dom.annotate.MAX_RECURSION_) {
    return false;
  }

  var annotated = false;

  if (node.nodeType == goog.dom.NodeType.TEXT) {
    var html = goog.dom.annotate.helpAnnotateText_(node.nodeValue, terms,
                                                   annotateFn, ignoreCase);
    if (html != null) {
      // Replace the text with the annotated html. First we put the html into
      // a temporary node, to get its DOM structure. To avoid adding a wrapper
      // element as a side effect, we'll only actually use the temporary node's
      // children.
      var tempNode = goog.dom.getOwnerDocument(node).createElement('SPAN');
      goog.dom.safe.setInnerHtml(tempNode, html);

      var parentNode = node.parentNode;
      var nodeToInsert;
      while ((nodeToInsert = tempNode.firstChild) != null) {
        // Each parentNode.insertBefore call removes the inserted node from
        // tempNode's list of children.
        parentNode.insertBefore(nodeToInsert, node);
      }

      parentNode.removeChild(node);
      annotated = true;
    }
  } else if (node.hasChildNodes() &&
             !goog.array.contains(goog.dom.annotate.NODES_TO_SKIP_,
                 node.tagName)) {
    var classes = node.className.split(/\s+/);
    var skip = goog.array.some(classes, function(className) {
      return goog.array.contains(classesToSkip, className);
    });

    if (!skip) {
      ++recursionLevel;
      var curNode = node.firstChild;
      var numTermsAnnotated = 0;
      while (curNode) {
        var nextNode = curNode.nextSibling;
        var curNodeAnnotated = goog.dom.annotate.annotateTermsInNode_(
            curNode, terms, annotateFn, ignoreCase, classesToSkip,
            stopTime, recursionLevel);
        annotated = annotated || curNodeAnnotated;
        curNode = nextNode;
      }
    }
  }

  return annotated;
};


/**
 * Regular expression that matches non-word characters.
 *
 * Performance note: Testing a one-character string using this regex is as fast
 * as the equivalent string test ("a-zA-Z0-9_".indexOf(c) < 0), give or take a
 * few percent. (The regex is about 5% faster in IE 6 and about 4% slower in
 * Firefox 1.5.) If performance becomes critical, it may be better to convert
 * the character to a numerical char code and check whether it falls in the
 * word character ranges. A quick test suggests that could be 33% faster.
 *
 * @type {RegExp}
 * @private
 */
goog.dom.annotate.NONWORD_RE_ = /\W/;


/**
 * Annotates occurrences of query terms in plain text. This process consists of
 * identifying all occurrences of all query terms, calling a provided function
 * to get the appropriate replacement HTML for each occurrence, and
 * HTML-escaping all the text.
 *
 * @param {string} text  The plain text to be searched.
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {goog.dom.annotate.AnnotateFn} annotateFn
 * @param {*=} opt_ignoreCase  Whether to ignore the case of the query
 *   terms when looking for matches.
 * @return {goog.html.SafeHtml} The HTML equivalent of {@code text} with terms
 *   annotated, or null if the text did not contain any of the terms.
 */
goog.dom.annotate.annotateText = function(text, terms, annotateFn,
                                          opt_ignoreCase) {
  if (opt_ignoreCase) {
    terms = goog.dom.annotate.lowercaseTerms_(terms);
  }
  return goog.dom.annotate.helpAnnotateText_(text, terms, annotateFn,
                                             opt_ignoreCase);
};


/**
 * Annotates occurrences of query terms in plain text. This process consists of
 * identifying all occurrences of all query terms, calling a provided function
 * to get the appropriate replacement HTML for each occurrence, and
 * HTML-escaping all the text.
 *
 * @param {string} text  The plain text to be searched.
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 *   If {@code ignoreCase} is true, each search term must already be lowercase.
 *   The matchWholeWordOnly value is a per-term attribute because some terms
 *   may be CJK, while others are not. (For correctness, matchWholeWordOnly
 *   should always be false for CJK terms.).
 * @param {goog.dom.annotate.AnnotateFn} annotateFn
 * @param {*} ignoreCase  Whether to ignore the case of the query terms
 *   when looking for matches.
 * @return {goog.html.SafeHtml} The HTML equivalent of {@code text} with terms
 *   annotated, or null if the text did not contain any of the terms.
 * @private
 */
goog.dom.annotate.helpAnnotateText_ = function(text, terms, annotateFn,
                                               ignoreCase) {
  var hit = false;
  var resultHtml = null;
  var textToSearch = ignoreCase ? text.toLowerCase() : text;
  var textLen = textToSearch.length;
  var numTerms = terms.length;

  // Each element will be an array of hit positions for the term.
  var termHits = new Array(numTerms);

  // First collect all the hits into allHits.
  for (var i = 0; i < numTerms; i++) {
    var term = terms[i];
    var hits = [];
    var termText = term[0];
    if (termText != '') {
      var matchWholeWordOnly = term[1];
      var termLen = termText.length;
      var pos = 0;
      // Find each hit for term t and append to termHits.
      while (pos < textLen) {
        var hitPos = textToSearch.indexOf(termText, pos);
        if (hitPos == -1) {
          break;
        } else {
          var prevCharPos = hitPos - 1;
          var nextCharPos = hitPos + termLen;
          if (!matchWholeWordOnly ||
              ((prevCharPos < 0 ||
                goog.dom.annotate.NONWORD_RE_.test(
                    textToSearch.charAt(prevCharPos))) &&
               (nextCharPos >= textLen ||
                goog.dom.annotate.NONWORD_RE_.test(
                    textToSearch.charAt(nextCharPos))))) {
            hits.push(hitPos);
            hit = true;
          }
          pos = hitPos + termLen;
        }
      }
    }
    termHits[i] = hits;
  }

  if (hit) {
    var html = [];
    var pos = 0;

    while (true) {
      // First determine which of the n terms is the next hit.
      var termIndexOfNextHit;
      var posOfNextHit = -1;

      for (var i = 0; i < numTerms; i++) {
        var hits = termHits[i];
        // pull off the position of the next hit of term t
        // (it's always the first in the array because we're shifting
        // hits off the front of the array as we process them)
        // this is the next candidate to consider for the next overall hit
        if (!goog.array.isEmpty(hits)) {
          var hitPos = hits[0];

          // Discard any hits embedded in the previous hit.
          while (hitPos >= 0 && hitPos < pos) {
            hits.shift();
            hitPos = goog.array.isEmpty(hits) ? -1 : hits[0];
          }

          if (hitPos >= 0 && (posOfNextHit < 0 || hitPos < posOfNextHit)) {
            termIndexOfNextHit = i;
            posOfNextHit = hitPos;
          }
        }
      }

      // Quit if there are no more hits.
      if (posOfNextHit < 0) break;

      // Remove the next hit from our hit list.
      termHits[termIndexOfNextHit].shift();

      // Append everything from the end of the last hit up to this one.
      html.push(text.substr(pos, posOfNextHit - pos));

      // Append the annotated term.
      var termLen = terms[termIndexOfNextHit][0].length;
      var termHtml = goog.html.SafeHtml.htmlEscape(
          text.substr(posOfNextHit, termLen));
      html.push(
          annotateFn(goog.asserts.assertNumber(termIndexOfNextHit), termHtml));

      pos = posOfNextHit + termLen;
    }

    // Append everything after the last hit.
    html.push(text.substr(pos));
    return goog.html.SafeHtml.concat(html);
  } else {
    return null;
  }
};


/**
 * Converts terms to lowercase.
 *
 * @param {Array} terms  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 * @return {!Array}  An array of
 *   [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
 * @private
 */
goog.dom.annotate.lowercaseTerms_ = function(terms) {
  var lowercaseTerms = [];
  for (var i = 0; i < terms.length; ++i) {
    var term = terms[i];
    lowercaseTerms[i] = [term[0].toLowerCase(), term[1]];
  }
  return lowercaseTerms;
};