首页 > 代码库 > Parse HTML Safely

Parse HTML Safely


jQuery.parseHTML

给定一段HTML代码, 如何将其转化为DOM树以便处理?

如果使用jQuery, 可以使用其$.parseHTML 方法将HTML代码转化为DOM树.

var markup = '<p>' +
        '<img src=http://www.mamicode.com/"http://note.youdao.com/styles/images/icon-1.png?96" data-media-type="image">' +>

翻看jQuery的源代码, 其parse HTML的原理与以下代码一致:

    /**
     * @param {String} markup
     * @param {Document} [context]
     * @return {Array}
     */
var parseHTMLWithDiv = function (markup, context) {
        context = context || document;

        var wrapper = context.createElement('div'),
            domArray = [],
            index,
            len;

        wrapper.innerHTML = markup;
        len = wrapper.childNodes.length;
        for (index = 0; index < len; index++) {
            domArray.push(wrapper.childNodes[index]);
        }

        return domArray;
    };


亦可以在将HTML内容放置在隐藏的iframe中进行parse.

/**
 * @param {String} markup
 * @param {Document} [context]
 * @return {Array}
 */
var parseHTMLWithIframe = function (markup, context) {
    context = context || document;
    var iframe = context.createElement('iframe'),
        body,
        index,
        len,
        domArray = [];
    iframe.src = http://www.mamicode.com/'';>

在parse HTML过程中, 如果仔细观察, 可以发现以下几点:

  1. HTML代码中的script不会被执行
    • 保证安全
  2. 浏览器会自动发出图片src的请求, 可以预加载图片
    • 使用div时, 图片会预加载
    • 使用iframe时, 图片加载请求会发出, 但会被取消

DOMParser

如果被parse过后的HTML代码并不需要注入到页面上, parse HTML过程中浏览器自动发出图片 src请求就会占用网络请求等资源, 这是不完美的.

该如何做才不会让浏览器自动发出图片src请求呢?

jQuery中有一个与$.parseHTML()类似的方法, 叫做$.parseXML(), 用于parse xml. 查看源码:

// Cross-browser xml parsing
jQuery.parseXML = function( data ) {
    var xml, tmp;
    if ( !data || typeof data !== "string" ) {
        return null;
    }

    // Support: IE9
    try {
        tmp = new DOMParser();
        xml = tmp.parseFromString( data, "text/xml" );
    } catch ( e ) {
        xml = undefined;
    }

    if ( !xml || xml.getElementsByTagName( "parsererror" ).length ) {
        jQuery.error( "Invalid XML: " + data );
    }
    return xml;
};


jQuery 使用了 DOMParser 对XML文档进行parse. DOMParser 不但能够parse XML文档, 还 能 parse HTML文档.

enum SupportedType {
  "text/html",
  "text/xml",
  "application/xml",
  "application/xhtml+xml",
  "image/svg+xml"
};

[Constructor]
interface DOMParser {
  Document parseFromString(DOMString str, SupportedType type);
};


使用 DOMParser 分析HTML文档时, 浏览器不会自动发出图片src的请求. 在不支持DOMParser的浏览器中, 有一个替代方案: DOMImplementation.createHTMLDocument

/**
 * There are two ways to parse html snippet:
 * 1. parse html in a virtual Document/DOMParser object.
 * 2. create a `div` element as wrapper and set html as its innerHTML.
 *
 * The 1st way can prevent loading images that in the html and is safer.
 *
 * NOTE:  This function does not support ie8 and ie8-
 *
 * @param {String} markup the html string that can be set as the  innserHTML
 * @param {Document} [context]
 * of <body/>
 * @return {Document} if returned value is null, you can follow the 2ed way.
 */
function parseHTML(markup, context) {
    var doc,
        parser,
        win;

    context = context || document;

    if (context.implementation &&
            context.implementation.createHTMLDocument) {
        doc = context.implementation.createHTMLDocument();
        doc.body.innerHTML = markup;
        return doc;
    }

    win = context.defaultView || window;
    if (win.DOMParser) {
        parser = new win.DOMParser();
        try {
            doc = parser.parseFromString('', 'text/html');
        } catch (ex) {
            // do nothing
        }
        if (doc) {
            doc.body.innerHTML = markup;
            return doc;
        }
    }

};


Reference

  1. https://code.google.com/p/google-caja/issues/detail?id=1823
  2. http://api.jquery.com/jquery.parsehtml/
  3. https://developer.mozilla.org/en-US/docs/Web/API/DOMParser
  4. http://domparsing.spec.whatwg.org/
  5. https://developer.mozilla.org/en-US/docs/Web/API/DOMImplementation.createHTMLDocument