可以将整个HTML文档加载到Internet Explorer中的文档片段中吗? [英] Can I load an entire HTML document into a document fragment in Internet Explorer?

查看:87
本文介绍了可以将整个HTML文档加载到Internet Explorer中的文档片段中吗?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

这是我遇到的一些困难。我有一个本地的客户端脚本,需要允许用户获取一个远程网页,并搜索结果页面的表单。为了这样做(没有正则表达式),我需要将文档解析为一个完全可遍历的DOM对象。



我想强调的一些限制:




  • 我不想使用库(如jQuery)。

  • 在任何情况下,远程页面的脚本都不会被执行(出于安全考虑)。

  • DOM API,如 getElementsByTagName ,需要可用。

  • 它只需要在Internet Explorer中工作,但在7至少。

  • 假设我无法访问服务器。我这样做,但我不能使用它。



我尝试过的



假设我在变量 html 中有一个完整的HTML文档字符串(包括DOCTYPE声明),这是我迄今为止所尝试的:

  var frag = document.createDocumentFragment(),
div = frag.appendChild(document.createElement(div));

div.outerHTML = html;
// - >导致一个空的片段

div.insertAdjacentHTML(afterEnd,html);
// - > HTML不添加到片段

div.innerHTML = html;
// - >错误(预期,但我尝试了)

var doc = new ActiveXObject(htmlfile);
doc.write(html);
doc.close();
// - > JavaScript执行

我也尝试解压缩< head> < body> < HTML> 元素


$ b

有没有人有任何想法?

解决方案

小提琴 http://jsfiddle.net/JFSKe/6/



DocumentFragment 不实现DOM方法。使用 document.createElement innerHTML 一起删除< head> < body> 标签(即使创建的元素是根元素,< html> )。因此,应该在其他地方寻求解决方案。我已经创建了一个跨浏览器字符串到DOM功能,它使用了一个不可见的内联框架。



所有外部资源和脚本将被禁用。



代码



  / * 
@param String html已转换为DOM对象的HTML字符串
@param func callback(可选)回调(HTMLDocument doc,功能销毁)$ b $如果回调存在,b @returns undefined否则:Object
从参数:html
中获取HTMLDocument doc DOM删除HTMLDocument文档。 * /
函数string2dom(html,callback){
/ *消除字符串* /
html = sanitiseHTML(html); / *定义在答案的底部* /

/ *创建一个IFrame * /
var iframe = document.createElement(iframe);
iframe.style.display =none;
document.body.appendChild(iframe);

var doc = iframe.contentDocument || iframe.contentWindow.document;
doc.open();
doc.write(html);
doc.close();

函数destroy(){
iframe.parentNode.removeChild(iframe);
}
if(callback)回调(doc,destroy);
else return {doc:doc,destroy:destroy};
}

/ * @name sanitiseHTML
@param String html代表HTML代码的字符串
@return String一个新的字符串,完全剥离了外部资源。
所有外部属性(href,src)以data- * /

函数为前缀加上sanitiseHTML(html){
/ *添加<! - \ ' - >在每个匹配的标签之前,以便未终止的引号
不阻止浏览器分割标签测试用例:
'< input style =foo; b:url(0 );>< input onclick =< input type = button onclick =too()href =;>>'* /
var prefix =& - >中;
/ *属性不应以这些字符为前缀。此列表不是
完成,但对于此功能将是足够的。
(参见http://www.w3.org/TR/REC-xml/#NT-NameChar)* /
var att =[^ -a-z0-9:._] ;
var tag =< [a-z];
var any =(?:[^ ^\'] *(?: \[^ \] * \|'[^'] *'))* ?[^<>] *;
var etag =(?:> |(?=<));

/ *
@name ae
@description转换
原始输入和HTML实体的序列中的给定字符串
@param String string字符串转换
* /
var entityEnd =(?:; |(?!\\d));
var ents = {:(?: \\s |& nbsp;?&#0 * 32+ entityEnd +|&#x0 * 20+ entityEnd +) ,
(:(?: \\(|&#0 * 40+ entityEnd +|&#x0 * 28+ entityEnd +),
) :(?: \\)|&#0 * 41+ entityEnd +|&#x0 * 29+ entityEnd +),
\\ |&安培;#0 * 46 + entityEnd + |&安培;#X0 * 2E + entityEnd +)};
/ *占位符避免棘手的过滤器规避方法* /
var charMap = {};
var s = ents [] +*; / *短手空间* /
/ *重要提示:必须由<和>。 RE匹配整个标签! * /
函数ae(string){
var all_chars_lowercase = string.toLowerCase();
if(ents [string])return ents [string];
var all_chars_uppercase = string.toUpperCase();
var RE_res =; (var i = 0; i< string.length; i ++)
{
var char_lowercase = all_chars_lowercase.charAt(i);
if(charMap [char_lowercase]){
RE_res + = charMap [char_lowercase];
继续;
}
var char_uppercase = all_chars_uppercase.charAt(i);
var RE_sub = [char_lowercase];
RE_sub.push(&#0 *+ char_lowercase.charCodeAt(0)+ entityEnd);
RE_sub.push(&#x0 *+ char_lowercase.charCodeAt(0).toString(16)+ entityEnd);
if(char_lowercase!= char_uppercase){
RE_sub.push(&#0 *+ char_uppercase.charCodeAt(0)+ entityEnd);
RE_sub.push(&#x0 *+ char_uppercase.charCodeAt(0).toString(16)+ entityEnd);
}
RE_sub =(?:+ RE_sub.join(|)+);
RE_res + =(charMap [char_lowercase] = RE_sub);
}
return(ents [string] = RE_res);
}
/ *
@name由
@description替换函数的第二个参数。
* /
函数(match,group1,group2){
/ *在每个外部指针之前添加数据前缀* /
返回group1 +data-+ group2
}
/ *
@name cr
@description选择一个HTML元素并执行
搜索和替换属性
@param字符串选择器HTML子字符串匹配
@param String属性RegExp-escaped; HTML元素属性匹配
@param String marker可选RegExp-escaped;标记前缀
@param String delimiter可选RegExp转义;非引号分隔符
@param String end可选RegExp转义;在< end>发生之前强制匹配
结束当
引号丢失
* /
函数cr(selector,attribute,marker,delimiter,end){
if(typeof selector ==string)selector = new RegExp (选择器,gi);
marker = typeof marker ==string?标记:\\s * =;
delimiter = typeof delimiter ==string?分隔符:;
end = typeof end ==string?结束 : ;
var is_end = end&& ?;
var re1 = new RegExp((+ att +)(+ attribute + marker +(?:\\s * \[^ \+ delimiter +] * \ |+++++++++++++) b $ b html = html.replace(selector,function(match){
return prefix + match.replace(re1,by);
});
}
/ *
@name cri
@description选择HTML元素的属性,
对某些值执行搜索替换
@param String selector HTML元素以匹配
@param String属性RegExp-escaped; HTML元素属性匹配
@param String front RegExp-escaped;属性值,匹配前缀
@param String flags可选的RegExp标志,默认gi
@param String delimiter可选的RegExp-escaped;非引号分隔符
@param String end可选RegExp-escaped; f在< end>发生之前,将匹配到
结束。当
引号丢失
* /
函数cri(selector,attribute,front,flags,delimiter,end){
if(typeof selector ==string)selector =新的RegExp(选择器,gi);
flags = typeof flags ==string?标志:gi;
var re1 = new RegExp((+ att + attribute +\\s * =)((?: \\s * \[^ \* * \\s *'[^ \\s>] +)),gi);

end = typeof end ==string ?);
var at1 = new RegExp('()('+ front +'[^] +)',flags);
var at2 = new RegExp((')(+ front +[^'] +'),flags);
var at3 = new RegExp(()(+ front +(?:[^] +| \'[^ \'] + \'|(?:(? +分隔符+'))+)'+ end,flags);

var handleAttr = function(match,g1,g2){
if(g2.charAt(0)==' ')return g1 + g2.replace(at1,by);
if(g2.charAt(0)==')return g1 + g2.replace(at2,by);
return g1 + g2.replace(at3,by);
};
html = html.replace(selector,function(match){
return prefix + match.replace(re1,handleAttr);
});
}

/ *< meta http-equiv = refresh content =; url => * /
html = html.replace (new RegExp(< meta+ any + att +http-equiv\\\s * = \\s *(?: \+ ae(refresh)+\ + any + etag +|+ ae(refresh)+'+ any + etag +|+ ae(refresh)+(?:+ ae()+ any + etag + |+ etag +)),gi),<! - meta http-equiv = refresh stripped - >);

/ *剥离所有脚本* /
html = html.re place(new RegExp(< script+ any +> \\s * // \\s *< \\ [CDATA\\ [[\\S\ \s] *?]]> \\s *< / script [^>] *>,gi),<! - CDATA script - >);
html = html.replace(/< script [\S\s] +?< \ / script\s *> / gi,<! - 非CDATA脚本 - - >中);在[-a-z0-9:_。] + =+ any + etag上,在[-a-z0-9:_。] +)上的
cr(tag + any + att + / *事件监听器* /

cr(标记+ any + att +href\\\s * =+ any + etag,href); / *链接元素* /
cr(tag + any + att +src\\s * =+ any + etag,src); / *嵌入元素* /

cr(< object+ any + att +data\\\s * =+ any + etag,data); / *< object data => * /
cr(< applet+ any + att +codebase\\\s * =+ any + etag,codebase); / *< applet codebase => * /

/ *< param name = movie value => * /
cr(¶m+ any + att +name \\s * = \ \s *(?:\ + AE( 电影)+ \ +任何+ ETAG + | ' + AE( 电影 )+' +任何+ ETAG + | + ae(movie)+(?:+ ae()+ any + etag +|+ etag +)),value);

/ *< style>和& style => url()* /
cr(/< style [^>] *>(?:[^'] *(?:[^] * )*?[^'] *(?:< \ / style | $)/ gi,url,\\s * \\(\\s *, ,\\s * \\);
cri(tag + any + att +style \\s * =+ any + etag,style,ae url)+ s + ae(()+ s,0,s + ae()),ae()));

/ * IE7- CSS expression * /
cr(/< style [^>] *>(?:[^'] *(?:[^] *|'[^'] *'))* ?[^'] *(?:< \ / style | $)/ gi,expression,\\s * \\(\\\s *, \\s * \\));
cri(tag + any + att +style \\s * =+ any + etag,style,ae(expression)+ s + ae(()+ s, s + ae()),ae()));
return html.replace(new RegExp((?:+ prefix +)+,g),前缀);
}



代码说明



sanitiseHTML 函数基于我的 replace_all_rel_by_abs 函数(请参阅这个答案)。 sanitiseHTML 功能是完全重写的,以达到最大的效率和可靠性。



另外,添加了RegExps以删除所有脚本和事件处理程序(包括CSS expression(),IE7-)。要确保所有标签都按预期解析,调整后的标签前缀为<! - ' - > 。此前缀对于正确解析嵌套事件处理程序结合未终止的引号:< a id =>< input onclick =< div onmousemove = evil()>>



这些RegExps是使用内部函数 cr / cri C reate R eplace [ nline])。这些函数接受参数列表,并创建并执行高级RE替换。为了确保HTML实体不会破坏< meta http-equiv = refresh> 中的RegExp( refresh 可以以各种方式编写),动态创建的RegExps部分由函数 ae A ny E

实际的替换是由(替换)的函数完成的。在这个实现中,之前的在所有匹配的属性之前添加数据 -


  1. 所有< script> //< [CDATA [.. //]]>< / script> 是条纹的。这个步骤是必要的,因为 CDATA 部分允许代码中的< / script> 字符串。执行此更换后,可以安全地进行下一次更换:

  2. 剩余的< script> ...< / script> 标签被删除

  3. < meta http-equiv = refresh ..> li>
  4. 所有事件侦听器和外部指针/属性( href src如前所述,, url())前缀为 data - p>


  5. 创建 IFrame 对象。 IFrames不太可能泄漏内存(与htmlfile ActiveXObject相反)。 IFrame变得不可见,并附加到文档中,以便可以访问DOM。 document.write()用于将HTML写入IFrame。 document.open() document.close()用于清空文档的以前内容,以便生成的文档是给定的 html 字符串的精确副本。


  6. 如果已指定回调函数,函数将被调用两个参数。 第一个参数是对生成的文档对象的引用。 第二个参数是一个函数,在调用时会破坏生成的DOM树。当您不再需要该树时,应该调用此函数。
    如果未指定回调函数,该函数将返回一个由两个属性组成的对象( doc destroy ),其行为与前面提到的参数相同。



附加说明




  • designMode 属性设置为开将停止执行脚本的一个框架(Chrome中不支持)。如果由于特定原因必须保留< script> 标签,则可以使用 iframe.designMode =On而不是脚本剥离功能。

  • 我无法找到可靠的 htmlfile activeXObject 的源代码。根据这个来源 htmlfile 比IFrames慢,更容易发生内存泄漏。


  • 所有受影响的属性( href src ,...)前缀为 data - data-href 显示获取/更改这些属性的示例:
    elem.getAttribute(data-href) elem.setAttribute(data-href,...)
    elem.dataset.href elem.dataset.href =...

  • 外部资源已被禁用。因此,该页面可能看起来完全不同:<<> < link rel =stylesheethref =main.css/> / strike> 没有外部样式
    < script> document.body.bgColor =red;< / script> 没有脚本样式
    < img src =128x128.png/> 否图片:元素的大小可能是完全不同的。



示例



sanitiseHTML(html)

将此书签粘贴到该位置的栏中。它将提供一个注入textarea的选项,显示消毒的HTML字符串。

  javascript:void(function(){var s =使用document.createElement( 脚本); s.src = http://rob.lekensteyn.nl/html-sanitizer.js; document.body.appendChild(S)})(); 

代码示例 - string2dom(html)

  string2dom(< html>< head>< title> Test< / title> ;< / head>< / html>,function(doc,destroy){
alert(doc.title); / * Alert:Test* /
destroy();
});

var test = string2dom(< div id ='secret'>< / div>);
alert(test.doc.getElementById(secret)。tagName); / *提醒:DIV* /
test.destroy();



值得注意的引用




Here's something I've been having a little bit of difficulty with. I have a local client-side script that needs to allow a user to fetch a remote web page and search that resulting page for forms. In order to do this (without regex), I need to parse the document into a fully traversable DOM object.

Some limitations I'd like to stress:

  • I don't want to use libraries (like jQuery). There's too much bloat for what I need to do here.
  • Under no circumstances should scripts from the remote page be executed (for security reasons).
  • DOM APIs, such as getElementsByTagName, need to be available.
  • It only needs to work in Internet Explorer, but in 7 at the very least.
  • Let's pretend I don't have access to a server. I do, but I can't use it for this.

What I've tried

Assuming I have a complete HTML document string (including DOCTYPE declaration) in the variable html, here's what I've tried so far:

var frag = document.createDocumentFragment(),
div  = frag.appendChild(document.createElement("div"));

div.outerHTML = html;
//-> results in an empty fragment

div.insertAdjacentHTML("afterEnd", html);
//-> HTML is not added to the fragment

div.innerHTML = html;
//-> Error (expected, but I tried it anyway)

var doc = new ActiveXObject("htmlfile");
doc.write(html);
doc.close();
//-> JavaScript executes

I've also tried extracting the <head> and <body>nodes from the HTML and adding them to a <HTML> element inside the fragment, still no luck.

Does anyone have any ideas?

解决方案

Fiddle: http://jsfiddle.net/JFSKe/6/

DocumentFragment doesn't implement DOM methods. Using document.createElement in conjunction with innerHTML removes the <head> and <body> tags (even when the created element is a root element, <html>). Therefore, the solution should be sought elsewhere. I have created a cross-browser string-to-DOM function, which makes use of an invisible inline-frame.

All external resources and scripts will be disabled. See Explanation of the code for more information.

Code

/*
 @param String html    The string with HTML which has be converted to a DOM object
 @param func callback  (optional) Callback(HTMLDocument doc, function destroy)
 @returns              undefined if callback exists, else: Object
                        HTMLDocument doc  DOM fetched from Parameter:html
                        function destroy  Removes HTMLDocument doc.         */
function string2dom(html, callback){
    /* Sanitise the string */
    html = sanitiseHTML(html); /*Defined at the bottom of the answer*/

    /* Create an IFrame */
    var iframe = document.createElement("iframe");
    iframe.style.display = "none";
    document.body.appendChild(iframe);

    var doc = iframe.contentDocument || iframe.contentWindow.document;
    doc.open();
    doc.write(html);
    doc.close();

    function destroy(){
        iframe.parentNode.removeChild(iframe);
    }
    if(callback) callback(doc, destroy);
    else return {"doc": doc, "destroy": destroy};
}

/* @name sanitiseHTML
   @param String html  A string representing HTML code
   @return String      A new string, fully stripped of external resources.
                       All "external" attributes (href, src) are prefixed by data- */

function sanitiseHTML(html){
    /* Adds a <!-\"'--> before every matched tag, so that unterminated quotes
        aren't preventing the browser from splitting a tag. Test case:
       '<input style="foo;b:url(0);><input onclick="<input type=button onclick="too() href=;>">' */
    var prefix = "<!--\"'-->";
    /*Attributes should not be prefixed by these characters. This list is not
     complete, but will be sufficient for this function.
      (see http://www.w3.org/TR/REC-xml/#NT-NameChar) */
    var att = "[^-a-z0-9:._]";
    var tag = "<[a-z]";
    var any = "(?:[^<>\"']*(?:\"[^\"]*\"|'[^']*'))*?[^<>]*";
    var etag = "(?:>|(?=<))";

    /*
      @name ae
      @description          Converts a given string in a sequence of the
                             original input and the HTML entity
      @param String string  String to convert
      */
    var entityEnd = "(?:;|(?!\\d))";
    var ents = {" ":"(?:\\s|&nbsp;?|&#0*32"+entityEnd+"|&#x0*20"+entityEnd+")",
                "(":"(?:\\(|&#0*40"+entityEnd+"|&#x0*28"+entityEnd+")",
                ")":"(?:\\)|&#0*41"+entityEnd+"|&#x0*29"+entityEnd+")",
                ".":"(?:\\.|&#0*46"+entityEnd+"|&#x0*2e"+entityEnd+")"};
                /*Placeholder to avoid tricky filter-circumventing methods*/
    var charMap = {};
    var s = ents[" "]+"*"; /* Short-hand space */
    /* Important: Must be pre- and postfixed by < and >. RE matches a whole tag! */
    function ae(string){
        var all_chars_lowercase = string.toLowerCase();
        if(ents[string]) return ents[string];
        var all_chars_uppercase = string.toUpperCase();
        var RE_res = "";
        for(var i=0; i<string.length; i++){
            var char_lowercase = all_chars_lowercase.charAt(i);
            if(charMap[char_lowercase]){
                RE_res += charMap[char_lowercase];
                continue;
            }
            var char_uppercase = all_chars_uppercase.charAt(i);
            var RE_sub = [char_lowercase];
            RE_sub.push("&#0*" + char_lowercase.charCodeAt(0) + entityEnd);
            RE_sub.push("&#x0*" + char_lowercase.charCodeAt(0).toString(16) + entityEnd);
            if(char_lowercase != char_uppercase){
                RE_sub.push("&#0*" + char_uppercase.charCodeAt(0) + entityEnd);   
                RE_sub.push("&#x0*" + char_uppercase.charCodeAt(0).toString(16) + entityEnd);
            }
            RE_sub = "(?:" + RE_sub.join("|") + ")";
            RE_res += (charMap[char_lowercase] = RE_sub);
        }
        return(ents[string] = RE_res);
    }
    /*
      @name by
      @description  second argument for the replace function.
      */
    function by(match, group1, group2){
        /* Adds a data-prefix before every external pointer */
        return group1 + "data-" + group2 
    }
    /*
      @name cr
      @description            Selects a HTML element and performs a
                                  search-and-replace on attributes
      @param String selector  HTML substring to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String marker    Optional RegExp-escaped; marks the prefix
      @param String delimiter Optional RegExp escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cr(selector, attribute, marker, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        marker = typeof marker == "string" ? marker : "\\s*=";
        delimiter = typeof delimiter == "string" ? delimiter : "";
        end = typeof end == "string" ? end : "";
        var is_end = end && "?";
        var re1 = new RegExp("("+att+")("+attribute+marker+"(?:\\s*\"[^\""+delimiter+"]*\"|\\s*'[^'"+delimiter+"]*'|[^\\s"+delimiter+"]+"+is_end+")"+end+")", "gi");
        html = html.replace(selector, function(match){
            return prefix + match.replace(re1, by);
        });
    }
    /* 
      @name cri
      @description            Selects an attribute of a HTML element, and
                               performs a search-and-replace on certain values
      @param String selector  HTML element to match
      @param String attribute RegExp-escaped; HTML element attribute to match
      @param String front     RegExp-escaped; attribute value, prefix to match
      @param String flags     Optional RegExp flags, default "gi"
      @param String delimiter Optional RegExp-escaped; non-quote delimiters
      @param String end       Optional RegExp-escaped; forces the match to
                                  end before an occurence of <end> when 
                                  quotes are missing
     */
    function cri(selector, attribute, front, flags, delimiter, end){
        if(typeof selector == "string") selector = new RegExp(selector, "gi");
        flags = typeof flags == "string" ? flags : "gi";
         var re1 = new RegExp("("+att+attribute+"\\s*=)((?:\\s*\"[^\"]*\"|\\s*'[^']*'|[^\\s>]+))", "gi");

        end = typeof end == "string" ? end + ")" : ")";
        var at1 = new RegExp('(")('+front+'[^"]+")', flags);
        var at2 = new RegExp("(')("+front+"[^']+')", flags);
        var at3 = new RegExp("()("+front+'(?:"[^"]+"|\'[^\']+\'|(?:(?!'+delimiter+').)+)'+end, flags);

        var handleAttr = function(match, g1, g2){
            if(g2.charAt(0) == '"') return g1+g2.replace(at1, by);
            if(g2.charAt(0) == "'") return g1+g2.replace(at2, by);
            return g1+g2.replace(at3, by);
        };
        html = html.replace(selector, function(match){
             return prefix + match.replace(re1, handleAttr);
        });
    }

    /* <meta http-equiv=refresh content="  ; url= " > */
    html = html.replace(new RegExp("<meta"+any+att+"http-equiv\\s*=\\s*(?:\""+ae("refresh")+"\""+any+etag+"|'"+ae("refresh")+"'"+any+etag+"|"+ae("refresh")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "gi"), "<!-- meta http-equiv=refresh stripped-->");

    /* Stripping all scripts */
    html = html.replace(new RegExp("<script"+any+">\\s*//\\s*<\\[CDATA\\[[\\S\\s]*?]]>\\s*</script[^>]*>", "gi"), "<!--CDATA script-->");
    html = html.replace(/<script[\S\s]+?<\/script\s*>/gi, "<!--Non-CDATA script-->");
    cr(tag+any+att+"on[-a-z0-9:_.]+="+any+etag, "on[-a-z0-9:_.]+"); /* Event listeners */

    cr(tag+any+att+"href\\s*="+any+etag, "href"); /* Linked elements */
    cr(tag+any+att+"src\\s*="+any+etag, "src"); /* Embedded elements */

    cr("<object"+any+att+"data\\s*="+any+etag, "data"); /* <object data= > */
    cr("<applet"+any+att+"codebase\\s*="+any+etag, "codebase"); /* <applet codebase= > */

    /* <param name=movie value= >*/
    cr("<param"+any+att+"name\\s*=\\s*(?:\""+ae("movie")+"\""+any+etag+"|'"+ae("movie")+"'"+any+etag+"|"+ae("movie")+"(?:"+ae(" ")+any+etag+"|"+etag+"))", "value");

    /* <style> and < style=  > url()*/
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "url", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("url")+s+ae("(")+s, 0, s+ae(")"), ae(")"));

    /* IE7- CSS expression() */
    cr(/<style[^>]*>(?:[^"']*(?:"[^"]*"|'[^']*'))*?[^'"]*(?:<\/style|$)/gi, "expression", "\\s*\\(\\s*", "", "\\s*\\)");
    cri(tag+any+att+"style\\s*="+any+etag, "style", ae("expression")+s+ae("(")+s, 0, s+ae(")"), ae(")"));
    return html.replace(new RegExp("(?:"+prefix+")+", "g"), prefix);
}

Explanation of the code

The sanitiseHTML function is based on my replace_all_rel_by_abs function (see this answer). The sanitiseHTML function is completely rewritten though, in order to achieve maximum efficiency and reliability.

Additionally, a new set of RegExps are added to remove all scripts and event handlers (including CSS expression(), IE7-). To make sure that all tags are parsed as expected, the adjusted tags are prefixed by <!--'"-->. This prefix is necessary to correctly parse nested "event handlers" in conjunction with unterminated quotes: <a id="><input onclick="<div onmousemove=evil()>">.

These RegExps are dynamically created using an internal function cr/cri (Create Replace [Inline]). These functions accept a list of arguments, and create and execute an advanced RE replacement. To make sure that HTML entities aren't breaking a RegExp (refresh in <meta http-equiv=refresh> could be written in various ways), the dynamically created RegExps are partially constructed by function ae (Any Entity).
The actual replacements are done by function by (replace by). In this implementation, by adds data- before all matched attributes.

  1. All <script>//<[CDATA[ .. //]]></script> occurrences are striped. This step is necessary, because CDATA sections allow </script> strings inside the code. After this replacement has been executed, it's safe to go to the next replacement:
  2. The remaining <script>...</script> tags are removed.
  3. The <meta http-equiv=refresh .. > tag is removed
  4. All event listeners and external pointers/attributes (href, src, url()) are prefixed by data-, as described previously.

  5. An IFrame object is created. IFrames are less likely to leak memory (contrary to the htmlfile ActiveXObject). The IFrame becomes invisible, and is appended to the document, so that the DOM can be accessed. document.write() are used to write HTML to the IFrame. document.open() and document.close() are used to empty the previous contents of the document, so that the generated document is an exact copy of the given html string.

  6. If a callback function has been specified, the function will be called with two arguments. The first argument is a reference to the generated document object. The second argument is a function, which destroys the generated DOM tree when called. This function should be called when you don't need the tree any more.
    If the callback function isn't specified, the function returns an object consisting of two properties (doc and destroy), which behave the same as the previously mentioned arguments.

Additional notes

  • Setting the designMode property to "On" will stop a frame from executing scripts (not supported in Chrome). If you have to preserve the <script> tags for a specific reason, you can use iframe.designMode = "On" instead of the script stripping feature.
  • I wasn't able to find a reliable source for the htmlfile activeXObject. According to this source, htmlfile is slower than IFrames, and more susceptible to memory leaks.

  • All affected attributes (href, src, ...) are prefixed by data-. An example of getting/changing these attributes is shown for data-href:
    elem.getAttribute("data-href") and elem.setAttribute("data-href", "...")
    elem.dataset.href and elem.dataset.href = "...".
  • External resources have been disabled. As a result, the page may look completely different:
    <link rel="stylesheet" href="main.css" /> No external styles
    <script>document.body.bgColor="red";</script> No scripted styles
    <img src="128x128.png" /> No images: the size of the element may be completely different.

Examples

sanitiseHTML(html)
Paste this bookmarklet in the location's bar. It will offer an option to inject a textarea, showing the sanitised HTML string.

javascript:void(function(){var s=document.createElement("script");s.src="http://rob.lekensteyn.nl/html-sanitizer.js";document.body.appendChild(s)})();

Code examples - string2dom(html):

string2dom("<html><head><title>Test</title></head></html>", function(doc, destroy){
    alert(doc.title); /* Alert: "Test" */
    destroy();
});

var test = string2dom("<div id='secret'></div>");
alert(test.doc.getElementById("secret").tagName); /* Alert: "DIV" */
test.destroy();

Notable references

这篇关于可以将整个HTML文档加载到Internet Explorer中的文档片段中吗?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆