Last month issue of IRW covered Web Scraping as a Web Mining activity. An app of scrapers, the Web Mining Studio, was unveiled. One of these, the Scripts Scrapers allows anyone to build a library of scripts from all over the Web. This month issue of the newsletter, which will be a bit delayed, covers how we grabbed scripts used by search engines. Here is a sample from http://news.google.com.
Scripts Report
7 results found.
| n | HTML |
| 1. | <script type=”text/javascript”> setupjsflags(); </script> |
| 2. | <script type=”text/javascript”> news_logerrors = true </script> |
| 3. | <script type=”text/javascript”> try { window["jstiming"]["load"].tick(”bol”); } catch (e) { news_logerror(e, ”csi:bol”); } </script> |
| 4. | <script type=”text/javascript”>function setupjsflags() { news_flags = {}; news_flag_xhrpathprefix = 0; news_flags[news_flag_xhrpathprefix] = ”/news/xhr”; news_flag_usejsimagefetchtracking = 1; news_flags[news_flag_usejsimagefetchtracking] = false; news_flag_enableemail = 2; news_flags[news_flag_enableemail] = true; news_flag_experiments = 3; news_flags[news_flag_experiments] = ””; news_flag_pingcsi = 4; news_flags[news_flag_pingcsi] = true; news_flag_prefetchcitylist = 5; news_flags[news_flag_prefetchcitylist] = false; news_flag_maxcreatepagetitlelength = 7; news_flags[news_flag_maxcreatepagetitlelength] = 25; news_flag_enablestarring = 8; news_flags[news_flag_enablestarring] = true; news_flag_enable_create_page_suggestions = 9; news_flags[news_flag_enable_create_page_suggestions] = true; news_flag_enable_js_debug = 10; news_flags[news_flag_enable_js_debug] = false } function news_logerror(e, extramessage) { var url = ”/news/xhr/log_error?ned=” + ”us” + ”&error=” + encodeuricomponent(e.name + ”: ” + e.message) + ”&useragent=” + encodeuricomponent(navigator.useragent) + ”&url=” + encodeuricomponent(window.location) + ”&experiments=” + encodeuricomponent(””) + ”&stack=” + encodeuricomponent(e.stack) + ”&errorlocation=” + encodeuricomponent(extramessage); new image().src = url; } function grabjsbundle(jsurl) { var scriptel = document.createelement(”script”); scriptel.src = jsurl; scriptel.onerror = function() { if (window['news_beforeonloadfired']) { return; } news_logerror(new error(”deferred js error”), ”error in download of deferred js: ” + jsurl); }; var head = document.getelementsbytagname(’head’)[0]; head.appendchild(scriptel); }</script> |
| 5. | <script type=”text/javascript”>var a=window,b=”substring”;if(a.location.hash==”#changed”){var c=a.location.href;c=c.substr(0,c.indexof(”#”));var d=[];if(c.indexof(”?”)>-1)for(var e=c[b](c.indexof(”?”)+1).split(”&”),f=0;f<e.length;f++)e[f][b](0,3)!=”zx=”&&e[f][b](0,3)!=”pz=”&&e[f][b](0,5)!=”shid=”&&d.push(e[f]);d.push(”pz=1”);d.push(”zx=”+math.random());a.location=a.location.pathname+”?”+d.join(”&”)}; </script> |
| 6. | <script type=”text/javascript”>var global_window=window;function timer(b){this.t={};this.tick=function(c,d,a){a=a?a:(new date).gettime();this.t[c]=[a,d]};this.tick(”start”,null,b)}var loadtimer=new timer;global_window.jstiming={timer:timer,load:loadtimer};try{global_window.jstiming.pt=global_window.gtbexternal&&global_window.gtbexternal.paget()||global_window.external&&global_window.external.paget}catch(e){}; </script> |
| 7. | <script type=”text/javascript”>window.gbar={};(function(){function g(a,b,c){var d=”on”+b;if(a.addeventlistener)a.addeventlistener(b,c,false);else if(a.attachevent)a.attachevent(d,c);else{var h=a[d];a[d]=function(){var f=h.apply(this,arguments),e=c.apply(this,arguments);return f==undefined?e:e==undefined?f:e&&f}}};var i=window.gbar,k,l;function m(a){var b=window.encodeuricomponent&&(document.forms[0].q||””).value;if(b)a.href=a.href.replace(/([?&])q=[^&]*|$/,function(c,d){return(d||”&”)+”q=”+encodeuricomponent(b)})}i.qs=m;function n(a,b,c,d,h,f){var e=document.getelementbyid(a),j=e.style;if(e){j.left=d?”auto”:b+”px”;j.right=d?b+”px”:”auto”;j.top=c+”px”;j.visibility=l?”hidden”:”visible”;if(h&&f){j.width=h+”px”;j.height=f+”px”}else{n(k,b,c,d,e.offsetwidth,e.offsetheight);l=l?””:a}}}i.tg=function(a){a=a||window.event;var b=a.target||a.srcelement;a.cancelbubble=true;if(k!=null)o(b);else{a=document.createelement(array.every||window.createpopup?”iframe”:”div”);a.frameborder=”0”;a.src=”javascript:’’”;k=b.parentnode.appendchild(a).id=”gbs”;g(document,”click”,i.close);o(b);i.alld&&i.alld(function(){var c=document.getelementbyid(”gbli”);if(c){var d=c.parentnode;d.removechild(c);p(d)}})}};function q(a){var b,c=document.defaultview;if(c&&c.getcomputedstyle){if(a=c.getcomputedstyle(a,””))b=a.direction}else b=a.currentstyle?a.currentstyle.direction:a.style.direction;return b==”rtl”}function o(a){var b=0;if(a.classname!=”gb3”)a=a.parentnode;var c=a.getattribute(”aria-owns”)||”gbi”,d=a.offsetwidth,h=a.offsettop>20?46:24,f=false;do b+=a.offsetleft||0;while(a=a.offsetparent);a=(document.documentelement.clientwidth||document.body.clientwidth)-b-d;d=q(document.body);if(c==”gbi”){var e=document.getelementbyid(”gbi”);i.alli&&i.alli(e);p(e);if(d){b=a;f=true}}else if(!d){b=a;f=true}l!=c&&i.close();n(c,b,h,f)}i.close=function(){l&&n(l,0,0)};function r(a,b){var c=a.firstchild?a.firstchild.classname:”gb2”;a.insertbefore(b,a.firstchild).classname=c}function p(a){for(var b,c=window.navextra;c&&(b=c.pop());)r(a,b)}})();</script> |