Last month issue of IRW covered Web Scraping as a Web Mining activity. An app of scrapers, the Web Mining Studio, was unveiled. One of these, the Scripts Scrapers allows anyone to build a library of scripts from all over the Web. This month issue of the newsletter, which will be a bit delayed, covers how we grabbed scripts used by search engines. Here is a sample from http://news.google.com.
Scripts Report
7 results found.
n | HTML |
1. | <script type=”text/javascript”> setupjsflags(); </script> |
2. | <script type=”text/javascript”> news_logerrors = true </script> |
3. | <script type=”text/javascript”> try { window[”jstiming”][”load”].tick(”bol”); } catch (e) { news_logerror(e, ”csi:bol”); } </script> |
4. | <script type=”text/javascript”>function setupjsflags() { news_flags = {}; news_flag_xhrpathprefix = 0; news_flags[news_flag_xhrpathprefix] = ”/news/xhr”; news_flag_usejsimagefetchtracking = 1; news_flags[news_flag_usejsimagefetchtracking] = false; news_flag_enableemail = 2; news_flags[news_flag_enableemail] = true; news_flag_experiments = 3; news_flags[news_flag_experiments] = ””; news_flag_pingcsi = 4; news_flags[news_flag_pingcsi] = true; news_flag_prefetchcitylist = 5; news_flags[news_flag_prefetchcitylist] = false; news_flag_maxcreatepagetitlelength = 7; news_flags[news_flag_maxcreatepagetitlelength] = 25; news_flag_enablestarring = 8; news_flags[news_flag_enablestarring] = true; news_flag_enable_create_page_suggestions = 9; news_flags[news_flag_enable_create_page_suggestions] = true; news_flag_enable_js_debug = 10; news_flags[news_flag_enable_js_debug] = false } function news_logerror(e, extramessage) { var url = ”/news/xhr/log_error?ned=” + ”us” + ”&error=” + encodeuricomponent(e.name + ”: ” + e.message) + ”&useragent=” + encodeuricomponent(navigator.useragent) + ”&url=” + encodeuricomponent(window.location) + ”&experiments=” + encodeuricomponent(””) + ”&stack=” + encodeuricomponent(e.stack) + ”&errorlocation=” + encodeuricomponent(extramessage); new image().src = url; } function grabjsbundle(jsurl) { var scriptel = document.createelement(”script”); scriptel.src = jsurl; scriptel.onerror = function() { if (window[’news_beforeonloadfired’]) { return; } news_logerror(new error(”deferred js error”), ”error in download of deferred js: ” + jsurl); }; var head = document.getelementsbytagname(’head’)[0]; head.appendchild(scriptel); }</script> |
5. | <script type=”text/javascript”>var a=window,b=”substring”;if(a.location.hash==”#changed”){var c=a.location.href;c=c.substr(0,c.indexof(”#”));var d=[];if(c.indexof(”?”)>-1)for(var e=c[b](c.indexof(”?”)+1).split(”&”),f=0;f<e.length;f++)e[f][b](0,3)!=”zx=”&&e[f][b](0,3)!=”pz=”&&e[f][b](0,5)!=”shid=”&&d.push(e[f]);d.push(”pz=1”);d.push(”zx=”+math.random());a.location=a.location.pathname+”?”+d.join(”&”)}; </script> |
6. | <script type=”text/javascript”>var global_window=window;function timer(b){this.t={};this.tick=function(c,d,a){a=a?a:(new date).gettime();this.t[c]=[a,d]};this.tick(”start”,null,b)}var loadtimer=new timer;global_window.jstiming={timer:timer,load:loadtimer};try{global_window.jstiming.pt=global_window.gtbexternal&&global_window.gtbexternal.paget()||global_window.external&&global_window.external.paget}catch(e){}; </script> |
7. | <script type=”text/javascript”>window.gbar={};(function(){function g(a,b,c){var d=”on”+b;if(a.addeventlistener)a.addeventlistener(b,c,false);else if(a.attachevent)a.attachevent(d,c);else{var h=a[d];a[d]=function(){var f=h.apply(this,arguments),e=c.apply(this,arguments);return f==undefined?e:e==undefined?f:e&&f}}};var i=window.gbar,k,l;function m(a){var b=window.encodeuricomponent&&(document.forms[0].q||””).value;if(b)a.href=a.href.replace(/([?&])q=[^&]*|$/,function(c,d){return(d||”&”)+”q=”+encodeuricomponent(b)})}i.qs=m;function n(a,b,c,d,h,f){var e=document.getelementbyid(a),j=e.style;if(e){j.left=d?”auto”:b+”px”;j.right=d?b+”px”:”auto”;j.top=c+”px”;j.visibility=l?”hidden”:”visible”;if(h&&f){j.width=h+”px”;j.height=f+”px”}else{n(k,b,c,d,e.offsetwidth,e.offsetheight);l=l?””:a}}}i.tg=function(a){a=a||window.event;var b=a.target||a.srcelement;a.cancelbubble=true;if(k!=null)o(b);else{a=document.createelement(array.every||window.createpopup?”iframe”:”div”);a.frameborder=”0”;a.src=”javascript:’’”;k=b.parentnode.appendchild(a).id=”gbs”;g(document,”click”,i.close);o(b);i.alld&&i.alld(function(){var c=document.getelementbyid(”gbli”);if(c){var d=c.parentnode;d.removechild(c);p(d)}})}};function q(a){var b,c=document.defaultview;if(c&&c.getcomputedstyle){if(a=c.getcomputedstyle(a,””))b=a.direction}else b=a.currentstyle?a.currentstyle.direction:a.style.direction;return b==”rtl”}function o(a){var b=0;if(a.classname!=”gb3”)a=a.parentnode;var c=a.getattribute(”aria-owns”)||”gbi”,d=a.offsetwidth,h=a.offsettop>20?46:24,f=false;do b+=a.offsetleft||0;while(a=a.offsetparent);a=(document.documentelement.clientwidth||document.body.clientwidth)-b-d;d=q(document.body);if(c==”gbi”){var e=document.getelementbyid(”gbi”);i.alli&&i.alli(e);p(e);if(d){b=a;f=true}}else if(!d){b=a;f=true}l!=c&&i.close();n(c,b,h,f)}i.close=function(){l&&n(l,0,0)};function r(a,b){var c=a.firstchild?a.firstchild.classname:”gb2”;a.insertbefore(b,a.firstchild).classname=c}function p(a){for(var b,c=window.navextra;c&&(b=c.pop());)r(a,b)}})();</script> |
Hi E. Garcia,
I recommend you to check Scrapy (http://scrapy.org), is an opensource framework for web scraping. It was written enterly in Pyhton, and powered by Twisted (http://twistedmatrix.com) and others open source libraries.
Actually, Scrapy is growing very fast and seems to be used in the project data.gov.uk (Check this tweet http://twitter.com/bfirsh/status/8025368963).
I think you can find a lot of useful stuffs in Scrapy.
Kind regards,
Andres
Hi, Andres:
Thank you for stopping by and for the tip. We heard before about Scrapy.
I think I’ll stick to the Web Mining Studio platform which does all Scrapy does, plus term weight analysis and few other IR scoring tasks. With the upcoming integration of the Fractal CSS Design Studio to the WMS,it will do very fancy self-similar Web design stuff from extracted data. We are putting together a white paper on the FCDS and it will be available soon.