web crawler |
Ketan Jetty
enthusiasm for technology
|
||
tags
functions
scopes
|
|||
|
web crawler How to use the web crawler
How to use the web crawler / spider in your code
<cfset domainURL = "http://ketanjetty.com" />
<cfset objSpdider = CreateObject("component", "spider").init(domainURL) />
<cfset spideredLinks = objSpdider.spider(domainURL) />
<cfdump var="#spideredLinks#">
spider.cfc :: web crawler component
<cfcomponent>
<cfset variables.domain = "" />
<cfset variables.allLinks = StructNew() />
<cffunction name="init" access="public" returntype="any">
<cfargument name="domain" required="true" type="string">
<cfset variables.domain = arguments.domain />
<cfset StructInsert(variables.allLinks, variables.domain, "")>
<cfreturn this />
</cffunction>
<cffunction name="getPageContent" access="private" returntype="string">
<cfargument name="pageURL" required="true" type="string">
<cfset var retVal = "" />
<cfhttp url="#trim(arguments.pageURL)#" method="get"></cfhttp>
<cfset retVal = LCase(cfhttp.fileContent) />
<cfreturn retVal />
</cffunction>
<cffunction name="getPageLinksArray" access="private" returntype="array">
<cfargument name="pageContent" required="true" type="string">
<cfset var hrefArray = arguments.pageContent.split("href=") />
<cfreturn hrefArray />
</cffunction>
<cffunction name="getProcessedLinks" access="private" returntype="struct">
<cfargument name="hrefArray" required="true" type="array">
<cfset var temp = "" />
<cfset var pageLinksStruct = StructNew() />
<cfset var filesToAvoid = '{ "!":"", ".css":"", ".js":"", ".jpeg":"", ".jpg":"", ".bmp":"", ".gif":"", ".png":"", ".cfc":"", ".swf":"", ".pdf":"", ".ico":"", ".xml":"", ".xls":"", ".doc":"", ".exe":"", ".jar":"", ".tar":"", ".mp3":"" }' />
<cftry>
<cfloop from="1" to="#ArrayLen(arguments.hrefArray)#" index="i">
<cfif Find(">",arguments.hrefArray[i])>
<cfset temp = Left(arguments.hrefArray[i],Find(">",arguments.hrefArray[i])) & " " />
<cfset temp = Left(temp,Find(" ",temp)) />
<cfset temp = Replace(temp,">"," ","all") />
<cfset temp = Replace(temp,"<","","all") />
<cfset temp = Replace(temp,"'","","all") />
<cfset temp = Replace(temp,'"','','all') />
<cfset temp = trim(removeSlash(trim(temp))) />
<cfset insert = 1 />
<cfloop collection="#DeSerializeJson(filesToAvoid)#" item="key">
<cfif FindNoCase(key,temp)>
<cfset insert = 0 />
</cfif>
</cfloop>
<cfif insert AND FindNoCase(variables.domain, temp) >
<cfif NOT StructKeyExists(pageLinksStruct, temp)>
<cfset StructInsert(pageLinksStruct, temp, "" )>
</cfif>
</cfif>
</cfif>
</cfloop>
<cfcatch type="any">
<cfdump var="#cfcatch#">
</cfcatch>
</cftry>
<cfreturn pageLinksStruct />
</cffunction>
<cffunction name="removeSlash" access="private" returntype="string">
<cfargument name="pageURL" required="true" type="string">
<cfset var page = arguments.pageURL />
<cfif Right(page,1) EQ "/">
<cfset page = left(page,len(page)-1)>
</cfif>
<cfreturn page />
</cffunction>
<cffunction name="addLinks" access="private" returntype="void">
<cfargument name="pageLinks" required="true" type="struct">
<cfset var key = "" />
<cfloop collection="#arguments.pageLinks#" item="key">
<cfif NOT StructKeyExists(variables.allLinks, key)>
<cfset StructInsert(variables.allLinks, key, "" )>
</cfif>
</cfloop>
</cffunction>
<cffunction name="spider" access="public" output="true" returntype="any">
<cfargument name="_page" required="true" type="string">
<cfset var page = removeSlash(arguments._page) />
<cfset var pageLinks = "" />
<cfset var key = "" />
<cfset pageLinks = getProcessedLinks(getPageLinksArray(getPageContent(page))) />
<cfset addLinks(pageLinks) />
<cfset variables.allLinks[page] = "processed">
<cfloop collection="#variables.allLinks#" item="key">
<cfif variables.allLinks[key] NEQ "processed">
<cfset spider(key)>
<cfbreak />
</cfif>
</cfloop>
<cfreturn variables.allLinks>
</cffunction>
</cfcomponent>
Web Crawler code in CSharp
using System;
using System.Text;
using System.Net;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
namespace KJSpider
{
class Program
{
static string websitename = string.Empty;
static ArrayList alUrls = new ArrayList();
static ArrayList alTempUrls = new ArrayList();
static string timeStart = string.Empty;
static string timeEnd = string.Empty;
static void Main(string[] args)
{
string websiteurl = string.Empty;
int iterations = 3;
if (args.Length > 0)
{
websiteurl = args[0];
if (args.Length > 1)
{
iterations = Convert.ToInt32(args[1]);
}
Console.WriteLine("websiteurl " + websiteurl);
Console.Write(" | iterations " + iterations);
Console.WriteLine("");
startSpider(websiteurl, iterations);
}
else
{
Console.WriteLine("no arguments passed");
}
// Test
// startSpider("http://ketanjetty.com");
//startSpider("http://ketanjetty.com", 5);
//Console.ReadLine();
}
public static void startSpider(string url)
{
// do default 3 iterations
startSpider(url, 3);
}
public static void startSpider(string url, int iterations)
{
// get the web site name
websitename = url.ToLower();
websitename = websitename.Replace("http://", "");
websitename = websitename.Replace("www", "");
// add the url to the repository
alUrls.Add(url);
// process spidering
timeStart = DateTime.Now.ToString();
for (int i = 0; i < iterations; i++)
{
Console.WriteLine("starting iteration: " + i);
Console.WriteLine("");
// clear the url's in the temporary repository and copy the latest from the central repository
alTempUrls.Clear();
alTempUrls = (ArrayList)alUrls.Clone();
spiderRecursive();
Console.WriteLine("");
}
timeEnd = DateTime.Now.ToString();
// write the results to the log
writeToLog();
}
static void spiderRecursive()
{
foreach (string strUrl in alTempUrls)
{
spiderURL(strUrl.Trim());
}
}
static void spiderURL(string strUrl)
{
string baseUrl = strUrl;
string link = null;
string pageData;
int curloc; // holds current location in response
// check for end "/"
if (!baseUrl.EndsWith("/"))
{
baseUrl += "/";
}
if (!strUrl.ToLower().Contains("*spidered::"))
{
if (alUrls.IndexOf(strUrl) > -1)
{
alUrls[alUrls.IndexOf(strUrl)] = "*spidered::" + alUrls[alUrls.IndexOf(strUrl)].ToString();
}
try
{
do
{
Console.WriteLine("Spidering... " + strUrl);
HttpWebRequest req = null;
HttpWebResponse resp = null;
Stream istrm = null;
StreamReader rdr = null;
try
{
req = (HttpWebRequest)WebRequest.Create(strUrl);
strUrl = null; // disallow further use of this URI
resp = (HttpWebResponse)req.GetResponse();
istrm = resp.GetResponseStream();
rdr = new StreamReader(istrm);
}
catch (Exception ex)
{
rdr = null;
Console.WriteLine("Error: req: " + ex.Message);
}
if (rdr != null)
{
// Read in the entire page.
pageData = rdr.ReadToEnd();
curloc = 0;
do
{
// Find the next URI to link to.
link = null;
link = FindLink(pageData, ref curloc);
if (link != null)
{
link = link.Trim();
//if (!link.Contains("http://"))
//{
// link = baseUrl + link;
//}
if (!(link.Contains(".pdf") || link.Contains(".jpg") || link.Contains(".gif") || link.Contains(".png") || link.Contains(".swf") || link.Contains(".js") || link.Contains(".css") || link.Contains(".xml") || link.Contains(".xls") || link.Contains(".doc")))
{
if (link.Contains(websitename))
{
bool addLink = true;
if (alUrls.Contains(link))
{
addLink = false;
}
if (alUrls.Contains(link + "/"))
{
addLink = false;
}
if (alUrls.Contains("*spidered::" + link))
{
addLink = false;
}
if (alUrls.Contains("*spidered::" + link + "/"))
{
addLink = false;
}
if (addLink)
{
// add links to the arraylist repository
alUrls.Add(link);
}
}
}
}
else
{
//Console.WriteLine("No link found.");
break;
}
} while (link.Length > 0);
}
// Close the Response.
if (resp != null)
{
resp.Close();
}
} while (strUrl != null);
}
catch (WebException exc)
{
Console.WriteLine("Error: Spidering: " + exc.Message);
}
}
}
// Find a link in a content string.
static string FindLink(string htmlstr, ref int startloc)
{
int i;
int start, end;
string uri = null;
string lowcasestr = htmlstr.ToLower();
// todo :: use href for robust spidering
//i = lowcasestr.IndexOf("href=", startloc);
i = lowcasestr.IndexOf("href=\"http", startloc);
if (i != -1)
{
start = htmlstr.IndexOf('"', i) + 1;
end = htmlstr.IndexOf('"', start);
uri = htmlstr.Substring(start, end - start);
startloc = end;
}
return uri;
}
// write to a log
static void writeToLog()
{
alUrls.Sort();
try
{
TextWriter tws = new StreamWriter("c:\\KJSpiderReport.txt");
tws.WriteLine(timeStart);
foreach (string str2 in alUrls)
{
tws.WriteLine(str2);
}
tws.WriteLine(timeEnd);
tws.Close(); // close the stream
}
catch (Exception ex)
{
Console.Write("Error: log: " + ex.Message);
}
Console.WriteLine("Spidering complete.");
//Console.ReadLine();
}
}
}
|
|
||
| Ketan Jetty @ 2010. All Rights Reserved. | |||