Changes

PhD Masterclass - How to Build a Web Crawler (view source)

Revision as of 19:54, 24 February 2011

4,455 bytes added , 19:54, 24 February 2011

no edit summary

my $incidentelement=$tree->look_down("id","primecontent");

#Or use HTML::Element methods to look_down the tree for a tag with some properties

==An Example Webcrawler==

I wrote the following simple webcrawler for a fellow PhD student:

#!/usr/bin/perl -w

use strict;

use LWP::UserAgent;

#Use the LWP::UserAgent modules

use HTML::TreeBuilder;

#Use the HTML::TreeBuilder modules

my $ua = LWP::UserAgent->new;

#Create a new UserAgent

my @Pkids;

open (PKIDS,"Pkidfile.txt") || die "Can't open the PKID file to read $!";

#Open the Pkid file to read - this file has a Pkid on each line. You can get some from here: http://myaccount.sdar.com/RealtorSrch.asp

while (<PKIDS>) {

#Read the pkid file line by line

chomp $_;

#Remove the \n (newline symbol) from each line

push(@Pkids,$_);

#Add the PKID to an array

}

open (RESULTS,">Results.txt") || die "Can't write the Results.txt file $!";

#Open the Results file to write

my $headerflag=0;

#Set a flag to indicate whether we wrote the header line to the output file

foreach my $Pkid (sort (@Pkids)) {

#Go through the PKIDs in order

my $url="http://myaccount.sdar.com/RealtorSrchDetail.asp?PKID=".$Pkid;

#Set up a string containing a URL

my $response = $ua->get($url);

#Use the UA 'get' method to retrieve the webpage. This returns an HTTP Response object

my $content=$response->decoded_content;

#Get the response as one long text string, so we can work with it...

my $tree = HTML::TreeBuilder->new; # empty tree

#Create a new tree object

$tree->parse($content);

#Load up the tree from the content string (that we got using UA)

my $name=$tree->look_down("width","520");

#Find an element in the HTML that has width=520 (this is where names are stored)

my $nametext=$name->as_text;

#Convert it to text

$nametext=~s/^\s{1,}//;

#Remove leading spaces

$nametext=~s/\s{1,}$//;

#Remove trailing spaces

$nametext=~s/^\s{2,}/ /g;

#Replace double spaces with a single space, globally

my @fieldstext;

#Declare an array

my @fields=$tree->look_down("class","field_labels");

#Find all of the field elements

foreach my $field (@fields) {

#Go through them

my $fieldparent=$field->parent;

#Go to their parent

my $fieldparenttext=$fieldparent->as_text;

#Turn the parent into text

$fieldparenttext=~s/^\s{1,}//; $fieldparenttext=~s/\s{1,}$//; $fieldparenttext=~s/^\s{2,}/ /g;

#Deal with spaces again

push @fieldstext,$fieldparenttext;

#Add the fields to a list

}

&writeoutput($Pkid,$nametext,@fieldstext);

#Call the write output subroutine

$content=undef; $tree=undef; $name=undef; undef @fields;

#Set a bunch of variables to undefined - this frees up memory

sleep(2);

#Pause for a second or two...

}

close (RESULTS);

#Close the Results filehandle - this flushes the write buffer

sub writeoutput {

#Declare the writeoutput subroutine

my $data={};

#Set up an anonymous hash

$data->{"A Pkid"}=shift @_;

#Set the A PKID field to the first parameter passed to the subroutine

$data->{"A Name"}=shift @_;

#Set the A PKID field to the second parameter passed to the subroutine (the first has now gone)

push(my @fields,@_);

#Add the remaining parameters to an array

foreach my $field (@fields) {

#Go through the array

my @fieldparts=split(":",$field);

#Split the fields on semicolon

my $key=shift(@fieldparts);

#Set the key

$data->{$key}=join(":",@fieldparts);

#Write the hash entry

}

if (!$headerflag) {

#If the headflag is 0 then do this

foreach my $key (sort {$a cmp $b} (keys %{$data})) {

#Go through the keys

print RESULTS $key."\t";

#Write the key followed by a tab

}

print RESULTS "\n";

#Print a newline

$headerflag=1;

#Set the headflag to 1

}

foreach my $key (sort {$a cmp $b} (keys %{$data})) {

#Go through the keys again

print RESULTS $data->{$key}."\t";

#This time print the data followed by tabs

}

print RESULTS "\n";

#print a newline

}

print "Thanks to Ed";

#Thank Ed.

Anonymous user

imported>Ed

Changes

PhD Masterclass - How to Build a Web Crawler (view source)

Revision as of 19:54, 24 February 2011

Navigation menu

Personal tools

Namespaces

Variants

Views

More

Search

Navigation

Sites

Sections

Organizations

Help

Tools