use LWP::UserAgent; use URI::Heuristic; # Parser.pl - Simply put it takes a Dice Web Page that was done in brief mode # and saved as dice.html in the same directory. Since it's for my # own personal use there isn't really a lot of fault tolerance as # I'll already know if there's a problem with Dice or my network. # Get 2-digit STATE from command-line $state = lc($ARGV[0]); if (length($state)!=2) { die "State '$state' MUST be 2-digits\n"; } # Start MAIN program &get_acode_line_data(); # Parses out the Area Code lines. It looks like 2 lines sub get_acode_line_data { my (@array,@rarray); # The hash below is familiar to those that have used Perl since 4.019 %codes = (714,1,949,1,909,1,760,1,213,1,323,1,310,1, 626,1,818,1,562,1,858,1,619,1); open(DICE,"dice.html"); while () { $lines[$count++] = $_; } close(DICE); foreach $line (0..$count) { $lines[$line] =~ /\S+
\w+-(\w+)-(.*?)-(.*?)<\/font>/; $link=$1; $job=$2; $areacode=$3; $city=$4; $detail=$5; if ($codes{$3}) { print "$1,$2,$3,$4,$5\n"; $z = index($link,".com") + 4; $webstring = substr($link,$z,length($link)-$z); (@F) = split(/\//,$webstring); $html = $F[$#F]; print $html,','; open(OUTFILE,">$html") or die "1-Can't open $html: $!\n"; my $url = URI::Heuristic::uf_urlstr($link); $| = 1; printf "%s =>\n\t",$url; my $ua = new LWP::UserAgent; $ua->agent("Netscape/v5.0c Navigator"); my $req = HTTP::Request->new(GET => $url); $req->referer("http:''dde.cia.gov"); my $response = $ua->request($req); if ($response->is_error()) { printf " %s\n", $response->status_line; } else { my $content = $response->content(); print OUTFILE "$content\n"; } close(OUTFILE); makereport($html); } } } sub makereport { $outfile = @_[0]; my $summaryfile = "summaryfile.tab"; open(OUTFILE,">>$summaryfile"); open(FILE,$outfile) or die "2-Can't open $outfile: $!\n"; my (@lines,$incr); while () { $lines[$incr++] = $_; } close(FILE); foreach (0..$incr) { if ($lines[$_] =~ /Title:/) { my $res = bold($lines[$_],"Title:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Skills:/) { my $res = top($lines[$_],"Skills:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Date Posted:/) { my $res = top($lines[$_],"Date Posted:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Location:/) { my $res = top($lines[$_],"Location:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Area code:/) { my $res = top($lines[$_],"Area code:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Tax Term:/) { my $res = left($lines[$_],"Tax Term:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Pay:/) { my $res = top($lines[$_],"Pay:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Length:/) { my $res = top($lines[$_],"Length:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Email:/) { my $res = weblink($lines[$_]); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Web:/) { my $res = weblink($lines[$_]); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Position ID:/) { my $res = top($lines[$_],"Position ID:"); print OUTFILE $res,"\t"; } if ($lines[$_] =~ /Tel:/) { my $res = tel($lines[$_],"Tel:"); print OUTFILE $res,"\n"; } } close(OUTFILE); } sub bold { my ($line,$string) = @_; my $h1 = index($line,"$string"); my $g1 = substr($line,$h1,length($line)-$h1); my $h2 = index($g1,"")+3; my $g2 = substr($g1,$h2,length($g1)-$h2); my $h3 = index($g2,"<"); my $g3 = substr($g2,0,$h3); return $g3; } sub left { my ($line,$string) = @_; my $h1 = index($line,"$string"); my $g1 = substr($line,$h1,length($line)-$h1); my $h2 = index($g1,"left>")+5; my $g2 = substr($g1,$h2,length($g1)-$h2); my $h3 = index($g2,"<"); my $g3 = substr($g2,0,$h3); return $g3; } sub weblink { my ($line) = @_; my $h1 = index($line,"?")+1; my $g1 = substr($line,$h1,length($line)-$h1); my $h2 = index($g1,"&"); my $g2 = substr($g1,0,$h2); return $g2; } sub tel { my ($line) = @_; my $h1 = index($line,"Tel:")+5; my $g1 = substr($line,$h1,length($line)-$h1); chop $g1; return $g1; } sub top { my ($line,$string) = @_; my $h1 = index($line,"$string"); my $g1 = substr($line,$h1,length($line)-$h1); my $h2 = index($g1,"top>")+4; my $g2 = substr($g1,$h2,length($g1)-$h2); my $h3 = index($g2,"<"); my $g3 = substr($g2,0,$h3); return $g3; }