# WEBSITE PARSING PROGRAM # # NOTE: ALL CONTENT ON CREATED WEBPAGE IS OWNED BY THE CORRESPONDING # WEBSITES. CLICK ON LINK TO SEE ORIGINAL CONTENT # #Created by : Justin Bischoff # #Latest Revision : 4/6/03 # -Bug Fixes and more comics # #DO NOT USE OR DISTRIBUTE THIS CODE # #There is no warranty, explicit or implied, on this code. # #Copyright 2003 Justin Bischoff # #Send me ideas for changes or improvements # #webpage content is owned by corresponding webpage owner, # see indiviual pages for details. # #publicly providing the content of the generated web page # without permission of the content owners could be # breaking the law ############################################# # Description # # This is a perl script that uses the LWP module to # grab specific webpages off of the internet. # # The second half of the script goes through the grabbed # webpages and turnes them into a summary page ############################################# # Usage # - use perl to run. C:\perl l33t_grabber.pl # # Command Line Parameters - examples (in any order) # "force 3" - forces up to 3 attempts at a page # "proxy my.proxy:69" - uses my.proxy at port 69 # "timeout 12" - sets the timeout to 12 seconds # "comicsoff" - only news sites grabbed - much faster # # ie. "C:\perl l33t_grabber.pl proxy proxy.fm.intel.com:911 force 2 comicsoff" # this line will use folsom's proxy, make two attempts at a webpage and it # has the comics turned off. #I hate use strict, if you don't initialize your vars, then you're dumb. use LWP::UserAgent; use POSIX qw/strftime/; ############################### #things to add: # - Save a text file w/date of source html, only look through web if file dne # - improve and add to current events page # - handle more failure cases, or handle them better. # - Create RSS compliant generic parser. # # - automatically spawn the webpage after the script is run. (Command line parameter?) # - more stock?, e2, circuit? # - Current Movie Listings or Showtimes? # - Current Events - concerts, shows, plays, lectures, etc. # # - Horoscope? (theOnion?) ########################################################### #USER VARIABLES - change these per personal preference # # #for news sites, this is the number of headlines per # # category to display. this only kinda works, # $Number_of_headlines = 3; # # #What is the html result title going to be? # $Page_title = "Nerd's Web Summary"; # # #What is the filename going to be? # $Output_file = "nerd_summary.html"; # # #width of news columns must be forced # $column_width = 33; # # #name and port of your proxy (can use command line instead) #my $my_proxy = 'http://proxy.sc.intel.com:911'; # my $my_proxy = 'http://proxy.fm.intel.com:911'; # # #timeout before we quit trying to load a page. # $timeout = 13; #in LWP timeouts don't really work # #By default generate logfiles? # $logfiles = 0; # # # ########################################################### #ONLY 1337 P3rl Ninjas venture below this line! #get time in "Month-Day-Year" $now = strftime( "%B-%d-%Y", localtime(time()) ); #streamline this code! my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); #print "$mon-$mday-$year : $isdst\n"; my $daysleft = 318 - $yday; ########################################################### # process user arguments # #DO NOT change these default values. $this_param = 0; $comics_on = 1; $force = 1; $proxy = "none"; foreach $cl_param (@ARGV) { #logfile - enables logfiling daily- creates lots of files. # if($cl_param=~/logfile/){ $logfiles = 1; open(LOGFILE, ">logfile_$now.txt") || die "Couldn't open logfile_$now.txt\n" . $!; &print_error( "HTML PARSER LOGFILE\n\nDate: $now\n\n"); } #force - forces a read, if a number follows force "force 2" it will # try 2 times on each website. Force w/o a number may cause # an infinite runtime! if($cl_param=~/force/){ if($ARGV[$this_param+1]=~/\d+/){ $force=$ARGV[$this_param+1]; if($force>5){ &print_error( "FOOL! Do you want the script to run forever?\n"); &print_error( "I AM COMPUTER, SMARTER THAN THOU, USE A LOWER FORCE VALUE!\n"); $force = 5; } elsif($force<=0){ &print_error( "WHAT ARE YOU TRYING TO DO?\nBAD USER, BAD!\n"); $force = 1; } &print_error( "Using a force of $force cycles\n"); } else{ $force=1; &print_error( "ERROR PARSING COMMAND LINE PARAMETERS!\n"); &print_error( "\"force\" must be followed by a number.\nForce not used\n"); } } #timeout - sets timeout in number of seconds. Include a number after the # word timeout, ie "timeout 10" if($cl_param=~/timeout/){ if($ARGV[$this_param+1]=~/\d+/){ $timeout=$ARGV[$this_param+1]; &print_error( "Using a timeout of $timeout\n"); } else{ &print_error( "ERROR PARSING COMMAND LINE PARAMETERS!\n"); &print_error( "\"timeout\" must be followed by a number.\nTimeout of $timeout sec. used\n"); } } #proxy - defines your proxy. It must be followed by a name colon # and port number "proxy.fm.intel.com:911 # current default is Intel's. if($cl_param=~/^proxy/){ $proxy = $ARGV[$this_param+1]; if($proxy =~/\w.*?\:\d+/){ if($proxy =~ /http\:\/\// ){ $my_proxy = $proxy; } else{ $my_proxy = "http://".$proxy; } }else{ &print_error( "ERROR PARSING COMMAND LINE PARAMETERS!\n"); &print_error( "\"proxy\" must be followed by a valid proxy.\nProxy $my_proxy used\n"); } } #comicsoff - script will grab only news pages, saves you time for the script # running, and time for the webpage to load up. if($cl_param=~/^comicsoff/){ $comics_on=0; &print_error( "COMICS WILL BE OFF!\n"); } $this_param++; }################ FINISHED PROCESSING COMMAND LINE PARAMETERS ##################### #can try Yahoo RSS http://rss.news.yahoo.com/rss/topstories #Retuers Photos match "class="photoLink"" at http://www.reuters.com/newsPhotoGallery.jhtml?type=topNews #http://www.kuro5hin.org/backend.rdf #URLs to get - dont change these unless target url changes $url[0]="http://news.google.com/news/gnmainlite.html"; #$url[0]="http://news.google.com/"; $url[1]="http://slashdot.org/slashdot.xml"; #http://www.theinquirer.net/inquirer.rss $url[2]="http://www.theinquirer.net/index.html"; if($comics_on){ $url[3]="http://www.dilbert.com/comics/dilbert/index.html"; $url[4]="http://www.penny-arcade.com/view.php3"; $url[5]="http://pvponline.com/"; $url[6]="http://www.dieselsweeties.com/"; $url[21]="http://www.megatokyo.com/"; $url[20]="http://www.little-gamers.com/"; $url[22]="http://www.ctrlaltdel-online.com/index.php?t=archives&date=last"; } $url[7]="http://finance.yahoo.com/q?s=intc&d=v1"; #$url[8]="http://my.yahoo.com/?myHome"; $url[8]="http://www.sacbee.com/content/news/"; $url[9]="http://www.kirkwood.com/conditions.asp"; $url[10]="http://www.aceshardware.com/"; $url[11]="http://www.cnn.com/"; $url[11]="http://edition.cnn.com/"; $url[12]="http://dictionary.reference.com/wordoftheday/"; $url[13]="http://www.qotd.org/"; #UPCOMING EVENTS $url[14]="http://www.mondaviarts.org/index.lasso"; $url[15]="http://www.berkeley.edu/calendar/"; $url[16]="http://sfgate.com/eguide/epicks/"; #$url[17]="http://www.weather.com/weather/print/95816"; #sacramento $url[17]="http://wwwa.accuweather.com/adcbin/public/local_index.asp?zipcode=95816&partner=accuweather"; #kirkwood #$url[17]="http://wwwa.accuweather.com/adcbin/public/local_index.asp?zipcode=95646&partner=accuweather"; $url[18]="http://news.yahoo.com/news?tmpl=index2&cid=716"; $url[19]="http://www.reuters.com/newsPhotoGallery.jhtml?type=topNews"; #create a UserAgent my $agent_name = "PPMOD-Agent/LWP"; my $ua = LWP::UserAgent->new($agent_name); #enable cookies $ua->cookie_jar( {} ); #set temporary timeout for test $ua->timeout(3); $ua->agent('MSIE/6.0'); #uncomment this line when line 142 is fixed : #142: if(!($test_page[0] =~ /HTTP\/1\.1 200 OK/)){ #ie. don't test for proxy when it is specified in command line. #if ($proxy == "none"){} #test connection to see if proxy is in place. &print_error( "Testing connection...\n"); my $test_request = HTTP::Request->new(GET => "http://perl.com"); @test_page = split(/\n/,$ua->request($test_request)->as_string); # #set real timeout value $ua->timeout($timeout); #do we need to use the proxy? if(!($test_page[0] =~ /HTTP\/1\.1 200 OK/)){ $ua->proxy(['http'], $my_proxy); &print_error( "\nUsing proxy $my_proxy\n"); } else{ &print_error( "\nNot using a proxy\n"); } #initialize value $current_page=0; #loop through each url in above list foreach $webpage (@url) { &print_error( "Requesting $webpage...\n"); #Continue to try if http request fails until we have tried $force number of times. for ($f_count=0;$f_count < $force ;$f_count++) { my $request = HTTP::Request->new(GET => $webpage); #if we are supposed to read this page, issue HTTP request. if( $comics_on || $current_page<3 || $current_page>6 ){ #get each page and split it into an array. $request->header(Accept => 'text/html'); @this_page = split(/\n/,$ua->request($request)->as_string); } #page not found. if( ($this_page[0] =~ /\(Internal Server Error\)/) || ($this_page[0] =~/404 Not Found/)){ &print_error( "ERROR! Could not connect to : $webpage\n"); $this_page[0] = "\\(Internal Server Error)\\n"; $this_page[1] = "Error connecting to webpage.\\n"; $this_page[2] = "\Click Here to visit your webpage.\<\/a\>\<\/small\>\\n"; if($f_count<$force-1){ &print_error( "Trying again...\n"); } } else{ #exit for loop because we have the http data $f_count=$force; &print_error( "Data received successfully.\n"); } }# for loop forcing re-reads #create an array of these references for each html page. $all_data[$current_page] = [ @this_page ]; $current_page++; } ######## DONE GETTING HTML ################# # Create Webpage Summary Now, parse pages. # # If you don't know regexps, this code will be gibberish. &print_error( "\nCreating output file...\n"); #create output file open(NEW, ">$Output_file") || die $!; #always start html file with this junk. if ($hour>12) { $american_time=$hour-12; print NEW "\n\n$Page_title : $now $american_time:$min"."pm\n"; }else{ print NEW "\n\n$Page_title : $now $hour:$min"."am\n"; } print NEW "\n"; print NEW "\n\n\n\n"; &print_error( "\nParsing results into new html...\n"); $headline_limit = 0; $counter = 0; #create outer table for rightbar print NEW "\n"; print NEW "
\n\n"; #next three lines create border around news section print NEW " \n"; print NEW "
\n"; print NEW " \n"; print NEW "
\n\n"; #The number here (colspan) corresponds to how many news columns the title bar spans. print NEW " \n \n "; &end_border(); print NEW "\n\n\n"; print NEW " \n \n"; #END GOOGLE NEWS PARSER: $headline_limit = 0; $counter = 0; $title = ""; print NEW "\n\n\n"; print NEW "
\n"; #create border around header &html_border(); print NEW " \n"; print NEW " \n"; #weather - 10 day header print NEW "\n\n\n"; print NEW " \n"; print NEW "\n\n\n"; print NEW " \n"; print NEW " \n"; #create column for rightside of top header print NEW "\n\n\n"; print NEW " \n"; #row inside header to display WOTD print NEW "\n\n\n"; print NEW " \n \n
\n"; $data_flag=0; $counter=0; #table for forecast only print NEW "\n \n"; print NEW " \n"; foreach $line (@{$all_data[17]}) { #DEBUG PRINT ENTIRE FILE #print NEW "\n"; if($line =~ /\(Internal Server Error\)/){ print NEW $line; } if($line =~ /BEGIN FIRST 7/){ $data_flag=1; } if($counter>=7){ $data_flag=0; } #Look for images if (($line =~ /common\/i/) && ($data_flag==1)){ print NEW " \n"; $counter++; } } } $data_flag=0; print NEW "
\n"; $line =~ s/\<\/?font.*?\>//g; $line =~ s/\<\/?b\>//g; $line =~ s/\<\/?a.*?\>//gi; #make picture smaller $line =~ s/\"31\"/\"24\"/g; #remove leading whitespace $line =~ s/^\s+//; print NEW " " . $line; print NEW "\n \n"; } if (($line =~ /sevendaynew/) && ($data_flag==1)){ #remove some unwanted text $line =~ s/\s\d?\d\/\d\d?//g; $line =~ s/\/\/; $line =~ s/High//g; $line =~ s/Low//g; $line =~ s/ F//g; $line =~ s/\;\s*?\//\//; $line =~ s/\<\/?font.*?\>//g; $line =~ s/\<\/?b\>//g; $line =~ s/\<\/?a.*?\>//gi; $line =~ s/^\s+//; print NEW " "; print NEW "".$line."\n"; if($line =~ /deg/){ print NEW "
\n\n"; print NEW "
\n"; #Daily Snowfall $data_flag=0; foreach $line (@{$all_data[9]}) { if($line =~ /\(Internal Server Error\)/){ print NEW $line; } if(($data_flag) && ($line=~/font/)){ $line =~ s/.*?\//; $line =~ s/\<.*//; print NEW " \n "; print NEW "Kirkwood "; print NEW "snowfall: $line\n \n"; $data_flag=0; } if($line=~ /New Snow in the/) { $data_flag=1; } } $data_flag=0; #temporary summer kirkwood section #print NEW " \n "; #print NEW "Days until next season : "; #print NEW "$daysleft\n \n"; #create column for middle of top header print NEW " \n"; print NEW "
\n"; print NEW " Justin's Website Parser
\n"; print NEW "
\n"; print NEW "
\n"; #Stock Ticker foreach $line (@{$all_data[7]}) { if($line =~ /\(Internal Server Error\)/){ print NEW $line; } if($line =~ /l\>Reuters/){ #remove unwanted html tags $line =~ s/\<\/?td.*?\>//g; $line =~ s/\<\/?big.*?\>//g; $line =~ s/\<\/?small.*?\>//g; #remove leading whitespace $line =~ s/^\s+//; #remove everything after "Reuters". $line =~ s/Reuters.*/Reuters/; #make Reuters comment smaller $line =~ s/Reuters/\Reuters\<\/small\>/; #grab the change in stock price #$line =~ /\>([+-][^<]*)/; #$temp_1 = $1; #if($temp_1=~/\+/){ # $temp_1=~s/(.*)/\$1\<\/font\>/; #} #else{ # $temp_1=~s/(.*)/\$1\<\/font\>/; #} #add finance.yahoo.com to url #$line =~ s/href\=\"\//href\=\"http\:\/\/finance\.yahoo\.com\//; #print NEW " \n $line : $temp_1 \n \n
\n"; print NEW " "; print NEW " $line
\n"; } } print NEW "
\n"; ################## WORD OF THE DAY $getNextLine=0; foreach $line (@{$all_data[12]}) { #handle html failures. - don't want this for WOTD! #if($line =~ /\(Internal Server Error\)/){ # print NEW $line; #} if($line=~/span style/){ #add tags $line =~ s/\/\/; $line =~ s/\<\/span.*?\>/\<\/b\>/; #remove all span tags $line =~ s/\<\/?span.*?\>//g; #remove all br tags $line =~ s/\<\/?br.*?\>//g; #remove excessive whitespace $line =~ s/\s+/ /g; print NEW "
\n \n "; print NEW "\n $line"; } if ($getNextLine==1) { print NEW " $line\n \n <\small>\n
\n"; $getNextLine=0; } if ($line =~ /\<\!\-\- WOTD/) { $getNextLine=1; } } ################## end WORD OF THE DAY print NEW "
\n"; #end WOTD Row #right column might go here print NEW "
\n"; #GOOGLE NEWS PARSER: &html_border(); print NEW "Google News
\n"; &end_border(); $errorOccured=0; foreach $line (@{$all_data[0]}) { @temp = split(/\/,$line); foreach $line2 (@temp) { &html_error($line2); #print NEW "\n"; #get categories if( ($line2 =~ /\Top/) || ($line2 =~ /\World/) || ($line2 =~ /\U\.S/) || ($line2 =~ /\Business/) || ($line2 =~ /\Sci/) || ($line2 =~ /\Sports/)){# || # ($line2 =~ /\Enter/) || ($line2 =~ /\Health/)){ #$line2=~ /\([a-zA-Z ]+)\<\/b\>/; #$result = $1; #print $1."\n"; #remove HTML tags. $line2 =~ s/\<\/?IMG.*?\>//g; $line2 =~ s/\<\/?p.*?\>//g; #remove table tags $line2 =~ s/\<\/?tr.*?\>//g; $line2 =~ s/\<\/?td.*?\>//g; $line2 =~ s/\<\/?table.*?\>//g; #remove links $line2 =~ s/\//g; #remove extra characters $line2 =~ s/\|//g; #remove whitespace $line2 =~ s/\s+//g; #Remove JUMP TO feature $line2 =~ s/\ first time if($headline_limit >= 1){ print NEW "\n"; } print NEW $line2, "
\n"; print NEW "\n"; $headline_limit = $Number_of_headlines + 1; } #get headlines and put into file if(($line2 =~ /.*class\=y/) && ($headline_limit > 1)){ #remove table tags $line2 =~ s/\<\/?tr.*?\>//g; $line2 =~ s/\<\/?td.*?\>//g; $line2 =~ s/\<\/?table.*?\>//g; #/url?ntc=0L0A0&q= $line2 =~ s/\/url\?.*?q\=//g; #$line2 =~ s/\<.*?\>//; #removes html $line2 =~ s/class\=.//; #add a linebreak after a
tag $line2 =~ s/\/\\n/g; #change %3F into actual characters $line2 =~ s/\%3F/\?/g; $line2 =~ s/\%3D/\=/g; $line2 =~ s/\%26/\&/g; #shorten links that are too long # $line2 = &shorten_line($line2); print NEW "*$line2
\n"; &print_error( $line2); $headline_limit--; } } } print NEW "
\n"; #SLASHDOT NEWS PARSER: &html_border(); print NEW "Slashdot\n"; &end_border(); print NEW "

\n\n"; $errorOccured=0; foreach $line (@{$all_data[1]}) { &html_error($line); #print NEW " $line
\n"; if($line =~ /Your Headline Reader Has Been Banned/){ print NEW "Slashdot is being a little b*!@#.
\n"; print NEW "Whoever wrote their crummy xml/rss
server "; print NEW "needs a 133tness class
from yours truly."; } #bug fix to remove extra "amp;" due to xml weirdness $line =~ s/amp\;amp\;/amp\;/g; #parse XML for headlines if(($line =~ /\//g; $line =~ s/^\s+//g; $title = $line; $original_title = $title; if(length($title)>$column_width+3){ $title=substr($title,0,$column_width); $title=$title."..."; } } if(($line =~ /\<url\>(.*?)\</) && ($counter <= ($Number_of_headlines*4))){ $headline_url = $1; # print NEW "<div id=\"sdlink$counter\" onMouseDown=\"sdtext$counter.style.display=\'block\';\">\n"; print NEW "<div id=\"sdlink$counter\" onMouseDown=\"ToggleSlash(sdtext$counter);\">\n"; print NEW "<font color=\"blue\"><u>$title</u></font>\n" . "</div>\n"; # print NEW "<a title=\"$original_title\" href=\"$headline_url\">$title</a><br>\n\n"; &print_error( "<a title=\"$original_title\" href=\"$headline_url\">$title</a><br>\n"); # GRAB CONTENT FROM LINK my $slash_req = HTTP::Request->new(GET => $headline_url); @slash_page = split(/\n/,$ua->request($slash_req)->as_string); $div_header = "\t<div id=\"sdtext$counter\" style=\"display:none;\"\">\n";#onMouseDown=\"sdtext$counter.style.display ='none'; $div_header = $div_header . "\t <TABLE border=\"0\" width=\"100%\"><tr><td bgcolor=\"#999999\">\n"; $div_header = $div_header . "\t <TABLE border=\"0\" width=\"100%\"><tr><td bgcolor=\"#eeeeee\" align=\"left\" width=\"190\">\n"; $div_header = $div_header . "\t <small>\n" . "<b><a href=\"$headline_url\">$original_title</a></b><br>\n\t"; $div_tailer = " \t</small>\n\t </td></tr></table>\n\t </td></tr></table>\n\t</div>"; $gotcha=0; foreach $slashline (@slash_page) { if($slashline =~ /\<i\>(.*?)\<\/i\>/i){ print NEW "\n $div_header $1 $div_tailer\n\n"; $gotcha=1; last; } } if($gotcha==0){ print NEW "\n $div_header Couldn't find the story. $div_tailer\n\n"; } $counter++; } # # #parse XML for images # if( ($line =~ /\<image\>(\S+)\</) && ($counter<=($Number_of_headlines*4))){ # $image_name = $1; # #http://images.slashdot.org/topics/ # $align="right"; # if($counter % 2 == 0){ # $align="left"; # } # print NEW "<img align=\"$align\" height=\"32\" width=\"40\" src=\"http://images.slashdot.org/topics/"; # print NEW $image_name . "\">\n"; # print NEW $saved_title; # } } print NEW "</small>\n</P>\n"; #END SLASHDOT NEWS PARSER: print NEW "\n\n<!-------- INQUIRER NEWS SECTION -------------->\n"; $valid_data = 0; #INQUIRER NEWS PARSER: &html_border(); print NEW "<A STYLE=\"text-decoration:none\" href=\"http://www.theinquirer.net/\"><FONT size=\"5\" color=\"purple\"><b>The Inquirer</b></FONT></A>\n"; &end_border(); $errorOccured=0; foreach $line (@{$all_data[2]}) { &html_error($line); #parse HTML for headlines if($line =~ /td_mainbody/){ $valid_data=1; } if(($valid_data) && ($line =~ /div class\=\"ht\"/)){ #remove all the following HTML tags. $line =~ s/\<\/?td.*?\>//g; $line =~ s/\<\/?tr.*?\>//g; $line =~ s/\<\/?img.*?\>//g; $line =~ s/\<\/?b.*?\>//g; $line =~ s/\<\/?embed.*?\>//g; $line =~ s/\<\/?hr.*?\>//g; $line =~ s/\<\/?s.*?\>//g; #replace everything after a link with a <br>tag. $line =~ s/\/a\>/\/a\>\<br\>/g; #insert theinquirer.net into a link url $line =~ s/\"\?/\"http\:\/\/www\.theinquirer\.net\/\?/g; print NEW "</small>\n<P>\n<small>\n"; @temp = split(/\<\/div\>\<\/div\>\<\/div\>/,$line); $count = 0; foreach $newline (@temp) { if(($count < ($Number_of_headlines*4)) && !($newline=~/2002 Breakthrough Publishing Ltd/) && !($newline=~/Advert/)) { $newline =~ s/\<\/?div.*?\>//g; #remove font tags and everything inside them $newline =~ s/\<font.*?\/font\>//g; $newline =~ s/\s+$//; $newline =~ s/\<br\>.*/\<br\>\n/; #remove everything between <a></a> links $newline =~ s/\<\/a.*?\>.*?\<a/\<\/a\>\<br\>\n\<a/g; #shorten links that are too long $newline = &shorten_line($newline); #remove leading whitespace $newline =~ s/^\s+//; print NEW $newline; &print_error( $newline); } $count++; } if($newline=~/\w+/){ print NEW "</small>\n</P>\n"; print NEW "<BR>\n"; } } } print NEW " </TD>\n <TD valign=\"top\">\n"; #END INQUIRER NEWS PARSER: #!@#$%^&*() print NEW "\n\n<!-------- SACBEE NEWS SECTION -------------->\n"; #SacBee NEWS & INFO PARSER &html_border(); print NEW "<A STYLE=\"text-decoration:none\" href=\"http://sacbee.com/\"><FONT size=\"5\" color=\"orange\"><b>SacBee</b></FONT></A>"; &end_border(); print NEW "<BR>\n<small>\n"; $data_flag=$Number_of_headlines * 7; $errorOccured=0; foreach $line (@{$all_data[8]}) { &html_error($line); #grab and print actual links to articles if( (($line =~ /nheadLineSS/) || ($line =~ /ntopStory/)) && ($data_flag>=0)){ $line =~ s/\<br \/\>/\<br\>/g; #add html to link $line =~ s/href\=\"/href\=\"http\:\/\/www\.sacbee\.com/; $line = &shorten_line($line); print NEW "$line\n"; &print_error( "$line\n"); $data_flag--; } } print NEW"</small>\n</P>\n"; #END SACBEE NEWS & INFO PARSER #print NEW "\n\n<!-------- YAHOO/REUTERS NEWS SECTION -------------->\n"; ##YAHOO NEWS & INFO PARSER #&html_border(); #print NEW "<A STYLE=\"text-decoration:none\" href=\"http://my.yahoo.com/\"><FONT size=\"5\" color=\"orange\"><b>My Yahoo!</b></FONT></A><BR>\n<small>\n"; #&end_border(); # #$data_flag=0; $errorOccured=0; #foreach $line (@{$all_data[8]}) { # &html_error($line); # # # #grab and print actual links to articles # if(($data_flag) && ($line =~ /story\.news\.yahoo\.com/)){ # $line =~ s/\<\/?li.*?\>/\n/g; # $line =~ s/\<\/?td.*?\>//g; # $line =~ s/\<\/?tr.*?\>//g; # $line =~ s/\<\/?font.*?\>//g; # $line =~ s/\<\/?ul.*?\>//g; # $line =~ s/\/a\>/\/a\>\<br\>/g; # $line =~ s/\<i.*?\/i\>//g; # if(!($line =~ /\<b\>/)){ # # #do a split here. # @three_headlines = split(/\n/,$line); # foreach $line (@three_headlines) { # # #shorten links that are too long # $line = &shorten_line($line); # # print NEW "$line\n"; # &print_error( "$line\n"); # # $data_flag--; # } # } # } # #grab and print headlines # if (($line =~ /Top Stories from Reuters/) || ($line =~ /World News from Reuters/) || ($line =~ /Business News from Reuters/) || ($line=~/Politics News from Reuters/)){ # #$line =~ s/\<[^a].*?\<\/.*?\>//g; # $line =~ s/.*\<b\>/\<b\>/g; # $line =~ s/\<\/?td.*?\>//g; # $line =~ s/\<\/?tr.*?\>//g; # $line =~ s/\<\/?br.*?\>//g; # $line =~ s/\<\/?font.*?\>//g; # $line =~ s/\<\/?table.*?\>//g; # $line =~ s/\<i.*?\/i\>//g; # $line =~ s/\<\!.*?\/\-\-\>//g; # # #may want to remove these # $line =~ s/from\s+Reuters//g; # $line =~ s/\<\/?a.*?\>//g; # print NEW "</small>\n$line\n<br>\n<small>"; # $data_flag=2; # } # #} #print NEW"</small>\n</P>\n"; ##END YAHOO NEWS & INFO PARSER print NEW "\n\n<!-------- ACESHARDWARE NEWS SECTION -------------->\n"; $valid_data = 0; $second_line=0; ############ ACESHARDWARE NEWS PARSER: ################ print NEW "<br>\n"; &html_border(); print NEW "<A STYLE=\"text-decoration:none\" href=\"$url[10]\"><FONT size=\"5\" color=\"blue\"><b>Ace's Hardware</b></FONT></A>\n"; &end_border(); print NEW "<P>\n<small>\n"; $errorOccured=0; $link_var = ""; foreach $line (@{$all_data[10]}) { &html_error($line); #PRINTS ARTICLE TITLE if ($second_line) { $line =~ s/\<\/?font.*?\>//g; $line =~ s/\<\/?b\>//g; #remove extra whitespace $line =~ s/\s*$//; $line =~ s/^\s*//; $line_new = $link_var . $line; $line_new = shorten_line($line_new); print NEW $line_new."\n"; &print_error( "$line_new\n"); } $second_line=0; #Finds LINK if(($valid_data) && ($line=~/\<a/)){ #print NEW "\n<!--$line-->\n"; $line =~ s/href\=\"\#/href=\"http\:\/\/www\.aceshardware\.com\/\#/g; $line =~ s/\<\/?font.*?\>//g; $line =~ s/\<\/?b\>//g; #insert breaks after <br> tags $line =~ s/\<br\>/\<br\>\n/g; #remove extra whitespace $line =~ s/\s*$//; $line =~ s/^\s*//; #insert title #$line =~s /\<a\s+h/\<a title\=\"$temp_line\" h/; $second_line=1; #print NEW $line; #&print_error( "$line"); $link_var = $line; } if ($line =~ /Latest News\<\/b\>/){ $valid_data = 1; } if ($line =~ /\<\/td/) { $valid_data = 0; } } print NEW "</small>\n</P>\n </TD>\n <TD valign=\"top\" width=\"180\">"; ######################################## #Yahoo Top Images # print NEW " <Table>\n <TR><TD valign=\"top\" width=\"180\">\n"; print NEW "\n <!-------- News Images from Yahoo-------------->\n"; &html_border(); print NEW "<A STYLE=\"text-decoration:none\""; print NEW "href=\"http://news.yahoo.com/news?tmpl=index2&cid=716/\">"; print NEW "<FONT size=\"5\" color=\"red\"><b>Yahoo News</b></FONT></A><BR>\n"; &end_border(); print NEW " <P>\n"; $valid_data=0; $alternate=0; $errorOccured=0; $picture_size = 35; foreach $line (@{$all_data[18]}) { &html_error($line); if($line =~ /class\=topstory/){ $line =~ s/.*?\>//; print NEW " <small>$line</small></a><br clear=all>\n"; } if(($line =~ /alt\=Photo/) or ($line =~ /alt\=Slideshow/)){ #only one leading space $line =~ s/\s+/ /g; #trim size $line =~ s/width\=(\d{2,3})/width\=50/; $width = $1; $line =~ s/height\=(\d{2,3})/height\=50/; $height = $1; #keep original aspect ratio $ratio = $width / $height; if($width > $height){ $height = $picture_size; $width = $picture_size * $ratio; }else{ $width = $picture_size; $height = $picture_size / $ratio; } $line =~ s/height\=(\d{2,3})/height\=$height/; $line =~ s/width\=(\d{2,3})/width\=$width/; #remove </a> tags $line =~ s/\<\/a\>//; #edit <a> tags for no underline, <img> tags no border # $line =~ s/\<a/\<a STYLE\=\"text\-decoration\:none\"/; $line =~ s/\<img/\<img border\=\"0\"/; #remove comments $line =~ s/\<\!\-\-.*?\-\-\>//g; #align alternating $line =~ s/align\=left//; $line =~ s/align\=right//; if($alternate==0){ $line =~ s/\<img/\<img align\=\"left\"/; $alternate = 1; }else{ $line =~ s/\<img/\<img align\=\"right\"/; $alternate = 0; } print NEW " $line\n"; $valid_data = 1; } } #border for news section &end_border(); #end news section print NEW " </TD>\n </TR>\n</TABLE>\n<BR>\n"; #start 4th Column print NEW " </TD>\n <TD rowspan=\"9\" valign=\"top\">\n<!--START Righthand COLUMN 4-->\n"; ####### SPECIAL RIGHT HAND COLUMN DATA if (open(RB, "rightbar.txt")) { while (<RB>){ print NEW $_; } } else{ print NEW "\n\t<!-- CANNOT FIND rightbar.txt with HTML DATA-->\n"; } print NEW "\t<!-- END OF RIGHTHAND COLUMN-->\n"; close(RB); ####### END SPECIAL RIGHT HAND COLUMN DATA #create outer table for rightbar print NEW "\t </TD></TR></TABLE>\n\n"; #END OF NEWS TABLE, START COMICS TABLE print NEW "\n\n\n<!-------- END NEWS : START COMICS -------------->\n\n\n"; print NEW "<TABLE border=\"0\">\n <TR>\n <TD colspan=\"8\" valign=\"top\">\n"; if (!($comics_on)) { print NEW "\n\n\n<!--------- COMICS ARE OFF ------------------->\n\n\n"; print NEW "<h3>Comics are off</h3>\n"; } else{ print NEW "\n <!-------- COMICS:Dilbert SECTION -------------->\n"; $comic = ""; #DILBDERT URL PARSER: print NEW " <hr><FONT size=\"5\" color=\"blue\">Comics</FONT>\n <P>\n"; $errorOccured=0; foreach $line (@{$all_data[3]}) { &html_error($line); if($line =~ /Today\'s Dilbert Comic/){ $line =~ s/\<\/?TD.*?\>//g; $line =~ s/\<\/?BR.*?\>//g; $line =~ s/\<\/?a.*?\>//gi; $line =~ s/\<\!\-\-.*?\-\-\>//g; #<IMG SRC="/images/ffffff_dot.gif" $line =~ s/\<IMG SRC\=\"\/images\/fffff.*?\>//g; #$line =~ s/\<IMG SRC=\".*?mystery_artist.*?\>//; #<IMG SRC="/comics/dilbert/ $line =~ s/\<IMG\s*SRC\=\"\/comics\/dilbert//g; $line =~ s/gif\".*/gif\"/; $line =~ s/^\s+//g; $comic = $line; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.dilbert.com\"><IMG border=\"0\" SRC=\"http://www.dilbert.com/comics/dilbert$comic></A>\n"; } } print NEW " </P>\n"; #END DILBERT URL PARSER: ######################################## #PENNY ARCADE URL PARSER: print NEW "\n <!-------- COMICS:Penny Arcade SECTION -------------->\n"; $comic = ""; print NEW " <P>\n"; $errorOccured=0; foreach $line (@{$all_data[4]}) { &html_error($line); #Ack! Y2K10 bug!!!! if($line =~ /images\/200/){ $line =~ s/\<\/?td.*?\>//g; $line =~ s/ALT\=\"\"//; $line =~ s/c\=\"/c\=\"http\:\/\/www\.penny\-arcade\.com\//; $line =~ s/\s+/ /g; $line =~ s/\<img/\<img border\=\"0\"/; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.penny-arcade.com\">"; print NEW $line; print NEW "</A>\n"; } } print NEW " </P>\n"; #END PENNY ARCADE URL PARSER: ######################################## #PVP SECTION print NEW "\n <!-------- COMICS:PVP Online SECTION -------------->\n"; print NEW " <P>\n"; $errorOccured=0; foreach $line (@{$all_data[5]}) { &html_error($line); if($line =~ /IMG.*?archive\//){ $line =~ s/C\=\"/C\=\"http\:\/\/www\.pvponline\.com\//; $line =~ s/\<\/?center.*?\>//gi; $line =~ s/\<\/?br.*?\>//g; $line =~ s/\<\/?td.*?\>//g; #remove extra <img> tag $line =~ s/\<img.*?images.*?\>//; $line =~ s/\<IMG/\<IMG align\=\"left\" border\=\"0\"/; #fix: remove align="left" $line =~ s/align\=\"left\"//g; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.pvponline.com/\">"; print NEW $line; print NEW "</A>\n"; } } print NEW " </P>\n"; #END PVP URL PARSER ######################################## #Diesel Sweeties print NEW "\n <!-------- COMICS:Diesel Sweeties SECTION -------------->\n"; print NEW " <P>\n"; $errorOccured=0; $linecount=0; foreach $line (@{$all_data[6]}) { &html_error($line); $linecount++; if($line =~ /\*\*\*newest/){ $line =~ s/.*?\*\*\*newest//; $line =~ s/.*?\<img/\<img/i; $line =~ s/\>.*/\>/; $line =~ s/^\s*//; $line =~ s/src\=\"/src\=\"http\:\/\/www\.dieselsweeties\.com/; # $line =~ s/\<img/\<img align\=\"right\"/i; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.dieselsweeties.com/\">"; print NEW $line ."\n"; print NEW " </A>\n"; } } print NEW " </P>\n"; ######################################## #Little Gamers SECTION print NEW "\n <!-------- COMICS:Little Gamers Online SECTION -------------->\n"; print NEW " <P>\n"; $errorOccured=0; foreach $line (@{$all_data[20]}) { &html_error($line); if($line =~ /show_strip/){ $line =~ s/src\=\'/src\=\'http\:\/\/www\.little\-gamers\.com/; $line =~ s/\' border\=0/\'/i; $line =~ s/\<\/?tr.*?\>//gi; $line =~ s/\<\/?td.*?\>//gi; $line =~ s/\<img/\<img align\=\"left\" border\=\"0\"/i; $line =~ s/^\s+//; $line =~ s/\s+$//; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.little-gamers.com/\">"; print NEW $line; print NEW "</A>\n"; } } print NEW " </P>\n"; #END LittleGamers URL PARSER ######################################## #MegaTokyo Gamers SECTION print NEW "\n <!-------- COMICS:MegaTokyo Online SECTION -------------->\n"; print NEW " <br clear=\"all\">\n"; print NEW " <P>\n"; $errorOccured=0; foreach $line (@{$all_data[21]}) { &html_error($line); if($line =~ /\/strips/){ $line =~ s/\<\/?tr.*?\>//gi; $line =~ s/\<\/?td.*?\>//gi; $line =~ s/.*?\<img src\=\"\/strips/\<img src\=\"\/strips/i; $line =~ s/src\=\"/src\=\"http\:\/\/www\.megatokyo\.com/i; $line =~ s/\.gif\".*/\.gif\"\>/i; $line =~ s/\<img/\<img align\=\"left\" border\=\"0\"/i; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.megatokyo.com/\">"; print NEW $line; print NEW "</A>\n"; } } print NEW " </P>\n"; #END MEGATOKYO URL PARSER ######################################## #CTRL+ALT+DEL Gamers SECTION print NEW "\n <!-------- COMICS:CTRL+ALT+DEL Online SECTION -------------->\n"; print NEW " <br clear=\"all\">\n"; print NEW " <P>\n"; $errorOccured=0; foreach $line (@{$all_data[22]}) { &html_error($line); if($line =~ /\/comics/){ $line =~ s/\.jpg\".*/\.jpg\"\>/i; $line =~ s/src\=\"/src\=\"http\:\/\/www\.ctrlaltdel\-online\.com/i; $line =~ s/\<img/\<img align\=\"left\" border\=\"0\"/i; print NEW " <A STYLE=\"text-decoration:none\" href=\"http://www.ctrlaltdel-online.com/\">"; print NEW $line; print NEW "</A>\n"; } } print NEW " </P>\n"; #END CTRL+ALT+DEL URL PARSER print NEW "\n\n"; }#comicsoff if statement. # ######################################## #Reuters Top Images print NEW "\n <!-------- News Images from Reuters-------------->\n"; print NEW "<tr><td width=\"240\">\n<br clear=\"all\">\n"; &html_border(); print NEW "<A STYLE=\"text-decoration:none\""; print NEW "href=\"http://www.reuters.com/newsPhotoGallery.jhtml?type=topNews\">"; print NEW "<FONT size=\"5\" color=\"orange\"><b>Reuters News</b></FONT></A><BR>\n"; &end_border(); print NEW " <P>\n"; $valid_data=0; $errorOccured=0; foreach $line (@{$all_data[19]}) { &html_error($line); if($line =~ /smPicBorder/){ $line =~ s/class\=\".*?\"//gi; $line =~ s/onMouseOver\=\".*?\"//gi; $line =~ s/onMouseOut\=\".*?\"//gi; $line =~ s/javascript\:commonPopup\(\'/http\:\/\/www\.reuters\.com\//i; $line =~ s/\'.*?\>/\"\>/; $line =~ s/align\=\"\"/align\=\"left\"/i; print NEW "\t$line\n"; } if($line =~ /\<\/b\>\<br\>/){ $line =~ s/^\s+//; print NEW "\t<small><small>$line</small></small>\n"; } } print NEW "<\p>\n"; print NEW "</td></tr>\n\n"; print NEW " </TD>\n </TR></TABLE>\n"; &print_error( "\n\nFinished. Closing $Output_file with the results.\n"); print NEW " </TD>\n </TR>\n</Table>\n"; ######################################## #QOTD SECTION #create row for qotd print NEW "\n\n<!-------- Quote of the Day SECTION -------------->\n"; print NEW "<div id=\"qotd\" background=\"#555555\" style=\"visibility:hidden; position:absolute; left:270;\">\n"; print NEW "\t<TABLE border=\"0\" width=\"356\"><tr><td bgcolor=\"#999999\">\n"; print NEW "\t<TABLE border=\"0\" width=\"350\"><tr><td bgcolor=\"#eeeeee\" align=\"center\">\n"; print NEW "\t <font size=\"3\"><b>Quote of the Day</b></font><br>\n"; #QOTD $data_flag=0; foreach $line (@{$all_data[13]}) { if($line =~ /\(Internal Server Error\)/ ){ print NEW "\t <small> $line </small>\n"; } if ($data_flag==1){ $line =~ s/\<A.*?\<\/A\>//; print NEW "\t <small>\n\t <a href=\"$url[13]\" style=\"text-decoration:none\"><font color=\"black\">"; print NEW $line; print NEW "</a></font>\n\t </small>\n"; $data_flag=0; } if($line=~/randomly selected quote/){ $data_flag=1; } } &end_border(); print NEW "</div>\n"; #END QOTD SECTION ######################################## #Upcoming Events SECTION print NEW "\n\n<!-------- EVENTS SECTION -------------->\n\n"; print NEW "<div id=\"UpcomingEvents\" background=\"white\" style=\"visibility:hidden; position:absolute; top:100; left:30;\">\n"; &html_border(); #events table print NEW "<table><tr>\n"; print NEW "\n\n<!-------- Mondavi Arts Center -------------->\n"; print NEW "<td align=\"left\" valign=\"top\" width=\"30%\">\n"; #MONDAVI ARTS CENTER print NEW "<small><b>Mondavi Center</b><br>\n"; $errorOccured=0; $data_flag=0; $more_info=0; foreach $line (@{$all_data[14]}) { $line_counter++; &html_error($line); #look for good data indicator: #if($line=~/Begin lasso content/){ if($line=~/Featured Events\"/){ $data_flag=1; #remove everything before our indiciator. $line =~ s/.*?\<img.*?Featured Event\"/\<img Featured Events\"/; #print NEW "\n<!--\n$line_counter:$line\n-->\n"; #data is all one line for some reason. @lasso_data = split(/\<\/a>/,$line); foreach $line2 (@lasso_data) { if (($data_flag==1) || ($more_info==1)){ if (($line2=~/announcements\.gif/) || ($line2 =~ /Post-Performance/)){ $data_flag=0; } if ($line2=~/membership\//) { $more_info=0; } if ($line2 =~ /blackcopysmall/) { $more_info=1; } if (($data_flag==1) && ($line2 =~ /whitelink2/)){ $line2 =~ s/\<\!\-\-.*?\-\-\>//g; $line2 =~ s/\&.*?\;//g; $line2 =~ s/\<\/?td.*?\>//g; $line2 =~ s/\<\/?tr.*?\>//g; $line2 =~ s/\<\/?span.*?\>//g; $line2 =~ s/\<\/?blockquote.*?\>//g; $line2 =~ s/\<\/?br.*?\>//g; $line2 =~ s/\<\/?p.*?\>//g; $line2 =~ s/\<\/?img.*?\>//g; $line2 =~ s/\<\/?div.*?\>//g; ######### $line2 =~ s/\<\/?table.*?\>//g; $line2 =~ s/\<a href\=\"/\<a href\=\"http\:\/\/www\.mondaviarts\.org\//; $line2 =~ s/\s+/ /g; $line2 =~ s/^\s+//; print NEW "$line2</a><br> "; } elsif (($data_flag==1) || ($more_info==1)){ $line2 =~ s/\<\!\-\-.*?\-\-\>//g; #get rid of extra <a link $line2 =~ s/\<\/?a.*?\>//g; $line2 =~ s/\&.*?\;//g; $line2 =~ s/\<\/?td.*?\>//g; $line2 =~ s/\<\/?blockquote.*?\>//g; $line2 =~ s/\<\/?br.*?\>//g; $line2 =~ s/\<\/?p.*?\>//g; $line2 =~ s/\<\/?tr.*?\>//g; $line2 =~ s/\<\/?td.*?\>//g; $line2 =~ s/\<\/?span.*?\>//g; $line2 =~ s/\<\/?img.*?\>//g; $line2 =~ s/\<\/?div.*?\>//g; ######### $line2 =~ s/\<\/?table.*?\>//g; $line2 =~ s/\s+/ /g; $line2 =~ s/Post-Performance.*//; if ($line2 =~ /[a-zA-Z<]/) { print NEW "$line2<BR>\n"; } } } } } } print NEW "</small>\n"; #end mondavi column print NEW "</td>\n"; #UC Berkley print NEW "\n\n<!-------- UC Berkley -------------->\n"; print NEW "<td align=\"left\" valign=\"top\" width=\"30%\">\n"; print NEW "<small><b>UC Berkley</b><br>\n"; $errorOccured=0; $limit=34; $pre_data_flag = 0; $data_flag=0; $a_tag_start=0; foreach $line (@{$all_data[15]}) { &html_error($line); if ($line =~ /\#top/) { #get rid of everything after #top $line =~ s/\#top.*/\#top/; $data_flag=0; $pre_data_flag=0; } if ($line =~ /Events this week/) { $pre_data_flag=1; #print NEW "--$line--\n"; } if (($pre_data_flag==1) && ($line =~ /\<\/tr\>/)) { $data_flag=1; } if (($data_flag==1) && ($limit > 0)){ $line =~ s/\<\/?td.*?\>//g; $line =~ s/\<\/?img.*?\>//g; $line =~ s/\<\/?tr.*?\>//g; $line =~ s/\<\/?table.*?\>//g; $line =~ s/\s+/ /g; $line =~ s/^\s+//g; $line =~ s/\<\/a\>/\<\/a\>\<br\>\n/g; $line =~ s/\<a href\=\"\#/\<a href\=\"http\:\/\/www\.berkeley\.edu\/calendar\/\#/; print NEW "$line"; $limit--; } } print NEW "</small>\n"; print NEW "</td>\n"; #end Berkely column #EPICKS print NEW "\n\n<!-------- SF E-Picks -------------->\n"; print NEW "<td align=\"left\" valign=\"top\" width=\"30%\">\n"; print NEW "<small><b>SF Gate</b><br>\n"; $errorOccured=0; $data_flag=0; foreach $line (@{$all_data[16]}) { &html_error($line); if ($line =~ /Movies:/) { $data_flag=1; } if ($line =~ /\TABLE/) { $data_flag=0; } if ($data_flag==1){ $line =~ s/\<\/?TD.*?\>//g; $line =~ s/\<\/?TR.*?\>//g; $line =~ s/\<\/?FONT.*?\>//g; $line =~ s/\<\/?P.*?\>//g; $line =~ s/HREF\=\"\#/href\=\"http\:\/\/sfgate\.com\/eguide\/epicks\/\#/; $line =~ s/^\s+//g; if ($line =~ /\S+/) { print NEW "$line\n"; } } } print NEW "</small>\n"; print NEW "</td>\n"; #EPICKS print NEW " </tr>\n"; print NEW "</table>\n"; &end_border(); print NEW "</div>\n\n"; #END Events SECTION print NEW " </TD></TR>\n"; ######################################## #COPYRIGHT WARNING print NEW "\n<p>\n <center>\n <small>\n"; print NEW " <!----COPYRIGHT WARNING---->\n <b>COPYRIGHT WARNING</b><br>\n"; print NEW " Webpage content is owned by corresponding webpage owner,"; print NEW " see individual pages for details.\n"; print NEW " </small>\n </center>\n</p>\n\n"; print NEW "</BODY>\n</HTML>\n"; print NEW "<!--END OF FILE-->\n"; close(NEW); if($logfiles){ close(LOGFILE); } ########################################## #FUNCTIONS sub print_error{ $toPrint = $_[0]; if ($logfiles){ print LOGFILE $toPrint; print $toPrint; } } sub shorten_line{ $long_string = $_[0]; #look for too many uppercase letters if ($long_string =~ /[A-Z].*?[A-Z].*?[A-Z].*?[A-Z].*?[A-Z].*?[A-Z].*?[A-Z]/) { $column_temp=$column_width; } else{ #set full length of string. $column_temp=$column_width+3; } if ($long_string =~ /\>([^<]{$column_temp,})/ ) { # if ($long_string =~ /\>(.){$column_temp,})?\<\/a\>/ ) { #will crash script $temp_1a = $1; $line2 = substr($temp_1a,0,$column_temp-3); $line2 = $line2."..."; #remove quotes and insert ascii for quote $line2 =~ s/\"/\"\;/g; $temp_1a =~ s/\"/\"\;/g; $long_string =~ s/\<a/\<a title\=\"$temp_1a\"/; $long_string =~ s/\>([^<]*)/\>$line2/; } return $long_string; } sub html_border{ print NEW "\n\t<!--BORDER-2 START-->\n"; print NEW "\t<TABLE border=\"0\" width=\"100%\"><tr><td bgcolor=\"#999999\">\n"; print NEW "\t<TABLE border=\"0\" width=\"100%\"><tr><td bgcolor=\"#eeeeee\" align=\"center\">\n"; print NEW "\t<!--BORDER-2 START-->\n\n"; } sub end_border{ print NEW "\n\t<!--BORDER-2 END-->\n"; print NEW "\t</td></tr></table>\n"; print NEW "\t</td></tr></table>\n"; print NEW "\t<!--BORDER-2 END-->\n\n"; } sub html_error{ $error_line = $_[0]; #handle failed html lookups if( ($error_line =~ /\(Internal Server Error\)/) || ($errorOccured>0 && $errorOccured<3)){ print NEW $error_line; $errorOccured++; } } #EOF# ###############OLD CNN NEWS SECTION ############## print NEW "\n\n<!-------- CNN NEWS SECTION -------------->\n"; &html_border(); print NEW "<A STYLE=\"text-decoration:none\" href=\"$url[11]\"><FONT size=\"5\" color=\"red\"><b>CNN News</b></FONT></A><BR>\n"; &end_border(); $errorOccured=0; foreach $line (@{$all_data[11]}) { &html_error($line); #get rid of <td> tags $line =~ s/\<td.*?\>//g; #get categories if( ($line=~ /cnnMainSections/) && ( ($line =~ /Education/) || ($line =~ /World/) || ($line =~ /U\.S/) || ($line =~ /Business/) || ($line =~ /Scien/) || ($line =~ /Sports/) || ($line =~ /Entert/) || ($line =~ /Health/) || ($line =~ /Travel/) || ($line =~ /Asia/) || ($line =~ /Europe/) || ($line =~ /Technol/) ) ){ $line =~ s/<div.*?\>//g; #get name of section $line =~ s/.*?\"\s+alt\=\"/\<b\>/; $line =~ s/\s*\w+\:\s*\".*/\<\/b\>/; #dont print </small> first time if($headline_limit >= 1){ print NEW "</small>\n"; } print NEW $line, "<BR>\n"; print NEW "<small>\n"; $headline_limit = $Number_of_headlines + 1; } #get headlines and put into file if( (($line =~ /\&\#8226\;\&/) || ($line =~ /\&\#149\;\&/)) && ($headline_limit > 1) && !($line=~ /target\=\"new\"/) ){ #$line =~ s/\<.*?\>//; #removes html $line =~ s/<img.*?\>//g; $line =~ s/<br.*?\>//g; $line =~ s/<span.*?\/span\>//g; $line =~ s/<div.*?\>//g; $line =~ s/\&.*?\;//g; #add <br> tags after link $line =~ s/\/a\>/\/a\>\<br\>\n/g; #add cnn.com to links $line =~ s/\<a href\=\"/\<a href\=\"http\:\/\/www\.cnn\.com/g; #shorten links that are too long $line = &shorten_line($line); #remove leading whitespace $line =~ s/^\s+//; print NEW $line; &print_error( $line); $headline_limit--; } } ############### END CNN SECTION ###############