#!/bin/perl # # Bradley Huffaker (07/31/98) # To create a data file based on the connections between html documents # use Socket; $start_file = "http://caida.org/Tools/Otter/index.html"; $depth = 2; &ProcessURL($start_file,$depth); &PrintFVL(); sub ProcessURL { my ($url,$depth) = @_; $url.= "index.html" if ($url=~/\/$/); return if ($reached{$url} || 0 > $depth); $htmlReached{$url} = 1; ($type,$server,$port,$path) = ($url=~/(^[^\:&^\/]+)\:\/\/([^\:|\/]+):*(\d*)(.+)/); $port = 80 if (!$port); $type =~ y/A-Z/a-z/; if ($type ne "http") { $url2format{$url} = $format; } #print STDOUT "type:$type server:$server port:$port path:$path\n"; #Opens a socket to the server and sets SOCK to the handle if (!open_socket($port,$server)) { $broken{$url} = 1; return; } print SOCK "GET $path HTTP/1.0\n"; print SOCK "User-Agent: Mozilla/3.01Gold (X11; U; SunOS 5.5.1 sun4u)\n"; print SOCK "\n\n"; my $finished; while (!$finished && ($line = )) { #$line=~y/A-Z/a-z/; if ($line=~/HTTP.+ ([\d]+)/) { $httpRequest = $1; } elsif ($line=~/Server\: ([^\s]+)/) { $url2server{$url} = $1; } elsif ($line=~/Content-Type\: ([^\s]+)/) { $url2format{$url} = $1; $finished = 1; #print STDOUT "`$1' $line"; } } #print STDERR "$httpRequest\n"; # Captures the redirects if ($httpRequest == 301) { my $mapping; while () { if (/(\<[a|A]\s+[h|H][r|R][e|E][f|F])=\"*([^\"&^\>]+)\"*.*\>/) { $mapping = $2; } } if ($mapping) { $mapping .= "index.html" if ($mapping =~/\/$/); $mapping{$url} = $mapping; &ProcessURL($mapping,$depth); } $return; } #print STDERR "$depth:$url\n"; if (!$urls{$url}) { push @urls,$url; $urls{$url} = 1; } my @next_hop; return if ($depth == 0); while () { undef $link; if (/(\<[a|A]\s+[h|H][r|R][e|E][f|F])=\"*([^\"&^\>]+)\"*.*\>/) { $link2type{$1} = $2; $link = $2; } elsif (/(\<[i|I][m|M][g|G]\s+[s|S][r|R][c|C])=\"*([^\"&^\>]+)\"*.*\>/) { $link2type{$1} = $2; $link = $2; } if ($link) { if ($link eq "..") { $path=~s/[^\/]+\/$//g; $link = "http://".$server.$path; } elsif ($link eq ".") { $link = $url; } elsif (!($link=~/^http\:/)) { $path=~s/[^\/]*\.html*$//g; $link = $path.$link if (!($link=~/^\//)); $link = "http://".$server.$link; } $link .= "index.html" if ($link=~/\/$/); $links{"$url\0$link"}++; if (!$urls{$link} && !$reached{$url}) { push @urls,$link; push @next_hop,$link; $urls{$link} = 1; } } } close SOCK; foreach $next (@next_hop) { &ProcessURL($next,$depth-1); } } sub PrintFVL() { #@urls = sort keys %urls; @links = keys %links; print STDOUT "t ",$#urls+1,"\n"; print STDOUT "T ",$#links+1,"\n"; $node_index = 0; foreach $node (@urls) { if (!$mapping{$node}) { print STDOUT "? $node_index $node"; print STDOUT " r" if ($start_file eq $node); print STDOUT "\n"; $node2index{$node} = $node_index++; } } $link_index = 0; foreach $link (@links) { ($from,$to) = split /\0/,$link; $from = $mapping{$from} if ($mapping{$from}); $to = $mapping{$to} if ($mapping{$to}); $from = $node2index{$from}; $to = $node2index{$to}; print STDOUT "l $link_index $from $to\n"; $link_index++; } } sub open_socket($port,$hostname) { my ($port, $hostname ) = @_; my ($iaddr, $paddr, $proto, $ip); if ($port =~ /\D/) {$port = getservbyname($port, 'tcp'); } if (!$port) { return 0; } &die("Bad port") unless $port; #$ip = getprotobyname($hostname); $iaddr = inet_aton($hostname); ($fqdn, $aliases, $type, $len, $iaddr) = gethostbyname($hostname); $paddr = sockaddr_in($port, $iaddr); $proto = getprotobyname('tcp'); #print STDOUT "(port: $port) (hostname: $hostname) (ip:$ip)", "(iaddr: $iaddr) (paddr: $paddr)\n"; socket( SOCK, PF_INET, SOCK_STREAM, $proto)|| return 0; select(SOCK) || &diecgi("select:$!"); connect(SOCK, $paddr) || return 0; $| = 1; return 1; }