#!/usr/local/bin/perl # # lmc.pl --- Last-Modified Checker # Taro Zzz # taro@hauN.org # # 2001/Mar/20 The first version written. # 2001/Jul/13 Implement timeout of w3m. # 2004/Dec/10 Support RDF dc:date. # Use wget instead of w3m. # # require "timelocal.pl"; require "/home/taro/my/jcode.pl"; $diarycache = "/home/taro/www/my/Diary.cache"; # 日記同好会用 $diarydat = "/home/taro/www/my/Diary.dat"; # web site data #$w3mpath = "/usr/local/bin/w3m"; $wgetpath = "/usr/local/bin/wget"; ### URL, timestamp, bytes を書いたファイル $cacheinfile = "/home/taro/my/diary.cachein"; $cacheoutfile = "/home/taro/my/diary.cacheout"; $getsourcemax = 100000; # get するファイルの byte 数の上限 $cooltime = 7 * 86400; # 7日で「冷めたパトス」 $coldtime = 90 * 86400; # 90日で「ほんまに冷めたパトス」 $timeout = 60; # w3m で GET or HEAD する時の timeout 時間 $| = 1; # 出力をバッファリングさせない %month = ( 'Jan', '0', 'Feb', '1', 'Mar', '2', 'Apr', '3', 'May', '4', 'Jun', '5', 'Jul', '6', 'Aug', '7', 'Sep', '8', 'Oct', '9', 'Nov', '10', 'Dec', '11' ); %dayofweek = ('0','(日)', '1','(月)', '2','(火)', '3','(水)', '4','(木)', '5','(金)', '6','(土)'); ### テーブルタイトル文字列 $koushin = "更新改竄時刻"; &jcode'convert(*koushin, 'jis'); $hotpathos = "熱いパトス"; &jcode'convert(*hotpathos, 'jis'); $yabai = "やばいライン"; &jcode'convert(*yabai, 'jis'); $coolpathos = "冷めたパトス"; &jcode'convert(*coolpathos, 'jis'); $majiyabai = "マジでやばいライン"; &jcode'convert(*majiyabai, 'jis'); $coldpathos = "ほんまに冷めたパトス"; &jcode'convert(*coldpathos, 'jis'); ### 現在時刻を得る $nowtime = time; $gmstr = gmtime($nowtime); $localnow = localtime($nowtime); ### diary.cachein ファイルを開く open(CACHEIN, "<$cacheinfile") || die "can't open $cacheinfile\n"; ### 更新時刻サーベイルーチン $SIG{ALRM} = sub {die "timeout"}; # timeout したら SIGALRM while () { chop; ($url, $lastmod, $bytes) = split(' ', $_, 3); $openfailed = 0; $newbytes = 0; # print "$url\n"; if ($bytes == -1) { ## Last-Modified を HEAD で取得 # if (!($pidhead = open(HEAD, "$w3mpath -dump_head $url | "))) { if (!($pidhead = open(HEAD, "$wgetpath -O - -S --spider $url 2>&1 | "))) { die "

Can't open $url, ret = $!

\n"; $openfailed = 1; } eval { # while loop がタイムアウトしたら例外を投げる alarm($timeout); while () { # if ($_ =~ /Last\-[Mm]odified\: (...), (..) (...) (....) (..):(..):(..) GMT/) { if ($_ =~ /.+Last\-[Mm]odified\: (...), (..) (...) (....) (..):(..):(..) GMT/) { @gmlm = ($7, $6, $5, $2, $month{$3}, $4 - 1900); $lastmodnew = &timegm(@gmlm); if ($lastmodnew != $lastmod) { # 更新時刻が変わっていたら $lastmod = $lastmodnew; } @lm = localtime($lastmod); last; } } alarm(0); }; if ($@) { if ($@ =~ /timeout/) { print "

timeout of HEAD request: $url

\n"; $openfailed = 1; kill 9 => $pidhead; } else { alarm(0); kill 9 => $pidhead; die; } } if (kill 0 => $pidhead) { kill 9 => $pidhead; } close HEAD; } elsif ($bytes == -2) { ## RDF があれば dc:date を読む if (!($pidhead = open(RDF, "$wgetpath $url -O - -q |"))) { die "

Can't open $url, ret = $!

\n"; $openfailed = 1; } eval { # while loop がタイムアウトしたら例外を投げる alarm($timeout); while () { $dcdateexists = 0; if ($_ =~ /\(....)\-(..)\-(..)T(..)\:(..)\:(..)[+-](..)\:(..)\<\/dc\:date\>/) { @locallm = ($6, $5, $4, $3, $2 - 1, $1 - 1900); $dcdateexists = 1; } elsif ($_ =~ /\(....)\-(..)\-(..)T(..)\:(..)[+-](..)\:(..)\<\/dc\:date\>/) { @locallm = (0, $5, $4, $3, $2 - 1, $1 - 1900); $dcdateexists = 1; } elsif ($_ =~ /\(....)\-(..)\-(..)\<\/dc\:date\>/) { @locallm = (0, 0, 0, $3, $2 - 1, $1 - 1900); $dcdateexists = 1; } if ($dcdateexists == 1) { $lastmodnew = &timelocal(@locallm); # print "$lastmodnew\n"; if ($lastmodnew != $lastmod) { # 更新時刻が変わっていたら # print "$lastmod changed.\n"; $lastmod = $lastmodnew; } else { # print "$lastmod not changed.\n"; } @lm = localtime($lastmod); last; } else { # print "no match to dc:date.\n"; } } alarm(0); }; if ($@) { if ($@ =~ /timeout/) { print "

timeout of GET request: $url

\n"; $openfailed = 1; kill 9 => $pidhead; } else { alarm(0); kill 9 => $pidhead; die; } } if (kill 0 => $pidhead) { kill 9 => $pidhead; } close RDF; } else { ## Last-Modified を返さないサイトは GET でソース取得 # if (!($pidget = open(GET, "$w3mpath -dump_source $url | "))) { if (!($pidget = open(GET, "$wgetpath -O - -q $url | "))) { die "

Can't open $url, ret = $!

\n"; $openfailed = 1; } eval { alarm($timeout); $newbytes = read(GET, $getsource, $getsourcemax); alarm(0); }; if ($@) { if ($@ =~ /timeout/) { print "

timeout of GET request: $url.

\n"; $openfailed = 1; kill 9 => $pidget; } else { alarm(0); kill 9 => $pidget; die; } } if ($newbytes != 0 && $newbytes != $bytes) { # byte 数が変わっていたら $bytes = $newbytes; $lastmod = $nowtime; } if (kill 0 => $pidget) { # プロセス $pidget が生きていたら殺す kill 9 => $pidget; } close GET; } push(@array, "$lastmod" . " $url" . " $bytes" . " $openfailed"); } close CACHEIN; ### 更新時刻取得にかかった時間を求める $endtime = time; $elapsetime = $endtime - $nowtime; ### diary.dat を開いて、「サイトURL -> (名前, アンカー)」の連想配列を作る open(DIARYDAT, $diarydat) || print "can't open $diarydat\n"; $/ = ""; # 入力セパレータを空文字列にする while () { chop; ($site, $author, $anchor) = split(/\n/, $_, 3); $sitehash{$site} = "$author" . " $anchor"; } close DIARYDAT; $/ = "\n"; # 入力セパレータを改行に戻す ### @array を $lastmod の逆順でソート foreach (@array) { push(@lmarray, (split(' '))[0]); } @array = @array[sort {$lmarray[$b] <=> $lmarray[$a]} $[..$#array]; ### 出力 $sites = @array; # @array の要素の数 $nowtimestr = "このページの Last-Modified: $localnow"; &jcode'convert(*nowtimestr, 'jis'); print "

$nowtimestr
Elapsed time for survey: $elapsetime sec.
Number of sites: $sites

\n"; print "\n"; print "\n"; open(CACHEOUT, ">$cacheoutfile") || print "can't open $cacheoutfile\n"; open(DIARYCACHE, ">$diarycache") || print "can't open $diarycache\n"; $pathosflag = 0; foreach (@array) { ($lastmod, $url, $bytes, $openfailed) = split(' ', $_, 4); $pasttime = $nowtime - $lastmod; if ($pasttime > $cooltime && $pasttime < $coldtime && $pathosflag == 0) { print ""; $pathosflag = 1; } elsif ($pasttime >= $coldtime && $pathosflag == 1 ) { print ""; $pathosflag = 2; } print CACHEOUT "$url $lastmod $bytes\n"; # 新たなデータを CACHEOUT へ出力 print DIARYCACHE "$url $lastmod\n"; # 新たなデータを DIARYCACHE にも出力 if ($bytes == -1 || $bytes == -2) {$lmcl = " ";} else {$lmcl = "頃";} if ($openfailed == 1) {$bgcolor = "#666666";} else {$bgcolor = "#333333";} &jcode'convert(*lmcl, 'jis'); foreach $site (keys %sitehash) { if ($site eq $url) { ($sec, $min, $hour, $day, $mon, $year, $week, $dum1, $dum2) = localtime($lastmod); $year = $year + 1900; $mon = $mon + 1; $lastmodstr = sprintf("%4d%s%2d%s%2d%s%s%2d%s%2d%s%2d%s", $year,'年',$mon,'月',$day,'日',$dayofweek{$week},$hour,'時',$min,'分',$sec,'秒'); &jcode'convert(*lastmodstr, 'jis'); print "\n"; last; } } } print "
$koushin$hotpathos
$yabai$coolpathos
$majiyabai$coldpathos
$lastmodstr$lmcl$sitehash{$site}
\n"; close CACHEOUT; close DIARYCACHE; ### cacheout を cachein にリネーム #system("cp $cacheoutfile $cacheinfile"); rename($cacheoutfile, $cacheinfile); $SIG{CHLD} = 'IGNORE';