#!/usr/local/bin/perl # # goo8.pl # - get search engine keywords from httpd referer_log file. # # original script written by kazunori@kazu.nori.org. # modified by Taro Nakano (taro@hauN.org). # # 2001/Nov/04 use Jcode.pm instead of jcode.pl in order to decode # f*cking UTF-8 string used by google. # # 2001/May/04 support asearch.cab.infoweb.ne.jp. # support google.co.jp. # add naver.co.jp. # add allabout.co.jp. # # 2001/Feb/03 some search engine added. # support "FI_*" marker of excite. # # 2001/Jan/02 the first version written. # #-------------------------------------------------------------- # Usage: goo8.pl < log_file # output code is EUC. # # NOTICE: You should keep kanji code of this script itself EUC. # # We assume the format of input log_file like: # # b130222.ppp.dion.ne.jp - - [22/Dec/2000:22:01:13 +0900] "GET /~nakano/log/200002.html HTTP/1.1" 200 65536 "http://www.google.com/search?q=%8C%DC%8C%8E%82%DD%82%C7%82%E8%81@jpg&btnG=Google+%8C%9F%8D%F5&hl=ja&lr=" "Mozilla/4.0 (compatible; MSIE 5.0; Mac_PowerPC)" # # so you should modify this script to be suitable for your # log_file format. #-------------------------------------------------------------- # If you install Jcode.pm not to Perl system directory # but to your home directory, # you need to describe where you put Jcode.pm. use lib '/home/taro/lib/perl5'; # You can get the latest package of Jcode.pm # at http://openlab.ring.gr.jp/Jcode/index-j.html. use Jcode; while(<>) { ($remotehost, $dum1, $dum2, $datetime, $timezone, $method, $target, $httpver, $errcode, $size, $referer, $dum3, $useragent) = split(' ', $_, 13); chop; if ( (($referer =~ /R\=\"http\:\/\/([^\?]*[\.]+[^\?]*)\/([^\?]*\?.*)\"/)) ) { ($domain, $string) = ($1, $2); $words = ''; $engine = ''; $marker = ''; ($engine, $marker, $markerpattern) = &WhichSearchEngine(); @in1 = split(/\?/, $string); @in = split(/[&;]/, $in1[1]); %keywords = (); foreach $i (0 .. $#in) { $in[$i] =~ s/\+/ /g; # $in[$i] =~ s/Ž¡Ž¡/ /g; ($key, $val) = split(/=/, $in[$i], 2); if (($key eq $marker) || ($key =~ /$markerpattern/)) { # Use while-loop for multi-encoding of query string. while ($val =~ /.*%[a-fA-F0-9][a-fA-F0-9].*/) { $val =~ s/%(..)/pack("C", hex($1))/ge; } $keywords{$val} = 1 unless ($val =~ /^\s*$/) } } $words = join(' ',(keys %keywords)); Jcode::convert(\$words, 'euc'); print "¡Ö$words¡×($remotehost; $engine)\n"; } } sub WhichSearchEngine { { if ($domain =~ /[a-zA-Z0-9]*\.*goo\.ne\.jp/ || $domain =~ /[a-zA-Z0-9]*\.*GOO\.NE\.jp/) {$marker = 'MT'; $engine = 'goo'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*google\.com/ || $domain =~ /[a-zA-Z0-9]*\.*google\.co\.jp/ || $domain =~ /[a-zA-Z0-9]*\.*google\.de/ || $domain =~ /[a-zA-Z0-9]*\.*Google\.com/ || $domain =~ /[a-zA-Z0-9]*\.*google\.it/) { if ($referer =~ /.*as\_q\=.*/) {$marker = 'as_q';} else {$marker = 'q';} $engine = 'google'; last; } if (($domain =~ /[a-zA-Z0-9]*\.*excite\.co\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*excite\.com/)) { if ($referer =~ /.*search\=.*/) {$marker = 'search';} elsif ($referer =~ /[?&]s\=.*/) {$marker = 's';} else {$markerpattern = 'FI_[0-9]'} $engine = 'excite'; last; } if ($domain =~ /[a-zA-Z0-9]*\.*nifty\.com/) { $marker = 'Text'; $engine = 'nifty'; last; } if (($domain =~ /[a-zA-Z0-9]*\.*infoweb\.ne\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*infoweb\.or\.jp/)) { if (($referer =~ /.*QUERYSTRING\&OLDQUERYDISPLAY\=.*/) && !($referer =~ /.*QueryString\=.*/)) {$marker = 'OLDQUERYDISPLAY';} elsif ($referer =~ /.*Querystring\=.*/) {$marker = 'Querystring';} elsif ($referer =~ /.*url\=.*/) {$marker = 'url';} elsif ($domain =~ /asearch.cab.infoweb.ne.jp/) {$marker = 'q';} else {$marker = 'QueryString';} $engine = 'infoweb'; last; } if (($domain =~ /[a-zA-Z0-9]*\.*yahoo\.co\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*yahoo\.com/)) { if ($referer =~ /.*p2\=.*/) {$marker = 'p2';} else {$marker = 'p';} $engine = 'yahoo'; last;} if (($domain =~ /[a-zA-Z0-9]*\.*infoseek\.co\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*infoseek\.com/)) {$marker = 'qt'; $engine = 'infoseek'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*hotbot\.lycos\.com/) {$marker = 'MT'; $engine = 'hotbot'; last;} if (($domain =~ /[a-zA-Z0-9]*\.*lycos\.co\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*lycos\.ne\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*lycos\.com/) || ($domain =~ /[a-zA-Z0-9]*\.*lycos\.de/) || ($domain =~ /[a-zA-Z0-9]*\.*lycos\.fr/) || ($domain =~ /[a-zA-Z0-9]*\.*lycos\.it/)) {$marker = 'q'; $engine = 'lycos'; last;} if (($domain =~ /[a-zA-Z0-9]*\.*msn\.co\.jp/) || ($domain =~ /[a-zA-Z0-9]*\.*msn\.com/) || ($domain =~ /[a-zA-Z0-9]*\.*msn\.co\.uk/)) { if (($referer =~ /.*[?&]q\=.*/)) {$marker = 'q';} else {$marker = 'MT';} $engine = 'msn'; last; } if ($domain =~ /[a-zA-Z0-9]*\.*fresheye\.com/) {$marker = 'kw'; $engine = 'fresheye'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*altavista\.com/) {$marker = 'q'; $engine = 'altavista'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*odin\.ingrid\.org/) {$marker = 'key'; $engine = 'odin'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*search\.biglobe\.ne\.jp/) {$marker = 'q'; $engine = 'biglobe'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*kensaku\.org/) {$marker = 'key'; $engine = 'kensaku'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*bestoftheweb\.com/) {$marker = 'q'; $engine = 'bestoftheweb'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*infofreako\.com/) {$marker = 'string'; $engine = 'infofreako'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*search\.odn\.ne\.jp/) {$marker = 'QueryString'; $engine = 'odn'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*navi\.ocn\.ne\.jp/) {$marker = 'kw'; $engine = 'ocn'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*aolsearch\.aol\.com/) {$marker = 'query'; $engine = 'aol'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*isize\.com/) {$marker = 'QueryString'; $engine = 'isize'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*mycom\.co\.jp/) {$marker = 'a1'; $engine = 'mycom'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*goto\.com/) {$marker = 'Keywords'; $engine = 'goto'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*alltheweb\.com/) {$marker = 'query'; $engine = 'alltheweb'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*coree\.ne\.jp/) {$marker = 'KEYWORD'; $engine = 'coree'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*iwon\.com/) {$marker = 'searchfor'; $engine = 'iwon'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*dogpile\.com/) {$marker = 'q'; $engine = 'dogpile'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*anzwers\.com\.au/) {$marker = 'query'; $engine = 'anzwers'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*naver\.co\.jp/) {$marker = 'query'; $engine = 'naver'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*allabout\.co\.jp/) {$marker = 'qs'; $engine = 'allabout'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*freebit\.net/) {$marker = 'QueryString'; $engine = 'freebit'; last;} if ($domain =~ /[a-zA-Z0-9]*\.*c4\.com/) {$marker = 'SearchText'; $engine = 'c4'; last;} } return($engine, $marker, $markerpattern); }