搜索北大未名全站

曾经 N 兄想知道某人(这事王晨, 沈建泽, 王彬都知道)的信息, 托我做一个搜索未名全站的程序. 我用SHELL/Perl/Java分别写个—给他的是Java版本. 我现在只剩下Perl版还在.

后来当作练习, 用Perl写了个多线程的, 第二天, 我的 IP 就被禁了, 我还以为未名挂了.

前几天, 还有不认识的人发邮件给我说, 这个有BUG, 让修正, 结果只是线程数设置过高.

现在, 未名有 Google 全站搜索, 这个也没什么用了, 放出来做为一个纪念.

#!/usr/bin/perl -w

# Copyright (C) 2007 Changsheng Jiang
# Time-stamp: <Changsheng Jiang 2008-06-11 01:07:24>
# Author: Changsheng Jiang<jiangzuoyan@gmail.com>
# Date: 2007/10/18
# Version: 2007/10/18

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.

# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.

# search articles by author or title from bdwm, use the original www
# interface.

# any change, please do change the email in the help message!


use strict;
use IO::Handle;
use Text::Iconv;
use LWP::UserAgent;
use threads;
use threads::shared;
use Getopt::Long;
use URI;

my $author:shared="";
my $title:shared="";
my $otitle:shared="";
my $savefile:shared="";
my $verbose:shared = 4;
my $savefilelock:shared = 0;
my @bbsboards:shared;
my $searchtype:shared="";
my $help=0;
my $nths=1;

sbbs_help(1) unless GetOptions(
   "help|h"=>\$help,
   "verbose=i"=>\$verbose,
   "savefile=s"=>\$savefile,
   "author=s"=>\$author,
   "title=s"=>\$title,
   "otitle=s"=>\$otitle,
   "nths=i" =>\$nths
   );

sub sbbs_help
{
   print STDERR "searchbbs options, ...\n",
   " search articles by author or title at bdwm\n",
   "options:\n",
   " --author author: name to search\n",
   " --title  title: search by title\n",
   " --otitle otitle: search use go=O\n",
   " --savefile filename: save file name\n",
   " --nths num: # of threads, 1-40, default $nths\n",
   "No warranty!\n",
   "Contact: Sheos <jiangzuoyan\@gmail.com>, -:)\n";
   exit $_[0] if $_[0] > 0;
}

if ($help) {
   sbbs_help(1);
}

if(length($savefile)){
   open(SAVEFILE, ">".$savefile) or die "cann't open file $savefile\n";
   select(SAVEFILE);
   SAVEFILE->autoflush(1);
}

# bs_content(board, conent)
sub bs_content
{
   my $board = shift;
   my $cont = shift;
   if ($cont =~ m{(<tr><td class=body1>.*</a>)</table>}s) {
       my $cnt = $1;
       $cnt =~ s/[\t\r]//sg;
       $cnt =~ s/\n\n+/\n/sg;
       print STDERR "==found $board:", length($cnt), "\n" unless $verbose < 2;
       {
           lock($savefilelock);
           print "<tr bgcolor=lightgreen><td colspan=5>Board ",
           "<a href=\"http://bbs.pku.edu.cn/bbs/bbsdoc.php?board=$board\">$board</a>\n",
           $cnt,
           "\n";
       }
   }
}

sub getsearch
{
   my $c = shift;
   my $board;
   my $ua = LWP::UserAgent->new;
   my $url = URI->new("http://162.105.204.150/bbs/bbssearch.php");
   my $go =  $searchtype eq 'author' ? 'W' :
       $searchtype eq 'title' ? 'T' : 'O';
   my $to = "$author$title$otitle";
   while (1) {
       {
           lock(@bbsboards);
           $board = shift @bbsboards;
       }
       return unless $board;
       $url->query_form('board'=>$board, "go"=>$go, "to"=>$to);
       my $req=HTTP::Request->new('GET', $url);
       my $res=$ua->request($req);
       bs_content($board, $res->content) if $res->is_success;
       # select(undef, undef, undef, 0.001);# sleep
       print STDERR "[$c]$board\n" unless $verbose < 4;
   }
 
}

sub get_boards
{
   my $ua = LWP::UserAgent->new;
   my $req = HTTP::Request->new(GET => "http://162.105.204.150/bbs/bbsall.php");
   my $res = $ua->request($req);
   die $res->status_line unless $res->is_success;
   @bbsboards = $res->content=~ m{href="/bbs/bbsdoc.php\?board=([^"]+)"}g;
   @bbsboards = sort(@bbsboards);
}

## main
$searchtype = 'author' if length($author);
$searchtype = 'title' if length($title);
$searchtype = 'otitle' if length($otitle);
my $LANG=$ENV{'LANG'};
$LANG =~ s/^.*\.//;
if ($LANG) {
   my $cnv = Text::Iconv->new($LANG, "gbk");
   $title = $cnv->convert($title);
   $otitle = $cnv->convert($otitle);
}
sbbs_help(1) unless $searchtype;
print <<EOF;
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=gbk">
<title>Search all $searchtype:$author$title$otitle</title>
<base href="http://bbs.pku.edu.cn/" />
<link rel="stylesheet" type="text/css" href="/bbstyle.php">
</head><body bgproperties="fixed">
<center>All $searchtype:$author$title$otitle<hr />
<table class=body>
<tr><th class=body>\#<th class=body>S<th class=body>Author<th class=body>Date<th class=body>Title
EOF
get_boards;
my @sthrs;
$nths = 1 if $nths < 1;
$nths = 40 if $nths > 40;
for (my $i=0; $i<$nths; ++$i) {
   push @sthrs, threads->new(\&getsearch, $i);
}
foreach my $thr(@sthrs) {
   $thr->join;
}
print <<EOF;
</table>
</center><hr />style, :-)
</body></html>
EOF

No comments: