#
# extract-reviews.pl
# Dan Wallach <dwallach@cs.rice.edu>
#
# As input, this program reads a Berkeley mail style folder and then
# outputs a directory full of files, one per review, named first
# with the paper ID then the reviewer's e-mail address.  If there's
# more than one review for a given paper and reviewer, other reviews
# get a postscript on the file name (e.g., 034-jms.txt, 034-jms-1.txt,
# and so forth).
#   
# There are a couple hacks of note in here.  For starters, you'll want
# to tweak the fixup variable to do something appropriate for you, probably
# tweaking all the referee e-mails to be the same as whatever names
# you used when you assigned the papers in the first place (see
# print-assignments.pl, below).  Also, I told my referees to send
# me reviews, one per mail message, without any MIME garbage, and
# with the subject line "UsenixSec2001: paper review", which I had
# rules to refile appropriately.  This is the file I fed into
# this program.  If you search for "XXX", you'll see a place where
# it checks for this subject line and deals with some other cruftiness.
# Tweak this line appropriately.
#   
# Note that the "Reviewer" line is ignored.  You may choose to do
# some fixup by hand here.  For subsequent passes that read these
# files, the file names are assumed to be the reviewer names that
# you really want.


$debug = 0;     # set to 1 to see output on stdout instead of making files

$overwrite = 1; # set to 1 to overwrite an old review with a new one
                # or to 0 to create multiple files and then you can
		# sort it all out by hand
		
$fromJustSeen = 0;
$outputDirectory = "reviews";

# cleanup to make e-mails cleaner to read -- the keys here are actual
# e-mail addresses with everything after the "@" removed.  The values
# are handles from committee.txt.  If you have reviews coming in from
# people not on your PC, that's okay, you can still fix them up here.
# No warnings will be generated, in any case, which might be a bug.
%fixup = ("pgut001" => "gutmann",
	  "cme" => "ellison",
	  "waa" => "whalen",
	  "tara" => "whalen",
	  "jtrostle" => "trostle",
	  "tlunt" => "lunt",
	  "ian" => "iang",
	  "daw" => "wagner",
	  "vern" => "paxson",
	  "antonye" => "jaeger",
	  "pdmcdan" => "mcdaniel" );
	  
# even if overwrite is set to one, some people are sending in unsolicited
# reviews from other folks.  You'll have to rename the files by hand,
# but this keeps the files from getting nuked.
%dont_overwrite = ("smb" => 1,
		   "paxson" => 1);

$fromJustSeen = 0;
topLevel: for(;;) {
    $_ = <> unless $fromJustSeen;
    chomp;
    if(/^From /) {
	# read the from line from the e-mail header
	$fromJustSeen = 0;
	$review = 0;
	@output = ();
	$email = "";
	while(<>) {
	    if(/^From /) {      # resynchronize
		$fromJustSeen = 1;
		next topLevel;
	    }
	    
	    chomp;
	    
	    last if /^$/;
	    
	    if(/^From:/) {
		if(/<([^>]*)>/) {
		    $_ = $1;
		} else {
		    s/^From:\s*//;      # nuke leading whitespace
		    s/\s*\([^()]*\)//;  # nuke "real name" field
		}
		# simplify the e-mail further by nuking everything
		# after the @ sign -- no uniqueness issues for my
		# PC but may be an issue for somebody else later
		s/@.*//;
		
		$email = $_;
		$email = $fixup{$email} if $fixup{$email};
	    }
	    
	    # XXX: normal flag: I asked people to use this subject line
	    # but Teresa Lunt used a different subject line so there's
	    # a hack here to deal with it.
	    $review = $review ||
		(/^Subject:/ && /UsenixSec2001: paper review/) ||
		($email eq "lunt" && /review of paper/i);
	    
	}
	
	if(! $email) {
	    warn "E-mail found without From: line?";
	}
	
	# if we get here, that means we've finished reading the
	# mail headers, $email and $review are defined as the
	# sender's (simplified) e-mail address and as a boolean
	# indicating whether we think the message body is a review
	# (but we're not 100% sure, yet).
	#
	# At this point, there may be some garbage, MIME headers,
	# or heaven knows what, then the review header, which is
	# left aligned and has various fields.  We'll read ahead
	# until we hit a comment line, as expected at the beginning
	# of a review.  This loop also has to deal with starting
	# over when it hits a new message or EOF.
	$hashSeen = 0;
	while(<>) {
	    if(/^From /) {      # resynchronize
		warn "Skipping a review?" if $review;
		$fromJustSeen = 1;
		next topLevel;
	    }
	    
	    chomp;
	    
	    # we break when we see a line beginning with a hash mark
	    # which indicates this message is likely to be a review
	    if(/^\#/) {
		$hashSeen = 1;
		last;
	    }
	}
	if (!$hashSeen && $review) {
	    warn "Skipping a review?  Bad, ";
	}
	
	last topLevel if !$hashSeen;  # eof
	
	push(@output, $_);
	$paperNumberSeen = 0;
	
	# okay, if we get here, that means we've seen something that
	# resembles a review.  Time to grab the review header and
	# the rankings.  We still have to worry about EOF and new
	# mail messages.
	while(<>) {
	    if(/^From /) {      # resynchronize
		warn "Skipping a review?" if $review;
		$fromJustSeen = 1;
		next topLevel;
	    }
	    
	    chomp;
	    push(@output, $_);
	    if(/^\#/) {
		next;
	    } else {
	        if(/^(\d+)\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+/) {
		    $paperNumberSeen = 1;
		    $paperNumber = $1;
		    last;
		} else {
		    # it's not not a paper review -- bail!
		    # except keep going if we saw the magic subject line
		    next topLevel unless $review;
		}
	    }
	}
	if(!paperNumberSeen) {
	    warn "Suspicious.  Didn't get a paper number, ";
	}
	
	# if we get here, $paperNumber and $email are well-defined,
	# @output has some useful lines in it, and we've got a bunch
	# of review lines following
	while(<>) {
	    chomp;
	    if(/^From /) {
		$fromJustSeen=1;
		last;
	    }
	    push(@output, $_);
	}
	
	$num = $fileNameUsed{$paperNumber,$email}++;
	$reviewsWritten{$email}++ if !$num;
	
	my $did_overwrite = 0;
	if($num && (!$overwrite || $dont_overwrite{$email})) {
	    $fileName = "$outputDirectory/$paperNumber-$email-$num.txt";
	} else {
	    $did_overwrite = $num;
	    $fileName = "$outputDirectory/$paperNumber-$email.txt";
	}
	if($debug) {
	    open(OUT, ">&STDOUT") || die "Can't dup stdout: $!\n";
	    printf OUT "=======================\nFile: %s\n=======================\n", $fileName;
	} else {
	    open(OUT, ">$fileName") || die "Can't write $fileName: $!\n";
	    if($did_overwrite) {
		printf STDOUT "==> Overwriting $fileName\n";
	    } else {
		printf STDOUT "Writing $fileName\n";
	    }
	}
    
	foreach $i (@output) {
	    printf OUT "%s\n", $i;
	}
	
	close OUT;
	
	if($fromJustSeen) {
	    # there's more below, so continue around the loop
	    next topLevel;
	} else {
	    # we hit EOF, we're done
	    last topLevel;
	}
    }
}

printf STDOUT "Papers submitted\n================\n";
foreach $i (sort {$reviewsWritten{$b} <=> $reviewsWritten{$a}} keys %reviewsWritten) {
    printf STDOUT "%10s: %d\n", $i, $reviewsWritten{$i};
}
