#!/usr/bin/perl -w

use strict;


# Author: Mihai Surdeanu (msurdeanu@email.arizona.edu) 
# (based on the check_kbp_slot-filling.pl script, created by Hoa Dang)
# 
# Date: August 20, 2013
# Version 1.1
# Now stripping '\r' correctly from submission files
#
# Date: July 11, 2013
# Version 1.0
# Initial version

# Check a TAC 2013 KBP Track temporal-slot-filling task submission for various common errors, including:
#		* invalid run tag (should be a concatenation of your TAC 2013 team ID and the run number (1-5)
#		* multiple run tags
#		* invalid query id or slot name
#		* missing value for a query (at least one response required per query)
#		* answer from an invalid document
#		* offsets (Columns 6,7,8, 10) that appear to be invalid
#		* invalidly formatted dates
# Messages regarding submission are printed to an error log

# Usage is:
#    check_kbp_temporal-slot-filling.pl doclist_file queries_file results_file
#  where results_file is the name of the results submission file to be checked

# The doclist file lists all the valid document IDS for the task, one ID per line
#
# The queries file is an tab-separated file with the following columns:
#	1: query id
#	2: slot name
#	3: entity name
#	4: doc supporting the (entity, filler, slot name) relation
#	5: filler 
#	6: filler offsets
#	7: entity offsets
#	8: justification offsets
#	9: confidence score (always 1.0)
#  10: entity KB id
#  11: filler KB id
#
# The results file has at least one response per query.
#
# A response is a line with the following tab-separated columns:
# Column 1: query id
# Column 2: slot name
# Column 3: a unique run id for the submission
# Column 4: NIL, if the system believes no temporal information is learnable 
#   for this slot; or one of T1, T2, T3, or T4.
# Column 5: a document id
# Column 6: start-end offsets for representative mentions used to extract/normalize filler
# Column 7: start-end offsets for representative mentions used to extract/normalize query entity
# Column 8: start-end offsets of clause(s)/sentence(s) in justification
# Column 9: a normalized date
# Column 10: start-end offsets for provenance of the temporal information
#
# If Column 4 is NIL, then Columns 5-10 must be empty.
# The date in Column 9 must be formatted as YYYY-MM-DD.
#   "X"s instead of digits are accepted if the full date is not specified
#   in the source document.


# Change this variable to the directory where the error log should be put
my $errlog_dir = ".";

my $MAX_ERRORS = 25;

my %queries = ();

# Slot Filler Type as defined in LDC publication "TAC KBP Slots"; Version 2.3; June 11, 2012
# Slot names as defined in "Proposed Task Description for Knowledge-Base Population at TAC 2013 Version 1.1 of May 24th, 2013"
my %slots = ('PER' => {'per:alternate_names' => {quantity => 'list',
					   content => 'name'},
		 'per:children' => {quantity => 'list',
				    content => 'name'}, 
		 'per:cities_of_residence' => {quantity => 'list',
				      content => 'name'}, 
		 'per:city_of_birth' => {quantity => 'single',
					  content => 'name'},
		 'per:city_of_death' => {quantity => 'single',
					  content => 'name'}, 
		 'per:countries_of_residence' => {quantity => 'list',
				      content => 'name'}, 
		 'per:country_of_birth' => {quantity => 'single',
					  content => 'name'},
		 'per:country_of_death' => {quantity => 'single',
					  content => 'name'}, 
		 'per:employee_or_member_of' => {quantity => 'list',
				       content => 'name'}, 
		 'per:origin' => {quantity => 'list',
				  content => 'name'}, 
		 'per:other_family' => {quantity => 'list',
					content => 'name'}, 
		 'per:parents' => {quantity => 'list',
				   content => 'name'}, 
		 'per:schools_attended' => {quantity => 'list',
					    content => 'name'}, 
		 'per:siblings' => {quantity => 'list',
				    content => 'name'}, 
		 'per:spouse' => {quantity => 'list',
				  content => 'name'}, 
		 'per:stateorprovince_of_birth' => {quantity => 'single',
					  content => 'name'},
		 'per:stateorprovince_of_death' => {quantity => 'single',
					  content => 'name'}, 
		 'per:statesorprovinces_of_residence' => {quantity => 'list',
				      content => 'name'}, 
		 'per:age' => {quantity => 'single',
			       content => 'value'},
		 'per:date_of_birth' => {quantity => 'single',
					   content => 'value'},
		 'per:date_of_death' => {quantity => 'single',
					 content => 'value'}, 
		 'per:cause_of_death' => {quantity => 'single',
					 content => 'string'}, 
		 'per:charges' => {quantity => 'list',
				   content => 'string'},
		 'per:religion' => {quantity => 'single',
				    content => 'string'}, 
		 'per:title' => {quantity => 'list',
				 content => 'string'}},

	     'ORG' => {'org:alternate_names' => {quantity => 'list',
						 content => 'name'}, 
		 'org:city_of_headquarters' => {quantity => 'single',
					content => 'name'}, 
		 'org:country_of_headquarters' => {quantity => 'single',
					content => 'name'}, 
		 'org:founded_by' => {quantity => 'list',
				      content => 'name'}, 
		 'org:member_of' => {quantity => 'list',
				     content => 'name'}, 
		 'org:members' => {quantity => 'list',
				   content => 'name'}, 
		 'org:parents' => {quantity => 'list',
				   content => 'name'}, 
		 'org:political_religious_affiliation' => {quantity => 'list',
							   content => 'name'}, 
		 'org:shareholders' => {quantity => 'list',
					content => 'name'}, 
		 'org:stateorprovince_of_headquarters' => {quantity => 'single',
					content => 'name'}, 
		 'org:subsidiaries' => {quantity => 'list',
					content => 'name'}, 
		 'org:top_members_employees' => {quantity => 'list',
						 content => 'name'}, 
		 'org:date_dissolved' => {quantity => 'single',
				     content => 'value'}, 
		 'org:date_founded' => {quantity => 'single',
				   content => 'value'}, 
		 'org:number_of_employees_members' => {quantity => 'single',
						       content => 'value'}, 
		 'org:website' => {quantity => 'single',
				   content => 'string'}});


my $doclist_file;               # list of valid docids
my $queries_file;               # list of eval queries
my $results_file;				# submission file to check/validate
my %docids;                     # list of valid docids
my %qids = ();					# number of answers returned for question
my %nils;                       # number of NIL answers returned for question
my ($errlog,$num_errors,$line_num,$num_warnings);
my ($run_id, $tag);
my ($q, $slot_name, $docid, $otype, $entity_name, $answer, $fillerOffsets, $entityOffsets, $justOffsets, $dateSubmitted, $dateOffsets, $conf, $entity_kbid, $filler_kbid);
my ($i, $last_i, $line);

if ($#ARGV != 2) {
    print STDERR "Usage: $0 doclist_file query_file resultsfile\n";
    die "\n";
}

$doclist_file = $ARGV[0];
$queries_file = $ARGV[1];
$results_file = $ARGV[2];
$num_errors = 0;
$num_warnings = 0;

# set up output files 
$last_i = -1;
while ( ($i=index($results_file,"/",$last_i+1)) > -1) {
    $last_i = $i;
}
$errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog";
open ERRLOG, ">$errlog" ||
	die "Cannot open error log for writing\n";


# read in doclist file
open DOCLIST, "<$doclist_file" ||
    die "Unable to open document list file $doclist_file: $!";
while ($line = <DOCLIST>) {
    chomp $line;
    $docids{$line} = 1;
}

# read in queries file
# only do limited error checking
open QUERIES, "<$queries_file" ||
    die "Unable to open queries file $queries_file: $!";
$line_num = 0;
while ($line = <QUERIES>) {
    chomp $line;
    $line_num++;
    next if ($line =~ /^\s*$/);

    undef $q;
    undef $slot_name;
    undef $entity_name;
    undef $docid;
    undef $answer;
    undef $fillerOffsets;
    undef $entityOffsets;
    undef $justOffsets;
    undef $conf;
    undef $entity_kbid;
    undef $filler_kbid;
    ($q, $slot_name, $entity_name, $docid, $answer, $fillerOffsets, $entityOffsets, $justOffsets, $conf, $entity_kbid, $filler_kbid) = split "\t", $line, 11;
    
	if (exists $queries{$q}) {
	    &error("duplicate query ids in queries file");
	} else {
		$queries{$q} = $slot_name;
	}
}
# while (my ($k,$v)=each %queries){print "$k $v\n"}


# process the submission file; check for all errors 
open RESULTS, "<$results_file" ||
    die "Unable to open results file $results_file: $!";
$line_num = 0;
$run_id = "";
while ($line = <RESULTS>) {
    chomp $line;
    $line =~ s/[\r]+//g;
    $line_num++;

    next if ($line =~ /^\s*$/);

    undef $q;
    undef $slot_name;
    undef $tag;
    undef $otype;
    undef $docid;
    undef $fillerOffsets;
    undef $entityOffsets;
    undef $justOffsets;
    undef $dateSubmitted;
    undef $dateOffsets;
    ($q, $slot_name, $tag, $otype, $docid, $fillerOffsets, $entityOffsets, $justOffsets, $dateSubmitted, $dateOffsets) = split "\t", $line, 10;
    
    if (!defined $otype || length($otype) == 0 || !defined $tag || !defined $slot_name) {
		&error("Wrong number of fields -- missing fields");
		next;
    }

    # make sure runtag is ok
    if (! $run_id) { 	# very first line --- remember tag 
		$run_id = $tag;
        if ($run_id !~ /^[A-Za-z0-9._]{1,12}[1-5]$/) {
            &error("Run tag `$run_id' is malformed)");
            next;
        }
    }
    else {			# otherwise just make sure one tag used
		if ($tag ne $run_id) {
	    	&error("Run tag inconsistent (`$tag' and `$run_id')");
	    	next;
		}
    }

    # make sure query id exists in input
    if (!defined $queries{$q}) {
   		&error("Invalid query id ($q)");
		next;
    }
    # make sure slot name is the same as the input
    if ( $queries{$q} ne $slot_name) {
    	&error("Slot name $slot_name different than slot name in input query");
    	next;
    }
    # make sure column 4 has a valid value
    if( $otype ne "NIL" and $otype ne "T1" and $otype ne "T2" and $otype ne "T3" and $otype ne "T4") {
    	&error("Invalid value $otype in column 4. Should be one of: NIL, T1, T2, T3, T4");
    	next;
    }

    $qids{$q}{$otype}++;

    if ($otype ne "NIL") {
    	if (!defined($dateOffsets)) {
	    	&error("Wrong number of fields -- missing fields");
	    	next;
		}

		# make sure docid valid
        if ($docid =~ /(\.sgm)/) {
	    	&error("Unknown document ID `$docid' (looks like a file name instead of a document ID)");
            next;
		} elsif (!$docids{$docid}) {
	    	&error("Unknown document `$docid'");
            next;
        }

        # make sure answer exists
        if (!defined $dateSubmitted || length($dateSubmitted) == 0) {
	    	&error("Missing date string for query $q");
	    	next;
       	}
       	# make sure answer is a valid date
       	if(!($dateSubmitted =~ /^[012Xx][\dXx][\dXx][\dXx]\-[01Xx][\dXx]\-[0123Xx][\dXx]$/)) {
       		&error("Invalid date $dateSubmitted");
       		next;
       	}

       	# check filler, entity, justification, and date offsets
		my $checkMsg = &checkOffsets($justOffsets, 2);
		if(length($checkMsg) > 0) {
		    &error("invalid justification offsets: `$justOffsets' ($checkMsg)");
		    next;
		}
		$checkMsg = &checkOffsets($fillerOffsets, 2);
		if(length($checkMsg) > 0) {
		    &error("invalid filler offsets: `$fillerOffsets' ($checkMsg)");
		    next;
		} else {
			if(&offsetInclusion($justOffsets, $fillerOffsets) == 0) {
				&warn("Filler offsets $fillerOffsets not included in justification offsets $justOffsets. This is not necessarily an error but it is not common.");
			}
		}
		$checkMsg = &checkOffsets($entityOffsets, 2);
		if(length($checkMsg) > 0) {
		    &error("invalid entity offsets: `$entityOffsets' ($checkMsg)");
		    next;
		} else {
			if(&offsetInclusion($justOffsets, $entityOffsets) == 0) {
				&warn("Entity offsets $entityOffsets not included in justification offsets $justOffsets. This is not necessarily an error but it is not common.");
			}
		}
		$checkMsg = &checkOffsets($dateOffsets, 2);
		if(length($checkMsg) > 0) {
		    &error("invalid date offsets: `$dateOffsets' ($checkMsg)");
		    next;
		}

    } else {
		if ($dateOffsets) {
	    	&error ("Date reported when Column 4 value is NIL");
	    	next;
		}
	}
}

# Do global checks:
# - At least one answer for each query
# - Can't have duplicated answers for a query and a given value in column 4
my @otypes = ("NIL", "T1", "T2", "T3", "T4");
foreach $q (keys %queries) {
	if(! defined($qids{$q}{"NIL"}) and
	   ! defined($qids{$q}{"T1"}) and
	   ! defined($qids{$q}{"T2"}) and
	   ! defined($qids{$q}{"T3"}) and
	   ! defined($qids{$q}{"T4"})) {
		&error("No response given for query $q");
	}
	foreach $otype (@otypes) {
		if(defined($qids{$q}{$otype}) and $qids{$q}{$otype} > 1) {
			&error("Multiple answers given for query $q and value $otype");
		}
	}
}

print ERRLOG "Finished processing $results_file\n";
close ERRLOG || die "Close failed for error log $errlog: $!\n";
if ($num_errors) { exit 255; }
exit 0;


# checks offsets for filler, entity, or justification
# it makes sure that:
#	no more than N pairs are given
#	each offset is an integer value
#	each end offset is larger than the corresponding start
sub checkOffsets {
	my $N = pop(@_);
	my $offsetString = pop(@_);

	my @tokens = split(',', $offsetString);

	# at least an offset pair must be present if not alternate_name
	if($#tokens < 0) {
		return "At least 1 offset pair must be present!";
	}

	# at most $N tokens accepted
	if($#tokens >= $N) {
		return "At most $N offset pairs accepted!";
	}

	# check each offset pair given
	foreach my $pair (@tokens) {
		(my $start, my $end) = split('\-', $pair, 2);
		if(!($start =~ /^\d+$/)) {
			return "Start offset $start must be a positive integer value!";
		}
		if(!($end =~ /^\d+$/)) {
			return "End offset $end must be a positive integer value!";
		}
		if($end < $start) {
			return "End offset $end is smaller than start offset $start!";
		}
	}

	return "";
}

sub offsetInclusion {
	my $offsetString = pop(@_);
	my $justOffsetString = pop(@_);

	my @offsets = extractOffsets($offsetString);
	my @justOffsets = extractOffsets($justOffsetString);

	# some of these offsets may not be present. this is ok
	if($#offsets == -1 or $#justOffsets == -1) {
		return 1;
	}

	for(my $i = 0; $i < $#offsets; $i += 2) {
		my $start = $offsets[$i];
		my $end = $offsets[$i + 1];

		for(my $j = 0; $j <= $#justOffsets; $j += 2) {
			my $sj = $justOffsets[$j];
			my $ej = $justOffsets[$j + 1];
			if($start >= $sj && $end <= $ej) {
				return 1;
			}
		}
	}	

	return 0;
}

sub extractOffsets {
	my $offsetString = pop(@_);
	my @tokens = split(',', $offsetString);
	my @offsets = ();
	foreach my $pair (@tokens) {
		(my $start, my $end) = split('\-', $pair, 2);
		push(@offsets, $start);
		push(@offsets, $end);
	}
	# print "OFFSETS: @offsets\n";
	return @offsets;
}

 # print error message, keeping track of total number of errors
sub error {
   my $msg_string = pop(@_);

    print ERRLOG 
    "$0 of $results_file: Error on line $line_num --- $msg_string\n";

    $num_errors++;
    if ($num_errors > $MAX_ERRORS) {
        print ERRLOG "$0 of $results_file: Quit. Too many errors!\n";
        close ERRLOG ||
		die "Close failed for error log $errlog: $!\n";
		exit 255;
    }
}

# print warning message, keeping track of total number of warnings
sub warn {
   my $msg_string = pop(@_);

    print ERRLOG 
    "$0 of $results_file: Warning on line $line_num --- $msg_string\n";

    $num_warnings++;
    # You can add stopping after N warnings here.
    # Currently, we do not stop because of warnings.
}