#!/usr/bin/perl -w use strict; # Author: Hoa Dang (hoa.dang@nist.gov) # Author: Mihai Surdeanu (msurdeanu@email.arizona.edu) # # Date: July 24, 2014 # Version 3.1 # Added check to ensure all documents in filler justification are also in relation justification # # Date: June 17, 2014 # Version 3.0 # Adapted to the SF 2014 format # # Date: July 12, 2013 # Version 2.2 # Bug fix in offset inclusion checks # # Date: July 8, 2013 # Version 2.1 # Added offset warnings, if filler + entity offsets are not included in justification # Removed UTF-8 checking. That code is not robust enough. # # Date: June 19, 2013 # Version: 2.0 # Adapted to the 2013 slot filling format # # Date: July 24, 2012 # Version: 1.1 # Increased maximum run number to 5 (up from 3) # Date: July 2, 2012 # Version: 1.0 # Initial version # Check a TAC 2014 KBP Track slot-filling task submission for various common errors, including: # * invalid run tag (should be a concatenation of your TAC 2014 team ID and the run number (1-5) # * multiple run tags # * invalid query id or slot name # * missing value for a question (at least one response required per question) # * answer from an invalid document # * offsets (Columns 4,6) that appear to be invalid # * the confidence score is a double value less or equal to 1 # [NB: "question" refers to a (query id, slot name) pair] # Messages regarding submission are printed to an error log # Usage is: # check_kbp_slot-filling.pl doclist_file queries_file results_file # where results_file is the name of the results submission file to be checked # The doclist file lists all the valid document IDS for the task, one ID per line # # The queries file is an xml file with the following dtd # # # # # # # # # # The results file has exactly one response per single-valued slot and # at least one response for each list-valued slot, for each query # # A response is a line with the following tab-separated columns: # Column 1: query id # Column 2: slot name # Column 3: a unique run id for the submission # Column 4: NIL, if the system believes no information is learnable for this slot; or provenance for the relation otherwise, consisting of up to 4 triples docid:start-end # Column 5: a slot filler # Column 6: up to 2 docid:start-end triples for representative mentions used to extract/normalize filler # Column 7: confidence score # # If Column 4 is NIL, then Columns 5-7 must be empty. # The slot filler (Column 5) must not contain any embeded tab characters # This script also creates a new file, "input", in the current directory # that has a more standard format (there will not be leading whitespace; # and columns will be separated by one tab) # Change this variable to the directory where the error log should be put my $errlog_dir = "."; my $MAX_ERRORS = 25; my %queries; # Slot Filler Type as defined in LDC publication "TAC KBP Slots"; Version 2.3; June 11, 2012 # Slot names as defined in "Proposed Task Description for Knowledge-Base Population at TAC 2013 Version 1.1 of May 24th, 2013" my %slots = ('PER' => {'per:alternate_names' => {quantity => 'list', content => 'name'}, 'per:children' => {quantity => 'list', content => 'name'}, 'per:cities_of_residence' => {quantity => 'list', content => 'name'}, 'per:city_of_birth' => {quantity => 'single', content => 'name'}, 'per:city_of_death' => {quantity => 'single', content => 'name'}, 'per:countries_of_residence' => {quantity => 'list', content => 'name'}, 'per:country_of_birth' => {quantity => 'single', content => 'name'}, 'per:country_of_death' => {quantity => 'single', content => 'name'}, 'per:employee_or_member_of' => {quantity => 'list', content => 'name'}, 'per:origin' => {quantity => 'list', content => 'name'}, 'per:other_family' => {quantity => 'list', content => 'name'}, 'per:parents' => {quantity => 'list', content => 'name'}, 'per:schools_attended' => {quantity => 'list', content => 'name'}, 'per:siblings' => {quantity => 'list', content => 'name'}, 'per:spouse' => {quantity => 'list', content => 'name'}, 'per:stateorprovince_of_birth' => {quantity => 'single', content => 'name'}, 'per:stateorprovince_of_death' => {quantity => 'single', content => 'name'}, 'per:statesorprovinces_of_residence' => {quantity => 'list', content => 'name'}, 'per:age' => {quantity => 'single', content => 'value'}, 'per:date_of_birth' => {quantity => 'single', content => 'value'}, 'per:date_of_death' => {quantity => 'single', content => 'value'}, 'per:cause_of_death' => {quantity => 'single', content => 'string'}, 'per:charges' => {quantity => 'list', content => 'string'}, 'per:religion' => {quantity => 'single', content => 'string'}, 'per:title' => {quantity => 'list', content => 'string'}}, 'ORG' => {'org:alternate_names' => {quantity => 'list', content => 'name'}, 'org:city_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:country_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:founded_by' => {quantity => 'list', content => 'name'}, 'org:member_of' => {quantity => 'list', content => 'name'}, 'org:members' => {quantity => 'list', content => 'name'}, 'org:parents' => {quantity => 'list', content => 'name'}, 'org:political_religious_affiliation' => {quantity => 'list', content => 'name'}, 'org:shareholders' => {quantity => 'list', content => 'name'}, 'org:stateorprovince_of_headquarters' => {quantity => 'single', content => 'name'}, 'org:subsidiaries' => {quantity => 'list', content => 'name'}, 'org:top_members_employees' => {quantity => 'list', content => 'name'}, 'org:date_dissolved' => {quantity => 'single', content => 'value'}, 'org:date_founded' => {quantity => 'single', content => 'value'}, 'org:number_of_employees_members' => {quantity => 'single', content => 'value'}, 'org:website' => {quantity => 'single', content => 'string'}}); my $doclist_file; # list of valid docids my $queries_file; # list of eval queries my $results_file; # submission file to check/validate my %docids; # list of valid docids my %qids; # number of answers returned for question my %nils; # number of NIL answers returned for question my ($errlog,$num_errors,$line_num,$num_warnings); my ($run_id, $tag); my ($q,$etype,$slot_name,$rel_just, $answer,$filler_just, $conf, $docid); my ($i, $last_i, $line); if ($#ARGV != 2) { print STDERR "Usage: $0 doclist_file query_file resultsfile\n"; die "\n"; } $doclist_file = $ARGV[0]; $queries_file = $ARGV[1]; $results_file = $ARGV[2]; $num_errors = 0; $num_warnings = 0; # set up output files $last_i = -1; while ( ($i=index($results_file,"/",$last_i+1)) > -1) { $last_i = $i; } $errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; open INPUT, ">input" || die "Cannot create `input' file: $!\n"; # read in doclist file open DOCLIST, "<$doclist_file" || die "Unable to open document list file $doclist_file: $!"; while ($line = ) { chomp $line; $docids{$line} = 1; } # read in queries file # only do limited error checking open QUERIES, "<$queries_file" || die "Unable to open queries file $queries_file: $!"; undef $q; undef $etype; while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); $line =~ s/^\s*(.*\S)\s*$/$1/g; # remove leading and trailing whitespace if($line =~/^$/) { $q= $1; if (defined $queries{$q}) { &error("duplicate query ids in queries file"); next; } } elsif ($line =~ /^<\/query>$/) { if (!defined $etype) { &error("undefined enttype for query $q"); next; } $queries{$q}{'type'} = $etype; undef $q; undef $etype; } elsif ($line =~ /^([A-Z]+)<\/enttype>$/) { if((defined $etype)) { &error("entity type $etype already defined"); next; } $etype = $1; } } open RESULTS, "<$results_file" || die "Unable to open results file $results_file: $!"; $line_num = 0; $run_id = ""; while ($line = ) { chomp $line; $line_num++; next if ($line =~ /^\s*$/); # Disabled UTF8 checking. This code is not robust! # if (&invalid_UTF8($line)) { # &error("invalid character (non-UTF8)"); # next; # } undef $slot_name; undef $tag; undef $rel_just; undef $answer; undef $filler_just; undef $conf; ($q, $slot_name, $tag, $rel_just, $answer, $filler_just, $conf) = split "\t", $line, 7; if (!defined $rel_just || length($rel_just) == 0 || !defined $tag || !defined $slot_name) { &error("Wrong number of fields -- missing fields"); next; } # make sure runtag is ok if (! $run_id) { # very first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9._]{1,12}[1-5]$/) { &error("Run tag `$run_id' is malformed)"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get query id if (!defined $queries{$q}) { &error("Invalid query id ($q)"); next; } # get slot name if (!defined $slots{$queries{$q}{'type'}}{$slot_name}) { &error("Invalid slot name $slot_name for query $q with entity type $queries{$q}{'type'}"); next; } $qids{$q}{$slot_name}++; if ($rel_just ne "NIL") { if (!defined($conf)) { &error("Wrong number of fields -- missing fields"); next; } # make sure answer exists if (!defined $answer || length($answer) == 0) { &error("Missing answer-string for slot $slot_name for query $q"); next; } # check filler offsets and justification offsets my $checkMsg = &check_justification($rel_just, 4, $slot_name); if(length($checkMsg) > 0) { &error("invalid relation justification: `$rel_just' ($checkMsg)"); next; } $checkMsg = &check_justification($filler_just, 2, ""); if(length($checkMsg) > 0) { &error("invalid filler justification: `$filler_just' ($checkMsg)"); next; } else { if(&docInclusion($rel_just, $filler_just) == 0) { &error("invalid filler justification `$filler_just' contains document not included in relation justification `$rel_just'"); } } # make sure the confidence is a double with a valid value if(!($conf =~ /^\d+\.\d+$/ && $conf <= 1.0)) { &error("invalid confidence value `$conf'"); next; } print INPUT "$q\t$slot_name\t$tag\t$rel_just\t$answer\t$filler_just\t$conf\n"; } else { if ($answer) { &error ("Answer string given when relation provenance is NIL"); next; } $answer = ""; $nils{$q}{$slot_name}++; print INPUT "$q\t$slot_name\t$tag\t$rel_just\n"; } } # Do global checks: # error if single-valued question has more than one response given for it # error if more than one response is given for a question where NIL has been given as an answer foreach $q (keys %queries) { foreach $slot_name (keys %{$slots{$queries{$q}{'type'}}}) { if ($slots{$queries{$q}{'type'}}{$slot_name}{'quantity'} eq "single" && defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1) { &error("More than one response given for single-valued slot $slot_name for query $q"); } if (defined $qids{$q}{$slot_name} && $qids{$q}{$slot_name} > 1 && defined $nils{$q}{$slot_name}) { &error("More than one response given for slot $slot_name for query $q, where NIL has been given as an answer"); } } } print ERRLOG "Finished processing $results_file\n"; close INPUT || die "Close failed for `input' file: $!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; if ($num_errors) { exit 255; } exit 0; # checks offsets for relation and filler # it makes sure that: # no more than N pairs are given # each docid is valid # each offset is an integer value # each end offset is larger than the corresponding start sub check_justification { my $slotName = pop(@_); my $N = pop(@_); my $offsetString = pop(@_); my @tokens = split(',', $offsetString); # at least an offset pair must be present # this is now mandatory for alternate_names too! if($#tokens < 0) { return "At least 1 offset pair must be present!"; } # at most $N tokens accepted if($#tokens >= $N) { return "At most $N offset pairs accepted!"; } # check each offset pair given foreach my $tuple (@tokens) { (my $docid, my $start_end) = split(':', $tuple, 2); # make sure docid is valid if ($docid =~ /(\.sgm)/) { return "Unknown document ID `$docid' (looks like a file name instead of a document ID)"; } else { if (!$docids{$docid}) { return "Unknown document `$docid'"; } } (my $start, my $end) = split('\-', $start_end, 2); #print "$start\t$end\n"; if(!($start =~ /^\d+$/)) { return "Start offset $start must be a positive integer value!"; } if(!($end =~ /^\d+$/)) { return "End offset $end must be a positive integer value!"; } if($end < $start) { return "End offset $end is smaller than start offset $start!"; } } return ""; } # return 1 if every document in entityJustification is also a document in relationJustification, # otherwise, return 0 sub docInclusion { my $entityJustification = pop(@_); my $relationJustification = pop(@_); my @entityDocs = extractDocs ($entityJustification); my @relationDocs = extractDocs ($relationJustification); foreach my $docid (@entityDocs) { if (-1 == &is_member($docid,\@relationDocs)) { return 0; } } return 1; } sub extractDocs { my $offsetString = pop(@_); my @tokens = split(',', $offsetString); my @docs = (); foreach my $tuple (@tokens) { (my $docid, my $start_end) = split(':', $tuple, 2); push(@docs, $docid); } return @docs; } # print error message, keeping track of total number of errors sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } } # print warning message, keeping track of total number of warnings sub warn { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Warning on line $line_num --- $msg_string\n"; $num_warnings++; # You can add stopping after N warnings here. # Currently, we do not stop because of warnings. } # Returns index of string element in array if present, else -1 sub is_member { my($element,$arrayref) = @_; my $i; for ($i=0; $i<= $#{$arrayref}; $i++) { if ($element eq $$arrayref[$i]) { return($i); } } return(-1); } # Return 0 iff line is valid UTF-8; else return 1. # Regular expression from: http://www.w3.org/International/questions/qa-forms-utf-8 sub invalid_UTF8 { my ($line) = @_; if($line =~ m/\A( [\x09\x0A\x0D\x20-\x7E] # ASCII | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 )*\z/x) { return 0; } else { return 1; } }