eval 'exec perl -S $0 ${1+"$@"}' if 0; # run perl script without knowing where perl is installed # # ID: $Id: $ # # COMPONENT_NAME: Documentation Infrastructure Processor # # DESCRIPTION OF PERL SCRIPT: # # This PERL script verifies a given Trace Id SGML file. It reports any errors # and alerts that are encountered and, when possible, fixes problems in # the file. An error file is created named .err. A new Trace Id SGML # file is also created, named .out. # # PERL SCRIPT PROGRAMMER: # # Bruce McLaren # # # HISTORY # April, 1999 - Written by Bruce McLaren # # # # Current directory pushed onto the include list. # push(@INC,$1) if ($0 =~ /(.*)\//); # location of this script; # # Push the directory containing translateTraceId into the INC # array. Note that this pathname must be changed to # 'afs/tr/kansas/latest/dest/etc' # for more general usage. For some reason this path was not # working on the AIX 4.3.2. machine biglou, so I used this # explicit path name. Since the path may change, the more # general path should be used, if possible. # push(@INC, '/afs/transarc.com/kansas/r1599/rs_aix43/dest/etc'); # # Utilities used by this script (e.g., &GetOptions,&PrintUsageAndQuit) # require 'utilities.pl'; # # Source the environment variables found in trace-vars. These # variables are necessary for running translateTraceId. # exec 'source trace-vars'; # # Parse the command-line options # $0 =~ s%.*/%%; # $0 = basename $0 $Usage = qq@Usage: $0 [-h] [file] \t-h requests this help message This PERL script verifies a given Trace Id SGML file. It reports errors and alerts that are encountered during processing and, when possible, fixes errors in the file. A new Trace Id SGML file is created, named .out. In addition, an error file is created named .err. The argument must be a Trace Id SGML file in the appropriate format. @; &GetOptions("h") || &PrintUsageAndExit(1); &PrintUsageAndExit(0) if ($opt_h); # # If an argument representing the Trace Id SGML file is provided, proceed # with processing the file. # if (@ARGV) { # # Open the Trace Id SGML file; quit on error. # open(INPUT, "<@ARGV") || die "\n\nCould not read '@ARGV': $!\n"; # # Open output file; quit on error # open(OUTPUT, ">@ARGV.out") || die "\n\nCould not write to '@ARGV.out': $!\n"; # # Open error file; quit on error # open(STDERR,">@ARGV.err") || die "\n\nCould not write to '@ARGV.err': $!\n"; print STDERR "*" x 70; print STDERR ("\n* ERROR REPORT for file '@ARGV'"); print STDERR "\n", "*" x 70; # # Define and initialize local scalar variables. # local($saveIt,$currentId,$currentIdShort,$currentMessage,$currentComponent, $translateMessage, $translateComponent, $translateType, $readingCurrentMessage, $currentIdText, $gotAMessage, $errorCount, $alertCount,$fixCount, $idCount,$gotOneMessage,$foundCurrentId, $replaceComponent, $replaceType, $saveCount, $fileTerminatingText, $replaceId, $replacedId, $errorFlag); $currentIdText = ""; $gotAMessage = 0; $idCount = 0; $errorCount = 0; $alertCount = 0; $fixCount = 0; # # Define assorted array variables. These are used to perform # various post-processing error & alert checking. # local(@saveId); local(%saveCurrentIds) ={}; local(%saveMultipleIds) ={}; local(%saveXRefIds) ={}; local(%saveIdText) = {}; local(@saveIds) = (); local(@sortedIds) = (); # # Iterate on lines of the input file # while () { # # Start by checking for a new Trace Id. If one is encountered, # set up for a new block of Trace Id text. # if (/^\\/) { # # Initialize for new Trace Id # # Save all of the text associated with the previous id in a # hash table and reassign $currentIdText to the new input. # if ($currentId ne "") { $saveIdText{$currentId} = $currentIdText; $saveIds[$idCount-1] = $currentId; } $currentIdText = $_; # # Grab the new Id, save it in $saveCurrentIds, increment # counter, and reset $gotOneMessage flag. # $currentIdShort = $1; $currentId = "0". $1; $saveCurrentIds{$currentId} = $currentId; $idCount++; $gotOneMessage = 0; print ("\n\n*** id = '$currentId'"); # # Verify that the current id has 10 characters. # $lengthOfCode = length($currentId); if ($lengthOfCode != 10) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> Trace id is not 10 characters in length."); } # # Call translateTraceId and pipe output into this script to # extract the associated component and message fields of # the current Trace Id. # open (TRANSLATE, "translateTraceId $currentId |") || die "\n\ntranslateTraceId failed: $!\n"; $readingCurrentMessage = 0; while () { if (/component\s*\:\s*(.+)/) { $translateComponent = $1; } elsif (/type\s*\:\s*(.+)/) { $translateType = $1; } elsif (/format\s*\:\s*(.+)/) { $translateMessage = $1; $readingCurrentMessage = 1; } elsif ($readingCurrentMessage) { $translateMessage = $translateMessage . $_; } } # # If $readingCurrentMessage flag is not set after the loop, # there is a problem with translateTraceId. Print message # and exit. # if (!$readingCurrentMessage) { die "\n\ntranslateTraceId failed for '$currentId'.\n"; } next; # Go to next iteration of file } # # If no Trace id has yet been encountered, write the current line of # text to the output file. Otherwise, add the current line of # text to current Trace Id text variable, which is used to cache # all of the lines of text associated with the current id. # if ($idCount == 0) { print OUTPUT ("$_"); } else { $currentIdText = $currentIdText . $_; } # # If the component in file differs from the component retrieved # from translateTraceId, replace it and tally an error and fix. # if ((/Component\:\s*\<\/ph\>\s*(.*)\<\/li\>/) && ($1 ne $translateComponent)) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> Invalid component encountered"); $replaceComponent = $1; $replaceComponent = backSlashString($replaceComponent); $translateComponent = backSlashString($translateComponent); $currentIdText =~ s/$replaceComponent/$translateComponent/; print STDERR ("\n *** Fix: Replaced '$replaceComponent' with '$translateComponent'"); $fixCount++; } # # If the type in file differs from the type retrieved from # translateTraceId, replace it and tally an error and fix. # if ((/Type\:\s*\<\/ph\>\s*(.*)\<\/li\>/) && ($1 ne $translateType)) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> Invalid type encountered"); $replaceType = $1; $replaceType = backSlashString($replaceType); $translateType = backSlashString($translateType); $currentIdText =~ s/$replaceType/$translateType/; print STDERR ("\n *** Fix: Replaced '$replaceType' with '$translateType'"); $fixCount++; } # # If there is a '???' in the Explanation field, this # trace id is missing information and should be flagged # as an error. # if (/Explanation\:\s*\<\/ph\>\s*(\?\?\?)\<\/li\>/) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> Explanation, User Response, and Variable fields are undefined."); } # # If this is a cross-reference to another id, save it in # an associative array, so we can later verify that the # cross-reference exists. # elsif (/\s*See\s*\\s*\/) && $gotOneMessage) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> Multiple messages associated with this id."); } # # Check for Message string. Start by looking for the case in # which the Message is fully contained on one line. # elsif (/Message\:\s*\<\/ph\>\s*\(.*)\<\/xph\>/) { $currentMessage = $1; $gotAMessage = 1;} # Check for first line of a multi-line Message. # elsif (/Message\:\s*\<\/ph\>\s*\(.*)/) { $currentMessage = spaceAtEndOfString($1); } # Check for last line of a multi-line Message, one that contains # a non-blank leading character. # elsif ((/(\S.*)\<\/xph\>/) && !$gotOneMessage) { $currentMessage = $currentMessage . $1; $gotAMessage = 1; } # Check for last line of a multi-line Message, one that contains no # leading character. # elsif ((/\<\/xph\>/) && !$gotOneMessage) { $currentMessage = substr($currentMessage,0,length($currentMessage) - 1); $gotAMessage = 1; } # Check for intermediate line of a multi-line Message # (Note: Do not move this condition check; this check must # come after check for last line of Message!) # elsif (($currentMessage ne "") && (/(\S.*)/) && !$gotOneMessage){ $currentMessage = spaceAtEndOfString($currentMessage . $1); } # # After a full Message has been retrieved, verify it. # if ($gotAMessage) { $gotAMessage = 0; $gotOneMessage = 1; # # Start by calling function to clean up the Message, i.e., # delete tags, deal with special characters, etc. # $currentMessage = cleanUpString($currentMessage); # # Display trace information to terminal window # print ("\n*** message = '$currentMessage'"); # # SPECIAL CASE: If the $currentMessage is a simple, common # string and equals $translateMessage, do not process it. # if (($currentMessage eq "%s") && ($currentMessage eq $translateMessage)) { $currentMessage = ""; next; } # # Call findTraceId with the message. Pipe results back # into the program so that we can query the Trace Id. # open (FIND, "findTraceId \'$currentMessage\' |") || die "\n\nfindTraceId failed: $!\n"; $findTraceIdCount = 0; @saveId = (); while () { print ("\n$_"); $findTraceIdCount++; # # Save Trace Id by placing in array. # push (@saveId,$1) if (/id=(\S*)/); } ########## Check for Error & Alert Conditions ############# # # If there were no trace ids returned from translateTraceId, # we have either: # (1) a minor formatting mismatch between the messages (an alert) # (2) a mismatch between the trace id and the message (an error). # if ($findTraceIdCount == 0) { # # If messages are 'virtually identical', report an alert. # if (virtuallyIdentical($currentMessage,$translateMessage)) { $alertCount++; print STDERR ("\n\n*** Alert # $alertCount"); print STDERR ("\nid: '$currentId' message: '$currentMessage'"); print STDERR ("\n >>> Message does not match any id."); print STDERR ("\n >>> translateTraceId message: '$translateMessage'"); print STDERR ("\n >>> However, messages are virtually identical (> 90% similar) so there is no error.") # # If messages are not virtually identical, report an error. # } else { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId' message: '$currentMessage'"); print STDERR ("\n >>> Message does not match any id."); print STDERR ("\n >>> translateTraceId message: '$translateMessage'"); } # # If trace id matches a different message, report the error # and fix it in the output file by replacing the trace id. # } elsif (($findTraceIdCount == 1) && ($saveId[0] ne $currentId)) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId' message: '$currentMessage'"); print STDERR ("\n >>> Message matches on different id: '$saveId[0]'"); $saveCurrentIds{$saveId[0]} = $saveId[0]; $saveCurrentIds{$currentId} = $saveId[0]; $replaceId = substr($saveId[0],1); $currentIdText =~ s/$currentIdShort/$replaceId/g; print STDERR ("\n *** Fix: Replaced id '$currentId' with '$saveId[0]'"); $fixCount++; # # If there were multiple trace ids returned, report as # an alert and save data for further error checking. # } elsif ($findTraceIdCount > 1) { $alertCount++; print STDERR ("\n\n*** Alert # $alertCount"); print STDERR ("\nid: '$currentId' message: '$currentMessage'"); print STDERR ("\n >>> Message matches on multiple ids: \n"); $foundCurrentId = 0; foreach $saveId (@saveId) { $foundCurrentId = 1 if ($saveId eq $currentId); $saveMultipleIds{$saveId} = $currentId; print STDERR ("\n'$saveId' "); # # Run translateTraceId against each of the multiple # trace ids. Write output to STDERR. # open (TRANSLATE, "translateTraceId $saveId |") || die "\n\ntranslateTraceId failed: $!\n"; print STDERR ("\n"); while () {print STDERR ("$_");} } # # If the current Trace Id wasn't in the list of multiple ids, # report an error. # if (!$foundCurrentId) { $errorCount++; print STDERR ("\n\n*** Error # $errorCount"); print STDERR ("\nid: '$currentId'"); print STDERR ("\n >>> None of the multiple trace ids in Alert # $alertCount (See previous alert) are '$currentId' "); } } $currentMessage = ""; } } # # Verify that all of the trace ids that were cross-referenced also # appear as regular trace id references in the trace file. # print STDERR "\n\n", "*" x 70; print STDERR ("\n >>> The following trace ids appeared as X-refs (in 'See' field)"); print STDERR ("\n >>> but do not appear elsewhere in the SGML file:\n"); $errorFlag = 0; $saveCount = $errorCount; while (($id, $value) = each(%saveXRefIds)) { if (not (exists($saveCurrentIds{$id}))) { $errorCount++; $errorFlag = 1; print STDERR ("\n *** Error # $errorCount: '$id' (X-Ref from '$saveXRefIds{$id}')"); } elsif ($saveCurrentIds{$id} ne $id) { $errorFlag = 1; $replacedId = substr($id,1); $replaceId = substr($saveCurrentIds{$id},1); # # If the trace id to replace leads to a circular reference, # delete the current id entry from the hash table. # if ($saveIdText{$saveCurrentIds{$id} =~ /$replaceId/g) { print STDERR ("\n *** Circular reference! Deleting the 0$replaceId XRef"); delete $saveCurrentIds{$id}; } # # Otherwise, replace the XRef. # else { print STDERR ("\n\nReplacing '$replacedId' with '$replaceId'"); print STDERR ("\nBEFORE replacement text is \n$saveIdText{$saveCurrentIds{$id}}"); $saveIdText{$saveCurrentIds{$id}} =~ s/$replacedId/$replaceId/g; print STDERR ("\nAFTER replacement text is \n$saveIdText{$saveCurrentIds{$id}}"); print STDERR ("\n *** Replaced Xref to 0$replacedId with 0$replaceId"); } } } if (not($errorFlag)) {print STDERR ("\n NONE"); } # # Verify that all of the trace ids that appeared as multiple ids also # appear somewhere as ids in the trace file. # print STDERR "\n\n", "*" x 70; print STDERR ("\n >>> The following trace ids appeared as multiple ids"); print STDERR ("\n >>> but do not appear elsewhere in the SGML file:\n"); $saveCount = $alertCount; while (($id, $value) = each(%saveMultipleIds)) { if ((not (exists($saveCurrentIds{$id}))) || ($saveCurrentIds{$id} ne $id)) { $alertCount++; print STDERR ("\n *** Alert # $alertCount: '$id' (From message of '$saveMultipleIds{$id}')"); # # Run translateTraceId against each missing trace id. # open (TRANSLATE, "translateTraceId $id |") || die "\n\ntranslateTraceId failed: $!\n"; print STDERR ("\n"); while () {print STDERR ("$_");} } } if ($saveCount == $alertCount) {print STDERR ("\n NONE"); } # # Print error and alert totals # printTotals($idCount,$errorCount,$fixCount,$alertCount); # # The last trace id contains the file terminating text. Separate # the text associated with the last id from terminating text. # $currentIdText =~ /(.+)(\<\/dlentry\>.)(.+)/s; $currentIdText = $1 . $2; $fileTerminatingText = $3; # # Save all of the text and the id associated with the last id. # $saveIdText{$currentId} = $currentIdText; $saveIds[$idCount-1] = $currentId; # # Sort saved ids # print ("\n\n*** Sorting the trace ids..."); print STDERR ("\n\n*** Sorting the trace ids..."); @sortedIds = sort(@saveIds); # # Write text associated with ids to the new trace id file, based # on the order of sorted ids. # print ("\n\n*** Writing the trace ids to @ARGV.out\n"); print STDERR ("\n\n*** Writing the trace ids to @ARGV.out\n"); for ($i = 0; $i < $idCount; $i++) { print OUTPUT ("$saveIdText{$sortedIds[$i]}"); } # # Write file terminating text # print OUTPUT ("$fileTerminatingText"); close(INPUT); close(OUTPUT); close(STDERR); } ###################################################################### # This sub checks whether message1 is "virtually identical" to # message2. If the messages are 90% similar they are considered # virtually identical. The information retrieval metrics recall # and precision are used to quantify similarity. If the strings # are found to be virtually identical, a 1 is returned. Otherwise, # a 0 is returned. ###################################################################### sub virtuallyIdentical { local($message1,$message2) = @_; local(%message1CharCount,%message2CharCount); local($char,$charCount,$message1TotalCharCount,$message2TotalCharCount); local($additional,$overlap,$missing,$recall,$precision,$average); # # If message2 contains the "no format found" message, return # immediately with failure (0). # if ($message2 =~ /no format found for/) { return(0); # # Otherwise, compute recall and precision between two strings. # } else { $message1 = stripSpecialChars($message1); # # Count chars in each message and store in associative array. # %message1CharCount = countChars($message1); %message2CharCount = countChars($message2); # # Initialize the components of the recall/precision calculation. # $additional = 0; $overlap = 0; $missing = 0; # # Tally the 'additional' characters, i.e., those found in $message1 # but not in $message2. Also, tally the 'overlap' characters, i.e., # those found in both strings. # while (($char, $charCount) = each(message1CharCount)) { if (exists($message2CharCount{$char})) { if ($message2CharCount{$char} < $charCount) { $additional += $charCount - $message2CharCount{$char}; $overlap += $message2CharCount{$char}; } else { $overlap += $charCount; } } else { $additional += $charCount; } } # # Tally the 'missing' characters, i.e., those found in $message2 but # not in $message1. # while (($char, $charCount) = each(message2CharCount)) { if (exists($message2CharCount{$char})) { if ($message1CharCount{$char} < $charCount) { $missing += $charCount - $message1CharCount{$char}; } } else { $missing += $charCount; } } } # # Calculate precision and recall and the average of the two. # (Note: Averaging recall and precision is unconventional, but # it should suffice as a rough measure of similarity for the # purpose of this script.) # $precision = $overlap / ($overlap + $additional); $recall = $overlap / ($overlap + $missing); $average = ($recall + $precision) / 2; #Debugging statements; leave them here for now. # print STDERR ("\n\nAre messages virtually identical?"); # print STDERR ("\nIn Message: '$message1'"); # print STDERR ("\nTran Message: '$message2'"); # print STDERR ("\n\toverlap = $overlap"); # print STDERR ("\n\tadditional = $additional"); # print STDERR ("\n\tmissing = $missing"); # print STDERR ("\n\tprecision = $precision"); # print STDERR ("\n\trecall = $recall"); # print STDERR ("\n\taverage = $average"); # # If the $average >= 90%, return 1. Otherwise, return 0. # if ($average >= .90) { return(1); } else { return(0); } } ###################################################################### # This subroutine (1) counts the characters in a given string, # (2) places the results in an associative array of characters, and # (3) returns the associative array. ###################################################################### sub countChars { local($message) = @_; local(%charCount,$messageCount,$currentChar); local($messageLength) = length($message); $messageCount = 0; while ($messageCount < $messageLength) { $currentChar = substr($message,$messageCount,1); if (exists($charCount{$currentChar})) { $charCount{$currentChar}++; } else { $charCount{$currentChar} = 1; } $messageCount++; } return(%charCount); } ###################################################################### # Subroutine to print error and alert totals. ###################################################################### sub printTotals { local($idCount,$errorCount,$fixCount,$alertCount) = @_; local($remainToBeAnalyzed); $remainToBeAnalyzed = $errorCount - $fixCount; # # Print to Error file # print STDERR "\n\n", "*" x 70; print STDERR ("\n\n$idCount Trace Ids were processed."); print STDERR ("\n$errorCount ERROR(s) were encountered."); print STDERR ("\n$fixCount ERROR(s) were fixed in the output file."); print STDERR ("\n$remainToBeAnalyzed ERROR(s) remain to be analyzed."); print STDERR ("\n$alertCount ALERT(S) were encountered.\n"); # # Print to display # print STDERR "\n\n", "*" x 70; print ("\n\n$idCount Trace Ids were processed."); print ("\n$errorCount ERROR(s) were encountered."); print ("\n$fixCount ERROR(s) were fixed in the output file."); print ("\n$remainToBeAnalyzed ERROR(s) remain to be analyzed."); print ("\n$alertCount ALERT(S) were encountered.\n\n"); } ###################################################################### # Subroutine to clean up string by replacing various special chars. ###################################################################### sub cleanUpString { local($string) = @_; # # Delete and tags within message # $string =~ s/\//g; $string =~ s/\<\/pv\>//g; # # Replace left paren with backslash/left paren. # Replace right paren with backslash/right paren. # $string =~ s/\(/\\\(/g; $string =~ s/\)/\\\)/g; # # Replace left bracket with backslash/left bracket. # Replace right bracket with backslash/right bracket. # $string =~ s/\[/\\\[/g; $string =~ s/\]/\\\]/g; # # Replace ' with '\''. This is done so that single # quotes are recognized in findTraceId. # $string =~ s/'/'\\\''/g; # # Replace " with \\\". This is done so that double # quotes are recognized in findTraceId. # $string =~ s/"/\\\\\"/g; # # Replace < with < # $string =~ s/\<\;/' at the end of the string. ###################################################################### sub spaceAtEndOfString { local($string) = @_; if ((substr($string,length($string) - 1, 1) ne " ") && (substr($string,length($string) - 1, 1) ne ">")) {$string = $string . " ";} return($string); } ###################################################################### # Sub to put backslash in front of special characters so that # substitution works properly. ###################################################################### sub backSlashString { local($string) = @_; $string = backSlashItem ($string,"+"); $string = backSlashItem ($string,"/"); $string = backSlashItem ($string,"."); $string = backSlashItem ($string,"("); $string = backSlashItem ($string,")"); return($string); } ###################################################################### # Sub to put backslash in front of given char in given string. ###################################################################### sub backSlashItem { local($string,$char) = @_; local($position); $position = -1; while (($position = index($string,$char,$position + 1)) > 0) { $string = substr($string,0,$position) . "\\" . substr($string,$position); $position = $position + 1; } return($string); } ###################################################################### # Subroutine to modify a string by stripping special characters. ###################################################################### sub stripSpecialChars { local($string) = @_; # # Delete back slashes from string # $string =~ s/\\//g; # # Change pairs of single quotes to a single quote. # (Note: This is done twice because it seems to miss the # case in which three single quotes are grouped together.) # $string =~ s/''/'/g; $string =~ s/''/'/g; return($string); } ###################################################################### # Subroutine to retrieve command line options. # This routine was copied from Tim Burt's burtBase.pl file. ###################################################################### sub GetOptions { local($argstring) = shift; local($errors) = 0; local($index); while (@ARGV && (($_ = @ARGV[0]) =~ /^-(.)(.*)/)) { if (($index = index($argstring,$1)) >= 0) { # options which require an argument if (substr($argstring,$index+1,1) eq ':') { shift(@ARGV); $arg = $2 || shift(@ARGV); warn("Option $1 requires an argument\n"), $errors++ unless $arg; eval "\$opt_$1 = \$arg" if $arg; next; } # boolean options eval "\$opt_$1 = 1"; } else { warn("Invalid option (-$1)\n"), $errors++; } # Increment for invalid/boolean options if ($2 ne '') { $ARGV[0] = "-$2"; } else { shift(@ARGV); } } $errors == 0; } exit 0; #Local Variables: #mode: perl #End: