223 lines
7.7 KiB
C
223 lines
7.7 KiB
C
/*
|
|
* subst.c -- Repair substitution tables
|
|
*
|
|
* Copyright (C) 1997 Pretty Good Privacy, Inc.
|
|
*
|
|
* Written by Colin Plumb
|
|
*
|
|
* $Id: subst.c,v 1.14 1997/11/03 22:12:00 colin Exp $
|
|
*
|
|
* IT IS EXPECTED that users of this program will play with these tables
|
|
* and the cost values in the subst.h header. (Some day, they'll all
|
|
* get moved to an external config file.)
|
|
*
|
|
* NOTE: Other cost are hiding in the Filter functions in repair.c.
|
|
* Remember to keep them all on the same scale.
|
|
*/
|
|
|
|
/*
|
|
* The repair program copies its input to its output, making various
|
|
* substitutions, until it manages to produce a version that satisfies
|
|
* the parser. This includes having a correct CRC for each line.
|
|
* Each substitution has a cost, and the combinations are tried in order
|
|
* of increasing cost. NOTE that even translating "A"->"A" counts as
|
|
* a substitution, although it may have zero cost.
|
|
*
|
|
* The intention is to correct transcription errors, where the
|
|
* errors have a distinctly non-uniform distribution. Slight
|
|
* differences in cost produce a preference in trying some errors
|
|
* first. If an error costs half as much as another, combinations
|
|
* of two of that error will be compared to one of the more expensive.
|
|
* Too many cheap substitutions will result is repair spending
|
|
* a very log time searching before considering the more expensive
|
|
* substitutions.
|
|
*
|
|
* The following parameters and the raw substitution tables are expected
|
|
* to be edited by the user based on experience. Eventually, this
|
|
* will be moved into an external config file, but for now it's a matter
|
|
* of recompiling.
|
|
*/
|
|
|
|
#include "subst.h"
|
|
#include "util.h"
|
|
|
|
/* what the OCR software reports for "unrecognizable */
|
|
#define UNRECOG_STRING "~\274"
|
|
|
|
/*
|
|
* The input substitutions to make (one-to-one). These are listed in
|
|
* the order of correction. i.e. uncorrected input first, then corrected
|
|
* output. Substitutions are one-way; to get two-way, list it twice.
|
|
*/
|
|
|
|
struct RawSubst const substSingles[] = {
|
|
/* Identity substitutions - note that period (.) is excluded */
|
|
{ "!\"#$%&'()*+,-./0123456789:;<=>?" SPACE_STRING,
|
|
"!\"#$%&'()*+,-./0123456789:;<=>?" SPACE_STRING, 0, 0, NULL },
|
|
{ "@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_\t" TAB_STRING,
|
|
"@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_\t" TAB_STRING, 0, 0, NULL },
|
|
{ "`abcdefghijklmnopqrstuvwxyz{|}~\f" FORMFEED_STRING,
|
|
"`abcdefghijklmnopqrstuvwxyz{|}~\f" FORMFEED_STRING, 0, 0, NULL },
|
|
#if (TAB_PAD_CHAR & 128) /* Not already included? */
|
|
{ TAB_PAD_STRING, TAB_PAD_STRING, 0, NULL },
|
|
#endif
|
|
{ "\r\n" CONTIN_STRING, "\n\n" CONTIN_STRING, 0, 0, NULL },
|
|
|
|
/* Occasionally these just get inserted as glitches */
|
|
{ ".,'`", NULL, 5, 10, FilterNearBlanks },
|
|
/* This is now pretty infrequent */
|
|
{ "-_", "_-", 0, 10, FilterAfterRepeat },
|
|
|
|
/*
|
|
* Capitalization errors are common in some cases
|
|
* c/C, s/S, u/U are fucked up all the time.
|
|
* Also o/O, v/V and w/W. x, y and z also give some problems.
|
|
*/
|
|
{ "cilmopsuvwxyz", "CILMOPSUVWXYZ", 7, 13, FilterNearLower },
|
|
{ "CILMOPSUVWXYZ", "cilmopsuvwxyz", 7, 13, FilterNearUpper },
|
|
/* Other errors */
|
|
{ "g9aaiji;xX00Si", "9gg2ji;i%%oO3f", 10, 0, NULL },
|
|
/* This seems to happen a lot */
|
|
{ "c", "r", 9, 0, NULL },
|
|
|
|
{ "j", ";", 9, 0, NULL },
|
|
{ "' ", "``", 10, 0, NULL },
|
|
|
|
/* Uncommon errors */
|
|
|
|
/* Wierd stuff that's happened in the checksum part */
|
|
/* A highish weight is okay here */
|
|
{ "sSEdJl", "554437", 15, 0, NULL },
|
|
{ "LESsPZ", "bb8a22", 15, 0, NULL },
|
|
|
|
/* Wierd stuff that has happened */
|
|
{ "BasAeaeRoooo", "3334a@QQpqbd", 5, 15, FilterIsBinary },
|
|
{ "oooo", "pqbd", 0, 15, FilterIsBinary },
|
|
{ "ttTCCflO", "iff{[lfG", 12, 0, NULL },
|
|
#if 0
|
|
/* If the line-breaks get screwed up, use these */
|
|
{ " ", "\n", 10, COST_INFINITY, FilterChecksumFollows },
|
|
{ "\n", " ", COST_INFINITY, 10, FilterChecksumFollows },
|
|
{ "\n", NULL, COST_INFINITY , 11, FilterChecksumFollows },
|
|
#endif
|
|
|
|
{ NULL, NULL, 0, 0, NULL }
|
|
};
|
|
|
|
/* The many-to-many substitutions */
|
|
struct RawSubst const substMultiples[] = {
|
|
{ "''", "\"", 2, 0, NULL },
|
|
{ "``", "\"", 2, 0, NULL },
|
|
{ ",'", "\"", 2, 0, NULL },
|
|
{ "',", "\"", 2, 0, NULL },
|
|
{ ",,", "\"", 2, 0, NULL },
|
|
/* Extra inserted spaces are common */
|
|
{ " ", " ", COST_INFINITY, 0, FilterFollowsSpace },
|
|
{ " ", "", 0, 15, FilterFollowsSpace },
|
|
{ "\t", " ", COST_INFINITY, 0, FilterFollowsSpace },
|
|
{ "\t", "", 0, 10, FilterFollowsSpace },
|
|
/* Convert between SPACE_CHAR dots and periods */
|
|
{ ".", SPACE_STRING, 1, COST_INFINITY, FilterFollowsSpace },
|
|
{ ".", " "SPACE_STRING, COST_INFINITY, 10, FilterFollowsSpace },
|
|
{ SPACE_STRING, ".", 15, 5, FilterFollowsSpace },
|
|
{ SPACE_STRING, " "SPACE_STRING, COST_INFINITY, 5, FilterFollowsSpace },
|
|
|
|
/* Replace "unknown" by zero - it often is */
|
|
{ UNRECOG_STRING, "0", 1, 0, NULL },
|
|
{ UNRECOG_STRING, "_", 2, 0, NULL },
|
|
{ UNRECOG_STRING, ")", 3, 0, NULL },
|
|
{ UNRECOG_STRING, "^", 4, 0, NULL },
|
|
/* Except that these glitches are common */
|
|
{ UNRECOG_STRING"'", "\\\"", 0, 0, NULL },
|
|
{ UNRECOG_STRING"'", "\"", 1, 0, NULL },
|
|
{ "'"UNRECOG_STRING, "\"", 0, 0, NULL },
|
|
{ UNRECOG_STRING UNRECOG_STRING , "\"", 0, 0, NULL },
|
|
/* Something else that has been seen */
|
|
{ "V'", "\\\"", 5, 0, NULL },
|
|
|
|
/* A common transposition */
|
|
{ "\"'", "'\"", 5, 0, NULL },
|
|
{ "'\"", "\"'", 5, 0, NULL },
|
|
/* These also happen fairly often */
|
|
{ " \"", "''", 5, 0, NULL },
|
|
{ "\" ", "''", 5, 0, NULL },
|
|
|
|
/* Common glitches */
|
|
{ "\t.\n", "\n", 5, 0, NULL },
|
|
{ "\t,\n", "\n", 5, 0, NULL },
|
|
{ "\t-\n", "\n", 5, 0, NULL },
|
|
{ "\t_\n", "\n", 5, 0, NULL },
|
|
{ "\t'\n", "\n", 5, 0, NULL },
|
|
{ "\t`\n", "\n", 5, 0, NULL },
|
|
{ "\t~\n", "\n", 5, 0, NULL },
|
|
{ "\t:\n", "\n", 5, 0, NULL },
|
|
{ "\t"SPACE_STRING"\n", "\n", 5, 0, NULL },
|
|
|
|
/* Less common */
|
|
{ " .\n", "\n", 10, 0, NULL },
|
|
{ " ,\n", "\n", 10, 0, NULL },
|
|
{ " -\n", "\n", 10, 0, NULL },
|
|
{ " _\n", "\n", 10, 0, NULL },
|
|
{ " '\n", "\n", 10, 0, NULL },
|
|
{ " `\n", "\n", 10, 0, NULL },
|
|
{ " ~\n", "\n", 10, 0, NULL },
|
|
{ " :\n", "\n", 10, 0, NULL },
|
|
{ " "SPACE_STRING"\n", "\n", 10, 0, NULL },
|
|
|
|
/* Even less common */
|
|
{ ".\n", "\n", 15, 0, NULL },
|
|
{ ",\n", "\n", 15, 0, NULL },
|
|
{ "-\n", "\n", 15, 0, NULL },
|
|
{ "_\n", "\n", 15, 0, NULL },
|
|
{ "'\n", "\n", 15, 0, NULL },
|
|
{ "`\n", "\n", 15, 0, NULL },
|
|
{ "~\n", "\n", 15, 0, NULL },
|
|
{ ":\n", "\n", 15, 0, NULL },
|
|
{ SPACE_STRING"\n", "\n", 15, 0, NULL },
|
|
|
|
/* Wierd stuff that has happened */
|
|
{ "lJ", "U", 10, 0, NULL },
|
|
{ "ll", "U", 10, 0, NULL },
|
|
{ "l1", "U", 10, 0, NULL },
|
|
{ "il", "U", 10, 0, NULL }, /* Fairly common, actually */
|
|
{ "li", "U", 10, 0, NULL },
|
|
{ "l)", "U", 10, 0, NULL },
|
|
{ "Ll", "U", 10, 0, NULL },
|
|
{ "LI", "U", 10, 0, NULL },
|
|
{ "L1", "U", 10, 0, NULL },
|
|
|
|
{ "lo", "b", 10, 0, NULL },
|
|
{ "cl", "d", 10, 0, NULL },
|
|
{ "cliff", "diff", 2, 0, NULL },
|
|
{ "*\n", "*/\n", 10, 0, NULL },
|
|
|
|
/* That big black block has odd things happen to it */
|
|
{ "d", CONTIN_STRING, 10, 0, NULL },
|
|
{ "d\n", CONTIN_STRING"\n", 3, 0, NULL },
|
|
{ "S", CONTIN_STRING, 10, 0, NULL },
|
|
{ "S\n", CONTIN_STRING"\n", 3, 0, NULL },
|
|
|
|
/* Tab-stop wonders */
|
|
{ TAB_STRING, TAB_STRING"", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
{ TAB_STRING, TAB_STRING" ", 0, 0, TabFilter },
|
|
/* Some scan errors */
|
|
{ "D ", TAB_STRING"", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
{ "D ", TAB_STRING" ", 1, 5, TabFilter },
|
|
#if TAB_PAD_CHAR != ' '
|
|
#error Fix those tab patterns!
|
|
#endif
|
|
{ NULL, NULL, 0, 0, NULL }
|
|
};
|