1852 lines
50 KiB
C
1852 lines
50 KiB
C
/*
|
|
* repair.c -- Program which reconstructs scanned source, locates errors,
|
|
* and tries to fix most of them automatically. If it
|
|
* can't, it drops you into an editor on the appropriate
|
|
* line for manual correction.
|
|
*
|
|
* Given a file "foo", this appends corrected output to "foo.out"
|
|
* and copies remaining uncorrected input in "foo.in". If "foo.in"
|
|
* exists initially, "foo" is ignored and only "foo.in" is processed.
|
|
* Thus, re-running it repeatedly, possibly with other correction
|
|
* techniques in between, will result in correct output in "foo.out"
|
|
* and an empty "foo.in" file.
|
|
*
|
|
* This can automatically invoke an editor for you on the .in file
|
|
* and re-run itself. The editor is chosen in the first available way:
|
|
* - The -e command-line argument takes a printf() format string to
|
|
* format the editor invocation command line with the line number and
|
|
* filename. E.g. "emacs +%u %s". %u and %s must appear, in that order.
|
|
* - Failing that, the default is "$VISUAL +%u %s"
|
|
* - Failing that, the default is "$EDITOR +%u %s"
|
|
* - Failing that, the program prints the error location and exits.
|
|
* Specifying -e- forces this behaviour.
|
|
*
|
|
* Copyright (C) 1997 Pretty Good Privacy, Inc.
|
|
*
|
|
* Designed by Colin Plumb, Mark H. Weaver, and Philip R. Zimmermann
|
|
* Written by Colin Plumb
|
|
*
|
|
* $Id: repair.c,v 1.37 1997/11/14 08:39:40 mhw Exp $
|
|
*/
|
|
|
|
#include <assert.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
#include <errno.h>
|
|
#include <signal.h>
|
|
|
|
#include "util.h"
|
|
#include "heap.h"
|
|
#include "mempool.h"
|
|
#include "subst.h"
|
|
|
|
/*
|
|
* The internal form of a substitution. These are stored on
|
|
* lists indexed by the first character of the input substitution.
|
|
*/
|
|
typedef struct Substitution {
|
|
struct Substitution *next;
|
|
char const *input, *output;
|
|
size_t inlen, outlen;
|
|
HeapCost cost, cost2;
|
|
FilterFunc *filter;
|
|
unsigned int index; /* Consecutive serial numbers */
|
|
} Substitution;
|
|
|
|
struct Substitution const substNull = { NULL, "", "", 0, 0, 0, 0, 0 };
|
|
|
|
/*
|
|
* This might get increased later to support multiple classes of
|
|
* substitutions, for different contexts. Currently, only one
|
|
* is used.
|
|
*/
|
|
#define SUBST_CLASSES 1
|
|
|
|
/* List of substitutions, indexed by first character, plus a catch-all */
|
|
Substitution *substitutions[SUBST_CLASSES][0x101];
|
|
|
|
/*
|
|
* The pool of Substitution structures. Remains alive for the entire
|
|
* execution of the program.
|
|
*/
|
|
static MemPool substPool;
|
|
static Substitution *substFree;
|
|
static unsigned int substCount = 1; /* Preallcoate 0 to substNull */
|
|
static unsigned int substFirstDynamic;
|
|
#define SubstIsDynamic(s) ((s)->index >= substFirstDynamic)
|
|
/* Adjust the substitution based on noccurrences this page */
|
|
#define SubstAdjust(s,n) ((s)->cost = (s)->cost2)
|
|
/* Is this a nasty-line substitution? */
|
|
#define SubstIsNasty(s) ((s)->cost2 == COST_INFINITY)
|
|
|
|
/* Every possible single-character string */
|
|
static char substChars[512];
|
|
#define SubstString(c) (substChars+2*((c)&255))
|
|
|
|
/* Set the list of substitutions to empty */
|
|
static void
|
|
SubstInit(void)
|
|
{
|
|
unsigned int i, j;
|
|
|
|
memPoolInit(&substPool);
|
|
substFree = 0;
|
|
substCount = 1; /* Number zero is reserved for uncounted substitutions */
|
|
for (i = 0; i < elemsof(substitutions); i++)
|
|
for (j = 0; j < elemsof(*substitutions); j++)
|
|
substitutions[i][j] = NULL;
|
|
|
|
for (i = 0; i < 256; i++) {
|
|
substChars[2*i] = (char)i;
|
|
substChars[2*i+1] = 0;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* For dynamically allocated substitutions, we maintain a free list.
|
|
* Each substitution has a unique serial number. These are retained
|
|
* if a substitution goes on the free list, to keep substCount from
|
|
* ratcheting upwards indefinitely while still guaranteeing uniqueness.
|
|
*/
|
|
static Substitution *
|
|
SubstAlloc(void)
|
|
{
|
|
struct Substitution *subst = substFree;
|
|
|
|
if (subst) {
|
|
substFree = subst->next;
|
|
} else {
|
|
subst = memPoolNew(&substPool, Substitution);
|
|
subst->index = substCount++;
|
|
}
|
|
return subst;
|
|
}
|
|
|
|
static void
|
|
SubstFree(Substitution *subst)
|
|
{
|
|
subst->next = substFree;
|
|
substFree = subst;
|
|
}
|
|
|
|
static Substitution *
|
|
MakeSubst(char const *input, char const *output, HeapCost cost, HeapCost cost2,
|
|
FilterFunc *filter, int class)
|
|
{
|
|
struct Substitution *subst, **head;
|
|
|
|
subst = SubstAlloc();
|
|
subst->input = input;
|
|
subst->output = output;
|
|
subst->inlen = strlen(input);
|
|
subst->outlen = strlen(output);
|
|
subst->cost = cost;
|
|
subst->cost2 = cost2;
|
|
subst->filter = filter;
|
|
|
|
/*
|
|
* Ignore certain substitutions when printing stats.
|
|
* Identity substitutions, and the tab/space tweaking.
|
|
*/
|
|
if (strcmp(input, output) == 0 || strcmp(input, TAB_STRING) == 0 ||
|
|
(input[0] == ' ' && input[1] == 0 && output[0] == 0)) {
|
|
if (subst->index == substCount-1)
|
|
substCount--;
|
|
subst->index = 0; /* Evil hack */
|
|
}
|
|
|
|
head = &substitutions[class][input[class] & 255];
|
|
subst->next = *head;
|
|
*head = subst;
|
|
return subst;
|
|
}
|
|
|
|
/*
|
|
* For each entry in the raw array, turn { "abc", "def", 5" }
|
|
* into cost-5 mappings of "a"->"d", "b"->"e" and "c"->"f".
|
|
* If the output string is NULL, the characters are deleted.
|
|
* An input string of NULL is the end of table delimiter.
|
|
*/
|
|
static void
|
|
SubstSingle(struct RawSubst const *raw, int class)
|
|
{
|
|
char const *input, *output;
|
|
int i, o;
|
|
|
|
while (raw->input) {
|
|
input = raw->input;
|
|
output = raw->output;
|
|
assert(!output || strlen(input) == strlen(output));
|
|
|
|
while (*input) {
|
|
i = *input++;
|
|
o = output ? *output++ : 0;
|
|
(void)MakeSubst(SubstString(i), SubstString(o),
|
|
raw->cost, raw->cost2, raw->filter, class);
|
|
}
|
|
raw++;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* For each entry in the raw array, turn { "abc", "def", 5" }
|
|
* into a cost-5 mappings of "abc"->"def".
|
|
* An input string of NULL is the end of table delimiter.
|
|
*/
|
|
static void
|
|
SubstMultiple(struct RawSubst const *raw, int class)
|
|
{
|
|
while (raw->input) {
|
|
(void)MakeSubst(raw->input, raw->output, raw->cost, raw->cost2,
|
|
raw->filter, class);
|
|
raw++;
|
|
}
|
|
}
|
|
|
|
/* Build the substitutions table */
|
|
static void
|
|
SubstBuild(void)
|
|
{
|
|
SubstInit();
|
|
SubstSingle(substSingles, 0);
|
|
SubstMultiple(substMultiples, 0);
|
|
substFirstDynamic = substCount;
|
|
}
|
|
|
|
/*
|
|
* See if the desired substitution already exists
|
|
*/
|
|
static Substitution const *
|
|
SubstSearch(char const *in, size_t inlen, char const *out, size_t outlen,
|
|
int class)
|
|
{
|
|
Substitution *subst = substitutions[class][in[0] & 255];
|
|
|
|
for (; subst; subst = subst->next) {
|
|
if (subst->inlen == inlen && subst->outlen == outlen &&
|
|
memcmp(subst->input, in, inlen) == 0 &&
|
|
memcmp(subst->output, out, outlen) == 0)
|
|
return subst; /* Already exists */
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
|
|
/*
|
|
* Create a new dynamic substitution. First search to make
|
|
* sure it doesn't already esist.
|
|
*/
|
|
static Substitution const *
|
|
SubstDynamic(char const *in, char const *out, int class)
|
|
{
|
|
Substitution const *subst;
|
|
|
|
subst = SubstSearch(in, strlen(in), out, strlen(out), class);
|
|
return subst ? subst : MakeSubst(in, out, COST_INFINITY,
|
|
DYNAMIC_COST_LEARNED, NULL, class);
|
|
}
|
|
|
|
/*
|
|
* Search for the substitution, allocating one if not found.
|
|
* the input string is not null-terminated and needs to be copied to
|
|
* an allocated buffer. The output string can just be pointer-copied.
|
|
*/
|
|
static Substitution const *
|
|
SubstNasty(char const *in, size_t inlen, char const *out, int class)
|
|
{
|
|
Substitution const *subst;
|
|
char *string;
|
|
|
|
if ((subst = SubstSearch(in, inlen, out, strlen(out), class)) != NULL)
|
|
return subst;
|
|
|
|
if (!(string = malloc(inlen+1))) {
|
|
fputs("Out of memory!\n", stderr);
|
|
exit(1);
|
|
}
|
|
memcpy(string, in, inlen);
|
|
string[inlen] = 0;
|
|
return MakeSubst(string, out, COST_INFINITY, COST_INFINITY, NULL, class);
|
|
}
|
|
|
|
/*
|
|
* The state of the parser.
|
|
* Note that this is updated when a ParseNode is *removed* from the heap;
|
|
* ParseNodes that are in the heap have ParseStates that reflect the
|
|
* state before the substitution has been parsed; this is a copy of the
|
|
* parents' state, which is after the parsing.
|
|
*/
|
|
typedef struct ParseState {
|
|
CRC page_crc; /* Computed per-page CRC */
|
|
word16 flags; /* Flags; see below */
|
|
unsigned char pos; /* Position on the line */
|
|
} ParseState; /* 7 bytes, rounded to 8 */
|
|
|
|
/* Flags values */
|
|
#define PS_MASK_PAGENUM 0xC000 /* Digits in header page number (1..3) */
|
|
#define PS_SHIFT_PAGENUM 14 /* Shift for the above */
|
|
#define PS_FLAG_EOL 512 /* Expect \n next */
|
|
#define PS_FLAG_SPACE 256 /* Was last char a space? */
|
|
#define PS_FLAG_TAB 128 /* Tabbing over a column */
|
|
#define PS_FLAG_INHEADER 64 /* Current line is a header */
|
|
#define PS_FLAG_PASTHEADER 32 /* A previous line was a header */
|
|
#define PS_FLAG_BINWS 16 /* In whitespace after binary data */
|
|
#define PS_FLAG_BINEND 8 /* End of binary data */
|
|
#define PS_FLAG_DYNAMIC 4 /* Have used ECC this line */
|
|
#define PS_MASK_FORMAT 3 /* The encoding format (max of 3, for now) */
|
|
#define PS_SHIFT_FORMAT 0 /* Shift for the above */
|
|
|
|
/* Have we started on a second page? Used to force flushing of the first. */
|
|
#define InSecondHeader(ps) \
|
|
((~(ps)->flags & (PS_FLAG_INHEADER | PS_FLAG_PASTHEADER)) == 0)
|
|
|
|
#define PageNumDigits(pn) (((pn)->ps.flags & PS_MASK_PAGENUM) >> PS_SHIFT_PAGENUM)
|
|
#define PageNumDigitsIncrement(pn) ((pn)->ps.flags += 1<<PS_SHIFT_PAGENUM)
|
|
|
|
EncodeFormat const *registeredFormats[4];
|
|
|
|
/* Returns a small integer index */
|
|
static int
|
|
registerFormat(EncodeFormat const *format)
|
|
{
|
|
int i;
|
|
for (i = 0; i < (int)elemsof(registeredFormats); i++) {
|
|
if (registeredFormats[i] == format)
|
|
return i;
|
|
if (!registeredFormats[i]) {
|
|
registeredFormats[i] = format;
|
|
return i;
|
|
}
|
|
}
|
|
fputs("Registered formats table overflow!\n", stderr);
|
|
exit(1);
|
|
}
|
|
|
|
#define psFormat(ps) registeredFormats[((ps)->flags & PS_MASK_FORMAT)>>PS_SHIFT_FORMAT]
|
|
#define pnFormat(pn) psFormat(&(pn)->ps)
|
|
#define psSetFormat(ps, i) \
|
|
((ps)->flags = ((ps)->flags & ~PS_MASK_FORMAT) | i << PS_SHIFT_FORMAT)
|
|
|
|
typedef struct ParseNode {
|
|
HeapCost cost;
|
|
unsigned int refcnt;
|
|
struct ParseNode *parent;
|
|
char const *input;
|
|
struct Substitution const *subst;
|
|
struct ParseState ps;
|
|
} ParseNode; /* 32 bytes */
|
|
|
|
/* A handle for walking backwards through the output stream */
|
|
typedef struct OutputHandle {
|
|
ParseNode const *node;
|
|
char const *output;
|
|
unsigned int pos;
|
|
} OutputHandle;
|
|
|
|
/* Initialize the handle to point to a node (optionally, a position therein) */
|
|
static void
|
|
OutputInit(OutputHandle *oh, ParseNode const *node, char const *p)
|
|
{
|
|
oh->node = node;
|
|
oh->output = p ? p : node->subst->output + node->subst->outlen;
|
|
oh->pos = 0;
|
|
}
|
|
|
|
/* Get the *previous* byte */
|
|
static int
|
|
OutputGetPrev(OutputHandle *oh)
|
|
{
|
|
if (!oh->node)
|
|
return -1;
|
|
for (;;) {
|
|
if (oh->output != oh->node->subst->output) {
|
|
oh->pos++;
|
|
return *--oh->output & 255;
|
|
}
|
|
oh->node = oh->node->parent;
|
|
if (!oh->node)
|
|
break;
|
|
oh->output = oh->node->subst->output + oh->node->subst->outlen;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/* Return the character just before the node - trivial handy wrapper */
|
|
static int
|
|
OutputPrevChar(ParseNode const *node)
|
|
{
|
|
OutputHandle oh;
|
|
|
|
OutputInit(&oh, node, NULL);
|
|
return OutputGetPrev(&oh);
|
|
}
|
|
|
|
/*
|
|
* Unget the last retrieved character (and return it), or
|
|
* -1 if that is impossible. At least one character is
|
|
* always ungettable, but after that you're on your own.
|
|
*/
|
|
static int
|
|
OutputUnget(OutputHandle *oh)
|
|
{
|
|
if (oh->node && *oh->output) {
|
|
oh->pos--;
|
|
return *oh->output++ & 255;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/* The position is useful for comparing two OutputHandles. */
|
|
#define OutputPos(oh) ((oh)->pos)
|
|
|
|
/*
|
|
* Fill backwards from bufend until you hit the given char.
|
|
* Use -1 to get the whole buffer.
|
|
*/
|
|
static char *
|
|
OutputGetUntil(OutputHandle oh, char *bufend, int end)
|
|
{
|
|
int c;
|
|
|
|
while ((c = OutputGetPrev(&oh)) != -1 && c != end)
|
|
*--bufend = (char)c;
|
|
return bufend;
|
|
}
|
|
|
|
/*
|
|
* The per-page structure. This is actually global, but describes
|
|
* the values kept for each page processed.
|
|
*/
|
|
typedef struct PerPage {
|
|
CRC page_check;
|
|
char const *maxpos, *minpos;
|
|
unsigned int tabsize; /* Zero means this is a binary page */
|
|
unsigned int lines;
|
|
unsigned int retries; /* How many retires since last progress? */
|
|
unsigned int max_retries; /* Maximum number of retries needed. */
|
|
} PerPage;
|
|
|
|
PerPage perpage; /* The global */
|
|
|
|
static void
|
|
PerPageInit(char const *buf)
|
|
{
|
|
perpage.maxpos = perpage.minpos = buf;
|
|
perpage.page_check = 0;
|
|
perpage.tabsize = 4; /* The default */
|
|
perpage.lines = perpage.retries = perpage.max_retries = 0;
|
|
}
|
|
|
|
/*
|
|
* Is the tab substitution being looked at acceptable?
|
|
* It is if the length needed to make the tab width come out
|
|
* right, it is. Otherwise, it's junk.
|
|
*/
|
|
HeapCost
|
|
TabFilter(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c, tabpos;
|
|
OutputHandle oh;
|
|
|
|
(void)limit;
|
|
if (!perpage.tabsize)
|
|
return COST_INFINITY; /* No interest */
|
|
|
|
/* How wide should the tab be? */
|
|
tabpos = (int)((parent->ps.pos-PREFIX_LENGTH) % perpage.tabsize);
|
|
if ((int)subst->outlen != (int)perpage.tabsize - tabpos)
|
|
return COST_INFINITY;
|
|
/* The right number - cost if likely, cost2 if unlikely */
|
|
if (subst->cost == subst->cost2)
|
|
return subst->cost;
|
|
OutputInit(&oh, parent, NULL);
|
|
do {
|
|
c = OutputGetPrev(&oh);
|
|
} while (c == ' ');
|
|
return (c == TAB_CHAR) ? subst->cost : subst->cost2;
|
|
}
|
|
|
|
/*
|
|
* Return cost if near blanks (including end-of-line), cost2 if not, and
|
|
* the average of there is a blank on one side. There are additional
|
|
* versions for upper- and lower-case. _ is considered upper-case,
|
|
* as it's oftne used in acro identifiers.
|
|
*/
|
|
HeapCost
|
|
FilterNearBlanks(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent), score = (isspace(c) != 0);
|
|
char const *p = parent->input + parent->subst->inlen;
|
|
|
|
score += p == limit || isspace((unsigned char)*p) != 0;
|
|
return (subst->cost*score + subst->cost2*(2-score))/2;
|
|
}
|
|
|
|
HeapCost
|
|
FilterNearUpper(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent), score = (isupper(c) != 0 || c == '_');
|
|
char const *p = parent->input + subst->inlen;
|
|
|
|
score += p != limit && (isupper((unsigned char)*p) != 0 || *p == '_');
|
|
return (subst->cost*score + subst->cost2*(2-score))/2;
|
|
}
|
|
|
|
HeapCost
|
|
FilterNearXDigit(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent), score = (isxdigit(c) != 0);
|
|
char const *p = parent->input + subst->inlen;
|
|
|
|
score += p != limit && (isxdigit((unsigned char)*p) != 0);
|
|
return (subst->cost*score + subst->cost2*(2-score))/2;
|
|
}
|
|
|
|
HeapCost
|
|
FilterNearLower(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent), score = (islower(c) != 0);
|
|
char const *p = parent->input + subst->inlen;
|
|
|
|
score += p != limit && (islower((unsigned char)*p) != 0);
|
|
return (subst->cost*score + subst->cost2*(2-score))/2;
|
|
}
|
|
|
|
/*
|
|
* cost2 unless previous character was a space (' ' or SPACE_CHAR).
|
|
* Note the & 255, necessary since chars might be signed and SPACE_CHAR
|
|
* is in the high (negative) half, but c is an int in the range -1..255.
|
|
*/
|
|
HeapCost
|
|
FilterFollowsSpace(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent);
|
|
(void)limit;
|
|
return (c == ' ' || c == (SPACE_CHAR & 255)) ? subst->cost : subst->cost2;
|
|
}
|
|
|
|
/* cost2 unless previous character was duplicate of this one */
|
|
HeapCost
|
|
FilterAfterRepeat(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int c = OutputPrevChar(parent);
|
|
(void)limit;
|
|
return (c == subst->output[0]) ? subst->cost : subst->cost2;
|
|
}
|
|
|
|
/* cost2 unless probably the closing quote in a char constant */
|
|
HeapCost
|
|
FilterCharConst(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
OutputHandle oh;
|
|
int c;
|
|
|
|
(void)limit;
|
|
OutputInit(&oh, parent, NULL);
|
|
c = OutputGetPrev(&oh);
|
|
c = OutputGetPrev(&oh);
|
|
if (c == '\\')
|
|
c = OutputGetPrev(&oh);
|
|
return (c == '\'') ? subst->cost : subst->cost2;
|
|
}
|
|
|
|
/*
|
|
* If the identifier leading up to the current position contains
|
|
* an underscore, then it's likely the current position is an underscore
|
|
* as well; return cost. If it does not, it's less likely; return cost2.
|
|
*/
|
|
HeapCost
|
|
FilterLikelyUnderscore(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
OutputHandle oh;
|
|
int c;
|
|
|
|
(void)limit;
|
|
OutputInit(&oh, parent, NULL);
|
|
for (;;) {
|
|
c = OutputGetPrev(&oh);
|
|
if (c == '_')
|
|
return subst->cost;
|
|
if (!isalnum(c))
|
|
return subst->cost2;
|
|
}
|
|
}
|
|
|
|
/* cost2 unless the following chars seem to be a checksum */
|
|
HeapCost
|
|
FilterChecksumFollows(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
int i, score = 0;
|
|
char const *p = parent->input + subst->inlen;
|
|
|
|
if (limit - p < PREFIX_LENGTH)
|
|
return subst->cost2;
|
|
if (!isspace((unsigned char)p[PREFIX_LENGTH-1]))
|
|
return subst->cost2;
|
|
for (i = 0; i < PREFIX_LENGTH-1; i++)
|
|
score += (p[i] >= '0' && p[i] <= '9') + (p[i] >= 'a' && p[i] <= 'f');
|
|
i = (score >= PREFIX_LENGTH-2 ? subst->cost : subst->cost2);
|
|
/* Magic, since this function is perfect on binary files */
|
|
if (i < COST_INFINITY && perpage.tabsize == 0)
|
|
i = 0;
|
|
return i;
|
|
}
|
|
|
|
/* Manage a *big* pool of ParseNodes */
|
|
|
|
struct MemPool nodePool;
|
|
struct ParseNode *nodeFreeList = 0;
|
|
|
|
/* Prepare for node allocations */
|
|
static void
|
|
NodePoolInit(void)
|
|
{
|
|
memPoolInit(&nodePool);
|
|
nodeFreeList = NULL;
|
|
}
|
|
|
|
/* Free all nodes in one swell foop */
|
|
static void
|
|
NodePoolCleanup(void)
|
|
{
|
|
nodeFreeList = NULL;
|
|
memPoolEmpty(&nodePool);
|
|
}
|
|
|
|
/* Allcoate a new (uninitialized) node */
|
|
static struct ParseNode *
|
|
NodeAlloc(void)
|
|
{
|
|
struct ParseNode *node;
|
|
|
|
node = nodeFreeList;
|
|
if (node) {
|
|
nodeFreeList = node->parent;
|
|
return node;
|
|
}
|
|
return memPoolNew(&nodePool, ParseNode);
|
|
}
|
|
|
|
/* Free a node for reallocation */
|
|
static void
|
|
NodeFree(struct ParseNode *node)
|
|
{
|
|
node->parent = nodeFreeList;
|
|
nodeFreeList = node;
|
|
}
|
|
|
|
/*
|
|
* Decrement a node's reference count, freeing it and
|
|
* recursively decrementing its parent's if the count
|
|
* goes to zero.
|
|
*/
|
|
static void
|
|
NodeRelease(struct ParseNode *node)
|
|
{
|
|
struct ParseNode *parent;
|
|
assert(node->refcnt);
|
|
|
|
while (!--node->refcnt) {
|
|
parent = node->parent;
|
|
NodeFree(node);
|
|
if (!parent)
|
|
break;
|
|
node = parent;
|
|
}
|
|
}
|
|
|
|
/* Add nodes to the substitution tree */
|
|
|
|
/* Create a child of the given node, with the given properties. */
|
|
static ParseNode *
|
|
AddChild(ParseNode *parent, Substitution const *subst, HeapCost cost)
|
|
{
|
|
ParseNode *child;
|
|
|
|
if (cost == COST_INFINITY)
|
|
return 0;
|
|
|
|
cost += parent->cost;
|
|
child = NodeAlloc();
|
|
*child = *parent;
|
|
/* Child is just like parent, except... */
|
|
child->cost = cost;
|
|
child->refcnt = 1; /* The heap */
|
|
child->input += subst->inlen;
|
|
child->subst = subst;
|
|
child->parent = parent;
|
|
parent->refcnt++;
|
|
return child;
|
|
}
|
|
|
|
/* Hash table of nasty lines, indexed by per-line CRC */
|
|
struct NastyLine {
|
|
struct NastyLine *next;
|
|
char const *line;
|
|
CRC crc;
|
|
};
|
|
|
|
#define NASTY_HASH_SIZE 256
|
|
static struct NastyLine *nastyHash[NASTY_HASH_SIZE]; /* All zero */
|
|
|
|
struct MemPool nastyStrings, nastyStructs;
|
|
static CRCPoly const *nastyPoly = &crcCCITTPoly;
|
|
/*
|
|
* Create a new NastyString entry if it doesn't already exist.
|
|
* Note that this expects the string passed to end in a newline which
|
|
* IS hashed but NOT stored
|
|
*/
|
|
static struct NastyLine *
|
|
AddNasty(char const *string)
|
|
{
|
|
size_t len = strlen(string); /* Including newline */
|
|
CRC crc = CalculateCRC(nastyPoly, 0, (byte const *)string, len);
|
|
struct NastyLine *nasty, **nastyp = nastyHash + (crc % NASTY_HASH_SIZE);
|
|
char *line;
|
|
|
|
/* Search for an existing copy */
|
|
while ((nasty = *nastyp) != NULL) {
|
|
if (nasty->crc == crc &&
|
|
memcmp(nasty->line, string, len-1) == 0 &&
|
|
nasty->line[len-1] == 0)
|
|
return nasty;
|
|
nastyp = &nasty->next;
|
|
}
|
|
/* Create a new structure */
|
|
*nastyp = nasty = memPoolNew(&nastyStructs, struct NastyLine);
|
|
nasty->next = NULL;
|
|
nasty->line = line = memPoolAlloc(&nastyStrings, len, 1);
|
|
nasty->crc = crc;
|
|
memcpy(line, string, len-1);
|
|
line[len-1] = 0;
|
|
return nasty;
|
|
}
|
|
|
|
static void
|
|
RehashNasties(CRCPoly const *poly)
|
|
{
|
|
struct NastyLine *cur, *head;
|
|
CRC crc;
|
|
int i;
|
|
size_t len;
|
|
|
|
/* Put everything into one list and clear the hash table */
|
|
head = NULL;
|
|
for (i = 0; i < (int)elemsof(nastyHash); i++) {
|
|
while ((cur = nastyHash[i]) != NULL) {
|
|
nastyHash[i] = cur->next;
|
|
cur->next = head;
|
|
head = cur;
|
|
}
|
|
}
|
|
/* Recompute CRCs for the list and redistribute them among the buckets */
|
|
while (head) {
|
|
cur = head;
|
|
head = head->next;
|
|
len = strlen(cur->line);
|
|
crc = CalculateCRC(poly, 0, (byte const *)cur->line, len);
|
|
crc = AdvanceCRC(poly, crc, '\n');
|
|
cur->crc = crc;
|
|
cur->next = nastyHash[crc % NASTY_HASH_SIZE];
|
|
nastyHash[crc % NASTY_HASH_SIZE] = cur;
|
|
}
|
|
nastyPoly = poly;
|
|
}
|
|
|
|
/* Read in the nastylines file */
|
|
static void
|
|
ReadNasties(FILE *f)
|
|
{
|
|
char buf[128];
|
|
|
|
while (fgets(buf, sizeof(buf)-1, f))
|
|
AddNasty(buf);
|
|
}
|
|
|
|
/*
|
|
* Convert an encoded string to binary.
|
|
* No error checking is performed.
|
|
*/
|
|
static word32
|
|
GetWord32(EncodeFormat const *format, char const *buf, int len)
|
|
{
|
|
word32 w = 0;
|
|
|
|
while (len--)
|
|
w = (w<<format->bitsPerDigit) + DecodeDigit(format, *buf++);
|
|
return w;
|
|
}
|
|
|
|
/* Attempt nasty line substitutions */
|
|
static void
|
|
TryNasty(struct ParseNode *parent, Heap *heap, char const *limit)
|
|
{
|
|
struct NastyLine const *nasty;
|
|
struct Substitution const *subst;
|
|
struct ParseNode *child;
|
|
char const *end;
|
|
EncodeFormat const *format = pnFormat(parent);
|
|
OutputHandle oh;
|
|
char buf[4];
|
|
CRC check;
|
|
int i;
|
|
|
|
/* Make sure the lines are hashed properly */
|
|
if (nastyPoly != format->lineCRC)
|
|
RehashNasties(format->lineCRC);
|
|
|
|
/* Get the line to be replaced */
|
|
assert(parent->ps.pos == PREFIX_LENGTH);
|
|
end = memchr(parent->input, '\n', limit - parent->input);
|
|
if (!end)
|
|
end = limit;
|
|
/* Get the line's check value */
|
|
OutputInit(&oh, parent, NULL);
|
|
(void)OutputGetPrev(&oh);
|
|
i = 4;
|
|
while (--i)
|
|
buf[i] = OutputGetPrev(&oh);
|
|
check = GetWord32(format, buf, 4);
|
|
/* Find the matches */
|
|
nasty = nastyHash[check % NASTY_HASH_SIZE];
|
|
for (; nasty; nasty = nasty->next) {
|
|
if (nasty->crc == check) {
|
|
subst = SubstNasty(parent->input, end-parent->input,
|
|
nasty->line, 0);
|
|
if (subst) {
|
|
child = AddChild(parent, subst, NASTY_COST);
|
|
if (child) {
|
|
child->ps.flags |= PS_FLAG_DYNAMIC;
|
|
HeapInsert(heap, &child->cost);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Form all of a ParseNode's children and add them to the heap.
|
|
* Limit is the limit of allowable lookahead.
|
|
*/
|
|
static void
|
|
AddChildren(ParseNode *parent, Heap *heap, char const *limit)
|
|
{
|
|
char c = parent->input[0];
|
|
Substitution *subst = substitutions[0][c & 255];
|
|
ParseNode *child;
|
|
HeapCost cost;
|
|
|
|
/* If you want to make pure insertion substitutions, do that here */
|
|
|
|
assert(parent->input < limit); /* We always have at least one char */
|
|
|
|
while (subst) {
|
|
if (subst->inlen == 1 || /* Easy case */
|
|
((size_t)(limit-parent->input) >= subst->inlen &&
|
|
memcmp(subst->input, parent->input, subst->inlen) == 0))
|
|
{
|
|
cost = subst->cost;
|
|
if (subst->filter)
|
|
cost = subst->filter(parent, limit, subst);
|
|
child = AddChild(parent, subst, cost);
|
|
if (child)
|
|
HeapInsert(heap, &child->cost);
|
|
}
|
|
subst = subst->next;
|
|
}
|
|
|
|
/* Whole-line substitutions */
|
|
if (parent->ps.pos == PREFIX_LENGTH)
|
|
TryNasty(parent, heap, limit);
|
|
}
|
|
|
|
|
|
/* cost if this line has a dynamic substitution, otherwise cost2 */
|
|
HeapCost
|
|
FilterIsDynamic(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
(void)limit;
|
|
return (parent->ps.flags & PS_FLAG_DYNAMIC) ? subst->cost : subst->cost2;
|
|
}
|
|
|
|
/* cost if the current page is binary mode, else cost2 */
|
|
HeapCost
|
|
FilterIsBinary(struct ParseNode *parent, char const *limit,
|
|
struct Substitution const *subst)
|
|
{
|
|
(void)parent; (void)limit;
|
|
return perpage.tabsize ? subst->cost2 : subst->cost;
|
|
}
|
|
|
|
/* Debugging utility */
|
|
#define DEBUG 1 /* Set to 1 to print every line considered */
|
|
|
|
static size_t lastlen = 0;
|
|
|
|
static void
|
|
OverstrikeLine(char const *line, size_t len)
|
|
{
|
|
static size_t lastlen = 0;
|
|
int blanklen;
|
|
|
|
if (!line) {
|
|
if (lastlen)
|
|
putchar('\n');
|
|
lastlen = 0;
|
|
} else if (len || lastlen) {
|
|
if (len > 79)
|
|
len = 79;
|
|
blanklen = (lastlen > len) ? (int)lastlen - len : 0;
|
|
printf("%.*s%*s\r", (int)len, line, blanklen, "");
|
|
fflush(stdout);
|
|
lastlen = len;
|
|
}
|
|
}
|
|
|
|
/* Print everything, for debugging */
|
|
static void
|
|
PrintLine(char const *line, size_t len)
|
|
{
|
|
if (line) {
|
|
printf("%.*s\n", (int)len, line);
|
|
lastlen = 0;
|
|
}
|
|
}
|
|
|
|
static HeapCost ParseAdvanceString(Heap *heap, ParseNode *pn);
|
|
|
|
/*
|
|
* Copy the parsechain from tail up to root, and hang it off of
|
|
* newroot, adjusting the costs and parse state accordingly. Returns
|
|
* NULL if it is unable to (invalid parse, too expensive, etc.)
|
|
* Note that as per the convention, ParseAdvanceString is *not* called
|
|
* on the new tail node (but is called on all its parents).
|
|
*/
|
|
static ParseNode *
|
|
CopyParse(ParseNode const *tail, ParseNode const *root, ParseNode *newroot)
|
|
{
|
|
ParseNode *newtail, *parent;
|
|
|
|
if (tail == root)
|
|
return newroot;
|
|
parent = CopyParse(tail->parent, root, newroot);
|
|
if (!parent)
|
|
return NULL;
|
|
newtail = AddChild(parent, tail->subst, ParseAdvanceString(NULL, parent));
|
|
NodeRelease(parent);
|
|
return newtail;
|
|
}
|
|
|
|
/*
|
|
* Replace oldnode with a dynamic substitution for newchar, if possible,
|
|
* and fill in the chain down to "tail" just like before, but with no branches.
|
|
* Add the resultant ParseNode to the heap.
|
|
*/
|
|
static void
|
|
AddDynamic(Heap *heap, ParseNode const *oldnode, ParseNode const *tail,
|
|
int newchar)
|
|
{
|
|
Substitution const *subst = oldnode->subst;
|
|
ParseNode *newnode;
|
|
|
|
/* Only replace one-character substitutions */
|
|
if (subst->outlen != 1)
|
|
return;
|
|
|
|
subst = SubstDynamic(oldnode->subst->input, SubstString(newchar), 0);
|
|
newnode = AddChild(oldnode->parent, subst, -1); /* Try it immediately */
|
|
if (newnode) {
|
|
newnode->ps.flags |= PS_FLAG_DYNAMIC;
|
|
newnode = CopyParse(tail, oldnode, newnode);
|
|
if (newnode)
|
|
HeapInsert(heap, &newnode->cost);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Do the same, at a given (1-based) position on the line. Owing to
|
|
* a minor glitch, we must never count the tail node, as this has not
|
|
* been parsed yet, so its oldnode->ps.pos field is inaccurate.
|
|
*/
|
|
static void
|
|
AddDynamicAt(Heap *heap, int position, ParseNode const *tail, int newchar)
|
|
{
|
|
ParseNode const *oldnode = tail;
|
|
|
|
do {
|
|
oldnode = oldnode->parent;
|
|
} while (oldnode->ps.pos > position);
|
|
|
|
if (oldnode->ps.pos == position)
|
|
AddDynamic(heap, oldnode, tail, newchar);
|
|
}
|
|
|
|
/*
|
|
* Given the computed and input check fields, correct the header field
|
|
* that *ends* at the given pos. This can be used for both the line and
|
|
* page CRC errors by jyst changing the pos. (It relies on the fact
|
|
* that the page CRC fragment fits into the LineCRC type.)
|
|
* It also relies on the fact that the CRC is at most 4 digits.
|
|
*/
|
|
static void
|
|
ErrorCorrectHeader(Heap *heap, ParseNode const *tail, int pos,
|
|
EncodeFormat const *format, CRC crc, CRC check)
|
|
{
|
|
CRC syndrome = crc ^ check;
|
|
|
|
/* Find the position and the crc digit at that position */
|
|
while (syndrome >= (CRC)format->radix) {
|
|
if (syndrome & (CRC)(format->radix - 1))
|
|
return; /* uncorrectable */
|
|
pos--;
|
|
crc >>= format->bitsPerDigit;
|
|
syndrome >>= format->bitsPerDigit;
|
|
}
|
|
/* Paste in the correct digit */
|
|
AddDynamicAt(heap, pos, tail, EncodeDigit(format, crc & (format->radix-1)));
|
|
}
|
|
|
|
/*
|
|
* This function walks back through the line, and if the line CRC could be
|
|
* made correct by changing a character to another legal character,
|
|
* the change is added (on probation) to the substitution table.
|
|
*/
|
|
static void
|
|
ErrorCorrect(Heap *heap, OutputHandle oh, EncodeFormat const *format,
|
|
CRC syndrome)
|
|
{
|
|
ParseNode const *tail = oh.node;
|
|
int c;
|
|
|
|
syndrome = ReverseCRC(format->lineCRC, syndrome, 0);
|
|
while (oh.node->ps.pos > PREFIX_LENGTH) {
|
|
c = OutputGetPrev(&oh);
|
|
if (c == '\n' || c == -1) { /* Can't happen */
|
|
printf("Line ended at pos %d\n", oh.node->ps.pos);
|
|
return;
|
|
}
|
|
syndrome = ReverseCRC(format->lineCRC, syndrome, 0);
|
|
if (syndrome >= 0x100 || !substitutions[0][c^syndrome] ||
|
|
oh.node->subst->outlen != 1)
|
|
continue;
|
|
AddDynamic(heap, oh.node, tail, c^syndrome);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Parsing operations. This is a rather ugly and ad-hoc parser that
|
|
* knows a lot about the fixed-field format produced by the munge
|
|
* utility. The main state variable is the position in
|
|
* the line, which controls the expected header, the position of
|
|
* tab stops, and the maximum permissible line length.
|
|
*/
|
|
#define OCCASIONALLY 100
|
|
|
|
/* Set up a ParseState to top-of-page */
|
|
static void
|
|
ParseStateInit(ParseState *ps)
|
|
{
|
|
static struct ParseState const parseNull = { 0, 0, 0 };
|
|
*ps = parseNull;
|
|
}
|
|
|
|
/*
|
|
* Try to accept a newline, checking CRCs and even doing error-correction
|
|
* as appropriate.
|
|
*/
|
|
static int
|
|
ParseNewline(Heap *heap, ParseNode *pn, char const *string)
|
|
{
|
|
OutputHandle oh;
|
|
int c;
|
|
char debugbuf[PREFIX_LENGTH+LINE_LENGTH+10];
|
|
char *header, *body, *end;
|
|
int pos, width;
|
|
CRC crc, check;
|
|
ParseNode *temp;
|
|
static int occasionally = OCCASIONALLY;
|
|
EncodeFormat const *format = pnFormat(pn);
|
|
EncodeFormat const *headerFormat = &hexFormat;
|
|
|
|
/* Get the line into a buffer for analysis */
|
|
OutputInit(&oh, pn, string);
|
|
end = debugbuf + sizeof(debugbuf)-1;
|
|
header = OutputGetUntil(oh, end, '\n');
|
|
/* Strip leading and trailing whitespace */
|
|
while (header < end && isspace((unsigned char)header[0]))
|
|
header++;
|
|
while (header < end && isspace((unsigned char)end[-1]))
|
|
end--;
|
|
*end++ = '\n';
|
|
|
|
/* Start of checksummed area */
|
|
body = header + PREFIX_LENGTH;
|
|
/* Blank lines are missing the trainign space from the prefix */
|
|
if (body >= end)
|
|
body = end-1;
|
|
|
|
crc = CalculateCRC(format->lineCRC, 0, body, end-body);
|
|
check = GetWord32(format, header+2, 4);
|
|
if (crc != check) {
|
|
if (!--occasionally) {
|
|
OverstrikeLine(header, end-header-1);
|
|
occasionally = OCCASIONALLY;
|
|
}
|
|
/* Try ECC on the line */
|
|
/* If we haven't already tried ECC on the line... */
|
|
if (!(pn->ps.flags & PS_FLAG_DYNAMIC)) {
|
|
ErrorCorrectHeader(heap, pn, PREFIX_LENGTH-1, format, crc, check);
|
|
ErrorCorrect(heap, oh, format, crc ^ check);
|
|
}
|
|
return COST_INFINITY;
|
|
}
|
|
/* Good enough that we always print it */
|
|
OverstrikeLine(header, end-header-1);
|
|
|
|
/* Okay, now there are two cases - header line or running CRC */
|
|
if (pn->ps.flags & PS_FLAG_INHEADER) {
|
|
/* Do things for first header */
|
|
if (!(pn->ps.flags & PS_FLAG_PASTHEADER)) {
|
|
/* Check version number */
|
|
width = EncodedLength(headerFormat, HDR_VERSION_BITS);
|
|
c = (int)GetWord32(&hexFormat, body, width);
|
|
if (c != 0) {
|
|
fputs("Fatal: you need a newer version of repair"
|
|
" to process this file\n", stderr);
|
|
exit(1);
|
|
}
|
|
/* Suck in page CRC, after version & flags */
|
|
pos = width + EncodedLength(headerFormat, HDR_FLAG_BITS);
|
|
width = EncodedLength(headerFormat, format->pageCRC->bits);
|
|
perpage.page_check = GetWord32(&hexFormat, body+pos, width);
|
|
/* Get tab size */
|
|
pos += width;
|
|
width = EncodedLength(headerFormat, HDR_TABWIDTH_BITS);
|
|
perpage.tabsize = GetWord32(&hexFormat, body+pos, width);
|
|
|
|
/* Once we have the header, don't reconsider */
|
|
if (!(pn->ps.flags & PS_FLAG_PASTHEADER))
|
|
while ((temp = (ParseNode *)HeapGetMin(heap)) != NULL)
|
|
NodeRelease(temp);
|
|
pn->ps.page_crc = 0; /* Clear for top of page */
|
|
}
|
|
} else {
|
|
/* Check the CRC-32 */
|
|
crc = CalculateCRC(format->pageCRC, pn->ps.page_crc, body, end-body);
|
|
pn->ps.page_crc = crc;
|
|
crc = RunningCRCFromPageCRC(format, crc);
|
|
check = GetWord32(format, header, 2);
|
|
if (crc != check) {
|
|
if (!(pn->ps.flags & PS_FLAG_DYNAMIC))
|
|
ErrorCorrectHeader(heap, pn, 2, format, crc, check);
|
|
return COST_INFINITY;
|
|
}
|
|
}
|
|
|
|
/* Hey, it's correct! */
|
|
PrintLine(header, end-header-1);
|
|
|
|
/* Start next line */
|
|
pn->ps.pos = 0;
|
|
/* Clear most other flags, but we *have* got a header */
|
|
c = pn->ps.flags & PS_FLAG_DYNAMIC;
|
|
pn->ps.flags &= PS_FLAG_BINEND | PS_MASK_FORMAT;
|
|
pn->ps.flags |= PS_FLAG_PASTHEADER;
|
|
/*
|
|
* Give a bonus to the next line for having completed this one,
|
|
* less if it was dynamically fixed.
|
|
*/
|
|
return c ? COST_LINE : COST_LINE*2/3;
|
|
}
|
|
|
|
/*
|
|
* Advance the parse state with pointed-to character. Returns
|
|
* COST_INFINITY if an impossible state is reached, otherwise returns a
|
|
* cost value. (Normally 0, this can be increased to penalize unlikely
|
|
* output combinations to nudge the correction in a certain direction.)
|
|
*/
|
|
static HeapCost
|
|
ParseAdvance(Heap *heap, ParseNode *pn, char const *string)
|
|
{
|
|
int i, retval = 0;
|
|
char c = *string;
|
|
EncodeFormat const *format = pnFormat(pn);
|
|
|
|
/*
|
|
* Insist on spaces being correctly converted to SPACE_CHAR.
|
|
* There's a little irregularity just before EOL.
|
|
* Line contiunation and formfeed are also only legal at EOL.
|
|
*/
|
|
if (c == ' ') {
|
|
if (pn->ps.flags & PS_FLAG_SPACE && !(pn->ps.flags & PS_FLAG_TAB))
|
|
pn->ps.flags |= PS_FLAG_EOL;
|
|
pn->ps.flags |= PS_FLAG_SPACE;
|
|
} else if (pn->ps.flags & PS_FLAG_EOL) {
|
|
if (c != '\n')
|
|
return COST_INFINITY;
|
|
} else if (c == SPACE_CHAR) {
|
|
if (!(pn->ps.flags & PS_FLAG_SPACE))
|
|
pn->ps.flags |= PS_FLAG_EOL;
|
|
} else if (c == CONTIN_CHAR || c == FORMFEED_CHAR) {
|
|
pn->ps.flags |= PS_FLAG_EOL;
|
|
} else {
|
|
pn->ps.flags &= ~PS_FLAG_SPACE;
|
|
}
|
|
|
|
switch (pn->ps.pos) {
|
|
case 0:
|
|
if (c == ' ' || c == '\n') {
|
|
break; /* Ignore ws and blank lines completely */
|
|
} else if (c == '\f' || c == HDR_PREFIX_CHAR) {
|
|
/* Start of a new page */
|
|
pn->ps.flags |= PS_FLAG_INHEADER; /* Expect header next */
|
|
if (c == '\f')
|
|
break;
|
|
/* And fall through to increment pos */
|
|
} else if (pn->ps.flags & PS_FLAG_INHEADER ||
|
|
pn->ps.flags & PS_FLAG_BINEND ||
|
|
!(pn->ps.flags & PS_FLAG_PASTHEADER) ||
|
|
DecodeDigit(format, c) < 0) {
|
|
return COST_INFINITY; /* Various illegal cases */
|
|
}
|
|
pn->ps.pos++;
|
|
break;
|
|
case 1:
|
|
if ((pn->ps.flags & PS_FLAG_INHEADER)) {
|
|
format = FindFormat(c); /* Second char of header */
|
|
if (!format)
|
|
return COST_INFINITY;
|
|
i = registerFormat(format);
|
|
psSetFormat(&pn->ps, i);
|
|
pn->ps.pos++;
|
|
break;
|
|
}
|
|
if (DecodeDigit(format, c) < 0)
|
|
return COST_INFINITY; /* Illegal */
|
|
pn->ps.pos++;
|
|
break;
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
#if PREFIX_LENGTH != 7
|
|
#error fix this code
|
|
#endif
|
|
case PREFIX_LENGTH-2:
|
|
if (DecodeDigit(format, c) < 0)
|
|
return COST_INFINITY; /* Illegal */
|
|
pn->ps.pos++;
|
|
break;
|
|
case PREFIX_LENGTH-1:
|
|
if (c == ' ') {
|
|
pn->ps.pos++;
|
|
break;
|
|
} else if (c != '\n') {
|
|
return COST_INFINITY;
|
|
}
|
|
/* Blank lines may be missing this space char */
|
|
/*FALLTHROUGH*/
|
|
/* The normal line starts here, at position 7 */
|
|
default:
|
|
if (pn->ps.flags & PS_FLAG_INHEADER) { /* Header line */
|
|
/* Format is "--abcd 0123456789abcdef012 Page %u of %s" */
|
|
int off = pn->ps.pos - (PREFIX_LENGTH+HDR_ENC_LENGTH);
|
|
/* Offset relative to end of hex header */
|
|
if (off < 0) {
|
|
if (HexDigitValue(c & 255) < 0)
|
|
return COST_INFINITY;
|
|
} else if (off < 6) {
|
|
if (c != " Page "[off]) /* Yes, this is legal C */
|
|
return COST_INFINITY;
|
|
} else if (off == 6) {
|
|
if (c < '1' || c > '9') /* First digit of page no. */
|
|
return COST_INFINITY;
|
|
} else {
|
|
/* Re-base to end of scanned part of page number */
|
|
off -= 7 + PageNumDigits(pn);
|
|
if (off == 0) {
|
|
if (c >= '0' && c <= '9' && PageNumDigits(pn) < 3)
|
|
PageNumDigitsIncrement(pn);
|
|
else if (c != ' ')
|
|
return COST_INFINITY;
|
|
} else if (off < 4) {
|
|
if (c != " of "[off])
|
|
return COST_INFINITY;
|
|
} else if (off == 4) {
|
|
if (!isgraph(c))
|
|
return COST_INFINITY;
|
|
} else if (c < ' ' || (c & 255) > '~') {
|
|
if (c != '\n')
|
|
return COST_INFINITY;
|
|
return ParseNewline(heap, pn, string);
|
|
}
|
|
}
|
|
} else if (!perpage.tabsize) { /* Radix-64 line */
|
|
/* Line is "RlNFVF9UQU== \n" */
|
|
if (isspace(c & 255)) {
|
|
if (!(pn->ps.flags & PS_FLAG_BINWS)) {
|
|
if ((pn->ps.pos - PREFIX_LENGTH) % 4 != 0)
|
|
return COST_INFINITY;
|
|
pn->ps.flags |= PS_FLAG_BINWS;
|
|
if (pn->ps.pos - PREFIX_LENGTH < BYTES_PER_LINE*4/3)
|
|
pn->ps.flags |= PS_FLAG_BINEND;
|
|
}
|
|
if (c == '\n')
|
|
return ParseNewline(heap, pn, string);
|
|
} else if (pn->ps.flags & PS_FLAG_BINWS) {
|
|
return COST_INFINITY;
|
|
} else if (c == RADIX64_END_CHAR) {
|
|
if ((pn->ps.pos - PREFIX_LENGTH) % 4 < 2)
|
|
return COST_INFINITY;
|
|
pn->ps.flags |= PS_FLAG_BINEND;
|
|
} else if (pn->ps.flags & PS_FLAG_BINEND) {
|
|
return COST_INFINITY;
|
|
} else if (Radix64DigitValue(c) < 0) {
|
|
return COST_INFINITY;
|
|
}
|
|
} else { /* Normal line */
|
|
/* Make sure tab stops come out right */
|
|
if (pn->ps.flags & PS_FLAG_TAB) {
|
|
if (((pn->ps.pos - PREFIX_LENGTH) % perpage.tabsize) == 0)
|
|
pn->ps.flags &= ~PS_FLAG_TAB;
|
|
else if (c != TAB_PAD_CHAR && c != '\n') {
|
|
return COST_INFINITY; /* Illegal */
|
|
}
|
|
}
|
|
/*
|
|
* Yes, this code has hard-coded ASCII assumptions
|
|
* It knows that the acceptable range of '\n', ' '..'~',
|
|
* TAB_CHAR, FORMFEED_CHAR is in that order.
|
|
* Signed char machines have it backwards, to be confusing.
|
|
*/
|
|
if ((c & 255) < ' ') {
|
|
/* Newline! (Or something illegal) */
|
|
if (c != '\n')
|
|
return COST_INFINITY;
|
|
return ParseNewline(heap, pn, string);
|
|
}
|
|
/* A normal character */
|
|
if ((c & 255) > '~') {
|
|
if (pn->ps.flags & PS_FLAG_INHEADER)
|
|
return COST_INFINITY; /* Illegal */
|
|
if (c == TAB_CHAR)
|
|
pn->ps.flags |= PS_FLAG_TAB;
|
|
else if (c != FORMFEED_CHAR && c != SPACE_CHAR &&
|
|
c != CONTIN_CHAR)
|
|
return COST_INFINITY; /* Illegal */
|
|
}
|
|
}
|
|
if (++pn->ps.pos > PREFIX_LENGTH + LINE_LENGTH)
|
|
return COST_INFINITY;
|
|
break;
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
/*
|
|
* Run the parser over the string in a ParseNode (using repeated calls
|
|
* to ParseAdvance). Return the penalty cost, or COST_INFINITY if
|
|
* it's impossible
|
|
*/
|
|
static HeapCost
|
|
ParseAdvanceString(Heap *heap, ParseNode *pn)
|
|
{
|
|
HeapCost cost, total = 0;
|
|
char const *string = pn->subst->output;
|
|
|
|
while (*string) {
|
|
cost = ParseAdvance(heap, pn, string++);
|
|
if (cost == COST_INFINITY)
|
|
return cost;
|
|
total += cost;
|
|
}
|
|
return total;
|
|
}
|
|
|
|
static unsigned int *globalStats = NULL;
|
|
static unsigned globalSize = 0;
|
|
static unsigned globalEdits = 0;
|
|
|
|
/*
|
|
* This walks the list of substitutions, performing two tasks with
|
|
* the statistics gathered.
|
|
*
|
|
* First, although not essential, it prints any interesting changes
|
|
* (non-identity substitutions) made, and a count of the total number
|
|
* of substitutions (including identity) as an approximate character count.
|
|
*
|
|
* Second, it does maintenance on dynamic (learned during program
|
|
* execution) substitutions. It discards any substitutions that end
|
|
* up unused, and computes nice costs for the others, based on the
|
|
* global (per-file) statistics.
|
|
*
|
|
* (This function is also called at the end to print the per-file stats,
|
|
* which does redundant weight adjustment, but it's harmless.)
|
|
*/
|
|
static void
|
|
UseStats(unsigned int *stats, FILE *log)
|
|
{
|
|
unsigned int i, j, n, changes = 0;
|
|
unsigned long grand = 0;
|
|
Substitution *s, **sp;
|
|
|
|
if (!stats)
|
|
return;
|
|
|
|
/* Yes, this loop is permuted on purpose */
|
|
for (j = 0; j < elemsof(*substitutions); j++) {
|
|
for (i = 0; i < elemsof(substitutions); i++) {
|
|
sp = &substitutions[i][j];
|
|
while ((s = *sp) != 0) {
|
|
grand += n = stats[s->index];
|
|
/* Retain or purge dynamic substitutions, depending. */
|
|
if (SubstIsDynamic(s)) {
|
|
if (n) {
|
|
SubstAdjust(s, n);
|
|
} else if (!globalStats[s->index]) {
|
|
/* Forget unused dynamic substitutions */
|
|
*sp = s->next;
|
|
if (SubstIsNasty(s))
|
|
free((char *)s->input); /* Dynamically allocated */
|
|
SubstFree(s);
|
|
continue;
|
|
}
|
|
}
|
|
sp = &s->next;
|
|
/*
|
|
* Print interesting substitutions. Some boring substitutions,
|
|
* flagged with an index value of zero, are not printed.
|
|
*/
|
|
if (!s->index || !n)
|
|
continue;
|
|
changes += n;
|
|
fprintf(log, "\t%2ux \"%.*s\"%*s-> \"%.*s\"%*s(cost ",
|
|
stats[s->index], (int)s->inlen, s->input,
|
|
s->inlen>3 ? 0 : 3-(int)s->inlen, "",
|
|
(int)s->outlen, s->output,
|
|
s->outlen>3 ? 0 : 3-(int)s->outlen, "");
|
|
fprintf(log, s->cost == COST_INFINITY ? "-" : "%d", s->cost);
|
|
if (s->filter)
|
|
fprintf(log, s->cost2 == COST_INFINITY ? "/-" : "/%d",
|
|
s->cost2);
|
|
fputs(SubstIsDynamic(s) ? ") ** LEARNED **\n" : ")\n", log);
|
|
}
|
|
}
|
|
}
|
|
fprintf(log, "\tTotal: %u changes (out of %lu)\n", changes, grand);
|
|
}
|
|
|
|
static void
|
|
DoStats(ParseNode const *node, unsigned int page, FILE *log)
|
|
{
|
|
unsigned int *stats;
|
|
unsigned int n;
|
|
|
|
/* Enlarge global stats if needed */
|
|
if (globalSize < substCount) {
|
|
stats = realloc(globalStats, substCount * sizeof(*stats));
|
|
if (!stats) {
|
|
fputs("Fatal error: out of memory for stats!\n", stderr);
|
|
exit(1);
|
|
}
|
|
for (n = globalSize; n < substCount; n++)
|
|
stats[n] = 0;
|
|
globalStats = stats;
|
|
globalSize = substCount;
|
|
}
|
|
|
|
/* Allocate per-page stats */
|
|
stats = calloc(substCount, sizeof(*stats));
|
|
if (!stats) {
|
|
fputs("Fatal error: out of memory for stats!\n", stderr);
|
|
exit(1);
|
|
}
|
|
/* Cheat and assume that calloc() initializes unsigned ints to zero */
|
|
while (node) {
|
|
stats[node->subst->index]++;
|
|
node = node->parent;
|
|
}
|
|
|
|
/* Keep the global counts accurate */
|
|
for (n = 0; n < substCount; n++)
|
|
globalStats[n] += stats[n];
|
|
|
|
fprintf(log, "Page %u substitutions:\n", page);
|
|
UseStats(stats, log);
|
|
|
|
free(stats);
|
|
}
|
|
|
|
/* Spit out a page of data (needs work). Returns number of lines */
|
|
static unsigned
|
|
PrintPage(OutputHandle oh, FILE *out)
|
|
{
|
|
char pagebuf[PAGE_BUFFER_SIZE];
|
|
char *p1; /* Beginning of current line */
|
|
char *p2; /* End of current line (WS stripped) */
|
|
char *p3; /* End of current line (newline) */
|
|
char *p4; /* End of all output */
|
|
unsigned lines = 0;
|
|
|
|
p4 = pagebuf + sizeof(pagebuf);
|
|
p1 = OutputGetUntil(oh, p4, -1);
|
|
|
|
/* Output the lines without leading & trailing whitespace */
|
|
while (p1 < p4) {
|
|
/* Identify the line */
|
|
p3 = memchr(p1, '\n', p4-p1);
|
|
if (!p3)
|
|
p3 = p4;
|
|
/* Delete leading whitespacee */
|
|
while (isspace((unsigned char)*p1) && p1 < p3)
|
|
p1++;
|
|
/* Delete trailing whitepace */
|
|
p2 = p3;
|
|
while (isspace((unsigned char)p2[-1]) && p1 < p2)
|
|
p2--;
|
|
/* Spit out this line */
|
|
fwrite(p1, 1, (size_t)(p2-p1), out);
|
|
putc('\n', out);
|
|
/* Advance p1 past the newline */
|
|
p1 = p3 + 1;
|
|
lines++;
|
|
}
|
|
return lines;
|
|
}
|
|
|
|
static volatile int interrupt = 0;
|
|
static void (* volatile oldhandler)(int) = SIG_DFL;
|
|
|
|
static void inthandler(int sig)
|
|
{
|
|
if (++interrupt > 2)
|
|
(void)signal(sig, oldhandler);
|
|
}
|
|
|
|
/*
|
|
* Given a buffer, process a page from it and try to write a corrected page to
|
|
* the out file. Return the number of bytes accessed. (0 if it was unable
|
|
* to make any corrections.)
|
|
*/
|
|
static size_t
|
|
DoPage(char const *buf, size_t len, FILE *out, unsigned int page, FILE *log)
|
|
{
|
|
ParseNode *node;
|
|
Heap heap;
|
|
HeapCost cost;
|
|
OutputHandle oh;
|
|
void (*sighandler)(int);
|
|
|
|
HeapInit(&heap, 1000);
|
|
NodePoolInit();
|
|
PerPageInit(buf);
|
|
|
|
/* Initialize signal handling */
|
|
interrupt = 0;
|
|
sighandler = signal(SIGINT, inthandler);
|
|
if (sighandler != inthandler)
|
|
oldhandler = sighandler;
|
|
|
|
/* Make a root node */
|
|
node = NodeAlloc();
|
|
node->cost = 0;
|
|
node->refcnt = 1;
|
|
node->input = buf;
|
|
node->subst = &substNull;
|
|
ParseStateInit(&node->ps);
|
|
node->parent = NULL;
|
|
|
|
HeapInsert(&heap, &node->cost);
|
|
|
|
/* The main loop: try to extend the current parse. */
|
|
while ((node = (ParseNode *)HeapGetMin(&heap)) != NULL) {
|
|
cost = ParseAdvanceString(&heap, node);
|
|
if (cost != COST_INFINITY) {
|
|
/* End of the file, or hit a second header line? */
|
|
if (node->input == buf+len || InSecondHeader(&node->ps)) {
|
|
/* Try to wrap up page, if page CRC works */
|
|
if (node->ps.page_crc == perpage.page_check) {
|
|
/* Success! */
|
|
HeapDestroy(&heap);
|
|
OutputInit(&oh, node, NULL);
|
|
OverstrikeLine("", 0);
|
|
|
|
if (InSecondHeader(&node->ps)) {
|
|
/* Back up to last newline */
|
|
OutputInit(&oh, node, NULL);
|
|
while (OutputGetPrev(&oh) != '\n')
|
|
;
|
|
OutputUnget(&oh);
|
|
}
|
|
/* oh points to node that emitted last char on page */
|
|
len = oh.node->input - buf; /* Chars eaten this page */
|
|
perpage.lines = PrintPage(oh, out);
|
|
DoStats(oh.node, page, log);
|
|
NodePoolCleanup();
|
|
return len;
|
|
}
|
|
} else {
|
|
/* Keep working on the page */
|
|
node->cost = cost += node->cost;
|
|
if (node->input > perpage.maxpos) {
|
|
perpage.maxpos = perpage.minpos = node->input;
|
|
if (perpage.max_retries < perpage.retries)
|
|
perpage.max_retries = perpage.retries;
|
|
perpage.retries = 0; /* Made progress */
|
|
} else if (node->input < perpage.minpos) {
|
|
perpage.minpos = node->input; /* Furthest backtrack */
|
|
}
|
|
++perpage.retries;
|
|
if (heap.numElems > MAX_HEAP || interrupt)
|
|
HeapDestroy(&heap);
|
|
else
|
|
AddChildren(node, &heap, buf+len);
|
|
}
|
|
}
|
|
NodeRelease(node);
|
|
}
|
|
/* Failed! */
|
|
OverstrikeLine(NULL, 0);
|
|
puts("Stopping for manual edit.");
|
|
|
|
NodePoolCleanup();
|
|
/* Get rid of the dynamic substitutions */
|
|
DoStats(NULL, page, log);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* The magic file-shuffling routine. */
|
|
static int
|
|
RepairFile(char const *name, char const *editor, char const *nastylines)
|
|
{
|
|
char buf[PAGE_BUFFER_SIZE];
|
|
char *filename;
|
|
char const *p;
|
|
size_t namelen;
|
|
FILE *in = 0, *out = 0, *dump = 0, *log = 0;
|
|
size_t inbytes; /* Bytes in input buffer */
|
|
size_t outbytes; /* Bytes taken from input buffer */
|
|
unsigned int pages = 0; /* # of pages processed */
|
|
unsigned int lines = 0; /* # of lines processed (until trouble) */
|
|
unsigned int minline, maxline; /* Where is the error? */
|
|
int giveup; /* Have we had to abort corrections? */
|
|
int err; /* Copy of errno for returns */
|
|
|
|
globalSize = 0; /* Reset global stats */
|
|
|
|
namelen = strlen(name);
|
|
if (!(filename = malloc(namelen+10))) {
|
|
p = "Unable to allocate memory\n";
|
|
goto error;
|
|
}
|
|
|
|
memcpy(filename, name, namelen);
|
|
strcpy(filename+namelen, ".log");
|
|
puts(filename);
|
|
if (!(log = fopen(filename, "at"))) {
|
|
p = "Unable to open log file \"%s\"\n";
|
|
goto error;
|
|
}
|
|
|
|
strcpy(filename+namelen, ".out");
|
|
puts(filename);
|
|
if (!(out = fopen(filename, "at"))) {
|
|
p = "Unable to open output file \"%s\"\n";
|
|
goto error;
|
|
}
|
|
|
|
retry:
|
|
/* Read in any new nasty lines */
|
|
if (!(in = fopen(nastylines, "rt"))) {
|
|
fprintf(stderr, "Unable to open nasty lines file \"%s\"\n", nastylines);
|
|
} else {
|
|
ReadNasties(in);
|
|
fclose(in);
|
|
}
|
|
/* Try to open input file - .in or original */
|
|
p = filename;
|
|
strcpy(filename+namelen, ".in");
|
|
if (!(in = fopen(filename, "rt"))) {
|
|
if (!(in = fopen(name, "rt"))) {
|
|
filename[namelen] = 0;
|
|
p = "Unable to open input file \"%s\"\n";
|
|
goto error;
|
|
}
|
|
p = name;
|
|
}
|
|
printf("Repairing from %s\n", p);
|
|
strcpy(filename+namelen, ".dmp");
|
|
if (!(dump = fopen(filename, "wt"))) {
|
|
p = "Unable to open output file \"%s\"\n";
|
|
goto error;
|
|
}
|
|
|
|
giveup = 0;
|
|
inbytes = 0; /* Bytes already at the front of the buffer */
|
|
/* Append more data from the file */
|
|
while ((inbytes += fread(buf+inbytes, 1, sizeof(buf)-inbytes, in)) != 0) {
|
|
if (giveup) {
|
|
/* Giving up mode - just copy through */
|
|
outbytes = fwrite(buf, 1, inbytes, dump);
|
|
if (!outbytes) {
|
|
p = "Error writing dump file!\n";
|
|
goto error;
|
|
}
|
|
} else {
|
|
outbytes = DoPage(buf, inbytes, out, pages+1, log);
|
|
NodePoolCleanup();
|
|
if (outbytes) {
|
|
pages++;
|
|
lines += perpage.lines;
|
|
} else { /* Failed */
|
|
/* Find range of backtracking for error location */
|
|
minline = 1;
|
|
for (p = buf; p < perpage.minpos; p++)
|
|
minline += (*p == '\n');
|
|
for (maxline = minline; p < perpage.maxpos; p++)
|
|
maxline += (*p == '\n');
|
|
giveup = 1;
|
|
}
|
|
}
|
|
/* Fewer bytes now in the buffer */
|
|
inbytes -= outbytes;
|
|
/* Move those bytes to the front again */
|
|
memmove(buf, buf+outbytes, inbytes);
|
|
}
|
|
|
|
fclose(in);
|
|
in = 0;
|
|
fclose(dump);
|
|
dump = 0;
|
|
|
|
/* Okay, let's get tricky */
|
|
memcpy(buf, name, namelen);
|
|
strcpy(buf+namelen, ".dmp");
|
|
strcpy(filename+namelen, ".in");
|
|
|
|
/* teun: MS Visual C doesn't rename on top of existing file; remove it */
|
|
if (remove(filename) != 0) {
|
|
err = errno;
|
|
fprintf(stderr, "Warning deleting %s\n", filename);
|
|
}
|
|
|
|
if (rename(buf, filename) != 0) {
|
|
err = errno;
|
|
fclose(out);
|
|
fclose(log);
|
|
/* teun: corrected buf, filename order. This cost me an hour */
|
|
fprintf(stderr, "Error renaming %s -> %s\n", buf, filename);
|
|
return err;
|
|
}
|
|
|
|
/* This code is spaghetti - is there a cleaner way? */
|
|
if (giveup) {
|
|
printf("Error in %s, lines %u-%u\n", filename, minline, maxline);
|
|
fprintf(log, "Error in %s, lines %u-%u\n", filename, minline, maxline);
|
|
if (interrupt > 1)
|
|
goto manual;
|
|
if (editor) {
|
|
if (strcmp(editor, "-") == 0)
|
|
goto manual;
|
|
sprintf(buf, editor, maxline, filename);
|
|
} else {
|
|
p = getenv("VISUAL");
|
|
if (!p)
|
|
p = getenv("EDITOR");
|
|
if (!p)
|
|
goto manual;
|
|
sprintf(buf, "%s +%u %s\n", p, maxline, filename);
|
|
}
|
|
printf("Executing %s\n", buf);
|
|
globalEdits++;
|
|
if (system(buf) == 0)
|
|
goto retry;
|
|
fputs("Edit failed - aborting\n", stderr);
|
|
manual:
|
|
puts("Please fix the error by hand and re-run repair.");
|
|
}
|
|
|
|
fclose(out);
|
|
free(filename);
|
|
|
|
fprintf(log, "\n%u lines successfully processed.\n", lines);
|
|
fprintf(log, "Overall substitutions (%u pages):\n", pages);
|
|
UseStats(globalStats, log);
|
|
printf("%u manual edits required\n", globalEdits);
|
|
fclose(log);
|
|
|
|
return 0;
|
|
|
|
error:
|
|
err = errno;
|
|
if (log) fclose(log);
|
|
if (dump) fclose(dump);
|
|
if (out) fclose(out);
|
|
if (in) fclose(in);
|
|
fprintf(stderr, p, filename);
|
|
free(filename);
|
|
return err;
|
|
}
|
|
|
|
/* Process the command line, calling RepairFile as needed. */
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
int result = 0;
|
|
int i;
|
|
char const *editor = NULL;
|
|
char const *nastylines = "nastylines";
|
|
|
|
InitUtil();
|
|
SubstBuild();
|
|
memPoolInit(&nastyStructs);
|
|
memPoolInit(&nastyStrings);
|
|
|
|
/* Process leading flags */
|
|
for (i = 1; i < argc && argv[i][0] == '-'; i++) {
|
|
if (argv[i][1] == '-' && argv[i][2] == 0) {
|
|
i++;
|
|
break;
|
|
} else if (argv[i][1] == 'e') {
|
|
editor = argv[i][2] ? argv[i]+2 : argv[++i];
|
|
} else if (argv[i][1] == 'l') {
|
|
nastylines = argv[i][2] ? argv[i]+2 : argv[++i];
|
|
} else {
|
|
editor = argv[i][2] ? argv[i]+2 : argv[++i];
|
|
fprintf(stderr, "ERROR: Unrecognized option %s\n", argv[i]);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
/* Process files */
|
|
for (; i < argc; i++) {
|
|
result = RepairFile(argv[i], editor, nastylines);
|
|
if (result != 0) {
|
|
fprintf(stderr, "Fatal error: %s\n", strerror(result));
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Local Variables:
|
|
* tab-width: 4
|
|
* End:
|
|
* vi: ts=4 sw=4
|
|
* vim: si
|
|
*/
|