ocr-tools/tools/repair.c

/*
 * repair.c -- Program which reconstructs scanned source, locates errors,
 *			   and tries to fix most of them automatically.  If it
 *             can't, it drops you into an editor on the appropriate
 *             line for manual correction.
 *
 * Given a file "foo", this appends corrected output to "foo.out"
 * and copies remaining uncorrected input in "foo.in".  If "foo.in"
 * exists initially, "foo" is ignored and only "foo.in" is processed.
 * Thus, re-running it repeatedly, possibly with other correction
 * techniques in between, will result in correct output in "foo.out"
 * and an empty "foo.in" file.
 *
 * This can automatically invoke an editor for you on the .in file
 * and re-run itself.  The editor is chosen in the first available way:
 * - The -e command-line argument takes a printf() format string to
 *   format the editor invocation command line with the line number and
 *   filename.  E.g. "emacs +%u %s".  %u and %s must appear, in that order.
 * - Failing that, the default is "$VISUAL +%u %s"
 * - Failing that, the default is "$EDITOR +%u %s"
 * - Failing that, the program prints the error location and exits.
 *   Specifying -e- forces this behaviour.
 *
 * Copyright (C) 1997 Pretty Good Privacy, Inc.
 *
 * Designed by Colin Plumb, Mark H. Weaver, and Philip R. Zimmermann
 * Written by Colin Plumb
 *
 * $Id: repair.c,v 1.37 1997/11/14 08:39:40 mhw Exp $
 */

#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <signal.h>

#include "util.h"
#include "heap.h"
#include "mempool.h"
#include "subst.h"

/*
 * The internal form of a substitution.  These are stored on
 * lists indexed by the first character of the input substitution.
 */
typedef struct Substitution {
	struct Substitution *next;
	char const *input, *output;
	size_t inlen, outlen;
	HeapCost cost, cost2;
	FilterFunc *filter;
	unsigned int index;	/* Consecutive serial numbers */
} Substitution;

struct Substitution const substNull = { NULL, "", "", 0, 0, 0, 0, 0 };

/*
 * This might get increased later to support multiple classes of
 * substitutions, for different contexts.  Currently, only one
 * is used.
 */
#define SUBST_CLASSES 1

/* List of substitutions, indexed by first character, plus a catch-all */
Substitution *substitutions[SUBST_CLASSES][0x101];

/*
 * The pool of Substitution structures.  Remains alive for the entire
 * execution of the program.
 */
static MemPool substPool;
static Substitution *substFree;
static unsigned int substCount = 1;	/* Preallcoate 0 to substNull */
static unsigned int substFirstDynamic;
#define SubstIsDynamic(s) ((s)->index >= substFirstDynamic)
/* Adjust the substitution based on noccurrences this page */
#define SubstAdjust(s,n) ((s)->cost = (s)->cost2)
/* Is this a nasty-line substitution? */
#define SubstIsNasty(s) ((s)->cost2 == COST_INFINITY)

/* Every possible single-character string */
static char substChars[512];
#define SubstString(c) (substChars+2*((c)&255))

/* Set the list of substitutions to empty */
static void
SubstInit(void)
{
	unsigned int i, j;

	memPoolInit(&substPool);
	substFree = 0;
	substCount = 1;	/* Number zero is reserved for uncounted substitutions */
	for (i = 0; i < elemsof(substitutions); i++)
		for (j = 0; j < elemsof(*substitutions); j++)
			substitutions[i][j] = NULL;

	for (i = 0; i < 256; i++) {
		substChars[2*i] = (char)i;
		substChars[2*i+1] = 0;
	}
}

/*
 * For dynamically allocated substitutions, we maintain a free list.
 * Each substitution has a unique serial number.  These are retained
 * if a substitution goes on the free list, to keep substCount from
 * ratcheting upwards indefinitely while still guaranteeing uniqueness.
 */
static Substitution *
SubstAlloc(void)
{
	struct Substitution *subst = substFree;

	if (subst) {
		substFree = subst->next;
	} else {
		subst = memPoolNew(&substPool, Substitution);
		subst->index = substCount++;
	}
	return subst;
}

static void
SubstFree(Substitution *subst)
{
	subst->next = substFree;
	substFree = subst;
}

static Substitution *
MakeSubst(char const *input, char const *output, HeapCost cost, HeapCost cost2,
	FilterFunc *filter, int class)
{
	struct Substitution *subst, **head;

	subst = SubstAlloc();
	subst->input = input;
	subst->output = output;
	subst->inlen = strlen(input);
	subst->outlen = strlen(output);
	subst->cost = cost;
	subst->cost2 = cost2;
	subst->filter = filter;

	/*
	 * Ignore certain substitutions when printing stats.
	 * Identity substitutions, and the tab/space tweaking.
	 */
	if (strcmp(input, output) == 0 || strcmp(input, TAB_STRING) == 0 ||
		(input[0] == ' ' && input[1] == 0 && output[0] == 0)) {
			if (subst->index == substCount-1)
				substCount--;
			subst->index = 0;	/* Evil hack */
	}

	head = &substitutions[class][input[class] & 255];
	subst->next = *head;
	*head = subst;
	return subst;
}

/*
 * For each entry in the raw array, turn { "abc", "def", 5" }
 * into cost-5 mappings of "a"->"d", "b"->"e" and "c"->"f".
 * If the output string is NULL, the characters are deleted.
 * An input string of NULL is the end of table delimiter.
 */
static void
SubstSingle(struct RawSubst const *raw, int class)
{
	char const *input, *output;
	int i, o;

	while (raw->input) {
		input = raw->input;
		output = raw->output;
		assert(!output || strlen(input) == strlen(output));

		while (*input) {
			i = *input++;
			o = output ? *output++ : 0;
			(void)MakeSubst(SubstString(i), SubstString(o),
							raw->cost, raw->cost2, raw->filter, class);
		}
		raw++;
	}
}

/*
 * For each entry in the raw array, turn { "abc", "def", 5" }
 * into a cost-5 mappings of "abc"->"def".
 * An input string of NULL is the end of table delimiter.
 */
static void
SubstMultiple(struct RawSubst const *raw, int class)
{
	while (raw->input) {
		(void)MakeSubst(raw->input, raw->output, raw->cost, raw->cost2,
					    raw->filter, class);
		raw++;
	}
}

/* Build the substitutions table */
static void
SubstBuild(void)
{
	SubstInit();
	SubstSingle(substSingles, 0);
	SubstMultiple(substMultiples, 0);
	substFirstDynamic = substCount;
}

/*
 * See if the desired substitution already exists
 */
static Substitution const *
SubstSearch(char const *in, size_t inlen, char const *out, size_t outlen,
	int class)
{
	Substitution *subst = substitutions[class][in[0] & 255];

	for (; subst; subst = subst->next) {
		if (subst->inlen == inlen && subst->outlen == outlen &&
			memcmp(subst->input, in, inlen) == 0 &&
			memcmp(subst->output, out, outlen) == 0)
				return subst;	/* Already exists */
	}
	return NULL;
}


/*
 * Create a new dynamic substitution.  First search to make
 * sure it doesn't already esist.
 */
static Substitution const *
SubstDynamic(char const *in, char const *out, int class)
{
	Substitution const *subst;

	subst = SubstSearch(in, strlen(in), out, strlen(out), class);
	return subst ? subst : MakeSubst(in, out, COST_INFINITY,
									 DYNAMIC_COST_LEARNED, NULL, class);
}

/*
 * Search for the substitution, allocating one if not found.
 * the input string is not null-terminated and needs to be copied to
 * an allocated buffer.  The output string can just be pointer-copied.
 */
static Substitution const *
SubstNasty(char const *in, size_t inlen, char const *out, int class)
{
	Substitution const *subst;
	char *string;

	if ((subst = SubstSearch(in, inlen, out, strlen(out), class)) != NULL)
		return subst;

	if (!(string = malloc(inlen+1))) {
		fputs("Out of memory!\n", stderr);
		exit(1);
	}
	memcpy(string, in, inlen);
	string[inlen] = 0;
	return MakeSubst(string, out, COST_INFINITY, COST_INFINITY, NULL, class);
}

/*
 * The state of the parser.
 * Note that this is updated when a ParseNode is *removed* from the heap;
 * ParseNodes that are in the heap have ParseStates that reflect the
 * state before the substitution has been parsed; this is a copy of the
 * parents' state, which is after the parsing.
 */
typedef struct ParseState {
	CRC page_crc;			/* Computed per-page CRC */
	word16 flags;			/* Flags; see below */
	unsigned char pos;		/* Position on the line */
} ParseState;	/* 7 bytes, rounded to 8 */

/* Flags values */
#define PS_MASK_PAGENUM	 0xC000 /* Digits in header page number (1..3) */
#define PS_SHIFT_PAGENUM	 14	/* Shift for the above */
#define PS_FLAG_EOL			512	/* Expect \n next */
#define PS_FLAG_SPACE		256	/* Was last char a space? */
#define PS_FLAG_TAB			128	/* Tabbing over a column */
#define PS_FLAG_INHEADER	 64	/* Current line is a header */
#define PS_FLAG_PASTHEADER	 32	/* A previous line was a header */
#define PS_FLAG_BINWS		 16	/* In whitespace after binary data */
#define PS_FLAG_BINEND		  8	/* End of binary data */
#define PS_FLAG_DYNAMIC		  4	/* Have used ECC this line */
#define PS_MASK_FORMAT	 	  3	/* The encoding format (max of 3, for now) */
#define PS_SHIFT_FORMAT		  0	/* Shift for the above */

/* Have we started on a second page?  Used to force flushing of the first. */
#define InSecondHeader(ps) \
	((~(ps)->flags & (PS_FLAG_INHEADER | PS_FLAG_PASTHEADER)) == 0)

#define PageNumDigits(pn) (((pn)->ps.flags & PS_MASK_PAGENUM) >> PS_SHIFT_PAGENUM)
#define PageNumDigitsIncrement(pn) ((pn)->ps.flags += 1<<PS_SHIFT_PAGENUM)

EncodeFormat const *registeredFormats[4];

/* Returns a small integer index */
static int
registerFormat(EncodeFormat const *format)
{
	int i;
	for (i = 0; i < (int)elemsof(registeredFormats); i++) {
		if (registeredFormats[i] == format)
			return i;
		if (!registeredFormats[i]) {
			registeredFormats[i] = format;
			return i;
		}
	}
	fputs("Registered formats table overflow!\n", stderr);
	exit(1);
}

#define psFormat(ps) registeredFormats[((ps)->flags & PS_MASK_FORMAT)>>PS_SHIFT_FORMAT]
#define pnFormat(pn) psFormat(&(pn)->ps)
#define psSetFormat(ps, i) \
	((ps)->flags = ((ps)->flags & ~PS_MASK_FORMAT) | i << PS_SHIFT_FORMAT)

typedef struct ParseNode {
	HeapCost cost;
	unsigned int refcnt;
	struct ParseNode *parent;
	char const *input;
	struct Substitution const *subst;
	struct ParseState ps;
} ParseNode;	/* 32 bytes */

/* A handle for walking backwards through the output stream */
typedef struct OutputHandle {
	ParseNode const *node;
	char const *output;
	unsigned int pos;
} OutputHandle;

/* Initialize the handle to point to a node (optionally, a position therein) */
static void
OutputInit(OutputHandle *oh, ParseNode const *node, char const *p)
{
		oh->node = node;
		oh->output = p ? p : node->subst->output + node->subst->outlen;
		oh->pos = 0;
}

/* Get the *previous* byte */
static int
OutputGetPrev(OutputHandle *oh)
{
	if (!oh->node)
		return -1;
	for (;;) {
		if (oh->output != oh->node->subst->output) {
			oh->pos++;
			return *--oh->output & 255;
		}
		oh->node = oh->node->parent;
		if (!oh->node)
			break;
		oh->output = oh->node->subst->output + oh->node->subst->outlen;
	}
	return -1;
}

/* Return the character just before the node - trivial handy wrapper */
static int
OutputPrevChar(ParseNode const *node)
{
	OutputHandle oh;

	OutputInit(&oh, node, NULL);
	return OutputGetPrev(&oh);
}

/*
 * Unget the last retrieved character (and return it), or
 * -1 if that is impossible.  At least one character is
 * always ungettable, but after that you're on your own.
 */
static int
OutputUnget(OutputHandle *oh)
{
	if (oh->node && *oh->output) {
		oh->pos--;
		return *oh->output++ & 255;
	}
	return -1;
}

/* The position is useful for comparing two OutputHandles. */
#define OutputPos(oh) ((oh)->pos)

/*
 * Fill backwards from bufend until you hit the given char.
 * Use -1 to get the whole buffer.
 */
static char *
OutputGetUntil(OutputHandle oh, char *bufend, int end)
{
	int c;

	while ((c = OutputGetPrev(&oh)) != -1 && c != end)
		*--bufend = (char)c;
	return bufend;
}

/*
 * The per-page structure.  This is actually global, but describes
 * the values kept for each page processed.
 */
typedef struct PerPage {
	CRC page_check;
	char const *maxpos, *minpos;
	unsigned int tabsize;	/* Zero means this is a binary page */
	unsigned int lines;
	unsigned int retries;	/* How many retires since last progress? */
	unsigned int max_retries;	/* Maximum number of retries needed. */
} PerPage;

PerPage perpage;	/* The global */

static void
PerPageInit(char const *buf)
{
	perpage.maxpos = perpage.minpos = buf;
	perpage.page_check = 0;
	perpage.tabsize = 4;	/* The default */
	perpage.lines = perpage.retries = perpage.max_retries = 0;
}

/*
 * Is the tab substitution being looked at acceptable?
 * It is if the length needed to make the tab width come out
 * right, it is.  Otherwise, it's junk.
 */
HeapCost
TabFilter(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c, tabpos;
	OutputHandle oh;

	(void)limit;
	if (!perpage.tabsize)
		return COST_INFINITY;	/* No interest */

	/* How wide should the tab be? */
	tabpos = (int)((parent->ps.pos-PREFIX_LENGTH) % perpage.tabsize);
	if ((int)subst->outlen != (int)perpage.tabsize - tabpos)
		return COST_INFINITY;
	/* The right number - cost if likely, cost2 if unlikely */
	if (subst->cost == subst->cost2)
		return subst->cost;
	OutputInit(&oh, parent, NULL);
	do {
		c = OutputGetPrev(&oh);
	} while (c == ' ');
	return (c == TAB_CHAR) ? subst->cost : subst->cost2;
}

/*
 * Return cost if near blanks (including end-of-line), cost2 if not, and
 * the average of there is a blank on one side.  There are additional
 * versions for upper- and lower-case.  _ is considered upper-case,
 * as it's oftne used in acro identifiers.
 */
HeapCost
FilterNearBlanks(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent), score = (isspace(c) != 0);
	char const *p = parent->input + parent->subst->inlen;

	score += p == limit || isspace((unsigned char)*p) != 0;
	return (subst->cost*score + subst->cost2*(2-score))/2;
}

HeapCost
FilterNearUpper(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent), score = (isupper(c) != 0 || c == '_');
	char const *p = parent->input + subst->inlen;

	score += p != limit && (isupper((unsigned char)*p) != 0 || *p == '_');
	return (subst->cost*score + subst->cost2*(2-score))/2;
}

HeapCost
FilterNearXDigit(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent), score = (isxdigit(c) != 0);
	char const *p = parent->input + subst->inlen;

	score += p != limit && (isxdigit((unsigned char)*p) != 0);
	return (subst->cost*score + subst->cost2*(2-score))/2;
}

HeapCost
FilterNearLower(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent), score = (islower(c) != 0);
	char const *p = parent->input + subst->inlen;

	score += p != limit && (islower((unsigned char)*p) != 0);
	return (subst->cost*score + subst->cost2*(2-score))/2;
}

/*
 * cost2 unless previous character was a space (' ' or SPACE_CHAR).
 * Note the & 255, necessary since chars might be signed and SPACE_CHAR
 * is in the high (negative) half, but c is an int in the range -1..255.
 */
HeapCost
FilterFollowsSpace(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent);
	(void)limit;
	return (c == ' ' || c == (SPACE_CHAR & 255)) ? subst->cost : subst->cost2;
}

/* cost2 unless previous character was duplicate of this one */
HeapCost
FilterAfterRepeat(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int c = OutputPrevChar(parent);
	(void)limit;
	return (c == subst->output[0]) ? subst->cost : subst->cost2;
}

/* cost2 unless probably the closing quote in a char constant */
HeapCost
FilterCharConst(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	OutputHandle oh;
	int c;

	(void)limit;
	OutputInit(&oh, parent, NULL);
	c = OutputGetPrev(&oh);
	c = OutputGetPrev(&oh);
	if (c == '\\')
		c = OutputGetPrev(&oh);
	return (c == '\'') ? subst->cost : subst->cost2;
}

/*
 * If the identifier leading up to the current position contains
 * an underscore, then it's likely the current position is an underscore
 * as well; return cost.  If it does not, it's less likely; return cost2.
 */
HeapCost
FilterLikelyUnderscore(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	OutputHandle oh;
	int c;

	(void)limit;
	OutputInit(&oh, parent, NULL);
	for (;;) {
		c = OutputGetPrev(&oh);
		if (c == '_')
			return subst->cost;
		if (!isalnum(c))
			return subst->cost2;
	}
}

/* cost2 unless the following chars seem to be a checksum */
HeapCost
FilterChecksumFollows(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	int i, score = 0;
	char const *p = parent->input + subst->inlen;

	if (limit - p < PREFIX_LENGTH)
		return subst->cost2;
	if (!isspace((unsigned char)p[PREFIX_LENGTH-1]))
		return subst->cost2;
	for (i = 0; i < PREFIX_LENGTH-1; i++)
		score += (p[i] >= '0' && p[i] <= '9') + (p[i] >= 'a' && p[i] <= 'f');
	i = (score >= PREFIX_LENGTH-2 ? subst->cost : subst->cost2);
	/* Magic, since this function is perfect on binary files */
	if (i < COST_INFINITY && perpage.tabsize == 0)
		i = 0;
	return i;
}

/* Manage a *big* pool of ParseNodes */

struct MemPool nodePool;
struct ParseNode *nodeFreeList = 0;

/* Prepare for node allocations */
static void
NodePoolInit(void)
{
	memPoolInit(&nodePool);
	nodeFreeList = NULL;
}

/* Free all nodes in one swell foop */
static void
NodePoolCleanup(void)
{
	nodeFreeList = NULL;
	memPoolEmpty(&nodePool);
}

/* Allcoate a new (uninitialized) node */
static struct ParseNode *
NodeAlloc(void)
{
	struct ParseNode *node;

	node = nodeFreeList;
	if (node) {
		nodeFreeList = node->parent;
		return node;
	}
	return memPoolNew(&nodePool, ParseNode);
}

/* Free a node for reallocation */
static void
NodeFree(struct ParseNode *node)
{
	node->parent = nodeFreeList;
	nodeFreeList = node;
}

/*
 * Decrement a node's reference count, freeing it and
 * recursively decrementing its parent's if the count
 * goes to zero.
 */
static void
NodeRelease(struct ParseNode *node)
{
	struct ParseNode *parent;
	assert(node->refcnt);

	while (!--node->refcnt) {
		parent = node->parent;
		NodeFree(node);
		if (!parent)
			break;
		node = parent;
	}
}

/* Add nodes to the substitution tree */

/* Create a child of the given node, with the given properties. */
static ParseNode *
AddChild(ParseNode *parent, Substitution const *subst, HeapCost cost)
{
	ParseNode *child;

	if (cost == COST_INFINITY)
		return 0;

	cost += parent->cost;
	child = NodeAlloc();
	*child = *parent;
	/* Child is just like parent, except... */
	child->cost = cost;
	child->refcnt = 1;	/* The heap */
	child->input += subst->inlen;
	child->subst = subst;
	child->parent = parent;
	parent->refcnt++;
	return child;
}

/* Hash table of nasty lines, indexed by per-line CRC */
struct NastyLine {
	struct NastyLine *next;
	char const *line;
	CRC crc;
};

#define NASTY_HASH_SIZE 256
static struct NastyLine *nastyHash[NASTY_HASH_SIZE];	/* All zero */

struct MemPool nastyStrings, nastyStructs;
static CRCPoly const *nastyPoly = &crcCCITTPoly;
/*
 * Create a new NastyString entry if it doesn't already exist.
 * Note that this expects the string passed to end in a newline which
 * IS hashed but NOT stored
 */
static struct NastyLine *
AddNasty(char const *string)
{
	size_t len = strlen(string);	/* Including newline */
	CRC crc = CalculateCRC(nastyPoly, 0, (byte const *)string, len);
	struct NastyLine *nasty, **nastyp = nastyHash + (crc % NASTY_HASH_SIZE);
	char *line;

	/* Search for an existing copy */
	while ((nasty = *nastyp) != NULL) {
		if (nasty->crc == crc &&
			memcmp(nasty->line, string, len-1) == 0 &&
			nasty->line[len-1] == 0)
				return nasty;
		nastyp = &nasty->next;
	}
	/* Create a new structure */
	*nastyp = nasty = memPoolNew(&nastyStructs, struct NastyLine);
	nasty->next = NULL;
	nasty->line = line = memPoolAlloc(&nastyStrings, len, 1);
	nasty->crc = crc;
	memcpy(line, string, len-1);
	line[len-1] = 0;
	return nasty;
}

static void
RehashNasties(CRCPoly const *poly)
{
	struct NastyLine *cur, *head;
	CRC crc;
	int i;
	size_t len;

	/* Put everything into one list and clear the hash table */
	head = NULL;
	for (i = 0; i < (int)elemsof(nastyHash); i++) {
		while ((cur = nastyHash[i]) != NULL) {
			nastyHash[i] = cur->next;
			cur->next = head;
			head = cur;
		}
	}
	/* Recompute CRCs for the list and redistribute them among the buckets */
	while (head) {
		cur = head;
		head = head->next;
		len = strlen(cur->line);
		crc = CalculateCRC(poly, 0, (byte const *)cur->line, len);
		crc = AdvanceCRC(poly, crc, '\n');
		cur->crc = crc;
		cur->next = nastyHash[crc % NASTY_HASH_SIZE];
		nastyHash[crc % NASTY_HASH_SIZE] = cur;
	}
	nastyPoly = poly;
}

/* Read in the nastylines file */
static void
ReadNasties(FILE *f)
{
	char buf[128];

	while (fgets(buf, sizeof(buf)-1, f))
		AddNasty(buf);
}

/*
 * Convert an encoded string to binary.
 * No error checking is performed.
 */
static word32
GetWord32(EncodeFormat const *format, char const *buf, int len)
{
	word32 w = 0;

	while (len--)
		w = (w<<format->bitsPerDigit) + DecodeDigit(format, *buf++);
	return w;
}

/* Attempt nasty line substitutions */
static void
TryNasty(struct ParseNode *parent, Heap *heap, char const *limit)
{
	struct NastyLine const *nasty;
	struct Substitution const *subst;
	struct ParseNode *child;
	char const *end;
	EncodeFormat const *format = pnFormat(parent);
	OutputHandle oh;
	char buf[4];
	CRC check;
	int i;

	/* Make sure the lines are hashed properly */
	if (nastyPoly != format->lineCRC)
		RehashNasties(format->lineCRC);

	/* Get the line to be replaced */
	assert(parent->ps.pos == PREFIX_LENGTH);
	end = memchr(parent->input, '\n', limit - parent->input);
	if (!end)
		end = limit;
	/* Get the line's check value */
	OutputInit(&oh, parent, NULL);
	(void)OutputGetPrev(&oh);
	i = 4;
	while (--i)
		buf[i] = OutputGetPrev(&oh);
	check = GetWord32(format, buf, 4);
	/* Find the matches */
	nasty = nastyHash[check % NASTY_HASH_SIZE];
	for (; nasty; nasty = nasty->next) {
		if (nasty->crc == check) {
			subst = SubstNasty(parent->input, end-parent->input,
							   nasty->line, 0);
			if (subst) {
				child = AddChild(parent, subst, NASTY_COST);
				if (child) {
					child->ps.flags |= PS_FLAG_DYNAMIC;
					HeapInsert(heap, &child->cost);
				}
			}
		}
	}
}

/*
 * Form all of a ParseNode's children and add them to the heap.
 * Limit is the limit of allowable lookahead.
 */
static void
AddChildren(ParseNode *parent, Heap *heap, char const *limit)
{
	char c = parent->input[0];
	Substitution *subst = substitutions[0][c & 255];
	ParseNode *child;
	HeapCost cost;

/* If you want to make pure insertion substitutions, do that here */

	assert(parent->input < limit);	/* We always have at least one char */

	while (subst) {
		if (subst->inlen == 1 ||	/* Easy case */
			((size_t)(limit-parent->input) >= subst->inlen &&
			 memcmp(subst->input, parent->input, subst->inlen) == 0))
		{
			cost = subst->cost;
			if (subst->filter)
				cost = subst->filter(parent, limit, subst);
			child = AddChild(parent, subst, cost);
			if (child)
				HeapInsert(heap, &child->cost);
		}
		subst = subst->next;
	}

	/* Whole-line substitutions */
	if (parent->ps.pos == PREFIX_LENGTH)
		TryNasty(parent, heap, limit);
}


/* cost if this line has a dynamic substitution, otherwise cost2 */
HeapCost
FilterIsDynamic(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	(void)limit;
	return (parent->ps.flags & PS_FLAG_DYNAMIC) ? subst->cost : subst->cost2;
}

/* cost if the current page is binary mode, else cost2 */
HeapCost
FilterIsBinary(struct ParseNode *parent, char const *limit,
	struct Substitution const *subst)
{
	(void)parent; (void)limit;
	return perpage.tabsize ? subst->cost2 : subst->cost;
}

/* Debugging utility */
#define DEBUG 1	/* Set to 1 to print every line considered */

static size_t lastlen = 0;

static void
OverstrikeLine(char const *line, size_t len)
{
	static size_t lastlen = 0;
	int blanklen;

	if (!line) {
		if (lastlen)
			putchar('\n');
		lastlen = 0;
	} else if (len || lastlen) {
		if (len > 79)
			len = 79;
		blanklen = (lastlen > len) ? (int)lastlen - len : 0;
		printf("%.*s%*s\r", (int)len, line, blanklen, "");
		fflush(stdout);
		lastlen = len;
	}
}

/* Print everything, for debugging */
static void
PrintLine(char const *line, size_t len)
{
	if (line) {
		printf("%.*s\n", (int)len, line);
		lastlen = 0;
	}
}

static HeapCost ParseAdvanceString(Heap *heap, ParseNode *pn);

/*
 * Copy the parsechain from tail up to root, and hang it off of
 * newroot, adjusting the costs and parse state accordingly.  Returns
 * NULL if it is unable to (invalid parse, too expensive, etc.)
 * Note that as per the convention, ParseAdvanceString is *not* called
 * on the new tail node (but is called on all its parents).
 */
static ParseNode *
CopyParse(ParseNode const *tail, ParseNode const *root, ParseNode *newroot)
{
	ParseNode *newtail, *parent;

	if (tail == root)
		return newroot;
	parent = CopyParse(tail->parent, root, newroot);
	if (!parent)
		return NULL;
	newtail = AddChild(parent, tail->subst, ParseAdvanceString(NULL, parent));
	NodeRelease(parent);
	return newtail;
}

/*
 * Replace oldnode with a dynamic substitution for newchar, if possible,
 * and fill in the chain down to "tail" just like before, but with no branches.
 * Add the resultant ParseNode to the heap.
 */
static void
AddDynamic(Heap *heap, ParseNode const *oldnode, ParseNode const *tail,
	int newchar)
{
	Substitution const *subst = oldnode->subst;
	ParseNode *newnode;

	/* Only replace one-character substitutions */
	if (subst->outlen != 1)
		return;

	subst = SubstDynamic(oldnode->subst->input, SubstString(newchar), 0);
	newnode = AddChild(oldnode->parent, subst, -1); /* Try it immediately */
	if (newnode) {
		newnode->ps.flags |= PS_FLAG_DYNAMIC;
		newnode = CopyParse(tail, oldnode, newnode);
		if (newnode)
			HeapInsert(heap, &newnode->cost);
	}
}

/*
 * Do the same, at a given (1-based) position on the line.  Owing to
 * a minor glitch, we must never count the tail node, as this has not
 * been parsed yet, so its oldnode->ps.pos field is inaccurate.
 */
static void
AddDynamicAt(Heap *heap, int position, ParseNode const *tail, int newchar)
{
	ParseNode const *oldnode = tail;

	do {
		oldnode = oldnode->parent;
	} while (oldnode->ps.pos > position);

	if (oldnode->ps.pos == position)
		AddDynamic(heap, oldnode, tail, newchar);
}

/*
 * Given the computed and input check fields, correct the header field
 * that *ends* at the given pos.  This can be used for both the line and
 * page CRC errors by jyst changing the pos.  (It relies on the fact
 * that the page CRC fragment fits into the LineCRC type.)
 * It also relies on the fact that the CRC is at most 4 digits.
 */
static void
ErrorCorrectHeader(Heap *heap, ParseNode const *tail, int pos,
	EncodeFormat const *format, CRC crc, CRC check)
{
	CRC syndrome = crc ^ check;

	/* Find the position and the crc digit at that position */
	while (syndrome >= (CRC)format->radix) {
		if (syndrome & (CRC)(format->radix - 1))
			return;	/* uncorrectable */
		pos--;
		crc >>= format->bitsPerDigit;
		syndrome >>= format->bitsPerDigit;
	}
	/* Paste in the correct digit */
	AddDynamicAt(heap, pos, tail, EncodeDigit(format, crc & (format->radix-1)));
}

/*
 * This function walks back through the line, and if the line CRC could be
 * made correct by changing a character to another legal character,
 * the change is added (on probation) to the substitution table.
 */
static void
ErrorCorrect(Heap *heap, OutputHandle oh, EncodeFormat const *format,
	CRC syndrome)
{
	ParseNode const *tail = oh.node;
	int c;

	syndrome = ReverseCRC(format->lineCRC, syndrome, 0);
	while (oh.node->ps.pos > PREFIX_LENGTH) {
		c = OutputGetPrev(&oh);
		if (c == '\n' || c == -1) {	/* Can't happen */
			printf("Line ended at pos %d\n", oh.node->ps.pos);
			return;
		}
		syndrome = ReverseCRC(format->lineCRC, syndrome, 0);
		if (syndrome >= 0x100 || !substitutions[0][c^syndrome] ||
			oh.node->subst->outlen != 1)
				continue;
		AddDynamic(heap, oh.node, tail, c^syndrome);
	}
}

/*
 * Parsing operations.  This is a rather ugly and ad-hoc parser that
 * knows a lot about the fixed-field format produced by the munge
 * utility.  The main state variable is the position in
 * the line, which controls the expected header, the position of
 * tab stops, and the maximum permissible line length.
 */
#define OCCASIONALLY 100

/* Set up a ParseState to top-of-page */
static void
ParseStateInit(ParseState *ps)
{
	static struct ParseState const parseNull = { 0, 0, 0 };
	*ps = parseNull;
}

/*
 * Try to accept a newline, checking CRCs and even doing error-correction
 * as appropriate.
 */
static int
ParseNewline(Heap *heap, ParseNode *pn, char const *string)
{
	OutputHandle oh;
	int c;
	char debugbuf[PREFIX_LENGTH+LINE_LENGTH+10];
	char *header, *body, *end;
	int pos, width;
	CRC crc, check;
	ParseNode *temp;
	static int occasionally = OCCASIONALLY;
	EncodeFormat const *format = pnFormat(pn);
	EncodeFormat const *headerFormat = &hexFormat;

	/* Get the line into a buffer for analysis */
	OutputInit(&oh, pn, string);
	end = debugbuf + sizeof(debugbuf)-1;
	header = OutputGetUntil(oh, end, '\n');
	/* Strip leading and trailing whitespace */
	while (header < end && isspace((unsigned char)header[0]))
		header++;
	while (header < end && isspace((unsigned char)end[-1]))
		end--;
	*end++ = '\n';

	/* Start of checksummed area */
	body = header + PREFIX_LENGTH;
	/* Blank lines are missing the trainign space from the prefix */
	if (body >= end)
		body = end-1;

	crc = CalculateCRC(format->lineCRC, 0, body, end-body);
	check = GetWord32(format, header+2, 4);
	if (crc != check) {
		if (!--occasionally) {
			OverstrikeLine(header, end-header-1);
			occasionally = OCCASIONALLY;
		}
		/* Try ECC on the line */
		/* If we haven't already tried ECC on the line... */
		if (!(pn->ps.flags & PS_FLAG_DYNAMIC)) {
			ErrorCorrectHeader(heap, pn, PREFIX_LENGTH-1, format, crc, check);
			ErrorCorrect(heap, oh, format, crc ^ check);
		}
		return COST_INFINITY;
	}
	/* Good enough that we always print it */
	OverstrikeLine(header, end-header-1);

	/* Okay, now there are two cases - header line or running CRC */
	if (pn->ps.flags & PS_FLAG_INHEADER) {
		/* Do things for first header */
		if (!(pn->ps.flags & PS_FLAG_PASTHEADER)) {
			/* Check version number */
			width = EncodedLength(headerFormat, HDR_VERSION_BITS);
			c = (int)GetWord32(&hexFormat, body, width);
			if (c != 0) {
				fputs("Fatal: you need a newer version of repair"
				      " to process this file\n", stderr);
				exit(1);
			}
			/* Suck in page CRC, after version & flags */
			pos = width + EncodedLength(headerFormat, HDR_FLAG_BITS);
			width = EncodedLength(headerFormat, format->pageCRC->bits);
			perpage.page_check = GetWord32(&hexFormat, body+pos, width);
			/* Get tab size */
			pos += width;
			width = EncodedLength(headerFormat, HDR_TABWIDTH_BITS);
			perpage.tabsize = GetWord32(&hexFormat, body+pos, width);

			/* Once we have the header, don't reconsider */
			if (!(pn->ps.flags & PS_FLAG_PASTHEADER))
				while ((temp = (ParseNode *)HeapGetMin(heap)) != NULL)
					NodeRelease(temp);
			pn->ps.page_crc = 0;	/* Clear for top of page */
		}
	} else {
		/* Check the CRC-32 */
		crc = CalculateCRC(format->pageCRC, pn->ps.page_crc, body, end-body);
		pn->ps.page_crc = crc;
		crc = RunningCRCFromPageCRC(format, crc);
		check = GetWord32(format, header, 2);
		if (crc != check) {
			if (!(pn->ps.flags & PS_FLAG_DYNAMIC))
				ErrorCorrectHeader(heap, pn, 2, format, crc, check);
			return COST_INFINITY;
		}
	}

	/* Hey, it's correct! */
	PrintLine(header, end-header-1);

	/* Start next line */
	pn->ps.pos = 0;
	/* Clear most other flags, but we *have* got a header */
	c = pn->ps.flags & PS_FLAG_DYNAMIC;
	pn->ps.flags &= PS_FLAG_BINEND | PS_MASK_FORMAT;
	pn->ps.flags |= PS_FLAG_PASTHEADER;
	/*
	 * Give a bonus to the next line for having completed this one,
	 * less if it was dynamically fixed.
	 */
	return c ? COST_LINE : COST_LINE*2/3;
}

/*
 * Advance the parse state with pointed-to character.  Returns
 * COST_INFINITY if an impossible state is reached, otherwise returns a
 * cost value.  (Normally 0, this can be increased to penalize unlikely
 * output combinations to nudge the correction in a certain direction.)
 */
static HeapCost
ParseAdvance(Heap *heap, ParseNode *pn, char const *string)
{
	int i, retval = 0;
	char c = *string;
	EncodeFormat const *format = pnFormat(pn);

	/*
	 * Insist on spaces being correctly converted to SPACE_CHAR.
	 * There's a little irregularity just before EOL.
	 * Line contiunation and formfeed are also only legal at EOL.
	 */
	if (c == ' ') {
		if (pn->ps.flags & PS_FLAG_SPACE && !(pn->ps.flags & PS_FLAG_TAB))
			pn->ps.flags |= PS_FLAG_EOL;
		pn->ps.flags |= PS_FLAG_SPACE;
	} else if (pn->ps.flags & PS_FLAG_EOL) {
		if (c != '\n')
			return COST_INFINITY;
	} else if (c == SPACE_CHAR) {
		if (!(pn->ps.flags & PS_FLAG_SPACE))
			pn->ps.flags |= PS_FLAG_EOL;
	} else if (c == CONTIN_CHAR || c == FORMFEED_CHAR) {
			pn->ps.flags |= PS_FLAG_EOL;
	} else {
		pn->ps.flags &= ~PS_FLAG_SPACE;
	}

	switch (pn->ps.pos) {
		case 0:
			if (c == ' ' || c == '\n') {
				break;		/* Ignore ws and blank lines completely */
			} else if (c == '\f' || c == HDR_PREFIX_CHAR) {
				/* Start of a new page */
				pn->ps.flags |= PS_FLAG_INHEADER;	/* Expect header next */
				if (c == '\f')
					break;
				/* And fall through to increment pos */
			} else if (pn->ps.flags & PS_FLAG_INHEADER ||
					   pn->ps.flags & PS_FLAG_BINEND ||
					   !(pn->ps.flags & PS_FLAG_PASTHEADER) ||
					   DecodeDigit(format, c) < 0) {
				return COST_INFINITY;	/* Various illegal cases */
			}
			pn->ps.pos++;
			break;
		case 1:
			if ((pn->ps.flags & PS_FLAG_INHEADER)) {
				format = FindFormat(c);	/* Second char of header */
				if (!format)
					return COST_INFINITY;
				i = registerFormat(format);
				psSetFormat(&pn->ps, i);
				pn->ps.pos++;
				break;
			}
			if (DecodeDigit(format, c) < 0)
				return COST_INFINITY;	/* Illegal */
			pn->ps.pos++;
			break;
		case 2:
		case 3:
		case 4:
#if PREFIX_LENGTH != 7
#error fix this code
#endif
		case PREFIX_LENGTH-2:
			if (DecodeDigit(format, c) < 0)
				return COST_INFINITY;	/* Illegal */
			pn->ps.pos++;
			break;
		case PREFIX_LENGTH-1:
			if (c == ' ') {
				pn->ps.pos++;
				break;
			} else if (c != '\n') {
				return COST_INFINITY;
			}
			/* Blank lines may be missing this space char */
			/*FALLTHROUGH*/
		/* The normal line starts here, at position 7 */
		default:
			if (pn->ps.flags & PS_FLAG_INHEADER) {	/* Header line */
				/* Format is "--abcd 0123456789abcdef012 Page %u of %s" */
				int off = pn->ps.pos - (PREFIX_LENGTH+HDR_ENC_LENGTH);
				/* Offset relative to end of hex header */
				if (off < 0) {
					if (HexDigitValue(c & 255) < 0)
						return COST_INFINITY;
				} else if (off < 6) {
					if (c != " Page "[off])	/* Yes, this is legal C */
						return COST_INFINITY;
				} else if (off == 6) {
					if (c < '1' || c > '9')	/* First digit of page no. */
						return COST_INFINITY;
				} else {
					/* Re-base to end of scanned part of page number */
					off -= 7 + PageNumDigits(pn);
					if (off == 0) {
						if (c >= '0' && c <= '9' && PageNumDigits(pn) < 3)
							PageNumDigitsIncrement(pn);
						else if (c != ' ')
							return COST_INFINITY;
					} else if (off < 4) {
						if (c != " of "[off])
							return COST_INFINITY;
					} else if (off == 4) {
						if (!isgraph(c))
							return COST_INFINITY;
					} else if (c < ' ' || (c & 255) > '~') {
						if (c != '\n')
							return COST_INFINITY;
						return ParseNewline(heap, pn, string);
					}
				}
			} else if (!perpage.tabsize) {	/* Radix-64 line */
				/* Line is "RlNFVF9UQU==   \n" */
				if (isspace(c & 255)) {
					if (!(pn->ps.flags & PS_FLAG_BINWS)) {
						if ((pn->ps.pos - PREFIX_LENGTH) % 4 != 0)
							return COST_INFINITY;
						pn->ps.flags |= PS_FLAG_BINWS;
						if (pn->ps.pos - PREFIX_LENGTH < BYTES_PER_LINE*4/3)
							pn->ps.flags |= PS_FLAG_BINEND;
					}
					if (c == '\n')
						return ParseNewline(heap, pn, string);
				} else if (pn->ps.flags & PS_FLAG_BINWS) {
					return COST_INFINITY;
				} else if (c == RADIX64_END_CHAR) {
					if ((pn->ps.pos - PREFIX_LENGTH) % 4 < 2)
						return COST_INFINITY;
					pn->ps.flags |= PS_FLAG_BINEND;
				} else if (pn->ps.flags & PS_FLAG_BINEND) {
					return COST_INFINITY;
				} else if (Radix64DigitValue(c) < 0) {
					return COST_INFINITY;
				}
			} else {	/* Normal line */
				/* Make sure tab stops come out right */
				if (pn->ps.flags & PS_FLAG_TAB) {
					if (((pn->ps.pos - PREFIX_LENGTH) % perpage.tabsize) == 0)
						pn->ps.flags &= ~PS_FLAG_TAB;
					else if (c != TAB_PAD_CHAR && c != '\n') {
						return COST_INFINITY;	/* Illegal */
					}
				}
				/*
				 * Yes, this code has hard-coded ASCII assumptions
				 * It knows that the acceptable range of '\n', ' '..'~',
				 * TAB_CHAR, FORMFEED_CHAR is in that order.
				 * Signed char machines have it backwards, to be confusing.
				 */
				if ((c & 255) < ' ') {
					/* Newline! (Or something illegal) */
					if (c != '\n')
						return COST_INFINITY;
					return ParseNewline(heap, pn, string);
				}
				/* A normal character */
				if ((c & 255) > '~') {
					if (pn->ps.flags & PS_FLAG_INHEADER)
						return COST_INFINITY;	/* Illegal */
					if (c == TAB_CHAR)
						pn->ps.flags |= PS_FLAG_TAB;
					else if (c != FORMFEED_CHAR && c != SPACE_CHAR &&
												   c != CONTIN_CHAR)
						return COST_INFINITY;	/* Illegal */
				}
			}
			if (++pn->ps.pos > PREFIX_LENGTH + LINE_LENGTH)
				return COST_INFINITY;
			break;
	}
	return retval;
}

/*
 * Run the parser over the string in a ParseNode (using repeated calls
 * to ParseAdvance).  Return the penalty cost, or COST_INFINITY if
 * it's impossible
 */
static HeapCost
ParseAdvanceString(Heap *heap, ParseNode *pn)
{
	HeapCost cost, total = 0;
	char const *string = pn->subst->output;

	while (*string) {
		cost = ParseAdvance(heap, pn, string++);
		if (cost == COST_INFINITY)
			return cost;
		total += cost;
	}
	return total;
}

static unsigned int *globalStats = NULL;
static unsigned globalSize = 0;
static unsigned globalEdits = 0;

/*
 * This walks the list of substitutions, performing two tasks with
 * the statistics gathered.
 *
 * First, although not essential, it prints any interesting changes
 * (non-identity substitutions) made, and a count of the total number
 * of substitutions (including identity) as an approximate character count.
 *
 * Second, it does maintenance on dynamic (learned during program
 * execution) substitutions.  It discards any substitutions that end
 * up unused, and computes nice costs for the others, based on the
 * global (per-file) statistics.
 *
 * (This function is also called at the end to print the per-file stats,
 * which does redundant weight adjustment, but it's harmless.)
 */
static void
UseStats(unsigned int *stats, FILE *log)
{
	unsigned int i, j, n, changes = 0;
	unsigned long grand = 0;
	Substitution *s, **sp;

	if (!stats)
		return;

	/* Yes, this loop is permuted on purpose */
	for (j = 0; j < elemsof(*substitutions); j++) {
		for (i = 0; i < elemsof(substitutions); i++) {
			sp = &substitutions[i][j];
			while ((s = *sp) != 0) {
				grand += n = stats[s->index];
				/* Retain or purge dynamic substitutions, depending. */
				if (SubstIsDynamic(s)) {
					if (n) {
						SubstAdjust(s, n);
					} else if (!globalStats[s->index]) {
						/* Forget unused dynamic substitutions */
						*sp = s->next;
						if (SubstIsNasty(s))
							free((char *)s->input);	/* Dynamically allocated */
						SubstFree(s);
						continue;
					}
				}
				sp = &s->next;
				/*
				 * Print interesting substitutions.  Some boring substitutions,
				 * flagged with an index value of zero, are not printed.
				 */
				if (!s->index || !n)
					continue;
				changes += n;
				fprintf(log, "\t%2ux \"%.*s\"%*s-> \"%.*s\"%*s(cost ",
					   stats[s->index], (int)s->inlen, s->input,
					   s->inlen>3 ? 0 : 3-(int)s->inlen, "",
					   (int)s->outlen, s->output,
					   s->outlen>3 ? 0 : 3-(int)s->outlen, "");
				fprintf(log, s->cost == COST_INFINITY ? "-" : "%d", s->cost);
				if (s->filter)
					fprintf(log, s->cost2 == COST_INFINITY ? "/-" : "/%d",
					        s->cost2);
				fputs(SubstIsDynamic(s) ? ") ** LEARNED **\n" : ")\n", log);
			}
		}
	}
	fprintf(log, "\tTotal: %u changes (out of %lu)\n", changes, grand);
}

static void
DoStats(ParseNode const *node, unsigned int page, FILE *log)
{
	unsigned int *stats;
	unsigned int n;

	/* Enlarge global stats if needed */
	if (globalSize < substCount) {
		stats = realloc(globalStats, substCount * sizeof(*stats));
		if (!stats)  {
			fputs("Fatal error: out of memory for stats!\n", stderr);
			exit(1);
		}
		for (n = globalSize; n < substCount; n++)
			stats[n] = 0;
		globalStats = stats;
		globalSize = substCount;
	}

	/* Allocate per-page stats */
	stats = calloc(substCount, sizeof(*stats));
	if (!stats) {
		fputs("Fatal error: out of memory for stats!\n", stderr);
		exit(1);
	}
	/* Cheat and assume that calloc() initializes unsigned ints to zero */
	while (node) {
		stats[node->subst->index]++;
		node = node->parent;
	}

	/* Keep the global counts accurate */
	for (n = 0; n < substCount; n++)
		globalStats[n] += stats[n];

	fprintf(log, "Page %u substitutions:\n", page);
	UseStats(stats, log);

	free(stats);
}

/* Spit out a page of data (needs work).  Returns number of lines */
static unsigned
PrintPage(OutputHandle oh, FILE *out)
{
	char pagebuf[PAGE_BUFFER_SIZE];
	char *p1;	/* Beginning of current line */
	char *p2;	/* End of current line (WS stripped) */
	char *p3;	/* End of current line (newline) */
	char *p4;	/* End of all output */
	unsigned lines = 0;

	p4 = pagebuf + sizeof(pagebuf);
	p1 = OutputGetUntil(oh, p4, -1);

	/* Output the lines without leading & trailing whitespace */
	while (p1 < p4) {
		/* Identify the line */
		p3 = memchr(p1, '\n', p4-p1);
		if (!p3)
			p3 = p4;
		/* Delete leading whitespacee */
		while (isspace((unsigned char)*p1) && p1 < p3)
			p1++;
		/* Delete trailing whitepace */
		p2 = p3;
		while (isspace((unsigned char)p2[-1]) && p1 < p2)
			p2--;
		/* Spit out this line */
		fwrite(p1, 1, (size_t)(p2-p1), out);
		putc('\n', out);
		/* Advance p1 past the newline */
		p1 = p3 + 1;
		lines++;
	}
	return lines;
}

static volatile int interrupt = 0;
static void (* volatile oldhandler)(int) = SIG_DFL;

static void inthandler(int sig)
{
	if (++interrupt > 2)
		(void)signal(sig, oldhandler);
}

/*
 * Given a buffer, process a page from it and try to write a corrected page to
 * the out file.  Return the number of bytes accessed.  (0 if it was unable
 * to make any corrections.)
 */
static size_t
DoPage(char const *buf, size_t len, FILE *out, unsigned int page, FILE *log)
{
	ParseNode *node;
	Heap heap;
	HeapCost cost;
	OutputHandle oh;
	void (*sighandler)(int);

	HeapInit(&heap, 1000);
	NodePoolInit();
	PerPageInit(buf);

	/* Initialize signal handling */
	interrupt = 0;
	sighandler = signal(SIGINT, inthandler);
	if (sighandler != inthandler)
		oldhandler = sighandler;

	/* Make a root node */
	node = NodeAlloc();
	node->cost = 0;
	node->refcnt = 1;
	node->input = buf;
	node->subst = &substNull;
	ParseStateInit(&node->ps);
	node->parent = NULL;

	HeapInsert(&heap, &node->cost);

	/* The main loop: try to extend the current parse. */
	while ((node = (ParseNode *)HeapGetMin(&heap)) != NULL) {
		cost = ParseAdvanceString(&heap, node);
		if (cost != COST_INFINITY) {
			/* End of the file, or hit a second header line? */
			if (node->input == buf+len || InSecondHeader(&node->ps)) {
				/* Try to wrap up page, if page CRC works */
				if (node->ps.page_crc == perpage.page_check) {
					/* Success! */
					HeapDestroy(&heap);
					OutputInit(&oh, node, NULL);
					OverstrikeLine("", 0);

					if (InSecondHeader(&node->ps)) {
						/* Back up to last newline */
						OutputInit(&oh, node, NULL);
						while (OutputGetPrev(&oh) != '\n')
							;
						OutputUnget(&oh);
					}
					/* oh points to node that emitted last char on page */
					len = oh.node->input - buf; /* Chars eaten this page */
					perpage.lines = PrintPage(oh, out);
					DoStats(oh.node, page, log);
					NodePoolCleanup();
					return len;
				}
			} else {
				/* Keep working on the page */
				node->cost = cost += node->cost;
				if (node->input > perpage.maxpos) {
					perpage.maxpos = perpage.minpos = node->input;
					if (perpage.max_retries < perpage.retries)
						perpage.max_retries = perpage.retries;
					perpage.retries = 0;	/* Made progress */
				} else if (node->input < perpage.minpos) {
					perpage.minpos = node->input;	/* Furthest backtrack */
				}
				++perpage.retries;
				if (heap.numElems > MAX_HEAP || interrupt)
					HeapDestroy(&heap);
				else
					AddChildren(node, &heap, buf+len);
			}
		}
		NodeRelease(node);
	}
	/* Failed! */
	OverstrikeLine(NULL, 0);
	puts("Stopping for manual edit.");

	NodePoolCleanup();
	/* Get rid of the dynamic substitutions */
	DoStats(NULL, page, log);

	return 0;
}

/* The magic file-shuffling routine. */
static int
RepairFile(char const *name, char const *editor, char const *nastylines)
{
	char buf[PAGE_BUFFER_SIZE];
	char *filename;
	char const *p;
	size_t namelen;
	FILE *in = 0, *out = 0, *dump = 0, *log = 0;
	size_t inbytes;		/* Bytes in input buffer */
	size_t outbytes;	/* Bytes taken from input buffer */
	unsigned int pages = 0;	/* # of pages processed */
	unsigned int lines = 0;	/* # of lines processed (until trouble) */
	unsigned int minline, maxline;	/* Where is the error? */
	int giveup;			/* Have we had to abort corrections? */
	int err;			/* Copy of errno for returns */

	globalSize = 0;	/* Reset global stats */

	namelen = strlen(name);
	if (!(filename = malloc(namelen+10))) {
		p = "Unable to allocate memory\n";
		goto error;
	}

	memcpy(filename, name, namelen);
	strcpy(filename+namelen, ".log");
	puts(filename);
	if (!(log = fopen(filename, "at"))) {
		p = "Unable to open log file \"%s\"\n";
		goto error;
	}

	strcpy(filename+namelen, ".out");
	puts(filename);
	if (!(out = fopen(filename, "at"))) {
		p = "Unable to open output file \"%s\"\n";
		goto error;
	}

retry:
	/* Read in any new nasty lines */
	if (!(in = fopen(nastylines, "rt"))) {
		fprintf(stderr, "Unable to open nasty lines file \"%s\"\n", nastylines);
	} else {
		ReadNasties(in);
		fclose(in);
	}
	/* Try to open input file - .in or original */
	p = filename;
	strcpy(filename+namelen, ".in");
	if (!(in = fopen(filename, "rt"))) {
		if (!(in = fopen(name, "rt"))) {
			filename[namelen] = 0;
			p = "Unable to open input file \"%s\"\n";
			goto error;
		}
		p = name;
	}
	printf("Repairing from %s\n", p);
	strcpy(filename+namelen, ".dmp");
	if (!(dump = fopen(filename, "wt"))) {
		p = "Unable to open output file \"%s\"\n";
		goto error;
	}

	giveup = 0;
	inbytes = 0;	/* Bytes already at the front of the buffer */
	/* Append more data from the file */
	while ((inbytes += fread(buf+inbytes, 1, sizeof(buf)-inbytes, in)) != 0) {
		if (giveup) {
			/* Giving up mode - just copy through */
			outbytes = fwrite(buf, 1, inbytes, dump);
			if (!outbytes) {
				p = "Error writing dump file!\n";
				goto error;
			}
		} else {
			outbytes = DoPage(buf, inbytes, out, pages+1, log);
			NodePoolCleanup();
			if (outbytes) {
				pages++;
				lines += perpage.lines;
			} else {	/* Failed */
				/* Find range of backtracking for error location */
				minline = 1;
				for (p = buf;  p < perpage.minpos; p++)
					minline += (*p == '\n');
				for (maxline = minline; p < perpage.maxpos; p++)
					maxline += (*p == '\n');
				giveup = 1;
			}
		}
		/* Fewer bytes now in the buffer */
		inbytes -= outbytes;
		/* Move those bytes to the front again */
		memmove(buf, buf+outbytes, inbytes);
	}

	fclose(in);
	in = 0;
	fclose(dump);
	dump = 0;

	/* Okay, let's get tricky */
	memcpy(buf, name, namelen);
	strcpy(buf+namelen, ".dmp");
	strcpy(filename+namelen, ".in");

	/* teun: MS Visual C doesn't rename on top of existing file; remove it */
	if (remove(filename) != 0) {
		err = errno;
		fprintf(stderr, "Warning deleting %s\n", filename);
	}

	if (rename(buf, filename) != 0) {
		err = errno;
		fclose(out);
		fclose(log);
		/* teun: corrected buf, filename order. This cost me an hour */
		fprintf(stderr, "Error renaming %s -> %s\n", buf, filename);
		return err;
	}

	/* This code is spaghetti - is there a cleaner way? */
	if (giveup) {
		printf("Error in %s, lines %u-%u\n", filename, minline, maxline);
		fprintf(log, "Error in %s, lines %u-%u\n", filename, minline, maxline);
		if (interrupt > 1)
			goto manual;
		if (editor) {
			if (strcmp(editor, "-") == 0)
				goto manual;
			sprintf(buf, editor, maxline, filename);
		} else {
			p = getenv("VISUAL");
			if (!p)
				p = getenv("EDITOR");
			if (!p)
				goto manual;
			sprintf(buf, "%s +%u %s\n", p, maxline, filename);
		}
		printf("Executing %s\n", buf);
		globalEdits++;
		if (system(buf) == 0)
			goto retry;
		fputs("Edit failed - aborting\n", stderr);
manual:
		puts("Please fix the error by hand and re-run repair.");
	}

	fclose(out);
	free(filename);

	fprintf(log, "\n%u lines successfully processed.\n", lines);
	fprintf(log, "Overall substitutions (%u pages):\n", pages);
	UseStats(globalStats, log);
	printf("%u manual edits required\n", globalEdits);
	fclose(log);

	return 0;

error:
	err = errno;
	if (log) fclose(log);
	if (dump) fclose(dump);
	if (out) fclose(out);
	if (in) fclose(in);
	fprintf(stderr, p, filename);
	free(filename);
	return err;
}

/* Process the command line, calling RepairFile as needed. */
int
main(int argc, char *argv[])
{
	int		result = 0;
	int		i;
	char const *editor = NULL;
	char const *nastylines = "nastylines";

	InitUtil();
	SubstBuild();
	memPoolInit(&nastyStructs);
	memPoolInit(&nastyStrings);

	/* Process leading flags */
	for (i = 1; i < argc && argv[i][0] == '-'; i++) {
		if (argv[i][1] == '-' && argv[i][2] == 0) {
			i++;
			break;
		} else if (argv[i][1] == 'e') {
			editor = argv[i][2] ? argv[i]+2 : argv[++i];
		} else if (argv[i][1] == 'l') {
			nastylines = argv[i][2] ? argv[i]+2 : argv[++i];
		} else {
			editor = argv[i][2] ? argv[i]+2 : argv[++i];
			fprintf(stderr, "ERROR: Unrecognized option %s\n", argv[i]);
			return 1;
		}
	}

	/* Process files */
	for (; i < argc; i++) {
		result = RepairFile(argv[i], editor, nastylines);
		if (result != 0) {
			fprintf(stderr, "Fatal error: %s\n", strerror(result));
			return 1;
		}
	}

	return 0;
}

/*
 * Local Variables:
 * tab-width: 4
 * End:
 * vi: ts=4 sw=4
 * vim: si
 */