ocr-tools/tools/munge.c
2019-05-15 16:55:03 +02:00

544 lines
13 KiB
C

/*
* munge.c -- Program to convert a text file into "munged" form,
* suitable for reconstruction from printed form. Tabs are
* made visible and checksums are added to each line and each
* page to protect against transcription errors.
*
* Copyright (C) 1997 Pretty Good Privacy, Inc.
*
* Designed by Colin Plumb, Mark H. Weaver, and Philip R. Zimmermann
* Written by Mark H. Weaver
*
* $Id: munge.c,v 1.32 1997/11/12 23:28:53 mhw Exp $
*/
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include "util.h"
/*
* The file is divided into pages, and the format of each page is
*
--f414 000b2dc79af40010002 Page 1 of munge.c
bc38e5 /*
40a838 * munge.c -- Program to convert a text file into munged form
647222 *
193f28 * Copyright (C) 1997 Pretty Good Privacy, Inc.
827222 *
699025 * Designed by Colin Plumb, Mark H. Weaver, and Philip R. Zimmermann
0d050c * Written by Mark H. Weaver
*
* Where the first 2 columns are the high 8 bits (in hex) of a running
* CRC-32 of the page (the string "--", unlikely to be confused with
* any digits, indicates a page header line) and the next 4 columns
* are a CRC-16 of the rest of the line. Then a space (not counted in
* the CRC), and the line of text. Tabs are printed as the currency
* symbol (ISO Latin 1 character 164) followed by the appropriate number
* of spaces, and any form feeds are printed as a yen symbol (Latin 1 165).
* The CRC is computed on the transformed line, including the trailing
* newline. No trailing whitespace is permitted.
*
* The header line contains a (hex) number of the form 0ffcccccccctpppnnnn,
* where the digit 0 is a version number, ff are flags, ccccccc is the CRC-32
* of the page, t is the tab size (usually 4 or 8; 0 for binary files that
* are sent in radix-64), ppp is the product number (usually 1, different
* for different books), and nnnn is the file number (sequential from 1).
*
* This is followed by " Page %u of " and the file name.
*/
typedef struct MungeState
{
EncodeFormat const * fmt;
EncodeFormat const * hFmt;
int binaryMode, tabWidth;
long origLineNumber;
long productNumber, fileNumber, pageNumber, lineNumber;
unsigned long fileOffset;
CRC pageCRC;
char const * fileName;
char const * fileNameTail;
char * pageBuffer; /* Buffer large enough to hold one page */
char * pagePos; /* Current position in pageBuffer */
word16 hdrFlags;
FILE * file;
FILE * out;
} MungeState;
void ChecksumLine(EncodeFormat const *fmt, char const *line, size_t length,
char *prefix, CRC *pageCRC)
{
CRC lineCRC;
CRC runCRCPart = 0;
lineCRC = CalculateCRC(fmt->lineCRC, 0, (byte const *)line, length);
if (pageCRC != NULL)
{
*pageCRC = CalculateCRC(fmt->pageCRC, *pageCRC,
(byte const *)line, length);
runCRCPart = RunningCRCFromPageCRC(fmt, *pageCRC);
}
prefix += EncodeCheckDigits(fmt, runCRCPart, fmt->runningCRCBits, prefix);
prefix += EncodeCheckDigits(fmt, lineCRC, fmt->lineCRC->bits, prefix);
*prefix++ = ' '; /* Write a space over the null byte */
}
/* Returns 1 for convenience */
int PrintFileError(MungeState *state, char const *message)
{
fprintf(stderr, "%s in %s %s %lu\n", message, state->fileName,
state->binaryMode ? "offset" : "line",
state->binaryMode ? state->fileOffset : state->origLineNumber);
return 1;
}
int MungeLine(MungeState *state, char *buffer, int length,
char *line, int *bufferUsed)
{
int i = 0, j = 0, jOld = 0;
char ch;
for (i = 0; i < length && j < LINE_LENGTH; i++)
{
jOld = j;
ch = buffer[i];
if (ch == '\t')
{
line[j++] = TAB_CHAR;
if (state->tabWidth < 1)
return PrintFileError(state,
"ERROR: Tab found in radix64 stream");
else
while (j % state->tabWidth && j < LINE_LENGTH)
line[j++] = TAB_PAD_CHAR;
}
else if (ch == '\n')
{
if (i + 1 < length)
return PrintFileError(state,
"UNEXPECTED ERROR: fgets read past newline!?");
break;
}
else if (ch == '\f')
{
break;
}
else if (ch == ' ' && (j <= 0 || line[j-1] == ' ' ||
line[j-1] == SPACE_CHAR ||
i+1 >= length || buffer[i+1] == '\n'))
{
line[j++] = SPACE_CHAR;
}
else if (ch >= ' ' && ch <= '~')
line[j++] = ch;
else
return PrintFileError(state, "ERROR: Non-ASCII char");
}
if (i < length && buffer[i] == '\n')
{
i++;
state->origLineNumber++;
}
else if (i < length && buffer[i] == '\f' && j < LINE_LENGTH)
{
i++;
line[j++] = FORMFEED_CHAR;
}
else
{
/* If there's no newline, we need to add the continuation marker */
if (i > 0 && j >= LINE_LENGTH)
{
/* Remove the last character if we're out of room */
i--;
j = jOld;
}
line[j++] = CONTIN_CHAR;
}
/* Strip trailing spaces */
while (j > 0 && isspace((unsigned char)line[j - 1]))
j--;
if (j > LINE_LENGTH) /* This should never happen */
return PrintFileError(state, "ERROR: Internal error, line too long");
/* Add trailing newline and NULL */
line[j++] = '\n';
line[j++] = '\0';
/* Return number of chars used from buffer */
*bufferUsed = i;
return 0;
}
static void
Encode3(byte const src[3], char dest[4])
{
dest[0] = radix64Digits[ (src[0]>>2 & 0x3f)];
dest[1] = radix64Digits[(src[0]<<4 & 0x30) | (src[1]>>4 & 0x0f)];
dest[2] = radix64Digits[(src[1]<<2 & 0x3c) | (src[2]>>6 & 0x03)];
dest[3] = radix64Digits[(src[2] & 0x3f)];
}
static int
EncodeLine(byte const *src, int srcLen, char *dest)
{
char * destp = dest;
byte tempSrc[3];
for (; srcLen >= 3; srcLen -= 3)
{
Encode3(src, destp);
src += 3; destp += 4;
}
if (srcLen > 0)
{
memset(tempSrc, 0, sizeof(tempSrc));
memcpy(tempSrc, src, srcLen);
Encode3(src, destp);
src += 3; destp += 4; srcLen -= 3;
while (srcLen < 0)
destp[srcLen++] = RADIX64_END_CHAR;
}
return destp - dest;
}
static int
MungeBinaryLine(MungeState *state, byte const *buffer, int length, char *line)
{
char binLine[128];
int binLength; /* Destination length */
int used;
binLength = EncodeLine(buffer, length, binLine);
/* Append newline */
binLine[binLength++] = '\n';
binLine[binLength] = '\0';
return MungeLine(state, binLine, binLength, line, &used);
}
int MaybePageBreak(MungeState *state)
{
EncodeFormat const * fmt = state->fmt;
EncodeFormat const * hFmt = state->hFmt;
if (state->lineNumber >= LINES_PER_PAGE)
{
char line[512];
char * lineData = line + PREFIX_LENGTH;
char * p = lineData;
p += EncodeCheckDigits(hFmt, 0, HDR_VERSION_BITS, p);
p += EncodeCheckDigits(hFmt, state->hdrFlags, HDR_FLAG_BITS, p);
p += EncodeCheckDigits(hFmt, state->pageCRC, fmt->pageCRC->bits, p);
p += EncodeCheckDigits(hFmt, state->tabWidth, HDR_TABWIDTH_BITS, p);
p += EncodeCheckDigits(hFmt, state->productNumber, HDR_PRODNUM_BITS, p);
p += EncodeCheckDigits(hFmt, state->fileNumber, HDR_FILENUM_BITS, p);
sprintf(p, " Page %ld of %s\n", state->pageNumber + 1,
state->fileNameTail);
if (strlen(lineData) > LINE_LENGTH + 1)
{
PrintFileError(state, "ERROR: Header line too long");
fprintf(stderr, "> %s", lineData);
return -1;
}
/* Compute checksums and prefix them to line */
ChecksumLine(fmt, lineData, strlen(lineData), line, NULL);
fprintf(state->out, "%c%c%s\n%s\f", HDR_PREFIX_CHAR,
fmt->headerTypeChar, line + 2, state->pageBuffer);
state->pageNumber++;
state->lineNumber = 0;
state->pageCRC = 0;
state->pagePos = state->pageBuffer; /* Clear page buffer */
}
return 0;
}
/*
* Search for Emacs "tab-width: " maker in file.
* Emacs is stricter about the format, but this will do.
*/
int FindTabWidth(MungeState *state)
{
char const * const tabWidthMarker = " tab-width: ";
char buffer[512];
char * p;
int length;
int tabWidth = 0;
fseek(state->file, -(sizeof(buffer) - 1), SEEK_END);
length = fread(buffer, 1, sizeof(buffer) - 1, state->file);
buffer[length] = '\0';
p = strstr(buffer, tabWidthMarker);
if (p != NULL)
{
p += strlen(tabWidthMarker);
while (*p != '\0' && *p != '\n' && isspace(*p))
p++;
tabWidth = strtol(p, &p, 10);
while (*p != '\0' && *p != '\n' && isspace(*p))
p++;
if (*p != '\n' || tabWidth < 2)
tabWidth = 0;
else if (tabWidth > 16)
fprintf(stderr, "WARNING: Weird tab-width (%d), %s\n",
tabWidth, state->fileName);
}
return tabWidth;
}
/*
* Open the given source file and send the munged output to the
* FILE *, with the given options.
*/
int MungeFile(char const *fileName, FILE *out, EncodeFormat const *fmt,
int binaryMode, int defaultTabWidth,
long productNumber, long fileNumber)
{
MungeState * state;
int length, used;
char line[PREFIX_LENGTH + LINE_LENGTH + 10];
char * lineData = line + PREFIX_LENGTH;
char buffer[128];
int result = 0;
state = (MungeState *)calloc(1, sizeof(*state));
state->fmt = fmt;
state->hFmt = &hexFormat;
state->origLineNumber = 1;
state->fileName = fileName;
state->pageCRC = 0;
state->productNumber = productNumber;
state->fileNumber = fileNumber;
state->pageNumber = 0;
state->lineNumber = 0;
state->fileOffset = 0;
state->binaryMode = binaryMode;
state->pageBuffer = malloc(PAGE_BUFFER_SIZE);
state->pageBuffer[0] = '\0';
state->pagePos = state->pageBuffer;
state->hdrFlags = 0;
state->out = out;
state->fileNameTail = strrchr(state->fileName, '/');
if (state->fileNameTail == NULL)
state->fileNameTail = state->fileName;
else
state->fileNameTail++;
state->file = fopen(state->fileName, binaryMode ? "rb" : "r");
if (state->file == NULL)
{
result = errno;
fprintf(stderr, "ERROR opening %s: %s\n",
state->fileName, strerror(result));
goto error;
}
if (state->binaryMode)
{
state->tabWidth = 0;
}
else
{
state->tabWidth = FindTabWidth(state);
if (state->tabWidth == 0)
state->tabWidth = defaultTabWidth;
rewind(state->file);
}
while (!feof(state->file))
{
if (state->binaryMode)
{
length = fread(buffer, 1, BYTES_PER_LINE, state->file);
if (length < 1)
{
if (feof(state->file))
break;
goto fileError;
}
if ((result = MaybePageBreak(state)))
goto error;
if ((result = MungeBinaryLine(state, buffer, length, lineData)))
goto error;
state->fileOffset += length;
}
else
{
if (fgets(buffer, sizeof(buffer), state->file) == NULL)
{
if (feof(state->file))
break;
goto fileError;
}
length = strlen(buffer);
if ((result = MaybePageBreak(state)))
goto error;
if ((result = MungeLine(state, buffer, length, lineData, &used)))
goto error;
if (used < length)
if (fseek(state->file, used - length, SEEK_CUR))
goto fileError;
}
/* Compute checksums and prefix them to the line */
ChecksumLine(fmt, lineData, strlen(lineData), line, &state->pageCRC);
strcpy(state->pagePos, line);
length = strlen(state->pagePos);
/* Suppress trailing whitespace on blank lines */
if (length == PREFIX_LENGTH+1 && state->pagePos[length-1] == '\n') {
state->pagePos[--length-1] = '\n';
state->pagePos[length] = '\0';
}
state->pagePos += length;
state->lineNumber++;
}
if (state->lineNumber > 0)
{
/* Force a final page break */
state->lineNumber = LINES_PER_PAGE;
state->hdrFlags |= HDR_FLAG_LASTPAGE;
if ((result = MaybePageBreak(state)))
goto error;
}
result = 0;
goto done;
fileError:
result = ferror(state->file);
error:
done:
if (state != NULL)
{
if (state->file != NULL)
fclose(state->file);
free(state);
}
return result;
}
int main(int argc, char *argv[])
{
int result = 0;
int i, j;
int defaultTabWidth = 4;
int binaryMode = 0;
long productNumber = 1;
long fileNumber = 1;
char * endOfNumber;
EncodeFormat const * fmt = NULL;
InitUtil();
for (i = 1; i < argc && argv[i][0] == '-'; i++)
{
if (0 == strcmp(argv[i], "--"))
{
i++;
break;
}
for (j = 1; argv[i][j] != '\0'; j++)
{
if (isdigit(argv[i][j]))
{
defaultTabWidth = argv[i][j] - '0';
if (defaultTabWidth < 2 || defaultTabWidth > 9)
fprintf(stderr, "WARNING: Weird default tab-width (%d)\n",
defaultTabWidth);
}
else if (argv[i][j] == 'b')
{
binaryMode = 1;
}
else if (argv[i][j] == 'F')
{
fmt = FindFormat(argv[i][j+1]);
if (!fmt || argv[i][j+2] != '\0')
{
fprintf(stderr, "ERROR: Invalid format char\n");
exit(1);
}
break;
}
else if (argv[i][j] == 'p')
{
productNumber = strtol(&argv[i][j+1], &endOfNumber, 10);
if (*endOfNumber != '\0')
{
fprintf(stderr, "ERROR: Invalid product number\n");
exit(1);
}
break;
}
else if (argv[i][j] == 'f')
{
fileNumber = strtol(&argv[i][j+1], &endOfNumber, 10);
if (*endOfNumber != '\0')
{
fprintf(stderr, "ERROR: Invalid file number\n");
exit(1);
}
break;
}
else
{
fprintf(stderr, "ERROR: Unrecognized option -%c\n", argv[i][j]);
exit(1);
}
}
}
if (!fmt)
fmt = binaryMode ? &radix64Format : &hexFormat;
for (; i < argc; i++)
{
if ((result = MungeFile(argv[i], stdout, fmt, binaryMode,
defaultTabWidth, productNumber,
fileNumber)) != 0)
{
/* If result > 0, message should have already been printed */
if (result < 0)
fprintf(stderr, "ERROR: %s\n", strerror(result));
exit(1);
}
fileNumber++;
}
return 0;
}
/*
* Local Variables:
* tab-width: 4
* End:
* vi: ts=4 sw=4
* vim: si
*/