/* * Magic Rescue, text extraction * Copyright (C) 2004 Jonas Jensen * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "config.h" #include #include #include #include #include #include #include #include #include #include "util.h" static int max_score = 9; static int block_score = 5; static int max_line = 5*80; static int max_reverse = 0; static long max_bytes = 0; static size_t bufsize = 8192; static unsigned char *buf; static short scorelut[UCHAR_MAX+1], replut[UCHAR_MAX+1]; struct rule { short score, rep; unsigned char *ranges; }; enum direction { DIR_FORWARD, DIR_REVERSE }; static enum direction direction = DIR_FORWARD; struct scores { unsigned char *last_letter; unsigned char repeated_char; int sum_score, sum_repeats, cur_line; off_t offset; long bytes_processed; }; /** * Initializes s to default values. Sets s->offset to offset, which may be -1 * for unknown. */ static void scores_init(struct scores *s, off_t offset) { s->last_letter = NULL; s->repeated_char = '\0'; s->sum_score = s->sum_repeats = s->cur_line = 0; s->offset = offset; s->bytes_processed = 0; } /** * Inspects a single char on the address p. * * Returns 0 for "keep going". * Returns 1 for "EOF found", in which case the EOF will be after * s->last_letter. */ static int inspect_char(struct scores *s, char *p) { const unsigned char c = *p; int score, max_repeats; /* Handle block offset score */ if (s->offset >= 0 && block_score > 0) { if ((s->offset & 511) == 0 && s->sum_score < block_score) s->sum_score = block_score; s->offset += (direction == DIR_FORWARD ? 1 : -1); } /* Handle character score */ score = scorelut[c]; s->sum_score += score; if (s->sum_score < 0) s->sum_score = 0; if (score <= 0) s->last_letter = p; if (score > 0) { fprintf(stderr, "score +%d for 0x%02X\n", score, c); } if (s->sum_score > max_score) { fprintf(stderr, "Score too high at %lld\n", (long long)s->offset); fprintf(stderr, "%d > %d\n", s->sum_score, max_score); return 1; } /* Handle repeat */ if (s->repeated_char == c && (max_repeats = replut[c])) { s->sum_repeats++; if (s->sum_repeats > max_repeats) { fprintf(stderr, "Too many repeats of '%c' (0x%02X)\n", c, c); fprintf(stderr, "%d > %d at %lld\n", s->sum_repeats, max_repeats, (long long)s->offset); return 1; } } else { s->sum_repeats = 0; } s->repeated_char = c; /* Handle line length */ if (max_line > 0) { if (c == '\r' || c == '\n') { s->cur_line = 0; } else if (++s->cur_line > max_line) { fprintf(stderr, "Line too long at %lld\n", (long long)s->offset); return 1; } } /* handle max bytes */ if (max_bytes > 0 && ++s->bytes_processed > max_bytes) { fprintf(stderr, "Wrote max bytes\n"); return 1; } return 0; } static void make_luts(void) { /* see http://www.bbsinc.com/iso8859.html */ struct rule *rule, rules[] = { /* default values */ { 0, 120, "\x01-\xFF"}, /* never used control characters */ { 4, 0, "\x01-\x1F" }, /* 8-bit characters */ { 1, 8, "\x80-\xFF" }, /* characters not in ISO 8859-1 */ { 2, 0, "\x7F-\x9F" }, /* characters in Windows latin-1 */ { 1, 0, "\x82-\x8C\x91-\x9C\x9F" }, /* rarely used control chars: EOF, bell, backspace, form feed, ESC */ { 2, 0, "\x04\x07\x08\x0C\x1B" }, /* the NUL character */ { 10, 0, "" }, /* 0xFF */ { 3, 0, "\xFF" }, /* whitespace */ { -1, 180, " \t\r\n" }, /* English letters and numbers */ { -2, 80, "a-zA-Z0-9" }, { 0, 0, NULL } }; for (rule = rules; rule->ranges != NULL; rule++) { unsigned char a, b, *range = rule->ranges; int i; do { a = b = *(range++); if (a && *range == '-' && (b = range[1])) { range += 2; } for (i = a; i <= b; i++) { scorelut[i] = rule->score; replut[i] = rule->rep; } } while (a && *range); } } static ssize_t write_all(int fd, const void *ptr, size_t count) { size_t written = 0; ssize_t rv; while (written < count) { rv = write(fd, (char *)ptr + written, count - written); if (rv < 0) return -1; written += rv; } return written; } static void usage(void) { fprintf(stderr, "Usage: textextract [-r MAX_REVERSE] [-M MAX_BYTES] [-s MAX_SCORE]\n" " [-b BLOCK_SCORE] OUTPUT_FILE|-\n" "\n" "Tries to recognize human-readable text among binary junk.\n" "Expects a file, preferably seekable, on standard intput. Writes to \n" "OUTPUT_FILE, or stdout if it's \"-\".\n" "\n" " -r Read backwards to find beginning of file up to MAX_REVERSE bytes.\n" " -M Set the max number of bytes to output. Default unlimited.\n" " -s Set max score before quitting. [%d]\n" " -l Set max line length, in bytes. [%d]\n" " -b Assign this value to the score when crossing a block boundary. [%d]\n" , max_score, max_line, block_score); } static int read_backward(struct scores *s, int outfd) { ssize_t read_count; unsigned char *p; if (s->offset <= 0) return 0; read_count = bufsize; if (s->offset < bufsize) read_count = (ssize_t)s->offset; if (max_reverse < read_count) read_count = max_reverse; errno = 0; if (lseek(0, -read_count, SEEK_CUR) < 0 || read(0, buf, read_count) != read_count) { perror("Reading backwards"); return -1; } p = buf+read_count-1; for (; p >= buf; p--) { if (inspect_char(s, p)) { if (s->last_letter) { write_all(outfd, s->last_letter, read_count - (s->last_letter - buf)); } return 0; } } write_all(outfd, buf, read_count); return 0; } static int read_forward(struct scores *s, int outfd) { ssize_t read_count; unsigned char *p, *bufpos; bufpos = buf; while ((read_count = read(0, bufpos, bufsize - (bufpos-buf)) ) > 0) { for (p = bufpos; p-bufpos < read_count; p++) { if (inspect_char(s, p)) { return 0; } } bufpos = p; if (bufsize == (size_t)(bufpos-buf)) { /* buffer full, flush to stdout */ if (!s->last_letter) { /* buffer not big enough, this shouldn't happen */ fprintf(stderr, "textextract: internal error\n"); return -1; } if (write_all(outfd, buf, s->last_letter - buf) <= 0) { perror("Write error"); return -1; } memmove(buf, s->last_letter, bufpos - s->last_letter); bufpos -= s->last_letter - buf; s->last_letter = NULL; } } return 0; } static void do_textextract(int outfd) { struct scores s; scores_init(&s, lseek(0, 0, SEEK_CUR)); if (direction == DIR_REVERSE) { if (read_backward(&s, outfd) != 0) return; } direction = DIR_FORWARD; scores_init(&s, s.offset); if (read_forward(&s, outfd) == 0 && s.last_letter) { write_all(outfd, buf, s.last_letter+1 - buf); } } int main(int argc, char **argv) { /* TODO: * "stopstring" option for reverse operation, e.g. "#!/" * option for print-debug-info * option to add a different ruleset? */ int c, outfd; if (argc < 2 || strcmp(argv[1], "--help") == 0) { usage(); return 1; } while ((c = getopt(argc, argv, "M:s:b:l:r:")) >= 0) { switch (c) { case 'M': max_bytes = atol_calc(optarg); break; case 's': max_score = atoi(optarg); break; case 'b': block_score = atoi(optarg); break; case 'l': max_line = atoi(optarg); break; case 'r': max_reverse = atoi(optarg); if (max_reverse <= 0) { fprintf(stderr, "Invalid argument to -r\n"); return 1; } direction = DIR_REVERSE; break; default: fprintf(stderr, "Error parsing options.\n"); usage(); return 1; } } if (strcmp(argv[optind], "-") == 0) { outfd = 1; } else if ((outfd = open(argv[optind], O_WRONLY|O_CREAT|O_TRUNC, 0666)) < 0) { fprintf(stderr, "textextract: opening %s: %s\n", argv[optind], strerror(errno)); return 1; } if (bufsize < (size_t)max_reverse) bufsize = max_reverse; buf = malloc(bufsize); if (!buf) { fprintf(stderr, "Failed to allocate %u bytes of memory\n", bufsize); return 1; } make_luts(); do_textextract(outfd); close(outfd); free(buf); return 0; }