/********************************************************************** * Rick, * * I made some more changes to fix another portability problem. It seems * that SOME compilers will pad a structure to a DWORD boundary when you * use the sizeof operator. In particular, for the Solaris compiler, the * 78 byte tDocHeader structure is reported as having 80 bytes. This shifts * EVERYTHING by two bytes and wreaks havoc in the generated .prc file. * I fixed this (look at the comments in struct tDocHeader and the DOCHEADSZ * definition) in the two places it occurred. * * I also fixed a spelling error in an error message. * * I also changed the usage message to say this is version 0.7a (rather than * 0.6). * * I also changed the return type of main() to be int and added various * calls to exit() as needed. Needed for portability and correctness. * * -- Harold Bamford **********************************************************************/ // MakeDoc // version 0.7a // // Compresses text files into a format that is ready to export to a Pilot // and work with Rick Bram's PilotDOC reader. // // Freeware // // ver 0.6 enforce 31 char limit on database names // ver 0.7 change header and record0 to structs // ver 0.7a minor mispellings and portability issues #ifdef sparc # ifndef UNIX # define UNIX 1 # endif #endif #include <stdio.h> #include <stdlib.h> #include <string.h> //template<class A> A max(const A& a, const A& b) {return (a<b) ? b : a;} #define max(a,b) ((a>b) ? a : b) typedef unsigned char byte; typedef unsigned long DWORD; typedef unsigned short WORD; #define DISP_BITS 11 #define COUNT_BITS 3 // all numbers in these structs are big-endian, MAC format struct tDocHeader { char sName[32]; // 32 bytes DWORD dwUnknown1; // 36 DWORD dwTime1; // 40 DWORD dwTime2; // 44 DWORD dwTime3; // 48 DWORD dwLastSync; // 52 DWORD ofsSort; // 56 DWORD ofsCatagories; // 60 DWORD dwCreator; // 64 DWORD dwType; // 68 DWORD dwUnknown2; // 72 DWORD dwUnknown3; // 76 WORD wNumRecs; // 78 }; // Some compilers pad structures out to DWORD boundaries so using sizeof() // doesn't give the intended result. #define DOCHEADSZ 78 struct tDocRecord0 { WORD wVersion; // 1=plain text, 2=compressed WORD wSpare; DWORD dwStoryLen; // in bytes, when decompressed WORD wNumRecs; // text records only; equals tDocHeader.wNumRecs-1 WORD wRecSize; // usually 0x1000 DWORD dwSpare2; }; ////////////// utilities ////////////////////////////////////// WORD SwapWord21(WORD r) { return (r>>8) + (r<<8); } WORD SwapWord12(WORD r) { return r; } DWORD SwapLong4321(DWORD r) { return ((r>>24) & 0xFF) + (r<<24) + ((r>>8) & 0xFF00) + ((r<<8) & 0xFF0000); } DWORD SwapLong1234(DWORD r) { return r; } WORD (*SwapWord)(WORD r) = NULL; DWORD (*SwapLong)(DWORD r) = NULL; // copy bytes into a word and double word and see how they fall, // then choose the appropriate swappers to make things come out // in the right order. int SwapChoose() { union { char b[2]; WORD w; } w; union { char b[4]; DWORD d; } d; strncpy(w.b, "\1\2", 2); strncpy(d.b, "\1\2\3\4", 4); if (w.w == 0x0201) SwapWord = SwapWord21; else if (w.w == 0x0102) SwapWord = SwapWord12; else return 0; if (d.d == 0x04030201) SwapLong = SwapLong4321; else if (d.d == 0x01020304) SwapLong = SwapLong1234; else return 0; return 1; } // replacement for strstr() which deals with 0's in the data byte* memfind(byte* t, int t_len, byte* m, int m_len) { int i; for (i = t_len - m_len + 1 ; i>0; i--, t++) if (t[0]==m[0] && memcmp(t,m,m_len)==0) return t; return 0; } ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////// ////////////////////// ///////////////////// tBuf class ////////////////////// ///////////////////// ////////////////////// ///////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////// struct tBuf { byte* buf; unsigned len; tBuf() {buf = new byte[len = 6000];}; ~tBuf() { if (buf) delete[] buf; } unsigned Len() const { return len; } unsigned RemoveBinary(); unsigned Decompress(); unsigned Compress(); unsigned Issue(byte src, int& bSpace); unsigned DuplicateCR(); void Clear() {delete[] buf; buf = new byte[len = 6000]; } void Dump() {printf("\nbuffer len=%d",len);} }; // // Issue() // // action: handle the details of writing a single // character to the compressed stream // unsigned tBuf::Issue(byte src, int& bSpace) { int iDest = len; byte* dest = buf; // if there is an outstanding space char, see if // we can squeeze it in with an ASCII char if (bSpace) { if (src>=0x40 && src<=0x7F) dest[iDest++] = src ^ 0x80; else { // couldn't squeeze it in, so issue the space char by itself // most chars go out simple, except the range 1...8,0x80...0xFF dest[iDest++] = ' '; if (src<0x80 && (src==0 || src>8) ) dest[iDest++] = src; else dest[iDest++] = 1, dest[iDest++] = src; } // knock down the space flag bSpace = 0; } else { // check for a space char if (src==' ') bSpace = 1; else { if (src<0x80 && (src==0 || src>8)) dest[iDest++] = src; else dest[iDest++] = 1, dest[iDest++] = src; } } len = iDest; return iDest; } // // Compress // // params: none // // action: takes the given buffer, // and compresses // the original data down into a second buffer // // comment: This version make heavy use of walking pointers. // unsigned tBuf::Compress() { int i,j; int bSpace = 0; // run through the input buffer byte* pBuffer; // points to the input buffer byte* pHit; // points to a walking test hit; works upwards on successive matches byte* pPrevHit; // previous value of pHit byte* pTestHead; // current test string byte* pTestTail; // current walking pointer; one past the current test buffer byte* pEnd; // 1 past the end of the input buffer pHit = pPrevHit = pTestHead = pBuffer = buf; pTestTail = pTestHead+1; pEnd = buf + len; //printf("pointers %x %x",pTestTail, pEnd); //printf("\nstart compression buf len=%d",len); // make a dest buffer and reassign the local buffer buf = new byte[6000]; len = 0; // used to walk through the output buffer // loop, absorbing one more char from the input buffer on each pass for (; pTestHead != pEnd; pTestTail++) { //printf("\npointers pTestHead %x pTestTail %x pTestHead[]=%x %x",pTestHead, pTestTail, pTestHead[0], pTestHead[1]); // establish where the scan can begin if (pTestHead - pPrevHit > ((1<<DISP_BITS)-1)) pPrevHit = pTestHead - ((1<<DISP_BITS)-1); // scan in the previous data for a match pHit = memfind(pPrevHit, pTestTail - pPrevHit, pTestHead, pTestTail - pTestHead); if (pHit==0) printf("!! bug source %x%x%x, dest %x%x%x, %d bytes", pPrevHit[0], pPrevHit[1],pPrevHit[2],pTestHead[0], pTestHead[1], pTestHead[2], pTestTail-pTestHead); // on a mismatch or end of buffer, issued codes if (pHit==0 || pHit==pTestHead || pTestTail-pTestHead>(1<<COUNT_BITS)+2 || pTestTail==pEnd) { // issued the codes // first, check for short runs if (pTestTail-pTestHead < 4) { //printf("\nissue a char %x",pTestHead[0]); Issue(pTestHead[0], bSpace); pTestHead++; } // for longer runs, issue a run-code else { // issue space char if required if (bSpace) buf[len++] = ' ', bSpace = 0; unsigned int dist = pTestHead - pPrevHit; unsigned int compound = (dist << COUNT_BITS) + pTestTail-pTestHead - 4; if (dist>=(1<<DISP_BITS)) printf("\n!! error dist overflow"); if (pTestTail-pTestHead-4>7) printf("\n!! error dist overflow"); buf[len++] = 0x80 + (compound>>8); buf[len++] = compound & 0xFF; //printf("\nissuing code for sequence len %d <%c%c%c>",pTestTail-pTestHead-1,pTestHead[0],pTestHead[1],pTestHead[2]); //printf("\n <%x%x>",pOut[-2],pOut[-1]); // and start again pTestHead = pTestTail-1; } // start the search again pPrevHit = pBuffer; } // got a match else { pPrevHit = pHit; } //printf("pointers %x %x %x",pTestHead, pTestTail, pPrevHit); // when we get to the end of the buffer, don't inc past the end // this forces the residue chars out one at a time if (pTestTail==pEnd) pTestTail--; } // clean up any dangling spaces if (bSpace) buf[len++] = ' '; // final scan to merge consecutive high chars together int k; for (i=k=0; i<len; i++,k++) { buf[k] = buf[i]; // skip the run-length codes if (buf[k]>=0x80 && buf[k]<0xC0) buf[++k] = buf[++i]; // if we hit a high char marker, look ahead for another else if (buf[k]==1) { buf[k+1] = buf[i+1]; while (i+2<len && buf[i+2]==1 && buf[k]<8) { buf[k]++; buf[k+buf[k]] = buf[i+3]; i+=2; } k += buf[k]; i++; } } // delete original buffer delete[] pBuffer; len = k; return k; } /* Decompress params: none action: make a new buffer run through the source data check the 4 cases: 0,9...7F represent self 1...8 escape n chars 80...bf reference earlier run c0...ff space+ASCII */ unsigned tBuf::Decompress() { // we "know" that all decompresses fit within 4096, right? byte* pOut = new byte[6000]; byte* in_buf = buf; byte* out_buf = pOut; int i,j; for (j=i=0; j<len; ) { unsigned int c; // take a char from the input buffer c = in_buf[j++]; // separate the char into zones: 0, 1...8, 9...0x7F, 0x80...0xBF, 0xC0...0xFF // codes 1...8 mean copy that many bytes; for accented chars & binary if (c>0 && c<9) while(c--) out_buf[i++] = in_buf[j++]; // codes 0, 9...0x7F represent themselves else if (c<0x80) out_buf[i++] = c; // codes 0xC0...0xFF represent "space + ascii char" else if (c>=0xC0) out_buf[i++] = ' ', out_buf[i++] = c ^ 0x80; // codes 0x80...0xBf represent sequences else { int m,n; c <<= 8; c += in_buf[j++]; m = (c & 0x3FFF) >> COUNT_BITS; n = c & ((1<<COUNT_BITS) - 1); n += 3; while (n--) { out_buf[i] = out_buf[i-m]; i++; } } } delete[] buf; buf = pOut; len = i; return i; } unsigned tBuf::DuplicateCR() { byte* pBuf = new byte[2*len]; int k,j; for (j=k=0; j<len; j++, k++) { pBuf[k] = buf[j]; if (pBuf[k]==0x0A) pBuf[k++] = 0x0D, pBuf[k] = 0x0A; } delete[] buf; buf = pBuf; len = k; return k; } void Decomp(char* src, char* dest, int bBinary) { FILE* fin; FILE* fout; fin = fopen(src,"rb"); if (fin==0) { printf("problem opening source file %s", src); exit(2); } // just holds the first few bytes of the file byte buf[0x100]; tDocHeader head; fread(&head, 1, DOCHEADSZ, fin); if (strncmp((char *)&head.dwType, "REAd", 4) != 0 || strncmp((char *)&head.dwCreator, "TEXt", 4) != 0) { //printf("file contains %.4s, %.4s", (char *)&head.dwCreator, (char *)&head.dwType); printf(".prc file is not the correct format"); exit(3); } WORD bCompressed; DWORD dwPos; tDocRecord0 rec0; // point to start of index fseek(fin, 0x4E, SEEK_SET); // read the location of the first record fread(&dwPos, 4, 1, fin); dwPos = SwapLong(dwPos); fseek(fin, dwPos, SEEK_SET); fread(&rec0, sizeof(rec0), 1, fin); bCompressed = SwapWord(rec0.wVersion); if (bCompressed!=1 && bCompressed!=2) printf("\nWARNING: unknown file compression type:%d",bCompressed); bCompressed--; fout = fopen(dest,"wb"); if (fout==0) { printf("problem opening output file %s",dest); exit(2); } DWORD dwLen; fseek(fin,0,SEEK_END); dwLen = ftell(fin); WORD nRecs; nRecs = SwapWord(head.wNumRecs) - 1; // this is the main record buffer // it knows how to stretch to accomodate the decompress tBuf t; DWORD dwRecLen; for (int i=0; i<nRecs; i++) { // read the record offset fseek(fin, 0x56 + 8*i, SEEK_SET); fread(&dwPos, 4, 1, fin); dwPos = SwapLong(dwPos); // read start of next record fseek(fin, 0x5E + 8*i, SEEK_SET); fread(&dwRecLen, 4, 1, fin); dwRecLen = SwapLong(dwRecLen); // for the last, use the file len if (i==nRecs-1) dwRecLen = dwLen; dwRecLen -= dwPos; fseek(fin,dwPos,SEEK_SET); int n = fread(t.buf, 1, dwRecLen, fin); t.len = n; if(bCompressed) t.Decompress(); // check for CR insert if (!bBinary) t.DuplicateCR(); printf("\rreconverting %s: record %d of %d",head.sName,i,nRecs); fwrite(t.buf, 1, t.Len(), fout); } fclose(fin); fclose(fout); } // this nasty little beast removes really low ASCII and 0's // and handles the CR problem // // if a cr appears before a lf, then remove the cr // if a cr appears in isolation, change to a lf unsigned tBuf::RemoveBinary() { byte* in_buf = buf; byte* out_buf = new byte[len]; int k,j; for (j=k=0; j<len; j++,k++) { // copy each byte out_buf[k] = in_buf[j]; // throw away really low ASCII if ((out_buf[k]>=0 && out_buf[k]<9)) k--; // for CR if (out_buf[k]==0x0D) { // if next is LF, then drop it if (j<len-1 && in_buf[j+1]==0x0A) k--; else // turn it into a LF out_buf[k] = 0x0A; } } delete[] buf; buf = out_buf; len = k; return k; } void out_word(short w, FILE* fout) { short m = SwapWord(w); fwrite(&m,2,1,fout); } void out_long(long d, FILE* fout) { long d1 = SwapLong(d); fwrite(&d1,4,1,fout); } int main(int argc, char** argv) { printf("MakeDoc ver 0.7a\n"); if (argc<4) { printf("\nsyntax makedoc [-n] [-b] <text-file> <prc-file> <story-name>"); printf("\n convert text files to .PRC format"); printf("\n makedoc -d [-b] <prc-file> <text-file>"); printf("\n decodes the PRC back into the txt file"); printf("\n -n builds the .prc file without compression"); printf("\n -b option compresses/decompresses binary"); #if UNIX printf("\n"); #endif exit(1); } int iArg = 1; int bDecomp = 0; int bBinary = 0; int bReport = 0; int bCompress = 1; if ( ! SwapChoose()) { printf("\nfailed to select proper byte swapping algorithm"); #if UNIX printf("\n"); #endif exit(1); } while (argv[iArg][0]=='-' || argv[iArg][0]=='\\') { if (argv[iArg][1]=='d') bDecomp = 1; if (argv[iArg][1]=='b') bBinary = 1; if (argv[iArg][1]=='r') bReport = 1; if (argv[iArg][1]=='n') bCompress = 0; iArg++; } if (bDecomp) Decomp(argv[iArg], argv[iArg+1], bBinary); else { FILE* fin; FILE* fout; tDocHeader head1; fin = fopen(argv[iArg],"rb"); fout = fopen(argv[iArg+1],"wb"); if (fin==0 || fout==0) { printf("problem opening files"); exit(2); } fseek(fin,0,SEEK_END); DWORD storySize = ftell(fin); fseek(fin,0,SEEK_SET); DWORD x; WORD w; long recSize = 4096; DWORD z,numRecs; sprintf(head1.sName,"%.31s",argv[iArg+2]); head1.sName[31] = 0; printf("saving to %s as <%s>,%s%s compressed",argv[iArg+1],argv[iArg+2], bBinary ? " binary mode," : "", bCompress ? "" : " not"); /*LocalWrite just writes to the new file the number of bytes starting at the passed pointer*/ head1.dwUnknown1 = 0; strncpy((char *)&head1.dwTime1, "\x06\xD1\x44\xAE", 4); strncpy((char *)&head1.dwTime2, "\x06\xD1\x44\xAE", 4); head1.dwTime3 = 0; head1.dwLastSync = 0; head1.ofsSort = 0; head1.ofsCatagories = 0; strncpy((char *)&head1.dwCreator, "TEXt", 4); // database creator strncpy((char *)&head1.dwType, "REAd", 4); // database type head1.dwUnknown2 = 0; head1.dwUnknown3 = 0; z = (int) (storySize/(long) recSize); if (((long) z * recSize) < storySize) z ++; numRecs = z; z ++; head1.wNumRecs = SwapWord(z); // the number of records to follow fwrite(&head1,1,DOCHEADSZ,fout); unsigned long index; index = 0x406F8000; // the pattern for attributes=dirty + unique_id=0x6f8000 x = 0x50L + (long) z * 8; out_long(x,fout); // start writing the record offsets out_long(index,fout); x += 0x0010L; index++; z--; while(z--) { out_long(x,fout); //more record offsets out_long(index++,fout); // the attributes + ID's x += 0x1000L; } // one more word..... out_word(0,fout); tDocRecord0 rec0; rec0.wVersion = SwapWord(bCompress ? 2 : 1); rec0.wSpare = 0; rec0.dwStoryLen = SwapLong(storySize); rec0.wNumRecs = SwapWord(SwapWord(head1.wNumRecs) - 1); rec0.wRecSize = SwapWord(recSize); rec0.dwSpare2 = 0; fwrite(&rec0,1,sizeof(rec0),fout); int n = recSize; // dump the whole story into the new file int recNum = 0; printf("\n"); tBuf buf; while(recNum < numRecs) { long pos; pos = ftell(fout); fseek(fout, 0x56 + 8*recNum, SEEK_SET); if (recNum!=numRecs) out_long(pos,fout); fseek(fout, pos, SEEK_SET); int nOrg; buf.Clear(); nOrg = n = fread(buf.buf,1,4096,fin); buf.len = n; if (n==0) break; if (!bBinary) buf.RemoveBinary(); if (bCompress) buf.Compress(); n = fwrite(buf.buf,1,buf.Len(),fout); printf("\rconverting record %d of %d",recNum+1,numRecs); if (bReport && n && bCompress) printf("\noriginal %d bytes, compressed to %d bytes, ratio: %f5.1\n", nOrg, n, 100. * n / nOrg); recNum++; } fclose(fin); fclose(fout); } exit(0); }