/* * Extraordinarily brute force Lempel & Ziv-like * compressor. The input file must fit in memory * during compression, and the output file will * be reconstructed in memory during decompression. * We search for large common sequences and use a * greedy algorithm to choose which sequence gets * compressed first. * * Files begin with "BLZ\n" and a 32-bit uncompressed file length. * * Output format is a series of blocks followed by * a raw data section. Each block begins with a 32-bit big-endian * number. The top bit is type and the next 31 bits * are uncompressed size. Type is one of * 0 - use raw data for this length * 1 - a 32-bit offset follows * After the blocks come the raw data. (The end of the blocks can be * noted by summing block lengths until you reach the file length.) */ #include #include #include #define malloc sbrk int minrun = 16; int win = 16; ulong outn; int verbose; int mindist; enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */ enum { NOFF = 3 }; Biobuf bout; ulong length; uchar *data; ulong sum32(ulong, void*, long); uchar *odat; int nodat; int nraw; int rawstart; int acct; int zlength; int maxchain; int maxrle[256]; int nnew; typedef struct Node Node; struct Node { Node *link; ulong key; ulong offset[NOFF]; }; Node *nodepool; int nnodepool; Node **hash; uint nhash; uint maxlen; uint maxsame; uint replacesame = 8*1024*1024; Node *freelist, **freeend; uint nalloc; Node* allocnode(void) { int i; Node *n; if(nnodepool == 0){ nnodepool = 256*1024; nodepool = malloc(sizeof(Node)*nnodepool); } if(freelist){ n = freelist; freelist = n->link; return n; } assert(nnodepool > 0); nalloc++; n = &nodepool[--nnodepool]; for(i=0; ioffset[i] = -1; return n; } void freenode(Node *n) { if(freelist == nil) freelist = n; else *freeend = n; freeend = &n->link; n->link = nil; } Node** llookup(ulong key) { uint c; Node **l, **top, *n; if(nhash == 0){ uint x; x = length/8; for(nhash=1; nhashlink){ c++; if((*l)->key == key){ /* move to front */ n = *l; *l = n->link; n->link = *top; *top = n; return top; } } if(c > maxlen) maxlen = c; return l; } Node* lookup(ulong key) { return *llookup(key); } void insertnode(ulong key, ulong offset) { int i; Node *n, **l; l = llookup(key); if(*l == nil){ if(l==&hash[key&(nhash-1)]) nnew++; *l = allocnode(); (*l)->key = key; } n = *l; /* add or replace last */ for(i=0; ioffset[i]!=-1; i++) ; n->offset[i] = offset; } void Bputint(Biobufhdr *b, int n) { uchar p[4]; p[0] = n>>24; p[1] = n>>16; p[2] = n>>8; p[3] = n; Bwrite(b, p, 4); } void flushraw(void) { if(nraw){ if(verbose) fprint(2, "Raw %d+%d\n", rawstart, nraw); zlength += 4+nraw; Bputint(&bout, (1<<31)|nraw); memmove(odat+nodat, data+rawstart, nraw); nodat += nraw; nraw = 0; } } int rawbyte(int i) { assert(acct == i); if(nraw == 0) rawstart = i; acct++; nraw++; return 1; } int refblock(int i, int len, int off) { assert(acct == i); acct += len; if(nraw) flushraw(); if(verbose) fprint(2, "Copy %d+%d from %d\n", i, len, off); Bputint(&bout, len); Bputint(&bout, off); zlength += 4+4; return len; } int cmprun(uchar *a, uchar *b, int len) { int i; if(a==b) return 0; for(i=0; ioffset[o] == -1) break; run = cmprun(data+i, data+n->offset[o], length-i); if(run > maxrun && n->offset[o]+mindist < i){ maxrun = run; maxoff = n->offset[o]; best = o; } } if(best > 0){ o = n->offset[best]; for(j=best; j>0; j--) n->offset[j] = n->offset[j-1]; n->offset[0] = o; } } if(maxrun >= minrun) j = i+refblock(i, maxrun, maxoff); else j = i+rawbyte(i); for(; imaxrle[data[i]]){ maxrle[data[i]] = rle; insertnode(sum, i); } sum = (sum*256+data[i+win]) % Prime; sum = (sum + data[i]*outn) % Prime; } } /* could do better here */ for(; i 0){ data = realloc(data, length+n); if(data == nil) sysfatal("realloc: %r"); memmove(data+length, buf, n); length += n; if(n < sizeof buf) break; } odat = malloc(length); if(odat == nil) sysfatal("malloc: %r"); Binit(&bout, 1, OWRITE); Bprint(&bout, "BLZ\n"); Bputint(&bout, length); outn = 1; for(i=0; i