#include #include #include /* * S.O.F.A. = Small_Or / Fast_And * Routines to manipulate compressed sparse bit-arrays. * * Copyright (c) 2008 James Dow Allen * This code is made available, without warrantee, * under General License subject to the provision that * it not be used in a product to be sold for profit. * Contact the author (jamesdowallen at gmail.com) if * you need a stronger License. * * Compile with * cc -O -c sofa.c * Or, to run some test routines, * cc -O -DTESTING -o sofatest sofa.c -lm ; sofatest * * Comments and Caveats: * 1) This codec is inappropriate unless data really * is sparse, say 8% 1 bits or less. * 2) Module does no file access; implicit assumption * is that core memory has room for at least * one uncompressed bit array. * 3) Size of uncompressed bit-arrays must be multiple of 8 * (i.e., whole number of bytes). (The codec * postulates a 9th bit (always 1) in the very * last byte, but this is transparent to you.) * 4) The compbits struct contains redundant and unused * fields: redesign to suit. * 5) nxt_run in compbits struct isn't really part of that * "object", but rather of a processing object. * Strictly we might need, instead: * struct compbits_processor { * struct compbits *p; * size_t nxt_run; * }; * Make such a change *IF* your system will have two (or more) * processes operating on the same compbits struct * *concurrently*. But that can't happen in this module, * as written, since each process (compress, bits_and, etc.) * runs to completion. * 6) It is convenient to do (at least some of) * memory allocation for compressed data in * these low-level routines, and we do so, * making the interface trivial; e.g. * struct foo *p1 = 0, *p2 = 0, *p3 = 0; * * bits_comp(&p1, (char *)rawdata1, leng); * bits_comp(&p2, (char *)rawdata2, leng); * bits_and(p1, p2, &p3); * bits_deco(p3, (char *)rawdata3, leng); * with *NO* other knowledge needed of internals! * But you may prefer to manage your allocations; * in that case, start by undefining SOFA_ALLOC : */ #define SOFA_ALLOC struct compbits { size_t unc_size; /* uncompressed size in bytes */ size_t com_size; /* compressed size in bytes */ size_t com_alloc; /* bytes allocated for compressed data */ size_t nxt_run; /* index of next run */ u_char com_data[4]; /* "4" is actually (p->com_alloc) */ }; /* Here are definitions of (non-static) functions */ /* Compress, Decmpress, AND bits, OR bits, Allocate_or_resize, Free */ void bits_comp(struct compbits **out, u_char *in, size_t leng); void bits_deco(struct compbits *p, u_char *out, size_t leng); void bits_and(struct compbits *p1, struct compbits *p2, struct compbits **out); void bits_or(struct compbits *p1, struct compbits *p2, struct compbits **out); struct compbits *realloc_combits(struct compbits *in, size_t numb); void free_combits(struct compbits *p); /* Shouldn't need to fiddle with these definitions: */ /* Largest number supported is > (255-GIPAD)*16.7 million */ #define GIPAD 100 /* Largest number that fits in 1 byte */ #define MAX_1 150 /* Largest number that fits in 2 bytes */ #define MAX_2 ((252 - MAX_1) * 256 + 255 + MAX_1 + 1) /* Largest number that fits in 3 bytes */ #define MAX_3 (255 * 256 + 255 + MAX_2 + 1) /* Largest number that fits in 4 bytes */ #define MAX_4 (GIPAD * 256L*256 + 257*255 + MAX_3 + 1) static void putexp(struct compbits **pp, long x) { struct compbits *p = *pp; size_t ix = p->nxt_run; /* Before putting we insist 6 bytes be available */ if (ix + 5 >= p->com_alloc) { #ifdef SOFA_ALLOC p = realloc_combits(p, /* Next is arbitrary growth heuristic */ p->com_alloc + 10 + (p->com_alloc >> 1)); *pp = p; #else fprintf(stderr, "Compbits at %p allocated %uld", (void *)p, (unsigned long)(p->com_alloc)); fprintf(stderr, " ... not enough\n"); exit(1); #endif } #define PUTT(y) p->com_data[ix++] = (y) /* Can eliminate (coalesce) nine of PUTT's following! * ... by using goto's (and tougher +='s) :-( */ if (x <= MAX_1) { PUTT(x); } else if (x <= MAX_2) { x -= MAX_1 + 1; PUTT((x >> 8) + MAX_1 + 1); PUTT(x); } else if (x <= MAX_3) { x -= MAX_2 + 1; PUTT(254); PUTT(x >> 8); PUTT(x); } else if (x <= MAX_4) { x -= MAX_3 + 1; PUTT(255); PUTT(x >> 16); PUTT(x >> 8); PUTT(x); } else { x -= MAX_3 + 1; PUTT(255); PUTT((x >> 24) + GIPAD); PUTT(x >> 16); PUTT(x >> 8); PUTT(x); } p->nxt_run = ix; return; } static long getexp(struct compbits *p) { size_t leng, ix = p->nxt_run; if (ix >= p->com_size) return -1; leng = p->com_data[ix++]; if (leng <= MAX_1) { /* 1-byte code */ } else if (leng < 254) { /* 2-byte code */ leng -= MAX_1 + 1; leng <<= 8; leng += p->com_data[ix++] + MAX_1 + 1; } else if (leng == 254) { /* 3-byte code */ leng = p->com_data[ix++]; leng <<= 8; leng += p->com_data[ix++] + MAX_2 + 1; } else if ((leng = p->com_data[ix++]) < GIPAD) { /* 4-byte code */ leng <<= 8; leng += p->com_data[ix++]; leng <<= 8; leng += p->com_data[ix++] + MAX_3 + 1; } else { /* 5-byte code */ leng -= GIPAD; leng <<= 8; leng += p->com_data[ix++]; leng <<= 8; leng += p->com_data[ix++]; leng <<= 8; leng += p->com_data[ix++] + MAX_3 + 1; } p->nxt_run = ix; return leng; } /* Compress in[0] ... in[leng-1] and place in **out */ void bits_comp(struct compbits **out, u_char *in, size_t leng) { size_t ix; long runleng; u_char thisd; int tv; #ifdef SOFA_ALLOC if (*out == 0) *out = realloc_combits(0, leng / 20); #endif (*out)->nxt_run = 0; (*out)->unc_size = leng; for (runleng = ix = 0; ix < leng; ix++) { if ((thisd = *in++) == 0) { runleng += 8; continue; } for (tv = 0x80; tv; tv >>= 1) { if (tv & thisd) { putexp(out, runleng); runleng = 0; } else { runleng += 1; } } } putexp(out, runleng); (*out)->com_size = (*out)->nxt_run; #ifdef SOFA_ALLOC (*out) = realloc_combits(*out, (*out)->com_size); #endif } /* Decompress *p and place in out[0] ... out[leng-1] */ void bits_deco(struct compbits *p, u_char *out, size_t leng) { size_t bytep; long thisx, ix = -1; memset(out, 0, leng); p->nxt_run = 0; while ((thisx = getexp(p)) >= 0) { ix += 1 + thisx; bytep = ix / 8; if (bytep >= leng) { if (bytep > leng) break; return; } out[bytep] |= 0x80 >> ix % 8; } fprintf(stderr, "Compbits at %p inconsistent\n", (void *)p); exit(1); } /* Set **out := *p1 AND *p2 */ void bits_and(struct compbits *p1, struct compbits *p2, struct compbits **out) { long pos1, pos2, pos3, totnumbit; if (p1 == p2 || p1 == (*out) || p2 == (*out)) { /* Next is because single-usage of compbits * is dictated by p->nxt_run implementation. * See comment above re need for processing object. */ fprintf(stderr, "Inputs to bits_and must be distinct\n"); exit(1); } if (p1->unc_size != p2->unc_size) { fprintf(stderr, "Inputs to bits_and must have same size\n"); exit(1); } #ifdef SOFA_ALLOC if (*out == 0) *out = realloc_combits(0, p1->com_alloc); #endif (*out)->unc_size = p1->unc_size; totnumbit = p1->unc_size * 8; p1->nxt_run = 0; p2->nxt_run = 0; (*out)->nxt_run = 0; pos1 = getexp(p1); pos2 = getexp(p2); pos3 = 0; while (1) { while (pos1 < pos2) pos1 += getexp(p1) + 1; while (pos2 < pos1) pos2 += getexp(p2) + 1; if (pos1 == pos2) { putexp(out, pos1 - pos3); if (pos1 >= totnumbit) break; pos3 = pos1 + 1; pos1 += getexp(p1) + 1; pos2 += getexp(p2) + 1; } } (*out)->com_size = (*out)->nxt_run; #ifdef SOFA_ALLOC (*out) = realloc_combits(*out, (*out)->com_size); #endif } /* Set **out := *p1 OR *p2 */ void bits_or(struct compbits *p1, struct compbits *p2, struct compbits **out) { long pos1, pos2, pos3, totnumbit; if (p1 == p2 || p1 == (*out) || p2 == (*out)) { /* Next is because single-usage of compbits * is dictated by p->nxt_run implementation. * Sorry if this is a problem. */ fprintf(stderr, "Inputs to bits_or must be distinct\n"); exit(1); } if (p1->unc_size != p2->unc_size) { fprintf(stderr, "Inputs to bits_or must have same size\n"); exit(1); } #ifdef SOFA_ALLOC if (*out == 0) *out = realloc_combits(0, p1->com_alloc + p2->com_alloc); #endif (*out)->unc_size = p1->unc_size; totnumbit = p1->unc_size * 8; p1->nxt_run = 0; p2->nxt_run = 0; (*out)->nxt_run = 0; pos1 = getexp(p1); pos2 = getexp(p2); pos3 = 0; while (1) { while (pos1 < pos2) { putexp(out, pos1 - pos3); pos3 = pos1 + 1; pos1 += getexp(p1) + 1; } while (pos2 < pos1) { putexp(out, pos2 - pos3); pos3 = pos2 + 1; pos2 += getexp(p2) + 1; } if (pos1 == pos2) { putexp(out, pos1 - pos3); if (pos1 >= totnumbit) break; pos3 = pos1 + 1; pos1 += getexp(p1) + 1; pos2 += getexp(p2) + 1; } } (*out)->com_size = (*out)->nxt_run; #ifdef SOFA_ALLOC (*out) = realloc_combits(*out, (*out)->com_size); #endif } /* Allocate or Resize compbits struct */ struct compbits *realloc_combits(struct compbits *in, size_t numb) { struct compbits *p; #if 0 fprintf(stderr, "realloc_combits (%ld): Called\n", (long)numb); #endif p = realloc(in, sizeof (struct compbits) + numb - 4); if (p == 0) { free(in); fprintf(stderr, "realloc_combits (%ld): Allocation failed\n", (long)numb); exit(1); } p->com_alloc = numb; return p; } /* Next is trivial cover for aesthetics */ void free_combits(struct compbits *p) { free(p); } #ifdef TESTING #include static void show_raw(u_char *in, size_t leng) { fprintf(stderr, "Raw: "); while (leng--) fprintf(stderr, "%02x", *in++); fprintf(stderr, "\n"); } static void show_comp(struct compbits *p) { size_t i; fprintf(stderr, "Compressed: (%d %d)", p->com_alloc, p->com_size); for (i = 0; i < p->com_size; i++) fprintf(stderr, " %d", p->com_data[i]); fprintf(stderr, "\n"); } /* Put putexp/getexp through their paces */ void test1() { u_long siz, tsiz; struct compbits *cp; for (siz = 0; siz < (255-GIPAD)*(u_long)16700000; siz += 19 + siz/2) { cp = realloc_combits(0, 1); cp->nxt_run = 0; putexp(&cp, siz); putexp(&cp, 19); putexp(&cp, 19); cp->com_size = cp->nxt_run; cp->nxt_run = 0; tsiz = getexp(cp); if (tsiz != siz) { fprintf(stderr, "Coded %ld Decoded %ld\n", siz, tsiz); cp->com_size -= 2; show_comp(cp); } free_combits(cp); } } #define TBSIZE 2000000 static u_char Buff1[TBSIZE], Buff2[TBSIZE], Buff3[TBSIZE]; /* Return a 1 with probability p */ static int rbit(double p) { return (((random() & 0xffffff) + 0.5) / 0x1000000) < p; } static void randfill(u_char *bufp, size_t siz, double prob) { size_t i, j; for (i = 0; i < siz; i++) { bufp[i] = 0; for (j = 0; j < 8; j++) if (rbit(prob)) bufp[i] |= 0x80 >> j; } } static void bufcompare(u_char *bp1, u_char *bp2, size_t siz, char *msg) { while (siz--) if (*bp1++ != *bp2++) { fprintf(stderr, "Miscompare during %s\n", msg); exit(1); } } void test2() { int i; struct compbits *c1p, *c2p, *c3p; c1p = realloc_combits(0, 1); c2p = realloc_combits(0, 1); c3p = realloc_combits(0, 1); randfill(Buff1, TBSIZE, 0.03); randfill(Buff2, TBSIZE, 0.02); bits_comp(&c1p, Buff1, TBSIZE); bits_deco(c1p, Buff3, TBSIZE); /* show_raw(Buff1, 20); */ /* show_raw(Buff3, 20); */ bufcompare(Buff1, Buff3, TBSIZE, "Compress-Decompress"); /* show_raw(Buff2, 20); */ bits_comp(&c2p, Buff2, TBSIZE); bits_and(c1p, c2p, &c3p); bits_deco(c3p, Buff3, TBSIZE); /* show_raw(Buff3, 20); */ for (i = 0; i < TBSIZE; i++) Buff1[i] &= Buff2[i]; bufcompare(Buff1, Buff3, TBSIZE, "And_bits"); randfill(Buff1, TBSIZE, 0.01); bits_comp(&c1p, Buff1, TBSIZE); bits_or(c1p, c2p, &c3p); bits_deco(c3p, Buff3, TBSIZE); for (i = 0; i < TBSIZE; i++) Buff1[i] |= Buff2[i]; bufcompare(Buff1, Buff3, TBSIZE, "Or_bits"); free_combits(c1p); free_combits(c2p); free_combits(c3p); } /* show code rates */ void test3() { double p; struct compbits *cp; for (p = 0.1; p > 0.0000002; p /= 2.154434691) { cp = realloc_combits(0, TBSIZE / 10); randfill(Buff1, TBSIZE, p); bits_comp(&cp, Buff1, TBSIZE); fprintf(stderr, " %9.7f 1's --- Bitrate %f (Optimal %f)\n", p, cp->com_size / (double)TBSIZE, (-p * log(p) - (1-p) * log(1-p)) / log(2.0)); free_combits(cp); } } int main(int argc, char **argv) { test1(); test2(); test3(); exit(0); } #endif