cmcompress.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. /*
  2. * Copyright (c) 1985, 1986 The Regents of the University of California.
  3. * All rights reserved.
  4. *
  5. * This code is derived from software contributed to Berkeley by
  6. * James A. Woods, derived from original work by Spencer Thomas
  7. * and Joseph Orost.
  8. *
  9. * Redistribution and use in source and binary forms, with or without
  10. * modification, are permitted provided that the following conditions
  11. * are met:
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. * 2. Redistributions in binary form must reproduce the above copyright
  15. * notice, this list of conditions and the following disclaimer in the
  16. * documentation and/or other materials provided with the distribution.
  17. * 3. All advertising materials mentioning features or use of this software
  18. * must display the following acknowledgement:
  19. * This product includes software developed by the University of
  20. * California, Berkeley and its contributors.
  21. * 4. Neither the name of the University nor the names of its contributors
  22. * may be used to endorse or promote products derived from this software
  23. * without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35. * SUCH DAMAGE.
  36. */
  37. #include "cmcompress.h"
  38. #include <errno.h>
  39. #include <string.h>
  40. static const char_type magic_header[] = { "\037\235" }; /* 1F 9D */
  41. /* Defines for third byte of header */
  42. #define BIT_MASK 0x1f
  43. #define BLOCK_MASK 0x80
  44. #define CHECK_GAP 10000 /* ratio check interval */
  45. /* Masks 0x40 and 0x20 are free. I think 0x20 should mean that there is
  46. a fourth header byte (for expansion).
  47. */
  48. #define INIT_BITS 9 /* initial number of bits/code */
  49. #ifdef COMPATIBLE /* But wrong! */
  50. # define MAXCODE(n_bits) (1 << (n_bits) - 1)
  51. #else
  52. # define MAXCODE(n_bits) ((1 << (n_bits)) - 1)
  53. #endif /* COMPATIBLE */
  54. #define htabof(i) cdata->htab[i]
  55. #define codetabof(i) cdata->codetab[i]
  56. /*
  57. * the next two codes should not be changed lightly, as they must not
  58. * lie within the contiguous general code space.
  59. */
  60. #define FIRST 257 /* first free entry */
  61. #define CLEAR 256 /* table clear output code */
  62. #ifdef DEBUG
  63. static void prratio( FILE *stream, long int num, long int den);
  64. #endif
  65. int cmcompress_compress_initialize(struct cmcompress_stream* cdata)
  66. {
  67. cdata->maxbits = BITS; /* user settable max # bits/code */
  68. cdata->maxmaxcode = 1 << BITS; /* should NEVER generate this code */
  69. cdata->hsize = HSIZE; /* for dynamic table sizing */
  70. cdata->free_ent = 0; /* first unused entry */
  71. cdata->nomagic = 0; /* Use a 3-byte magic number header, unless old file */
  72. cdata->block_compress = BLOCK_MASK;
  73. cdata->clear_flg = 0;
  74. cdata->ratio = 0;
  75. cdata->checkpoint = CHECK_GAP;
  76. cdata->input_stream = 0;
  77. cdata->output_stream = 0;
  78. cdata->client_data = 0;
  79. return 1;
  80. }
  81. static void cl_hash(struct cmcompress_stream* cdata, count_int hsize) /* reset code table */
  82. {
  83. register count_int *htab_p = cdata->htab+hsize;
  84. register long i;
  85. register long m1 = -1;
  86. i = hsize - 16;
  87. do
  88. { /* might use Sys V memset(3) here */
  89. *(htab_p-16) = m1;
  90. *(htab_p-15) = m1;
  91. *(htab_p-14) = m1;
  92. *(htab_p-13) = m1;
  93. *(htab_p-12) = m1;
  94. *(htab_p-11) = m1;
  95. *(htab_p-10) = m1;
  96. *(htab_p-9) = m1;
  97. *(htab_p-8) = m1;
  98. *(htab_p-7) = m1;
  99. *(htab_p-6) = m1;
  100. *(htab_p-5) = m1;
  101. *(htab_p-4) = m1;
  102. *(htab_p-3) = m1;
  103. *(htab_p-2) = m1;
  104. *(htab_p-1) = m1;
  105. htab_p -= 16;
  106. }
  107. while ((i -= 16) >= 0);
  108. for ( i += 16; i > 0; i-- )
  109. {
  110. *--htab_p = m1;
  111. }
  112. }
  113. /*-
  114. * Output the given code.
  115. * Inputs:
  116. * code: A n_bits-bit integer. If == -1, then EOF. This assumes
  117. * that n_bits =< (long)wordsize - 1.
  118. * Outputs:
  119. * Outputs code to the file.
  120. * Assumptions:
  121. * Chars are 8 bits long.
  122. * Algorithm:
  123. * Maintain a BITS character long buffer (so that 8 codes will
  124. * fit in it exactly). Use the VAX insv instruction to insert each
  125. * code in turn. When the buffer fills up empty it and start over.
  126. */
  127. static char buf[BITS];
  128. #ifndef vax
  129. char_type lmask[9] = {0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00};
  130. char_type rmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
  131. #endif /* vax */
  132. static int output(struct cmcompress_stream* cdata, code_int code)
  133. {
  134. #ifdef DEBUG
  135. static int col = 0;
  136. #endif /* DEBUG */
  137. /*
  138. * On the VAX, it is important to have the register declarations
  139. * in exactly the order given, or the asm will break.
  140. */
  141. register int r_off = cdata->offset, bits= cdata->n_bits;
  142. register char * bp = buf;
  143. #ifdef DEBUG
  144. if ( verbose )
  145. {
  146. fprintf( stderr, "%5d%c", code,
  147. (col+=6) >= 74 ? (col = 0, '\n') : ' ' );
  148. }
  149. #endif /* DEBUG */
  150. if ( code >= 0 )
  151. {
  152. #if defined(vax) && !defined(__GNUC__)
  153. /*
  154. * VAX and PCC DEPENDENT!! Implementation on other machines is
  155. * below.
  156. *
  157. * Translation: Insert BITS bits from the argument starting at
  158. * cdata->offset bits from the beginning of buf.
  159. */
  160. 0; /* Work around for pcc -O bug with asm and if stmt */
  161. asm( "insv 4(ap),r11,r10,(r9)" );
  162. #else
  163. /*
  164. * byte/bit numbering on the VAX is simulated by the following code
  165. */
  166. /*
  167. * Get to the first byte.
  168. */
  169. bp += (r_off >> 3);
  170. r_off &= 7;
  171. /*
  172. * Since code is always >= 8 bits, only need to mask the first
  173. * hunk on the left.
  174. */
  175. *bp = (char)((*bp & rmask[r_off]) | ((code << r_off) & lmask[r_off]));
  176. bp++;
  177. bits -= (8 - r_off);
  178. code >>= 8 - r_off;
  179. /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
  180. if ( bits >= 8 )
  181. {
  182. *bp++ = (char)(code);
  183. code >>= 8;
  184. bits -= 8;
  185. }
  186. /* Last bits. */
  187. if(bits)
  188. {
  189. *bp = (char)(code);
  190. }
  191. #endif /* vax */
  192. cdata->offset += cdata->n_bits;
  193. if ( cdata->offset == (cdata->n_bits << 3) )
  194. {
  195. bp = buf;
  196. bits = cdata->n_bits;
  197. cdata->bytes_out += bits;
  198. do
  199. {
  200. if ( cdata->output_stream(cdata, bp, 1) != 1 )
  201. {
  202. return 0;
  203. }
  204. bp++;
  205. }
  206. while(--bits);
  207. cdata->offset = 0;
  208. }
  209. /*
  210. * If the next entry is going to be too big for the code size,
  211. * then increase it, if possible.
  212. */
  213. if ( cdata->free_ent > cdata->maxcode || (cdata->clear_flg > 0))
  214. {
  215. /*
  216. * Write the whole buffer, because the input side won't
  217. * discover the size increase until after it has read it.
  218. */
  219. if ( cdata->offset > 0 )
  220. {
  221. if ( cdata->output_stream(cdata, buf, cdata->n_bits) != cdata->n_bits )
  222. {
  223. return 0;
  224. }
  225. cdata->bytes_out += cdata->n_bits;
  226. }
  227. cdata->offset = 0;
  228. if ( cdata->clear_flg )
  229. {
  230. cdata->maxcode = MAXCODE (cdata->n_bits = INIT_BITS);
  231. cdata->clear_flg = 0;
  232. }
  233. else
  234. {
  235. cdata->n_bits++;
  236. if ( cdata->n_bits == cdata->maxbits )
  237. {
  238. cdata->maxcode = cdata->maxmaxcode;
  239. }
  240. else
  241. {
  242. cdata->maxcode = MAXCODE(cdata->n_bits);
  243. }
  244. }
  245. #ifdef DEBUG
  246. if ( debug )
  247. {
  248. fprintf( stderr, "\nChange to %d bits\n", cdata->n_bits );
  249. col = 0;
  250. }
  251. #endif /* DEBUG */
  252. }
  253. }
  254. else
  255. {
  256. /*
  257. * At EOF, write the rest of the buffer.
  258. */
  259. if ( cdata->offset > 0 )
  260. {
  261. cdata->offset = (cdata->offset + 7) / 8;
  262. if ( cdata->output_stream(cdata, buf, cdata->offset ) != cdata->offset )
  263. {
  264. return 0;
  265. }
  266. cdata->bytes_out += cdata->offset;
  267. }
  268. cdata->offset = 0;
  269. (void)fflush( stdout );
  270. if( ferror( stdout ) )
  271. {
  272. return 0;
  273. }
  274. #ifdef DEBUG
  275. if ( verbose )
  276. {
  277. fprintf( stderr, "\n" );
  278. }
  279. #endif
  280. }
  281. return 1;
  282. }
  283. /*
  284. * compress stdin to stdout
  285. *
  286. * Algorithm: use open addressing double hashing (no chaining) on the
  287. * prefix code / next character combination. We do a variant of Knuth's
  288. * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
  289. * secondary probe. Here, the modular division first probe is gives way
  290. * to a faster exclusive-or manipulation. Also do block compression with
  291. * an adaptive reset, whereby the code table is cleared when the compression
  292. * ratio decreases, but after the table fills. The variable-length output
  293. * codes are re-sized at this point, and a special CLEAR code is generated
  294. * for the decompressor. Late addition: construct the table according to
  295. * file size for noticeable speed improvement on small files. Please direct
  296. * questions about this implementation to ames!jaw.
  297. */
  298. int cmcompress_compress_start(struct cmcompress_stream* cdata)
  299. {
  300. #ifndef COMPATIBLE
  301. if (cdata->nomagic == 0)
  302. {
  303. char headLast = (char)(cdata->maxbits | cdata->block_compress);
  304. cdata->output_stream(cdata, (const char*)magic_header, 2);
  305. cdata->output_stream(cdata, &headLast, 1);
  306. if(ferror(stdout))
  307. {
  308. printf("Error...\n");
  309. }
  310. }
  311. #endif /* COMPATIBLE */
  312. cdata->offset = 0;
  313. cdata->bytes_out = 3; /* includes 3-byte header mojo */
  314. cdata->out_count = 0;
  315. cdata->clear_flg = 0;
  316. cdata->ratio = 0;
  317. cdata->in_count = 1;
  318. cdata->checkpoint = CHECK_GAP;
  319. cdata->maxcode = MAXCODE(cdata->n_bits = INIT_BITS);
  320. cdata->free_ent = ((cdata->block_compress) ? FIRST : 256 );
  321. cdata->first_pass = 1;
  322. cdata->hshift = 0;
  323. for ( cdata->fcode = (long) cdata->hsize; cdata->fcode < 65536L; cdata->fcode *= 2L )
  324. {
  325. cdata->hshift++;
  326. }
  327. cdata->hshift = 8 - cdata->hshift; /* set hash code range bound */
  328. cdata->hsize_reg = cdata->hsize;
  329. cl_hash(cdata, (count_int) cdata->hsize_reg); /* clear hash table */
  330. return 1;
  331. }
  332. static int cl_block (struct cmcompress_stream* cdata) /* table clear for block compress */
  333. {
  334. register long int rat;
  335. cdata->checkpoint = cdata->in_count + CHECK_GAP;
  336. #ifdef DEBUG
  337. if ( cdata->debug )
  338. {
  339. fprintf ( stderr, "count: %ld, ratio: ", cdata->in_count );
  340. prratio ( stderr, cdata->in_count, cdata->bytes_out );
  341. fprintf ( stderr, "\n");
  342. }
  343. #endif /* DEBUG */
  344. if(cdata->in_count > 0x007fffff)
  345. { /* shift will overflow */
  346. rat = cdata->bytes_out >> 8;
  347. if(rat == 0)
  348. { /* Don't divide by zero */
  349. rat = 0x7fffffff;
  350. }
  351. else
  352. {
  353. rat = cdata->in_count / rat;
  354. }
  355. }
  356. else
  357. {
  358. rat = (cdata->in_count << 8) / cdata->bytes_out; /* 8 fractional bits */
  359. }
  360. if ( rat > cdata->ratio )
  361. {
  362. cdata->ratio = rat;
  363. }
  364. else
  365. {
  366. cdata->ratio = 0;
  367. #ifdef DEBUG
  368. if(cdata->verbose)
  369. {
  370. dump_tab(); /* dump string table */
  371. }
  372. #endif
  373. cl_hash (cdata, (count_int) cdata->hsize );
  374. cdata->free_ent = FIRST;
  375. cdata->clear_flg = 1;
  376. if ( !output (cdata, (code_int) CLEAR ) )
  377. {
  378. return 0;
  379. }
  380. #ifdef DEBUG
  381. if(cdata->debug)
  382. {
  383. fprintf ( stderr, "clear\n" );
  384. }
  385. #endif /* DEBUG */
  386. }
  387. return 1;
  388. }
  389. int cmcompress_compress(struct cmcompress_stream* cdata, void* buff, size_t n)
  390. {
  391. register code_int i;
  392. register int c;
  393. register int disp;
  394. unsigned char* input_buffer = (unsigned char*)buff;
  395. size_t cc;
  396. /*printf("cmcompress_compress(%p, %p, %d)\n", cdata, buff, n);*/
  397. if ( cdata->first_pass )
  398. {
  399. cdata->ent = input_buffer[0];
  400. ++ input_buffer;
  401. -- n;
  402. cdata->first_pass = 0;
  403. }
  404. for ( cc = 0; cc < n; ++ cc )
  405. {
  406. c = input_buffer[cc];
  407. cdata->in_count++;
  408. cdata->fcode = (long) (((long) c << cdata->maxbits) + cdata->ent);
  409. i = ((c << cdata->hshift) ^ cdata->ent); /* xor hashing */
  410. if ( htabof (i) == cdata->fcode )
  411. {
  412. cdata->ent = codetabof (i);
  413. continue;
  414. }
  415. else if ( (long)htabof (i) < 0 ) /* empty slot */
  416. {
  417. goto nomatch;
  418. }
  419. disp = (int)(cdata->hsize_reg - i); /* secondary hash (after G. Knott) */
  420. if ( i == 0 )
  421. {
  422. disp = 1;
  423. }
  424. probe:
  425. if ( (i -= disp) < 0 )
  426. {
  427. i += cdata->hsize_reg;
  428. }
  429. if ( htabof (i) == cdata->fcode )
  430. {
  431. cdata->ent = codetabof (i);
  432. continue;
  433. }
  434. if ( (long)htabof (i) > 0 )
  435. {
  436. goto probe;
  437. }
  438. nomatch:
  439. if ( !output(cdata, (code_int) cdata->ent ) )
  440. {
  441. return 0;
  442. }
  443. cdata->out_count++;
  444. cdata->ent = c;
  445. if (
  446. #ifdef SIGNED_COMPARE_SLOW
  447. (unsigned) cdata->free_ent < (unsigned) cdata->maxmaxcode
  448. #else
  449. cdata->free_ent < cdata->maxmaxcode
  450. #endif
  451. )
  452. {
  453. codetabof (i) = (unsigned short)(cdata->free_ent++); /* code -> hashtable */
  454. htabof (i) = cdata->fcode;
  455. }
  456. else if ( (count_int)cdata->in_count >= cdata->checkpoint && cdata->block_compress )
  457. {
  458. if ( !cl_block (cdata) )
  459. {
  460. return 0;
  461. }
  462. }
  463. }
  464. return 1;
  465. }
  466. int cmcompress_compress_finalize(struct cmcompress_stream* cdata)
  467. {
  468. /*
  469. * Put out the final code.
  470. */
  471. if ( !output(cdata, (code_int)cdata->ent ) )
  472. {
  473. return 0;
  474. }
  475. cdata->out_count++;
  476. if ( !output(cdata, (code_int)-1 ) )
  477. {
  478. return 0;
  479. }
  480. if(cdata->bytes_out > cdata->in_count) /* exit(2) if no savings */
  481. {
  482. return 0;
  483. }
  484. return 1;
  485. }
  486. #if defined(DEBUG)
  487. static void prratio(FILE *stream, long int num, long int den)
  488. {
  489. register int q; /* Doesn't need to be long */
  490. if(num > 214748L)
  491. { /* 2147483647/10000 */
  492. q = num / (den / 10000L);
  493. }
  494. else
  495. {
  496. q = 10000L * num / den; /* Long calculations, though */
  497. }
  498. if (q < 0)
  499. {
  500. putc('-', stream);
  501. q = -q;
  502. }
  503. fprintf(stream, "%d.%02d%%", q / 100, q % 100);
  504. }
  505. #endif