/* Copyright (c) 1990-2007 Info-ZIP. All rights reserved. See the accompanying file LICENSE, version 2007-Mar-04 or later (the contents of which are also included in unzip.h) for terms of use. If, for some reason, all these files are missing, the Info-ZIP license also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html */ /* explode.c -- by Mark Adler version c17d, 01 December 2007 */ /* Copyright history: - Starting with UnZip 5.41 of 16-April-2000, this source file is covered by the Info-Zip LICENSE cited above. - Prior versions of this source file, found in UnZip source packages up to UnZip 5.40, were put in the public domain. The original copyright note by Mark Adler was: "You can do whatever you like with this source file, though I would prefer that if you modify it and redistribute it that you include comments to that effect with your name and the date. Thank you." History: vers date who what ---- --------- -------------- ------------------------------------ c1 30 Mar 92 M. Adler explode that uses huft_build from inflate (this gives over a 70% speed improvement over the original unimplode.c, which decoded a bit at a time) c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k. c3 10 Apr 92 M. Adler added a little memory tracking if DEBUG c4 11 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy() c5 21 Apr 92 M. Adler added the WSIZE #define to allow reducing the 32K window size for specialized applications. c6 31 May 92 M. Adler added typecasts to eliminate some warnings c7 27 Jun 92 G. Roelofs added more typecasts. c8 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch. c9 19 Jul 93 J. Bush added more typecasts (to return values); made l[256] array static for Amiga. c10 8 Oct 93 G. Roelofs added used_csize for diagnostics; added buf and unshrink arguments to flush(); undef'd various macros at end for Turbo C; removed NEXTBYTE macro (now in unzip.h) and bytebuf variable (not used); changed memset() to memzero(). c11 9 Jan 94 M. Adler fixed incorrect used_csize calculation. c12 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines to avoid bug in Encore compiler. c13 25 Aug 94 M. Adler fixed distance-length comment (orig c9 fix) c14 22 Nov 95 S. Maxwell removed unnecessary "static" on auto array c15 6 Jul 96 W. Haidinger added ulg typecasts to flush() calls. c16 8 Feb 98 C. Spieler added ZCONST modifiers to const tables and #ifdef DEBUG around debugging code. c16b 25 Mar 98 C. Spieler modified DLL code for slide redirection. c16d 05 Jul 99 C. Spieler take care of flush() return values and stop processing in case of errors c17 04 Feb 01 C. Spieler reorganized code to reduce repetitions of large code parts; adapted huft decoding to the changes in inflate's huft_build() due to support of deflate64; fixed memory leaks (huft tables were not free'd when get_tree() failed). c17b 16 Feb 02 C. Spieler changed type of the "extra lengths" array "extra" from ush into uch (to save space) c17c 10 Aug 04 NN file sizes use zoff_t. c17d 01 Dec 07 C. Spieler type for file sizes changed from zoff_t into zusz_t. */ /* Explode imploded (PKZIP method 6 compressed) data. This compression method searches for as much of the current string of bytes (up to a length of ~320) in the previous 4K or 8K bytes. If it doesn't find any matches (of at least length 2 or 3), it codes the next byte. Otherwise, it codes the length of the matched string and its distance backwards from the current position. Single bytes ("literals") are preceded by a one (a single bit) and are either uncoded (the eight bits go directly into the compressed stream for a total of nine bits) or Huffman coded with a supplied literal code tree. If literals are coded, then the minimum match length is three, otherwise it is two. There are therefore four kinds of imploded streams: 8K search with coded literals (min match = 3), 4K search with coded literals (min match = 3), 8K with uncoded literals (min match = 2), and 4K with uncoded literals (min match = 2). The kind of stream is identified in two bits of a general purpose bit flag that is outside of the compressed stream. Distance-length pairs for matched strings are preceded by a zero bit (to distinguish them from literals) and are always coded. The distance comes first and is either the low six (4K) or low seven (8K) bits of the distance (uncoded), followed by the high six bits of the distance coded. Then the length is six bits coded (0..63 + min match length), and if the maximum such length is coded, then it's followed by another eight bits (uncoded) to be added to the coded length. This gives a match length range of 2..320 or 3..321 bytes. The literal, length, and distance codes are all represented in a slightly compressed form themselves. What is sent are the lengths of the codes for each value, which is sufficient to construct the codes. Each byte of the code representation is the code length (the low four bits representing 1..16), and the number of values sequentially with that length (the high four bits also representing 1..16). There are 256 literal code values (if literals are coded), 64 length code values, and 64 distance code values, in that order at the beginning of the compressed stream. Each set of code values is preceded (redundantly) with a byte indicating how many bytes are in the code description that follows, in the range 1..256. The codes themselves are decoded using tables made by huft_build() from the bit lengths. That routine and its comments are in the inflate.c module. */ #define __EXPLODE_C /* identifies this source module */ #define UNZIP_INTERNAL #include "unzip.h" /* must supply slide[] (uch) array and NEXTBYTE macro */ #ifndef WSIZE # define WSIZE 0x8000 /* window size--must be a power of two, and */ #endif /* at least 8K for zip's implode method */ #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) # define wszimpl (unsigned)(G._wsize) #else # if defined(USE_DEFLATE64) && defined(INT_16BIT) # define wszimpl (unsigned)(WSIZE>>1) # else /* !(USE_DEFLATE64 && INT_16BIT) */ # define wszimpl WSIZE # endif /* !(USE_DEFLATE64 && INT_16BIT) */ #endif /* routines here */ static int get_tree OF((__GPRO__ unsigned *l, unsigned n)); static int explode_lit OF((__GPRO__ struct huft *tb, struct huft *tl, struct huft *td, unsigned bb, unsigned bl, unsigned bd, unsigned bdl)); static int explode_nolit OF((__GPRO__ struct huft *tl, struct huft *td, unsigned bl, unsigned bd, unsigned bdl)); int explode OF((__GPRO)); /* The implode algorithm uses a sliding 4K or 8K byte window on the uncompressed stream to find repeated byte strings. This is implemented here as a circular buffer. The index is updated simply by incrementing and then and'ing with 0x0fff (4K-1) or 0x1fff (8K-1). Here, the 32K buffer of inflate is used, and it works just as well to always have a 32K circular buffer, so the index is anded with 0x7fff. This is done to allow the window to also be used as the output buffer. */ /* This must be supplied in an external module useable like "uch slide[8192];" or "uch *slide;", where the latter would be malloc'ed. In unzip, slide[] is actually a 32K area for use by inflate, which uses a 32K sliding window. */ #define INVALID_CODE 99 #define IS_INVALID_CODE(c) ((c) == INVALID_CODE) /* Tables for length and distance */ static ZCONST ush cplen2[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65}; static ZCONST ush cplen3[] = {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66}; static ZCONST uch extra[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8}; static ZCONST ush cpdist4[] = {1, 65, 129, 193, 257, 321, 385, 449, 513, 577, 641, 705, 769, 833, 897, 961, 1025, 1089, 1153, 1217, 1281, 1345, 1409, 1473, 1537, 1601, 1665, 1729, 1793, 1857, 1921, 1985, 2049, 2113, 2177, 2241, 2305, 2369, 2433, 2497, 2561, 2625, 2689, 2753, 2817, 2881, 2945, 3009, 3073, 3137, 3201, 3265, 3329, 3393, 3457, 3521, 3585, 3649, 3713, 3777, 3841, 3905, 3969, 4033}; static ZCONST ush cpdist8[] = {1, 129, 257, 385, 513, 641, 769, 897, 1025, 1153, 1281, 1409, 1537, 1665, 1793, 1921, 2049, 2177, 2305, 2433, 2561, 2689, 2817, 2945, 3073, 3201, 3329, 3457, 3585, 3713, 3841, 3969, 4097, 4225, 4353, 4481, 4609, 4737, 4865, 4993, 5121, 5249, 5377, 5505, 5633, 5761, 5889, 6017, 6145, 6273, 6401, 6529, 6657, 6785, 6913, 7041, 7169, 7297, 7425, 7553, 7681, 7809, 7937, 8065}; /* Macros for inflate() bit peeking and grabbing. The usage is: NEEDBITS(j) x = b & mask_bits[j]; DUMPBITS(j) where NEEDBITS makes sure that b has at least j bits in it, and DUMPBITS removes the bits from b. The macros use the variable k for the number of bits in b. Normally, b and k are register variables for speed. */ #define NEEDBITS(n) {while(k<(n)){b|=((ulg)NEXTBYTE)<>=(n);k-=(n);} #define DECODEHUFT(htab, bits, mask) {\ NEEDBITS((unsigned)(bits))\ t = (htab) + ((~(unsigned)b)&(mask));\ while (1) {\ DUMPBITS(t->b)\ if ((e=t->e) <= 32) break;\ if (IS_INVALID_CODE(e)) return 1;\ e &= 31;\ NEEDBITS(e)\ t = t->v.t + ((~(unsigned)b)&mask_bits[e]);\ }\ } static int get_tree(__G__ l, n) __GDEF unsigned *l; /* bit lengths */ unsigned n; /* number expected */ /* Get the bit lengths for a code representation from the compressed stream. If get_tree() returns 4, then there is an error in the data. Otherwise zero is returned. */ { unsigned i; /* bytes remaining in list */ unsigned k; /* lengths entered */ unsigned j; /* number of codes */ unsigned b; /* bit length for those codes */ /* get bit lengths */ i = NEXTBYTE + 1; /* length/count pairs to read */ k = 0; /* next code */ do { b = ((j = NEXTBYTE) & 0xf) + 1; /* bits in code (1..16) */ j = ((j & 0xf0) >> 4) + 1; /* codes with those bits (1..16) */ if (k + j > n) return 4; /* don't overflow l[] */ do { l[k++] = b; } while (--j); } while (--i); return k != n ? 4 : 0; /* should have read n of them */ } static int explode_lit(__G__ tb, tl, td, bb, bl, bd, bdl) __GDEF struct huft *tb, *tl, *td; /* literal, length, and distance tables */ unsigned bb, bl, bd; /* number of bits decoded by those */ unsigned bdl; /* number of distance low bits */ /* Decompress the imploded data using coded literals and a sliding window (of size 2^(6+bdl) bytes). */ { zusz_t s; /* bytes to decompress */ register unsigned e; /* table entry flag/number of extra bits */ unsigned n, d; /* length and index for copy */ unsigned w; /* current window position */ struct huft *t; /* pointer to table entry */ unsigned mb, ml, md; /* masks for bb, bl, and bd bits */ unsigned mdl; /* mask for bdl (distance lower) bits */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ unsigned u; /* true if unflushed */ int retval = 0; /* error code returned: initialized to "no error" */ /* explode the coded data */ b = k = w = 0; /* initialize bit buffer, window */ u = 1; /* buffer unflushed */ mb = mask_bits[bb]; /* precompute masks for speed */ ml = mask_bits[bl]; md = mask_bits[bd]; mdl = mask_bits[bdl]; s = G.lrec.ucsize; while (s > 0) /* do until ucsize bytes uncompressed */ { NEEDBITS(1) if (b & 1) /* then literal--decode it */ { DUMPBITS(1) s--; DECODEHUFT(tb, bb, mb) /* get coded literal */ redirSlide[w++] = (uch)t->v.n; if (w == wszimpl) { if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; w = u = 0; } } else /* else distance/length */ { DUMPBITS(1) NEEDBITS(bdl) /* get distance low bits */ d = (unsigned)b & mdl; DUMPBITS(bdl) DECODEHUFT(td, bd, md) /* get coded distance high bits */ d = w - d - t->v.n; /* construct offset */ DECODEHUFT(tl, bl, ml) /* get coded length */ n = t->v.n; if (e) /* get length extra bits */ { NEEDBITS(8) n += (unsigned)b & 0xff; DUMPBITS(8) } /* do the copy */ s = (s > (zusz_t)n ? s - (zusz_t)n : 0); do { #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) if (G.redirect_slide) { /* &= w/ wszimpl not needed and wrong if redirect */ if (d >= wszimpl) return 1; e = wszimpl - (d > w ? d : w); } else #endif e = wszimpl - ((d &= wszimpl-1) > w ? d : w); if (e > n) e = n; n -= e; if (u && w <= d) { memzero(redirSlide + w, e); w += e; d += e; } else #ifndef NOMEMCPY if (w - d >= e) /* (this test assumes unsigned comparison) */ { memcpy(redirSlide + w, redirSlide + d, e); w += e; d += e; } else /* do it slow to avoid memcpy() overlap */ #endif /* !NOMEMCPY */ do { redirSlide[w++] = redirSlide[d++]; } while (--e); if (w == wszimpl) { if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; w = u = 0; } } while (n); } } /* flush out redirSlide */ if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; if (G.csize + G.incnt + (k >> 3)) /* should have read csize bytes, but */ { /* sometimes read one too many: k>>3 compensates */ G.used_csize = G.lrec.csize - G.csize - G.incnt - (k >> 3); return 5; } return 0; } static int explode_nolit(__G__ tl, td, bl, bd, bdl) __GDEF struct huft *tl, *td; /* length and distance decoder tables */ unsigned bl, bd; /* number of bits decoded by tl[] and td[] */ unsigned bdl; /* number of distance low bits */ /* Decompress the imploded data using uncoded literals and a sliding window (of size 2^(6+bdl) bytes). */ { zusz_t s; /* bytes to decompress */ register unsigned e; /* table entry flag/number of extra bits */ unsigned n, d; /* length and index for copy */ unsigned w; /* current window position */ struct huft *t; /* pointer to table entry */ unsigned ml, md; /* masks for bl and bd bits */ unsigned mdl; /* mask for bdl (distance lower) bits */ register ulg b; /* bit buffer */ register unsigned k; /* number of bits in bit buffer */ unsigned u; /* true if unflushed */ int retval = 0; /* error code returned: initialized to "no error" */ /* explode the coded data */ b = k = w = 0; /* initialize bit buffer, window */ u = 1; /* buffer unflushed */ ml = mask_bits[bl]; /* precompute masks for speed */ md = mask_bits[bd]; mdl = mask_bits[bdl]; s = G.lrec.ucsize; while (s > 0) /* do until ucsize bytes uncompressed */ { NEEDBITS(1) if (b & 1) /* then literal--get eight bits */ { DUMPBITS(1) s--; NEEDBITS(8) redirSlide[w++] = (uch)b; if (w == wszimpl) { if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; w = u = 0; } DUMPBITS(8) } else /* else distance/length */ { DUMPBITS(1) NEEDBITS(bdl) /* get distance low bits */ d = (unsigned)b & mdl; DUMPBITS(bdl) DECODEHUFT(td, bd, md) /* get coded distance high bits */ d = w - d - t->v.n; /* construct offset */ DECODEHUFT(tl, bl, ml) /* get coded length */ n = t->v.n; if (e) /* get length extra bits */ { NEEDBITS(8) n += (unsigned)b & 0xff; DUMPBITS(8) } /* do the copy */ s = (s > (zusz_t)n ? s - (zusz_t)n : 0); do { #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) if (G.redirect_slide) { /* &= w/ wszimpl not needed and wrong if redirect */ if (d >= wszimpl) return 1; e = wszimpl - (d > w ? d : w); } else #endif e = wszimpl - ((d &= wszimpl-1) > w ? d : w); if (e > n) e = n; n -= e; if (u && w <= d) { memzero(redirSlide + w, e); w += e; d += e; } else #ifndef NOMEMCPY if (w - d >= e) /* (this test assumes unsigned comparison) */ { memcpy(redirSlide + w, redirSlide + d, e); w += e; d += e; } else /* do it slow to avoid memcpy() overlap */ #endif /* !NOMEMCPY */ do { redirSlide[w++] = redirSlide[d++]; } while (--e); if (w == wszimpl) { if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; w = u = 0; } } while (n); } } /* flush out redirSlide */ if ((retval = flush(__G__ redirSlide, (ulg)w, 0)) != 0) return retval; if (G.csize + G.incnt + (k >> 3)) /* should have read csize bytes, but */ { /* sometimes read one too many: k>>3 compensates */ G.used_csize = G.lrec.csize - G.csize - G.incnt - (k >> 3); return 5; } return 0; } int explode(__G) __GDEF /* Explode an imploded compressed stream. Based on the general purpose bit flag, decide on coded or uncoded literals, and an 8K or 4K sliding window. Construct the literal (if any), length, and distance codes and the tables needed to decode them (using huft_build() from inflate.c), and call the appropriate routine for the type of data in the remainder of the stream. The four routines are nearly identical, differing only in whether the literal is decoded or simply read in, and in how many bits are read in, uncoded, for the low distance bits. */ { unsigned r; /* return codes */ struct huft *tb; /* literal code table */ struct huft *tl; /* length code table */ struct huft *td; /* distance code table */ unsigned bb; /* bits for tb */ unsigned bl; /* bits for tl */ unsigned bd; /* bits for td */ unsigned bdl; /* number of uncoded lower distance bits */ unsigned l[256]; /* bit lengths for codes */ #if (defined(DLL) && !defined(NO_SLIDE_REDIR)) if (G.redirect_slide) /* For 16-bit systems, it has already been checked at DLL entrance that * the buffer size in G.redirect_size does not exceed unsigned range. */ G._wsize = G.redirect_size, redirSlide = G.redirect_buffer; else #if defined(USE_DEFLATE64) && defined(INT_16BIT) /* For systems using 16-bit ints, reduce the used buffer size below * the limit of "unsigned int" numbers range. */ G._wsize = WSIZE>>1, redirSlide = slide; #else /* !(USE_DEFLATE64 && INT_16BIT) */ G._wsize = WSIZE, redirSlide = slide; #endif /* !(USE_DEFLATE64 && INT_16BIT) */ #endif /* DLL && !NO_SLIDE_REDIR */ /* Tune base table sizes. Note: I thought that to truly optimize speed, I would have to select different bl, bd, and bb values for different compressed file sizes. I was surprised to find out that the values of 7, 7, and 9 worked best over a very wide range of sizes, except that bd = 8 worked marginally better for large compressed sizes. */ bl = 7; bd = (G.csize + G.incnt) > 200000L ? 8 : 7; #ifdef DEBUG G.hufts = 0; /* initialize huft's malloc'ed */ #endif if (G.lrec.general_purpose_bit_flag & 4) /* With literal tree--minimum match length is 3 */ { bb = 9; /* base table size for literals */ if ((r = get_tree(__G__ l, 256)) != 0) return (int)r; if ((r = huft_build(__G__ l, 256, 256, NULL, NULL, &tb, &bb)) != 0) { if (r == 1) huft_free(tb); return (int)r; } if ((r = get_tree(__G__ l, 64)) != 0) { huft_free(tb); return (int)r; } if ((r = huft_build(__G__ l, 64, 0, cplen3, extra, &tl, &bl)) != 0) { if (r == 1) huft_free(tl); huft_free(tb); return (int)r; } } else /* No literal tree--minimum match length is 2 */ { tb = (struct huft *)NULL; if ((r = get_tree(__G__ l, 64)) != 0) return (int)r; if ((r = huft_build(__G__ l, 64, 0, cplen2, extra, &tl, &bl)) != 0) { if (r == 1) huft_free(tl); return (int)r; } } if ((r = get_tree(__G__ l, 64)) != 0) { huft_free(tl); if (tb != (struct huft *)NULL) huft_free(tb); return (int)r; } if (G.lrec.general_purpose_bit_flag & 2) /* true if 8K */ { bdl = 7; r = huft_build(__G__ l, 64, 0, cpdist8, extra, &td, &bd); } else /* else 4K */ { bdl = 6; r = huft_build(__G__ l, 64, 0, cpdist4, extra, &td, &bd); } if (r != 0) { if (r == 1) huft_free(td); huft_free(tl); if (tb != (struct huft *)NULL) huft_free(tb); return (int)r; } if (tb != NULL) { r = explode_lit(__G__ tb, tl, td, bb, bl, bd, bdl); huft_free(tb); } else { r = explode_nolit(__G__ tl, td, bl, bd, bdl); } huft_free(td); huft_free(tl); Trace((stderr, "<%u > ", G.hufts)); return (int)r; } /* so explode.c and inflate.c can be compiled together into one object: */ #undef DECODEHUFT #undef NEEDBITS #undef DUMPBITS #undef wszimpl