libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

extractor_datasource.c (34206B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file main/extractor_datasource.c
     22  * @brief random access and possibly decompression of data from buffer in memory or file on disk
     23  * @author Christian Grothoff
     24  */
     25 #include "platform.h"
     26 #include "extractor_logging.h"
     27 #include "extractor_datasource.h"
     28 
     29 #if HAVE_LIBBZ2
     30 #include <bzlib.h>
     31 #define MIN_BZ2_HEADER 4
     32 #ifndef MIN_COMPRESSED_HEADER
     33 #define MIN_COMPRESSED_HEADER MIN_ZLIB_HEADER
     34 #endif
     35 #endif
     36 
     37 #if HAVE_ZLIB
     38 #include <zlib.h>
     39 #define MIN_ZLIB_HEADER 12
     40 #ifndef MIN_COMPRESSED_HEADER
     41 #define MIN_COMPRESSED_HEADER MIN_BZ2_HEADER
     42 #endif
     43 #endif
     44 
     45 #ifndef MIN_COMPRESSED_HEADER
     46 #define MIN_COMPRESSED_HEADER -1
     47 #endif
     48 
     49 #ifndef O_LARGEFILE
     50 #define O_LARGEFILE 0
     51 #endif
     52 
     53 /**
     54  * Maximum size of an IO buffer.
     55  */
     56 #define MAX_READ (4 * 1024 * 1024)
     57 
     58 /**
     59  * Data is read from the source and shoved into decompressor
     60  * in chunks this big.
     61  */
     62 #define COM_CHUNK_SIZE (16 * 1024)
     63 
     64 
     65 /**
     66  * Enum with the various possible types of compression supported.
     67  */
     68 enum ExtractorCompressionType
     69 {
     70   /**
     71    * We cannot tell from the data (header incomplete).
     72    */
     73   COMP_TYPE_UNDEFINED = -1,
     74 
     75   /**
     76    * Invalid header (likely uncompressed)
     77    */
     78   COMP_TYPE_INVALID = 0,
     79 
     80   /**
     81    * libz / gzip compression.
     82    */
     83   COMP_TYPE_ZLIB = 1,
     84 
     85   /**
     86    * bz2 compression
     87    */
     88   COMP_TYPE_BZ2 = 2
     89 };
     90 
     91 
     92 /**
     93  * Abstraction of the data source (file or a memory buffer)
     94  * for the decompressor.
     95  */
     96 struct BufferedFileDataSource
     97 {
     98   /**
     99    * Pointer to the buffer to read from (may be NULL)
    100    */
    101   const void *data;
    102 
    103   /**
    104    * A buffer to read into. For fd != -1: when data != NULL,
    105    * data is used directly.
    106    */
    107   void *buffer;
    108 
    109   /**
    110    * Size of the file (or the data buffer)
    111    */
    112   uint64_t fsize;
    113 
    114   /**
    115    * Position of the buffer in the file.
    116    */
    117   uint64_t fpos;
    118 
    119   /**
    120    * Position within the buffer.  Our absolute offset in the file
    121    * is thus 'fpos + buffer_pos'.
    122    */
    123   size_t buffer_pos;
    124 
    125   /**
    126    * Number of valid bytes in the buffer (<= buffer_size)
    127    */
    128   size_t buffer_bytes;
    129 
    130   /**
    131    * Allocated size of the buffer
    132    */
    133   size_t buffer_size;
    134 
    135   /**
    136    * Descriptor of the file to read data from (may be -1)
    137    */
    138   int fd;
    139 
    140 };
    141 
    142 
    143 /**
    144  * An object from which uncompressed data can be read
    145  */
    146 struct CompressedFileSource
    147 {
    148   /**
    149    * The source of data
    150    */
    151   struct BufferedFileDataSource *bfds;
    152 
    153   /**
    154    * Decompression target buffer.
    155    */
    156   char result[COM_CHUNK_SIZE];
    157 
    158   /**
    159    * At which offset in 'result' is 'fpos'?
    160    */
    161   size_t result_pos;
    162 
    163   /**
    164    * Size of the source (same as bfds->fsize)
    165    */
    166   int64_t fsize;
    167 
    168   /**
    169    * Position within the (decompressed) source
    170    */
    171   int64_t fpos;
    172 
    173   /**
    174    * Total size of the uncompressed data. Remains -1 until
    175    * decompression is finished.
    176    */
    177   int64_t uncompressed_size;
    178 
    179 #if HAVE_LIBBZ2
    180   /**
    181    * BZ2 stream object
    182    */
    183   bz_stream bstrm;
    184 #endif
    185 
    186 #if HAVE_ZLIB
    187   /**
    188    * ZLIB stream object
    189    */
    190   z_stream strm;
    191 
    192   /**
    193    * Length of gzip header (may be 0, in that case ZLIB parses the header)
    194    */
    195   int gzip_header_length;
    196 #endif
    197 
    198   /**
    199    * The type of compression used in the source
    200    */
    201   enum ExtractorCompressionType compression_type;
    202 
    203 };
    204 
    205 
    206 /**
    207  * Makes bfds seek to 'pos' and read a chunk of bytes there.
    208  * Changes bfds->fpos, bfds->buffer_bytes and bfds->buffer_pos.
    209  * Does almost nothing for memory-backed bfds.
    210  *
    211  * @param bfds bfds
    212  * @param pos position
    213  * @return 0 on success, -1 on error
    214  */
    215 static int
    216 bfds_pick_next_buffer_at (struct BufferedFileDataSource *bfds,
    217                           uint64_t pos)
    218 {
    219   int64_t position;
    220   ssize_t rd;
    221 
    222   if (pos > bfds->fsize)
    223   {
    224     LOG ("Invalid seek operation\n");
    225     return -1;   /* invalid */
    226   }
    227   if (NULL == bfds->buffer)
    228   {
    229     bfds->buffer_pos = pos;
    230     return 0;
    231   }
    232   position = (int64_t) lseek (bfds->fd, pos, SEEK_SET);
    233   if (position < 0)
    234   {
    235     LOG_STRERROR ("lseek");
    236     return -1;
    237   }
    238   bfds->fpos = position;
    239   bfds->buffer_pos = 0;
    240   rd = read (bfds->fd, bfds->buffer, bfds->buffer_size);
    241   if (rd < 0)
    242   {
    243     LOG_STRERROR ("read");
    244     return -1;
    245   }
    246   bfds->buffer_bytes = rd;
    247   return 0;
    248 }
    249 
    250 
    251 /**
    252  * Creates a bfds
    253  *
    254  * @param data data buffer to use as a source (NULL if fd != -1)
    255  * @param fd file descriptor to use as a source (-1 if data != NULL)
    256  * @param fsize size of the file (or the buffer)
    257  * @return newly allocated bfds
    258  */
    259 static struct BufferedFileDataSource *
    260 bfds_new (const void *data,
    261           int fd,
    262           int64_t fsize)
    263 {
    264   struct BufferedFileDataSource *result;
    265   size_t xtra;
    266 
    267   if (fsize > MAX_READ)
    268     xtra = MAX_READ;
    269   else
    270     xtra = (size_t) fsize;
    271   if ( (-1 == fd) && (NULL == data) )
    272   {
    273     LOG ("Invalid arguments\n");
    274     return NULL;
    275   }
    276   if ( (-1 != fd) && (NULL != data) )
    277     fd = -1; /* don't need fd */
    278   if (NULL != data)
    279     xtra = 0;
    280   if (NULL == (result = malloc (sizeof (struct BufferedFileDataSource) + xtra)))
    281   {
    282     LOG_STRERROR ("malloc");
    283     return NULL;
    284   }
    285   memset (result, 0, sizeof (struct BufferedFileDataSource));
    286   result->data = (NULL != data) ? data : &result[1];
    287   result->buffer = (NULL != data) ? NULL : &result[1];
    288   result->buffer_size = (NULL != data) ? fsize : xtra;
    289   result->buffer_bytes = (NULL != data) ? fsize : 0;
    290   result->fsize = fsize;
    291   result->fd = fd;
    292   bfds_pick_next_buffer_at (result, 0);
    293   return result;
    294 }
    295 
    296 
    297 /**
    298  * Unallocates bfds
    299  *
    300  * @param bfds bfds to deallocate
    301  */
    302 static void
    303 bfds_delete (struct BufferedFileDataSource *bfds)
    304 {
    305   free (bfds);
    306 }
    307 
    308 
    309 /**
    310  * Makes bfds seek to 'pos' in 'whence' mode.
    311  * Will try to seek within the buffer, will move the buffer location if
    312  * the seek request falls outside of the buffer range.
    313  *
    314  * @param bfds bfds
    315  * @param pos position to seek to
    316  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
    317  * @return new absolute position, -1 on error
    318  */
    319 static int64_t
    320 bfds_seek (struct BufferedFileDataSource *bfds,
    321            int64_t pos, int whence)
    322 {
    323   uint64_t npos;
    324   size_t nbpos;
    325 
    326   switch (whence)
    327   {
    328   case SEEK_CUR:
    329     npos = bfds->fpos + bfds->buffer_pos + pos;
    330     if (npos > bfds->fsize)
    331     {
    332       LOG ("Invalid seek operation to %lld from %llu (max is %llu)\n",
    333            (long long) pos,
    334            bfds->fpos + bfds->buffer_pos,
    335            (unsigned long long) bfds->fsize);
    336       return -1;
    337     }
    338     nbpos = bfds->buffer_pos + pos;
    339     if ( (NULL == bfds->buffer) ||
    340          (nbpos < bfds->buffer_bytes) )
    341     {
    342       bfds->buffer_pos = nbpos;
    343       return npos;
    344     }
    345     if (0 != bfds_pick_next_buffer_at (bfds,
    346                                        npos))
    347     {
    348       LOG ("seek operation failed\n");
    349       return -1;
    350     }
    351     return npos;
    352   case SEEK_END:
    353     if (pos > 0)
    354     {
    355       LOG ("Invalid seek operation\n");
    356       return -1;
    357     }
    358     if (bfds->fsize < -pos)
    359     {
    360       LOG ("Invalid seek operation\n");
    361       return -1;
    362     }
    363     pos = bfds->fsize + pos;
    364   /* fall-through! */
    365   case SEEK_SET:
    366     if (pos < 0)
    367     {
    368       LOG ("Invalid seek operation\n");
    369       return -1;
    370     }
    371     if (pos > bfds->fsize)
    372     {
    373       LOG ("Invalid seek operation (%lld > %llu) %d\n",
    374            (long long) pos,
    375            (unsigned long long) bfds->fsize,
    376            SEEK_SET == whence);
    377       return -1;
    378     }
    379     if ( (NULL == bfds->buffer) ||
    380          ( (bfds->fpos <= pos) &&
    381            (bfds->fpos + bfds->buffer_bytes > pos) ) )
    382     {
    383       bfds->buffer_pos = pos - bfds->fpos;
    384       return pos;
    385     }
    386     if (0 != bfds_pick_next_buffer_at (bfds, pos))
    387     {
    388       LOG ("seek operation failed\n");
    389       return -1;
    390     }
    391     ASSERT (pos == bfds->fpos + bfds->buffer_pos);
    392     return pos;
    393   }
    394   return -1;
    395 }
    396 
    397 
    398 /**
    399  * Fills 'buf_ptr' with a chunk of data. Will
    400  * fail if 'count' exceeds buffer size.
    401  *
    402  * @param bfds bfds
    403  * @param buf_ptr location to store data
    404  * @param count number of bytes to read
    405  * @return number of bytes (<= count) available at location pointed by buf_ptr,
    406  *         0 for end of stream, -1 on error
    407  */
    408 static ssize_t
    409 bfds_read (struct BufferedFileDataSource *bfds,
    410            void *buf_ptr,
    411            size_t count)
    412 {
    413   char *cbuf = buf_ptr;
    414   uint64_t old_off;
    415   size_t avail;
    416   size_t ret;
    417 
    418   old_off = bfds->fpos + bfds->buffer_pos;
    419   if (old_off == bfds->fsize)
    420     return 0; /* end of stream */
    421   ret = 0;
    422   while (count > 0)
    423   {
    424     if ( (bfds->buffer_bytes == bfds->buffer_pos) &&
    425          (0 != bfds_pick_next_buffer_at (bfds,
    426                                          bfds->fpos + bfds->buffer_bytes)) )
    427     {
    428       /* revert to original position, invalidate buffer */
    429       bfds->fpos = old_off;
    430       bfds->buffer_bytes = 0;
    431       bfds->buffer_pos = 0;
    432       LOG ("read operation failed\n");
    433       return -1; /* getting more failed */
    434     }
    435     avail = bfds->buffer_bytes - bfds->buffer_pos;
    436     if (avail > count)
    437       avail = count;
    438     if (0 == avail)
    439       break;
    440     memcpy (&cbuf[ret], bfds->data + bfds->buffer_pos, avail);
    441     bfds->buffer_pos += avail;
    442     count -= avail;
    443     ret += avail;
    444   }
    445   return ret;
    446 }
    447 
    448 
    449 #if HAVE_ZLIB
    450 /**
    451  * Initializes gz-decompression object. Might report metadata about
    452  * compresse stream, if available. Resets the stream to the beginning.
    453  *
    454  * @param cfs cfs to initialize
    455  * @param proc callback for metadata
    456  * @param proc_cls callback cls
    457  * @return 1 on success, 0 to terminate extraction, -1 on error
    458  */
    459 static int
    460 cfs_init_decompressor_zlib (struct CompressedFileSource *cfs,
    461                             EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
    462 {
    463   unsigned int gzip_header_length = 10;
    464   unsigned char hdata[12];
    465   ssize_t rsize;
    466 
    467   if (0 != bfds_seek (cfs->bfds, 0, SEEK_SET))
    468   {
    469     LOG ("Failed to seek to offset 0!\n");
    470     return -1;
    471   }
    472   /* Process gzip header */
    473   rsize = bfds_read (cfs->bfds, hdata, sizeof (hdata));
    474   if ( (-1 == rsize) ||
    475        (sizeof (hdata) > (size_t) rsize) )
    476     return -1;
    477   if (0 != (hdata[3] & 0x4)) /* FEXTRA  set */
    478     gzip_header_length += 2 + (hdata[10] & 0xff) + ((hdata[11] & 0xff) * 256);
    479 
    480   if (0 != (hdata[3] & 0x8))
    481   {
    482     /* FNAME set */
    483     char fname[1024];
    484     char *cptr;
    485     size_t len;
    486     ssize_t buf_bytes;
    487 
    488     if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
    489                                         SEEK_SET))
    490     {
    491       LOG ("Corrupt gzip, failed to seek to end of header\n");
    492       return -1;
    493     }
    494     buf_bytes = bfds_read (cfs->bfds, fname, sizeof (fname));
    495     if (buf_bytes <= 0)
    496     {
    497       LOG ("Corrupt gzip, failed to read filename\n");
    498       return -1;
    499     }
    500     if (NULL == (cptr = memchr (fname, 0, buf_bytes)))
    501     {
    502       LOG ("Corrupt gzip, failed to read filename terminator\n");
    503       return -1;
    504     }
    505     len = cptr - fname;
    506     if ( (NULL != proc) &&
    507          (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_FILENAME,
    508                      EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
    509                      fname,
    510                      len)) )
    511       return 0; /* done */
    512     gzip_header_length += len + 1;
    513   }
    514 
    515   if (0 != (hdata[3] & 0x16))
    516   {
    517     /* FCOMMENT set */
    518     char fcomment[1024];
    519     char *cptr;
    520     ssize_t buf_bytes;
    521     size_t len;
    522 
    523     if (gzip_header_length > bfds_seek (cfs->bfds, gzip_header_length,
    524                                         SEEK_SET))
    525     {
    526       LOG ("Corrupt gzip, failed to seek to end of header\n");
    527       return -1;
    528     }
    529     buf_bytes = bfds_read (cfs->bfds, fcomment, sizeof (fcomment));
    530     if (buf_bytes <= 0)
    531     {
    532       LOG ("Corrupt gzip, failed to read comment\n");
    533       return -1;
    534     }
    535     if (NULL == (cptr = memchr (fcomment, 0, buf_bytes)))
    536     {
    537       LOG ("Corrupt gzip, failed to read comment terminator\n");
    538       return -1;
    539     }
    540     len = cptr - fcomment;
    541     if ( (NULL != proc) &&
    542          (0 != proc (proc_cls, "<zlib>", EXTRACTOR_METATYPE_COMMENT,
    543                      EXTRACTOR_METAFORMAT_C_STRING, "text/plain",
    544                      (const char *) fcomment,
    545                      len)) )
    546       return 0; /* done */
    547     gzip_header_length += len + 1;
    548   }
    549   if (0 != (hdata[3] & 0x2)) /* FCHRC set */
    550     gzip_header_length += 2;
    551   memset (&cfs->strm, 0, sizeof (z_stream));
    552 
    553 #ifdef ZLIB_VERNUM
    554   /* zlib will take care of its header */
    555   gzip_header_length = 0;
    556 #endif
    557   cfs->gzip_header_length = gzip_header_length;
    558 
    559   if (cfs->gzip_header_length !=
    560       bfds_seek (cfs->bfds, cfs->gzip_header_length, SEEK_SET))
    561   {
    562     LOG ("Failed to seek to start to initialize gzip decompressor\n");
    563     return -1;
    564   }
    565   cfs->strm.avail_out = COM_CHUNK_SIZE;
    566   /*
    567    * note: maybe plain inflateInit(&strm) is adequate,
    568    * it looks more backward-compatible also ;
    569    *
    570    * ZLIB_VERNUM isn't defined by zlib version 1.1.4 ;
    571    * there might be a better check.
    572    */if (Z_OK != inflateInit2 (&cfs->strm,
    573 #ifdef ZLIB_VERNUM
    574                             15 + 32
    575 #else
    576                             -MAX_WBITS
    577 #endif
    578                             ))
    579   {
    580     LOG ("Failed to initialize zlib decompression\n");
    581     return -1;
    582   }
    583   return 1;
    584 }
    585 
    586 
    587 #endif
    588 
    589 
    590 #if HAVE_LIBBZ2
    591 /**
    592  * Initializes bz2-decompression object. Might report metadata about
    593  * compresse stream, if available. Resets the stream to the beginning.
    594  *
    595  * @param cfs cfs to initialize
    596  * @param proc callback for metadata
    597  * @param proc_cls callback cls
    598  * @return 1 on success, -1 on error
    599  */
    600 static int
    601 cfs_init_decompressor_bz2 (struct CompressedFileSource *cfs,
    602                            EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
    603 {
    604   if (0 !=
    605       bfds_seek (cfs->bfds, 0, SEEK_SET))
    606   {
    607     LOG ("Failed to seek to start to initialize BZ2 decompressor\n");
    608     return -1;
    609   }
    610   memset (&cfs->bstrm, 0, sizeof (bz_stream));
    611   if (BZ_OK !=
    612       BZ2_bzDecompressInit (&cfs->bstrm, 0, 0))
    613   {
    614     LOG ("Failed to initialize BZ2 decompressor\n");
    615     return -1;
    616   }
    617   cfs->bstrm.avail_out = COM_CHUNK_SIZE;
    618   return 1;
    619 }
    620 
    621 
    622 #endif
    623 
    624 
    625 /**
    626  * Initializes decompression object. Might report metadata about
    627  * compresse stream, if available. Resets the stream to the beginning.
    628  *
    629  * @param cfs cfs to initialize
    630  * @param proc callback for metadata
    631  * @param proc_cls callback cls
    632  * @return 1 on success, 0 to terminate extraction, -1 on error
    633  */
    634 static int
    635 cfs_init_decompressor (struct CompressedFileSource *cfs,
    636                        EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
    637 {
    638   cfs->result_pos = 0;
    639   cfs->fpos = 0;
    640   switch (cfs->compression_type)
    641   {
    642 #if HAVE_ZLIB
    643   case COMP_TYPE_ZLIB:
    644     return cfs_init_decompressor_zlib (cfs, proc, proc_cls);
    645 #endif
    646 #if HAVE_LIBBZ2
    647   case COMP_TYPE_BZ2:
    648     return cfs_init_decompressor_bz2 (cfs, proc, proc_cls);
    649 #endif
    650   default:
    651     LOG ("invalid compression type selected\n");
    652     return -1;
    653   }
    654 }
    655 
    656 
    657 #if HAVE_ZLIB
    658 /**
    659  * Deinitializes gz-decompression object.
    660  *
    661  * @param cfs cfs to deinitialize
    662  * @return 1 on success, -1 on error
    663  */
    664 static int
    665 cfs_deinit_decompressor_zlib (struct CompressedFileSource *cfs)
    666 {
    667   inflateEnd (&cfs->strm);
    668   return 1;
    669 }
    670 
    671 
    672 #endif
    673 
    674 
    675 #if HAVE_LIBBZ2
    676 /**
    677  * Deinitializes bz2-decompression object.
    678  *
    679  * @param cfs cfs to deinitialize
    680  * @return 1 on success, -1 on error
    681  */
    682 static int
    683 cfs_deinit_decompressor_bz2 (struct CompressedFileSource *cfs)
    684 {
    685   BZ2_bzDecompressEnd (&cfs->bstrm);
    686   return 1;
    687 }
    688 
    689 
    690 #endif
    691 
    692 
    693 /**
    694  * Deinitializes decompression object.
    695  *
    696  * @param cfs cfs to deinitialize
    697  * @return 1 on success, -1 on error
    698  */
    699 static int
    700 cfs_deinit_decompressor (struct CompressedFileSource *cfs)
    701 {
    702   switch (cfs->compression_type)
    703   {
    704 #if HAVE_ZLIB
    705   case COMP_TYPE_ZLIB:
    706     return cfs_deinit_decompressor_zlib (cfs);
    707 #endif
    708 #if HAVE_LIBBZ2
    709   case COMP_TYPE_BZ2:
    710     return cfs_deinit_decompressor_bz2 (cfs);
    711 #endif
    712   default:
    713     LOG ("invalid compression type selected\n");
    714     return -1;
    715   }
    716 }
    717 
    718 
    719 /**
    720  * Resets the compression stream to begin uncompressing
    721  * from the beginning. Used at initialization time, and when
    722  * seeking backward.
    723  *
    724  * @param cfs cfs to reset
    725  * @return 1 on success, 0 to terminate extraction,
    726  *        -1 on error
    727  */
    728 static int
    729 cfs_reset_stream (struct CompressedFileSource *cfs)
    730 {
    731   if (-1 == cfs_deinit_decompressor (cfs))
    732     return -1;
    733   return cfs_init_decompressor (cfs, NULL, NULL);
    734 }
    735 
    736 
    737 /**
    738  * Destroy compressed file source.
    739  *
    740  * @param cfs source to destroy
    741  */
    742 static void
    743 cfs_destroy (struct CompressedFileSource *cfs)
    744 {
    745   cfs_deinit_decompressor (cfs);
    746   free (cfs);
    747 }
    748 
    749 
    750 /**
    751  * Allocates and initializes new cfs object.
    752  *
    753  * @param bfds data source to use
    754  * @param fsize size of the source
    755  * @param compression_type type of compression used
    756  * @param proc metadata callback to call with meta data found upon opening
    757  * @param proc_cls callback cls
    758  * @return newly allocated cfs on success, NULL on error
    759  */
    760 struct CompressedFileSource *
    761 cfs_new (struct BufferedFileDataSource *bfds,
    762          int64_t fsize,
    763          enum ExtractorCompressionType compression_type,
    764          EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
    765 {
    766   struct CompressedFileSource *cfs;
    767 
    768   if (NULL == (cfs = malloc (sizeof (struct CompressedFileSource))))
    769   {
    770     LOG_STRERROR ("malloc");
    771     return NULL;
    772   }
    773   memset (cfs, 0, sizeof (struct CompressedFileSource));
    774   cfs->compression_type = compression_type;
    775   cfs->bfds = bfds;
    776   cfs->fsize = fsize;
    777   cfs->uncompressed_size = -1;
    778   if (1 != cfs_init_decompressor (cfs,
    779                                   proc, proc_cls))
    780   {
    781     free (cfs);
    782     return NULL;
    783   }
    784   return cfs;
    785 }
    786 
    787 
    788 #if HAVE_ZLIB
    789 /**
    790  * Fills 'data' with new uncompressed data.  Does the actual
    791  * decompression. Will set uncompressed_size on the end of compressed
    792  * stream.
    793  *
    794  * @param cfds cfs to read from
    795  * @param data where to copy the data
    796  * @param size number of bytes available in data
    797  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
    798  */
    799 static ssize_t
    800 cfs_read_zlib (struct CompressedFileSource *cfs,
    801                void *data,
    802                size_t size)
    803 {
    804   char *dst = data;
    805   int ret;
    806   size_t rc;
    807   ssize_t in;
    808   unsigned char buf[COM_CHUNK_SIZE];
    809 
    810   if (cfs->fpos == cfs->uncompressed_size)
    811   {
    812     /* end of file */
    813     return 0;
    814   }
    815   rc = 0;
    816   if (COM_CHUNK_SIZE > cfs->strm.avail_out + cfs->result_pos)
    817   {
    818     /* got left-over decompressed data from previous round! */
    819     in = COM_CHUNK_SIZE - (cfs->strm.avail_out + cfs->result_pos);
    820     if (in > size)
    821       in = size;
    822     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
    823     cfs->fpos += in;
    824     cfs->result_pos += in;
    825     rc += in;
    826   }
    827   ret = Z_OK;
    828   while ( (rc < size) && (Z_STREAM_END != ret) )
    829   {
    830     /* read block from original data source */
    831     in = bfds_read (cfs->bfds,
    832                     buf, sizeof (buf));
    833     if (in < 0)
    834     {
    835       LOG ("unexpected EOF\n");
    836       return -1; /* unexpected EOF */
    837     }
    838     if (0 == in)
    839     {
    840       cfs->uncompressed_size = cfs->fpos;
    841       return rc;
    842     }
    843     cfs->strm.next_in = buf;
    844     cfs->strm.avail_in = (uInt) in;
    845     cfs->strm.next_out = (unsigned char *) cfs->result;
    846     cfs->strm.avail_out = COM_CHUNK_SIZE;
    847     cfs->result_pos = 0;
    848     ret = inflate (&cfs->strm, Z_SYNC_FLUSH);
    849     if ( (Z_OK != ret) && (Z_STREAM_END != ret) )
    850     {
    851       LOG ("unexpected gzip inflate error: %d\n", ret);
    852       return -1; /* unexpected error */
    853     }
    854     /* go backwards by the number of bytes left in the buffer */
    855     if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->strm.avail_in, SEEK_CUR))
    856     {
    857       LOG ("seek failed\n");
    858       return -1;
    859     }
    860     /* copy decompressed bytes to target buffer */
    861     in = COM_CHUNK_SIZE - cfs->strm.avail_out;
    862     if (in > size - rc)
    863     {
    864       if (Z_STREAM_END == ret)
    865       {
    866         cfs->uncompressed_size = cfs->fpos + in;
    867         ret = Z_OK;
    868       }
    869       in = size - rc;
    870     }
    871     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
    872     cfs->fpos += in;
    873     cfs->result_pos += in;
    874     rc += in;
    875   }
    876   if (Z_STREAM_END == ret)
    877   {
    878     cfs->uncompressed_size = cfs->fpos;
    879   }
    880   return rc;
    881 }
    882 
    883 
    884 #endif
    885 
    886 
    887 #if HAVE_LIBBZ2
    888 /**
    889  * Fills 'data' with new uncompressed data.  Does the actual
    890  * decompression. Will set uncompressed_size on the end of compressed
    891  * stream.
    892  *
    893  * @param cfds cfs to read from
    894  * @param data where to copy the data
    895  * @param size number of bytes available in data
    896  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
    897  */
    898 static ssize_t
    899 cfs_read_bz2 (struct CompressedFileSource *cfs,
    900               void *data,
    901               size_t size)
    902 {
    903   char *dst = data;
    904   int ret;
    905   size_t rc;
    906   ssize_t in;
    907   char buf[COM_CHUNK_SIZE];
    908 
    909   if (cfs->fpos == cfs->uncompressed_size)
    910   {
    911     /* end of file */
    912     return 0;
    913   }
    914   rc = 0;
    915   if (COM_CHUNK_SIZE > cfs->bstrm.avail_out + cfs->result_pos)
    916   {
    917     /* got left-over decompressed data from previous round! */
    918     in = COM_CHUNK_SIZE - (cfs->bstrm.avail_out + cfs->result_pos);
    919     if (in > size)
    920       in = size;
    921     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
    922     cfs->fpos += in;
    923     cfs->result_pos += in;
    924     rc += in;
    925   }
    926   ret = BZ_OK;
    927   while ( (rc < size) && (BZ_STREAM_END != ret) )
    928   {
    929     /* read block from original data source */
    930     in = bfds_read (cfs->bfds,
    931                     buf, sizeof (buf));
    932     if (in < 0)
    933     {
    934       LOG ("unexpected EOF\n");
    935       return -1; /* unexpected EOF */
    936     }
    937     if (0 == in)
    938     {
    939       cfs->uncompressed_size = cfs->fpos;
    940       return rc;
    941     }
    942     cfs->bstrm.next_in = buf;
    943     cfs->bstrm.avail_in = (unsigned int) in;
    944     cfs->bstrm.next_out = cfs->result;
    945     cfs->bstrm.avail_out = COM_CHUNK_SIZE;
    946     cfs->result_pos = 0;
    947     ret = BZ2_bzDecompress (&cfs->bstrm);
    948     if ( (BZ_OK != ret) && (BZ_STREAM_END != ret) )
    949     {
    950       LOG ("unexpected bzip2 decompress error: %d\n", ret);
    951       return -1; /* unexpected error */
    952     }
    953     /* go backwards by the number of bytes left in the buffer */
    954     if (-1 == bfds_seek (cfs->bfds, -(int64_t) cfs->bstrm.avail_in, SEEK_CUR))
    955     {
    956       LOG ("seek failed\n");
    957       return -1;
    958     }
    959     /* copy decompressed bytes to target buffer */
    960     in = COM_CHUNK_SIZE - cfs->bstrm.avail_out;
    961     if (in > size - rc)
    962     {
    963       if (BZ_STREAM_END == ret)
    964       {
    965         cfs->uncompressed_size = cfs->fpos + in;
    966         ret = BZ_OK;
    967       }
    968       in = size - rc;
    969     }
    970     memcpy (&dst[rc], &cfs->result[cfs->result_pos], in);
    971     cfs->fpos += in;
    972     cfs->result_pos += in;
    973     rc += in;
    974   }
    975   if (BZ_STREAM_END == ret)
    976   {
    977     cfs->uncompressed_size = cfs->fpos;
    978   }
    979   return rc;
    980 }
    981 
    982 
    983 #endif
    984 
    985 
    986 /**
    987  * Fills 'data' with new uncompressed data.  Does the actual
    988  * decompression. Will set uncompressed_size on the end of compressed
    989  * stream.
    990  *
    991  * @param cfds cfs to read from
    992  * @param data where to copy the data
    993  * @param size number of bytes available in data
    994  * @return number of bytes in data. 0 if no more data can be uncompressed, -1 on error
    995  */
    996 static ssize_t
    997 cfs_read (struct CompressedFileSource *cfs,
    998           void *data,
    999           size_t size)
   1000 {
   1001   switch (cfs->compression_type)
   1002   {
   1003 #if HAVE_ZLIB
   1004   case COMP_TYPE_ZLIB:
   1005     return cfs_read_zlib (cfs, data, size);
   1006 #endif
   1007 #if HAVE_LIBBZ2
   1008   case COMP_TYPE_BZ2:
   1009     return cfs_read_bz2 (cfs, data, size);
   1010 #endif
   1011   default:
   1012     LOG ("invalid compression type selected\n");
   1013     return -1;
   1014   }
   1015 }
   1016 
   1017 
   1018 /**
   1019  * Moves the buffer to 'position' in uncompressed steam. If position
   1020  * requires seeking backwards beyond the boundaries of the buffer, resets the
   1021  * stream and repeats decompression from the beginning to 'position'.
   1022  *
   1023  * @param cfs cfs to seek on
   1024  * @param position new starting point for the buffer
   1025  * @param whence one of the seek constants (SEEK_CUR, SEEK_SET, SEEK_END)
   1026  * @return new absolute buffer position, -1 on error or EOS
   1027  */
   1028 static int64_t
   1029 cfs_seek (struct CompressedFileSource *cfs,
   1030           int64_t position,
   1031           int whence)
   1032 {
   1033   uint64_t nposition;
   1034   int64_t delta;
   1035 
   1036   switch (whence)
   1037   {
   1038   case SEEK_CUR:
   1039     if (cfs->fpos + position < 0)
   1040     {
   1041       /* underflow */
   1042       LOG ("Invalid seek operation\n");
   1043       return -1;
   1044     }
   1045     if ( (-1 != cfs->uncompressed_size) &&
   1046          (cfs->fpos + position > cfs->uncompressed_size) )
   1047     {
   1048       LOG ("Invalid seek operation\n");
   1049       return -1;
   1050     }
   1051     nposition = cfs->fpos + position;
   1052     break;
   1053   case SEEK_END:
   1054     ASSERT (-1 != cfs->uncompressed_size);
   1055     if (position > 0)
   1056     {
   1057       LOG ("Invalid seek operation\n");
   1058       return -1;
   1059     }
   1060     if (cfs->uncompressed_size < -position)
   1061     {
   1062       LOG ("Invalid seek operation\n");
   1063       return -1;
   1064     }
   1065     nposition = cfs->uncompressed_size + position;
   1066     break;
   1067   case SEEK_SET:
   1068     if (position < 0)
   1069     {
   1070       LOG ("Invalid seek operation\n");
   1071       return -1;
   1072     }
   1073     if ( (-1 != cfs->uncompressed_size) &&
   1074          (cfs->uncompressed_size < position) )
   1075     {
   1076       LOG ("Invalid seek operation\n");
   1077       return -1;
   1078     }
   1079     nposition = (uint64_t) position;
   1080     break;
   1081   default:
   1082     LOG ("Invalid seek operation\n");
   1083     return -1;
   1084   }
   1085   delta = nposition - cfs->fpos;
   1086   if (delta < 0)
   1087   {
   1088     if (cfs->result_pos >= -delta)
   1089     {
   1090       cfs->result_pos += delta;
   1091       cfs->fpos += delta;
   1092       delta = 0;
   1093     }
   1094     else
   1095     {
   1096       if (-1 == cfs_reset_stream (cfs))
   1097       {
   1098         LOG ("Failed to restart compressed stream for seek operation\n");
   1099         return -1;
   1100       }
   1101       delta = nposition;
   1102     }
   1103   }
   1104   while (delta > 0)
   1105   {
   1106     char buf[COM_CHUNK_SIZE];
   1107     size_t max;
   1108     int64_t ret;
   1109 
   1110     max = (sizeof (buf) > delta) ? delta : sizeof (buf);
   1111     ret = cfs_read (cfs, buf, max);
   1112     if (-1 == ret)
   1113     {
   1114       LOG ("Failed to read decompressed stream for seek operation\n");
   1115       return -1;
   1116     }
   1117     if (0 == ret)
   1118     {
   1119       LOG (
   1120         "Reached unexpected end of stream at %llu during seek operation to %llu (%d left)\n",
   1121         (unsigned long long) cfs->fpos,
   1122         (unsigned long long) nposition,
   1123         delta);
   1124       return -1;
   1125     }
   1126     ASSERT (ret <= delta);
   1127     delta -= ret;
   1128   }
   1129   return cfs->fpos;
   1130 }
   1131 
   1132 
   1133 /**
   1134  * Detect if we have compressed data on our hands.
   1135  *
   1136  * @param data pointer to a data buffer or NULL (in case fd is not -1)
   1137  * @param fd a file to read data from, or -1 (if data is not NULL)
   1138  * @param fsize size of data (if data is not NULL) or of file (if fd is not -1)
   1139  * @return -1 to indicate an error, 0 to indicate uncompressed data, or a type (> 0) of compression
   1140  */
   1141 static enum ExtractorCompressionType
   1142 get_compression_type (struct BufferedFileDataSource *bfds)
   1143 {
   1144   unsigned char read_data[3];
   1145 
   1146   if (0 != bfds_seek (bfds, 0, SEEK_SET))
   1147     return COMP_TYPE_INVALID;
   1148   if (sizeof (read_data) !=
   1149       bfds_read (bfds, read_data, sizeof (read_data)))
   1150     return COMP_TYPE_UNDEFINED;
   1151 
   1152 #if HAVE_ZLIB
   1153   if ( (bfds->fsize >= MIN_ZLIB_HEADER) &&
   1154        (read_data[0] == 0x1f) &&
   1155        (read_data[1] == 0x8b) &&
   1156        (read_data[2] == 0x08) )
   1157     return COMP_TYPE_ZLIB;
   1158 #endif
   1159 #if HAVE_LIBBZ2
   1160   if ( (bfds->fsize >= MIN_BZ2_HEADER) &&
   1161        (read_data[0] == 'B') &&
   1162        (read_data[1] == 'Z') &&
   1163        (read_data[2] == 'h'))
   1164     return COMP_TYPE_BZ2;
   1165 #endif
   1166   return COMP_TYPE_INVALID;
   1167 }
   1168 
   1169 
   1170 /**
   1171  * Handle to a datasource we can use for the plugins.
   1172  */
   1173 struct EXTRACTOR_Datasource
   1174 {
   1175 
   1176   /**
   1177    * Underlying buffered data source.
   1178    */
   1179   struct BufferedFileDataSource *bfds;
   1180 
   1181   /**
   1182    * Compressed file source (NULL if not applicable).
   1183    */
   1184   struct CompressedFileSource *cfs;
   1185 
   1186   /**
   1187    * Underlying file descriptor, -1 for none.
   1188    */
   1189   int fd;
   1190 };
   1191 
   1192 
   1193 /**
   1194  * Create a datasource from a file on disk.
   1195  *
   1196  * @param filename name of the file on disk
   1197  * @param proc metadata callback to call with meta data found upon opening
   1198  * @param proc_cls callback cls
   1199  * @return handle to the datasource, NULL on error
   1200  */
   1201 struct EXTRACTOR_Datasource *
   1202 EXTRACTOR_datasource_create_from_file_ (const char *filename,
   1203                                         EXTRACTOR_MetaDataProcessor proc,
   1204                                         void *proc_cls)
   1205 {
   1206   struct BufferedFileDataSource *bfds;
   1207   struct EXTRACTOR_Datasource *ds;
   1208   enum ExtractorCompressionType ct;
   1209   int fd;
   1210   struct stat sb;
   1211   int64_t fsize;
   1212   int winmode = 0;
   1213 #if WINDOWS
   1214   winmode = O_BINARY;
   1215 #endif
   1216 
   1217   if (-1 == (fd = open (filename, O_RDONLY | O_LARGEFILE | winmode)))
   1218   {
   1219     LOG_STRERROR_FILE ("open", filename);
   1220     return NULL;
   1221   }
   1222   if ( (0 != fstat (fd, &sb)) ||
   1223        (S_ISDIR (sb.st_mode)) )
   1224   {
   1225     if (! S_ISDIR (sb.st_mode))
   1226       LOG_STRERROR_FILE ("fstat", filename);
   1227     else
   1228       LOG ("Skipping directory `%s'\n", filename);
   1229     (void) close (fd);
   1230     return NULL;
   1231   }
   1232   fsize = (int64_t) sb.st_size;
   1233   if (0 == fsize)
   1234   {
   1235     (void) close (fd);
   1236     return NULL;
   1237   }
   1238   bfds = bfds_new (NULL, fd, fsize);
   1239   if (NULL == bfds)
   1240   {
   1241     (void) close (fd);
   1242     return NULL;
   1243   }
   1244   if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
   1245   {
   1246     LOG_STRERROR ("malloc");
   1247     bfds_delete (bfds);
   1248     (void) close (fd);
   1249     return NULL;
   1250   }
   1251   ds->bfds = bfds;
   1252   ds->fd = fd;
   1253   ds->cfs = NULL;
   1254   ct = get_compression_type (bfds);
   1255   if ( (COMP_TYPE_ZLIB == ct) ||
   1256        (COMP_TYPE_BZ2 == ct) )
   1257   {
   1258     ds->cfs = cfs_new (bfds, fsize, ct, proc, proc_cls);
   1259     if (NULL == ds->cfs)
   1260     {
   1261       LOG ("Failed to initialize decompressor\n");
   1262       bfds_delete (bfds);
   1263       free (ds);
   1264       (void) close (fd);
   1265       return NULL;
   1266     }
   1267   }
   1268   return ds;
   1269 }
   1270 
   1271 
   1272 /**
   1273  * Create a datasource from a buffer in memory.
   1274  *
   1275  * @param buf data in memory
   1276  * @param size number of bytes in 'buf'
   1277  * @param proc metadata callback to call with meta data found upon opening
   1278  * @param proc_cls callback cls
   1279  * @return handle to the datasource
   1280  */
   1281 struct EXTRACTOR_Datasource *
   1282 EXTRACTOR_datasource_create_from_buffer_ (const char *buf,
   1283                                           size_t size,
   1284                                           EXTRACTOR_MetaDataProcessor proc,
   1285                                           void *proc_cls)
   1286 {
   1287   struct BufferedFileDataSource *bfds;
   1288   struct EXTRACTOR_Datasource *ds;
   1289   enum ExtractorCompressionType ct;
   1290 
   1291   if (0 == size)
   1292     return NULL;
   1293   if (NULL == (bfds = bfds_new (buf, -1, size)))
   1294   {
   1295     LOG ("Failed to initialize buffer data source\n");
   1296     return NULL;
   1297   }
   1298   if (NULL == (ds = malloc (sizeof (struct EXTRACTOR_Datasource))))
   1299   {
   1300     LOG_STRERROR ("malloc");
   1301     bfds_delete (bfds);
   1302     return NULL;
   1303   }
   1304   ds->bfds = bfds;
   1305   ds->fd = -1;
   1306   ds->cfs = NULL;
   1307   ct = get_compression_type (bfds);
   1308   if ( (COMP_TYPE_ZLIB == ct) ||
   1309        (COMP_TYPE_BZ2 == ct) )
   1310   {
   1311     ds->cfs = cfs_new (bfds, size, ct, proc, proc_cls);
   1312     if (NULL == ds->cfs)
   1313     {
   1314       LOG ("Failed to initialize decompressor\n");
   1315       bfds_delete (bfds);
   1316       free (ds);
   1317       return NULL;
   1318     }
   1319   }
   1320   return ds;
   1321 }
   1322 
   1323 
   1324 /**
   1325  * Destroy a data source.
   1326  *
   1327  * @param ds source to destroy
   1328  */
   1329 void
   1330 EXTRACTOR_datasource_destroy_ (struct EXTRACTOR_Datasource *ds)
   1331 {
   1332   if (NULL != ds->cfs)
   1333     cfs_destroy (ds->cfs);
   1334   bfds_delete (ds->bfds);
   1335   if (-1 != ds->fd)
   1336     (void) close (ds->fd);
   1337   free (ds);
   1338 }
   1339 
   1340 
   1341 /**
   1342  * Make 'size' bytes of data from the data source available at 'data'.
   1343  *
   1344  * @param cls must be a 'struct EXTRACTOR_Datasource'
   1345  * @param data where the data should be copied to
   1346  * @param size maximum number of bytes requested
   1347  * @return number of bytes now available in data (can be smaller than 'size'),
   1348  *         -1 on error
   1349  */
   1350 ssize_t
   1351 EXTRACTOR_datasource_read_ (void *cls,
   1352                             void *data,
   1353                             size_t size)
   1354 {
   1355   struct EXTRACTOR_Datasource *ds = cls;
   1356 
   1357   if (NULL != ds->cfs)
   1358     return cfs_read (ds->cfs, data, size);
   1359   return bfds_read (ds->bfds, data, size);
   1360 }
   1361 
   1362 
   1363 /**
   1364  * Seek in the datasource.  Use 'SEEK_CUR' for whence and 'pos' of 0 to
   1365  * obtain the current position in the file.
   1366  *
   1367  * @param cls must be a 'struct EXTRACTOR_Datasource'
   1368  * @param pos position to seek (see 'man lseek')
   1369  * @param whence how to see (absolute to start, relative, absolute to end)
   1370  * @return new absolute position, UINT64_MAX on error (i.e. desired position
   1371  *         does not exist)
   1372  */
   1373 int64_t
   1374 EXTRACTOR_datasource_seek_ (void *cls,
   1375                             int64_t pos,
   1376                             int whence)
   1377 {
   1378   struct EXTRACTOR_Datasource *ds = cls;
   1379 
   1380   if (NULL != ds->cfs)
   1381   {
   1382     if ( (SEEK_END == whence) &&
   1383          (-1 == ds->cfs->uncompressed_size) )
   1384     {
   1385       /* need to obtain uncompressed size */
   1386       (void) EXTRACTOR_datasource_get_size_ (ds, 1);
   1387       if (-1 == ds->cfs->uncompressed_size)
   1388         return -1;
   1389     }
   1390     return cfs_seek (ds->cfs, pos, whence);
   1391   }
   1392   return bfds_seek (ds->bfds, pos, whence);
   1393 }
   1394 
   1395 
   1396 /**
   1397  * Determine the overall size of the data source (after compression).
   1398  *
   1399  * @param cls must be a 'struct EXTRACTOR_Datasource'
   1400  * @param force force computing the size if it is unavailable
   1401  * @return overall file size, UINT64_MAX on error or unknown
   1402  */
   1403 int64_t
   1404 EXTRACTOR_datasource_get_size_ (void *cls,
   1405                                 int force)
   1406 {
   1407   struct EXTRACTOR_Datasource *ds = cls;
   1408   char buf[32 * 1024];
   1409   uint64_t pos;
   1410 
   1411   if (NULL != ds->cfs)
   1412   {
   1413     if ( (force) &&
   1414          (-1 == ds->cfs->uncompressed_size) )
   1415     {
   1416       pos = ds->cfs->fpos;
   1417       while ( (-1 == ds->cfs->uncompressed_size) &&
   1418               (-1 != cfs_read (ds->cfs, buf, sizeof (buf))) )
   1419         ;
   1420       if (-1 == cfs_seek (ds->cfs, pos, SEEK_SET))
   1421       {
   1422         LOG (
   1423           "Serious problem, I moved the buffer to determine the file size but could not restore it...\n");
   1424         return -1;
   1425       }
   1426       if (-1 == ds->cfs->uncompressed_size)
   1427         return -1;
   1428     }
   1429     return ds->cfs->uncompressed_size;
   1430   }
   1431   return ds->bfds->fsize;
   1432 }
   1433 
   1434 
   1435 /* end of extractor_datasource.c */