libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

extractor.c (19487B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 
     21 #include "platform.h"
     22 #include "extractor.h"
     23 #include <dirent.h>
     24 #include <sys/types.h>
     25 #include <signal.h>
     26 #include <ltdl.h>
     27 #include "extractor_datasource.h"
     28 #include "extractor_ipc.h"
     29 #include "extractor_logging.h"
     30 #include "extractor_plugpath.h"
     31 #include "extractor_plugins.h"
     32 
     33 
     34 /**
     35  * Size used for the shared memory segment.
     36  */
     37 #define DEFAULT_SHM_SIZE (16 * 1024)
     38 
     39 
     40 /**
     41  * Closure for #process_plugin_reply()
     42  */
     43 struct PluginReplyProcessor
     44 {
     45   /**
     46    * Function to call if we receive meta data from the plugin.
     47    */
     48   EXTRACTOR_MetaDataProcessor proc;
     49 
     50   /**
     51    * Closure for @e proc.
     52    */
     53   void *proc_cls;
     54 
     55   /**
     56    * Are we done with processing this file? 0 to continue, 1 to terminate.
     57    */
     58   int file_finished;
     59 
     60 };
     61 
     62 
     63 /**
     64  * Send an 'update' message to the plugin.
     65  *
     66  * @param plugin plugin to notify
     67  * @param shm_off new offset for the SHM
     68  * @param data_available number of bytes available in shm
     69  * @param ds datastore backend we are using
     70  */
     71 static void
     72 send_update_message (struct EXTRACTOR_PluginList *plugin,
     73                      int64_t shm_off,
     74                      size_t data_available,
     75                      struct EXTRACTOR_Datasource *ds)
     76 {
     77   struct UpdateMessage um;
     78 
     79   um.opcode = MESSAGE_UPDATED_SHM;
     80   um.reserved = 0;
     81   um.reserved2 = 0;
     82   um.shm_ready_bytes = (uint32_t) data_available;
     83   um.shm_off = (uint64_t) shm_off;
     84   um.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
     85   if (sizeof (um) !=
     86       EXTRACTOR_IPC_channel_send_ (plugin->channel,
     87                                    &um,
     88                                    sizeof (um)) )
     89   {
     90     LOG ("Failed to send UPDATED_SHM message to plugin\n");
     91     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
     92     plugin->channel = NULL;
     93     plugin->round_finished = 1;
     94   }
     95 }
     96 
     97 
     98 /**
     99  * Send a 'discard state' message to the plugin and mark it as finished
    100  * for this round.
    101  *
    102  * @param plugin plugin to notify
    103  */
    104 static void
    105 send_discard_message (struct EXTRACTOR_PluginList *plugin)
    106 {
    107   static unsigned char disc_msg = MESSAGE_DISCARD_STATE;
    108 
    109   if (sizeof (disc_msg) !=
    110       EXTRACTOR_IPC_channel_send_ (plugin->channel,
    111                                    &disc_msg,
    112                                    sizeof (disc_msg)) )
    113   {
    114     LOG ("Failed to send DISCARD_STATE message to plugin\n");
    115     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
    116     plugin->channel = NULL;
    117     plugin->round_finished = 1;
    118   }
    119 }
    120 
    121 
    122 /**
    123  * We had some serious trouble.  Abort all channels.
    124  *
    125  * @param plugins list of plugins with channels to abort
    126  */
    127 static void
    128 abort_all_channels (struct EXTRACTOR_PluginList *plugins)
    129 {
    130   struct EXTRACTOR_PluginList *pos;
    131 
    132   for (pos = plugins; NULL != pos; pos = pos->next)
    133   {
    134     if (NULL == pos->channel)
    135       continue;
    136     EXTRACTOR_IPC_channel_destroy_ (pos->channel);
    137     pos->channel = NULL;
    138   }
    139 }
    140 
    141 
    142 /**
    143  * Handler for a message from one of the plugins.
    144  *
    145  * @param cls closure with our 'struct PluginReplyProcessor'
    146  * @param plugin plugin of the channel sending the message
    147  * @param meta_type type of the meta data
    148  * @param meta_format format of the meta data
    149  * @param mime mime string send from the plugin
    150  * @param value 'data' send from the plugin
    151  * @param value_len number of bytes in 'value'
    152  */
    153 static void
    154 process_plugin_reply (void *cls,
    155                       struct EXTRACTOR_PluginList *plugin,
    156                       enum EXTRACTOR_MetaType meta_type,
    157                       enum EXTRACTOR_MetaFormat meta_format,
    158                       const char *mime,
    159                       const void *value,
    160                       size_t value_len)
    161 {
    162   static unsigned char cont_msg = MESSAGE_CONTINUE_EXTRACTING;
    163   struct PluginReplyProcessor *prp = cls;
    164 
    165   if (0 != prp->file_finished)
    166   {
    167     /* client already aborted, ignore message, tell plugin about abort */
    168     return;
    169   }
    170   if (0 != prp->proc (prp->proc_cls,
    171                       plugin->short_libname,
    172                       meta_type,
    173                       meta_format,
    174                       mime,
    175                       value,
    176                       value_len))
    177   {
    178     prp->file_finished = 1;
    179 #if DEBUG
    180     fprintf (stderr, "Sending ABRT\n");
    181 #endif
    182     send_discard_message (plugin);
    183     return;
    184   }
    185   if (sizeof (cont_msg) !=
    186       EXTRACTOR_IPC_channel_send_ (plugin->channel,
    187                                    &cont_msg,
    188                                    sizeof (cont_msg)) )
    189   {
    190     LOG ("Failed to send CONTINUE_EXTRACTING message to plugin\n");
    191     EXTRACTOR_IPC_channel_destroy_ (plugin->channel);
    192     plugin->channel = NULL;
    193     plugin->round_finished = 1;
    194   }
    195 }
    196 
    197 
    198 /**
    199  * Closure for the in-process callbacks.
    200  */
    201 struct InProcessContext
    202 {
    203   /**
    204    * Current plugin.
    205    */
    206   struct EXTRACTOR_PluginList *plugin;
    207 
    208   /**
    209    * Data source to use.
    210    */
    211   struct EXTRACTOR_Datasource *ds;
    212 
    213   /**
    214    * Function to call with meta data.
    215    */
    216   EXTRACTOR_MetaDataProcessor proc;
    217 
    218   /**
    219    * Closure for @e proc.
    220    */
    221   void *proc_cls;
    222 
    223   /**
    224    * IO buffer.
    225    */
    226   char buf[DEFAULT_SHM_SIZE];
    227 
    228   /**
    229    * 0 to continue extracting, 1 if we are finished
    230    */
    231   int finished;
    232 };
    233 
    234 
    235 /**
    236  * Obtain a pointer to up to @a size bytes of data from the file to process.
    237  * Callback used for in-process plugins.
    238  *
    239  * @param cls a `struct InProcessContext`
    240  * @param data pointer to set to the file data, set to NULL on error
    241  * @param size maximum number of bytes requested
    242  * @return number of bytes now available in data (can be smaller than @a size),
    243  *         -1 on error
    244  */
    245 static ssize_t
    246 in_process_read (void *cls,
    247                  void **data,
    248                  size_t size)
    249 {
    250   struct InProcessContext *ctx = cls;
    251   ssize_t ret;
    252   size_t bsize;
    253 
    254   bsize = sizeof (ctx->buf);
    255   if (size < bsize)
    256     bsize = size;
    257   ret = EXTRACTOR_datasource_read_ (ctx->ds,
    258                                     ctx->buf,
    259                                     bsize);
    260   if (-1 == ret)
    261     *data = NULL;
    262   else
    263     *data = ctx->buf;
    264   return ret;
    265 }
    266 
    267 
    268 /**
    269  * Seek in the file.  Use 'SEEK_CUR' for @a whence and @a pos of 0 to
    270  * obtain the current position in the file.
    271  * Callback used for in-process plugins.
    272  *
    273  * @param cls a 'struct InProcessContext'
    274  * @param pos position to seek (see 'man lseek')
    275  * @param whence how to see (absolute to start, relative, absolute to end)
    276  * @return new absolute position, -1 on error (i.e. desired position
    277  *         does not exist)
    278  */
    279 static int64_t
    280 in_process_seek (void *cls,
    281                  int64_t pos,
    282                  int whence)
    283 {
    284   struct InProcessContext *ctx = cls;
    285 
    286   return EXTRACTOR_datasource_seek_ (ctx->ds,
    287                                      pos,
    288                                      whence);
    289 }
    290 
    291 
    292 /**
    293  * Determine the overall size of the file.
    294  * Callback used for in-process plugins.
    295  *
    296  * @param cls a `struct InProcessContext`
    297  * @return overall file size, UINT64_MAX on error (i.e. IPC failure)
    298  */
    299 static uint64_t
    300 in_process_get_size (void *cls)
    301 {
    302   struct InProcessContext *ctx = cls;
    303 
    304   return (uint64_t) EXTRACTOR_datasource_get_size_ (ctx->ds, 0);
    305 }
    306 
    307 
    308 /**
    309  * Type of a function that libextractor calls for each
    310  * meta data item found.
    311  * Callback used for in-process plugins.
    312  *
    313  * @param cls a 'struct InProcessContext'
    314  * @param plugin_name name of the plugin that produced this value;
    315  *        special values can be used (i.e. '&lt;zlib&gt;' for zlib being
    316  *        used in the main libextractor library and yielding
    317  *        meta data).
    318  * @param type libextractor-type describing the meta data
    319  * @param format basic format information about data
    320  * @param data_mime_type mime-type of data (not of the original file);
    321  *        can be NULL (if mime-type is not known)
    322  * @param data actual meta-data found
    323  * @param data_len number of bytes in data
    324  * @return 0 to continue extracting, 1 to abort
    325  */
    326 static int
    327 in_process_proc (void *cls,
    328                  const char *plugin_name,
    329                  enum EXTRACTOR_MetaType type,
    330                  enum EXTRACTOR_MetaFormat format,
    331                  const char *data_mime_type,
    332                  const char *data,
    333                  size_t data_len)
    334 {
    335   struct InProcessContext *ctx = cls;
    336   int ret;
    337 
    338   if (0 != ctx->finished)
    339     return 1;
    340   ret = ctx->proc (ctx->proc_cls,
    341                    plugin_name,
    342                    type,
    343                    format,
    344                    data_mime_type,
    345                    data,
    346                    data_len);
    347   if (0 != ret)
    348     ctx->finished = 1;
    349   return ret;
    350 }
    351 
    352 
    353 /**
    354  * Extract keywords using the given set of plugins.
    355  *
    356  * @param plugins the list of plugins to use
    357  * @param shm shared memory object used by the plugins (NULL if
    358  *        all plugins are in-process)
    359  * @param ds data to process
    360  * @param proc function to call for each meta data item found
    361  * @param proc_cls cls argument to @a proc
    362  */
    363 static void
    364 do_extract (struct EXTRACTOR_PluginList *plugins,
    365             struct EXTRACTOR_SharedMemory *shm,
    366             struct EXTRACTOR_Datasource *ds,
    367             EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
    368 {
    369   unsigned int plugin_count;
    370   unsigned int plugin_off;
    371   struct EXTRACTOR_PluginList *pos;
    372   struct StartMessage start;
    373   struct EXTRACTOR_Channel *channel;
    374   struct PluginReplyProcessor prp;
    375   struct InProcessContext ctx;
    376   struct EXTRACTOR_ExtractContext ec;
    377   int64_t min_seek;
    378   int64_t end;
    379   ssize_t data_available;
    380   ssize_t ready;
    381   int done;
    382   int have_in_memory;
    383 
    384   plugin_count = 0;
    385   for (pos = plugins; NULL != pos; pos = pos->next)
    386     plugin_count++;
    387   if (NULL != shm)
    388     ready = EXTRACTOR_IPC_shared_memory_set_ (shm,
    389                                               ds,
    390                                               0,
    391                                               DEFAULT_SHM_SIZE);
    392   else
    393     ready = 0;
    394   if (-1 == ready)
    395     return; /* failed to ready _any_ data!? */
    396   have_in_memory = 0;
    397   prp.file_finished = 0;
    398   prp.proc = proc;
    399   prp.proc_cls = proc_cls;
    400 
    401   /* send 'start' message */
    402   start.opcode = MESSAGE_EXTRACT_START;
    403   start.reserved = 0;
    404   start.reserved2 = 0;
    405   start.shm_ready_bytes = (uint32_t) ready;
    406   start.file_size = EXTRACTOR_datasource_get_size_ (ds, 0);
    407   for (pos = plugins; NULL != pos; pos = pos->next)
    408   {
    409     if (EXTRACTOR_OPTION_IN_PROCESS == pos->flags)
    410       have_in_memory = 1;
    411     if ( (NULL != pos->channel) &&
    412          (-1 == EXTRACTOR_IPC_channel_send_ (pos->channel,
    413                                              &start,
    414                                              sizeof (start)) ) )
    415     {
    416       LOG ("Failed to send EXTRACT_START message to plugin\n");
    417       EXTRACTOR_IPC_channel_destroy_ (pos->channel);
    418       pos->channel = NULL;
    419     }
    420   }
    421   done = 0;
    422   while (! done)
    423   {
    424     struct EXTRACTOR_Channel *channels[plugin_count];
    425 
    426     /* calculate current 'channels' array */
    427     plugin_off = 0;
    428     for (pos = plugins; NULL != pos; pos = pos->next)
    429     {
    430       if (-1 == pos->seek_request)
    431       {
    432         /* channel is not seeking, must be running or done */
    433         channels[plugin_off] = pos->channel;
    434       }
    435       else
    436       {
    437         /* not running this round, seeking! */
    438         channels[plugin_off] = NULL;
    439       }
    440       plugin_off++;
    441     }
    442     /* give plugins chance to send us meta data, seek or finished messages */
    443     if (-1 ==
    444         EXTRACTOR_IPC_channel_recv_ (channels,
    445                                      plugin_count,
    446                                      &process_plugin_reply,
    447                                      &prp))
    448     {
    449       /* serious problem in IPC; reset *all* channels */
    450       LOG ("Failed to receive message from channels; full reset\n");
    451       abort_all_channels (plugins);
    452       break;
    453     }
    454 
    455     /* calculate minimum seek request (or set done=0 to continue here) */
    456     done = 1;
    457     min_seek = -1;
    458     plugin_off = 0;
    459     for (pos = plugins; NULL != pos; pos = pos->next)
    460     {
    461       plugin_off++;
    462       if ( (1 == pos->round_finished) ||
    463            (NULL == pos->channel) )
    464       {
    465         continue;     /* inactive plugin */
    466       }
    467       if (-1 == pos->seek_request)
    468       {
    469         /* possibly more meta data at current position, at least
    470      this plugin is still working on it... */
    471         done = 0;
    472         break;
    473       }
    474       if (-1 != pos->seek_request)
    475       {
    476         if (SEEK_END == pos->seek_whence)
    477         {
    478           /* convert distance from end to absolute position */
    479           pos->seek_whence = 0;
    480           end = EXTRACTOR_datasource_get_size_ (ds, 1);
    481           if (pos->seek_request > end)
    482           {
    483             LOG ("Cannot seek to before the beginning of the file!\n");
    484             pos->seek_request = 0;
    485           }
    486           else
    487           {
    488             pos->seek_request = end - pos->seek_request;
    489           }
    490         }
    491         if ( (-1 == min_seek) ||
    492              (min_seek > pos->seek_request) )
    493         {
    494           min_seek = pos->seek_request;
    495         }
    496       }
    497     }
    498     data_available = -1;
    499     if ( (1 == done) &&
    500          (-1 != min_seek) &&
    501          (NULL != shm) )
    502     {
    503       /* current position done, but seek requested */
    504       done = 0;
    505       if (-1 ==
    506           (data_available = EXTRACTOR_IPC_shared_memory_set_ (shm,
    507                                                               ds,
    508                                                               min_seek,
    509                                                               DEFAULT_SHM_SIZE)))
    510       {
    511         LOG ("Failed to seek; full reset\n");
    512         abort_all_channels (plugins);
    513         break;
    514       }
    515     }
    516     /* if 'prp.file_finished', send 'abort' to plugins;
    517        if not, send 'seek' notification to plugins in range */
    518     for (pos = plugins; NULL != pos; pos = pos->next)
    519     {
    520       if (NULL == (channel = pos->channel))
    521       {
    522         /* Skipping plugin: channel down */
    523         continue;
    524       }
    525       if ( (-1 != pos->seek_request) &&
    526            (1 == prp.file_finished) )
    527       {
    528         send_discard_message (pos);
    529         pos->round_finished = 1;
    530         pos->seek_request = -1;
    531       }
    532       if ( (-1 != data_available) &&
    533            (-1 != pos->seek_request) &&
    534            (min_seek <= pos->seek_request) &&
    535            ( (min_seek + data_available > pos->seek_request) ||
    536              (min_seek == EXTRACTOR_datasource_get_size_ (ds, 0))) )
    537       {
    538         /* Notify plugin about seek to 'min_seek' */
    539         send_update_message (pos,
    540                              min_seek,
    541                              data_available,
    542                              ds);
    543         pos->seek_request = -1;
    544       }
    545       if (0 == pos->round_finished)
    546         done = 0; /* can't be done, plugin still active */
    547     }
    548   }
    549 
    550   if (0 == have_in_memory)
    551     return;
    552   /* run in-process plugins */
    553   ctx.finished = 0;
    554   ctx.ds = ds;
    555   ctx.proc = proc;
    556   ctx.proc_cls = proc_cls;
    557   ec.cls = &ctx;
    558   ec.read = &in_process_read;
    559   ec.seek = &in_process_seek;
    560   ec.get_size = &in_process_get_size;
    561   ec.proc = &in_process_proc;
    562   for (pos = plugins; NULL != pos; pos = pos->next)
    563   {
    564     if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
    565       continue;
    566     if (-1 == EXTRACTOR_plugin_load_ (pos))
    567       continue;
    568     ctx.plugin = pos;
    569     ec.config = pos->plugin_options;
    570     if (-1 == EXTRACTOR_datasource_seek_ (ds, 0, SEEK_SET))
    571     {
    572       LOG ("Failed to seek to 0 for in-memory plugins\n");
    573       return;
    574     }
    575     pos->extract_method (&ec);
    576     if (1 == ctx.finished)
    577       break;
    578   }
    579 }
    580 
    581 
    582 /**
    583  * Extract keywords from a file using the given set of plugins.
    584  * If needed, opens the file and loads its data (via mmap).  Then
    585  * decompresses it if the data is compressed.  Finally runs the
    586  * plugins on the (now possibly decompressed) data.
    587  *
    588  * @param plugins the list of plugins to use
    589  * @param filename the name of the file, can be NULL if data is not NULL
    590  * @param data data of the file in memory, can be NULL (in which
    591  *        case libextractor will open file) if filename is not NULL
    592  * @param size number of bytes in data, ignored if data is NULL
    593  * @param proc function to call for each meta data item found
    594  * @param proc_cls cls argument to @a proc
    595  */
    596 void
    597 EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins,
    598                    const char *filename,
    599                    const void *data,
    600                    size_t size,
    601                    EXTRACTOR_MetaDataProcessor proc,
    602                    void *proc_cls)
    603 {
    604   struct EXTRACTOR_Datasource *datasource;
    605   struct EXTRACTOR_SharedMemory *shm;
    606   struct EXTRACTOR_PluginList *pos;
    607   int have_oop;
    608 
    609   if (NULL == plugins)
    610     return;
    611   if (NULL == filename)
    612     datasource = EXTRACTOR_datasource_create_from_buffer_ (data, size,
    613                                                            proc, proc_cls);
    614   else
    615     datasource = EXTRACTOR_datasource_create_from_file_ (filename,
    616                                                          proc, proc_cls);
    617   if (NULL == datasource)
    618     return;
    619   shm = NULL;
    620   have_oop = 0;
    621   for (pos = plugins; NULL != pos; pos = pos->next)
    622   {
    623     if (NULL == shm)
    624       shm = pos->shm;
    625     if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags)
    626       have_oop = 1;
    627     pos->round_finished = 0;
    628   }
    629   if ( (NULL == shm) &&
    630        (1 == have_oop) )
    631   {
    632     /* need to create shared memory segment */
    633     shm = EXTRACTOR_IPC_shared_memory_create_ (DEFAULT_SHM_SIZE);
    634     if (NULL == shm)
    635     {
    636       LOG ("Failed to setup IPC\n");
    637       EXTRACTOR_datasource_destroy_ (datasource);
    638       return;
    639     }
    640   }
    641   for (pos = plugins; NULL != pos; pos = pos->next)
    642     if ( (NULL == pos->channel) &&
    643          (NULL != shm) &&
    644          (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) )
    645     {
    646       if (NULL == pos->shm)
    647       {
    648         pos->shm = shm;
    649         (void) EXTRACTOR_IPC_shared_memory_change_rc_ (shm, 1);
    650       }
    651       pos->channel = EXTRACTOR_IPC_channel_create_ (pos,
    652                                                     shm);
    653     }
    654   do_extract (plugins,
    655               shm,
    656               datasource,
    657               proc,
    658               proc_cls);
    659   EXTRACTOR_datasource_destroy_ (datasource);
    660 }
    661 
    662 
    663 /**
    664  * Initialize gettext and libltdl (and W32 if needed).
    665  */
    666 void __attribute__ ((constructor))
    667 EXTRACTOR_ltdl_init ()
    668 {
    669   int err;
    670 
    671 #if ENABLE_NLS
    672   bindtextdomain (PACKAGE, LOCALEDIR);
    673 #endif
    674   err = lt_dlinit ();
    675   if (err > 0)
    676   {
    677 #if DEBUG
    678     fprintf (stderr,
    679              _ ("Initialization of plugin mechanism failed: %s!\n"),
    680              lt_dlerror ());
    681 #endif
    682     return;
    683   }
    684 #if WINDOWS
    685   plibc_init_utf8 ("GNU", PACKAGE, 1);
    686   plibc_set_stat_size_size (sizeof (((struct stat *) 0)->st_size));
    687   plibc_set_stat_time_size (sizeof (((struct stat *) 0)->st_mtime));
    688 #endif
    689 }
    690 
    691 
    692 /**
    693  * Deinit.
    694  */
    695 void __attribute__ ((destructor))
    696 EXTRACTOR_ltdl_fini ()
    697 {
    698 #if WINDOWS
    699   plibc_shutdown ();
    700 #endif
    701   lt_dlexit ();
    702 }
    703 
    704 
    705 /* end of extractor.c */