extractor.c (19487B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2005, 2006, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 21 #include "platform.h" 22 #include "extractor.h" 23 #include <dirent.h> 24 #include <sys/types.h> 25 #include <signal.h> 26 #include <ltdl.h> 27 #include "extractor_datasource.h" 28 #include "extractor_ipc.h" 29 #include "extractor_logging.h" 30 #include "extractor_plugpath.h" 31 #include "extractor_plugins.h" 32 33 34 /** 35 * Size used for the shared memory segment. 36 */ 37 #define DEFAULT_SHM_SIZE (16 * 1024) 38 39 40 /** 41 * Closure for #process_plugin_reply() 42 */ 43 struct PluginReplyProcessor 44 { 45 /** 46 * Function to call if we receive meta data from the plugin. 47 */ 48 EXTRACTOR_MetaDataProcessor proc; 49 50 /** 51 * Closure for @e proc. 52 */ 53 void *proc_cls; 54 55 /** 56 * Are we done with processing this file? 0 to continue, 1 to terminate. 57 */ 58 int file_finished; 59 60 }; 61 62 63 /** 64 * Send an 'update' message to the plugin. 65 * 66 * @param plugin plugin to notify 67 * @param shm_off new offset for the SHM 68 * @param data_available number of bytes available in shm 69 * @param ds datastore backend we are using 70 */ 71 static void 72 send_update_message (struct EXTRACTOR_PluginList *plugin, 73 int64_t shm_off, 74 size_t data_available, 75 struct EXTRACTOR_Datasource *ds) 76 { 77 struct UpdateMessage um; 78 79 um.opcode = MESSAGE_UPDATED_SHM; 80 um.reserved = 0; 81 um.reserved2 = 0; 82 um.shm_ready_bytes = (uint32_t) data_available; 83 um.shm_off = (uint64_t) shm_off; 84 um.file_size = EXTRACTOR_datasource_get_size_ (ds, 0); 85 if (sizeof (um) != 86 EXTRACTOR_IPC_channel_send_ (plugin->channel, 87 &um, 88 sizeof (um)) ) 89 { 90 LOG ("Failed to send UPDATED_SHM message to plugin\n"); 91 EXTRACTOR_IPC_channel_destroy_ (plugin->channel); 92 plugin->channel = NULL; 93 plugin->round_finished = 1; 94 } 95 } 96 97 98 /** 99 * Send a 'discard state' message to the plugin and mark it as finished 100 * for this round. 101 * 102 * @param plugin plugin to notify 103 */ 104 static void 105 send_discard_message (struct EXTRACTOR_PluginList *plugin) 106 { 107 static unsigned char disc_msg = MESSAGE_DISCARD_STATE; 108 109 if (sizeof (disc_msg) != 110 EXTRACTOR_IPC_channel_send_ (plugin->channel, 111 &disc_msg, 112 sizeof (disc_msg)) ) 113 { 114 LOG ("Failed to send DISCARD_STATE message to plugin\n"); 115 EXTRACTOR_IPC_channel_destroy_ (plugin->channel); 116 plugin->channel = NULL; 117 plugin->round_finished = 1; 118 } 119 } 120 121 122 /** 123 * We had some serious trouble. Abort all channels. 124 * 125 * @param plugins list of plugins with channels to abort 126 */ 127 static void 128 abort_all_channels (struct EXTRACTOR_PluginList *plugins) 129 { 130 struct EXTRACTOR_PluginList *pos; 131 132 for (pos = plugins; NULL != pos; pos = pos->next) 133 { 134 if (NULL == pos->channel) 135 continue; 136 EXTRACTOR_IPC_channel_destroy_ (pos->channel); 137 pos->channel = NULL; 138 } 139 } 140 141 142 /** 143 * Handler for a message from one of the plugins. 144 * 145 * @param cls closure with our 'struct PluginReplyProcessor' 146 * @param plugin plugin of the channel sending the message 147 * @param meta_type type of the meta data 148 * @param meta_format format of the meta data 149 * @param mime mime string send from the plugin 150 * @param value 'data' send from the plugin 151 * @param value_len number of bytes in 'value' 152 */ 153 static void 154 process_plugin_reply (void *cls, 155 struct EXTRACTOR_PluginList *plugin, 156 enum EXTRACTOR_MetaType meta_type, 157 enum EXTRACTOR_MetaFormat meta_format, 158 const char *mime, 159 const void *value, 160 size_t value_len) 161 { 162 static unsigned char cont_msg = MESSAGE_CONTINUE_EXTRACTING; 163 struct PluginReplyProcessor *prp = cls; 164 165 if (0 != prp->file_finished) 166 { 167 /* client already aborted, ignore message, tell plugin about abort */ 168 return; 169 } 170 if (0 != prp->proc (prp->proc_cls, 171 plugin->short_libname, 172 meta_type, 173 meta_format, 174 mime, 175 value, 176 value_len)) 177 { 178 prp->file_finished = 1; 179 #if DEBUG 180 fprintf (stderr, "Sending ABRT\n"); 181 #endif 182 send_discard_message (plugin); 183 return; 184 } 185 if (sizeof (cont_msg) != 186 EXTRACTOR_IPC_channel_send_ (plugin->channel, 187 &cont_msg, 188 sizeof (cont_msg)) ) 189 { 190 LOG ("Failed to send CONTINUE_EXTRACTING message to plugin\n"); 191 EXTRACTOR_IPC_channel_destroy_ (plugin->channel); 192 plugin->channel = NULL; 193 plugin->round_finished = 1; 194 } 195 } 196 197 198 /** 199 * Closure for the in-process callbacks. 200 */ 201 struct InProcessContext 202 { 203 /** 204 * Current plugin. 205 */ 206 struct EXTRACTOR_PluginList *plugin; 207 208 /** 209 * Data source to use. 210 */ 211 struct EXTRACTOR_Datasource *ds; 212 213 /** 214 * Function to call with meta data. 215 */ 216 EXTRACTOR_MetaDataProcessor proc; 217 218 /** 219 * Closure for @e proc. 220 */ 221 void *proc_cls; 222 223 /** 224 * IO buffer. 225 */ 226 char buf[DEFAULT_SHM_SIZE]; 227 228 /** 229 * 0 to continue extracting, 1 if we are finished 230 */ 231 int finished; 232 }; 233 234 235 /** 236 * Obtain a pointer to up to @a size bytes of data from the file to process. 237 * Callback used for in-process plugins. 238 * 239 * @param cls a `struct InProcessContext` 240 * @param data pointer to set to the file data, set to NULL on error 241 * @param size maximum number of bytes requested 242 * @return number of bytes now available in data (can be smaller than @a size), 243 * -1 on error 244 */ 245 static ssize_t 246 in_process_read (void *cls, 247 void **data, 248 size_t size) 249 { 250 struct InProcessContext *ctx = cls; 251 ssize_t ret; 252 size_t bsize; 253 254 bsize = sizeof (ctx->buf); 255 if (size < bsize) 256 bsize = size; 257 ret = EXTRACTOR_datasource_read_ (ctx->ds, 258 ctx->buf, 259 bsize); 260 if (-1 == ret) 261 *data = NULL; 262 else 263 *data = ctx->buf; 264 return ret; 265 } 266 267 268 /** 269 * Seek in the file. Use 'SEEK_CUR' for @a whence and @a pos of 0 to 270 * obtain the current position in the file. 271 * Callback used for in-process plugins. 272 * 273 * @param cls a 'struct InProcessContext' 274 * @param pos position to seek (see 'man lseek') 275 * @param whence how to see (absolute to start, relative, absolute to end) 276 * @return new absolute position, -1 on error (i.e. desired position 277 * does not exist) 278 */ 279 static int64_t 280 in_process_seek (void *cls, 281 int64_t pos, 282 int whence) 283 { 284 struct InProcessContext *ctx = cls; 285 286 return EXTRACTOR_datasource_seek_ (ctx->ds, 287 pos, 288 whence); 289 } 290 291 292 /** 293 * Determine the overall size of the file. 294 * Callback used for in-process plugins. 295 * 296 * @param cls a `struct InProcessContext` 297 * @return overall file size, UINT64_MAX on error (i.e. IPC failure) 298 */ 299 static uint64_t 300 in_process_get_size (void *cls) 301 { 302 struct InProcessContext *ctx = cls; 303 304 return (uint64_t) EXTRACTOR_datasource_get_size_ (ctx->ds, 0); 305 } 306 307 308 /** 309 * Type of a function that libextractor calls for each 310 * meta data item found. 311 * Callback used for in-process plugins. 312 * 313 * @param cls a 'struct InProcessContext' 314 * @param plugin_name name of the plugin that produced this value; 315 * special values can be used (i.e. '<zlib>' for zlib being 316 * used in the main libextractor library and yielding 317 * meta data). 318 * @param type libextractor-type describing the meta data 319 * @param format basic format information about data 320 * @param data_mime_type mime-type of data (not of the original file); 321 * can be NULL (if mime-type is not known) 322 * @param data actual meta-data found 323 * @param data_len number of bytes in data 324 * @return 0 to continue extracting, 1 to abort 325 */ 326 static int 327 in_process_proc (void *cls, 328 const char *plugin_name, 329 enum EXTRACTOR_MetaType type, 330 enum EXTRACTOR_MetaFormat format, 331 const char *data_mime_type, 332 const char *data, 333 size_t data_len) 334 { 335 struct InProcessContext *ctx = cls; 336 int ret; 337 338 if (0 != ctx->finished) 339 return 1; 340 ret = ctx->proc (ctx->proc_cls, 341 plugin_name, 342 type, 343 format, 344 data_mime_type, 345 data, 346 data_len); 347 if (0 != ret) 348 ctx->finished = 1; 349 return ret; 350 } 351 352 353 /** 354 * Extract keywords using the given set of plugins. 355 * 356 * @param plugins the list of plugins to use 357 * @param shm shared memory object used by the plugins (NULL if 358 * all plugins are in-process) 359 * @param ds data to process 360 * @param proc function to call for each meta data item found 361 * @param proc_cls cls argument to @a proc 362 */ 363 static void 364 do_extract (struct EXTRACTOR_PluginList *plugins, 365 struct EXTRACTOR_SharedMemory *shm, 366 struct EXTRACTOR_Datasource *ds, 367 EXTRACTOR_MetaDataProcessor proc, void *proc_cls) 368 { 369 unsigned int plugin_count; 370 unsigned int plugin_off; 371 struct EXTRACTOR_PluginList *pos; 372 struct StartMessage start; 373 struct EXTRACTOR_Channel *channel; 374 struct PluginReplyProcessor prp; 375 struct InProcessContext ctx; 376 struct EXTRACTOR_ExtractContext ec; 377 int64_t min_seek; 378 int64_t end; 379 ssize_t data_available; 380 ssize_t ready; 381 int done; 382 int have_in_memory; 383 384 plugin_count = 0; 385 for (pos = plugins; NULL != pos; pos = pos->next) 386 plugin_count++; 387 if (NULL != shm) 388 ready = EXTRACTOR_IPC_shared_memory_set_ (shm, 389 ds, 390 0, 391 DEFAULT_SHM_SIZE); 392 else 393 ready = 0; 394 if (-1 == ready) 395 return; /* failed to ready _any_ data!? */ 396 have_in_memory = 0; 397 prp.file_finished = 0; 398 prp.proc = proc; 399 prp.proc_cls = proc_cls; 400 401 /* send 'start' message */ 402 start.opcode = MESSAGE_EXTRACT_START; 403 start.reserved = 0; 404 start.reserved2 = 0; 405 start.shm_ready_bytes = (uint32_t) ready; 406 start.file_size = EXTRACTOR_datasource_get_size_ (ds, 0); 407 for (pos = plugins; NULL != pos; pos = pos->next) 408 { 409 if (EXTRACTOR_OPTION_IN_PROCESS == pos->flags) 410 have_in_memory = 1; 411 if ( (NULL != pos->channel) && 412 (-1 == EXTRACTOR_IPC_channel_send_ (pos->channel, 413 &start, 414 sizeof (start)) ) ) 415 { 416 LOG ("Failed to send EXTRACT_START message to plugin\n"); 417 EXTRACTOR_IPC_channel_destroy_ (pos->channel); 418 pos->channel = NULL; 419 } 420 } 421 done = 0; 422 while (! done) 423 { 424 struct EXTRACTOR_Channel *channels[plugin_count]; 425 426 /* calculate current 'channels' array */ 427 plugin_off = 0; 428 for (pos = plugins; NULL != pos; pos = pos->next) 429 { 430 if (-1 == pos->seek_request) 431 { 432 /* channel is not seeking, must be running or done */ 433 channels[plugin_off] = pos->channel; 434 } 435 else 436 { 437 /* not running this round, seeking! */ 438 channels[plugin_off] = NULL; 439 } 440 plugin_off++; 441 } 442 /* give plugins chance to send us meta data, seek or finished messages */ 443 if (-1 == 444 EXTRACTOR_IPC_channel_recv_ (channels, 445 plugin_count, 446 &process_plugin_reply, 447 &prp)) 448 { 449 /* serious problem in IPC; reset *all* channels */ 450 LOG ("Failed to receive message from channels; full reset\n"); 451 abort_all_channels (plugins); 452 break; 453 } 454 455 /* calculate minimum seek request (or set done=0 to continue here) */ 456 done = 1; 457 min_seek = -1; 458 plugin_off = 0; 459 for (pos = plugins; NULL != pos; pos = pos->next) 460 { 461 plugin_off++; 462 if ( (1 == pos->round_finished) || 463 (NULL == pos->channel) ) 464 { 465 continue; /* inactive plugin */ 466 } 467 if (-1 == pos->seek_request) 468 { 469 /* possibly more meta data at current position, at least 470 this plugin is still working on it... */ 471 done = 0; 472 break; 473 } 474 if (-1 != pos->seek_request) 475 { 476 if (SEEK_END == pos->seek_whence) 477 { 478 /* convert distance from end to absolute position */ 479 pos->seek_whence = 0; 480 end = EXTRACTOR_datasource_get_size_ (ds, 1); 481 if (pos->seek_request > end) 482 { 483 LOG ("Cannot seek to before the beginning of the file!\n"); 484 pos->seek_request = 0; 485 } 486 else 487 { 488 pos->seek_request = end - pos->seek_request; 489 } 490 } 491 if ( (-1 == min_seek) || 492 (min_seek > pos->seek_request) ) 493 { 494 min_seek = pos->seek_request; 495 } 496 } 497 } 498 data_available = -1; 499 if ( (1 == done) && 500 (-1 != min_seek) && 501 (NULL != shm) ) 502 { 503 /* current position done, but seek requested */ 504 done = 0; 505 if (-1 == 506 (data_available = EXTRACTOR_IPC_shared_memory_set_ (shm, 507 ds, 508 min_seek, 509 DEFAULT_SHM_SIZE))) 510 { 511 LOG ("Failed to seek; full reset\n"); 512 abort_all_channels (plugins); 513 break; 514 } 515 } 516 /* if 'prp.file_finished', send 'abort' to plugins; 517 if not, send 'seek' notification to plugins in range */ 518 for (pos = plugins; NULL != pos; pos = pos->next) 519 { 520 if (NULL == (channel = pos->channel)) 521 { 522 /* Skipping plugin: channel down */ 523 continue; 524 } 525 if ( (-1 != pos->seek_request) && 526 (1 == prp.file_finished) ) 527 { 528 send_discard_message (pos); 529 pos->round_finished = 1; 530 pos->seek_request = -1; 531 } 532 if ( (-1 != data_available) && 533 (-1 != pos->seek_request) && 534 (min_seek <= pos->seek_request) && 535 ( (min_seek + data_available > pos->seek_request) || 536 (min_seek == EXTRACTOR_datasource_get_size_ (ds, 0))) ) 537 { 538 /* Notify plugin about seek to 'min_seek' */ 539 send_update_message (pos, 540 min_seek, 541 data_available, 542 ds); 543 pos->seek_request = -1; 544 } 545 if (0 == pos->round_finished) 546 done = 0; /* can't be done, plugin still active */ 547 } 548 } 549 550 if (0 == have_in_memory) 551 return; 552 /* run in-process plugins */ 553 ctx.finished = 0; 554 ctx.ds = ds; 555 ctx.proc = proc; 556 ctx.proc_cls = proc_cls; 557 ec.cls = &ctx; 558 ec.read = &in_process_read; 559 ec.seek = &in_process_seek; 560 ec.get_size = &in_process_get_size; 561 ec.proc = &in_process_proc; 562 for (pos = plugins; NULL != pos; pos = pos->next) 563 { 564 if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) 565 continue; 566 if (-1 == EXTRACTOR_plugin_load_ (pos)) 567 continue; 568 ctx.plugin = pos; 569 ec.config = pos->plugin_options; 570 if (-1 == EXTRACTOR_datasource_seek_ (ds, 0, SEEK_SET)) 571 { 572 LOG ("Failed to seek to 0 for in-memory plugins\n"); 573 return; 574 } 575 pos->extract_method (&ec); 576 if (1 == ctx.finished) 577 break; 578 } 579 } 580 581 582 /** 583 * Extract keywords from a file using the given set of plugins. 584 * If needed, opens the file and loads its data (via mmap). Then 585 * decompresses it if the data is compressed. Finally runs the 586 * plugins on the (now possibly decompressed) data. 587 * 588 * @param plugins the list of plugins to use 589 * @param filename the name of the file, can be NULL if data is not NULL 590 * @param data data of the file in memory, can be NULL (in which 591 * case libextractor will open file) if filename is not NULL 592 * @param size number of bytes in data, ignored if data is NULL 593 * @param proc function to call for each meta data item found 594 * @param proc_cls cls argument to @a proc 595 */ 596 void 597 EXTRACTOR_extract (struct EXTRACTOR_PluginList *plugins, 598 const char *filename, 599 const void *data, 600 size_t size, 601 EXTRACTOR_MetaDataProcessor proc, 602 void *proc_cls) 603 { 604 struct EXTRACTOR_Datasource *datasource; 605 struct EXTRACTOR_SharedMemory *shm; 606 struct EXTRACTOR_PluginList *pos; 607 int have_oop; 608 609 if (NULL == plugins) 610 return; 611 if (NULL == filename) 612 datasource = EXTRACTOR_datasource_create_from_buffer_ (data, size, 613 proc, proc_cls); 614 else 615 datasource = EXTRACTOR_datasource_create_from_file_ (filename, 616 proc, proc_cls); 617 if (NULL == datasource) 618 return; 619 shm = NULL; 620 have_oop = 0; 621 for (pos = plugins; NULL != pos; pos = pos->next) 622 { 623 if (NULL == shm) 624 shm = pos->shm; 625 if (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) 626 have_oop = 1; 627 pos->round_finished = 0; 628 } 629 if ( (NULL == shm) && 630 (1 == have_oop) ) 631 { 632 /* need to create shared memory segment */ 633 shm = EXTRACTOR_IPC_shared_memory_create_ (DEFAULT_SHM_SIZE); 634 if (NULL == shm) 635 { 636 LOG ("Failed to setup IPC\n"); 637 EXTRACTOR_datasource_destroy_ (datasource); 638 return; 639 } 640 } 641 for (pos = plugins; NULL != pos; pos = pos->next) 642 if ( (NULL == pos->channel) && 643 (NULL != shm) && 644 (EXTRACTOR_OPTION_IN_PROCESS != pos->flags) ) 645 { 646 if (NULL == pos->shm) 647 { 648 pos->shm = shm; 649 (void) EXTRACTOR_IPC_shared_memory_change_rc_ (shm, 1); 650 } 651 pos->channel = EXTRACTOR_IPC_channel_create_ (pos, 652 shm); 653 } 654 do_extract (plugins, 655 shm, 656 datasource, 657 proc, 658 proc_cls); 659 EXTRACTOR_datasource_destroy_ (datasource); 660 } 661 662 663 /** 664 * Initialize gettext and libltdl (and W32 if needed). 665 */ 666 void __attribute__ ((constructor)) 667 EXTRACTOR_ltdl_init () 668 { 669 int err; 670 671 #if ENABLE_NLS 672 bindtextdomain (PACKAGE, LOCALEDIR); 673 #endif 674 err = lt_dlinit (); 675 if (err > 0) 676 { 677 #if DEBUG 678 fprintf (stderr, 679 _ ("Initialization of plugin mechanism failed: %s!\n"), 680 lt_dlerror ()); 681 #endif 682 return; 683 } 684 #if WINDOWS 685 plibc_init_utf8 ("GNU", PACKAGE, 1); 686 plibc_set_stat_size_size (sizeof (((struct stat *) 0)->st_size)); 687 plibc_set_stat_time_size (sizeof (((struct stat *) 0)->st_mtime)); 688 #endif 689 } 690 691 692 /** 693 * Deinit. 694 */ 695 void __attribute__ ((destructor)) 696 EXTRACTOR_ltdl_fini () 697 { 698 #if WINDOWS 699 plibc_shutdown (); 700 #endif 701 lt_dlexit (); 702 } 703 704 705 /* end of extractor.c */