ole2_extractor.c (27803B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2004, 2005, 2006, 2007, 2009, 2012, 2018 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 20 This code makes extensive use of libgsf 21 -- the Gnome Structured File Library 22 Copyright Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) 23 24 Part of this code was adapted from wordleaker. 25 */ 26 /** 27 * @file plugins/ole2_extractor.c 28 * @brief plugin to support OLE2 (DOC, XLS, etc.) files 29 * @author Christian Grothoff 30 */ 31 #include "platform.h" 32 #include "extractor.h" 33 #include "convert.h" 34 #include <glib-object.h> 35 #include <string.h> 36 #include <stdio.h> 37 #include <ctype.h> 38 #include <gsf/gsf-utils.h> 39 #include <gsf/gsf-input-impl.h> 40 #include <gsf/gsf-input-memory.h> 41 #include <gsf/gsf-impl-utils.h> 42 #include <gsf/gsf-infile.h> 43 #include <gsf/gsf-infile-msole.h> 44 #include <gsf/gsf-msole-utils.h> 45 46 47 /** 48 * Set to 1 to use our own GsfInput subclass which supports seeking 49 * and thus can handle very large files. Set to 0 to use the simple 50 * gsf in-memory buffer (which can only access the first ~16k) for 51 * debugging. 52 */ 53 #define USE_LE_INPUT 1 54 55 56 /** 57 * Give the given UTF8 string to LE by calling 'proc'. 58 * 59 * @param proc callback to invoke 60 * @param proc_cls closure for proc 61 * @param phrase metadata string to pass; may include spaces 62 * just double-quotes or just a space in a double quote; 63 * in those cases, nothing should be done 64 * @param type meta data type to use 65 * @return if 'proc' returned 1, otherwise 0 66 */ 67 static int 68 add_metadata (EXTRACTOR_MetaDataProcessor proc, 69 void *proc_cls, 70 const char *phrase, 71 enum EXTRACTOR_MetaType type) 72 { 73 char *tmp; 74 int ret; 75 76 if (0 == strlen (phrase)) 77 return 0; 78 if (0 == strcmp (phrase, "\"\"")) 79 return 0; 80 if (0 == strcmp (phrase, "\" \"")) 81 return 0; 82 if (0 == strcmp (phrase, " ")) 83 return 0; 84 if (NULL == (tmp = strdup (phrase))) 85 return 0; 86 87 while ( (strlen (tmp) > 0) && 88 (isblank ((unsigned char) tmp [strlen (tmp) - 1])) ) 89 tmp [strlen (tmp) - 1] = '\0'; 90 ret = proc (proc_cls, 91 "ole2", 92 type, 93 EXTRACTOR_METAFORMAT_UTF8, 94 "text/plain", 95 tmp, 96 strlen (tmp) + 1); 97 free (tmp); 98 return ret; 99 } 100 101 102 /** 103 * Entry in the map from OLE meta type strings 104 * to LE types. 105 */ 106 struct Matches 107 { 108 /** 109 * OLE description. 110 */ 111 const char *text; 112 113 /** 114 * Corresponding LE type. 115 */ 116 enum EXTRACTOR_MetaType type; 117 }; 118 119 120 static struct Matches tmap[] = { 121 { "Title", EXTRACTOR_METATYPE_TITLE }, 122 { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT }, 123 { "Category", EXTRACTOR_METATYPE_SECTION }, 124 { "Manager", EXTRACTOR_METATYPE_MANAGER }, 125 { "Company", EXTRACTOR_METATYPE_COMPANY }, 126 { "Subject", EXTRACTOR_METATYPE_SUBJECT }, 127 { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 128 { "Keywords", EXTRACTOR_METATYPE_KEYWORDS }, 129 { "Comments", EXTRACTOR_METATYPE_COMMENT }, 130 { "Template", EXTRACTOR_METATYPE_TEMPLATE }, 131 { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT }, 132 { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, 133 { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER }, 134 { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE }, 135 { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE }, 136 { "LastSavedTime", EXTRACTOR_METATYPE_MODIFICATION_DATE }, 137 { "gsf:company", EXTRACTOR_METATYPE_COMPANY }, 138 { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT }, 139 { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT }, 140 { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT }, 141 { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT }, 142 { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT }, 143 { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY }, 144 { "gsf:manager", EXTRACTOR_METATYPE_MANAGER }, 145 { "dc:title", EXTRACTOR_METATYPE_TITLE }, 146 { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, 147 { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, 148 { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, 149 { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS }, 150 { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED }, 151 { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, 152 { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, 153 { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 154 { "meta:template", EXTRACTOR_METATYPE_TEMPLATE }, 155 { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, 156 /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE }, */ 157 /* { "gsf:security", EXTRACTOR_SECURITY }, */ 158 /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ 159 /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */ 160 /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */ 161 { NULL, 0 } 162 }; 163 164 165 /** 166 * Closure for 'process_metadata'. 167 */ 168 struct ProcContext 169 { 170 /** 171 * Function to call for meta data that was found. 172 */ 173 EXTRACTOR_MetaDataProcessor proc; 174 175 /** 176 * Closure for @e proc. 177 */ 178 void *proc_cls; 179 180 /** 181 * Return value; 0 to continue to extract, 1 if we are done 182 */ 183 int ret; 184 }; 185 186 187 /** 188 * Function invoked by 'gst_msole_metadata_read' with 189 * metadata found in the document. 190 * 191 * @param key 'const char *' describing the meta data 192 * @param value the UTF8 representation of the meta data 193 * @param user_data our 'struct ProcContext' (closure) 194 */ 195 static void 196 process_metadata (gpointer key, 197 gpointer value, 198 gpointer user_data) 199 { 200 const char *type = key; 201 const GsfDocProp *prop = value; 202 struct ProcContext *pc = user_data; 203 const GValue *gval; 204 char *contents; 205 int pos; 206 207 if ( (NULL == key) || 208 (NULL == value) ) 209 return; 210 if (0 != pc->ret) 211 return; 212 gval = gsf_doc_prop_get_val (prop); 213 214 if (G_VALUE_TYPE (gval) == G_TYPE_STRING) 215 { 216 const char *gvals; 217 218 gvals = g_value_get_string (gval); 219 if (NULL == gvals) 220 return; 221 contents = strdup (gvals); 222 } 223 else 224 { 225 /* convert other formats? */ 226 contents = g_strdup_value_contents (gval); 227 } 228 if (NULL == contents) 229 return; 230 if (0 == strcmp (type, 231 "meta:generator")) 232 { 233 const char *mimetype = "application/vnd.ms-files"; 234 struct 235 { 236 const char *v; 237 const char *m; 238 } mm[] = { 239 { "Microsoft Word", "application/msword" }, 240 { "Microsoft Office Word", "application/msword" }, 241 { "Microsoft Excel", "application/vnd.ms-excel" }, 242 { "Microsoft Office Excel", "application/vnd.ms-excel" }, 243 { "Microsoft PowerPoint", "application/vnd.ms-powerpoint" }, 244 { "Microsoft Office PowerPoint", "application/vnd.ms-powerpoint"}, 245 { "Microsoft Project", "application/vnd.ms-project" }, 246 { "Microsoft Visio", "application/vnd.visio" }, 247 { "Microsoft Office", "application/vnd.ms-office" }, 248 { NULL, NULL } 249 }; 250 int i; 251 252 for (i = 0; NULL != mm[i].v; i++) 253 if (0 == strncmp (value, 254 mm[i].v, 255 strlen (mm[i].v) + 1)) 256 { 257 mimetype = mm[i].m; 258 break; 259 } 260 if (0 != add_metadata (pc->proc, 261 pc->proc_cls, 262 mimetype, 263 EXTRACTOR_METATYPE_MIMETYPE)) 264 { 265 free (contents); 266 pc->ret = 1; 267 return; 268 } 269 } 270 for (pos = 0; NULL != tmap[pos].text; pos++) 271 if (0 == strcmp (tmap[pos].text, 272 type)) 273 break; 274 if ( (NULL != tmap[pos].text) && 275 (0 != add_metadata (pc->proc, pc->proc_cls, 276 contents, 277 tmap[pos].type)) ) 278 { 279 free (contents); 280 pc->ret = 1; 281 return; 282 } 283 free (contents); 284 } 285 286 287 /** 288 * Function called on (Document)SummaryInformation OLE 289 * streams. 290 * 291 * @param in the input OLE stream 292 * @param proc function to call on meta data found 293 * @param proc_cls closure for proc 294 * @return 0 to continue to extract, 1 if we are done 295 */ 296 static int 297 process (GsfInput *in, 298 EXTRACTOR_MetaDataProcessor proc, 299 void *proc_cls) 300 { 301 struct ProcContext pc; 302 GsfDocMetaData *sections; 303 GError *error; 304 305 pc.proc = proc; 306 pc.proc_cls = proc_cls; 307 pc.ret = 0; 308 sections = gsf_doc_meta_data_new (); 309 #ifdef HAVE_GSF_DOC_META_DATA_READ_FROM_MSOLE 310 error = gsf_doc_meta_data_read_from_msole (sections, in); 311 #else 312 error = gsf_msole_metadata_read (in, sections); 313 #endif 314 if (NULL == error) 315 { 316 gsf_doc_meta_data_foreach (sections, 317 &process_metadata, 318 &pc); 319 } 320 else 321 { 322 g_error_free (error); 323 } 324 g_object_unref (G_OBJECT (sections)); 325 return pc.ret; 326 } 327 328 329 /** 330 * Function called on SfxDocumentInfo OLE 331 * streams. 332 * 333 * @param in the input OLE stream 334 * @param proc function to call on meta data found 335 * @param proc_cls closure for proc 336 * @return 0 to continue to extract, 1 if we are done 337 */ 338 static int 339 process_star_office (GsfInput *src, 340 EXTRACTOR_MetaDataProcessor proc, 341 void *proc_cls) 342 { 343 off_t size = gsf_input_size (src); 344 345 if ( (size < 0x374) || 346 (size > 4 * 1024 * 1024) ) /* == 0x375?? */ 347 return 0; 348 { 349 char buf[size]; 350 351 gsf_input_read (src, size, (unsigned char*) buf); 352 if ( (buf[0] != 0x0F) || 353 (buf[1] != 0x0) || 354 (0 != strncmp (&buf[2], 355 "SfxDocumentInfo", 356 strlen ("SfxDocumentInfo"))) || 357 (buf[0x11] != 0x0B) || 358 (buf[0x13] != 0x00) || /* pw protected! */ 359 (buf[0x12] != 0x00) ) 360 return 0; 361 buf[0xd3] = '\0'; 362 if ( (buf[0x94] + buf[0x93] > 0) && 363 (0 != add_metadata (proc, proc_cls, 364 &buf[0x95], 365 EXTRACTOR_METATYPE_TITLE)) ) 366 return 1; 367 buf[0x114] = '\0'; 368 if ( (buf[0xd5] + buf[0xd4] > 0) && 369 (0 != add_metadata (proc, proc_cls, 370 &buf[0xd6], 371 EXTRACTOR_METATYPE_SUBJECT)) ) 372 return 1; 373 buf[0x215] = '\0'; 374 if ( (buf[0x115] + buf[0x116] > 0) && 375 (0 != add_metadata (proc, proc_cls, 376 &buf[0x117], 377 EXTRACTOR_METATYPE_COMMENT)) ) 378 return 1; 379 buf[0x296] = '\0'; 380 if ( (buf[0x216] + buf[0x217] > 0) && 381 (0 != add_metadata (proc, proc_cls, 382 &buf[0x218], 383 EXTRACTOR_METATYPE_KEYWORDS)) ) 384 return 1; 385 /* fixme: do timestamps, 386 mime-type, user-defined info's */ 387 } 388 return 0; 389 } 390 391 392 /** 393 * We use "__" to translate using iso-639. 394 * 395 * @param a string to translate 396 * @return translated string 397 */ 398 #define __(a) dgettext ("iso-639", a) 399 400 401 /** 402 * Get the language string for the given language ID (lid) 403 * value. 404 * 405 * @param lid language id value 406 * @return language string corresponding to the lid 407 */ 408 static const char * 409 lid_to_language (unsigned int lid) 410 { 411 switch (lid) 412 { 413 case 0x0400: 414 return _ ("No Proofing"); 415 case 0x0401: 416 return __ ("Arabic"); 417 case 0x0402: 418 return __ ("Bulgarian"); 419 case 0x0403: 420 return __ ("Catalan"); 421 case 0x0404: 422 return _ ("Traditional Chinese"); 423 case 0x0804: 424 return _ ("Simplified Chinese"); 425 case 0x0405: 426 return __ ("Chechen"); 427 case 0x0406: 428 return __ ("Danish"); 429 case 0x0407: 430 return __ ("German"); 431 case 0x0807: 432 return _ ("Swiss German"); 433 case 0x0408: 434 return __ ("Greek"); 435 case 0x0409: 436 return _ ("U.S. English"); 437 case 0x0809: 438 return _ ("U.K. English"); 439 case 0x0c09: 440 return _ ("Australian English"); 441 case 0x040a: 442 return _ ("Castilian Spanish"); 443 case 0x080a: 444 return _ ("Mexican Spanish"); 445 case 0x040b: 446 return __ ("Finnish"); 447 case 0x040c: 448 return __ ("French"); 449 case 0x080c: 450 return _ ("Belgian French"); 451 case 0x0c0c: 452 return _ ("Canadian French"); 453 case 0x100c: 454 return _ ("Swiss French"); 455 case 0x040d: 456 return __ ("Hebrew"); 457 case 0x040e: 458 return __ ("Hungarian"); 459 case 0x040f: 460 return __ ("Icelandic"); 461 case 0x0410: 462 return __ ("Italian"); 463 case 0x0810: 464 return _ ("Swiss Italian"); 465 case 0x0411: 466 return __ ("Japanese"); 467 case 0x0412: 468 return __ ("Korean"); 469 case 0x0413: 470 return __ ("Dutch"); 471 case 0x0813: 472 return _ ("Belgian Dutch"); 473 case 0x0414: 474 return _ ("Norwegian Bokmal"); 475 case 0x0814: 476 return __ ("Norwegian Nynorsk"); 477 case 0x0415: 478 return __ ("Polish"); 479 case 0x0416: 480 return __ ("Brazilian Portuguese"); 481 case 0x0816: 482 return __ ("Portuguese"); 483 case 0x0417: 484 return _ ("Rhaeto-Romanic"); 485 case 0x0418: 486 return __ ("Romanian"); 487 case 0x0419: 488 return __ ("Russian"); 489 case 0x041a: 490 return _ ("Croato-Serbian (Latin)"); 491 case 0x081a: 492 return _ ("Serbo-Croatian (Cyrillic)"); 493 case 0x041b: 494 return __ ("Slovak"); 495 case 0x041c: 496 return __ ("Albanian"); 497 case 0x041d: 498 return __ ("Swedish"); 499 case 0x041e: 500 return __ ("Thai"); 501 case 0x041f: 502 return __ ("Turkish"); 503 case 0x0420: 504 return __ ("Urdu"); 505 case 0x0421: 506 return __ ("Bahasa"); 507 case 0x0422: 508 return __ ("Ukrainian"); 509 case 0x0423: 510 return __ ("Byelorussian"); 511 case 0x0424: 512 return __ ("Slovenian"); 513 case 0x0425: 514 return __ ("Estonian"); 515 case 0x0426: 516 return __ ("Latvian"); 517 case 0x0427: 518 return __ ("Lithuanian"); 519 case 0x0429: 520 return _ ("Farsi"); 521 case 0x042D: 522 return __ ("Basque"); 523 case 0x042F: 524 return __ ("Macedonian"); 525 case 0x0436: 526 return __ ("Afrikaans"); 527 case 0x043E: 528 return __ ("Malayalam"); 529 default: 530 return NULL; 531 } 532 } 533 534 535 /** 536 * Extract editing history from XTable stream. 537 * 538 * @param stream OLE stream to process 539 * @param lcSttbSavedBy length of the revision history in bytes 540 * @param fcSttbSavedBy offset of the revision history in the stream 541 * @param proc function to call on meta data found 542 * @param proc_cls closure for proc 543 * @return 0 to continue to extract, 1 if we are done 544 */ 545 static int 546 history_extract (GsfInput *stream, 547 unsigned int lcbSttbSavedBy, 548 unsigned int fcSttbSavedBy, 549 EXTRACTOR_MetaDataProcessor proc, 550 void *proc_cls) 551 { 552 unsigned int where; 553 unsigned char *lbuffer; 554 unsigned int i; 555 unsigned int length; 556 char *author; 557 char *filename; 558 char *rbuf; 559 unsigned int nRev; 560 int ret; 561 562 /* goto offset of revision information */ 563 gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET); 564 if (gsf_input_remaining (stream) < lcbSttbSavedBy) 565 return 0; 566 if (NULL == (lbuffer = malloc (lcbSttbSavedBy))) 567 return 0; 568 /* read all the revision history */ 569 gsf_input_read (stream, lcbSttbSavedBy, lbuffer); 570 /* there are n strings, so n/2 revisions (author & file) */ 571 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; 572 where = 6; 573 ret = 0; 574 for (i = 0; i < nRev; i++) 575 { 576 if (where >= lcbSttbSavedBy) 577 break; 578 length = lbuffer[where++]; 579 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || 580 (where + 2 * length + 2 <= where) ) 581 break; 582 author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where], 583 length * 2, 584 "UTF-16BE"); 585 where += length * 2 + 1; 586 length = lbuffer[where++]; 587 if ( (where + 2 * length >= lcbSttbSavedBy) || 588 (where + 2 * length + 1 <= where) ) 589 { 590 if (NULL != author) 591 free (author); 592 break; 593 } 594 filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where], 595 length * 2, 596 "UTF-16BE"); 597 where += length * 2 + 1; 598 if ( (NULL != author) && 599 (NULL != filename) ) 600 { 601 size_t bsize; 602 603 bsize = strlen (author) + strlen (filename) + 512; 604 if (NULL != (rbuf = malloc (bsize))) 605 { 606 int snret; 607 608 snret = snprintf (rbuf, 609 bsize, 610 _ ("Revision #%u: Author `%s' worked on `%s'"), 611 i, 612 author, 613 filename); 614 if ( (-1 != snret) && 615 (bsize > (size_t) snret) ) 616 { 617 ret = add_metadata (proc, 618 proc_cls, 619 rbuf, 620 EXTRACTOR_METATYPE_REVISION_HISTORY); 621 } 622 free (rbuf); 623 } 624 } 625 if (NULL != author) 626 free (author); 627 if (NULL != filename) 628 free (filename); 629 if (0 != ret) 630 break; 631 } 632 free (lbuffer); 633 return ret; 634 } 635 636 637 /* *************************** custom GSF input method ***************** */ 638 639 #define LE_TYPE_INPUT (le_input_get_type ()) 640 #define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), \ 641 LE_TYPE_INPUT, \ 642 LeInput)) 643 #define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), \ 644 LE_TYPE_INPUT, \ 645 LeInputClass)) 646 #define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), \ 647 LE_TYPE_INPUT)) 648 #define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), \ 649 LE_TYPE_INPUT)) 650 #define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), \ 651 LE_TYPE_INPUT, \ 652 LeInputClass)) 653 654 /** 655 * Internal state of an "LeInput" object. 656 */ 657 typedef struct _LeInputPrivate 658 { 659 /** 660 * Our extraction context. 661 */ 662 struct EXTRACTOR_ExtractContext *ec; 663 } LeInputPrivate; 664 665 666 /** 667 * Overall state of an "LeInput" object. 668 */ 669 typedef struct _LeInput 670 { 671 /** 672 * Inherited state from parent (GsfInput). 673 */ 674 GsfInput input; 675 676 /*< private > */ 677 /** 678 * Private state of the LeInput. 679 */ 680 LeInputPrivate *priv; 681 } LeInput; 682 683 684 /** 685 * LeInput's class state. 686 */ 687 typedef struct _LeInputClass 688 { 689 /** 690 * GsfInput is our parent class. 691 */ 692 GsfInputClass parent_class; 693 694 /* Padding for future expansion */ 695 void (*_gtk_reserved1)(void); 696 void (*_gtk_reserved2)(void); 697 void (*_gtk_reserved3)(void); 698 void (*_gtk_reserved4)(void); 699 } LeInputClass; 700 701 702 /** 703 * Constructor for LeInput objects. 704 * 705 * @param ec extraction context to use 706 * @return the LeInput, NULL on error 707 */ 708 GsfInput * 709 le_input_new (struct EXTRACTOR_ExtractContext *ec); 710 711 712 /** 713 * Class initializer for the "LeInput" class. 714 * 715 * @param class class object to initialize 716 */ 717 static void 718 le_input_class_init (LeInputClass *class); 719 720 721 /** 722 * Initialize internal state of fresh input object. 723 * 724 * @param input object to initialize 725 */ 726 static void 727 le_input_init (LeInput *input); 728 729 730 /** 731 * Macro to create LeInput type definition and register the class. 732 */ 733 GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init, 734 GSF_INPUT_TYPE) 735 736 737 /** 738 * Duplicate input, leaving the new one at the same offset. 739 * 740 * @param input the input to duplicate 741 * @param err location for error reporting, can be NULL 742 * @return NULL on error (always) 743 */ 744 static GsfInput * 745 le_input_dup (GsfInput * input, 746 GError * *err) 747 { 748 if (NULL != err) 749 *err = g_error_new (gsf_input_error_id (), 0, 750 "dup not supported on LeInput"); 751 return NULL; 752 } 753 754 755 /** 756 * Read at least num_bytes. Does not change the current position if 757 * there is an error. Will only read if the entire amount can be 758 * read. Invalidates the buffer associated with previous calls to 759 * gsf_input_read. 760 * 761 * @param input 762 * @param num_bytes 763 * @param optional_buffer 764 * @return buffer where num_bytes data are available, or NULL on error 765 */ 766 static const guint8 * 767 le_input_read (GsfInput *input, 768 size_t num_bytes, 769 guint8 *optional_buffer) 770 { 771 LeInput *li = LE_INPUT (input); 772 struct EXTRACTOR_ExtractContext *ec; 773 void *buf; 774 uint64_t old_off; 775 ssize_t ret; 776 777 ec = li->priv->ec; 778 old_off = ec->seek (ec->cls, 0, SEEK_CUR); 779 if (num_bytes 780 != (ret = ec->read (ec->cls, 781 &buf, 782 num_bytes))) 783 { 784 /* we don't support partial reads; 785 most other GsfInput implementations in this case 786 allocate some huge temporary buffer just to avoid 787 the partial read; we might need to do that as well!? */ 788 ec->seek (ec->cls, SEEK_SET, old_off); 789 return NULL; 790 } 791 if (NULL != optional_buffer) 792 { 793 memcpy (optional_buffer, buf, num_bytes); 794 return optional_buffer; 795 } 796 return buf; 797 } 798 799 800 /** 801 * Move the current location in an input stream 802 * 803 * @param input stream to seek 804 * @param offset target offset 805 * @param whence determines to what the offset is relative to 806 * @return TRUE on error 807 */ 808 static gboolean 809 le_input_seek (GsfInput *input, 810 gsf_off_t offset, 811 GSeekType whence) 812 { 813 LeInput *li = LE_INPUT (input); 814 struct EXTRACTOR_ExtractContext *ec; 815 int w; 816 int64_t ret; 817 818 ec = li->priv->ec; 819 switch (whence) 820 { 821 case G_SEEK_SET: 822 w = SEEK_SET; 823 break; 824 case G_SEEK_CUR: 825 w = SEEK_CUR; 826 break; 827 case G_SEEK_END: 828 w = SEEK_END; 829 break; 830 default: 831 return TRUE; 832 } 833 if (-1 == 834 (ret = ec->seek (ec->cls, 835 offset, 836 w))) 837 return TRUE; 838 return FALSE; 839 } 840 841 842 /** 843 * Class initializer for the "LeInput" class. 844 * 845 * @param class class object to initialize 846 */ 847 static void 848 le_input_class_init (LeInputClass *class) 849 { 850 GsfInputClass *input_class; 851 852 input_class = (GsfInputClass *) class; 853 input_class->Dup = le_input_dup; 854 input_class->Read = le_input_read; 855 input_class->Seek = le_input_seek; 856 g_type_class_add_private (class, sizeof (LeInputPrivate)); 857 } 858 859 860 /** 861 * Initialize internal state of fresh input object. 862 * 863 * @param input object to initialize 864 */ 865 static void 866 le_input_init (LeInput *input) 867 { 868 LeInputPrivate *priv; 869 870 input->priv = 871 G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT, 872 LeInputPrivate); 873 priv = input->priv; 874 priv->ec = NULL; 875 } 876 877 878 /** 879 * Creates a new LeInput object. 880 * 881 * @param ec extractor context to wrap 882 * @return NULL on error 883 */ 884 GsfInput * 885 le_input_new (struct EXTRACTOR_ExtractContext *ec) 886 { 887 LeInput *input; 888 889 input = g_object_new (LE_TYPE_INPUT, NULL); 890 gsf_input_set_size (GSF_INPUT (input), 891 ec->get_size (ec->cls)); 892 gsf_input_seek_emulate (GSF_INPUT (input), 893 0); 894 input->input.name = NULL; 895 input->input.container = NULL; 896 input->priv->ec = ec; 897 898 return GSF_INPUT (input); 899 } 900 901 902 /* *********************** end of custom GSF input method ************* */ 903 904 905 /** 906 * Main entry method for the OLE2 extraction plugin. 907 * 908 * @param ec extraction context provided to the plugin 909 */ 910 void 911 EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec) 912 { 913 GsfInput *input; 914 GsfInfile *infile; 915 GsfInput *src; 916 const char *name; 917 unsigned int i; 918 unsigned int lcb; 919 unsigned int fcb; 920 const unsigned char *data512; 921 unsigned int lid; 922 const char *lang; 923 int ret; 924 void *data; 925 uint64_t fsize; 926 ssize_t data_size; 927 928 fsize = ec->get_size (ec->cls); 929 if (fsize < 512 + 898) 930 { 931 /* File too small for OLE2 */ 932 return; /* can hardly be OLE2 */ 933 } 934 if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize))) 935 { 936 /* Failed to read minimum file size to buffer */ 937 return; 938 } 939 data512 = (const unsigned char*) data + 512; 940 lid = data512[6] + (data512[7] << 8); 941 if ( (NULL != (lang = lid_to_language (lid))) && 942 (0 != (ret = add_metadata (ec->proc, ec->cls, 943 lang, 944 EXTRACTOR_METATYPE_LANGUAGE))) ) 945 return; 946 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) 947 + (data512[729] << 24); 948 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) 949 + (data512[725] << 24); 950 if (0 != ec->seek (ec->cls, 0, SEEK_SET)) 951 { 952 /* seek failed!? */ 953 return; 954 } 955 #if USE_LE_INPUT 956 if (NULL == (input = le_input_new (ec))) 957 { 958 fprintf (stderr, "le_input_new failed\n"); 959 return; 960 } 961 #else 962 input = gsf_input_memory_new ((const guint8 *) data, 963 data_size, 964 FALSE); 965 #endif 966 if (NULL == (infile = gsf_infile_msole_new (input, NULL))) 967 { 968 g_object_unref (G_OBJECT (input)); 969 return; 970 } 971 ret = 0; 972 for (i = 0; i<gsf_infile_num_children (infile); i++) 973 { 974 if (0 != ret) 975 break; 976 if (NULL == (name = gsf_infile_name_by_index (infile, i))) 977 continue; 978 src = NULL; 979 if ( ( (0 == strcmp (name, "\005SummaryInformation")) || 980 (0 == strcmp (name, "\005DocumentSummaryInformation")) ) && 981 (NULL != (src = gsf_infile_child_by_index (infile, i))) ) 982 ret = process (src, 983 ec->proc, 984 ec->cls); 985 if ( (0 == strcmp (name, "SfxDocumentInfo")) && 986 (NULL != (src = gsf_infile_child_by_index (infile, i))) ) 987 ret = process_star_office (src, 988 ec->proc, 989 ec->cls); 990 if (NULL != src) 991 g_object_unref (G_OBJECT (src)); 992 } 993 if (0 != ret) 994 goto CLEANUP; 995 996 if (lcb < 6) 997 goto CLEANUP; 998 for (i = 0; i<gsf_infile_num_children (infile); i++) 999 { 1000 if (ret != 0) 1001 break; 1002 if (NULL == (name = gsf_infile_name_by_index (infile, i))) 1003 continue; 1004 if ( ( (0 == strcmp (name, "1Table")) || 1005 (0 == strcmp (name, "0Table")) ) && 1006 (NULL != (src = gsf_infile_child_by_index (infile, i))) ) 1007 { 1008 ret = history_extract (src, 1009 lcb, 1010 fcb, 1011 ec->proc, ec->cls); 1012 g_object_unref (G_OBJECT (src)); 1013 } 1014 } 1015 CLEANUP: 1016 g_object_unref (G_OBJECT (infile)); 1017 g_object_unref (G_OBJECT (input)); 1018 } 1019 1020 1021 /** 1022 * Custom log function we give to GSF to disable logging. 1023 * 1024 * @param log_domain unused 1025 * @param log_level unused 1026 * @param message unused 1027 * @param user_data unused 1028 */ 1029 static void 1030 nolog (const gchar *log_domain, 1031 GLogLevelFlags log_level, 1032 const gchar *message, 1033 gpointer user_data) 1034 { 1035 /* do nothing */ 1036 } 1037 1038 1039 /** 1040 * OLE2 plugin constructor. Initializes glib and gsf, in particular 1041 * gsf logging is disabled. 1042 */ 1043 void __attribute__ ((constructor)) 1044 ole2_ltdl_init () 1045 { 1046 #if ! GLIB_CHECK_VERSION (2, 35, 0) 1047 g_type_init (); 1048 #endif 1049 #ifdef HAVE_GSF_INIT 1050 gsf_init (); 1051 #endif 1052 /* disable logging -- thanks, Jody! */ 1053 g_log_set_handler ("libgsf:msole", 1054 G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, 1055 &nolog, NULL); 1056 } 1057 1058 1059 /** 1060 * OLE2 plugin destructor. Shutdown of gsf. 1061 */ 1062 void __attribute__ ((destructor)) 1063 ole2_ltdl_fini () 1064 { 1065 #ifdef HAVE_GSF_INIT 1066 gsf_shutdown (); 1067 #endif 1068 } 1069 1070 1071 /* end of ole2_extractor.c */