libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

ole2_extractor.c (27803B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2004, 2005, 2006, 2007, 2009, 2012, 2018 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 
     20      This code makes extensive use of libgsf
     21      -- the Gnome Structured File Library
     22      Copyright Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
     23 
     24      Part of this code was adapted from wordleaker.
     25 */
     26 /**
     27  * @file plugins/ole2_extractor.c
     28  * @brief plugin to support OLE2 (DOC, XLS, etc.) files
     29  * @author Christian Grothoff
     30  */
     31 #include "platform.h"
     32 #include "extractor.h"
     33 #include "convert.h"
     34 #include <glib-object.h>
     35 #include <string.h>
     36 #include <stdio.h>
     37 #include <ctype.h>
     38 #include <gsf/gsf-utils.h>
     39 #include <gsf/gsf-input-impl.h>
     40 #include <gsf/gsf-input-memory.h>
     41 #include <gsf/gsf-impl-utils.h>
     42 #include <gsf/gsf-infile.h>
     43 #include <gsf/gsf-infile-msole.h>
     44 #include <gsf/gsf-msole-utils.h>
     45 
     46 
     47 /**
     48  * Set to 1 to use our own GsfInput subclass which supports seeking
     49  * and thus can handle very large files.  Set to 0 to use the simple
     50  * gsf in-memory buffer (which can only access the first ~16k) for
     51  * debugging.
     52  */
     53 #define USE_LE_INPUT 1
     54 
     55 
     56 /**
     57  * Give the given UTF8 string to LE by calling 'proc'.
     58  *
     59  * @param proc callback to invoke
     60  * @param proc_cls closure for proc
     61  * @param phrase metadata string to pass; may include spaces
     62  *        just double-quotes or just a space in a double quote;
     63  *        in those cases, nothing should be done
     64  * @param type meta data type to use
     65  * @return if 'proc' returned 1, otherwise 0
     66  */
     67 static int
     68 add_metadata (EXTRACTOR_MetaDataProcessor proc,
     69               void *proc_cls,
     70               const char *phrase,
     71               enum EXTRACTOR_MetaType type)
     72 {
     73   char *tmp;
     74   int ret;
     75 
     76   if (0 == strlen (phrase))
     77     return 0;
     78   if (0 == strcmp (phrase, "\"\""))
     79     return 0;
     80   if (0 == strcmp (phrase, "\" \""))
     81     return 0;
     82   if (0 == strcmp (phrase, " "))
     83     return 0;
     84   if (NULL == (tmp = strdup (phrase)))
     85     return 0;
     86 
     87   while ( (strlen (tmp) > 0) &&
     88           (isblank ((unsigned char) tmp [strlen (tmp) - 1])) )
     89     tmp [strlen (tmp) - 1] = '\0';
     90   ret = proc (proc_cls,
     91               "ole2",
     92               type,
     93               EXTRACTOR_METAFORMAT_UTF8,
     94               "text/plain",
     95               tmp,
     96               strlen (tmp) + 1);
     97   free (tmp);
     98   return ret;
     99 }
    100 
    101 
    102 /**
    103  * Entry in the map from OLE meta type  strings
    104  * to LE types.
    105  */
    106 struct Matches
    107 {
    108   /**
    109    * OLE description.
    110    */
    111   const char *text;
    112 
    113   /**
    114    * Corresponding LE type.
    115    */
    116   enum EXTRACTOR_MetaType type;
    117 };
    118 
    119 
    120 static struct Matches tmap[] = {
    121   { "Title", EXTRACTOR_METATYPE_TITLE },
    122   { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
    123   { "Category", EXTRACTOR_METATYPE_SECTION },
    124   { "Manager", EXTRACTOR_METATYPE_MANAGER },
    125   { "Company", EXTRACTOR_METATYPE_COMPANY },
    126   { "Subject", EXTRACTOR_METATYPE_SUBJECT },
    127   { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
    128   { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
    129   { "Comments", EXTRACTOR_METATYPE_COMMENT },
    130   { "Template", EXTRACTOR_METATYPE_TEMPLATE },
    131   { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
    132   { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
    133   { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
    134   { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
    135   { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
    136   { "LastSavedTime", EXTRACTOR_METATYPE_MODIFICATION_DATE },
    137   { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
    138   { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
    139   { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
    140   { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
    141   { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
    142   { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
    143   { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
    144   { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
    145   { "dc:title", EXTRACTOR_METATYPE_TITLE },
    146   { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
    147   { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
    148   { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
    149   { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
    150   { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
    151   { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
    152   { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
    153   { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
    154   { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
    155   { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
    156   /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE },  */
    157   /* { "gsf:security", EXTRACTOR_SECURITY }, */
    158   /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
    159   /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
    160   /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
    161   { NULL, 0 }
    162 };
    163 
    164 
    165 /**
    166  * Closure for 'process_metadata'.
    167  */
    168 struct ProcContext
    169 {
    170   /**
    171    * Function to call for meta data that was found.
    172    */
    173   EXTRACTOR_MetaDataProcessor proc;
    174 
    175   /**
    176    * Closure for @e proc.
    177    */
    178   void *proc_cls;
    179 
    180   /**
    181    * Return value; 0 to continue to extract, 1 if we are done
    182    */
    183   int ret;
    184 };
    185 
    186 
    187 /**
    188  * Function invoked by 'gst_msole_metadata_read' with
    189  * metadata found in the document.
    190  *
    191  * @param key 'const char *' describing the meta data
    192  * @param value the UTF8 representation of the meta data
    193  * @param user_data our 'struct ProcContext' (closure)
    194  */
    195 static void
    196 process_metadata (gpointer key,
    197                   gpointer value,
    198                   gpointer user_data)
    199 {
    200   const char *type = key;
    201   const GsfDocProp *prop = value;
    202   struct ProcContext *pc = user_data;
    203   const GValue *gval;
    204   char *contents;
    205   int pos;
    206 
    207   if ( (NULL == key) ||
    208        (NULL == value) )
    209     return;
    210   if (0 != pc->ret)
    211     return;
    212   gval = gsf_doc_prop_get_val (prop);
    213 
    214   if (G_VALUE_TYPE (gval) == G_TYPE_STRING)
    215   {
    216     const char *gvals;
    217 
    218     gvals = g_value_get_string (gval);
    219     if (NULL == gvals)
    220       return;
    221     contents = strdup (gvals);
    222   }
    223   else
    224   {
    225     /* convert other formats? */
    226     contents = g_strdup_value_contents (gval);
    227   }
    228   if (NULL == contents)
    229     return;
    230   if (0 == strcmp (type,
    231                    "meta:generator"))
    232   {
    233     const char *mimetype = "application/vnd.ms-files";
    234     struct
    235     {
    236       const char *v;
    237       const char *m;
    238     } mm[] = {
    239       { "Microsoft Word", "application/msword" },
    240       { "Microsoft Office Word", "application/msword" },
    241       { "Microsoft Excel", "application/vnd.ms-excel" },
    242       { "Microsoft Office Excel", "application/vnd.ms-excel" },
    243       { "Microsoft PowerPoint", "application/vnd.ms-powerpoint" },
    244       { "Microsoft Office PowerPoint", "application/vnd.ms-powerpoint"},
    245       { "Microsoft Project", "application/vnd.ms-project" },
    246       { "Microsoft Visio", "application/vnd.visio" },
    247       { "Microsoft Office", "application/vnd.ms-office" },
    248       { NULL, NULL }
    249     };
    250     int i;
    251 
    252     for (i = 0; NULL != mm[i].v; i++)
    253       if (0 == strncmp (value,
    254                         mm[i].v,
    255                         strlen (mm[i].v) + 1))
    256       {
    257         mimetype = mm[i].m;
    258         break;
    259       }
    260     if (0 != add_metadata (pc->proc,
    261                            pc->proc_cls,
    262                            mimetype,
    263                            EXTRACTOR_METATYPE_MIMETYPE))
    264     {
    265       free (contents);
    266       pc->ret = 1;
    267       return;
    268     }
    269   }
    270   for (pos = 0; NULL != tmap[pos].text; pos++)
    271     if (0 == strcmp (tmap[pos].text,
    272                      type))
    273       break;
    274   if ( (NULL != tmap[pos].text) &&
    275        (0 != add_metadata (pc->proc, pc->proc_cls,
    276                            contents,
    277                            tmap[pos].type)) )
    278   {
    279     free (contents);
    280     pc->ret = 1;
    281     return;
    282   }
    283   free (contents);
    284 }
    285 
    286 
    287 /**
    288  * Function called on (Document)SummaryInformation OLE
    289  * streams.
    290  *
    291  * @param in the input OLE stream
    292  * @param proc function to call on meta data found
    293  * @param proc_cls closure for proc
    294  * @return 0 to continue to extract, 1 if we are done
    295  */
    296 static int
    297 process (GsfInput *in,
    298          EXTRACTOR_MetaDataProcessor proc,
    299          void *proc_cls)
    300 {
    301   struct ProcContext pc;
    302   GsfDocMetaData *sections;
    303   GError *error;
    304 
    305   pc.proc = proc;
    306   pc.proc_cls = proc_cls;
    307   pc.ret = 0;
    308   sections = gsf_doc_meta_data_new ();
    309 #ifdef HAVE_GSF_DOC_META_DATA_READ_FROM_MSOLE
    310   error = gsf_doc_meta_data_read_from_msole (sections, in);
    311 #else
    312   error = gsf_msole_metadata_read (in, sections);
    313 #endif
    314   if (NULL == error)
    315   {
    316     gsf_doc_meta_data_foreach (sections,
    317                                &process_metadata,
    318                                &pc);
    319   }
    320   else
    321   {
    322     g_error_free (error);
    323   }
    324   g_object_unref (G_OBJECT (sections));
    325   return pc.ret;
    326 }
    327 
    328 
    329 /**
    330  * Function called on SfxDocumentInfo OLE
    331  * streams.
    332  *
    333  * @param in the input OLE stream
    334  * @param proc function to call on meta data found
    335  * @param proc_cls closure for proc
    336  * @return 0 to continue to extract, 1 if we are done
    337  */
    338 static int
    339 process_star_office (GsfInput *src,
    340                      EXTRACTOR_MetaDataProcessor proc,
    341                      void *proc_cls)
    342 {
    343   off_t size = gsf_input_size (src);
    344 
    345   if ( (size < 0x374) ||
    346        (size > 4 * 1024 * 1024) ) /* == 0x375?? */
    347     return 0;
    348   {
    349     char buf[size];
    350 
    351     gsf_input_read (src, size, (unsigned char*) buf);
    352     if ( (buf[0] != 0x0F) ||
    353          (buf[1] != 0x0) ||
    354          (0 != strncmp (&buf[2],
    355                         "SfxDocumentInfo",
    356                         strlen ("SfxDocumentInfo"))) ||
    357          (buf[0x11] != 0x0B) ||
    358          (buf[0x13] != 0x00) || /* pw protected! */
    359          (buf[0x12] != 0x00) )
    360       return 0;
    361     buf[0xd3] = '\0';
    362     if ( (buf[0x94] + buf[0x93] > 0) &&
    363          (0 != add_metadata (proc, proc_cls,
    364                              &buf[0x95],
    365                              EXTRACTOR_METATYPE_TITLE)) )
    366       return 1;
    367     buf[0x114] = '\0';
    368     if ( (buf[0xd5] + buf[0xd4] > 0) &&
    369          (0 != add_metadata (proc, proc_cls,
    370                              &buf[0xd6],
    371                              EXTRACTOR_METATYPE_SUBJECT)) )
    372       return 1;
    373     buf[0x215] = '\0';
    374     if ( (buf[0x115] + buf[0x116] > 0) &&
    375          (0 != add_metadata (proc, proc_cls,
    376                              &buf[0x117],
    377                              EXTRACTOR_METATYPE_COMMENT)) )
    378       return 1;
    379     buf[0x296] = '\0';
    380     if ( (buf[0x216] + buf[0x217] > 0) &&
    381          (0 != add_metadata (proc, proc_cls,
    382                              &buf[0x218],
    383                              EXTRACTOR_METATYPE_KEYWORDS)) )
    384       return 1;
    385     /* fixme: do timestamps,
    386        mime-type, user-defined info's */
    387   }
    388   return 0;
    389 }
    390 
    391 
    392 /**
    393  * We use "__" to translate using iso-639.
    394  *
    395  * @param a string to translate
    396  * @return translated string
    397  */
    398 #define __(a) dgettext ("iso-639", a)
    399 
    400 
    401 /**
    402  * Get the language string for the given language ID (lid)
    403  * value.
    404  *
    405  * @param lid language id value
    406  * @return language string corresponding to the lid
    407  */
    408 static const char *
    409 lid_to_language (unsigned int lid)
    410 {
    411   switch (lid)
    412   {
    413   case 0x0400:
    414     return _ ("No Proofing");
    415   case 0x0401:
    416     return __ ("Arabic");
    417   case 0x0402:
    418     return __ ("Bulgarian");
    419   case 0x0403:
    420     return __ ("Catalan");
    421   case 0x0404:
    422     return _ ("Traditional Chinese");
    423   case 0x0804:
    424     return _ ("Simplified Chinese");
    425   case 0x0405:
    426     return __ ("Chechen");
    427   case 0x0406:
    428     return __ ("Danish");
    429   case 0x0407:
    430     return __ ("German");
    431   case 0x0807:
    432     return _ ("Swiss German");
    433   case 0x0408:
    434     return __ ("Greek");
    435   case 0x0409:
    436     return _ ("U.S. English");
    437   case 0x0809:
    438     return _ ("U.K. English");
    439   case 0x0c09:
    440     return _ ("Australian English");
    441   case 0x040a:
    442     return _ ("Castilian Spanish");
    443   case 0x080a:
    444     return _ ("Mexican Spanish");
    445   case 0x040b:
    446     return __ ("Finnish");
    447   case 0x040c:
    448     return __ ("French");
    449   case 0x080c:
    450     return _ ("Belgian French");
    451   case 0x0c0c:
    452     return _ ("Canadian French");
    453   case 0x100c:
    454     return _ ("Swiss French");
    455   case 0x040d:
    456     return __ ("Hebrew");
    457   case 0x040e:
    458     return __ ("Hungarian");
    459   case 0x040f:
    460     return __ ("Icelandic");
    461   case 0x0410:
    462     return __ ("Italian");
    463   case 0x0810:
    464     return _ ("Swiss Italian");
    465   case 0x0411:
    466     return __ ("Japanese");
    467   case 0x0412:
    468     return __ ("Korean");
    469   case 0x0413:
    470     return __ ("Dutch");
    471   case 0x0813:
    472     return _ ("Belgian Dutch");
    473   case 0x0414:
    474     return _ ("Norwegian Bokmal");
    475   case 0x0814:
    476     return __ ("Norwegian Nynorsk");
    477   case 0x0415:
    478     return __ ("Polish");
    479   case 0x0416:
    480     return __ ("Brazilian Portuguese");
    481   case 0x0816:
    482     return __ ("Portuguese");
    483   case 0x0417:
    484     return _ ("Rhaeto-Romanic");
    485   case 0x0418:
    486     return __ ("Romanian");
    487   case 0x0419:
    488     return __ ("Russian");
    489   case 0x041a:
    490     return _ ("Croato-Serbian (Latin)");
    491   case 0x081a:
    492     return _ ("Serbo-Croatian (Cyrillic)");
    493   case 0x041b:
    494     return __ ("Slovak");
    495   case 0x041c:
    496     return __ ("Albanian");
    497   case 0x041d:
    498     return __ ("Swedish");
    499   case 0x041e:
    500     return __ ("Thai");
    501   case 0x041f:
    502     return __ ("Turkish");
    503   case 0x0420:
    504     return __ ("Urdu");
    505   case 0x0421:
    506     return __ ("Bahasa");
    507   case 0x0422:
    508     return __ ("Ukrainian");
    509   case 0x0423:
    510     return __ ("Byelorussian");
    511   case 0x0424:
    512     return __ ("Slovenian");
    513   case 0x0425:
    514     return __ ("Estonian");
    515   case 0x0426:
    516     return __ ("Latvian");
    517   case 0x0427:
    518     return __ ("Lithuanian");
    519   case 0x0429:
    520     return _ ("Farsi");
    521   case 0x042D:
    522     return __ ("Basque");
    523   case 0x042F:
    524     return __ ("Macedonian");
    525   case 0x0436:
    526     return __ ("Afrikaans");
    527   case 0x043E:
    528     return __ ("Malayalam");
    529   default:
    530     return NULL;
    531   }
    532 }
    533 
    534 
    535 /**
    536  * Extract editing history from XTable stream.
    537  *
    538  * @param stream OLE stream to process
    539  * @param lcSttbSavedBy length of the revision history in bytes
    540  * @param fcSttbSavedBy offset of the revision history in the stream
    541  * @param proc function to call on meta data found
    542  * @param proc_cls closure for proc
    543  * @return 0 to continue to extract, 1 if we are done
    544  */
    545 static int
    546 history_extract (GsfInput *stream,
    547                  unsigned int lcbSttbSavedBy,
    548                  unsigned int fcSttbSavedBy,
    549                  EXTRACTOR_MetaDataProcessor proc,
    550                  void *proc_cls)
    551 {
    552   unsigned int where;
    553   unsigned char *lbuffer;
    554   unsigned int i;
    555   unsigned int length;
    556   char *author;
    557   char *filename;
    558   char *rbuf;
    559   unsigned int nRev;
    560   int ret;
    561 
    562   /* goto offset of revision information */
    563   gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET);
    564   if (gsf_input_remaining (stream) < lcbSttbSavedBy)
    565     return 0;
    566   if (NULL == (lbuffer = malloc (lcbSttbSavedBy)))
    567     return 0;
    568   /* read all the revision history */
    569   gsf_input_read (stream, lcbSttbSavedBy, lbuffer);
    570   /* there are n strings, so n/2 revisions (author & file) */
    571   nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
    572   where = 6;
    573   ret = 0;
    574   for (i = 0; i < nRev; i++)
    575   {
    576     if (where >= lcbSttbSavedBy)
    577       break;
    578     length = lbuffer[where++];
    579     if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
    580          (where + 2 * length + 2 <= where) )
    581       break;
    582     author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
    583                                                length * 2,
    584                                                "UTF-16BE");
    585     where += length * 2 + 1;
    586     length = lbuffer[where++];
    587     if ( (where + 2 * length >= lcbSttbSavedBy) ||
    588          (where + 2 * length + 1 <= where) )
    589     {
    590       if (NULL != author)
    591         free (author);
    592       break;
    593     }
    594     filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
    595                                                  length * 2,
    596                                                  "UTF-16BE");
    597     where += length * 2 + 1;
    598     if ( (NULL != author) &&
    599          (NULL != filename) )
    600     {
    601       size_t bsize;
    602 
    603       bsize = strlen (author) + strlen (filename) + 512;
    604       if (NULL != (rbuf = malloc (bsize)))
    605       {
    606         int snret;
    607 
    608         snret = snprintf (rbuf,
    609                           bsize,
    610                           _ ("Revision #%u: Author `%s' worked on `%s'"),
    611                           i,
    612                           author,
    613                           filename);
    614         if ( (-1 != snret) &&
    615              (bsize > (size_t) snret) )
    616         {
    617           ret = add_metadata (proc,
    618                               proc_cls,
    619                               rbuf,
    620                               EXTRACTOR_METATYPE_REVISION_HISTORY);
    621         }
    622         free (rbuf);
    623       }
    624     }
    625     if (NULL != author)
    626       free (author);
    627     if (NULL != filename)
    628       free (filename);
    629     if (0 != ret)
    630       break;
    631   }
    632   free (lbuffer);
    633   return ret;
    634 }
    635 
    636 
    637 /* *************************** custom GSF input method ***************** */
    638 
    639 #define LE_TYPE_INPUT                  (le_input_get_type ())
    640 #define LE_INPUT(obj)                  (G_TYPE_CHECK_INSTANCE_CAST ((obj), \
    641                                                                     LE_TYPE_INPUT, \
    642                                                                     LeInput))
    643 #define LE_INPUT_CLASS(klass)          (G_TYPE_CHECK_CLASS_CAST ((klass), \
    644                                                                  LE_TYPE_INPUT, \
    645                                                                  LeInputClass))
    646 #define IS_LE_INPUT(obj)               (G_TYPE_CHECK_INSTANCE_TYPE ((obj), \
    647                                                                     LE_TYPE_INPUT))
    648 #define IS_LE_INPUT_CLASS(klass)       (G_TYPE_CHECK_CLASS_TYPE ((klass), \
    649                                                                  LE_TYPE_INPUT))
    650 #define LE_INPUT_GET_CLASS(obj)        (G_TYPE_INSTANCE_GET_CLASS ((obj), \
    651                                                                    LE_TYPE_INPUT, \
    652                                                                    LeInputClass))
    653 
    654 /**
    655  * Internal state of an "LeInput" object.
    656  */
    657 typedef struct _LeInputPrivate
    658 {
    659   /**
    660    * Our extraction context.
    661    */
    662   struct EXTRACTOR_ExtractContext *ec;
    663 } LeInputPrivate;
    664 
    665 
    666 /**
    667  * Overall state of an "LeInput" object.
    668  */
    669 typedef struct _LeInput
    670 {
    671   /**
    672    * Inherited state from parent (GsfInput).
    673    */
    674   GsfInput input;
    675 
    676   /*< private > */
    677   /**
    678    * Private state of the LeInput.
    679    */
    680   LeInputPrivate *priv;
    681 } LeInput;
    682 
    683 
    684 /**
    685  * LeInput's class state.
    686  */
    687 typedef struct _LeInputClass
    688 {
    689   /**
    690    * GsfInput is our parent class.
    691    */
    692   GsfInputClass parent_class;
    693 
    694   /* Padding for future expansion */
    695   void (*_gtk_reserved1)(void);
    696   void (*_gtk_reserved2)(void);
    697   void (*_gtk_reserved3)(void);
    698   void (*_gtk_reserved4)(void);
    699 } LeInputClass;
    700 
    701 
    702 /**
    703  * Constructor for LeInput objects.
    704  *
    705  * @param ec extraction context to use
    706  * @return the LeInput, NULL on error
    707  */
    708 GsfInput *
    709 le_input_new (struct EXTRACTOR_ExtractContext *ec);
    710 
    711 
    712 /**
    713  * Class initializer for the "LeInput" class.
    714  *
    715  * @param class class object to initialize
    716  */
    717 static void
    718 le_input_class_init (LeInputClass *class);
    719 
    720 
    721 /**
    722  * Initialize internal state of fresh input object.
    723  *
    724  * @param input object to initialize
    725  */
    726 static void
    727 le_input_init (LeInput *input);
    728 
    729 
    730 /**
    731  * Macro to create LeInput type definition and register the class.
    732  */
    733 GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init,
    734            GSF_INPUT_TYPE)
    735 
    736 
    737 /**
    738  * Duplicate input, leaving the new one at the same offset.
    739  *
    740  * @param input the input to duplicate
    741  * @param err location for error reporting, can be NULL
    742  * @return NULL on error (always)
    743  */
    744 static GsfInput *
    745 le_input_dup (GsfInput * input,
    746               GError * *err)
    747 {
    748   if (NULL != err)
    749     *err = g_error_new (gsf_input_error_id (), 0,
    750                         "dup not supported on LeInput");
    751   return NULL;
    752 }
    753 
    754 
    755 /**
    756  * Read at least num_bytes. Does not change the current position if
    757  * there is an error. Will only read if the entire amount can be
    758  * read. Invalidates the buffer associated with previous calls to
    759  * gsf_input_read.
    760  *
    761  * @param input
    762  * @param num_bytes
    763  * @param optional_buffer
    764  * @return buffer where num_bytes data are available, or NULL on error
    765  */
    766 static const guint8 *
    767 le_input_read (GsfInput *input,
    768                size_t num_bytes,
    769                guint8 *optional_buffer)
    770 {
    771   LeInput *li = LE_INPUT (input);
    772   struct EXTRACTOR_ExtractContext *ec;
    773   void *buf;
    774   uint64_t old_off;
    775   ssize_t ret;
    776 
    777   ec = li->priv->ec;
    778   old_off = ec->seek (ec->cls, 0, SEEK_CUR);
    779   if (num_bytes
    780       != (ret = ec->read (ec->cls,
    781                           &buf,
    782                           num_bytes)))
    783   {
    784     /* we don't support partial reads;
    785  most other GsfInput implementations in this case
    786  allocate some huge temporary buffer just to avoid
    787  the partial read; we might need to do that as well!? */
    788     ec->seek (ec->cls, SEEK_SET, old_off);
    789     return NULL;
    790   }
    791   if (NULL != optional_buffer)
    792   {
    793     memcpy (optional_buffer, buf, num_bytes);
    794     return optional_buffer;
    795   }
    796   return buf;
    797 }
    798 
    799 
    800 /**
    801  * Move the current location in an input stream
    802  *
    803  * @param input stream to seek
    804  * @param offset target offset
    805  * @param whence determines to what the offset is relative to
    806  * @return TRUE on error
    807  */
    808 static gboolean
    809 le_input_seek (GsfInput *input,
    810                gsf_off_t offset,
    811                GSeekType whence)
    812 {
    813   LeInput *li = LE_INPUT (input);
    814   struct EXTRACTOR_ExtractContext *ec;
    815   int w;
    816   int64_t ret;
    817 
    818   ec = li->priv->ec;
    819   switch (whence)
    820   {
    821   case G_SEEK_SET:
    822     w = SEEK_SET;
    823     break;
    824   case G_SEEK_CUR:
    825     w = SEEK_CUR;
    826     break;
    827   case G_SEEK_END:
    828     w = SEEK_END;
    829     break;
    830   default:
    831     return TRUE;
    832   }
    833   if (-1 ==
    834       (ret = ec->seek (ec->cls,
    835                        offset,
    836                        w)))
    837     return TRUE;
    838   return FALSE;
    839 }
    840 
    841 
    842 /**
    843  * Class initializer for the "LeInput" class.
    844  *
    845  * @param class class object to initialize
    846  */
    847 static void
    848 le_input_class_init (LeInputClass *class)
    849 {
    850   GsfInputClass *input_class;
    851 
    852   input_class = (GsfInputClass *) class;
    853   input_class->Dup = le_input_dup;
    854   input_class->Read = le_input_read;
    855   input_class->Seek = le_input_seek;
    856   g_type_class_add_private (class, sizeof (LeInputPrivate));
    857 }
    858 
    859 
    860 /**
    861  * Initialize internal state of fresh input object.
    862  *
    863  * @param input object to initialize
    864  */
    865 static void
    866 le_input_init (LeInput *input)
    867 {
    868   LeInputPrivate *priv;
    869 
    870   input->priv =
    871     G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
    872                                  LeInputPrivate);
    873   priv = input->priv;
    874   priv->ec = NULL;
    875 }
    876 
    877 
    878 /**
    879  * Creates a new LeInput object.
    880  *
    881  * @param ec extractor context to wrap
    882  * @return NULL on error
    883  */
    884 GsfInput *
    885 le_input_new (struct EXTRACTOR_ExtractContext *ec)
    886 {
    887   LeInput *input;
    888 
    889   input = g_object_new (LE_TYPE_INPUT, NULL);
    890   gsf_input_set_size (GSF_INPUT (input),
    891                       ec->get_size (ec->cls));
    892   gsf_input_seek_emulate (GSF_INPUT (input),
    893                           0);
    894   input->input.name = NULL;
    895   input->input.container = NULL;
    896   input->priv->ec = ec;
    897 
    898   return GSF_INPUT (input);
    899 }
    900 
    901 
    902 /* *********************** end of custom GSF input method ************* */
    903 
    904 
    905 /**
    906  * Main entry method for the OLE2 extraction plugin.
    907  *
    908  * @param ec extraction context provided to the plugin
    909  */
    910 void
    911 EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
    912 {
    913   GsfInput *input;
    914   GsfInfile *infile;
    915   GsfInput *src;
    916   const char *name;
    917   unsigned int i;
    918   unsigned int lcb;
    919   unsigned int fcb;
    920   const unsigned char *data512;
    921   unsigned int lid;
    922   const char *lang;
    923   int ret;
    924   void *data;
    925   uint64_t fsize;
    926   ssize_t data_size;
    927 
    928   fsize = ec->get_size (ec->cls);
    929   if (fsize < 512 + 898)
    930   {
    931     /* File too small for OLE2 */
    932     return;   /* can hardly be OLE2 */
    933   }
    934   if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
    935   {
    936     /* Failed to read minimum file size to buffer */
    937     return;
    938   }
    939   data512 = (const unsigned char*) data + 512;
    940   lid = data512[6] + (data512[7] << 8);
    941   if ( (NULL != (lang = lid_to_language (lid))) &&
    942        (0 != (ret = add_metadata (ec->proc, ec->cls,
    943                                   lang,
    944                                   EXTRACTOR_METATYPE_LANGUAGE))) )
    945     return;
    946   lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16)
    947         + (data512[729] << 24);
    948   fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16)
    949         + (data512[725] << 24);
    950   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
    951   {
    952     /* seek failed!? */
    953     return;
    954   }
    955 #if USE_LE_INPUT
    956   if (NULL == (input = le_input_new (ec)))
    957   {
    958     fprintf (stderr, "le_input_new failed\n");
    959     return;
    960   }
    961 #else
    962   input = gsf_input_memory_new ((const guint8 *) data,
    963                                 data_size,
    964                                 FALSE);
    965 #endif
    966   if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
    967   {
    968     g_object_unref (G_OBJECT (input));
    969     return;
    970   }
    971   ret = 0;
    972   for (i = 0; i<gsf_infile_num_children (infile); i++)
    973   {
    974     if (0 != ret)
    975       break;
    976     if (NULL == (name = gsf_infile_name_by_index (infile, i)))
    977       continue;
    978     src = NULL;
    979     if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
    980            (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
    981          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
    982       ret = process (src,
    983                      ec->proc,
    984                      ec->cls);
    985     if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
    986          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
    987       ret = process_star_office (src,
    988                                  ec->proc,
    989                                  ec->cls);
    990     if (NULL != src)
    991       g_object_unref (G_OBJECT (src));
    992   }
    993   if (0 != ret)
    994     goto CLEANUP;
    995 
    996   if (lcb < 6)
    997     goto CLEANUP;
    998   for (i = 0; i<gsf_infile_num_children (infile); i++)
    999   {
   1000     if (ret != 0)
   1001       break;
   1002     if (NULL == (name = gsf_infile_name_by_index (infile, i)))
   1003       continue;
   1004     if ( ( (0 == strcmp (name, "1Table")) ||
   1005            (0 == strcmp (name, "0Table")) ) &&
   1006          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
   1007     {
   1008       ret = history_extract (src,
   1009                              lcb,
   1010                              fcb,
   1011                              ec->proc, ec->cls);
   1012       g_object_unref (G_OBJECT (src));
   1013     }
   1014   }
   1015 CLEANUP:
   1016   g_object_unref (G_OBJECT (infile));
   1017   g_object_unref (G_OBJECT (input));
   1018 }
   1019 
   1020 
   1021 /**
   1022  * Custom log function we give to GSF to disable logging.
   1023  *
   1024  * @param log_domain unused
   1025  * @param log_level unused
   1026  * @param message unused
   1027  * @param user_data unused
   1028  */
   1029 static void
   1030 nolog (const gchar *log_domain,
   1031        GLogLevelFlags log_level,
   1032        const gchar *message,
   1033        gpointer user_data)
   1034 {
   1035   /* do nothing */
   1036 }
   1037 
   1038 
   1039 /**
   1040  * OLE2 plugin constructor. Initializes glib and gsf, in particular
   1041  * gsf logging is disabled.
   1042  */
   1043 void __attribute__ ((constructor))
   1044 ole2_ltdl_init ()
   1045 {
   1046 #if ! GLIB_CHECK_VERSION (2, 35, 0)
   1047   g_type_init ();
   1048 #endif
   1049 #ifdef HAVE_GSF_INIT
   1050   gsf_init ();
   1051 #endif
   1052   /* disable logging -- thanks, Jody! */
   1053   g_log_set_handler ("libgsf:msole",
   1054                      G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,
   1055                      &nolog, NULL);
   1056 }
   1057 
   1058 
   1059 /**
   1060  * OLE2 plugin destructor.  Shutdown of gsf.
   1061  */
   1062 void __attribute__ ((destructor))
   1063 ole2_ltdl_fini ()
   1064 {
   1065 #ifdef HAVE_GSF_INIT
   1066   gsf_shutdown ();
   1067 #endif
   1068 }
   1069 
   1070 
   1071 /* end of ole2_extractor.c */