ole2 - libextractor - GNU libextractor

commit 509826a244154dacbe1c95e81d314b95ed449f7d
parent aa056e44077dc7ed731cdfc1dcdcdd1d18b6004d
Author: Christian Grothoff <christian@grothoff.org>
Date:   Wed, 16 Dec 2009 14:14:01 +0000

ole2

Diffstat:
M configure.ac  | 1 -
M src/include/extractor.h  | 25 ++++++++++++-------------
M src/main/extractor_metatypes.c  | 24 ++++++++++++++++++++++++
M src/plugins/Makefile.am  | 15 +++++++++++++--
M src/plugins/man_extractor.c  | 2 +-
D src/plugins/ole2/INFO  | 6 ------
D src/plugins/ole2/Makefile.am  | 16 ----------------
D src/plugins/ole2/README  | 25 -------------------------
D src/plugins/ole2/ole2extractor.c  | 584 -------------------------------------------------------------------------------
A src/plugins/ole2_extractor.c  | 599 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/plugins/pdf_extractor.cc  | 7 ++++++-
M src/plugins/rpm_extractor.c  | 2 +-

12 files changed, 656 insertions(+), 650 deletions(-)
diff --git a/configure.ac b/configure.ac
@@ -556,7 +556,6 @@ src/intlemu/Makefile
 src/common/Makefile
 src/main/Makefile
 src/plugins/Makefile
-src/plugins/ole2/Makefile
 src/plugins/oo/Makefile
 src/plugins/printable/Makefile
 src/plugins/hash/Makefile
diff --git a/src/include/extractor.h b/src/include/extractor.h
@@ -237,13 +237,23 @@ enum EXTRACTOR_MetaType
 
     /* image specifics */
     EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112, 
-
-
     EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113, 
     EXTRACTOR_METATYPE_THUMBNAIL = 114,
     EXTRACTOR_METATYPE_IMAGE_RESOLUTION = 115,
     EXTRACTOR_METATYPE_SOURCE = 116,
 
+    /* (text) document processing specifics */
+    EXTRACTOR_METATYPE_CHARACTER_SET = 117,
+    EXTRACTOR_METATYPE_LINE_COUNT = 118,
+    EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 119,
+    EXTRACTOR_METATYPE_WORD_COUNT = 120,
+    EXTRACTOR_METATYPE_CHARACTER_COUNT = 121,
+    EXTRACTOR_METATYPE_PAGE_ORIENTATION = 122,
+    EXTRACTOR_METATYPE_PAPER_SIZE = 123,
+    EXTRACTOR_METATYPE_TEMPLATE = 124,
+    EXTRACTOR_METATYPE_COMPANY = 125,
+    EXTRACTOR_METATYPE_MANAGER = 126,
+    EXTRACTOR_METATYPE_REVISION_NUMBER = 127,
     
     /* fixme: used up to here! */
     EXTRACTOR_METATYPE_SCALE = 108,
@@ -251,14 +261,6 @@ enum EXTRACTOR_MetaType
 
 
     /* FIXME: transcribe & renumber those below */
-    /* (text) document processing specifics */
-    EXTRACTOR_METATYPE_CHARACTER_SET = 104,
-    EXTRACTOR_METATYPE_LINE_COUNT = 105,
-    EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106,
-    EXTRACTOR_METATYPE_WORD_COUNT = 93,
-    EXTRACTOR_METATYPE_CHARACTER_COUNT = 94,
-    EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35,
-    EXTRACTOR_METATYPE_PAPER_SIZE = 36,
     EXTRACTOR_METATYPE_USED_FONTS = 37,
     EXTRACTOR_METATYPE_PAGE_ORDER = 38,
 
@@ -312,10 +314,7 @@ enum EXTRACTOR_MetaType
     EXTRACTOR_METATYPE_OWNER = 66,
     EXTRACTOR_METATYPE_MEDIA_TYPE = 68,
     EXTRACTOR_METATYPE_CONTACT = 69,
-    EXTRACTOR_METATYPE_TEMPLATE = 88,
     EXTRACTOR_METATYPE_SECURITY = 97,
-    EXTRACTOR_METATYPE_COMPANY = 102,
-    EXTRACTOR_METATYPE_MANAGER = 109,
     EXTRACTOR_METATYPE_INFORMATION = 112,
     EXTRACTOR_METATYPE_FULL_NAME = 113,
     EXTRACTOR_METATYPE_LINK = 116,
diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c
@@ -294,6 +294,30 @@ static const struct MetaTypeDescription meta_type_descriptions[] = {
     gettext_noop ("resolution in dots per inch") }, 
   { gettext_noop ("source"),
     gettext_noop ("Originating entity") }, 
+  { gettext_noop ("character set"),
+    gettext_noop ("character encoding used") }, 
+  { gettext_noop ("line count"),
+    gettext_noop ("number of lines") }, 
+  { gettext_noop ("paragraph count"),
+    gettext_noop ("number o paragraphs") }, 
+  { gettext_noop ("word count"),
+    gettext_noop ("number of words") }, 
+  { gettext_noop ("page orientation"),
+    gettext_noop ("") }, 
+  { gettext_noop ("paper size"),
+    gettext_noop ("") }, 
+  { gettext_noop ("template"),
+    gettext_noop ("template the document uses or is based on") }, 
+  { gettext_noop ("company"),
+    gettext_noop ("") }, 
+  { gettext_noop ("manager"),
+    gettext_noop ("") }, 
+  { gettext_noop ("revision number"),
+    gettext_noop ("") }, 
+  { gettext_noop (""),
+    gettext_noop ("") }, 
+  { gettext_noop (""),
+    gettext_noop ("") }, 
   { gettext_noop (""),
     gettext_noop ("") }, 
   { gettext_noop (""),
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
@@ -13,7 +13,7 @@ endif
 
 if HAVE_GLIB
 if WITH_GSF
- oledir=ole2
+ ole2=libextractor_ole2.la
 endif
 if HAVE_GTK
  thumbgtk=libextractor_thumbnailgtk.la
@@ -58,7 +58,7 @@ endif
 
 # toggle for development
 SUBDIRS = . 
-# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash
 
 
 if HAVE_VORBISFILE
@@ -95,6 +95,7 @@ plugin_LTLIBRARIES = \
   libextractor_jpeg.la \
   libextractor_man.la \
   libextractor_mime.la \
+  $(ole2) \
   $(pdf) \
   $(rpm) \
   $(thumbgtk)
@@ -176,6 +177,16 @@ libextractor_mime_la_SOURCES = \
 libextractor_mime_la_LDFLAGS = \
   $(PLUGINFLAGS)
 
+libextractor_ole2_la_SOURCES =  \
+  ole2_extractor.c
+libextractor_ole2_la_CFLAGS = \
+  $(GSF_CFLAGS) 
+libextractor_ole2_la_LIBADD = \
+  $(LIBADD) $(GSF_LIBS) \
+  $(top_builddir)/src/common/libextractor_common.la 
+libextractor_ole2_la_LDFLAGS = \
+  $(PLUGINFLAGS) 
+
 libextractor_pdf_la_SOURCES = \
   pdf_extractor.cc 
 libextractor_pdf_la_LDFLAGS = \
diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c
@@ -1,6 +1,6 @@
 /*
      This file is part of libextractor.
-     (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
+     (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff
 
      libextractor is free software; you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published
diff --git a/src/plugins/ole2/INFO b/src/plugins/ole2/INFO
@@ -1,6 +0,0 @@
-Most of the code in this directory comes from
-libgsf 1.10.1 (Licensed under GPL/LGPL).
-
-libgsf -- The G Structured File Library
-Jody Goldberg <jody@gnome.org>
-
diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am
@@ -1,16 +0,0 @@
-include ../Makefile-plugins.am
-
-
-plugin_LTLIBRARIES = \
-  libextractor_ole2.la
-
-libextractor_ole2_la_CFLAGS = \
-  $(GSF_CFLAGS) 
-libextractor_ole2_la_LIBADD = \
-  $(LIBADD) $(GSF_LIBS) \
-  $(top_builddir)/src/common/libextractor_common.la \
-  $(top_builddir)/src/main/libextractor.la
-libextractor_ole2_la_LDFLAGS = \
-  $(PLUGINFLAGS) $(retaincommand) 
-libextractor_ole2_la_SOURCES =  \
-        ole2extractor.c
diff --git a/src/plugins/ole2/README b/src/plugins/ole2/README
@@ -1,25 +0,0 @@
-WordLeaker v.0.1 (c) 2005
- by Madelman (http://elligre.tk/madelman/)
- 
-Shows information about a Word file.
-It can show all the summary and the revision history of the file.
-
-It should be portable but, for now, it doesn't work in Linux. I haven't had
-the time to debug it but I'll do when I can.
-
-There are a lot of things that don't work yet, if you want to help contact me.
-
-Copyright and License
-=====================
-
-WordLeaker v.0.1 (c) 2005 by Madelman (madelman@iname.com)
-
-WordLeaker program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2 of the License, or (at your option) any
-later version. 
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c
@@ -1,584 +0,0 @@
-/*
-     This file is part of libextractor.
-     (C) 2004, 2005, 2006, 2007 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-
-     This code makes extensive use of libgsf
-     -- the Gnome Structured File Library
-     Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
-
-     Part of this code was borrowed from wordleaker.cpp. See also
-     the README file in this directory.
-*/
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-#include <glib-object.h>
-#include <string.h>
-#include <stdio.h>
-#include <ctype.h>
-
-#include <gsf/gsf-utils.h>
-#include <gsf/gsf-input-memory.h>
-#include <gsf/gsf-infile.h>
-#include <gsf/gsf-infile-msole.h>
-#include <gsf/gsf-msole-utils.h>
-
-#define DEBUG_OLE2 0
-
-/* ******************************** main extraction code ************************ */
-
-static struct EXTRACTOR_Keywords *
-addKeyword(EXTRACTOR_KeywordList *oldhead,
-	   const char *phrase,
-	   EXTRACTOR_KeywordType type) {
-  EXTRACTOR_KeywordList * keyword;
-
-  if (strlen(phrase) == 0)
-    return oldhead;
-  if (0 == strcmp(phrase, "\"\""))
-    return oldhead;
-  if (0 == strcmp(phrase, "\" \""))
-    return oldhead;
-  if (0 == strcmp(phrase, " "))
-    return oldhead;
-  keyword = malloc(sizeof(EXTRACTOR_KeywordList));
-  keyword->next = oldhead;
-  keyword->keyword = strdup(phrase);
-  keyword->keywordType = type;
-  return keyword;
-}
-
-
-#if 0
-static guint8 const component_guid [] = {
-	0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
-	0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
-};
-
-static guint8 const document_guid [] = {
-	0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
-	0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
-};
-
-static guint8 const user_guid [] = {
-	0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
-	0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
-};
-#endif
-
-typedef struct {
-  char * text;
-  EXTRACTOR_KeywordType type;
-} Matches;
-
-static Matches tmap[] = {
-  { "Title", EXTRACTOR_TITLE },
-  { "PresentationFormat", EXTRACTOR_FORMAT },
-  { "Category", EXTRACTOR_DESCRIPTION },
-  { "Manager", EXTRACTOR_MANAGER },
-  { "Company", EXTRACTOR_COMPANY },
-  { "Subject", EXTRACTOR_SUBJECT },
-  { "Author", EXTRACTOR_AUTHOR },
-  { "Keywords", EXTRACTOR_KEYWORDS },
-  { "Comments", EXTRACTOR_COMMENT },
-  { "Template", EXTRACTOR_TEMPLATE },
-  { "NumPages", EXTRACTOR_PAGE_COUNT },
-  { "AppName", EXTRACTOR_SOFTWARE },
-  { "RevisionNumber", EXTRACTOR_VERSIONNUMBER },
-  { "Dictionary", EXTRACTOR_LANGUAGE },
-  { "NumBytes", EXTRACTOR_SIZE },
-  { "CreatedTime", EXTRACTOR_CREATION_DATE },
-  { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE },
-  { "gsf:company", EXTRACTOR_COMPANY },
-  /*  { "gsf:security", EXTRACTOR_SECURITY }, */
-  { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT },
-  { "gsf:page-count", EXTRACTOR_PAGE_COUNT },
-  { "gsf:line-count", EXTRACTOR_LINE_COUNT },
-  { "gsf:word-count", EXTRACTOR_WORD_COUNT },
-  { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT },
-  { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY },
-  /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
-  { "gsf:manager", EXTRACTOR_MANAGER },
-  { "dc:title", EXTRACTOR_TITLE },
-  { "dc:creator", EXTRACTOR_CREATOR },
-  { "dc:date", EXTRACTOR_DATE },
-  { "dc:subject", EXTRACTOR_SUBJECT },
-  { "dc:keywords", EXTRACTOR_KEYWORDS },
-  { "dc:last-printed", EXTRACTOR_LAST_PRINTED },
-  { "dc:description", EXTRACTOR_DESCRIPTION },
-  { "meta:creation-date", EXTRACTOR_CREATION_DATE },
-  /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */
-  { "meta:generator", EXTRACTOR_GENERATOR },
-  { "meta:template", EXTRACTOR_TEMPLATE },
-  /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */
-  /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
-  { NULL, 0 },
-};
-
-static void processMetadata(gpointer key,
-			    gpointer value,
-			    gpointer user_data) {
-  struct EXTRACTOR_Keywords ** pprev = user_data;
-  const char * type = key;
-  const GsfDocProp * prop = value;
-  const GValue * gval;
-  char * contents;
-  int pos;
-
-  if ( (key == NULL) ||
-       (value == NULL) )
-    return;
-  gval = gsf_doc_prop_get_val(prop);
-
-  if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
-    contents = strdup(g_value_get_string(gval));
-  } else {
-    /* convert other formats? */
-    contents = g_strdup_value_contents(gval);
-  }
-  if (contents == NULL)
-    return;
-  if ( (strlen(contents) > 0) &&
-       (contents[strlen(contents)-1] == '\n') )
-    contents[strlen(contents)-1] = '\0';
-  pos = 0;
-  while (tmap[pos].text != NULL) {
-    if (0 == strcmp(tmap[pos].text,
-		    type))
-      break;
-    pos++;
-  }
-  if (tmap[pos].text != NULL)
-    *pprev = addKeyword(*pprev,
-			contents,
-			tmap[pos].type);
-#if DEBUG_OLE2
-  else
-    printf("No match for type `%s'\n",
-	   type);
-#endif
-  free(contents);
-}
-
-
-static struct EXTRACTOR_Keywords *
-process(GsfInput * in,
-	struct EXTRACTOR_Keywords * prev) {
-  GsfDocMetaData * sections;
-  GError * error;
-
-  sections = gsf_doc_meta_data_new();
-  error = gsf_msole_metadata_read(in, sections);
-  if (error == NULL) {
-    gsf_doc_meta_data_foreach(sections,
-			      &processMetadata,
-			      &prev);
-  }
-  g_object_unref(G_OBJECT(sections));
-  return prev;
-}
-
-static struct EXTRACTOR_Keywords *
-processSO(GsfInput * src,
-	  struct EXTRACTOR_Keywords * prev) {
-  off_t size;
-  char * buf;
-
-  size = gsf_input_size(src);
-  if (size < 0x374) /* == 0x375?? */
-    return prev;
-  buf = malloc(size);
-  gsf_input_read(src, size, (unsigned char*) buf);
-  if ( (buf[0] != 0x0F) ||
-       (buf[1] != 0x0) ||
-       (0 != strncmp(&buf[2],
-		     "SfxDocumentInfo",
-		     strlen("SfxDocumentInfo"))) ||
-       (buf[0x11] != 0x0B) ||
-       (buf[0x13] != 0x00) || /* pw protected! */
-       (buf[0x12] != 0x00) ) {
-    free(buf);
-    return prev;
-  }
-  buf[0xd3] = '\0';
-  if (buf[0x94] + buf[0x93] > 0)
-    prev = addKeyword(prev,
-		      &buf[0x95],
-		      EXTRACTOR_TITLE);
-  buf[0x114] = '\0';
-  if (buf[0xd5] + buf[0xd4] > 0)
-    prev = addKeyword(prev,
-		      &buf[0xd6],
-		      EXTRACTOR_SUBJECT);
-  buf[0x215] = '\0';
-  if (buf[0x115] + buf[0x116] > 0)
-    prev = addKeyword(prev,
-		      &buf[0x117],
-		      EXTRACTOR_COMMENT);
-  buf[0x296] = '\0';
-  if (buf[0x216] + buf[0x217] > 0)
-    prev = addKeyword(prev,
-		      &buf[0x218],
-		      EXTRACTOR_KEYWORDS);
-  /* fixme: do timestamps,
-     mime-type, user-defined info's */
-
-  free(buf);
-  return prev;
-}
-
-/* *************** wordleaker stuff *************** */
-
-#define __(a) dgettext("iso-639", a)
-
-static const char * lidToLanguage( unsigned int lid ) {
-  switch ( lid ) {
-  case 0x0400:
-    return _("No Proofing");
-  case 0x0401:
-    return __("Arabic");
-  case 0x0402:
-    return __("Bulgarian");
-  case 0x0403:
-    return __("Catalan");
-  case 0x0404:
-    return _("Traditional Chinese");
-  case 0x0804:
-    return _("Simplified Chinese");
-  case 0x0405:
-    return __("Chechen");
-  case 0x0406:
-    return __("Danish");
-  case 0x0407:
-    return __("German");
-  case 0x0807:
-    return _("Swiss German");
-  case 0x0408:
-    return __("Greek");
-  case 0x0409:
-    return _("U.S. English");
-  case 0x0809:
-    return _("U.K. English");
-  case 0x0c09:
-    return _("Australian English");
-  case 0x040a:
-    return _("Castilian Spanish");
-  case 0x080a:
-    return _("Mexican Spanish");
-  case 0x040b:
-    return __("Finnish");
-  case 0x040c:
-    return __("French");
-  case 0x080c:
-    return _("Belgian French");
-  case 0x0c0c:
-    return _("Canadian French");
-  case 0x100c:
-    return _("Swiss French");
-  case 0x040d:
-    return __("Hebrew");
-  case 0x040e:
-    return __("Hungarian");
-  case 0x040f:
-    return __("Icelandic");
-  case 0x0410:
-    return __("Italian");
-  case 0x0810:
-    return _("Swiss Italian");
-  case 0x0411:
-    return __("Japanese");
-  case 0x0412:
-    return __("Korean");
-  case 0x0413:
-    return __("Dutch");
-  case 0x0813:
-    return _("Belgian Dutch");
-  case 0x0414:
-    return _("Norwegian Bokmal");
-  case 0x0814:
-    return __("Norwegian Nynorsk");
-  case 0x0415:
-    return __("Polish");
-  case 0x0416:
-    return __("Brazilian Portuguese");
-  case 0x0816:
-    return __("Portuguese");
-  case 0x0417:
-    return _("Rhaeto-Romanic");
-  case 0x0418:
-    return __("Romanian");
-  case 0x0419:
-    return __("Russian");
-  case 0x041a:
-    return _("Croato-Serbian (Latin)");
-  case 0x081a:
-    return _("Serbo-Croatian (Cyrillic)");
-  case 0x041b:
-    return __("Slovak");
-  case 0x041c:
-    return __("Albanian");
-  case 0x041d:
-    return __("Swedish");
-  case 0x041e:
-    return __("Thai");
-  case 0x041f:
-    return __("Turkish");
-  case 0x0420:
-    return __("Urdu");
-  case 0x0421:
-    return __("Bahasa");
-  case 0x0422:
-    return __("Ukrainian");
-  case 0x0423:
-    return __("Byelorussian");
-  case 0x0424:
-    return __("Slovenian");
-  case 0x0425:
-    return __("Estonian");
-  case 0x0426:
-    return __("Latvian");
-  case 0x0427:
-    return __("Lithuanian");
-  case 0x0429:
-    return _("Farsi");
-  case 0x042D:
-    return __("Basque");
-  case 0x042F:
-    return __("Macedonian");
-  case 0x0436:
-    return __("Afrikaans");
-  case 0x043E:
-    return __("Malayalam");
-  default:
-    return NULL;
-  }
-}
-
-
-static struct EXTRACTOR_Keywords *
-history_extract(GsfInput * stream,
-		unsigned int lcbSttbSavedBy,
-		unsigned int fcSttbSavedBy,
-		struct EXTRACTOR_Keywords * prev) {
-  unsigned int where = 0;
-  unsigned char * lbuffer;
-  unsigned int i;
-  unsigned int length;
-  char * author;
-  char * filename;
-  char * rbuf;
-  unsigned int nRev;
-
-  // goto offset of revision
-  gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
-  if (gsf_input_remaining(stream) < lcbSttbSavedBy)
-    return prev;
-  lbuffer = malloc(lcbSttbSavedBy);
-  // read all the revision history
-  gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
-  // there are n strings, so n/2 revisions (author & file)
-  nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
-  where = 6;
-  for (i=0; i < nRev; i++) {
-    if (where >= lcbSttbSavedBy)
-      break;
-    length = lbuffer[where++];
-    if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
-	 (where + 2 * length + 2 <= where) )
-      break;
-    author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
-			   length * 2,
-			   "UTF-16BE");
-    where += length * 2 + 1;
-    length = lbuffer[where++];
-    if ( (where + 2 * length >= lcbSttbSavedBy) ||
-	 (where + 2 * length + 1 <= where) ) {
-      free(author);
-      break;
-    }
-    filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
-			     length * 2,
-			     "UTF-16BE");
-    where += length * 2 + 1;
-    rbuf = malloc(strlen(author) + strlen(filename) + 512);
-    snprintf(rbuf, 512 + strlen(author) + strlen(filename),
-	     _("Revision #%u: Author '%s' worked on '%s'"),
-	     i, author, filename);
-    free(author);
-    free(filename);
-    prev = addKeyword(prev,
-		      rbuf,
-		      EXTRACTOR_REVISION_HISTORY);
-    free(rbuf);
-  }
-  free(lbuffer);
-  return prev;
-}
-
-
-/* ************** main method *********** */
-
-struct EXTRACTOR_Keywords *
-libextractor_ole2_extract(const char * filename,
-			  const char * data,
-			  size_t size,
-			  struct EXTRACTOR_Keywords * prev) {
-  GsfInput * input;
-  GsfInfile * infile;
-  GsfInput * src;
-  const char * name;
-  const char * generator = NULL;
-  int i;
-  unsigned int lcb;
-  unsigned int fcb;
-  const unsigned char * data512;
-  unsigned int lid;
-  const char * lang;
-
-  if (size < 512 + 898)
-    return prev; /* can hardly be OLE2 */
-  input = gsf_input_memory_new((const guint8 *) data,
-			       (gsf_off_t) size,
-			       FALSE);
-  if (input == NULL)
-    return prev;
-
-  infile = gsf_infile_msole_new(input, NULL);
-  if (infile == NULL) {
-    g_object_unref(G_OBJECT(input));
-    return prev;
-  }
-  lcb = 0;
-  fcb = 0;
-  for (i=0;i<gsf_infile_num_children(infile);i++) {
-    name = gsf_infile_name_by_index (infile, i);
-    src = NULL;
-    if (name == NULL)
-      continue;
-    if ( (0 == strcmp(name, "\005SummaryInformation"))
-	 || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
-      src = gsf_infile_child_by_index (infile, i);
-      if (src != NULL)
-	prev = process(src,
-		       prev);
-    }
-    if (0 == strcmp(name, "SfxDocumentInfo")) {
-      src = gsf_infile_child_by_index (infile, i);
-      if (src != NULL)
-	prev = processSO(src,
-			 prev);
-    }
-    if (src != NULL)
-      g_object_unref(G_OBJECT(src));
-  }
-
-  data512 = (const unsigned char*) &data[512];
-  lid = data512[6] + (data512[7] << 8);
-  lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
-  fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
-  lang = lidToLanguage(lid);
-  if (lang != NULL) {
-    prev = addKeyword(prev,
-		      lang,
-		      EXTRACTOR_LANGUAGE);
-  }
-  if (lcb >= 6) {
-    for (i=0;i<gsf_infile_num_children(infile);i++) {
-      name = gsf_infile_name_by_index (infile, i);
-      if (name == NULL)
-	continue;
-      if ( (0 == strcmp(name, "1Table")) ||
-	   (0 == strcmp(name, "0Table")) ) {
-	src = gsf_infile_child_by_index (infile, i);
-	if (src != NULL) {
-	  prev = history_extract(src,
-				 lcb,
-				 fcb,
-				 prev);
-	  g_object_unref(G_OBJECT(src));
-	}
-      }
-    }
-  }
-  g_object_unref(G_OBJECT(infile));
-  g_object_unref(G_OBJECT(input));
-
-  /*
-   * Hack to return an appropriate mimetype
-   */
-  generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev);
-  if (NULL == generator) {
-     /*
-      * when very puzzled, just look at file magic number
-      */
-    if ( (8 < size)
-	 && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
-      generator = "Microsoft Office";
-  }
-
-  if(NULL != generator) {
-    const char * mimetype = "application/vnd.ms-files";
-
-    if((0 == strncmp(generator, "Microsoft Word", 14)) ||
-       (0 == strncmp(generator, "Microsoft Office Word", 21)))
-      mimetype = "application/msword";
-    else if((0 == strncmp(generator, "Microsoft Excel", 15)) ||
-            (0 == strncmp(generator, "Microsoft Office Excel", 22)))
-      mimetype = "application/vnd.ms-excel";
-    else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) ||
-            (0 == strncmp(generator, "Microsoft Office PowerPoint", 27)))
-      mimetype = "application/vnd.ms-powerpoint";
-    else if(0 == strncmp(generator, "Microsoft Project", 17))
-      mimetype = "application/vnd.ms-project";
-    else if(0 == strncmp(generator, "Microsoft Visio", 15))
-      mimetype = "application/vnd.visio";
-    else if(0 == strncmp(generator, "Microsoft Office", 16))
-      mimetype = "application/vnd.ms-office";
-
-    prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE);
-  }
-
-  return prev;
-}
-static void nolog (const gchar *log_domain,
-		   GLogLevelFlags log_level,
-		   const gchar *message,
-		   gpointer user_data) {
-}
-
-void __attribute__ ((constructor)) ole2_ltdl_init() {
- g_type_init();
-#ifdef HAVE_GSF_INIT
-  gsf_init();
-#endif
-  /* disable logging -- thanks, Jody! */
-  g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,  &nolog, NULL);
-  // gsf_init_dynamic(NULL);
-}
-
-void __attribute__ ((destructor)) ole2_ltdl_fini() {
-#ifdef HAVE_GSF_INIT
-  gsf_shutdown();
-#endif
-  // gsf_shutdown_dynamic(NULL);
-}
-
-/* end of ole2extractor.c */
-
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c
@@ -0,0 +1,599 @@
+/*
+     This file is part of libextractor.
+     (C) 2004, 2005, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff
+
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+
+     This code makes extensive use of libgsf
+     -- the Gnome Structured File Library
+     Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
+
+     Part of this code was borrowed from wordleaker.cpp. See also
+     the README file in this directory.
+*/
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+
+#include <glib-object.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-input-memory.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-msole-utils.h>
+
+#define DEBUG_OLE2 0
+
+/* ******************************** main extraction code ************************ */
+
+static int
+addKeyword(EXTRACTOR_MetaDataProcessor proc,
+	   void *proc_cls,
+	   const char *phrase,
+	   enum EXTRACTOR_MetaType type) {
+  if (strlen(phrase) == 0)
+    return 0;
+  if (0 == strcmp(phrase, "\"\""))
+    return 0;
+  if (0 == strcmp(phrase, "\" \""))
+    return 0;
+  if (0 == strcmp(phrase, " "))
+    return 0;
+  return proc (proc_cls, 
+	       "ole2",
+	       type,
+	       EXTRACTOR_METAFORMAT_UTF8,
+	       "text/plain",
+	       phrase,
+	       strlen (phrase) +1);
+}
+
+typedef struct {
+  char * text;
+  enum EXTRACTOR_MetaType type;
+} Matches;
+
+static Matches tmap[] = {
+  { "Title", EXTRACTOR_METATYPE_TITLE },
+  { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
+  { "Category", EXTRACTOR_METATYPE_SECTION },
+  { "Manager", EXTRACTOR_METATYPE_MANAGER },
+  { "Company", EXTRACTOR_METATYPE_COMPANY },
+  { "Subject", EXTRACTOR_METATYPE_SUBJECT },
+  { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "Comments", EXTRACTOR_METATYPE_COMMENT },
+  { "Template", EXTRACTOR_METATYPE_TEMPLATE },
+  { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
+  { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
+  { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
+  { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
+  { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
+  { "LastSavedTime" , EXTRACTOR_METATYPE_MODIFICATION_DATE },
+  { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
+  { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
+  { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
+  { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
+  { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
+  { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
+  { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
+  { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
+  { "dc:title", EXTRACTOR_METATYPE_TITLE },
+  { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
+  { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+  { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
+  { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
+  { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
+  { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
+  { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, 
+  /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE },  */
+  /* { "gsf:security", EXTRACTOR_SECURITY }, */
+  /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
+  /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
+  /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
+  { NULL, 0 }
+};
+
+
+struct ProcContext
+{
+  EXTRACTOR_MetaDataProcessor proc;
+  void *proc_cls;
+  int ret;
+};
+
+
+static void processMetadata(gpointer key,
+			    gpointer value,
+			    gpointer user_data) {
+  struct ProcContext *pc = user_data;
+  const char * type = key;
+  const GsfDocProp * prop = value;
+  const GValue * gval;
+  char * contents;
+  int pos;
+
+  if ( (key == NULL) ||
+       (value == NULL) )
+    return;
+  if (pc->ret != 0)
+    return;
+  gval = gsf_doc_prop_get_val(prop);
+
+  if (G_VALUE_TYPE(gval) == G_TYPE_STRING) 
+    {
+      contents = strdup(g_value_get_string(gval));
+    }
+  else
+    {
+      /* convert other formats? */
+      contents = g_strdup_value_contents(gval);
+    }
+  if (contents == NULL)
+    return;
+  if ( (strlen(contents) > 0) &&
+       (contents[strlen(contents)-1] == '\n') )
+    contents[strlen(contents)-1] = '\0';
+  pos = 0;
+  while (tmap[pos].text != NULL) 
+    {
+      if (0 == strcmp(tmap[pos].text,
+		      type))
+	break;
+      pos++;
+    }
+  if (0 == strcmp (type, "meta:generator"))
+    {
+      const char * mimetype = "application/vnd.ms-files";
+      if((0 == strncmp(value, "Microsoft Word", 14)) ||
+	 (0 == strncmp(value, "Microsoft Office Word", 21)))
+	mimetype = "application/msword";
+      else if((0 == strncmp(value, "Microsoft Excel", 15)) ||
+	      (0 == strncmp(value, "Microsoft Office Excel", 22)))
+	mimetype = "application/vnd.ms-excel";
+      else if((0 == strncmp(value, "Microsoft PowerPoint", 20)) ||
+	      (0 == strncmp(value, "Microsoft Office PowerPoint", 27)))
+	mimetype = "application/vnd.ms-powerpoint";
+      else if(0 == strncmp(value, "Microsoft Project", 17))
+	mimetype = "application/vnd.ms-project";
+      else if(0 == strncmp(value, "Microsoft Visio", 15))
+	mimetype = "application/vnd.visio";
+      else if(0 == strncmp(value, "Microsoft Office", 16))
+	mimetype = "application/vnd.ms-office";
+      
+      if (0 != addKeyword(pc->proc,
+			  pc->proc_cls, mimetype, EXTRACTOR_METATYPE_MIMETYPE))
+	{
+	  free (contents);
+	  pc->ret = 1;
+	  return;
+	}
+    }
+  if (tmap[pos].text != NULL)
+    {
+      if (0 != addKeyword(pc->proc, pc->proc_cls,
+			  contents,
+			  tmap[pos].type))
+	{
+	  free (contents);
+	  pc->ret = 1;
+	  return;
+	}
+    }
+#if DEBUG_OLE2
+  else
+    printf("No match for type `%s'\n",
+	   type);
+#endif
+  free(contents);
+}
+
+
+static int
+process(GsfInput * in,
+	EXTRACTOR_MetaDataProcessor proc,
+	void *proc_cls)
+{
+  struct ProcContext pc;
+  GsfDocMetaData * sections;
+  GError * error;
+
+  pc.proc = proc;
+  pc.proc_cls = proc_cls;
+  pc.ret = 0;
+  sections = gsf_doc_meta_data_new();
+  error = gsf_msole_metadata_read(in, sections);
+  if (error == NULL) {
+    gsf_doc_meta_data_foreach(sections,
+			      &processMetadata,
+			      &pc);
+  }
+  g_object_unref(G_OBJECT(sections));
+  return pc.ret;
+}
+
+static int
+processSO(GsfInput * src,
+	  EXTRACTOR_MetaDataProcessor proc,
+	  void *proc_cls) {
+  off_t size = gsf_input_size(src);
+  if ( (size < 0x374) || (size > 4*1024*1024) )  /* == 0x375?? */
+    return 0;
+  char buf[size];
+  gsf_input_read(src, size, (unsigned char*) buf);
+  if ( (buf[0] != 0x0F) ||
+       (buf[1] != 0x0) ||
+       (0 != strncmp(&buf[2],
+		     "SfxDocumentInfo",
+		     strlen("SfxDocumentInfo"))) ||
+       (buf[0x11] != 0x0B) ||
+       (buf[0x13] != 0x00) || /* pw protected! */
+       (buf[0x12] != 0x00) ) 
+    return 0;
+  buf[0xd3] = '\0';
+  if (buf[0x94] + buf[0x93] > 0)
+    if (0 != addKeyword(proc, proc_cls,
+			&buf[0x95],
+			EXTRACTOR_METATYPE_TITLE))
+      return 1;
+  buf[0x114] = '\0';
+  if (buf[0xd5] + buf[0xd4] > 0)
+    if (0 != addKeyword(proc, proc_cls,
+			&buf[0xd6],
+			EXTRACTOR_METATYPE_SUBJECT))
+      return 1;
+  buf[0x215] = '\0';
+  if (buf[0x115] + buf[0x116] > 0)
+    if (0 != addKeyword(proc, proc_cls,
+			&buf[0x117],
+			EXTRACTOR_METATYPE_COMMENT))
+      return 1;
+  buf[0x296] = '\0';
+  if (buf[0x216] + buf[0x217] > 0)
+    if (0 != addKeyword(proc, proc_cls,
+			&buf[0x218],
+			EXTRACTOR_METATYPE_KEYWORDS))
+      return 1;
+  /* fixme: do timestamps,
+     mime-type, user-defined info's */
+  return 0;
+}
+
+/* *************** wordleaker stuff *************** */
+
+#define __(a) dgettext("iso-639", a)
+
+static const char * lidToLanguage( unsigned int lid ) {
+  switch ( lid ) {
+  case 0x0400:
+    return _("No Proofing");
+  case 0x0401:
+    return __("Arabic");
+  case 0x0402:
+    return __("Bulgarian");
+  case 0x0403:
+    return __("Catalan");
+  case 0x0404:
+    return _("Traditional Chinese");
+  case 0x0804:
+    return _("Simplified Chinese");
+  case 0x0405:
+    return __("Chechen");
+  case 0x0406:
+    return __("Danish");
+  case 0x0407:
+    return __("German");
+  case 0x0807:
+    return _("Swiss German");
+  case 0x0408:
+    return __("Greek");
+  case 0x0409:
+    return _("U.S. English");
+  case 0x0809:
+    return _("U.K. English");
+  case 0x0c09:
+    return _("Australian English");
+  case 0x040a:
+    return _("Castilian Spanish");
+  case 0x080a:
+    return _("Mexican Spanish");
+  case 0x040b:
+    return __("Finnish");
+  case 0x040c:
+    return __("French");
+  case 0x080c:
+    return _("Belgian French");
+  case 0x0c0c:
+    return _("Canadian French");
+  case 0x100c:
+    return _("Swiss French");
+  case 0x040d:
+    return __("Hebrew");
+  case 0x040e:
+    return __("Hungarian");
+  case 0x040f:
+    return __("Icelandic");
+  case 0x0410:
+    return __("Italian");
+  case 0x0810:
+    return _("Swiss Italian");
+  case 0x0411:
+    return __("Japanese");
+  case 0x0412:
+    return __("Korean");
+  case 0x0413:
+    return __("Dutch");
+  case 0x0813:
+    return _("Belgian Dutch");
+  case 0x0414:
+    return _("Norwegian Bokmal");
+  case 0x0814:
+    return __("Norwegian Nynorsk");
+  case 0x0415:
+    return __("Polish");
+  case 0x0416:
+    return __("Brazilian Portuguese");
+  case 0x0816:
+    return __("Portuguese");
+  case 0x0417:
+    return _("Rhaeto-Romanic");
+  case 0x0418:
+    return __("Romanian");
+  case 0x0419:
+    return __("Russian");
+  case 0x041a:
+    return _("Croato-Serbian (Latin)");
+  case 0x081a:
+    return _("Serbo-Croatian (Cyrillic)");
+  case 0x041b:
+    return __("Slovak");
+  case 0x041c:
+    return __("Albanian");
+  case 0x041d:
+    return __("Swedish");
+  case 0x041e:
+    return __("Thai");
+  case 0x041f:
+    return __("Turkish");
+  case 0x0420:
+    return __("Urdu");
+  case 0x0421:
+    return __("Bahasa");
+  case 0x0422:
+    return __("Ukrainian");
+  case 0x0423:
+    return __("Byelorussian");
+  case 0x0424:
+    return __("Slovenian");
+  case 0x0425:
+    return __("Estonian");
+  case 0x0426:
+    return __("Latvian");
+  case 0x0427:
+    return __("Lithuanian");
+  case 0x0429:
+    return _("Farsi");
+  case 0x042D:
+    return __("Basque");
+  case 0x042F:
+    return __("Macedonian");
+  case 0x0436:
+    return __("Afrikaans");
+  case 0x043E:
+    return __("Malayalam");
+  default:
+    return NULL;
+  }
+}
+
+
+static int
+history_extract(GsfInput * stream,
+		unsigned int lcbSttbSavedBy,
+		unsigned int fcSttbSavedBy,
+		EXTRACTOR_MetaDataProcessor proc,
+		void *proc_cls)
+{
+  unsigned int where = 0;
+  unsigned char * lbuffer;
+  unsigned int i;
+  unsigned int length;
+  char * author;
+  char * filename;
+  char * rbuf;
+  unsigned int nRev;
+  int ret;
+
+  // goto offset of revision
+  gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
+  if (gsf_input_remaining(stream) < lcbSttbSavedBy)
+    return 0;
+  lbuffer = malloc(lcbSttbSavedBy);
+  // read all the revision history
+  gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
+  // there are n strings, so n/2 revisions (author & file)
+  nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
+  where = 6;
+  ret = 0;
+  for (i=0; i < nRev; i++) {
+    if (where >= lcbSttbSavedBy)
+      break;
+    length = lbuffer[where++];
+    if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
+	 (where + 2 * length + 2 <= where) )
+      break;
+    author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+			   length * 2,
+			   "UTF-16BE");
+    where += length * 2 + 1;
+    length = lbuffer[where++];
+    if ( (where + 2 * length >= lcbSttbSavedBy) ||
+	 (where + 2 * length + 1 <= where) ) {
+      free(author);
+      break;
+    }
+    filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+			     length * 2,
+			     "UTF-16BE");
+    where += length * 2 + 1;
+    rbuf = malloc(strlen(author) + strlen(filename) + 512);
+    snprintf(rbuf, 512 + strlen(author) + strlen(filename),
+	     _("Revision #%u: Author '%s' worked on '%s'"),
+	     i, author, filename);
+    free(author);
+    free(filename);
+    ret = addKeyword(proc, proc_cls,
+		     rbuf,
+		     EXTRACTOR_METATYPE_REVISION_HISTORY);    
+    free(rbuf);
+    if (0 != ret)
+      break;
+  }
+  free(lbuffer);
+  return ret;
+}
+
+
+int 
+EXTRACTOR_ole2_extract (const char *data,
+			size_t size,
+			EXTRACTOR_MetaDataProcessor proc,
+			void *proc_cls,
+			const char *options)
+{
+  GsfInput * input;
+  GsfInfile * infile;
+  GsfInput * src;
+  const char * name;
+  int i;
+  unsigned int lcb;
+  unsigned int fcb;
+  const unsigned char * data512;
+  unsigned int lid;
+  const char * lang;
+  int ret;
+
+  ret = 0;
+  if (size < 512 + 898)
+    return 0; /* can hardly be OLE2 */
+  input = gsf_input_memory_new((const guint8 *) data,
+			       (gsf_off_t) size,
+			       FALSE);
+  if (input == NULL)
+    return 0;
+
+  infile = gsf_infile_msole_new(input, NULL);
+  if (infile == NULL) {
+    g_object_unref(G_OBJECT(input));
+    return 0;
+  }
+  lcb = 0;
+  fcb = 0;
+  for (i=0;i<gsf_infile_num_children(infile);i++) {
+    name = gsf_infile_name_by_index (infile, i);
+    src = NULL;
+    if (ret != 0)
+      break;
+    if (name == NULL)
+      continue;
+    if ( (0 == strcmp(name, "\005SummaryInformation"))
+	 || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
+      src = gsf_infile_child_by_index (infile, i);
+      if (src != NULL)
+	ret = process(src,
+		      proc, 
+		      proc_cls);
+    }
+    if (0 == strcmp(name, "SfxDocumentInfo")) {
+      src = gsf_infile_child_by_index (infile, i);
+      if ( (src != NULL) && (ret == 0) )
+	ret = processSO(src,
+			proc,
+			proc_cls);
+    }
+    if (src != NULL)
+      g_object_unref(G_OBJECT(src));
+  }
+
+  data512 = (const unsigned char*) &data[512];
+  lid = data512[6] + (data512[7] << 8);
+  lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
+  fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
+  lang = lidToLanguage(lid);
+  if ( (lang != NULL) && (ret == 0) )
+    ret = addKeyword(proc, proc_cls,
+		     lang,
+		     EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE);  
+  if (lcb >= 6) {
+    for (i=0;i<gsf_infile_num_children(infile);i++) {
+      if (ret != 0)
+	break;
+      name = gsf_infile_name_by_index (infile, i);
+      if (name == NULL)
+	continue;
+      if ( (0 == strcmp(name, "1Table")) ||
+	   (0 == strcmp(name, "0Table")) ) {
+	src = gsf_infile_child_by_index (infile, i);
+	if (src != NULL) {
+	  ret = history_extract(src,
+				lcb,
+				fcb,
+				proc, proc_cls);
+	  g_object_unref(G_OBJECT(src));
+	}
+      }
+    }
+  }
+  g_object_unref(G_OBJECT(infile));
+  g_object_unref(G_OBJECT(input));
+  return ret;
+}
+
+
+static void 
+nolog (const gchar *log_domain,
+       GLogLevelFlags log_level,
+       const gchar *message,
+       gpointer user_data) {
+}
+
+
+void __attribute__ ((constructor)) ole2_ltdl_init() {
+  g_type_init();
+#ifdef HAVE_GSF_INIT
+  gsf_init();
+#endif
+  /* disable logging -- thanks, Jody! */
+  g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,  &nolog, NULL);
+}
+
+
+void __attribute__ ((destructor)) ole2_ltdl_fini() {
+#ifdef HAVE_GSF_INIT
+  gsf_shutdown();
+#endif
+}
+
+/* end of ole2_extractor.c */
+
diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc
@@ -37,6 +37,7 @@
 #include <poppler/Page.h>
 #include <poppler/PDFDoc.h>
 #include <poppler/Error.h>
+#include <poppler/GlobalParams.h>
 #include <poppler/goo/GooString.h>
 
 #define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0)
@@ -167,7 +168,11 @@ extern "C" {
     BaseStream * stream;
     int err;
 
-    /* errorInit();   -- keep commented out, otherwise errors are printed to stderr for non-pdf files! */
+    if (globalParams == NULL)
+      {
+	globalParams = new GlobalParams();
+	globalParams->setErrQuiet (gTrue);
+      }
     obj.initNull();
     err = 0;
     stream = new MemStream( (char*) data, 0, size, &obj);
diff --git a/src/plugins/rpm_extractor.c b/src/plugins/rpm_extractor.c
@@ -1,6 +1,6 @@
 /*
      This file is part of libextractor.
-     (C) 2002, 2003, 2008 Vidyut Samanta and Christian Grothoff
+     (C) 2002, 2003, 2008, 2009 Vidyut Samanta and Christian Grothoff
 
      libextractor is free software; you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	configure.ac	\|	1	-
M	src/include/extractor.h	\|	25	++++++++++++-------------
M	src/main/extractor_metatypes.c	\|	24	++++++++++++++++++++++++
M	src/plugins/Makefile.am	\|	15	+++++++++++++--
M	src/plugins/man_extractor.c	\|	2	+-
D	src/plugins/ole2/INFO	\|	6	------
D	src/plugins/ole2/Makefile.am	\|	16	----------------
D	src/plugins/ole2/README	\|	25	-------------------------
D	src/plugins/ole2/ole2extractor.c	\|	584	-------------------------------------------------------------------------------
A	src/plugins/ole2_extractor.c	\|	599	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/plugins/pdf_extractor.cc	\|	7	++++++-
M	src/plugins/rpm_extractor.c	\|	2	+-