commit 509826a244154dacbe1c95e81d314b95ed449f7d
parent aa056e44077dc7ed731cdfc1dcdcdd1d18b6004d
Author: Christian Grothoff <christian@grothoff.org>
Date: Wed, 16 Dec 2009 14:14:01 +0000
ole2
Diffstat:
12 files changed, 656 insertions(+), 650 deletions(-)
diff --git a/configure.ac b/configure.ac
@@ -556,7 +556,6 @@ src/intlemu/Makefile
src/common/Makefile
src/main/Makefile
src/plugins/Makefile
-src/plugins/ole2/Makefile
src/plugins/oo/Makefile
src/plugins/printable/Makefile
src/plugins/hash/Makefile
diff --git a/src/include/extractor.h b/src/include/extractor.h
@@ -237,13 +237,23 @@ enum EXTRACTOR_MetaType
/* image specifics */
EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112,
-
-
EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113,
EXTRACTOR_METATYPE_THUMBNAIL = 114,
EXTRACTOR_METATYPE_IMAGE_RESOLUTION = 115,
EXTRACTOR_METATYPE_SOURCE = 116,
+ /* (text) document processing specifics */
+ EXTRACTOR_METATYPE_CHARACTER_SET = 117,
+ EXTRACTOR_METATYPE_LINE_COUNT = 118,
+ EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 119,
+ EXTRACTOR_METATYPE_WORD_COUNT = 120,
+ EXTRACTOR_METATYPE_CHARACTER_COUNT = 121,
+ EXTRACTOR_METATYPE_PAGE_ORIENTATION = 122,
+ EXTRACTOR_METATYPE_PAPER_SIZE = 123,
+ EXTRACTOR_METATYPE_TEMPLATE = 124,
+ EXTRACTOR_METATYPE_COMPANY = 125,
+ EXTRACTOR_METATYPE_MANAGER = 126,
+ EXTRACTOR_METATYPE_REVISION_NUMBER = 127,
/* fixme: used up to here! */
EXTRACTOR_METATYPE_SCALE = 108,
@@ -251,14 +261,6 @@ enum EXTRACTOR_MetaType
/* FIXME: transcribe & renumber those below */
- /* (text) document processing specifics */
- EXTRACTOR_METATYPE_CHARACTER_SET = 104,
- EXTRACTOR_METATYPE_LINE_COUNT = 105,
- EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106,
- EXTRACTOR_METATYPE_WORD_COUNT = 93,
- EXTRACTOR_METATYPE_CHARACTER_COUNT = 94,
- EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35,
- EXTRACTOR_METATYPE_PAPER_SIZE = 36,
EXTRACTOR_METATYPE_USED_FONTS = 37,
EXTRACTOR_METATYPE_PAGE_ORDER = 38,
@@ -312,10 +314,7 @@ enum EXTRACTOR_MetaType
EXTRACTOR_METATYPE_OWNER = 66,
EXTRACTOR_METATYPE_MEDIA_TYPE = 68,
EXTRACTOR_METATYPE_CONTACT = 69,
- EXTRACTOR_METATYPE_TEMPLATE = 88,
EXTRACTOR_METATYPE_SECURITY = 97,
- EXTRACTOR_METATYPE_COMPANY = 102,
- EXTRACTOR_METATYPE_MANAGER = 109,
EXTRACTOR_METATYPE_INFORMATION = 112,
EXTRACTOR_METATYPE_FULL_NAME = 113,
EXTRACTOR_METATYPE_LINK = 116,
diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c
@@ -294,6 +294,30 @@ static const struct MetaTypeDescription meta_type_descriptions[] = {
gettext_noop ("resolution in dots per inch") },
{ gettext_noop ("source"),
gettext_noop ("Originating entity") },
+ { gettext_noop ("character set"),
+ gettext_noop ("character encoding used") },
+ { gettext_noop ("line count"),
+ gettext_noop ("number of lines") },
+ { gettext_noop ("paragraph count"),
+ gettext_noop ("number o paragraphs") },
+ { gettext_noop ("word count"),
+ gettext_noop ("number of words") },
+ { gettext_noop ("page orientation"),
+ gettext_noop ("") },
+ { gettext_noop ("paper size"),
+ gettext_noop ("") },
+ { gettext_noop ("template"),
+ gettext_noop ("template the document uses or is based on") },
+ { gettext_noop ("company"),
+ gettext_noop ("") },
+ { gettext_noop ("manager"),
+ gettext_noop ("") },
+ { gettext_noop ("revision number"),
+ gettext_noop ("") },
+ { gettext_noop (""),
+ gettext_noop ("") },
+ { gettext_noop (""),
+ gettext_noop ("") },
{ gettext_noop (""),
gettext_noop ("") },
{ gettext_noop (""),
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
@@ -13,7 +13,7 @@ endif
if HAVE_GLIB
if WITH_GSF
- oledir=ole2
+ ole2=libextractor_ole2.la
endif
if HAVE_GTK
thumbgtk=libextractor_thumbnailgtk.la
@@ -58,7 +58,7 @@ endif
# toggle for development
SUBDIRS = .
-# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir)
+# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash
if HAVE_VORBISFILE
@@ -95,6 +95,7 @@ plugin_LTLIBRARIES = \
libextractor_jpeg.la \
libextractor_man.la \
libextractor_mime.la \
+ $(ole2) \
$(pdf) \
$(rpm) \
$(thumbgtk)
@@ -176,6 +177,16 @@ libextractor_mime_la_SOURCES = \
libextractor_mime_la_LDFLAGS = \
$(PLUGINFLAGS)
+libextractor_ole2_la_SOURCES = \
+ ole2_extractor.c
+libextractor_ole2_la_CFLAGS = \
+ $(GSF_CFLAGS)
+libextractor_ole2_la_LIBADD = \
+ $(LIBADD) $(GSF_LIBS) \
+ $(top_builddir)/src/common/libextractor_common.la
+libextractor_ole2_la_LDFLAGS = \
+ $(PLUGINFLAGS)
+
libextractor_pdf_la_SOURCES = \
pdf_extractor.cc
libextractor_pdf_la_LDFLAGS = \
diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
+ (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
diff --git a/src/plugins/ole2/INFO b/src/plugins/ole2/INFO
@@ -1,6 +0,0 @@
-Most of the code in this directory comes from
-libgsf 1.10.1 (Licensed under GPL/LGPL).
-
-libgsf -- The G Structured File Library
-Jody Goldberg <jody@gnome.org>
-
diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am
@@ -1,16 +0,0 @@
-include ../Makefile-plugins.am
-
-
-plugin_LTLIBRARIES = \
- libextractor_ole2.la
-
-libextractor_ole2_la_CFLAGS = \
- $(GSF_CFLAGS)
-libextractor_ole2_la_LIBADD = \
- $(LIBADD) $(GSF_LIBS) \
- $(top_builddir)/src/common/libextractor_common.la \
- $(top_builddir)/src/main/libextractor.la
-libextractor_ole2_la_LDFLAGS = \
- $(PLUGINFLAGS) $(retaincommand)
-libextractor_ole2_la_SOURCES = \
- ole2extractor.c
diff --git a/src/plugins/ole2/README b/src/plugins/ole2/README
@@ -1,25 +0,0 @@
-WordLeaker v.0.1 (c) 2005
- by Madelman (http://elligre.tk/madelman/)
-
-Shows information about a Word file.
-It can show all the summary and the revision history of the file.
-
-It should be portable but, for now, it doesn't work in Linux. I haven't had
-the time to debug it but I'll do when I can.
-
-There are a lot of things that don't work yet, if you want to help contact me.
-
-Copyright and License
-=====================
-
-WordLeaker v.0.1 (c) 2005 by Madelman (madelman@iname.com)
-
-WordLeaker program is free software; you can redistribute it and/or modify it under
-the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2 of the License, or (at your option) any
-later version.
-
-This program is distributed in the hope that it will be useful, but WITHOUT ANY
-WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-PARTICULAR PURPOSE. See the GNU General Public License for more details.
-
diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c
@@ -1,584 +0,0 @@
-/*
- This file is part of libextractor.
- (C) 2004, 2005, 2006, 2007 Vidyut Samanta and Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA.
-
- This code makes extensive use of libgsf
- -- the Gnome Structured File Library
- Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
-
- Part of this code was borrowed from wordleaker.cpp. See also
- the README file in this directory.
-*/
-
-#include "platform.h"
-#include "extractor.h"
-#include "convert.h"
-
-#include <glib-object.h>
-#include <string.h>
-#include <stdio.h>
-#include <ctype.h>
-
-#include <gsf/gsf-utils.h>
-#include <gsf/gsf-input-memory.h>
-#include <gsf/gsf-infile.h>
-#include <gsf/gsf-infile-msole.h>
-#include <gsf/gsf-msole-utils.h>
-
-#define DEBUG_OLE2 0
-
-/* ******************************** main extraction code ************************ */
-
-static struct EXTRACTOR_Keywords *
-addKeyword(EXTRACTOR_KeywordList *oldhead,
- const char *phrase,
- EXTRACTOR_KeywordType type) {
- EXTRACTOR_KeywordList * keyword;
-
- if (strlen(phrase) == 0)
- return oldhead;
- if (0 == strcmp(phrase, "\"\""))
- return oldhead;
- if (0 == strcmp(phrase, "\" \""))
- return oldhead;
- if (0 == strcmp(phrase, " "))
- return oldhead;
- keyword = malloc(sizeof(EXTRACTOR_KeywordList));
- keyword->next = oldhead;
- keyword->keyword = strdup(phrase);
- keyword->keywordType = type;
- return keyword;
-}
-
-
-#if 0
-static guint8 const component_guid [] = {
- 0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
- 0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
-};
-
-static guint8 const document_guid [] = {
- 0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
- 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
-};
-
-static guint8 const user_guid [] = {
- 0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
- 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
-};
-#endif
-
-typedef struct {
- char * text;
- EXTRACTOR_KeywordType type;
-} Matches;
-
-static Matches tmap[] = {
- { "Title", EXTRACTOR_TITLE },
- { "PresentationFormat", EXTRACTOR_FORMAT },
- { "Category", EXTRACTOR_DESCRIPTION },
- { "Manager", EXTRACTOR_MANAGER },
- { "Company", EXTRACTOR_COMPANY },
- { "Subject", EXTRACTOR_SUBJECT },
- { "Author", EXTRACTOR_AUTHOR },
- { "Keywords", EXTRACTOR_KEYWORDS },
- { "Comments", EXTRACTOR_COMMENT },
- { "Template", EXTRACTOR_TEMPLATE },
- { "NumPages", EXTRACTOR_PAGE_COUNT },
- { "AppName", EXTRACTOR_SOFTWARE },
- { "RevisionNumber", EXTRACTOR_VERSIONNUMBER },
- { "Dictionary", EXTRACTOR_LANGUAGE },
- { "NumBytes", EXTRACTOR_SIZE },
- { "CreatedTime", EXTRACTOR_CREATION_DATE },
- { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE },
- { "gsf:company", EXTRACTOR_COMPANY },
- /* { "gsf:security", EXTRACTOR_SECURITY }, */
- { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT },
- { "gsf:page-count", EXTRACTOR_PAGE_COUNT },
- { "gsf:line-count", EXTRACTOR_LINE_COUNT },
- { "gsf:word-count", EXTRACTOR_WORD_COUNT },
- { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT },
- { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY },
- /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
- { "gsf:manager", EXTRACTOR_MANAGER },
- { "dc:title", EXTRACTOR_TITLE },
- { "dc:creator", EXTRACTOR_CREATOR },
- { "dc:date", EXTRACTOR_DATE },
- { "dc:subject", EXTRACTOR_SUBJECT },
- { "dc:keywords", EXTRACTOR_KEYWORDS },
- { "dc:last-printed", EXTRACTOR_LAST_PRINTED },
- { "dc:description", EXTRACTOR_DESCRIPTION },
- { "meta:creation-date", EXTRACTOR_CREATION_DATE },
- /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */
- { "meta:generator", EXTRACTOR_GENERATOR },
- { "meta:template", EXTRACTOR_TEMPLATE },
- /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */
- /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
- { NULL, 0 },
-};
-
-static void processMetadata(gpointer key,
- gpointer value,
- gpointer user_data) {
- struct EXTRACTOR_Keywords ** pprev = user_data;
- const char * type = key;
- const GsfDocProp * prop = value;
- const GValue * gval;
- char * contents;
- int pos;
-
- if ( (key == NULL) ||
- (value == NULL) )
- return;
- gval = gsf_doc_prop_get_val(prop);
-
- if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
- contents = strdup(g_value_get_string(gval));
- } else {
- /* convert other formats? */
- contents = g_strdup_value_contents(gval);
- }
- if (contents == NULL)
- return;
- if ( (strlen(contents) > 0) &&
- (contents[strlen(contents)-1] == '\n') )
- contents[strlen(contents)-1] = '\0';
- pos = 0;
- while (tmap[pos].text != NULL) {
- if (0 == strcmp(tmap[pos].text,
- type))
- break;
- pos++;
- }
- if (tmap[pos].text != NULL)
- *pprev = addKeyword(*pprev,
- contents,
- tmap[pos].type);
-#if DEBUG_OLE2
- else
- printf("No match for type `%s'\n",
- type);
-#endif
- free(contents);
-}
-
-
-static struct EXTRACTOR_Keywords *
-process(GsfInput * in,
- struct EXTRACTOR_Keywords * prev) {
- GsfDocMetaData * sections;
- GError * error;
-
- sections = gsf_doc_meta_data_new();
- error = gsf_msole_metadata_read(in, sections);
- if (error == NULL) {
- gsf_doc_meta_data_foreach(sections,
- &processMetadata,
- &prev);
- }
- g_object_unref(G_OBJECT(sections));
- return prev;
-}
-
-static struct EXTRACTOR_Keywords *
-processSO(GsfInput * src,
- struct EXTRACTOR_Keywords * prev) {
- off_t size;
- char * buf;
-
- size = gsf_input_size(src);
- if (size < 0x374) /* == 0x375?? */
- return prev;
- buf = malloc(size);
- gsf_input_read(src, size, (unsigned char*) buf);
- if ( (buf[0] != 0x0F) ||
- (buf[1] != 0x0) ||
- (0 != strncmp(&buf[2],
- "SfxDocumentInfo",
- strlen("SfxDocumentInfo"))) ||
- (buf[0x11] != 0x0B) ||
- (buf[0x13] != 0x00) || /* pw protected! */
- (buf[0x12] != 0x00) ) {
- free(buf);
- return prev;
- }
- buf[0xd3] = '\0';
- if (buf[0x94] + buf[0x93] > 0)
- prev = addKeyword(prev,
- &buf[0x95],
- EXTRACTOR_TITLE);
- buf[0x114] = '\0';
- if (buf[0xd5] + buf[0xd4] > 0)
- prev = addKeyword(prev,
- &buf[0xd6],
- EXTRACTOR_SUBJECT);
- buf[0x215] = '\0';
- if (buf[0x115] + buf[0x116] > 0)
- prev = addKeyword(prev,
- &buf[0x117],
- EXTRACTOR_COMMENT);
- buf[0x296] = '\0';
- if (buf[0x216] + buf[0x217] > 0)
- prev = addKeyword(prev,
- &buf[0x218],
- EXTRACTOR_KEYWORDS);
- /* fixme: do timestamps,
- mime-type, user-defined info's */
-
- free(buf);
- return prev;
-}
-
-/* *************** wordleaker stuff *************** */
-
-#define __(a) dgettext("iso-639", a)
-
-static const char * lidToLanguage( unsigned int lid ) {
- switch ( lid ) {
- case 0x0400:
- return _("No Proofing");
- case 0x0401:
- return __("Arabic");
- case 0x0402:
- return __("Bulgarian");
- case 0x0403:
- return __("Catalan");
- case 0x0404:
- return _("Traditional Chinese");
- case 0x0804:
- return _("Simplified Chinese");
- case 0x0405:
- return __("Chechen");
- case 0x0406:
- return __("Danish");
- case 0x0407:
- return __("German");
- case 0x0807:
- return _("Swiss German");
- case 0x0408:
- return __("Greek");
- case 0x0409:
- return _("U.S. English");
- case 0x0809:
- return _("U.K. English");
- case 0x0c09:
- return _("Australian English");
- case 0x040a:
- return _("Castilian Spanish");
- case 0x080a:
- return _("Mexican Spanish");
- case 0x040b:
- return __("Finnish");
- case 0x040c:
- return __("French");
- case 0x080c:
- return _("Belgian French");
- case 0x0c0c:
- return _("Canadian French");
- case 0x100c:
- return _("Swiss French");
- case 0x040d:
- return __("Hebrew");
- case 0x040e:
- return __("Hungarian");
- case 0x040f:
- return __("Icelandic");
- case 0x0410:
- return __("Italian");
- case 0x0810:
- return _("Swiss Italian");
- case 0x0411:
- return __("Japanese");
- case 0x0412:
- return __("Korean");
- case 0x0413:
- return __("Dutch");
- case 0x0813:
- return _("Belgian Dutch");
- case 0x0414:
- return _("Norwegian Bokmal");
- case 0x0814:
- return __("Norwegian Nynorsk");
- case 0x0415:
- return __("Polish");
- case 0x0416:
- return __("Brazilian Portuguese");
- case 0x0816:
- return __("Portuguese");
- case 0x0417:
- return _("Rhaeto-Romanic");
- case 0x0418:
- return __("Romanian");
- case 0x0419:
- return __("Russian");
- case 0x041a:
- return _("Croato-Serbian (Latin)");
- case 0x081a:
- return _("Serbo-Croatian (Cyrillic)");
- case 0x041b:
- return __("Slovak");
- case 0x041c:
- return __("Albanian");
- case 0x041d:
- return __("Swedish");
- case 0x041e:
- return __("Thai");
- case 0x041f:
- return __("Turkish");
- case 0x0420:
- return __("Urdu");
- case 0x0421:
- return __("Bahasa");
- case 0x0422:
- return __("Ukrainian");
- case 0x0423:
- return __("Byelorussian");
- case 0x0424:
- return __("Slovenian");
- case 0x0425:
- return __("Estonian");
- case 0x0426:
- return __("Latvian");
- case 0x0427:
- return __("Lithuanian");
- case 0x0429:
- return _("Farsi");
- case 0x042D:
- return __("Basque");
- case 0x042F:
- return __("Macedonian");
- case 0x0436:
- return __("Afrikaans");
- case 0x043E:
- return __("Malayalam");
- default:
- return NULL;
- }
-}
-
-
-static struct EXTRACTOR_Keywords *
-history_extract(GsfInput * stream,
- unsigned int lcbSttbSavedBy,
- unsigned int fcSttbSavedBy,
- struct EXTRACTOR_Keywords * prev) {
- unsigned int where = 0;
- unsigned char * lbuffer;
- unsigned int i;
- unsigned int length;
- char * author;
- char * filename;
- char * rbuf;
- unsigned int nRev;
-
- // goto offset of revision
- gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
- if (gsf_input_remaining(stream) < lcbSttbSavedBy)
- return prev;
- lbuffer = malloc(lcbSttbSavedBy);
- // read all the revision history
- gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
- // there are n strings, so n/2 revisions (author & file)
- nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
- where = 6;
- for (i=0; i < nRev; i++) {
- if (where >= lcbSttbSavedBy)
- break;
- length = lbuffer[where++];
- if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
- (where + 2 * length + 2 <= where) )
- break;
- author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
- length * 2,
- "UTF-16BE");
- where += length * 2 + 1;
- length = lbuffer[where++];
- if ( (where + 2 * length >= lcbSttbSavedBy) ||
- (where + 2 * length + 1 <= where) ) {
- free(author);
- break;
- }
- filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
- length * 2,
- "UTF-16BE");
- where += length * 2 + 1;
- rbuf = malloc(strlen(author) + strlen(filename) + 512);
- snprintf(rbuf, 512 + strlen(author) + strlen(filename),
- _("Revision #%u: Author '%s' worked on '%s'"),
- i, author, filename);
- free(author);
- free(filename);
- prev = addKeyword(prev,
- rbuf,
- EXTRACTOR_REVISION_HISTORY);
- free(rbuf);
- }
- free(lbuffer);
- return prev;
-}
-
-
-/* ************** main method *********** */
-
-struct EXTRACTOR_Keywords *
-libextractor_ole2_extract(const char * filename,
- const char * data,
- size_t size,
- struct EXTRACTOR_Keywords * prev) {
- GsfInput * input;
- GsfInfile * infile;
- GsfInput * src;
- const char * name;
- const char * generator = NULL;
- int i;
- unsigned int lcb;
- unsigned int fcb;
- const unsigned char * data512;
- unsigned int lid;
- const char * lang;
-
- if (size < 512 + 898)
- return prev; /* can hardly be OLE2 */
- input = gsf_input_memory_new((const guint8 *) data,
- (gsf_off_t) size,
- FALSE);
- if (input == NULL)
- return prev;
-
- infile = gsf_infile_msole_new(input, NULL);
- if (infile == NULL) {
- g_object_unref(G_OBJECT(input));
- return prev;
- }
- lcb = 0;
- fcb = 0;
- for (i=0;i<gsf_infile_num_children(infile);i++) {
- name = gsf_infile_name_by_index (infile, i);
- src = NULL;
- if (name == NULL)
- continue;
- if ( (0 == strcmp(name, "\005SummaryInformation"))
- || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
- src = gsf_infile_child_by_index (infile, i);
- if (src != NULL)
- prev = process(src,
- prev);
- }
- if (0 == strcmp(name, "SfxDocumentInfo")) {
- src = gsf_infile_child_by_index (infile, i);
- if (src != NULL)
- prev = processSO(src,
- prev);
- }
- if (src != NULL)
- g_object_unref(G_OBJECT(src));
- }
-
- data512 = (const unsigned char*) &data[512];
- lid = data512[6] + (data512[7] << 8);
- lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
- fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
- lang = lidToLanguage(lid);
- if (lang != NULL) {
- prev = addKeyword(prev,
- lang,
- EXTRACTOR_LANGUAGE);
- }
- if (lcb >= 6) {
- for (i=0;i<gsf_infile_num_children(infile);i++) {
- name = gsf_infile_name_by_index (infile, i);
- if (name == NULL)
- continue;
- if ( (0 == strcmp(name, "1Table")) ||
- (0 == strcmp(name, "0Table")) ) {
- src = gsf_infile_child_by_index (infile, i);
- if (src != NULL) {
- prev = history_extract(src,
- lcb,
- fcb,
- prev);
- g_object_unref(G_OBJECT(src));
- }
- }
- }
- }
- g_object_unref(G_OBJECT(infile));
- g_object_unref(G_OBJECT(input));
-
- /*
- * Hack to return an appropriate mimetype
- */
- generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev);
- if (NULL == generator) {
- /*
- * when very puzzled, just look at file magic number
- */
- if ( (8 < size)
- && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
- generator = "Microsoft Office";
- }
-
- if(NULL != generator) {
- const char * mimetype = "application/vnd.ms-files";
-
- if((0 == strncmp(generator, "Microsoft Word", 14)) ||
- (0 == strncmp(generator, "Microsoft Office Word", 21)))
- mimetype = "application/msword";
- else if((0 == strncmp(generator, "Microsoft Excel", 15)) ||
- (0 == strncmp(generator, "Microsoft Office Excel", 22)))
- mimetype = "application/vnd.ms-excel";
- else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) ||
- (0 == strncmp(generator, "Microsoft Office PowerPoint", 27)))
- mimetype = "application/vnd.ms-powerpoint";
- else if(0 == strncmp(generator, "Microsoft Project", 17))
- mimetype = "application/vnd.ms-project";
- else if(0 == strncmp(generator, "Microsoft Visio", 15))
- mimetype = "application/vnd.visio";
- else if(0 == strncmp(generator, "Microsoft Office", 16))
- mimetype = "application/vnd.ms-office";
-
- prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE);
- }
-
- return prev;
-}
-static void nolog (const gchar *log_domain,
- GLogLevelFlags log_level,
- const gchar *message,
- gpointer user_data) {
-}
-
-void __attribute__ ((constructor)) ole2_ltdl_init() {
- g_type_init();
-#ifdef HAVE_GSF_INIT
- gsf_init();
-#endif
- /* disable logging -- thanks, Jody! */
- g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL);
- // gsf_init_dynamic(NULL);
-}
-
-void __attribute__ ((destructor)) ole2_ltdl_fini() {
-#ifdef HAVE_GSF_INIT
- gsf_shutdown();
-#endif
- // gsf_shutdown_dynamic(NULL);
-}
-
-/* end of ole2extractor.c */
-
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c
@@ -0,0 +1,599 @@
+/*
+ This file is part of libextractor.
+ (C) 2004, 2005, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff
+
+ libextractor is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ libextractor is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with libextractor; see the file COPYING. If not, write to the
+ Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ Boston, MA 02111-1307, USA.
+
+ This code makes extensive use of libgsf
+ -- the Gnome Structured File Library
+ Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
+
+ Part of this code was borrowed from wordleaker.cpp. See also
+ the README file in this directory.
+*/
+
+#include "platform.h"
+#include "extractor.h"
+#include "convert.h"
+
+#include <glib-object.h>
+#include <string.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-input-memory.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-msole-utils.h>
+
+#define DEBUG_OLE2 0
+
+/* ******************************** main extraction code ************************ */
+
+static int
+addKeyword(EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls,
+ const char *phrase,
+ enum EXTRACTOR_MetaType type) {
+ if (strlen(phrase) == 0)
+ return 0;
+ if (0 == strcmp(phrase, "\"\""))
+ return 0;
+ if (0 == strcmp(phrase, "\" \""))
+ return 0;
+ if (0 == strcmp(phrase, " "))
+ return 0;
+ return proc (proc_cls,
+ "ole2",
+ type,
+ EXTRACTOR_METAFORMAT_UTF8,
+ "text/plain",
+ phrase,
+ strlen (phrase) +1);
+}
+
+typedef struct {
+ char * text;
+ enum EXTRACTOR_MetaType type;
+} Matches;
+
+static Matches tmap[] = {
+ { "Title", EXTRACTOR_METATYPE_TITLE },
+ { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
+ { "Category", EXTRACTOR_METATYPE_SECTION },
+ { "Manager", EXTRACTOR_METATYPE_MANAGER },
+ { "Company", EXTRACTOR_METATYPE_COMPANY },
+ { "Subject", EXTRACTOR_METATYPE_SUBJECT },
+ { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+ { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
+ { "Comments", EXTRACTOR_METATYPE_COMMENT },
+ { "Template", EXTRACTOR_METATYPE_TEMPLATE },
+ { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
+ { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
+ { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
+ { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
+ { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
+ { "LastSavedTime" , EXTRACTOR_METATYPE_MODIFICATION_DATE },
+ { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
+ { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
+ { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
+ { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
+ { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
+ { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
+ { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
+ { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
+ { "dc:title", EXTRACTOR_METATYPE_TITLE },
+ { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
+ { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+ { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
+ { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
+ { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
+ { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
+ { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
+ { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+ { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
+ { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
+ /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, */
+ /* { "gsf:security", EXTRACTOR_SECURITY }, */
+ /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
+ /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
+ /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
+ { NULL, 0 }
+};
+
+
+struct ProcContext
+{
+ EXTRACTOR_MetaDataProcessor proc;
+ void *proc_cls;
+ int ret;
+};
+
+
+static void processMetadata(gpointer key,
+ gpointer value,
+ gpointer user_data) {
+ struct ProcContext *pc = user_data;
+ const char * type = key;
+ const GsfDocProp * prop = value;
+ const GValue * gval;
+ char * contents;
+ int pos;
+
+ if ( (key == NULL) ||
+ (value == NULL) )
+ return;
+ if (pc->ret != 0)
+ return;
+ gval = gsf_doc_prop_get_val(prop);
+
+ if (G_VALUE_TYPE(gval) == G_TYPE_STRING)
+ {
+ contents = strdup(g_value_get_string(gval));
+ }
+ else
+ {
+ /* convert other formats? */
+ contents = g_strdup_value_contents(gval);
+ }
+ if (contents == NULL)
+ return;
+ if ( (strlen(contents) > 0) &&
+ (contents[strlen(contents)-1] == '\n') )
+ contents[strlen(contents)-1] = '\0';
+ pos = 0;
+ while (tmap[pos].text != NULL)
+ {
+ if (0 == strcmp(tmap[pos].text,
+ type))
+ break;
+ pos++;
+ }
+ if (0 == strcmp (type, "meta:generator"))
+ {
+ const char * mimetype = "application/vnd.ms-files";
+ if((0 == strncmp(value, "Microsoft Word", 14)) ||
+ (0 == strncmp(value, "Microsoft Office Word", 21)))
+ mimetype = "application/msword";
+ else if((0 == strncmp(value, "Microsoft Excel", 15)) ||
+ (0 == strncmp(value, "Microsoft Office Excel", 22)))
+ mimetype = "application/vnd.ms-excel";
+ else if((0 == strncmp(value, "Microsoft PowerPoint", 20)) ||
+ (0 == strncmp(value, "Microsoft Office PowerPoint", 27)))
+ mimetype = "application/vnd.ms-powerpoint";
+ else if(0 == strncmp(value, "Microsoft Project", 17))
+ mimetype = "application/vnd.ms-project";
+ else if(0 == strncmp(value, "Microsoft Visio", 15))
+ mimetype = "application/vnd.visio";
+ else if(0 == strncmp(value, "Microsoft Office", 16))
+ mimetype = "application/vnd.ms-office";
+
+ if (0 != addKeyword(pc->proc,
+ pc->proc_cls, mimetype, EXTRACTOR_METATYPE_MIMETYPE))
+ {
+ free (contents);
+ pc->ret = 1;
+ return;
+ }
+ }
+ if (tmap[pos].text != NULL)
+ {
+ if (0 != addKeyword(pc->proc, pc->proc_cls,
+ contents,
+ tmap[pos].type))
+ {
+ free (contents);
+ pc->ret = 1;
+ return;
+ }
+ }
+#if DEBUG_OLE2
+ else
+ printf("No match for type `%s'\n",
+ type);
+#endif
+ free(contents);
+}
+
+
+static int
+process(GsfInput * in,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ struct ProcContext pc;
+ GsfDocMetaData * sections;
+ GError * error;
+
+ pc.proc = proc;
+ pc.proc_cls = proc_cls;
+ pc.ret = 0;
+ sections = gsf_doc_meta_data_new();
+ error = gsf_msole_metadata_read(in, sections);
+ if (error == NULL) {
+ gsf_doc_meta_data_foreach(sections,
+ &processMetadata,
+ &pc);
+ }
+ g_object_unref(G_OBJECT(sections));
+ return pc.ret;
+}
+
+static int
+processSO(GsfInput * src,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls) {
+ off_t size = gsf_input_size(src);
+ if ( (size < 0x374) || (size > 4*1024*1024) ) /* == 0x375?? */
+ return 0;
+ char buf[size];
+ gsf_input_read(src, size, (unsigned char*) buf);
+ if ( (buf[0] != 0x0F) ||
+ (buf[1] != 0x0) ||
+ (0 != strncmp(&buf[2],
+ "SfxDocumentInfo",
+ strlen("SfxDocumentInfo"))) ||
+ (buf[0x11] != 0x0B) ||
+ (buf[0x13] != 0x00) || /* pw protected! */
+ (buf[0x12] != 0x00) )
+ return 0;
+ buf[0xd3] = '\0';
+ if (buf[0x94] + buf[0x93] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x95],
+ EXTRACTOR_METATYPE_TITLE))
+ return 1;
+ buf[0x114] = '\0';
+ if (buf[0xd5] + buf[0xd4] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0xd6],
+ EXTRACTOR_METATYPE_SUBJECT))
+ return 1;
+ buf[0x215] = '\0';
+ if (buf[0x115] + buf[0x116] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x117],
+ EXTRACTOR_METATYPE_COMMENT))
+ return 1;
+ buf[0x296] = '\0';
+ if (buf[0x216] + buf[0x217] > 0)
+ if (0 != addKeyword(proc, proc_cls,
+ &buf[0x218],
+ EXTRACTOR_METATYPE_KEYWORDS))
+ return 1;
+ /* fixme: do timestamps,
+ mime-type, user-defined info's */
+ return 0;
+}
+
+/* *************** wordleaker stuff *************** */
+
+#define __(a) dgettext("iso-639", a)
+
+static const char * lidToLanguage( unsigned int lid ) {
+ switch ( lid ) {
+ case 0x0400:
+ return _("No Proofing");
+ case 0x0401:
+ return __("Arabic");
+ case 0x0402:
+ return __("Bulgarian");
+ case 0x0403:
+ return __("Catalan");
+ case 0x0404:
+ return _("Traditional Chinese");
+ case 0x0804:
+ return _("Simplified Chinese");
+ case 0x0405:
+ return __("Chechen");
+ case 0x0406:
+ return __("Danish");
+ case 0x0407:
+ return __("German");
+ case 0x0807:
+ return _("Swiss German");
+ case 0x0408:
+ return __("Greek");
+ case 0x0409:
+ return _("U.S. English");
+ case 0x0809:
+ return _("U.K. English");
+ case 0x0c09:
+ return _("Australian English");
+ case 0x040a:
+ return _("Castilian Spanish");
+ case 0x080a:
+ return _("Mexican Spanish");
+ case 0x040b:
+ return __("Finnish");
+ case 0x040c:
+ return __("French");
+ case 0x080c:
+ return _("Belgian French");
+ case 0x0c0c:
+ return _("Canadian French");
+ case 0x100c:
+ return _("Swiss French");
+ case 0x040d:
+ return __("Hebrew");
+ case 0x040e:
+ return __("Hungarian");
+ case 0x040f:
+ return __("Icelandic");
+ case 0x0410:
+ return __("Italian");
+ case 0x0810:
+ return _("Swiss Italian");
+ case 0x0411:
+ return __("Japanese");
+ case 0x0412:
+ return __("Korean");
+ case 0x0413:
+ return __("Dutch");
+ case 0x0813:
+ return _("Belgian Dutch");
+ case 0x0414:
+ return _("Norwegian Bokmal");
+ case 0x0814:
+ return __("Norwegian Nynorsk");
+ case 0x0415:
+ return __("Polish");
+ case 0x0416:
+ return __("Brazilian Portuguese");
+ case 0x0816:
+ return __("Portuguese");
+ case 0x0417:
+ return _("Rhaeto-Romanic");
+ case 0x0418:
+ return __("Romanian");
+ case 0x0419:
+ return __("Russian");
+ case 0x041a:
+ return _("Croato-Serbian (Latin)");
+ case 0x081a:
+ return _("Serbo-Croatian (Cyrillic)");
+ case 0x041b:
+ return __("Slovak");
+ case 0x041c:
+ return __("Albanian");
+ case 0x041d:
+ return __("Swedish");
+ case 0x041e:
+ return __("Thai");
+ case 0x041f:
+ return __("Turkish");
+ case 0x0420:
+ return __("Urdu");
+ case 0x0421:
+ return __("Bahasa");
+ case 0x0422:
+ return __("Ukrainian");
+ case 0x0423:
+ return __("Byelorussian");
+ case 0x0424:
+ return __("Slovenian");
+ case 0x0425:
+ return __("Estonian");
+ case 0x0426:
+ return __("Latvian");
+ case 0x0427:
+ return __("Lithuanian");
+ case 0x0429:
+ return _("Farsi");
+ case 0x042D:
+ return __("Basque");
+ case 0x042F:
+ return __("Macedonian");
+ case 0x0436:
+ return __("Afrikaans");
+ case 0x043E:
+ return __("Malayalam");
+ default:
+ return NULL;
+ }
+}
+
+
+static int
+history_extract(GsfInput * stream,
+ unsigned int lcbSttbSavedBy,
+ unsigned int fcSttbSavedBy,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls)
+{
+ unsigned int where = 0;
+ unsigned char * lbuffer;
+ unsigned int i;
+ unsigned int length;
+ char * author;
+ char * filename;
+ char * rbuf;
+ unsigned int nRev;
+ int ret;
+
+ // goto offset of revision
+ gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
+ if (gsf_input_remaining(stream) < lcbSttbSavedBy)
+ return 0;
+ lbuffer = malloc(lcbSttbSavedBy);
+ // read all the revision history
+ gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
+ // there are n strings, so n/2 revisions (author & file)
+ nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
+ where = 6;
+ ret = 0;
+ for (i=0; i < nRev; i++) {
+ if (where >= lcbSttbSavedBy)
+ break;
+ length = lbuffer[where++];
+ if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
+ (where + 2 * length + 2 <= where) )
+ break;
+ author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ length = lbuffer[where++];
+ if ( (where + 2 * length >= lcbSttbSavedBy) ||
+ (where + 2 * length + 1 <= where) ) {
+ free(author);
+ break;
+ }
+ filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ rbuf = malloc(strlen(author) + strlen(filename) + 512);
+ snprintf(rbuf, 512 + strlen(author) + strlen(filename),
+ _("Revision #%u: Author '%s' worked on '%s'"),
+ i, author, filename);
+ free(author);
+ free(filename);
+ ret = addKeyword(proc, proc_cls,
+ rbuf,
+ EXTRACTOR_METATYPE_REVISION_HISTORY);
+ free(rbuf);
+ if (0 != ret)
+ break;
+ }
+ free(lbuffer);
+ return ret;
+}
+
+
+int
+EXTRACTOR_ole2_extract (const char *data,
+ size_t size,
+ EXTRACTOR_MetaDataProcessor proc,
+ void *proc_cls,
+ const char *options)
+{
+ GsfInput * input;
+ GsfInfile * infile;
+ GsfInput * src;
+ const char * name;
+ int i;
+ unsigned int lcb;
+ unsigned int fcb;
+ const unsigned char * data512;
+ unsigned int lid;
+ const char * lang;
+ int ret;
+
+ ret = 0;
+ if (size < 512 + 898)
+ return 0; /* can hardly be OLE2 */
+ input = gsf_input_memory_new((const guint8 *) data,
+ (gsf_off_t) size,
+ FALSE);
+ if (input == NULL)
+ return 0;
+
+ infile = gsf_infile_msole_new(input, NULL);
+ if (infile == NULL) {
+ g_object_unref(G_OBJECT(input));
+ return 0;
+ }
+ lcb = 0;
+ fcb = 0;
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ name = gsf_infile_name_by_index (infile, i);
+ src = NULL;
+ if (ret != 0)
+ break;
+ if (name == NULL)
+ continue;
+ if ( (0 == strcmp(name, "\005SummaryInformation"))
+ || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL)
+ ret = process(src,
+ proc,
+ proc_cls);
+ }
+ if (0 == strcmp(name, "SfxDocumentInfo")) {
+ src = gsf_infile_child_by_index (infile, i);
+ if ( (src != NULL) && (ret == 0) )
+ ret = processSO(src,
+ proc,
+ proc_cls);
+ }
+ if (src != NULL)
+ g_object_unref(G_OBJECT(src));
+ }
+
+ data512 = (const unsigned char*) &data[512];
+ lid = data512[6] + (data512[7] << 8);
+ lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
+ fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
+ lang = lidToLanguage(lid);
+ if ( (lang != NULL) && (ret == 0) )
+ ret = addKeyword(proc, proc_cls,
+ lang,
+ EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE);
+ if (lcb >= 6) {
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ if (ret != 0)
+ break;
+ name = gsf_infile_name_by_index (infile, i);
+ if (name == NULL)
+ continue;
+ if ( (0 == strcmp(name, "1Table")) ||
+ (0 == strcmp(name, "0Table")) ) {
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL) {
+ ret = history_extract(src,
+ lcb,
+ fcb,
+ proc, proc_cls);
+ g_object_unref(G_OBJECT(src));
+ }
+ }
+ }
+ }
+ g_object_unref(G_OBJECT(infile));
+ g_object_unref(G_OBJECT(input));
+ return ret;
+}
+
+
+static void
+nolog (const gchar *log_domain,
+ GLogLevelFlags log_level,
+ const gchar *message,
+ gpointer user_data) {
+}
+
+
+void __attribute__ ((constructor)) ole2_ltdl_init() {
+ g_type_init();
+#ifdef HAVE_GSF_INIT
+ gsf_init();
+#endif
+ /* disable logging -- thanks, Jody! */
+ g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL);
+}
+
+
+void __attribute__ ((destructor)) ole2_ltdl_fini() {
+#ifdef HAVE_GSF_INIT
+ gsf_shutdown();
+#endif
+}
+
+/* end of ole2_extractor.c */
+
diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc
@@ -37,6 +37,7 @@
#include <poppler/Page.h>
#include <poppler/PDFDoc.h>
#include <poppler/Error.h>
+#include <poppler/GlobalParams.h>
#include <poppler/goo/GooString.h>
#define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0)
@@ -167,7 +168,11 @@ extern "C" {
BaseStream * stream;
int err;
- /* errorInit(); -- keep commented out, otherwise errors are printed to stderr for non-pdf files! */
+ if (globalParams == NULL)
+ {
+ globalParams = new GlobalParams();
+ globalParams->setErrQuiet (gTrue);
+ }
obj.initNull();
err = 0;
stream = new MemStream( (char*) data, 0, size, &obj);
diff --git a/src/plugins/rpm_extractor.c b/src/plugins/rpm_extractor.c
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- (C) 2002, 2003, 2008 Vidyut Samanta and Christian Grothoff
+ (C) 2002, 2003, 2008, 2009 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published