libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 509826a244154dacbe1c95e81d314b95ed449f7d
parent aa056e44077dc7ed731cdfc1dcdcdd1d18b6004d
Author: Christian Grothoff <christian@grothoff.org>
Date:   Wed, 16 Dec 2009 14:14:01 +0000

ole2

Diffstat:
Mconfigure.ac | 1-
Msrc/include/extractor.h | 25++++++++++++-------------
Msrc/main/extractor_metatypes.c | 24++++++++++++++++++++++++
Msrc/plugins/Makefile.am | 15+++++++++++++--
Msrc/plugins/man_extractor.c | 2+-
Dsrc/plugins/ole2/INFO | 6------
Dsrc/plugins/ole2/Makefile.am | 16----------------
Dsrc/plugins/ole2/README | 25-------------------------
Dsrc/plugins/ole2/ole2extractor.c | 584-------------------------------------------------------------------------------
Asrc/plugins/ole2_extractor.c | 599+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/plugins/pdf_extractor.cc | 7++++++-
Msrc/plugins/rpm_extractor.c | 2+-
12 files changed, 656 insertions(+), 650 deletions(-)

diff --git a/configure.ac b/configure.ac @@ -556,7 +556,6 @@ src/intlemu/Makefile src/common/Makefile src/main/Makefile src/plugins/Makefile -src/plugins/ole2/Makefile src/plugins/oo/Makefile src/plugins/printable/Makefile src/plugins/hash/Makefile diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -237,13 +237,23 @@ enum EXTRACTOR_MetaType /* image specifics */ EXTRACTOR_METATYPE_IMAGE_DIMENSIONS = 112, - - EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE = 113, EXTRACTOR_METATYPE_THUMBNAIL = 114, EXTRACTOR_METATYPE_IMAGE_RESOLUTION = 115, EXTRACTOR_METATYPE_SOURCE = 116, + /* (text) document processing specifics */ + EXTRACTOR_METATYPE_CHARACTER_SET = 117, + EXTRACTOR_METATYPE_LINE_COUNT = 118, + EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 119, + EXTRACTOR_METATYPE_WORD_COUNT = 120, + EXTRACTOR_METATYPE_CHARACTER_COUNT = 121, + EXTRACTOR_METATYPE_PAGE_ORIENTATION = 122, + EXTRACTOR_METATYPE_PAPER_SIZE = 123, + EXTRACTOR_METATYPE_TEMPLATE = 124, + EXTRACTOR_METATYPE_COMPANY = 125, + EXTRACTOR_METATYPE_MANAGER = 126, + EXTRACTOR_METATYPE_REVISION_NUMBER = 127, /* fixme: used up to here! */ EXTRACTOR_METATYPE_SCALE = 108, @@ -251,14 +261,6 @@ enum EXTRACTOR_MetaType /* FIXME: transcribe & renumber those below */ - /* (text) document processing specifics */ - EXTRACTOR_METATYPE_CHARACTER_SET = 104, - EXTRACTOR_METATYPE_LINE_COUNT = 105, - EXTRACTOR_METATYPE_PARAGRAPH_COUNT = 106, - EXTRACTOR_METATYPE_WORD_COUNT = 93, - EXTRACTOR_METATYPE_CHARACTER_COUNT = 94, - EXTRACTOR_METATYPE_PAGE_ORIENTATION = 35, - EXTRACTOR_METATYPE_PAPER_SIZE = 36, EXTRACTOR_METATYPE_USED_FONTS = 37, EXTRACTOR_METATYPE_PAGE_ORDER = 38, @@ -312,10 +314,7 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_OWNER = 66, EXTRACTOR_METATYPE_MEDIA_TYPE = 68, EXTRACTOR_METATYPE_CONTACT = 69, - EXTRACTOR_METATYPE_TEMPLATE = 88, EXTRACTOR_METATYPE_SECURITY = 97, - EXTRACTOR_METATYPE_COMPANY = 102, - EXTRACTOR_METATYPE_MANAGER = 109, EXTRACTOR_METATYPE_INFORMATION = 112, EXTRACTOR_METATYPE_FULL_NAME = 113, EXTRACTOR_METATYPE_LINK = 116, diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c @@ -294,6 +294,30 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { gettext_noop ("resolution in dots per inch") }, { gettext_noop ("source"), gettext_noop ("Originating entity") }, + { gettext_noop ("character set"), + gettext_noop ("character encoding used") }, + { gettext_noop ("line count"), + gettext_noop ("number of lines") }, + { gettext_noop ("paragraph count"), + gettext_noop ("number o paragraphs") }, + { gettext_noop ("word count"), + gettext_noop ("number of words") }, + { gettext_noop ("page orientation"), + gettext_noop ("") }, + { gettext_noop ("paper size"), + gettext_noop ("") }, + { gettext_noop ("template"), + gettext_noop ("template the document uses or is based on") }, + { gettext_noop ("company"), + gettext_noop ("") }, + { gettext_noop ("manager"), + gettext_noop ("") }, + { gettext_noop ("revision number"), + gettext_noop ("") }, + { gettext_noop (""), + gettext_noop ("") }, + { gettext_noop (""), + gettext_noop ("") }, { gettext_noop (""), gettext_noop ("") }, { gettext_noop (""), diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -13,7 +13,7 @@ endif if HAVE_GLIB if WITH_GSF - oledir=ole2 + ole2=libextractor_ole2.la endif if HAVE_GTK thumbgtk=libextractor_thumbnailgtk.la @@ -58,7 +58,7 @@ endif # toggle for development SUBDIRS = . -# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash $(oledir) +# SUBDIRS = . $(thumbgtk) $(thumbffmpeg) $(oodir) $(printdir) hash if HAVE_VORBISFILE @@ -95,6 +95,7 @@ plugin_LTLIBRARIES = \ libextractor_jpeg.la \ libextractor_man.la \ libextractor_mime.la \ + $(ole2) \ $(pdf) \ $(rpm) \ $(thumbgtk) @@ -176,6 +177,16 @@ libextractor_mime_la_SOURCES = \ libextractor_mime_la_LDFLAGS = \ $(PLUGINFLAGS) +libextractor_ole2_la_SOURCES = \ + ole2_extractor.c +libextractor_ole2_la_CFLAGS = \ + $(GSF_CFLAGS) +libextractor_ole2_la_LIBADD = \ + $(LIBADD) $(GSF_LIBS) \ + $(top_builddir)/src/common/libextractor_common.la +libextractor_ole2_la_LDFLAGS = \ + $(PLUGINFLAGS) + libextractor_pdf_la_SOURCES = \ pdf_extractor.cc libextractor_pdf_la_LDFLAGS = \ diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published diff --git a/src/plugins/ole2/INFO b/src/plugins/ole2/INFO @@ -1,6 +0,0 @@ -Most of the code in this directory comes from -libgsf 1.10.1 (Licensed under GPL/LGPL). - -libgsf -- The G Structured File Library -Jody Goldberg <jody@gnome.org> - diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am @@ -1,16 +0,0 @@ -include ../Makefile-plugins.am - - -plugin_LTLIBRARIES = \ - libextractor_ole2.la - -libextractor_ole2_la_CFLAGS = \ - $(GSF_CFLAGS) -libextractor_ole2_la_LIBADD = \ - $(LIBADD) $(GSF_LIBS) \ - $(top_builddir)/src/common/libextractor_common.la \ - $(top_builddir)/src/main/libextractor.la -libextractor_ole2_la_LDFLAGS = \ - $(PLUGINFLAGS) $(retaincommand) -libextractor_ole2_la_SOURCES = \ - ole2extractor.c diff --git a/src/plugins/ole2/README b/src/plugins/ole2/README @@ -1,25 +0,0 @@ -WordLeaker v.0.1 (c) 2005 - by Madelman (http://elligre.tk/madelman/) - -Shows information about a Word file. -It can show all the summary and the revision history of the file. - -It should be portable but, for now, it doesn't work in Linux. I haven't had -the time to debug it but I'll do when I can. - -There are a lot of things that don't work yet, if you want to help contact me. - -Copyright and License -===================== - -WordLeaker v.0.1 (c) 2005 by Madelman (madelman@iname.com) - -WordLeaker program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2 of the License, or (at your option) any -later version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c @@ -1,584 +0,0 @@ -/* - This file is part of libextractor. - (C) 2004, 2005, 2006, 2007 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - - This code makes extensive use of libgsf - -- the Gnome Structured File Library - Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) - - Part of this code was borrowed from wordleaker.cpp. See also - the README file in this directory. -*/ - -#include "platform.h" -#include "extractor.h" -#include "convert.h" - -#include <glib-object.h> -#include <string.h> -#include <stdio.h> -#include <ctype.h> - -#include <gsf/gsf-utils.h> -#include <gsf/gsf-input-memory.h> -#include <gsf/gsf-infile.h> -#include <gsf/gsf-infile-msole.h> -#include <gsf/gsf-msole-utils.h> - -#define DEBUG_OLE2 0 - -/* ******************************** main extraction code ************************ */ - -static struct EXTRACTOR_Keywords * -addKeyword(EXTRACTOR_KeywordList *oldhead, - const char *phrase, - EXTRACTOR_KeywordType type) { - EXTRACTOR_KeywordList * keyword; - - if (strlen(phrase) == 0) - return oldhead; - if (0 == strcmp(phrase, "\"\"")) - return oldhead; - if (0 == strcmp(phrase, "\" \"")) - return oldhead; - if (0 == strcmp(phrase, " ")) - return oldhead; - keyword = malloc(sizeof(EXTRACTOR_KeywordList)); - keyword->next = oldhead; - keyword->keyword = strdup(phrase); - keyword->keywordType = type; - return keyword; -} - - -#if 0 -static guint8 const component_guid [] = { - 0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10, - 0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9 -}; - -static guint8 const document_guid [] = { - 0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10, - 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae -}; - -static guint8 const user_guid [] = { - 0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10, - 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae -}; -#endif - -typedef struct { - char * text; - EXTRACTOR_KeywordType type; -} Matches; - -static Matches tmap[] = { - { "Title", EXTRACTOR_TITLE }, - { "PresentationFormat", EXTRACTOR_FORMAT }, - { "Category", EXTRACTOR_DESCRIPTION }, - { "Manager", EXTRACTOR_MANAGER }, - { "Company", EXTRACTOR_COMPANY }, - { "Subject", EXTRACTOR_SUBJECT }, - { "Author", EXTRACTOR_AUTHOR }, - { "Keywords", EXTRACTOR_KEYWORDS }, - { "Comments", EXTRACTOR_COMMENT }, - { "Template", EXTRACTOR_TEMPLATE }, - { "NumPages", EXTRACTOR_PAGE_COUNT }, - { "AppName", EXTRACTOR_SOFTWARE }, - { "RevisionNumber", EXTRACTOR_VERSIONNUMBER }, - { "Dictionary", EXTRACTOR_LANGUAGE }, - { "NumBytes", EXTRACTOR_SIZE }, - { "CreatedTime", EXTRACTOR_CREATION_DATE }, - { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE }, - { "gsf:company", EXTRACTOR_COMPANY }, - /* { "gsf:security", EXTRACTOR_SECURITY }, */ - { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT }, - { "gsf:page-count", EXTRACTOR_PAGE_COUNT }, - { "gsf:line-count", EXTRACTOR_LINE_COUNT }, - { "gsf:word-count", EXTRACTOR_WORD_COUNT }, - { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT }, - { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY }, - /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ - { "gsf:manager", EXTRACTOR_MANAGER }, - { "dc:title", EXTRACTOR_TITLE }, - { "dc:creator", EXTRACTOR_CREATOR }, - { "dc:date", EXTRACTOR_DATE }, - { "dc:subject", EXTRACTOR_SUBJECT }, - { "dc:keywords", EXTRACTOR_KEYWORDS }, - { "dc:last-printed", EXTRACTOR_LAST_PRINTED }, - { "dc:description", EXTRACTOR_DESCRIPTION }, - { "meta:creation-date", EXTRACTOR_CREATION_DATE }, - /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */ - { "meta:generator", EXTRACTOR_GENERATOR }, - { "meta:template", EXTRACTOR_TEMPLATE }, - /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */ - /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */ - { NULL, 0 }, -}; - -static void processMetadata(gpointer key, - gpointer value, - gpointer user_data) { - struct EXTRACTOR_Keywords ** pprev = user_data; - const char * type = key; - const GsfDocProp * prop = value; - const GValue * gval; - char * contents; - int pos; - - if ( (key == NULL) || - (value == NULL) ) - return; - gval = gsf_doc_prop_get_val(prop); - - if (G_VALUE_TYPE(gval) == G_TYPE_STRING) { - contents = strdup(g_value_get_string(gval)); - } else { - /* convert other formats? */ - contents = g_strdup_value_contents(gval); - } - if (contents == NULL) - return; - if ( (strlen(contents) > 0) && - (contents[strlen(contents)-1] == '\n') ) - contents[strlen(contents)-1] = '\0'; - pos = 0; - while (tmap[pos].text != NULL) { - if (0 == strcmp(tmap[pos].text, - type)) - break; - pos++; - } - if (tmap[pos].text != NULL) - *pprev = addKeyword(*pprev, - contents, - tmap[pos].type); -#if DEBUG_OLE2 - else - printf("No match for type `%s'\n", - type); -#endif - free(contents); -} - - -static struct EXTRACTOR_Keywords * -process(GsfInput * in, - struct EXTRACTOR_Keywords * prev) { - GsfDocMetaData * sections; - GError * error; - - sections = gsf_doc_meta_data_new(); - error = gsf_msole_metadata_read(in, sections); - if (error == NULL) { - gsf_doc_meta_data_foreach(sections, - &processMetadata, - &prev); - } - g_object_unref(G_OBJECT(sections)); - return prev; -} - -static struct EXTRACTOR_Keywords * -processSO(GsfInput * src, - struct EXTRACTOR_Keywords * prev) { - off_t size; - char * buf; - - size = gsf_input_size(src); - if (size < 0x374) /* == 0x375?? */ - return prev; - buf = malloc(size); - gsf_input_read(src, size, (unsigned char*) buf); - if ( (buf[0] != 0x0F) || - (buf[1] != 0x0) || - (0 != strncmp(&buf[2], - "SfxDocumentInfo", - strlen("SfxDocumentInfo"))) || - (buf[0x11] != 0x0B) || - (buf[0x13] != 0x00) || /* pw protected! */ - (buf[0x12] != 0x00) ) { - free(buf); - return prev; - } - buf[0xd3] = '\0'; - if (buf[0x94] + buf[0x93] > 0) - prev = addKeyword(prev, - &buf[0x95], - EXTRACTOR_TITLE); - buf[0x114] = '\0'; - if (buf[0xd5] + buf[0xd4] > 0) - prev = addKeyword(prev, - &buf[0xd6], - EXTRACTOR_SUBJECT); - buf[0x215] = '\0'; - if (buf[0x115] + buf[0x116] > 0) - prev = addKeyword(prev, - &buf[0x117], - EXTRACTOR_COMMENT); - buf[0x296] = '\0'; - if (buf[0x216] + buf[0x217] > 0) - prev = addKeyword(prev, - &buf[0x218], - EXTRACTOR_KEYWORDS); - /* fixme: do timestamps, - mime-type, user-defined info's */ - - free(buf); - return prev; -} - -/* *************** wordleaker stuff *************** */ - -#define __(a) dgettext("iso-639", a) - -static const char * lidToLanguage( unsigned int lid ) { - switch ( lid ) { - case 0x0400: - return _("No Proofing"); - case 0x0401: - return __("Arabic"); - case 0x0402: - return __("Bulgarian"); - case 0x0403: - return __("Catalan"); - case 0x0404: - return _("Traditional Chinese"); - case 0x0804: - return _("Simplified Chinese"); - case 0x0405: - return __("Chechen"); - case 0x0406: - return __("Danish"); - case 0x0407: - return __("German"); - case 0x0807: - return _("Swiss German"); - case 0x0408: - return __("Greek"); - case 0x0409: - return _("U.S. English"); - case 0x0809: - return _("U.K. English"); - case 0x0c09: - return _("Australian English"); - case 0x040a: - return _("Castilian Spanish"); - case 0x080a: - return _("Mexican Spanish"); - case 0x040b: - return __("Finnish"); - case 0x040c: - return __("French"); - case 0x080c: - return _("Belgian French"); - case 0x0c0c: - return _("Canadian French"); - case 0x100c: - return _("Swiss French"); - case 0x040d: - return __("Hebrew"); - case 0x040e: - return __("Hungarian"); - case 0x040f: - return __("Icelandic"); - case 0x0410: - return __("Italian"); - case 0x0810: - return _("Swiss Italian"); - case 0x0411: - return __("Japanese"); - case 0x0412: - return __("Korean"); - case 0x0413: - return __("Dutch"); - case 0x0813: - return _("Belgian Dutch"); - case 0x0414: - return _("Norwegian Bokmal"); - case 0x0814: - return __("Norwegian Nynorsk"); - case 0x0415: - return __("Polish"); - case 0x0416: - return __("Brazilian Portuguese"); - case 0x0816: - return __("Portuguese"); - case 0x0417: - return _("Rhaeto-Romanic"); - case 0x0418: - return __("Romanian"); - case 0x0419: - return __("Russian"); - case 0x041a: - return _("Croato-Serbian (Latin)"); - case 0x081a: - return _("Serbo-Croatian (Cyrillic)"); - case 0x041b: - return __("Slovak"); - case 0x041c: - return __("Albanian"); - case 0x041d: - return __("Swedish"); - case 0x041e: - return __("Thai"); - case 0x041f: - return __("Turkish"); - case 0x0420: - return __("Urdu"); - case 0x0421: - return __("Bahasa"); - case 0x0422: - return __("Ukrainian"); - case 0x0423: - return __("Byelorussian"); - case 0x0424: - return __("Slovenian"); - case 0x0425: - return __("Estonian"); - case 0x0426: - return __("Latvian"); - case 0x0427: - return __("Lithuanian"); - case 0x0429: - return _("Farsi"); - case 0x042D: - return __("Basque"); - case 0x042F: - return __("Macedonian"); - case 0x0436: - return __("Afrikaans"); - case 0x043E: - return __("Malayalam"); - default: - return NULL; - } -} - - -static struct EXTRACTOR_Keywords * -history_extract(GsfInput * stream, - unsigned int lcbSttbSavedBy, - unsigned int fcSttbSavedBy, - struct EXTRACTOR_Keywords * prev) { - unsigned int where = 0; - unsigned char * lbuffer; - unsigned int i; - unsigned int length; - char * author; - char * filename; - char * rbuf; - unsigned int nRev; - - // goto offset of revision - gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); - if (gsf_input_remaining(stream) < lcbSttbSavedBy) - return prev; - lbuffer = malloc(lcbSttbSavedBy); - // read all the revision history - gsf_input_read(stream, lcbSttbSavedBy, lbuffer); - // there are n strings, so n/2 revisions (author & file) - nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; - where = 6; - for (i=0; i < nRev; i++) { - if (where >= lcbSttbSavedBy) - break; - length = lbuffer[where++]; - if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || - (where + 2 * length + 2 <= where) ) - break; - author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], - length * 2, - "UTF-16BE"); - where += length * 2 + 1; - length = lbuffer[where++]; - if ( (where + 2 * length >= lcbSttbSavedBy) || - (where + 2 * length + 1 <= where) ) { - free(author); - break; - } - filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], - length * 2, - "UTF-16BE"); - where += length * 2 + 1; - rbuf = malloc(strlen(author) + strlen(filename) + 512); - snprintf(rbuf, 512 + strlen(author) + strlen(filename), - _("Revision #%u: Author '%s' worked on '%s'"), - i, author, filename); - free(author); - free(filename); - prev = addKeyword(prev, - rbuf, - EXTRACTOR_REVISION_HISTORY); - free(rbuf); - } - free(lbuffer); - return prev; -} - - -/* ************** main method *********** */ - -struct EXTRACTOR_Keywords * -libextractor_ole2_extract(const char * filename, - const char * data, - size_t size, - struct EXTRACTOR_Keywords * prev) { - GsfInput * input; - GsfInfile * infile; - GsfInput * src; - const char * name; - const char * generator = NULL; - int i; - unsigned int lcb; - unsigned int fcb; - const unsigned char * data512; - unsigned int lid; - const char * lang; - - if (size < 512 + 898) - return prev; /* can hardly be OLE2 */ - input = gsf_input_memory_new((const guint8 *) data, - (gsf_off_t) size, - FALSE); - if (input == NULL) - return prev; - - infile = gsf_infile_msole_new(input, NULL); - if (infile == NULL) { - g_object_unref(G_OBJECT(input)); - return prev; - } - lcb = 0; - fcb = 0; - for (i=0;i<gsf_infile_num_children(infile);i++) { - name = gsf_infile_name_by_index (infile, i); - src = NULL; - if (name == NULL) - continue; - if ( (0 == strcmp(name, "\005SummaryInformation")) - || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) { - src = gsf_infile_child_by_index (infile, i); - if (src != NULL) - prev = process(src, - prev); - } - if (0 == strcmp(name, "SfxDocumentInfo")) { - src = gsf_infile_child_by_index (infile, i); - if (src != NULL) - prev = processSO(src, - prev); - } - if (src != NULL) - g_object_unref(G_OBJECT(src)); - } - - data512 = (const unsigned char*) &data[512]; - lid = data512[6] + (data512[7] << 8); - lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); - fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); - lang = lidToLanguage(lid); - if (lang != NULL) { - prev = addKeyword(prev, - lang, - EXTRACTOR_LANGUAGE); - } - if (lcb >= 6) { - for (i=0;i<gsf_infile_num_children(infile);i++) { - name = gsf_infile_name_by_index (infile, i); - if (name == NULL) - continue; - if ( (0 == strcmp(name, "1Table")) || - (0 == strcmp(name, "0Table")) ) { - src = gsf_infile_child_by_index (infile, i); - if (src != NULL) { - prev = history_extract(src, - lcb, - fcb, - prev); - g_object_unref(G_OBJECT(src)); - } - } - } - } - g_object_unref(G_OBJECT(infile)); - g_object_unref(G_OBJECT(input)); - - /* - * Hack to return an appropriate mimetype - */ - generator = EXTRACTOR_extractLast(EXTRACTOR_GENERATOR, prev); - if (NULL == generator) { - /* - * when very puzzled, just look at file magic number - */ - if ( (8 < size) - && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) ) - generator = "Microsoft Office"; - } - - if(NULL != generator) { - const char * mimetype = "application/vnd.ms-files"; - - if((0 == strncmp(generator, "Microsoft Word", 14)) || - (0 == strncmp(generator, "Microsoft Office Word", 21))) - mimetype = "application/msword"; - else if((0 == strncmp(generator, "Microsoft Excel", 15)) || - (0 == strncmp(generator, "Microsoft Office Excel", 22))) - mimetype = "application/vnd.ms-excel"; - else if((0 == strncmp(generator, "Microsoft PowerPoint", 20)) || - (0 == strncmp(generator, "Microsoft Office PowerPoint", 27))) - mimetype = "application/vnd.ms-powerpoint"; - else if(0 == strncmp(generator, "Microsoft Project", 17)) - mimetype = "application/vnd.ms-project"; - else if(0 == strncmp(generator, "Microsoft Visio", 15)) - mimetype = "application/vnd.visio"; - else if(0 == strncmp(generator, "Microsoft Office", 16)) - mimetype = "application/vnd.ms-office"; - - prev = addKeyword(prev, mimetype, EXTRACTOR_MIMETYPE); - } - - return prev; -} -static void nolog (const gchar *log_domain, - GLogLevelFlags log_level, - const gchar *message, - gpointer user_data) { -} - -void __attribute__ ((constructor)) ole2_ltdl_init() { - g_type_init(); -#ifdef HAVE_GSF_INIT - gsf_init(); -#endif - /* disable logging -- thanks, Jody! */ - g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL); - // gsf_init_dynamic(NULL); -} - -void __attribute__ ((destructor)) ole2_ltdl_fini() { -#ifdef HAVE_GSF_INIT - gsf_shutdown(); -#endif - // gsf_shutdown_dynamic(NULL); -} - -/* end of ole2extractor.c */ - diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c @@ -0,0 +1,599 @@ +/* + This file is part of libextractor. + (C) 2004, 2005, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + + This code makes extensive use of libgsf + -- the Gnome Structured File Library + Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) + + Part of this code was borrowed from wordleaker.cpp. See also + the README file in this directory. +*/ + +#include "platform.h" +#include "extractor.h" +#include "convert.h" + +#include <glib-object.h> +#include <string.h> +#include <stdio.h> +#include <ctype.h> + +#include <gsf/gsf-utils.h> +#include <gsf/gsf-input-memory.h> +#include <gsf/gsf-infile.h> +#include <gsf/gsf-infile-msole.h> +#include <gsf/gsf-msole-utils.h> + +#define DEBUG_OLE2 0 + +/* ******************************** main extraction code ************************ */ + +static int +addKeyword(EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *phrase, + enum EXTRACTOR_MetaType type) { + if (strlen(phrase) == 0) + return 0; + if (0 == strcmp(phrase, "\"\"")) + return 0; + if (0 == strcmp(phrase, "\" \"")) + return 0; + if (0 == strcmp(phrase, " ")) + return 0; + return proc (proc_cls, + "ole2", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + phrase, + strlen (phrase) +1); +} + +typedef struct { + char * text; + enum EXTRACTOR_MetaType type; +} Matches; + +static Matches tmap[] = { + { "Title", EXTRACTOR_METATYPE_TITLE }, + { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT }, + { "Category", EXTRACTOR_METATYPE_SECTION }, + { "Manager", EXTRACTOR_METATYPE_MANAGER }, + { "Company", EXTRACTOR_METATYPE_COMPANY }, + { "Subject", EXTRACTOR_METATYPE_SUBJECT }, + { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME }, + { "Keywords", EXTRACTOR_METATYPE_KEYWORDS }, + { "Comments", EXTRACTOR_METATYPE_COMMENT }, + { "Template", EXTRACTOR_METATYPE_TEMPLATE }, + { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT }, + { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, + { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER }, + { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE }, + { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE }, + { "LastSavedTime" , EXTRACTOR_METATYPE_MODIFICATION_DATE }, + { "gsf:company", EXTRACTOR_METATYPE_COMPANY }, + { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT }, + { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT }, + { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT }, + { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT }, + { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT }, + { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY }, + { "gsf:manager", EXTRACTOR_METATYPE_MANAGER }, + { "dc:title", EXTRACTOR_METATYPE_TITLE }, + { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, + { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, + { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, + { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS }, + { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED }, + { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, + { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, + { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, + { "meta:template", EXTRACTOR_METATYPE_TEMPLATE }, + { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES }, + /* { "Dictionary", EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE }, */ + /* { "gsf:security", EXTRACTOR_SECURITY }, */ + /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ + /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */ + /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */ + { NULL, 0 } +}; + + +struct ProcContext +{ + EXTRACTOR_MetaDataProcessor proc; + void *proc_cls; + int ret; +}; + + +static void processMetadata(gpointer key, + gpointer value, + gpointer user_data) { + struct ProcContext *pc = user_data; + const char * type = key; + const GsfDocProp * prop = value; + const GValue * gval; + char * contents; + int pos; + + if ( (key == NULL) || + (value == NULL) ) + return; + if (pc->ret != 0) + return; + gval = gsf_doc_prop_get_val(prop); + + if (G_VALUE_TYPE(gval) == G_TYPE_STRING) + { + contents = strdup(g_value_get_string(gval)); + } + else + { + /* convert other formats? */ + contents = g_strdup_value_contents(gval); + } + if (contents == NULL) + return; + if ( (strlen(contents) > 0) && + (contents[strlen(contents)-1] == '\n') ) + contents[strlen(contents)-1] = '\0'; + pos = 0; + while (tmap[pos].text != NULL) + { + if (0 == strcmp(tmap[pos].text, + type)) + break; + pos++; + } + if (0 == strcmp (type, "meta:generator")) + { + const char * mimetype = "application/vnd.ms-files"; + if((0 == strncmp(value, "Microsoft Word", 14)) || + (0 == strncmp(value, "Microsoft Office Word", 21))) + mimetype = "application/msword"; + else if((0 == strncmp(value, "Microsoft Excel", 15)) || + (0 == strncmp(value, "Microsoft Office Excel", 22))) + mimetype = "application/vnd.ms-excel"; + else if((0 == strncmp(value, "Microsoft PowerPoint", 20)) || + (0 == strncmp(value, "Microsoft Office PowerPoint", 27))) + mimetype = "application/vnd.ms-powerpoint"; + else if(0 == strncmp(value, "Microsoft Project", 17)) + mimetype = "application/vnd.ms-project"; + else if(0 == strncmp(value, "Microsoft Visio", 15)) + mimetype = "application/vnd.visio"; + else if(0 == strncmp(value, "Microsoft Office", 16)) + mimetype = "application/vnd.ms-office"; + + if (0 != addKeyword(pc->proc, + pc->proc_cls, mimetype, EXTRACTOR_METATYPE_MIMETYPE)) + { + free (contents); + pc->ret = 1; + return; + } + } + if (tmap[pos].text != NULL) + { + if (0 != addKeyword(pc->proc, pc->proc_cls, + contents, + tmap[pos].type)) + { + free (contents); + pc->ret = 1; + return; + } + } +#if DEBUG_OLE2 + else + printf("No match for type `%s'\n", + type); +#endif + free(contents); +} + + +static int +process(GsfInput * in, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + struct ProcContext pc; + GsfDocMetaData * sections; + GError * error; + + pc.proc = proc; + pc.proc_cls = proc_cls; + pc.ret = 0; + sections = gsf_doc_meta_data_new(); + error = gsf_msole_metadata_read(in, sections); + if (error == NULL) { + gsf_doc_meta_data_foreach(sections, + &processMetadata, + &pc); + } + g_object_unref(G_OBJECT(sections)); + return pc.ret; +} + +static int +processSO(GsfInput * src, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) { + off_t size = gsf_input_size(src); + if ( (size < 0x374) || (size > 4*1024*1024) ) /* == 0x375?? */ + return 0; + char buf[size]; + gsf_input_read(src, size, (unsigned char*) buf); + if ( (buf[0] != 0x0F) || + (buf[1] != 0x0) || + (0 != strncmp(&buf[2], + "SfxDocumentInfo", + strlen("SfxDocumentInfo"))) || + (buf[0x11] != 0x0B) || + (buf[0x13] != 0x00) || /* pw protected! */ + (buf[0x12] != 0x00) ) + return 0; + buf[0xd3] = '\0'; + if (buf[0x94] + buf[0x93] > 0) + if (0 != addKeyword(proc, proc_cls, + &buf[0x95], + EXTRACTOR_METATYPE_TITLE)) + return 1; + buf[0x114] = '\0'; + if (buf[0xd5] + buf[0xd4] > 0) + if (0 != addKeyword(proc, proc_cls, + &buf[0xd6], + EXTRACTOR_METATYPE_SUBJECT)) + return 1; + buf[0x215] = '\0'; + if (buf[0x115] + buf[0x116] > 0) + if (0 != addKeyword(proc, proc_cls, + &buf[0x117], + EXTRACTOR_METATYPE_COMMENT)) + return 1; + buf[0x296] = '\0'; + if (buf[0x216] + buf[0x217] > 0) + if (0 != addKeyword(proc, proc_cls, + &buf[0x218], + EXTRACTOR_METATYPE_KEYWORDS)) + return 1; + /* fixme: do timestamps, + mime-type, user-defined info's */ + return 0; +} + +/* *************** wordleaker stuff *************** */ + +#define __(a) dgettext("iso-639", a) + +static const char * lidToLanguage( unsigned int lid ) { + switch ( lid ) { + case 0x0400: + return _("No Proofing"); + case 0x0401: + return __("Arabic"); + case 0x0402: + return __("Bulgarian"); + case 0x0403: + return __("Catalan"); + case 0x0404: + return _("Traditional Chinese"); + case 0x0804: + return _("Simplified Chinese"); + case 0x0405: + return __("Chechen"); + case 0x0406: + return __("Danish"); + case 0x0407: + return __("German"); + case 0x0807: + return _("Swiss German"); + case 0x0408: + return __("Greek"); + case 0x0409: + return _("U.S. English"); + case 0x0809: + return _("U.K. English"); + case 0x0c09: + return _("Australian English"); + case 0x040a: + return _("Castilian Spanish"); + case 0x080a: + return _("Mexican Spanish"); + case 0x040b: + return __("Finnish"); + case 0x040c: + return __("French"); + case 0x080c: + return _("Belgian French"); + case 0x0c0c: + return _("Canadian French"); + case 0x100c: + return _("Swiss French"); + case 0x040d: + return __("Hebrew"); + case 0x040e: + return __("Hungarian"); + case 0x040f: + return __("Icelandic"); + case 0x0410: + return __("Italian"); + case 0x0810: + return _("Swiss Italian"); + case 0x0411: + return __("Japanese"); + case 0x0412: + return __("Korean"); + case 0x0413: + return __("Dutch"); + case 0x0813: + return _("Belgian Dutch"); + case 0x0414: + return _("Norwegian Bokmal"); + case 0x0814: + return __("Norwegian Nynorsk"); + case 0x0415: + return __("Polish"); + case 0x0416: + return __("Brazilian Portuguese"); + case 0x0816: + return __("Portuguese"); + case 0x0417: + return _("Rhaeto-Romanic"); + case 0x0418: + return __("Romanian"); + case 0x0419: + return __("Russian"); + case 0x041a: + return _("Croato-Serbian (Latin)"); + case 0x081a: + return _("Serbo-Croatian (Cyrillic)"); + case 0x041b: + return __("Slovak"); + case 0x041c: + return __("Albanian"); + case 0x041d: + return __("Swedish"); + case 0x041e: + return __("Thai"); + case 0x041f: + return __("Turkish"); + case 0x0420: + return __("Urdu"); + case 0x0421: + return __("Bahasa"); + case 0x0422: + return __("Ukrainian"); + case 0x0423: + return __("Byelorussian"); + case 0x0424: + return __("Slovenian"); + case 0x0425: + return __("Estonian"); + case 0x0426: + return __("Latvian"); + case 0x0427: + return __("Lithuanian"); + case 0x0429: + return _("Farsi"); + case 0x042D: + return __("Basque"); + case 0x042F: + return __("Macedonian"); + case 0x0436: + return __("Afrikaans"); + case 0x043E: + return __("Malayalam"); + default: + return NULL; + } +} + + +static int +history_extract(GsfInput * stream, + unsigned int lcbSttbSavedBy, + unsigned int fcSttbSavedBy, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + unsigned int where = 0; + unsigned char * lbuffer; + unsigned int i; + unsigned int length; + char * author; + char * filename; + char * rbuf; + unsigned int nRev; + int ret; + + // goto offset of revision + gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); + if (gsf_input_remaining(stream) < lcbSttbSavedBy) + return 0; + lbuffer = malloc(lcbSttbSavedBy); + // read all the revision history + gsf_input_read(stream, lcbSttbSavedBy, lbuffer); + // there are n strings, so n/2 revisions (author & file) + nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; + where = 6; + ret = 0; + for (i=0; i < nRev; i++) { + if (where >= lcbSttbSavedBy) + break; + length = lbuffer[where++]; + if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || + (where + 2 * length + 2 <= where) ) + break; + author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], + length * 2, + "UTF-16BE"); + where += length * 2 + 1; + length = lbuffer[where++]; + if ( (where + 2 * length >= lcbSttbSavedBy) || + (where + 2 * length + 1 <= where) ) { + free(author); + break; + } + filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], + length * 2, + "UTF-16BE"); + where += length * 2 + 1; + rbuf = malloc(strlen(author) + strlen(filename) + 512); + snprintf(rbuf, 512 + strlen(author) + strlen(filename), + _("Revision #%u: Author '%s' worked on '%s'"), + i, author, filename); + free(author); + free(filename); + ret = addKeyword(proc, proc_cls, + rbuf, + EXTRACTOR_METATYPE_REVISION_HISTORY); + free(rbuf); + if (0 != ret) + break; + } + free(lbuffer); + return ret; +} + + +int +EXTRACTOR_ole2_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + GsfInput * input; + GsfInfile * infile; + GsfInput * src; + const char * name; + int i; + unsigned int lcb; + unsigned int fcb; + const unsigned char * data512; + unsigned int lid; + const char * lang; + int ret; + + ret = 0; + if (size < 512 + 898) + return 0; /* can hardly be OLE2 */ + input = gsf_input_memory_new((const guint8 *) data, + (gsf_off_t) size, + FALSE); + if (input == NULL) + return 0; + + infile = gsf_infile_msole_new(input, NULL); + if (infile == NULL) { + g_object_unref(G_OBJECT(input)); + return 0; + } + lcb = 0; + fcb = 0; + for (i=0;i<gsf_infile_num_children(infile);i++) { + name = gsf_infile_name_by_index (infile, i); + src = NULL; + if (ret != 0) + break; + if (name == NULL) + continue; + if ( (0 == strcmp(name, "\005SummaryInformation")) + || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) { + src = gsf_infile_child_by_index (infile, i); + if (src != NULL) + ret = process(src, + proc, + proc_cls); + } + if (0 == strcmp(name, "SfxDocumentInfo")) { + src = gsf_infile_child_by_index (infile, i); + if ( (src != NULL) && (ret == 0) ) + ret = processSO(src, + proc, + proc_cls); + } + if (src != NULL) + g_object_unref(G_OBJECT(src)); + } + + data512 = (const unsigned char*) &data[512]; + lid = data512[6] + (data512[7] << 8); + lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); + fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); + lang = lidToLanguage(lid); + if ( (lang != NULL) && (ret == 0) ) + ret = addKeyword(proc, proc_cls, + lang, + EXTRACTOR_METATYPE_DOCUMENT_LANGUAGE); + if (lcb >= 6) { + for (i=0;i<gsf_infile_num_children(infile);i++) { + if (ret != 0) + break; + name = gsf_infile_name_by_index (infile, i); + if (name == NULL) + continue; + if ( (0 == strcmp(name, "1Table")) || + (0 == strcmp(name, "0Table")) ) { + src = gsf_infile_child_by_index (infile, i); + if (src != NULL) { + ret = history_extract(src, + lcb, + fcb, + proc, proc_cls); + g_object_unref(G_OBJECT(src)); + } + } + } + } + g_object_unref(G_OBJECT(infile)); + g_object_unref(G_OBJECT(input)); + return ret; +} + + +static void +nolog (const gchar *log_domain, + GLogLevelFlags log_level, + const gchar *message, + gpointer user_data) { +} + + +void __attribute__ ((constructor)) ole2_ltdl_init() { + g_type_init(); +#ifdef HAVE_GSF_INIT + gsf_init(); +#endif + /* disable logging -- thanks, Jody! */ + g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL); +} + + +void __attribute__ ((destructor)) ole2_ltdl_fini() { +#ifdef HAVE_GSF_INIT + gsf_shutdown(); +#endif +} + +/* end of ole2_extractor.c */ + diff --git a/src/plugins/pdf_extractor.cc b/src/plugins/pdf_extractor.cc @@ -37,6 +37,7 @@ #include <poppler/Page.h> #include <poppler/PDFDoc.h> #include <poppler/Error.h> +#include <poppler/GlobalParams.h> #include <poppler/goo/GooString.h> #define ADD(s, type) do { if (0!=proc(proc_cls, "pdf", type, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1)) { err = 1; goto EXIT; }} while (0) @@ -167,7 +168,11 @@ extern "C" { BaseStream * stream; int err; - /* errorInit(); -- keep commented out, otherwise errors are printed to stderr for non-pdf files! */ + if (globalParams == NULL) + { + globalParams = new GlobalParams(); + globalParams->setErrQuiet (gTrue); + } obj.initNull(); err = 0; stream = new MemStream( (char*) data, 0, size, &obj); diff --git a/src/plugins/rpm_extractor.c b/src/plugins/rpm_extractor.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2002, 2003, 2008 Vidyut Samanta and Christian Grothoff + (C) 2002, 2003, 2008, 2009 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published