libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 7406fa16dc06ff094b9d2535999d6e695b3c80b3
parent d72714d2581e1d0c54b3bb15b46aa9bd4784768d
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 29 Apr 2006 04:49:06 +0000

integrating wordleaker into ole2 plugin, switching to libgsf

Diffstat:
MChangeLog | 4++++
Mconfigure.ac | 5+++--
Am4/abi-gsf.m4 | 78++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/include/extractor.h | 8++++++++
Msrc/main/extractor.c | 53++++++++++++++++++++++++++++++-----------------------
Msrc/plugins/Makefile.am | 7+++++--
Msrc/plugins/hash/rmd160extractor.c | 9+++++----
Msrc/plugins/ole2/Makefile.am | 7+++----
Rsrc/plugins/wordleaker/README -> src/plugins/ole2/README | 0
Msrc/plugins/ole2/ole2extractor.c | 2401++++++++++++-------------------------------------------------------------------
Dsrc/plugins/wordleaker/Makefile.am | 25-------------------------
Dsrc/plugins/wordleaker/SYMBOLS | 1-
Dsrc/plugins/wordleaker/pole.cpp | 1271-------------------------------------------------------------------------------
Dsrc/plugins/wordleaker/pole.h | 149-------------------------------------------------------------------------------
Dsrc/plugins/wordleaker/wordextractor.cc | 486-------------------------------------------------------------------------------
Dsrc/plugins/wordleaker/wordleaker.cpp | 308-------------------------------------------------------------------------------
Dsrc/plugins/wordleaker/wordleaker.h | 124-------------------------------------------------------------------------------
17 files changed, 494 insertions(+), 4442 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,7 @@ +Fri Apr 28 22:26:43 PDT 2006 + Integrated wordleaker into OLE2 plugin. Changed OLE2 plugin to use + libgsf (new dependency!). + Fri Apr 28 16:18:26 PDT 2006 Fixing some i18n issues. Specifically, EXTRACTOR_getKeywordTypeAsString will now never return the translated version of the keyword type diff --git a/configure.ac b/configure.ac @@ -313,6 +313,7 @@ AC_ARG_ENABLE(exiv2, AM_CONDITIONAL(HAVE_EXIV2, test x$exiv2 != x0) AC_DEFINE_UNQUOTED([HAVE_EXIV2], $exiv2, [We use EXIV2]) +ABI_GSF AC_SUBST(CPPFLAGS) AC_SUBST(LDFLAGS) @@ -358,9 +359,9 @@ else AC_MSG_NOTICE([NOTICE: printable plugins enabled]) fi -if test "x$without_glib" = "xtrue" +if test "x$have_gsf" != "xtrue" then - AC_MSG_NOTICE([NOTICE: glib not used, no OLE2 (MS Office) support]) + AC_MSG_NOTICE([NOTICE: libgsf not found, no OLE2 (MS Office) support]) fi if test "x$without_gtk" = "xtrue" diff --git a/m4/abi-gsf.m4 b/m4/abi-gsf.m4 @@ -0,0 +1,78 @@ +# start: abi/ac-helpers/abi-gsf.m4 +# +# Copyright (C) 2005 Christian Neumair +# +# This file is free software; you may copy and/or distribute it with +# or without modifications, as long as this notice is preserved. +# This software is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY, to the extent permitted by law; without even +# the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR +# PURPOSE. +# +# The above license applies to THIS FILE ONLY, the GNUnet code +# itself may be copied and distributed under the terms of the GNU +# GPL, see COPYING for more details +# +# Usage: ABI_GSF + +# Check for gsf + +AC_DEFUN([ABI_GSF], [ + +test_gsf=true +have_gsf=false + +test_gsf_gnome=true +have_gsf_gnome=false + +AC_ARG_ENABLE(gsf,[ --disable-gsf Turn off gsf], [ + if test "x$enableval" = "xno"; then + test_gsf=false + fi +]) + +AC_ARG_ENABLE(gsf-gnome,[ --disable-gnome Turn off gsf-gnome], [ + if test "x$enableval" = "xno"; then + test_gsf_gnome=false + fi +]) + +if test "x$test_gsf" = "xtrue" ; then + PKG_CHECK_MODULES(GSF,[libgsf-1 >= 1.10], [ + have_gsf=true + GSF_CFLAGS="$GSF_CFLAGS -DHAVE_GSF" + ], + [ + have_gsf=false + ]) +fi + +if test "x$have_gsf" = "xtrue" -a "x$test_gsf_gnome" = "xtrue" ; then + PKG_CHECK_MODULES(GSF_GNOME, [libgsf-gnome-1 >= 1.10], [ + have_gsf_gnome=true + GSF_GNOME_CFLAGS="$GSF_GNOME_CFLAGS -DHAVE_GSF_GNOME" + ], + [ + have_gsf_gnome=false + ]) +fi + +AC_SUBST(GSF_CFLAGS) +AC_SUBST(GSF_LIBS) + +AC_SUBST(GSF_GNOME_CFLAGS) +AC_SUBST(GSF_GNOME_LIBS) + +AM_CONDITIONAL(WITH_GSF, test "x$have_gsf" = "xtrue") +AM_CONDITIONAL(WITH_GSF_GNOME, test "x$have_gsf_gnome" = "xtrue") + +if test "x$have_gsf_gnome" = "xtrue" ; then + abi_gsf_message="yes, with GNOME support" +else if test "x$have_gsf" = "xtrue" ; then + abi_gsf_message="yes, without GNOME support" +else + abi_gsf_message="no" +fi +fi + +]) diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -152,6 +152,14 @@ typedef enum { EXTRACTOR_MODIFIED_BY_SOFTWARE = 99, EXTRACTOR_REVISION_HISTORY = 100, EXTRACTOR_LOWERCASE = 101, + EXTRACTOR_COMPANY = 102, + EXTRACTOR_GENERATOR = 103, + EXTRACTOR_CHARACTER_SET = 104, + EXTRACTOR_LINE_COUNT = 105, + EXTRACTOR_PARAGRAPH_COUNT = 106, + EXTRACTOR_EDITING_CYCLES = 107, + EXTRACTOR_SCALE = 108, + EXTRACTOR_MANAGER = 109, } EXTRACTOR_KeywordType; /** diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -41,113 +41,121 @@ * The sources of keywords as strings. */ static const char *keywordTypes[] = { - gettext_noop("unknown"), + gettext_noop("unknown"), /* 0 */ gettext_noop("filename"), gettext_noop("mimetype"), gettext_noop("title"), gettext_noop("author"), - gettext_noop("artist"), + gettext_noop("artist"), /* 5 */ gettext_noop("description"), gettext_noop("comment"), gettext_noop("date"), gettext_noop("publisher"), - gettext_noop("language"), + gettext_noop("language"), /* 10 */ gettext_noop("album"), gettext_noop("genre"), gettext_noop("location"), gettext_noop("version"), - gettext_noop("organization"), + gettext_noop("organization"), /* 15 */ gettext_noop("copyright"), gettext_noop("subject"), gettext_noop("keywords"), gettext_noop("contributor"), - gettext_noop("resource-type"), + gettext_noop("resource-type"), /* 20 */ gettext_noop("format"), gettext_noop("resource-identifier"), gettext_noop("source"), gettext_noop("relation"), - gettext_noop("coverage"), + gettext_noop("coverage"), /* 25 */ gettext_noop("software"), gettext_noop("disclaimer"), gettext_noop("warning"), gettext_noop("translated"), - gettext_noop("creation date"), + gettext_noop("creation date"), /* 30 */ gettext_noop("modification date"), gettext_noop("creator"), gettext_noop("producer"), gettext_noop("page count"), - gettext_noop("page orientation"), + gettext_noop("page orientation"), /* 35 */ gettext_noop("paper size"), gettext_noop("used fonts"), gettext_noop("page order"), gettext_noop("created for"), - gettext_noop("magnification"), + gettext_noop("magnification"), /* 40 */ gettext_noop("release"), gettext_noop("group"), gettext_noop("size"), gettext_noop("summary"), - gettext_noop("packager"), + gettext_noop("packager"), /* 45 */ gettext_noop("vendor"), gettext_noop("license"), gettext_noop("distribution"), gettext_noop("build-host"), - gettext_noop("os"), + gettext_noop("operating system"), /* 50 */ gettext_noop("dependency"), gettext_noop("MD4"), gettext_noop("MD5"), gettext_noop("SHA-0"), - gettext_noop("SHA-1"), + gettext_noop("SHA-1"), /* 55 */ gettext_noop("RipeMD160"), gettext_noop("resolution"), gettext_noop("category"), gettext_noop("book title"), - gettext_noop("priority"), + gettext_noop("priority"), /* 60 */ gettext_noop("conflicts"), gettext_noop("replaces"), gettext_noop("provides"), gettext_noop("conductor"), - gettext_noop("interpreter"), + gettext_noop("interpreter"), /* 65 */ gettext_noop("owner"), gettext_noop("lyrics"), gettext_noop("media type"), gettext_noop("contact"), - gettext_noop("binary thumbnail data"), + gettext_noop("binary thumbnail data"), /* 70 */ gettext_noop("publication date"), gettext_noop("camera make"), gettext_noop("camera model"), gettext_noop("exposure"), - gettext_noop("aperture"), + gettext_noop("aperture"), /* 75 */ gettext_noop("exposure bias"), gettext_noop("flash"), gettext_noop("flash bias"), gettext_noop("focal length"), - gettext_noop("focal length (35mm equivalent)"), + gettext_noop("focal length (35mm equivalent)"), /* 80 */ gettext_noop("iso speed"), gettext_noop("exposure mode"), gettext_noop("metering mode"), gettext_noop("macro mode"), - gettext_noop("image quality"), + gettext_noop("image quality"), /* 85 */ gettext_noop("white balance"), gettext_noop("orientation"), gettext_noop("template"), gettext_noop("split"), - gettext_noop("product version"), + gettext_noop("product version"), /* 90 */ gettext_noop("last saved by"), gettext_noop("last printed"), gettext_noop("word count"), gettext_noop("character count"), - gettext_noop("total editing time"), + gettext_noop("total editing time"), /* 95 */ gettext_noop("thumbnails"), gettext_noop("security"), gettext_noop("created by software"), gettext_noop("modified by software"), - gettext_noop("revision history"), + gettext_noop("revision history"), /* 100 */ gettext_noop("lower case conversion"), + gettext_noop("company"), + gettext_noop("generator"), + gettext_noop("character set"), + gettext_noop("line count"), /* 105 */ + gettext_noop("paragraph count"), + gettext_noop("editing cycles"), + gettext_noop("scale"), + gettext_noop("manager"), NULL, }; /* the number of keyword types (for bounds-checking) */ -#define HIGHEST_TYPE_NUMBER 102 +#define HIGHEST_TYPE_NUMBER 110 #ifdef HAVE_LIBOGG #if HAVE_VORBIS @@ -211,7 +219,6 @@ libextractor_riff:\ libextractor_mpeg:\ libextractor_elf:\ libextractor_oo:\ -libextractor_word:\ libextractor_asf" #define DEFAULT_LIBRARIES EXSO OLESO OGGSO QTSO DEFSO diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -1,7 +1,9 @@ include Makefile-plugins.am if HAVE_GLIB -oledir=ole2 +if WITH_GSF + oledir=ole2 +endif if HAVE_GTK thumbdir=thumbnail endif @@ -15,6 +17,7 @@ if HAVE_EXIV2 exiv2dir=exiv2 endif + if HAVE_XPDF xpdfdir=pdf else @@ -23,7 +26,7 @@ endif # toggle for development # SUBDIRS = . -SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm $(xpdfdir) $(thumbdir) $(exiv2dir) wordleaker +SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm $(xpdfdir) $(thumbdir) $(exiv2dir) if HAVE_VORBISFILE diff --git a/src/plugins/hash/rmd160extractor.c b/src/plugins/hash/rmd160extractor.c @@ -619,10 +619,11 @@ static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordList *oldhead, #define rmd160_new() rmd160_copy(NULL,NULL) -struct EXTRACTOR_Keywords * libextractor_hash_rmd160_extract(const char * filename, - char * data, - size_t size, - struct EXTRACTOR_Keywords * prev) { +struct EXTRACTOR_Keywords * +libextractor_hash_rmd160_extract(const char * filename, + const unsigned char * data, + size_t size, + struct EXTRACTOR_Keywords * prev) { unsigned char bin_buffer[MAX_DIGEST_BIN_BYTES]; char hash[8 * MAX_DIGEST_BIN_BYTES]; char buf[16]; diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am @@ -4,12 +4,11 @@ include ../Makefile-plugins.am plugin_LTLIBRARIES = \ libextractor_ole2.la -AM_CFLAGS = $(GLIB_CFLAGS) - libextractor_ole2_la_CFLAGS = \ - $(GLIB_CFLAGS) + $(GSF_CFLAGS) libextractor_ole2_la_LIBADD = \ - $(LIBADD) $(GLIB_LIBS) -lgobject-2.0 \ + $(LIBADD) $(GSF_LIBS) \ + $(top_builddir)/src/plugins/libconvert.la \ $(top_builddir)/src/main/libextractor.la libextractor_ole2_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) diff --git a/src/plugins/wordleaker/README b/src/plugins/ole2/README diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c @@ -1,6 +1,6 @@ /* This file is part of libextractor. - (C) 2004,2005 Vidyut Samanta and Christian Grothoff + (C) 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff libextractor is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published @@ -17,1217 +17,30 @@ Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - Most of the code in this directory comes from - libgsf 1.10.1 (Licensed under GPL/LGPL). - - libgsf -- The G Structured File Library + This code makes extensive use of libgsf + -- the Gnome Structured File Library Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) + Part of this code was borrowed from wordleaker.cpp. See also + the README file in this directory. */ #include "platform.h" #include "extractor.h" +#include "../convert.h" + #include <glib-object.h> #include <string.h> #include <stdio.h> #include <ctype.h> -#define DEBUG_OLE2 0 - -#if DEBUG_OLE2 -#define d(code) do { code } while (0) -#define warning printf -#else -#define d(code) - static void warning(const char * format, ...) {} -#endif - -#undef g_return_val_if_fail -#define g_return_val_if_fail(a,b) if (! (a)) return (b); - -/* *********************** formerly gsf-input.c ************* */ - -typedef struct GsfInput { - off_t size; - off_t cur_offset; - char * name; - const unsigned char * buf; - int needs_free; -} GsfInput; - - -static void -gsf_input_init (GsfInput * input) -{ - input->size = 0; - input->cur_offset = 0; - input->name = NULL; - input->buf = NULL; -} - -/** - * gsf_input_memory_new: - * @buf: The input bytes - * @length: The length of @buf - * @needs_free: Whether you want this memory to be free'd at object destruction - * - * Returns: A new #GsfInputMemory - */ -static GsfInput * -gsf_input_new (const unsigned char * buf, - off_t length, - int needs_free) -{ - GsfInput *mem = malloc(sizeof(GsfInput)); - if (mem == NULL) - return NULL; - gsf_input_init(mem); - mem->buf = buf; - mem->size = length; - mem->needs_free = needs_free; - return mem; -} - -static void -gsf_input_finalize (GsfInput * input) -{ - if (input->name != NULL) { - free (input->name); - input->name = NULL; - } - if ( (input->buf) && input->needs_free) - free((void*) input->buf); - free(input); -} - -/** - * gsf_input_set_name : - * @input : - * @name : - * - * protected. - * - * Returns : TRUE if the assignment was ok. - **/ -static int -gsf_input_set_name (GsfInput *input, char const *name) -{ - char *buf; - - g_return_val_if_fail (input != NULL, 0); - - buf = strdup (name); - if (input->name != NULL) - free (input->name); - input->name = buf; - return 1; -} - - - -static GsfInput * -gsf_input_dup (GsfInput *src) -{ - GsfInput * dst = malloc(sizeof(GsfInput)); - if (dst == NULL) - return NULL; - gsf_input_init(dst); - dst->buf = src->buf; - dst->needs_free = 0; - dst->size = src->size; - if (src->name != NULL) - gsf_input_set_name (dst, src->name); - dst->cur_offset = src->cur_offset; - return dst; -} - -static const unsigned char * -gsf_input_read (GsfInput * mem, size_t num_bytes, unsigned char * optional_buffer) -{ - const unsigned char *src = mem->buf; - if (src == NULL) - return NULL; - if (optional_buffer) { - memcpy (optional_buffer, src + mem->cur_offset, num_bytes); - mem->cur_offset += num_bytes; - - return optional_buffer; - } else { - const unsigned char * ret = src + mem->cur_offset; - mem->cur_offset += num_bytes; - return ret; - } -} - -/** - * gsf_input_size : - * @input : The input - * - * Looks up and caches the number of bytes in the input - * - * Returns : the size or -1 on error - **/ -static off_t -gsf_input_size (GsfInput *input) -{ - g_return_val_if_fail (input != NULL, -1); - return input->size; -} - -/** - * gsf_input_seek : - * @input : - * @offset : - * @whence : - * - * Returns TRUE on error. - **/ -static int -gsf_input_seek (GsfInput *input, off_t offset, int whence) -{ - off_t pos = offset; - - g_return_val_if_fail (input != NULL, 1); - - switch (whence) { - case SEEK_SET : break; - case SEEK_CUR : pos += input->cur_offset; break; - case SEEK_END : pos += input->size; break; - default : return 1; - } - - if (pos < 0 || pos > input->size) - return 1; - - /* - * If we go nowhere, just return. This in particular handles null - * seeks for streams with no seek method. - */ - if (pos == input->cur_offset) - return 0; - - input->cur_offset = pos; - return 0; -} - - - - -/* ******************** formerly gsf-utils.c **************** */ - - -/* Do this the ugly way so that we don't have to worry about alignment */ -#define GSF_LE_GET_GUINT8(p) (*(guint8 const *)(p)) -#define GSF_LE_GET_GUINT16(p) \ - (guint16)((((guint8 const *)(p))[0] << 0) | \ - (((guint8 const *)(p))[1] << 8)) -#define GSF_LE_GET_GUINT32(p) \ - (guint32)((((guint8 const *)(p))[0] << 0) | \ - (((guint8 const *)(p))[1] << 8) | \ - (((guint8 const *)(p))[2] << 16) | \ - (((guint8 const *)(p))[3] << 24)) - -#define GSF_LE_GET_GUINT64(p) (gsf_le_get_guint64 (p)) -#define GSF_LE_GET_GINT64(p) ((gint64)GSF_LE_GET_GUINT64(p)) -#define GSF_LE_GET_GINT8(p) ((gint8)GSF_LE_GET_GUINT8(p)) -#define GSF_LE_GET_GINT16(p) ((gint16)GSF_LE_GET_GUINT16(p)) -#define GSF_LE_GET_GINT32(p) ((gint32)GSF_LE_GET_GUINT32(p)) -#define GSF_LE_GET_FLOAT(p) (gsf_le_get_float (p)) -#define GSF_LE_GET_DOUBLE(p) (gsf_le_get_double (p)) -#define GSF_LE_SET_GUINT8(p, dat) \ - (*((guint8 *)(p)) = ((dat) & 0xff)) -#define GSF_LE_SET_GUINT16(p, dat) \ - ((*((guint8 *)(p) + 0) = ((dat) & 0xff)),\ - (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff)) -#define GSF_LE_SET_GUINT32(p, dat) \ - ((*((guint8 *)(p) + 0) = ((dat)) & 0xff), \ - (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff), \ - (*((guint8 *)(p) + 2) = ((dat) >> 16) & 0xff), \ - (*((guint8 *)(p) + 3) = ((dat) >> 24) & 0xff)) -#define GSF_LE_SET_GINT8(p,dat) GSF_LE_SET_GUINT8((p),(dat)) -#define GSF_LE_SET_GINT16(p,dat) GSF_LE_SET_GUINT16((p),(dat)) -#define GSF_LE_SET_GINT32(p,dat) GSF_LE_SET_GUINT32((p),(dat)) - - -/* - * Glib gets this wrong, really. ARM's floating point format is a weird - * mixture. - */ -#define G_ARMFLOAT_ENDIAN 56781234 -#if defined(__arm__) && !defined(__vfp__) && (G_BYTE_ORDER == G_LITTLE_ENDIAN) -#define G_FLOAT_BYTE_ORDER G_ARMFLOAT_ENDIAN -#else -#define G_FLOAT_BYTE_ORDER G_BYTE_ORDER -#endif - -static guint64 -gsf_le_get_guint64 (void const *p) -{ -#if G_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (guint64) == 8) { - guint64 li; - int i; - guint8 *t = (guint8 *)&li; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (li); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return li; - } else { - g_error ("Big endian machine, but weird size of guint64"); - } -#elif G_BYTE_ORDER == G_LITTLE_ENDIAN - if (sizeof (guint64) == 8) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - guint64 data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of guint64"); - } -#else -#error "Byte order not recognised -- out of luck" -#endif -} - -static float -gsf_le_get_float (void const *p) -{ -#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (float) == 4) { - float f; - int i; - guint8 *t = (guint8 *)&f; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (f); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return f; - } else { - g_error ("Big endian machine, but weird size of floats"); - } -#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN) - if (sizeof (float) == 4) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - float data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of floats"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -static double -gsf_le_get_double (void const *p) -{ -#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN - double data; - memcpy ((char *)&data + 4, p, 4); - memcpy ((char *)&data, (const char *)p + 4, 4); - return data; -#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (double) == 8) { - double d; - int i; - guint8 *t = (guint8 *)&d; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (d); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return d; - } else { - g_error ("Big endian machine, but weird size of doubles"); - } -#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN - if (sizeof (double) == 8) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - double data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of doubles"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -/** - * gsf_iconv_close : A utility wrapper to safely close an iconv handle - * @handle : - **/ -static void -gsf_iconv_close (GIConv handle) -{ - if (handle != NULL && handle != ((GIConv)-1)) - g_iconv_close (handle); -} - - -/* ***************************** formerly gsf-infile-msole.c ********************* */ - -#define OLE_HEADER_SIZE 0x200 /* independent of big block size size */ -#define OLE_HEADER_SIGNATURE 0x00 -#define OLE_HEADER_CLSID 0x08 /* See ReadClassStg */ -#define OLE_HEADER_MINOR_VER 0x18 /* 0x33 and 0x3e have been seen */ -#define OLE_HEADER_MAJOR_VER 0x1a /* 0x3 been seen in wild */ -#define OLE_HEADER_BYTE_ORDER 0x1c /* 0xfe 0xff == Intel Little Endian */ -#define OLE_HEADER_BB_SHIFT 0x1e -#define OLE_HEADER_SB_SHIFT 0x20 -/* 0x22..0x27 reserved == 0 */ -#define OLE_HEADER_CSECTDIR 0x28 -#define OLE_HEADER_NUM_BAT 0x2c -#define OLE_HEADER_DIRENT_START 0x30 -/* 0x34..0x37 transacting signature must be 0 */ -#define OLE_HEADER_THRESHOLD 0x38 -#define OLE_HEADER_SBAT_START 0x3c -#define OLE_HEADER_NUM_SBAT 0x40 -#define OLE_HEADER_METABAT_BLOCK 0x44 -#define OLE_HEADER_NUM_METABAT 0x48 -#define OLE_HEADER_START_BAT 0x4c -#define BAT_INDEX_SIZE 4 -#define OLE_HEADER_METABAT_SIZE ((OLE_HEADER_SIZE - OLE_HEADER_START_BAT) / BAT_INDEX_SIZE) - -#define DIRENT_MAX_NAME_SIZE 0x40 -#define DIRENT_DETAILS_SIZE 0x40 -#define DIRENT_SIZE (DIRENT_MAX_NAME_SIZE + DIRENT_DETAILS_SIZE) -#define DIRENT_NAME_LEN 0x40 /* length in bytes incl 0 terminator */ -#define DIRENT_TYPE 0x42 -#define DIRENT_COLOUR 0x43 -#define DIRENT_PREV 0x44 -#define DIRENT_NEXT 0x48 -#define DIRENT_CHILD 0x4c -#define DIRENT_CLSID 0x50 /* only for dirs */ -#define DIRENT_USERFLAGS 0x60 /* only for dirs */ -#define DIRENT_CREATE_TIME 0x64 /* for files */ -#define DIRENT_MODIFY_TIME 0x6c /* for files */ -#define DIRENT_FIRSTBLOCK 0x74 -#define DIRENT_FILE_SIZE 0x78 -/* 0x7c..0x7f reserved == 0 */ - -#define DIRENT_TYPE_INVALID 0 -#define DIRENT_TYPE_DIR 1 -#define DIRENT_TYPE_FILE 2 -#define DIRENT_TYPE_LOCKBYTES 3 /* ? */ -#define DIRENT_TYPE_PROPERTY 4 /* ? */ -#define DIRENT_TYPE_ROOTDIR 5 -#define DIRENT_MAGIC_END 0xffffffff - -/* flags in the block allocation list to denote special blocks */ -#define BAT_MAGIC_UNUSED 0xffffffff /* -1 */ -#define BAT_MAGIC_END_OF_CHAIN 0xfffffffe /* -2 */ -#define BAT_MAGIC_BAT 0xfffffffd /* a bat block, -3 */ -#define BAT_MAGIC_METABAT 0xfffffffc /* a metabat block -4 */ - - - - -typedef struct { - guint32 *block; - guint32 num_blocks; -} MSOleBAT; - -typedef struct { - char *name; - char *collation_name; - int index; - size_t size; - gboolean use_sb; - guint32 first_block; - gboolean is_directory; - GList *children; - unsigned char clsid[16]; /* 16 byte GUID used by some apps */ -} MSOleDirent; - -typedef struct { - struct { - MSOleBAT bat; - unsigned shift; - unsigned filter; - size_t size; - } bb, sb; - off_t max_block; - guint32 threshold; /* transition between small and big blocks */ - guint32 sbat_start, num_sbat; - - MSOleDirent *root_dir; - struct GsfInput *sb_file; - - int ref_count; -} MSOleInfo; - -typedef struct GsfInfileMSOle { - off_t size; - off_t cur_offset; - struct GsfInput *input; - MSOleInfo *info; - MSOleDirent *dirent; - MSOleBAT bat; - off_t cur_block; - - struct { - guint8 *buf; - size_t buf_size; - } stream; -} GsfInfileMSOle; - -/* utility macros */ -#define OLE_BIG_BLOCK(index, ole) ((index) >> ole->info->bb.shift) - -static struct GsfInput *gsf_infile_msole_new_child (GsfInfileMSOle *parent, - MSOleDirent *dirent); - -/** - * ole_get_block : - * @ole : the infile - * @block : - * @buffer : optionally NULL - * - * Read a block of data from the underlying input. - * Be really anal. - **/ -static const guint8 * -ole_get_block (const GsfInfileMSOle *ole, guint32 block, guint8 *buffer) -{ - g_return_val_if_fail (block < ole->info->max_block, NULL); - - /* OLE_HEADER_SIZE is fixed at 512, but the sector containing the - * header is padded out to bb.size (sector size) when bb.size > 512. */ - if (gsf_input_seek (ole->input, - (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (block << ole->info->bb.shift)), - SEEK_SET) < 0) - return NULL; - - return gsf_input_read (ole->input, ole->info->bb.size, buffer); -} - -/** - * ole_make_bat : - * @metabat : a meta bat to connect to the raw blocks (small or large) - * @size_guess : An optional guess as to how many blocks are in the file - * @block : The first block in the list. - * @res : where to store the result. - * - * Walk the linked list of the supplied block allocation table and build up a - * table for the list starting in @block. - * - * Returns TRUE on error. - */ -static gboolean -ole_make_bat (MSOleBAT const *metabat, size_t size_guess, guint32 block, - MSOleBAT *res) -{ - /* NOTE : Only use size as a suggestion, sometimes it is wrong */ - GArray *bat = g_array_sized_new (FALSE, FALSE, - sizeof (guint32), size_guess); - - guint8 *used = (guint8*)g_alloca (1 + metabat->num_blocks / 8); - memset (used, 0, 1 + metabat->num_blocks / 8); - - if (block < metabat->num_blocks) - do { - /* Catch cycles in the bat list */ - g_return_val_if_fail (0 == (used[block/8] & (1 << (block & 0x7))), TRUE); - used[block/8] |= 1 << (block & 0x7); - - g_array_append_val (bat, block); - block = metabat->block [block]; - } while (block < metabat->num_blocks); - - res->block = NULL; - - res->num_blocks = bat->len; - res->block = (guint32 *) (gpointer) g_array_free (bat, FALSE); - - if (block != BAT_MAGIC_END_OF_CHAIN) { -#if 0 - g_warning ("This OLE2 file is invalid.\n" - "The Block Allocation Table for one of the streams had %x instead of a terminator (%x).\n" - "We might still be able to extract some data, but you'll want to check the file.", - block, BAT_MAGIC_END_OF_CHAIN); -#endif - } - - return FALSE; -} - -static void -ols_bat_release (MSOleBAT *bat) -{ - if (bat->block != NULL) { - g_free (bat->block); - bat->block = NULL; - bat->num_blocks = 0; - } -} - -/** - * ole_info_read_metabat : - * @ole : - * @bats : - * - * A small utility routine to read a set of references to bat blocks - * either from the OLE header, or a meta-bat block. - * - * Returns a pointer to the element after the last position filled. - **/ -static guint32 * -ole_info_read_metabat (GsfInfileMSOle *ole, guint32 *bats, guint32 max, - guint32 const *metabat, guint32 const *metabat_end) -{ - guint8 const *bat, *end; - - for (; metabat < metabat_end; metabat++) { - bat = ole_get_block (ole, *metabat, NULL); - if (bat == NULL) - return NULL; - end = bat + ole->info->bb.size; - for ( ; bat < end ; bat += BAT_INDEX_SIZE, bats++) { - *bats = GSF_LE_GET_GUINT32 (bat); - g_return_val_if_fail (*bats < max || - *bats >= BAT_MAGIC_METABAT, NULL); - } - } - return bats; -} - -/** - * gsf_ole_get_guint32s : - * @dst : - * @src : - * @num_bytes : - * - * Copy some some raw data into an array of guint32. - **/ -static void -gsf_ole_get_guint32s (guint32 *dst, guint8 const *src, int num_bytes) -{ - for (; (num_bytes -= BAT_INDEX_SIZE) >= 0 ; src += BAT_INDEX_SIZE) - *dst++ = GSF_LE_GET_GUINT32 (src); -} - -static struct GsfInput * -ole_info_get_sb_file (GsfInfileMSOle *parent) -{ - MSOleBAT meta_sbat; - - if (parent->info->sb_file != NULL) - return parent->info->sb_file; - - parent->info->sb_file = gsf_infile_msole_new_child (parent, - parent->info->root_dir); - - if (NULL == parent->info->sb_file) - return NULL; - - g_return_val_if_fail (parent->info->sb.bat.block == NULL, NULL); - - if (ole_make_bat (&parent->info->bb.bat, - parent->info->num_sbat, - parent->info->sbat_start, - &meta_sbat)) { - return NULL; - } - - parent->info->sb.bat.num_blocks = meta_sbat.num_blocks * (parent->info->bb.size / BAT_INDEX_SIZE); - parent->info->sb.bat.block = g_new0 (guint32, parent->info->sb.bat.num_blocks); - ole_info_read_metabat (parent, parent->info->sb.bat.block, - parent->info->sb.bat.num_blocks, - meta_sbat.block, meta_sbat.block + meta_sbat.num_blocks); - ols_bat_release (&meta_sbat); - - return parent->info->sb_file; -} - -static gint -ole_dirent_cmp (const MSOleDirent *a, const MSOleDirent *b) -{ - g_return_val_if_fail (a, 0); - g_return_val_if_fail (b, 0); - - g_return_val_if_fail (a->collation_name, 0); - g_return_val_if_fail (b->collation_name, 0); - - return strcmp (b->collation_name, a->collation_name); -} - -/** - * ole_dirent_new : - * @ole : - * @entry : - * @parent : optional - * - * Parse dirent number @entry and recursively handle its siblings and children. - **/ -static MSOleDirent * -ole_dirent_new (GsfInfileMSOle *ole, guint32 entry, MSOleDirent *parent) -{ - MSOleDirent *dirent; - guint32 block, next, prev, child, size; - guint8 const *data; - guint8 type; - guint16 name_len; - - if (entry >= DIRENT_MAGIC_END) - return NULL; - - block = OLE_BIG_BLOCK (entry * DIRENT_SIZE, ole); - - g_return_val_if_fail (block < ole->bat.num_blocks, NULL); - data = ole_get_block (ole, ole->bat.block [block], NULL); - if (data == NULL) - return NULL; - data += (DIRENT_SIZE * entry) % ole->info->bb.size; - - type = GSF_LE_GET_GUINT8 (data + DIRENT_TYPE); - if (type != DIRENT_TYPE_DIR && - type != DIRENT_TYPE_FILE && - type != DIRENT_TYPE_ROOTDIR) { -#if 0 - g_warning ("Unknown stream type 0x%x", type); -#endif - return NULL; - } - - /* It looks like directory (and root directory) sizes are sometimes bogus */ - size = GSF_LE_GET_GUINT32 (data + DIRENT_FILE_SIZE); - g_return_val_if_fail (type == DIRENT_TYPE_DIR || type == DIRENT_TYPE_ROOTDIR || - size <= (guint32)gsf_input_size(ole->input), NULL); - - dirent = g_new0 (MSOleDirent, 1); - dirent->index = entry; - dirent->size = size; - /* Store the class id which is 16 byte identifier used by some apps */ - memcpy(dirent->clsid, data + DIRENT_CLSID, sizeof(dirent->clsid)); - - /* root dir is always big block */ - dirent->use_sb = parent && (size < ole->info->threshold); - dirent->first_block = (GSF_LE_GET_GUINT32 (data + DIRENT_FIRSTBLOCK)); - dirent->is_directory = (type != DIRENT_TYPE_FILE); - dirent->children = NULL; - prev = GSF_LE_GET_GUINT32 (data + DIRENT_PREV); - next = GSF_LE_GET_GUINT32 (data + DIRENT_NEXT); - child = GSF_LE_GET_GUINT32 (data + DIRENT_CHILD); - name_len = GSF_LE_GET_GUINT16 (data + DIRENT_NAME_LEN); - dirent->name = NULL; - if (0 < name_len && name_len <= DIRENT_MAX_NAME_SIZE) { - gunichar2 uni_name [DIRENT_MAX_NAME_SIZE+1]; - gchar const *end; - int i; - - /* !#%!@$#^ - * Sometimes, rarely, people store the stream name as ascii - * rather than utf16. Do a validation first just in case. - */ - if (!g_utf8_validate ((const char*) data, -1, &end) || - ((guint8 const *)end - data + 1) != name_len) { - /* be wary about endianness */ - for (i = 0 ; i < name_len ; i += 2) - uni_name [i/2] = GSF_LE_GET_GUINT16 (data + i); - uni_name [i/2] = 0; - - dirent->name = g_utf16_to_utf8 (uni_name, -1, NULL, NULL, NULL); - } else - dirent->name = g_strndup ((gchar *)data, (gsize)((guint8 const *)end - data + 1)); - } - /* be really anal in the face of screwups */ - if (dirent->name == NULL) - dirent->name = g_strdup (""); - dirent->collation_name = g_utf8_collate_key (dirent->name, -1); - - if (parent != NULL) - parent->children = g_list_insert_sorted (parent->children, - dirent, (GCompareFunc)ole_dirent_cmp); - - /* NOTE : These links are a tree, not a linked list */ - if (prev != entry) - ole_dirent_new (ole, prev, parent); - if (next != entry) - ole_dirent_new (ole, next, parent); - - if (dirent->is_directory) - ole_dirent_new (ole, child, dirent); - return dirent; -} - -static void -ole_dirent_free (MSOleDirent *dirent) -{ - GList *tmp; - g_return_if_fail (dirent != NULL); - - g_free (dirent->name); - g_free (dirent->collation_name); - - for (tmp = dirent->children; tmp; tmp = tmp->next) - ole_dirent_free ((MSOleDirent *)tmp->data); - g_list_free (dirent->children); - g_free (dirent); -} - -/*****************************************************************************/ - -static void -ole_info_unref (MSOleInfo *info) -{ - if (info->ref_count-- != 1) - return; - - ols_bat_release (&info->bb.bat); - ols_bat_release (&info->sb.bat); - if (info->root_dir != NULL) { - ole_dirent_free (info->root_dir); - info->root_dir = NULL; - } - if (info->sb_file != NULL) { - gsf_input_finalize(info->sb_file); - info->sb_file = NULL; - } - g_free (info); -} - -static MSOleInfo * -ole_info_ref (MSOleInfo *info) -{ - info->ref_count++; - return info; -} - -static void -gsf_infile_msole_init (GsfInfileMSOle * ole) -{ - ole->cur_offset = 0; - ole->size = 0; - ole->input = NULL; - ole->info = NULL; - ole->bat.block = NULL; - ole->bat.num_blocks = 0; - ole->cur_block = BAT_MAGIC_UNUSED; - ole->stream.buf = NULL; - ole->stream.buf_size = 0; -} - -static void -gsf_infile_msole_finalize (GsfInfileMSOle * ole) -{ - if (ole->input != NULL) { - gsf_input_finalize(ole->input); - ole->input = NULL; - } - if (ole->info != NULL) { - ole_info_unref (ole->info); - ole->info = NULL; - } - ols_bat_release (&ole->bat); - - g_free (ole->stream.buf); - free(ole); -} - -/** - * ole_dup : - * @src : - * - * Utility routine to _partially_ replicate a file. It does NOT copy the bat - * blocks, or init the dirent. - * - * Return value: the partial duplicate. - **/ -static GsfInfileMSOle * -ole_dup (GsfInfileMSOle const * src) -{ - GsfInfileMSOle *dst; - struct GsfInput *input; - - g_return_val_if_fail (src != NULL, NULL); - - dst = malloc(sizeof(GsfInfileMSOle)); - if (dst == NULL) - return NULL; - gsf_infile_msole_init(dst); - input = gsf_input_dup (src->input); - if (input == NULL) { - gsf_infile_msole_finalize(dst); - return NULL; - } - dst->input = input; - dst->info = ole_info_ref (src->info); - - /* buf and buf_size are initialized to NULL */ - - return dst; -} - -/** - * ole_init_info : - * @ole : - * - * Read an OLE header and do some sanity checking - * along the way. - * - * Return value: TRUE on error - **/ -static gboolean -ole_init_info (GsfInfileMSOle *ole) -{ - static guint8 const signature[] = - { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 }; - guint8 const *header, *tmp; - guint32 *metabat = NULL; - MSOleInfo *info; - guint32 bb_shift, sb_shift, num_bat, num_metabat, last, dirent_start; - guint32 metabat_block, *ptr; - - /* check the header */ - if (gsf_input_seek (ole->input, (off_t) 0, SEEK_SET) || - NULL == (header = gsf_input_read (ole->input, OLE_HEADER_SIZE, NULL)) || - 0 != memcmp (header, signature, sizeof (signature))) { - return TRUE; - } - - bb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_BB_SHIFT); - sb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_SB_SHIFT); - num_bat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_BAT); - dirent_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_DIRENT_START); - metabat_block = GSF_LE_GET_GUINT32 (header + OLE_HEADER_METABAT_BLOCK); - num_metabat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_METABAT); - - /* Some sanity checks - * 1) There should always be at least 1 BAT block - * 2) It makes no sense to have a block larger than 2^31 for now. - * Maybe relax this later, but not much. - */ - if (6 > bb_shift || bb_shift >= 31 || sb_shift > bb_shift) { - return TRUE; - } - - info = g_new0 (MSOleInfo, 1); - ole->info = info; - - info->ref_count = 1; - info->bb.shift = bb_shift; - info->bb.size = 1 << info->bb.shift; - info->bb.filter = info->bb.size - 1; - info->sb.shift = sb_shift; - info->sb.size = 1 << info->sb.shift; - info->sb.filter = info->sb.size - 1; - info->threshold = GSF_LE_GET_GUINT32 (header + OLE_HEADER_THRESHOLD); - info->sbat_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_SBAT_START); - info->num_sbat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_SBAT); - info->max_block = (gsf_input_size (ole->input) - OLE_HEADER_SIZE) / info->bb.size; - info->sb_file = NULL; - - if (info->num_sbat == 0 && info->sbat_start != BAT_MAGIC_END_OF_CHAIN) { -#if 0 - g_warning ("There is are not supposed to be any blocks in the small block allocation table, yet there is a link to some. Ignoring it."); -#endif - } - - /* very rough heuristic, just in case */ - if (num_bat < info->max_block) { - info->bb.bat.num_blocks = num_bat * (info->bb.size / BAT_INDEX_SIZE); - info->bb.bat.block = g_new0 (guint32, info->bb.bat.num_blocks); - - metabat = (guint32 *)g_alloca (MAX (info->bb.size, OLE_HEADER_SIZE)); - - /* Reading the elements invalidates this memory, make copy */ - gsf_ole_get_guint32s (metabat, header + OLE_HEADER_START_BAT, - OLE_HEADER_SIZE - OLE_HEADER_START_BAT); - last = num_bat; - if (last > OLE_HEADER_METABAT_SIZE) - last = OLE_HEADER_METABAT_SIZE; - - ptr = ole_info_read_metabat (ole, info->bb.bat.block, - info->bb.bat.num_blocks, metabat, metabat + last); - num_bat -= last; - } else - ptr = NULL; - - last = (info->bb.size - BAT_INDEX_SIZE) / BAT_INDEX_SIZE; - while (ptr != NULL && num_metabat-- > 0) { - tmp = ole_get_block (ole, metabat_block, NULL); - if (tmp == NULL) { - ptr = NULL; - break; - } - - /* Reading the elements invalidates this memory, make copy */ - gsf_ole_get_guint32s (metabat, tmp, (int)info->bb.size); - - if (num_metabat == 0) { - if (last < num_bat) { - /* there should be less that a full metabat block - * remaining */ - ptr = NULL; - break; - } - last = num_bat; - } else if (num_metabat > 0) { - metabat_block = metabat[last]; - num_bat -= last; - } - - ptr = ole_info_read_metabat (ole, ptr, - info->bb.bat.num_blocks, metabat, metabat + last); - } - - if (ptr == NULL) { - return TRUE; - } - - /* Read the directory's bat, we do not know the size */ - if (ole_make_bat (&info->bb.bat, 0, dirent_start, &ole->bat)) { - return TRUE; - } - - /* Read the directory */ - ole->dirent = info->root_dir = ole_dirent_new (ole, 0, NULL); - if (ole->dirent == NULL) { - return TRUE; - } - - return FALSE; -} - -static guint8 const * -gsf_infile_msole_read (GsfInfileMSOle *ole, size_t num_bytes, guint8 *buffer) -{ - off_t first_block, last_block, raw_block, offset, i; - guint8 const *data; - guint8 *ptr; - size_t count; - - /* small block files are preload */ - if (ole->dirent != NULL && ole->dirent->use_sb) { - if (buffer != NULL) { - memcpy (buffer, ole->stream.buf + ole->cur_offset, num_bytes); - ole->cur_offset += num_bytes; - return buffer; - } - data = ole->stream.buf + ole->cur_offset; - ole->cur_offset += num_bytes; - return data; - } - - /* GsfInput guarantees that num_bytes > 0 */ - first_block = OLE_BIG_BLOCK (ole->cur_offset, ole); - last_block = OLE_BIG_BLOCK (ole->cur_offset + num_bytes - 1, ole); - offset = ole->cur_offset & ole->info->bb.filter; - - /* optimization : are all the raw blocks contiguous */ - i = first_block; - raw_block = ole->bat.block [i]; - while (++i <= last_block && ++raw_block == ole->bat.block [i]) - ; - if (i > last_block) { - /* optimization don't seek if we don't need to */ - if (ole->cur_block != first_block) { - if (gsf_input_seek (ole->input, - (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (ole->bat.block [first_block] << ole->info->bb.shift) + offset), - SEEK_SET) < 0) - return NULL; - } - ole->cur_block = last_block; - return gsf_input_read (ole->input, - num_bytes, - (unsigned char*) buffer); - } - - /* damn, we need to copy it block by block */ - if (buffer == NULL) { - if (ole->stream.buf_size < num_bytes) { - if (ole->stream.buf != NULL) - g_free (ole->stream.buf); - ole->stream.buf_size = num_bytes; - ole->stream.buf = g_new (guint8, num_bytes); - } - buffer = ole->stream.buf; - } - - ptr = buffer; - for (i = first_block ; i <= last_block ; i++ , ptr += count, num_bytes -= count) { - count = ole->info->bb.size - offset; - if (count > num_bytes) - count = num_bytes; - data = ole_get_block (ole, ole->bat.block [i], NULL); - if (data == NULL) - return NULL; - - /* TODO : this could be optimized to avoid the copy */ - memcpy (ptr, data + offset, count); - offset = 0; - } - ole->cur_block = BAT_MAGIC_UNUSED; - ole->cur_offset += num_bytes; - return buffer; -} - -static struct GsfInput * -gsf_infile_msole_new_child (GsfInfileMSOle *parent, - MSOleDirent *dirent) -{ - GsfInfileMSOle * child; - MSOleInfo *info; - MSOleBAT const *metabat; - struct GsfInput *sb_file = NULL; - size_t size_guess; - char * buf; - - - if ( (dirent->index != 0) && - (dirent->is_directory) ) { - /* be wary. It seems as if some implementations pretend that the - * directories contain data */ - return gsf_input_new((const unsigned char*) "", - (off_t) 0, - 0); - } - child = ole_dup (parent); - if (child == NULL) - return NULL; - child->dirent = dirent; - child->size = (off_t) dirent->size; - - info = parent->info; - - if (dirent->use_sb) { /* build the bat */ - metabat = &info->sb.bat; - size_guess = dirent->size >> info->sb.shift; - sb_file = ole_info_get_sb_file (parent); - } else { - metabat = &info->bb.bat; - size_guess = dirent->size >> info->bb.shift; - } - if (ole_make_bat (metabat, size_guess + 1, dirent->first_block, &child->bat)) { - gsf_infile_msole_finalize(child); - return NULL; - } - - if (dirent->use_sb) { - unsigned i; - guint8 const *data; - - if (sb_file == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - - child->stream.buf_size = info->threshold; - child->stream.buf = g_new (guint8, info->threshold); - - for (i = 0 ; i < child->bat.num_blocks; i++) - if (gsf_input_seek (sb_file, - (off_t)(child->bat.block [i] << info->sb.shift), SEEK_SET) < 0 || - (data = gsf_input_read (sb_file, - info->sb.size, - child->stream.buf + (i << info->sb.shift))) == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - } - buf = malloc(child->size); - if (buf == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - if (NULL == gsf_infile_msole_read(child, - child->size, - (guint8*) buf)) { - gsf_infile_msole_finalize(child); - return NULL; - } - gsf_infile_msole_finalize(child); - return gsf_input_new((const unsigned char*) buf, - (off_t) dirent->size, - 1); -} - - -static struct GsfInput * -gsf_infile_msole_child_by_index (GsfInfileMSOle * ole, int target) -{ - GList *p; - - for (p = ole->dirent->children; p != NULL ; p = p->next) - if (target-- <= 0) - return gsf_infile_msole_new_child (ole, - (MSOleDirent *)p->data); - return NULL; -} - -static char const * -gsf_infile_msole_name_by_index (GsfInfileMSOle * ole, int target) -{ - GList *p; - - for (p = ole->dirent->children; p != NULL ; p = p->next) - if (target-- <= 0) - return ((MSOleDirent *)p->data)->name; - return NULL; -} - -static int -gsf_infile_msole_num_children (GsfInfileMSOle * ole) -{ - g_return_val_if_fail (ole->dirent != NULL, -1); - - if (!ole->dirent->is_directory) - return -1; - return g_list_length (ole->dirent->children); -} - - -/** - * gsf_infile_msole_new : - * @source : - * - * Opens the root directory of an MS OLE file. - * NOTE : adds a reference to @source - * - * Returns : the new ole file handler - **/ -static GsfInfileMSOle * -gsf_infile_msole_new (struct GsfInput *source) -{ - GsfInfileMSOle * ole; - - ole = malloc(sizeof(GsfInfileMSOle)); - if (ole == NULL) - return NULL; - gsf_infile_msole_init(ole); - ole->input = source; - ole->size = (off_t) 0; - - if (ole_init_info (ole)) { - gsf_infile_msole_finalize(ole); - return NULL; - } - - return ole; -} - - - - +#include <gsf/gsf-utils.h> +#include <gsf/gsf-input-memory.h> +#include <gsf/gsf-infile.h> +#include <gsf/gsf-infile-msole.h> +#include <gsf/gsf-msole-utils.h> +#define DEBUG_OLE2 0 /* ******************************** main extraction code ************************ */ @@ -1240,21 +53,21 @@ static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordList *oldhead, const char *phrase, EXTRACTOR_KeywordType type) { - EXTRACTOR_KeywordList * keyword; - - if (strlen(phrase) == 0) - return oldhead; - if (0 == strcmp(phrase, "\"\"")) - return oldhead; - if (0 == strcmp(phrase, "\" \"")) - return oldhead; - if (0 == strcmp(phrase, " ")) - return oldhead; - keyword = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList)); - keyword->next = oldhead; - keyword->keyword = strdup(phrase); - keyword->keywordType = type; - return keyword; + EXTRACTOR_KeywordList * keyword; + + if (strlen(phrase) == 0) + return oldhead; + if (0 == strcmp(phrase, "\"\"")) + return oldhead; + if (0 == strcmp(phrase, "\" \"")) + return oldhead; + if (0 == strcmp(phrase, " ")) + return oldhead; + keyword = malloc(sizeof(EXTRACTOR_KeywordList)); + keyword->next = oldhead; + keyword->keyword = strdup(phrase); + keyword->keywordType = type; + return keyword; } @@ -1273,122 +86,6 @@ static guint8 const user_guid [] = { 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae }; -typedef enum { - GSF_MSOLE_META_DATA_COMPONENT, - GSF_MSOLE_META_DATA_DOCUMENT, - GSF_MSOLE_META_DATA_USER -} GsfMSOleMetaDataType; - -typedef enum { - LE_VT_EMPTY = 0, - LE_VT_NULL = 1, - LE_VT_I2 = 2, - LE_VT_I4 = 3, - LE_VT_R4 = 4, - LE_VT_R8 = 5, - LE_VT_CY = 6, - LE_VT_DATE = 7, - LE_VT_BSTR = 8, - LE_VT_DISPATCH = 9, - LE_VT_ERROR = 10, - LE_VT_BOOL = 11, - LE_VT_VARIANT = 12, - LE_VT_UNKNOWN = 13, - LE_VT_DECIMAL = 14, - LE_VT_I1 = 16, - LE_VT_UI1 = 17, - LE_VT_UI2 = 18, - LE_VT_UI4 = 19, - LE_VT_I8 = 20, - LE_VT_UI8 = 21, - LE_VT_INT = 22, - LE_VT_UINT = 23, - LE_VT_VOID = 24, - LE_VT_HRESULT = 25, - LE_VT_PTR = 26, - LE_VT_SAFEARRAY = 27, - LE_VT_CARRAY = 28, - LE_VT_USERDEFINED = 29, - LE_VT_LPSTR = 30, - LE_VT_LPWSTR = 31, - LE_VT_FILETIME = 64, - LE_VT_BLOB = 65, - LE_VT_STREAM = 66, - LE_VT_STORAGE = 67, - LE_VT_STREAMED_OBJECT = 68, - LE_VT_STORED_OBJECT = 69, - LE_VT_BLOB_OBJECT = 70, - LE_VT_CF = 71, - LE_VT_CLSID = 72, - LE_VT_VECTOR = 0x1000 -} GsfMSOleVariantType; - -typedef struct { - char const *name; - guint32 id; - GsfMSOleVariantType prefered_type; -} GsfMSOleMetaDataPropMap; - -typedef struct { - guint32 id; - off_t offset; -} GsfMSOleMetaDataProp; - -typedef struct { - GsfMSOleMetaDataType type; - off_t offset; - guint32 size, num_props; - GIConv iconv_handle; - unsigned char_size; - GHashTable *dict; -} GsfMSOleMetaDataSection; - -static GsfMSOleMetaDataPropMap const document_props[] = { - { "Category", 2, LE_VT_LPSTR }, - { "PresentationFormat", 3, LE_VT_LPSTR }, - { "NumBytes", 4, LE_VT_I4 }, - { "NumLines", 5, LE_VT_I4 }, - { "NumParagraphs", 6, LE_VT_I4 }, - { "NumSlides", 7, LE_VT_I4 }, - { "NumNotes", 8, LE_VT_I4 }, - { "NumHiddenSlides", 9, LE_VT_I4 }, - { "NumMMClips", 10, LE_VT_I4 }, - { "Scale", 11, LE_VT_BOOL }, - { "HeadingPairs", 12, LE_VT_VECTOR | LE_VT_VARIANT }, - { "DocumentParts", 13, LE_VT_VECTOR | LE_VT_LPSTR }, - { "Manager", 14, LE_VT_LPSTR }, - { "Company", 15, LE_VT_LPSTR }, - { "LinksDirty", 16, LE_VT_BOOL } -}; - -static GsfMSOleMetaDataPropMap const component_props[] = { - { "Title", 2, LE_VT_LPSTR }, - { "Subject", 3, LE_VT_LPSTR }, - { "Author", 4, LE_VT_LPSTR }, - { "Keywords", 5, LE_VT_LPSTR }, - { "Comments", 6, LE_VT_LPSTR }, - { "Template", 7, LE_VT_LPSTR }, - { "LastSavedBy", 8, LE_VT_LPSTR }, - { "RevisionNumber", 9, LE_VT_LPSTR }, - { "TotalEditingTime", 10, LE_VT_FILETIME }, - { "LastPrinted", 11, LE_VT_FILETIME }, - { "CreateTime", 12, LE_VT_FILETIME }, - { "LastSavedTime", 13, LE_VT_FILETIME }, - { "NumPages", 14, LE_VT_I4 }, - { "NumWords", 15, LE_VT_I4 }, - { "NumCharacters", 16, LE_VT_I4 }, - { "Thumbnail", 17, LE_VT_CF }, - { "AppName", 18, LE_VT_LPSTR }, - { "Security", 19, LE_VT_I4 } -}; - -static GsfMSOleMetaDataPropMap const common_props[] = { - { "Dictionary", 0, 0, /* magic */}, - { "CodePage", 1, LE_VT_UI2 }, - { "LOCALE_SYSTEM_DEFAULT", 0x80000000, LE_VT_UI4}, - { "CASE_SENSITIVE", 0x80000003, LE_VT_UI4}, -}; - typedef struct { char * text; EXTRACTOR_KeywordType type; @@ -1398,8 +95,8 @@ static Matches tmap[] = { { "Title", EXTRACTOR_TITLE }, { "PresentationFormat", EXTRACTOR_FORMAT }, { "Category", EXTRACTOR_DESCRIPTION }, - { "Manager", EXTRACTOR_CREATED_FOR }, - { "Company", EXTRACTOR_ORGANIZATION }, + { "Manager", EXTRACTOR_MANAGER }, + { "Company", EXTRACTOR_COMPANY }, { "Subject", EXTRACTOR_SUBJECT }, { "Author", EXTRACTOR_AUTHOR }, { "Keywords", EXTRACTOR_KEYWORDS }, @@ -1412,709 +109,98 @@ static Matches tmap[] = { { "NumBytes", EXTRACTOR_SIZE }, { "CreatedTime", EXTRACTOR_CREATION_DATE }, { "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE }, + { "gsf:company", EXTRACTOR_COMPANY }, + /* { "gsf:security", EXTRACTOR_SECURITY }, */ + { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT }, + { "gsf:page-count", EXTRACTOR_PAGE_COUNT }, + { "gsf:line-count", EXTRACTOR_LINE_COUNT }, + { "gsf:word-count", EXTRACTOR_WORD_COUNT }, + { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT }, + { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY }, + /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */ + { "gsf:manager", EXTRACTOR_MANAGER }, + { "dc:title", EXTRACTOR_TITLE }, + { "dc:creator", EXTRACTOR_CREATOR }, + { "dc:date", EXTRACTOR_DATE }, + { "dc:subject", EXTRACTOR_SUBJECT }, + { "dc:keywords", EXTRACTOR_KEYWORDS }, + { "dc:last-printed", EXTRACTOR_LAST_PRINTED }, + { "dc:description", EXTRACTOR_DESCRIPTION }, + { "meta:creation-date", EXTRACTOR_CREATION_DATE }, + /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */ + { "meta:generator", EXTRACTOR_GENERATOR }, + { "meta:template", EXTRACTOR_TEMPLATE }, + /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */ + /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */ { NULL, 0 }, }; - -static char const * -msole_prop_id_to_gsf (GsfMSOleMetaDataSection *section, guint32 id) -{ - char const *res = NULL; - GsfMSOleMetaDataPropMap const *map = NULL; - unsigned i = 0; - - if (section->dict != NULL) { - if (id & 0x1000000) { - id &= ~0x1000000; - d (printf ("LINKED ");); - } - - res = g_hash_table_lookup (section->dict, GINT_TO_POINTER (id)); - - if (res != NULL) { - d (printf (res);); - return res; - } - } - - if (section->type == GSF_MSOLE_META_DATA_COMPONENT) { - map = component_props; - i = G_N_ELEMENTS (component_props); - } else if (section->type == GSF_MSOLE_META_DATA_DOCUMENT) { - map = document_props; - i = G_N_ELEMENTS (document_props); - } - while (i-- > 0) - if (map[i].id == id) { - d (printf (map[i].name);); - return map[i].name; - } - - map = common_props; - i = G_N_ELEMENTS (common_props); - while (i-- > 0) - if (map[i].id == id) { - d (printf (map[i].name);); - return map[i].name; - } - - d (printf ("_UNKNOWN_(0x%x %d)", id, id);); - - return NULL; -} - -static GValue * -msole_prop_parse(GsfMSOleMetaDataSection *section, - guint32 type, - guint8 const **data, - guint8 const *data_end) -{ - GValue *res; - char *str; - guint32 len; - gboolean const is_vector = type & LE_VT_VECTOR; - GError * error; - - g_return_val_if_fail (!(type & (unsigned)(~0x1fff)), NULL); /* not valid in a prop set */ - - type &= 0xfff; - - if (is_vector) { - unsigned i, n; - - g_return_val_if_fail (*data + 4 <= data_end, NULL); - - n = GSF_LE_GET_GUINT32 (*data); - *data += 4; - - d (printf (" array with %d elem\n", n);); - for (i = 0 ; i < n ; i++) { - GValue *v; - d (printf ("\t[%d] ", i);); - v = msole_prop_parse (section, type, data, data_end); - if (v) { - /* FIXME: do something with it. */ - if (G_IS_VALUE (v)) - g_value_unset (v); - g_free (v); - } - } - return NULL; - } - - res = g_new0 (GValue, 1); - switch (type) { - case LE_VT_EMPTY : d (puts ("VT_EMPTY");); - /* value::unset == empty */ - break; - - case LE_VT_NULL : d (puts ("VT_NULL");); - /* value::unset == null too :-) do we need to distinguish ? */ - break; - - case LE_VT_I2 : d (puts ("VT_I2");); - g_return_val_if_fail (*data + 2 <= data_end, NULL); - g_value_init (res, G_TYPE_INT); - g_value_set_int (res, GSF_LE_GET_GINT16 (*data)); - *data += 2; - break; - - case LE_VT_I4 : d (puts ("VT_I4");); - g_return_val_if_fail (*data + 4 <= data_end, NULL); - g_value_init (res, G_TYPE_INT); - g_value_set_int (res, GSF_LE_GET_GINT32 (*data)); - *data += 4; - break; - - case LE_VT_R4 : d (puts ("VT_R4");); - g_return_val_if_fail (*data + 4 <= data_end, NULL); - g_value_init (res, G_TYPE_FLOAT); - g_value_set_float (res, GSF_LE_GET_FLOAT (*data)); - *data += 4; - break; - - case LE_VT_R8 : d (puts ("VT_R8");); - g_return_val_if_fail (*data + 8 <= data_end, NULL); - g_value_init (res, G_TYPE_DOUBLE); - g_value_set_double (res, GSF_LE_GET_DOUBLE (*data)); - *data += 8; - break; - - case LE_VT_CY : d (puts ("VT_CY");); - /* 8-byte two's complement integer (scaled by 10,000) */ - /* CHEAT : just store as an int64 for now */ - g_return_val_if_fail (*data + 8 <= data_end, NULL); - g_value_init (res, G_TYPE_INT64); - g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data)); - break; - - case LE_VT_DATE : d (puts ("VT_DATE");); - break; - - case LE_VT_BSTR : d (puts ("VT_BSTR");); - break; - - case LE_VT_DISPATCH : d (puts ("VT_DISPATCH");); - break; - - case LE_VT_BOOL : d (puts ("VT_BOOL");); - g_return_val_if_fail (*data + 1 <= data_end, NULL); - g_value_init (res, G_TYPE_BOOLEAN); - g_value_set_boolean (res, **data ? TRUE : FALSE); - *data += 1; - break; - - case LE_VT_VARIANT : d (printf ("VT_VARIANT containing a ");); - g_free (res); - type = GSF_LE_GET_GUINT32 (*data); - *data += 4; - return msole_prop_parse (section, type, data, data_end); - - case LE_VT_UI1 : d (puts ("VT_UI1");); - g_return_val_if_fail (*data + 1 <= data_end, NULL); - g_value_init (res, G_TYPE_UCHAR); - g_value_set_uchar (res, (guchar)(**data)); - *data += 1; - break; - - case LE_VT_UI2 : d (puts ("VT_UI2");); - g_return_val_if_fail (*data + 2 <= data_end, NULL); - g_value_init (res, G_TYPE_UINT); - g_value_set_uint (res, GSF_LE_GET_GUINT16 (*data)); - *data += 2; - break; - - case LE_VT_UI4 : d (puts ("VT_UI4");); - g_return_val_if_fail (*data + 4 <= data_end, NULL); - g_value_init (res, G_TYPE_UINT); - *data += 4; - d (printf ("%u\n", GSF_LE_GET_GUINT32 (*data));); - break; - - case LE_VT_I8 : d (puts ("VT_I8");); - g_return_val_if_fail (*data + 8 <= data_end, NULL); - g_value_init (res, G_TYPE_INT64); - g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data)); - *data += 8; - break; - - case LE_VT_UI8 : d (puts ("VT_UI8");); - g_return_val_if_fail (*data + 8 <= data_end, NULL); - g_value_init (res, G_TYPE_UINT64); - g_value_set_uint64 (res, GSF_LE_GET_GUINT64 (*data)); - *data += 8; - break; - - case LE_VT_LPSTR : d (puts ("VT_LPSTR");); - /* - * This is the representation of many strings. It is stored in - * the same representation as VT_BSTR. Note that the serialized - * representation of VP_LPSTR has a preceding byte count, whereas - * the in-memory representation does not. - */ - /* be anal and safe */ - g_return_val_if_fail (*data + 4 <= data_end, NULL); - - len = GSF_LE_GET_GUINT32 (*data); - - g_return_val_if_fail (len < 0x10000, NULL); - g_return_val_if_fail (*data + 4 + len*section->char_size <= data_end, NULL); - - error = NULL; - d (gsf_mem_dump (*data + 4, len * section->char_size);); - str = g_convert_with_iconv ((char*) *data + 4, - len * section->char_size, - section->iconv_handle, NULL, NULL, &error); - - g_value_init (res, G_TYPE_STRING); - if (NULL != str) { - g_value_set_string (res, str); - g_free (str); - } else if (NULL != error) { - g_warning ("error: %s", error->message); - g_error_free (error); - } else { - // g_warning ("unknown error converting string property, using blank"); - } - *data += 4 + len * section->char_size; - break; - - case LE_VT_LPWSTR : d (puts ("VT_LPWSTR");); - /* - * A counted and null-terminated Unicode string; a DWORD character - * count (where the count includes the terminating null) followed - * by that many Unicode (16-bit) characters. Note that the count - * is character count, not byte count. - */ - /* be anal and safe */ - g_return_val_if_fail (*data + 4 <= data_end, NULL); - - len = GSF_LE_GET_GUINT32 (*data); - - g_return_val_if_fail (len < 0x10000, NULL); - g_return_val_if_fail (*data + 4 + len <= data_end, NULL); - - error = NULL; - d (gsf_mem_dump (*data + 4, len*2);); - str = g_convert ((char*) *data + 4, - len*2, - "UTF-8", - "UTF-16LE", - NULL, - NULL, - &error); - - g_value_init (res, G_TYPE_STRING); - if (NULL != str) { - g_value_set_string (res, str); - g_free (str); - } else if (NULL != error) { - g_warning ("error: %s", error->message); - g_error_free (error); - } else { - g_warning ("unknown error converting string property, using blank"); - } - *data += 4 + len*2; - break; - - case LE_VT_FILETIME : d (puts ("VT_FILETIME");); - - g_return_val_if_fail (*data + 8 <= data_end, NULL); - - g_value_init (res, G_TYPE_STRING); - { - /* ft * 100ns since Jan 1 1601 */ - guint64 ft = GSF_LE_GET_GUINT64 (*data); - - ft /= 10000000; /* convert to seconds */ -#ifdef _MSC_VER - ft -= 11644473600i64; /* move to Jan 1 1970 */ -#else - ft -= 11644473600ULL; /* move to Jan 1 1970 */ -#endif - - str = g_strdup(ctime((time_t*)&ft)); - - g_value_set_string (res, str); - - *data += 8; - break; - } - case LE_VT_BLOB : d (puts ("VT_BLOB");); - g_free (res); - res = NULL; - break; - case LE_VT_STREAM : d (puts ("VT_STREAM");); - g_free (res); - res = NULL; - break; - case LE_VT_STORAGE : d (puts ("VT_STORAGE");); - g_free (res); - res = NULL; - break; - case LE_VT_STREAMED_OBJECT: d (puts ("VT_STREAMED_OBJECT");); - g_free (res); - res = NULL; - break; - case LE_VT_STORED_OBJECT : d (puts ("VT_STORED_OBJECT");); - g_free (res); - res = NULL; - break; - case LE_VT_BLOB_OBJECT : d (puts ("VT_BLOB_OBJECT");); - g_free (res); - res = NULL; - break; - case LE_VT_CF : d (puts ("VT_CF");); - break; - case LE_VT_CLSID : d (puts ("VT_CLSID");); - *data += 16; - g_free (res); - res = NULL; - break; - - case LE_VT_ERROR : - case LE_VT_UNKNOWN : - case LE_VT_DECIMAL : - case LE_VT_I1 : - case LE_VT_INT : - case LE_VT_UINT : - case LE_VT_VOID : - case LE_VT_HRESULT : - case LE_VT_PTR : - case LE_VT_SAFEARRAY : - case LE_VT_CARRAY : - case LE_VT_USERDEFINED : - warning ("type %d (0x%x) is not permitted in property sets", - type, type); - g_free (res); - res = NULL; - break; - - default : - warning ("Unknown property type %d (0x%x)", type, type); - g_free (res); - res = NULL; - }; - - d ( if (res != NULL && G_IS_VALUE (res)) { - char *val = g_strdup_value_contents (res); - d(printf ("%s\n", val);); - g_free (val); - } else - puts ("<unparsed>\n"); - ); - return res; -} - -static GValue * -msole_prop_read (struct GsfInput *in, - GsfMSOleMetaDataSection *section, - GsfMSOleMetaDataProp *props, - unsigned i) -{ - guint32 type; - guint8 const *data; - /* TODO : why size-4 ? I must be missing something */ - off_t size = ((i+1) >= section->num_props) - ? section->size-4 : props[i+1].offset; - char const *prop_name; - - g_return_val_if_fail (i < section->num_props, NULL); - g_return_val_if_fail (size >= props[i].offset + 4, NULL); - - size -= props[i].offset; /* includes the type id */ - if (gsf_input_seek (in, section->offset+props[i].offset, SEEK_SET) || - NULL == (data = gsf_input_read (in, size, NULL))) { - warning ("failed to read prop #%d", i); - return NULL; - } - - type = GSF_LE_GET_GUINT32 (data); - data += 4; - - /* dictionary is magic */ - if (props[i].id == 0) { - guint32 len, id, i, n; - gsize gslen; - char *name; - guint8 const *start = data; - - g_return_val_if_fail (section->dict == NULL, NULL); - - section->dict = g_hash_table_new_full ( - g_direct_hash, g_direct_equal, - NULL, g_free); - - n = type; - for (i = 0 ; i < n ; i++) { - id = GSF_LE_GET_GUINT32 (data); - len = GSF_LE_GET_GUINT32 (data + 4); - - g_return_val_if_fail (len < 0x10000, NULL); - - gslen = 0; - name = g_convert_with_iconv ((char*) data + 8, - len * section->char_size, - section->iconv_handle, &gslen, NULL, NULL); - - len = (guint32)gslen; - data += 8 + len; - - d (printf ("\t%u == %s\n", id, name);); - g_hash_table_replace (section->dict, - GINT_TO_POINTER (id), name); - - /* MS documentation blows goats ! - * The docs claim there are padding bytes in the dictionary. - * Their examples show padding bytes. - * In reality non-unicode strings do not see to have padding. - */ - if (section->char_size != 1 && (data - start) % 4) - data += 4 - ((data - start) % 4); - } - - return NULL; - } - - d (printf ("%u) ", i);); - prop_name = msole_prop_id_to_gsf (section, props[i].id); - - d (printf (" @ %x %x = ", (unsigned)props[i].offset, (unsigned)size);); - return msole_prop_parse (section, type, &data, data + size); -} - -static int -msole_prop_cmp (gconstpointer a, gconstpointer b) -{ - GsfMSOleMetaDataProp const *prop_a = a ; - GsfMSOleMetaDataProp const *prop_b = b ; - return prop_a->offset - prop_b->offset; -} - -/** - * gsf_msole_iconv_open_codepage_for_import : - * @to: - * @codepage : - * - * Returns an iconv converter for @codepage -> utf8. - **/ -static GIConv -gsf_msole_iconv_open_codepage_for_import(char const *to, - int codepage) -{ - GIConv iconv_handle; - - g_return_val_if_fail (to != NULL, (GIConv)(-1)); - /* sometimes it is stored as signed short */ - if (codepage == 65001 || codepage == -535) { - iconv_handle = g_iconv_open (to, "UTF-8"); - if (iconv_handle != (GIConv)(-1)) - return iconv_handle; - } else if (codepage != 1200 && codepage != 1201) { - char* src_charset = g_strdup_printf ("CP%d", codepage); - iconv_handle = g_iconv_open (to, src_charset); - g_free (src_charset); - if (iconv_handle != (GIConv)(-1)) - return iconv_handle; +static void processMetadata(gpointer key, + gpointer value, + gpointer user_data) { + struct EXTRACTOR_Keywords ** pprev = user_data; + const char * type = key; + const GsfDocProp * prop = value; + const GValue * gval; + char * contents; + int pos; + + if ( (key == NULL) || + (value == NULL) ) + return; + gval = gsf_doc_prop_get_val(prop); + + if (G_VALUE_TYPE(gval) == G_TYPE_STRING) { + contents = strdup(g_value_get_string(gval)); } else { - char const *from = (codepage == 1200) ? "UTF-16LE" : "UTF-16BE"; - iconv_handle = g_iconv_open (to, from); - if (iconv_handle != (GIConv)(-1)) - return iconv_handle; - } - - /* Try aliases. */ - if (codepage == 10000) { - /* gnu iconv. */ - iconv_handle = g_iconv_open (to, "MACROMAN"); - if (iconv_handle != (GIConv)(-1)) - return iconv_handle; - - /* glibc. */ - iconv_handle = g_iconv_open (to, "MACINTOSH"); - if (iconv_handle != (GIConv)(-1)) - return iconv_handle; + /* convert other formats? */ + contents = g_strdup_value_contents(gval); } - - warning ("Unable to open an iconv handle from codepage %d -> %s", - codepage, to); - return (GIConv)(-1); -} - -/** - * gsf_msole_iconv_open_for_import : - * @codepage : - * - * Returns an iconv converter for single byte encodings @codepage -> utf8. - * Attempt to handle the semantics of a specification for multibyte encodings - * since this is only supposed to be used for single bytes. - **/ -static GIConv -gsf_msole_iconv_open_for_import (int codepage) -{ - return gsf_msole_iconv_open_codepage_for_import ("UTF-8", codepage); -} - - - - - -static struct EXTRACTOR_Keywords * process(struct GsfInput * in, - struct EXTRACTOR_Keywords * prev) { - guint8 const *data = gsf_input_read (in, 28, NULL); - guint16 version; - guint32 os, num_sections; - unsigned i, j; - GsfMSOleMetaDataSection *sections; - GsfMSOleMetaDataProp *props; - - if (NULL == data) - return prev; - - /* NOTE : high word is the os, low word is the os version - * 0 = win16 - * 1 = mac - * 2 = win32 - */ - os = GSF_LE_GET_GUINT16 (data + 6); - - version = GSF_LE_GET_GUINT16 (data + 2); - - num_sections = GSF_LE_GET_GUINT32 (data + 24); - if (GSF_LE_GET_GUINT16 (data + 0) != 0xfffe - || (version != 0 && version != 1) - || os > 2 - || num_sections > 100) { /* arbitrary sanity check */ - return prev; + if ( (strlen(contents) > 0) && + (contents[strlen(contents)-1] == '\n') ) + contents[strlen(contents)-1] = '\0'; + if (contents == NULL) + return; + pos = 0; + while (tmap[pos].text != NULL) { + if (0 == strcmp(tmap[pos].text, + type)) + break; + pos++; } - - /* extract the section info */ - sections = (GsfMSOleMetaDataSection *)g_alloca (sizeof (GsfMSOleMetaDataSection)* num_sections); - for (i = 0 ; i < num_sections ; i++) { - data = gsf_input_read (in, 20, NULL); - if (NULL == data) { - return prev; - } - if (!memcmp (data, component_guid, sizeof (component_guid))) - sections [i].type = GSF_MSOLE_META_DATA_COMPONENT; - else if (!memcmp (data, document_guid, sizeof (document_guid))) - sections [i].type = GSF_MSOLE_META_DATA_DOCUMENT; - else if (!memcmp (data, user_guid, sizeof (user_guid))) - sections [i].type = GSF_MSOLE_META_DATA_USER; - else { - sections [i].type = GSF_MSOLE_META_DATA_USER; - warning ("Unknown property section type, treating it as USER"); - } - - sections [i].offset = GSF_LE_GET_GUINT32 (data + 16); -#ifndef NO_DEBUG_OLE_PROPS - d(printf ("0x%x\n", (guint32)sections [i].offset);); + if (tmap[pos].text != NULL) + *pprev = addKeyword(*pprev, + contents, + tmap[pos].type); +#if DEBUG_OLE2 + else + printf("No match for type `%s'\n", + type); #endif - } - for (i = 0 ; i < num_sections ; i++) { - if (gsf_input_seek (in, sections[i].offset, SEEK_SET) || - NULL == (data = gsf_input_read (in, 8, NULL))) { - return prev; - } - - sections[i].iconv_handle = (GIConv)-1; - sections[i].char_size = 1; - sections[i].dict = NULL; - sections[i].size = GSF_LE_GET_GUINT32 (data); /* includes header */ - sections[i].num_props = GSF_LE_GET_GUINT32 (data + 4); - if (sections[i].num_props <= 0) - continue; - props = g_new (GsfMSOleMetaDataProp, sections[i].num_props); - for (j = 0; j < sections[i].num_props; j++) { - if (NULL == (data = gsf_input_read (in, 8, NULL))) { - g_free (props); - return prev; - } - - props [j].id = GSF_LE_GET_GUINT32 (data); - props [j].offset = GSF_LE_GET_GUINT32 (data + 4); - } - - /* order prop info by offset to facilitate bounds checking */ - qsort (props, sections[i].num_props, - sizeof (GsfMSOleMetaDataProp), - msole_prop_cmp); + free(contents); +} - sections[i].iconv_handle = (GIConv)-1; - sections[i].char_size = 1; - for (j = 0; j < sections[i].num_props; j++) /* first codepage */ - if (props[j].id == 1) { - GValue *v = msole_prop_read (in, sections+i, props, j); - if (v != NULL) { - if (G_IS_VALUE (v)) { - if (G_VALUE_HOLDS_INT (v)) { - int codepage = g_value_get_int (v); - sections[i].iconv_handle = gsf_msole_iconv_open_for_import (codepage); - if (codepage == 1200 || codepage == 1201) - sections[i].char_size = 2; - } - g_value_unset (v); - } - g_free (v) ; - } - } - if (sections[i].iconv_handle == (GIConv)-1) - sections[i].iconv_handle = gsf_msole_iconv_open_for_import (1252); - for (j = 0; j < sections[i].num_props; j++) /* then dictionary */ - if (props[j].id == 0) { - GValue *v = msole_prop_read (in, sections+i, props, j); - if (v) { - if (G_VALUE_TYPE(v) == G_TYPE_STRING) { - gchar * contents = g_strdup_value_contents(v); - free(contents); - } else { - - /* FIXME: do something with non-strings... */ - } - if (G_IS_VALUE (v)) - g_value_unset (v); - g_free (v); - } - } - for (j = 0; j < sections[i].num_props; j++) /* the rest */ - if (props[j].id > 1) { - GValue *v = msole_prop_read (in, sections+i, props, j); - if (v && G_IS_VALUE(v)) { - gchar * contents = NULL; - int pc; - int ipc; - - if (G_VALUE_TYPE(v) == G_TYPE_STRING) { - contents = strdup(g_value_get_string(v)); - } else { - /* convert other formats? */ - contents = g_strdup_value_contents(v); - } - pc = 0; - if (contents != NULL) { - for (ipc=strlen(contents)-1;ipc>=0;ipc--) - if ( (isprint(contents[ipc])) && - (! isspace(contents[ipc])) ) - pc++; - if ( (strlen(contents) > 0) && - (contents[strlen(contents)-1] == '\n') ) - contents[strlen(contents)-1] = '\0'; - } - if (pc > 0) { - int pos = 0; - const char * prop - = msole_prop_id_to_gsf(sections+i, props[j].id); - if (prop != NULL) { - while (tmap[pos].text != NULL) { - if (0 == strcmp(tmap[pos].text, - prop)) - break; - pos++; - } - if (tmap[pos].text != NULL) - prev = addKeyword(prev, - contents, - tmap[pos].type); - } - } - if (contents != NULL) - free(contents); - } - if (v) { - if (G_IS_VALUE (v)) - g_value_unset (v); - g_free (v); - } - } +static struct EXTRACTOR_Keywords * +process(GsfInput * in, + struct EXTRACTOR_Keywords * prev) { + GsfDocMetaData * sections; + GError * error; - gsf_iconv_close (sections[i].iconv_handle); - g_free (props); - if (sections[i].dict != NULL) - g_hash_table_destroy (sections[i].dict); - } - switch (os) { - case 0: - prev = addKeyword(prev, - "Win16", - EXTRACTOR_OS); - break; - case 1: - prev = addKeyword(prev, - "MacOS", - EXTRACTOR_OS); - break; - case 2: - prev = addKeyword(prev, - "Win32", - EXTRACTOR_OS); - break; + sections = gsf_doc_meta_data_new(); + error = gsf_msole_metadata_read(in, sections); + if (error == NULL) { + gsf_doc_meta_data_foreach(sections, + &processMetadata, + &prev); } + g_object_unref(G_OBJECT(sections)); return prev; } -static struct EXTRACTOR_Keywords * processSO(struct GsfInput * src, - struct EXTRACTOR_Keywords * prev) { +static struct EXTRACTOR_Keywords * +processSO(GsfInput * src, + struct EXTRACTOR_Keywords * prev) { off_t size; char * buf; @@ -2161,61 +247,290 @@ static struct EXTRACTOR_Keywords * processSO(struct GsfInput * src, return prev; } +/* *************** wordleaker stuff *************** */ + +#define __(a) dgettext("iso-639", a) + +static const char * lidToLanguage( unsigned int lid ) { + switch ( lid ) { + case 0x0400: + return _("No Proofing"); + case 0x0401: + return __("Arabic"); + case 0x0402: + return __("Bulgarian"); + case 0x0403: + return __("Catalan"); + case 0x0404: + return _("Traditional Chinese"); + case 0x0804: + return _("Simplified Chinese"); + case 0x0405: + return __("Chechen"); + case 0x0406: + return __("Danish"); + case 0x0407: + return __("German"); + case 0x0807: + return _("Swiss German"); + case 0x0408: + return __("Greek"); + case 0x0409: + return _("U.S. English"); + case 0x0809: + return _("U.K. English"); + case 0x0c09: + return _("Australian English"); + case 0x040a: + return _("Castilian Spanish"); + case 0x080a: + return _("Mexican Spanish"); + case 0x040b: + return __("Finnish"); + case 0x040c: + return __("French"); + case 0x080c: + return _("Belgian French"); + case 0x0c0c: + return _("Canadian French"); + case 0x100c: + return _("Swiss French"); + case 0x040d: + return __("Hebrew"); + case 0x040e: + return __("Hungarian"); + case 0x040f: + return __("Icelandic"); + case 0x0410: + return __("Italian"); + case 0x0810: + return _("Swiss Italian"); + case 0x0411: + return __("Japanese"); + case 0x0412: + return __("Korean"); + case 0x0413: + return __("Dutch"); + case 0x0813: + return _("Belgian Dutch"); + case 0x0414: + return _("Norwegian Bokmal"); + case 0x0814: + return __("Norwegian Nynorsk"); + case 0x0415: + return __("Polish"); + case 0x0416: + return __("Brazilian Portuguese"); + case 0x0816: + return __("Portuguese"); + case 0x0417: + return _("Rhaeto-Romanic"); + case 0x0418: + return __("Romanian"); + case 0x0419: + return __("Russian"); + case 0x041a: + return _("Croato-Serbian (Latin)"); + case 0x081a: + return _("Serbo-Croatian (Cyrillic)"); + case 0x041b: + return __("Slovak"); + case 0x041c: + return __("Albanian"); + case 0x041d: + return __("Swedish"); + case 0x041e: + return __("Thai"); + case 0x041f: + return __("Turkish"); + case 0x0420: + return __("Urdu"); + case 0x0421: + return __("Bahasa"); + case 0x0422: + return __("Ukrainian"); + case 0x0423: + return __("Byelorussian"); + case 0x0424: + return __("Slovenian"); + case 0x0425: + return __("Estonian"); + case 0x0426: + return __("Latvian"); + case 0x0427: + return __("Lithuanian"); + case 0x0429: + return _("Farsi"); + case 0x042D: + return __("Basque"); + case 0x042F: + return __("Macedonian"); + case 0x0436: + return __("Afrikaans"); + case 0x043E: + return __("Malayalam"); + default: + return NULL; + } +} + + +static struct EXTRACTOR_Keywords * +history_extract(GsfInput * stream, + unsigned int lcbSttbSavedBy, + unsigned int fcSttbSavedBy, + struct EXTRACTOR_Keywords * prev) { + unsigned int where = 0; + unsigned char * lbuffer; + unsigned int i; + unsigned int length; + char * author; + char * filename; + char * rbuf; + unsigned int nRev; + + // goto offset of revision + gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); + if (gsf_input_remaining(stream) < lcbSttbSavedBy) + return prev; + lbuffer = malloc(lcbSttbSavedBy); + // read all the revision history + gsf_input_read(stream, lcbSttbSavedBy, lbuffer); + // there are n strings, so n/2 revisions (author & file) + nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; + where = 6; + for (i=0; i < nRev; i++) { + if (where >= lcbSttbSavedBy) + break; + length = lbuffer[where++]; + if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || + (where + 2 * length + 2 <= where) ) + break; + author = convertToUtf8((const char*) &lbuffer[where], + length * 2, + "UTF-16BE"); + where += length * 2 + 1; + length = lbuffer[where++]; + if ( (where + 2 * length >= lcbSttbSavedBy) || + (where + 2 * length + 1 <= where) ) + break; + filename = convertToUtf8((const char*) &lbuffer[where], + length * 2, + "UTF-16BE"); + where += length * 2 + 1; + rbuf = malloc(strlen(author) + strlen(filename) + 512); + snprintf(rbuf, 512 + strlen(author) + strlen(filename), + _("Revision #%u: Author '%s' worked on '%s'"), + i, author, filename); + free(author); + free(filename); + prev = addKeyword(prev, + rbuf, + EXTRACTOR_REVISION_HISTORY); + free(rbuf); + } + free(lbuffer); + return prev; +} + + +/* ************** main method *********** */ + struct EXTRACTOR_Keywords * libextractor_ole2_extract(const char * filename, const char * data, size_t size, struct EXTRACTOR_Keywords * prev) { - struct GsfInput *input; - struct GsfInfileMSOle * infile; - struct GsfInput * src; + GsfInput * input; + GsfInfile * infile; + GsfInput * src; + GError * err = NULL; const char * name; - const char * software = 0; + const char * software = NULL; int i; - - input = gsf_input_new((const unsigned char*) data, - (off_t) size, - 0); + unsigned int lcb; + unsigned int fcb; + const unsigned char * data512; + unsigned int lid; + const char * lang; + + if (size < 512 + 898) + return prev; /* can hardly be OLE2 */ + input = gsf_input_memory_new((const guint8 *) data, + (gsf_off_t) size, + FALSE); if (input == NULL) return prev; - infile = gsf_infile_msole_new(input); - if (infile == NULL) + infile = gsf_infile_msole_new(input, &err); + if (infile == NULL) { + g_object_unref(G_OBJECT(input)); return prev; - - for (i=0;i<gsf_infile_msole_num_children(infile);i++) { - name = gsf_infile_msole_name_by_index (infile, i); + } + lcb = 0; + fcb = 0; + for (i=0;i<gsf_infile_num_children(infile);i++) { + name = gsf_infile_name_by_index (infile, i); src = NULL; if (name == NULL) continue; if ( (0 == strcmp(name, "\005SummaryInformation")) || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) { - src = gsf_infile_msole_child_by_index (infile, i); - if (src != NULL) + src = gsf_infile_child_by_index (infile, i); + if (src != NULL) prev = process(src, prev); } if (0 == strcmp(name, "SfxDocumentInfo")) { - src = gsf_infile_msole_child_by_index (infile, i); + src = gsf_infile_child_by_index (infile, i); if (src != NULL) prev = processSO(src, prev); } if (src != NULL) - gsf_input_finalize(src); + g_object_unref(G_OBJECT(src)); } - gsf_infile_msole_finalize(infile); + + data512 = (const unsigned char*) &data[512]; + lid = data512[6] + (data512[7] << 8); + lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); + fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); + lang = lidToLanguage(lid); + if (lang != NULL) { + prev = addKeyword(prev, + lang, + EXTRACTOR_LANGUAGE); + } + if (lcb >= 6) { + for (i=0;i<gsf_infile_num_children(infile);i++) { + name = gsf_infile_name_by_index (infile, i); + if (name == NULL) + continue; + if ( (0 == strcmp(name, "1Table")) || + (0 == strcmp(name, "0Table")) ) { + src = gsf_infile_child_by_index (infile, i); + if (src != NULL) { + prev = history_extract(src, + lcb, + fcb, + prev); + g_object_unref(G_OBJECT(src)); + } + } + } + } + g_object_unref(G_OBJECT(infile)); /* * Hack to return an appropriate mimetype */ software = EXTRACTOR_extractLast(EXTRACTOR_SOFTWARE, prev); - if(NULL == software) { + if (NULL == software) { /* * when very puzzled, just look at file magic number */ - if( (8 < size) - && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) ) + if ( (8 < size) + && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) ) software = "Microsoft Office"; } diff --git a/src/plugins/wordleaker/Makefile.am b/src/plugins/wordleaker/Makefile.am @@ -1,25 +0,0 @@ -include ../Makefile-plugins.am - -plugin_LTLIBRARIES = \ - libextractor_word.la - -libextractor_word_la_LINK = \ - /bin/sh ../../../libtool --mode=link $(CXXLD) -o libextractor_word.la -libextractor_word_la_LDFLAGS = \ - $(PLUGINFLAGS) $(retaincommand) \ - $(XTRA_CPPLIBS) -libextractor_word_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la \ - $(top_builddir)/src/plugins/libconvert.la \ - -lm - -libextractor_word_la_SOURCES = \ - pole.h pole.cpp \ - wordleaker.h \ - wordextractor.cc - -# gcc 3.3 produces BROKEN code for -O1 and -O2 (PDF extraction -# would fail silently) hence we MUST override the user flag here -# which may contain -O1 or -O2! -# CXXFLAGS = -O0 - diff --git a/src/plugins/wordleaker/SYMBOLS b/src/plugins/wordleaker/SYMBOLS @@ -1 +0,0 @@ -libextractor_word_extract diff --git a/src/plugins/wordleaker/pole.cpp b/src/plugins/wordleaker/pole.cpp @@ -1,1271 +0,0 @@ -/* POLE - Portable C++ library to access OLE Storage - Copyright (C) 2002-2004 Ariya Hidayat <ariya@kde.org> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -#include <fstream> -#include <iostream> -#include <list> -#include <string> -#include <vector> - -#include "pole.h" - -namespace POLE -{ - -class Header -{ - public: - unsigned char id[8]; // signature, or magic identifier - unsigned b_shift; // bbat->blockSize = 1 << b_shift - unsigned s_shift; // sbat->blockSize = 1 << s_shift - unsigned num_bat; // blocks allocated for big bat - unsigned dirent_start; // starting block for directory info - unsigned threshold; // switch from small to big file (usually 4K) - unsigned sbat_start; // starting block index to store small bat - unsigned num_sbat; // blocks allocated for small bat - unsigned mbat_start; // starting block to store meta bat - unsigned num_mbat; // blocks allocated for meta bat - unsigned long bb_blocks[109]; - - Header(); - void load( const unsigned char* buffer ); - void save( unsigned char* buffer ); - void debug(); -}; - -class AllocTable -{ - public: - static const unsigned Eof; - static const unsigned Avail; - static const unsigned Bat; - unsigned blockSize; - AllocTable(); - void clear(); - unsigned long count(); - void resize( unsigned long newsize ); - void preserve( unsigned long n ); - void set( unsigned long index, unsigned long val ); - unsigned unused(); - void setChain( std::vector<unsigned long> ); - std::vector<unsigned long> follow( unsigned long start ); - unsigned long operator[](unsigned long index ); - void load( const unsigned char* buffer, unsigned len ); - void save( unsigned char* buffer ); - unsigned size(); - void debug(); - private: - std::vector<unsigned long> data; - AllocTable( const AllocTable& ); - AllocTable& operator=( const AllocTable& ); -}; - -class DirEntry -{ - public: - std::string name; - bool dir; // true if directory - unsigned long size; // size (not valid if directory) - unsigned long start; // starting block - unsigned prev; // previous sibling - unsigned next; // next sibling - unsigned child; // first child -}; - -class DirTree -{ - public: - static const unsigned End; - DirTree(); - void clear(); - unsigned entryCount(); - DirEntry* entry( unsigned index ); - DirEntry* entry( const std::string& name, bool create=false ); - int indexOf( DirEntry* e ); - int parent( unsigned index ); - std::string fullName( unsigned index ); - std::vector<unsigned> children( unsigned index ); - std::vector<DirEntry*> listDirectory(); - bool enterDirectory( const std::string& dir ); - void leaveDirectory(); - std::string path(); - void load( unsigned char* buffer, unsigned len ); - void save( unsigned char* buffer ); - unsigned size(); - void debug(); - private: - unsigned current; - std::vector<DirEntry> entries; - DirTree( const DirTree& ); - DirTree& operator=( const DirTree& ); -}; - -class StorageIO -{ - public: - Storage* storage; - std::string filename; - std::fstream file; - int result; // result of operation - bool opened; // true if file is opened - unsigned long filesize; // size of the file - - Header* header; // storage header - DirTree* dirtree; // directory tree - AllocTable* bbat; // allocation table for big blocks - AllocTable* sbat; // allocation table for small blocks - - std::vector<unsigned long> sb_blocks; // blocks for "small" files - - std::list<Stream*> streams; - - StorageIO( Storage* storage, const char* filename ); - ~StorageIO(); - - bool open(); - void close(); - void flush(); - void load(); - void create(); - - unsigned long loadBigBlocks( std::vector<unsigned long> blocks, unsigned char* buffer, unsigned long maxlen ); - - unsigned long loadBigBlock( unsigned long block, unsigned char* buffer, unsigned long maxlen ); - - unsigned long loadSmallBlocks( std::vector<unsigned long> blocks, unsigned char* buffer, unsigned long maxlen ); - - unsigned long loadSmallBlock( unsigned long block, unsigned char* buffer, unsigned long maxlen ); - - private: - // no copy or assign - StorageIO( const StorageIO& ); - StorageIO& operator=( const StorageIO& ); - -}; - -class StreamImpl -{ - public: - StreamImpl( StorageIO* io, DirEntry* entry ); - ~StreamImpl(); - unsigned long size(); - void seek( unsigned long pos ); - unsigned long tell(); - int getch(); - unsigned long read( unsigned char* data, unsigned long maxlen ); - unsigned long read( unsigned long pos, unsigned char* data, unsigned long maxlen ); - - StorageIO* io; - DirEntry* entry; - - private: - std::vector<unsigned long> blocks; - - // no copy or assign - StreamImpl( const StreamImpl& ); - StreamImpl& operator=( const StreamImpl& ); - - // pointer for read - unsigned long m_pos; - - // simple cache system to speed-up getch() - unsigned char* cache_data; - unsigned long cache_size; - unsigned long cache_pos; - void updateCache(); -}; - -}; // namespace POLE - -using namespace POLE; - -static inline unsigned long readU16( const unsigned char* ptr ) -{ - return ptr[0]+(ptr[1]<<8); -} - -static inline unsigned long readU32( const unsigned char* ptr ) -{ - return ptr[0]+(ptr[1]<<8)+(ptr[2]<<16)+(ptr[3]<<24); -} - -static inline void writeU16( unsigned char* ptr, unsigned long data ) -{ - ptr[0] = data & 0xff; - ptr[1] = (data >> 8) & 0xff; -} - -static inline void writeU32( unsigned char* ptr, unsigned long data ) -{ - ptr[0] = data & 0xff; - ptr[1] = (data >> 8) & 0xff; - ptr[2] = (data >> 16) & 0xff; - ptr[3] = (data >> 24) & 0xff; -} - -static const unsigned char pole_magic[] = - { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 }; - -// =========== Header ========== - -Header::Header() -{ - b_shift = 9; - s_shift = 6; - num_bat = 0; - dirent_start = 0; - threshold = 4096; - sbat_start = 0; - num_sbat = 0; - mbat_start = 0; - num_mbat = 0; - - for( unsigned i = 0; i < 8; i++ ) - id[i] = pole_magic[i]; - for( unsigned i=0; i<109; i++ ) - bb_blocks[i] = AllocTable::Avail; -} - -void Header::load( const unsigned char* buffer ) -{ - b_shift = readU16( buffer + 0x1e ); - s_shift = readU16( buffer + 0x20 ); - num_bat = readU32( buffer + 0x2c ); - dirent_start = readU32( buffer + 0x30 ); - threshold = readU32( buffer + 0x38 ); - sbat_start = readU32( buffer + 0x3c ); - num_sbat = readU32( buffer + 0x40 ); - mbat_start = readU32( buffer + 0x44 ); - num_mbat = readU32( buffer + 0x48 ); - - for( unsigned i = 0; i < 8; i++ ) - id[i] = buffer[i]; - for( unsigned i=0; i<109; i++ ) - bb_blocks[i] = readU32( buffer + 0x4C+i*4 ); -} - -void Header::save( unsigned char* buffer ) -{ - memset( buffer, 0, 0x4c ); - memcpy( buffer, pole_magic, 8 ); // ole signature - writeU32( buffer + 8, 0 ); // unknown - writeU32( buffer + 12, 0 ); // unknown - writeU32( buffer + 16, 0 ); // unknown - writeU16( buffer + 24, 0x003e ); // revision ? - writeU16( buffer + 26, 3 ); // version ? - writeU16( buffer + 28, 0xfffe ); // unknown - writeU16( buffer + 0x1e, b_shift ); - writeU16( buffer + 0x20, s_shift ); - writeU32( buffer + 0x2c, num_bat ); - writeU32( buffer + 0x30, dirent_start ); - writeU32( buffer + 0x38, threshold ); - writeU32( buffer + 0x3c, sbat_start ); - writeU32( buffer + 0x40, num_sbat ); - writeU32( buffer + 0x44, mbat_start ); - writeU32( buffer + 0x48, num_mbat ); - - for( unsigned i=0; i<109; i++ ) - writeU32( buffer + 0x4C+i*4, bb_blocks[i] ); -} - -void Header::debug() -{ - std::cout << std::endl; - std::cout << "b_shift " << b_shift << std::endl; - std::cout << "s_shift " << s_shift << std::endl; - std::cout << "num_bat " << num_bat << std::endl; - std::cout << "dirent_start " << dirent_start << std::endl; - std::cout << "threshold " << threshold << std::endl; - std::cout << "sbat_start " << sbat_start << std::endl; - std::cout << "num_sbat " << num_sbat << std::endl; - std::cout << "mbat_start " << mbat_start << std::endl; - std::cout << "num_mbat " << num_mbat << std::endl; - - unsigned s = (num_bat<=109) ? num_bat : 109; - std::cout << "bat blocks: "; - for( unsigned i = 0; i < s; i++ ) - std::cout << bb_blocks[i] << " "; - std::cout << std::endl; -} - -// =========== AllocTable ========== - -const unsigned AllocTable::Avail = 0xffffffff; -const unsigned AllocTable::Eof = 0xfffffffe; -const unsigned AllocTable::Bat = 0xfffffffd; - -AllocTable::AllocTable() -{ - blockSize = 4096; - // initial size - resize( 128 ); -} - -unsigned long AllocTable::count() -{ - return data.size(); -} - -void AllocTable::resize( unsigned long newsize ) -{ - unsigned oldsize = data.size(); - data.resize( newsize ); - if( newsize > oldsize ) - for( unsigned i = oldsize; i<newsize; i++ ) - data[i] = Avail; -} - -// make sure there're still free blocks -void AllocTable::preserve( unsigned long n ) -{ - std::vector<unsigned long> pre; - for( unsigned i=0; i < n; i++ ) - pre.push_back( unused() ); -} - -unsigned long AllocTable::operator[]( unsigned long index ) -{ - unsigned long result; - result = data[index]; - return result; -} - -void AllocTable::set( unsigned long index, unsigned long value ) -{ - if( index >= count() ) resize( index + 1); - data[ index ] = value; -} - -void AllocTable::setChain( std::vector<unsigned long> chain ) -{ - if( chain.size() ) - { - for( unsigned i=0; i<chain.size()-1; i++ ) - set( chain[i], chain[i+1] ); - set( chain[ chain.size()-1 ], AllocTable::Eof ); - } -} - -// follow -std::vector<unsigned long> AllocTable::follow( unsigned long start ) -{ - std::vector<unsigned long> chain; - - if( start >= count() ) return chain; - - unsigned long p = start; - while( p < count() ) - { - if( p >= (unsigned long)Eof ) break; - if( p >= count() ) break; - chain.push_back( p ); - if( data[p] >= count() ) break; - p = data[ p ]; - } - - return chain; -} - -unsigned AllocTable::unused() -{ - // find first available block - for( unsigned i = 0; i < data.size(); i++ ) - if( data[i] == Avail ) - return i; - - // completely full, so enlarge the table - unsigned block = data.size(); - resize( data.size()+10 ); - return block; -} - -void AllocTable::load( const unsigned char* buffer, unsigned len ) -{ - resize( len / 4 ); - for( unsigned i = 0; i < count(); i++ ) - set( i, readU32( buffer + i*4 ) ); -} - -// return space required to save this dirtree -unsigned AllocTable::size() -{ - return count() * 4; -} - -void AllocTable::save( unsigned char* buffer ) -{ - for( unsigned i = 0; i < count(); i++ ) - writeU32( buffer + i*4, data[i] ); -} - -void AllocTable::debug() -{ - std::cout << "block size " << data.size() << std::endl; - for( unsigned i=0; i< data.size(); i++ ) - { - if( data[i] == Avail ) continue; - std::cout << i << ": "; - if( data[i] == Eof ) std::cout << "eof"; - else std::cout << data[i]; - std::cout << std::endl; - } -} - -// =========== DirTree ========== - -const unsigned DirTree::End = 0xffffffff; - -DirTree::DirTree() -{ - current = 0; - clear(); -} - -void DirTree::clear() -{ - // leave only root entry - entries.resize( 1 ); - entries[0].name = "Root Entry"; - entries[0].dir = true; - entries[0].size = 0; - entries[0].start = End; - entries[0].prev = End; - entries[0].next = End; - entries[0].child = End; - current = 0; -} - -unsigned DirTree::entryCount() -{ - return entries.size(); -} - -DirEntry* DirTree::entry( unsigned index ) -{ - if( index >= entryCount() ) return (DirEntry*) 0; - return &entries[ index ]; -} - -int DirTree::indexOf( DirEntry* e ) -{ - for( unsigned i = 0; i < entryCount(); i++ ) - if( entry( i ) == e ) return i; - - return -1; -} - -int DirTree::parent( unsigned index ) -{ - // brute-force, basically we iterate for each entries, find its children - // and check if one of the children is 'index' - for( unsigned j=0; j<entryCount(); j++ ) - { - std::vector<unsigned> chi = children( j ); - for( unsigned i=0; i<chi.size();i++ ) - if( chi[i] == index ) - return j; - } - - return -1; -} - -std::string DirTree::fullName( unsigned index ) -{ - // don't use root name ("Root Entry"), just give "/" - if( index == 0 ) return "/"; - - std::string result = entry( index )->name; - result.insert( 0, "/" ); - int p = parent( index ); - while( p > 0 ) - { - result.insert( 0, entry( p )->name ); - result.insert( 0, "/" ); - index = p; - if( index <= 0 ) break; - } - return result; -} - -// given a fullname (e.g "/ObjectPool/_1020961869"), find the entry -// if not found and create is false, return 0 -// if create is true, a new entry is returned -DirEntry* DirTree::entry( const std::string& name, bool create ) -{ - if( !name.length() ) return (DirEntry*)0; - - // quick check for "/" (that's root) - if( name == "/" ) return entry( 0 ); - - // split the names, e.g "/ObjectPool/_1020961869" will become: - // "ObjectPool" and "_1020961869" - std::list<std::string> names; - std::string::size_type start = 0, end = 0; - while( start < name.length() ) - { - end = name.find_first_of( '/', start ); - if( end == std::string::npos ) end = name.length(); - names.push_back( name.substr( start, end-start ) ); - start = end+1; - } - - // start from root when name is absolute - // or current directory when name is relative - int index = (name[0] == '/' ) ? 0 : current; - - // trace one by one - std::list<std::string>::iterator it; - for( it = names.begin(); it != names.end(); ++it ) - { - // find among the children of index - std::vector<unsigned> chi = children( index ); - unsigned child = 0; - for( unsigned i = 0; i < chi.size(); i++ ) - { - DirEntry* ce = entry( chi[i] ); - if( ce ) if( ce->name == *it ) - child = chi[i]; - } - - // traverse to the child - if( child > 0 ) index = child; - else - { - // not found among children - if( !create ) return (DirEntry*)0; - - // create a new entry - unsigned parent = index; - entries.push_back( DirEntry() ); - index = entryCount()-1; - DirEntry* e = entry( index ); - e->name = *it; - e->dir = false; - e->size = 0; - e->start = 0; - e->child = End; - e->prev = End; - e->next = entry(parent)->child; - entry(parent)->child = index; - } - } - - return entry( index ); -} - -// helper function: recursively find siblings of index -void dirtree_find_siblings( DirTree* dirtree, std::vector<unsigned>& result, - unsigned index ) -{ - DirEntry* e = dirtree->entry( index ); - if( !e ) return; - - // prevent infinite loop - for( unsigned i = 0; i < result.size(); i++ ) - if( result[i] == index ) return; - - // add myself - result.push_back( index ); - - // visit previous sibling, don't go infinitely - unsigned prev = e->prev; - if( ( prev > 0 ) && ( prev < dirtree->entryCount() ) ) - { - for( unsigned i = 0; i < result.size(); i++ ) - if( result[i] == prev ) prev = 0; - if( prev ) dirtree_find_siblings( dirtree, result, prev ); - } - - // visit next sibling, don't go infinitely - unsigned next = e->next; - if( ( next > 0 ) && ( next < dirtree->entryCount() ) ) - { - for( unsigned i = 0; i < result.size(); i++ ) - if( result[i] == next ) next = 0; - if( next ) dirtree_find_siblings( dirtree, result, next ); - } -} - -std::vector<unsigned> DirTree::children( unsigned index ) -{ - std::vector<unsigned> result; - - DirEntry* e = entry( index ); - if( e ) if( e->child < entryCount() ) - dirtree_find_siblings( this, result, e->child ); - - return result; -} - -std::vector<DirEntry*> DirTree::listDirectory() -{ - std::vector<DirEntry*> result; - - std::vector<unsigned> chi = children( current ); - for( unsigned i = 0; i < chi.size(); i++ ) - result.push_back( entry( chi[i] ) ); - - return result; -} - -bool DirTree::enterDirectory( const std::string& dir ) -{ - DirEntry* e = entry( dir ); - if( !e ) return false; - if( !e->dir ) return false; - - int index = indexOf( e ); - if( index < 0 ) return false; - - current = index; - return true; -} - -void DirTree::leaveDirectory() -{ - // already at root ? - if( current == 0 ) return; - - int p = parent( current ); - if( p >= 0 ) current = p; -} - -std::string DirTree::path() -{ - return fullName( current ); -} - -void DirTree::load( unsigned char* buffer, unsigned size ) -{ - entries.clear(); - current = 0; - - for( unsigned i = 0; i < size/128; i++ ) - { - unsigned p = i * 128; - - // would be < 32 if first char in the name isn't printable - unsigned prefix = 32; - - // parse name of this entry, which stored as Unicode 16-bit - std::string name; - int name_len = readU16( buffer + 0x40+p ); - for( int j=0; ( buffer[j+p]) && (j<name_len); j+= 2 ) - name.append( 1, buffer[j+p] ); - - // first char isn't printable ? remove it... - if( buffer[p] < 32 ) - { - prefix = buffer[0]; - name.erase( 0,1 ); - } - - DirEntry e; - e.name = name; - e.start = readU32( buffer + 0x74+p ); - e.size = readU32( buffer + 0x78+p ); - e.prev = readU32( buffer + 0x44+p ); - e.next = readU32( buffer + 0x48+p ); - e.child = readU32( buffer + 0x4C+p ); - e.dir = ( buffer[ 0x42 + p]!=2 ); - - entries.push_back( e ); - } -} - -// return space required to save this dirtree -unsigned DirTree::size() -{ - return entryCount() * 128; -} - -void DirTree::save( unsigned char* buffer ) -{ - memset( buffer, 0, size() ); - - // root is fixed as "Root Entry" - DirEntry* root = entry( 0 ); - std::string name = "Root Entry"; - for( unsigned j = 0; j < name.length(); j++ ) - buffer[ j*2 ] = name[j]; - writeU16( buffer + 0x40, name.length()*2 + 2 ); - writeU32( buffer + 0x74, 0xffffffff ); - writeU32( buffer + 0x78, 0 ); - writeU32( buffer + 0x44, 0xffffffff ); - writeU32( buffer + 0x48, 0xffffffff ); - writeU32( buffer + 0x4c, root->child ); - buffer[ 0x42 ] = 5; - buffer[ 0x43 ] = 1; - - for( unsigned i = 1; i < entryCount(); i++ ) - { - DirEntry* e = entry( i ); - if( !e ) continue; - if( e->dir ) - { - e->start = 0xffffffff; - e->size = 0; - } - - // max length for name is 32 chars - std::string name = e->name; - if( name.length() > 32 ) - name.erase( 32, name.length() ); - - // write name as Unicode 16-bit - for( unsigned j = 0; j < name.length(); j++ ) - buffer[ i*128 + j*2 ] = name[j]; - - writeU16( buffer + i*128 + 0x40, name.length()*2 + 2 ); - writeU32( buffer + i*128 + 0x74, e->start ); - writeU32( buffer + i*128 + 0x78, e->size ); - writeU32( buffer + i*128 + 0x44, e->prev ); - writeU32( buffer + i*128 + 0x48, e->next ); - writeU32( buffer + i*128 + 0x4c, e->child ); - buffer[ i*128 + 0x42 ] = e->dir ? 1 : 2; - buffer[ i*128 + 0x43 ] = 1; // always black - } -} - -void DirTree::debug() -{ - for( unsigned i = 0; i < entryCount(); i++ ) - { - DirEntry* e = entry( i ); - if( !e ) continue; - std::cout << i << ": "; - std::cout << e->name << " "; - if( e->dir ) std::cout << "(Dir) "; - else std::cout << "(File) "; - std::cout << e->size << " "; - std::cout << "s:" << e->start << " "; - std::cout << "("; - if( e->child == End ) std::cout << "-"; else std::cout << e->child; - std::cout << " "; - if( e->prev == End ) std::cout << "-"; else std::cout << e->prev; - std::cout << ":"; - if( e->next == End ) std::cout << "-"; else std::cout << e->next; - std::cout << ")"; - std::cout << std::endl; - } -} - -// =========== StorageIO ========== - -StorageIO::StorageIO( Storage* st, const char* fname ) -{ - storage = st; - filename = fname; - result = Storage::Ok; - opened = false; - - header = new Header(); - dirtree = new DirTree(); - bbat = new AllocTable(); - sbat = new AllocTable(); - - filesize = 0; - bbat->blockSize = 1 << header->b_shift; - sbat->blockSize = 1 << header->s_shift; -} - -StorageIO::~StorageIO() -{ - if( opened ) close(); - delete sbat; - delete bbat; - delete dirtree; - delete header; -} - -bool StorageIO::open() -{ - // already opened ? close first - if( opened ) close(); - - load(); - - return result == Storage::Ok; -} - -void StorageIO::load() -{ - unsigned char* buffer = 0; - unsigned long buflen = 0; - std::vector<unsigned long> blocks; - - // open the file, check for error - result = Storage::OpenFailed; - file.open( filename.c_str(), std::ios::binary | std::ios::in ); - if( !file.good() ) return; - - // find size of input file - file.seekg( 0, std::ios::end ); - filesize = file.tellg(); - - // load header - buffer = new unsigned char[512]; - file.seekg( 0 ); - file.read( (char*)buffer, 512 ); - header->load( buffer ); - delete[] buffer; - - // check OLE magic id - result = Storage::NotOLE; - for( unsigned i=0; i<8; i++ ) - if( header->id[i] != pole_magic[i] ) - return; - - // sanity checks - result = Storage::BadOLE; - if( header->threshold != 4096 ) return; - if( header->num_bat == 0 ) return; - if( header->s_shift > header->b_shift ) return; - if( header->b_shift <= 6 ) return; - if( header->b_shift >=31 ) return; - - // important block size - bbat->blockSize = 1 << header->b_shift; - sbat->blockSize = 1 << header->s_shift; - - // find blocks allocated to store big bat - // the first 109 blocks are in header, the rest in meta bat - blocks.resize( header->num_bat ); - for( unsigned i = 0; i < header->num_bat; i++ ) - if( i < 109 ) blocks[i] = header->bb_blocks[i]; - if( header->num_bat > 109 ) - if( header->num_mbat > 0 ) - { - buffer = new unsigned char[ bbat->blockSize ]; - unsigned k = 109; - for( unsigned r = 0; r < header->num_mbat; r++ ) - { - loadBigBlock( header->mbat_start+r, buffer, bbat->blockSize ); - for( unsigned s=0; s < bbat->blockSize/4; s+=4 ) - blocks[k++] = readU32( buffer + s ); - // FIXME check if k > num_bat - } - delete[] buffer; - } - - // load big bat - buflen = blocks.size()*bbat->blockSize; - buffer = new unsigned char[ buflen ]; - loadBigBlocks( blocks, buffer, buflen ); - bbat->load( buffer, buflen ); - delete[] buffer; - - // load small bat - blocks.clear(); - blocks = bbat->follow( header->sbat_start ); - buflen = blocks.size()*bbat->blockSize; - buffer = new unsigned char[ buflen ]; - loadBigBlocks( blocks, buffer, buflen ); - sbat->load( buffer, buflen ); - delete[] buffer; - - // load directory tree - blocks = bbat->follow( header->dirent_start ); - buflen = blocks.size()*bbat->blockSize; - buffer = new unsigned char[ buflen ]; - loadBigBlocks( blocks, buffer, buflen ); - sb_blocks = bbat->follow( readU32( buffer + 0x74 ) ); // small files - dirtree->load( buffer, buflen ); - - // fetch block chain as data for small-files - delete[] buffer; - - // so far so good - result = Storage::Ok; - opened = true; -} - -void StorageIO::create() -{ - // std::cout << "Creating " << filename << std::endl; - - file.open( filename.c_str(), std::ios::out|std::ios::binary ); - if( !file.good() ) - { - std::cerr << "Can't create " << filename << std::endl; - result = Storage::OpenFailed; - return; - } - - // so far so good - opened = true; - result = Storage::Ok; -} - -void StorageIO::close() -{ - if( !opened ) return; - - file.close(); - opened = false; - - std::list<Stream*>::iterator it; - for( it = streams.begin(); it != streams.end(); ++it ) - delete *it; -} - -unsigned long StorageIO::loadBigBlocks( std::vector<unsigned long> blocks, - unsigned char* data, unsigned long maxlen ) -{ - // sentinel - if( !data ) return 0; - if( !file.good() ) return 0; - if( blocks.size() < 1 ) return 0; - if( maxlen == 0 ) return 0; - - // read block one by one, seems fast enough - unsigned long bytes = 0; - for( unsigned long i=0; (i < blocks.size() ) & ( bytes<maxlen ); i++ ) - { - unsigned long block = blocks[i]; - if( block < 0 ) continue; - unsigned long pos = bbat->blockSize * ( block+1 ); - unsigned long p = (bbat->blockSize < maxlen-bytes) ? bbat->blockSize : maxlen-bytes; - if( pos + p > filesize ) p = filesize - pos; - file.seekg( pos ); - file.read( (char*)data + bytes, p ); - bytes += p; - } - - return bytes; -} - -unsigned long StorageIO::loadBigBlock( unsigned long block, - unsigned char* data, unsigned long maxlen ) -{ - // sentinel - if( !data ) return 0; - if( !file.good() ) return 0; - if( block < 0 ) return 0; - - // wraps call for loadBigBlocks - std::vector<unsigned long> blocks; - blocks.resize( 1 ); - blocks[ 0 ] = block; - - return loadBigBlocks( blocks, data, maxlen ); -} - -// return number of bytes which has been read -unsigned long StorageIO::loadSmallBlocks( std::vector<unsigned long> blocks, - unsigned char* data, unsigned long maxlen ) -{ - // sentinel - if( !data ) return 0; - if( !file.good() ) return 0; - if( blocks.size() < 1 ) return 0; - if( maxlen == 0 ) return 0; - - // our own local buffer - unsigned char buf[ bbat->blockSize ]; - - // read small block one by one - unsigned long bytes = 0; - for( unsigned long i=0; ( i<blocks.size() ) & ( bytes<maxlen ); i++ ) - { - unsigned long block = blocks[i]; - if( block < 0 ) continue; - - // find where the small-block exactly is - unsigned long pos = block * sbat->blockSize; - unsigned long bbindex = pos / bbat->blockSize; - if( bbindex >= sb_blocks.size() ) break; - - loadBigBlock( sb_blocks[ bbindex ], buf, bbat->blockSize ); - - // copy the data - unsigned offset = pos % bbat->blockSize; - unsigned long p = (maxlen-bytes < bbat->blockSize-offset ) ? maxlen-bytes : bbat->blockSize-offset; - p = (sbat->blockSize<p ) ? sbat->blockSize : p; - memcpy( data + bytes, buf + offset, p ); - bytes += p; - } - - return bytes; -} - -unsigned long StorageIO::loadSmallBlock( unsigned long block, - unsigned char* data, unsigned long maxlen ) -{ - // sentinel - if( !data ) return 0; - if( !file.good() ) return 0; - if( block < 0 ) return 0; - - // wraps call for loadSmallBlocks - std::vector<unsigned long> blocks; - blocks.resize( 1 ); - blocks.assign( 1, block ); - - return loadSmallBlocks( blocks, data, maxlen ); -} - -// =========== StreamImpl ========== - -StreamImpl::StreamImpl( StorageIO* s, DirEntry* e) -{ - io = s; - entry = e; - m_pos = 0; - - if( entry->size >= io->header->threshold ) - blocks = io->bbat->follow( entry->start ); - else - blocks = io->sbat->follow( entry->start ); - - // prepare cache - cache_pos = 0; - cache_size = 4096; // optimal ? - cache_data = new unsigned char[cache_size]; - updateCache(); -} - -// FIXME tell parent we're gone -StreamImpl::~StreamImpl() -{ - delete[] cache_data; -} - -void StreamImpl::seek( unsigned long pos ) -{ - m_pos = pos; -} - -unsigned long StreamImpl::tell() -{ - return m_pos; -} - -int StreamImpl::getch() -{ - // past end-of-file ? - if( m_pos > entry->size ) return -1; - - // need to update cache ? - if( !cache_size || ( m_pos < cache_pos ) || - ( m_pos >= cache_pos + cache_size ) ) - updateCache(); - - // something bad if we don't get good cache - if( !cache_size ) return -1; - - int data = cache_data[m_pos - cache_pos]; - m_pos++; - - return data; -} - -unsigned long StreamImpl::read( unsigned long pos, unsigned char* data, unsigned long maxlen ) -{ - // sanity checks - if( !data ) return 0; - if( maxlen == 0 ) return 0; - - unsigned long totalbytes = 0; - - if ( entry->size < io->header->threshold ) - { - // small file - unsigned long index = pos / io->sbat->blockSize; - - if( index >= blocks.size() ) return 0; - - unsigned char buf[ io->sbat->blockSize ]; - unsigned long offset = pos % io->sbat->blockSize; - while( totalbytes < maxlen ) - { - if( index >= blocks.size() ) break; - io->loadSmallBlock( blocks[index], buf, io->bbat->blockSize ); - unsigned long count = io->sbat->blockSize - offset; - if( count > maxlen-totalbytes ) count = maxlen-totalbytes; - memcpy( data+totalbytes, buf + offset, count ); - totalbytes += count; - offset = 0; - index++; - } - - } - else - { - // big file - unsigned long index = pos / io->bbat->blockSize; - - if( index >= blocks.size() ) return 0; - - unsigned char buf[ io->bbat->blockSize ]; - unsigned long offset = pos % io->bbat->blockSize; - while( totalbytes < maxlen ) - { - if( index >= blocks.size() ) break; - io->loadBigBlock( blocks[index], buf, io->bbat->blockSize ); - unsigned long count = io->bbat->blockSize - offset; - if( count > maxlen-totalbytes ) count = maxlen-totalbytes; - memcpy( data+totalbytes, buf + offset, count ); - totalbytes += count; - index++; - offset = 0; - } - - } - - return totalbytes; -} - -unsigned long StreamImpl::read( unsigned char* data, unsigned long maxlen ) -{ - unsigned long bytes = read( tell(), data, maxlen ); - m_pos += bytes; - return bytes; -} - -void StreamImpl::updateCache() -{ - // sanity check - if( !cache_data ) return; - - cache_pos = m_pos - ( m_pos % cache_size ); - unsigned long bytes = cache_size; - if( cache_pos + bytes > entry->size ) bytes = entry->size - cache_pos; - cache_size = read( cache_pos, cache_data, bytes ); -} - - -// =========== Storage ========== - -Storage::Storage( const char* filename ) -{ - io = new StorageIO( this, filename ); -} - -Storage::~Storage() -{ - delete io; -} - -int Storage::result() -{ - return io->result; -} - -bool Storage::open() -{ - return io->open(); -} - -void Storage::close() -{ - io->close(); -} - -// list all files and subdirs in current path -std::list<std::string> Storage::listDirectory() -{ - std::list<std::string> result; - - std::vector<DirEntry*> entries; - entries = io->dirtree->listDirectory(); - for( unsigned i = 0; i < entries.size(); i++ ) - result.push_back( entries[i]->name ); - - return result; -} - -// enters a sub-directory, returns false if not a directory or not found -bool Storage::enterDirectory( const std::string& directory ) -{ - return io->dirtree->enterDirectory( directory ); -} - -// goes up one level (like cd ..) -void Storage::leaveDirectory() -{ - return io->dirtree->leaveDirectory(); -} - -std::string Storage::path() -{ - return io->dirtree->path(); -} - -Stream* Storage::stream( const std::string& name ) -{ - // sanity check - if( !name.length() ) return (Stream*)0; - if( !io ) return (Stream*)0; - - // make absolute if necesary - std::string fullName = name; - if( name[0] != '/' ) fullName.insert( 0, path() + "/" ); - - DirEntry* entry = io->dirtree->entry( name ); - if( !entry ) return (Stream*)0; - - Stream* s = new Stream(); - s->impl = new StreamImpl( io, entry ); - io->streams.push_back( s ); - - return s; -} - - - -// =========== Stream ========== - -Stream::Stream() -{ - // just nullify, will be managed later Storage::stream - impl = 0; -} - -// FIXME tell parent we're gone -Stream::~Stream() -{ - delete impl; -} - -unsigned long Stream::tell() -{ - return impl ? impl->tell() : 0; -} - -void Stream::seek( unsigned long newpos ) -{ - if( impl ) impl->seek( newpos ); -} - -unsigned long Stream::size() -{ - return impl ? impl->entry->size : 0; -} - -int Stream::getch() -{ - return impl ? impl->getch() : 0; -} - -unsigned long Stream::read( unsigned char* data, unsigned long maxlen ) -{ - return impl ? impl->read( data, maxlen ) : 0; -} - diff --git a/src/plugins/wordleaker/pole.h b/src/plugins/wordleaker/pole.h @@ -1,149 +0,0 @@ -/* POLE - Portable C++ library to access OLE Storage - Copyright (C) 2002-2004 Ariya Hidayat <ariya@kde.org> - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public License - along with this library; see the file COPYING.LIB. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -#ifndef POLE_H -#define POLE_H - -#include <string> -#include <list> - -namespace POLE -{ - -class StorageIO; -class Stream; -class StreamImpl; - -class Storage -{ - friend class Stream; - friend class StreamOut; - -public: - - enum { Ok, OpenFailed, NotOLE, BadOLE, UnknownError, - StupidWorkaroundForBrokenCompiler=255 }; - - /** - * Constructs a storage with name filename. - **/ - Storage( const char* filename ); - - /** - * Destroys the storage. - **/ - ~Storage(); - - /** - * Opens the storage. Returns true if no error occurs. - **/ - bool open(); - - /** - * Closes the storage. - **/ - void close(); - - /** - * Returns the error code of last operation. - **/ - int result(); - - /** - * Returns the current path. - **/ - std::string path(); - - /** - * Finds all stream and directories in current path. - **/ - std::list<std::string> listDirectory(); - - /** - * Changes path to directory. Returns true if no error occurs. - **/ - bool enterDirectory( const std::string& directory ); - - /** - * Goes to one directory up. - **/ - void leaveDirectory(); - - /** - * Finds and returns a stream with the specified name. - **/ - Stream* stream( const std::string& name ); - -private: - StorageIO* io; - - // no copy or assign - Storage( const Storage& ); - Storage& operator=( const Storage& ); - -}; - -class Stream -{ - friend class Storage; - friend class StorageIO; - -public: - - /** - * Returns the stream size. - **/ - unsigned long size(); - - /** - * Returns the read pointer. - **/ - unsigned long tell(); - - /** - * Sets the read position. - **/ - void seek( unsigned long pos ); - - /** - * Reads a byte. - **/ - int getch(); - - /** - * Reads a block of data. - **/ - unsigned long read( unsigned char* data, unsigned long maxlen ); - -private: - - Stream(); - ~Stream(); - - // no copy or assign - Stream( const Stream& ); - Stream& operator=( const Stream& ); - - StreamImpl* impl; -}; - - -} - -#endif // POLE_H diff --git a/src/plugins/wordleaker/wordextractor.cc b/src/plugins/wordleaker/wordextractor.cc @@ -1,486 +0,0 @@ -/* - This file is part of libextractor. - (C) 2006 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - - This code depends heavily on the wordleaker code and - a lot of code was borrowed from wordleaker.cpp. See also - the README file in this directory. - */ - -#include <math.h> -#include <time.h> - -#include "wordleaker.h" -#include "pole.h" -#include "platform.h" -#include "extractor.h" -#include "../convert.h" - -#define __(a) dgettext("iso-639", a) - -extern "C" { - - static EXTRACTOR_KeywordType - SummaryProperties[] = { - EXTRACTOR_UNKNOWN, - EXTRACTOR_UNKNOWN, - EXTRACTOR_TITLE, - EXTRACTOR_SUBJECT, - EXTRACTOR_AUTHOR, - EXTRACTOR_KEYWORDS, - EXTRACTOR_COMMENT, - EXTRACTOR_TEMPLATE, - EXTRACTOR_LAST_SAVED_BY, - EXTRACTOR_VERSIONNUMBER, - EXTRACTOR_TOTAL_EDITING_TIME, - EXTRACTOR_LAST_PRINTED, - EXTRACTOR_CREATION_DATE, - EXTRACTOR_MODIFICATION_DATE, - EXTRACTOR_PAGE_COUNT, - EXTRACTOR_WORD_COUNT, - EXTRACTOR_CHARACTER_COUNT, - EXTRACTOR_THUMBNAILS, - EXTRACTOR_SOFTWARE, - EXTRACTOR_SECURITY, - }; - - static char * xstrndup(const char * s, size_t n){ - char * d; - - d = (char *) malloc(n+1); - memcpy(d,s,n); - d[n]='\0'; - return d; - } - - static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, - const char * keyword, - struct EXTRACTOR_Keywords * next) { - EXTRACTOR_KeywordList * result; - - if (keyword == NULL) - return next; - result = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = strdup(keyword); - result->keywordType = type; - return result; - } - - static char * dateToString( unsigned long date ) { - char f[128]; - struct tm t; - memset(&t, 0, sizeof(struct tm)); - t.tm_year = 1900 + date % 100; - t.tm_mon = date / 100 % 100; - t.tm_mday = date / 10000 % 100; - if (0 == strftime(f, 128, - nl_langinfo(D_FMT), - &t)) - return NULL; - - return xstrndup(f, 128); - } - - static const char * idToProduct( unsigned int id ) { - // TODO: find the rest of ids (and check existing ones!) - switch ( id ) { - case 0x6954: - case 0x656d: - return "Word 97 (Windows NT)?"; - case 0x206d: - case 0x696c: - return "Word 6 (MS DOS)?"; - case 0x6A62: - return "Word 97"; - case 0x626A: - return "Word 98 (Mac)"; - default: - return NULL; - } - } - - static const char * lidToLanguage( unsigned int lid ) { - switch ( lid ) { - case 0x0400: - return _("No Proofing"); - case 0x0401: - return __("Arabic"); - case 0x0402: - return __("Bulgarian"); - case 0x0403: - return __("Catalan"); - case 0x0404: - return _("Traditional Chinese"); - case 0x0804: - return _("Simplified Chinese"); - case 0x0405: - return __("Chechen"); - case 0x0406: - return __("Danish"); - case 0x0407: - return __("German"); - case 0x0807: - return _("Swiss German"); - case 0x0408: - return __("Greek"); - case 0x0409: - return _("U.S. English"); - case 0x0809: - return _("U.K. English"); - case 0x0c09: - return _("Australian English"); - case 0x040a: - return _("Castilian Spanish"); - case 0x080a: - return _("Mexican Spanish"); - case 0x040b: - return __("Finnish"); - case 0x040c: - return __("French"); - case 0x080c: - return _("Belgian French"); - case 0x0c0c: - return _("Canadian French"); - case 0x100c: - return _("Swiss French"); - case 0x040d: - return __("Hebrew"); - case 0x040e: - return __("Hungarian"); - case 0x040f: - return __("Icelandic"); - case 0x0410: - return __("Italian"); - case 0x0810: - return _("Swiss Italian"); - case 0x0411: - return __("Japanese"); - case 0x0412: - return __("Korean"); - case 0x0413: - return __("Dutch"); - case 0x0813: - return _("Belgian Dutch"); - case 0x0414: - return _("Norwegian Bokmal"); - case 0x0814: - return __("Norwegian Nynorsk"); - case 0x0415: - return __("Polish"); - case 0x0416: - return __("Brazilian Portuguese"); - case 0x0816: - return __("Portuguese"); - case 0x0417: - return _("Rhaeto-Romanic"); - case 0x0418: - return __("Romanian"); - case 0x0419: - return __("Russian"); - case 0x041a: - return _("Croato-Serbian (Latin)"); - case 0x081a: - return _("Serbo-Croatian (Cyrillic)"); - case 0x041b: - return __("Slovak"); - case 0x041c: - return __("Albanian"); - case 0x041d: - return __("Swedish"); - case 0x041e: - return __("Thai"); - case 0x041f: - return __("Turkish"); - case 0x0420: - return __("Urdu"); - case 0x0421: - return __("Bahasa"); - case 0x0422: - return __("Ukrainian"); - case 0x0423: - return __("Byelorussian"); - case 0x0424: - return __("Slovenian"); - case 0x0425: - return __("Estonian"); - case 0x0426: - return __("Latvian"); - case 0x0427: - return __("Lithuanian"); - case 0x0429: - return _("Farsi"); - case 0x042D: - return __("Basque"); - case 0x042F: - return __("Macedonian"); - case 0x0436: - return __("Afrikaans"); - case 0x043E: - return __("Malayalam"); - default: - return NULL; - } - } - - - - // read the type of the property and displays its value - static char * getProperty( POLE::Stream* stream ) { - unsigned char buffer[256]; - unsigned char c; - unsigned long i; - unsigned int j; - unsigned long t, t1, t2; - char *s; - - unsigned long read = stream->read(buffer, 4); - if (read != 4) - return NULL; - unsigned int type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - - switch (type) { - case 2: // VT_I2 - read = stream->read(buffer, 2); - if (read != 2) - return NULL; - i = buffer[0] + (buffer[1] << 8); - s = (char*) malloc(16); - snprintf(s, 16, "%u", i); - return s; - case 3: // VT_I4 - read = stream->read(buffer, 4); - if (read != 4) - return NULL; - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - s = (char*) malloc(16); - snprintf(s, 16, "%u", i); - return s; - case 11: // VT_BOOL - read = stream->read(buffer, 1); - if (read != 1) - return NULL; - if ((char) buffer[0] == -1) - return strdup("true"); - return strdup("false"); - case 30: // VT_LPSTR - read = stream->read(buffer, 4); - if (read != 4) - return NULL; - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - if ( (i < 0) || (i > 16*1024*1024)) - return NULL; - s = (char*) malloc(i+1); - s[i] = '\0'; - j = 0; - while ( ((c = stream->getch()) != 0) && (i > j) ) - s[j++] = c; - if ( (j > 0) && (s[j-1] == '\n') ) - s[--j] = '\0'; - if (j != i) { - free(s); - return NULL; - } - return s; - case 64: // VT_FILETIME - read = stream->read(buffer, 8); - if (read != 8) - return NULL; - t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - t = filetime_to_unixtime(t1, t2); - char * ret = ctime_r((time_t *) &t, (char*)malloc(32)); - ret[strlen(ret)-1] = '\0'; /* kill newline */ - return ret; - } - return NULL; - } - - - struct EXTRACTOR_Keywords * libextractor_word_extract(const char * filename, - const char * data, - size_t size, - struct EXTRACTOR_Keywords * prev) { - char ver[16]; - char product[128]; - unsigned char buffer[256]; - - if ( (size < 512 + 898) || (filename == NULL) ) - return prev; - if (0 != memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) - /* look at file magic number to avoid false positives */ - return prev; - - - POLE::Storage* storage = new POLE::Storage(filename); - storage->open(); - if (storage->result() != POLE::Storage::Ok ) { - delete storage; - return prev; - } - - POLE::Stream * stream = storage->stream( "SummaryInformation" ); - if (! stream) { - delete storage; - return prev; - } - - // ClassID & Offset - stream->seek(28); - if (20 != stream->read(buffer, 20)) { - delete storage; - return prev; - } - - // beginning of section - unsigned long begin = stream->tell(); - // skip length of section - stream->read(buffer, 4); - // number of properties - if (4 == stream->read(buffer, 4)) { - unsigned int nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - // properties - for (unsigned int i = 0; i < nproperties; i++) { - if (8 != stream->read(buffer, 8)) - break; - unsigned int propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - unsigned int offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - if (propertyID > 1 && propertyID < 20) { - unsigned long offsetCur = stream->tell(); - stream->seek(offsetProp + begin); - if (propertyID == 10) { - /* FIXME: how is editing time encoded? */ - } if (propertyID == 19) { - /* FIXME: how to interpret the security integer? */ - } else { - char * prop = getProperty(stream); - if (prop != NULL) { - prev = addKeyword(SummaryProperties[propertyID], - prop, - prev); - free(prop); - } - } - stream->seek(offsetCur); - } - } - } - - - const unsigned char * data512 = (const unsigned char*) &data[512]; - unsigned int wIdent = data512[0] + (data512[1] << 8); - unsigned int nProduct = data512[4] + (data512[5] << 8); - unsigned int lid = data512[6] + (data512[7] << 8); - unsigned int envr = data512[18]; - unsigned int wMagicCreated = data512[34] + (data512[35] << 8); - unsigned int wMagicRevised = data512[36] + (data512[37] << 8); - unsigned long lProductCreated = data512[68] + (data512[69] << 8) + (data512[70] << 16) + (data512[71] << 24); - unsigned long lProductRevised = data512[72] + (data512[73] << 8) + (data512[74] << 16) + (data512[75] << 24); - unsigned long fcSttbSavedBy = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); - unsigned long lcbSttbSavedBy = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); - - if (nProduct != 0) { - snprintf(ver, 16, "%u", nProduct); - prev = addKeyword(EXTRACTOR_PRODUCTVERSION, - ver, - prev); - } - const char * lang = lidToLanguage(lid); - if (lang != NULL) { - prev = addKeyword(EXTRACTOR_LANGUAGE, - lang, - prev); - } - const char * prod = idToProduct(wMagicCreated); - if (prod != NULL) { - char * date = dateToString(lProductCreated); - snprintf(product, 128, _("%s (Build %s)"), - prod, - date); - free(date); - prev = addKeyword(EXTRACTOR_CREATED_BY_SOFTWARE, - product, - prev); - } - prod = idToProduct(wMagicRevised); - if (prod != NULL) { - char * date = dateToString(lProductRevised); - snprintf(product, 128, _("%s (Build %s)"), - prod, - date); - free(date); - prev = addKeyword(EXTRACTOR_MODIFIED_BY_SOFTWARE, - product, - prev); - } - - - unsigned int where = 0; - stream = storage->stream("1Table"); - if (! stream) - stream = storage->stream("0Table"); - if ( (stream) && (lcbSttbSavedBy >= 6)) { - unsigned char * lbuffer = (unsigned char*) malloc(lcbSttbSavedBy); - - // goto offset of revision - stream->seek(fcSttbSavedBy); - // read all the revision history - if (lcbSttbSavedBy == stream->read(lbuffer, lcbSttbSavedBy)) { - // there are n strings, so n/2 revisions (author & file) - unsigned int nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; - where = 6; - for (unsigned int i=0; i < nRev; i++) { - if (where >= lcbSttbSavedBy) - break; - unsigned int length = lbuffer[where++]; - if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || - (where + 2 * length + 2 <= where) ) - break; - char * author = convertToUtf8((const char*) &lbuffer[where], - length * 2, - "UTF-16BE"); - where += length * 2 + 1; - length = lbuffer[where++]; - if ( (where + 2 * length >= lcbSttbSavedBy) || - (where + 2 * length + 1 <= where) ) - break; - char * filename = convertToUtf8((const char*) &lbuffer[where], - length * 2, - "UTF-16BE"); - where += length * 2 + 1; - char * rbuf = (char*) malloc(strlen(author) + strlen(filename) + 512); - snprintf(rbuf, 512 + strlen(author) + strlen(filename), - _("Revision #%u: Author '%s' worked on '%s'"), - i, author, filename); - free(author); - free(filename); - prev = addKeyword(EXTRACTOR_REVISION_HISTORY, - rbuf, - prev); - free(rbuf); - } - } - free(lbuffer); - } - delete storage; - - return prev; - } - -} - diff --git a/src/plugins/wordleaker/wordleaker.cpp b/src/plugins/wordleaker/wordleaker.cpp @@ -1,308 +0,0 @@ -/* - WordLeaker - Shows information about Word DOC files - Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> - - Based on poledump.c - Original idea from WordDumper (http://www.computerbytesman.com) - Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm - Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this library; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -#include <iostream> -#include <fstream> -#include <stdlib.h> -#include <list> -#include <ctime> - -#include "pole.h" -#include "WordLeaker.h" - -unsigned long fcSttbSavedBy; -unsigned long lcbSttbSavedBy; - - - -// read the type of the property and displays its value -void showProperty( POLE::Stream* stream ) { - unsigned long read, type; - unsigned char buffer[256]; - unsigned char c; - unsigned long i; - unsigned long t, t1, t2; - char *s; - - read = stream->read(buffer, 4); - type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - - switch (type) { - case 2: // VT_I2 - read = stream->read(buffer, 2); - i = buffer[0] + (buffer[1] << 8); - cout << i << endl; - break; - case 3: // VT_I4 - read = stream->read(buffer, 4); - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - cout << i << endl; - break; - case 11: // VT_BOOL - read = stream->read(buffer, 1); - if ((char) buffer[0] == -1) - cout << "true" << endl; - else - cout << "false" << endl; - break; - case 30: // VT_LPSTR - read = stream->read(buffer, 4); - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - while ((c = stream->getch()) != 0) - cout << c; - cout << endl; - break; - case 64: // VT_FILETIME - read = stream->read(buffer, 8); - t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - t = filetime_to_unixtime(t1, t2); - s = ctime((time_t *) &t); - cout << s; - break; - default: - cout << "Unknown format " << type << endl; - } -} - -// show the revision data (users and files) -void dumpRevision( POLE::Storage* storage ) { - unsigned int nRev; - unsigned int where = 0; - POLE::Stream* stream; - - cout << "Revision:" << endl; - cout << "---------" << endl << endl; - - // FIXME: should look if using 0Table or 1Table - stream = storage->stream( "1Table" ); - if( !stream ) { - cout << "There's no revision information" << endl; - return; - } - - unsigned char * buffer = new unsigned char[lcbSttbSavedBy]; - unsigned char buffer2[1024]; - unsigned int length; - - // goto offset of revision - stream->seek(fcSttbSavedBy); - // read all the revision history - stream->read(buffer, lcbSttbSavedBy); - - // there are n strings, so n/2 revisions (author & file) - nRev = (buffer[2] + (buffer[3] << 8)) / 2; - where = 6; - - for (unsigned int i=0; i < nRev; i++) { - cout << "Rev #" << i << ": Author \""; - length = buffer[where++]; - // it's unicode, for now we only get the low byte - for (unsigned int j=0; j < length; j++) { - where++; - cout << buffer[where]; - where++; - } - where++; - cout << "\" worked on file \""; - length = buffer[where++]; - // it's unicode, for now we only get the low byte - for (unsigned int j=0; j < length; j++) { - where++; - cout << buffer[where]; - where++; - } - where++; - cout << "\"" << endl; - } - - cout << endl; - delete buffer; - -} - -// show data from DocumentSummary stream -void dumpDocumentSummary( POLE::Storage* storage ) { - POLE::Stream* stream; - unsigned long read, nproperties, propertyID, offsetProp, offsetCur; - unsigned long begin; - - cout << "Document Summary:" << endl; - cout << "-----------------" << endl << endl; - - stream = storage->stream( "DocumentSummaryInformation" ); - if( !stream ) { - cout << "There's no document summary information" << endl; - return; - } - - unsigned char buffer[256]; - - // ClassID & Offset - stream->seek(28); - stream->read(buffer, 20); - // beginning of section - begin = stream->tell(); - // length of section - read = stream->read(buffer, 4); - // number of properties - read = stream->read(buffer, 4); - nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - // properties - - for (unsigned long i = 0; i < nproperties; i++) { - read = stream->read(buffer, 8); - propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - if (propertyID > 1 && propertyID < 16) { - cout << DocumentSummaryProperties[propertyID] << ": "; - offsetCur = stream->tell(); - stream->seek(offsetProp + begin); - // read and show the property - showProperty(stream); - stream->seek(offsetCur); - } - } - - cout << endl; -} - -// show data from Summary stream -void dumpSummary( POLE::Storage* storage ) { - POLE::Stream* stream; - unsigned long read, nproperties, propertyID, offsetProp, offsetCur; - unsigned long begin; - - cout << "Summary:" << endl; - cout << "--------" << endl << endl; - - stream = storage->stream( "SummaryInformation" ); - if( !stream ) { - cout << "There's no summary information" << endl; - return; - } - - unsigned char buffer[256]; - - // ClassID & Offset - stream->seek(28); - stream->read(buffer, 20); - // beginning of section - begin = stream->tell(); - // length of section - read = stream->read(buffer, 4); - // number of properties - read = stream->read(buffer, 4); - nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - // properties - for (unsigned long i = 0; i < nproperties; i++) { - read = stream->read(buffer, 8); - propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - if (propertyID > 1 && propertyID < 20) { - cout << SummaryProperties[propertyID] << ": "; - offsetCur = stream->tell(); - stream->seek(offsetProp + begin); - // read and show the property - showProperty(stream); - stream->seek(offsetCur); - } - } - - cout << endl; -} - -// reads the header of the file -bool readFIB( char* filename ) { - fstream file; - - file.open( filename, std::ios::binary | std::ios::in ); - if( !file.good() ) { - cout << "Can't find the file" << endl; - return false; - } - - unsigned char * buffer = new unsigned char[898]; - file.seekg( 512 ); - file.read( (char*)buffer, 898 ); - file.close(); - - unsigned int wIdent = buffer[0] + (buffer[1] << 8); - unsigned int nProduct = buffer[4] + (buffer[5] << 8); - unsigned int lid = buffer[6] + (buffer[7] << 8); - unsigned int envr = buffer[18]; - unsigned int wMagicCreated = buffer[34] + (buffer[35] << 8); - unsigned int wMagicRevised = buffer[36] + (buffer[37] << 8); - unsigned long lProductCreated = buffer[68] + (buffer[69] << 8) + (buffer[70] << 16) + (buffer[71] << 24); - unsigned long lProductRevised = buffer[72] + (buffer[73] << 8) + (buffer[74] << 16) + (buffer[75] << 24); - fcSttbSavedBy = buffer[722] + (buffer[723] << 8) + (buffer[724] << 16) + (buffer[725] << 24); - lcbSttbSavedBy = buffer[726] + (buffer[727] << 8) + (buffer[728] << 16) + (buffer[729] << 24); - delete[] buffer; - - cout << "File: " << filename << endl; - cout << "Product version: " << nProduct << endl; - cout << "Language: " << lidToLanguage(lid) << endl; - cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << dateToString(lProductCreated) << ")" << endl; - cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << dateToString(lProductRevised) << ")" << endl; - cout << endl; - - return true; - -} - -int main(int argc, char *argv[]) { - cout << endl << "WordLeaker v.0.1" << endl; - cout << " by Madelman (http://elligre.tk/madelman/)" << endl << endl; - - - if( argc < 2 ) { - cout << " You must supply a filename" << endl << endl; - return 0; - } - - char* filename = argv[1]; - - if ( !readFIB(filename) ) - return 1; - - POLE::Storage* storage = new POLE::Storage( filename ); - storage->open(); - if( storage->result() != POLE::Storage::Ok ) { - cout << "The file " << filename << " is not a Word document" << endl; - return 1; - } - - dumpSummary( storage ); - // FIXME: doesn't always work - // but there's nothing really interesting in here - //dumpDocumentSummary( storage ); - dumpRevision( storage ); - // TODO: we don't show the GUID - // TODO: we don't show the macros - - delete storage; - - return 0; -} diff --git a/src/plugins/wordleaker/wordleaker.h b/src/plugins/wordleaker/wordleaker.h @@ -1,124 +0,0 @@ -/* - WordLeaker - Shows information about Word DOC files - Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> - - Based on poledump.c - Original idea from WordDumper (http://www.computerbytesman.com) - Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm - Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this library; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -#include <string> - -using namespace std; - -static char* -DocumentSummaryProperties[] = { -"Dictionary", -"Code page", -"Category", -"PresentationTarget", -"Bytes", -"Lines", -"Paragraphs", -"Slides", -"Notes", -"HiddenSlides", -"MMClips", -"ScaleCrop", -"HeadingPairs", -"TitlesofParts", -"Manager", -"Company", -"LinksUpTo" -}; - -/* - * filetime_to_unixtime - * - * Adapted from work in 'wv' by: - * Caolan McNamara (Caolan.McNamara@ul.ie) - */ -#define HIGH32_DELTA 27111902 -#define MID16_DELTA 54590 -#define LOW16_DELTA 32768 - -unsigned long filetime_to_unixtime (unsigned long low_time, unsigned long high_time) { - unsigned long low16;/* 16 bit, low bits */ - unsigned long mid16;/* 16 bit, medium bits */ - unsigned long hi32;/* 32 bit, high bits */ - unsigned int carry;/* carry bit for subtraction */ - int negative;/* whether a represents a negative value */ - -/* Copy the time values to hi32/mid16/low16 */ -hi32 = high_time; -mid16 = low_time >> 16; -low16 = low_time & 0xffff; - -/* Subtract the time difference */ -if (low16 >= LOW16_DELTA ) -low16 -= LOW16_DELTA , carry = 0; -else -low16 += (1 << 16) - LOW16_DELTA , carry = 1; - -if (mid16 >= MID16_DELTA + carry) -mid16 -= MID16_DELTA + carry, carry = 0; -else -mid16 += (1 << 16) - MID16_DELTA - carry, carry = 1; - -hi32 -= HIGH32_DELTA + carry; - -/* If a is negative, replace a by (-1-a) */ -negative = (hi32 >= ((unsigned long)1) << 31); -if (negative) { -/* Set a to -a - 1 (a is hi32/mid16/low16) */ -low16 = 0xffff - low16; -mid16 = 0xffff - mid16; -hi32 = ~hi32; -} - -/* - * Divide a by 10000000 (a = hi32/mid16/low16), put the rest into r. - * Split the divisor into 10000 * 1000 which are both less than 0xffff. - */ -mid16 += (hi32 % 10000) << 16; -hi32 /= 10000; -low16 += (mid16 % 10000) << 16; -mid16 /= 10000; -low16 /= 10000; - -mid16 += (hi32 % 1000) << 16; -hi32 /= 1000; -low16 += (mid16 % 1000) << 16; -mid16 /= 1000; -low16 /= 1000; - -/* If a was negative, replace a by (-1-a) and r by (9999999 - r) */ -if (negative) { -/* Set a to -a - 1 (a is hi32/mid16/low16) */ -low16 = 0xffff - low16; -mid16 = 0xffff - mid16; -hi32 = ~hi32; -} - -/* Do not replace this by << 32, it gives a compiler warning and - * it does not work - */ -return ((((unsigned long)hi32) << 16) << 16) + (mid16 << 16) + low16; - -}