commit 7406fa16dc06ff094b9d2535999d6e695b3c80b3
parent d72714d2581e1d0c54b3bb15b46aa9bd4784768d
Author: Christian Grothoff <christian@grothoff.org>
Date: Sat, 29 Apr 2006 04:49:06 +0000
integrating wordleaker into ole2 plugin, switching to libgsf
Diffstat:
17 files changed, 494 insertions(+), 4442 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,7 @@
+Fri Apr 28 22:26:43 PDT 2006
+ Integrated wordleaker into OLE2 plugin. Changed OLE2 plugin to use
+ libgsf (new dependency!).
+
Fri Apr 28 16:18:26 PDT 2006
Fixing some i18n issues. Specifically, EXTRACTOR_getKeywordTypeAsString
will now never return the translated version of the keyword type
diff --git a/configure.ac b/configure.ac
@@ -313,6 +313,7 @@ AC_ARG_ENABLE(exiv2,
AM_CONDITIONAL(HAVE_EXIV2, test x$exiv2 != x0)
AC_DEFINE_UNQUOTED([HAVE_EXIV2], $exiv2, [We use EXIV2])
+ABI_GSF
AC_SUBST(CPPFLAGS)
AC_SUBST(LDFLAGS)
@@ -358,9 +359,9 @@ else
AC_MSG_NOTICE([NOTICE: printable plugins enabled])
fi
-if test "x$without_glib" = "xtrue"
+if test "x$have_gsf" != "xtrue"
then
- AC_MSG_NOTICE([NOTICE: glib not used, no OLE2 (MS Office) support])
+ AC_MSG_NOTICE([NOTICE: libgsf not found, no OLE2 (MS Office) support])
fi
if test "x$without_gtk" = "xtrue"
diff --git a/m4/abi-gsf.m4 b/m4/abi-gsf.m4
@@ -0,0 +1,78 @@
+# start: abi/ac-helpers/abi-gsf.m4
+#
+# Copyright (C) 2005 Christian Neumair
+#
+# This file is free software; you may copy and/or distribute it with
+# or without modifications, as long as this notice is preserved.
+# This software is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY, to the extent permitted by law; without even
+# the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+# PURPOSE.
+#
+# The above license applies to THIS FILE ONLY, the GNUnet code
+# itself may be copied and distributed under the terms of the GNU
+# GPL, see COPYING for more details
+#
+# Usage: ABI_GSF
+
+# Check for gsf
+
+AC_DEFUN([ABI_GSF], [
+
+test_gsf=true
+have_gsf=false
+
+test_gsf_gnome=true
+have_gsf_gnome=false
+
+AC_ARG_ENABLE(gsf,[ --disable-gsf Turn off gsf], [
+ if test "x$enableval" = "xno"; then
+ test_gsf=false
+ fi
+])
+
+AC_ARG_ENABLE(gsf-gnome,[ --disable-gnome Turn off gsf-gnome], [
+ if test "x$enableval" = "xno"; then
+ test_gsf_gnome=false
+ fi
+])
+
+if test "x$test_gsf" = "xtrue" ; then
+ PKG_CHECK_MODULES(GSF,[libgsf-1 >= 1.10], [
+ have_gsf=true
+ GSF_CFLAGS="$GSF_CFLAGS -DHAVE_GSF"
+ ],
+ [
+ have_gsf=false
+ ])
+fi
+
+if test "x$have_gsf" = "xtrue" -a "x$test_gsf_gnome" = "xtrue" ; then
+ PKG_CHECK_MODULES(GSF_GNOME, [libgsf-gnome-1 >= 1.10], [
+ have_gsf_gnome=true
+ GSF_GNOME_CFLAGS="$GSF_GNOME_CFLAGS -DHAVE_GSF_GNOME"
+ ],
+ [
+ have_gsf_gnome=false
+ ])
+fi
+
+AC_SUBST(GSF_CFLAGS)
+AC_SUBST(GSF_LIBS)
+
+AC_SUBST(GSF_GNOME_CFLAGS)
+AC_SUBST(GSF_GNOME_LIBS)
+
+AM_CONDITIONAL(WITH_GSF, test "x$have_gsf" = "xtrue")
+AM_CONDITIONAL(WITH_GSF_GNOME, test "x$have_gsf_gnome" = "xtrue")
+
+if test "x$have_gsf_gnome" = "xtrue" ; then
+ abi_gsf_message="yes, with GNOME support"
+else if test "x$have_gsf" = "xtrue" ; then
+ abi_gsf_message="yes, without GNOME support"
+else
+ abi_gsf_message="no"
+fi
+fi
+
+])
diff --git a/src/include/extractor.h b/src/include/extractor.h
@@ -152,6 +152,14 @@ typedef enum {
EXTRACTOR_MODIFIED_BY_SOFTWARE = 99,
EXTRACTOR_REVISION_HISTORY = 100,
EXTRACTOR_LOWERCASE = 101,
+ EXTRACTOR_COMPANY = 102,
+ EXTRACTOR_GENERATOR = 103,
+ EXTRACTOR_CHARACTER_SET = 104,
+ EXTRACTOR_LINE_COUNT = 105,
+ EXTRACTOR_PARAGRAPH_COUNT = 106,
+ EXTRACTOR_EDITING_CYCLES = 107,
+ EXTRACTOR_SCALE = 108,
+ EXTRACTOR_MANAGER = 109,
} EXTRACTOR_KeywordType;
/**
diff --git a/src/main/extractor.c b/src/main/extractor.c
@@ -41,113 +41,121 @@
* The sources of keywords as strings.
*/
static const char *keywordTypes[] = {
- gettext_noop("unknown"),
+ gettext_noop("unknown"), /* 0 */
gettext_noop("filename"),
gettext_noop("mimetype"),
gettext_noop("title"),
gettext_noop("author"),
- gettext_noop("artist"),
+ gettext_noop("artist"), /* 5 */
gettext_noop("description"),
gettext_noop("comment"),
gettext_noop("date"),
gettext_noop("publisher"),
- gettext_noop("language"),
+ gettext_noop("language"), /* 10 */
gettext_noop("album"),
gettext_noop("genre"),
gettext_noop("location"),
gettext_noop("version"),
- gettext_noop("organization"),
+ gettext_noop("organization"), /* 15 */
gettext_noop("copyright"),
gettext_noop("subject"),
gettext_noop("keywords"),
gettext_noop("contributor"),
- gettext_noop("resource-type"),
+ gettext_noop("resource-type"), /* 20 */
gettext_noop("format"),
gettext_noop("resource-identifier"),
gettext_noop("source"),
gettext_noop("relation"),
- gettext_noop("coverage"),
+ gettext_noop("coverage"), /* 25 */
gettext_noop("software"),
gettext_noop("disclaimer"),
gettext_noop("warning"),
gettext_noop("translated"),
- gettext_noop("creation date"),
+ gettext_noop("creation date"), /* 30 */
gettext_noop("modification date"),
gettext_noop("creator"),
gettext_noop("producer"),
gettext_noop("page count"),
- gettext_noop("page orientation"),
+ gettext_noop("page orientation"), /* 35 */
gettext_noop("paper size"),
gettext_noop("used fonts"),
gettext_noop("page order"),
gettext_noop("created for"),
- gettext_noop("magnification"),
+ gettext_noop("magnification"), /* 40 */
gettext_noop("release"),
gettext_noop("group"),
gettext_noop("size"),
gettext_noop("summary"),
- gettext_noop("packager"),
+ gettext_noop("packager"), /* 45 */
gettext_noop("vendor"),
gettext_noop("license"),
gettext_noop("distribution"),
gettext_noop("build-host"),
- gettext_noop("os"),
+ gettext_noop("operating system"), /* 50 */
gettext_noop("dependency"),
gettext_noop("MD4"),
gettext_noop("MD5"),
gettext_noop("SHA-0"),
- gettext_noop("SHA-1"),
+ gettext_noop("SHA-1"), /* 55 */
gettext_noop("RipeMD160"),
gettext_noop("resolution"),
gettext_noop("category"),
gettext_noop("book title"),
- gettext_noop("priority"),
+ gettext_noop("priority"), /* 60 */
gettext_noop("conflicts"),
gettext_noop("replaces"),
gettext_noop("provides"),
gettext_noop("conductor"),
- gettext_noop("interpreter"),
+ gettext_noop("interpreter"), /* 65 */
gettext_noop("owner"),
gettext_noop("lyrics"),
gettext_noop("media type"),
gettext_noop("contact"),
- gettext_noop("binary thumbnail data"),
+ gettext_noop("binary thumbnail data"), /* 70 */
gettext_noop("publication date"),
gettext_noop("camera make"),
gettext_noop("camera model"),
gettext_noop("exposure"),
- gettext_noop("aperture"),
+ gettext_noop("aperture"), /* 75 */
gettext_noop("exposure bias"),
gettext_noop("flash"),
gettext_noop("flash bias"),
gettext_noop("focal length"),
- gettext_noop("focal length (35mm equivalent)"),
+ gettext_noop("focal length (35mm equivalent)"), /* 80 */
gettext_noop("iso speed"),
gettext_noop("exposure mode"),
gettext_noop("metering mode"),
gettext_noop("macro mode"),
- gettext_noop("image quality"),
+ gettext_noop("image quality"), /* 85 */
gettext_noop("white balance"),
gettext_noop("orientation"),
gettext_noop("template"),
gettext_noop("split"),
- gettext_noop("product version"),
+ gettext_noop("product version"), /* 90 */
gettext_noop("last saved by"),
gettext_noop("last printed"),
gettext_noop("word count"),
gettext_noop("character count"),
- gettext_noop("total editing time"),
+ gettext_noop("total editing time"), /* 95 */
gettext_noop("thumbnails"),
gettext_noop("security"),
gettext_noop("created by software"),
gettext_noop("modified by software"),
- gettext_noop("revision history"),
+ gettext_noop("revision history"), /* 100 */
gettext_noop("lower case conversion"),
+ gettext_noop("company"),
+ gettext_noop("generator"),
+ gettext_noop("character set"),
+ gettext_noop("line count"), /* 105 */
+ gettext_noop("paragraph count"),
+ gettext_noop("editing cycles"),
+ gettext_noop("scale"),
+ gettext_noop("manager"),
NULL,
};
/* the number of keyword types (for bounds-checking) */
-#define HIGHEST_TYPE_NUMBER 102
+#define HIGHEST_TYPE_NUMBER 110
#ifdef HAVE_LIBOGG
#if HAVE_VORBIS
@@ -211,7 +219,6 @@ libextractor_riff:\
libextractor_mpeg:\
libextractor_elf:\
libextractor_oo:\
-libextractor_word:\
libextractor_asf"
#define DEFAULT_LIBRARIES EXSO OLESO OGGSO QTSO DEFSO
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
@@ -1,7 +1,9 @@
include Makefile-plugins.am
if HAVE_GLIB
-oledir=ole2
+if WITH_GSF
+ oledir=ole2
+endif
if HAVE_GTK
thumbdir=thumbnail
endif
@@ -15,6 +17,7 @@ if HAVE_EXIV2
exiv2dir=exiv2
endif
+
if HAVE_XPDF
xpdfdir=pdf
else
@@ -23,7 +26,7 @@ endif
# toggle for development
# SUBDIRS = .
-SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm $(xpdfdir) $(thumbdir) $(exiv2dir) wordleaker
+SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm $(xpdfdir) $(thumbdir) $(exiv2dir)
if HAVE_VORBISFILE
diff --git a/src/plugins/hash/rmd160extractor.c b/src/plugins/hash/rmd160extractor.c
@@ -619,10 +619,11 @@ static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordList *oldhead,
#define rmd160_new() rmd160_copy(NULL,NULL)
-struct EXTRACTOR_Keywords * libextractor_hash_rmd160_extract(const char * filename,
- char * data,
- size_t size,
- struct EXTRACTOR_Keywords * prev) {
+struct EXTRACTOR_Keywords *
+libextractor_hash_rmd160_extract(const char * filename,
+ const unsigned char * data,
+ size_t size,
+ struct EXTRACTOR_Keywords * prev) {
unsigned char bin_buffer[MAX_DIGEST_BIN_BYTES];
char hash[8 * MAX_DIGEST_BIN_BYTES];
char buf[16];
diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am
@@ -4,12 +4,11 @@ include ../Makefile-plugins.am
plugin_LTLIBRARIES = \
libextractor_ole2.la
-AM_CFLAGS = $(GLIB_CFLAGS)
-
libextractor_ole2_la_CFLAGS = \
- $(GLIB_CFLAGS)
+ $(GSF_CFLAGS)
libextractor_ole2_la_LIBADD = \
- $(LIBADD) $(GLIB_LIBS) -lgobject-2.0 \
+ $(LIBADD) $(GSF_LIBS) \
+ $(top_builddir)/src/plugins/libconvert.la \
$(top_builddir)/src/main/libextractor.la
libextractor_ole2_la_LDFLAGS = \
$(PLUGINFLAGS) $(retaincommand)
diff --git a/src/plugins/wordleaker/README b/src/plugins/ole2/README
diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c
@@ -1,6 +1,6 @@
/*
This file is part of libextractor.
- (C) 2004,2005 Vidyut Samanta and Christian Grothoff
+ (C) 2004, 2005, 2006 Vidyut Samanta and Christian Grothoff
libextractor is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published
@@ -17,1217 +17,30 @@
Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
- Most of the code in this directory comes from
- libgsf 1.10.1 (Licensed under GPL/LGPL).
-
- libgsf -- The G Structured File Library
+ This code makes extensive use of libgsf
+ -- the Gnome Structured File Library
Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
+ Part of this code was borrowed from wordleaker.cpp. See also
+ the README file in this directory.
*/
#include "platform.h"
#include "extractor.h"
+#include "../convert.h"
+
#include <glib-object.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
-#define DEBUG_OLE2 0
-
-#if DEBUG_OLE2
-#define d(code) do { code } while (0)
-#define warning printf
-#else
-#define d(code)
- static void warning(const char * format, ...) {}
-#endif
-
-#undef g_return_val_if_fail
-#define g_return_val_if_fail(a,b) if (! (a)) return (b);
-
-/* *********************** formerly gsf-input.c ************* */
-
-typedef struct GsfInput {
- off_t size;
- off_t cur_offset;
- char * name;
- const unsigned char * buf;
- int needs_free;
-} GsfInput;
-
-
-static void
-gsf_input_init (GsfInput * input)
-{
- input->size = 0;
- input->cur_offset = 0;
- input->name = NULL;
- input->buf = NULL;
-}
-
-/**
- * gsf_input_memory_new:
- * @buf: The input bytes
- * @length: The length of @buf
- * @needs_free: Whether you want this memory to be free'd at object destruction
- *
- * Returns: A new #GsfInputMemory
- */
-static GsfInput *
-gsf_input_new (const unsigned char * buf,
- off_t length,
- int needs_free)
-{
- GsfInput *mem = malloc(sizeof(GsfInput));
- if (mem == NULL)
- return NULL;
- gsf_input_init(mem);
- mem->buf = buf;
- mem->size = length;
- mem->needs_free = needs_free;
- return mem;
-}
-
-static void
-gsf_input_finalize (GsfInput * input)
-{
- if (input->name != NULL) {
- free (input->name);
- input->name = NULL;
- }
- if ( (input->buf) && input->needs_free)
- free((void*) input->buf);
- free(input);
-}
-
-/**
- * gsf_input_set_name :
- * @input :
- * @name :
- *
- * protected.
- *
- * Returns : TRUE if the assignment was ok.
- **/
-static int
-gsf_input_set_name (GsfInput *input, char const *name)
-{
- char *buf;
-
- g_return_val_if_fail (input != NULL, 0);
-
- buf = strdup (name);
- if (input->name != NULL)
- free (input->name);
- input->name = buf;
- return 1;
-}
-
-
-
-static GsfInput *
-gsf_input_dup (GsfInput *src)
-{
- GsfInput * dst = malloc(sizeof(GsfInput));
- if (dst == NULL)
- return NULL;
- gsf_input_init(dst);
- dst->buf = src->buf;
- dst->needs_free = 0;
- dst->size = src->size;
- if (src->name != NULL)
- gsf_input_set_name (dst, src->name);
- dst->cur_offset = src->cur_offset;
- return dst;
-}
-
-static const unsigned char *
-gsf_input_read (GsfInput * mem, size_t num_bytes, unsigned char * optional_buffer)
-{
- const unsigned char *src = mem->buf;
- if (src == NULL)
- return NULL;
- if (optional_buffer) {
- memcpy (optional_buffer, src + mem->cur_offset, num_bytes);
- mem->cur_offset += num_bytes;
-
- return optional_buffer;
- } else {
- const unsigned char * ret = src + mem->cur_offset;
- mem->cur_offset += num_bytes;
- return ret;
- }
-}
-
-/**
- * gsf_input_size :
- * @input : The input
- *
- * Looks up and caches the number of bytes in the input
- *
- * Returns : the size or -1 on error
- **/
-static off_t
-gsf_input_size (GsfInput *input)
-{
- g_return_val_if_fail (input != NULL, -1);
- return input->size;
-}
-
-/**
- * gsf_input_seek :
- * @input :
- * @offset :
- * @whence :
- *
- * Returns TRUE on error.
- **/
-static int
-gsf_input_seek (GsfInput *input, off_t offset, int whence)
-{
- off_t pos = offset;
-
- g_return_val_if_fail (input != NULL, 1);
-
- switch (whence) {
- case SEEK_SET : break;
- case SEEK_CUR : pos += input->cur_offset; break;
- case SEEK_END : pos += input->size; break;
- default : return 1;
- }
-
- if (pos < 0 || pos > input->size)
- return 1;
-
- /*
- * If we go nowhere, just return. This in particular handles null
- * seeks for streams with no seek method.
- */
- if (pos == input->cur_offset)
- return 0;
-
- input->cur_offset = pos;
- return 0;
-}
-
-
-
-
-/* ******************** formerly gsf-utils.c **************** */
-
-
-/* Do this the ugly way so that we don't have to worry about alignment */
-#define GSF_LE_GET_GUINT8(p) (*(guint8 const *)(p))
-#define GSF_LE_GET_GUINT16(p) \
- (guint16)((((guint8 const *)(p))[0] << 0) | \
- (((guint8 const *)(p))[1] << 8))
-#define GSF_LE_GET_GUINT32(p) \
- (guint32)((((guint8 const *)(p))[0] << 0) | \
- (((guint8 const *)(p))[1] << 8) | \
- (((guint8 const *)(p))[2] << 16) | \
- (((guint8 const *)(p))[3] << 24))
-
-#define GSF_LE_GET_GUINT64(p) (gsf_le_get_guint64 (p))
-#define GSF_LE_GET_GINT64(p) ((gint64)GSF_LE_GET_GUINT64(p))
-#define GSF_LE_GET_GINT8(p) ((gint8)GSF_LE_GET_GUINT8(p))
-#define GSF_LE_GET_GINT16(p) ((gint16)GSF_LE_GET_GUINT16(p))
-#define GSF_LE_GET_GINT32(p) ((gint32)GSF_LE_GET_GUINT32(p))
-#define GSF_LE_GET_FLOAT(p) (gsf_le_get_float (p))
-#define GSF_LE_GET_DOUBLE(p) (gsf_le_get_double (p))
-#define GSF_LE_SET_GUINT8(p, dat) \
- (*((guint8 *)(p)) = ((dat) & 0xff))
-#define GSF_LE_SET_GUINT16(p, dat) \
- ((*((guint8 *)(p) + 0) = ((dat) & 0xff)),\
- (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff))
-#define GSF_LE_SET_GUINT32(p, dat) \
- ((*((guint8 *)(p) + 0) = ((dat)) & 0xff), \
- (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff), \
- (*((guint8 *)(p) + 2) = ((dat) >> 16) & 0xff), \
- (*((guint8 *)(p) + 3) = ((dat) >> 24) & 0xff))
-#define GSF_LE_SET_GINT8(p,dat) GSF_LE_SET_GUINT8((p),(dat))
-#define GSF_LE_SET_GINT16(p,dat) GSF_LE_SET_GUINT16((p),(dat))
-#define GSF_LE_SET_GINT32(p,dat) GSF_LE_SET_GUINT32((p),(dat))
-
-
-/*
- * Glib gets this wrong, really. ARM's floating point format is a weird
- * mixture.
- */
-#define G_ARMFLOAT_ENDIAN 56781234
-#if defined(__arm__) && !defined(__vfp__) && (G_BYTE_ORDER == G_LITTLE_ENDIAN)
-#define G_FLOAT_BYTE_ORDER G_ARMFLOAT_ENDIAN
-#else
-#define G_FLOAT_BYTE_ORDER G_BYTE_ORDER
-#endif
-
-static guint64
-gsf_le_get_guint64 (void const *p)
-{
-#if G_BYTE_ORDER == G_BIG_ENDIAN
- if (sizeof (guint64) == 8) {
- guint64 li;
- int i;
- guint8 *t = (guint8 *)&li;
- guint8 *p2 = (guint8 *)p;
- int sd = sizeof (li);
-
- for (i = 0; i < sd; i++)
- t[i] = p2[sd - 1 - i];
-
- return li;
- } else {
- g_error ("Big endian machine, but weird size of guint64");
- }
-#elif G_BYTE_ORDER == G_LITTLE_ENDIAN
- if (sizeof (guint64) == 8) {
- /*
- * On i86, we could access directly, but Alphas require
- * aligned access.
- */
- guint64 data;
- memcpy (&data, p, sizeof (data));
- return data;
- } else {
- g_error ("Little endian machine, but weird size of guint64");
- }
-#else
-#error "Byte order not recognised -- out of luck"
-#endif
-}
-
-static float
-gsf_le_get_float (void const *p)
-{
-#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN
- if (sizeof (float) == 4) {
- float f;
- int i;
- guint8 *t = (guint8 *)&f;
- guint8 *p2 = (guint8 *)p;
- int sd = sizeof (f);
-
- for (i = 0; i < sd; i++)
- t[i] = p2[sd - 1 - i];
-
- return f;
- } else {
- g_error ("Big endian machine, but weird size of floats");
- }
-#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN)
- if (sizeof (float) == 4) {
- /*
- * On i86, we could access directly, but Alphas require
- * aligned access.
- */
- float data;
- memcpy (&data, p, sizeof (data));
- return data;
- } else {
- g_error ("Little endian machine, but weird size of floats");
- }
-#else
-#error "Floating-point byte order not recognised -- out of luck"
-#endif
-}
-
-static double
-gsf_le_get_double (void const *p)
-{
-#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN
- double data;
- memcpy ((char *)&data + 4, p, 4);
- memcpy ((char *)&data, (const char *)p + 4, 4);
- return data;
-#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN
- if (sizeof (double) == 8) {
- double d;
- int i;
- guint8 *t = (guint8 *)&d;
- guint8 *p2 = (guint8 *)p;
- int sd = sizeof (d);
-
- for (i = 0; i < sd; i++)
- t[i] = p2[sd - 1 - i];
-
- return d;
- } else {
- g_error ("Big endian machine, but weird size of doubles");
- }
-#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN
- if (sizeof (double) == 8) {
- /*
- * On i86, we could access directly, but Alphas require
- * aligned access.
- */
- double data;
- memcpy (&data, p, sizeof (data));
- return data;
- } else {
- g_error ("Little endian machine, but weird size of doubles");
- }
-#else
-#error "Floating-point byte order not recognised -- out of luck"
-#endif
-}
-
-/**
- * gsf_iconv_close : A utility wrapper to safely close an iconv handle
- * @handle :
- **/
-static void
-gsf_iconv_close (GIConv handle)
-{
- if (handle != NULL && handle != ((GIConv)-1))
- g_iconv_close (handle);
-}
-
-
-/* ***************************** formerly gsf-infile-msole.c ********************* */
-
-#define OLE_HEADER_SIZE 0x200 /* independent of big block size size */
-#define OLE_HEADER_SIGNATURE 0x00
-#define OLE_HEADER_CLSID 0x08 /* See ReadClassStg */
-#define OLE_HEADER_MINOR_VER 0x18 /* 0x33 and 0x3e have been seen */
-#define OLE_HEADER_MAJOR_VER 0x1a /* 0x3 been seen in wild */
-#define OLE_HEADER_BYTE_ORDER 0x1c /* 0xfe 0xff == Intel Little Endian */
-#define OLE_HEADER_BB_SHIFT 0x1e
-#define OLE_HEADER_SB_SHIFT 0x20
-/* 0x22..0x27 reserved == 0 */
-#define OLE_HEADER_CSECTDIR 0x28
-#define OLE_HEADER_NUM_BAT 0x2c
-#define OLE_HEADER_DIRENT_START 0x30
-/* 0x34..0x37 transacting signature must be 0 */
-#define OLE_HEADER_THRESHOLD 0x38
-#define OLE_HEADER_SBAT_START 0x3c
-#define OLE_HEADER_NUM_SBAT 0x40
-#define OLE_HEADER_METABAT_BLOCK 0x44
-#define OLE_HEADER_NUM_METABAT 0x48
-#define OLE_HEADER_START_BAT 0x4c
-#define BAT_INDEX_SIZE 4
-#define OLE_HEADER_METABAT_SIZE ((OLE_HEADER_SIZE - OLE_HEADER_START_BAT) / BAT_INDEX_SIZE)
-
-#define DIRENT_MAX_NAME_SIZE 0x40
-#define DIRENT_DETAILS_SIZE 0x40
-#define DIRENT_SIZE (DIRENT_MAX_NAME_SIZE + DIRENT_DETAILS_SIZE)
-#define DIRENT_NAME_LEN 0x40 /* length in bytes incl 0 terminator */
-#define DIRENT_TYPE 0x42
-#define DIRENT_COLOUR 0x43
-#define DIRENT_PREV 0x44
-#define DIRENT_NEXT 0x48
-#define DIRENT_CHILD 0x4c
-#define DIRENT_CLSID 0x50 /* only for dirs */
-#define DIRENT_USERFLAGS 0x60 /* only for dirs */
-#define DIRENT_CREATE_TIME 0x64 /* for files */
-#define DIRENT_MODIFY_TIME 0x6c /* for files */
-#define DIRENT_FIRSTBLOCK 0x74
-#define DIRENT_FILE_SIZE 0x78
-/* 0x7c..0x7f reserved == 0 */
-
-#define DIRENT_TYPE_INVALID 0
-#define DIRENT_TYPE_DIR 1
-#define DIRENT_TYPE_FILE 2
-#define DIRENT_TYPE_LOCKBYTES 3 /* ? */
-#define DIRENT_TYPE_PROPERTY 4 /* ? */
-#define DIRENT_TYPE_ROOTDIR 5
-#define DIRENT_MAGIC_END 0xffffffff
-
-/* flags in the block allocation list to denote special blocks */
-#define BAT_MAGIC_UNUSED 0xffffffff /* -1 */
-#define BAT_MAGIC_END_OF_CHAIN 0xfffffffe /* -2 */
-#define BAT_MAGIC_BAT 0xfffffffd /* a bat block, -3 */
-#define BAT_MAGIC_METABAT 0xfffffffc /* a metabat block -4 */
-
-
-
-
-typedef struct {
- guint32 *block;
- guint32 num_blocks;
-} MSOleBAT;
-
-typedef struct {
- char *name;
- char *collation_name;
- int index;
- size_t size;
- gboolean use_sb;
- guint32 first_block;
- gboolean is_directory;
- GList *children;
- unsigned char clsid[16]; /* 16 byte GUID used by some apps */
-} MSOleDirent;
-
-typedef struct {
- struct {
- MSOleBAT bat;
- unsigned shift;
- unsigned filter;
- size_t size;
- } bb, sb;
- off_t max_block;
- guint32 threshold; /* transition between small and big blocks */
- guint32 sbat_start, num_sbat;
-
- MSOleDirent *root_dir;
- struct GsfInput *sb_file;
-
- int ref_count;
-} MSOleInfo;
-
-typedef struct GsfInfileMSOle {
- off_t size;
- off_t cur_offset;
- struct GsfInput *input;
- MSOleInfo *info;
- MSOleDirent *dirent;
- MSOleBAT bat;
- off_t cur_block;
-
- struct {
- guint8 *buf;
- size_t buf_size;
- } stream;
-} GsfInfileMSOle;
-
-/* utility macros */
-#define OLE_BIG_BLOCK(index, ole) ((index) >> ole->info->bb.shift)
-
-static struct GsfInput *gsf_infile_msole_new_child (GsfInfileMSOle *parent,
- MSOleDirent *dirent);
-
-/**
- * ole_get_block :
- * @ole : the infile
- * @block :
- * @buffer : optionally NULL
- *
- * Read a block of data from the underlying input.
- * Be really anal.
- **/
-static const guint8 *
-ole_get_block (const GsfInfileMSOle *ole, guint32 block, guint8 *buffer)
-{
- g_return_val_if_fail (block < ole->info->max_block, NULL);
-
- /* OLE_HEADER_SIZE is fixed at 512, but the sector containing the
- * header is padded out to bb.size (sector size) when bb.size > 512. */
- if (gsf_input_seek (ole->input,
- (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (block << ole->info->bb.shift)),
- SEEK_SET) < 0)
- return NULL;
-
- return gsf_input_read (ole->input, ole->info->bb.size, buffer);
-}
-
-/**
- * ole_make_bat :
- * @metabat : a meta bat to connect to the raw blocks (small or large)
- * @size_guess : An optional guess as to how many blocks are in the file
- * @block : The first block in the list.
- * @res : where to store the result.
- *
- * Walk the linked list of the supplied block allocation table and build up a
- * table for the list starting in @block.
- *
- * Returns TRUE on error.
- */
-static gboolean
-ole_make_bat (MSOleBAT const *metabat, size_t size_guess, guint32 block,
- MSOleBAT *res)
-{
- /* NOTE : Only use size as a suggestion, sometimes it is wrong */
- GArray *bat = g_array_sized_new (FALSE, FALSE,
- sizeof (guint32), size_guess);
-
- guint8 *used = (guint8*)g_alloca (1 + metabat->num_blocks / 8);
- memset (used, 0, 1 + metabat->num_blocks / 8);
-
- if (block < metabat->num_blocks)
- do {
- /* Catch cycles in the bat list */
- g_return_val_if_fail (0 == (used[block/8] & (1 << (block & 0x7))), TRUE);
- used[block/8] |= 1 << (block & 0x7);
-
- g_array_append_val (bat, block);
- block = metabat->block [block];
- } while (block < metabat->num_blocks);
-
- res->block = NULL;
-
- res->num_blocks = bat->len;
- res->block = (guint32 *) (gpointer) g_array_free (bat, FALSE);
-
- if (block != BAT_MAGIC_END_OF_CHAIN) {
-#if 0
- g_warning ("This OLE2 file is invalid.\n"
- "The Block Allocation Table for one of the streams had %x instead of a terminator (%x).\n"
- "We might still be able to extract some data, but you'll want to check the file.",
- block, BAT_MAGIC_END_OF_CHAIN);
-#endif
- }
-
- return FALSE;
-}
-
-static void
-ols_bat_release (MSOleBAT *bat)
-{
- if (bat->block != NULL) {
- g_free (bat->block);
- bat->block = NULL;
- bat->num_blocks = 0;
- }
-}
-
-/**
- * ole_info_read_metabat :
- * @ole :
- * @bats :
- *
- * A small utility routine to read a set of references to bat blocks
- * either from the OLE header, or a meta-bat block.
- *
- * Returns a pointer to the element after the last position filled.
- **/
-static guint32 *
-ole_info_read_metabat (GsfInfileMSOle *ole, guint32 *bats, guint32 max,
- guint32 const *metabat, guint32 const *metabat_end)
-{
- guint8 const *bat, *end;
-
- for (; metabat < metabat_end; metabat++) {
- bat = ole_get_block (ole, *metabat, NULL);
- if (bat == NULL)
- return NULL;
- end = bat + ole->info->bb.size;
- for ( ; bat < end ; bat += BAT_INDEX_SIZE, bats++) {
- *bats = GSF_LE_GET_GUINT32 (bat);
- g_return_val_if_fail (*bats < max ||
- *bats >= BAT_MAGIC_METABAT, NULL);
- }
- }
- return bats;
-}
-
-/**
- * gsf_ole_get_guint32s :
- * @dst :
- * @src :
- * @num_bytes :
- *
- * Copy some some raw data into an array of guint32.
- **/
-static void
-gsf_ole_get_guint32s (guint32 *dst, guint8 const *src, int num_bytes)
-{
- for (; (num_bytes -= BAT_INDEX_SIZE) >= 0 ; src += BAT_INDEX_SIZE)
- *dst++ = GSF_LE_GET_GUINT32 (src);
-}
-
-static struct GsfInput *
-ole_info_get_sb_file (GsfInfileMSOle *parent)
-{
- MSOleBAT meta_sbat;
-
- if (parent->info->sb_file != NULL)
- return parent->info->sb_file;
-
- parent->info->sb_file = gsf_infile_msole_new_child (parent,
- parent->info->root_dir);
-
- if (NULL == parent->info->sb_file)
- return NULL;
-
- g_return_val_if_fail (parent->info->sb.bat.block == NULL, NULL);
-
- if (ole_make_bat (&parent->info->bb.bat,
- parent->info->num_sbat,
- parent->info->sbat_start,
- &meta_sbat)) {
- return NULL;
- }
-
- parent->info->sb.bat.num_blocks = meta_sbat.num_blocks * (parent->info->bb.size / BAT_INDEX_SIZE);
- parent->info->sb.bat.block = g_new0 (guint32, parent->info->sb.bat.num_blocks);
- ole_info_read_metabat (parent, parent->info->sb.bat.block,
- parent->info->sb.bat.num_blocks,
- meta_sbat.block, meta_sbat.block + meta_sbat.num_blocks);
- ols_bat_release (&meta_sbat);
-
- return parent->info->sb_file;
-}
-
-static gint
-ole_dirent_cmp (const MSOleDirent *a, const MSOleDirent *b)
-{
- g_return_val_if_fail (a, 0);
- g_return_val_if_fail (b, 0);
-
- g_return_val_if_fail (a->collation_name, 0);
- g_return_val_if_fail (b->collation_name, 0);
-
- return strcmp (b->collation_name, a->collation_name);
-}
-
-/**
- * ole_dirent_new :
- * @ole :
- * @entry :
- * @parent : optional
- *
- * Parse dirent number @entry and recursively handle its siblings and children.
- **/
-static MSOleDirent *
-ole_dirent_new (GsfInfileMSOle *ole, guint32 entry, MSOleDirent *parent)
-{
- MSOleDirent *dirent;
- guint32 block, next, prev, child, size;
- guint8 const *data;
- guint8 type;
- guint16 name_len;
-
- if (entry >= DIRENT_MAGIC_END)
- return NULL;
-
- block = OLE_BIG_BLOCK (entry * DIRENT_SIZE, ole);
-
- g_return_val_if_fail (block < ole->bat.num_blocks, NULL);
- data = ole_get_block (ole, ole->bat.block [block], NULL);
- if (data == NULL)
- return NULL;
- data += (DIRENT_SIZE * entry) % ole->info->bb.size;
-
- type = GSF_LE_GET_GUINT8 (data + DIRENT_TYPE);
- if (type != DIRENT_TYPE_DIR &&
- type != DIRENT_TYPE_FILE &&
- type != DIRENT_TYPE_ROOTDIR) {
-#if 0
- g_warning ("Unknown stream type 0x%x", type);
-#endif
- return NULL;
- }
-
- /* It looks like directory (and root directory) sizes are sometimes bogus */
- size = GSF_LE_GET_GUINT32 (data + DIRENT_FILE_SIZE);
- g_return_val_if_fail (type == DIRENT_TYPE_DIR || type == DIRENT_TYPE_ROOTDIR ||
- size <= (guint32)gsf_input_size(ole->input), NULL);
-
- dirent = g_new0 (MSOleDirent, 1);
- dirent->index = entry;
- dirent->size = size;
- /* Store the class id which is 16 byte identifier used by some apps */
- memcpy(dirent->clsid, data + DIRENT_CLSID, sizeof(dirent->clsid));
-
- /* root dir is always big block */
- dirent->use_sb = parent && (size < ole->info->threshold);
- dirent->first_block = (GSF_LE_GET_GUINT32 (data + DIRENT_FIRSTBLOCK));
- dirent->is_directory = (type != DIRENT_TYPE_FILE);
- dirent->children = NULL;
- prev = GSF_LE_GET_GUINT32 (data + DIRENT_PREV);
- next = GSF_LE_GET_GUINT32 (data + DIRENT_NEXT);
- child = GSF_LE_GET_GUINT32 (data + DIRENT_CHILD);
- name_len = GSF_LE_GET_GUINT16 (data + DIRENT_NAME_LEN);
- dirent->name = NULL;
- if (0 < name_len && name_len <= DIRENT_MAX_NAME_SIZE) {
- gunichar2 uni_name [DIRENT_MAX_NAME_SIZE+1];
- gchar const *end;
- int i;
-
- /* !#%!@$#^
- * Sometimes, rarely, people store the stream name as ascii
- * rather than utf16. Do a validation first just in case.
- */
- if (!g_utf8_validate ((const char*) data, -1, &end) ||
- ((guint8 const *)end - data + 1) != name_len) {
- /* be wary about endianness */
- for (i = 0 ; i < name_len ; i += 2)
- uni_name [i/2] = GSF_LE_GET_GUINT16 (data + i);
- uni_name [i/2] = 0;
-
- dirent->name = g_utf16_to_utf8 (uni_name, -1, NULL, NULL, NULL);
- } else
- dirent->name = g_strndup ((gchar *)data, (gsize)((guint8 const *)end - data + 1));
- }
- /* be really anal in the face of screwups */
- if (dirent->name == NULL)
- dirent->name = g_strdup ("");
- dirent->collation_name = g_utf8_collate_key (dirent->name, -1);
-
- if (parent != NULL)
- parent->children = g_list_insert_sorted (parent->children,
- dirent, (GCompareFunc)ole_dirent_cmp);
-
- /* NOTE : These links are a tree, not a linked list */
- if (prev != entry)
- ole_dirent_new (ole, prev, parent);
- if (next != entry)
- ole_dirent_new (ole, next, parent);
-
- if (dirent->is_directory)
- ole_dirent_new (ole, child, dirent);
- return dirent;
-}
-
-static void
-ole_dirent_free (MSOleDirent *dirent)
-{
- GList *tmp;
- g_return_if_fail (dirent != NULL);
-
- g_free (dirent->name);
- g_free (dirent->collation_name);
-
- for (tmp = dirent->children; tmp; tmp = tmp->next)
- ole_dirent_free ((MSOleDirent *)tmp->data);
- g_list_free (dirent->children);
- g_free (dirent);
-}
-
-/*****************************************************************************/
-
-static void
-ole_info_unref (MSOleInfo *info)
-{
- if (info->ref_count-- != 1)
- return;
-
- ols_bat_release (&info->bb.bat);
- ols_bat_release (&info->sb.bat);
- if (info->root_dir != NULL) {
- ole_dirent_free (info->root_dir);
- info->root_dir = NULL;
- }
- if (info->sb_file != NULL) {
- gsf_input_finalize(info->sb_file);
- info->sb_file = NULL;
- }
- g_free (info);
-}
-
-static MSOleInfo *
-ole_info_ref (MSOleInfo *info)
-{
- info->ref_count++;
- return info;
-}
-
-static void
-gsf_infile_msole_init (GsfInfileMSOle * ole)
-{
- ole->cur_offset = 0;
- ole->size = 0;
- ole->input = NULL;
- ole->info = NULL;
- ole->bat.block = NULL;
- ole->bat.num_blocks = 0;
- ole->cur_block = BAT_MAGIC_UNUSED;
- ole->stream.buf = NULL;
- ole->stream.buf_size = 0;
-}
-
-static void
-gsf_infile_msole_finalize (GsfInfileMSOle * ole)
-{
- if (ole->input != NULL) {
- gsf_input_finalize(ole->input);
- ole->input = NULL;
- }
- if (ole->info != NULL) {
- ole_info_unref (ole->info);
- ole->info = NULL;
- }
- ols_bat_release (&ole->bat);
-
- g_free (ole->stream.buf);
- free(ole);
-}
-
-/**
- * ole_dup :
- * @src :
- *
- * Utility routine to _partially_ replicate a file. It does NOT copy the bat
- * blocks, or init the dirent.
- *
- * Return value: the partial duplicate.
- **/
-static GsfInfileMSOle *
-ole_dup (GsfInfileMSOle const * src)
-{
- GsfInfileMSOle *dst;
- struct GsfInput *input;
-
- g_return_val_if_fail (src != NULL, NULL);
-
- dst = malloc(sizeof(GsfInfileMSOle));
- if (dst == NULL)
- return NULL;
- gsf_infile_msole_init(dst);
- input = gsf_input_dup (src->input);
- if (input == NULL) {
- gsf_infile_msole_finalize(dst);
- return NULL;
- }
- dst->input = input;
- dst->info = ole_info_ref (src->info);
-
- /* buf and buf_size are initialized to NULL */
-
- return dst;
-}
-
-/**
- * ole_init_info :
- * @ole :
- *
- * Read an OLE header and do some sanity checking
- * along the way.
- *
- * Return value: TRUE on error
- **/
-static gboolean
-ole_init_info (GsfInfileMSOle *ole)
-{
- static guint8 const signature[] =
- { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
- guint8 const *header, *tmp;
- guint32 *metabat = NULL;
- MSOleInfo *info;
- guint32 bb_shift, sb_shift, num_bat, num_metabat, last, dirent_start;
- guint32 metabat_block, *ptr;
-
- /* check the header */
- if (gsf_input_seek (ole->input, (off_t) 0, SEEK_SET) ||
- NULL == (header = gsf_input_read (ole->input, OLE_HEADER_SIZE, NULL)) ||
- 0 != memcmp (header, signature, sizeof (signature))) {
- return TRUE;
- }
-
- bb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_BB_SHIFT);
- sb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_SB_SHIFT);
- num_bat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_BAT);
- dirent_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_DIRENT_START);
- metabat_block = GSF_LE_GET_GUINT32 (header + OLE_HEADER_METABAT_BLOCK);
- num_metabat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_METABAT);
-
- /* Some sanity checks
- * 1) There should always be at least 1 BAT block
- * 2) It makes no sense to have a block larger than 2^31 for now.
- * Maybe relax this later, but not much.
- */
- if (6 > bb_shift || bb_shift >= 31 || sb_shift > bb_shift) {
- return TRUE;
- }
-
- info = g_new0 (MSOleInfo, 1);
- ole->info = info;
-
- info->ref_count = 1;
- info->bb.shift = bb_shift;
- info->bb.size = 1 << info->bb.shift;
- info->bb.filter = info->bb.size - 1;
- info->sb.shift = sb_shift;
- info->sb.size = 1 << info->sb.shift;
- info->sb.filter = info->sb.size - 1;
- info->threshold = GSF_LE_GET_GUINT32 (header + OLE_HEADER_THRESHOLD);
- info->sbat_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_SBAT_START);
- info->num_sbat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_SBAT);
- info->max_block = (gsf_input_size (ole->input) - OLE_HEADER_SIZE) / info->bb.size;
- info->sb_file = NULL;
-
- if (info->num_sbat == 0 && info->sbat_start != BAT_MAGIC_END_OF_CHAIN) {
-#if 0
- g_warning ("There is are not supposed to be any blocks in the small block allocation table, yet there is a link to some. Ignoring it.");
-#endif
- }
-
- /* very rough heuristic, just in case */
- if (num_bat < info->max_block) {
- info->bb.bat.num_blocks = num_bat * (info->bb.size / BAT_INDEX_SIZE);
- info->bb.bat.block = g_new0 (guint32, info->bb.bat.num_blocks);
-
- metabat = (guint32 *)g_alloca (MAX (info->bb.size, OLE_HEADER_SIZE));
-
- /* Reading the elements invalidates this memory, make copy */
- gsf_ole_get_guint32s (metabat, header + OLE_HEADER_START_BAT,
- OLE_HEADER_SIZE - OLE_HEADER_START_BAT);
- last = num_bat;
- if (last > OLE_HEADER_METABAT_SIZE)
- last = OLE_HEADER_METABAT_SIZE;
-
- ptr = ole_info_read_metabat (ole, info->bb.bat.block,
- info->bb.bat.num_blocks, metabat, metabat + last);
- num_bat -= last;
- } else
- ptr = NULL;
-
- last = (info->bb.size - BAT_INDEX_SIZE) / BAT_INDEX_SIZE;
- while (ptr != NULL && num_metabat-- > 0) {
- tmp = ole_get_block (ole, metabat_block, NULL);
- if (tmp == NULL) {
- ptr = NULL;
- break;
- }
-
- /* Reading the elements invalidates this memory, make copy */
- gsf_ole_get_guint32s (metabat, tmp, (int)info->bb.size);
-
- if (num_metabat == 0) {
- if (last < num_bat) {
- /* there should be less that a full metabat block
- * remaining */
- ptr = NULL;
- break;
- }
- last = num_bat;
- } else if (num_metabat > 0) {
- metabat_block = metabat[last];
- num_bat -= last;
- }
-
- ptr = ole_info_read_metabat (ole, ptr,
- info->bb.bat.num_blocks, metabat, metabat + last);
- }
-
- if (ptr == NULL) {
- return TRUE;
- }
-
- /* Read the directory's bat, we do not know the size */
- if (ole_make_bat (&info->bb.bat, 0, dirent_start, &ole->bat)) {
- return TRUE;
- }
-
- /* Read the directory */
- ole->dirent = info->root_dir = ole_dirent_new (ole, 0, NULL);
- if (ole->dirent == NULL) {
- return TRUE;
- }
-
- return FALSE;
-}
-
-static guint8 const *
-gsf_infile_msole_read (GsfInfileMSOle *ole, size_t num_bytes, guint8 *buffer)
-{
- off_t first_block, last_block, raw_block, offset, i;
- guint8 const *data;
- guint8 *ptr;
- size_t count;
-
- /* small block files are preload */
- if (ole->dirent != NULL && ole->dirent->use_sb) {
- if (buffer != NULL) {
- memcpy (buffer, ole->stream.buf + ole->cur_offset, num_bytes);
- ole->cur_offset += num_bytes;
- return buffer;
- }
- data = ole->stream.buf + ole->cur_offset;
- ole->cur_offset += num_bytes;
- return data;
- }
-
- /* GsfInput guarantees that num_bytes > 0 */
- first_block = OLE_BIG_BLOCK (ole->cur_offset, ole);
- last_block = OLE_BIG_BLOCK (ole->cur_offset + num_bytes - 1, ole);
- offset = ole->cur_offset & ole->info->bb.filter;
-
- /* optimization : are all the raw blocks contiguous */
- i = first_block;
- raw_block = ole->bat.block [i];
- while (++i <= last_block && ++raw_block == ole->bat.block [i])
- ;
- if (i > last_block) {
- /* optimization don't seek if we don't need to */
- if (ole->cur_block != first_block) {
- if (gsf_input_seek (ole->input,
- (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (ole->bat.block [first_block] << ole->info->bb.shift) + offset),
- SEEK_SET) < 0)
- return NULL;
- }
- ole->cur_block = last_block;
- return gsf_input_read (ole->input,
- num_bytes,
- (unsigned char*) buffer);
- }
-
- /* damn, we need to copy it block by block */
- if (buffer == NULL) {
- if (ole->stream.buf_size < num_bytes) {
- if (ole->stream.buf != NULL)
- g_free (ole->stream.buf);
- ole->stream.buf_size = num_bytes;
- ole->stream.buf = g_new (guint8, num_bytes);
- }
- buffer = ole->stream.buf;
- }
-
- ptr = buffer;
- for (i = first_block ; i <= last_block ; i++ , ptr += count, num_bytes -= count) {
- count = ole->info->bb.size - offset;
- if (count > num_bytes)
- count = num_bytes;
- data = ole_get_block (ole, ole->bat.block [i], NULL);
- if (data == NULL)
- return NULL;
-
- /* TODO : this could be optimized to avoid the copy */
- memcpy (ptr, data + offset, count);
- offset = 0;
- }
- ole->cur_block = BAT_MAGIC_UNUSED;
- ole->cur_offset += num_bytes;
- return buffer;
-}
-
-static struct GsfInput *
-gsf_infile_msole_new_child (GsfInfileMSOle *parent,
- MSOleDirent *dirent)
-{
- GsfInfileMSOle * child;
- MSOleInfo *info;
- MSOleBAT const *metabat;
- struct GsfInput *sb_file = NULL;
- size_t size_guess;
- char * buf;
-
-
- if ( (dirent->index != 0) &&
- (dirent->is_directory) ) {
- /* be wary. It seems as if some implementations pretend that the
- * directories contain data */
- return gsf_input_new((const unsigned char*) "",
- (off_t) 0,
- 0);
- }
- child = ole_dup (parent);
- if (child == NULL)
- return NULL;
- child->dirent = dirent;
- child->size = (off_t) dirent->size;
-
- info = parent->info;
-
- if (dirent->use_sb) { /* build the bat */
- metabat = &info->sb.bat;
- size_guess = dirent->size >> info->sb.shift;
- sb_file = ole_info_get_sb_file (parent);
- } else {
- metabat = &info->bb.bat;
- size_guess = dirent->size >> info->bb.shift;
- }
- if (ole_make_bat (metabat, size_guess + 1, dirent->first_block, &child->bat)) {
- gsf_infile_msole_finalize(child);
- return NULL;
- }
-
- if (dirent->use_sb) {
- unsigned i;
- guint8 const *data;
-
- if (sb_file == NULL) {
- gsf_infile_msole_finalize(child);
- return NULL;
- }
-
- child->stream.buf_size = info->threshold;
- child->stream.buf = g_new (guint8, info->threshold);
-
- for (i = 0 ; i < child->bat.num_blocks; i++)
- if (gsf_input_seek (sb_file,
- (off_t)(child->bat.block [i] << info->sb.shift), SEEK_SET) < 0 ||
- (data = gsf_input_read (sb_file,
- info->sb.size,
- child->stream.buf + (i << info->sb.shift))) == NULL) {
- gsf_infile_msole_finalize(child);
- return NULL;
- }
- }
- buf = malloc(child->size);
- if (buf == NULL) {
- gsf_infile_msole_finalize(child);
- return NULL;
- }
- if (NULL == gsf_infile_msole_read(child,
- child->size,
- (guint8*) buf)) {
- gsf_infile_msole_finalize(child);
- return NULL;
- }
- gsf_infile_msole_finalize(child);
- return gsf_input_new((const unsigned char*) buf,
- (off_t) dirent->size,
- 1);
-}
-
-
-static struct GsfInput *
-gsf_infile_msole_child_by_index (GsfInfileMSOle * ole, int target)
-{
- GList *p;
-
- for (p = ole->dirent->children; p != NULL ; p = p->next)
- if (target-- <= 0)
- return gsf_infile_msole_new_child (ole,
- (MSOleDirent *)p->data);
- return NULL;
-}
-
-static char const *
-gsf_infile_msole_name_by_index (GsfInfileMSOle * ole, int target)
-{
- GList *p;
-
- for (p = ole->dirent->children; p != NULL ; p = p->next)
- if (target-- <= 0)
- return ((MSOleDirent *)p->data)->name;
- return NULL;
-}
-
-static int
-gsf_infile_msole_num_children (GsfInfileMSOle * ole)
-{
- g_return_val_if_fail (ole->dirent != NULL, -1);
-
- if (!ole->dirent->is_directory)
- return -1;
- return g_list_length (ole->dirent->children);
-}
-
-
-/**
- * gsf_infile_msole_new :
- * @source :
- *
- * Opens the root directory of an MS OLE file.
- * NOTE : adds a reference to @source
- *
- * Returns : the new ole file handler
- **/
-static GsfInfileMSOle *
-gsf_infile_msole_new (struct GsfInput *source)
-{
- GsfInfileMSOle * ole;
-
- ole = malloc(sizeof(GsfInfileMSOle));
- if (ole == NULL)
- return NULL;
- gsf_infile_msole_init(ole);
- ole->input = source;
- ole->size = (off_t) 0;
-
- if (ole_init_info (ole)) {
- gsf_infile_msole_finalize(ole);
- return NULL;
- }
-
- return ole;
-}
-
-
-
-
+#include <gsf/gsf-utils.h>
+#include <gsf/gsf-input-memory.h>
+#include <gsf/gsf-infile.h>
+#include <gsf/gsf-infile-msole.h>
+#include <gsf/gsf-msole-utils.h>
+#define DEBUG_OLE2 0
/* ******************************** main extraction code ************************ */
@@ -1240,21 +53,21 @@ static struct EXTRACTOR_Keywords *
addKeyword(EXTRACTOR_KeywordList *oldhead,
const char *phrase,
EXTRACTOR_KeywordType type) {
- EXTRACTOR_KeywordList * keyword;
-
- if (strlen(phrase) == 0)
- return oldhead;
- if (0 == strcmp(phrase, "\"\""))
- return oldhead;
- if (0 == strcmp(phrase, "\" \""))
- return oldhead;
- if (0 == strcmp(phrase, " "))
- return oldhead;
- keyword = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList));
- keyword->next = oldhead;
- keyword->keyword = strdup(phrase);
- keyword->keywordType = type;
- return keyword;
+ EXTRACTOR_KeywordList * keyword;
+
+ if (strlen(phrase) == 0)
+ return oldhead;
+ if (0 == strcmp(phrase, "\"\""))
+ return oldhead;
+ if (0 == strcmp(phrase, "\" \""))
+ return oldhead;
+ if (0 == strcmp(phrase, " "))
+ return oldhead;
+ keyword = malloc(sizeof(EXTRACTOR_KeywordList));
+ keyword->next = oldhead;
+ keyword->keyword = strdup(phrase);
+ keyword->keywordType = type;
+ return keyword;
}
@@ -1273,122 +86,6 @@ static guint8 const user_guid [] = {
0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
};
-typedef enum {
- GSF_MSOLE_META_DATA_COMPONENT,
- GSF_MSOLE_META_DATA_DOCUMENT,
- GSF_MSOLE_META_DATA_USER
-} GsfMSOleMetaDataType;
-
-typedef enum {
- LE_VT_EMPTY = 0,
- LE_VT_NULL = 1,
- LE_VT_I2 = 2,
- LE_VT_I4 = 3,
- LE_VT_R4 = 4,
- LE_VT_R8 = 5,
- LE_VT_CY = 6,
- LE_VT_DATE = 7,
- LE_VT_BSTR = 8,
- LE_VT_DISPATCH = 9,
- LE_VT_ERROR = 10,
- LE_VT_BOOL = 11,
- LE_VT_VARIANT = 12,
- LE_VT_UNKNOWN = 13,
- LE_VT_DECIMAL = 14,
- LE_VT_I1 = 16,
- LE_VT_UI1 = 17,
- LE_VT_UI2 = 18,
- LE_VT_UI4 = 19,
- LE_VT_I8 = 20,
- LE_VT_UI8 = 21,
- LE_VT_INT = 22,
- LE_VT_UINT = 23,
- LE_VT_VOID = 24,
- LE_VT_HRESULT = 25,
- LE_VT_PTR = 26,
- LE_VT_SAFEARRAY = 27,
- LE_VT_CARRAY = 28,
- LE_VT_USERDEFINED = 29,
- LE_VT_LPSTR = 30,
- LE_VT_LPWSTR = 31,
- LE_VT_FILETIME = 64,
- LE_VT_BLOB = 65,
- LE_VT_STREAM = 66,
- LE_VT_STORAGE = 67,
- LE_VT_STREAMED_OBJECT = 68,
- LE_VT_STORED_OBJECT = 69,
- LE_VT_BLOB_OBJECT = 70,
- LE_VT_CF = 71,
- LE_VT_CLSID = 72,
- LE_VT_VECTOR = 0x1000
-} GsfMSOleVariantType;
-
-typedef struct {
- char const *name;
- guint32 id;
- GsfMSOleVariantType prefered_type;
-} GsfMSOleMetaDataPropMap;
-
-typedef struct {
- guint32 id;
- off_t offset;
-} GsfMSOleMetaDataProp;
-
-typedef struct {
- GsfMSOleMetaDataType type;
- off_t offset;
- guint32 size, num_props;
- GIConv iconv_handle;
- unsigned char_size;
- GHashTable *dict;
-} GsfMSOleMetaDataSection;
-
-static GsfMSOleMetaDataPropMap const document_props[] = {
- { "Category", 2, LE_VT_LPSTR },
- { "PresentationFormat", 3, LE_VT_LPSTR },
- { "NumBytes", 4, LE_VT_I4 },
- { "NumLines", 5, LE_VT_I4 },
- { "NumParagraphs", 6, LE_VT_I4 },
- { "NumSlides", 7, LE_VT_I4 },
- { "NumNotes", 8, LE_VT_I4 },
- { "NumHiddenSlides", 9, LE_VT_I4 },
- { "NumMMClips", 10, LE_VT_I4 },
- { "Scale", 11, LE_VT_BOOL },
- { "HeadingPairs", 12, LE_VT_VECTOR | LE_VT_VARIANT },
- { "DocumentParts", 13, LE_VT_VECTOR | LE_VT_LPSTR },
- { "Manager", 14, LE_VT_LPSTR },
- { "Company", 15, LE_VT_LPSTR },
- { "LinksDirty", 16, LE_VT_BOOL }
-};
-
-static GsfMSOleMetaDataPropMap const component_props[] = {
- { "Title", 2, LE_VT_LPSTR },
- { "Subject", 3, LE_VT_LPSTR },
- { "Author", 4, LE_VT_LPSTR },
- { "Keywords", 5, LE_VT_LPSTR },
- { "Comments", 6, LE_VT_LPSTR },
- { "Template", 7, LE_VT_LPSTR },
- { "LastSavedBy", 8, LE_VT_LPSTR },
- { "RevisionNumber", 9, LE_VT_LPSTR },
- { "TotalEditingTime", 10, LE_VT_FILETIME },
- { "LastPrinted", 11, LE_VT_FILETIME },
- { "CreateTime", 12, LE_VT_FILETIME },
- { "LastSavedTime", 13, LE_VT_FILETIME },
- { "NumPages", 14, LE_VT_I4 },
- { "NumWords", 15, LE_VT_I4 },
- { "NumCharacters", 16, LE_VT_I4 },
- { "Thumbnail", 17, LE_VT_CF },
- { "AppName", 18, LE_VT_LPSTR },
- { "Security", 19, LE_VT_I4 }
-};
-
-static GsfMSOleMetaDataPropMap const common_props[] = {
- { "Dictionary", 0, 0, /* magic */},
- { "CodePage", 1, LE_VT_UI2 },
- { "LOCALE_SYSTEM_DEFAULT", 0x80000000, LE_VT_UI4},
- { "CASE_SENSITIVE", 0x80000003, LE_VT_UI4},
-};
-
typedef struct {
char * text;
EXTRACTOR_KeywordType type;
@@ -1398,8 +95,8 @@ static Matches tmap[] = {
{ "Title", EXTRACTOR_TITLE },
{ "PresentationFormat", EXTRACTOR_FORMAT },
{ "Category", EXTRACTOR_DESCRIPTION },
- { "Manager", EXTRACTOR_CREATED_FOR },
- { "Company", EXTRACTOR_ORGANIZATION },
+ { "Manager", EXTRACTOR_MANAGER },
+ { "Company", EXTRACTOR_COMPANY },
{ "Subject", EXTRACTOR_SUBJECT },
{ "Author", EXTRACTOR_AUTHOR },
{ "Keywords", EXTRACTOR_KEYWORDS },
@@ -1412,709 +109,98 @@ static Matches tmap[] = {
{ "NumBytes", EXTRACTOR_SIZE },
{ "CreatedTime", EXTRACTOR_CREATION_DATE },
{ "LastSavedTime" , EXTRACTOR_MODIFICATION_DATE },
+ { "gsf:company", EXTRACTOR_COMPANY },
+ /* { "gsf:security", EXTRACTOR_SECURITY }, */
+ { "gsf:character-count", EXTRACTOR_CHARACTER_COUNT },
+ { "gsf:page-count", EXTRACTOR_PAGE_COUNT },
+ { "gsf:line-count", EXTRACTOR_LINE_COUNT },
+ { "gsf:word-count", EXTRACTOR_WORD_COUNT },
+ { "gsf:paragraph-count", EXTRACTOR_PARAGRAPH_COUNT },
+ { "gsf:last-saved-by", EXTRACTOR_LAST_SAVED_BY },
+ /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
+ { "gsf:manager", EXTRACTOR_MANAGER },
+ { "dc:title", EXTRACTOR_TITLE },
+ { "dc:creator", EXTRACTOR_CREATOR },
+ { "dc:date", EXTRACTOR_DATE },
+ { "dc:subject", EXTRACTOR_SUBJECT },
+ { "dc:keywords", EXTRACTOR_KEYWORDS },
+ { "dc:last-printed", EXTRACTOR_LAST_PRINTED },
+ { "dc:description", EXTRACTOR_DESCRIPTION },
+ { "meta:creation-date", EXTRACTOR_CREATION_DATE },
+ /* { "meta:editing-duration", EXTRACTOR_TOTAL_EDITING_TIME }, // encoding? */
+ { "meta:generator", EXTRACTOR_GENERATOR },
+ { "meta:template", EXTRACTOR_TEMPLATE },
+ /* { "meta:editing-cycles", EXTRACTOR_EDITING_CYCLES }, // usually "FALSE" */
+ /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
{ NULL, 0 },
};
-
-static char const *
-msole_prop_id_to_gsf (GsfMSOleMetaDataSection *section, guint32 id)
-{
- char const *res = NULL;
- GsfMSOleMetaDataPropMap const *map = NULL;
- unsigned i = 0;
-
- if (section->dict != NULL) {
- if (id & 0x1000000) {
- id &= ~0x1000000;
- d (printf ("LINKED "););
- }
-
- res = g_hash_table_lookup (section->dict, GINT_TO_POINTER (id));
-
- if (res != NULL) {
- d (printf (res););
- return res;
- }
- }
-
- if (section->type == GSF_MSOLE_META_DATA_COMPONENT) {
- map = component_props;
- i = G_N_ELEMENTS (component_props);
- } else if (section->type == GSF_MSOLE_META_DATA_DOCUMENT) {
- map = document_props;
- i = G_N_ELEMENTS (document_props);
- }
- while (i-- > 0)
- if (map[i].id == id) {
- d (printf (map[i].name););
- return map[i].name;
- }
-
- map = common_props;
- i = G_N_ELEMENTS (common_props);
- while (i-- > 0)
- if (map[i].id == id) {
- d (printf (map[i].name););
- return map[i].name;
- }
-
- d (printf ("_UNKNOWN_(0x%x %d)", id, id););
-
- return NULL;
-}
-
-static GValue *
-msole_prop_parse(GsfMSOleMetaDataSection *section,
- guint32 type,
- guint8 const **data,
- guint8 const *data_end)
-{
- GValue *res;
- char *str;
- guint32 len;
- gboolean const is_vector = type & LE_VT_VECTOR;
- GError * error;
-
- g_return_val_if_fail (!(type & (unsigned)(~0x1fff)), NULL); /* not valid in a prop set */
-
- type &= 0xfff;
-
- if (is_vector) {
- unsigned i, n;
-
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
-
- n = GSF_LE_GET_GUINT32 (*data);
- *data += 4;
-
- d (printf (" array with %d elem\n", n););
- for (i = 0 ; i < n ; i++) {
- GValue *v;
- d (printf ("\t[%d] ", i););
- v = msole_prop_parse (section, type, data, data_end);
- if (v) {
- /* FIXME: do something with it. */
- if (G_IS_VALUE (v))
- g_value_unset (v);
- g_free (v);
- }
- }
- return NULL;
- }
-
- res = g_new0 (GValue, 1);
- switch (type) {
- case LE_VT_EMPTY : d (puts ("VT_EMPTY"););
- /* value::unset == empty */
- break;
-
- case LE_VT_NULL : d (puts ("VT_NULL"););
- /* value::unset == null too :-) do we need to distinguish ? */
- break;
-
- case LE_VT_I2 : d (puts ("VT_I2"););
- g_return_val_if_fail (*data + 2 <= data_end, NULL);
- g_value_init (res, G_TYPE_INT);
- g_value_set_int (res, GSF_LE_GET_GINT16 (*data));
- *data += 2;
- break;
-
- case LE_VT_I4 : d (puts ("VT_I4"););
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
- g_value_init (res, G_TYPE_INT);
- g_value_set_int (res, GSF_LE_GET_GINT32 (*data));
- *data += 4;
- break;
-
- case LE_VT_R4 : d (puts ("VT_R4"););
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
- g_value_init (res, G_TYPE_FLOAT);
- g_value_set_float (res, GSF_LE_GET_FLOAT (*data));
- *data += 4;
- break;
-
- case LE_VT_R8 : d (puts ("VT_R8"););
- g_return_val_if_fail (*data + 8 <= data_end, NULL);
- g_value_init (res, G_TYPE_DOUBLE);
- g_value_set_double (res, GSF_LE_GET_DOUBLE (*data));
- *data += 8;
- break;
-
- case LE_VT_CY : d (puts ("VT_CY"););
- /* 8-byte two's complement integer (scaled by 10,000) */
- /* CHEAT : just store as an int64 for now */
- g_return_val_if_fail (*data + 8 <= data_end, NULL);
- g_value_init (res, G_TYPE_INT64);
- g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
- break;
-
- case LE_VT_DATE : d (puts ("VT_DATE"););
- break;
-
- case LE_VT_BSTR : d (puts ("VT_BSTR"););
- break;
-
- case LE_VT_DISPATCH : d (puts ("VT_DISPATCH"););
- break;
-
- case LE_VT_BOOL : d (puts ("VT_BOOL"););
- g_return_val_if_fail (*data + 1 <= data_end, NULL);
- g_value_init (res, G_TYPE_BOOLEAN);
- g_value_set_boolean (res, **data ? TRUE : FALSE);
- *data += 1;
- break;
-
- case LE_VT_VARIANT : d (printf ("VT_VARIANT containing a "););
- g_free (res);
- type = GSF_LE_GET_GUINT32 (*data);
- *data += 4;
- return msole_prop_parse (section, type, data, data_end);
-
- case LE_VT_UI1 : d (puts ("VT_UI1"););
- g_return_val_if_fail (*data + 1 <= data_end, NULL);
- g_value_init (res, G_TYPE_UCHAR);
- g_value_set_uchar (res, (guchar)(**data));
- *data += 1;
- break;
-
- case LE_VT_UI2 : d (puts ("VT_UI2"););
- g_return_val_if_fail (*data + 2 <= data_end, NULL);
- g_value_init (res, G_TYPE_UINT);
- g_value_set_uint (res, GSF_LE_GET_GUINT16 (*data));
- *data += 2;
- break;
-
- case LE_VT_UI4 : d (puts ("VT_UI4"););
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
- g_value_init (res, G_TYPE_UINT);
- *data += 4;
- d (printf ("%u\n", GSF_LE_GET_GUINT32 (*data)););
- break;
-
- case LE_VT_I8 : d (puts ("VT_I8"););
- g_return_val_if_fail (*data + 8 <= data_end, NULL);
- g_value_init (res, G_TYPE_INT64);
- g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
- *data += 8;
- break;
-
- case LE_VT_UI8 : d (puts ("VT_UI8"););
- g_return_val_if_fail (*data + 8 <= data_end, NULL);
- g_value_init (res, G_TYPE_UINT64);
- g_value_set_uint64 (res, GSF_LE_GET_GUINT64 (*data));
- *data += 8;
- break;
-
- case LE_VT_LPSTR : d (puts ("VT_LPSTR"););
- /*
- * This is the representation of many strings. It is stored in
- * the same representation as VT_BSTR. Note that the serialized
- * representation of VP_LPSTR has a preceding byte count, whereas
- * the in-memory representation does not.
- */
- /* be anal and safe */
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
-
- len = GSF_LE_GET_GUINT32 (*data);
-
- g_return_val_if_fail (len < 0x10000, NULL);
- g_return_val_if_fail (*data + 4 + len*section->char_size <= data_end, NULL);
-
- error = NULL;
- d (gsf_mem_dump (*data + 4, len * section->char_size););
- str = g_convert_with_iconv ((char*) *data + 4,
- len * section->char_size,
- section->iconv_handle, NULL, NULL, &error);
-
- g_value_init (res, G_TYPE_STRING);
- if (NULL != str) {
- g_value_set_string (res, str);
- g_free (str);
- } else if (NULL != error) {
- g_warning ("error: %s", error->message);
- g_error_free (error);
- } else {
- // g_warning ("unknown error converting string property, using blank");
- }
- *data += 4 + len * section->char_size;
- break;
-
- case LE_VT_LPWSTR : d (puts ("VT_LPWSTR"););
- /*
- * A counted and null-terminated Unicode string; a DWORD character
- * count (where the count includes the terminating null) followed
- * by that many Unicode (16-bit) characters. Note that the count
- * is character count, not byte count.
- */
- /* be anal and safe */
- g_return_val_if_fail (*data + 4 <= data_end, NULL);
-
- len = GSF_LE_GET_GUINT32 (*data);
-
- g_return_val_if_fail (len < 0x10000, NULL);
- g_return_val_if_fail (*data + 4 + len <= data_end, NULL);
-
- error = NULL;
- d (gsf_mem_dump (*data + 4, len*2););
- str = g_convert ((char*) *data + 4,
- len*2,
- "UTF-8",
- "UTF-16LE",
- NULL,
- NULL,
- &error);
-
- g_value_init (res, G_TYPE_STRING);
- if (NULL != str) {
- g_value_set_string (res, str);
- g_free (str);
- } else if (NULL != error) {
- g_warning ("error: %s", error->message);
- g_error_free (error);
- } else {
- g_warning ("unknown error converting string property, using blank");
- }
- *data += 4 + len*2;
- break;
-
- case LE_VT_FILETIME : d (puts ("VT_FILETIME"););
-
- g_return_val_if_fail (*data + 8 <= data_end, NULL);
-
- g_value_init (res, G_TYPE_STRING);
- {
- /* ft * 100ns since Jan 1 1601 */
- guint64 ft = GSF_LE_GET_GUINT64 (*data);
-
- ft /= 10000000; /* convert to seconds */
-#ifdef _MSC_VER
- ft -= 11644473600i64; /* move to Jan 1 1970 */
-#else
- ft -= 11644473600ULL; /* move to Jan 1 1970 */
-#endif
-
- str = g_strdup(ctime((time_t*)&ft));
-
- g_value_set_string (res, str);
-
- *data += 8;
- break;
- }
- case LE_VT_BLOB : d (puts ("VT_BLOB"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_STREAM : d (puts ("VT_STREAM"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_STORAGE : d (puts ("VT_STORAGE"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_STREAMED_OBJECT: d (puts ("VT_STREAMED_OBJECT"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_STORED_OBJECT : d (puts ("VT_STORED_OBJECT"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_BLOB_OBJECT : d (puts ("VT_BLOB_OBJECT"););
- g_free (res);
- res = NULL;
- break;
- case LE_VT_CF : d (puts ("VT_CF"););
- break;
- case LE_VT_CLSID : d (puts ("VT_CLSID"););
- *data += 16;
- g_free (res);
- res = NULL;
- break;
-
- case LE_VT_ERROR :
- case LE_VT_UNKNOWN :
- case LE_VT_DECIMAL :
- case LE_VT_I1 :
- case LE_VT_INT :
- case LE_VT_UINT :
- case LE_VT_VOID :
- case LE_VT_HRESULT :
- case LE_VT_PTR :
- case LE_VT_SAFEARRAY :
- case LE_VT_CARRAY :
- case LE_VT_USERDEFINED :
- warning ("type %d (0x%x) is not permitted in property sets",
- type, type);
- g_free (res);
- res = NULL;
- break;
-
- default :
- warning ("Unknown property type %d (0x%x)", type, type);
- g_free (res);
- res = NULL;
- };
-
- d ( if (res != NULL && G_IS_VALUE (res)) {
- char *val = g_strdup_value_contents (res);
- d(printf ("%s\n", val););
- g_free (val);
- } else
- puts ("<unparsed>\n");
- );
- return res;
-}
-
-static GValue *
-msole_prop_read (struct GsfInput *in,
- GsfMSOleMetaDataSection *section,
- GsfMSOleMetaDataProp *props,
- unsigned i)
-{
- guint32 type;
- guint8 const *data;
- /* TODO : why size-4 ? I must be missing something */
- off_t size = ((i+1) >= section->num_props)
- ? section->size-4 : props[i+1].offset;
- char const *prop_name;
-
- g_return_val_if_fail (i < section->num_props, NULL);
- g_return_val_if_fail (size >= props[i].offset + 4, NULL);
-
- size -= props[i].offset; /* includes the type id */
- if (gsf_input_seek (in, section->offset+props[i].offset, SEEK_SET) ||
- NULL == (data = gsf_input_read (in, size, NULL))) {
- warning ("failed to read prop #%d", i);
- return NULL;
- }
-
- type = GSF_LE_GET_GUINT32 (data);
- data += 4;
-
- /* dictionary is magic */
- if (props[i].id == 0) {
- guint32 len, id, i, n;
- gsize gslen;
- char *name;
- guint8 const *start = data;
-
- g_return_val_if_fail (section->dict == NULL, NULL);
-
- section->dict = g_hash_table_new_full (
- g_direct_hash, g_direct_equal,
- NULL, g_free);
-
- n = type;
- for (i = 0 ; i < n ; i++) {
- id = GSF_LE_GET_GUINT32 (data);
- len = GSF_LE_GET_GUINT32 (data + 4);
-
- g_return_val_if_fail (len < 0x10000, NULL);
-
- gslen = 0;
- name = g_convert_with_iconv ((char*) data + 8,
- len * section->char_size,
- section->iconv_handle, &gslen, NULL, NULL);
-
- len = (guint32)gslen;
- data += 8 + len;
-
- d (printf ("\t%u == %s\n", id, name););
- g_hash_table_replace (section->dict,
- GINT_TO_POINTER (id), name);
-
- /* MS documentation blows goats !
- * The docs claim there are padding bytes in the dictionary.
- * Their examples show padding bytes.
- * In reality non-unicode strings do not see to have padding.
- */
- if (section->char_size != 1 && (data - start) % 4)
- data += 4 - ((data - start) % 4);
- }
-
- return NULL;
- }
-
- d (printf ("%u) ", i););
- prop_name = msole_prop_id_to_gsf (section, props[i].id);
-
- d (printf (" @ %x %x = ", (unsigned)props[i].offset, (unsigned)size););
- return msole_prop_parse (section, type, &data, data + size);
-}
-
-static int
-msole_prop_cmp (gconstpointer a, gconstpointer b)
-{
- GsfMSOleMetaDataProp const *prop_a = a ;
- GsfMSOleMetaDataProp const *prop_b = b ;
- return prop_a->offset - prop_b->offset;
-}
-
-/**
- * gsf_msole_iconv_open_codepage_for_import :
- * @to:
- * @codepage :
- *
- * Returns an iconv converter for @codepage -> utf8.
- **/
-static GIConv
-gsf_msole_iconv_open_codepage_for_import(char const *to,
- int codepage)
-{
- GIConv iconv_handle;
-
- g_return_val_if_fail (to != NULL, (GIConv)(-1));
- /* sometimes it is stored as signed short */
- if (codepage == 65001 || codepage == -535) {
- iconv_handle = g_iconv_open (to, "UTF-8");
- if (iconv_handle != (GIConv)(-1))
- return iconv_handle;
- } else if (codepage != 1200 && codepage != 1201) {
- char* src_charset = g_strdup_printf ("CP%d", codepage);
- iconv_handle = g_iconv_open (to, src_charset);
- g_free (src_charset);
- if (iconv_handle != (GIConv)(-1))
- return iconv_handle;
+static void processMetadata(gpointer key,
+ gpointer value,
+ gpointer user_data) {
+ struct EXTRACTOR_Keywords ** pprev = user_data;
+ const char * type = key;
+ const GsfDocProp * prop = value;
+ const GValue * gval;
+ char * contents;
+ int pos;
+
+ if ( (key == NULL) ||
+ (value == NULL) )
+ return;
+ gval = gsf_doc_prop_get_val(prop);
+
+ if (G_VALUE_TYPE(gval) == G_TYPE_STRING) {
+ contents = strdup(g_value_get_string(gval));
} else {
- char const *from = (codepage == 1200) ? "UTF-16LE" : "UTF-16BE";
- iconv_handle = g_iconv_open (to, from);
- if (iconv_handle != (GIConv)(-1))
- return iconv_handle;
- }
-
- /* Try aliases. */
- if (codepage == 10000) {
- /* gnu iconv. */
- iconv_handle = g_iconv_open (to, "MACROMAN");
- if (iconv_handle != (GIConv)(-1))
- return iconv_handle;
-
- /* glibc. */
- iconv_handle = g_iconv_open (to, "MACINTOSH");
- if (iconv_handle != (GIConv)(-1))
- return iconv_handle;
+ /* convert other formats? */
+ contents = g_strdup_value_contents(gval);
}
-
- warning ("Unable to open an iconv handle from codepage %d -> %s",
- codepage, to);
- return (GIConv)(-1);
-}
-
-/**
- * gsf_msole_iconv_open_for_import :
- * @codepage :
- *
- * Returns an iconv converter for single byte encodings @codepage -> utf8.
- * Attempt to handle the semantics of a specification for multibyte encodings
- * since this is only supposed to be used for single bytes.
- **/
-static GIConv
-gsf_msole_iconv_open_for_import (int codepage)
-{
- return gsf_msole_iconv_open_codepage_for_import ("UTF-8", codepage);
-}
-
-
-
-
-
-static struct EXTRACTOR_Keywords * process(struct GsfInput * in,
- struct EXTRACTOR_Keywords * prev) {
- guint8 const *data = gsf_input_read (in, 28, NULL);
- guint16 version;
- guint32 os, num_sections;
- unsigned i, j;
- GsfMSOleMetaDataSection *sections;
- GsfMSOleMetaDataProp *props;
-
- if (NULL == data)
- return prev;
-
- /* NOTE : high word is the os, low word is the os version
- * 0 = win16
- * 1 = mac
- * 2 = win32
- */
- os = GSF_LE_GET_GUINT16 (data + 6);
-
- version = GSF_LE_GET_GUINT16 (data + 2);
-
- num_sections = GSF_LE_GET_GUINT32 (data + 24);
- if (GSF_LE_GET_GUINT16 (data + 0) != 0xfffe
- || (version != 0 && version != 1)
- || os > 2
- || num_sections > 100) { /* arbitrary sanity check */
- return prev;
+ if ( (strlen(contents) > 0) &&
+ (contents[strlen(contents)-1] == '\n') )
+ contents[strlen(contents)-1] = '\0';
+ if (contents == NULL)
+ return;
+ pos = 0;
+ while (tmap[pos].text != NULL) {
+ if (0 == strcmp(tmap[pos].text,
+ type))
+ break;
+ pos++;
}
-
- /* extract the section info */
- sections = (GsfMSOleMetaDataSection *)g_alloca (sizeof (GsfMSOleMetaDataSection)* num_sections);
- for (i = 0 ; i < num_sections ; i++) {
- data = gsf_input_read (in, 20, NULL);
- if (NULL == data) {
- return prev;
- }
- if (!memcmp (data, component_guid, sizeof (component_guid)))
- sections [i].type = GSF_MSOLE_META_DATA_COMPONENT;
- else if (!memcmp (data, document_guid, sizeof (document_guid)))
- sections [i].type = GSF_MSOLE_META_DATA_DOCUMENT;
- else if (!memcmp (data, user_guid, sizeof (user_guid)))
- sections [i].type = GSF_MSOLE_META_DATA_USER;
- else {
- sections [i].type = GSF_MSOLE_META_DATA_USER;
- warning ("Unknown property section type, treating it as USER");
- }
-
- sections [i].offset = GSF_LE_GET_GUINT32 (data + 16);
-#ifndef NO_DEBUG_OLE_PROPS
- d(printf ("0x%x\n", (guint32)sections [i].offset););
+ if (tmap[pos].text != NULL)
+ *pprev = addKeyword(*pprev,
+ contents,
+ tmap[pos].type);
+#if DEBUG_OLE2
+ else
+ printf("No match for type `%s'\n",
+ type);
#endif
- }
- for (i = 0 ; i < num_sections ; i++) {
- if (gsf_input_seek (in, sections[i].offset, SEEK_SET) ||
- NULL == (data = gsf_input_read (in, 8, NULL))) {
- return prev;
- }
-
- sections[i].iconv_handle = (GIConv)-1;
- sections[i].char_size = 1;
- sections[i].dict = NULL;
- sections[i].size = GSF_LE_GET_GUINT32 (data); /* includes header */
- sections[i].num_props = GSF_LE_GET_GUINT32 (data + 4);
- if (sections[i].num_props <= 0)
- continue;
- props = g_new (GsfMSOleMetaDataProp, sections[i].num_props);
- for (j = 0; j < sections[i].num_props; j++) {
- if (NULL == (data = gsf_input_read (in, 8, NULL))) {
- g_free (props);
- return prev;
- }
-
- props [j].id = GSF_LE_GET_GUINT32 (data);
- props [j].offset = GSF_LE_GET_GUINT32 (data + 4);
- }
-
- /* order prop info by offset to facilitate bounds checking */
- qsort (props, sections[i].num_props,
- sizeof (GsfMSOleMetaDataProp),
- msole_prop_cmp);
+ free(contents);
+}
- sections[i].iconv_handle = (GIConv)-1;
- sections[i].char_size = 1;
- for (j = 0; j < sections[i].num_props; j++) /* first codepage */
- if (props[j].id == 1) {
- GValue *v = msole_prop_read (in, sections+i, props, j);
- if (v != NULL) {
- if (G_IS_VALUE (v)) {
- if (G_VALUE_HOLDS_INT (v)) {
- int codepage = g_value_get_int (v);
- sections[i].iconv_handle = gsf_msole_iconv_open_for_import (codepage);
- if (codepage == 1200 || codepage == 1201)
- sections[i].char_size = 2;
- }
- g_value_unset (v);
- }
- g_free (v) ;
- }
- }
- if (sections[i].iconv_handle == (GIConv)-1)
- sections[i].iconv_handle = gsf_msole_iconv_open_for_import (1252);
- for (j = 0; j < sections[i].num_props; j++) /* then dictionary */
- if (props[j].id == 0) {
- GValue *v = msole_prop_read (in, sections+i, props, j);
- if (v) {
- if (G_VALUE_TYPE(v) == G_TYPE_STRING) {
- gchar * contents = g_strdup_value_contents(v);
- free(contents);
- } else {
-
- /* FIXME: do something with non-strings... */
- }
- if (G_IS_VALUE (v))
- g_value_unset (v);
- g_free (v);
- }
- }
- for (j = 0; j < sections[i].num_props; j++) /* the rest */
- if (props[j].id > 1) {
- GValue *v = msole_prop_read (in, sections+i, props, j);
- if (v && G_IS_VALUE(v)) {
- gchar * contents = NULL;
- int pc;
- int ipc;
-
- if (G_VALUE_TYPE(v) == G_TYPE_STRING) {
- contents = strdup(g_value_get_string(v));
- } else {
- /* convert other formats? */
- contents = g_strdup_value_contents(v);
- }
- pc = 0;
- if (contents != NULL) {
- for (ipc=strlen(contents)-1;ipc>=0;ipc--)
- if ( (isprint(contents[ipc])) &&
- (! isspace(contents[ipc])) )
- pc++;
- if ( (strlen(contents) > 0) &&
- (contents[strlen(contents)-1] == '\n') )
- contents[strlen(contents)-1] = '\0';
- }
- if (pc > 0) {
- int pos = 0;
- const char * prop
- = msole_prop_id_to_gsf(sections+i, props[j].id);
- if (prop != NULL) {
- while (tmap[pos].text != NULL) {
- if (0 == strcmp(tmap[pos].text,
- prop))
- break;
- pos++;
- }
- if (tmap[pos].text != NULL)
- prev = addKeyword(prev,
- contents,
- tmap[pos].type);
- }
- }
- if (contents != NULL)
- free(contents);
- }
- if (v) {
- if (G_IS_VALUE (v))
- g_value_unset (v);
- g_free (v);
- }
- }
+static struct EXTRACTOR_Keywords *
+process(GsfInput * in,
+ struct EXTRACTOR_Keywords * prev) {
+ GsfDocMetaData * sections;
+ GError * error;
- gsf_iconv_close (sections[i].iconv_handle);
- g_free (props);
- if (sections[i].dict != NULL)
- g_hash_table_destroy (sections[i].dict);
- }
- switch (os) {
- case 0:
- prev = addKeyword(prev,
- "Win16",
- EXTRACTOR_OS);
- break;
- case 1:
- prev = addKeyword(prev,
- "MacOS",
- EXTRACTOR_OS);
- break;
- case 2:
- prev = addKeyword(prev,
- "Win32",
- EXTRACTOR_OS);
- break;
+ sections = gsf_doc_meta_data_new();
+ error = gsf_msole_metadata_read(in, sections);
+ if (error == NULL) {
+ gsf_doc_meta_data_foreach(sections,
+ &processMetadata,
+ &prev);
}
+ g_object_unref(G_OBJECT(sections));
return prev;
}
-static struct EXTRACTOR_Keywords * processSO(struct GsfInput * src,
- struct EXTRACTOR_Keywords * prev) {
+static struct EXTRACTOR_Keywords *
+processSO(GsfInput * src,
+ struct EXTRACTOR_Keywords * prev) {
off_t size;
char * buf;
@@ -2161,61 +247,290 @@ static struct EXTRACTOR_Keywords * processSO(struct GsfInput * src,
return prev;
}
+/* *************** wordleaker stuff *************** */
+
+#define __(a) dgettext("iso-639", a)
+
+static const char * lidToLanguage( unsigned int lid ) {
+ switch ( lid ) {
+ case 0x0400:
+ return _("No Proofing");
+ case 0x0401:
+ return __("Arabic");
+ case 0x0402:
+ return __("Bulgarian");
+ case 0x0403:
+ return __("Catalan");
+ case 0x0404:
+ return _("Traditional Chinese");
+ case 0x0804:
+ return _("Simplified Chinese");
+ case 0x0405:
+ return __("Chechen");
+ case 0x0406:
+ return __("Danish");
+ case 0x0407:
+ return __("German");
+ case 0x0807:
+ return _("Swiss German");
+ case 0x0408:
+ return __("Greek");
+ case 0x0409:
+ return _("U.S. English");
+ case 0x0809:
+ return _("U.K. English");
+ case 0x0c09:
+ return _("Australian English");
+ case 0x040a:
+ return _("Castilian Spanish");
+ case 0x080a:
+ return _("Mexican Spanish");
+ case 0x040b:
+ return __("Finnish");
+ case 0x040c:
+ return __("French");
+ case 0x080c:
+ return _("Belgian French");
+ case 0x0c0c:
+ return _("Canadian French");
+ case 0x100c:
+ return _("Swiss French");
+ case 0x040d:
+ return __("Hebrew");
+ case 0x040e:
+ return __("Hungarian");
+ case 0x040f:
+ return __("Icelandic");
+ case 0x0410:
+ return __("Italian");
+ case 0x0810:
+ return _("Swiss Italian");
+ case 0x0411:
+ return __("Japanese");
+ case 0x0412:
+ return __("Korean");
+ case 0x0413:
+ return __("Dutch");
+ case 0x0813:
+ return _("Belgian Dutch");
+ case 0x0414:
+ return _("Norwegian Bokmal");
+ case 0x0814:
+ return __("Norwegian Nynorsk");
+ case 0x0415:
+ return __("Polish");
+ case 0x0416:
+ return __("Brazilian Portuguese");
+ case 0x0816:
+ return __("Portuguese");
+ case 0x0417:
+ return _("Rhaeto-Romanic");
+ case 0x0418:
+ return __("Romanian");
+ case 0x0419:
+ return __("Russian");
+ case 0x041a:
+ return _("Croato-Serbian (Latin)");
+ case 0x081a:
+ return _("Serbo-Croatian (Cyrillic)");
+ case 0x041b:
+ return __("Slovak");
+ case 0x041c:
+ return __("Albanian");
+ case 0x041d:
+ return __("Swedish");
+ case 0x041e:
+ return __("Thai");
+ case 0x041f:
+ return __("Turkish");
+ case 0x0420:
+ return __("Urdu");
+ case 0x0421:
+ return __("Bahasa");
+ case 0x0422:
+ return __("Ukrainian");
+ case 0x0423:
+ return __("Byelorussian");
+ case 0x0424:
+ return __("Slovenian");
+ case 0x0425:
+ return __("Estonian");
+ case 0x0426:
+ return __("Latvian");
+ case 0x0427:
+ return __("Lithuanian");
+ case 0x0429:
+ return _("Farsi");
+ case 0x042D:
+ return __("Basque");
+ case 0x042F:
+ return __("Macedonian");
+ case 0x0436:
+ return __("Afrikaans");
+ case 0x043E:
+ return __("Malayalam");
+ default:
+ return NULL;
+ }
+}
+
+
+static struct EXTRACTOR_Keywords *
+history_extract(GsfInput * stream,
+ unsigned int lcbSttbSavedBy,
+ unsigned int fcSttbSavedBy,
+ struct EXTRACTOR_Keywords * prev) {
+ unsigned int where = 0;
+ unsigned char * lbuffer;
+ unsigned int i;
+ unsigned int length;
+ char * author;
+ char * filename;
+ char * rbuf;
+ unsigned int nRev;
+
+ // goto offset of revision
+ gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET);
+ if (gsf_input_remaining(stream) < lcbSttbSavedBy)
+ return prev;
+ lbuffer = malloc(lcbSttbSavedBy);
+ // read all the revision history
+ gsf_input_read(stream, lcbSttbSavedBy, lbuffer);
+ // there are n strings, so n/2 revisions (author & file)
+ nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
+ where = 6;
+ for (i=0; i < nRev; i++) {
+ if (where >= lcbSttbSavedBy)
+ break;
+ length = lbuffer[where++];
+ if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
+ (where + 2 * length + 2 <= where) )
+ break;
+ author = convertToUtf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ length = lbuffer[where++];
+ if ( (where + 2 * length >= lcbSttbSavedBy) ||
+ (where + 2 * length + 1 <= where) )
+ break;
+ filename = convertToUtf8((const char*) &lbuffer[where],
+ length * 2,
+ "UTF-16BE");
+ where += length * 2 + 1;
+ rbuf = malloc(strlen(author) + strlen(filename) + 512);
+ snprintf(rbuf, 512 + strlen(author) + strlen(filename),
+ _("Revision #%u: Author '%s' worked on '%s'"),
+ i, author, filename);
+ free(author);
+ free(filename);
+ prev = addKeyword(prev,
+ rbuf,
+ EXTRACTOR_REVISION_HISTORY);
+ free(rbuf);
+ }
+ free(lbuffer);
+ return prev;
+}
+
+
+/* ************** main method *********** */
+
struct EXTRACTOR_Keywords *
libextractor_ole2_extract(const char * filename,
const char * data,
size_t size,
struct EXTRACTOR_Keywords * prev) {
- struct GsfInput *input;
- struct GsfInfileMSOle * infile;
- struct GsfInput * src;
+ GsfInput * input;
+ GsfInfile * infile;
+ GsfInput * src;
+ GError * err = NULL;
const char * name;
- const char * software = 0;
+ const char * software = NULL;
int i;
-
- input = gsf_input_new((const unsigned char*) data,
- (off_t) size,
- 0);
+ unsigned int lcb;
+ unsigned int fcb;
+ const unsigned char * data512;
+ unsigned int lid;
+ const char * lang;
+
+ if (size < 512 + 898)
+ return prev; /* can hardly be OLE2 */
+ input = gsf_input_memory_new((const guint8 *) data,
+ (gsf_off_t) size,
+ FALSE);
if (input == NULL)
return prev;
- infile = gsf_infile_msole_new(input);
- if (infile == NULL)
+ infile = gsf_infile_msole_new(input, &err);
+ if (infile == NULL) {
+ g_object_unref(G_OBJECT(input));
return prev;
-
- for (i=0;i<gsf_infile_msole_num_children(infile);i++) {
- name = gsf_infile_msole_name_by_index (infile, i);
+ }
+ lcb = 0;
+ fcb = 0;
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ name = gsf_infile_name_by_index (infile, i);
src = NULL;
if (name == NULL)
continue;
if ( (0 == strcmp(name, "\005SummaryInformation"))
|| (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
- src = gsf_infile_msole_child_by_index (infile, i);
- if (src != NULL)
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL)
prev = process(src,
prev);
}
if (0 == strcmp(name, "SfxDocumentInfo")) {
- src = gsf_infile_msole_child_by_index (infile, i);
+ src = gsf_infile_child_by_index (infile, i);
if (src != NULL)
prev = processSO(src,
prev);
}
if (src != NULL)
- gsf_input_finalize(src);
+ g_object_unref(G_OBJECT(src));
}
- gsf_infile_msole_finalize(infile);
+
+ data512 = (const unsigned char*) &data[512];
+ lid = data512[6] + (data512[7] << 8);
+ lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
+ fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
+ lang = lidToLanguage(lid);
+ if (lang != NULL) {
+ prev = addKeyword(prev,
+ lang,
+ EXTRACTOR_LANGUAGE);
+ }
+ if (lcb >= 6) {
+ for (i=0;i<gsf_infile_num_children(infile);i++) {
+ name = gsf_infile_name_by_index (infile, i);
+ if (name == NULL)
+ continue;
+ if ( (0 == strcmp(name, "1Table")) ||
+ (0 == strcmp(name, "0Table")) ) {
+ src = gsf_infile_child_by_index (infile, i);
+ if (src != NULL) {
+ prev = history_extract(src,
+ lcb,
+ fcb,
+ prev);
+ g_object_unref(G_OBJECT(src));
+ }
+ }
+ }
+ }
+ g_object_unref(G_OBJECT(infile));
/*
* Hack to return an appropriate mimetype
*/
software = EXTRACTOR_extractLast(EXTRACTOR_SOFTWARE, prev);
- if(NULL == software) {
+ if (NULL == software) {
/*
* when very puzzled, just look at file magic number
*/
- if( (8 < size)
- && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
+ if ( (8 < size)
+ && (0 == memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8)) )
software = "Microsoft Office";
}
diff --git a/src/plugins/wordleaker/Makefile.am b/src/plugins/wordleaker/Makefile.am
@@ -1,25 +0,0 @@
-include ../Makefile-plugins.am
-
-plugin_LTLIBRARIES = \
- libextractor_word.la
-
-libextractor_word_la_LINK = \
- /bin/sh ../../../libtool --mode=link $(CXXLD) -o libextractor_word.la
-libextractor_word_la_LDFLAGS = \
- $(PLUGINFLAGS) $(retaincommand) \
- $(XTRA_CPPLIBS)
-libextractor_word_la_LIBADD = \
- $(top_builddir)/src/main/libextractor.la \
- $(top_builddir)/src/plugins/libconvert.la \
- -lm
-
-libextractor_word_la_SOURCES = \
- pole.h pole.cpp \
- wordleaker.h \
- wordextractor.cc
-
-# gcc 3.3 produces BROKEN code for -O1 and -O2 (PDF extraction
-# would fail silently) hence we MUST override the user flag here
-# which may contain -O1 or -O2!
-# CXXFLAGS = -O0
-
diff --git a/src/plugins/wordleaker/SYMBOLS b/src/plugins/wordleaker/SYMBOLS
@@ -1 +0,0 @@
-libextractor_word_extract
diff --git a/src/plugins/wordleaker/pole.cpp b/src/plugins/wordleaker/pole.cpp
@@ -1,1271 +0,0 @@
-/* POLE - Portable C++ library to access OLE Storage
- Copyright (C) 2002-2004 Ariya Hidayat <ariya@kde.org>
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public License
- along with this library; see the file COPYING.LIB. If not, write to
- the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, US
-*/
-
-#include <fstream>
-#include <iostream>
-#include <list>
-#include <string>
-#include <vector>
-
-#include "pole.h"
-
-namespace POLE
-{
-
-class Header
-{
- public:
- unsigned char id[8]; // signature, or magic identifier
- unsigned b_shift; // bbat->blockSize = 1 << b_shift
- unsigned s_shift; // sbat->blockSize = 1 << s_shift
- unsigned num_bat; // blocks allocated for big bat
- unsigned dirent_start; // starting block for directory info
- unsigned threshold; // switch from small to big file (usually 4K)
- unsigned sbat_start; // starting block index to store small bat
- unsigned num_sbat; // blocks allocated for small bat
- unsigned mbat_start; // starting block to store meta bat
- unsigned num_mbat; // blocks allocated for meta bat
- unsigned long bb_blocks[109];
-
- Header();
- void load( const unsigned char* buffer );
- void save( unsigned char* buffer );
- void debug();
-};
-
-class AllocTable
-{
- public:
- static const unsigned Eof;
- static const unsigned Avail;
- static const unsigned Bat;
- unsigned blockSize;
- AllocTable();
- void clear();
- unsigned long count();
- void resize( unsigned long newsize );
- void preserve( unsigned long n );
- void set( unsigned long index, unsigned long val );
- unsigned unused();
- void setChain( std::vector<unsigned long> );
- std::vector<unsigned long> follow( unsigned long start );
- unsigned long operator[](unsigned long index );
- void load( const unsigned char* buffer, unsigned len );
- void save( unsigned char* buffer );
- unsigned size();
- void debug();
- private:
- std::vector<unsigned long> data;
- AllocTable( const AllocTable& );
- AllocTable& operator=( const AllocTable& );
-};
-
-class DirEntry
-{
- public:
- std::string name;
- bool dir; // true if directory
- unsigned long size; // size (not valid if directory)
- unsigned long start; // starting block
- unsigned prev; // previous sibling
- unsigned next; // next sibling
- unsigned child; // first child
-};
-
-class DirTree
-{
- public:
- static const unsigned End;
- DirTree();
- void clear();
- unsigned entryCount();
- DirEntry* entry( unsigned index );
- DirEntry* entry( const std::string& name, bool create=false );
- int indexOf( DirEntry* e );
- int parent( unsigned index );
- std::string fullName( unsigned index );
- std::vector<unsigned> children( unsigned index );
- std::vector<DirEntry*> listDirectory();
- bool enterDirectory( const std::string& dir );
- void leaveDirectory();
- std::string path();
- void load( unsigned char* buffer, unsigned len );
- void save( unsigned char* buffer );
- unsigned size();
- void debug();
- private:
- unsigned current;
- std::vector<DirEntry> entries;
- DirTree( const DirTree& );
- DirTree& operator=( const DirTree& );
-};
-
-class StorageIO
-{
- public:
- Storage* storage;
- std::string filename;
- std::fstream file;
- int result; // result of operation
- bool opened; // true if file is opened
- unsigned long filesize; // size of the file
-
- Header* header; // storage header
- DirTree* dirtree; // directory tree
- AllocTable* bbat; // allocation table for big blocks
- AllocTable* sbat; // allocation table for small blocks
-
- std::vector<unsigned long> sb_blocks; // blocks for "small" files
-
- std::list<Stream*> streams;
-
- StorageIO( Storage* storage, const char* filename );
- ~StorageIO();
-
- bool open();
- void close();
- void flush();
- void load();
- void create();
-
- unsigned long loadBigBlocks( std::vector<unsigned long> blocks, unsigned char* buffer, unsigned long maxlen );
-
- unsigned long loadBigBlock( unsigned long block, unsigned char* buffer, unsigned long maxlen );
-
- unsigned long loadSmallBlocks( std::vector<unsigned long> blocks, unsigned char* buffer, unsigned long maxlen );
-
- unsigned long loadSmallBlock( unsigned long block, unsigned char* buffer, unsigned long maxlen );
-
- private:
- // no copy or assign
- StorageIO( const StorageIO& );
- StorageIO& operator=( const StorageIO& );
-
-};
-
-class StreamImpl
-{
- public:
- StreamImpl( StorageIO* io, DirEntry* entry );
- ~StreamImpl();
- unsigned long size();
- void seek( unsigned long pos );
- unsigned long tell();
- int getch();
- unsigned long read( unsigned char* data, unsigned long maxlen );
- unsigned long read( unsigned long pos, unsigned char* data, unsigned long maxlen );
-
- StorageIO* io;
- DirEntry* entry;
-
- private:
- std::vector<unsigned long> blocks;
-
- // no copy or assign
- StreamImpl( const StreamImpl& );
- StreamImpl& operator=( const StreamImpl& );
-
- // pointer for read
- unsigned long m_pos;
-
- // simple cache system to speed-up getch()
- unsigned char* cache_data;
- unsigned long cache_size;
- unsigned long cache_pos;
- void updateCache();
-};
-
-}; // namespace POLE
-
-using namespace POLE;
-
-static inline unsigned long readU16( const unsigned char* ptr )
-{
- return ptr[0]+(ptr[1]<<8);
-}
-
-static inline unsigned long readU32( const unsigned char* ptr )
-{
- return ptr[0]+(ptr[1]<<8)+(ptr[2]<<16)+(ptr[3]<<24);
-}
-
-static inline void writeU16( unsigned char* ptr, unsigned long data )
-{
- ptr[0] = data & 0xff;
- ptr[1] = (data >> 8) & 0xff;
-}
-
-static inline void writeU32( unsigned char* ptr, unsigned long data )
-{
- ptr[0] = data & 0xff;
- ptr[1] = (data >> 8) & 0xff;
- ptr[2] = (data >> 16) & 0xff;
- ptr[3] = (data >> 24) & 0xff;
-}
-
-static const unsigned char pole_magic[] =
- { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
-
-// =========== Header ==========
-
-Header::Header()
-{
- b_shift = 9;
- s_shift = 6;
- num_bat = 0;
- dirent_start = 0;
- threshold = 4096;
- sbat_start = 0;
- num_sbat = 0;
- mbat_start = 0;
- num_mbat = 0;
-
- for( unsigned i = 0; i < 8; i++ )
- id[i] = pole_magic[i];
- for( unsigned i=0; i<109; i++ )
- bb_blocks[i] = AllocTable::Avail;
-}
-
-void Header::load( const unsigned char* buffer )
-{
- b_shift = readU16( buffer + 0x1e );
- s_shift = readU16( buffer + 0x20 );
- num_bat = readU32( buffer + 0x2c );
- dirent_start = readU32( buffer + 0x30 );
- threshold = readU32( buffer + 0x38 );
- sbat_start = readU32( buffer + 0x3c );
- num_sbat = readU32( buffer + 0x40 );
- mbat_start = readU32( buffer + 0x44 );
- num_mbat = readU32( buffer + 0x48 );
-
- for( unsigned i = 0; i < 8; i++ )
- id[i] = buffer[i];
- for( unsigned i=0; i<109; i++ )
- bb_blocks[i] = readU32( buffer + 0x4C+i*4 );
-}
-
-void Header::save( unsigned char* buffer )
-{
- memset( buffer, 0, 0x4c );
- memcpy( buffer, pole_magic, 8 ); // ole signature
- writeU32( buffer + 8, 0 ); // unknown
- writeU32( buffer + 12, 0 ); // unknown
- writeU32( buffer + 16, 0 ); // unknown
- writeU16( buffer + 24, 0x003e ); // revision ?
- writeU16( buffer + 26, 3 ); // version ?
- writeU16( buffer + 28, 0xfffe ); // unknown
- writeU16( buffer + 0x1e, b_shift );
- writeU16( buffer + 0x20, s_shift );
- writeU32( buffer + 0x2c, num_bat );
- writeU32( buffer + 0x30, dirent_start );
- writeU32( buffer + 0x38, threshold );
- writeU32( buffer + 0x3c, sbat_start );
- writeU32( buffer + 0x40, num_sbat );
- writeU32( buffer + 0x44, mbat_start );
- writeU32( buffer + 0x48, num_mbat );
-
- for( unsigned i=0; i<109; i++ )
- writeU32( buffer + 0x4C+i*4, bb_blocks[i] );
-}
-
-void Header::debug()
-{
- std::cout << std::endl;
- std::cout << "b_shift " << b_shift << std::endl;
- std::cout << "s_shift " << s_shift << std::endl;
- std::cout << "num_bat " << num_bat << std::endl;
- std::cout << "dirent_start " << dirent_start << std::endl;
- std::cout << "threshold " << threshold << std::endl;
- std::cout << "sbat_start " << sbat_start << std::endl;
- std::cout << "num_sbat " << num_sbat << std::endl;
- std::cout << "mbat_start " << mbat_start << std::endl;
- std::cout << "num_mbat " << num_mbat << std::endl;
-
- unsigned s = (num_bat<=109) ? num_bat : 109;
- std::cout << "bat blocks: ";
- for( unsigned i = 0; i < s; i++ )
- std::cout << bb_blocks[i] << " ";
- std::cout << std::endl;
-}
-
-// =========== AllocTable ==========
-
-const unsigned AllocTable::Avail = 0xffffffff;
-const unsigned AllocTable::Eof = 0xfffffffe;
-const unsigned AllocTable::Bat = 0xfffffffd;
-
-AllocTable::AllocTable()
-{
- blockSize = 4096;
- // initial size
- resize( 128 );
-}
-
-unsigned long AllocTable::count()
-{
- return data.size();
-}
-
-void AllocTable::resize( unsigned long newsize )
-{
- unsigned oldsize = data.size();
- data.resize( newsize );
- if( newsize > oldsize )
- for( unsigned i = oldsize; i<newsize; i++ )
- data[i] = Avail;
-}
-
-// make sure there're still free blocks
-void AllocTable::preserve( unsigned long n )
-{
- std::vector<unsigned long> pre;
- for( unsigned i=0; i < n; i++ )
- pre.push_back( unused() );
-}
-
-unsigned long AllocTable::operator[]( unsigned long index )
-{
- unsigned long result;
- result = data[index];
- return result;
-}
-
-void AllocTable::set( unsigned long index, unsigned long value )
-{
- if( index >= count() ) resize( index + 1);
- data[ index ] = value;
-}
-
-void AllocTable::setChain( std::vector<unsigned long> chain )
-{
- if( chain.size() )
- {
- for( unsigned i=0; i<chain.size()-1; i++ )
- set( chain[i], chain[i+1] );
- set( chain[ chain.size()-1 ], AllocTable::Eof );
- }
-}
-
-// follow
-std::vector<unsigned long> AllocTable::follow( unsigned long start )
-{
- std::vector<unsigned long> chain;
-
- if( start >= count() ) return chain;
-
- unsigned long p = start;
- while( p < count() )
- {
- if( p >= (unsigned long)Eof ) break;
- if( p >= count() ) break;
- chain.push_back( p );
- if( data[p] >= count() ) break;
- p = data[ p ];
- }
-
- return chain;
-}
-
-unsigned AllocTable::unused()
-{
- // find first available block
- for( unsigned i = 0; i < data.size(); i++ )
- if( data[i] == Avail )
- return i;
-
- // completely full, so enlarge the table
- unsigned block = data.size();
- resize( data.size()+10 );
- return block;
-}
-
-void AllocTable::load( const unsigned char* buffer, unsigned len )
-{
- resize( len / 4 );
- for( unsigned i = 0; i < count(); i++ )
- set( i, readU32( buffer + i*4 ) );
-}
-
-// return space required to save this dirtree
-unsigned AllocTable::size()
-{
- return count() * 4;
-}
-
-void AllocTable::save( unsigned char* buffer )
-{
- for( unsigned i = 0; i < count(); i++ )
- writeU32( buffer + i*4, data[i] );
-}
-
-void AllocTable::debug()
-{
- std::cout << "block size " << data.size() << std::endl;
- for( unsigned i=0; i< data.size(); i++ )
- {
- if( data[i] == Avail ) continue;
- std::cout << i << ": ";
- if( data[i] == Eof ) std::cout << "eof";
- else std::cout << data[i];
- std::cout << std::endl;
- }
-}
-
-// =========== DirTree ==========
-
-const unsigned DirTree::End = 0xffffffff;
-
-DirTree::DirTree()
-{
- current = 0;
- clear();
-}
-
-void DirTree::clear()
-{
- // leave only root entry
- entries.resize( 1 );
- entries[0].name = "Root Entry";
- entries[0].dir = true;
- entries[0].size = 0;
- entries[0].start = End;
- entries[0].prev = End;
- entries[0].next = End;
- entries[0].child = End;
- current = 0;
-}
-
-unsigned DirTree::entryCount()
-{
- return entries.size();
-}
-
-DirEntry* DirTree::entry( unsigned index )
-{
- if( index >= entryCount() ) return (DirEntry*) 0;
- return &entries[ index ];
-}
-
-int DirTree::indexOf( DirEntry* e )
-{
- for( unsigned i = 0; i < entryCount(); i++ )
- if( entry( i ) == e ) return i;
-
- return -1;
-}
-
-int DirTree::parent( unsigned index )
-{
- // brute-force, basically we iterate for each entries, find its children
- // and check if one of the children is 'index'
- for( unsigned j=0; j<entryCount(); j++ )
- {
- std::vector<unsigned> chi = children( j );
- for( unsigned i=0; i<chi.size();i++ )
- if( chi[i] == index )
- return j;
- }
-
- return -1;
-}
-
-std::string DirTree::fullName( unsigned index )
-{
- // don't use root name ("Root Entry"), just give "/"
- if( index == 0 ) return "/";
-
- std::string result = entry( index )->name;
- result.insert( 0, "/" );
- int p = parent( index );
- while( p > 0 )
- {
- result.insert( 0, entry( p )->name );
- result.insert( 0, "/" );
- index = p;
- if( index <= 0 ) break;
- }
- return result;
-}
-
-// given a fullname (e.g "/ObjectPool/_1020961869"), find the entry
-// if not found and create is false, return 0
-// if create is true, a new entry is returned
-DirEntry* DirTree::entry( const std::string& name, bool create )
-{
- if( !name.length() ) return (DirEntry*)0;
-
- // quick check for "/" (that's root)
- if( name == "/" ) return entry( 0 );
-
- // split the names, e.g "/ObjectPool/_1020961869" will become:
- // "ObjectPool" and "_1020961869"
- std::list<std::string> names;
- std::string::size_type start = 0, end = 0;
- while( start < name.length() )
- {
- end = name.find_first_of( '/', start );
- if( end == std::string::npos ) end = name.length();
- names.push_back( name.substr( start, end-start ) );
- start = end+1;
- }
-
- // start from root when name is absolute
- // or current directory when name is relative
- int index = (name[0] == '/' ) ? 0 : current;
-
- // trace one by one
- std::list<std::string>::iterator it;
- for( it = names.begin(); it != names.end(); ++it )
- {
- // find among the children of index
- std::vector<unsigned> chi = children( index );
- unsigned child = 0;
- for( unsigned i = 0; i < chi.size(); i++ )
- {
- DirEntry* ce = entry( chi[i] );
- if( ce ) if( ce->name == *it )
- child = chi[i];
- }
-
- // traverse to the child
- if( child > 0 ) index = child;
- else
- {
- // not found among children
- if( !create ) return (DirEntry*)0;
-
- // create a new entry
- unsigned parent = index;
- entries.push_back( DirEntry() );
- index = entryCount()-1;
- DirEntry* e = entry( index );
- e->name = *it;
- e->dir = false;
- e->size = 0;
- e->start = 0;
- e->child = End;
- e->prev = End;
- e->next = entry(parent)->child;
- entry(parent)->child = index;
- }
- }
-
- return entry( index );
-}
-
-// helper function: recursively find siblings of index
-void dirtree_find_siblings( DirTree* dirtree, std::vector<unsigned>& result,
- unsigned index )
-{
- DirEntry* e = dirtree->entry( index );
- if( !e ) return;
-
- // prevent infinite loop
- for( unsigned i = 0; i < result.size(); i++ )
- if( result[i] == index ) return;
-
- // add myself
- result.push_back( index );
-
- // visit previous sibling, don't go infinitely
- unsigned prev = e->prev;
- if( ( prev > 0 ) && ( prev < dirtree->entryCount() ) )
- {
- for( unsigned i = 0; i < result.size(); i++ )
- if( result[i] == prev ) prev = 0;
- if( prev ) dirtree_find_siblings( dirtree, result, prev );
- }
-
- // visit next sibling, don't go infinitely
- unsigned next = e->next;
- if( ( next > 0 ) && ( next < dirtree->entryCount() ) )
- {
- for( unsigned i = 0; i < result.size(); i++ )
- if( result[i] == next ) next = 0;
- if( next ) dirtree_find_siblings( dirtree, result, next );
- }
-}
-
-std::vector<unsigned> DirTree::children( unsigned index )
-{
- std::vector<unsigned> result;
-
- DirEntry* e = entry( index );
- if( e ) if( e->child < entryCount() )
- dirtree_find_siblings( this, result, e->child );
-
- return result;
-}
-
-std::vector<DirEntry*> DirTree::listDirectory()
-{
- std::vector<DirEntry*> result;
-
- std::vector<unsigned> chi = children( current );
- for( unsigned i = 0; i < chi.size(); i++ )
- result.push_back( entry( chi[i] ) );
-
- return result;
-}
-
-bool DirTree::enterDirectory( const std::string& dir )
-{
- DirEntry* e = entry( dir );
- if( !e ) return false;
- if( !e->dir ) return false;
-
- int index = indexOf( e );
- if( index < 0 ) return false;
-
- current = index;
- return true;
-}
-
-void DirTree::leaveDirectory()
-{
- // already at root ?
- if( current == 0 ) return;
-
- int p = parent( current );
- if( p >= 0 ) current = p;
-}
-
-std::string DirTree::path()
-{
- return fullName( current );
-}
-
-void DirTree::load( unsigned char* buffer, unsigned size )
-{
- entries.clear();
- current = 0;
-
- for( unsigned i = 0; i < size/128; i++ )
- {
- unsigned p = i * 128;
-
- // would be < 32 if first char in the name isn't printable
- unsigned prefix = 32;
-
- // parse name of this entry, which stored as Unicode 16-bit
- std::string name;
- int name_len = readU16( buffer + 0x40+p );
- for( int j=0; ( buffer[j+p]) && (j<name_len); j+= 2 )
- name.append( 1, buffer[j+p] );
-
- // first char isn't printable ? remove it...
- if( buffer[p] < 32 )
- {
- prefix = buffer[0];
- name.erase( 0,1 );
- }
-
- DirEntry e;
- e.name = name;
- e.start = readU32( buffer + 0x74+p );
- e.size = readU32( buffer + 0x78+p );
- e.prev = readU32( buffer + 0x44+p );
- e.next = readU32( buffer + 0x48+p );
- e.child = readU32( buffer + 0x4C+p );
- e.dir = ( buffer[ 0x42 + p]!=2 );
-
- entries.push_back( e );
- }
-}
-
-// return space required to save this dirtree
-unsigned DirTree::size()
-{
- return entryCount() * 128;
-}
-
-void DirTree::save( unsigned char* buffer )
-{
- memset( buffer, 0, size() );
-
- // root is fixed as "Root Entry"
- DirEntry* root = entry( 0 );
- std::string name = "Root Entry";
- for( unsigned j = 0; j < name.length(); j++ )
- buffer[ j*2 ] = name[j];
- writeU16( buffer + 0x40, name.length()*2 + 2 );
- writeU32( buffer + 0x74, 0xffffffff );
- writeU32( buffer + 0x78, 0 );
- writeU32( buffer + 0x44, 0xffffffff );
- writeU32( buffer + 0x48, 0xffffffff );
- writeU32( buffer + 0x4c, root->child );
- buffer[ 0x42 ] = 5;
- buffer[ 0x43 ] = 1;
-
- for( unsigned i = 1; i < entryCount(); i++ )
- {
- DirEntry* e = entry( i );
- if( !e ) continue;
- if( e->dir )
- {
- e->start = 0xffffffff;
- e->size = 0;
- }
-
- // max length for name is 32 chars
- std::string name = e->name;
- if( name.length() > 32 )
- name.erase( 32, name.length() );
-
- // write name as Unicode 16-bit
- for( unsigned j = 0; j < name.length(); j++ )
- buffer[ i*128 + j*2 ] = name[j];
-
- writeU16( buffer + i*128 + 0x40, name.length()*2 + 2 );
- writeU32( buffer + i*128 + 0x74, e->start );
- writeU32( buffer + i*128 + 0x78, e->size );
- writeU32( buffer + i*128 + 0x44, e->prev );
- writeU32( buffer + i*128 + 0x48, e->next );
- writeU32( buffer + i*128 + 0x4c, e->child );
- buffer[ i*128 + 0x42 ] = e->dir ? 1 : 2;
- buffer[ i*128 + 0x43 ] = 1; // always black
- }
-}
-
-void DirTree::debug()
-{
- for( unsigned i = 0; i < entryCount(); i++ )
- {
- DirEntry* e = entry( i );
- if( !e ) continue;
- std::cout << i << ": ";
- std::cout << e->name << " ";
- if( e->dir ) std::cout << "(Dir) ";
- else std::cout << "(File) ";
- std::cout << e->size << " ";
- std::cout << "s:" << e->start << " ";
- std::cout << "(";
- if( e->child == End ) std::cout << "-"; else std::cout << e->child;
- std::cout << " ";
- if( e->prev == End ) std::cout << "-"; else std::cout << e->prev;
- std::cout << ":";
- if( e->next == End ) std::cout << "-"; else std::cout << e->next;
- std::cout << ")";
- std::cout << std::endl;
- }
-}
-
-// =========== StorageIO ==========
-
-StorageIO::StorageIO( Storage* st, const char* fname )
-{
- storage = st;
- filename = fname;
- result = Storage::Ok;
- opened = false;
-
- header = new Header();
- dirtree = new DirTree();
- bbat = new AllocTable();
- sbat = new AllocTable();
-
- filesize = 0;
- bbat->blockSize = 1 << header->b_shift;
- sbat->blockSize = 1 << header->s_shift;
-}
-
-StorageIO::~StorageIO()
-{
- if( opened ) close();
- delete sbat;
- delete bbat;
- delete dirtree;
- delete header;
-}
-
-bool StorageIO::open()
-{
- // already opened ? close first
- if( opened ) close();
-
- load();
-
- return result == Storage::Ok;
-}
-
-void StorageIO::load()
-{
- unsigned char* buffer = 0;
- unsigned long buflen = 0;
- std::vector<unsigned long> blocks;
-
- // open the file, check for error
- result = Storage::OpenFailed;
- file.open( filename.c_str(), std::ios::binary | std::ios::in );
- if( !file.good() ) return;
-
- // find size of input file
- file.seekg( 0, std::ios::end );
- filesize = file.tellg();
-
- // load header
- buffer = new unsigned char[512];
- file.seekg( 0 );
- file.read( (char*)buffer, 512 );
- header->load( buffer );
- delete[] buffer;
-
- // check OLE magic id
- result = Storage::NotOLE;
- for( unsigned i=0; i<8; i++ )
- if( header->id[i] != pole_magic[i] )
- return;
-
- // sanity checks
- result = Storage::BadOLE;
- if( header->threshold != 4096 ) return;
- if( header->num_bat == 0 ) return;
- if( header->s_shift > header->b_shift ) return;
- if( header->b_shift <= 6 ) return;
- if( header->b_shift >=31 ) return;
-
- // important block size
- bbat->blockSize = 1 << header->b_shift;
- sbat->blockSize = 1 << header->s_shift;
-
- // find blocks allocated to store big bat
- // the first 109 blocks are in header, the rest in meta bat
- blocks.resize( header->num_bat );
- for( unsigned i = 0; i < header->num_bat; i++ )
- if( i < 109 ) blocks[i] = header->bb_blocks[i];
- if( header->num_bat > 109 )
- if( header->num_mbat > 0 )
- {
- buffer = new unsigned char[ bbat->blockSize ];
- unsigned k = 109;
- for( unsigned r = 0; r < header->num_mbat; r++ )
- {
- loadBigBlock( header->mbat_start+r, buffer, bbat->blockSize );
- for( unsigned s=0; s < bbat->blockSize/4; s+=4 )
- blocks[k++] = readU32( buffer + s );
- // FIXME check if k > num_bat
- }
- delete[] buffer;
- }
-
- // load big bat
- buflen = blocks.size()*bbat->blockSize;
- buffer = new unsigned char[ buflen ];
- loadBigBlocks( blocks, buffer, buflen );
- bbat->load( buffer, buflen );
- delete[] buffer;
-
- // load small bat
- blocks.clear();
- blocks = bbat->follow( header->sbat_start );
- buflen = blocks.size()*bbat->blockSize;
- buffer = new unsigned char[ buflen ];
- loadBigBlocks( blocks, buffer, buflen );
- sbat->load( buffer, buflen );
- delete[] buffer;
-
- // load directory tree
- blocks = bbat->follow( header->dirent_start );
- buflen = blocks.size()*bbat->blockSize;
- buffer = new unsigned char[ buflen ];
- loadBigBlocks( blocks, buffer, buflen );
- sb_blocks = bbat->follow( readU32( buffer + 0x74 ) ); // small files
- dirtree->load( buffer, buflen );
-
- // fetch block chain as data for small-files
- delete[] buffer;
-
- // so far so good
- result = Storage::Ok;
- opened = true;
-}
-
-void StorageIO::create()
-{
- // std::cout << "Creating " << filename << std::endl;
-
- file.open( filename.c_str(), std::ios::out|std::ios::binary );
- if( !file.good() )
- {
- std::cerr << "Can't create " << filename << std::endl;
- result = Storage::OpenFailed;
- return;
- }
-
- // so far so good
- opened = true;
- result = Storage::Ok;
-}
-
-void StorageIO::close()
-{
- if( !opened ) return;
-
- file.close();
- opened = false;
-
- std::list<Stream*>::iterator it;
- for( it = streams.begin(); it != streams.end(); ++it )
- delete *it;
-}
-
-unsigned long StorageIO::loadBigBlocks( std::vector<unsigned long> blocks,
- unsigned char* data, unsigned long maxlen )
-{
- // sentinel
- if( !data ) return 0;
- if( !file.good() ) return 0;
- if( blocks.size() < 1 ) return 0;
- if( maxlen == 0 ) return 0;
-
- // read block one by one, seems fast enough
- unsigned long bytes = 0;
- for( unsigned long i=0; (i < blocks.size() ) & ( bytes<maxlen ); i++ )
- {
- unsigned long block = blocks[i];
- if( block < 0 ) continue;
- unsigned long pos = bbat->blockSize * ( block+1 );
- unsigned long p = (bbat->blockSize < maxlen-bytes) ? bbat->blockSize : maxlen-bytes;
- if( pos + p > filesize ) p = filesize - pos;
- file.seekg( pos );
- file.read( (char*)data + bytes, p );
- bytes += p;
- }
-
- return bytes;
-}
-
-unsigned long StorageIO::loadBigBlock( unsigned long block,
- unsigned char* data, unsigned long maxlen )
-{
- // sentinel
- if( !data ) return 0;
- if( !file.good() ) return 0;
- if( block < 0 ) return 0;
-
- // wraps call for loadBigBlocks
- std::vector<unsigned long> blocks;
- blocks.resize( 1 );
- blocks[ 0 ] = block;
-
- return loadBigBlocks( blocks, data, maxlen );
-}
-
-// return number of bytes which has been read
-unsigned long StorageIO::loadSmallBlocks( std::vector<unsigned long> blocks,
- unsigned char* data, unsigned long maxlen )
-{
- // sentinel
- if( !data ) return 0;
- if( !file.good() ) return 0;
- if( blocks.size() < 1 ) return 0;
- if( maxlen == 0 ) return 0;
-
- // our own local buffer
- unsigned char buf[ bbat->blockSize ];
-
- // read small block one by one
- unsigned long bytes = 0;
- for( unsigned long i=0; ( i<blocks.size() ) & ( bytes<maxlen ); i++ )
- {
- unsigned long block = blocks[i];
- if( block < 0 ) continue;
-
- // find where the small-block exactly is
- unsigned long pos = block * sbat->blockSize;
- unsigned long bbindex = pos / bbat->blockSize;
- if( bbindex >= sb_blocks.size() ) break;
-
- loadBigBlock( sb_blocks[ bbindex ], buf, bbat->blockSize );
-
- // copy the data
- unsigned offset = pos % bbat->blockSize;
- unsigned long p = (maxlen-bytes < bbat->blockSize-offset ) ? maxlen-bytes : bbat->blockSize-offset;
- p = (sbat->blockSize<p ) ? sbat->blockSize : p;
- memcpy( data + bytes, buf + offset, p );
- bytes += p;
- }
-
- return bytes;
-}
-
-unsigned long StorageIO::loadSmallBlock( unsigned long block,
- unsigned char* data, unsigned long maxlen )
-{
- // sentinel
- if( !data ) return 0;
- if( !file.good() ) return 0;
- if( block < 0 ) return 0;
-
- // wraps call for loadSmallBlocks
- std::vector<unsigned long> blocks;
- blocks.resize( 1 );
- blocks.assign( 1, block );
-
- return loadSmallBlocks( blocks, data, maxlen );
-}
-
-// =========== StreamImpl ==========
-
-StreamImpl::StreamImpl( StorageIO* s, DirEntry* e)
-{
- io = s;
- entry = e;
- m_pos = 0;
-
- if( entry->size >= io->header->threshold )
- blocks = io->bbat->follow( entry->start );
- else
- blocks = io->sbat->follow( entry->start );
-
- // prepare cache
- cache_pos = 0;
- cache_size = 4096; // optimal ?
- cache_data = new unsigned char[cache_size];
- updateCache();
-}
-
-// FIXME tell parent we're gone
-StreamImpl::~StreamImpl()
-{
- delete[] cache_data;
-}
-
-void StreamImpl::seek( unsigned long pos )
-{
- m_pos = pos;
-}
-
-unsigned long StreamImpl::tell()
-{
- return m_pos;
-}
-
-int StreamImpl::getch()
-{
- // past end-of-file ?
- if( m_pos > entry->size ) return -1;
-
- // need to update cache ?
- if( !cache_size || ( m_pos < cache_pos ) ||
- ( m_pos >= cache_pos + cache_size ) )
- updateCache();
-
- // something bad if we don't get good cache
- if( !cache_size ) return -1;
-
- int data = cache_data[m_pos - cache_pos];
- m_pos++;
-
- return data;
-}
-
-unsigned long StreamImpl::read( unsigned long pos, unsigned char* data, unsigned long maxlen )
-{
- // sanity checks
- if( !data ) return 0;
- if( maxlen == 0 ) return 0;
-
- unsigned long totalbytes = 0;
-
- if ( entry->size < io->header->threshold )
- {
- // small file
- unsigned long index = pos / io->sbat->blockSize;
-
- if( index >= blocks.size() ) return 0;
-
- unsigned char buf[ io->sbat->blockSize ];
- unsigned long offset = pos % io->sbat->blockSize;
- while( totalbytes < maxlen )
- {
- if( index >= blocks.size() ) break;
- io->loadSmallBlock( blocks[index], buf, io->bbat->blockSize );
- unsigned long count = io->sbat->blockSize - offset;
- if( count > maxlen-totalbytes ) count = maxlen-totalbytes;
- memcpy( data+totalbytes, buf + offset, count );
- totalbytes += count;
- offset = 0;
- index++;
- }
-
- }
- else
- {
- // big file
- unsigned long index = pos / io->bbat->blockSize;
-
- if( index >= blocks.size() ) return 0;
-
- unsigned char buf[ io->bbat->blockSize ];
- unsigned long offset = pos % io->bbat->blockSize;
- while( totalbytes < maxlen )
- {
- if( index >= blocks.size() ) break;
- io->loadBigBlock( blocks[index], buf, io->bbat->blockSize );
- unsigned long count = io->bbat->blockSize - offset;
- if( count > maxlen-totalbytes ) count = maxlen-totalbytes;
- memcpy( data+totalbytes, buf + offset, count );
- totalbytes += count;
- index++;
- offset = 0;
- }
-
- }
-
- return totalbytes;
-}
-
-unsigned long StreamImpl::read( unsigned char* data, unsigned long maxlen )
-{
- unsigned long bytes = read( tell(), data, maxlen );
- m_pos += bytes;
- return bytes;
-}
-
-void StreamImpl::updateCache()
-{
- // sanity check
- if( !cache_data ) return;
-
- cache_pos = m_pos - ( m_pos % cache_size );
- unsigned long bytes = cache_size;
- if( cache_pos + bytes > entry->size ) bytes = entry->size - cache_pos;
- cache_size = read( cache_pos, cache_data, bytes );
-}
-
-
-// =========== Storage ==========
-
-Storage::Storage( const char* filename )
-{
- io = new StorageIO( this, filename );
-}
-
-Storage::~Storage()
-{
- delete io;
-}
-
-int Storage::result()
-{
- return io->result;
-}
-
-bool Storage::open()
-{
- return io->open();
-}
-
-void Storage::close()
-{
- io->close();
-}
-
-// list all files and subdirs in current path
-std::list<std::string> Storage::listDirectory()
-{
- std::list<std::string> result;
-
- std::vector<DirEntry*> entries;
- entries = io->dirtree->listDirectory();
- for( unsigned i = 0; i < entries.size(); i++ )
- result.push_back( entries[i]->name );
-
- return result;
-}
-
-// enters a sub-directory, returns false if not a directory or not found
-bool Storage::enterDirectory( const std::string& directory )
-{
- return io->dirtree->enterDirectory( directory );
-}
-
-// goes up one level (like cd ..)
-void Storage::leaveDirectory()
-{
- return io->dirtree->leaveDirectory();
-}
-
-std::string Storage::path()
-{
- return io->dirtree->path();
-}
-
-Stream* Storage::stream( const std::string& name )
-{
- // sanity check
- if( !name.length() ) return (Stream*)0;
- if( !io ) return (Stream*)0;
-
- // make absolute if necesary
- std::string fullName = name;
- if( name[0] != '/' ) fullName.insert( 0, path() + "/" );
-
- DirEntry* entry = io->dirtree->entry( name );
- if( !entry ) return (Stream*)0;
-
- Stream* s = new Stream();
- s->impl = new StreamImpl( io, entry );
- io->streams.push_back( s );
-
- return s;
-}
-
-
-
-// =========== Stream ==========
-
-Stream::Stream()
-{
- // just nullify, will be managed later Storage::stream
- impl = 0;
-}
-
-// FIXME tell parent we're gone
-Stream::~Stream()
-{
- delete impl;
-}
-
-unsigned long Stream::tell()
-{
- return impl ? impl->tell() : 0;
-}
-
-void Stream::seek( unsigned long newpos )
-{
- if( impl ) impl->seek( newpos );
-}
-
-unsigned long Stream::size()
-{
- return impl ? impl->entry->size : 0;
-}
-
-int Stream::getch()
-{
- return impl ? impl->getch() : 0;
-}
-
-unsigned long Stream::read( unsigned char* data, unsigned long maxlen )
-{
- return impl ? impl->read( data, maxlen ) : 0;
-}
-
diff --git a/src/plugins/wordleaker/pole.h b/src/plugins/wordleaker/pole.h
@@ -1,149 +0,0 @@
-/* POLE - Portable C++ library to access OLE Storage
- Copyright (C) 2002-2004 Ariya Hidayat <ariya@kde.org>
-
- This library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Library General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Library General Public License for more details.
-
- You should have received a copy of the GNU Library General Public License
- along with this library; see the file COPYING.LIB. If not, write to
- the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, US
-*/
-
-#ifndef POLE_H
-#define POLE_H
-
-#include <string>
-#include <list>
-
-namespace POLE
-{
-
-class StorageIO;
-class Stream;
-class StreamImpl;
-
-class Storage
-{
- friend class Stream;
- friend class StreamOut;
-
-public:
-
- enum { Ok, OpenFailed, NotOLE, BadOLE, UnknownError,
- StupidWorkaroundForBrokenCompiler=255 };
-
- /**
- * Constructs a storage with name filename.
- **/
- Storage( const char* filename );
-
- /**
- * Destroys the storage.
- **/
- ~Storage();
-
- /**
- * Opens the storage. Returns true if no error occurs.
- **/
- bool open();
-
- /**
- * Closes the storage.
- **/
- void close();
-
- /**
- * Returns the error code of last operation.
- **/
- int result();
-
- /**
- * Returns the current path.
- **/
- std::string path();
-
- /**
- * Finds all stream and directories in current path.
- **/
- std::list<std::string> listDirectory();
-
- /**
- * Changes path to directory. Returns true if no error occurs.
- **/
- bool enterDirectory( const std::string& directory );
-
- /**
- * Goes to one directory up.
- **/
- void leaveDirectory();
-
- /**
- * Finds and returns a stream with the specified name.
- **/
- Stream* stream( const std::string& name );
-
-private:
- StorageIO* io;
-
- // no copy or assign
- Storage( const Storage& );
- Storage& operator=( const Storage& );
-
-};
-
-class Stream
-{
- friend class Storage;
- friend class StorageIO;
-
-public:
-
- /**
- * Returns the stream size.
- **/
- unsigned long size();
-
- /**
- * Returns the read pointer.
- **/
- unsigned long tell();
-
- /**
- * Sets the read position.
- **/
- void seek( unsigned long pos );
-
- /**
- * Reads a byte.
- **/
- int getch();
-
- /**
- * Reads a block of data.
- **/
- unsigned long read( unsigned char* data, unsigned long maxlen );
-
-private:
-
- Stream();
- ~Stream();
-
- // no copy or assign
- Stream( const Stream& );
- Stream& operator=( const Stream& );
-
- StreamImpl* impl;
-};
-
-
-}
-
-#endif // POLE_H
diff --git a/src/plugins/wordleaker/wordextractor.cc b/src/plugins/wordleaker/wordextractor.cc
@@ -1,486 +0,0 @@
-/*
- This file is part of libextractor.
- (C) 2006 Vidyut Samanta and Christian Grothoff
-
- libextractor is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published
- by the Free Software Foundation; either version 2, or (at your
- option) any later version.
-
- libextractor is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with libextractor; see the file COPYING. If not, write to the
- Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, USA.
-
- This code depends heavily on the wordleaker code and
- a lot of code was borrowed from wordleaker.cpp. See also
- the README file in this directory.
- */
-
-#include <math.h>
-#include <time.h>
-
-#include "wordleaker.h"
-#include "pole.h"
-#include "platform.h"
-#include "extractor.h"
-#include "../convert.h"
-
-#define __(a) dgettext("iso-639", a)
-
-extern "C" {
-
- static EXTRACTOR_KeywordType
- SummaryProperties[] = {
- EXTRACTOR_UNKNOWN,
- EXTRACTOR_UNKNOWN,
- EXTRACTOR_TITLE,
- EXTRACTOR_SUBJECT,
- EXTRACTOR_AUTHOR,
- EXTRACTOR_KEYWORDS,
- EXTRACTOR_COMMENT,
- EXTRACTOR_TEMPLATE,
- EXTRACTOR_LAST_SAVED_BY,
- EXTRACTOR_VERSIONNUMBER,
- EXTRACTOR_TOTAL_EDITING_TIME,
- EXTRACTOR_LAST_PRINTED,
- EXTRACTOR_CREATION_DATE,
- EXTRACTOR_MODIFICATION_DATE,
- EXTRACTOR_PAGE_COUNT,
- EXTRACTOR_WORD_COUNT,
- EXTRACTOR_CHARACTER_COUNT,
- EXTRACTOR_THUMBNAILS,
- EXTRACTOR_SOFTWARE,
- EXTRACTOR_SECURITY,
- };
-
- static char * xstrndup(const char * s, size_t n){
- char * d;
-
- d = (char *) malloc(n+1);
- memcpy(d,s,n);
- d[n]='\0';
- return d;
- }
-
- static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,
- const char * keyword,
- struct EXTRACTOR_Keywords * next) {
- EXTRACTOR_KeywordList * result;
-
- if (keyword == NULL)
- return next;
- result = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList));
- result->next = next;
- result->keyword = strdup(keyword);
- result->keywordType = type;
- return result;
- }
-
- static char * dateToString( unsigned long date ) {
- char f[128];
- struct tm t;
- memset(&t, 0, sizeof(struct tm));
- t.tm_year = 1900 + date % 100;
- t.tm_mon = date / 100 % 100;
- t.tm_mday = date / 10000 % 100;
- if (0 == strftime(f, 128,
- nl_langinfo(D_FMT),
- &t))
- return NULL;
-
- return xstrndup(f, 128);
- }
-
- static const char * idToProduct( unsigned int id ) {
- // TODO: find the rest of ids (and check existing ones!)
- switch ( id ) {
- case 0x6954:
- case 0x656d:
- return "Word 97 (Windows NT)?";
- case 0x206d:
- case 0x696c:
- return "Word 6 (MS DOS)?";
- case 0x6A62:
- return "Word 97";
- case 0x626A:
- return "Word 98 (Mac)";
- default:
- return NULL;
- }
- }
-
- static const char * lidToLanguage( unsigned int lid ) {
- switch ( lid ) {
- case 0x0400:
- return _("No Proofing");
- case 0x0401:
- return __("Arabic");
- case 0x0402:
- return __("Bulgarian");
- case 0x0403:
- return __("Catalan");
- case 0x0404:
- return _("Traditional Chinese");
- case 0x0804:
- return _("Simplified Chinese");
- case 0x0405:
- return __("Chechen");
- case 0x0406:
- return __("Danish");
- case 0x0407:
- return __("German");
- case 0x0807:
- return _("Swiss German");
- case 0x0408:
- return __("Greek");
- case 0x0409:
- return _("U.S. English");
- case 0x0809:
- return _("U.K. English");
- case 0x0c09:
- return _("Australian English");
- case 0x040a:
- return _("Castilian Spanish");
- case 0x080a:
- return _("Mexican Spanish");
- case 0x040b:
- return __("Finnish");
- case 0x040c:
- return __("French");
- case 0x080c:
- return _("Belgian French");
- case 0x0c0c:
- return _("Canadian French");
- case 0x100c:
- return _("Swiss French");
- case 0x040d:
- return __("Hebrew");
- case 0x040e:
- return __("Hungarian");
- case 0x040f:
- return __("Icelandic");
- case 0x0410:
- return __("Italian");
- case 0x0810:
- return _("Swiss Italian");
- case 0x0411:
- return __("Japanese");
- case 0x0412:
- return __("Korean");
- case 0x0413:
- return __("Dutch");
- case 0x0813:
- return _("Belgian Dutch");
- case 0x0414:
- return _("Norwegian Bokmal");
- case 0x0814:
- return __("Norwegian Nynorsk");
- case 0x0415:
- return __("Polish");
- case 0x0416:
- return __("Brazilian Portuguese");
- case 0x0816:
- return __("Portuguese");
- case 0x0417:
- return _("Rhaeto-Romanic");
- case 0x0418:
- return __("Romanian");
- case 0x0419:
- return __("Russian");
- case 0x041a:
- return _("Croato-Serbian (Latin)");
- case 0x081a:
- return _("Serbo-Croatian (Cyrillic)");
- case 0x041b:
- return __("Slovak");
- case 0x041c:
- return __("Albanian");
- case 0x041d:
- return __("Swedish");
- case 0x041e:
- return __("Thai");
- case 0x041f:
- return __("Turkish");
- case 0x0420:
- return __("Urdu");
- case 0x0421:
- return __("Bahasa");
- case 0x0422:
- return __("Ukrainian");
- case 0x0423:
- return __("Byelorussian");
- case 0x0424:
- return __("Slovenian");
- case 0x0425:
- return __("Estonian");
- case 0x0426:
- return __("Latvian");
- case 0x0427:
- return __("Lithuanian");
- case 0x0429:
- return _("Farsi");
- case 0x042D:
- return __("Basque");
- case 0x042F:
- return __("Macedonian");
- case 0x0436:
- return __("Afrikaans");
- case 0x043E:
- return __("Malayalam");
- default:
- return NULL;
- }
- }
-
-
-
- // read the type of the property and displays its value
- static char * getProperty( POLE::Stream* stream ) {
- unsigned char buffer[256];
- unsigned char c;
- unsigned long i;
- unsigned int j;
- unsigned long t, t1, t2;
- char *s;
-
- unsigned long read = stream->read(buffer, 4);
- if (read != 4)
- return NULL;
- unsigned int type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
-
- switch (type) {
- case 2: // VT_I2
- read = stream->read(buffer, 2);
- if (read != 2)
- return NULL;
- i = buffer[0] + (buffer[1] << 8);
- s = (char*) malloc(16);
- snprintf(s, 16, "%u", i);
- return s;
- case 3: // VT_I4
- read = stream->read(buffer, 4);
- if (read != 4)
- return NULL;
- i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- s = (char*) malloc(16);
- snprintf(s, 16, "%u", i);
- return s;
- case 11: // VT_BOOL
- read = stream->read(buffer, 1);
- if (read != 1)
- return NULL;
- if ((char) buffer[0] == -1)
- return strdup("true");
- return strdup("false");
- case 30: // VT_LPSTR
- read = stream->read(buffer, 4);
- if (read != 4)
- return NULL;
- i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- if ( (i < 0) || (i > 16*1024*1024))
- return NULL;
- s = (char*) malloc(i+1);
- s[i] = '\0';
- j = 0;
- while ( ((c = stream->getch()) != 0) && (i > j) )
- s[j++] = c;
- if ( (j > 0) && (s[j-1] == '\n') )
- s[--j] = '\0';
- if (j != i) {
- free(s);
- return NULL;
- }
- return s;
- case 64: // VT_FILETIME
- read = stream->read(buffer, 8);
- if (read != 8)
- return NULL;
- t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24);
- t = filetime_to_unixtime(t1, t2);
- char * ret = ctime_r((time_t *) &t, (char*)malloc(32));
- ret[strlen(ret)-1] = '\0'; /* kill newline */
- return ret;
- }
- return NULL;
- }
-
-
- struct EXTRACTOR_Keywords * libextractor_word_extract(const char * filename,
- const char * data,
- size_t size,
- struct EXTRACTOR_Keywords * prev) {
- char ver[16];
- char product[128];
- unsigned char buffer[256];
-
- if ( (size < 512 + 898) || (filename == NULL) )
- return prev;
- if (0 != memcmp(data, "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1", 8))
- /* look at file magic number to avoid false positives */
- return prev;
-
-
- POLE::Storage* storage = new POLE::Storage(filename);
- storage->open();
- if (storage->result() != POLE::Storage::Ok ) {
- delete storage;
- return prev;
- }
-
- POLE::Stream * stream = storage->stream( "SummaryInformation" );
- if (! stream) {
- delete storage;
- return prev;
- }
-
- // ClassID & Offset
- stream->seek(28);
- if (20 != stream->read(buffer, 20)) {
- delete storage;
- return prev;
- }
-
- // beginning of section
- unsigned long begin = stream->tell();
- // skip length of section
- stream->read(buffer, 4);
- // number of properties
- if (4 == stream->read(buffer, 4)) {
- unsigned int nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- // properties
- for (unsigned int i = 0; i < nproperties; i++) {
- if (8 != stream->read(buffer, 8))
- break;
- unsigned int propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- unsigned int offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24);
- if (propertyID > 1 && propertyID < 20) {
- unsigned long offsetCur = stream->tell();
- stream->seek(offsetProp + begin);
- if (propertyID == 10) {
- /* FIXME: how is editing time encoded? */
- } if (propertyID == 19) {
- /* FIXME: how to interpret the security integer? */
- } else {
- char * prop = getProperty(stream);
- if (prop != NULL) {
- prev = addKeyword(SummaryProperties[propertyID],
- prop,
- prev);
- free(prop);
- }
- }
- stream->seek(offsetCur);
- }
- }
- }
-
-
- const unsigned char * data512 = (const unsigned char*) &data[512];
- unsigned int wIdent = data512[0] + (data512[1] << 8);
- unsigned int nProduct = data512[4] + (data512[5] << 8);
- unsigned int lid = data512[6] + (data512[7] << 8);
- unsigned int envr = data512[18];
- unsigned int wMagicCreated = data512[34] + (data512[35] << 8);
- unsigned int wMagicRevised = data512[36] + (data512[37] << 8);
- unsigned long lProductCreated = data512[68] + (data512[69] << 8) + (data512[70] << 16) + (data512[71] << 24);
- unsigned long lProductRevised = data512[72] + (data512[73] << 8) + (data512[74] << 16) + (data512[75] << 24);
- unsigned long fcSttbSavedBy = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
- unsigned long lcbSttbSavedBy = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
-
- if (nProduct != 0) {
- snprintf(ver, 16, "%u", nProduct);
- prev = addKeyword(EXTRACTOR_PRODUCTVERSION,
- ver,
- prev);
- }
- const char * lang = lidToLanguage(lid);
- if (lang != NULL) {
- prev = addKeyword(EXTRACTOR_LANGUAGE,
- lang,
- prev);
- }
- const char * prod = idToProduct(wMagicCreated);
- if (prod != NULL) {
- char * date = dateToString(lProductCreated);
- snprintf(product, 128, _("%s (Build %s)"),
- prod,
- date);
- free(date);
- prev = addKeyword(EXTRACTOR_CREATED_BY_SOFTWARE,
- product,
- prev);
- }
- prod = idToProduct(wMagicRevised);
- if (prod != NULL) {
- char * date = dateToString(lProductRevised);
- snprintf(product, 128, _("%s (Build %s)"),
- prod,
- date);
- free(date);
- prev = addKeyword(EXTRACTOR_MODIFIED_BY_SOFTWARE,
- product,
- prev);
- }
-
-
- unsigned int where = 0;
- stream = storage->stream("1Table");
- if (! stream)
- stream = storage->stream("0Table");
- if ( (stream) && (lcbSttbSavedBy >= 6)) {
- unsigned char * lbuffer = (unsigned char*) malloc(lcbSttbSavedBy);
-
- // goto offset of revision
- stream->seek(fcSttbSavedBy);
- // read all the revision history
- if (lcbSttbSavedBy == stream->read(lbuffer, lcbSttbSavedBy)) {
- // there are n strings, so n/2 revisions (author & file)
- unsigned int nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
- where = 6;
- for (unsigned int i=0; i < nRev; i++) {
- if (where >= lcbSttbSavedBy)
- break;
- unsigned int length = lbuffer[where++];
- if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
- (where + 2 * length + 2 <= where) )
- break;
- char * author = convertToUtf8((const char*) &lbuffer[where],
- length * 2,
- "UTF-16BE");
- where += length * 2 + 1;
- length = lbuffer[where++];
- if ( (where + 2 * length >= lcbSttbSavedBy) ||
- (where + 2 * length + 1 <= where) )
- break;
- char * filename = convertToUtf8((const char*) &lbuffer[where],
- length * 2,
- "UTF-16BE");
- where += length * 2 + 1;
- char * rbuf = (char*) malloc(strlen(author) + strlen(filename) + 512);
- snprintf(rbuf, 512 + strlen(author) + strlen(filename),
- _("Revision #%u: Author '%s' worked on '%s'"),
- i, author, filename);
- free(author);
- free(filename);
- prev = addKeyword(EXTRACTOR_REVISION_HISTORY,
- rbuf,
- prev);
- free(rbuf);
- }
- }
- free(lbuffer);
- }
- delete storage;
-
- return prev;
- }
-
-}
-
diff --git a/src/plugins/wordleaker/wordleaker.cpp b/src/plugins/wordleaker/wordleaker.cpp
@@ -1,308 +0,0 @@
-/*
- WordLeaker - Shows information about Word DOC files
- Copyright (C) 2005 Sacha Fuentes <madelman@iname.com>
-
- Based on poledump.c
- Original idea from WordDumper (http://www.computerbytesman.com)
- Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm
- Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html
-
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this library; see the file COPYING. If not, write to
- the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, US
-*/
-
-#include <iostream>
-#include <fstream>
-#include <stdlib.h>
-#include <list>
-#include <ctime>
-
-#include "pole.h"
-#include "WordLeaker.h"
-
-unsigned long fcSttbSavedBy;
-unsigned long lcbSttbSavedBy;
-
-
-
-// read the type of the property and displays its value
-void showProperty( POLE::Stream* stream ) {
- unsigned long read, type;
- unsigned char buffer[256];
- unsigned char c;
- unsigned long i;
- unsigned long t, t1, t2;
- char *s;
-
- read = stream->read(buffer, 4);
- type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
-
- switch (type) {
- case 2: // VT_I2
- read = stream->read(buffer, 2);
- i = buffer[0] + (buffer[1] << 8);
- cout << i << endl;
- break;
- case 3: // VT_I4
- read = stream->read(buffer, 4);
- i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- cout << i << endl;
- break;
- case 11: // VT_BOOL
- read = stream->read(buffer, 1);
- if ((char) buffer[0] == -1)
- cout << "true" << endl;
- else
- cout << "false" << endl;
- break;
- case 30: // VT_LPSTR
- read = stream->read(buffer, 4);
- i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- while ((c = stream->getch()) != 0)
- cout << c;
- cout << endl;
- break;
- case 64: // VT_FILETIME
- read = stream->read(buffer, 8);
- t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24);
- t = filetime_to_unixtime(t1, t2);
- s = ctime((time_t *) &t);
- cout << s;
- break;
- default:
- cout << "Unknown format " << type << endl;
- }
-}
-
-// show the revision data (users and files)
-void dumpRevision( POLE::Storage* storage ) {
- unsigned int nRev;
- unsigned int where = 0;
- POLE::Stream* stream;
-
- cout << "Revision:" << endl;
- cout << "---------" << endl << endl;
-
- // FIXME: should look if using 0Table or 1Table
- stream = storage->stream( "1Table" );
- if( !stream ) {
- cout << "There's no revision information" << endl;
- return;
- }
-
- unsigned char * buffer = new unsigned char[lcbSttbSavedBy];
- unsigned char buffer2[1024];
- unsigned int length;
-
- // goto offset of revision
- stream->seek(fcSttbSavedBy);
- // read all the revision history
- stream->read(buffer, lcbSttbSavedBy);
-
- // there are n strings, so n/2 revisions (author & file)
- nRev = (buffer[2] + (buffer[3] << 8)) / 2;
- where = 6;
-
- for (unsigned int i=0; i < nRev; i++) {
- cout << "Rev #" << i << ": Author \"";
- length = buffer[where++];
- // it's unicode, for now we only get the low byte
- for (unsigned int j=0; j < length; j++) {
- where++;
- cout << buffer[where];
- where++;
- }
- where++;
- cout << "\" worked on file \"";
- length = buffer[where++];
- // it's unicode, for now we only get the low byte
- for (unsigned int j=0; j < length; j++) {
- where++;
- cout << buffer[where];
- where++;
- }
- where++;
- cout << "\"" << endl;
- }
-
- cout << endl;
- delete buffer;
-
-}
-
-// show data from DocumentSummary stream
-void dumpDocumentSummary( POLE::Storage* storage ) {
- POLE::Stream* stream;
- unsigned long read, nproperties, propertyID, offsetProp, offsetCur;
- unsigned long begin;
-
- cout << "Document Summary:" << endl;
- cout << "-----------------" << endl << endl;
-
- stream = storage->stream( "DocumentSummaryInformation" );
- if( !stream ) {
- cout << "There's no document summary information" << endl;
- return;
- }
-
- unsigned char buffer[256];
-
- // ClassID & Offset
- stream->seek(28);
- stream->read(buffer, 20);
- // beginning of section
- begin = stream->tell();
- // length of section
- read = stream->read(buffer, 4);
- // number of properties
- read = stream->read(buffer, 4);
- nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- // properties
-
- for (unsigned long i = 0; i < nproperties; i++) {
- read = stream->read(buffer, 8);
- propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24);
- if (propertyID > 1 && propertyID < 16) {
- cout << DocumentSummaryProperties[propertyID] << ": ";
- offsetCur = stream->tell();
- stream->seek(offsetProp + begin);
- // read and show the property
- showProperty(stream);
- stream->seek(offsetCur);
- }
- }
-
- cout << endl;
-}
-
-// show data from Summary stream
-void dumpSummary( POLE::Storage* storage ) {
- POLE::Stream* stream;
- unsigned long read, nproperties, propertyID, offsetProp, offsetCur;
- unsigned long begin;
-
- cout << "Summary:" << endl;
- cout << "--------" << endl << endl;
-
- stream = storage->stream( "SummaryInformation" );
- if( !stream ) {
- cout << "There's no summary information" << endl;
- return;
- }
-
- unsigned char buffer[256];
-
- // ClassID & Offset
- stream->seek(28);
- stream->read(buffer, 20);
- // beginning of section
- begin = stream->tell();
- // length of section
- read = stream->read(buffer, 4);
- // number of properties
- read = stream->read(buffer, 4);
- nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- // properties
- for (unsigned long i = 0; i < nproperties; i++) {
- read = stream->read(buffer, 8);
- propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24);
- offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24);
- if (propertyID > 1 && propertyID < 20) {
- cout << SummaryProperties[propertyID] << ": ";
- offsetCur = stream->tell();
- stream->seek(offsetProp + begin);
- // read and show the property
- showProperty(stream);
- stream->seek(offsetCur);
- }
- }
-
- cout << endl;
-}
-
-// reads the header of the file
-bool readFIB( char* filename ) {
- fstream file;
-
- file.open( filename, std::ios::binary | std::ios::in );
- if( !file.good() ) {
- cout << "Can't find the file" << endl;
- return false;
- }
-
- unsigned char * buffer = new unsigned char[898];
- file.seekg( 512 );
- file.read( (char*)buffer, 898 );
- file.close();
-
- unsigned int wIdent = buffer[0] + (buffer[1] << 8);
- unsigned int nProduct = buffer[4] + (buffer[5] << 8);
- unsigned int lid = buffer[6] + (buffer[7] << 8);
- unsigned int envr = buffer[18];
- unsigned int wMagicCreated = buffer[34] + (buffer[35] << 8);
- unsigned int wMagicRevised = buffer[36] + (buffer[37] << 8);
- unsigned long lProductCreated = buffer[68] + (buffer[69] << 8) + (buffer[70] << 16) + (buffer[71] << 24);
- unsigned long lProductRevised = buffer[72] + (buffer[73] << 8) + (buffer[74] << 16) + (buffer[75] << 24);
- fcSttbSavedBy = buffer[722] + (buffer[723] << 8) + (buffer[724] << 16) + (buffer[725] << 24);
- lcbSttbSavedBy = buffer[726] + (buffer[727] << 8) + (buffer[728] << 16) + (buffer[729] << 24);
- delete[] buffer;
-
- cout << "File: " << filename << endl;
- cout << "Product version: " << nProduct << endl;
- cout << "Language: " << lidToLanguage(lid) << endl;
- cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << dateToString(lProductCreated) << ")" << endl;
- cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << dateToString(lProductRevised) << ")" << endl;
- cout << endl;
-
- return true;
-
-}
-
-int main(int argc, char *argv[]) {
- cout << endl << "WordLeaker v.0.1" << endl;
- cout << " by Madelman (http://elligre.tk/madelman/)" << endl << endl;
-
-
- if( argc < 2 ) {
- cout << " You must supply a filename" << endl << endl;
- return 0;
- }
-
- char* filename = argv[1];
-
- if ( !readFIB(filename) )
- return 1;
-
- POLE::Storage* storage = new POLE::Storage( filename );
- storage->open();
- if( storage->result() != POLE::Storage::Ok ) {
- cout << "The file " << filename << " is not a Word document" << endl;
- return 1;
- }
-
- dumpSummary( storage );
- // FIXME: doesn't always work
- // but there's nothing really interesting in here
- //dumpDocumentSummary( storage );
- dumpRevision( storage );
- // TODO: we don't show the GUID
- // TODO: we don't show the macros
-
- delete storage;
-
- return 0;
-}
diff --git a/src/plugins/wordleaker/wordleaker.h b/src/plugins/wordleaker/wordleaker.h
@@ -1,124 +0,0 @@
-/*
- WordLeaker - Shows information about Word DOC files
- Copyright (C) 2005 Sacha Fuentes <madelman@iname.com>
-
- Based on poledump.c
- Original idea from WordDumper (http://www.computerbytesman.com)
- Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm
- Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html
-
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public
- License as published by the Free Software Foundation; either
- version 2 of the License, or (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this library; see the file COPYING. If not, write to
- the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- Boston, MA 02111-1307, US
-*/
-
-#include <string>
-
-using namespace std;
-
-static char*
-DocumentSummaryProperties[] = {
-"Dictionary",
-"Code page",
-"Category",
-"PresentationTarget",
-"Bytes",
-"Lines",
-"Paragraphs",
-"Slides",
-"Notes",
-"HiddenSlides",
-"MMClips",
-"ScaleCrop",
-"HeadingPairs",
-"TitlesofParts",
-"Manager",
-"Company",
-"LinksUpTo"
-};
-
-/*
- * filetime_to_unixtime
- *
- * Adapted from work in 'wv' by:
- * Caolan McNamara (Caolan.McNamara@ul.ie)
- */
-#define HIGH32_DELTA 27111902
-#define MID16_DELTA 54590
-#define LOW16_DELTA 32768
-
-unsigned long filetime_to_unixtime (unsigned long low_time, unsigned long high_time) {
- unsigned long low16;/* 16 bit, low bits */
- unsigned long mid16;/* 16 bit, medium bits */
- unsigned long hi32;/* 32 bit, high bits */
- unsigned int carry;/* carry bit for subtraction */
- int negative;/* whether a represents a negative value */
-
-/* Copy the time values to hi32/mid16/low16 */
-hi32 = high_time;
-mid16 = low_time >> 16;
-low16 = low_time & 0xffff;
-
-/* Subtract the time difference */
-if (low16 >= LOW16_DELTA )
-low16 -= LOW16_DELTA , carry = 0;
-else
-low16 += (1 << 16) - LOW16_DELTA , carry = 1;
-
-if (mid16 >= MID16_DELTA + carry)
-mid16 -= MID16_DELTA + carry, carry = 0;
-else
-mid16 += (1 << 16) - MID16_DELTA - carry, carry = 1;
-
-hi32 -= HIGH32_DELTA + carry;
-
-/* If a is negative, replace a by (-1-a) */
-negative = (hi32 >= ((unsigned long)1) << 31);
-if (negative) {
-/* Set a to -a - 1 (a is hi32/mid16/low16) */
-low16 = 0xffff - low16;
-mid16 = 0xffff - mid16;
-hi32 = ~hi32;
-}
-
-/*
- * Divide a by 10000000 (a = hi32/mid16/low16), put the rest into r.
- * Split the divisor into 10000 * 1000 which are both less than 0xffff.
- */
-mid16 += (hi32 % 10000) << 16;
-hi32 /= 10000;
-low16 += (mid16 % 10000) << 16;
-mid16 /= 10000;
-low16 /= 10000;
-
-mid16 += (hi32 % 1000) << 16;
-hi32 /= 1000;
-low16 += (mid16 % 1000) << 16;
-mid16 /= 1000;
-low16 /= 1000;
-
-/* If a was negative, replace a by (-1-a) and r by (9999999 - r) */
-if (negative) {
-/* Set a to -a - 1 (a is hi32/mid16/low16) */
-low16 = 0xffff - low16;
-mid16 = 0xffff - mid16;
-hi32 = ~hi32;
-}
-
-/* Do not replace this by << 32, it gives a compiler warning and
- * it does not work
- */
-return ((((unsigned long)hi32) << 16) << 16) + (mid16 << 16) + low16;
-
-}