commit 194df7957ff6877e4a3899b631c7e4435d77e193
parent 72944e8c23a2a0c2592569ffb7a6f76e09258bbb
Author: Christian Grothoff <christian@grothoff.org>
Date: Fri, 10 Aug 2012 19:22:14 +0000
implementing OLE2 support
Diffstat:
3 files changed, 276 insertions(+), 60 deletions(-)
diff --git a/configure.ac b/configure.ac
@@ -575,7 +575,7 @@ AM_CONDITIONAL(HAVE_QT_SVG4, test x$qt_svg4 != x0)
ABI_GSF
-AM_CONDITIONAL(WITH_GSF, test "x$have_gsf" = "xtrue")
+AM_CONDITIONAL(HAVE_GSF, test "x$have_gsf" = "xtrue")
# produce new line
echo ""
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
@@ -67,6 +67,11 @@ PLUGIN_EXIV2=libextractor_exiv2.la
TEST_EXIV2=test_exiv2
endif
+if HAVE_GSF
+PLUGIN_GSF=libextractor_ole2.la
+TEST_GSF=test_ole2
+endif
+
plugin_LTLIBRARIES = \
libextractor_it.la \
@@ -80,7 +85,8 @@ plugin_LTLIBRARIES = \
$(PLUGIN_FLAC) \
$(PLUGIN_MPEG) \
$(PLUGIN_JPEG) \
- $(PLUGIN_EXIV2)
+ $(PLUGIN_EXIV2) \
+ $(PLUGIN_GSF)
if HAVE_ZZUF
fuzz_tests=fuzz_default.sh
@@ -97,7 +103,8 @@ check_PROGRAMS = \
$(TEST_FLAC) \
$(TEST_MPEG) \
$(TEST_JPEG) \
- $(TEST_EXIV2)
+ $(TEST_EXIV2) \
+ $(TEST_GSF)
TESTS = \
$(fuzz_tests) \
@@ -256,3 +263,21 @@ test_exiv2_LDADD = \
$(top_builddir)/src/plugins/libtest.la
+libextractor_ole2_la_SOURCES = \
+ ole2_extractor.c
+libextractor_ole2_la_CFLAGS = \
+ $(GSF_CFLAGS)
+libextractor_ole2_la_CPPFLAGS = \
+ $(GSF_CFLAGS)
+libextractor_ole2_la_LDFLAGS = \
+ $(PLUGINFLAGS)
+libextractor_ole2_la_LIBADD = \
+ $(top_builddir)/src/common/libextractor_common.la \
+ $(GSF_LIBS)
+
+test_ole2_SOURCES = \
+ test_ole2.c
+test_ole2_LDADD = \
+ $(top_builddir)/src/plugins/libtest.la
+
+
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c
@@ -37,12 +37,21 @@
#include <stdio.h>
#include <ctype.h>
#include <gsf/gsf-utils.h>
+#include <gsf/gsf-input-impl.h>
#include <gsf/gsf-input-memory.h>
+#include <gsf/gsf-impl-utils.h>
#include <gsf/gsf-infile.h>
#include <gsf/gsf-infile-msole.h>
#include <gsf/gsf-msole-utils.h>
-#define DEBUG_OLE2 0
+
+/**
+ * Set to 1 to use our own GsfInput subclass which supports seeking
+ * and thus can handle very large files. Set to 0 to use the simple
+ * gsf in-memory buffer (which can only access the first ~16k) for
+ * debugging.
+ */
+#define USE_LE_INPUT 1
/**
@@ -325,7 +334,7 @@ process_star_office (GsfInput *src,
if ( (buf[0xd5] + buf[0xd4] > 0) &&
(0 != add_metadata (proc, proc_cls,
&buf[0xd6],
- EXTRACTOR_METATYPE_SUBJECT)) _)
+ EXTRACTOR_METATYPE_SUBJECT)) )
return 1;
buf[0x215] = '\0';
if ( (buf[0x115] + buf[0x116] > 0) &&
@@ -450,7 +459,7 @@ lid_to_language (unsigned int lid)
case 0x041b:
return __("Slovak");
case 0x041c:
- return __("Albanian");
+ return __("Albanian");
case 0x041d:
return __("Swedish");
case 0x041e:
@@ -583,13 +592,24 @@ history_extract (GsfInput *stream,
/* *************************** custom GSF input method ***************** */
-G_BEGIN_DECLS
#define LE_TYPE_INPUT (le_input_get_type ())
-#define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), TYPE_LE_INPUT, LeInput))
-#define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), TYPE_LE_INPUT, LeInputClass))
-#define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), TYPE_LE_INPUT))
-#define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), TYPE_LE_INPUT))
-#define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), TYPE_LE_INPUT, LeInputClass))
+#define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), LE_TYPE_INPUT, LeInput))
+#define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), LE_TYPE_INPUT, LeInputClass))
+#define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), LE_TYPE_INPUT))
+#define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), LE_TYPE_INPUT))
+#define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), LE_TYPE_INPUT, LeInputClass))
+
+/**
+ * Internal state of an "LeInput" object.
+ */
+typedef struct _LeInputPrivate
+{
+ /**
+ * Our extraction context.
+ */
+ struct EXTRACTOR_ExtractContext *ec;
+} LeInputPrivate;
+
/**
* Overall state of an "LeInput" object.
@@ -610,18 +630,6 @@ typedef struct _LeInput
/**
- * Internal state of an "LeInput" object.
- */
-typedef struct _LeInputPrivate
-{
- /**
- * Our extraction context.
- */
- struct EXTRACTOR_ExtractContext *ec;
-} LeInputPrivate;
-
-
-/**
* LeInput's class state.
*/
typedef struct _LeInputClass
@@ -640,13 +648,6 @@ typedef struct _LeInputClass
/**
- * Required method to obtain the LeInput "type".
- */
-GType
-le_input_get_type (void) G_GNUC_CONST;
-
-
-/**
* Constructor for LeInput objects.
*
* @param ec extraction context to use
@@ -654,31 +655,198 @@ le_input_get_type (void) G_GNUC_CONST;
*/
GsfInput *
le_input_new (struct EXTRACTOR_ExtractContext *ec);
-G_END_DECLS
/**
- * Macro to create LeInput type definition.
+ * Class initializer for the "LeInput" class.
+ *
+ * @param class class object to initialize
+ */
+static void
+le_input_class_init (LeInputClass *class);
+
+
+/**
+ * Initialize internal state of fresh input object.
+ *
+ * @param input object to initialize
+ */
+static void
+le_input_init (LeInput *input);
+
+
+/**
+ * Macro to create LeInput type definition and register the class.
+ */
+GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init, GSF_INPUT_TYPE)
+
+
+/**
+ * Duplicate input, leaving the new one at the same offset.
+ *
+ * @param input the input to duplicate
+ * @param err location for error reporting, can be NULL
+ * @return NULL on error (always)
+ */
+static GsfInput *
+le_input_dup (GsfInput *input,
+ GError **err)
+{
+ if (NULL != err)
+ *err = g_error_new (gsf_input_error_id (), 0,
+ "dup not supported on LeInput");
+ return NULL;
+}
+
+
+/**
+ * Read at least num_bytes. Does not change the current position if
+ * there is an error. Will only read if the entire amount can be
+ * read. Invalidates the buffer associated with previous calls to
+ * gsf_input_read.
+ *
+ * @param input
+ * @param num_bytes
+ * @param optional_buffer
+ * @return buffer where num_bytes data are available, or NULL on error
+ */
+static const guint8 *
+le_input_read (GsfInput *input,
+ size_t num_bytes,
+ guint8 *optional_buffer)
+{
+ LeInput *li = LE_INPUT (input);
+ struct EXTRACTOR_ExtractContext *ec;
+ void *buf;
+ uint64_t old_off;
+ ssize_t ret;
+
+ ec = li->priv->ec;
+ old_off = ec->seek (ec->cls, 0, SEEK_CUR);
+ if (num_bytes
+ != (ret = ec->read (ec->cls,
+ &buf,
+ num_bytes)))
+ {
+ /* we don't support partial reads;
+ most other GsfInput implementations in this case
+ allocate some huge temporary buffer just to avoid
+ the partial read; we might need to do that as well!? */
+ ec->seek (ec->cls, SEEK_SET, old_off);
+ return NULL;
+ }
+ if (NULL != optional_buffer)
+ {
+ memcpy (optional_buffer, buf, num_bytes);
+ return optional_buffer;
+ }
+ return buf;
+}
+
+
+/**
+ * Move the current location in an input stream
+ *
+ * @param input stream to seek
+ * @param offset target offset
+ * @param whence determines to what the offset is relative to
+ * @return TRUE on error
*/
-G_DEFINE_TYPE (LeInput, le_input, GSF_TYPE_INPUT)
+static gboolean
+le_input_seek (GsfInput *input,
+ gsf_off_t offset,
+ GSeekType whence)
+{
+ LeInput *li = LE_INPUT (input);
+ struct EXTRACTOR_ExtractContext *ec;
+ int w;
+ int64_t ret;
+
+ ec = li->priv->ec;
+ switch (whence)
+ {
+ case G_SEEK_SET:
+ w = SEEK_SET;
+ break;
+ case G_SEEK_CUR:
+ w = SEEK_CUR;
+ break;
+ case G_SEEK_END:
+ w = SEEK_END;
+ break;
+ default:
+ return TRUE;
+ }
+ if (-1 ==
+ (ret = ec->seek (ec->cls,
+ offset,
+ w)))
+ return TRUE;
+ return FALSE;
+}
/**
+ * Class initializer for the "LeInput" class.
*
+ * @param class class object to initialize
*/
static void
le_input_class_init (LeInputClass *class)
{
- // GObjectClass *gobject_class;
GsfInputClass *input_class;
- // gobject_class = (GObjectClass *) class;
input_class = (GsfInputClass *) class;
- input_class->read = le_input_read;
+ input_class->Dup = le_input_dup;
+ input_class->Read = le_input_read;
+ input_class->Seek = le_input_seek;
g_type_class_add_private (class, sizeof (LeInputPrivate));
}
+/**
+ * Initialize internal state of fresh input object.
+ *
+ * @param input object to initialize
+ */
+static void
+le_input_init (LeInput *input)
+{
+ LeInputPrivate *priv;
+
+ input->priv =
+ G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
+ LeInputPrivate);
+ priv = input->priv;
+ priv->ec = NULL;
+}
+
+
+/**
+ * Creates a new LeInput object.
+ *
+ * @param ec extractor context to wrap
+ * @return NULL on error
+ */
+GsfInput *
+le_input_new (struct EXTRACTOR_ExtractContext *ec)
+{
+ LeInput *input;
+
+ input = g_object_new (LE_TYPE_INPUT, NULL);
+ gsf_input_set_size (GSF_INPUT (input),
+ ec->get_size (ec->cls));
+ gsf_input_seek_emulate (GSF_INPUT (input),
+ 0);
+ input->input.name = NULL;
+ input->input.container = NULL;
+ input->priv->ec = ec;
+
+ return GSF_INPUT (input);
+}
+
+
+
/* *********************** end of custom GSF input method ************* */
@@ -702,17 +870,50 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
unsigned int lid;
const char *lang;
int ret;
+ void *data;
+ uint64_t fsize;
+ ssize_t data_size;
- if (size < 512 + 898)
- return; /* can hardly be OLE2 */
- if (NULL == (input = gsf_input_memory_new ((const guint8 *) data,
- (gsf_off_t) size,
- FALSE)))
+ fsize = ec->get_size (ec->cls);
+ if (fsize < 512 + 898)
+ {
+ /* File too small for OLE2 */
+ return; /* can hardly be OLE2 */
+ }
+ if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
+ {
+ /* Failed to read minimum file size to buffer */
+ return;
+ }
+ data512 = (const unsigned char*) data + 512;
+ lid = data512[6] + (data512[7] << 8);
+ if ( (NULL != (lang = lid_to_language (lid))) &&
+ (0 != (ret = add_metadata (ec->proc, ec->cls,
+ lang,
+ EXTRACTOR_METATYPE_LANGUAGE))) )
return;
+ lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
+ fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
+ if (0 != ec->seek (ec->cls, 0, SEEK_SET))
+ {
+ /* seek failed!? */
+ return;
+ }
+#if USE_LE_INPUT
+ if (NULL == (input = le_input_new (ec)))
+ {
+ fprintf (stderr, "le_input_new failed\n");
+ return;
+ }
+#else
+ input = gsf_input_memory_new ((const guint8 *) data,
+ data_size,
+ FALSE);
+#endif
if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
{
g_object_unref (G_OBJECT (input));
- return 0;
+ return;
}
ret = 0;
for (i=0;i<gsf_infile_num_children (infile);i++)
@@ -722,32 +923,23 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
if (NULL == (name = gsf_infile_name_by_index (infile, i)))
continue;
src = NULL;
- if ( ( (0 == strcmp(name, "\005SummaryInformation")) ||
- (0 == strcmp(name, "\005DocumentSummaryInformation")) ) &&
+ if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
+ (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
(NULL != (src = gsf_infile_child_by_index (infile, i))) )
ret = process (src,
- proc,
- proc_cls);
+ ec->proc,
+ ec->cls);
if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
(NULL != (src = gsf_infile_child_by_index (infile, i))) )
ret = process_star_office (src,
- proc,
- proc_cls);
+ ec->proc,
+ ec->cls);
if (NULL != src)
g_object_unref (G_OBJECT (src));
}
if (0 != ret)
goto CLEANUP;
- data512 = (const unsigned char*) &data[512];
- lid = data512[6] + (data512[7] << 8);
- if ( (NULL != (lang = lid_to_language (lid))) &&
- (0 != (ret = add_metadata (proc, proc_cls,
- lang,
- EXTRACTOR_METATYPE_LANGUAGE))) )
- goto CLEANUP;
- lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
- fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
if (lcb < 6)
goto CLEANUP;
for (i=0;i<gsf_infile_num_children (infile);i++)
@@ -763,14 +955,13 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
ret = history_extract (src,
lcb,
fcb,
- proc, proc_cls);
+ ec->proc, ec->cls);
g_object_unref (G_OBJECT (src));
}
}
CLEANUP:
g_object_unref (G_OBJECT (infile));
g_object_unref (G_OBJECT (input));
- return ret;
}