aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2012-08-10 19:22:14 +0000
committerChristian Grothoff <christian@grothoff.org>2012-08-10 19:22:14 +0000
commit194df7957ff6877e4a3899b631c7e4435d77e193 (patch)
tree5c70abb6799752f36136cd08a37c9cf53a9e026b
parent72944e8c23a2a0c2592569ffb7a6f76e09258bbb (diff)
downloadlibextractor-194df7957ff6877e4a3899b631c7e4435d77e193.tar.gz
libextractor-194df7957ff6877e4a3899b631c7e4435d77e193.zip
implementing OLE2 support
-rw-r--r--configure.ac2
-rw-r--r--src/plugins/Makefile.am29
-rw-r--r--src/plugins/ole2_extractor.c305
3 files changed, 276 insertions, 60 deletions
diff --git a/configure.ac b/configure.ac
index 071145c..d63e9b9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -575,7 +575,7 @@ AM_CONDITIONAL(HAVE_QT_SVG4, test x$qt_svg4 != x0)
575 575
576ABI_GSF 576ABI_GSF
577 577
578AM_CONDITIONAL(WITH_GSF, test "x$have_gsf" = "xtrue") 578AM_CONDITIONAL(HAVE_GSF, test "x$have_gsf" = "xtrue")
579 579
580# produce new line 580# produce new line
581echo "" 581echo ""
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
index e8cf3fb..6741861 100644
--- a/src/plugins/Makefile.am
+++ b/src/plugins/Makefile.am
@@ -67,6 +67,11 @@ PLUGIN_EXIV2=libextractor_exiv2.la
67TEST_EXIV2=test_exiv2 67TEST_EXIV2=test_exiv2
68endif 68endif
69 69
70if HAVE_GSF
71PLUGIN_GSF=libextractor_ole2.la
72TEST_GSF=test_ole2
73endif
74
70 75
71plugin_LTLIBRARIES = \ 76plugin_LTLIBRARIES = \
72 libextractor_it.la \ 77 libextractor_it.la \
@@ -80,7 +85,8 @@ plugin_LTLIBRARIES = \
80 $(PLUGIN_FLAC) \ 85 $(PLUGIN_FLAC) \
81 $(PLUGIN_MPEG) \ 86 $(PLUGIN_MPEG) \
82 $(PLUGIN_JPEG) \ 87 $(PLUGIN_JPEG) \
83 $(PLUGIN_EXIV2) 88 $(PLUGIN_EXIV2) \
89 $(PLUGIN_GSF)
84 90
85if HAVE_ZZUF 91if HAVE_ZZUF
86 fuzz_tests=fuzz_default.sh 92 fuzz_tests=fuzz_default.sh
@@ -97,7 +103,8 @@ check_PROGRAMS = \
97 $(TEST_FLAC) \ 103 $(TEST_FLAC) \
98 $(TEST_MPEG) \ 104 $(TEST_MPEG) \
99 $(TEST_JPEG) \ 105 $(TEST_JPEG) \
100 $(TEST_EXIV2) 106 $(TEST_EXIV2) \
107 $(TEST_GSF)
101 108
102TESTS = \ 109TESTS = \
103 $(fuzz_tests) \ 110 $(fuzz_tests) \
@@ -256,3 +263,21 @@ test_exiv2_LDADD = \
256 $(top_builddir)/src/plugins/libtest.la 263 $(top_builddir)/src/plugins/libtest.la
257 264
258 265
266libextractor_ole2_la_SOURCES = \
267 ole2_extractor.c
268libextractor_ole2_la_CFLAGS = \
269 $(GSF_CFLAGS)
270libextractor_ole2_la_CPPFLAGS = \
271 $(GSF_CFLAGS)
272libextractor_ole2_la_LDFLAGS = \
273 $(PLUGINFLAGS)
274libextractor_ole2_la_LIBADD = \
275 $(top_builddir)/src/common/libextractor_common.la \
276 $(GSF_LIBS)
277
278test_ole2_SOURCES = \
279 test_ole2.c
280test_ole2_LDADD = \
281 $(top_builddir)/src/plugins/libtest.la
282
283
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c
index afa451e..ccfc3cb 100644
--- a/src/plugins/ole2_extractor.c
+++ b/src/plugins/ole2_extractor.c
@@ -37,12 +37,21 @@
37#include <stdio.h> 37#include <stdio.h>
38#include <ctype.h> 38#include <ctype.h>
39#include <gsf/gsf-utils.h> 39#include <gsf/gsf-utils.h>
40#include <gsf/gsf-input-impl.h>
40#include <gsf/gsf-input-memory.h> 41#include <gsf/gsf-input-memory.h>
42#include <gsf/gsf-impl-utils.h>
41#include <gsf/gsf-infile.h> 43#include <gsf/gsf-infile.h>
42#include <gsf/gsf-infile-msole.h> 44#include <gsf/gsf-infile-msole.h>
43#include <gsf/gsf-msole-utils.h> 45#include <gsf/gsf-msole-utils.h>
44 46
45#define DEBUG_OLE2 0 47
48/**
49 * Set to 1 to use our own GsfInput subclass which supports seeking
50 * and thus can handle very large files. Set to 0 to use the simple
51 * gsf in-memory buffer (which can only access the first ~16k) for
52 * debugging.
53 */
54#define USE_LE_INPUT 1
46 55
47 56
48/** 57/**
@@ -325,7 +334,7 @@ process_star_office (GsfInput *src,
325 if ( (buf[0xd5] + buf[0xd4] > 0) && 334 if ( (buf[0xd5] + buf[0xd4] > 0) &&
326 (0 != add_metadata (proc, proc_cls, 335 (0 != add_metadata (proc, proc_cls,
327 &buf[0xd6], 336 &buf[0xd6],
328 EXTRACTOR_METATYPE_SUBJECT)) _) 337 EXTRACTOR_METATYPE_SUBJECT)) )
329 return 1; 338 return 1;
330 buf[0x215] = '\0'; 339 buf[0x215] = '\0';
331 if ( (buf[0x115] + buf[0x116] > 0) && 340 if ( (buf[0x115] + buf[0x116] > 0) &&
@@ -450,7 +459,7 @@ lid_to_language (unsigned int lid)
450 case 0x041b: 459 case 0x041b:
451 return __("Slovak"); 460 return __("Slovak");
452 case 0x041c: 461 case 0x041c:
453 return __("Albanian"); 462 return __("Albanian");
454 case 0x041d: 463 case 0x041d:
455 return __("Swedish"); 464 return __("Swedish");
456 case 0x041e: 465 case 0x041e:
@@ -583,13 +592,24 @@ history_extract (GsfInput *stream,
583 592
584/* *************************** custom GSF input method ***************** */ 593/* *************************** custom GSF input method ***************** */
585 594
586G_BEGIN_DECLS
587#define LE_TYPE_INPUT (le_input_get_type ()) 595#define LE_TYPE_INPUT (le_input_get_type ())
588#define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), TYPE_LE_INPUT, LeInput)) 596#define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), LE_TYPE_INPUT, LeInput))
589#define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), TYPE_LE_INPUT, LeInputClass)) 597#define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), LE_TYPE_INPUT, LeInputClass))
590#define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), TYPE_LE_INPUT)) 598#define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), LE_TYPE_INPUT))
591#define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), TYPE_LE_INPUT)) 599#define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), LE_TYPE_INPUT))
592#define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), TYPE_LE_INPUT, LeInputClass)) 600#define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), LE_TYPE_INPUT, LeInputClass))
601
602/**
603 * Internal state of an "LeInput" object.
604 */
605typedef struct _LeInputPrivate
606{
607 /**
608 * Our extraction context.
609 */
610 struct EXTRACTOR_ExtractContext *ec;
611} LeInputPrivate;
612
593 613
594/** 614/**
595 * Overall state of an "LeInput" object. 615 * Overall state of an "LeInput" object.
@@ -610,18 +630,6 @@ typedef struct _LeInput
610 630
611 631
612/** 632/**
613 * Internal state of an "LeInput" object.
614 */
615typedef struct _LeInputPrivate
616{
617 /**
618 * Our extraction context.
619 */
620 struct EXTRACTOR_ExtractContext *ec;
621} LeInputPrivate;
622
623
624/**
625 * LeInput's class state. 633 * LeInput's class state.
626 */ 634 */
627typedef struct _LeInputClass 635typedef struct _LeInputClass
@@ -640,13 +648,6 @@ typedef struct _LeInputClass
640 648
641 649
642/** 650/**
643 * Required method to obtain the LeInput "type".
644 */
645GType
646le_input_get_type (void) G_GNUC_CONST;
647
648
649/**
650 * Constructor for LeInput objects. 651 * Constructor for LeInput objects.
651 * 652 *
652 * @param ec extraction context to use 653 * @param ec extraction context to use
@@ -654,31 +655,198 @@ le_input_get_type (void) G_GNUC_CONST;
654 */ 655 */
655GsfInput * 656GsfInput *
656le_input_new (struct EXTRACTOR_ExtractContext *ec); 657le_input_new (struct EXTRACTOR_ExtractContext *ec);
657G_END_DECLS
658 658
659 659
660/** 660/**
661 * Macro to create LeInput type definition. 661 * Class initializer for the "LeInput" class.
662 *
663 * @param class class object to initialize
664 */
665static void
666le_input_class_init (LeInputClass *class);
667
668
669/**
670 * Initialize internal state of fresh input object.
671 *
672 * @param input object to initialize
673 */
674static void
675le_input_init (LeInput *input);
676
677
678/**
679 * Macro to create LeInput type definition and register the class.
680 */
681GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init, GSF_INPUT_TYPE)
682
683
684/**
685 * Duplicate input, leaving the new one at the same offset.
686 *
687 * @param input the input to duplicate
688 * @param err location for error reporting, can be NULL
689 * @return NULL on error (always)
690 */
691static GsfInput *
692le_input_dup (GsfInput *input,
693 GError **err)
694{
695 if (NULL != err)
696 *err = g_error_new (gsf_input_error_id (), 0,
697 "dup not supported on LeInput");
698 return NULL;
699}
700
701
702/**
703 * Read at least num_bytes. Does not change the current position if
704 * there is an error. Will only read if the entire amount can be
705 * read. Invalidates the buffer associated with previous calls to
706 * gsf_input_read.
707 *
708 * @param input
709 * @param num_bytes
710 * @param optional_buffer
711 * @return buffer where num_bytes data are available, or NULL on error
712 */
713static const guint8 *
714le_input_read (GsfInput *input,
715 size_t num_bytes,
716 guint8 *optional_buffer)
717{
718 LeInput *li = LE_INPUT (input);
719 struct EXTRACTOR_ExtractContext *ec;
720 void *buf;
721 uint64_t old_off;
722 ssize_t ret;
723
724 ec = li->priv->ec;
725 old_off = ec->seek (ec->cls, 0, SEEK_CUR);
726 if (num_bytes
727 != (ret = ec->read (ec->cls,
728 &buf,
729 num_bytes)))
730 {
731 /* we don't support partial reads;
732 most other GsfInput implementations in this case
733 allocate some huge temporary buffer just to avoid
734 the partial read; we might need to do that as well!? */
735 ec->seek (ec->cls, SEEK_SET, old_off);
736 return NULL;
737 }
738 if (NULL != optional_buffer)
739 {
740 memcpy (optional_buffer, buf, num_bytes);
741 return optional_buffer;
742 }
743 return buf;
744}
745
746
747/**
748 * Move the current location in an input stream
749 *
750 * @param input stream to seek
751 * @param offset target offset
752 * @param whence determines to what the offset is relative to
753 * @return TRUE on error
662 */ 754 */
663G_DEFINE_TYPE (LeInput, le_input, GSF_TYPE_INPUT) 755static gboolean
756le_input_seek (GsfInput *input,
757 gsf_off_t offset,
758 GSeekType whence)
759{
760 LeInput *li = LE_INPUT (input);
761 struct EXTRACTOR_ExtractContext *ec;
762 int w;
763 int64_t ret;
764
765 ec = li->priv->ec;
766 switch (whence)
767 {
768 case G_SEEK_SET:
769 w = SEEK_SET;
770 break;
771 case G_SEEK_CUR:
772 w = SEEK_CUR;
773 break;
774 case G_SEEK_END:
775 w = SEEK_END;
776 break;
777 default:
778 return TRUE;
779 }
780 if (-1 ==
781 (ret = ec->seek (ec->cls,
782 offset,
783 w)))
784 return TRUE;
785 return FALSE;
786}
664 787
665 788
666/** 789/**
790 * Class initializer for the "LeInput" class.
667 * 791 *
792 * @param class class object to initialize
668 */ 793 */
669static void 794static void
670le_input_class_init (LeInputClass *class) 795le_input_class_init (LeInputClass *class)
671{ 796{
672 // GObjectClass *gobject_class;
673 GsfInputClass *input_class; 797 GsfInputClass *input_class;
674 798
675 // gobject_class = (GObjectClass *) class;
676 input_class = (GsfInputClass *) class; 799 input_class = (GsfInputClass *) class;
677 input_class->read = le_input_read; 800 input_class->Dup = le_input_dup;
801 input_class->Read = le_input_read;
802 input_class->Seek = le_input_seek;
678 g_type_class_add_private (class, sizeof (LeInputPrivate)); 803 g_type_class_add_private (class, sizeof (LeInputPrivate));
679} 804}
680 805
681 806
807/**
808 * Initialize internal state of fresh input object.
809 *
810 * @param input object to initialize
811 */
812static void
813le_input_init (LeInput *input)
814{
815 LeInputPrivate *priv;
816
817 input->priv =
818 G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
819 LeInputPrivate);
820 priv = input->priv;
821 priv->ec = NULL;
822}
823
824
825/**
826 * Creates a new LeInput object.
827 *
828 * @param ec extractor context to wrap
829 * @return NULL on error
830 */
831GsfInput *
832le_input_new (struct EXTRACTOR_ExtractContext *ec)
833{
834 LeInput *input;
835
836 input = g_object_new (LE_TYPE_INPUT, NULL);
837 gsf_input_set_size (GSF_INPUT (input),
838 ec->get_size (ec->cls));
839 gsf_input_seek_emulate (GSF_INPUT (input),
840 0);
841 input->input.name = NULL;
842 input->input.container = NULL;
843 input->priv->ec = ec;
844
845 return GSF_INPUT (input);
846}
847
848
849
682 850
683/* *********************** end of custom GSF input method ************* */ 851/* *********************** end of custom GSF input method ************* */
684 852
@@ -702,17 +870,50 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
702 unsigned int lid; 870 unsigned int lid;
703 const char *lang; 871 const char *lang;
704 int ret; 872 int ret;
873 void *data;
874 uint64_t fsize;
875 ssize_t data_size;
705 876
706 if (size < 512 + 898) 877 fsize = ec->get_size (ec->cls);
707 return; /* can hardly be OLE2 */ 878 if (fsize < 512 + 898)
708 if (NULL == (input = gsf_input_memory_new ((const guint8 *) data, 879 {
709 (gsf_off_t) size, 880 /* File too small for OLE2 */
710 FALSE))) 881 return; /* can hardly be OLE2 */
882 }
883 if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
884 {
885 /* Failed to read minimum file size to buffer */
886 return;
887 }
888 data512 = (const unsigned char*) data + 512;
889 lid = data512[6] + (data512[7] << 8);
890 if ( (NULL != (lang = lid_to_language (lid))) &&
891 (0 != (ret = add_metadata (ec->proc, ec->cls,
892 lang,
893 EXTRACTOR_METATYPE_LANGUAGE))) )
711 return; 894 return;
895 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
896 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
897 if (0 != ec->seek (ec->cls, 0, SEEK_SET))
898 {
899 /* seek failed!? */
900 return;
901 }
902#if USE_LE_INPUT
903 if (NULL == (input = le_input_new (ec)))
904 {
905 fprintf (stderr, "le_input_new failed\n");
906 return;
907 }
908#else
909 input = gsf_input_memory_new ((const guint8 *) data,
910 data_size,
911 FALSE);
912#endif
712 if (NULL == (infile = gsf_infile_msole_new (input, NULL))) 913 if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
713 { 914 {
714 g_object_unref (G_OBJECT (input)); 915 g_object_unref (G_OBJECT (input));
715 return 0; 916 return;
716 } 917 }
717 ret = 0; 918 ret = 0;
718 for (i=0;i<gsf_infile_num_children (infile);i++) 919 for (i=0;i<gsf_infile_num_children (infile);i++)
@@ -722,32 +923,23 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
722 if (NULL == (name = gsf_infile_name_by_index (infile, i))) 923 if (NULL == (name = gsf_infile_name_by_index (infile, i)))
723 continue; 924 continue;
724 src = NULL; 925 src = NULL;
725 if ( ( (0 == strcmp(name, "\005SummaryInformation")) || 926 if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
726 (0 == strcmp(name, "\005DocumentSummaryInformation")) ) && 927 (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
727 (NULL != (src = gsf_infile_child_by_index (infile, i))) ) 928 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
728 ret = process (src, 929 ret = process (src,
729 proc, 930 ec->proc,
730 proc_cls); 931 ec->cls);
731 if ( (0 == strcmp (name, "SfxDocumentInfo")) && 932 if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
732 (NULL != (src = gsf_infile_child_by_index (infile, i))) ) 933 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
733 ret = process_star_office (src, 934 ret = process_star_office (src,
734 proc, 935 ec->proc,
735 proc_cls); 936 ec->cls);
736 if (NULL != src) 937 if (NULL != src)
737 g_object_unref (G_OBJECT (src)); 938 g_object_unref (G_OBJECT (src));
738 } 939 }
739 if (0 != ret) 940 if (0 != ret)
740 goto CLEANUP; 941 goto CLEANUP;
741 942
742 data512 = (const unsigned char*) &data[512];
743 lid = data512[6] + (data512[7] << 8);
744 if ( (NULL != (lang = lid_to_language (lid))) &&
745 (0 != (ret = add_metadata (proc, proc_cls,
746 lang,
747 EXTRACTOR_METATYPE_LANGUAGE))) )
748 goto CLEANUP;
749 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
750 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
751 if (lcb < 6) 943 if (lcb < 6)
752 goto CLEANUP; 944 goto CLEANUP;
753 for (i=0;i<gsf_infile_num_children (infile);i++) 945 for (i=0;i<gsf_infile_num_children (infile);i++)
@@ -763,14 +955,13 @@ EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
763 ret = history_extract (src, 955 ret = history_extract (src,
764 lcb, 956 lcb,
765 fcb, 957 fcb,
766 proc, proc_cls); 958 ec->proc, ec->cls);
767 g_object_unref (G_OBJECT (src)); 959 g_object_unref (G_OBJECT (src));
768 } 960 }
769 } 961 }
770 CLEANUP: 962 CLEANUP:
771 g_object_unref (G_OBJECT (infile)); 963 g_object_unref (G_OBJECT (infile));
772 g_object_unref (G_OBJECT (input)); 964 g_object_unref (G_OBJECT (input));
773 return ret;
774} 965}
775 966
776 967