libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 4431afd4ebdc03782a4f83b9c5705c86f4be4b1a
parent 15a096f7b058ce2a27e4d4177b83da1ed0317f5e
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun,  4 Sep 2005 07:47:58 +0000

rewrite

Diffstat:
Msrc/plugins/ole2/Makefile.am | 16++++++----------
Msrc/plugins/ole2/gsf-infile-msole.c | 868-------------------------------------------------------------------------------
Msrc/plugins/ole2/gsf-infile-msole.h | 2--
Msrc/plugins/ole2/gsf-input.c | 231-------------------------------------------------------------------------------
Msrc/plugins/ole2/gsf-utils.c | 230-------------------------------------------------------------------------------
Msrc/plugins/ole2/gsf-utils.h | 36------------------------------------
Msrc/plugins/ole2/ole2extractor.c | 1444+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
7 files changed, 1425 insertions(+), 1402 deletions(-)

diff --git a/src/plugins/ole2/Makefile.am b/src/plugins/ole2/Makefile.am @@ -1,6 +1,7 @@ INCLUDES = \ -I$(top_srcdir)/src/include +EXTRA_DIST = SYMBOLS LIBS = \ @LTLIBINTL@ @LIBS@ @@ -8,7 +9,8 @@ LIBS = \ # install plugins under: plugindir = $(libdir)/libextractor -PLUGINFLAGS = -export-dynamic -avoid-version -module +PLUGINFLAGS = -Wl,-Bsymbolic -avoid-version -module -no-undefined + plugin_LTLIBRARIES = \ libextractor_ole2.la @@ -16,17 +18,11 @@ plugin_LTLIBRARIES = \ AM_CFLAGS = $(GLIB_CFLAGS) libextractor_ole2_la_CFLAGS = \ - $(GLIB_CFLAGS) + $(GLIB_CFLAGS) libextractor_ole2_la_LIBADD = \ $(LIBADD) $(GLIB_LIBS) -lgobject-2.0 \ $(top_builddir)/src/main/libextractor.la libextractor_ole2_la_LDFLAGS = \ - $(PLUGINFLAGS) + $(PLUGINFLAGS) -Wl,--retain-symbols-file -Wl,SYMBOLS libextractor_ole2_la_SOURCES = \ - ole2extractor.c \ - gsf-utils.c \ - gsf-utils.h \ - gsf-input.c \ - gsf-input.h \ - gsf-infile-msole.c \ - gsf-infile-msole.h + ole2extractor.c diff --git a/src/plugins/ole2/gsf-infile-msole.c b/src/plugins/ole2/gsf-infile-msole.c @@ -21,871 +21,3 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA */ - -#include "platform.h" -#include <glib-object.h> -#include "gsf-input.h" -#include "gsf-infile-msole.h" -#include "gsf-utils.h" - -#include <string.h> -#include <stdio.h> - -#define OLE_HEADER_SIZE 0x200 /* independent of big block size size */ -#define OLE_HEADER_SIGNATURE 0x00 -#define OLE_HEADER_CLSID 0x08 /* See ReadClassStg */ -#define OLE_HEADER_MINOR_VER 0x18 /* 0x33 and 0x3e have been seen */ -#define OLE_HEADER_MAJOR_VER 0x1a /* 0x3 been seen in wild */ -#define OLE_HEADER_BYTE_ORDER 0x1c /* 0xfe 0xff == Intel Little Endian */ -#define OLE_HEADER_BB_SHIFT 0x1e -#define OLE_HEADER_SB_SHIFT 0x20 -/* 0x22..0x27 reserved == 0 */ -#define OLE_HEADER_CSECTDIR 0x28 -#define OLE_HEADER_NUM_BAT 0x2c -#define OLE_HEADER_DIRENT_START 0x30 -/* 0x34..0x37 transacting signature must be 0 */ -#define OLE_HEADER_THRESHOLD 0x38 -#define OLE_HEADER_SBAT_START 0x3c -#define OLE_HEADER_NUM_SBAT 0x40 -#define OLE_HEADER_METABAT_BLOCK 0x44 -#define OLE_HEADER_NUM_METABAT 0x48 -#define OLE_HEADER_START_BAT 0x4c -#define BAT_INDEX_SIZE 4 -#define OLE_HEADER_METABAT_SIZE ((OLE_HEADER_SIZE - OLE_HEADER_START_BAT) / BAT_INDEX_SIZE) - -#define DIRENT_MAX_NAME_SIZE 0x40 -#define DIRENT_DETAILS_SIZE 0x40 -#define DIRENT_SIZE (DIRENT_MAX_NAME_SIZE + DIRENT_DETAILS_SIZE) -#define DIRENT_NAME_LEN 0x40 /* length in bytes incl 0 terminator */ -#define DIRENT_TYPE 0x42 -#define DIRENT_COLOUR 0x43 -#define DIRENT_PREV 0x44 -#define DIRENT_NEXT 0x48 -#define DIRENT_CHILD 0x4c -#define DIRENT_CLSID 0x50 /* only for dirs */ -#define DIRENT_USERFLAGS 0x60 /* only for dirs */ -#define DIRENT_CREATE_TIME 0x64 /* for files */ -#define DIRENT_MODIFY_TIME 0x6c /* for files */ -#define DIRENT_FIRSTBLOCK 0x74 -#define DIRENT_FILE_SIZE 0x78 -/* 0x7c..0x7f reserved == 0 */ - -#define DIRENT_TYPE_INVALID 0 -#define DIRENT_TYPE_DIR 1 -#define DIRENT_TYPE_FILE 2 -#define DIRENT_TYPE_LOCKBYTES 3 /* ? */ -#define DIRENT_TYPE_PROPERTY 4 /* ? */ -#define DIRENT_TYPE_ROOTDIR 5 -#define DIRENT_MAGIC_END 0xffffffff - -/* flags in the block allocation list to denote special blocks */ -#define BAT_MAGIC_UNUSED 0xffffffff /* -1 */ -#define BAT_MAGIC_END_OF_CHAIN 0xfffffffe /* -2 */ -#define BAT_MAGIC_BAT 0xfffffffd /* a bat block, -3 */ -#define BAT_MAGIC_METABAT 0xfffffffc /* a metabat block -4 */ - - - - -typedef struct { - guint32 *block; - guint32 num_blocks; -} MSOleBAT; - -typedef struct { - char *name; - char *collation_name; - int index; - size_t size; - gboolean use_sb; - guint32 first_block; - gboolean is_directory; - GList *children; - unsigned char clsid[16]; /* 16 byte GUID used by some apps */ -} MSOleDirent; - -typedef struct { - struct { - MSOleBAT bat; - unsigned shift; - unsigned filter; - size_t size; - } bb, sb; - off_t max_block; - guint32 threshold; /* transition between small and big blocks */ - guint32 sbat_start, num_sbat; - - MSOleDirent *root_dir; - struct GsfInput *sb_file; - - int ref_count; -} MSOleInfo; - -typedef struct GsfInfileMSOle { - off_t size; - off_t cur_offset; - struct GsfInput *input; - MSOleInfo *info; - MSOleDirent *dirent; - MSOleBAT bat; - off_t cur_block; - - struct { - guint8 *buf; - size_t buf_size; - } stream; -} GsfInfileMSOle; - -/* utility macros */ -#define OLE_BIG_BLOCK(index, ole) ((index) >> ole->info->bb.shift) - -static struct GsfInput *gsf_infile_msole_new_child (GsfInfileMSOle *parent, - MSOleDirent *dirent); - -/** - * ole_get_block : - * @ole : the infile - * @block : - * @buffer : optionally NULL - * - * Read a block of data from the underlying input. - * Be really anal. - **/ -static const guint8 * -ole_get_block (const GsfInfileMSOle *ole, guint32 block, guint8 *buffer) -{ - g_return_val_if_fail (block < ole->info->max_block, NULL); - - /* OLE_HEADER_SIZE is fixed at 512, but the sector containing the - * header is padded out to bb.size (sector size) when bb.size > 512. */ - if (gsf_input_seek (ole->input, - (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (block << ole->info->bb.shift)), - SEEK_SET) < 0) - return NULL; - - return gsf_input_read (ole->input, ole->info->bb.size, buffer); -} - -/** - * ole_make_bat : - * @metabat : a meta bat to connect to the raw blocks (small or large) - * @size_guess : An optional guess as to how many blocks are in the file - * @block : The first block in the list. - * @res : where to store the result. - * - * Walk the linked list of the supplied block allocation table and build up a - * table for the list starting in @block. - * - * Returns TRUE on error. - */ -static gboolean -ole_make_bat (MSOleBAT const *metabat, size_t size_guess, guint32 block, - MSOleBAT *res) -{ - /* NOTE : Only use size as a suggestion, sometimes it is wrong */ - GArray *bat = g_array_sized_new (FALSE, FALSE, - sizeof (guint32), size_guess); - - guint8 *used = (guint8*)g_alloca (1 + metabat->num_blocks / 8); - memset (used, 0, 1 + metabat->num_blocks / 8); - - if (block < metabat->num_blocks) - do { - /* Catch cycles in the bat list */ - g_return_val_if_fail (0 == (used[block/8] & (1 << (block & 0x7))), TRUE); - used[block/8] |= 1 << (block & 0x7); - - g_array_append_val (bat, block); - block = metabat->block [block]; - } while (block < metabat->num_blocks); - - res->block = NULL; - - res->num_blocks = bat->len; - res->block = (guint32 *) (gpointer) g_array_free (bat, FALSE); - - if (block != BAT_MAGIC_END_OF_CHAIN) { -#if 0 - g_warning ("This OLE2 file is invalid.\n" - "The Block Allocation Table for one of the streams had %x instead of a terminator (%x).\n" - "We might still be able to extract some data, but you'll want to check the file.", - block, BAT_MAGIC_END_OF_CHAIN); -#endif - } - - return FALSE; -} - -static void -ols_bat_release (MSOleBAT *bat) -{ - if (bat->block != NULL) { - g_free (bat->block); - bat->block = NULL; - bat->num_blocks = 0; - } -} - -/** - * ole_info_read_metabat : - * @ole : - * @bats : - * - * A small utility routine to read a set of references to bat blocks - * either from the OLE header, or a meta-bat block. - * - * Returns a pointer to the element after the last position filled. - **/ -static guint32 * -ole_info_read_metabat (GsfInfileMSOle *ole, guint32 *bats, guint32 max, - guint32 const *metabat, guint32 const *metabat_end) -{ - guint8 const *bat, *end; - - for (; metabat < metabat_end; metabat++) { - bat = ole_get_block (ole, *metabat, NULL); - if (bat == NULL) - return NULL; - end = bat + ole->info->bb.size; - for ( ; bat < end ; bat += BAT_INDEX_SIZE, bats++) { - *bats = GSF_LE_GET_GUINT32 (bat); - g_return_val_if_fail (*bats < max || - *bats >= BAT_MAGIC_METABAT, NULL); - } - } - return bats; -} - -/** - * gsf_ole_get_guint32s : - * @dst : - * @src : - * @num_bytes : - * - * Copy some some raw data into an array of guint32. - **/ -static void -gsf_ole_get_guint32s (guint32 *dst, guint8 const *src, int num_bytes) -{ - for (; (num_bytes -= BAT_INDEX_SIZE) >= 0 ; src += BAT_INDEX_SIZE) - *dst++ = GSF_LE_GET_GUINT32 (src); -} - -static struct GsfInput * -ole_info_get_sb_file (GsfInfileMSOle *parent) -{ - MSOleBAT meta_sbat; - - if (parent->info->sb_file != NULL) - return parent->info->sb_file; - - parent->info->sb_file = gsf_infile_msole_new_child (parent, - parent->info->root_dir); - - if (NULL == parent->info->sb_file) - return NULL; - - g_return_val_if_fail (parent->info->sb.bat.block == NULL, NULL); - - if (ole_make_bat (&parent->info->bb.bat, - parent->info->num_sbat, - parent->info->sbat_start, - &meta_sbat)) { - return NULL; - } - - parent->info->sb.bat.num_blocks = meta_sbat.num_blocks * (parent->info->bb.size / BAT_INDEX_SIZE); - parent->info->sb.bat.block = g_new0 (guint32, parent->info->sb.bat.num_blocks); - ole_info_read_metabat (parent, parent->info->sb.bat.block, - parent->info->sb.bat.num_blocks, - meta_sbat.block, meta_sbat.block + meta_sbat.num_blocks); - ols_bat_release (&meta_sbat); - - return parent->info->sb_file; -} - -static gint -ole_dirent_cmp (const MSOleDirent *a, const MSOleDirent *b) -{ - g_return_val_if_fail (a, 0); - g_return_val_if_fail (b, 0); - - g_return_val_if_fail (a->collation_name, 0); - g_return_val_if_fail (b->collation_name, 0); - - return strcmp (b->collation_name, a->collation_name); -} - -/** - * ole_dirent_new : - * @ole : - * @entry : - * @parent : optional - * - * Parse dirent number @entry and recursively handle its siblings and children. - **/ -static MSOleDirent * -ole_dirent_new (GsfInfileMSOle *ole, guint32 entry, MSOleDirent *parent) -{ - MSOleDirent *dirent; - guint32 block, next, prev, child, size; - guint8 const *data; - guint8 type; - guint16 name_len; - - if (entry >= DIRENT_MAGIC_END) - return NULL; - - block = OLE_BIG_BLOCK (entry * DIRENT_SIZE, ole); - - g_return_val_if_fail (block < ole->bat.num_blocks, NULL); - data = ole_get_block (ole, ole->bat.block [block], NULL); - if (data == NULL) - return NULL; - data += (DIRENT_SIZE * entry) % ole->info->bb.size; - - type = GSF_LE_GET_GUINT8 (data + DIRENT_TYPE); - if (type != DIRENT_TYPE_DIR && - type != DIRENT_TYPE_FILE && - type != DIRENT_TYPE_ROOTDIR) { -#if 0 - g_warning ("Unknown stream type 0x%x", type); -#endif - return NULL; - } - - /* It looks like directory (and root directory) sizes are sometimes bogus */ - size = GSF_LE_GET_GUINT32 (data + DIRENT_FILE_SIZE); - g_return_val_if_fail (type == DIRENT_TYPE_DIR || type == DIRENT_TYPE_ROOTDIR || - size <= (guint32)gsf_input_size(ole->input), NULL); - - dirent = g_new0 (MSOleDirent, 1); - dirent->index = entry; - dirent->size = size; - /* Store the class id which is 16 byte identifier used by some apps */ - memcpy(dirent->clsid, data + DIRENT_CLSID, sizeof(dirent->clsid)); - - /* root dir is always big block */ - dirent->use_sb = parent && (size < ole->info->threshold); - dirent->first_block = (GSF_LE_GET_GUINT32 (data + DIRENT_FIRSTBLOCK)); - dirent->is_directory = (type != DIRENT_TYPE_FILE); - dirent->children = NULL; - prev = GSF_LE_GET_GUINT32 (data + DIRENT_PREV); - next = GSF_LE_GET_GUINT32 (data + DIRENT_NEXT); - child = GSF_LE_GET_GUINT32 (data + DIRENT_CHILD); - name_len = GSF_LE_GET_GUINT16 (data + DIRENT_NAME_LEN); - dirent->name = NULL; - if (0 < name_len && name_len <= DIRENT_MAX_NAME_SIZE) { - gunichar2 uni_name [DIRENT_MAX_NAME_SIZE+1]; - gchar const *end; - int i; - - /* !#%!@$#^ - * Sometimes, rarely, people store the stream name as ascii - * rather than utf16. Do a validation first just in case. - */ - if (!g_utf8_validate (data, -1, &end) || - ((guint8 const *)end - data + 1) != name_len) { - /* be wary about endianness */ - for (i = 0 ; i < name_len ; i += 2) - uni_name [i/2] = GSF_LE_GET_GUINT16 (data + i); - uni_name [i/2] = 0; - - dirent->name = g_utf16_to_utf8 (uni_name, -1, NULL, NULL, NULL); - } else - dirent->name = g_strndup ((gchar *)data, (gsize)((guint8 const *)end - data + 1)); - } - /* be really anal in the face of screwups */ - if (dirent->name == NULL) - dirent->name = g_strdup (""); - dirent->collation_name = g_utf8_collate_key (dirent->name, -1); - - if (parent != NULL) - parent->children = g_list_insert_sorted (parent->children, - dirent, (GCompareFunc)ole_dirent_cmp); - - /* NOTE : These links are a tree, not a linked list */ - if (prev != entry) - ole_dirent_new (ole, prev, parent); - if (next != entry) - ole_dirent_new (ole, next, parent); - - if (dirent->is_directory) - ole_dirent_new (ole, child, dirent); - return dirent; -} - -static void -ole_dirent_free (MSOleDirent *dirent) -{ - GList *tmp; - g_return_if_fail (dirent != NULL); - - g_free (dirent->name); - g_free (dirent->collation_name); - - for (tmp = dirent->children; tmp; tmp = tmp->next) - ole_dirent_free ((MSOleDirent *)tmp->data); - g_list_free (dirent->children); - g_free (dirent); -} - -/*****************************************************************************/ - -static void -ole_info_unref (MSOleInfo *info) -{ - if (info->ref_count-- != 1) - return; - - ols_bat_release (&info->bb.bat); - ols_bat_release (&info->sb.bat); - if (info->root_dir != NULL) { - ole_dirent_free (info->root_dir); - info->root_dir = NULL; - } - if (info->sb_file != NULL) { - gsf_input_finalize(info->sb_file); - info->sb_file = NULL; - } - g_free (info); -} - -static MSOleInfo * -ole_info_ref (MSOleInfo *info) -{ - info->ref_count++; - return info; -} - -static void -gsf_infile_msole_init (GsfInfileMSOle * ole) -{ - ole->cur_offset = 0; - ole->size = 0; - ole->input = NULL; - ole->info = NULL; - ole->bat.block = NULL; - ole->bat.num_blocks = 0; - ole->cur_block = BAT_MAGIC_UNUSED; - ole->stream.buf = NULL; - ole->stream.buf_size = 0; -} - -/** - * ole_dup : - * @src : - * - * Utility routine to _partially_ replicate a file. It does NOT copy the bat - * blocks, or init the dirent. - * - * Return value: the partial duplicate. - **/ -static GsfInfileMSOle * -ole_dup (GsfInfileMSOle const * src) -{ - GsfInfileMSOle *dst; - struct GsfInput *input; - - g_return_val_if_fail (src != NULL, NULL); - - dst = malloc(sizeof(GsfInfileMSOle)); - if (dst == NULL) - return NULL; - gsf_infile_msole_init(dst); - input = gsf_input_dup (src->input); - if (input == NULL) { - gsf_infile_msole_finalize(dst); - return NULL; - } - dst->input = input; - dst->info = ole_info_ref (src->info); - - /* buf and buf_size are initialized to NULL */ - - return dst; -} - -/** - * ole_init_info : - * @ole : - * - * Read an OLE header and do some sanity checking - * along the way. - * - * Return value: TRUE on error - **/ -static gboolean -ole_init_info (GsfInfileMSOle *ole) -{ - static guint8 const signature[] = - { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 }; - guint8 const *header, *tmp; - guint32 *metabat = NULL; - MSOleInfo *info; - guint32 bb_shift, sb_shift, num_bat, num_metabat, last, dirent_start; - guint32 metabat_block, *ptr; - - /* check the header */ - if (gsf_input_seek (ole->input, (off_t) 0, SEEK_SET) || - NULL == (header = gsf_input_read (ole->input, OLE_HEADER_SIZE, NULL)) || - 0 != memcmp (header, signature, sizeof (signature))) { - return TRUE; - } - - bb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_BB_SHIFT); - sb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_SB_SHIFT); - num_bat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_BAT); - dirent_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_DIRENT_START); - metabat_block = GSF_LE_GET_GUINT32 (header + OLE_HEADER_METABAT_BLOCK); - num_metabat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_METABAT); - - /* Some sanity checks - * 1) There should always be at least 1 BAT block - * 2) It makes no sense to have a block larger than 2^31 for now. - * Maybe relax this later, but not much. - */ - if (6 > bb_shift || bb_shift >= 31 || sb_shift > bb_shift) { - return TRUE; - } - - info = g_new0 (MSOleInfo, 1); - ole->info = info; - - info->ref_count = 1; - info->bb.shift = bb_shift; - info->bb.size = 1 << info->bb.shift; - info->bb.filter = info->bb.size - 1; - info->sb.shift = sb_shift; - info->sb.size = 1 << info->sb.shift; - info->sb.filter = info->sb.size - 1; - info->threshold = GSF_LE_GET_GUINT32 (header + OLE_HEADER_THRESHOLD); - info->sbat_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_SBAT_START); - info->num_sbat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_SBAT); - info->max_block = (gsf_input_size (ole->input) - OLE_HEADER_SIZE) / info->bb.size; - info->sb_file = NULL; - - if (info->num_sbat == 0 && info->sbat_start != BAT_MAGIC_END_OF_CHAIN) { -#if 0 - g_warning ("There is are not supposed to be any blocks in the small block allocation table, yet there is a link to some. Ignoring it."); -#endif - } - - /* very rough heuristic, just in case */ - if (num_bat < info->max_block) { - info->bb.bat.num_blocks = num_bat * (info->bb.size / BAT_INDEX_SIZE); - info->bb.bat.block = g_new0 (guint32, info->bb.bat.num_blocks); - - metabat = (guint32 *)g_alloca (MAX (info->bb.size, OLE_HEADER_SIZE)); - - /* Reading the elements invalidates this memory, make copy */ - gsf_ole_get_guint32s (metabat, header + OLE_HEADER_START_BAT, - OLE_HEADER_SIZE - OLE_HEADER_START_BAT); - last = num_bat; - if (last > OLE_HEADER_METABAT_SIZE) - last = OLE_HEADER_METABAT_SIZE; - - ptr = ole_info_read_metabat (ole, info->bb.bat.block, - info->bb.bat.num_blocks, metabat, metabat + last); - num_bat -= last; - } else - ptr = NULL; - - last = (info->bb.size - BAT_INDEX_SIZE) / BAT_INDEX_SIZE; - while (ptr != NULL && num_metabat-- > 0) { - tmp = ole_get_block (ole, metabat_block, NULL); - if (tmp == NULL) { - ptr = NULL; - break; - } - - /* Reading the elements invalidates this memory, make copy */ - gsf_ole_get_guint32s (metabat, tmp, (int)info->bb.size); - - if (num_metabat == 0) { - if (last < num_bat) { - /* there should be less that a full metabat block - * remaining */ - ptr = NULL; - break; - } - last = num_bat; - } else if (num_metabat > 0) { - metabat_block = metabat[last]; - num_bat -= last; - } - - ptr = ole_info_read_metabat (ole, ptr, - info->bb.bat.num_blocks, metabat, metabat + last); - } - - if (ptr == NULL) { - return TRUE; - } - - /* Read the directory's bat, we do not know the size */ - if (ole_make_bat (&info->bb.bat, 0, dirent_start, &ole->bat)) { - return TRUE; - } - - /* Read the directory */ - ole->dirent = info->root_dir = ole_dirent_new (ole, 0, NULL); - if (ole->dirent == NULL) { - return TRUE; - } - - return FALSE; -} - -void -gsf_infile_msole_finalize (GsfInfileMSOle * ole) -{ - if (ole->input != NULL) { - gsf_input_finalize(ole->input); - ole->input = NULL; - } - if (ole->info != NULL) { - ole_info_unref (ole->info); - ole->info = NULL; - } - ols_bat_release (&ole->bat); - - g_free (ole->stream.buf); - free(ole); -} - -static guint8 const * -gsf_infile_msole_read (GsfInfileMSOle *ole, size_t num_bytes, guint8 *buffer) -{ - off_t first_block, last_block, raw_block, offset, i; - guint8 const *data; - guint8 *ptr; - size_t count; - - /* small block files are preload */ - if (ole->dirent != NULL && ole->dirent->use_sb) { - if (buffer != NULL) { - memcpy (buffer, ole->stream.buf + ole->cur_offset, num_bytes); - ole->cur_offset += num_bytes; - return buffer; - } - data = ole->stream.buf + ole->cur_offset; - ole->cur_offset += num_bytes; - return data; - } - - /* GsfInput guarantees that num_bytes > 0 */ - first_block = OLE_BIG_BLOCK (ole->cur_offset, ole); - last_block = OLE_BIG_BLOCK (ole->cur_offset + num_bytes - 1, ole); - offset = ole->cur_offset & ole->info->bb.filter; - - /* optimization : are all the raw blocks contiguous */ - i = first_block; - raw_block = ole->bat.block [i]; - while (++i <= last_block && ++raw_block == ole->bat.block [i]) - ; - if (i > last_block) { - /* optimization don't seek if we don't need to */ - if (ole->cur_block != first_block) { - if (gsf_input_seek (ole->input, - (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (ole->bat.block [first_block] << ole->info->bb.shift) + offset), - SEEK_SET) < 0) - return NULL; - } - ole->cur_block = last_block; - return gsf_input_read (ole->input, num_bytes, buffer); - } - - /* damn, we need to copy it block by block */ - if (buffer == NULL) { - if (ole->stream.buf_size < num_bytes) { - if (ole->stream.buf != NULL) - g_free (ole->stream.buf); - ole->stream.buf_size = num_bytes; - ole->stream.buf = g_new (guint8, num_bytes); - } - buffer = ole->stream.buf; - } - - ptr = buffer; - for (i = first_block ; i <= last_block ; i++ , ptr += count, num_bytes -= count) { - count = ole->info->bb.size - offset; - if (count > num_bytes) - count = num_bytes; - data = ole_get_block (ole, ole->bat.block [i], NULL); - if (data == NULL) - return NULL; - - /* TODO : this could be optimized to avoid the copy */ - memcpy (ptr, data + offset, count); - offset = 0; - } - ole->cur_block = BAT_MAGIC_UNUSED; - ole->cur_offset += num_bytes; - return buffer; -} - -static struct GsfInput * -gsf_infile_msole_new_child (GsfInfileMSOle *parent, - MSOleDirent *dirent) -{ - GsfInfileMSOle * child; - MSOleInfo *info; - MSOleBAT const *metabat; - struct GsfInput *sb_file = NULL; - size_t size_guess; - char * buf; - - - if ( (dirent->index != 0) && - (dirent->is_directory) ) { - /* be wary. It seems as if some implementations pretend that the - * directories contain data */ - return gsf_input_new("", - (off_t) 0, - 0); - } - child = ole_dup (parent); - if (child == NULL) - return NULL; - child->dirent = dirent; - child->size = (off_t) dirent->size; - - info = parent->info; - - if (dirent->use_sb) { /* build the bat */ - metabat = &info->sb.bat; - size_guess = dirent->size >> info->sb.shift; - sb_file = ole_info_get_sb_file (parent); - } else { - metabat = &info->bb.bat; - size_guess = dirent->size >> info->bb.shift; - } - if (ole_make_bat (metabat, size_guess + 1, dirent->first_block, &child->bat)) { - gsf_infile_msole_finalize(child); - return NULL; - } - - if (dirent->use_sb) { - unsigned i; - guint8 const *data; - - if (sb_file == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - - child->stream.buf_size = info->threshold; - child->stream.buf = g_new (guint8, info->threshold); - - for (i = 0 ; i < child->bat.num_blocks; i++) - if (gsf_input_seek (sb_file, - (off_t)(child->bat.block [i] << info->sb.shift), SEEK_SET) < 0 || - (data = gsf_input_read (sb_file, - info->sb.size, - child->stream.buf + (i << info->sb.shift))) == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - } - buf = malloc(child->size); - if (buf == NULL) { - gsf_infile_msole_finalize(child); - return NULL; - } - if (NULL == gsf_infile_msole_read(child, - child->size, - buf)) { - gsf_infile_msole_finalize(child); - return NULL; - } - gsf_infile_msole_finalize(child); - return gsf_input_new(buf, - (off_t) dirent->size, - 1); -} - - -struct GsfInput * -gsf_infile_msole_child_by_index (GsfInfileMSOle * ole, int target) -{ - GList *p; - - for (p = ole->dirent->children; p != NULL ; p = p->next) - if (target-- <= 0) - return gsf_infile_msole_new_child (ole, - (MSOleDirent *)p->data); - return NULL; -} - -char const * -gsf_infile_msole_name_by_index (GsfInfileMSOle * ole, int target) -{ - GList *p; - - for (p = ole->dirent->children; p != NULL ; p = p->next) - if (target-- <= 0) - return ((MSOleDirent *)p->data)->name; - return NULL; -} - -int -gsf_infile_msole_num_children (GsfInfileMSOle * ole) -{ - g_return_val_if_fail (ole->dirent != NULL, -1); - - if (!ole->dirent->is_directory) - return -1; - return g_list_length (ole->dirent->children); -} - - -/** - * gsf_infile_msole_new : - * @source : - * - * Opens the root directory of an MS OLE file. - * NOTE : adds a reference to @source - * - * Returns : the new ole file handler - **/ -GsfInfileMSOle * -gsf_infile_msole_new (struct GsfInput *source) -{ - GsfInfileMSOle * ole; - - ole = malloc(sizeof(GsfInfileMSOle)); - if (ole == NULL) - return NULL; - gsf_infile_msole_init(ole); - ole->input = source; - ole->size = (off_t) 0; - - if (ole_init_info (ole)) { - gsf_infile_msole_finalize(ole); - return NULL; - } - - return ole; -} - -/** - * gsf_infile_msole_get_class_id : - * @ole: a #GsfInfileMSOle - * @res: 16 byte identifier (often a GUID in MS Windows apps) - * - * Retrieves the 16 byte indentifier (often a GUID in MS Windows apps) - * stored within the directory associated with @ole and stores it in @res. - * - * Returns TRUE on success - **/ -int -gsf_infile_msole_get_class_id (const GsfInfileMSOle *ole, - unsigned char * res) -{ - g_return_val_if_fail (ole != NULL && ole->dirent != NULL, 0); - - memcpy (res, ole->dirent->clsid, - sizeof(ole->dirent->clsid)); - return 1; -} diff --git a/src/plugins/ole2/gsf-infile-msole.h b/src/plugins/ole2/gsf-infile-msole.h @@ -27,8 +27,6 @@ struct GsfInfileMSOle; struct GsfInfileMSOle * gsf_infile_msole_new (struct GsfInput *source); -int gsf_infile_msole_get_class_id (const struct GsfInfileMSOle * ole, - unsigned char * res); int gsf_infile_msole_num_children (struct GsfInfileMSOle *infile); diff --git a/src/plugins/ole2/gsf-input.c b/src/plugins/ole2/gsf-input.c @@ -24,236 +24,5 @@ #include "gsf-utils.h" #include <string.h> -typedef struct GsfInput { - off_t size; - off_t cur_offset; - char * name; - const unsigned char * buf; - int needs_free; -} GsfInput; -static void -gsf_input_init (GsfInput * input) -{ - input->size = 0; - input->cur_offset = 0; - input->name = NULL; - input->buf = NULL; -} - -/** - * gsf_input_memory_new: - * @buf: The input bytes - * @length: The length of @buf - * @needs_free: Whether you want this memory to be free'd at object destruction - * - * Returns: A new #GsfInputMemory - */ -GsfInput * -gsf_input_new (const unsigned char * buf, - off_t length, - int needs_free) -{ - GsfInput *mem = malloc(sizeof(GsfInput)); - if (mem == NULL) - return NULL; - gsf_input_init(mem); - mem->buf = buf; - mem->size = length; - mem->needs_free = needs_free; - return mem; -} - -void -gsf_input_finalize (GsfInput * input) -{ - if (input->name != NULL) { - free (input->name); - input->name = NULL; - } - if ( (input->buf) && input->needs_free) - free((void*) input->buf); - free(input); -} - -GsfInput * -gsf_input_dup (GsfInput *src) -{ - GsfInput * dst = malloc(sizeof(GsfInput)); - if (dst == NULL) - return NULL; - gsf_input_init(dst); - dst->buf = src->buf; - dst->needs_free = 0; - dst->size = src->size; - if (src->name != NULL) - gsf_input_set_name (dst, src->name); - dst->cur_offset = src->cur_offset; - return dst; -} - -const unsigned char * -gsf_input_read (GsfInput * mem, size_t num_bytes, unsigned char * optional_buffer) -{ - const unsigned char *src = mem->buf; - if (src == NULL) - return NULL; - if (optional_buffer) { - memcpy (optional_buffer, src + mem->cur_offset, num_bytes); - mem->cur_offset += num_bytes; - - return optional_buffer; - } else { - const unsigned char * ret = src + mem->cur_offset; - mem->cur_offset += num_bytes; - return ret; - } -} - -/** - * gsf_input_name : - * @input : - * - * Returns @input's name in utf8 form, DO NOT FREE THIS STRING - **/ -const char * -gsf_input_name (GsfInput *input) -{ - return input->name; -} - -/** - * gsf_input_size : - * @input : The input - * - * Looks up and caches the number of bytes in the input - * - * Returns : the size or -1 on error - **/ -off_t -gsf_input_size (GsfInput *input) -{ - g_return_val_if_fail (input != NULL, -1); - return input->size; -} - -/** - * gsf_input_eof : - * @input : the input - * - * Are we at the end of the file ? - * - * Returns : TRUE if the input is at the eof. - **/ -int -gsf_input_eof (GsfInput *input) -{ - g_return_val_if_fail (input != NULL, 0); - - return input->cur_offset >= input->size; -} - -/** - * gsf_input_remaining : - * @input : - * - * Returns the number of bytes left in the file. - **/ -off_t -gsf_input_remaining (GsfInput *input) -{ - g_return_val_if_fail (input != NULL, 0); - - return input->size - input->cur_offset; -} - -/** - * gsf_input_tell : - * @input : - * - * Returns the current offset in the file. - **/ -off_t -gsf_input_tell (GsfInput *input) -{ - g_return_val_if_fail (input != NULL, 0); - - return input->cur_offset; -} - -/** - * gsf_input_seek : - * @input : - * @offset : - * @whence : - * - * Returns TRUE on error. - **/ -int -gsf_input_seek (GsfInput *input, off_t offset, int whence) -{ - off_t pos = offset; - - g_return_val_if_fail (input != NULL, 1); - - switch (whence) { - case SEEK_SET : break; - case SEEK_CUR : pos += input->cur_offset; break; - case SEEK_END : pos += input->size; break; - default : return 1; - } - - if (pos < 0 || pos > input->size) - return 1; - - /* - * If we go nowhere, just return. This in particular handles null - * seeks for streams with no seek method. - */ - if (pos == input->cur_offset) - return 0; - - input->cur_offset = pos; - return 0; -} - -/** - * gsf_input_set_name : - * @input : - * @name : - * - * protected. - * - * Returns : TRUE if the assignment was ok. - **/ -int -gsf_input_set_name (GsfInput *input, char const *name) -{ - char *buf; - - g_return_val_if_fail (input != NULL, 0); - - buf = strdup (name); - if (input->name != NULL) - free (input->name); - input->name = buf; - return 1; -} - -/** - * gsf_input_set_size : - * @input : - * @size : - * - * Returns : TRUE if the assignment was ok. - */ -int -gsf_input_set_size (GsfInput *input, off_t size) -{ - g_return_val_if_fail (input != NULL, 0); - - input->size = size; - return 1; -} - diff --git a/src/plugins/ole2/gsf-utils.c b/src/plugins/ole2/gsf-utils.c @@ -22,233 +22,3 @@ #include "platform.h" #include "gsf-utils.h" #include "gsf-input.h" - -#include <ctype.h> -#include <stdio.h> -#include <string.h> - -/* - * Glib gets this wrong, really. ARM's floating point format is a weird - * mixture. - */ -#define G_ARMFLOAT_ENDIAN 56781234 -#if defined(__arm__) && !defined(__vfp__) && (G_BYTE_ORDER == G_LITTLE_ENDIAN) -#define G_FLOAT_BYTE_ORDER G_ARMFLOAT_ENDIAN -#else -#define G_FLOAT_BYTE_ORDER G_BYTE_ORDER -#endif - -guint64 -gsf_le_get_guint64 (void const *p) -{ -#if G_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (guint64) == 8) { - guint64 li; - int i; - guint8 *t = (guint8 *)&li; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (li); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return li; - } else { - g_error ("Big endian machine, but weird size of guint64"); - } -#elif G_BYTE_ORDER == G_LITTLE_ENDIAN - if (sizeof (guint64) == 8) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - guint64 data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of guint64"); - } -#else -#error "Byte order not recognised -- out of luck" -#endif -} - -float -gsf_le_get_float (void const *p) -{ -#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (float) == 4) { - float f; - int i; - guint8 *t = (guint8 *)&f; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (f); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return f; - } else { - g_error ("Big endian machine, but weird size of floats"); - } -#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN) - if (sizeof (float) == 4) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - float data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of floats"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -void -gsf_le_set_float (void *p, float d) -{ -#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (float) == 4) { - int i; - guint8 *t = (guint8 *)&d; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (d); - - for (i = 0; i < sd; i++) - p2[sd - 1 - i] = t[i]; - } else { - g_error ("Big endian machine, but weird size of floats"); - } -#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN) - if (sizeof (float) == 4) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - memcpy (p, &d, sizeof (d)); - } else { - g_error ("Little endian machine, but weird size of floats"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -double -gsf_le_get_double (void const *p) -{ -#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN - double data; - memcpy ((char *)&data + 4, p, 4); - memcpy ((char *)&data, (const char *)p + 4, 4); - return data; -#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (double) == 8) { - double d; - int i; - guint8 *t = (guint8 *)&d; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (d); - - for (i = 0; i < sd; i++) - t[i] = p2[sd - 1 - i]; - - return d; - } else { - g_error ("Big endian machine, but weird size of doubles"); - } -#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN - if (sizeof (double) == 8) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - double data; - memcpy (&data, p, sizeof (data)); - return data; - } else { - g_error ("Little endian machine, but weird size of doubles"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -void -gsf_le_set_double (void *p, double d) -{ -#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN - memcpy (p, (const char *)&d + 4, 4); - memcpy ((char *)p + 4, &d, 4); -#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN - if (sizeof (double) == 8) { - int i; - guint8 *t = (guint8 *)&d; - guint8 *p2 = (guint8 *)p; - int sd = sizeof (d); - - for (i = 0; i < sd; i++) - p2[sd - 1 - i] = t[i]; - } else { - g_error ("Big endian machine, but weird size of doubles"); - } -#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN - if (sizeof (double) == 8) { - /* - * On i86, we could access directly, but Alphas require - * aligned access. - */ - memcpy (p, &d, sizeof (d)); - } else { - g_error ("Little endian machine, but weird size of doubles"); - } -#else -#error "Floating-point byte order not recognised -- out of luck" -#endif -} - -/** - * gsf_extension_pointer: - * @path: A filename or file path. - * - * Extracts the extension from the end of a filename (the part after the final - * '.' in the filename). - * - * Returns: A pointer to the extension part of the filename, or a - * pointer to the end of the string if the filename does not - * have an extension. - */ -char const * -gsf_extension_pointer (char const *path) -{ - char *s, *t; - - g_return_val_if_fail (path != NULL, NULL); - - t = strrchr (path, G_DIR_SEPARATOR); - s = strrchr ((t != NULL) ? t : path, '.'); - if (s != NULL) - return s + 1; - return path + strlen(path); -} - -/** - * gsf_iconv_close : A utility wrapper to safely close an iconv handle - * @handle : - **/ -void -gsf_iconv_close (GIConv handle) -{ - if (handle != NULL && handle != ((GIConv)-1)) - g_iconv_close (handle); -} - -/* FIXME: what about translations? */ -#ifndef _ -#define _(x) x -#endif - diff --git a/src/plugins/ole2/gsf-utils.h b/src/plugins/ole2/gsf-utils.h @@ -23,42 +23,6 @@ #define GSF_UTILS_H #include <glib-object.h> - -/* Do this the ugly way so that we don't have to worry about alignment */ -#define GSF_LE_GET_GUINT8(p) (*(guint8 const *)(p)) -#define GSF_LE_GET_GUINT16(p) \ - (guint16)((((guint8 const *)(p))[0] << 0) | \ - (((guint8 const *)(p))[1] << 8)) -#define GSF_LE_GET_GUINT32(p) \ - (guint32)((((guint8 const *)(p))[0] << 0) | \ - (((guint8 const *)(p))[1] << 8) | \ - (((guint8 const *)(p))[2] << 16) | \ - (((guint8 const *)(p))[3] << 24)) -#define GSF_LE_GET_GUINT64(p) (gsf_le_get_guint64 (p)) -#define GSF_LE_GET_GINT8(p) ((gint8)GSF_LE_GET_GUINT8(p)) -#define GSF_LE_GET_GINT16(p) ((gint16)GSF_LE_GET_GUINT16(p)) -#define GSF_LE_GET_GINT32(p) ((gint32)GSF_LE_GET_GUINT32(p)) -#define GSF_LE_GET_FLOAT(p) (gsf_le_get_float (p)) -#define GSF_LE_GET_DOUBLE(p) (gsf_le_get_double (p)) -guint64 gsf_le_get_guint64 (void const *p); -float gsf_le_get_float (void const *p); -double gsf_le_get_double (void const *p); - -#define GSF_LE_SET_GUINT8(p, dat) \ - (*((guint8 *)(p)) = ((dat) & 0xff)) -#define GSF_LE_SET_GUINT16(p, dat) \ - ((*((guint8 *)(p) + 0) = ((dat) & 0xff)),\ - (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff)) -#define GSF_LE_SET_GUINT32(p, dat) \ - ((*((guint8 *)(p) + 0) = ((dat)) & 0xff), \ - (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff), \ - (*((guint8 *)(p) + 2) = ((dat) >> 16) & 0xff), \ - (*((guint8 *)(p) + 3) = ((dat) >> 24) & 0xff)) -#define GSF_LE_SET_GINT8(p,dat) GSF_LE_SET_GUINT8((p),(dat)) -#define GSF_LE_SET_GINT16(p,dat) GSF_LE_SET_GUINT16((p),(dat)) -#define GSF_LE_SET_GINT32(p,dat) GSF_LE_SET_GUINT32((p),(dat)) -#define GSF_LE_SET_FLOAT(p,dat) gsf_le_set_float((p),(dat)) -#define GSF_LE_SET_DOUBLE(p,dat) gsf_le_set_double((p),(dat)) void gsf_le_set_float (void *p, float f); void gsf_le_set_double (void *p, double d); diff --git a/src/plugins/ole2/ole2extractor.c b/src/plugins/ole2/ole2extractor.c @@ -21,9 +21,9 @@ #include "platform.h" #include "extractor.h" #include <glib-object.h> -#include "gsf-infile-msole.h" -#include "gsf-input.h" -#include "gsf-utils.h" +#include <string.h> +#include <stdio.h> +#include <ctype.h> #define DEBUG_OLE2 0 @@ -35,6 +35,1351 @@ static void warning(const char * format, ...) {} #endif +/* *********************** formerly gsf-input.c ************* */ + +typedef struct GsfInput { + off_t size; + off_t cur_offset; + char * name; + const unsigned char * buf; + int needs_free; +} GsfInput; + + +static void +gsf_input_init (GsfInput * input) +{ + input->size = 0; + input->cur_offset = 0; + input->name = NULL; + input->buf = NULL; +} + +/** + * gsf_input_memory_new: + * @buf: The input bytes + * @length: The length of @buf + * @needs_free: Whether you want this memory to be free'd at object destruction + * + * Returns: A new #GsfInputMemory + */ +static GsfInput * +gsf_input_new (const unsigned char * buf, + off_t length, + int needs_free) +{ + GsfInput *mem = malloc(sizeof(GsfInput)); + if (mem == NULL) + return NULL; + gsf_input_init(mem); + mem->buf = buf; + mem->size = length; + mem->needs_free = needs_free; + return mem; +} + +static void +gsf_input_finalize (GsfInput * input) +{ + if (input->name != NULL) { + free (input->name); + input->name = NULL; + } + if ( (input->buf) && input->needs_free) + free((void*) input->buf); + free(input); +} + +/** + * gsf_input_set_name : + * @input : + * @name : + * + * protected. + * + * Returns : TRUE if the assignment was ok. + **/ +static int +gsf_input_set_name (GsfInput *input, char const *name) +{ + char *buf; + + g_return_val_if_fail (input != NULL, 0); + + buf = strdup (name); + if (input->name != NULL) + free (input->name); + input->name = buf; + return 1; +} + + + +static GsfInput * +gsf_input_dup (GsfInput *src) +{ + GsfInput * dst = malloc(sizeof(GsfInput)); + if (dst == NULL) + return NULL; + gsf_input_init(dst); + dst->buf = src->buf; + dst->needs_free = 0; + dst->size = src->size; + if (src->name != NULL) + gsf_input_set_name (dst, src->name); + dst->cur_offset = src->cur_offset; + return dst; +} + +static const unsigned char * +gsf_input_read (GsfInput * mem, size_t num_bytes, unsigned char * optional_buffer) +{ + const unsigned char *src = mem->buf; + if (src == NULL) + return NULL; + if (optional_buffer) { + memcpy (optional_buffer, src + mem->cur_offset, num_bytes); + mem->cur_offset += num_bytes; + + return optional_buffer; + } else { + const unsigned char * ret = src + mem->cur_offset; + mem->cur_offset += num_bytes; + return ret; + } +} + +/** + * gsf_input_name : + * @input : + * + * Returns @input's name in utf8 form, DO NOT FREE THIS STRING + **/ +static const char * +gsf_input_name (GsfInput *input) +{ + return input->name; +} + +/** + * gsf_input_size : + * @input : The input + * + * Looks up and caches the number of bytes in the input + * + * Returns : the size or -1 on error + **/ +static off_t +gsf_input_size (GsfInput *input) +{ + g_return_val_if_fail (input != NULL, -1); + return input->size; +} + +/** + * gsf_input_eof : + * @input : the input + * + * Are we at the end of the file ? + * + * Returns : TRUE if the input is at the eof. + **/ +static int +gsf_input_eof (GsfInput *input) +{ + g_return_val_if_fail (input != NULL, 0); + + return input->cur_offset >= input->size; +} + +/** + * gsf_input_remaining : + * @input : + * + * Returns the number of bytes left in the file. + **/ +static off_t +gsf_input_remaining (GsfInput *input) +{ + g_return_val_if_fail (input != NULL, 0); + + return input->size - input->cur_offset; +} + +/** + * gsf_input_tell : + * @input : + * + * Returns the current offset in the file. + **/ +static off_t +gsf_input_tell (GsfInput *input) +{ + g_return_val_if_fail (input != NULL, 0); + + return input->cur_offset; +} + +/** + * gsf_input_seek : + * @input : + * @offset : + * @whence : + * + * Returns TRUE on error. + **/ +static int +gsf_input_seek (GsfInput *input, off_t offset, int whence) +{ + off_t pos = offset; + + g_return_val_if_fail (input != NULL, 1); + + switch (whence) { + case SEEK_SET : break; + case SEEK_CUR : pos += input->cur_offset; break; + case SEEK_END : pos += input->size; break; + default : return 1; + } + + if (pos < 0 || pos > input->size) + return 1; + + /* + * If we go nowhere, just return. This in particular handles null + * seeks for streams with no seek method. + */ + if (pos == input->cur_offset) + return 0; + + input->cur_offset = pos; + return 0; +} + +/** + * gsf_input_set_size : + * @input : + * @size : + * + * Returns : TRUE if the assignment was ok. + */ +static int +gsf_input_set_size (GsfInput *input, off_t size) +{ + g_return_val_if_fail (input != NULL, 0); + + input->size = size; + return 1; +} + + + + +/* ******************** formerly gsf-utils.c **************** */ + + +/* Do this the ugly way so that we don't have to worry about alignment */ +#define GSF_LE_GET_GUINT8(p) (*(guint8 const *)(p)) +#define GSF_LE_GET_GUINT16(p) \ + (guint16)((((guint8 const *)(p))[0] << 0) | \ + (((guint8 const *)(p))[1] << 8)) +#define GSF_LE_GET_GUINT32(p) \ + (guint32)((((guint8 const *)(p))[0] << 0) | \ + (((guint8 const *)(p))[1] << 8) | \ + (((guint8 const *)(p))[2] << 16) | \ + (((guint8 const *)(p))[3] << 24)) + +#define GSF_LE_GET_GUINT64(p) (gsf_le_get_guint64 (p)) +#define GSF_LE_GET_GINT64(p) ((gint64)GSF_LE_GET_GUINT64(p)) +#define GSF_LE_GET_GINT8(p) ((gint8)GSF_LE_GET_GUINT8(p)) +#define GSF_LE_GET_GINT16(p) ((gint16)GSF_LE_GET_GUINT16(p)) +#define GSF_LE_GET_GINT32(p) ((gint32)GSF_LE_GET_GUINT32(p)) +#define GSF_LE_GET_FLOAT(p) (gsf_le_get_float (p)) +#define GSF_LE_GET_DOUBLE(p) (gsf_le_get_double (p)) +#define GSF_LE_SET_GUINT8(p, dat) \ + (*((guint8 *)(p)) = ((dat) & 0xff)) +#define GSF_LE_SET_GUINT16(p, dat) \ + ((*((guint8 *)(p) + 0) = ((dat) & 0xff)),\ + (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff)) +#define GSF_LE_SET_GUINT32(p, dat) \ + ((*((guint8 *)(p) + 0) = ((dat)) & 0xff), \ + (*((guint8 *)(p) + 1) = ((dat) >> 8) & 0xff), \ + (*((guint8 *)(p) + 2) = ((dat) >> 16) & 0xff), \ + (*((guint8 *)(p) + 3) = ((dat) >> 24) & 0xff)) +#define GSF_LE_SET_GINT8(p,dat) GSF_LE_SET_GUINT8((p),(dat)) +#define GSF_LE_SET_GINT16(p,dat) GSF_LE_SET_GUINT16((p),(dat)) +#define GSF_LE_SET_GINT32(p,dat) GSF_LE_SET_GUINT32((p),(dat)) +#define GSF_LE_SET_FLOAT(p,dat) gsf_le_set_float((p),(dat)) +#define GSF_LE_SET_DOUBLE(p,dat) gsf_le_set_double((p),(dat)) + + +/* + * Glib gets this wrong, really. ARM's floating point format is a weird + * mixture. + */ +#define G_ARMFLOAT_ENDIAN 56781234 +#if defined(__arm__) && !defined(__vfp__) && (G_BYTE_ORDER == G_LITTLE_ENDIAN) +#define G_FLOAT_BYTE_ORDER G_ARMFLOAT_ENDIAN +#else +#define G_FLOAT_BYTE_ORDER G_BYTE_ORDER +#endif + +static guint64 +gsf_le_get_guint64 (void const *p) +{ +#if G_BYTE_ORDER == G_BIG_ENDIAN + if (sizeof (guint64) == 8) { + guint64 li; + int i; + guint8 *t = (guint8 *)&li; + guint8 *p2 = (guint8 *)p; + int sd = sizeof (li); + + for (i = 0; i < sd; i++) + t[i] = p2[sd - 1 - i]; + + return li; + } else { + g_error ("Big endian machine, but weird size of guint64"); + } +#elif G_BYTE_ORDER == G_LITTLE_ENDIAN + if (sizeof (guint64) == 8) { + /* + * On i86, we could access directly, but Alphas require + * aligned access. + */ + guint64 data; + memcpy (&data, p, sizeof (data)); + return data; + } else { + g_error ("Little endian machine, but weird size of guint64"); + } +#else +#error "Byte order not recognised -- out of luck" +#endif +} + +static float +gsf_le_get_float (void const *p) +{ +#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN + if (sizeof (float) == 4) { + float f; + int i; + guint8 *t = (guint8 *)&f; + guint8 *p2 = (guint8 *)p; + int sd = sizeof (f); + + for (i = 0; i < sd; i++) + t[i] = p2[sd - 1 - i]; + + return f; + } else { + g_error ("Big endian machine, but weird size of floats"); + } +#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN) + if (sizeof (float) == 4) { + /* + * On i86, we could access directly, but Alphas require + * aligned access. + */ + float data; + memcpy (&data, p, sizeof (data)); + return data; + } else { + g_error ("Little endian machine, but weird size of floats"); + } +#else +#error "Floating-point byte order not recognised -- out of luck" +#endif +} + +static void +gsf_le_set_float (void *p, float d) +{ +#if G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN + if (sizeof (float) == 4) { + int i; + guint8 *t = (guint8 *)&d; + guint8 *p2 = (guint8 *)p; + int sd = sizeof (d); + + for (i = 0; i < sd; i++) + p2[sd - 1 - i] = t[i]; + } else { + g_error ("Big endian machine, but weird size of floats"); + } +#elif (G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN) || (G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN) + if (sizeof (float) == 4) { + /* + * On i86, we could access directly, but Alphas require + * aligned access. + */ + memcpy (p, &d, sizeof (d)); + } else { + g_error ("Little endian machine, but weird size of floats"); + } +#else +#error "Floating-point byte order not recognised -- out of luck" +#endif +} + +static double +gsf_le_get_double (void const *p) +{ +#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN + double data; + memcpy ((char *)&data + 4, p, 4); + memcpy ((char *)&data, (const char *)p + 4, 4); + return data; +#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN + if (sizeof (double) == 8) { + double d; + int i; + guint8 *t = (guint8 *)&d; + guint8 *p2 = (guint8 *)p; + int sd = sizeof (d); + + for (i = 0; i < sd; i++) + t[i] = p2[sd - 1 - i]; + + return d; + } else { + g_error ("Big endian machine, but weird size of doubles"); + } +#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN + if (sizeof (double) == 8) { + /* + * On i86, we could access directly, but Alphas require + * aligned access. + */ + double data; + memcpy (&data, p, sizeof (data)); + return data; + } else { + g_error ("Little endian machine, but weird size of doubles"); + } +#else +#error "Floating-point byte order not recognised -- out of luck" +#endif +} + +static void +gsf_le_set_double (void *p, double d) +{ +#if G_FLOAT_BYTE_ORDER == G_ARMFLOAT_ENDIAN + memcpy (p, (const char *)&d + 4, 4); + memcpy ((char *)p + 4, &d, 4); +#elif G_FLOAT_BYTE_ORDER == G_BIG_ENDIAN + if (sizeof (double) == 8) { + int i; + guint8 *t = (guint8 *)&d; + guint8 *p2 = (guint8 *)p; + int sd = sizeof (d); + + for (i = 0; i < sd; i++) + p2[sd - 1 - i] = t[i]; + } else { + g_error ("Big endian machine, but weird size of doubles"); + } +#elif G_FLOAT_BYTE_ORDER == G_LITTLE_ENDIAN + if (sizeof (double) == 8) { + /* + * On i86, we could access directly, but Alphas require + * aligned access. + */ + memcpy (p, &d, sizeof (d)); + } else { + g_error ("Little endian machine, but weird size of doubles"); + } +#else +#error "Floating-point byte order not recognised -- out of luck" +#endif +} + +/** + * gsf_extension_pointer: + * @path: A filename or file path. + * + * Extracts the extension from the end of a filename (the part after the final + * '.' in the filename). + * + * Returns: A pointer to the extension part of the filename, or a + * pointer to the end of the string if the filename does not + * have an extension. + */ +static char const * +gsf_extension_pointer (char const *path) +{ + char *s, *t; + + g_return_val_if_fail (path != NULL, NULL); + + t = strrchr (path, G_DIR_SEPARATOR); + s = strrchr ((t != NULL) ? t : path, '.'); + if (s != NULL) + return s + 1; + return path + strlen(path); +} + +/** + * gsf_iconv_close : A utility wrapper to safely close an iconv handle + * @handle : + **/ +static void +gsf_iconv_close (GIConv handle) +{ + if (handle != NULL && handle != ((GIConv)-1)) + g_iconv_close (handle); +} + + +/* ***************************** formerly gsf-infile-msole.c ********************* */ + +#define OLE_HEADER_SIZE 0x200 /* independent of big block size size */ +#define OLE_HEADER_SIGNATURE 0x00 +#define OLE_HEADER_CLSID 0x08 /* See ReadClassStg */ +#define OLE_HEADER_MINOR_VER 0x18 /* 0x33 and 0x3e have been seen */ +#define OLE_HEADER_MAJOR_VER 0x1a /* 0x3 been seen in wild */ +#define OLE_HEADER_BYTE_ORDER 0x1c /* 0xfe 0xff == Intel Little Endian */ +#define OLE_HEADER_BB_SHIFT 0x1e +#define OLE_HEADER_SB_SHIFT 0x20 +/* 0x22..0x27 reserved == 0 */ +#define OLE_HEADER_CSECTDIR 0x28 +#define OLE_HEADER_NUM_BAT 0x2c +#define OLE_HEADER_DIRENT_START 0x30 +/* 0x34..0x37 transacting signature must be 0 */ +#define OLE_HEADER_THRESHOLD 0x38 +#define OLE_HEADER_SBAT_START 0x3c +#define OLE_HEADER_NUM_SBAT 0x40 +#define OLE_HEADER_METABAT_BLOCK 0x44 +#define OLE_HEADER_NUM_METABAT 0x48 +#define OLE_HEADER_START_BAT 0x4c +#define BAT_INDEX_SIZE 4 +#define OLE_HEADER_METABAT_SIZE ((OLE_HEADER_SIZE - OLE_HEADER_START_BAT) / BAT_INDEX_SIZE) + +#define DIRENT_MAX_NAME_SIZE 0x40 +#define DIRENT_DETAILS_SIZE 0x40 +#define DIRENT_SIZE (DIRENT_MAX_NAME_SIZE + DIRENT_DETAILS_SIZE) +#define DIRENT_NAME_LEN 0x40 /* length in bytes incl 0 terminator */ +#define DIRENT_TYPE 0x42 +#define DIRENT_COLOUR 0x43 +#define DIRENT_PREV 0x44 +#define DIRENT_NEXT 0x48 +#define DIRENT_CHILD 0x4c +#define DIRENT_CLSID 0x50 /* only for dirs */ +#define DIRENT_USERFLAGS 0x60 /* only for dirs */ +#define DIRENT_CREATE_TIME 0x64 /* for files */ +#define DIRENT_MODIFY_TIME 0x6c /* for files */ +#define DIRENT_FIRSTBLOCK 0x74 +#define DIRENT_FILE_SIZE 0x78 +/* 0x7c..0x7f reserved == 0 */ + +#define DIRENT_TYPE_INVALID 0 +#define DIRENT_TYPE_DIR 1 +#define DIRENT_TYPE_FILE 2 +#define DIRENT_TYPE_LOCKBYTES 3 /* ? */ +#define DIRENT_TYPE_PROPERTY 4 /* ? */ +#define DIRENT_TYPE_ROOTDIR 5 +#define DIRENT_MAGIC_END 0xffffffff + +/* flags in the block allocation list to denote special blocks */ +#define BAT_MAGIC_UNUSED 0xffffffff /* -1 */ +#define BAT_MAGIC_END_OF_CHAIN 0xfffffffe /* -2 */ +#define BAT_MAGIC_BAT 0xfffffffd /* a bat block, -3 */ +#define BAT_MAGIC_METABAT 0xfffffffc /* a metabat block -4 */ + + + + +typedef struct { + guint32 *block; + guint32 num_blocks; +} MSOleBAT; + +typedef struct { + char *name; + char *collation_name; + int index; + size_t size; + gboolean use_sb; + guint32 first_block; + gboolean is_directory; + GList *children; + unsigned char clsid[16]; /* 16 byte GUID used by some apps */ +} MSOleDirent; + +typedef struct { + struct { + MSOleBAT bat; + unsigned shift; + unsigned filter; + size_t size; + } bb, sb; + off_t max_block; + guint32 threshold; /* transition between small and big blocks */ + guint32 sbat_start, num_sbat; + + MSOleDirent *root_dir; + struct GsfInput *sb_file; + + int ref_count; +} MSOleInfo; + +typedef struct GsfInfileMSOle { + off_t size; + off_t cur_offset; + struct GsfInput *input; + MSOleInfo *info; + MSOleDirent *dirent; + MSOleBAT bat; + off_t cur_block; + + struct { + guint8 *buf; + size_t buf_size; + } stream; +} GsfInfileMSOle; + +/* utility macros */ +#define OLE_BIG_BLOCK(index, ole) ((index) >> ole->info->bb.shift) + +static struct GsfInput *gsf_infile_msole_new_child (GsfInfileMSOle *parent, + MSOleDirent *dirent); + +/** + * ole_get_block : + * @ole : the infile + * @block : + * @buffer : optionally NULL + * + * Read a block of data from the underlying input. + * Be really anal. + **/ +static const guint8 * +ole_get_block (const GsfInfileMSOle *ole, guint32 block, guint8 *buffer) +{ + g_return_val_if_fail (block < ole->info->max_block, NULL); + + /* OLE_HEADER_SIZE is fixed at 512, but the sector containing the + * header is padded out to bb.size (sector size) when bb.size > 512. */ + if (gsf_input_seek (ole->input, + (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (block << ole->info->bb.shift)), + SEEK_SET) < 0) + return NULL; + + return gsf_input_read (ole->input, ole->info->bb.size, buffer); +} + +/** + * ole_make_bat : + * @metabat : a meta bat to connect to the raw blocks (small or large) + * @size_guess : An optional guess as to how many blocks are in the file + * @block : The first block in the list. + * @res : where to store the result. + * + * Walk the linked list of the supplied block allocation table and build up a + * table for the list starting in @block. + * + * Returns TRUE on error. + */ +static gboolean +ole_make_bat (MSOleBAT const *metabat, size_t size_guess, guint32 block, + MSOleBAT *res) +{ + /* NOTE : Only use size as a suggestion, sometimes it is wrong */ + GArray *bat = g_array_sized_new (FALSE, FALSE, + sizeof (guint32), size_guess); + + guint8 *used = (guint8*)g_alloca (1 + metabat->num_blocks / 8); + memset (used, 0, 1 + metabat->num_blocks / 8); + + if (block < metabat->num_blocks) + do { + /* Catch cycles in the bat list */ + g_return_val_if_fail (0 == (used[block/8] & (1 << (block & 0x7))), TRUE); + used[block/8] |= 1 << (block & 0x7); + + g_array_append_val (bat, block); + block = metabat->block [block]; + } while (block < metabat->num_blocks); + + res->block = NULL; + + res->num_blocks = bat->len; + res->block = (guint32 *) (gpointer) g_array_free (bat, FALSE); + + if (block != BAT_MAGIC_END_OF_CHAIN) { +#if 0 + g_warning ("This OLE2 file is invalid.\n" + "The Block Allocation Table for one of the streams had %x instead of a terminator (%x).\n" + "We might still be able to extract some data, but you'll want to check the file.", + block, BAT_MAGIC_END_OF_CHAIN); +#endif + } + + return FALSE; +} + +static void +ols_bat_release (MSOleBAT *bat) +{ + if (bat->block != NULL) { + g_free (bat->block); + bat->block = NULL; + bat->num_blocks = 0; + } +} + +/** + * ole_info_read_metabat : + * @ole : + * @bats : + * + * A small utility routine to read a set of references to bat blocks + * either from the OLE header, or a meta-bat block. + * + * Returns a pointer to the element after the last position filled. + **/ +static guint32 * +ole_info_read_metabat (GsfInfileMSOle *ole, guint32 *bats, guint32 max, + guint32 const *metabat, guint32 const *metabat_end) +{ + guint8 const *bat, *end; + + for (; metabat < metabat_end; metabat++) { + bat = ole_get_block (ole, *metabat, NULL); + if (bat == NULL) + return NULL; + end = bat + ole->info->bb.size; + for ( ; bat < end ; bat += BAT_INDEX_SIZE, bats++) { + *bats = GSF_LE_GET_GUINT32 (bat); + g_return_val_if_fail (*bats < max || + *bats >= BAT_MAGIC_METABAT, NULL); + } + } + return bats; +} + +/** + * gsf_ole_get_guint32s : + * @dst : + * @src : + * @num_bytes : + * + * Copy some some raw data into an array of guint32. + **/ +static void +gsf_ole_get_guint32s (guint32 *dst, guint8 const *src, int num_bytes) +{ + for (; (num_bytes -= BAT_INDEX_SIZE) >= 0 ; src += BAT_INDEX_SIZE) + *dst++ = GSF_LE_GET_GUINT32 (src); +} + +static struct GsfInput * +ole_info_get_sb_file (GsfInfileMSOle *parent) +{ + MSOleBAT meta_sbat; + + if (parent->info->sb_file != NULL) + return parent->info->sb_file; + + parent->info->sb_file = gsf_infile_msole_new_child (parent, + parent->info->root_dir); + + if (NULL == parent->info->sb_file) + return NULL; + + g_return_val_if_fail (parent->info->sb.bat.block == NULL, NULL); + + if (ole_make_bat (&parent->info->bb.bat, + parent->info->num_sbat, + parent->info->sbat_start, + &meta_sbat)) { + return NULL; + } + + parent->info->sb.bat.num_blocks = meta_sbat.num_blocks * (parent->info->bb.size / BAT_INDEX_SIZE); + parent->info->sb.bat.block = g_new0 (guint32, parent->info->sb.bat.num_blocks); + ole_info_read_metabat (parent, parent->info->sb.bat.block, + parent->info->sb.bat.num_blocks, + meta_sbat.block, meta_sbat.block + meta_sbat.num_blocks); + ols_bat_release (&meta_sbat); + + return parent->info->sb_file; +} + +static gint +ole_dirent_cmp (const MSOleDirent *a, const MSOleDirent *b) +{ + g_return_val_if_fail (a, 0); + g_return_val_if_fail (b, 0); + + g_return_val_if_fail (a->collation_name, 0); + g_return_val_if_fail (b->collation_name, 0); + + return strcmp (b->collation_name, a->collation_name); +} + +/** + * ole_dirent_new : + * @ole : + * @entry : + * @parent : optional + * + * Parse dirent number @entry and recursively handle its siblings and children. + **/ +static MSOleDirent * +ole_dirent_new (GsfInfileMSOle *ole, guint32 entry, MSOleDirent *parent) +{ + MSOleDirent *dirent; + guint32 block, next, prev, child, size; + guint8 const *data; + guint8 type; + guint16 name_len; + + if (entry >= DIRENT_MAGIC_END) + return NULL; + + block = OLE_BIG_BLOCK (entry * DIRENT_SIZE, ole); + + g_return_val_if_fail (block < ole->bat.num_blocks, NULL); + data = ole_get_block (ole, ole->bat.block [block], NULL); + if (data == NULL) + return NULL; + data += (DIRENT_SIZE * entry) % ole->info->bb.size; + + type = GSF_LE_GET_GUINT8 (data + DIRENT_TYPE); + if (type != DIRENT_TYPE_DIR && + type != DIRENT_TYPE_FILE && + type != DIRENT_TYPE_ROOTDIR) { +#if 0 + g_warning ("Unknown stream type 0x%x", type); +#endif + return NULL; + } + + /* It looks like directory (and root directory) sizes are sometimes bogus */ + size = GSF_LE_GET_GUINT32 (data + DIRENT_FILE_SIZE); + g_return_val_if_fail (type == DIRENT_TYPE_DIR || type == DIRENT_TYPE_ROOTDIR || + size <= (guint32)gsf_input_size(ole->input), NULL); + + dirent = g_new0 (MSOleDirent, 1); + dirent->index = entry; + dirent->size = size; + /* Store the class id which is 16 byte identifier used by some apps */ + memcpy(dirent->clsid, data + DIRENT_CLSID, sizeof(dirent->clsid)); + + /* root dir is always big block */ + dirent->use_sb = parent && (size < ole->info->threshold); + dirent->first_block = (GSF_LE_GET_GUINT32 (data + DIRENT_FIRSTBLOCK)); + dirent->is_directory = (type != DIRENT_TYPE_FILE); + dirent->children = NULL; + prev = GSF_LE_GET_GUINT32 (data + DIRENT_PREV); + next = GSF_LE_GET_GUINT32 (data + DIRENT_NEXT); + child = GSF_LE_GET_GUINT32 (data + DIRENT_CHILD); + name_len = GSF_LE_GET_GUINT16 (data + DIRENT_NAME_LEN); + dirent->name = NULL; + if (0 < name_len && name_len <= DIRENT_MAX_NAME_SIZE) { + gunichar2 uni_name [DIRENT_MAX_NAME_SIZE+1]; + gchar const *end; + int i; + + /* !#%!@$#^ + * Sometimes, rarely, people store the stream name as ascii + * rather than utf16. Do a validation first just in case. + */ + if (!g_utf8_validate (data, -1, &end) || + ((guint8 const *)end - data + 1) != name_len) { + /* be wary about endianness */ + for (i = 0 ; i < name_len ; i += 2) + uni_name [i/2] = GSF_LE_GET_GUINT16 (data + i); + uni_name [i/2] = 0; + + dirent->name = g_utf16_to_utf8 (uni_name, -1, NULL, NULL, NULL); + } else + dirent->name = g_strndup ((gchar *)data, (gsize)((guint8 const *)end - data + 1)); + } + /* be really anal in the face of screwups */ + if (dirent->name == NULL) + dirent->name = g_strdup (""); + dirent->collation_name = g_utf8_collate_key (dirent->name, -1); + + if (parent != NULL) + parent->children = g_list_insert_sorted (parent->children, + dirent, (GCompareFunc)ole_dirent_cmp); + + /* NOTE : These links are a tree, not a linked list */ + if (prev != entry) + ole_dirent_new (ole, prev, parent); + if (next != entry) + ole_dirent_new (ole, next, parent); + + if (dirent->is_directory) + ole_dirent_new (ole, child, dirent); + return dirent; +} + +static void +ole_dirent_free (MSOleDirent *dirent) +{ + GList *tmp; + g_return_if_fail (dirent != NULL); + + g_free (dirent->name); + g_free (dirent->collation_name); + + for (tmp = dirent->children; tmp; tmp = tmp->next) + ole_dirent_free ((MSOleDirent *)tmp->data); + g_list_free (dirent->children); + g_free (dirent); +} + +/*****************************************************************************/ + +static void +ole_info_unref (MSOleInfo *info) +{ + if (info->ref_count-- != 1) + return; + + ols_bat_release (&info->bb.bat); + ols_bat_release (&info->sb.bat); + if (info->root_dir != NULL) { + ole_dirent_free (info->root_dir); + info->root_dir = NULL; + } + if (info->sb_file != NULL) { + gsf_input_finalize(info->sb_file); + info->sb_file = NULL; + } + g_free (info); +} + +static MSOleInfo * +ole_info_ref (MSOleInfo *info) +{ + info->ref_count++; + return info; +} + +static void +gsf_infile_msole_init (GsfInfileMSOle * ole) +{ + ole->cur_offset = 0; + ole->size = 0; + ole->input = NULL; + ole->info = NULL; + ole->bat.block = NULL; + ole->bat.num_blocks = 0; + ole->cur_block = BAT_MAGIC_UNUSED; + ole->stream.buf = NULL; + ole->stream.buf_size = 0; +} + +static void +gsf_infile_msole_finalize (GsfInfileMSOle * ole) +{ + if (ole->input != NULL) { + gsf_input_finalize(ole->input); + ole->input = NULL; + } + if (ole->info != NULL) { + ole_info_unref (ole->info); + ole->info = NULL; + } + ols_bat_release (&ole->bat); + + g_free (ole->stream.buf); + free(ole); +} + +/** + * ole_dup : + * @src : + * + * Utility routine to _partially_ replicate a file. It does NOT copy the bat + * blocks, or init the dirent. + * + * Return value: the partial duplicate. + **/ +static GsfInfileMSOle * +ole_dup (GsfInfileMSOle const * src) +{ + GsfInfileMSOle *dst; + struct GsfInput *input; + + g_return_val_if_fail (src != NULL, NULL); + + dst = malloc(sizeof(GsfInfileMSOle)); + if (dst == NULL) + return NULL; + gsf_infile_msole_init(dst); + input = gsf_input_dup (src->input); + if (input == NULL) { + gsf_infile_msole_finalize(dst); + return NULL; + } + dst->input = input; + dst->info = ole_info_ref (src->info); + + /* buf and buf_size are initialized to NULL */ + + return dst; +} + +/** + * ole_init_info : + * @ole : + * + * Read an OLE header and do some sanity checking + * along the way. + * + * Return value: TRUE on error + **/ +static gboolean +ole_init_info (GsfInfileMSOle *ole) +{ + static guint8 const signature[] = + { 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 }; + guint8 const *header, *tmp; + guint32 *metabat = NULL; + MSOleInfo *info; + guint32 bb_shift, sb_shift, num_bat, num_metabat, last, dirent_start; + guint32 metabat_block, *ptr; + + /* check the header */ + if (gsf_input_seek (ole->input, (off_t) 0, SEEK_SET) || + NULL == (header = gsf_input_read (ole->input, OLE_HEADER_SIZE, NULL)) || + 0 != memcmp (header, signature, sizeof (signature))) { + return TRUE; + } + + bb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_BB_SHIFT); + sb_shift = GSF_LE_GET_GUINT16 (header + OLE_HEADER_SB_SHIFT); + num_bat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_BAT); + dirent_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_DIRENT_START); + metabat_block = GSF_LE_GET_GUINT32 (header + OLE_HEADER_METABAT_BLOCK); + num_metabat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_METABAT); + + /* Some sanity checks + * 1) There should always be at least 1 BAT block + * 2) It makes no sense to have a block larger than 2^31 for now. + * Maybe relax this later, but not much. + */ + if (6 > bb_shift || bb_shift >= 31 || sb_shift > bb_shift) { + return TRUE; + } + + info = g_new0 (MSOleInfo, 1); + ole->info = info; + + info->ref_count = 1; + info->bb.shift = bb_shift; + info->bb.size = 1 << info->bb.shift; + info->bb.filter = info->bb.size - 1; + info->sb.shift = sb_shift; + info->sb.size = 1 << info->sb.shift; + info->sb.filter = info->sb.size - 1; + info->threshold = GSF_LE_GET_GUINT32 (header + OLE_HEADER_THRESHOLD); + info->sbat_start = GSF_LE_GET_GUINT32 (header + OLE_HEADER_SBAT_START); + info->num_sbat = GSF_LE_GET_GUINT32 (header + OLE_HEADER_NUM_SBAT); + info->max_block = (gsf_input_size (ole->input) - OLE_HEADER_SIZE) / info->bb.size; + info->sb_file = NULL; + + if (info->num_sbat == 0 && info->sbat_start != BAT_MAGIC_END_OF_CHAIN) { +#if 0 + g_warning ("There is are not supposed to be any blocks in the small block allocation table, yet there is a link to some. Ignoring it."); +#endif + } + + /* very rough heuristic, just in case */ + if (num_bat < info->max_block) { + info->bb.bat.num_blocks = num_bat * (info->bb.size / BAT_INDEX_SIZE); + info->bb.bat.block = g_new0 (guint32, info->bb.bat.num_blocks); + + metabat = (guint32 *)g_alloca (MAX (info->bb.size, OLE_HEADER_SIZE)); + + /* Reading the elements invalidates this memory, make copy */ + gsf_ole_get_guint32s (metabat, header + OLE_HEADER_START_BAT, + OLE_HEADER_SIZE - OLE_HEADER_START_BAT); + last = num_bat; + if (last > OLE_HEADER_METABAT_SIZE) + last = OLE_HEADER_METABAT_SIZE; + + ptr = ole_info_read_metabat (ole, info->bb.bat.block, + info->bb.bat.num_blocks, metabat, metabat + last); + num_bat -= last; + } else + ptr = NULL; + + last = (info->bb.size - BAT_INDEX_SIZE) / BAT_INDEX_SIZE; + while (ptr != NULL && num_metabat-- > 0) { + tmp = ole_get_block (ole, metabat_block, NULL); + if (tmp == NULL) { + ptr = NULL; + break; + } + + /* Reading the elements invalidates this memory, make copy */ + gsf_ole_get_guint32s (metabat, tmp, (int)info->bb.size); + + if (num_metabat == 0) { + if (last < num_bat) { + /* there should be less that a full metabat block + * remaining */ + ptr = NULL; + break; + } + last = num_bat; + } else if (num_metabat > 0) { + metabat_block = metabat[last]; + num_bat -= last; + } + + ptr = ole_info_read_metabat (ole, ptr, + info->bb.bat.num_blocks, metabat, metabat + last); + } + + if (ptr == NULL) { + return TRUE; + } + + /* Read the directory's bat, we do not know the size */ + if (ole_make_bat (&info->bb.bat, 0, dirent_start, &ole->bat)) { + return TRUE; + } + + /* Read the directory */ + ole->dirent = info->root_dir = ole_dirent_new (ole, 0, NULL); + if (ole->dirent == NULL) { + return TRUE; + } + + return FALSE; +} + +static guint8 const * +gsf_infile_msole_read (GsfInfileMSOle *ole, size_t num_bytes, guint8 *buffer) +{ + off_t first_block, last_block, raw_block, offset, i; + guint8 const *data; + guint8 *ptr; + size_t count; + + /* small block files are preload */ + if (ole->dirent != NULL && ole->dirent->use_sb) { + if (buffer != NULL) { + memcpy (buffer, ole->stream.buf + ole->cur_offset, num_bytes); + ole->cur_offset += num_bytes; + return buffer; + } + data = ole->stream.buf + ole->cur_offset; + ole->cur_offset += num_bytes; + return data; + } + + /* GsfInput guarantees that num_bytes > 0 */ + first_block = OLE_BIG_BLOCK (ole->cur_offset, ole); + last_block = OLE_BIG_BLOCK (ole->cur_offset + num_bytes - 1, ole); + offset = ole->cur_offset & ole->info->bb.filter; + + /* optimization : are all the raw blocks contiguous */ + i = first_block; + raw_block = ole->bat.block [i]; + while (++i <= last_block && ++raw_block == ole->bat.block [i]) + ; + if (i > last_block) { + /* optimization don't seek if we don't need to */ + if (ole->cur_block != first_block) { + if (gsf_input_seek (ole->input, + (off_t)(MAX (OLE_HEADER_SIZE, ole->info->bb.size) + (ole->bat.block [first_block] << ole->info->bb.shift) + offset), + SEEK_SET) < 0) + return NULL; + } + ole->cur_block = last_block; + return gsf_input_read (ole->input, num_bytes, buffer); + } + + /* damn, we need to copy it block by block */ + if (buffer == NULL) { + if (ole->stream.buf_size < num_bytes) { + if (ole->stream.buf != NULL) + g_free (ole->stream.buf); + ole->stream.buf_size = num_bytes; + ole->stream.buf = g_new (guint8, num_bytes); + } + buffer = ole->stream.buf; + } + + ptr = buffer; + for (i = first_block ; i <= last_block ; i++ , ptr += count, num_bytes -= count) { + count = ole->info->bb.size - offset; + if (count > num_bytes) + count = num_bytes; + data = ole_get_block (ole, ole->bat.block [i], NULL); + if (data == NULL) + return NULL; + + /* TODO : this could be optimized to avoid the copy */ + memcpy (ptr, data + offset, count); + offset = 0; + } + ole->cur_block = BAT_MAGIC_UNUSED; + ole->cur_offset += num_bytes; + return buffer; +} + +static struct GsfInput * +gsf_infile_msole_new_child (GsfInfileMSOle *parent, + MSOleDirent *dirent) +{ + GsfInfileMSOle * child; + MSOleInfo *info; + MSOleBAT const *metabat; + struct GsfInput *sb_file = NULL; + size_t size_guess; + char * buf; + + + if ( (dirent->index != 0) && + (dirent->is_directory) ) { + /* be wary. It seems as if some implementations pretend that the + * directories contain data */ + return gsf_input_new("", + (off_t) 0, + 0); + } + child = ole_dup (parent); + if (child == NULL) + return NULL; + child->dirent = dirent; + child->size = (off_t) dirent->size; + + info = parent->info; + + if (dirent->use_sb) { /* build the bat */ + metabat = &info->sb.bat; + size_guess = dirent->size >> info->sb.shift; + sb_file = ole_info_get_sb_file (parent); + } else { + metabat = &info->bb.bat; + size_guess = dirent->size >> info->bb.shift; + } + if (ole_make_bat (metabat, size_guess + 1, dirent->first_block, &child->bat)) { + gsf_infile_msole_finalize(child); + return NULL; + } + + if (dirent->use_sb) { + unsigned i; + guint8 const *data; + + if (sb_file == NULL) { + gsf_infile_msole_finalize(child); + return NULL; + } + + child->stream.buf_size = info->threshold; + child->stream.buf = g_new (guint8, info->threshold); + + for (i = 0 ; i < child->bat.num_blocks; i++) + if (gsf_input_seek (sb_file, + (off_t)(child->bat.block [i] << info->sb.shift), SEEK_SET) < 0 || + (data = gsf_input_read (sb_file, + info->sb.size, + child->stream.buf + (i << info->sb.shift))) == NULL) { + gsf_infile_msole_finalize(child); + return NULL; + } + } + buf = malloc(child->size); + if (buf == NULL) { + gsf_infile_msole_finalize(child); + return NULL; + } + if (NULL == gsf_infile_msole_read(child, + child->size, + buf)) { + gsf_infile_msole_finalize(child); + return NULL; + } + gsf_infile_msole_finalize(child); + return gsf_input_new(buf, + (off_t) dirent->size, + 1); +} + + +static struct GsfInput * +gsf_infile_msole_child_by_index (GsfInfileMSOle * ole, int target) +{ + GList *p; + + for (p = ole->dirent->children; p != NULL ; p = p->next) + if (target-- <= 0) + return gsf_infile_msole_new_child (ole, + (MSOleDirent *)p->data); + return NULL; +} + +static char const * +gsf_infile_msole_name_by_index (GsfInfileMSOle * ole, int target) +{ + GList *p; + + for (p = ole->dirent->children; p != NULL ; p = p->next) + if (target-- <= 0) + return ((MSOleDirent *)p->data)->name; + return NULL; +} + +static int +gsf_infile_msole_num_children (GsfInfileMSOle * ole) +{ + g_return_val_if_fail (ole->dirent != NULL, -1); + + if (!ole->dirent->is_directory) + return -1; + return g_list_length (ole->dirent->children); +} + + +/** + * gsf_infile_msole_new : + * @source : + * + * Opens the root directory of an MS OLE file. + * NOTE : adds a reference to @source + * + * Returns : the new ole file handler + **/ +static GsfInfileMSOle * +gsf_infile_msole_new (struct GsfInput *source) +{ + GsfInfileMSOle * ole; + + ole = malloc(sizeof(GsfInfileMSOle)); + if (ole == NULL) + return NULL; + gsf_infile_msole_init(ole); + ole->input = source; + ole->size = (off_t) 0; + + if (ole_init_info (ole)) { + gsf_infile_msole_finalize(ole); + return NULL; + } + + return ole; +} + + + + + + +/* ******************************** main extraction code ************************ */ /* using libgobject, needs init! */ void __attribute__ ((constructor)) ole_gobject_init(void) { @@ -45,8 +1390,8 @@ static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordList *oldhead, const char *phrase, EXTRACTOR_KeywordType type) { - EXTRACTOR_KeywordList * keyword; + if (strlen(phrase) == 0) return oldhead; if (0 == strcmp(phrase, "\"\"")) @@ -279,6 +1624,7 @@ msole_prop_parse(GsfMSOleMetaDataSection *section, guint32 len; gsize gslen; gboolean const is_vector = type & LE_VT_VECTOR; + GError * error; g_return_val_if_fail (!(type & (unsigned)(~0x1fff)), NULL); /* not valid in a prop set */ @@ -346,6 +1692,11 @@ msole_prop_parse(GsfMSOleMetaDataSection *section, break; case LE_VT_CY : d (puts ("VT_CY");); + /* 8-byte two's complement integer (scaled by 10,000) */ + /* CHEAT : just store as an int64 for now */ + g_return_val_if_fail (*data + 8 <= data_end, NULL); + g_value_init (res, G_TYPE_INT64); + g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data)); break; case LE_VT_DATE : d (puts ("VT_DATE");); @@ -394,53 +1745,82 @@ msole_prop_parse(GsfMSOleMetaDataSection *section, case LE_VT_I8 : d (puts ("VT_I8");); g_return_val_if_fail (*data + 8 <= data_end, NULL); g_value_init (res, G_TYPE_INT64); - *data += 8; + g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data)); + *data += 8; break; case LE_VT_UI8 : d (puts ("VT_UI8");); g_return_val_if_fail (*data + 8 <= data_end, NULL); g_value_init (res, G_TYPE_UINT64); + g_value_set_uint64 (res, GSF_LE_GET_GUINT64 (*data)); *data += 8; break; case LE_VT_LPSTR : d (puts ("VT_LPSTR");); + /* + * This is the representation of many strings. It is stored in + * the same representation as VT_BSTR. Note that the serialized + * representation of VP_LPSTR has a preceding byte count, whereas + * the in-memory representation does not. + */ /* be anal and safe */ g_return_val_if_fail (*data + 4 <= data_end, NULL); - + len = GSF_LE_GET_GUINT32 (*data); - + g_return_val_if_fail (len < 0x10000, NULL); g_return_val_if_fail (*data + 4 + len*section->char_size <= data_end, NULL); - - gslen = 0; + + error = NULL; + d (gsf_mem_dump (*data + 4, len * section->char_size);); str = g_convert_with_iconv (*data + 4, len * section->char_size, - section->iconv_handle, &gslen, NULL, NULL); - len = (guint32)gslen; - + section->iconv_handle, NULL, NULL, &error); + g_value_init (res, G_TYPE_STRING); - g_value_set_string (res, str); - g_free (str); - *data += 4 + len; + if (NULL != str) { + g_value_set_string (res, str); + g_free (str); + } else if (NULL != error) { + g_warning ("error: %s", error->message); + g_error_free (error); + } else { + g_warning ("unknown error converting string property, using blank"); + } + *data += 4 + len * section->char_size; break; case LE_VT_LPWSTR : d (puts ("VT_LPWSTR");); + /* + * A counted and null-terminated Unicode string; a DWORD character + * count (where the count includes the terminating null) followed + * by that many Unicode (16-bit) characters. Note that the count + * is character count, not byte count. + */ /* be anal and safe */ g_return_val_if_fail (*data + 4 <= data_end, NULL); - + len = GSF_LE_GET_GUINT32 (*data); - + g_return_val_if_fail (len < 0x10000, NULL); g_return_val_if_fail (*data + 4 + len <= data_end, NULL); - + + error = NULL; + d (gsf_mem_dump (*data + 4, len*2);); str = g_convert (*data + 4, len*2, - "UTF-8", "UTF-16LE", &gslen, NULL, NULL); - len = (guint32)gslen; - + "UTF-8", "UTF-16LE", NULL, NULL, &error); + g_value_init (res, G_TYPE_STRING); - g_value_set_string (res, str); - g_free (str); - *data += 4 + len; + if (NULL != str) { + g_value_set_string (res, str); + g_free (str); + } else if (NULL != error) { + g_warning ("error: %s", error->message); + g_error_free (error); + } else { + g_warning ("unknown error converting string property, using blank"); + } + *data += 4 + len*2; break; case LE_VT_FILETIME : d (puts ("VT_FILETIME");); @@ -467,21 +1847,35 @@ msole_prop_parse(GsfMSOleMetaDataSection *section, break; } case LE_VT_BLOB : d (puts ("VT_BLOB");); + g_free (res); + res = NULL; break; case LE_VT_STREAM : d (puts ("VT_STREAM");); - break; + g_free (res); + res = NULL; + break; case LE_VT_STORAGE : d (puts ("VT_STORAGE");); + g_free (res); + res = NULL; break; case LE_VT_STREAMED_OBJECT: d (puts ("VT_STREAMED_OBJECT");); + g_free (res); + res = NULL; break; case LE_VT_STORED_OBJECT : d (puts ("VT_STORED_OBJECT");); + g_free (res); + res = NULL; break; case LE_VT_BLOB_OBJECT : d (puts ("VT_BLOB_OBJECT");); + g_free (res); + res = NULL; break; case LE_VT_CF : d (puts ("VT_CF");); break; case LE_VT_CLSID : d (puts ("VT_CLSID");); *data += 16; + g_free (res); + res = NULL; break; case LE_VT_ERROR :