libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit c099ad78d68ca9d0e91ddbde4da3141b82f1a730
parent a4d078ecfb2703ccd4c17d27559a9ec2f346e5c7
Author: Christian Grothoff <christian@grothoff.org>
Date:   Wed,  8 Mar 2006 13:52:16 +0000

wordleaker integration -- draft

Diffstat:
Mconfigure.ac | 1+
Msrc/include/extractor.h | 2++
Msrc/main/extractor.c | 3++-
Asrc/plugins/wordleaker/Makefile.am | 25+++++++++++++++++++++++++
Asrc/plugins/wordleaker/SYMBOLS | 1+
Dsrc/plugins/wordleaker/WordLeaker.cpp | 310-------------------------------------------------------------------------------
Dsrc/plugins/wordleaker/WordLeaker.h | 287-------------------------------------------------------------------------------
Asrc/plugins/wordleaker/wordextractor.cc | 221+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/plugins/wordleaker/wordleaker.cpp | 311+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/plugins/wordleaker/wordleaker.h | 287+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 850 insertions(+), 598 deletions(-)

diff --git a/configure.ac b/configure.ac @@ -298,6 +298,7 @@ src/plugins/printable/Makefile src/plugins/hash/Makefile src/plugins/thumbnail/Makefile src/plugins/exiv2/Makefile +src/plugins/wordleaker/Makefile src/test/Makefile ]) diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -140,6 +140,8 @@ typedef enum { EXTRACTOR_ORIENTATION = 87, EXTRACTOR_TEMPLATE = 88, EXTRACTOR_SPLIT = 89, + + EXTRACTOR_PRODUCTVERSION = 90, } EXTRACTOR_KeywordType; /** diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -131,11 +131,12 @@ static const char *keywordTypes[] = { gettext_noop("orientation"), gettext_noop("template"), gettext_noop("split"), + gettext_noop("product version"), NULL, }; /* the number of keyword types (for bounds-checking) */ -#define HIGHEST_TYPE_NUMBER 90 +#define HIGHEST_TYPE_NUMBER 91 #ifdef HAVE_LIBOGG #if HAVE_VORBIS diff --git a/src/plugins/wordleaker/Makefile.am b/src/plugins/wordleaker/Makefile.am @@ -0,0 +1,25 @@ +include ../Makefile-plugins.am + +plugin_LTLIBRARIES = \ + libextractor_word.la + +libextractor_word_la_LINK = \ + /bin/sh ../../../libtool --mode=link $(CXXLD) -o libextractor_word.la +libextractor_word_la_LDFLAGS = \ + $(PLUGINFLAGS) $(retaincommand) \ + $(XTRA_CPPLIBS) +libextractor_word_la_LIBADD = \ + $(top_builddir)/src/main/libextractor.la \ + $(top_builddir)/src/plugins/libconvert.la \ + -lm + +libextractor_word_la_SOURCES = \ + pole.h pole.cpp \ + wordleaker.h \ + wordextractor.cc + +# gcc 3.3 produces BROKEN code for -O1 and -O2 (PDF extraction +# would fail silently) hence we MUST override the user flag here +# which may contain -O1 or -O2! +# CXXFLAGS = -O0 + diff --git a/src/plugins/wordleaker/SYMBOLS b/src/plugins/wordleaker/SYMBOLS @@ -0,0 +1 @@ +libextractor_word_extract diff --git a/src/plugins/wordleaker/WordLeaker.cpp b/src/plugins/wordleaker/WordLeaker.cpp @@ -1,310 +0,0 @@ -/* - WordLeaker - Shows information about Word DOC files - Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> - - Based on poledump.c - Original idea from WordDumper (http://www.computerbytesman.com) - Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm - Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this library; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -// TAKE CARE: there's not a single check for validity of data, -// so any malformed or malicious Word file will break it - -#include <iostream> -#include <fstream> -#include <stdlib.h> -#include <list> -#include <ctime> - -#include "pole.h" -#include "WordLeaker.h" - -unsigned long fcSttbSavedBy; -unsigned long lcbSttbSavedBy; - -// read the type of the property and displays its value -void showProperty( POLE::Stream* stream ) { - unsigned long read, type; - unsigned char buffer[256]; - unsigned char c; - unsigned long i; - unsigned long t, t1, t2; - char *s; - - read = stream->read(buffer, 4); - type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - - switch (type) { - case 2: // VT_I2 - read = stream->read(buffer, 2); - i = buffer[0] + (buffer[1] << 8); - cout << i << endl; - break; - case 3: // VT_I4 - read = stream->read(buffer, 4); - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - cout << i << endl; - break; - case 11: // VT_BOOL - read = stream->read(buffer, 1); - if ((char) buffer[0] == -1) - cout << "true" << endl; - else - cout << "false" << endl; - break; - case 30: // VT_LPSTR - read = stream->read(buffer, 4); - i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - while ((c = stream->getch()) != 0) - cout << c; - cout << endl; - break; - case 64: // VT_FILETIME - read = stream->read(buffer, 8); - t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - t = filetime_to_unixtime(t1, t2); - s = ctime((time_t *) &t); - cout << s; - break; - default: - cout << "Unknown format " << type << endl; - } -} - -// show the revision data (users and files) -void dumpRevision( POLE::Storage* storage ) { - unsigned int nRev; - unsigned int where = 0; - POLE::Stream* stream; - - cout << "Revision:" << endl; - cout << "---------" << endl << endl; - - // FIXME: should look if using 0Table or 1Table - stream = storage->stream( "1Table" ); - if( !stream ) { - cout << "There's no revision information" << endl; - return; - } - - unsigned char * buffer = new unsigned char[lcbSttbSavedBy]; - unsigned char buffer2[1024]; - unsigned int length; - - // goto offset of revision - stream->seek(fcSttbSavedBy); - // read all the revision history - stream->read(buffer, lcbSttbSavedBy); - - // there are n strings, so n/2 revisions (author & file) - nRev = (buffer[2] + (buffer[3] << 8)) / 2; - where = 6; - - for (unsigned int i=0; i < nRev; i++) { - cout << "Rev #" << i << ": Author \""; - length = buffer[where++]; - // it's unicode, for now we only get the low byte - for (unsigned int j=0; j < length; j++) { - where++; - cout << buffer[where]; - where++; - } - where++; - cout << "\" worked on file \""; - length = buffer[where++]; - // it's unicode, for now we only get the low byte - for (unsigned int j=0; j < length; j++) { - where++; - cout << buffer[where]; - where++; - } - where++; - cout << "\"" << endl; - } - - cout << endl; - delete buffer; - -} - -// show data from DocumentSummary stream -void dumpDocumentSummary( POLE::Storage* storage ) { - POLE::Stream* stream; - unsigned long read, nproperties, propertyID, offsetProp, offsetCur; - unsigned long begin; - - cout << "Document Summary:" << endl; - cout << "-----------------" << endl << endl; - - stream = storage->stream( "DocumentSummaryInformation" ); - if( !stream ) { - cout << "There's no document summary information" << endl; - return; - } - - unsigned char buffer[256]; - - // ClassID & Offset - stream->seek(28); - stream->read(buffer, 20); - // beginning of section - begin = stream->tell(); - // length of section - read = stream->read(buffer, 4); - // number of properties - read = stream->read(buffer, 4); - nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - // properties - - for (unsigned long i = 0; i < nproperties; i++) { - read = stream->read(buffer, 8); - propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - if (propertyID > 1 && propertyID < 16) { - cout << DocumentSummaryProperties[propertyID] << ": "; - offsetCur = stream->tell(); - stream->seek(offsetProp + begin); - // read and show the property - showProperty(stream); - stream->seek(offsetCur); - } - } - - cout << endl; -} - -// show data from Summary stream -void dumpSummary( POLE::Storage* storage ) { - POLE::Stream* stream; - unsigned long read, nproperties, propertyID, offsetProp, offsetCur; - unsigned long begin; - - cout << "Summary:" << endl; - cout << "--------" << endl << endl; - - stream = storage->stream( "SummaryInformation" ); - if( !stream ) { - cout << "There's no summary information" << endl; - return; - } - - unsigned char buffer[256]; - - // ClassID & Offset - stream->seek(28); - stream->read(buffer, 20); - // beginning of section - begin = stream->tell(); - // length of section - read = stream->read(buffer, 4); - // number of properties - read = stream->read(buffer, 4); - nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - // properties - for (unsigned long i = 0; i < nproperties; i++) { - read = stream->read(buffer, 8); - propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); - offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); - if (propertyID > 1 && propertyID < 20) { - cout << SummaryProperties[propertyID] << ": "; - offsetCur = stream->tell(); - stream->seek(offsetProp + begin); - // read and show the property - showProperty(stream); - stream->seek(offsetCur); - } - } - - cout << endl; -} - -// reads the header of the file -bool readFIB( char* filename ) { - fstream file; - - file.open( filename, std::ios::binary | std::ios::in ); - if( !file.good() ) { - cout << "Can't find the file" << endl; - return false; - } - - unsigned char * buffer = new unsigned char[898]; - file.seekg( 512 ); - file.read( (char*)buffer, 898 ); - file.close(); - - unsigned int wIdent = buffer[0] + (buffer[1] << 8); - unsigned int nProduct = buffer[4] + (buffer[5] << 8); - unsigned int lid = buffer[6] + (buffer[7] << 8); - unsigned int envr = buffer[18]; - unsigned int wMagicCreated = buffer[34] + (buffer[35] << 8); - unsigned int wMagicRevised = buffer[36] + (buffer[37] << 8); - unsigned long lProductCreated = buffer[68] + (buffer[69] << 8) + (buffer[70] << 16) + (buffer[71] << 24); - unsigned long lProductRevised = buffer[72] + (buffer[73] << 8) + (buffer[74] << 16) + (buffer[75] << 24); - fcSttbSavedBy = buffer[722] + (buffer[723] << 8) + (buffer[724] << 16) + (buffer[725] << 24); - lcbSttbSavedBy = buffer[726] + (buffer[727] << 8) + (buffer[728] << 16) + (buffer[729] << 24); - delete[] buffer; - - cout << "File: " << filename << endl; - cout << "Product version: " << nProduct << endl; - cout << "Language: " << lidToLanguage(lid) << endl; - cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << dateToString(lProductCreated) << ")" << endl; - cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << dateToString(lProductRevised) << ")" << endl; - cout << endl; - - return true; - -} - -int main(int argc, char *argv[]) { - cout << endl << "WordLeaker v.0.1" << endl; - cout << " by Madelman (http://elligre.tk/madelman/)" << endl << endl; - - - if( argc < 2 ) { - cout << " You must supply a filename" << endl << endl; - return 0; - } - - char* filename = argv[1]; - - if ( !readFIB(filename) ) - return 1; - - POLE::Storage* storage = new POLE::Storage( filename ); - storage->open(); - if( storage->result() != POLE::Storage::Ok ) { - cout << "The file " << filename << " is not a Word document" << endl; - return 1; - } - - dumpSummary( storage ); - // FIXME: doesn't always work - // but there's nothing really interesting in here - //dumpDocumentSummary( storage ); - dumpRevision( storage ); - // TODO: we don't show the GUID - // TODO: we don't show the macros - - delete storage; - - return 0; -} - diff --git a/src/plugins/wordleaker/WordLeaker.h b/src/plugins/wordleaker/WordLeaker.h @@ -1,287 +0,0 @@ -/* - WordLeaker - Shows information about Word DOC files - Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> - - Based on poledump.c - Original idea from WordDumper (http://www.computerbytesman.com) - Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm - Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this library; see the file COPYING. If not, write to - the Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, US -*/ - -#include <string> - -using namespace std; - -static char* SummaryProperties[] = { -"Unknown", -"Unknown", -"Title", -"Subject", -"Author", -"Keywords", -"Comments", -"Template", -"Last Saved By", -"Revision Number", -"Total Editing Time", -"Last Printed", -"Create Time/Date", -"Last Saved Time/Date", -"Number of Pages", -"Number of Words", -"Number of Characters", -"Thumbnails", -"Creating Application", -"Security" -}; - -static char* DocumentSummaryProperties[] = { -"Dictionary", -"Code page", -"Category", -"PresentationTarget", -"Bytes", -"Lines", -"Paragraphs", -"Slides", -"Notes", -"HiddenSlides", -"MMClips", -"ScaleCrop", -"HeadingPairs", -"TitlesofParts", -"Manager", -"Company", -"LinksUpTo" -}; - -string dateToString( unsigned long date ) { - char f[9]; - sprintf(f, "%d/%d/%d", (date / 10000 % 100), (date / 100 % 100), (date % 100)); - return f; -} - -string idToProduct( unsigned int id ) { - // TODO: find the rest of ids - switch ( id ) { - case 0x6A62: - return "Word 97"; - case 0x626A: - return "Word 98 (Mac)"; - default: - return "Unknown"; - } -} - -string lidToLanguage( unsigned int lid ) { - switch ( lid ) { - case 0x0400: - return "No Proofing"; - case 0x0401: - return "Arabic"; - case 0x0402: - return "Bulgarian"; - case 0x0403: - return "Catalan"; - case 0x0404: - return "Traditional Chinese"; - case 0x0804: - return "Simplified Chinese"; - case 0x0405: - return "Czech"; - case 0x0406: - return "Danish"; - case 0x0407: - return "German"; - case 0x0807: - return "Swiss German"; - case 0x0408: - return "Greek"; - case 0x0409: - return "U.S. English"; - case 0x0809: - return "U.K. English"; - case 0x0c09: - return "Australian English"; - case 0x040a: - return "Castilian Spanish"; - case 0x080a: - return "Mexican Spanish"; - case 0x040b: - return "Finnish"; - case 0x040c: - return "French"; - case 0x080c: - return "Belgian French"; - case 0x0c0c: - return "Canadian French"; - case 0x100c: - return "Swiss French"; - case 0x040d: - return "Hebrew"; - case 0x040e: - return "Hungarian"; - case 0x040f: - return "Icelandic"; - case 0x0410: - return "Italian"; - case 0x0810: - return "Swiss Italian"; - case 0x0411: - return "Japanese"; - case 0x0412: - return "Korean"; - case 0x0413: - return "Dutch"; - case 0x0813: - return "Belgian Dutch"; - case 0x0414: - return "Norwegian - Bokmal"; - case 0x0814: - return "Norwegian - Nynorsk"; - case 0x0415: - return "Polish"; - case 0x0416: - return "Brazilian Portuguese"; - case 0x0816: - return "Portuguese"; - case 0x0417: - return "Rhaeto-Romanic"; - case 0x0418: - return "Romanian"; - case 0x0419: - return "Russian"; - case 0x041a: - return "Croato-Serbian (Latin)"; - case 0x081a: - return "Serbo-Croatian (Cyrillic)"; - case 0x041b: - return "Slovak"; - case 0x041c: - return "Albanian"; - case 0x041d: - return "Swedish"; - case 0x041e: - return "Thai"; - case 0x041f: - return "Turkish"; - case 0x0420: - return "Urdu"; - case 0x0421: - return "Bahasa"; - case 0x0422: - return "Ukrainian"; - case 0x0423: - return "Byelorussian"; - case 0x0424: - return "Slovenian"; - case 0x0425: - return "Estonian"; - case 0x0426: - return "Latvian"; - case 0x0427: - return "Lithuanian"; - case 0x0429: - return "Farsi"; - case 0x042D: - return "Basque"; - case 0x042F: - return "Macedonian"; - case 0x0436: - return "Afrikaans"; - case 0x043E: - return "Malaysian"; - default: - return "Unknown"; - } -} - -/* - * filetime_to_unixtime - * - * Adapted from work in 'wv' by: - * Caolan McNamara (Caolan.McNamara@ul.ie) - */ -#define HIGH32_DELTA 27111902 -#define MID16_DELTA 54590 -#define LOW16_DELTA 32768 - -unsigned long filetime_to_unixtime (unsigned long low_time, unsigned long high_time) { - unsigned long low16;/* 16 bit, low bits */ - unsigned long mid16;/* 16 bit, medium bits */ - unsigned long hi32;/* 32 bit, high bits */ - unsigned int carry;/* carry bit for subtraction */ - int negative;/* whether a represents a negative value */ - -/* Copy the time values to hi32/mid16/low16 */ -hi32 = high_time; -mid16 = low_time >> 16; -low16 = low_time & 0xffff; - -/* Subtract the time difference */ -if (low16 >= LOW16_DELTA ) -low16 -= LOW16_DELTA , carry = 0; -else -low16 += (1 << 16) - LOW16_DELTA , carry = 1; - -if (mid16 >= MID16_DELTA + carry) -mid16 -= MID16_DELTA + carry, carry = 0; -else -mid16 += (1 << 16) - MID16_DELTA - carry, carry = 1; - -hi32 -= HIGH32_DELTA + carry; - -/* If a is negative, replace a by (-1-a) */ -negative = (hi32 >= ((unsigned long)1) << 31); -if (negative) { -/* Set a to -a - 1 (a is hi32/mid16/low16) */ -low16 = 0xffff - low16; -mid16 = 0xffff - mid16; -hi32 = ~hi32; -} - -/* - * Divide a by 10000000 (a = hi32/mid16/low16), put the rest into r. - * Split the divisor into 10000 * 1000 which are both less than 0xffff. - */ -mid16 += (hi32 % 10000) << 16; -hi32 /= 10000; -low16 += (mid16 % 10000) << 16; -mid16 /= 10000; -low16 /= 10000; - -mid16 += (hi32 % 1000) << 16; -hi32 /= 1000; -low16 += (mid16 % 1000) << 16; -mid16 /= 1000; -low16 /= 1000; - -/* If a was negative, replace a by (-1-a) and r by (9999999 - r) */ -if (negative) { -/* Set a to -a - 1 (a is hi32/mid16/low16) */ -low16 = 0xffff - low16; -mid16 = 0xffff - mid16; -hi32 = ~hi32; -} - -/* Do not replace this by << 32, it gives a compiler warning and - * it does not work - */ -return ((((unsigned long)hi32) << 16) << 16) + (mid16 << 16) + low16; - -} diff --git a/src/plugins/wordleaker/wordextractor.cc b/src/plugins/wordleaker/wordextractor.cc @@ -0,0 +1,221 @@ +/* + This file is part of libextractor. + (C) 2006 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + + This code depends heavily on the wordleaker code and + a lot of code was borrowed from wordleaker.cpp. See also + the README file in this directory. + */ + +#include "platform.h" +#include "extractor.h" +#include "../convert.h" +#include <math.h> + +#include "wordleaker.h" +#include "pole.h" + +extern "C" { + + static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, + const char * keyword, + struct EXTRACTOR_Keywords * next) { + EXTRACTOR_KeywordList * result; + + if (keyword == NULL) + return next; + result = (EXTRACTOR_KeywordList*) malloc(sizeof(EXTRACTOR_KeywordList)); + result->next = next; + result->keyword = strdup(keyword); + result->keywordType = type; + return result; + } + + + // read the type of the property and displays its value + char * getProperty( POLE::Stream* stream ) { + unsigned long read, type; + unsigned char buffer[256]; + unsigned char c; + unsigned long i; + unsigned int j; + unsigned long t, t1, t2; + char *s; + + read = stream->read(buffer, 4); + type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + + switch (type) { + case 2: // VT_I2 + read = stream->read(buffer, 2); + i = buffer[0] + (buffer[1] << 8); + s = (char*) malloc(16); + snprintf(s, 16, "%u", i); + return s; + case 3: // VT_I4 + read = stream->read(buffer, 4); + i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + s = (char*) malloc(16); + snprintf(s, 16, "%u", i); + return s; + case 11: // VT_BOOL + read = stream->read(buffer, 1); + if ((char) buffer[0] == -1) + return strdup("true"); + return strdup("false"); + case 30: // VT_LPSTR + read = stream->read(buffer, 4); + i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + if ( (i < 0) || (i > 16*1024*1024)) + return NULL; + s = (char*) malloc(i+1); + s[i] = '\0'; + j = 0; + while ( ((c = stream->getch()) != 0) && (i > j) ) + s[j++] = c; + if (j != i) { + free(s); + return NULL; + } + return s; + case 64: // VT_FILETIME + read = stream->read(buffer, 8); + t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); + t = filetime_to_unixtime(t1, t2); + return ctime_r((time_t *) &t, (char*)malloc(32)); + } + return NULL; + } + + + struct EXTRACTOR_Keywords * libextractor_word_extract(const char * filename, + const char * data, + size_t size, + struct EXTRACTOR_Keywords * prev) { + char ver[16]; + if (size < 512 + 898) + return prev; + const unsigned char * buffer = (const unsigned char*) &data[512]; + unsigned int wIdent = buffer[0] + (buffer[1] << 8); + unsigned int nProduct = buffer[4] + (buffer[5] << 8); + unsigned int lid = buffer[6] + (buffer[7] << 8); + unsigned int envr = buffer[18]; + unsigned int wMagicCreated = buffer[34] + (buffer[35] << 8); + unsigned int wMagicRevised = buffer[36] + (buffer[37] << 8); + unsigned long lProductCreated = buffer[68] + (buffer[69] << 8) + (buffer[70] << 16) + (buffer[71] << 24); + unsigned long lProductRevised = buffer[72] + (buffer[73] << 8) + (buffer[74] << 16) + (buffer[75] << 24); + unsigned long fcSttbSavedBy = buffer[722] + (buffer[723] << 8) + (buffer[724] << 16) + (buffer[725] << 24); + unsigned long lcbSttbSavedBy = buffer[726] + (buffer[727] << 8) + (buffer[728] << 16) + (buffer[729] << 24); + + snprintf(ver, 16, "%u", nProduct); + prev = addKeyword(EXTRACTOR_PRODUCTVERSION, + ver, + prev); + prev = addKeyword(EXTRACTOR_LANGUAGE, + lidToLanguage(lid), + prev); + + // cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << dateToString(lProductCreated) << ")" << endl; + // cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << dateToString(lProductRevised) << ")" << endl; + + POLE::Storage* storage = new POLE::Storage( filename ); + storage->open(); + if( storage->result() != POLE::Storage::Ok ) + return prev; + + POLE::Stream * stream = storage->stream( "SummaryInformation" ); + if (stream) { + unsigned char buffer[256]; + + // ClassID & Offset + stream->seek(28); + stream->read(buffer, 20); + // beginning of section + unsigned long begin = stream->tell(); + // length of section + unsigned long read = stream->read(buffer, 4); + // number of properties + read = stream->read(buffer, 4); + unsigned int nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + // properties + for (unsigned int i = 0; i < nproperties; i++) { + read = stream->read(buffer, 8); + unsigned int propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + unsigned int offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); + if (propertyID > 1 && propertyID < 20) { + // cout << SummaryProperties[propertyID] << ": "; + unsigned long offsetCur = stream->tell(); + stream->seek(offsetProp + begin); + // read and show the property + char * prop = getProperty(stream); + free(prop); + stream->seek(offsetCur); + } + } + } + + unsigned int where = 0; + + // FIXME: should look if using 0Table or 1Table + stream = storage->stream( "1Table" ); + if (stream) { + unsigned char * buffer = new unsigned char[lcbSttbSavedBy]; + unsigned char buffer2[1024]; + + // goto offset of revision + stream->seek(fcSttbSavedBy); + // read all the revision history + stream->read(buffer, lcbSttbSavedBy); + + // there are n strings, so n/2 revisions (author & file) + unsigned int nRev = (buffer[2] + (buffer[3] << 8)) / 2; + where = 6; + + for (unsigned int i=0; i < nRev; i++) { + // cout << "Rev #" << i << ": Author \""; + unsigned int length = buffer[where++]; + // it's unicode, for now we only get the low byte + for (unsigned int j=0; j < length; j++) { + where++; + // cout << buffer[where]; + where++; + } + where++; + // cout << "\" worked on file \""; + length = buffer[where++]; + // it's unicode, for now we only get the low byte + for (unsigned int j=0; j < length; j++) { + where++; + // cout << buffer[where]; + where++; + } + where++; + // cout << "\"" << endl; + } + + delete buffer; + + } + delete storage; + + return prev; + } + +} + diff --git a/src/plugins/wordleaker/wordleaker.cpp b/src/plugins/wordleaker/wordleaker.cpp @@ -0,0 +1,311 @@ +/* + WordLeaker - Shows information about Word DOC files + Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> + + Based on poledump.c + Original idea from WordDumper (http://www.computerbytesman.com) + Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm + Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this library; see the file COPYING. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, US +*/ + +// TAKE CARE: there's not a single check for validity of data, +// so any malformed or malicious Word file will break it + +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <list> +#include <ctime> + +#include "pole.h" +#include "WordLeaker.h" + +unsigned long fcSttbSavedBy; +unsigned long lcbSttbSavedBy; + +// read the type of the property and displays its value +void showProperty( POLE::Stream* stream ) { + unsigned long read, type; + unsigned char buffer[256]; + unsigned char c; + unsigned long i; + unsigned long t, t1, t2; + char *s; + + read = stream->read(buffer, 4); + type = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + + switch (type) { + case 2: // VT_I2 + read = stream->read(buffer, 2); + i = buffer[0] + (buffer[1] << 8); + cout << i << endl; + break; + case 3: // VT_I4 + read = stream->read(buffer, 4); + i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + cout << i << endl; + break; + case 11: // VT_BOOL + read = stream->read(buffer, 1); + if ((char) buffer[0] == -1) + cout << "true" << endl; + else + cout << "false" << endl; + break; + case 30: // VT_LPSTR + read = stream->read(buffer, 4); + i = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + while ((c = stream->getch()) != 0) + cout << c; + cout << endl; + break; + case 64: // VT_FILETIME + read = stream->read(buffer, 8); + t1 = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + t2 = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); + t = filetime_to_unixtime(t1, t2); + s = ctime((time_t *) &t); + cout << s; + break; + default: + cout << "Unknown format " << type << endl; + } +} + +// show the revision data (users and files) +void dumpRevision( POLE::Storage* storage ) { + unsigned int nRev; + unsigned int where = 0; + POLE::Stream* stream; + + cout << "Revision:" << endl; + cout << "---------" << endl << endl; + + // FIXME: should look if using 0Table or 1Table + stream = storage->stream( "1Table" ); + if( !stream ) { + cout << "There's no revision information" << endl; + return; + } + + unsigned char * buffer = new unsigned char[lcbSttbSavedBy]; + unsigned char buffer2[1024]; + unsigned int length; + + // goto offset of revision + stream->seek(fcSttbSavedBy); + // read all the revision history + stream->read(buffer, lcbSttbSavedBy); + + // there are n strings, so n/2 revisions (author & file) + nRev = (buffer[2] + (buffer[3] << 8)) / 2; + where = 6; + + for (unsigned int i=0; i < nRev; i++) { + cout << "Rev #" << i << ": Author \""; + length = buffer[where++]; + // it's unicode, for now we only get the low byte + for (unsigned int j=0; j < length; j++) { + where++; + cout << buffer[where]; + where++; + } + where++; + cout << "\" worked on file \""; + length = buffer[where++]; + // it's unicode, for now we only get the low byte + for (unsigned int j=0; j < length; j++) { + where++; + cout << buffer[where]; + where++; + } + where++; + cout << "\"" << endl; + } + + cout << endl; + delete buffer; + +} + +// show data from DocumentSummary stream +void dumpDocumentSummary( POLE::Storage* storage ) { + POLE::Stream* stream; + unsigned long read, nproperties, propertyID, offsetProp, offsetCur; + unsigned long begin; + + cout << "Document Summary:" << endl; + cout << "-----------------" << endl << endl; + + stream = storage->stream( "DocumentSummaryInformation" ); + if( !stream ) { + cout << "There's no document summary information" << endl; + return; + } + + unsigned char buffer[256]; + + // ClassID & Offset + stream->seek(28); + stream->read(buffer, 20); + // beginning of section + begin = stream->tell(); + // length of section + read = stream->read(buffer, 4); + // number of properties + read = stream->read(buffer, 4); + nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + // properties + + for (unsigned long i = 0; i < nproperties; i++) { + read = stream->read(buffer, 8); + propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); + if (propertyID > 1 && propertyID < 16) { + cout << DocumentSummaryProperties[propertyID] << ": "; + offsetCur = stream->tell(); + stream->seek(offsetProp + begin); + // read and show the property + showProperty(stream); + stream->seek(offsetCur); + } + } + + cout << endl; +} + +// show data from Summary stream +void dumpSummary( POLE::Storage* storage ) { + POLE::Stream* stream; + unsigned long read, nproperties, propertyID, offsetProp, offsetCur; + unsigned long begin; + + cout << "Summary:" << endl; + cout << "--------" << endl << endl; + + stream = storage->stream( "SummaryInformation" ); + if( !stream ) { + cout << "There's no summary information" << endl; + return; + } + + unsigned char buffer[256]; + + // ClassID & Offset + stream->seek(28); + stream->read(buffer, 20); + // beginning of section + begin = stream->tell(); + // length of section + read = stream->read(buffer, 4); + // number of properties + read = stream->read(buffer, 4); + nproperties = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + // properties + for (unsigned long i = 0; i < nproperties; i++) { + read = stream->read(buffer, 8); + propertyID = buffer[0] + (buffer[1] << 8) + (buffer[2] << 16) + (buffer[3] << 24); + offsetProp = buffer[4] + (buffer[5] << 8) + (buffer[6] << 16) + (buffer[7] << 24); + if (propertyID > 1 && propertyID < 20) { + cout << SummaryProperties[propertyID] << ": "; + offsetCur = stream->tell(); + stream->seek(offsetProp + begin); + // read and show the property + showProperty(stream); + stream->seek(offsetCur); + } + } + + cout << endl; +} + +// reads the header of the file +bool readFIB( char* filename ) { + fstream file; + + file.open( filename, std::ios::binary | std::ios::in ); + if( !file.good() ) { + cout << "Can't find the file" << endl; + return false; + } + + unsigned char * buffer = new unsigned char[898]; + file.seekg( 512 ); + file.read( (char*)buffer, 898 ); + file.close(); + + unsigned int wIdent = buffer[0] + (buffer[1] << 8); + unsigned int nProduct = buffer[4] + (buffer[5] << 8); + unsigned int lid = buffer[6] + (buffer[7] << 8); + unsigned int envr = buffer[18]; + unsigned int wMagicCreated = buffer[34] + (buffer[35] << 8); + unsigned int wMagicRevised = buffer[36] + (buffer[37] << 8); + unsigned long lProductCreated = buffer[68] + (buffer[69] << 8) + (buffer[70] << 16) + (buffer[71] << 24); + unsigned long lProductRevised = buffer[72] + (buffer[73] << 8) + (buffer[74] << 16) + (buffer[75] << 24); + fcSttbSavedBy = buffer[722] + (buffer[723] << 8) + (buffer[724] << 16) + (buffer[725] << 24); + lcbSttbSavedBy = buffer[726] + (buffer[727] << 8) + (buffer[728] << 16) + (buffer[729] << 24); + delete[] buffer; + + cout << "File: " << filename << endl; + cout << "Product version: " << nProduct << endl; + cout << "Language: " << lidToLanguage(lid) << endl; + cout << "Created by: " << idToProduct(wMagicCreated) << " (Build " << dateToString(lProductCreated) << ")" << endl; + cout << "Revised by: " << idToProduct(wMagicRevised) << " (Build " << dateToString(lProductRevised) << ")" << endl; + cout << endl; + + return true; + +} + +#if HAVE_MAIN +int main(int argc, char *argv[]) { + cout << endl << "WordLeaker v.0.1" << endl; + cout << " by Madelman (http://elligre.tk/madelman/)" << endl << endl; + + + if( argc < 2 ) { + cout << " You must supply a filename" << endl << endl; + return 0; + } + + char* filename = argv[1]; + + if ( !readFIB(filename) ) + return 1; + + POLE::Storage* storage = new POLE::Storage( filename ); + storage->open(); + if( storage->result() != POLE::Storage::Ok ) { + cout << "The file " << filename << " is not a Word document" << endl; + return 1; + } + + dumpSummary( storage ); + // FIXME: doesn't always work + // but there's nothing really interesting in here + //dumpDocumentSummary( storage ); + dumpRevision( storage ); + // TODO: we don't show the GUID + // TODO: we don't show the macros + + delete storage; + + return 0; +} +#endif diff --git a/src/plugins/wordleaker/wordleaker.h b/src/plugins/wordleaker/wordleaker.h @@ -0,0 +1,287 @@ +/* + WordLeaker - Shows information about Word DOC files + Copyright (C) 2005 Sacha Fuentes <madelman@iname.com> + + Based on poledump.c + Original idea from WordDumper (http://www.computerbytesman.com) + Info on Word format: http://www.aozw65.dsl.pipex.com/generator_wword8.htm + Info on Word format: http://jakarta.apache.org/poi/hpsf/internals.html + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this library; see the file COPYING. If not, write to + the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, US +*/ + +#include <string> + +using namespace std; + +static char* SummaryProperties[] = { +"Unknown", +"Unknown", +"Title", +"Subject", +"Author", +"Keywords", +"Comments", +"Template", +"Last Saved By", +"Revision Number", +"Total Editing Time", +"Last Printed", +"Create Time/Date", +"Last Saved Time/Date", +"Number of Pages", +"Number of Words", +"Number of Characters", +"Thumbnails", +"Creating Application", +"Security" +}; + +static char* DocumentSummaryProperties[] = { +"Dictionary", +"Code page", +"Category", +"PresentationTarget", +"Bytes", +"Lines", +"Paragraphs", +"Slides", +"Notes", +"HiddenSlides", +"MMClips", +"ScaleCrop", +"HeadingPairs", +"TitlesofParts", +"Manager", +"Company", +"LinksUpTo" +}; + +string dateToString( unsigned long date ) { + char f[9]; + sprintf(f, "%d/%d/%d", (date / 10000 % 100), (date / 100 % 100), (date % 100)); + return f; +} + +string idToProduct( unsigned int id ) { + // TODO: find the rest of ids + switch ( id ) { + case 0x6A62: + return "Word 97"; + case 0x626A: + return "Word 98 (Mac)"; + default: + return "Unknown"; + } +} + +const char * lidToLanguage( unsigned int lid ) { + switch ( lid ) { + case 0x0400: + return "No Proofing"; + case 0x0401: + return "Arabic"; + case 0x0402: + return "Bulgarian"; + case 0x0403: + return "Catalan"; + case 0x0404: + return "Traditional Chinese"; + case 0x0804: + return "Simplified Chinese"; + case 0x0405: + return "Czech"; + case 0x0406: + return "Danish"; + case 0x0407: + return "German"; + case 0x0807: + return "Swiss German"; + case 0x0408: + return "Greek"; + case 0x0409: + return "U.S. English"; + case 0x0809: + return "U.K. English"; + case 0x0c09: + return "Australian English"; + case 0x040a: + return "Castilian Spanish"; + case 0x080a: + return "Mexican Spanish"; + case 0x040b: + return "Finnish"; + case 0x040c: + return "French"; + case 0x080c: + return "Belgian French"; + case 0x0c0c: + return "Canadian French"; + case 0x100c: + return "Swiss French"; + case 0x040d: + return "Hebrew"; + case 0x040e: + return "Hungarian"; + case 0x040f: + return "Icelandic"; + case 0x0410: + return "Italian"; + case 0x0810: + return "Swiss Italian"; + case 0x0411: + return "Japanese"; + case 0x0412: + return "Korean"; + case 0x0413: + return "Dutch"; + case 0x0813: + return "Belgian Dutch"; + case 0x0414: + return "Norwegian - Bokmal"; + case 0x0814: + return "Norwegian - Nynorsk"; + case 0x0415: + return "Polish"; + case 0x0416: + return "Brazilian Portuguese"; + case 0x0816: + return "Portuguese"; + case 0x0417: + return "Rhaeto-Romanic"; + case 0x0418: + return "Romanian"; + case 0x0419: + return "Russian"; + case 0x041a: + return "Croato-Serbian (Latin)"; + case 0x081a: + return "Serbo-Croatian (Cyrillic)"; + case 0x041b: + return "Slovak"; + case 0x041c: + return "Albanian"; + case 0x041d: + return "Swedish"; + case 0x041e: + return "Thai"; + case 0x041f: + return "Turkish"; + case 0x0420: + return "Urdu"; + case 0x0421: + return "Bahasa"; + case 0x0422: + return "Ukrainian"; + case 0x0423: + return "Byelorussian"; + case 0x0424: + return "Slovenian"; + case 0x0425: + return "Estonian"; + case 0x0426: + return "Latvian"; + case 0x0427: + return "Lithuanian"; + case 0x0429: + return "Farsi"; + case 0x042D: + return "Basque"; + case 0x042F: + return "Macedonian"; + case 0x0436: + return "Afrikaans"; + case 0x043E: + return "Malaysian"; + default: + return "Unknown"; + } +} + +/* + * filetime_to_unixtime + * + * Adapted from work in 'wv' by: + * Caolan McNamara (Caolan.McNamara@ul.ie) + */ +#define HIGH32_DELTA 27111902 +#define MID16_DELTA 54590 +#define LOW16_DELTA 32768 + +unsigned long filetime_to_unixtime (unsigned long low_time, unsigned long high_time) { + unsigned long low16;/* 16 bit, low bits */ + unsigned long mid16;/* 16 bit, medium bits */ + unsigned long hi32;/* 32 bit, high bits */ + unsigned int carry;/* carry bit for subtraction */ + int negative;/* whether a represents a negative value */ + +/* Copy the time values to hi32/mid16/low16 */ +hi32 = high_time; +mid16 = low_time >> 16; +low16 = low_time & 0xffff; + +/* Subtract the time difference */ +if (low16 >= LOW16_DELTA ) +low16 -= LOW16_DELTA , carry = 0; +else +low16 += (1 << 16) - LOW16_DELTA , carry = 1; + +if (mid16 >= MID16_DELTA + carry) +mid16 -= MID16_DELTA + carry, carry = 0; +else +mid16 += (1 << 16) - MID16_DELTA - carry, carry = 1; + +hi32 -= HIGH32_DELTA + carry; + +/* If a is negative, replace a by (-1-a) */ +negative = (hi32 >= ((unsigned long)1) << 31); +if (negative) { +/* Set a to -a - 1 (a is hi32/mid16/low16) */ +low16 = 0xffff - low16; +mid16 = 0xffff - mid16; +hi32 = ~hi32; +} + +/* + * Divide a by 10000000 (a = hi32/mid16/low16), put the rest into r. + * Split the divisor into 10000 * 1000 which are both less than 0xffff. + */ +mid16 += (hi32 % 10000) << 16; +hi32 /= 10000; +low16 += (mid16 % 10000) << 16; +mid16 /= 10000; +low16 /= 10000; + +mid16 += (hi32 % 1000) << 16; +hi32 /= 1000; +low16 += (mid16 % 1000) << 16; +mid16 /= 1000; +low16 /= 1000; + +/* If a was negative, replace a by (-1-a) and r by (9999999 - r) */ +if (negative) { +/* Set a to -a - 1 (a is hi32/mid16/low16) */ +low16 = 0xffff - low16; +mid16 = 0xffff - mid16; +hi32 = ~hi32; +} + +/* Do not replace this by << 32, it gives a compiler warning and + * it does not work + */ +return ((((unsigned long)hi32) << 16) << 16) + (mid16 << 16) + low16; + +}