libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 50ee542a691d498bf546661702e90ed57b28664e
parent 194df7957ff6877e4a3899b631c7e4435d77e193
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 11 Aug 2012 15:10:15 +0000

ole2 testcase

Diffstat:
Msrc/plugins/Makefile.am | 6+++++-
Msrc/plugins/ole2_extractor.c | 37++++++++++++++++++++++---------------
Asrc/plugins/test_ole2.c | 490+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rtest/blair.doc -> src/plugins/testdata/ole2_blair.doc | 0
Rtest/results.xls -> src/plugins/testdata/ole2_excel.xls | 0
Rtest/Test.doc -> src/plugins/testdata/ole2_msword.doc | 0
Rtest/sw40.sdw -> src/plugins/testdata/ole2_starwriter40.sdw | 0
7 files changed, 517 insertions(+), 16 deletions(-)

diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -25,7 +25,11 @@ EXTRA_DIST = template_extractor.c \ testdata/wav_noise.wav \ testdata/wav_alert.wav \ testdata/it_dawn.it \ - testdata/s3m_2nd_pm.s3m + testdata/s3m_2nd_pm.s3m \ + testdata/ole2_msword.doc \ + testdata/ole2_starwriter40.sdw \ + testdata/ole2_blair.doc \ + testdata/ole2_excel.xls if HAVE_VORBISFILE PLUGIN_OGG=libextractor_ogg.la diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c @@ -21,8 +21,7 @@ -- the Gnome Structured File Library Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org) - Part of this code was borrowed from wordleaker.cpp. See also - the README file in this directory. + Part of this code was adapted from wordleaker. */ /** * @file plugins/ole2_extractor.c @@ -67,10 +66,13 @@ */ static int add_metadata (EXTRACTOR_MetaDataProcessor proc, - void *proc_cls, - const char *phrase, - enum EXTRACTOR_MetaType type) + void *proc_cls, + const char *phrase, + enum EXTRACTOR_MetaType type) { + char *tmp; + int ret; + if (0 == strlen (phrase)) return 0; if (0 == strcmp (phrase, "\"\"")) @@ -79,13 +81,21 @@ add_metadata (EXTRACTOR_MetaDataProcessor proc, return 0; if (0 == strcmp (phrase, " ")) return 0; - return proc (proc_cls, - "ole2", - type, - EXTRACTOR_METAFORMAT_UTF8, - "text/plain", - phrase, - strlen (phrase) +1); + if (NULL == (tmp = strdup (phrase))) + return 0; + + while ( (strlen (tmp) > 0) && + (isblank ((unsigned char) tmp [strlen (tmp) - 1])) ) + tmp [strlen (tmp) - 1] = '\0'; + ret = proc (proc_cls, + "ole2", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + tmp, + strlen (tmp) + 1); + free (tmp); + return ret; } @@ -212,9 +222,6 @@ process_metadata (gpointer key, } if (NULL == contents) return; - if ( (strlen (contents) > 0) && - ('\n' == contents[strlen (contents) - 1]) ) - contents [strlen (contents) - 1] = '\0'; if (0 == strcmp (type, "meta:generator")) { const char *mimetype = "application/vnd.ms-files"; diff --git a/src/plugins/test_ole2.c b/src/plugins/test_ole2.c @@ -0,0 +1,490 @@ +/* + This file is part of libextractor. + (C) 2012 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 3, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. +*/ +/** + * @file plugins/test_ole2.c + * @brief testcase for ole2 plugin + * @author Christian Grothoff + */ +#include "platform.h" +#include "test_lib.h" + + +/** + * Main function for the OLE2 testcase. + * + * @param argc number of arguments (ignored) + * @param argv arguments (ignored) + * @return 0 on success + */ +int +main (int argc, char *argv[]) +{ + struct SolutionData ole2_msword_sol[] = + { + { + EXTRACTOR_METATYPE_CREATOR, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Nils Durner", + strlen ("Nils Durner") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_UNKNOWN_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2005-03-21T06:11:12Z", + strlen ("2005-03-21T06:11:12Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_DESCRIPTION, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "This is a small document to test meta data extraction by GNU libextractor.", + strlen ("This is a small document to test meta data extraction by GNU libextractor.") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_KEYWORDS, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "ole ole2 eole2extractor", + strlen ("ole ole2 eole2extractor") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_SUBJECT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "GNU libextractor", + strlen ("GNU libextractor") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Testcase for the ole2 extractor", + strlen ("Testcase for the ole2 extractor") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_LAST_SAVED_BY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Nils Durner", + strlen ("Nils Durner") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2005-03-21T06:10:19Z", + strlen ("2005-03-21T06:10:19Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_EDITING_CYCLES, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2", + strlen ("2") + 1, + 0 + }, + { 0, 0, NULL, NULL, 0, -1 } + }; + + struct SolutionData ole2_starwriter_sol[] = + { + { + EXTRACTOR_METATYPE_CREATOR, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Christian Grothoff", + strlen ("Christian Grothoff") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_UNKNOWN_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2004-09-24T02:54:31Z", + strlen ("2004-09-24T02:54:31Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_DESCRIPTION, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The comments", + strlen ("The comments") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_KEYWORDS, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Keywords", + strlen ("The Keywords") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_SUBJECT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Subject", + strlen ("The Subject") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Title", + strlen ("The Title") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_LAST_SAVED_BY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Christian Grothoff", + strlen ("Christian Grothoff") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2004-09-24T02:53:15Z", + strlen ("2004-09-24T02:53:15Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_EDITING_CYCLES, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "4", + strlen ("4") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Title", + strlen ("The Title") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_SUBJECT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Subject", + strlen ("The Subject") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_COMMENT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The comments", + strlen ("The comments") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_KEYWORDS, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "The Keywords", + strlen ("The Keywords") + 1, + 0 + }, + { 0, 0, NULL, NULL, 0, -1 } + }; + + struct SolutionData ole2_blair_sol[] = + { + { + EXTRACTOR_METATYPE_LANGUAGE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "U.S. English", + strlen ("U.S. English") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATOR, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "default", + strlen ("default") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_UNKNOWN_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2003-02-03T11:18:00Z", + strlen ("2003-02-03T11:18:00Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TITLE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Iraq- ITS INFRASTRUCTURE OF CONCEALMENT, DECEPTION AND INTIMIDATION", + strlen ("Iraq- ITS INFRASTRUCTURE OF CONCEALMENT, DECEPTION AND INTIMIDATION") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CHARACTER_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "22090", + strlen ("22090") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_LAST_SAVED_BY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "MKhan", + strlen ("MKhan") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_PAGE_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "1", + strlen ("1") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_WORD_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "3875", + strlen ("3875") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2003-02-03T09:31:00Z", + strlen ("2003-02-03T09:31:00Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_EDITING_CYCLES, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "4", + strlen ("4") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "application/vnd.ms-files", + strlen ("application/vnd.ms-files") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Microsoft Word 8.0", + strlen ("Microsoft Word 8.0") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_TEMPLATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Normal.dot", + strlen ("Normal.dot") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_LINE_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "184", + strlen ("184") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_PARAGRAPH_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "44", + strlen ("44") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #0: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'", + strlen ("Revision #0: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #1: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'", + strlen ("Revision #1: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #2: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'", + strlen ("Revision #2: Author `cic22' worked on `C:\\DOCUME~1\\phamill\\LOCALS~1\\Temp\\AutoRecovery save of Iraq - security.asd'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #3: Author `JPratt' worked on `C:\\TEMP\\Iraq - security.doc'", + strlen ("Revision #3: Author `JPratt' worked on `C:\\TEMP\\Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #4: Author `JPratt' worked on `A:\\Iraq - security.doc'", + strlen ("Revision #4: Author `JPratt' worked on `A:\\Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #5: Author `ablackshaw' worked on `C:\\ABlackshaw\\Iraq - security.doc'", + strlen ("Revision #5: Author `ablackshaw' worked on `C:\\ABlackshaw\\Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #6: Author `ablackshaw' worked on `C:\\ABlackshaw\\A;Iraq - security.doc'", + strlen ("Revision #6: Author `ablackshaw' worked on `C:\\ABlackshaw\\A;Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #7: Author `ablackshaw' worked on `A:\\Iraq - security.doc'", + strlen ("Revision #7: Author `ablackshaw' worked on `A:\\Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #8: Author `MKhan' worked on `C:\\TEMP\\Iraq - security.doc'", + strlen ("Revision #8: Author `MKhan' worked on `C:\\TEMP\\Iraq - security.doc'") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_REVISION_HISTORY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Revision #9: Author `MKhan' worked on `C:\\WINNT\\Profiles\\mkhan\\Desktop\\Iraq.doc'", + strlen ("Revision #9: Author `MKhan' worked on `C:\\WINNT\\Profiles\\mkhan\\Desktop\\Iraq.doc'") + 1, + 0 + }, + { 0, 0, NULL, NULL, 0, -1 } + }; + + struct SolutionData ole2_excel_sol[] = + { + { + EXTRACTOR_METATYPE_CREATOR, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "JV", + strlen ("JV") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_LAST_SAVED_BY, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "JV", + strlen ("JV") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATION_DATE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "2002-03-20T21:26:28Z", + strlen ("2002-03-20T21:26:28Z") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "application/vnd.ms-files", + strlen ("application/vnd.ms-files") + 1, + 0 + }, + { + EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "Microsoft Excel", + strlen ("Microsoft Excel") + 1, + 0 + }, + { 0, 0, NULL, NULL, 0, -1 } + }; + + struct ProblemSet ps[] = + { + { "testdata/ole2_msword.doc", + ole2_msword_sol }, + { "testdata/ole2_starwriter40.sdw", + ole2_starwriter_sol }, + { "testdata/ole2_blair.doc", + ole2_blair_sol }, + { "testdata/ole2_excel.xls", + ole2_excel_sol }, + { NULL, NULL } + }; + return ET_main ("ole2", ps); +} + +/* end of test_ole2.c */ diff --git a/test/blair.doc b/src/plugins/testdata/ole2_blair.doc Binary files differ. diff --git a/test/results.xls b/src/plugins/testdata/ole2_excel.xls Binary files differ. diff --git a/test/Test.doc b/src/plugins/testdata/ole2_msword.doc Binary files differ. diff --git a/test/sw40.sdw b/src/plugins/testdata/ole2_starwriter40.sdw Binary files differ.