libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 28e77e0ef9ac3719c31cb045feb519ce3d7cbd11
parent 75392c60408c1692c34c2f4c72452273f0a02625
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 25 Mar 2006 20:06:01 +0000

sync

Diffstat:
MChangeLog | 5+++++
MTODO | 1-
Mconfigure.ac | 64++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mdoc/extract.1 | 4++--
Mdoc/libextractor.3 | 2+-
Msrc/plugins/Makefile.am | 20++++++++++++++++++--
Msrc/plugins/pdfextractor.c | 255+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Msrc/plugins/pngextractor.c | 2+-
8 files changed, 294 insertions(+), 59 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,8 @@ +Fri Mar 24 21:43:43 PST 2006 + Started re-implementation of PDF support from scratch + (incomplete but working). Improvements to the build + system. + Thu Mar 9 17:46:39 PST 2006 Added support for wordleaker (additional meta-data for OLE2 streams). Releasing libextractor 0.5.11. diff --git a/TODO b/TODO @@ -1,5 +1,4 @@ FIX: -* HTML-extractor now broken (!) Also crappy code. FIX?! * check exiv2 memory consumption on very large files; also investigate 500kb (!) allocation/leak in exiv2 on test/test.html (reported by valgrind) diff --git a/configure.ac b/configure.ac @@ -261,6 +261,25 @@ AC_ARG_ENABLE(printable, printable=1]) AM_CONDITIONAL(HAVE_PRINTABLE, test x$printable != x0) + +xpdf=0 +AC_MSG_CHECKING([whether to enable xpdf-based extractor]) +AC_ARG_ENABLE(xpdf, + [AC_HELP_STRING([--enable-xpdf],[Enable xpdf-based extractor]) + AC_HELP_STRING([--disable-xpdf],[Disable xpdf-based extractor])], + [case "$enableval" in + no) AC_MSG_RESULT(no) + xpdf=0 + ;; + *) AC_MSG_RESULT(yes) + xpdf=1 + ;; + esac], + [ AC_MSG_RESULT(no) + xpdf=0]) +AM_CONDITIONAL(HAVE_XPDF, test x$xpdf != x0) + + exiv2=1 AC_MSG_CHECKING([whether to enable exiv2 extractor]) AC_ARG_ENABLE(exiv2, @@ -303,3 +322,48 @@ src/test/Makefile ]) AC_OUTPUT + +if test "x$xpdf" = "x1" +then + AC_MSG_NOTICE([NOTICE: xpdf enabled (xpdf has a bad security record)]) +else + AC_MSG_NOTICE([NOTICE: xpdf disabled (result: limited PDF support)]) +fi + +if test "x$exiv2" = "x0" +then + AC_MSG_NOTICE([NOTICE: exiv2 disabled]) +fi + + +if test "x$printable" = "x0" +then + AC_MSG_NOTICE([NOTICE: printable plugins disabled]) +else + AC_MSG_NOTICE([NOTICE: printable plugins enabled (will need 150 MB memory to compile)]) +fi + +if test "x$without_glib" = "xtrue" +then + AC_MSG_NOTICE([NOTICE: glib not used, no OLE2 (MS Office) support]) +fi + +if test "x$without_gtk" = "xtrue" +then + AC_MSG_NOTICE([NOTICE: gtk not found, no thumbnail support]) +fi + +if test "x$HAVE_VORBISFILE_TRUE" = "x#" +then + AC_MSG_NOTICE([NOTICE: vorbis support disabled]) +fi + +if test "x$HAVE_BZ2_TRUE" = "x#" +then + AC_MSG_NOTICE([NOTICE: bzip2 support disabled]) +fi + +if test "x$HAVE_ZLIB_TRUE" = "x#" +then + AC_MSG_ERROR([FATAL: zlib not found (headers not installed?)]) +fi diff --git a/doc/extract.1 b/doc/extract.1 @@ -1,4 +1,4 @@ -.TH EXTRACT 1 "April 28, 2005" "libextractor 0.4.2" +.TH EXTRACT 1 "April 28, 2005" "libextractor 0.5.11" .\" $Id .SH NAME extract @@ -32,7 +32,7 @@ extract \&... .br .SH DESCRIPTION -This manual page documents version 0.4.0 of the +This manual page documents version 0.5.11 of the .B extract command. .PP diff --git a/doc/libextractor.3 b/doc/libextractor.3 @@ -1,6 +1,6 @@ .TH LIBEXTRACTOR 3 "Jul 14, 2005" .SH NAME -libextractor \- meta\-information extraction library 0.5.2 +libextractor \- meta\-information extraction library 0.5.11 .SH SYNOPSIS \fB#include <extractor.h> diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -15,7 +15,16 @@ if HAVE_EXIV2 exiv2dir=exiv2 endif -SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm pdf $(thumbdir) $(exiv2dir) wordleaker +if HAVE_XPDF + xpdfdir=pdf +else + pdfplugin=libextractor_pdf.la +endif + +# toggle for development +# SUBDIRS = . +SUBDIRS = . $(oodir) $(printdir) hash $(oledir) rpm $(xpdfdir) $(thumbdir) $(exiv2dir) wordleaker + if HAVE_VORBISFILE extraogg = libextractor_ogg.la @@ -30,7 +39,7 @@ extraqt = libextractor_qt.la oodir = oo endif -plugin_LTLIBRARIES = \ +plugin_LTLIBRARIES = $(pdfplugin) \ libextractor_asf.la \ libextractor_deb.la \ libextractor_dvi.la \ @@ -92,6 +101,13 @@ libextractor_wav_la_SOURCES = \ libextractor_wav_la_LDFLAGS = \ $(PLUGINFLAGS) $(retaincommand) +libextractor_pdf_la_SOURCES = \ + pdfextractor.c +libextractor_pdf_la_LDFLAGS = \ + $(PLUGINFLAGS) $(retaincommand) +libextractor_pdf_la_LIBADD = \ + libconvert.la + libextractor_mp3_la_SOURCES = \ mp3extractor.c libextractor_mp3_la_LDFLAGS = \ diff --git a/src/plugins/pdfextractor.c b/src/plugins/pdfextractor.c @@ -18,9 +18,25 @@ Boston, MA 02111-1307, USA. */ +/** + * TODO: + * - code clean up (factor out some parsing aspects?) + * - proper string decoding (escape sequences) + * - proper dictionary support + * - filters (compression!) + * - page count (and other document catalog information, + * such as language, viewer preferences, page layout, + * Metadatastreams (10.2.2), legal and permissions info) + * - pdf 1.5 support ((compressed) cross reference streams) + */ + #include "platform.h" #include "extractor.h" #include <zlib.h> +#ifndef _XOPEN_SOURCE +#define _XOPEN_SOURCE 1 +#endif +#include <time.h> #include "convert.h" static char * stndup(const char * str, @@ -32,23 +48,9 @@ static char * stndup(const char * str, return tmp; } -/** - * strnlen is GNU specific, let's redo it here to be - * POSIX compliant. - */ -static size_t stnlen(const char * str, - size_t maxlen) { - size_t ret; - ret = 0; - while ( (ret < maxlen) && - (str[ret] != '\0') ) - ret++; - return ret; -} - static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, - const char * keyword, + char * keyword, struct EXTRACTOR_Keywords * next) { EXTRACTOR_KeywordList * result; @@ -56,26 +58,108 @@ addKeyword(EXTRACTOR_KeywordType type, return next; result = malloc(sizeof(EXTRACTOR_KeywordList)); result->next = next; - result->keyword = strdup(keyword); + result->keyword = keyword; result->keywordType = type; return result; } + + +static char * +dateDecode(const char * pdfString) { + unsigned char * ret; + + if (pdfString == NULL) + return NULL; + if (strlen(pdfString) < 4) + return NULL; + return stndup(&pdfString[3], strlen(pdfString) - 4); +} + +static unsigned char * +stringDecode(const char * pdfString, + size_t * size) { + size_t slen; + unsigned char * ret; + char hex[3]; + int i; + int val; + + slen = strlen(pdfString); + if (slen < 2) + return NULL; + switch (pdfString[0]) { + case '(': + if (pdfString[slen-1] != ')') + return NULL; + /* todo: recode escape sequences! */ + *size = slen - 2; + return stndup(&pdfString[1], slen-2); + case '<': + if (pdfString[slen-1] != '>') + return NULL; + hex[2] = '\0'; + ret = malloc(1 + ((slen - 1) / 2)); + for (i=0;i<slen-2;i+=2) { + hex[0] = pdfString[i+1]; + hex[1] = '0'; + if (i + 1 < slen) + hex[1] = pdfString[i+2]; + if ( (1 != sscanf(hex, "%x", &val)) && + (1 != sscanf(hex, "%X", &val)) ) { + free(ret); + return NULL; + } + ret[i/2] = val; + } + ret[(slen-1)/2] = '\0'; + *size = (slen-1) / 2; + return ret; + } + return NULL; +} + +static char * +charsetDecode(const unsigned char * in, + size_t size) { + if (in == NULL) + return NULL; + if ( (size < 2) || + (in[0] != 0xfe) || + (in[1] != 0xff) ) { + /* TODO: extend glibc with + character set that corresponds to + Adobe's extended ISOLATIN1 encoding! */ + return convertToUtf8(in, + size, + "CSISOLATIN1"); + } else { + return convertToUtf8(&in[2], + size - 2, + "UNICODEBIG"); + } + +} + static struct { char * name; EXTRACTOR_KeywordType type; } tagmap[] = { - { "Author" , EXTRACTOR_AUTHOR}, - { "Description" , EXTRACTOR_DESCRIPTION}, - { "Comment", EXTRACTOR_COMMENT}, - { "Copyright", EXTRACTOR_COPYRIGHT}, - { "Source", EXTRACTOR_SOURCE}, - { "Creation Time", EXTRACTOR_DATE}, - { "Title", EXTRACTOR_TITLE}, - { "Software", EXTRACTOR_SOFTWARE}, - { "Disclaimer", EXTRACTOR_DISCLAIMER}, - { "Warning", EXTRACTOR_WARNING}, - { "Signature", EXTRACTOR_RESOURCE_IDENTIFIER}, + { "/CreationDate", EXTRACTOR_CREATION_DATE}, + { "/Author" , EXTRACTOR_AUTHOR}, + { "/Description" , EXTRACTOR_DESCRIPTION}, + { "/Title" , EXTRACTOR_TITLE}, + { "/Comment", EXTRACTOR_COMMENT}, + { "/Copyright", EXTRACTOR_COPYRIGHT}, + { "/Subject", EXTRACTOR_SUBJECT}, + { "/PTEX.Fullbanner", EXTRACTOR_SOFTWARE}, + { "/Creator", EXTRACTOR_CREATOR}, + { "/ModDate", EXTRACTOR_MODIFICATION_DATE}, + { "/Producer", EXTRACTOR_PRODUCER}, + { "/Software", EXTRACTOR_SOFTWARE}, + { "/Keywords", EXTRACTOR_KEYWORDS}, + { "/Warning", EXTRACTOR_WARNING}, + { "/Signature", EXTRACTOR_RESOURCE_IDENTIFIER}, { NULL, EXTRACTOR_UNKNOWN}, }; @@ -97,7 +181,9 @@ libextractor_pdf_extract(const char * filename, size_t size, struct EXTRACTOR_Keywords * prev) { size_t pos; + size_t spos; size_t steps; + size_t mlen; unsigned int xstart; unsigned int xcount; unsigned int xinfo; @@ -107,6 +193,10 @@ libextractor_pdf_extract(const char * filename, unsigned long long info_offset; char buf[MAX_STEPS+1]; int i; + char * meta; + unsigned char * dmeta; + char pcnt[20]; + float version; while ( (size > 0) && (IS_NL(data[size-1])) ) size--; @@ -116,6 +206,20 @@ libextractor_pdf_extract(const char * filename, return prev; if (0 != memcmp(&data[size - strlen(PDF_EOF)], PDF_EOF, strlen(PDF_EOF))) return prev; + /* PDF format is pretty much sure by now */ + memcpy(buf, + data, + 8); + buf[8] = '\0'; + if (1 != sscanf(buf, "%%PDF-%f", &version)) { + return prev; + } + sprintf(pcnt, "PDF %.1f", version); + prev = addKeyword(EXTRACTOR_FORMAT, + strdup(pcnt), + prev); + + pos = size - strlen(PDF_EOF) - strlen(PDF_SXR); steps = 0; @@ -123,21 +227,20 @@ libextractor_pdf_extract(const char * filename, (pos > 0) && (0 != memcmp(&data[pos], PDF_SXR, strlen(PDF_SXR))) ) pos--; - printf("pos: %u\n", pos); - if (0 != memcmp(&data[pos], PDF_SXR, strlen(PDF_SXR))) + if (0 != memcmp(&data[pos], PDF_SXR, strlen(PDF_SXR))) { + /* cross reference streams not yet supported! */ return prev; + } memcpy(buf, &data[pos + strlen(PDF_SXR)], steps); buf[steps] = '\0'; if (1 != sscanf(buf, "%llu", &startxref)) return prev; - printf("startxref: %llu\n", startxref); if (startxref >= size - strlen(PDF_XREF)) return prev; if (0 != memcmp(&data[startxref], PDF_XREF, strlen(PDF_XREF))) return prev; haveValidXref = 0; xrefpos = startxref + strlen(PDF_XREF); - while (1) { pos = xrefpos; while ( (pos < size) && (IS_NL(data[pos])) ) @@ -146,10 +249,6 @@ libextractor_pdf_extract(const char * filename, buf[MIN(MAX_STEPS,size-pos)] = '\0'; if (2 != sscanf(buf, "%u %u", &xstart, &xcount)) break; - printf("xstart: %u - xcount: %u - pos %u\n", - xstart, - xcount, - pos); while ( (pos < size) && (! IS_NL(data[pos])) ) pos++; if ( (pos < size) && IS_NL(data[pos])) @@ -158,8 +257,6 @@ libextractor_pdf_extract(const char * filename, if ( (xrefpos >= size) || (xrefpos < pos) ) return prev; /* invalid xref size */ haveValidXref = 1; - printf("xref portion ends at %llu\n", - xrefpos); } if (! haveValidXref) return prev; @@ -170,6 +267,7 @@ libextractor_pdf_extract(const char * filename, strlen(PDF_TRAILER))) return prev; pos += strlen(PDF_TRAILER); + SKIP("<< \n\r", pos, data, size); while ( (pos < size) && (pos + strlen(PDF_INFO) < size) && @@ -186,8 +284,7 @@ libextractor_pdf_extract(const char * filename, } while ( (pos < size) && (IS_NL(data[pos]) || isspace(data[pos]) ) ) - pos++; - } + pos++; } if ( ! ( (pos < size) && (pos + strlen(PDF_INFO) < size) && (0 == memcmp(&data[pos], @@ -207,7 +304,6 @@ libextractor_pdf_extract(const char * filename, } if (1 != sscanf(buf, "%u", &xinfo)) return prev; - printf("xinfo: %u\n", xinfo); haveValidXref = 0; /* now go find xinfo in xref table */ @@ -220,10 +316,6 @@ libextractor_pdf_extract(const char * filename, buf[MIN(MAX_STEPS,size-pos)] = '\0'; if (2 != sscanf(buf, "%u %u", &xstart, &xcount)) break; - printf("xstart: %u - xcount: %u - pos %u\n", - xstart, - xcount, - pos); while ( (pos < size) && (! IS_NL(data[pos])) ) pos++; if ( (pos < size) && IS_NL(data[pos])) @@ -234,9 +326,7 @@ libextractor_pdf_extract(const char * filename, pos += 20 * xinfo - xstart; memcpy(buf, &data[pos], 20); buf[20] = '\0'; - sscanf(buf, "%10llu %*5u %*c", &info_offset); - - + sscanf(buf, "%10llu %*5u %*c", &info_offset); break; } xrefpos = 20 * xcount + pos; @@ -245,11 +335,72 @@ libextractor_pdf_extract(const char * filename, } if (! haveValidXref) return prev; - - /* read size of xref */ - /* parse xref */ - /* find info index */ - /* parse info */ + pos = info_offset; + + while ( (pos < size - 4) && + (! ( (data[pos] == '<') && + (data[pos+1] == '<') ) ) ) + pos++; + pos++; + if (pos >= size - 4) + return prev; + if ( (data[pos] == ' ') || + (data[pos] == 10) || + (data[pos] == 13) ) + pos++; + + while ( (pos < size - 2) && + ( ! ( (data[pos] == '>') && + (data[pos+1] == '>') ) ) ) { + i = 0; + while (tagmap[i].name != NULL) { + if ( (pos + strlen(tagmap[i].name) > pos) && + (pos + strlen(tagmap[i].name) + 1 < size) && + (0 == memcmp(&data[pos], + tagmap[i].name, + strlen(tagmap[i].name))) ) { + pos += strlen(tagmap[i].name); + if (isspace(data[pos])) + pos++; + spos = pos; + while ( (pos < size + 2) && + (! IS_NL(data[pos])) && + (data[pos] != '/') && + (! ( (data[pos] == '>') && + (data[pos+1] == '>') ) ) ) + pos++; + meta = stndup(&data[spos], + pos - spos); + if (i == 0) { + dmeta = dateDecode(meta); + mlen = strlen(dmeta); + } else { + dmeta = stringDecode(meta, + &mlen); + } + if (meta != NULL) + free(meta); + meta = charsetDecode(dmeta, mlen); + if (dmeta != NULL) + free(dmeta); + if (meta != NULL) { + prev = addKeyword(tagmap[i].type, + meta, + prev); + } + break; + } + i++; + } + if (tagmap[i].name == NULL) { + while ( (pos < size) && + (! IS_NL(data[pos])) ) + pos++; + } + while ( (pos < size) && + (IS_NL(data[pos])) ) + pos++; + } return prev; } diff --git a/src/plugins/pngextractor.c b/src/plugins/pngextractor.c @@ -282,7 +282,7 @@ static struct EXTRACTOR_Keywords * processzTXt(const char * data, -struct EXTRACTOR_Keywords * libextractor_png_extract(char * filename, +struct EXTRACTOR_Keywords * libextractor_png_extract(const char * filename, const char * data, size_t size, struct EXTRACTOR_Keywords * prev) {