libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 9c7cd611928aa5838bf8d5073e81d6193e73d522
parent f3eef358ddd8beb04752e2a3e84a576f61f94897
Author: Christian Grothoff <christian@grothoff.org>
Date:   Fri, 18 Dec 2009 21:01:30 +0000

ps

Diffstat:
Msrc/include/extractor.h | 3+--
Msrc/main/extractor_metatypes.c | 4++--
Msrc/plugins/Makefile.am | 13++++++-------
Asrc/plugins/ps_extractor.c | 192+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/psextractor.c | 228-------------------------------------------------------------------------------
5 files changed, 201 insertions(+), 239 deletions(-)

diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -275,7 +275,7 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_SOURCE_DEVICE = 143, EXTRACTOR_METATYPE_DISCLAIMER = 144, EXTRACTOR_METATYPE_WARNING = 145, - + EXTRACTOR_METATYPE_PAGE_ORDER = 146, /* fixme: used up to here! */ @@ -295,7 +295,6 @@ enum EXTRACTOR_MetaType /* FIXME: transcribe & renumber those below */ EXTRACTOR_METATYPE_USED_FONTS = 37, - EXTRACTOR_METATYPE_PAGE_ORDER = 38, /* numeric metrics */ diff --git a/src/main/extractor_metatypes.c b/src/main/extractor_metatypes.c @@ -358,8 +358,8 @@ static const struct MetaTypeDescription meta_type_descriptions[] = { /* 145 */ { gettext_noop ("warning"), gettext_noop ("warning about the nature of the content") }, - { gettext_noop (""), - gettext_noop ("") }, + { gettext_noop ("page order"), + gettext_noop ("order of the pages") }, { gettext_noop (""), gettext_noop ("") }, { gettext_noop (""), diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -88,6 +88,7 @@ plugin_LTLIBRARIES = \ $(ole2) \ $(pdf) \ libextractor_png.la \ + libextractor_ps.la \ libextractor_real.la \ $(rpm) \ libextractor_tar.la \ @@ -243,6 +244,11 @@ libextractor_png_la_LIBADD = \ $(top_builddir)/src/common/libextractor_common.la \ -lz +libextractor_ps_la_SOURCES = \ + ps_extractor.c +libextractor_ps_la_LDFLAGS = \ + $(PLUGINFLAGS) + libextractor_real_la_SOURCES = \ real_extractor.c libextractor_real_la_LDFLAGS = \ @@ -297,7 +303,6 @@ OLD_LIBS = \ $(extrampeg) \ libextractor_nsf.la \ libextractor_nsfe.la \ - libextractor_ps.la \ $(extraqt) \ libextractor_riff.la \ libextractor_s3m.la \ @@ -317,12 +322,6 @@ libextractor_qt_la_LIBADD = \ -lz endif -libextractor_ps_la_SOURCES = \ - psextractor.c -libextractor_ps_la_LDFLAGS = \ - $(PLUGINFLAGS) -libextractor_ps_la_LIBADD = \ - $(top_builddir)/src/main/libextractor.la libextractor_id3v2_la_SOURCES = \ id3v2extractor.c diff --git a/src/plugins/ps_extractor.c b/src/plugins/ps_extractor.c @@ -0,0 +1,192 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2009 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +#include "platform.h" +#include "extractor.h" + + +static char * +readline (const char *data, size_t size, size_t pos) +{ + size_t end; + char *res; + + while ((pos < size) && + ((data[pos] == (char) 0x0d) || (data[pos] == (char) 0x0a))) + pos++; + + if (pos >= size) + return NULL; /* end of file */ + end = pos; + while ((end < size) && + (data[end] != (char) 0x0d) && (data[end] != (char) 0x0a)) + end++; + res = malloc (end - pos + 1); + memcpy (res, &data[pos], end - pos); + res[end - pos] = '\0'; + + return res; +} + + +static int +testmeta (char *line, + const char *match, + enum EXTRACTOR_MetaType type, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + char *key; + + if ( (strncmp (line, match, strlen (match)) == 0) && + (strlen (line) > strlen (match)) ) + { + if ((line[strlen (line) - 1] == ')') && (line[strlen (match)] == '(')) + { + key = &line[strlen (match) + 1]; + key[strlen (key) - 1] = '\0'; /* remove ")" */ + } + else + { + key = &line[strlen (match)]; + } + if (0 != proc (proc_cls, + "ps", + type, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + key, + strlen (key)+1)) + return 1; + } + return 0; +} + +typedef struct +{ + const char *prefix; + enum EXTRACTOR_MetaType type; +} Matches; + +static Matches tests[] = { + {"%%Title: ", EXTRACTOR_METATYPE_TITLE}, + {"%%Author: ", EXTRACTOR_METATYPE_AUTHOR_NAME}, + {"%%Version: ", EXTRACTOR_METATYPE_REVISION_NUMBER}, + {"%%Creator: ", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE}, + {"%%CreationDate: ", EXTRACTOR_METATYPE_CREATION_DATE}, + {"%%Pages: ", EXTRACTOR_METATYPE_PAGE_COUNT}, + {"%%Orientation: ", EXTRACTOR_METATYPE_PAGE_ORIENTATION}, + {"%%DocumentPaperSizes: ", EXTRACTOR_METATYPE_PAPER_SIZE}, + {"%%PageOrder: ", EXTRACTOR_METATYPE_PAGE_ORDER}, + {"%%LanguageLevel: ", EXTRACTOR_METATYPE_FORMAT_VERSION}, + {"%%Magnification: ", EXTRACTOR_METATYPE_MAGNIFICATION}, + + /* Also widely used but not supported since they + probably make no sense: + "%%BoundingBox: ", + "%%DocumentNeededResources: ", + "%%DocumentSuppliedResources: ", + "%%DocumentProcSets: ", + "%%DocumentData: ", */ + + {NULL, 0} +}; + +#define PS_HEADER "%!PS-Adobe" + +/* mimetype = application/postscript */ +int +EXTRACTOR_ps_extract (const char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + size_t pos; + char *line; + int i; + int lastLine; + int ret; + + pos = strlen (PS_HEADER); + if ( (size < pos) || + (0 != strncmp (PS_HEADER, + data, + pos)) ) + return 0; + ret = 0; + + if (0 != proc (proc_cls, + "ps", + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "application/postscript", + strlen ("application/postscript")+1)) + return 1; + /* skip rest of first line */ + while ((pos < size) && (data[pos] != '\n')) + pos++; + + lastLine = -1; + line = NULL; + /* while Windows-PostScript does not seem to (always?) put + "%%EndComments", this should allow us to not read through most of + the file for all the sane applications... For Windows-generated + PS files, we will bail out at the end of the file. */ + while (0 != strncmp ("%%EndComments", line, strlen ("%%EndComments"))) + { + free (line); + line = readline (data, size, pos); + if (line == NULL) + break; + i = 0; + while (tests[i].prefix != NULL) + { + ret = testmeta (line, tests[i].prefix, tests[i].type, proc, proc_cls); + if (ret != 0) + break; + i++; + } + if (ret != 0) + break; + + /* %%+ continues previous meta-data type... */ + if ( (lastLine != -1) && (0 == strncmp (line, "%%+ ", strlen ("%%+ ")))) + { + ret = testmeta (line, "%%+ ", tests[lastLine].type, proc, proc_cls); + } + else + { + /* update "previous" type */ + if (tests[i].prefix == NULL) + lastLine = -1; + else + lastLine = i; + } + if (pos + strlen (line) + 1 <= pos) + break; /* overflow */ + pos += strlen (line) + 1; /* skip newline, too; guarantee progress! */ + } + free (line); + return ret; +} + +/* end of ps_extractor.c */ diff --git a/src/plugins/psextractor.c b/src/plugins/psextractor.c @@ -1,228 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - **/ - -#include "platform.h" -#include "extractor.h" - -static struct EXTRACTOR_Keywords * -addKeyword (EXTRACTOR_KeywordType type, - char *keyword, struct EXTRACTOR_Keywords *next) -{ - EXTRACTOR_KeywordList *result; - - if (keyword == NULL) - return next; - result = malloc (sizeof (EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = strdup (keyword); - result->keywordType = type; - return result; -} - -static char * -readline (char *data, size_t size, size_t pos) -{ - size_t end; - char *res; - - while ((pos < size) && - ((data[pos] == (char) 0x0d) || (data[pos] == (char) 0x0a))) - pos++; - - if (pos >= size) - return NULL; /* end of file */ - end = pos; - while ((end < size) && - (data[end] != (char) 0x0d) && (data[end] != (char) 0x0a)) - end++; - res = malloc (end - pos + 1); - memcpy (res, &data[pos], end - pos); - res[end - pos] = '\0'; - - return res; -} - -static struct EXTRACTOR_Keywords * -testmeta (char *line, - const char *match, - EXTRACTOR_KeywordType type, struct EXTRACTOR_Keywords *prev) -{ - if ((strncmp (line, match, strlen (match)) == 0) && - (strlen (line) > strlen (match))) - { - char *key; - - if ((line[strlen (line) - 1] == ')') && (line[strlen (match)] == '(')) - { - key = &line[strlen (match) + 1]; - key[strlen (key) - 1] = '\0'; /* remove ")" */ - } - else - { - key = &line[strlen (match)]; - } - prev = addKeyword (type, key, prev); - } - return prev; -} - -typedef struct -{ - char *prefix; - EXTRACTOR_KeywordType type; -} Matches; - -static Matches tests[] = { - {"%%Title: ", EXTRACTOR_TITLE}, - {"%%Version: ", EXTRACTOR_VERSIONNUMBER}, - {"%%Creator: ", EXTRACTOR_CREATOR}, - {"%%CreationDate: ", EXTRACTOR_CREATION_DATE}, - {"%%Pages: ", EXTRACTOR_PAGE_COUNT}, - {"%%Orientation: ", EXTRACTOR_UNKNOWN}, - {"%%DocumentPaperSizes: ", EXTRACTOR_UNKNOWN}, - {"%%DocumentFonts: ", EXTRACTOR_UNKNOWN}, - {"%%PageOrder: ", EXTRACTOR_UNKNOWN}, - {"%%For: ", EXTRACTOR_UNKNOWN}, - {"%%Magnification: ", EXTRACTOR_UNKNOWN}, - - /* Also widely used but not supported since they - probably make no sense: - "%%BoundingBox: ", - "%%DocumentNeededResources: ", - "%%DocumentSuppliedResources: ", - "%%DocumentProcSets: ", - "%%DocumentData: ", */ - - {NULL, 0}, -}; - -/* which mime-types should not be subjected to - the PostScript extractor (no use trying) */ -static char *blacklist[] = { - "image/jpeg", - "image/gif", - "image/png", - "image/x-png", - "audio/real", - "audio/mpeg", - "application/x-gzip", - "application/x-dpkg", - "application/bz2", - "application/x-rpm", - "application/x-rar", - "application/x-zip", - "application/x-arj", - "application/x-compress", - "application/x-tar", - "application/x-lha", - "application/x-gtar", - "application/x-dpkg", - "application/ogg", - "video/real", - "video/asf", - "video/quicktime", - NULL, -}; - -/* mimetype = application/postscript */ -struct EXTRACTOR_Keywords * -libextractor_ps_extract (const char *filename, - char *data, - size_t size, struct EXTRACTOR_Keywords *prev) -{ - size_t pos; - char *psheader = "%!PS-Adobe"; - char *line; - int i; - int lastLine; - const char *mime; - - /* if the mime-type of the file is blacklisted, don't - run the printable extactor! */ - mime = EXTRACTOR_extractLast (EXTRACTOR_MIMETYPE, prev); - if (mime != NULL) - { - int j; - j = 0; - while (blacklist[j] != NULL) - { - if (0 == strcmp (blacklist[j], mime)) - return prev; - j++; - } - } - - - pos = 0; - while ((pos < size) && - (pos < strlen (psheader)) && (data[pos] == psheader[pos])) - pos++; - if (pos != strlen (psheader)) - { - return prev; /* no ps */ - } - - prev = addKeyword (EXTRACTOR_MIMETYPE, "application/postscript", prev); - - /* skip rest of first line */ - while ((pos < size) && (data[pos] != '\n')) - pos++; - - lastLine = -1; - line = strdup (psheader); - - /* while Windows-PostScript does not seem to (always?) put - "%%EndComments", this should allow us to not read through most of - the file for all the sane applications... For Windows-generated - PS files, we will bail out at the end of the file. */ - while (0 != strncmp ("%%EndComments", line, strlen ("%%EndComments"))) - { - free (line); - line = readline (data, size, pos); - if (line == NULL) - break; - i = 0; - while (tests[i].prefix != NULL) - { - prev = testmeta (line, tests[i].prefix, tests[i].type, prev); - i++; - } - - /* %%+ continues previous meta-data type... */ - if ((lastLine != -1) && (0 == strncmp (line, "%%+ ", strlen ("%%+ ")))) - { - prev = testmeta (line, "%%+ ", tests[lastLine].type, prev); - } - else - { - /* update "previous" type */ - if (tests[i].prefix == NULL) - lastLine = -1; - else - lastLine = i; - } - pos += strlen (line) + 1; /* skip newline, too; guarantee progress! */ - } - free (line); - - return prev; -} - -/* end of psextractor.c */