ps_extractor.c (5929B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/ps_extractor.c 22 * @brief plugin to support PostScript files 23 * @author Christian Grothoff 24 */ 25 #include "platform.h" 26 #include "extractor.h" 27 28 29 /** 30 * Maximum length of a single line in the PostScript file we're 31 * willing to look at. While the body of the file can have longer 32 * lines, this should be a sane limit for the lines in the header with 33 * the meta data. 34 */ 35 #define MAX_LINE (1024) 36 37 /** 38 * Header of a PostScript file. 39 */ 40 #define PS_HEADER "%!PS-Adobe" 41 42 43 /** 44 * Pair with prefix in the PS header and corresponding LE type. 45 */ 46 struct Matches 47 { 48 /** 49 * PS header prefix. 50 */ 51 const char *prefix; 52 53 /** 54 * Corresponding LE type. 55 */ 56 enum EXTRACTOR_MetaType type; 57 }; 58 59 60 /** 61 * Map of PS prefixes to LE types. 62 */ 63 static struct Matches tests[] = { 64 { "%%Title: ", EXTRACTOR_METATYPE_TITLE }, 65 { "% Subject: ", EXTRACTOR_METATYPE_SUBJECT }, 66 { "%%Author: ", EXTRACTOR_METATYPE_AUTHOR_NAME }, 67 { "% From: ", EXTRACTOR_METATYPE_AUTHOR_NAME }, 68 { "%%Version: ", EXTRACTOR_METATYPE_REVISION_NUMBER }, 69 { "%%Creator: ", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 70 { "%%CreationDate: ", EXTRACTOR_METATYPE_CREATION_DATE }, 71 { "% Date: ", EXTRACTOR_METATYPE_UNKNOWN_DATE }, 72 { "%%Pages: ", EXTRACTOR_METATYPE_PAGE_COUNT }, 73 { "%%Orientation: ", EXTRACTOR_METATYPE_PAGE_ORIENTATION }, 74 { "%%DocumentPaperSizes: ", EXTRACTOR_METATYPE_PAPER_SIZE }, 75 { "%%PageOrder: ", EXTRACTOR_METATYPE_PAGE_ORDER }, 76 { "%%LanguageLevel: ", EXTRACTOR_METATYPE_FORMAT_VERSION }, 77 { "%%Magnification: ", EXTRACTOR_METATYPE_MAGNIFICATION }, 78 79 /* Also widely used but not supported since they 80 probably make no sense: 81 "%%BoundingBox: ", 82 "%%DocumentNeededResources: ", 83 "%%DocumentSuppliedResources: ", 84 "%%DocumentProcSets: ", 85 "%%DocumentData: ", */ 86 87 { NULL, 0 } 88 }; 89 90 91 /** 92 * Read a single ('\n'-terminated) line of input. 93 * 94 * @param ec context for IO 95 * @return NULL on end-of-file (or if next line exceeds limit) 96 */ 97 static char * 98 readline (struct EXTRACTOR_ExtractContext *ec) 99 { 100 int64_t pos; 101 ssize_t ret; 102 char *res; 103 void *data; 104 const char *cdata; 105 const char *eol; 106 107 pos = ec->seek (ec->cls, 0, SEEK_CUR); 108 if (0 >= (ret = ec->read (ec->cls, &data, MAX_LINE))) 109 return NULL; 110 cdata = data; 111 if (NULL == (eol = memchr (cdata, '\n', ret))) 112 return NULL; /* no end-of-line found */ 113 if (NULL == (res = malloc (eol - cdata + 1))) 114 return NULL; 115 memcpy (res, cdata, eol - cdata); 116 res[eol - cdata] = '\0'; 117 ec->seek (ec->cls, pos + eol - cdata + 1, SEEK_SET); 118 return res; 119 } 120 121 122 /** 123 * Main entry method for the 'application/postscript' extraction plugin. 124 * 125 * @param ec extraction context provided to the plugin 126 */ 127 void 128 EXTRACTOR_ps_extract_method (struct EXTRACTOR_ExtractContext *ec) 129 { 130 unsigned int i; 131 char *line; 132 char *next; 133 char *acc; 134 const char *match; 135 136 if (NULL == (line = readline (ec))) 137 return; 138 if ( (strlen (line) < strlen (PS_HEADER)) || 139 (0 != memcmp (PS_HEADER, 140 line, 141 strlen (PS_HEADER))) ) 142 { 143 free (line); 144 return; 145 } 146 free (line); 147 if (0 != ec->proc (ec->cls, 148 "ps", 149 EXTRACTOR_METATYPE_MIMETYPE, 150 EXTRACTOR_METAFORMAT_UTF8, 151 "text/plain", 152 "application/postscript", 153 strlen ("application/postscript") + 1)) 154 return; 155 156 line = NULL; 157 next = readline (ec); 158 while ( (NULL != next) && 159 ('%' == next[0]) ) 160 { 161 line = next; 162 next = readline (ec); 163 for (i = 0; NULL != tests[i].prefix; i++) 164 { 165 match = tests[i].prefix; 166 if ( (strlen (line) < strlen (match)) || 167 (0 != strncmp (line, match, strlen (match))) ) 168 continue; 169 /* %%+ continues previous meta-data type... */ 170 while ( (NULL != next) && 171 (0 == strncmp (next, "%%+", strlen ("%%+"))) ) 172 { 173 if (NULL == (acc = malloc (strlen (line) + strlen (next) - 1))) 174 break; 175 strcpy (acc, line); 176 strcat (acc, " "); 177 strcat (acc, next + 3); 178 free (line); 179 line = acc; 180 free (next); 181 next = readline (ec); 182 } 183 if ( (line[strlen (line) - 1] == ')') && 184 (line[strlen (match)] == '(') ) 185 { 186 acc = &line[strlen (match) + 1]; 187 acc[strlen (acc) - 1] = '\0'; /* remove ")" */ 188 } 189 else 190 { 191 acc = &line[strlen (match)]; 192 } 193 while (isspace ((unsigned char) acc[0])) 194 acc++; 195 if ( (strlen (acc) > 0) && 196 (0 != ec->proc (ec->cls, 197 "ps", 198 tests[i].type, 199 EXTRACTOR_METAFORMAT_UTF8, 200 "text/plain", 201 acc, 202 strlen (acc) + 1)) ) 203 { 204 free (line); 205 if (NULL != next) 206 free (next); 207 return; 208 } 209 break; 210 } 211 free (line); 212 } 213 if (NULL != next) 214 free (next); 215 } 216 217 218 /* end of ps_extractor.c */