diff options
Diffstat (limited to 'src/plugins/ps_extractor.c')
-rw-r--r-- | src/plugins/ps_extractor.c | 213 |
1 files changed, 213 insertions, 0 deletions
diff --git a/src/plugins/ps_extractor.c b/src/plugins/ps_extractor.c new file mode 100644 index 0000000..bd99c94 --- /dev/null +++ b/src/plugins/ps_extractor.c | |||
@@ -0,0 +1,213 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | (C) 2002, 2003, 2009, 2012 Vidyut Samanta and Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 3, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | */ | ||
20 | /** | ||
21 | * @file plugins/ps_extractor.c | ||
22 | * @brief plugin to support PostScript files | ||
23 | * @author Christian Grothoff | ||
24 | */ | ||
25 | #include "platform.h" | ||
26 | #include "extractor.h" | ||
27 | |||
28 | |||
29 | /** | ||
30 | * Maximum length of a single line in the PostScript file we're | ||
31 | * willing to look at. While the body of the file can have longer | ||
32 | * lines, this should be a sane limit for the lines in the header with | ||
33 | * the meta data. | ||
34 | */ | ||
35 | #define MAX_LINE (1024) | ||
36 | |||
37 | /** | ||
38 | * Header of a PostScript file. | ||
39 | */ | ||
40 | #define PS_HEADER "%!PS-Adobe" | ||
41 | |||
42 | |||
43 | /** | ||
44 | * Pair with prefix in the PS header and corresponding LE type. | ||
45 | */ | ||
46 | struct Matches | ||
47 | { | ||
48 | /** | ||
49 | * PS header prefix. | ||
50 | */ | ||
51 | const char *prefix; | ||
52 | |||
53 | /** | ||
54 | * Corresponding LE type. | ||
55 | */ | ||
56 | enum EXTRACTOR_MetaType type; | ||
57 | }; | ||
58 | |||
59 | |||
60 | /** | ||
61 | * Map of PS prefixes to LE types. | ||
62 | */ | ||
63 | static struct Matches tests[] = { | ||
64 | { "%%Title: ", EXTRACTOR_METATYPE_TITLE }, | ||
65 | { "% Subject: ", EXTRACTOR_METATYPE_SUBJECT }, | ||
66 | { "%%Author: ", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
67 | { "% From: ", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
68 | { "%%Version: ", EXTRACTOR_METATYPE_REVISION_NUMBER }, | ||
69 | { "%%Creator: ", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | ||
70 | { "%%CreationDate: ", EXTRACTOR_METATYPE_CREATION_DATE }, | ||
71 | { "% Date: ", EXTRACTOR_METATYPE_UNKNOWN_DATE }, | ||
72 | { "%%Pages: ", EXTRACTOR_METATYPE_PAGE_COUNT }, | ||
73 | { "%%Orientation: ", EXTRACTOR_METATYPE_PAGE_ORIENTATION }, | ||
74 | { "%%DocumentPaperSizes: ", EXTRACTOR_METATYPE_PAPER_SIZE }, | ||
75 | { "%%PageOrder: ", EXTRACTOR_METATYPE_PAGE_ORDER }, | ||
76 | { "%%LanguageLevel: ", EXTRACTOR_METATYPE_FORMAT_VERSION }, | ||
77 | { "%%Magnification: ", EXTRACTOR_METATYPE_MAGNIFICATION }, | ||
78 | |||
79 | /* Also widely used but not supported since they | ||
80 | probably make no sense: | ||
81 | "%%BoundingBox: ", | ||
82 | "%%DocumentNeededResources: ", | ||
83 | "%%DocumentSuppliedResources: ", | ||
84 | "%%DocumentProcSets: ", | ||
85 | "%%DocumentData: ", */ | ||
86 | |||
87 | { NULL, 0 } | ||
88 | }; | ||
89 | |||
90 | |||
91 | /** | ||
92 | * Read a single ('\n'-terminated) line of input. | ||
93 | * | ||
94 | * @param ec context for IO | ||
95 | * @return NULL on end-of-file (or if next line exceeds limit) | ||
96 | */ | ||
97 | static char * | ||
98 | readline (struct EXTRACTOR_ExtractContext *ec) | ||
99 | { | ||
100 | int64_t pos; | ||
101 | ssize_t ret; | ||
102 | char *res; | ||
103 | void *data; | ||
104 | const char *cdata; | ||
105 | const char *eol; | ||
106 | |||
107 | pos = ec->seek (ec->cls, 0, SEEK_CUR); | ||
108 | if (0 >= (ret = ec->read (ec->cls, &data, MAX_LINE))) | ||
109 | return NULL; | ||
110 | cdata = data; | ||
111 | if (NULL == (eol = memchr (cdata, '\n', ret))) | ||
112 | return NULL; /* no end-of-line found */ | ||
113 | if (NULL == (res = malloc (eol - cdata + 1))) | ||
114 | return NULL; | ||
115 | memcpy (res, cdata, eol - cdata); | ||
116 | res[eol - cdata] = '\0'; | ||
117 | ec->seek (ec->cls, pos + eol - cdata + 1, SEEK_SET); | ||
118 | return res; | ||
119 | } | ||
120 | |||
121 | |||
122 | /** | ||
123 | * Main entry method for the 'application/postscript' extraction plugin. | ||
124 | * | ||
125 | * @param ec extraction context provided to the plugin | ||
126 | */ | ||
127 | void | ||
128 | EXTRACTOR_ps_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
129 | { | ||
130 | unsigned int i; | ||
131 | char *line; | ||
132 | char *next; | ||
133 | char *acc; | ||
134 | const char *match; | ||
135 | |||
136 | if (NULL == (line = readline (ec))) | ||
137 | return; | ||
138 | if ( (strlen (line) < strlen (PS_HEADER)) || | ||
139 | (0 != memcmp (PS_HEADER, | ||
140 | line, | ||
141 | strlen (PS_HEADER))) ) | ||
142 | { | ||
143 | free (line); | ||
144 | return; | ||
145 | } | ||
146 | free (line); | ||
147 | if (0 != ec->proc (ec->cls, | ||
148 | "ps", | ||
149 | EXTRACTOR_METATYPE_MIMETYPE, | ||
150 | EXTRACTOR_METAFORMAT_UTF8, | ||
151 | "text/plain", | ||
152 | "application/postscript", | ||
153 | strlen ("application/postscript") + 1)) | ||
154 | return; | ||
155 | |||
156 | line = NULL; | ||
157 | next = readline (ec); | ||
158 | while ( (NULL != next) && | ||
159 | ('%' == next[0]) ) | ||
160 | { | ||
161 | line = next; | ||
162 | next = readline (ec); | ||
163 | for (i = 0; NULL != tests[i].prefix; i++) | ||
164 | { | ||
165 | match = tests[i].prefix; | ||
166 | if ( (strlen (line) < strlen (match)) || | ||
167 | (0 != strncmp (line, match, strlen (match))) ) | ||
168 | continue; | ||
169 | /* %%+ continues previous meta-data type... */ | ||
170 | while ( (NULL != next) && | ||
171 | (0 == strncmp (next, "%%+", strlen ("%%+"))) ) | ||
172 | { | ||
173 | acc = malloc (strlen (line) + strlen (next) - 1); | ||
174 | strcpy (acc, line); | ||
175 | strcat (acc, " "); | ||
176 | strcat (acc, next + 3); | ||
177 | free (line); | ||
178 | line = acc; | ||
179 | next = readline (ec); | ||
180 | } | ||
181 | if ( (line[strlen (line) - 1] == ')') && | ||
182 | (line[strlen (match)] == '(') ) | ||
183 | { | ||
184 | acc = &line[strlen (match) + 1]; | ||
185 | acc[strlen (acc) - 1] = '\0'; /* remove ")" */ | ||
186 | } | ||
187 | else | ||
188 | { | ||
189 | acc = &line[strlen (match)]; | ||
190 | } | ||
191 | while (isspace ((unsigned int) acc[0])) | ||
192 | acc++; | ||
193 | if ( (strlen (acc) > 0) && | ||
194 | (0 != ec->proc (ec->cls, | ||
195 | "ps", | ||
196 | tests[i].type, | ||
197 | EXTRACTOR_METAFORMAT_UTF8, | ||
198 | "text/plain", | ||
199 | acc, | ||
200 | strlen (acc) + 1)) ) | ||
201 | { | ||
202 | free (line); | ||
203 | if (NULL != next) | ||
204 | free (next); | ||
205 | return; | ||
206 | } | ||
207 | break; | ||
208 | } | ||
209 | free (line); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | /* end of ps_extractor.c */ | ||