aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/pdf_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/pdf_extractor.c')
-rw-r--r--src/plugins/pdf_extractor.c229
1 files changed, 229 insertions, 0 deletions
diff --git a/src/plugins/pdf_extractor.c b/src/plugins/pdf_extractor.c
new file mode 100644
index 0000000..b84981f
--- /dev/null
+++ b/src/plugins/pdf_extractor.c
@@ -0,0 +1,229 @@
1/*
2 This file is part of libextractor.
3 Copyright (C) 2016 Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19 */
20/**
21 * @file plugins/pdf_extractor.c
22 * @brief plugin to support PDF files
23 * @author Christian Grothoff
24 *
25 * PDF libraries today are a nightmare (TM). So instead of doing the
26 * fast thing and calling some library functions to parse the PDF,
27 * we execute 'pdfinfo' and parse the output. Because that's 21st
28 * century plumbing: nobody writes reasonable code anymore.
29 */
30#include "platform.h"
31#include <extractor.h>
32#include <sys/types.h>
33#include <sys/wait.h>
34#include <signal.h>
35#include <unistd.h>
36
37/**
38 * Entry in the mapping from control data to LE types.
39 */
40struct Matches
41{
42 /**
43 * Key in the Pdfian control file.
44 */
45 const char *text;
46
47 /**
48 * Corresponding type in LE.
49 */
50 enum EXTRACTOR_MetaType type;
51};
52
53
54/**
55 * Map from pdf-control entries to LE types.
56 *
57 * See output of 'pdfinfo'.
58 */
59static struct Matches tmap[] = {
60 {"Title", EXTRACTOR_METATYPE_TITLE},
61 {"Subject", EXTRACTOR_METATYPE_SUBJECT},
62 {"Keywords", EXTRACTOR_METATYPE_KEYWORDS},
63 {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME},
64 {"Creator", EXTRACTOR_METATYPE_CREATOR},
65 {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE},
66 {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE},
67 {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE},
68 {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION},
69 {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT},
70 {NULL, 0}
71};
72
73
74/**
75 * Process the "stdout" file from pdfinfo.
76 *
77 * @param fout stdout of pdfinfo
78 * @param proc function to call with meta data
79 * @param proc_cls closure for @e proc
80 */
81static void
82process_stdout (FILE *fout,
83 EXTRACTOR_MetaDataProcessor proc,
84 void *proc_cls)
85{
86 unsigned int i;
87 char line[1025];
88 const char *psuffix;
89 const char *colon;
90
91 while (! feof (fout))
92 {
93 if (NULL == fgets (line, sizeof (line) - 1, fout))
94 break;
95 if (0 == strlen (line))
96 continue;
97 if ('\n' == line[strlen(line)-1])
98 line[strlen(line)-1] = '\0';
99 colon = strchr (line, (int) ':');
100 if (NULL == colon)
101 break;
102 psuffix = colon + 1;
103 while (isblank ((int) psuffix[0]))
104 psuffix++;
105 if (0 == strlen (psuffix))
106 continue;
107 for (i = 0; NULL != tmap[i].text; i++)
108 {
109 if (0 != strncasecmp (line,
110 tmap[i].text,
111 colon - line))
112 continue;
113 if (0 != proc (proc_cls,
114 "pdf",
115 tmap[i].type,
116 EXTRACTOR_METAFORMAT_UTF8,
117 "text/plain",
118 psuffix,
119 strlen(psuffix) + 1))
120 return;
121 break;
122 }
123 }
124}
125
126
127/**
128 * Main entry method for the PDF extraction plugin.
129 *
130 * @param ec extraction context provided to the plugin
131 */
132void
133EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec)
134{
135 uint64_t fsize;
136 void *data;
137 pid_t pid;
138 int in[2];
139 int out[2];
140 FILE *fout;
141 uint64_t pos;
142
143 fsize = ec->get_size (ec->cls);
144 if (fsize < 128)
145 return;
146 if (4 !=
147 ec->read (ec->cls, &data, 4))
148 return;
149 if (0 != strncmp ("%PDF", data, 4))
150 return;
151 if (0 !=
152 ec->seek (ec->cls, 0, SEEK_SET))
153 return;
154 if (0 != pipe (in))
155 return;
156 if (0 != pipe (out))
157 {
158 close (in[0]);
159 close (in[1]);
160 return;
161 }
162 pid = fork ();
163 if (-1 == pid)
164 {
165 close (in[0]);
166 close (in[1]);
167 close (out[0]);
168 close (out[1]);
169 return;
170 }
171 if (0 == pid)
172 {
173 char *const args[] = {
174 "pdfinfo",
175 "-",
176 NULL
177 };
178 /* am child, exec 'pdfinfo' */
179 close (0);
180 close (1);
181 dup2 (in[0], 0);
182 dup2 (out[1], 1);
183 close (in[0]);
184 close (in[1]);
185 close (out[0]);
186 close (out[1]);
187 execvp ("pdfinfo", args);
188 exit (1);
189 }
190 /* am parent, send file */
191 close (in[0]);
192 close (out[1]);
193 fout = fdopen (out[0], "r");
194
195 pos = 0;
196 while (pos < fsize)
197 {
198 ssize_t got;
199 size_t wpos;
200
201 data = NULL;
202 got = ec->read (ec->cls,
203 &data,
204 fsize - pos);
205 if ( (-1 == got) ||
206 (NULL == data) )
207 break;
208 wpos = 0;
209 while (wpos < got)
210 {
211 ssize_t out;
212
213 out = write (in[1], data + wpos, got - wpos);
214 if (out <= 0)
215 break;
216 wpos += out;
217 }
218 if (wpos < got)
219 break;
220 pos += got;
221 }
222 close (in[1]);
223 process_stdout (fout, ec->proc, ec->cls);
224 fclose (fout);
225 kill (pid, SIGKILL);
226 waitpid (pid, NULL, 0);
227}
228
229/* end of pdf_extractor.c */