pdf_extractor.c (5530B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2016 Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/pdf_extractor.c 22 * @brief plugin to support PDF files 23 * @author Christian Grothoff 24 * 25 * PDF libraries today are a nightmare (TM). So instead of doing the 26 * fast thing and calling some library functions to parse the PDF, 27 * we execute 'pdfinfo' and parse the output. Because that's 21st 28 * century plumbing: nobody writes reasonable code anymore. 29 */ 30 #include "platform.h" 31 #include <extractor.h> 32 #include <sys/types.h> 33 #include <sys/wait.h> 34 #include <signal.h> 35 #include <unistd.h> 36 37 /** 38 * Entry in the mapping from control data to LE types. 39 */ 40 struct Matches 41 { 42 /** 43 * Key in the Pdfian control file. 44 */ 45 const char *text; 46 47 /** 48 * Corresponding type in LE. 49 */ 50 enum EXTRACTOR_MetaType type; 51 }; 52 53 54 /** 55 * Map from pdf-control entries to LE types. 56 * 57 * See output of 'pdfinfo'. 58 */ 59 static struct Matches tmap[] = { 60 {"Title", EXTRACTOR_METATYPE_TITLE}, 61 {"Subject", EXTRACTOR_METATYPE_SUBJECT}, 62 {"Keywords", EXTRACTOR_METATYPE_KEYWORDS}, 63 {"Author", EXTRACTOR_METATYPE_AUTHOR_NAME}, 64 {"Creator", EXTRACTOR_METATYPE_CREATOR}, 65 {"Producer", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE}, 66 {"CreationDate", EXTRACTOR_METATYPE_CREATION_DATE}, 67 {"ModDate", EXTRACTOR_METATYPE_MODIFICATION_DATE}, 68 {"PDF version", EXTRACTOR_METATYPE_ENCODER_VERSION}, 69 {"Pages", EXTRACTOR_METATYPE_PAGE_COUNT}, 70 {NULL, 0} 71 }; 72 73 74 /** 75 * Process the "stdout" file from pdfinfo. 76 * 77 * @param fout stdout of pdfinfo 78 * @param proc function to call with meta data 79 * @param proc_cls closure for @e proc 80 */ 81 static void 82 process_stdout (FILE *fout, 83 EXTRACTOR_MetaDataProcessor proc, 84 void *proc_cls) 85 { 86 unsigned int i; 87 char line[1025]; 88 const char *psuffix; 89 const char *colon; 90 91 while (! feof (fout)) 92 { 93 if (NULL == fgets (line, sizeof (line) - 1, fout)) 94 break; 95 if (0 == strlen (line)) 96 continue; 97 if ('\n' == line[strlen (line) - 1]) 98 line[strlen (line) - 1] = '\0'; 99 colon = strchr (line, (int) ':'); 100 if (NULL == colon) 101 break; 102 psuffix = colon + 1; 103 while (isblank ((unsigned char) psuffix[0])) 104 psuffix++; 105 if (0 == strlen (psuffix)) 106 continue; 107 for (i = 0; NULL != tmap[i].text; i++) 108 { 109 if (0 != strncasecmp (line, 110 tmap[i].text, 111 colon - line)) 112 continue; 113 if (0 != proc (proc_cls, 114 "pdf", 115 tmap[i].type, 116 EXTRACTOR_METAFORMAT_UTF8, 117 "text/plain", 118 psuffix, 119 strlen (psuffix) + 1)) 120 return; 121 break; 122 } 123 } 124 } 125 126 127 /** 128 * Main entry method for the PDF extraction plugin. 129 * 130 * @param ec extraction context provided to the plugin 131 */ 132 void 133 EXTRACTOR_pdf_extract_method (struct EXTRACTOR_ExtractContext *ec) 134 { 135 uint64_t fsize; 136 void *data; 137 pid_t pid; 138 int in[2]; 139 int out[2]; 140 FILE *fout; 141 uint64_t pos; 142 143 fsize = ec->get_size (ec->cls); 144 if (fsize < 128) 145 return; 146 if (4 != 147 ec->read (ec->cls, &data, 4)) 148 return; 149 if (0 != strncmp ("%PDF", data, 4)) 150 return; 151 if (0 != 152 ec->seek (ec->cls, 0, SEEK_SET)) 153 return; 154 if (0 != pipe (in)) 155 return; 156 if (0 != pipe (out)) 157 { 158 close (in[0]); 159 close (in[1]); 160 return; 161 } 162 pid = fork (); 163 if (-1 == pid) 164 { 165 close (in[0]); 166 close (in[1]); 167 close (out[0]); 168 close (out[1]); 169 return; 170 } 171 if (0 == pid) 172 { 173 char *const args[] = { 174 "pdfinfo", 175 "-", 176 NULL 177 }; 178 /* am child, exec 'pdfinfo' */ 179 close (0); 180 close (1); 181 if ( (-1 == dup2 (in[0], 0)) || 182 (-1 == dup2 (out[1], 1)) ) 183 exit (1); 184 close (in[0]); 185 close (in[1]); 186 close (out[0]); 187 close (out[1]); 188 execvp ("pdfinfo", args); 189 exit (1); 190 } 191 /* am parent, send file */ 192 close (in[0]); 193 close (out[1]); 194 fout = fdopen (out[0], "r"); 195 if (NULL == fout) 196 { 197 close (in[1]); 198 close (out[0]); 199 kill (pid, SIGKILL); 200 waitpid (pid, NULL, 0); 201 return; 202 } 203 pos = 0; 204 while (pos < fsize) 205 { 206 ssize_t got; 207 size_t wpos; 208 209 data = NULL; 210 got = ec->read (ec->cls, 211 &data, 212 fsize - pos); 213 if ( (-1 == got) || 214 (NULL == data) ) 215 break; 216 wpos = 0; 217 while (wpos < got) 218 { 219 ssize_t out; 220 221 out = write (in[1], data + wpos, got - wpos); 222 if (out <= 0) 223 break; 224 wpos += out; 225 } 226 if (wpos < got) 227 break; 228 pos += got; 229 } 230 close (in[1]); 231 process_stdout (fout, ec->proc, ec->cls); 232 fclose (fout); 233 kill (pid, SIGKILL); 234 waitpid (pid, NULL, 0); 235 } 236 237 238 /* end of pdf_extractor.c */