dvi_extractor.c (8709B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2012, 2017, 2019 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/dvi_extractor.c 22 * @brief plugin to support DVI files (from LaTeX) 23 * @author Christian Grothoff 24 */ 25 #include "platform.h" 26 #include "extractor.h" 27 28 29 /** 30 * Pair of a PostScipt prefix and the corresponding LE type. 31 */ 32 struct Matches 33 { 34 /** 35 * Prefix in the PS map. 36 */ 37 const char *text; 38 39 /** 40 * Corresponding LE type. 41 */ 42 enum EXTRACTOR_MetaType type; 43 }; 44 45 46 /** 47 * Map from PS names to LE types. 48 */ 49 static struct Matches tmap[] = { 50 { "/Title (", EXTRACTOR_METATYPE_TITLE }, 51 { "/Subject (", EXTRACTOR_METATYPE_SUBJECT }, 52 { "/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME }, 53 { "/Keywords (", EXTRACTOR_METATYPE_KEYWORDS }, 54 { "/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 55 { "/Producer (", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE }, 56 { NULL, 0 } 57 }; 58 59 60 /** 61 * Parse a "ZZZ" tag. Specifically, the data may contain a 62 * postscript dictionary with metadata. 63 * 64 * @param data overall input stream 65 * @param pos where in data is the zzz data 66 * @param len how many bytes from 'pos' does the zzz data extend? 67 * @param proc function to call with meta data found 68 * @param proc_cls closure for proc 69 * @return 0 to continue to extract, 1 to stop 70 */ 71 static int 72 parseZZZ (const char *data, 73 size_t pos, size_t len, 74 EXTRACTOR_MetaDataProcessor proc, 75 void *proc_cls) 76 { 77 size_t slen; 78 size_t end; 79 unsigned int i; 80 81 end = pos + len; 82 slen = strlen ("ps:SDict begin ["); 83 if ( (len <= slen) || 84 (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) ) 85 return 0; 86 pos += slen; 87 while (pos < end) 88 { 89 for (i = 0; NULL != tmap[i].text; i++) 90 { 91 slen = strlen (tmap[i].text); 92 if ( (pos + slen > end) || 93 (0 != strncmp (&data[pos], tmap[i].text, slen)) ) 94 continue; 95 pos += slen; 96 slen = pos; 97 while ((slen < end) && (data[slen] != ')')) 98 slen++; 99 slen = slen - pos; 100 { 101 char value[slen + 1]; 102 103 value[slen] = '\0'; 104 memcpy (value, &data[pos], slen); 105 if (0 != proc (proc_cls, 106 "dvi", 107 tmap[i].type, 108 EXTRACTOR_METAFORMAT_C_STRING, 109 "text/plain", 110 value, 111 slen + 1)) 112 return 1; 113 } 114 pos += slen + 1; 115 break; 116 } 117 pos++; 118 } 119 return 0; 120 } 121 122 123 /** 124 * Read 32-bit unsigned integer in big-endian format from 'data'. 125 * 126 * @param data pointer to integer (possibly unaligned) 127 * @return 32-bit integer in host byte order 128 */ 129 static uint32_t 130 getIntAt (const void *data) 131 { 132 uint32_t p; 133 134 memcpy (&p, data, 4); /* ensure alignment! */ 135 return ntohl (p); 136 } 137 138 139 /** 140 * Read 16-bit unsigned integer in big-endian format from 'data'. 141 * 142 * @param data pointer to integer (possibly unaligned) 143 * @return 16-bit integer in host byte order 144 */ 145 static uint16_t 146 getShortAt (const void *data) 147 { 148 uint16_t p; 149 150 memcpy (&p, data, sizeof (uint16_t)); /* ensure alignment! */ 151 return ntohs (p); 152 } 153 154 155 /** 156 * Main entry method for the 'application/x-dvi' extraction plugin. 157 * 158 * @param ec extraction context provided to the plugin 159 */ 160 void 161 EXTRACTOR_dvi_extract_method (struct EXTRACTOR_ExtractContext *ec) 162 { 163 unsigned int klen; 164 uint32_t pos; 165 uint32_t opos; 166 unsigned int len; 167 unsigned int pageCount; 168 char pages[16]; 169 void *buf; 170 unsigned char *data; 171 uint64_t size; 172 uint64_t off; 173 ssize_t iret; 174 175 if (40 >= (iret = ec->read (ec->cls, &buf, 1024))) 176 return; 177 data = buf; 178 if ( (data[0] != 247) || 179 (data[1] != 2) ) 180 return; /* cannot be DVI or unsupported version */ 181 klen = data[14]; 182 size = ec->get_size (ec->cls); 183 if (size > 16 * 1024 * 1024) 184 return; /* too large */ 185 if (klen + 15 > size) 186 return; /* malformed klen */ 187 if (NULL == (data = malloc ((size_t) size))) 188 return; /* out of memory */ 189 memcpy (data, buf, iret); 190 off = iret; 191 while (off < size) 192 { 193 if (0 >= (iret = ec->read (ec->cls, &buf, 16 * 1024))) 194 { 195 free (data); 196 return; 197 } 198 memcpy (&data[off], buf, iret); 199 off += iret; 200 } 201 pos = size - 1; 202 while ( (223 == data[pos]) && 203 (pos > 0) ) 204 pos--; 205 if ( (2 != data[pos]) || 206 (pos < 40) ) 207 goto CLEANUP; 208 pos--; 209 pos -= 4; 210 /* assert pos at 'post_post tag' */ 211 if (data[pos] != 249) 212 goto CLEANUP; 213 opos = pos; 214 pos = getIntAt (&data[opos + 1]); 215 if ( (pos + 25 > size) || 216 (pos + 25 < pos) ) 217 goto CLEANUP; 218 /* assert pos at 'post' command */ 219 if (data[pos] != 248) 220 goto CLEANUP; 221 pageCount = 0; 222 opos = pos; 223 pos = getIntAt (&data[opos + 1]); 224 while (1) 225 { 226 if (UINT32_MAX == pos) 227 break; 228 if ( (pos + 45 > size) || 229 (pos + 45 < pos) ) 230 goto CLEANUP; 231 if (data[pos] != 139) /* expect 'bop' */ 232 goto CLEANUP; 233 pageCount++; 234 opos = pos; 235 pos = getIntAt (&data[opos + 41]); 236 if (UINT32_MAX == pos) 237 break; 238 if (pos >= opos) 239 goto CLEANUP; /* invalid! */ 240 } 241 /* ok, now we believe it's a dvi... */ 242 snprintf (pages, 243 sizeof (pages), 244 "%u", 245 pageCount); 246 if (0 != ec->proc (ec->cls, 247 "dvi", 248 EXTRACTOR_METATYPE_PAGE_COUNT, 249 EXTRACTOR_METAFORMAT_UTF8, 250 "text/plain", 251 pages, 252 strlen (pages) + 1)) 253 goto CLEANUP; 254 if (0 != ec->proc (ec->cls, 255 "dvi", 256 EXTRACTOR_METATYPE_MIMETYPE, 257 EXTRACTOR_METAFORMAT_UTF8, 258 "text/plain", 259 "application/x-dvi", 260 strlen ("application/x-dvi") + 1)) 261 goto CLEANUP; 262 { 263 char comment[klen + 1]; 264 265 comment[klen] = '\0'; 266 memcpy (comment, &data[15], klen); 267 if (0 != ec->proc (ec->cls, 268 "dvi", 269 EXTRACTOR_METATYPE_COMMENT, 270 EXTRACTOR_METAFORMAT_C_STRING, 271 "text/plain", 272 comment, 273 klen + 1)) 274 goto CLEANUP; 275 } 276 /* try to find PDF/ps special */ 277 pos = opos; 278 while ( (size >= 100) && 279 (pos < size - 100) ) 280 { 281 switch (data[pos]) 282 { 283 case 139: /* begin page 'bop', we typically have to skip that one to 284 find the zzz's */ 285 pos += 45; /* skip bop */ 286 break; 287 case 239: /* zzz1 */ 288 len = data[pos + 1]; 289 if ( (pos + 2 + len < size) && 290 (0 != parseZZZ ((const char *) data, pos + 2, len, ec->proc, 291 ec->cls)) ) 292 goto CLEANUP; 293 pos += len + 2; 294 break; 295 case 240: /* zzz2 */ 296 len = getShortAt (&data[pos + 1]); 297 if ( (pos + 3 + len < size) && 298 (0 != parseZZZ ((const char *) data, pos + 3, len, ec->proc, 299 ec->cls)) ) 300 goto CLEANUP; 301 pos += len + 3; 302 break; 303 case 241: /* zzz3, who uses that? */ 304 len = (getShortAt (&data[pos + 1])) + 65536 * data[pos + 3]; 305 if ( (pos + 4 + len < size) && 306 (0 != parseZZZ ((const char *) data, pos + 4, len, ec->proc, 307 ec->cls)) ) 308 goto CLEANUP; 309 pos += len + 4; 310 break; 311 case 242: /* zzz4, hurray! */ 312 len = getIntAt (&data[pos + 1]); 313 if ( (pos + 1 + len < size) && 314 (0 != parseZZZ ((const char *) data, pos + 5, len, ec->proc, 315 ec->cls)) ) 316 goto CLEANUP; 317 pos += len + 5; 318 break; 319 default: /* unsupported opcode, abort scan */ 320 goto CLEANUP; 321 } 322 } 323 CLEANUP: 324 free (data); 325 } 326 327 328 /* end of dvi_extractor.c */