aboutsummaryrefslogtreecommitdiff
path: root/src/main/extractor.c
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2012-07-22 18:20:40 +0000
committerChristian Grothoff <christian@grothoff.org>2012-07-22 18:20:40 +0000
commit8b969da6d45e3a9245320f676b00d87e3768b1a6 (patch)
tree691c6c44e8965a99e39c6e1e79dadcee95b9bd1f /src/main/extractor.c
parent38b0cdd4c4d94c644eca757f5736ee8f3f03cc84 (diff)
downloadlibextractor-8b969da6d45e3a9245320f676b00d87e3768b1a6.tar.gz
libextractor-8b969da6d45e3a9245320f676b00d87e3768b1a6.zip
-misc hacking on train
Diffstat (limited to 'src/main/extractor.c')
-rw-r--r--src/main/extractor.c576
1 files changed, 2 insertions, 574 deletions
diff --git a/src/main/extractor.c b/src/main/extractor.c
index 1f1c561..d5dbd4d 100644
--- a/src/main/extractor.c
+++ b/src/main/extractor.c
@@ -113,35 +113,6 @@
113 */ 113 */
114#define OPMODE_FILE 3 114#define OPMODE_FILE 3
115 115
116/**
117 * Header used for our IPC replies. A header
118 * with all fields being zero is used to indicate
119 * the end of the stream.
120 */
121struct IpcHeader
122{
123 /**
124 * Type of the meta data.
125 */
126 enum EXTRACTOR_MetaType meta_type;
127
128 /**
129 * Format of the meta data.
130 */
131 enum EXTRACTOR_MetaFormat meta_format;
132
133 /**
134 * Number of bytes of meta data (value)
135 */
136 size_t data_len;
137
138 /**
139 * Length of the mime type string describing the meta data value's mime type,
140 * including 0-terminator, 0 for mime type of "NULL".
141 */
142 size_t mime_len;
143};
144
145 116
146/** 117/**
147 * Writes 'size' bytes from 'buf' to 'fd', returns only when 118 * Writes 'size' bytes from 'buf' to 'fd', returns only when
@@ -173,345 +144,12 @@ write_all (int fd,
173} 144}
174 145
175 146
176/**
177 * Function called by a plugin in a child process. Transmits
178 * the meta data back to the parent process.
179 *
180 * @param cls closure, "int*" of the FD for transmission
181 * @param plugin_name name of the plugin that produced this value;
182 * special values can be used (i.e. '<zlib>' for zlib being
183 * used in the main libextractor library and yielding
184 * meta data).
185 * @param type libextractor-type describing the meta data
186 * @param format basic format information about data
187 * @param data_mime_type mime-type of data (not of the original file);
188 * can be NULL (if mime-type is not known)
189 * @param data actual meta-data found
190 * @param data_len number of bytes in data
191 * @return 0 to continue extracting, 1 to abort (transmission error)
192 */
193static int
194transmit_reply (void *cls,
195 const char *plugin_name,
196 enum EXTRACTOR_MetaType type,
197 enum EXTRACTOR_MetaFormat format,
198 const char *data_mime_type,
199 const char *data,
200 size_t data_len)
201{
202 static const unsigned char meta_byte = MESSAGE_META;
203 int *cpipe_out = cls;
204 struct IpcHeader hdr;
205 size_t mime_len;
206
207 if (NULL == data_mime_type)
208 mime_len = 0;
209 else
210 mime_len = strlen (data_mime_type) + 1;
211 if (mime_len > MAX_MIME_LEN)
212 mime_len = MAX_MIME_LEN;
213 hdr.meta_type = type;
214 hdr.meta_format = format;
215 hdr.data_len = data_len;
216 hdr.mime_len = mime_len;
217 if ( (sizeof (meta_byte) !=
218 write_all (*cpipe_out,
219 &meta_byte, sizeof (meta_byte))) ||
220 (sizeof (hdr) !=
221 write_all (*cpipe_out,
222 &hdr, sizeof (hdr))) ||
223 (mime_len !=
224 write_all (*cpipe_out,
225 data_mime_type, mime_len)) ||
226 (data_len !=
227 write_all (*cpipe_out,
228 data, data_len)) )
229 return 1;
230 return 0;
231}
232
233
234/**
235 * Main loop function for plugins.
236 * Reads a message from the plugin input pipe and acts on it.
237 * Can be called recursively (once) in OPMODE_DECOMPRESS.
238 * plugin->waiting_for_update == 1 indicates the recursive call.
239 *
240 * @param plugin plugin context
241 * @return 0, always
242 */
243static int
244process_requests (struct EXTRACTOR_PluginList *plugin)
245{
246 int in, out;
247 int read_result1, read_result2, read_result3, read_result4;
248 unsigned char code;
249 char *shm_name = NULL;
250 size_t shm_name_len;
251 int extract_reply;
252 struct IpcHeader hdr;
253 int do_break;
254#ifdef WINDOWS
255 HANDLE map;
256 MEMORY_BASIC_INFORMATION mi;
257#endif
258
259 in = plugin->pipe_in;
260 out = plugin->cpipe_out;
261
262 /* The point of recursing into this function is to request
263 * a seek from LE server and wait for a reply. This snipper
264 * requests a seek.
265 */
266 if (plugin->waiting_for_update == 1)
267 {
268 unsigned char seek_byte = MESSAGE_SEEK;
269 if (write (out, &seek_byte, 1) != 1)
270 return -1;
271 if (write (out, &plugin->seek_request, sizeof (int64_t)) != sizeof (int64_t))
272 return -1;
273 }
274
275 memset (&hdr, 0, sizeof (hdr));
276 do_break = 0;
277 while (!do_break)
278 {
279 read_result1 = read (in, &code, 1);
280 if (read_result1 <= 0)
281 break;
282 switch (code)
283 {
284 case MESSAGE_INIT_STATE:
285 read_result2 = read (in, &plugin->operation_mode, sizeof (uint8_t));
286 read_result3 = read (in, &plugin->fsize, sizeof (int64_t));
287 read_result4 = read (in, &shm_name_len, sizeof (size_t));
288 if ((read_result2 < sizeof (uint8_t)) ||
289 (read_result3 < sizeof (int64_t)) ||
290 (read_result4 < sizeof (size_t)))
291 {
292 do_break = 1;
293 break;
294 }
295 if (plugin->operation_mode != OPMODE_MEMORY &&
296 plugin->operation_mode != OPMODE_DECOMPRESS &&
297 plugin->operation_mode != OPMODE_FILE)
298 {
299 do_break = 1;
300 break;
301 }
302 if ((plugin->operation_mode == OPMODE_MEMORY ||
303 plugin->operation_mode == OPMODE_DECOMPRESS) &&
304 shm_name_len > MAX_SHM_NAME)
305 {
306 do_break = 1;
307 break;
308 }
309 /* Fsize may be -1 only in decompression mode */
310 if (plugin->operation_mode != OPMODE_DECOMPRESS && plugin->fsize <= 0)
311 {
312 do_break = 1;
313 break;
314 }
315 if (shm_name != NULL)
316 free (shm_name);
317 shm_name = malloc (shm_name_len);
318 if (shm_name == NULL)
319 {
320 do_break = 1;
321 break;
322 }
323 read_result2 = read (in, shm_name, shm_name_len);
324 if (read_result2 < shm_name_len)
325 {
326 do_break = 1;
327 break;
328 }
329 shm_name[shm_name_len - 1] = '\0';
330 do_break = init_state_method (plugin, plugin->operation_mode, plugin->fsize, shm_name);
331 /* in OPMODE_MEMORY and OPMODE_FILE we can start extracting right away,
332 * there won't be UPDATED_SHM message, and we don't need it
333 */
334 if (!do_break && (plugin->operation_mode == OPMODE_MEMORY ||
335 plugin->operation_mode == OPMODE_FILE))
336 {
337 extract_reply = plugin->extract_method (plugin, transmit_reply, &out);
338 unsigned char done_byte = MESSAGE_DONE;
339 if (write (out, &done_byte, 1) != 1)
340 {
341 do_break = 1;
342 break;
343 }
344 if ((plugin->specials != NULL) &&
345 (NULL != strstr (plugin->specials, "force-kill")))
346 {
347 /* we're required to die after each file since this
348 plugin only supports a single file at a time */
349#if !WINDOWS
350 fsync (out);
351#else
352 _commit (out);
353#endif
354 _exit (0);
355 }
356 }
357 break;
358 case MESSAGE_DISCARD_STATE:
359 discard_state_method (plugin);
360 break;
361 case MESSAGE_UPDATED_SHM:
362 if (plugin->operation_mode == OPMODE_DECOMPRESS)
363 {
364 read_result2 = read (in, &plugin->fpos, sizeof (int64_t));
365 read_result3 = read (in, &plugin->map_size, sizeof (size_t));
366 read_result4 = read (in, &plugin->fsize, sizeof (int64_t));
367 if ((read_result2 < sizeof (int64_t)) || (read_result3 < sizeof (size_t)) ||
368 plugin->fpos < 0 || (plugin->operation_mode != OPMODE_DECOMPRESS && (plugin->fsize <= 0 || plugin->fpos >= plugin->fsize)))
369 {
370 do_break = 1;
371 break;
372 }
373 /* FIXME: also check mapped region size (lseek for *nix, VirtualQuery for W32) */
374 /* Re-map the shm */
375#if !WINDOWS
376 if ((-1 == plugin->shm_id) ||
377 (NULL == (plugin->shm_ptr = mmap (NULL, plugin->map_size, PROT_READ, MAP_SHARED, plugin->shm_id, 0))) ||
378 (plugin->shm_ptr == (void *) -1))
379 {
380 do_break = 1;
381 break;
382 }
383#else
384 if ((plugin->map_handle == 0) ||
385 (NULL == (plugin->shm_ptr = MapViewOfFile (plugin->map_handle, FILE_MAP_READ, 0, 0, 0))))
386 {
387 do_break = 1;
388 break;
389 }
390#endif
391 if (plugin->waiting_for_update == 1)
392 {
393 /* We were only waiting for this one message */
394 do_break = 1;
395 plugin->waiting_for_update = 2;
396 break;
397 }
398 /* Run extractor on mapped region (recursive call doesn't reach this
399 * point and breaks out earlier.
400 */
401 extract_reply = plugin->extract_method (plugin, transmit_reply, &out);
402 /* Unmap the shm */
403#if !WINDOWS
404 if ((plugin->shm_ptr != NULL) &&
405 (plugin->shm_ptr != (void*) -1) )
406 munmap (plugin->shm_ptr, plugin->map_size);
407#else
408 if (plugin->shm_ptr != NULL)
409 UnmapViewOfFile (plugin->shm_ptr);
410#endif
411 plugin->shm_ptr = NULL;
412 if (extract_reply == 1)
413 {
414 /* Tell LE that we're done */
415 unsigned char done_byte = MESSAGE_DONE;
416 if (write (out, &done_byte, 1) != 1)
417 {
418 do_break = 1;
419 break;
420 }
421 if ((plugin->specials != NULL) &&
422 (NULL != strstr (plugin->specials, "force-kill")))
423 {
424 /* we're required to die after each file since this
425 plugin only supports a single file at a time */
426#if !WINDOWS
427 fsync (out);
428#else
429 _commit (out);
430#endif
431 _exit (0);
432 }
433 }
434 else
435 {
436 /* Tell LE that we're not done, and we need to seek */
437 unsigned char seek_byte = MESSAGE_SEEK;
438 if (write (out, &seek_byte, 1) != 1)
439 {
440 do_break = 1;
441 break;
442 }
443 if (write (out, &plugin->seek_request, sizeof (int64_t)) != sizeof (int64_t))
444 {
445 do_break = 1;
446 break;
447 }
448 }
449 }
450 else
451 {
452 /* This is mostly to safely skip unrelated messages */
453 int64_t t;
454 size_t t2;
455 read_result2 = read (in, &t, sizeof (int64_t));
456 read_result3 = read (in, &t2, sizeof (size_t));
457 read_result4 = read (in, &t, sizeof (int64_t));
458 }
459 break;
460 }
461 }
462 return 0;
463}
464
465
466/**
467 * 'main' function of the child process. Loads the plugin,
468 * sets up its in and out pipes, then runs the request serving function.
469 *
470 * @param plugin extractor plugin to use
471 * @param in stream to read from
472 * @param out stream to write to
473 */
474static void
475plugin_main (struct EXTRACTOR_PluginList *plugin,
476 int in, int out)
477{
478 if (plugin == NULL)
479 {
480 close (in);
481 close (out);
482 return;
483 }
484 if (0 != EXTRACTOR_plugin_load_ (plugin))
485 {
486 close (in);
487 close (out);
488#if DEBUG
489 fprintf (stderr, "Plugin `%s' failed to load!\n", plugin->short_libname);
490#endif
491 return;
492 }
493 if ((plugin->specials != NULL) &&
494 (NULL != strstr (plugin->specials, "close-stderr")))
495 close (2);
496 if ((plugin->specials != NULL) &&
497 (NULL != strstr (plugin->specials, "close-stdout")))
498 close (1);
499
500 plugin->pipe_in = in;
501 /* Compiler will complain, and it's right. This is a kind of hack...*/
502 plugin->cpipe_out = out;
503 process_requests (plugin);
504
505 close (in);
506 close (out);
507}
508
509 147
510/** 148/**
511 * Open a file 149 * Open a file
512 */ 150 */
513static int 151static int
514file_open(const char *filename, int oflag, ...) 152file_open (const char *filename, int oflag, ...)
515{ 153{
516 int mode; 154 int mode;
517 const char *fn; 155 const char *fn;
@@ -794,112 +432,6 @@ pl_pick_next_buffer_at (struct EXTRACTOR_PluginList *plugin,
794} 432}
795 433
796 434
797/**
798 * Moves current absolute buffer position to @pos in @whence mode.
799 * Will move logical position withouth shifting the buffer, if possible.
800 * Will not move beyond the end of file.
801 *
802 * @param plugin plugin context
803 * @param pos position to move to
804 * @param whence seek mode (SEEK_CUR, SEEK_SET, SEEK_END)
805 * @return new absolute position, -1 on error
806 */
807static int64_t
808pl_seek (struct EXTRACTOR_PluginList *plugin, int64_t pos, int whence)
809{
810 switch (whence)
811 {
812 case SEEK_CUR:
813 if (plugin->shm_pos + pos < plugin->map_size && plugin->shm_pos + pos >= 0)
814 {
815 plugin->shm_pos += pos;
816 return plugin->fpos + plugin->shm_pos;
817 }
818 if (0 != pl_pick_next_buffer_at (plugin, plugin->fpos + plugin->shm_pos + pos, 1))
819 return -1;
820 plugin->shm_pos += pos;
821 return plugin->fpos + plugin->shm_pos;
822 break;
823 case SEEK_SET:
824 if (pos < 0)
825 return -1;
826 if (pos >= plugin->fpos && pos < plugin->fpos + plugin->map_size)
827 {
828 plugin->shm_pos = pos - plugin->fpos;
829 return pos;
830 }
831 if (0 != pl_pick_next_buffer_at (plugin, pos, 1))
832 return -1;
833 if (pos >= plugin->fpos && pos < plugin->fpos + plugin->map_size)
834 {
835 plugin->shm_pos = pos - plugin->fpos;
836 return pos;
837 }
838 return -1;
839 break;
840 case SEEK_END:
841 while (plugin->fsize == -1)
842 {
843 pl_pick_next_buffer_at (plugin, plugin->fpos + plugin->map_size + pos, 0);
844 }
845 if (plugin->fsize + pos - 1 < plugin->fpos || plugin->fsize + pos - 1 > plugin->fpos + plugin->map_size)
846 {
847 if (0 != pl_pick_next_buffer_at (plugin, plugin->fsize - MAX_READ, 0))
848 return -1;
849 }
850 plugin->shm_pos = plugin->fsize + pos - plugin->fpos;
851 if (plugin->shm_pos < 0)
852 plugin->shm_pos = 0;
853 else if (plugin->shm_pos >= plugin->map_size)
854 plugin->shm_pos = plugin->map_size - 1;
855 return plugin->fpos + plugin->shm_pos - 1;
856 break;
857 }
858 return -1;
859}
860
861
862static int64_t
863pl_get_fsize (struct EXTRACTOR_PluginList *plugin)
864{
865 return plugin->fsize;
866}
867
868
869/**
870 * Fills @data with a pointer to the data buffer.
871 * Equivalent to read(), except you don't have to allocate and free
872 * a buffer, since the data is already in memory.
873 * Will move the buffer, if necessary
874 *
875 * @param plugin plugin context
876 * @param data location to store data pointer
877 * @param count number of bytes to read
878 * @return number of bytes (<= count) avalable in @data, -1 on error
879 */
880static int64_t
881pl_read (struct EXTRACTOR_PluginList *plugin, unsigned char **data, size_t count)
882{
883 *data = NULL;
884 if (count > MAX_READ)
885 return -1;
886 if (count > plugin->map_size - plugin->shm_pos)
887 {
888 int64_t actual_count;
889 if (plugin->fpos + plugin->shm_pos != pl_seek (plugin, plugin->fpos + plugin->shm_pos, SEEK_SET))
890 return -1;
891 *data = &plugin->shm_ptr[plugin->shm_pos];
892 actual_count = (count < plugin->map_size - plugin->shm_pos) ? count : (plugin->map_size - plugin->shm_pos);
893 plugin->shm_pos += actual_count;
894 return actual_count;
895 }
896 else
897 {
898 *data = &plugin->shm_ptr[plugin->shm_pos];
899 plugin->shm_pos += count;
900 return count;
901 }
902}
903 435
904 436
905/** 437/**
@@ -1002,111 +534,7 @@ ask_in_process_plugin (struct EXTRACTOR_PluginList *plugin,
1002} 534}
1003 535
1004 536
1005/** 537
1006 * Receive a reply from plugin (seek request, metadata and done message)
1007 *
1008 * @param plugin plugin context
1009 * @param proc metadata callback
1010 * @param proc_cls callback cls
1011 * @return 0 on success, -1 on error
1012 */
1013static int
1014receive_reply (struct EXTRACTOR_PluginList *plugin,
1015 EXTRACTOR_MetaDataProcessor proc, void *proc_cls)
1016{
1017 int read_result;
1018 unsigned char code;
1019 int64_t seek_position;
1020 struct IpcHeader hdr;
1021 char *mime_type;
1022 char *data;
1023 int must_read = 1;
1024
1025 while (must_read)
1026 {
1027 read_result = plugin_read (plugin, &code, 1);
1028 if (read_result < 1)
1029 return -1;
1030 switch (code)
1031 {
1032 case MESSAGE_DONE: /* Done */
1033 plugin->seek_request = -1;
1034 must_read = 0;
1035 break;
1036 case MESSAGE_SEEK: /* Seek */
1037 read_result = plugin_read (plugin,
1038 &seek_position, sizeof (int64_t));
1039 if (read_result < sizeof (int64_t))
1040 return -1;
1041 plugin->seek_request = seek_position;
1042 must_read = 0;
1043 break;
1044 case MESSAGE_META: /* Meta */
1045 read_result = plugin_read (plugin,
1046 &hdr, sizeof (hdr));
1047 if (read_result < sizeof (hdr))
1048 return -1;
1049 /* FIXME: check hdr for sanity */
1050 if (hdr.data_len > MAX_META_DATA)
1051 return -1; /* not allowing more than MAX_META_DATA meta data */
1052 if (0 == hdr.mime_len)
1053 {
1054 mime_type = NULL;
1055 }
1056 else
1057 {
1058 if (NULL == (mime_type = malloc (hdr.mime_len)))
1059 return -1;
1060 read_result = plugin_read (plugin,
1061 mime_type,
1062 hdr.mime_len);
1063 if ( (read_result < hdr.mime_len) ||
1064 ('\0' != mime_type[hdr.mime_len-1]) )
1065 {
1066 if (NULL != mime_type)
1067 free (mime_type);
1068 return -1;
1069 }
1070 }
1071 if (0 == hdr.data_len)
1072 {
1073 data = NULL;
1074 }
1075 else
1076 {
1077 if (NULL == (data = malloc (hdr.data_len)))
1078 {
1079 if (NULL != mime_type)
1080 free (mime_type);
1081 return -1;
1082 }
1083 read_result = plugin_read (plugin,
1084 data, hdr.data_len);
1085 if (read_result < hdr.data_len)
1086 {
1087 if (NULL != mime_type)
1088 free (mime_type);
1089 free (data);
1090 return -1;
1091 }
1092 }
1093 read_result = proc (proc_cls,
1094 plugin->short_libname,
1095 hdr.meta_type, hdr.meta_format,
1096 mime_type, data, hdr.data_len);
1097 if (NULL != mime_type)
1098 free (mime_type);
1099 if (NULL != data)
1100 free (data);
1101 if (0 != read_result)
1102 return 1;
1103 break;
1104 default:
1105 return -1;
1106 }
1107 }
1108 return 0;
1109}
1110 538
1111 539
1112/** 540/**