aboutsummaryrefslogtreecommitdiff
path: root/src/fs/fs_dirmetascan.c
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2012-01-14 16:04:58 +0000
committerChristian Grothoff <christian@grothoff.org>2012-01-14 16:04:58 +0000
commit85967c4d4bd03d68a677f6e8023b192b8b4453f5 (patch)
treefdfea34d80d25bf40c1a1e117ef86b65dba5c096 /src/fs/fs_dirmetascan.c
parent385d99b60ab8eedc6d26b1e66949a43afafdd79e (diff)
downloadgnunet-85967c4d4bd03d68a677f6e8023b192b8b4453f5.tar.gz
gnunet-85967c4d4bd03d68a677f6e8023b192b8b4453f5.zip
-file was missing, forgot to add earlier
Diffstat (limited to 'src/fs/fs_dirmetascan.c')
-rw-r--r--src/fs/fs_dirmetascan.c1282
1 files changed, 1282 insertions, 0 deletions
diff --git a/src/fs/fs_dirmetascan.c b/src/fs/fs_dirmetascan.c
new file mode 100644
index 000000000..372579ccb
--- /dev/null
+++ b/src/fs/fs_dirmetascan.c
@@ -0,0 +1,1282 @@
1/*
2 This file is part of GNUnet
3 (C) 2005-2012 Christian Grothoff (and other contributing authors)
4
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19*/
20
21#include "platform.h"
22#include "gnunet_fs_service.h"
23#include "gnunet_scheduler_lib.h"
24
25/**
26 * Entry for each unique keyword to track how often
27 * it occured. Contains the keyword and the counter.
28 */
29struct KeywordCounter
30{
31
32 /**
33 * Keyword that was found.
34 */
35 const char *value;
36
37 /**
38 * How many files have this keyword?
39 */
40 unsigned int count;
41
42 /**
43 * This is a doubly-linked list
44 */
45 struct KeywordCounter *prev;
46
47 /**
48 * This is a doubly-linked list
49 */
50 struct KeywordCounter *next;
51};
52
53/**
54 * Aggregate information we keep for meta data in each directory.
55 */
56struct MetaCounter
57{
58 /**
59 * The actual meta data.
60 */
61 const char *data;
62
63 /**
64 * Number of bytes in 'data'.
65 */
66 size_t data_size;
67
68 /**
69 * Name of the plugin that provided that piece of metadata
70 */
71 const char *plugin_name;
72
73 /**
74 * Type of the data
75 */
76 enum EXTRACTOR_MetaType type;
77
78 /**
79 * Format of the data
80 */
81 enum EXTRACTOR_MetaFormat format;
82
83 /**
84 * MIME-type of the metadata itself
85 */
86 const char *data_mime_type;
87
88 /**
89 * How many files have meta entries matching this value?
90 * (type and format do not have to match).
91 */
92 unsigned int count;
93
94 /**
95 * This is a doubly-linked list
96 */
97 struct MetaCounter *prev;
98
99 /**
100 * This is a doubly-linked list
101 */
102 struct MetaCounter *next;
103};
104
105/**
106 * Execution context for 'add_dir'
107 * Owned by the initiator thread.
108 */
109struct AddDirContext
110{
111 /**
112 * Parent directory (used to access keyword and metadata counters,
113 * and the like).
114 * After the scan is finished, it will contain a pointer to the
115 * top-level directory entry in the directory tree built by the
116 * scanner.
117 */
118 struct ShareTreeItem *parent;
119
120 /**
121 * Expanded filename (as given by the scan initiator).
122 * The scanner thread stores a copy here, and frees it when it finishes.
123 */
124 char *filename_expanded;
125
126 /**
127 * A synchronization privitive. Whenever its state is altered,
128 * it means that the initiator wants the scanner to wrap up.
129 * It is owned by the initiator thread.
130 */
131#if WINDOWS
132 HANDLE stop;
133#else
134 sem_t *stop;
135#endif
136
137 /**
138 * 1 if the scanner should stop, 0 otherwise. Set in response
139 * to communication errors or when the initiator wants the scanning
140 * process to stop.
141 */
142 char do_stop;
143
144 /**
145 * Handle of the pipe end into which the progress messages are written
146 * The pipe is owned by the initiator thread, and there's no way to
147 * close this end without having access to the pipe, so it won't
148 * be closed by the scanner thread.
149 * The initiator MUST keep it alive until the scanner thread is finished.
150 */
151 const struct GNUNET_DISK_FileHandle *progress_write;
152
153
154 /**
155 * List of libextractor plugins to use for extracting.
156 * Initialized when the scan starts, removed when it finishes.
157 */
158 struct EXTRACTOR_PluginList *plugins;
159};
160
161/**
162 * An opaque structure a pointer to which is returned to the
163 * caller to be used to control the scanner.
164 */
165struct GNUNET_FS_DirScanner
166{
167 /**
168 * A synchronization privitive that is used to signal the scanner to stop.
169 * Owned by the initiator thread.
170 */
171#if WINDOWS
172 HANDLE stop;
173#else
174 sem_t *stop;
175#endif
176
177 /**
178 * A thread object for the scanner thread.
179 * Owned by the initiator thread.
180 */
181#if WINDOWS
182 HANDLE thread;
183#else
184 pthread_t thread;
185#endif
186
187 /**
188 * A task for reading progress messages from the scanner.
189 */
190 GNUNET_SCHEDULER_TaskIdentifier progress_read_task;
191
192 /**
193 * The end of the pipe that is used to read progress messages.
194 */
195 const struct GNUNET_DISK_FileHandle *progress_read;
196
197 /**
198 * The pipe that is used to read progress messages.
199 * Owned (along with both of its ends) by the initiator thread.
200 * Only closed after the scanner thread is finished.
201 */
202 struct GNUNET_DISK_PipeHandle *progress_pipe;
203
204 /**
205 * The function that will be called every time there's a progress
206 * message.
207 */
208 GNUNET_FS_DirScannerProgressCallback progress_callback;
209
210 /**
211 * A closure for progress_callback.
212 */
213 void *cls;
214
215 /**
216 * A pointer to the context of the scanner.
217 * Owned by the initiator thread.
218 * Initiator thread shouldn't touch it until the scanner thread
219 * is finished.
220 */
221 struct AddDirContext *adc;
222};
223
224/**
225 * A structure that forms a singly-linked list that serves as a stack
226 * for metadata-processing function.
227 */
228struct ProcessMetadataStackItem
229{
230 /**
231 * A pointer to metadata-processing context.
232 * The same in every stack item.
233 */
234 struct ProcessMetadataContext *ctx;
235
236 /**
237 * This is a singly-linked list. A pointer to its end is kept, and
238 * this pointer is used to walk it backwards.
239 */
240 struct ProcessMetadataStackItem *parent;
241
242 /**
243 * Map from the hash over the keyword to an 'struct KeywordCounter *'
244 * counter that says how often this keyword was
245 * encountered in the current directory.
246 */
247 struct GNUNET_CONTAINER_MultiHashMap *keywordcounter;
248
249 /**
250 * Map from the hash over the metadata to an 'struct MetaCounter *'
251 * counter that says how often this metadata was
252 * encountered in the current directory.
253 */
254 struct GNUNET_CONTAINER_MultiHashMap *metacounter;
255
256 /**
257 * Number of files in the current directory.
258 */
259 unsigned int dir_entry_count;
260
261 /**
262 * Keywords to exclude from using for KSK since they'll be associated
263 * with the parent as well. NULL for nothing blocked.
264 */
265 struct GNUNET_FS_Uri *exclude_ksk;
266
267 /**
268 * A share tree item that is being processed.
269 */
270 struct ShareTreeItem *item;
271
272 /**
273 * Set to GNUNET_YES to indicate that the directory pointer by 'item'
274 * was processed, and we should move on to the next.
275 * Otherwise the directory will be recursed into.
276 */
277 int end_directory;
278
279};
280
281/**
282 * The structure to keep the state of metadata processing
283 */
284struct ProcessMetadataContext
285{
286 /**
287 * The top of the stack.
288 */
289 struct ProcessMetadataStackItem *stack;
290
291 /**
292 * Callback to invoke when processing is finished
293 */
294 GNUNET_SCHEDULER_Task cb;
295
296 /**
297 * Closure for 'cb'
298 */
299 void *cls;
300
301 /**
302 * Toplevel directory item of the tree to process.
303 */
304 struct ShareTreeItem *toplevel;
305};
306
307/**
308 * Called every now and then by the scanner.
309 * Checks the synchronization privitive.
310 * Returns 1 if the scanner should stop, 0 otherwise.
311 */
312static int
313should_stop (struct AddDirContext *adc)
314{
315#if WINDOWS
316 if (WaitForSingleObject (adc->stop, 0) == WAIT_TIMEOUT)
317 return 0;
318 adc->do_stop = 1;
319 return 1;
320#else
321 int value;
322 sem_getvalue(adc->stop, &value);
323 if (value > 0)
324 {
325 adc->do_stop = 1;
326 return 1;
327 }
328 return 0;
329#endif
330}
331
332/**
333 * Write progress message.
334 * Format is:
335 * <reason><filename length><filename><directory flag>
336 * If filename is NULL, filename is not written, and its length
337 * is written as 0, and nothing else is written. It signals the initiator
338 * thread that the scanner is finished, and that it can now join its thread.
339 *
340 * Also checks if the initiator thread wants the scanner to stop,
341 * Returns 1 to stop scanning (if the signal was received, or
342 * if the pipe was broken somehow), 0 otherwise.
343 */
344static int
345write_progress (struct AddDirContext *adc, const char *filename,
346 char is_directory, enum GNUNET_DirScannerProgressUpdateReason reason)
347{
348 size_t filename_len;
349 size_t wr;
350 size_t total_write;
351 if ((adc->do_stop || should_stop (adc)) && reason != GNUNET_DIR_SCANNER_ASKED_TO_STOP
352 && reason != GNUNET_DIR_SCANNER_FINISHED)
353 return 1;
354 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
355 &reason, sizeof (reason));
356 while (wr > 0 && total_write < sizeof (reason))
357 {
358 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
359 &((char *)&reason)[total_write], sizeof (reason) - total_write);
360 if (wr > 0)
361 total_write += wr;
362 }
363 if (sizeof (reason) != wr)
364 return 1;
365 if (filename)
366 filename_len = strlen (filename) + 1;
367 else
368 filename_len = 0;
369 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
370 &filename_len, sizeof (size_t));
371 while (wr > 0 && total_write < sizeof (size_t))
372 {
373 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
374 &((char *)&filename_len)[total_write], sizeof (size_t) - total_write);
375 if (wr > 0)
376 total_write += wr;
377 }
378 if (sizeof (size_t) != wr)
379 return 1;
380 if (filename)
381 {
382 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
383 filename, filename_len);
384 while (wr > 0 && total_write < filename_len)
385 {
386 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
387 &((char *)filename)[total_write], filename_len - total_write);
388 if (wr > 0)
389 total_write += wr;
390 }
391 if (filename_len != wr)
392 return 1;
393 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
394 &is_directory, sizeof (char));
395 while (wr > 0 && total_write < sizeof (char))
396 {
397 total_write = wr = GNUNET_DISK_file_write (adc->progress_write,
398 &((char *)&is_directory)[total_write], sizeof (char) - total_write);
399 if (wr > 0)
400 total_write += wr;
401 }
402 if (sizeof (char) != wr)
403 return 1;
404 }
405 return 0;
406}
407
408/**
409 * Add the given keyword to the
410 * keyword statistics tracker.
411 *
412 * @param cls closure (user-defined)
413 * @param keyword the keyword to count
414 * @param is_mandatory ignored
415 * @return always GNUNET_OK
416 */
417static int
418add_to_keyword_counter (void *cls, const char *keyword, int is_mandatory)
419{
420 struct GNUNET_CONTAINER_MultiHashMap *mcm = cls;
421 struct KeywordCounter *cnt, *first_cnt;
422 GNUNET_HashCode hc;
423 size_t klen;
424
425 klen = strlen (keyword) + 1;
426 GNUNET_CRYPTO_hash (keyword, klen - 1, &hc);
427 /* Since the map might contain multiple values per keyword, we only
428 * store one value, and attach all other to it, forming a linked list.
429 * Somewhat easier than retrieving multiple items via callback.
430 */
431 first_cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc);
432 for (cnt = first_cnt; cnt && strcmp (cnt->value, keyword) != 0; cnt = cnt->next);
433 if (cnt == NULL)
434 {
435 cnt = GNUNET_malloc (sizeof (struct KeywordCounter) + klen);
436 cnt->value = (const char *) &cnt[1];
437 memcpy (&cnt[1], keyword, klen);
438 if (first_cnt != NULL)
439 {
440 if (first_cnt->prev != NULL)
441 {
442 first_cnt->prev->next = cnt;
443 cnt->prev = first_cnt->prev;
444 }
445 first_cnt->prev = cnt;
446 cnt->next = first_cnt;
447 }
448 else
449 GNUNET_CONTAINER_multihashmap_put (mcm, &hc, cnt,
450 GNUNET_CONTAINER_MULTIHASHMAPOPTION_MULTIPLE);
451 }
452 cnt->count++;
453 return GNUNET_OK;
454}
455
456/**
457 * Type of a function that libextractor calls for each
458 * meta data item found.
459 *
460 * @param cls the container multihashmap to update
461 * @param plugin_name name of the plugin that produced this value;
462 * special values can be used (i.e. '&lt;zlib&gt;' for zlib being
463 * used in the main libextractor library and yielding
464 * meta data).
465 * @param type libextractor-type describing the meta data
466 * @param format basic format information about data
467 * @param data_mime_type mime-type of data (not of the original file);
468 * can be NULL (if mime-type is not known)
469 * @param data actual meta-data found
470 * @param data_len number of bytes in data
471 * @return GNUNET_OK to continue extracting / iterating
472 */
473static int
474add_to_meta_counter (void *cls, const char *plugin_name,
475 enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format,
476 const char *data_mime_type, const char *data, size_t data_len)
477{
478 struct GNUNET_CONTAINER_MultiHashMap *map = cls;
479 GNUNET_HashCode key;
480 struct MetaCounter *cnt, *first_cnt;
481
482 GNUNET_CRYPTO_hash (data, data_len, &key);
483 first_cnt = GNUNET_CONTAINER_multihashmap_get (map, &key);
484 for (cnt = first_cnt; cnt
485 && cnt->data_size != data_len
486 && memcmp (cnt->data, data, cnt->data_size) != 0; cnt = cnt->next);
487 if (cnt == NULL)
488 {
489 cnt = GNUNET_malloc (sizeof (struct MetaCounter));
490 cnt->data = data;
491 cnt->data_size = data_len;
492 cnt->plugin_name = plugin_name;
493 cnt->type = type;
494 cnt->format = format;
495 cnt->data_mime_type = data_mime_type;
496
497 if (first_cnt != NULL)
498 {
499 if (first_cnt->prev != NULL)
500 {
501 first_cnt->prev->next = cnt;
502 cnt->prev = first_cnt->prev;
503 }
504 first_cnt->prev = cnt;
505 cnt->next = first_cnt;
506 }
507 else
508 GNUNET_CONTAINER_multihashmap_put (map, &key, cnt,
509 GNUNET_CONTAINER_MULTIHASHMAPOPTION_MULTIPLE);
510 }
511 cnt->count++;
512 return 0;
513}
514
515/**
516 * Allocates a struct ShareTreeItem and adds it to its parent.
517 */
518static struct ShareTreeItem *
519make_item (struct ShareTreeItem *parent)
520{
521 struct ShareTreeItem *item;
522 item = GNUNET_malloc (sizeof (struct ShareTreeItem));
523
524 item->parent = parent;
525 if (parent)
526 GNUNET_CONTAINER_DLL_insert (parent->children_head, parent->children_tail,
527 item);
528 return item;
529}
530
531/**
532 * Extract metadata from a file and add it to the share tree
533 *
534 * @param adc context to modify
535 * @param filename name of the file to process
536 */
537static void
538extract_file (struct AddDirContext *adc, const char *filename)
539{
540 struct ShareTreeItem *item;
541 const char *short_fn;
542
543 item = make_item (adc->parent);
544
545 GNUNET_DISK_file_size (filename, &item->file_size, GNUNET_YES);
546 item->is_directory = GNUNET_NO;
547
548 item->meta = GNUNET_CONTAINER_meta_data_create ();
549 GNUNET_FS_meta_data_extract_from_file (item->meta, filename,
550 adc->plugins);
551 GNUNET_CONTAINER_meta_data_delete (item->meta, EXTRACTOR_METATYPE_FILENAME,
552 NULL, 0);
553 short_fn = GNUNET_STRINGS_get_short_name (filename);
554
555 item->filename = GNUNET_strdup (filename);
556 item->short_filename = GNUNET_strdup (short_fn);
557
558 GNUNET_CONTAINER_meta_data_insert (item->meta, "<libgnunetfs>",
559 EXTRACTOR_METATYPE_FILENAME,
560 EXTRACTOR_METAFORMAT_UTF8, "text/plain",
561 short_fn, strlen (short_fn) + 1);
562}
563
564/**
565 * Remove the keyword from the ksk URI.
566 *
567 * @param cls the ksk uri
568 * @param keyword the word to remove
569 * @param is_mandatory ignored
570 * @return always GNUNET_OK
571 */
572static int
573remove_keyword (void *cls, const char *keyword, int is_mandatory)
574{
575 struct GNUNET_FS_Uri *ksk = cls;
576
577 GNUNET_FS_uri_ksk_remove_keyword (ksk, keyword);
578 return GNUNET_OK;
579}
580
581/**
582 * Remove keywords from current directory's children, if they are
583 * in the exluded keywords list of that directory.
584 *
585 * @param cls the ksk uri
586 * @param keyword the word to remove
587 * @param is_mandatory ignored
588 * @return always GNUNET_OK
589 */
590static int
591remove_keywords (struct ProcessMetadataStackItem *stack, struct ShareTreeItem *dir)
592{
593 struct ShareTreeItem *item;
594
595 for (item = dir->children_head; item; item = item->next)
596 {
597 if (stack->exclude_ksk != NULL)
598 GNUNET_FS_uri_ksk_get_keywords (stack->exclude_ksk, &remove_keyword, item->ksk_uri);
599 }
600 return GNUNET_OK;
601}
602
603/**
604 * Context passed to 'migrate_and_drop'.
605 */
606struct KeywordProcessContext
607{
608 /**
609 * All the keywords we migrated to the parent.
610 */
611 struct GNUNET_FS_Uri *ksk;
612
613 /**
614 * How often does a keyword have to occur to be
615 * migrated to the parent?
616 */
617 unsigned int threshold;
618};
619
620/**
621 * Context passed to 'migrate_and_drop'.
622 */
623struct MetaProcessContext
624{
625 /**
626 * All the metadata we copy to the parent.
627 */
628 struct GNUNET_CONTAINER_MetaData *meta;
629
630 /**
631 * How often does a metadata have to occur to be
632 * migrated to the parent?
633 */
634 unsigned int threshold;
635};
636
637
638/**
639 * Move "frequent" keywords over to the
640 * target ksk uri, free the counters.
641 *
642 */
643static int
644migrate_and_drop (void *cls, const GNUNET_HashCode * key, void *value)
645{
646 struct KeywordProcessContext *kpc = cls;
647 struct KeywordCounter *counter = value;
648
649 if (counter->count >= kpc->threshold && counter->count > 1)
650 {
651 GNUNET_FS_uri_ksk_add_keyword (kpc->ksk, counter->value, GNUNET_NO);
652 }
653 GNUNET_free (counter);
654 return GNUNET_YES;
655}
656/**
657 * Copy "frequent" metadata items over to the
658 * target metadata container, free the counters.
659 *
660 */
661static int
662migrate_and_drop_metadata (void *cls, const GNUNET_HashCode * key, void *value)
663{
664 struct MetaProcessContext *mpc = cls;
665 struct MetaCounter *counter = value;
666
667 if (counter->count >= mpc->threshold && counter->count > 1)
668 {
669 GNUNET_CONTAINER_meta_data_insert (mpc->meta,
670 counter->plugin_name,
671 counter->type,
672 counter->format,
673 counter->data_mime_type, counter->data,
674 counter->data_size);
675 }
676 GNUNET_free (counter);
677 return GNUNET_YES;
678}
679
680/**
681 * Go over the collected keywords from all entries in the
682 * directory and push common keywords up one level (by
683 * adding it to the returned struct). Do the same for metadata.
684 * Destroys keywordcounter and metacoutner for current directory.
685 *
686 * @param adc collection of child meta data
687 * @param exclude_ksk pointer to where moveable keywords will be stored
688 * @param copy_meta pointer to where copyable metadata will be stored
689 */
690static void
691process_keywords_and_metadata (struct ProcessMetadataStackItem *stack,
692 struct GNUNET_FS_Uri **exclude_ksk,
693 struct GNUNET_CONTAINER_MetaData **copy_meta)
694{
695 struct KeywordProcessContext kpc;
696 struct MetaProcessContext mpc;
697 struct GNUNET_CONTAINER_MetaData *tmp;
698
699 /* Surprisingly, it's impossible to create a ksk with 0 keywords directly.
700 * But we can create one from an empty metadata set
701 */
702 tmp = GNUNET_CONTAINER_meta_data_create ();
703 kpc.ksk = GNUNET_FS_uri_ksk_create_from_meta_data (tmp);
704 GNUNET_CONTAINER_meta_data_destroy (tmp);
705 mpc.meta = GNUNET_CONTAINER_meta_data_create ();
706
707 kpc.threshold = mpc.threshold = (stack->dir_entry_count + 1) / 2; /* 50% */
708
709 GNUNET_CONTAINER_multihashmap_iterate (stack->keywordcounter,
710 &migrate_and_drop, &kpc);
711 GNUNET_CONTAINER_multihashmap_iterate (stack->metacounter,
712 &migrate_and_drop_metadata, &mpc);
713
714 GNUNET_CONTAINER_multihashmap_destroy (stack->keywordcounter);
715 GNUNET_CONTAINER_multihashmap_destroy (stack->metacounter);
716 *exclude_ksk = kpc.ksk;
717 *copy_meta = mpc.meta;
718}
719
720/**
721 * Function called by the directory iterator to
722 * (recursively) add all of the files in the
723 * directory to the tree.
724 * Called by the directory scanner to initiate the
725 * scan.
726 * TODO: find a way to make it non-recursive.
727 *
728 * @param cls the 'struct AddDirContext*' we're in
729 * @param filename file or directory to scan
730 */
731static int
732scan_directory (void *cls, const char *filename)
733{
734 struct AddDirContext *adc = cls, recurse_adc;
735 struct stat sbuf;
736 struct ShareTreeItem *item;
737 const char *short_fn;
738 int do_stop = 0;
739
740 /* Wrap up fast */
741 if (adc->do_stop)
742 return GNUNET_SYSERR;
743
744 /* If the file doesn't exist (or is not statable for any other reason,
745 * skip it, and report it.
746 */
747 if (0 != STAT (filename, &sbuf))
748 {
749 do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),
750 GNUNET_DIR_SCANNER_DOES_NOT_EXIST);
751 return GNUNET_OK;
752 }
753
754 /* Report the progress */
755 do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),
756 GNUNET_DIR_SCANNER_NEW_FILE);
757 if (do_stop)
758 {
759 /* We were asked to stop, acknowledge that and return */
760 do_stop = write_progress (adc, filename, S_ISDIR (sbuf.st_mode),
761 GNUNET_DIR_SCANNER_ASKED_TO_STOP);
762 return GNUNET_SYSERR;
763 }
764
765 if (!S_ISDIR (sbuf.st_mode))
766 extract_file (adc, filename);
767 else
768 {
769 item = make_item (adc->parent);
770 item->meta = GNUNET_CONTAINER_meta_data_create ();
771
772 item->is_directory = GNUNET_YES;
773
774 /* copy fields from adc */
775 recurse_adc = *adc;
776 /* replace recurse_adc contents with the ones for this directory */
777 recurse_adc.parent = item;
778
779 /* recurse into directory */
780 GNUNET_DISK_directory_scan (filename, &scan_directory, &recurse_adc);
781
782 short_fn = GNUNET_STRINGS_get_short_name (filename);
783
784 item->filename = GNUNET_strdup (filename);
785 item->short_filename = GNUNET_strdup (short_fn);
786
787 if (adc->parent == NULL)
788 {
789 /* we're finished with the scan, make sure caller gets the top-level
790 * directory pointer
791 */
792 adc->parent = item;
793 }
794 }
795 return GNUNET_OK;
796}
797
798/**
799 * Signals the scanner to finish the scan as fast as possible.
800 * Does not block.
801 * Can close the pipe if asked to, but that is only used by the
802 * internal call to this function during cleanup. The client
803 * must understand the consequences of closing the pipe too early.
804 *
805 * @param ds directory scanner structure
806 * @param close_pipe GNUNET_YES to close
807 */
808void
809GNUNET_FS_directory_scan_finish (struct GNUNET_FS_DirScanner *ds,
810 int close_pipe)
811{
812#if WINDOWS
813 SetEvent (ds->stop);
814#else
815 sem_post (&ds->stop);
816#endif
817 if (close_pipe)
818 {
819 if (ds->progress_read_task != GNUNET_SCHEDULER_NO_TASK)
820 {
821 GNUNET_SCHEDULER_cancel (ds->progress_read_task);
822 ds->progress_read_task = GNUNET_SCHEDULER_NO_TASK;
823 }
824 GNUNET_DISK_pipe_close_end (ds->progress_pipe, GNUNET_DISK_PIPE_END_READ);
825 ds->progress_read = NULL;
826 }
827}
828
829/**
830 * Signals the scanner thread to finish (in case it isn't finishing
831 * already) and joins the scanner thread. Closes the pipes, frees the
832 * scanner contexts (both of them), returns the results of the scan.
833 * Results are valid (and have to be freed) even if the scanner had
834 * an error or was rushed to finish prematurely.
835 * Blocks until the scanner is finished.
836 *
837 * @param ds directory scanner structure
838 * @return the results of the scan (a directory tree)
839 */
840struct ShareTreeItem *
841GNUNET_FS_directory_scan_cleanup (struct GNUNET_FS_DirScanner *ds)
842{
843 struct ShareTreeItem *result;
844
845 GNUNET_FS_directory_scan_finish (ds, GNUNET_YES);
846#if WINDOWS
847 WaitForSingleObject (ds->thread, INFINITE);
848 CloseHandle (ds->stop);
849 CloseHandle (ds->thread);
850#else
851 pthread_join (ds->thread, NULL);
852 sem_destroy (&ds->stop);
853 pthread_detach (ds->thread);
854#endif
855
856 GNUNET_DISK_pipe_close (ds->progress_pipe);
857 result = ds->adc->parent;
858 GNUNET_free (ds->adc);
859 GNUNET_free (ds);
860 return result;
861}
862
863/**
864 * The function from which the scanner thread starts
865 */
866#if WINDOWS
867static DWORD
868#else
869static int
870#endif
871run_directory_scan_thread (struct AddDirContext *adc)
872{
873 scan_directory (adc, adc->filename_expanded);
874 GNUNET_free (adc->filename_expanded);
875 if (adc->plugins != NULL)
876 EXTRACTOR_plugin_remove_all (adc->plugins);
877 /* Tell the initiator that we're finished, it can now join the thread */
878 write_progress (adc, NULL, 0, GNUNET_DIR_SCANNER_FINISHED);
879 return 0;
880}
881
882/**
883 * Called every time there is data to read from the scanner.
884 * Calls the scanner progress handler.
885 *
886 * @param cls the closure (directory scanner object)
887 * @param tc task context in which the task is running
888 */
889static void
890read_progress_task (void *cls, const struct GNUNET_SCHEDULER_TaskContext *tc)
891{
892 struct GNUNET_FS_DirScanner *ds;
893 int end_it = 0;
894 enum GNUNET_DirScannerProgressUpdateReason reason;
895 ssize_t rd;
896 ssize_t total_read;
897
898 size_t filename_len;
899 char is_directory;
900 char *filename;
901
902 ds = cls;
903
904 ds->progress_read_task = GNUNET_SCHEDULER_NO_TASK;
905
906 if (!(tc->reason & GNUNET_SCHEDULER_REASON_READ_READY))
907 {
908 ds->progress_callback (ds->cls, ds, NULL, 0, GNUNET_DIR_SCANNER_SHUTDOWN);
909 return;
910 }
911
912 /* Read one message. If message is malformed or can't be read, end the scanner */
913 total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &reason, sizeof (reason));
914 while (rd > 0 && total_read < sizeof (reason))
915 {
916 rd = GNUNET_DISK_file_read (ds->progress_read,
917 &((char *) &reason)[total_read],
918 sizeof (reason) - total_read);
919 if (rd > 0)
920 total_read += rd;
921 }
922 if (total_read != sizeof (reason)
923 || reason <= GNUNET_DIR_SCANNER_FIRST
924 || reason >= GNUNET_DIR_SCANNER_LAST)
925 {
926 end_it = 1;
927 reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;
928 }
929
930 if (!end_it)
931 {
932 total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &filename_len,
933 sizeof (size_t));
934 while (rd > 0 && total_read < sizeof (size_t))
935 {
936 rd = GNUNET_DISK_file_read (ds->progress_read,
937 &((char *) &filename_len)[total_read],
938 sizeof (size_t) - total_read);
939 if (rd > 0)
940 total_read += rd;
941 }
942 if (rd != sizeof (size_t))
943 {
944 end_it = 1;
945 reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;
946 }
947 }
948 if (!end_it)
949 {
950 if (filename_len == 0)
951 end_it = 1;
952 else if (filename_len > MAX_PATH)
953 {
954 end_it = 1;
955 reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;
956 }
957 }
958 if (!end_it)
959 {
960 filename = GNUNET_malloc (filename_len);
961 total_read = rd = GNUNET_DISK_file_read (ds->progress_read, filename,
962 filename_len);
963 while (rd > 0 && total_read < filename_len)
964 {
965 rd = GNUNET_DISK_file_read (ds->progress_read, &filename[total_read],
966 filename_len - total_read);
967 if (rd > 0)
968 total_read += rd;
969 }
970 if (rd != filename_len)
971 {
972 GNUNET_free (filename);
973 reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;
974 end_it = 1;
975 }
976 }
977 if (!end_it && filename_len > 0)
978 {
979 total_read = rd = GNUNET_DISK_file_read (ds->progress_read, &is_directory,
980 sizeof (char));
981 while (rd > 0 && total_read < sizeof (char))
982 {
983 rd = GNUNET_DISK_file_read (ds->progress_read, &(&is_directory)[total_read],
984 sizeof (char) - total_read);
985 if (rd > 0)
986 total_read += rd;
987 }
988 if (rd != sizeof (char))
989 {
990 GNUNET_free (filename);
991 reason = GNUNET_DIR_SCANNER_PROTOCOL_ERROR;
992 end_it = 1;
993 }
994 }
995 if (!end_it)
996 {
997 end_it = ds->progress_callback (ds->cls, ds, (const char *) filename, is_directory, reason);
998 GNUNET_free (filename);
999 if (!end_it)
1000 {
1001 ds->progress_read_task = GNUNET_SCHEDULER_add_read_file (
1002 GNUNET_TIME_UNIT_FOREVER_REL, ds->progress_read, &read_progress_task,
1003 cls);
1004 }
1005 }
1006 else
1007 {
1008 ds->progress_callback (ds->cls, ds, NULL, 0, reason);
1009 }
1010}
1011
1012
1013/**
1014 * Start a directory scanner thread.
1015 *
1016 * @param filename name of the directory to scan
1017 * @param GNUNET_YES to not to run libextractor on files (only build a tree)
1018 * @param ex if not NULL, must be a list of extra plugins for extractor
1019 * @param cb the callback to call when there are scanning progress messages
1020 * @param cls closure for 'cb'
1021 * @return directory scanner object to be used for controlling the scanner
1022 */
1023struct GNUNET_FS_DirScanner *
1024GNUNET_FS_directory_scan_start (const char *filename,
1025 int disable_extractor, const char *ex,
1026 GNUNET_FS_DirScannerProgressCallback cb, void *cls)
1027{
1028 struct stat sbuf;
1029 struct AddDirContext *adc;
1030 char *filename_expanded;
1031 struct GNUNET_FS_DirScanner *ds;
1032 struct GNUNET_DISK_PipeHandle *progress_pipe;
1033 int ok;
1034
1035 if (0 != STAT (filename, &sbuf))
1036 return NULL;
1037 /* TODO: consider generalizing this for files too! */
1038 if (!S_ISDIR (sbuf.st_mode))
1039 {
1040 GNUNET_break (0);
1041 return NULL;
1042 }
1043 /* scan_directory() is guaranteed to be given expanded filenames,
1044 * so expand we will!
1045 */
1046 filename_expanded = GNUNET_STRINGS_filename_expand (filename);
1047 if (filename_expanded == NULL)
1048 return NULL;
1049
1050 progress_pipe = GNUNET_DISK_pipe (GNUNET_NO, GNUNET_NO, GNUNET_NO);
1051 if (progress_pipe == NULL)
1052 {
1053 GNUNET_free (filename_expanded);
1054 return NULL;
1055 }
1056
1057 adc = GNUNET_malloc (sizeof (struct AddDirContext));
1058
1059 ds = GNUNET_malloc (sizeof (struct GNUNET_FS_DirScanner));
1060
1061 ds->adc = adc;
1062
1063#if WINDOWS
1064 ds->stop = CreateEvent (NULL, TRUE, FALSE, NULL);
1065 adc->stop = ds->stop;
1066 ok = ds->stop != INVALID_HANDLE_VALUE;
1067#else
1068 ok = !sem_init (&ds->stop, 0, 0);
1069 adc = &ds->stop;
1070#endif
1071 if (!ok)
1072 {
1073 GNUNET_free (adc);
1074 GNUNET_free (ds);
1075 GNUNET_free (filename_expanded);
1076 GNUNET_DISK_pipe_close (progress_pipe);
1077 return NULL;
1078 }
1079
1080 adc->plugins = NULL;
1081 if (!disable_extractor)
1082 {
1083 adc->plugins = EXTRACTOR_plugin_add_defaults (
1084 EXTRACTOR_OPTION_DEFAULT_POLICY);
1085 if (ex && strlen (ex) > 0)
1086 adc->plugins = EXTRACTOR_plugin_add_config (adc->plugins, ex,
1087 EXTRACTOR_OPTION_DEFAULT_POLICY);
1088 }
1089
1090 adc->filename_expanded = filename_expanded;
1091 adc->progress_write = GNUNET_DISK_pipe_handle (progress_pipe,
1092 GNUNET_DISK_PIPE_END_WRITE);
1093
1094
1095 ds->progress_read = GNUNET_DISK_pipe_handle (progress_pipe,
1096 GNUNET_DISK_PIPE_END_READ);
1097
1098#if WINDOWS
1099 ds->thread = CreateThread (NULL, 0,
1100 (LPTHREAD_START_ROUTINE) &run_directory_scan_thread, (LPVOID) adc,
1101 0, NULL);
1102 ok = ds->thread != NULL;
1103#else
1104 ok = !pthread_create (&ds->thread, NULL, &run_directory_scan_thread,
1105 (void *) adc);
1106#endif
1107 if (!ok)
1108 {
1109 GNUNET_free (adc);
1110 GNUNET_free (filename_expanded);
1111 GNUNET_DISK_pipe_close (progress_pipe);
1112 GNUNET_free (ds);
1113 return NULL;
1114 }
1115
1116 ds->progress_callback = cb;
1117 ds->cls = cls;
1118 ds->adc = adc;
1119 ds->progress_pipe = progress_pipe;
1120
1121 ds->progress_read_task = GNUNET_SCHEDULER_add_read_file (
1122 GNUNET_TIME_UNIT_FOREVER_REL, ds->progress_read, &read_progress_task,
1123 ds);
1124
1125 return ds;
1126}
1127
1128/**
1129 * Task that post-processes the share item tree.
1130 * This processing has to be done in the main thread, because
1131 * it requires access to libgcrypt's hashing functions, and
1132 * libgcrypt is not thread-safe without some special magic.
1133 *
1134 * @param cls top of the stack
1135 * @param tc task context
1136 */
1137static void
1138trim_share_tree_task (void *cls,
1139 const struct GNUNET_SCHEDULER_TaskContext *tc)
1140{
1141 struct ProcessMetadataStackItem *stack = cls;
1142 struct ProcessMetadataStackItem *next = stack;
1143 /* FIXME: figure out what to do when tc says we're shutting down */
1144
1145 /* item == NULL means that we've just finished going over the children of
1146 * current directory.
1147 */
1148 if (stack->item == NULL)
1149 {
1150 if (stack->parent->item != NULL)
1151 {
1152 /* end of a directory */
1153 struct GNUNET_FS_Uri *ksk;
1154
1155 /* use keyword and metadata counters to create lists of keywords to move
1156 * and metadata to copy.
1157 */
1158 process_keywords_and_metadata (stack, &stack->parent->exclude_ksk, &stack->parent->item->meta);
1159
1160 /* create keywords from metadata (copies all text-metadata as keywords,
1161 * AND parses the directory name we've just added, producing even more
1162 * keywords.
1163 * then merge these keywords with the ones moved from children.
1164 */
1165 ksk = GNUNET_FS_uri_ksk_create_from_meta_data (stack->parent->item->meta);
1166 stack->parent->item->ksk_uri = GNUNET_FS_uri_ksk_merge (ksk, stack->parent->exclude_ksk);
1167 GNUNET_FS_uri_destroy (ksk);
1168
1169 /* remove moved keywords from children (complete the move) */
1170 remove_keywords (stack->parent, stack->parent->item);
1171 GNUNET_FS_uri_destroy (stack->parent->exclude_ksk);
1172
1173 /* go up the stack */
1174 next = stack->parent;
1175 GNUNET_free (stack);
1176 next->end_directory = GNUNET_YES;
1177 }
1178 else
1179 {
1180 /* we've just finished processing the toplevel directory */
1181 struct ProcessMetadataContext *ctx = stack->ctx;
1182 next = NULL;
1183 GNUNET_SCHEDULER_add_continuation (ctx->cb, ctx->cls,
1184 GNUNET_SCHEDULER_REASON_PREREQ_DONE);
1185 GNUNET_free (stack->parent);
1186 GNUNET_free (stack);
1187 GNUNET_free (ctx);
1188 }
1189 }
1190 else if (stack->item->is_directory
1191 && !stack->end_directory
1192 && stack->item->children_head != NULL)
1193 {
1194 /* recurse into subdirectory */
1195 next = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));
1196 next->ctx = stack->ctx;
1197 next->item = stack->item->children_head;
1198 next->keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024);
1199 next->metacounter = GNUNET_CONTAINER_multihashmap_create (1024);
1200 next->dir_entry_count = 0;
1201 next->parent = stack;
1202 }
1203 else
1204 {
1205 /* process a child entry (a file or a directory) and move to the next one*/
1206 if (stack->item->is_directory)
1207 stack->end_directory = GNUNET_NO;
1208 stack->dir_entry_count++;
1209 GNUNET_CONTAINER_meta_data_iterate (stack->item->meta, &add_to_meta_counter, stack->metacounter);
1210
1211 if (stack->item->is_directory)
1212 {
1213 char *user = getenv ("USER");
1214 if ((user == NULL) || (0 != strncasecmp (user, stack->item->short_filename, strlen(user))))
1215 {
1216 /* only use filename if it doesn't match $USER */
1217 GNUNET_CONTAINER_meta_data_insert (stack->item->meta, "<libgnunetfs>",
1218 EXTRACTOR_METATYPE_FILENAME,
1219 EXTRACTOR_METAFORMAT_UTF8,
1220 "text/plain", stack->item->short_filename,
1221 strlen (stack->item->short_filename) + 1);
1222 GNUNET_CONTAINER_meta_data_insert (stack->item->meta, "<libgnunetfs>",
1223 EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME,
1224 EXTRACTOR_METAFORMAT_UTF8,
1225 "text/plain", stack->item->short_filename,
1226 strlen (stack->item->short_filename) + 1);
1227 }
1228 }
1229
1230 stack->item->ksk_uri = GNUNET_FS_uri_ksk_create_from_meta_data (stack->item->meta);
1231 GNUNET_FS_uri_ksk_get_keywords (stack->item->ksk_uri, &add_to_keyword_counter, stack->keywordcounter);
1232 stack->item = stack->item->next;
1233 }
1234 /* Call this task again later, if there are more entries to process */
1235 if (next)
1236 GNUNET_SCHEDULER_add_continuation (&trim_share_tree_task, next,
1237 GNUNET_SCHEDULER_REASON_PREREQ_DONE);
1238}
1239
1240/**
1241 * Process a share item tree, moving frequent keywords up and
1242 * copying frequent metadata up.
1243 *
1244 * @param toplevel toplevel directory in the tree, returned by the scanner
1245 * @param cb called after processing is done
1246 * @param cls closure for 'cb'
1247 */
1248struct ProcessMetadataContext *
1249GNUNET_FS_trim_share_tree (struct ShareTreeItem *toplevel,
1250 GNUNET_SCHEDULER_Task cb, void *cls)
1251{
1252 struct ProcessMetadataContext *ret;
1253
1254 if (toplevel == NULL)
1255 {
1256 struct GNUNET_SCHEDULER_TaskContext tc;
1257 tc.reason = GNUNET_SCHEDULER_REASON_PREREQ_DONE;
1258 cb (cls, &tc);
1259 return NULL;
1260 }
1261
1262 ret = GNUNET_malloc (sizeof (struct ProcessMetadataContext));
1263 ret->toplevel = toplevel;
1264 ret->stack = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));
1265 ret->stack->ctx = ret;
1266 ret->stack->item = toplevel;
1267 ret->stack->keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024);
1268 ret->stack->metacounter = GNUNET_CONTAINER_multihashmap_create (1024);
1269 ret->stack->dir_entry_count = 0;
1270 ret->stack->end_directory = GNUNET_NO;
1271
1272 /* dummy stack entry that tells us we're at the top of the stack */
1273 ret->stack->parent = GNUNET_malloc (sizeof (struct ProcessMetadataStackItem));
1274 ret->stack->parent->ctx = ret;
1275
1276 ret->cb = cb;
1277 ret->cls = cls;
1278
1279 GNUNET_SCHEDULER_add_continuation (&trim_share_tree_task, ret->stack,
1280 GNUNET_SCHEDULER_REASON_PREREQ_DONE);
1281 return ret;
1282} \ No newline at end of file