aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2011-12-30 23:52:54 +0000
committerChristian Grothoff <christian@grothoff.org>2011-12-30 23:52:54 +0000
commitfa479ec59f4dd772e9ea562e4a30ed3031f47925 (patch)
treec6ba808051efc048cb9e71449a319d4cbd04985a
parent684af0945283ced2830b0533fb22e818713a8a48 (diff)
downloadgnunet-gtk-fa479ec59f4dd772e9ea562e4a30ed3031f47925.tar.gz
gnunet-gtk-fa479ec59f4dd772e9ea562e4a30ed3031f47925.zip
-LRN: applying patch 4 from #2046 - Count and propagate keywords instead of metadata
-rw-r--r--src/fs/gnunet-fs-gtk-main_window_file_publish.c186
1 files changed, 76 insertions, 110 deletions
diff --git a/src/fs/gnunet-fs-gtk-main_window_file_publish.c b/src/fs/gnunet-fs-gtk-main_window_file_publish.c
index 16df7b93..b1f8be98 100644
--- a/src/fs/gnunet-fs-gtk-main_window_file_publish.c
+++ b/src/fs/gnunet-fs-gtk-main_window_file_publish.c
@@ -329,6 +329,11 @@ struct PublishData
329 struct GNUNET_CONTAINER_MetaData *meta; 329 struct GNUNET_CONTAINER_MetaData *meta;
330 330
331 /** 331 /**
332 * Keywords for the file (derived from metadata).
333 */
334 struct GNUNET_FS_Uri *ksk_uri;
335
336 /**
332 * Iterator for the entry. 337 * Iterator for the entry.
333 */ 338 */
334 GtkTreeIter iter; 339 GtkTreeIter iter;
@@ -336,10 +341,10 @@ struct PublishData
336 341
337 342
338/** 343/**
339 * Entry for each unique meta data entry to track how often 344 * Entry for each unique keyword to track how often
340 * it occured. Contains the keyword and the counter. 345 * it occured. Contains the keyword and the counter.
341 */ 346 */
342struct MetaCounter 347struct KeywordCounter
343{ 348{
344 349
345 /** 350 /**
@@ -348,21 +353,6 @@ struct MetaCounter
348 const char *value; 353 const char *value;
349 354
350 /** 355 /**
351 * Mimetype of the value.
352 */
353 const char *value_mimetype;
354
355 /**
356 * Type of the value.
357 */
358 enum EXTRACTOR_MetaType type;
359
360 /**
361 * Format of the value.
362 */
363 enum EXTRACTOR_MetaFormat format;
364
365 /**
366 * How many files have meta entries matching this value? 356 * How many files have meta entries matching this value?
367 * (type and format do not have to match). 357 * (type and format do not have to match).
368 */ 358 */
@@ -388,11 +378,11 @@ struct AddDirContext
388 GtkTreeStore *ts; 378 GtkTreeStore *ts;
389 379
390 /** 380 /**
391 * Map from the hash over the meta value to an 'struct MetaCounter' 381 * Map from the hash over the keyword to an 'struct KeywordCounter'
392 * counter that says how often this value was 382 * counter that says how often this keyword was
393 * encountered in the current directory. 383 * encountered in the current directory.
394 */ 384 */
395 struct GNUNET_CONTAINER_MultiHashMap *metacounter; 385 struct GNUNET_CONTAINER_MultiHashMap *keywordcounter;
396 386
397 /** 387 /**
398 * Map from the hash of a filename in the current directory 388 * Map from the hash of a filename in the current directory
@@ -401,10 +391,10 @@ struct AddDirContext
401 struct GNUNET_CONTAINER_MultiHashMap *metamap; 391 struct GNUNET_CONTAINER_MultiHashMap *metamap;
402 392
403 /** 393 /**
404 * Metadata to exclude from using for KSK since it'll be associated 394 * Keywords to exclude from using for KSK since they'll be associated
405 * with the parent as well. NULL for nothing blocked. 395 * with the parent as well. NULL for nothing blocked.
406 */ 396 */
407 struct GNUNET_CONTAINER_MetaData *no_ksk; 397 struct GNUNET_FS_Uri *exclude_ksk;
408 398
409 /** 399 /**
410 * Block options to use. 400 * Block options to use.
@@ -424,71 +414,45 @@ struct AddDirContext
424 414
425 415
426/** 416/**
427 * Add the given meta data item to the 417 * Add the given keyword to the
428 * meta data statistics tracker. 418 * keyword statistics tracker.
429 * 419 *
430 * @param cls closure (user-defined) 420 * @param cls closure (user-defined)
431 * @param plugin_name name of the plugin that produced this value; 421 * @param keyword the keyword to count
432 * special values can be used (i.e. '<zlib>' for zlib being 422 * @param is_mandatory ignored
433 * used in the main libextractor library and yielding 423 * @return always GNUNET_OK
434 * meta data).
435 * @param type libextractor-type describing the meta data
436 * @param format basic format information about data
437 * @param data_mime_type mime-type of data (not of the original file);
438 * can be NULL (if mime-type is not known)
439 * @param data actual meta-data found
440 * @param data_len number of bytes in data
441 * @return 0 to continue extracting, 1 to abort
442 */ 424 */
443static int 425static int
444add_to_meta_counter (void *cls, const char *plugin_name, 426add_to_keyword_counter (void *cls, const char *keyword, int is_mandatory)
445 enum EXTRACTOR_MetaType type,
446 enum EXTRACTOR_MetaFormat format,
447 const char *data_mime_type, const char *data,
448 size_t data_len)
449{ 427{
450 struct GNUNET_CONTAINER_MultiHashMap *mcm = cls; 428 struct GNUNET_CONTAINER_MultiHashMap *mcm = cls;
451 struct MetaCounter *cnt; 429 struct KeywordCounter *cnt;
452 GNUNET_HashCode hc; 430 GNUNET_HashCode hc;
453 size_t mlen; 431 size_t klen;
454 size_t dlen; 432
455 433 klen = strlen (keyword) + 1;
456 if ((format != EXTRACTOR_METAFORMAT_UTF8) && 434 GNUNET_CRYPTO_hash (keyword, klen - 1, &hc);
457 (format != EXTRACTOR_METAFORMAT_C_STRING))
458 return 0;
459 dlen = strlen (data) + 1;
460 GNUNET_CRYPTO_hash (data, dlen - 1, &hc);
461 cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc); 435 cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc);
462 if (cnt == NULL) 436 if (cnt == NULL)
463 { 437 {
464 mlen = strlen (data_mime_type) + 1; 438 cnt = GNUNET_malloc (sizeof (struct KeywordCounter) + klen);
465 cnt = GNUNET_malloc (sizeof (struct MetaCounter) + dlen + mlen);
466 cnt->count = 1; 439 cnt->count = 1;
467 cnt->value = (const char *) &cnt[1]; 440 cnt->value = (const char *) &cnt[1];
468 cnt->value_mimetype = &cnt->value[dlen]; 441 memcpy (&cnt[1], keyword, klen);
469 memcpy (&cnt[1], data, dlen);
470 memcpy ((char *) cnt->value_mimetype, data_mime_type, mlen);
471 cnt->type = type;
472 cnt->format = format;
473 GNUNET_CONTAINER_multihashmap_put (mcm, &hc, cnt, 442 GNUNET_CONTAINER_multihashmap_put (mcm, &hc, cnt,
474 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY); 443 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY);
475
476 } 444 }
477 else 445 else
478 { 446 {
479 cnt->count++; 447 cnt->count++;
480 if (cnt->format == EXTRACTOR_METAFORMAT_C_STRING)
481 cnt->format = format; /* possibly improve to UTF8 */
482 if (cnt->type == EXTRACTOR_METATYPE_UNKNOWN)
483 cnt->type = type;
484 } 448 }
485 return 0; 449 return GNUNET_OK;
486} 450}
487 451
488 452
489/** 453/**
490 * Extract metadata from a file and add it to the metamap and 454 * Extract metadata from a file and add it to the metamap and
491 * the metacounter. 455 * the keywordcounter.
492 * 456 *
493 * @param adc context to modify 457 * @param adc context to modify
494 * @param filename name of the file to process 458 * @param filename name of the file to process
@@ -521,8 +485,8 @@ extract_file (struct AddDirContext *adc, const char *filename)
521 GNUNET_CONTAINER_multihashmap_put (adc->metamap, &hc, pd, 485 GNUNET_CONTAINER_multihashmap_put (adc->metamap, &hc, pd,
522 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY); 486 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY);
523 /* FIXME: what if this put fails? I think it actually can... Why unique only? */ 487 /* FIXME: what if this put fails? I think it actually can... Why unique only? */
524 GNUNET_CONTAINER_meta_data_iterate (pd->meta, &add_to_meta_counter, 488 pd->ksk_uri = GNUNET_FS_uri_ksk_create_from_meta_data (pd->meta);
525 adc->metacounter); 489 GNUNET_FS_uri_ksk_get_keywords (pd->ksk_uri, &add_to_keyword_counter, adc->keywordcounter);
526} 490}
527 491
528 492
@@ -546,21 +510,22 @@ remove_keyword (void *cls, const char *keyword, int is_mandatory)
546 510
547/** 511/**
548 * Add the specifics of the given entry to the tree store. 512 * Add the specifics of the given entry to the tree store.
549 * Derive KSK from the given meta data, but exclude meta 513 * Use keywords from ksk_uri, but exclude the ones given in
550 * data given in "md_no_ksk" for keyword generation. 514 * "md_no_ksk".
551 * 515 *
552 * @param ts tree store to modify 516 * @param ts tree store to modify
553 * @param iter position in the tree store for this file 517 * @param iter position in the tree store for this file
554 * @param filename file to add 518 * @param filename file to add
555 * @param bo block options 519 * @param bo block options
556 * @param do_index should we index or insert? 520 * @param do_index should we index or insert?
557 * @param md_no_ksk metadata with keywords NOT to add 521 * @param ksk_uri keywords to use. Will be destroyed at the end.
558 * @param meta metadata for the file 522 * @param exclude_ksk keywords NOT to use. Won't be modified.
523 * @param meta metadata for the file. Will be destroyed at the end.
559 */ 524 */
560static void 525static void
561add_entry_to_ts (GtkTreeStore * ts, GtkTreeIter * iter, const char *filename, 526add_entry_to_ts (GtkTreeStore * ts, GtkTreeIter * iter, const char *filename,
562 const struct GNUNET_FS_BlockOptions *bo, int do_index, 527 const struct GNUNET_FS_BlockOptions *bo, int do_index,
563 struct GNUNET_CONTAINER_MetaData *md_no_ksk, 528 struct GNUNET_FS_Uri *ksk_uri, struct GNUNET_FS_Uri *exclude_ksk,
564 struct GNUNET_CONTAINER_MetaData *meta) 529 struct GNUNET_CONTAINER_MetaData *meta)
565{ 530{
566 char *file_size_fancy; 531 char *file_size_fancy;
@@ -568,8 +533,6 @@ add_entry_to_ts (GtkTreeStore * ts, GtkTreeIter * iter, const char *filename,
568 GtkTreeRowReference *row_reference; 533 GtkTreeRowReference *row_reference;
569 GtkTreePath *path; 534 GtkTreePath *path;
570 uint64_t file_size; 535 uint64_t file_size;
571 struct GNUNET_FS_Uri *ksk_uri;
572 struct GNUNET_FS_Uri *kill_ksk;
573 const char *ss; 536 const char *ss;
574 const char *short_fn; 537 const char *short_fn;
575 struct stat sbuf; 538 struct stat sbuf;
@@ -588,12 +551,9 @@ add_entry_to_ts (GtkTreeStore * ts, GtkTreeIter * iter, const char *filename,
588 return; 551 return;
589 } 552 }
590 } 553 }
591 ksk_uri = GNUNET_FS_uri_ksk_create_from_meta_data (meta); 554 if (exclude_ksk != NULL)
592 kill_ksk = GNUNET_FS_uri_ksk_create_from_meta_data (md_no_ksk);
593 if (kill_ksk != NULL)
594 { 555 {
595 GNUNET_FS_uri_ksk_get_keywords (kill_ksk, &remove_keyword, ksk_uri); 556 GNUNET_FS_uri_ksk_get_keywords (exclude_ksk, &remove_keyword, ksk_uri);
596 GNUNET_FS_uri_destroy (kill_ksk);
597 } 557 }
598 path = gtk_tree_model_get_path (GTK_TREE_MODEL (ts), iter); 558 path = gtk_tree_model_get_path (GTK_TREE_MODEL (ts), iter);
599 row_reference = gtk_tree_row_reference_new (GTK_TREE_MODEL (ts), path); 559 row_reference = gtk_tree_row_reference_new (GTK_TREE_MODEL (ts), path);
@@ -648,7 +608,7 @@ publish_entry (void *cls, const char *filename)
648 GNUNET_CRYPTO_hash (filename, strlen (filename), &hc); 608 GNUNET_CRYPTO_hash (filename, strlen (filename), &hc);
649 pd = GNUNET_CONTAINER_multihashmap_get (adc->metamap, &hc); 609 pd = GNUNET_CONTAINER_multihashmap_get (adc->metamap, &hc);
650 add_entry_to_ts (adc->ts, &pd->iter, filename, &adc->bo, adc->do_index, 610 add_entry_to_ts (adc->ts, &pd->iter, filename, &adc->bo, adc->do_index,
651 adc->no_ksk, pd->meta); 611 pd->ksk_uri, adc->exclude_ksk, pd->meta);
652 GNUNET_CONTAINER_multihashmap_remove (adc->metamap, &hc, pd); 612 GNUNET_CONTAINER_multihashmap_remove (adc->metamap, &hc, pd);
653 GNUNET_free (pd); 613 GNUNET_free (pd);
654 return GNUNET_OK; 614 return GNUNET_OK;
@@ -658,12 +618,12 @@ publish_entry (void *cls, const char *filename)
658/** 618/**
659 * Context passed to 'migrate_and_drop'. 619 * Context passed to 'migrate_and_drop'.
660 */ 620 */
661struct MetaProcessContext 621struct KeywordProcessContext
662{ 622{
663 /** 623 /**
664 * Metadata with all the keywords we migrated to the parent. 624 * All the keywords we migrated to the parent.
665 */ 625 */
666 struct GNUNET_CONTAINER_MetaData *md; 626 struct GNUNET_FS_Uri *ksk;
667 627
668 /** 628 /**
669 * How often does a keyword have to occur to be 629 * How often does a keyword have to occur to be
@@ -674,22 +634,19 @@ struct MetaProcessContext
674 634
675 635
676/** 636/**
677 * Copy "frequent" meta data entries over to the 637 * Copy "frequent" keywords over to the
678 * target meta data struct, free the counters. 638 * target ksk uri, free the counters.
679 * 639 *
680 */ 640 */
681static int 641static int
682migrate_and_drop (void *cls, const GNUNET_HashCode * key, void *value) 642migrate_and_drop (void *cls, const GNUNET_HashCode * key, void *value)
683{ 643{
684 struct MetaProcessContext *mpc = cls; 644 struct KeywordProcessContext *kpc = cls;
685 struct MetaCounter *counter = value; 645 struct KeywordCounter *counter = value;
686 646
687 if (counter->count >= mpc->threshold && counter->count > 1) 647 if (counter->count >= kpc->threshold && counter->count > 1)
688 { 648 {
689 GNUNET_CONTAINER_meta_data_insert (mpc->md, "<gnunet-gtk>", counter->type, 649 GNUNET_FS_uri_ksk_add_keyword (kpc->ksk, counter->value, GNUNET_NO);
690 counter->format, counter->value_mimetype,
691 counter->value,
692 strlen (counter->value) + 1);
693 } 650 }
694 GNUNET_free (counter); 651 GNUNET_free (counter);
695 return GNUNET_YES; 652 return GNUNET_YES;
@@ -697,24 +654,31 @@ migrate_and_drop (void *cls, const GNUNET_HashCode * key, void *value)
697 654
698 655
699/** 656/**
700 * Go over the collected meta data from all entries in the 657 * Go over the collected keywords from all entries in the
701 * directory and push common meta data up one level (by 658 * directory and push common keywords up one level (by
702 * adding it to the returned struct). 659 * adding it to the returned struct).
703 * 660 *
704 * @param adc collection of child meta data 661 * @param adc collection of child meta data
705 * @return meta data to moved to parent 662 * @return meta data to moved to parent
706 */ 663 */
707static struct GNUNET_CONTAINER_MetaData * 664static struct GNUNET_FS_Uri *
708process_metadata (struct AddDirContext *adc) 665process_keywords (struct AddDirContext *adc)
709{ 666{
710 struct MetaProcessContext mpc; 667 struct KeywordProcessContext kpc;
711 668 struct GNUNET_CONTAINER_MetaData *tmp;
712 mpc.md = GNUNET_CONTAINER_meta_data_create (); 669
713 mpc.threshold = (adc->dir_entry_count + 1) / 2; /* 50% */ 670 tmp = GNUNET_CONTAINER_meta_data_create ();
714 GNUNET_CONTAINER_multihashmap_iterate (adc->metacounter, &migrate_and_drop, 671
715 &mpc); 672 /* Surprisingly, it's impossible to create a ksk with 0 keywords directly.
716 GNUNET_CONTAINER_multihashmap_destroy (adc->metacounter); 673 * But we can create one from an empty metadata set
717 return mpc.md; 674 */
675 kpc.ksk = GNUNET_FS_uri_ksk_create_from_meta_data (tmp);
676 GNUNET_CONTAINER_meta_data_destroy (tmp);
677 kpc.threshold = (adc->dir_entry_count + 1) / 2; /* 50% */
678 GNUNET_CONTAINER_multihashmap_iterate (adc->keywordcounter, &migrate_and_drop,
679 &kpc);
680 GNUNET_CONTAINER_multihashmap_destroy (adc->keywordcounter);
681 return kpc.ksk;
718} 682}
719 683
720 684
@@ -735,7 +699,7 @@ scan_directory (void *cls, const char *filename)
735 struct PublishData *pd; 699 struct PublishData *pd;
736 GNUNET_HashCode hc; 700 GNUNET_HashCode hc;
737 struct GNUNET_CONTAINER_MultiHashMap *mhm; 701 struct GNUNET_CONTAINER_MultiHashMap *mhm;
738 struct GNUNET_CONTAINER_MultiHashMap *mcm; 702 struct GNUNET_CONTAINER_MultiHashMap *kcm;
739 unsigned int pc; 703 unsigned int pc;
740 const char *ss; 704 const char *ss;
741 const char *short_fn; 705 const char *short_fn;
@@ -747,21 +711,22 @@ scan_directory (void *cls, const char *filename)
747 { 711 {
748 parent = adc->parent; 712 parent = adc->parent;
749 mhm = adc->metamap; 713 mhm = adc->metamap;
750 mcm = adc->metacounter; 714 kcm = adc->keywordcounter;
751 pc = adc->dir_entry_count; 715 pc = adc->dir_entry_count;
752 adc->metamap = GNUNET_CONTAINER_multihashmap_create (1024); 716 adc->metamap = GNUNET_CONTAINER_multihashmap_create (1024);
753 adc->metacounter = GNUNET_CONTAINER_multihashmap_create (1024); 717 adc->keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024);
754 adc->dir_entry_count = 0; 718 adc->dir_entry_count = 0;
755 pd = GNUNET_malloc (sizeof (struct PublishData)); 719 pd = GNUNET_malloc (sizeof (struct PublishData));
756 gtk_tree_store_insert_before (adc->ts, &pd->iter, parent, NULL); 720 gtk_tree_store_insert_before (adc->ts, &pd->iter, parent, NULL);
757 adc->parent = &pd->iter; 721 adc->parent = &pd->iter;
758 GNUNET_DISK_directory_scan (filename, &scan_directory, adc); 722 GNUNET_DISK_directory_scan (filename, &scan_directory, adc);
759 pd->meta = process_metadata (adc); 723 pd->ksk_uri = process_keywords (adc);
760 adc->no_ksk = pd->meta; 724 pd->meta = GNUNET_CONTAINER_meta_data_create ();
725 adc->exclude_ksk = GNUNET_FS_uri_dup (pd->ksk_uri);
761 GNUNET_DISK_directory_scan (filename, &publish_entry, adc); 726 GNUNET_DISK_directory_scan (filename, &publish_entry, adc);
762 GNUNET_CONTAINER_multihashmap_destroy (adc->metamap); 727 GNUNET_CONTAINER_multihashmap_destroy (adc->metamap);
763 adc->metamap = mhm; 728 adc->metamap = mhm;
764 adc->metacounter = mcm; 729 adc->keywordcounter = kcm;
765 adc->parent = parent; 730 adc->parent = parent;
766 adc->dir_entry_count = pc + 1; 731 adc->dir_entry_count = pc + 1;
767 short_fn = filename; 732 short_fn = filename;
@@ -787,12 +752,13 @@ scan_directory (void *cls, const char *filename)
787 GNUNET_CONTAINER_multihashmap_put (adc->metamap, &hc, pd, 752 GNUNET_CONTAINER_multihashmap_put (adc->metamap, &hc, pd,
788 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY); 753 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY);
789 /* FIXME: what if this put fails? I think it actually can... Why unique only? */ 754 /* FIXME: what if this put fails? I think it actually can... Why unique only? */
790 GNUNET_CONTAINER_meta_data_iterate (pd->meta, &add_to_meta_counter, mcm); 755 GNUNET_FS_uri_ksk_get_keywords (pd->ksk_uri, &add_to_keyword_counter, kcm);
791 } 756 }
792 else 757 else
793 { 758 {
759 GNUNET_assert (kcm == NULL);
794 add_entry_to_ts (adc->ts, &pd->iter, filename, &adc->bo, adc->do_index, 760 add_entry_to_ts (adc->ts, &pd->iter, filename, &adc->bo, adc->do_index,
795 NULL, pd->meta); 761 pd->ksk_uri, NULL, pd->meta);
796 } 762 }
797 } 763 }
798 else 764 else