aboutsummaryrefslogtreecommitdiff
path: root/src/fs/fs_file_information.c
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2011-10-21 13:48:25 +0000
committerChristian Grothoff <christian@grothoff.org>2011-10-21 13:48:25 +0000
commit52bf657944215306b0753eede6285d4296baa884 (patch)
tree5b06e4d928b8de1fa658f8334b2b7adb6828b446 /src/fs/fs_file_information.c
parenta30718300ec364b57daf8ef68559f947c9efa672 (diff)
downloadgnunet-52bf657944215306b0753eede6285d4296baa884.tar.gz
gnunet-52bf657944215306b0753eede6285d4296baa884.zip
more keyword canonicalization, fix threshold, update testcases:
Diffstat (limited to 'src/fs/fs_file_information.c')
-rw-r--r--src/fs/fs_file_information.c125
1 files changed, 109 insertions, 16 deletions
diff --git a/src/fs/fs_file_information.c b/src/fs/fs_file_information.c
index a07ebe2cf..ce4f189a9 100644
--- a/src/fs/fs_file_information.c
+++ b/src/fs/fs_file_information.c
@@ -520,6 +520,25 @@ update_metamap (void *cls,
520 520
521 521
522/** 522/**
523 * Aggregate information we keep for keywords in each directory.
524 */
525struct KeywordInformation
526{
527
528 /**
529 * Mime-type of keyword.
530 */
531 const char *keyword;
532
533 /**
534 * How often does this meta value occur in this directory?
535 */
536 unsigned int frequency;
537
538};
539
540
541/**
523 * Closure for dirproc function. 542 * Closure for dirproc function.
524 */ 543 */
525struct EntryProcCls 544struct EntryProcCls
@@ -538,6 +557,13 @@ struct EntryProcCls
538 struct GNUNET_CONTAINER_MultiHashMap *metamap; 557 struct GNUNET_CONTAINER_MultiHashMap *metamap;
539 558
540 /** 559 /**
560 * Map describing the keywords for all entries in the
561 * directory. Keys are the hash of the keyword,
562 * values are of type 'struct KeywordInformation'.
563 */
564 struct GNUNET_CONTAINER_MultiHashMap *keywordmap;
565
566 /**
541 * Number of entries in 'entries'. 567 * Number of entries in 'entries'.
542 */ 568 */
543 unsigned int count; 569 unsigned int count;
@@ -547,17 +573,23 @@ struct EntryProcCls
547 573
548/** 574/**
549 * Function that processes a directory entry that 575 * Function that processes a directory entry that
550 * was obtained from the scanner. 576 * was obtained from the scanner. Adds each entry to
577 * the directory and computes directroy meta map.
578 *
551 * @param cls our closure 579 * @param cls our closure
552 * @param filename name of the file (unused, why there???) 580 * @param filename name of the file (unused, why there???)
553 * @param fi information for publishing the file 581 * @param fi information for publishing the file
554 */ 582 */
555static void 583static void
556dirproc (void *cls, const char *filename, 584dirproc_add (void *cls, const char *filename,
557 struct GNUNET_FS_FileInformation *fi) 585 struct GNUNET_FS_FileInformation *fi)
558{ 586{
559 struct EntryProcCls *dc = cls; 587 struct EntryProcCls *dc = cls;
560 588 unsigned int i;
589 const char *kw;
590 struct KeywordInformation *ki;
591 GNUNET_HashCode key;
592
561 GNUNET_assert (fi->next == NULL); 593 GNUNET_assert (fi->next == NULL);
562 GNUNET_assert (fi->dir == NULL); 594 GNUNET_assert (fi->dir == NULL);
563 fi->next = dc->entries; 595 fi->next = dc->entries;
@@ -567,6 +599,20 @@ dirproc (void *cls, const char *filename,
567 GNUNET_CONTAINER_meta_data_iterate (fi->meta, 599 GNUNET_CONTAINER_meta_data_iterate (fi->meta,
568 &update_metamap, 600 &update_metamap,
569 dc->metamap); 601 dc->metamap);
602 for (i=0;i<fi->keywords->data.ksk.keywordCount;i++)
603 {
604 kw = fi->keywords->data.ksk.keywords[i];
605 GNUNET_CRYPTO_hash (kw, strlen(kw), &key);
606 ki = GNUNET_CONTAINER_multihashmap_get (dc->keywordmap, &key);
607 if (ki == NULL)
608 {
609 ki = GNUNET_malloc (sizeof (struct KeywordInformation));
610 ki->keyword = &kw[1];
611 GNUNET_CONTAINER_multihashmap_put (dc->keywordmap, &key, ki,
612 GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY);
613 }
614 ki->frequency++;
615 }
570} 616}
571 617
572 618
@@ -611,15 +657,16 @@ compute_directory_metadata (void *cls,
611 struct ComputeDirectoryMetadataContext *cdmc = cls; 657 struct ComputeDirectoryMetadataContext *cdmc = cls;
612 struct MetaValueInformation *mvi = value; 658 struct MetaValueInformation *mvi = value;
613 659
614 if (mvi->frequency > cdmc->threshold) 660 if (mvi->frequency > cdmc->threshold)
615 { 661 {
616 (void) GNUNET_CONTAINER_meta_data_insert (cdmc->meta, 662 if (mvi->type != EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME)
617 "<children>", 663 (void) GNUNET_CONTAINER_meta_data_insert (cdmc->meta,
618 mvi->type, 664 "<children>",
619 mvi->format, 665 mvi->type,
620 mvi->mime_type, 666 mvi->format,
621 mvi->data, 667 mvi->mime_type,
622 mvi->data_size); 668 mvi->data,
669 mvi->data_size);
623 if ( (mvi->format == EXTRACTOR_METAFORMAT_UTF8) || 670 if ( (mvi->format == EXTRACTOR_METAFORMAT_UTF8) ||
624 (mvi->format == EXTRACTOR_METAFORMAT_C_STRING) ) 671 (mvi->format == EXTRACTOR_METAFORMAT_C_STRING) )
625 GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk, 672 GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk,
@@ -632,6 +679,32 @@ compute_directory_metadata (void *cls,
632 679
633 680
634/** 681/**
682 * Add keywords that occur in more than the threshold entries of the
683 * directory to the directory itself.
684 *
685 * @param cls the 'struct ComputeDirectoryMetadataContext'
686 * @param key unused
687 * @param value the 'struct Keywordnformation' (to be freed as well)
688 * @return GNUNET_OK
689 */
690static int
691compute_directory_keywords (void *cls,
692 const GNUNET_HashCode *key,
693 void *value)
694{
695 struct ComputeDirectoryMetadataContext *cdmc = cls;
696 struct KeywordInformation *ki = value;
697
698 if (ki->frequency > cdmc->threshold)
699 (void) GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk,
700 ki->keyword,
701 GNUNET_NO);
702 GNUNET_free (ki);
703 return GNUNET_OK;
704}
705
706
707/**
635 * Create a publish-structure from an existing file hierarchy, inferring 708 * Create a publish-structure from an existing file hierarchy, inferring
636 * and organizing keywords and metadata as much as possible. This 709 * and organizing keywords and metadata as much as possible. This
637 * function primarily performs the recursive build and re-organizes 710 * function primarily performs the recursive build and re-organizes
@@ -669,26 +742,46 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h,
669 const char *ss; 742 const char *ss;
670 struct GNUNET_FS_Uri *cksk; 743 struct GNUNET_FS_Uri *cksk;
671 char *dn; 744 char *dn;
745 struct GNUNET_FS_FileInformation *epos;
746 unsigned int i;
747 const char *kw;
672 748
673 dc.entries = NULL; 749 dc.entries = NULL;
674 dc.count = 0; 750 dc.count = 0;
675 dc.metamap = GNUNET_CONTAINER_multihashmap_create (64); 751 dc.metamap = GNUNET_CONTAINER_multihashmap_create (64);
676 scanner (scanner_cls, h, filename, do_index, bo, &dirproc, &dc, emsg); 752 dc.keywordmap = GNUNET_CONTAINER_multihashmap_create (64);
753 /* update children to point to directory and generate statistics
754 on all meta data in children */
755 scanner (scanner_cls, h, filename, do_index, bo, &dirproc_add, &dc, emsg);
677 cdmc.meta = GNUNET_CONTAINER_meta_data_create (); 756 cdmc.meta = GNUNET_CONTAINER_meta_data_create ();
678 cdmc.ksk = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri)); 757 cdmc.ksk = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri));
679 cdmc.ksk->type = ksk; 758 cdmc.ksk->type = ksk;
680 cdmc.threshold = dc.count / 2; /* 50% threshold for now */ 759 cdmc.threshold = 1 + dc.count / 2; /* 50% threshold for now */
681 GNUNET_FS_meta_data_make_directory (cdmc.meta); 760 GNUNET_FS_meta_data_make_directory (cdmc.meta);
682 /* FIXME: remove meta data above a certain threshold from files
683 to *only* have it for the directory? */
684 GNUNET_CONTAINER_multihashmap_iterate (dc.metamap, 761 GNUNET_CONTAINER_multihashmap_iterate (dc.metamap,
685 &compute_directory_metadata, 762 &compute_directory_metadata,
686 &cdmc); 763 &cdmc);
764 GNUNET_CONTAINER_multihashmap_iterate (dc.keywordmap,
765 &compute_directory_keywords,
766 &cdmc);
687 GNUNET_CONTAINER_multihashmap_destroy (dc.metamap); 767 GNUNET_CONTAINER_multihashmap_destroy (dc.metamap);
768 GNUNET_CONTAINER_multihashmap_destroy (dc.keywordmap);
688 GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, 769 GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk,
689 GNUNET_FS_DIRECTORY_MIME, 770 GNUNET_FS_DIRECTORY_MIME,
690 GNUNET_NO); 771 GNUNET_NO);
691 cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk); 772 cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk);
773
774 /* remove keywords in children that are already in the
775 parent */
776 for (epos = dc.entries; NULL != epos; epos = epos->next)
777 {
778 for (i=0;i<cksk->data.ksk.keywordCount;i++)
779 {
780 kw = cksk->data.ksk.keywords[i];
781 GNUNET_FS_uri_ksk_remove_keyword (epos->keywords,
782 &kw[1]);
783 }
784 }
692 ret = 785 ret =
693 GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk, 786 GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk,
694 cdmc.meta, bo); 787 cdmc.meta, bo);