diff options
author | Christian Grothoff <christian@grothoff.org> | 2011-10-21 13:48:25 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2011-10-21 13:48:25 +0000 |
commit | 52bf657944215306b0753eede6285d4296baa884 (patch) | |
tree | 5b06e4d928b8de1fa658f8334b2b7adb6828b446 /src/fs/fs_file_information.c | |
parent | a30718300ec364b57daf8ef68559f947c9efa672 (diff) | |
download | gnunet-52bf657944215306b0753eede6285d4296baa884.tar.gz gnunet-52bf657944215306b0753eede6285d4296baa884.zip |
more keyword canonicalization, fix threshold, update testcases:
Diffstat (limited to 'src/fs/fs_file_information.c')
-rw-r--r-- | src/fs/fs_file_information.c | 125 |
1 files changed, 109 insertions, 16 deletions
diff --git a/src/fs/fs_file_information.c b/src/fs/fs_file_information.c index a07ebe2cf..ce4f189a9 100644 --- a/src/fs/fs_file_information.c +++ b/src/fs/fs_file_information.c | |||
@@ -520,6 +520,25 @@ update_metamap (void *cls, | |||
520 | 520 | ||
521 | 521 | ||
522 | /** | 522 | /** |
523 | * Aggregate information we keep for keywords in each directory. | ||
524 | */ | ||
525 | struct KeywordInformation | ||
526 | { | ||
527 | |||
528 | /** | ||
529 | * Mime-type of keyword. | ||
530 | */ | ||
531 | const char *keyword; | ||
532 | |||
533 | /** | ||
534 | * How often does this meta value occur in this directory? | ||
535 | */ | ||
536 | unsigned int frequency; | ||
537 | |||
538 | }; | ||
539 | |||
540 | |||
541 | /** | ||
523 | * Closure for dirproc function. | 542 | * Closure for dirproc function. |
524 | */ | 543 | */ |
525 | struct EntryProcCls | 544 | struct EntryProcCls |
@@ -538,6 +557,13 @@ struct EntryProcCls | |||
538 | struct GNUNET_CONTAINER_MultiHashMap *metamap; | 557 | struct GNUNET_CONTAINER_MultiHashMap *metamap; |
539 | 558 | ||
540 | /** | 559 | /** |
560 | * Map describing the keywords for all entries in the | ||
561 | * directory. Keys are the hash of the keyword, | ||
562 | * values are of type 'struct KeywordInformation'. | ||
563 | */ | ||
564 | struct GNUNET_CONTAINER_MultiHashMap *keywordmap; | ||
565 | |||
566 | /** | ||
541 | * Number of entries in 'entries'. | 567 | * Number of entries in 'entries'. |
542 | */ | 568 | */ |
543 | unsigned int count; | 569 | unsigned int count; |
@@ -547,17 +573,23 @@ struct EntryProcCls | |||
547 | 573 | ||
548 | /** | 574 | /** |
549 | * Function that processes a directory entry that | 575 | * Function that processes a directory entry that |
550 | * was obtained from the scanner. | 576 | * was obtained from the scanner. Adds each entry to |
577 | * the directory and computes directroy meta map. | ||
578 | * | ||
551 | * @param cls our closure | 579 | * @param cls our closure |
552 | * @param filename name of the file (unused, why there???) | 580 | * @param filename name of the file (unused, why there???) |
553 | * @param fi information for publishing the file | 581 | * @param fi information for publishing the file |
554 | */ | 582 | */ |
555 | static void | 583 | static void |
556 | dirproc (void *cls, const char *filename, | 584 | dirproc_add (void *cls, const char *filename, |
557 | struct GNUNET_FS_FileInformation *fi) | 585 | struct GNUNET_FS_FileInformation *fi) |
558 | { | 586 | { |
559 | struct EntryProcCls *dc = cls; | 587 | struct EntryProcCls *dc = cls; |
560 | 588 | unsigned int i; | |
589 | const char *kw; | ||
590 | struct KeywordInformation *ki; | ||
591 | GNUNET_HashCode key; | ||
592 | |||
561 | GNUNET_assert (fi->next == NULL); | 593 | GNUNET_assert (fi->next == NULL); |
562 | GNUNET_assert (fi->dir == NULL); | 594 | GNUNET_assert (fi->dir == NULL); |
563 | fi->next = dc->entries; | 595 | fi->next = dc->entries; |
@@ -567,6 +599,20 @@ dirproc (void *cls, const char *filename, | |||
567 | GNUNET_CONTAINER_meta_data_iterate (fi->meta, | 599 | GNUNET_CONTAINER_meta_data_iterate (fi->meta, |
568 | &update_metamap, | 600 | &update_metamap, |
569 | dc->metamap); | 601 | dc->metamap); |
602 | for (i=0;i<fi->keywords->data.ksk.keywordCount;i++) | ||
603 | { | ||
604 | kw = fi->keywords->data.ksk.keywords[i]; | ||
605 | GNUNET_CRYPTO_hash (kw, strlen(kw), &key); | ||
606 | ki = GNUNET_CONTAINER_multihashmap_get (dc->keywordmap, &key); | ||
607 | if (ki == NULL) | ||
608 | { | ||
609 | ki = GNUNET_malloc (sizeof (struct KeywordInformation)); | ||
610 | ki->keyword = &kw[1]; | ||
611 | GNUNET_CONTAINER_multihashmap_put (dc->keywordmap, &key, ki, | ||
612 | GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY); | ||
613 | } | ||
614 | ki->frequency++; | ||
615 | } | ||
570 | } | 616 | } |
571 | 617 | ||
572 | 618 | ||
@@ -611,15 +657,16 @@ compute_directory_metadata (void *cls, | |||
611 | struct ComputeDirectoryMetadataContext *cdmc = cls; | 657 | struct ComputeDirectoryMetadataContext *cdmc = cls; |
612 | struct MetaValueInformation *mvi = value; | 658 | struct MetaValueInformation *mvi = value; |
613 | 659 | ||
614 | if (mvi->frequency > cdmc->threshold) | 660 | if (mvi->frequency > cdmc->threshold) |
615 | { | 661 | { |
616 | (void) GNUNET_CONTAINER_meta_data_insert (cdmc->meta, | 662 | if (mvi->type != EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME) |
617 | "<children>", | 663 | (void) GNUNET_CONTAINER_meta_data_insert (cdmc->meta, |
618 | mvi->type, | 664 | "<children>", |
619 | mvi->format, | 665 | mvi->type, |
620 | mvi->mime_type, | 666 | mvi->format, |
621 | mvi->data, | 667 | mvi->mime_type, |
622 | mvi->data_size); | 668 | mvi->data, |
669 | mvi->data_size); | ||
623 | if ( (mvi->format == EXTRACTOR_METAFORMAT_UTF8) || | 670 | if ( (mvi->format == EXTRACTOR_METAFORMAT_UTF8) || |
624 | (mvi->format == EXTRACTOR_METAFORMAT_C_STRING) ) | 671 | (mvi->format == EXTRACTOR_METAFORMAT_C_STRING) ) |
625 | GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk, | 672 | GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk, |
@@ -632,6 +679,32 @@ compute_directory_metadata (void *cls, | |||
632 | 679 | ||
633 | 680 | ||
634 | /** | 681 | /** |
682 | * Add keywords that occur in more than the threshold entries of the | ||
683 | * directory to the directory itself. | ||
684 | * | ||
685 | * @param cls the 'struct ComputeDirectoryMetadataContext' | ||
686 | * @param key unused | ||
687 | * @param value the 'struct Keywordnformation' (to be freed as well) | ||
688 | * @return GNUNET_OK | ||
689 | */ | ||
690 | static int | ||
691 | compute_directory_keywords (void *cls, | ||
692 | const GNUNET_HashCode *key, | ||
693 | void *value) | ||
694 | { | ||
695 | struct ComputeDirectoryMetadataContext *cdmc = cls; | ||
696 | struct KeywordInformation *ki = value; | ||
697 | |||
698 | if (ki->frequency > cdmc->threshold) | ||
699 | (void) GNUNET_FS_uri_ksk_add_keyword (cdmc->ksk, | ||
700 | ki->keyword, | ||
701 | GNUNET_NO); | ||
702 | GNUNET_free (ki); | ||
703 | return GNUNET_OK; | ||
704 | } | ||
705 | |||
706 | |||
707 | /** | ||
635 | * Create a publish-structure from an existing file hierarchy, inferring | 708 | * Create a publish-structure from an existing file hierarchy, inferring |
636 | * and organizing keywords and metadata as much as possible. This | 709 | * and organizing keywords and metadata as much as possible. This |
637 | * function primarily performs the recursive build and re-organizes | 710 | * function primarily performs the recursive build and re-organizes |
@@ -669,26 +742,46 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h, | |||
669 | const char *ss; | 742 | const char *ss; |
670 | struct GNUNET_FS_Uri *cksk; | 743 | struct GNUNET_FS_Uri *cksk; |
671 | char *dn; | 744 | char *dn; |
745 | struct GNUNET_FS_FileInformation *epos; | ||
746 | unsigned int i; | ||
747 | const char *kw; | ||
672 | 748 | ||
673 | dc.entries = NULL; | 749 | dc.entries = NULL; |
674 | dc.count = 0; | 750 | dc.count = 0; |
675 | dc.metamap = GNUNET_CONTAINER_multihashmap_create (64); | 751 | dc.metamap = GNUNET_CONTAINER_multihashmap_create (64); |
676 | scanner (scanner_cls, h, filename, do_index, bo, &dirproc, &dc, emsg); | 752 | dc.keywordmap = GNUNET_CONTAINER_multihashmap_create (64); |
753 | /* update children to point to directory and generate statistics | ||
754 | on all meta data in children */ | ||
755 | scanner (scanner_cls, h, filename, do_index, bo, &dirproc_add, &dc, emsg); | ||
677 | cdmc.meta = GNUNET_CONTAINER_meta_data_create (); | 756 | cdmc.meta = GNUNET_CONTAINER_meta_data_create (); |
678 | cdmc.ksk = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri)); | 757 | cdmc.ksk = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri)); |
679 | cdmc.ksk->type = ksk; | 758 | cdmc.ksk->type = ksk; |
680 | cdmc.threshold = dc.count / 2; /* 50% threshold for now */ | 759 | cdmc.threshold = 1 + dc.count / 2; /* 50% threshold for now */ |
681 | GNUNET_FS_meta_data_make_directory (cdmc.meta); | 760 | GNUNET_FS_meta_data_make_directory (cdmc.meta); |
682 | /* FIXME: remove meta data above a certain threshold from files | ||
683 | to *only* have it for the directory? */ | ||
684 | GNUNET_CONTAINER_multihashmap_iterate (dc.metamap, | 761 | GNUNET_CONTAINER_multihashmap_iterate (dc.metamap, |
685 | &compute_directory_metadata, | 762 | &compute_directory_metadata, |
686 | &cdmc); | 763 | &cdmc); |
764 | GNUNET_CONTAINER_multihashmap_iterate (dc.keywordmap, | ||
765 | &compute_directory_keywords, | ||
766 | &cdmc); | ||
687 | GNUNET_CONTAINER_multihashmap_destroy (dc.metamap); | 767 | GNUNET_CONTAINER_multihashmap_destroy (dc.metamap); |
768 | GNUNET_CONTAINER_multihashmap_destroy (dc.keywordmap); | ||
688 | GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, | 769 | GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, |
689 | GNUNET_FS_DIRECTORY_MIME, | 770 | GNUNET_FS_DIRECTORY_MIME, |
690 | GNUNET_NO); | 771 | GNUNET_NO); |
691 | cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk); | 772 | cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk); |
773 | |||
774 | /* remove keywords in children that are already in the | ||
775 | parent */ | ||
776 | for (epos = dc.entries; NULL != epos; epos = epos->next) | ||
777 | { | ||
778 | for (i=0;i<cksk->data.ksk.keywordCount;i++) | ||
779 | { | ||
780 | kw = cksk->data.ksk.keywords[i]; | ||
781 | GNUNET_FS_uri_ksk_remove_keyword (epos->keywords, | ||
782 | &kw[1]); | ||
783 | } | ||
784 | } | ||
692 | ret = | 785 | ret = |
693 | GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk, | 786 | GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk, |
694 | cdmc.meta, bo); | 787 | cdmc.meta, bo); |