aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2011-12-25 21:16:11 +0000
committerChristian Grothoff <christian@grothoff.org>2011-12-25 21:16:11 +0000
commitcdee03748e83189713b32bb87bc77cde659c20d9 (patch)
treeddafc550fe8b6038106ba81140cf6e94e52f6858
parentfd20a7d04e49f9317de85b7f40f79d447d6e1715 (diff)
downloadgnunet-cdee03748e83189713b32bb87bc77cde659c20d9.tar.gz
gnunet-cdee03748e83189713b32bb87bc77cde659c20d9.zip
-remove code for keyword caonicalization/normalization, makes no sense in international application, normalization methods are questionable to begin with
-rw-r--r--doc/man/gnunet-publish.12
-rw-r--r--src/fs/fs_file_information.c15
-rw-r--r--src/fs/fs_uri.c123
-rw-r--r--src/include/gnunet_fs_service.h24
4 files changed, 12 insertions, 152 deletions
diff --git a/doc/man/gnunet-publish.1 b/doc/man/gnunet-publish.1
index 402fef2d0..f35234d13 100644
--- a/doc/man/gnunet-publish.1
+++ b/doc/man/gnunet-publish.1
@@ -10,7 +10,7 @@ In order to share files with other GNUnet users, the files must first be made av
10.PP 10.PP
11In order to start sharing files, the files must be added either using gnunet\-publish or a graphical interface such as gnunet\-gtk. The command line tool gnunet\-publish is more useful if many files are supposed to be added. gnunet\-publish can automatically publish batches of files, recursively publish directories, create directories that can be browsed within GNUnet and publish file lists in a namespace. When run on a directory, gnunet\-publish will always recursively publish all of the files in the directory. 11In order to start sharing files, the files must be added either using gnunet\-publish or a graphical interface such as gnunet\-gtk. The command line tool gnunet\-publish is more useful if many files are supposed to be added. gnunet\-publish can automatically publish batches of files, recursively publish directories, create directories that can be browsed within GNUnet and publish file lists in a namespace. When run on a directory, gnunet\-publish will always recursively publish all of the files in the directory.
12.PP 12.PP
13gnunet\-publish can automatically extract keywords from the files that are shared. Users that want to download files from GNUnet use keywords to search for the appropriate content. You can disable keyword extraction with the \-D option. You can manually add keywords using the \-k option. The keywords are case\-sensitive. (However, keyword normalization can also be used.) 13gnunet\-publish can automatically extract keywords from the files that are shared. Users that want to download files from GNUnet use keywords to search for the appropriate content. You can disable keyword extraction with the \-D option. You can manually add keywords using the \-k option. The keywords are case\-sensitive.
14.PP 14.PP
15You can use automatic meta\-data extraction (based on libextractor) or the command\-line option \-m to specify meta-data. For the \-m option you need to use the form keyword\-type:value. For example, use "\-m os:Linux" to specify that the operating system is Linux. Common meta\-data types are "author name", "title" , "mimetype", "filename", "language", "subject" and "keywords". A full list can be obtained from the extract tool using the option \-\-list. The meta\-data is used to help users in searching for files on the network. 15You can use automatic meta\-data extraction (based on libextractor) or the command\-line option \-m to specify meta-data. For the \-m option you need to use the form keyword\-type:value. For example, use "\-m os:Linux" to specify that the operating system is Linux. Common meta\-data types are "author name", "title" , "mimetype", "filename", "language", "subject" and "keywords". A full list can be obtained from the extract tool using the option \-\-list. The meta\-data is used to help users in searching for files on the network.
16.PP 16.PP
diff --git a/src/fs/fs_file_information.c b/src/fs/fs_file_information.c
index 0925d57ec..4ea264892 100644
--- a/src/fs/fs_file_information.c
+++ b/src/fs/fs_file_information.c
@@ -341,7 +341,6 @@ dir_scan_cb (void *cls, const char *filename)
341 struct DirScanCls *dsc = cls; 341 struct DirScanCls *dsc = cls;
342 struct stat sbuf; 342 struct stat sbuf;
343 struct GNUNET_FS_FileInformation *fi; 343 struct GNUNET_FS_FileInformation *fi;
344 struct GNUNET_FS_Uri *ksk_uri;
345 struct GNUNET_FS_Uri *keywords; 344 struct GNUNET_FS_Uri *keywords;
346 struct GNUNET_CONTAINER_MetaData *meta; 345 struct GNUNET_CONTAINER_MetaData *meta;
347 346
@@ -370,13 +369,11 @@ dir_scan_cb (void *cls, const char *filename)
370 meta = GNUNET_CONTAINER_meta_data_create (); 369 meta = GNUNET_CONTAINER_meta_data_create ();
371 GNUNET_FS_meta_data_extract_from_file (meta, filename, dsc->extractors); 370 GNUNET_FS_meta_data_extract_from_file (meta, filename, dsc->extractors);
372 keywords = GNUNET_FS_uri_ksk_create_from_meta_data (meta); 371 keywords = GNUNET_FS_uri_ksk_create_from_meta_data (meta);
373 ksk_uri = GNUNET_FS_uri_ksk_canonicalize (keywords);
374 fi = GNUNET_FS_file_information_create_from_file (dsc->h, NULL, filename, 372 fi = GNUNET_FS_file_information_create_from_file (dsc->h, NULL, filename,
375 ksk_uri, meta, 373 keywords, meta,
376 dsc->do_index, dsc->bo); 374 dsc->do_index, dsc->bo);
377 GNUNET_CONTAINER_meta_data_destroy (meta); 375 GNUNET_CONTAINER_meta_data_destroy (meta);
378 GNUNET_FS_uri_destroy (keywords); 376 GNUNET_FS_uri_destroy (keywords);
379 GNUNET_FS_uri_destroy (ksk_uri);
380 } 377 }
381 dsc->proc (dsc->proc_cls, filename, fi); 378 dsc->proc (dsc->proc_cls, filename, fi);
382 return GNUNET_OK; 379 return GNUNET_OK;
@@ -723,7 +720,6 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h,
723 struct EntryProcCls dc; 720 struct EntryProcCls dc;
724 const char *fn; 721 const char *fn;
725 const char *ss; 722 const char *ss;
726 struct GNUNET_FS_Uri *cksk;
727 char *dn; 723 char *dn;
728 struct GNUNET_FS_FileInformation *epos; 724 struct GNUNET_FS_FileInformation *epos;
729 unsigned int i; 725 unsigned int i;
@@ -747,21 +743,20 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h,
747 &compute_directory_keywords, &cdmc); 743 &compute_directory_keywords, &cdmc);
748 GNUNET_CONTAINER_multihashmap_destroy (dc.metamap); 744 GNUNET_CONTAINER_multihashmap_destroy (dc.metamap);
749 GNUNET_CONTAINER_multihashmap_destroy (dc.keywordmap); 745 GNUNET_CONTAINER_multihashmap_destroy (dc.keywordmap);
750 GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, GNUNET_FS_DIRECTORY_MIME, GNUNET_NO);
751 cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk);
752 746
753 /* remove keywords in children that are already in the 747 /* remove keywords in children that are already in the
754 * parent */ 748 * parent */
755 for (epos = dc.entries; NULL != epos; epos = epos->next) 749 for (epos = dc.entries; NULL != epos; epos = epos->next)
756 { 750 {
757 for (i = 0; i < cksk->data.ksk.keywordCount; i++) 751 for (i = 0; i < cdmc.ksk->data.ksk.keywordCount; i++)
758 { 752 {
759 kw = cksk->data.ksk.keywords[i]; 753 kw = cdmc.ksk->data.ksk.keywords[i];
760 GNUNET_FS_uri_ksk_remove_keyword (epos->keywords, &kw[1]); 754 GNUNET_FS_uri_ksk_remove_keyword (epos->keywords, &kw[1]);
761 } 755 }
762 } 756 }
757 GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, GNUNET_FS_DIRECTORY_MIME, GNUNET_NO);
763 ret = 758 ret =
764 GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk, 759 GNUNET_FS_file_information_create_empty_directory (h, client_info, cdmc.ksk,
765 cdmc.meta, bo); 760 cdmc.meta, bo);
766 GNUNET_CONTAINER_meta_data_destroy (cdmc.meta); 761 GNUNET_CONTAINER_meta_data_destroy (cdmc.meta);
767 GNUNET_FS_uri_destroy (cdmc.ksk); 762 GNUNET_FS_uri_destroy (cdmc.ksk);
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index 55503b71b..d3fcdd8ca 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -970,129 +970,6 @@ GNUNET_FS_uri_sks_create_from_nsid (GNUNET_HashCode * nsid, const char *id)
970 970
971 971
972/** 972/**
973 * Canonicalize a keyword.
974 *
975 * @param in input string (the keyword)
976 * @return canonicalized keyword
977 */
978static char *
979canonicalize_keyword (const char *in)
980{
981 char *ret;
982 char *wpos;
983 const char *rpos;
984
985 ret = GNUNET_strdup (in);
986 wpos = ret;
987 rpos = in;
988 while ('\0' != *rpos)
989 {
990 switch (tolower ((unsigned char) *rpos))
991 {
992 case 'a':
993 case 'e':
994 case 'i':
995 case 'o':
996 case 'u':
997 case ' ':
998 case '\t':
999 case '\n':
1000 case '\r':
1001 /* skip characters listed above */
1002 break;
1003 case 'b':
1004 case 'c':
1005 case 'd':
1006 case 'f':
1007 case 'g':
1008 case 'h':
1009 case 'j':
1010 case 'k':
1011 case 'l':
1012 case 'm':
1013 case 'n':
1014 case 'p':
1015 case 'r':
1016 case 's':
1017 case 't':
1018 case 'v':
1019 case 'w':
1020 case 'x':
1021 case 'y':
1022 case 'z':
1023 /* convert characters listed above to lower case */
1024 *wpos = tolower ((unsigned char) *rpos);
1025 wpos++;
1026 break;
1027 case '!':
1028 case '.':
1029 case '?':
1030 case '-':
1031 /* keep characters listed above without changes */
1032 *wpos = *rpos;
1033 wpos++;
1034 break;
1035 default:
1036 if (isspace ((unsigned char) *rpos) || isdigit ((unsigned char) *rpos))
1037 break;
1038 /* replace characters listed above with '_' */
1039 *wpos = '_';
1040 wpos++;
1041 break;
1042 }
1043 rpos++;
1044 }
1045 *wpos = '\0';
1046 return ret;
1047}
1048
1049
1050/**
1051 * Canonicalize keyword URI. Performs operations such
1052 * as decapitalization and removal of certain characters.
1053 * (useful for search).
1054 *
1055 * @param uri the URI to canonicalize
1056 * @return canonicalized version of the URI, NULL on error
1057 */
1058struct GNUNET_FS_Uri *
1059GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri)
1060{
1061 struct GNUNET_FS_Uri *ret;
1062 unsigned int kc;
1063 unsigned int i;
1064 const char *in;
1065 char *sb;
1066 char *cc;
1067 const char *tok;
1068
1069 ret = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri));
1070 ret->type = ksk;
1071 kc = uri->data.ksk.keywordCount;
1072 for (i = 0; i < kc; i++)
1073 {
1074 in = uri->data.ksk.keywords[i];
1075 GNUNET_FS_uri_ksk_add_keyword (ret, &in[1],
1076 (in[0] == '+') ? GNUNET_YES : GNUNET_NO);
1077 sb = GNUNET_strdup (&in[1]);
1078#define DELIMS " \\|\"'`/&@-_,.;!?+-*^$#~=[]{}()<>"
1079 for (tok = strtok (sb, DELIMS); NULL != tok; tok = strtok (NULL, DELIMS))
1080#undef DELIMS
1081 {
1082 if (strlen (tok) < 3)
1083 continue;
1084 GNUNET_FS_uri_ksk_add_keyword (ret, tok, GNUNET_NO);
1085 cc = canonicalize_keyword (tok);
1086 if (strlen (cc) > 2)
1087 GNUNET_FS_uri_ksk_add_keyword (ret, cc, GNUNET_NO);
1088 }
1089 GNUNET_free (sb);
1090 }
1091 return ret;
1092}
1093
1094
1095/**
1096 * Merge the sets of keywords from two KSK URIs. 973 * Merge the sets of keywords from two KSK URIs.
1097 * (useful for merging the canonicalized keywords with 974 * (useful for merging the canonicalized keywords with
1098 * the original keywords for sharing). 975 * the original keywords for sharing).
diff --git a/src/include/gnunet_fs_service.h b/src/include/gnunet_fs_service.h
index db1d74589..ad4441bd3 100644
--- a/src/include/gnunet_fs_service.h
+++ b/src/include/gnunet_fs_service.h
@@ -53,7 +53,7 @@ extern "C"
53 * 6.1.x: with simplified namespace support 53 * 6.1.x: with simplified namespace support
54 * 9.0.0: CPS-style integrated API 54 * 9.0.0: CPS-style integrated API
55 */ 55 */
56#define GNUNET_FS_VERSION 0x00090000 56#define GNUNET_FS_VERSION 0x00090001
57 57
58 58
59/* ******************** URI API *********************** */ 59/* ******************** URI API *********************** */
@@ -228,21 +228,7 @@ GNUNET_FS_uri_loc_create (const struct GNUNET_FS_Uri *baseUri,
228 228
229 229
230/** 230/**
231 * Canonicalize keyword URI. Performs operations such
232 * as decapitalization and removal of certain characters.
233 * (useful for search).
234 *
235 * @param uri the URI to canonicalize
236 * @return canonicalized version of the URI, NULL on error
237 */
238struct GNUNET_FS_Uri *
239GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri);
240
241
242/**
243 * Merge the sets of keywords from two KSK URIs. 231 * Merge the sets of keywords from two KSK URIs.
244 * (useful for merging the canonicalized keywords with
245 * the original keywords for sharing).
246 * 232 *
247 * @param u1 first uri 233 * @param u1 first uri
248 * @param u2 second uri 234 * @param u2 second uri
@@ -1898,9 +1884,11 @@ typedef int (*GNUNET_FS_DirectoryScanner) (void *cls,
1898 * files (those starting with a "."). Metadata will be extracted 1884 * files (those starting with a "."). Metadata will be extracted
1899 * using GNU libextractor; the specific list of plugins should be 1885 * using GNU libextractor; the specific list of plugins should be
1900 * specified in "cls", passing NULL will disable (!) metadata 1886 * specified in "cls", passing NULL will disable (!) metadata
1901 * extraction. Keywords will be derived from the metadata and be 1887 * extraction. Keywords will be derived from the metadata and
1902 * subject to default canonicalization. This is strictly a 1888 * associated with directories as appropriate. This is strictly a
1903 * convenience function. 1889 * convenience function (however, if all tools use it, there will
1890 * be less of a chance of distinguishing users by the specific
1891 * user-interface they were using).
1904 * 1892 *
1905 * @param cls must be of type "struct EXTRACTOR_Extractor*" 1893 * @param cls must be of type "struct EXTRACTOR_Extractor*"
1906 * @param h handle to the file sharing subsystem 1894 * @param h handle to the file sharing subsystem