From cdee03748e83189713b32bb87bc77cde659c20d9 Mon Sep 17 00:00:00 2001 From: Christian Grothoff Date: Sun, 25 Dec 2011 21:16:11 +0000 Subject: -remove code for keyword caonicalization/normalization, makes no sense in international application, normalization methods are questionable to begin with --- doc/man/gnunet-publish.1 | 2 +- src/fs/fs_file_information.c | 15 ++--- src/fs/fs_uri.c | 123 ---------------------------------------- src/include/gnunet_fs_service.h | 24 ++------ 4 files changed, 12 insertions(+), 152 deletions(-) diff --git a/doc/man/gnunet-publish.1 b/doc/man/gnunet-publish.1 index 402fef2d0..f35234d13 100644 --- a/doc/man/gnunet-publish.1 +++ b/doc/man/gnunet-publish.1 @@ -10,7 +10,7 @@ In order to share files with other GNUnet users, the files must first be made av .PP In order to start sharing files, the files must be added either using gnunet\-publish or a graphical interface such as gnunet\-gtk. The command line tool gnunet\-publish is more useful if many files are supposed to be added. gnunet\-publish can automatically publish batches of files, recursively publish directories, create directories that can be browsed within GNUnet and publish file lists in a namespace. When run on a directory, gnunet\-publish will always recursively publish all of the files in the directory. .PP -gnunet\-publish can automatically extract keywords from the files that are shared. Users that want to download files from GNUnet use keywords to search for the appropriate content. You can disable keyword extraction with the \-D option. You can manually add keywords using the \-k option. The keywords are case\-sensitive. (However, keyword normalization can also be used.) +gnunet\-publish can automatically extract keywords from the files that are shared. Users that want to download files from GNUnet use keywords to search for the appropriate content. You can disable keyword extraction with the \-D option. You can manually add keywords using the \-k option. The keywords are case\-sensitive. .PP You can use automatic meta\-data extraction (based on libextractor) or the command\-line option \-m to specify meta-data. For the \-m option you need to use the form keyword\-type:value. For example, use "\-m os:Linux" to specify that the operating system is Linux. Common meta\-data types are "author name", "title" , "mimetype", "filename", "language", "subject" and "keywords". A full list can be obtained from the extract tool using the option \-\-list. The meta\-data is used to help users in searching for files on the network. .PP diff --git a/src/fs/fs_file_information.c b/src/fs/fs_file_information.c index 0925d57ec..4ea264892 100644 --- a/src/fs/fs_file_information.c +++ b/src/fs/fs_file_information.c @@ -341,7 +341,6 @@ dir_scan_cb (void *cls, const char *filename) struct DirScanCls *dsc = cls; struct stat sbuf; struct GNUNET_FS_FileInformation *fi; - struct GNUNET_FS_Uri *ksk_uri; struct GNUNET_FS_Uri *keywords; struct GNUNET_CONTAINER_MetaData *meta; @@ -370,13 +369,11 @@ dir_scan_cb (void *cls, const char *filename) meta = GNUNET_CONTAINER_meta_data_create (); GNUNET_FS_meta_data_extract_from_file (meta, filename, dsc->extractors); keywords = GNUNET_FS_uri_ksk_create_from_meta_data (meta); - ksk_uri = GNUNET_FS_uri_ksk_canonicalize (keywords); fi = GNUNET_FS_file_information_create_from_file (dsc->h, NULL, filename, - ksk_uri, meta, + keywords, meta, dsc->do_index, dsc->bo); GNUNET_CONTAINER_meta_data_destroy (meta); GNUNET_FS_uri_destroy (keywords); - GNUNET_FS_uri_destroy (ksk_uri); } dsc->proc (dsc->proc_cls, filename, fi); return GNUNET_OK; @@ -723,7 +720,6 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h, struct EntryProcCls dc; const char *fn; const char *ss; - struct GNUNET_FS_Uri *cksk; char *dn; struct GNUNET_FS_FileInformation *epos; unsigned int i; @@ -747,21 +743,20 @@ GNUNET_FS_file_information_create_from_directory (struct GNUNET_FS_Handle *h, &compute_directory_keywords, &cdmc); GNUNET_CONTAINER_multihashmap_destroy (dc.metamap); GNUNET_CONTAINER_multihashmap_destroy (dc.keywordmap); - GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, GNUNET_FS_DIRECTORY_MIME, GNUNET_NO); - cksk = GNUNET_FS_uri_ksk_canonicalize (cdmc.ksk); /* remove keywords in children that are already in the * parent */ for (epos = dc.entries; NULL != epos; epos = epos->next) { - for (i = 0; i < cksk->data.ksk.keywordCount; i++) + for (i = 0; i < cdmc.ksk->data.ksk.keywordCount; i++) { - kw = cksk->data.ksk.keywords[i]; + kw = cdmc.ksk->data.ksk.keywords[i]; GNUNET_FS_uri_ksk_remove_keyword (epos->keywords, &kw[1]); } } + GNUNET_FS_uri_ksk_add_keyword (cdmc.ksk, GNUNET_FS_DIRECTORY_MIME, GNUNET_NO); ret = - GNUNET_FS_file_information_create_empty_directory (h, client_info, cksk, + GNUNET_FS_file_information_create_empty_directory (h, client_info, cdmc.ksk, cdmc.meta, bo); GNUNET_CONTAINER_meta_data_destroy (cdmc.meta); GNUNET_FS_uri_destroy (cdmc.ksk); diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index 55503b71b..d3fcdd8ca 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c @@ -969,129 +969,6 @@ GNUNET_FS_uri_sks_create_from_nsid (GNUNET_HashCode * nsid, const char *id) } -/** - * Canonicalize a keyword. - * - * @param in input string (the keyword) - * @return canonicalized keyword - */ -static char * -canonicalize_keyword (const char *in) -{ - char *ret; - char *wpos; - const char *rpos; - - ret = GNUNET_strdup (in); - wpos = ret; - rpos = in; - while ('\0' != *rpos) - { - switch (tolower ((unsigned char) *rpos)) - { - case 'a': - case 'e': - case 'i': - case 'o': - case 'u': - case ' ': - case '\t': - case '\n': - case '\r': - /* skip characters listed above */ - break; - case 'b': - case 'c': - case 'd': - case 'f': - case 'g': - case 'h': - case 'j': - case 'k': - case 'l': - case 'm': - case 'n': - case 'p': - case 'r': - case 's': - case 't': - case 'v': - case 'w': - case 'x': - case 'y': - case 'z': - /* convert characters listed above to lower case */ - *wpos = tolower ((unsigned char) *rpos); - wpos++; - break; - case '!': - case '.': - case '?': - case '-': - /* keep characters listed above without changes */ - *wpos = *rpos; - wpos++; - break; - default: - if (isspace ((unsigned char) *rpos) || isdigit ((unsigned char) *rpos)) - break; - /* replace characters listed above with '_' */ - *wpos = '_'; - wpos++; - break; - } - rpos++; - } - *wpos = '\0'; - return ret; -} - - -/** - * Canonicalize keyword URI. Performs operations such - * as decapitalization and removal of certain characters. - * (useful for search). - * - * @param uri the URI to canonicalize - * @return canonicalized version of the URI, NULL on error - */ -struct GNUNET_FS_Uri * -GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri) -{ - struct GNUNET_FS_Uri *ret; - unsigned int kc; - unsigned int i; - const char *in; - char *sb; - char *cc; - const char *tok; - - ret = GNUNET_malloc (sizeof (struct GNUNET_FS_Uri)); - ret->type = ksk; - kc = uri->data.ksk.keywordCount; - for (i = 0; i < kc; i++) - { - in = uri->data.ksk.keywords[i]; - GNUNET_FS_uri_ksk_add_keyword (ret, &in[1], - (in[0] == '+') ? GNUNET_YES : GNUNET_NO); - sb = GNUNET_strdup (&in[1]); -#define DELIMS " \\|\"'`/&@-_,.;!?+-*^$#~=[]{}()<>" - for (tok = strtok (sb, DELIMS); NULL != tok; tok = strtok (NULL, DELIMS)) -#undef DELIMS - { - if (strlen (tok) < 3) - continue; - GNUNET_FS_uri_ksk_add_keyword (ret, tok, GNUNET_NO); - cc = canonicalize_keyword (tok); - if (strlen (cc) > 2) - GNUNET_FS_uri_ksk_add_keyword (ret, cc, GNUNET_NO); - } - GNUNET_free (sb); - } - return ret; -} - - /** * Merge the sets of keywords from two KSK URIs. * (useful for merging the canonicalized keywords with diff --git a/src/include/gnunet_fs_service.h b/src/include/gnunet_fs_service.h index db1d74589..ad4441bd3 100644 --- a/src/include/gnunet_fs_service.h +++ b/src/include/gnunet_fs_service.h @@ -53,7 +53,7 @@ extern "C" * 6.1.x: with simplified namespace support * 9.0.0: CPS-style integrated API */ -#define GNUNET_FS_VERSION 0x00090000 +#define GNUNET_FS_VERSION 0x00090001 /* ******************** URI API *********************** */ @@ -227,22 +227,8 @@ GNUNET_FS_uri_loc_create (const struct GNUNET_FS_Uri *baseUri, struct GNUNET_TIME_Absolute expiration_time); -/** - * Canonicalize keyword URI. Performs operations such - * as decapitalization and removal of certain characters. - * (useful for search). - * - * @param uri the URI to canonicalize - * @return canonicalized version of the URI, NULL on error - */ -struct GNUNET_FS_Uri * -GNUNET_FS_uri_ksk_canonicalize (const struct GNUNET_FS_Uri *uri); - - /** * Merge the sets of keywords from two KSK URIs. - * (useful for merging the canonicalized keywords with - * the original keywords for sharing). * * @param u1 first uri * @param u2 second uri @@ -1898,9 +1884,11 @@ typedef int (*GNUNET_FS_DirectoryScanner) (void *cls, * files (those starting with a "."). Metadata will be extracted * using GNU libextractor; the specific list of plugins should be * specified in "cls", passing NULL will disable (!) metadata - * extraction. Keywords will be derived from the metadata and be - * subject to default canonicalization. This is strictly a - * convenience function. + * extraction. Keywords will be derived from the metadata and + * associated with directories as appropriate. This is strictly a + * convenience function (however, if all tools use it, there will + * be less of a chance of distinguishing users by the specific + * user-interface they were using). * * @param cls must be of type "struct EXTRACTOR_Extractor*" * @param h handle to the file sharing subsystem -- cgit v1.2.3