diff options
author | Christian Grothoff <christian@grothoff.org> | 2012-01-14 15:20:55 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2012-01-14 15:20:55 +0000 |
commit | ea58fb0d848465e40c8b2a56ac482a4afabfa7d4 (patch) | |
tree | 2ec4246ff075a10f216dc58a41db43dbb7a34fba /src | |
parent | ea03f11cfac38afdad7ea45b3a7787bd9b4f4711 (diff) | |
download | gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.tar.gz gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.zip |
LRN: skip short keywords when generating keywords automatically from metadata
Diffstat (limited to 'src')
-rw-r--r-- | src/fs/fs_uri.c | 24 |
1 files changed, 21 insertions, 3 deletions
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index f8bd76f56..157295b09 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c | |||
@@ -1597,11 +1597,17 @@ get_keywords_from_parens (const char *s, char **array, int index) | |||
1597 | } | 1597 | } |
1598 | if (match && (close_paren - open_paren > 1)) | 1598 | if (match && (close_paren - open_paren > 1)) |
1599 | { | 1599 | { |
1600 | tmp = close_paren[0]; | ||
1601 | close_paren[0] = '\0'; | ||
1602 | /* Keywords must be at least 3 characters long */ | ||
1603 | if (u8_strlen ((const uint8_t *) &open_paren[1]) <= 2) | ||
1604 | { | ||
1605 | close_paren[0] = tmp; | ||
1606 | continue; | ||
1607 | } | ||
1600 | if (NULL != array) | 1608 | if (NULL != array) |
1601 | { | 1609 | { |
1602 | char *normalized; | 1610 | char *normalized; |
1603 | tmp = close_paren[0]; | ||
1604 | close_paren[0] = '\0'; | ||
1605 | if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], | 1611 | if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], |
1606 | (const char **) array, index + count)) | 1612 | (const char **) array, index + count)) |
1607 | { | 1613 | { |
@@ -1622,10 +1628,10 @@ get_keywords_from_parens (const char *s, char **array, int index) | |||
1622 | } | 1628 | } |
1623 | GNUNET_free (normalized); | 1629 | GNUNET_free (normalized); |
1624 | } | 1630 | } |
1625 | close_paren[0] = tmp; | ||
1626 | } | 1631 | } |
1627 | else | 1632 | else |
1628 | count++; | 1633 | count++; |
1634 | close_paren[0] = tmp; | ||
1629 | } | 1635 | } |
1630 | } | 1636 | } |
1631 | GNUNET_free (ss); | 1637 | GNUNET_free (ss); |
@@ -1662,6 +1668,9 @@ get_keywords_from_tokens (const char *s, char **array, int index) | |||
1662 | ss = GNUNET_strdup (s); | 1668 | ss = GNUNET_strdup (s); |
1663 | for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS)) | 1669 | for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS)) |
1664 | { | 1670 | { |
1671 | /* Keywords must be at least 3 characters long */ | ||
1672 | if (u8_strlen ((const uint8_t *) p) <= 2) | ||
1673 | continue; | ||
1665 | if (NULL != array) | 1674 | if (NULL != array) |
1666 | { | 1675 | { |
1667 | char *normalized; | 1676 | char *normalized; |
@@ -1721,6 +1730,15 @@ gather_uri_data (void *cls, const char *plugin_name, | |||
1721 | if ((format != EXTRACTOR_METAFORMAT_UTF8) && | 1730 | if ((format != EXTRACTOR_METAFORMAT_UTF8) && |
1722 | (format != EXTRACTOR_METAFORMAT_C_STRING)) | 1731 | (format != EXTRACTOR_METAFORMAT_C_STRING)) |
1723 | return 0; | 1732 | return 0; |
1733 | /* Keywords must be at least 3 characters long | ||
1734 | * If given non-utf8 string it will, most likely, find it to be invalid, | ||
1735 | * and will return the length of its valid part, skipping the keyword. | ||
1736 | * If it does - fix the extractor, not this check! | ||
1737 | */ | ||
1738 | if (u8_strlen ((const uint8_t *) data) <= 2) | ||
1739 | { | ||
1740 | return 0; | ||
1741 | } | ||
1724 | normalized_data = normalize_metadata (format, data, data_len); | 1742 | normalized_data = normalize_metadata (format, data, data_len); |
1725 | if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) | 1743 | if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) |
1726 | { | 1744 | { |