aboutsummaryrefslogtreecommitdiff
path: root/src/fs
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2012-01-14 15:20:55 +0000
committerChristian Grothoff <christian@grothoff.org>2012-01-14 15:20:55 +0000
commitea58fb0d848465e40c8b2a56ac482a4afabfa7d4 (patch)
tree2ec4246ff075a10f216dc58a41db43dbb7a34fba /src/fs
parentea03f11cfac38afdad7ea45b3a7787bd9b4f4711 (diff)
downloadgnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.tar.gz
gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.zip
LRN: skip short keywords when generating keywords automatically from metadata
Diffstat (limited to 'src/fs')
-rw-r--r--src/fs/fs_uri.c24
1 files changed, 21 insertions, 3 deletions
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index f8bd76f56..157295b09 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -1597,11 +1597,17 @@ get_keywords_from_parens (const char *s, char **array, int index)
1597 } 1597 }
1598 if (match && (close_paren - open_paren > 1)) 1598 if (match && (close_paren - open_paren > 1))
1599 { 1599 {
1600 tmp = close_paren[0];
1601 close_paren[0] = '\0';
1602 /* Keywords must be at least 3 characters long */
1603 if (u8_strlen ((const uint8_t *) &open_paren[1]) <= 2)
1604 {
1605 close_paren[0] = tmp;
1606 continue;
1607 }
1600 if (NULL != array) 1608 if (NULL != array)
1601 { 1609 {
1602 char *normalized; 1610 char *normalized;
1603 tmp = close_paren[0];
1604 close_paren[0] = '\0';
1605 if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], 1611 if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1],
1606 (const char **) array, index + count)) 1612 (const char **) array, index + count))
1607 { 1613 {
@@ -1622,10 +1628,10 @@ get_keywords_from_parens (const char *s, char **array, int index)
1622 } 1628 }
1623 GNUNET_free (normalized); 1629 GNUNET_free (normalized);
1624 } 1630 }
1625 close_paren[0] = tmp;
1626 } 1631 }
1627 else 1632 else
1628 count++; 1633 count++;
1634 close_paren[0] = tmp;
1629 } 1635 }
1630 } 1636 }
1631 GNUNET_free (ss); 1637 GNUNET_free (ss);
@@ -1662,6 +1668,9 @@ get_keywords_from_tokens (const char *s, char **array, int index)
1662 ss = GNUNET_strdup (s); 1668 ss = GNUNET_strdup (s);
1663 for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS)) 1669 for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS))
1664 { 1670 {
1671 /* Keywords must be at least 3 characters long */
1672 if (u8_strlen ((const uint8_t *) p) <= 2)
1673 continue;
1665 if (NULL != array) 1674 if (NULL != array)
1666 { 1675 {
1667 char *normalized; 1676 char *normalized;
@@ -1721,6 +1730,15 @@ gather_uri_data (void *cls, const char *plugin_name,
1721 if ((format != EXTRACTOR_METAFORMAT_UTF8) && 1730 if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
1722 (format != EXTRACTOR_METAFORMAT_C_STRING)) 1731 (format != EXTRACTOR_METAFORMAT_C_STRING))
1723 return 0; 1732 return 0;
1733 /* Keywords must be at least 3 characters long
1734 * If given non-utf8 string it will, most likely, find it to be invalid,
1735 * and will return the length of its valid part, skipping the keyword.
1736 * If it does - fix the extractor, not this check!
1737 */
1738 if (u8_strlen ((const uint8_t *) data) <= 2)
1739 {
1740 return 0;
1741 }
1724 normalized_data = normalize_metadata (format, data, data_len); 1742 normalized_data = normalize_metadata (format, data, data_len);
1725 if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) 1743 if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
1726 { 1744 {