LRN: skip short keywords when generating keywords automatically from metadata

author: Christian Grothoff <christian@grothoff.org> 2012-01-14 15:20:55 +0000
committer: Christian Grothoff <christian@grothoff.org> 2012-01-14 15:20:55 +0000
commit: ea58fb0d848465e40c8b2a56ac482a4afabfa7d4 (patch)
tree: 2ec4246ff075a10f216dc58a41db43dbb7a34fba /src
parent: ea03f11cfac38afdad7ea45b3a7787bd9b4f4711 (diff)
download: gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.tar.gz
gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.zip
1 files changed, 21 insertions, 3 deletions
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index f8bd76f56..157295b09 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -1597,11 +1597,17 @@ get_keywords_from_parens (const char *s, char **array, int index)
    }
    if (match && (close_paren - open_paren > 1))
    {
+      tmp = close_paren[0];
+      close_paren[0] = '\0';
+      /* Keywords must be at least 3 characters long */
+      if (u8_strlen ((const uint8_t *) &open_paren[1]) <= 2)
+      {
+        close_paren[0] = tmp;
+        continue;
+      }
      if (NULL != array)
      {
        char *normalized;
-        tmp = close_paren[0];
-        close_paren[0] = '\0';
        if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1],
            (const char **) array, index + count))
        {
@@ -1622,10 +1628,10 @@ get_keywords_from_parens (const char *s, char **array, int index)
          }
          GNUNET_free (normalized);
        }
-        close_paren[0] = tmp;
      }
      else
        count++;
+      close_paren[0] = tmp;
    }   
  }
  GNUNET_free (ss);
@@ -1662,6 +1668,9 @@ get_keywords_from_tokens (const char *s, char **array, int index)
  ss = GNUNET_strdup (s);
  for (p = strtok (ss, TOKENS); p != NULL; p = strtok (NULL, TOKENS))
  {
+    /* Keywords must be at least 3 characters long */
+    if (u8_strlen ((const uint8_t *) p) <= 2)
+      continue;
    if (NULL != array)
    {
      char *normalized;
@@ -1721,6 +1730,15 @@ gather_uri_data (void *cls, const char *plugin_name,
  if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
      (format != EXTRACTOR_METAFORMAT_C_STRING))
    return 0;
+  /* Keywords must be at least 3 characters long
+   * If given non-utf8 string it will, most likely, find it to be invalid,
+   * and will return the length of its valid part, skipping the keyword.
+   * If it does - fix the extractor, not this check!
+   */
+  if (u8_strlen ((const uint8_t *) data) <= 2)
+  {
+    return 0;
+  }
  normalized_data = normalize_metadata (format, data, data_len);
  if (!find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
  {
author	Christian Grothoff <christian@grothoff.org>	2012-01-14 15:20:55 +0000
committer	Christian Grothoff <christian@grothoff.org>	2012-01-14 15:20:55 +0000
commit	ea58fb0d848465e40c8b2a56ac482a4afabfa7d4 (patch)
tree	2ec4246ff075a10f216dc58a41db43dbb7a34fba /src
parent	ea03f11cfac38afdad7ea45b3a7787bd9b4f4711 (diff)
download	gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.tar.gz gnunet-ea58fb0d848465e40c8b2a56ac482a4afabfa7d4.zip