aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2011-12-24 16:04:53 +0000
committerChristian Grothoff <christian@grothoff.org>2011-12-24 16:04:53 +0000
commit8feaaffac09035bd7203d456f5bf96d79fa49be8 (patch)
treea687163070b7e3534431ee267a55dab7ce9c1554 /src
parenta298109020987faa3595e0414c44f6d10660132c (diff)
downloadgnunet-8feaaffac09035bd7203d456f5bf96d79fa49be8.tar.gz
gnunet-8feaaffac09035bd7203d456f5bf96d79fa49be8.zip
-LRN/CG: extract keywords from file names (#2032)
Diffstat (limited to 'src')
-rw-r--r--src/fs/fs_uri.c197
1 files changed, 188 insertions, 9 deletions
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c
index 62fd51304..9da78b6f4 100644
--- a/src/fs/fs_uri.c
+++ b/src/fs/fs_uri.c
@@ -1579,6 +1579,162 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri)
1579 1579
1580 1580
1581/** 1581/**
1582 * Add a keyword as non-mandatory (with ' '-prefix) to the
1583 * given keyword list at offset 'index'. The array is
1584 * guaranteed to be long enough.
1585 *
1586 * @param s keyword to add
1587 * @param array array to add the keyword to
1588 * @param index offset where to add the keyword
1589 */
1590static void
1591insert_non_mandatory_keyword (const char *s, char **array, int index)
1592{
1593 char *nkword;
1594 GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s);
1595 array[index] = nkword;
1596}
1597
1598
1599/**
1600 * Test if the given keyword 's' is already present in the
1601 * given array, ignoring the '+'-mandatory prefix in the array.
1602 *
1603 * @param s keyword to test
1604 * @param array keywords to test against, with ' ' or '+' prefix to ignore
1605 * @param array_length length of the array
1606 * @return GNUNET_YES if the keyword exists, GNUNET_NO if not
1607 */
1608static int
1609find_duplicate (const char *s, const char **array, int array_length)
1610{
1611 int j;
1612
1613 for (j = array_length - 1; j >= 0; j--)
1614 if (0 == strcmp (&array[j][1], s))
1615 return GNUNET_YES;
1616 return GNUNET_NO;
1617}
1618
1619
1620/**
1621 * Break the filename up by matching [], () and {} pairs to make
1622 * keywords. In case of nesting parentheses only the inner pair counts.
1623 * You can't escape parentheses to scan something like "[blah\{foo]" to
1624 * make a "blah{foo" keyword, this function is only a heuristic!
1625 *
1626 * @param s string to break down.
1627 * @param array array to fill with enclosed tokens. If NULL, then tokens
1628 * are only counted.
1629 * @param index index at which to start filling the array (entries prior
1630 * to it are used to check for duplicates). ignored if array == NULL.
1631 * @return number of tokens counted (including duplicates), or number of
1632 * tokens extracted (excluding duplicates). 0 if there are no
1633 * matching parens in the string (when counting), or when all tokens
1634 * were duplicates (when extracting).
1635 */
1636static int
1637get_keywords_from_parens (const char *s, char **array, int index)
1638{
1639 int count = 0;
1640 char *open_paren;
1641 char *close_paren;
1642 char *ss;
1643 char tmp;
1644
1645 if (NULL == s)
1646 return 0;
1647 ss = GNUNET_strdup (s);
1648 open_paren = ss - 1;
1649 while (NULL != (open_paren = strpbrk (open_paren + 1, "[{(")))
1650 {
1651 int match = 0;
1652
1653 close_paren = strpbrk (open_paren + 1, "]})");
1654 if (NULL == close_paren)
1655 continue;
1656 switch (open_paren[0])
1657 {
1658 case '[':
1659 if (']' == close_paren[0])
1660 match = 1;
1661 break;
1662 case '{':
1663 if ('}' == close_paren[0])
1664 match = 1;
1665 break;
1666 case '(':
1667 if (')' == close_paren[0])
1668 match = 1;
1669 break;
1670 default:
1671 break;
1672 }
1673 if (match && (close_paren - open_paren > 1))
1674 {
1675 if (NULL != array)
1676 {
1677 tmp = close_paren[0];
1678 close_paren[0] = '\0';
1679 if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count))
1680 {
1681 insert_non_mandatory_keyword ((const char *) &open_paren[1], array,
1682 index + count);
1683 count++;
1684 }
1685 close_paren[0] = tmp;
1686 }
1687 else
1688 count++;
1689 }
1690 }
1691 GNUNET_free (ss);
1692 return count;
1693}
1694
1695
1696/**
1697 * Break the filename up by "_", " " and "." (any other separators?) to make
1698 * keywords.
1699 *
1700 * @param s string to break down.
1701 * @param array array to fill with tokens. If NULL, then tokens are only
1702 * counted.
1703 * @param index index at which to start filling the array (entries prior
1704 * to it are used to check for duplicates). ignored if array == NULL.
1705 * @return number of tokens (>1) counted (including duplicates), or number of
1706 * tokens extracted (excluding duplicates). 0 if there are no
1707 * separators in the string (when counting), or when all tokens were
1708 * duplicates (when extracting).
1709 */
1710static int
1711get_keywords_from_tokens (const char *s, char **array, int index)
1712{
1713 char *p;
1714 char *ss;
1715 int seps = 0;
1716
1717 ss = GNUNET_strdup (s);
1718 for (p = strtok (ss, "_. "); p != NULL; p = strtok (NULL, "_, "))
1719 {
1720 if (NULL != array)
1721 {
1722 if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps))
1723 {
1724 insert_non_mandatory_keyword (p, array,
1725 index + seps);
1726 seps++;
1727 }
1728 }
1729 else
1730 seps++;
1731 }
1732 GNUNET_free (ss);
1733 return seps;
1734}
1735
1736
1737/**
1582 * Function called on each value in the meta data. 1738 * Function called on each value in the meta data.
1583 * Adds it to the URI. 1739 * Adds it to the URI.
1584 * 1740 *
@@ -1601,18 +1757,15 @@ gather_uri_data (void *cls, const char *plugin_name,
1601 const char *data_mime_type, const char *data, size_t data_len) 1757 const char *data_mime_type, const char *data, size_t data_len)
1602{ 1758{
1603 struct GNUNET_FS_Uri *uri = cls; 1759 struct GNUNET_FS_Uri *uri = cls;
1604 char *nkword;
1605 int j;
1606 1760
1607 if ((format != EXTRACTOR_METAFORMAT_UTF8) && 1761 if ((format != EXTRACTOR_METAFORMAT_UTF8) &&
1608 (format != EXTRACTOR_METAFORMAT_C_STRING)) 1762 (format != EXTRACTOR_METAFORMAT_C_STRING))
1609 return 0; 1763 return 0;
1610 for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--) 1764 if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount))
1611 if (0 == strcmp (&uri->data.ksk.keywords[j][1], data)) 1765 return GNUNET_OK;
1612 return GNUNET_OK; 1766 insert_non_mandatory_keyword (data,
1613 GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ 1767 uri->data.ksk.keywords, uri->data.ksk.keywordCount);
1614 data); 1768 uri->data.ksk.keywordCount++;
1615 uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword;
1616 return 0; 1769 return 0;
1617} 1770}
1618 1771
@@ -1630,7 +1783,12 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
1630 *md) 1783 *md)
1631{ 1784{
1632 struct GNUNET_FS_Uri *ret; 1785 struct GNUNET_FS_Uri *ret;
1786 char *filename;
1787 char *full_name;
1788 char *ss;
1633 int ent; 1789 int ent;
1790 int tok_keywords = 0;
1791 int paren_keywords = 0;
1634 1792
1635 if (md == NULL) 1793 if (md == NULL)
1636 return NULL; 1794 return NULL;
@@ -1639,9 +1797,30 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData
1639 ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL); 1797 ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL);
1640 if (ent > 0) 1798 if (ent > 0)
1641 { 1799 {
1642 ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent); 1800 full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md,
1801 EXTRACTOR_METATYPE_FILENAME, -1);
1802 if (NULL != full_name)
1803 {
1804 filename = full_name;
1805 while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR)))
1806 filename = ss + 1;
1807 tok_keywords = get_keywords_from_tokens (filename, NULL, 0);
1808 paren_keywords = get_keywords_from_parens (filename, NULL, 0);
1809 }
1810 ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent
1811 + tok_keywords + paren_keywords));
1643 GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); 1812 GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret);
1644 } 1813 }
1814 if (tok_keywords > 0)
1815 ret->data.ksk.keywordCount += get_keywords_from_tokens (filename,
1816 ret->data.ksk.keywords,
1817 ret->data.ksk.keywordCount);
1818 if (paren_keywords > 0)
1819 ret->data.ksk.keywordCount += get_keywords_from_parens (filename,
1820 ret->data.ksk.keywords,
1821 ret->data.ksk.keywordCount);
1822 if (ent > 0)
1823 GNUNET_free (full_name);
1645 return ret; 1824 return ret;
1646} 1825}
1647 1826