diff options
author | Christian Grothoff <christian@grothoff.org> | 2011-12-24 16:04:53 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2011-12-24 16:04:53 +0000 |
commit | 8feaaffac09035bd7203d456f5bf96d79fa49be8 (patch) | |
tree | a687163070b7e3534431ee267a55dab7ce9c1554 /src/fs/fs_uri.c | |
parent | a298109020987faa3595e0414c44f6d10660132c (diff) | |
download | gnunet-8feaaffac09035bd7203d456f5bf96d79fa49be8.tar.gz gnunet-8feaaffac09035bd7203d456f5bf96d79fa49be8.zip |
-LRN/CG: extract keywords from file names (#2032)
Diffstat (limited to 'src/fs/fs_uri.c')
-rw-r--r-- | src/fs/fs_uri.c | 197 |
1 files changed, 188 insertions, 9 deletions
diff --git a/src/fs/fs_uri.c b/src/fs/fs_uri.c index 62fd51304..9da78b6f4 100644 --- a/src/fs/fs_uri.c +++ b/src/fs/fs_uri.c | |||
@@ -1579,6 +1579,162 @@ GNUNET_FS_uri_test_loc (const struct GNUNET_FS_Uri *uri) | |||
1579 | 1579 | ||
1580 | 1580 | ||
1581 | /** | 1581 | /** |
1582 | * Add a keyword as non-mandatory (with ' '-prefix) to the | ||
1583 | * given keyword list at offset 'index'. The array is | ||
1584 | * guaranteed to be long enough. | ||
1585 | * | ||
1586 | * @param s keyword to add | ||
1587 | * @param array array to add the keyword to | ||
1588 | * @param index offset where to add the keyword | ||
1589 | */ | ||
1590 | static void | ||
1591 | insert_non_mandatory_keyword (const char *s, char **array, int index) | ||
1592 | { | ||
1593 | char *nkword; | ||
1594 | GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ s); | ||
1595 | array[index] = nkword; | ||
1596 | } | ||
1597 | |||
1598 | |||
1599 | /** | ||
1600 | * Test if the given keyword 's' is already present in the | ||
1601 | * given array, ignoring the '+'-mandatory prefix in the array. | ||
1602 | * | ||
1603 | * @param s keyword to test | ||
1604 | * @param array keywords to test against, with ' ' or '+' prefix to ignore | ||
1605 | * @param array_length length of the array | ||
1606 | * @return GNUNET_YES if the keyword exists, GNUNET_NO if not | ||
1607 | */ | ||
1608 | static int | ||
1609 | find_duplicate (const char *s, const char **array, int array_length) | ||
1610 | { | ||
1611 | int j; | ||
1612 | |||
1613 | for (j = array_length - 1; j >= 0; j--) | ||
1614 | if (0 == strcmp (&array[j][1], s)) | ||
1615 | return GNUNET_YES; | ||
1616 | return GNUNET_NO; | ||
1617 | } | ||
1618 | |||
1619 | |||
1620 | /** | ||
1621 | * Break the filename up by matching [], () and {} pairs to make | ||
1622 | * keywords. In case of nesting parentheses only the inner pair counts. | ||
1623 | * You can't escape parentheses to scan something like "[blah\{foo]" to | ||
1624 | * make a "blah{foo" keyword, this function is only a heuristic! | ||
1625 | * | ||
1626 | * @param s string to break down. | ||
1627 | * @param array array to fill with enclosed tokens. If NULL, then tokens | ||
1628 | * are only counted. | ||
1629 | * @param index index at which to start filling the array (entries prior | ||
1630 | * to it are used to check for duplicates). ignored if array == NULL. | ||
1631 | * @return number of tokens counted (including duplicates), or number of | ||
1632 | * tokens extracted (excluding duplicates). 0 if there are no | ||
1633 | * matching parens in the string (when counting), or when all tokens | ||
1634 | * were duplicates (when extracting). | ||
1635 | */ | ||
1636 | static int | ||
1637 | get_keywords_from_parens (const char *s, char **array, int index) | ||
1638 | { | ||
1639 | int count = 0; | ||
1640 | char *open_paren; | ||
1641 | char *close_paren; | ||
1642 | char *ss; | ||
1643 | char tmp; | ||
1644 | |||
1645 | if (NULL == s) | ||
1646 | return 0; | ||
1647 | ss = GNUNET_strdup (s); | ||
1648 | open_paren = ss - 1; | ||
1649 | while (NULL != (open_paren = strpbrk (open_paren + 1, "[{("))) | ||
1650 | { | ||
1651 | int match = 0; | ||
1652 | |||
1653 | close_paren = strpbrk (open_paren + 1, "]})"); | ||
1654 | if (NULL == close_paren) | ||
1655 | continue; | ||
1656 | switch (open_paren[0]) | ||
1657 | { | ||
1658 | case '[': | ||
1659 | if (']' == close_paren[0]) | ||
1660 | match = 1; | ||
1661 | break; | ||
1662 | case '{': | ||
1663 | if ('}' == close_paren[0]) | ||
1664 | match = 1; | ||
1665 | break; | ||
1666 | case '(': | ||
1667 | if (')' == close_paren[0]) | ||
1668 | match = 1; | ||
1669 | break; | ||
1670 | default: | ||
1671 | break; | ||
1672 | } | ||
1673 | if (match && (close_paren - open_paren > 1)) | ||
1674 | { | ||
1675 | if (NULL != array) | ||
1676 | { | ||
1677 | tmp = close_paren[0]; | ||
1678 | close_paren[0] = '\0'; | ||
1679 | if (GNUNET_NO == find_duplicate ((const char *) &open_paren[1], (const char **) array, index + count)) | ||
1680 | { | ||
1681 | insert_non_mandatory_keyword ((const char *) &open_paren[1], array, | ||
1682 | index + count); | ||
1683 | count++; | ||
1684 | } | ||
1685 | close_paren[0] = tmp; | ||
1686 | } | ||
1687 | else | ||
1688 | count++; | ||
1689 | } | ||
1690 | } | ||
1691 | GNUNET_free (ss); | ||
1692 | return count; | ||
1693 | } | ||
1694 | |||
1695 | |||
1696 | /** | ||
1697 | * Break the filename up by "_", " " and "." (any other separators?) to make | ||
1698 | * keywords. | ||
1699 | * | ||
1700 | * @param s string to break down. | ||
1701 | * @param array array to fill with tokens. If NULL, then tokens are only | ||
1702 | * counted. | ||
1703 | * @param index index at which to start filling the array (entries prior | ||
1704 | * to it are used to check for duplicates). ignored if array == NULL. | ||
1705 | * @return number of tokens (>1) counted (including duplicates), or number of | ||
1706 | * tokens extracted (excluding duplicates). 0 if there are no | ||
1707 | * separators in the string (when counting), or when all tokens were | ||
1708 | * duplicates (when extracting). | ||
1709 | */ | ||
1710 | static int | ||
1711 | get_keywords_from_tokens (const char *s, char **array, int index) | ||
1712 | { | ||
1713 | char *p; | ||
1714 | char *ss; | ||
1715 | int seps = 0; | ||
1716 | |||
1717 | ss = GNUNET_strdup (s); | ||
1718 | for (p = strtok (ss, "_. "); p != NULL; p = strtok (NULL, "_, ")) | ||
1719 | { | ||
1720 | if (NULL != array) | ||
1721 | { | ||
1722 | if (GNUNET_NO == find_duplicate (p, (const char **) array, index + seps)) | ||
1723 | { | ||
1724 | insert_non_mandatory_keyword (p, array, | ||
1725 | index + seps); | ||
1726 | seps++; | ||
1727 | } | ||
1728 | } | ||
1729 | else | ||
1730 | seps++; | ||
1731 | } | ||
1732 | GNUNET_free (ss); | ||
1733 | return seps; | ||
1734 | } | ||
1735 | |||
1736 | |||
1737 | /** | ||
1582 | * Function called on each value in the meta data. | 1738 | * Function called on each value in the meta data. |
1583 | * Adds it to the URI. | 1739 | * Adds it to the URI. |
1584 | * | 1740 | * |
@@ -1601,18 +1757,15 @@ gather_uri_data (void *cls, const char *plugin_name, | |||
1601 | const char *data_mime_type, const char *data, size_t data_len) | 1757 | const char *data_mime_type, const char *data, size_t data_len) |
1602 | { | 1758 | { |
1603 | struct GNUNET_FS_Uri *uri = cls; | 1759 | struct GNUNET_FS_Uri *uri = cls; |
1604 | char *nkword; | ||
1605 | int j; | ||
1606 | 1760 | ||
1607 | if ((format != EXTRACTOR_METAFORMAT_UTF8) && | 1761 | if ((format != EXTRACTOR_METAFORMAT_UTF8) && |
1608 | (format != EXTRACTOR_METAFORMAT_C_STRING)) | 1762 | (format != EXTRACTOR_METAFORMAT_C_STRING)) |
1609 | return 0; | 1763 | return 0; |
1610 | for (j = uri->data.ksk.keywordCount - 1; j >= 0; j--) | 1764 | if (find_duplicate (data, (const char **) uri->data.ksk.keywords, uri->data.ksk.keywordCount)) |
1611 | if (0 == strcmp (&uri->data.ksk.keywords[j][1], data)) | 1765 | return GNUNET_OK; |
1612 | return GNUNET_OK; | 1766 | insert_non_mandatory_keyword (data, |
1613 | GNUNET_asprintf (&nkword, " %s", /* space to mark as 'non mandatory' */ | 1767 | uri->data.ksk.keywords, uri->data.ksk.keywordCount); |
1614 | data); | 1768 | uri->data.ksk.keywordCount++; |
1615 | uri->data.ksk.keywords[uri->data.ksk.keywordCount++] = nkword; | ||
1616 | return 0; | 1769 | return 0; |
1617 | } | 1770 | } |
1618 | 1771 | ||
@@ -1630,7 +1783,12 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData | |||
1630 | *md) | 1783 | *md) |
1631 | { | 1784 | { |
1632 | struct GNUNET_FS_Uri *ret; | 1785 | struct GNUNET_FS_Uri *ret; |
1786 | char *filename; | ||
1787 | char *full_name; | ||
1788 | char *ss; | ||
1633 | int ent; | 1789 | int ent; |
1790 | int tok_keywords = 0; | ||
1791 | int paren_keywords = 0; | ||
1634 | 1792 | ||
1635 | if (md == NULL) | 1793 | if (md == NULL) |
1636 | return NULL; | 1794 | return NULL; |
@@ -1639,9 +1797,30 @@ GNUNET_FS_uri_ksk_create_from_meta_data (const struct GNUNET_CONTAINER_MetaData | |||
1639 | ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL); | 1797 | ent = GNUNET_CONTAINER_meta_data_iterate (md, NULL, NULL); |
1640 | if (ent > 0) | 1798 | if (ent > 0) |
1641 | { | 1799 | { |
1642 | ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * ent); | 1800 | full_name = GNUNET_CONTAINER_meta_data_get_first_by_types (md, |
1801 | EXTRACTOR_METATYPE_FILENAME, -1); | ||
1802 | if (NULL != full_name) | ||
1803 | { | ||
1804 | filename = full_name; | ||
1805 | while (NULL != (ss = strstr (filename, DIR_SEPARATOR_STR))) | ||
1806 | filename = ss + 1; | ||
1807 | tok_keywords = get_keywords_from_tokens (filename, NULL, 0); | ||
1808 | paren_keywords = get_keywords_from_parens (filename, NULL, 0); | ||
1809 | } | ||
1810 | ret->data.ksk.keywords = GNUNET_malloc (sizeof (char *) * (ent | ||
1811 | + tok_keywords + paren_keywords)); | ||
1643 | GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); | 1812 | GNUNET_CONTAINER_meta_data_iterate (md, &gather_uri_data, ret); |
1644 | } | 1813 | } |
1814 | if (tok_keywords > 0) | ||
1815 | ret->data.ksk.keywordCount += get_keywords_from_tokens (filename, | ||
1816 | ret->data.ksk.keywords, | ||
1817 | ret->data.ksk.keywordCount); | ||
1818 | if (paren_keywords > 0) | ||
1819 | ret->data.ksk.keywordCount += get_keywords_from_parens (filename, | ||
1820 | ret->data.ksk.keywords, | ||
1821 | ret->data.ksk.keywordCount); | ||
1822 | if (ent > 0) | ||
1823 | GNUNET_free (full_name); | ||
1645 | return ret; | 1824 | return ret; |
1646 | } | 1825 | } |
1647 | 1826 | ||