diff options
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r-- | src/plugins/html_extractor.c | 40 |
1 files changed, 20 insertions, 20 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c index 6ac0809..8cd4aba 100644 --- a/src/plugins/html_extractor.c +++ b/src/plugins/html_extractor.c | |||
@@ -27,7 +27,7 @@ | |||
27 | #include "extractor.h" | 27 | #include "extractor.h" |
28 | #include <magic.h> | 28 | #include <magic.h> |
29 | #include <tidy/tidy.h> | 29 | #include <tidy/tidy.h> |
30 | #include <tidy/buffio.h> | 30 | #include <tidy/tidybuffio.h> |
31 | 31 | ||
32 | /** | 32 | /** |
33 | * Mapping of HTML META names to LE types. | 33 | * Mapping of HTML META names to LE types. |
@@ -59,7 +59,7 @@ static struct | |||
59 | { "rights", EXTRACTOR_METATYPE_RIGHTS }, | 59 | { "rights", EXTRACTOR_METATYPE_RIGHTS }, |
60 | { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, | 60 | { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, |
61 | { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, | 61 | { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, |
62 | { "language", EXTRACTOR_METATYPE_LANGUAGE }, | 62 | { "language", EXTRACTOR_METATYPE_LANGUAGE }, |
63 | { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, | 63 | { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, |
64 | { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, | 64 | { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, |
65 | { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | 65 | { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, |
@@ -82,7 +82,7 @@ static magic_t magic; | |||
82 | * @param tag tag to map | 82 | * @param tag tag to map |
83 | * @return EXTRACTOR_METATYPE_RESERVED if the type was not found | 83 | * @return EXTRACTOR_METATYPE_RESERVED if the type was not found |
84 | */ | 84 | */ |
85 | static enum EXTRACTOR_MetaType | 85 | static enum EXTRACTOR_MetaType |
86 | tag_to_type (const char *tag) | 86 | tag_to_type (const char *tag) |
87 | { | 87 | { |
88 | unsigned int i; | 88 | unsigned int i; |
@@ -146,7 +146,7 @@ static void TIDY_CALL | |||
146 | unget_byte_cb (void *sourceData, byte bt) | 146 | unget_byte_cb (void *sourceData, byte bt) |
147 | { | 147 | { |
148 | struct EXTRACTOR_ExtractContext *ec = sourceData; | 148 | struct EXTRACTOR_ExtractContext *ec = sourceData; |
149 | 149 | ||
150 | (void) ec->seek (ec->cls, -1, SEEK_CUR); | 150 | (void) ec->seek (ec->cls, -1, SEEK_CUR); |
151 | } | 151 | } |
152 | 152 | ||
@@ -167,11 +167,11 @@ eof_cb (void *sourceData) | |||
167 | 167 | ||
168 | 168 | ||
169 | /** | 169 | /** |
170 | * Main entry method for the 'text/html' extraction plugin. | 170 | * Main entry method for the 'text/html' extraction plugin. |
171 | * | 171 | * |
172 | * @param ec extraction context provided to the plugin | 172 | * @param ec extraction context provided to the plugin |
173 | */ | 173 | */ |
174 | void | 174 | void |
175 | EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | 175 | EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) |
176 | { | 176 | { |
177 | TidyDoc doc; | 177 | TidyDoc doc; |
@@ -250,9 +250,9 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | |||
250 | case TidyNode_Php: | 250 | case TidyNode_Php: |
251 | break; | 251 | break; |
252 | case TidyNode_XmlDecl: | 252 | case TidyNode_XmlDecl: |
253 | break; | 253 | break; |
254 | case TidyNode_Start: | 254 | case TidyNode_Start: |
255 | case TidyNode_StartEnd: | 255 | case TidyNode_StartEnd: |
256 | name = tidyNodeGetName (child); | 256 | name = tidyNodeGetName (child); |
257 | if ( (0 == strcasecmp (name, "title")) && | 257 | if ( (0 == strcasecmp (name, "title")) && |
258 | (NULL != (title = tidyGetChild (child))) ) | 258 | (NULL != (title = tidyGetChild (child))) ) |
@@ -278,13 +278,13 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | |||
278 | } | 278 | } |
279 | if (0 == strcasecmp (name, "meta")) | 279 | if (0 == strcasecmp (name, "meta")) |
280 | { | 280 | { |
281 | if (NULL == (attr = tidyAttrGetById (child, | 281 | if (NULL == (attr = tidyAttrGetById (child, |
282 | TidyAttr_NAME))) | 282 | TidyAttr_NAME))) |
283 | break; | 283 | break; |
284 | if (EXTRACTOR_METATYPE_RESERVED == | 284 | if (EXTRACTOR_METATYPE_RESERVED == |
285 | (type = tag_to_type (tidyAttrValue (attr)))) | 285 | (type = tag_to_type (tidyAttrValue (attr)))) |
286 | break; | 286 | break; |
287 | if (NULL == (attr = tidyAttrGetById (child, | 287 | if (NULL == (attr = tidyAttrGetById (child, |
288 | TidyAttr_CONTENT))) | 288 | TidyAttr_CONTENT))) |
289 | break; | 289 | break; |
290 | name = tidyAttrValue (attr); | 290 | name = tidyAttrValue (attr); |
@@ -297,14 +297,14 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | |||
297 | name, | 297 | name, |
298 | strlen (name) + 1)) | 298 | strlen (name) + 1)) |
299 | goto CLEANUP; | 299 | goto CLEANUP; |
300 | break; | 300 | break; |
301 | } | 301 | } |
302 | break; | 302 | break; |
303 | case TidyNode_End: | 303 | case TidyNode_End: |
304 | break; | 304 | break; |
305 | default: | 305 | default: |
306 | break; | 306 | break; |
307 | } | 307 | } |
308 | } | 308 | } |
309 | CLEANUP: | 309 | CLEANUP: |
310 | tidyRelease (doc); | 310 | tidyRelease (doc); |
@@ -463,7 +463,7 @@ findInTags (struct TagInfo * t, | |||
463 | 463 | ||
464 | 464 | ||
465 | /* mimetype = text/html */ | 465 | /* mimetype = text/html */ |
466 | int | 466 | int |
467 | EXTRACTOR_html_extract (const char *data, | 467 | EXTRACTOR_html_extract (const char *data, |
468 | size_t size, | 468 | size_t size, |
469 | EXTRACTOR_MetaDataProcessor proc, | 469 | EXTRACTOR_MetaDataProcessor proc, |
@@ -562,7 +562,7 @@ EXTRACTOR_html_extract (const char *data, | |||
562 | if text/html is present, we take that as the mime-type; if charset= | 562 | if text/html is present, we take that as the mime-type; if charset= |
563 | is present, we try to use that for character set conversion. */ | 563 | is present, we try to use that for character set conversion. */ |
564 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) | 564 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) |
565 | ret = proc (proc_cls, | 565 | ret = proc (proc_cls, |
566 | "html", | 566 | "html", |
567 | EXTRACTOR_METATYPE_MIMETYPE, | 567 | EXTRACTOR_METATYPE_MIMETYPE, |
568 | EXTRACTOR_METAFORMAT_UTF8, | 568 | EXTRACTOR_METAFORMAT_UTF8, |
@@ -613,7 +613,7 @@ EXTRACTOR_html_extract (const char *data, | |||
613 | free (tmp); | 613 | free (tmp); |
614 | i++; | 614 | i++; |
615 | } | 615 | } |
616 | while (tags != NULL) | 616 | while (tags != NULL) |
617 | { | 617 | { |
618 | t = tags; | 618 | t = tags; |
619 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && | 619 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && |
@@ -667,7 +667,7 @@ EXTRACTOR_html_extract (const char *data, | |||
667 | /** | 667 | /** |
668 | * Initialize glib and load magic file. | 668 | * Initialize glib and load magic file. |
669 | */ | 669 | */ |
670 | void __attribute__ ((constructor)) | 670 | void __attribute__ ((constructor)) |
671 | html_gobject_init () | 671 | html_gobject_init () |
672 | { | 672 | { |
673 | magic = magic_open (MAGIC_MIME_TYPE); | 673 | magic = magic_open (MAGIC_MIME_TYPE); |
@@ -681,8 +681,8 @@ html_gobject_init () | |||
681 | /** | 681 | /** |
682 | * Destructor for the library, cleans up. | 682 | * Destructor for the library, cleans up. |
683 | */ | 683 | */ |
684 | void __attribute__ ((destructor)) | 684 | void __attribute__ ((destructor)) |
685 | html_ltdl_fini () | 685 | html_ltdl_fini () |
686 | { | 686 | { |
687 | if (NULL != magic) | 687 | if (NULL != magic) |
688 | { | 688 | { |