aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/html_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r--src/plugins/html_extractor.c40
1 files changed, 20 insertions, 20 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c
index 6ac0809..8cd4aba 100644
--- a/src/plugins/html_extractor.c
+++ b/src/plugins/html_extractor.c
@@ -27,7 +27,7 @@
27#include "extractor.h" 27#include "extractor.h"
28#include <magic.h> 28#include <magic.h>
29#include <tidy/tidy.h> 29#include <tidy/tidy.h>
30#include <tidy/buffio.h> 30#include <tidy/tidybuffio.h>
31 31
32/** 32/**
33 * Mapping of HTML META names to LE types. 33 * Mapping of HTML META names to LE types.
@@ -59,7 +59,7 @@ static struct
59 { "rights", EXTRACTOR_METATYPE_RIGHTS }, 59 { "rights", EXTRACTOR_METATYPE_RIGHTS },
60 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, 60 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
61 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, 61 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
62 { "language", EXTRACTOR_METATYPE_LANGUAGE }, 62 { "language", EXTRACTOR_METATYPE_LANGUAGE },
63 { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, 63 { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
64 { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, 64 { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
65 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 65 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
@@ -82,7 +82,7 @@ static magic_t magic;
82 * @param tag tag to map 82 * @param tag tag to map
83 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found 83 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
84 */ 84 */
85static enum EXTRACTOR_MetaType 85static enum EXTRACTOR_MetaType
86tag_to_type (const char *tag) 86tag_to_type (const char *tag)
87{ 87{
88 unsigned int i; 88 unsigned int i;
@@ -146,7 +146,7 @@ static void TIDY_CALL
146unget_byte_cb (void *sourceData, byte bt) 146unget_byte_cb (void *sourceData, byte bt)
147{ 147{
148 struct EXTRACTOR_ExtractContext *ec = sourceData; 148 struct EXTRACTOR_ExtractContext *ec = sourceData;
149 149
150 (void) ec->seek (ec->cls, -1, SEEK_CUR); 150 (void) ec->seek (ec->cls, -1, SEEK_CUR);
151} 151}
152 152
@@ -167,11 +167,11 @@ eof_cb (void *sourceData)
167 167
168 168
169/** 169/**
170 * Main entry method for the 'text/html' extraction plugin. 170 * Main entry method for the 'text/html' extraction plugin.
171 * 171 *
172 * @param ec extraction context provided to the plugin 172 * @param ec extraction context provided to the plugin
173 */ 173 */
174void 174void
175EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) 175EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
176{ 176{
177 TidyDoc doc; 177 TidyDoc doc;
@@ -250,9 +250,9 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
250 case TidyNode_Php: 250 case TidyNode_Php:
251 break; 251 break;
252 case TidyNode_XmlDecl: 252 case TidyNode_XmlDecl:
253 break; 253 break;
254 case TidyNode_Start: 254 case TidyNode_Start:
255 case TidyNode_StartEnd: 255 case TidyNode_StartEnd:
256 name = tidyNodeGetName (child); 256 name = tidyNodeGetName (child);
257 if ( (0 == strcasecmp (name, "title")) && 257 if ( (0 == strcasecmp (name, "title")) &&
258 (NULL != (title = tidyGetChild (child))) ) 258 (NULL != (title = tidyGetChild (child))) )
@@ -278,13 +278,13 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
278 } 278 }
279 if (0 == strcasecmp (name, "meta")) 279 if (0 == strcasecmp (name, "meta"))
280 { 280 {
281 if (NULL == (attr = tidyAttrGetById (child, 281 if (NULL == (attr = tidyAttrGetById (child,
282 TidyAttr_NAME))) 282 TidyAttr_NAME)))
283 break; 283 break;
284 if (EXTRACTOR_METATYPE_RESERVED == 284 if (EXTRACTOR_METATYPE_RESERVED ==
285 (type = tag_to_type (tidyAttrValue (attr)))) 285 (type = tag_to_type (tidyAttrValue (attr))))
286 break; 286 break;
287 if (NULL == (attr = tidyAttrGetById (child, 287 if (NULL == (attr = tidyAttrGetById (child,
288 TidyAttr_CONTENT))) 288 TidyAttr_CONTENT)))
289 break; 289 break;
290 name = tidyAttrValue (attr); 290 name = tidyAttrValue (attr);
@@ -297,14 +297,14 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
297 name, 297 name,
298 strlen (name) + 1)) 298 strlen (name) + 1))
299 goto CLEANUP; 299 goto CLEANUP;
300 break; 300 break;
301 } 301 }
302 break; 302 break;
303 case TidyNode_End: 303 case TidyNode_End:
304 break; 304 break;
305 default: 305 default:
306 break; 306 break;
307 } 307 }
308 } 308 }
309 CLEANUP: 309 CLEANUP:
310 tidyRelease (doc); 310 tidyRelease (doc);
@@ -463,7 +463,7 @@ findInTags (struct TagInfo * t,
463 463
464 464
465/* mimetype = text/html */ 465/* mimetype = text/html */
466int 466int
467EXTRACTOR_html_extract (const char *data, 467EXTRACTOR_html_extract (const char *data,
468 size_t size, 468 size_t size,
469 EXTRACTOR_MetaDataProcessor proc, 469 EXTRACTOR_MetaDataProcessor proc,
@@ -562,7 +562,7 @@ EXTRACTOR_html_extract (const char *data,
562 if text/html is present, we take that as the mime-type; if charset= 562 if text/html is present, we take that as the mime-type; if charset=
563 is present, we try to use that for character set conversion. */ 563 is present, we try to use that for character set conversion. */
564 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) 564 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
565 ret = proc (proc_cls, 565 ret = proc (proc_cls,
566 "html", 566 "html",
567 EXTRACTOR_METATYPE_MIMETYPE, 567 EXTRACTOR_METATYPE_MIMETYPE,
568 EXTRACTOR_METAFORMAT_UTF8, 568 EXTRACTOR_METAFORMAT_UTF8,
@@ -613,7 +613,7 @@ EXTRACTOR_html_extract (const char *data,
613 free (tmp); 613 free (tmp);
614 i++; 614 i++;
615 } 615 }
616 while (tags != NULL) 616 while (tags != NULL)
617 { 617 {
618 t = tags; 618 t = tags;
619 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && 619 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
@@ -667,7 +667,7 @@ EXTRACTOR_html_extract (const char *data,
667/** 667/**
668 * Initialize glib and load magic file. 668 * Initialize glib and load magic file.
669 */ 669 */
670void __attribute__ ((constructor)) 670void __attribute__ ((constructor))
671html_gobject_init () 671html_gobject_init ()
672{ 672{
673 magic = magic_open (MAGIC_MIME_TYPE); 673 magic = magic_open (MAGIC_MIME_TYPE);
@@ -681,8 +681,8 @@ html_gobject_init ()
681/** 681/**
682 * Destructor for the library, cleans up. 682 * Destructor for the library, cleans up.
683 */ 683 */
684void __attribute__ ((destructor)) 684void __attribute__ ((destructor))
685html_ltdl_fini () 685html_ltdl_fini ()
686{ 686{
687 if (NULL != magic) 687 if (NULL != magic)
688 { 688 {