aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/html_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r--src/plugins/html_extractor.c694
1 files changed, 694 insertions, 0 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c
new file mode 100644
index 0000000..65fb535
--- /dev/null
+++ b/src/plugins/html_extractor.c
@@ -0,0 +1,694 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19
20 */
21/**
22 * @file plugins/html_extractor.c
23 * @brief plugin to support HTML files
24 * @author Christian Grothoff
25 */
26#include "platform.h"
27#include "extractor.h"
28#include <magic.h>
29#include <tidy/tidy.h>
30#include <tidy/buffio.h>
31
32/**
33 * Mapping of HTML META names to LE types.
34 */
35static struct
36{
37 /**
38 * HTML META name.
39 */
40 const char *name;
41
42 /**
43 * Corresponding LE type.
44 */
45 enum EXTRACTOR_MetaType type;
46} tagmap[] = {
47 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
48 { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
49 { "title", EXTRACTOR_METATYPE_TITLE },
50 { "dc.title", EXTRACTOR_METATYPE_TITLE},
51 { "description", EXTRACTOR_METATYPE_DESCRIPTION },
52 { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
53 { "subject", EXTRACTOR_METATYPE_SUBJECT},
54 { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
55 { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
56 { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
57 { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
58 { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
59 { "rights", EXTRACTOR_METATYPE_RIGHTS },
60 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
61 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
62 { "language", EXTRACTOR_METATYPE_LANGUAGE },
63 { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
64 { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
65 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
66 { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
67 { "dc.identifier", EXTRACTOR_METATYPE_URI },
68 { "dc.format", EXTRACTOR_METATYPE_FORMAT },
69 { NULL, EXTRACTOR_METATYPE_RESERVED }
70};
71
72
73/**
74 * Global handle to MAGIC data.
75 */
76static magic_t magic;
77
78
79/**
80 * Map 'meta' tag to LE type.
81 *
82 * @param tag tag to map
83 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
84 */
85static enum EXTRACTOR_MetaType
86tag_to_type (const char *tag)
87{
88 unsigned int i;
89
90 for (i=0; NULL != tagmap[i].name; i++)
91 if (0 == strcasecmp (tag,
92 tagmap[i].name))
93 return tagmap[i].type;
94 return EXTRACTOR_METATYPE_RESERVED;
95}
96
97
98/**
99 * Function called by libtidy for error reporting.
100 *
101 * @param doc tidy doc being processed
102 * @param lvl report level
103 * @param line input line
104 * @param col input column
105 * @param mssg message
106 * @return FALSE (no output)
107 */
108static Bool
109report_cb (TidyDoc doc,
110 TidyReportLevel lvl,
111 uint line,
112 uint col,
113 ctmbstr mssg)
114{
115 return 0;
116}
117
118
119/**
120 * Input callback: get next byte of input.
121 *
122 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
123 * @return next byte of input, EndOfStream on errors and EOF
124 */
125static int
126get_byte_cb (void *sourceData)
127{
128 struct EXTRACTOR_ExtractContext *ec = sourceData;
129 void *data;
130
131 if (1 !=
132 ec->read (ec->cls,
133 &data, 1))
134 return EndOfStream;
135 return *(unsigned char*) data;
136}
137
138
139/**
140 * Input callback: unget last byte of input.
141 *
142 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
143 * @param bt byte to unget (ignored)
144 */
145static void
146unget_byte_cb (void *sourceData, byte bt)
147{
148 struct EXTRACTOR_ExtractContext *ec = sourceData;
149
150 (void) ec->seek (ec->cls, -1, SEEK_CUR);
151}
152
153
154/**
155 * Input callback: check for EOF.
156 *
157 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
158 * @return true if we are at the EOF
159 */
160static Bool
161eof_cb (void *sourceData)
162{
163 struct EXTRACTOR_ExtractContext *ec = sourceData;
164
165 return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
166}
167
168
169/**
170 * Main entry method for the 'text/html' extraction plugin.
171 *
172 * @param ec extraction context provided to the plugin
173 */
174void
175EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
176{
177 TidyDoc doc;
178 TidyNode head;
179 TidyNode child;
180 TidyNode title;
181 TidyInputSource src;
182 const char *name;
183 TidyBuffer tbuf;
184 TidyAttr attr;
185 enum EXTRACTOR_MetaType type;
186 ssize_t iret;
187 void *data;
188 const char *mime;
189
190 if (-1 == (iret = ec->read (ec->cls,
191 &data,
192 16 * 1024)))
193 return;
194 if (NULL == (mime = magic_buffer (magic, data, iret)))
195 return;
196 if (0 != strncmp (mime,
197 "text/html",
198 strlen ("text/html")))
199 return; /* not HTML */
200
201 if (0 != ec->seek (ec->cls, 0, SEEK_SET))
202 return; /* seek failed !? */
203
204 tidyInitSource (&src, ec,
205 &get_byte_cb,
206 &unget_byte_cb,
207 &eof_cb);
208 if (NULL == (doc = tidyCreate ()))
209 return;
210 tidySetReportFilter (doc, &report_cb);
211 tidySetAppData (doc, ec);
212 if (0 > tidyParseSource (doc, &src))
213 {
214 tidyRelease (doc);
215 return;
216 }
217 if (1 != tidyStatus (doc))
218 {
219 tidyRelease (doc);
220 return;
221 }
222 if (NULL == (head = tidyGetHead (doc)))
223 {
224 fprintf (stderr, "no head\n");
225 tidyRelease (doc);
226 return;
227 }
228 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
229 {
230 switch (tidyNodeGetType(child))
231 {
232 case TidyNode_Root:
233 break;
234 case TidyNode_DocType:
235 break;
236 case TidyNode_Comment:
237 break;
238 case TidyNode_ProcIns:
239 break;
240 case TidyNode_Text:
241 break;
242 case TidyNode_CDATA:
243 break;
244 case TidyNode_Section:
245 break;
246 case TidyNode_Asp:
247 break;
248 case TidyNode_Jste:
249 break;
250 case TidyNode_Php:
251 break;
252 case TidyNode_XmlDecl:
253 break;
254 case TidyNode_Start:
255 case TidyNode_StartEnd:
256 name = tidyNodeGetName (child);
257 if ( (0 == strcasecmp (name, "title")) &&
258 (NULL != (title = tidyGetChild (child))) )
259 {
260 tidyBufInit (&tbuf);
261 tidyNodeGetValue (doc, title, &tbuf);
262 /* add 0-termination */
263 tidyBufPutByte (&tbuf, 0);
264 if (0 !=
265 ec->proc (ec->cls,
266 "html",
267 EXTRACTOR_METATYPE_TITLE,
268 EXTRACTOR_METAFORMAT_UTF8,
269 "text/plain",
270 (const char *) tbuf.bp,
271 tbuf.size))
272 {
273 tidyBufFree (&tbuf);
274 goto CLEANUP;
275 }
276 tidyBufFree (&tbuf);
277 break;
278 }
279 if (0 == strcasecmp (name, "meta"))
280 {
281 if (NULL == (attr = tidyAttrGetById (child,
282 TidyAttr_NAME)))
283 break;
284 if (EXTRACTOR_METATYPE_RESERVED ==
285 (type = tag_to_type (tidyAttrValue (attr))))
286 break;
287 if (NULL == (attr = tidyAttrGetById (child,
288 TidyAttr_CONTENT)))
289 break;
290 name = tidyAttrValue (attr);
291 if (0 !=
292 ec->proc (ec->cls,
293 "html",
294 type,
295 EXTRACTOR_METAFORMAT_UTF8,
296 "text/plain",
297 name,
298 strlen (name) + 1))
299 goto CLEANUP;
300 break;
301 }
302 break;
303 case TidyNode_End:
304 break;
305 default:
306 break;
307 }
308 }
309 CLEANUP:
310 tidyRelease (doc);
311}
312
313
314
315#if OLD
316
317
318/* ******************** parser helper functions ************** */
319
320static int
321tagMatch (const char *tag, const char *s, const char *e)
322{
323 return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
324}
325
326static int
327lookFor (char c, size_t * pos, const char *data, size_t size)
328{
329 size_t p = *pos;
330
331 while ((p < size) && (data[p] != c))
332 {
333 if (data[p] == '\0')
334 return 0;
335 p++;
336 }
337 *pos = p;
338 return p < size;
339}
340
341static int
342skipWhitespace (size_t * pos, const char *data, size_t size)
343{
344 size_t p = *pos;
345
346 while ((p < size) && (isspace ( (unsigned char) data[p])))
347 {
348 if (data[p] == '\0')
349 return 0;
350 p++;
351 }
352 *pos = p;
353 return p < size;
354}
355
356static int
357skipLetters (size_t * pos, const char *data, size_t size)
358{
359 size_t p = *pos;
360
361 while ((p < size) && (isalpha ( (unsigned char) data[p])))
362 {
363 if (data[p] == '\0')
364 return 0;
365 p++;
366 }
367 *pos = p;
368 return p < size;
369}
370
371static int
372lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
373{
374 size_t p = *pos;
375
376 while ((p < size) && (strchr (c, data[p]) == NULL))
377 {
378 if (data[p] == '\0')
379 return 0;
380 p++;
381 }
382 *pos = p;
383 return p < size;
384}
385
386static void
387findEntry (const char *key,
388 const char *start,
389 const char *end, const char **mstart, const char **mend)
390{
391 size_t len;
392
393 *mstart = NULL;
394 *mend = NULL;
395 len = strlen (key);
396 while (start < end - len - 1)
397 {
398 start++;
399 if (start[len] != '=')
400 continue;
401 if (0 == strncasecmp (start, key, len))
402 {
403 start += len + 1;
404 *mstart = start;
405 if ((*start == '\"') || (*start == '\''))
406 {
407 start++;
408 while ((start < end) && (*start != **mstart))
409 start++;
410 (*mstart)++; /* skip quote */
411 }
412 else
413 {
414 while ((start < end) && (!isspace ( (unsigned char) *start)))
415 start++;
416 }
417 *mend = start;
418 return;
419 }
420 }
421}
422
423/**
424 * Search all tags that correspond to "tagname". Example:
425 * If the tag is <meta name="foo" desc="bar">, and
426 * tagname == "meta", keyname="name", keyvalue="foo",
427 * and searchname="desc", then this function returns a
428 * copy (!) of "bar". Easy enough?
429 *
430 * @return NULL if nothing is found
431 */
432static char *
433findInTags (struct TagInfo * t,
434 const char *tagname,
435 const char *keyname, const char *keyvalue, const char *searchname)
436{
437 const char *pstart;
438 const char *pend;
439
440 while (t != NULL)
441 {
442 if (tagMatch (tagname, t->tagStart, t->tagEnd))
443 {
444 findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
445 if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
446 {
447 findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
448 if (pstart != NULL)
449 {
450 char *ret = malloc (pend - pstart + 1);
451 if (ret == NULL)
452 return NULL;
453 memcpy (ret, pstart, pend - pstart);
454 ret[pend - pstart] = '\0';
455 return ret;
456 }
457 }
458 }
459 t = t->next;
460 }
461 return NULL;
462}
463
464
465/* mimetype = text/html */
466int
467EXTRACTOR_html_extract (const char *data,
468 size_t size,
469 EXTRACTOR_MetaDataProcessor proc,
470 void *proc_cls,
471 const char *options)
472{
473 size_t xsize;
474 struct TagInfo *tags;
475 struct TagInfo *t;
476 struct TagInfo tag;
477 size_t pos;
478 size_t tpos;
479 int i;
480 char *charset;
481 char *tmp;
482 char *xtmp;
483 int ret;
484
485 ret = 0;
486 if (size == 0)
487 return 0;
488 /* only scan first 32k */
489 if (size > 1024 * 32)
490 xsize = 1024 * 32;
491 else
492 xsize = size;
493 tags = NULL;
494 tag.next = NULL;
495 pos = 0;
496 while (pos < xsize)
497 {
498 if (!lookFor ('<', &pos, data, size))
499 break;
500 tag.tagStart = &data[++pos];
501 if (!skipLetters (&pos, data, size))
502 break;
503 tag.tagEnd = &data[pos];
504 if (!skipWhitespace (&pos, data, size))
505 break;
506 STEP3:
507 if (!lookForMultiple (">\"\'", &pos, data, size))
508 break;
509 if (data[pos] != '>')
510 {
511 /* find end-quote, ignore escaped quotes (\') */
512 do
513 {
514 tpos = pos;
515 pos++;
516 if (!lookFor (data[tpos], &pos, data, size))
517 break;
518 }
519 while (data[pos - 1] == '\\');
520 pos++;
521 goto STEP3;
522 }
523 pos++;
524 if (!skipWhitespace (&pos, data, size))
525 break;
526 tag.dataStart = &data[pos];
527 if (!lookFor ('<', &pos, data, size))
528 break;
529 tag.dataEnd = &data[pos];
530 i = 0;
531 while (relevantTags[i] != NULL)
532 {
533 if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
534 (0 == strncasecmp (relevantTags[i],
535 tag.tagStart, tag.tagEnd - tag.tagStart)))
536 {
537 t = malloc (sizeof (struct TagInfo));
538 if (t == NULL)
539 return 0;
540 *t = tag;
541 t->next = tags;
542 tags = t;
543 break;
544 }
545 i++;
546 }
547 /* abort early if we hit the body tag */
548 if (tagMatch ("body", tag.tagStart, tag.tagEnd))
549 break;
550 }
551
552 /* fast exit */
553 if (tags == NULL)
554 return 0;
555
556 charset = NULL;
557 /* first, try to determine mime type and/or character set */
558 tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
559 if (tmp != NULL)
560 {
561 /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
562 if text/html is present, we take that as the mime-type; if charset=
563 is present, we try to use that for character set conversion. */
564 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
565 ret = proc (proc_cls,
566 "html",
567 EXTRACTOR_METATYPE_MIMETYPE,
568 EXTRACTOR_METAFORMAT_UTF8,
569 "text/plain",
570 "text/html",
571 strlen ("text/html")+1);
572 charset = strcasestr (tmp, "charset=");
573 if (charset != NULL)
574 charset = strdup (&charset[strlen ("charset=")]);
575 free (tmp);
576 }
577 i = 0;
578 while (tagmap[i].name != NULL)
579 {
580 tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
581 if ( (tmp != NULL) &&
582 (ret == 0) )
583 {
584 if (charset == NULL)
585 {
586 ret = proc (proc_cls,
587 "html",
588 tagmap[i].type,
589 EXTRACTOR_METAFORMAT_C_STRING,
590 "text/plain",
591 tmp,
592 strlen (tmp) + 1);
593 }
594 else
595 {
596 xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
597 strlen (tmp),
598 charset);
599 if (xtmp != NULL)
600 {
601 ret = proc (proc_cls,
602 "html",
603 tagmap[i].type,
604 EXTRACTOR_METAFORMAT_UTF8,
605 "text/plain",
606 xtmp,
607 strlen (xtmp) + 1);
608 free (xtmp);
609 }
610 }
611 }
612 if (tmp != NULL)
613 free (tmp);
614 i++;
615 }
616 while (tags != NULL)
617 {
618 t = tags;
619 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
620 (ret == 0) )
621 {
622 if (charset == NULL)
623 {
624 xtmp = malloc (t->dataEnd - t->dataStart + 1);
625 if (xtmp != NULL)
626 {
627 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
628 xtmp[t->dataEnd - t->dataStart] = '\0';
629 ret = proc (proc_cls,
630 "html",
631 EXTRACTOR_METATYPE_TITLE,
632 EXTRACTOR_METAFORMAT_C_STRING,
633 "text/plain",
634 xtmp,
635 strlen (xtmp) + 1);
636 free (xtmp);
637 }
638 }
639 else
640 {
641 xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
642 t->dataEnd - t->dataStart,
643 charset);
644 if (xtmp != NULL)
645 {
646 ret = proc (proc_cls,
647 "html",
648 EXTRACTOR_METATYPE_TITLE,
649 EXTRACTOR_METAFORMAT_UTF8,
650 "text/plain",
651 xtmp,
652 strlen (xtmp) + 1);
653 free (xtmp);
654 }
655 }
656 }
657 tags = t->next;
658 free (t);
659 }
660 if (charset != NULL)
661 free (charset);
662 return ret;
663}
664#endif
665
666
667/**
668 * Initialize glib and load magic file.
669 */
670void __attribute__ ((constructor))
671html_gobject_init ()
672{
673 magic = magic_open (MAGIC_MIME_TYPE);
674 if (0 != magic_load (magic, NULL))
675 {
676 /* FIXME: how to deal with errors? */
677 }
678}
679
680
681/**
682 * Destructor for the library, cleans up.
683 */
684void __attribute__ ((destructor))
685html_ltdl_fini ()
686{
687 if (NULL != magic)
688 {
689 magic_close (magic);
690 magic = NULL;
691 }
692}
693
694/* end of html_extractor.c */