diff options
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r-- | src/plugins/html_extractor.c | 694 |
1 files changed, 694 insertions, 0 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c new file mode 100644 index 0000000..65fb535 --- /dev/null +++ b/src/plugins/html_extractor.c | |||
@@ -0,0 +1,694 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 2, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | |||
20 | */ | ||
21 | /** | ||
22 | * @file plugins/html_extractor.c | ||
23 | * @brief plugin to support HTML files | ||
24 | * @author Christian Grothoff | ||
25 | */ | ||
26 | #include "platform.h" | ||
27 | #include "extractor.h" | ||
28 | #include <magic.h> | ||
29 | #include <tidy/tidy.h> | ||
30 | #include <tidy/buffio.h> | ||
31 | |||
32 | /** | ||
33 | * Mapping of HTML META names to LE types. | ||
34 | */ | ||
35 | static struct | ||
36 | { | ||
37 | /** | ||
38 | * HTML META name. | ||
39 | */ | ||
40 | const char *name; | ||
41 | |||
42 | /** | ||
43 | * Corresponding LE type. | ||
44 | */ | ||
45 | enum EXTRACTOR_MetaType type; | ||
46 | } tagmap[] = { | ||
47 | { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
48 | { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
49 | { "title", EXTRACTOR_METATYPE_TITLE }, | ||
50 | { "dc.title", EXTRACTOR_METATYPE_TITLE}, | ||
51 | { "description", EXTRACTOR_METATYPE_DESCRIPTION }, | ||
52 | { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION }, | ||
53 | { "subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
54 | { "dc.subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
55 | { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, | ||
56 | { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE}, | ||
57 | { "publisher", EXTRACTOR_METATYPE_PUBLISHER }, | ||
58 | { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER}, | ||
59 | { "rights", EXTRACTOR_METATYPE_RIGHTS }, | ||
60 | { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, | ||
61 | { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, | ||
62 | { "language", EXTRACTOR_METATYPE_LANGUAGE }, | ||
63 | { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, | ||
64 | { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, | ||
65 | { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | ||
66 | { "dc.creator", EXTRACTOR_METATYPE_CREATOR}, | ||
67 | { "dc.identifier", EXTRACTOR_METATYPE_URI }, | ||
68 | { "dc.format", EXTRACTOR_METATYPE_FORMAT }, | ||
69 | { NULL, EXTRACTOR_METATYPE_RESERVED } | ||
70 | }; | ||
71 | |||
72 | |||
73 | /** | ||
74 | * Global handle to MAGIC data. | ||
75 | */ | ||
76 | static magic_t magic; | ||
77 | |||
78 | |||
79 | /** | ||
80 | * Map 'meta' tag to LE type. | ||
81 | * | ||
82 | * @param tag tag to map | ||
83 | * @return EXTRACTOR_METATYPE_RESERVED if the type was not found | ||
84 | */ | ||
85 | static enum EXTRACTOR_MetaType | ||
86 | tag_to_type (const char *tag) | ||
87 | { | ||
88 | unsigned int i; | ||
89 | |||
90 | for (i=0; NULL != tagmap[i].name; i++) | ||
91 | if (0 == strcasecmp (tag, | ||
92 | tagmap[i].name)) | ||
93 | return tagmap[i].type; | ||
94 | return EXTRACTOR_METATYPE_RESERVED; | ||
95 | } | ||
96 | |||
97 | |||
98 | /** | ||
99 | * Function called by libtidy for error reporting. | ||
100 | * | ||
101 | * @param doc tidy doc being processed | ||
102 | * @param lvl report level | ||
103 | * @param line input line | ||
104 | * @param col input column | ||
105 | * @param mssg message | ||
106 | * @return FALSE (no output) | ||
107 | */ | ||
108 | static Bool | ||
109 | report_cb (TidyDoc doc, | ||
110 | TidyReportLevel lvl, | ||
111 | uint line, | ||
112 | uint col, | ||
113 | ctmbstr mssg) | ||
114 | { | ||
115 | return 0; | ||
116 | } | ||
117 | |||
118 | |||
119 | /** | ||
120 | * Input callback: get next byte of input. | ||
121 | * | ||
122 | * @param sourceData our 'struct EXTRACTOR_ExtractContext' | ||
123 | * @return next byte of input, EndOfStream on errors and EOF | ||
124 | */ | ||
125 | static int | ||
126 | get_byte_cb (void *sourceData) | ||
127 | { | ||
128 | struct EXTRACTOR_ExtractContext *ec = sourceData; | ||
129 | void *data; | ||
130 | |||
131 | if (1 != | ||
132 | ec->read (ec->cls, | ||
133 | &data, 1)) | ||
134 | return EndOfStream; | ||
135 | return *(unsigned char*) data; | ||
136 | } | ||
137 | |||
138 | |||
139 | /** | ||
140 | * Input callback: unget last byte of input. | ||
141 | * | ||
142 | * @param sourceData our 'struct EXTRACTOR_ExtractContext' | ||
143 | * @param bt byte to unget (ignored) | ||
144 | */ | ||
145 | static void | ||
146 | unget_byte_cb (void *sourceData, byte bt) | ||
147 | { | ||
148 | struct EXTRACTOR_ExtractContext *ec = sourceData; | ||
149 | |||
150 | (void) ec->seek (ec->cls, -1, SEEK_CUR); | ||
151 | } | ||
152 | |||
153 | |||
154 | /** | ||
155 | * Input callback: check for EOF. | ||
156 | * | ||
157 | * @param sourceData our 'struct EXTRACTOR_ExtractContext' | ||
158 | * @return true if we are at the EOF | ||
159 | */ | ||
160 | static Bool | ||
161 | eof_cb (void *sourceData) | ||
162 | { | ||
163 | struct EXTRACTOR_ExtractContext *ec = sourceData; | ||
164 | |||
165 | return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls); | ||
166 | } | ||
167 | |||
168 | |||
169 | /** | ||
170 | * Main entry method for the 'text/html' extraction plugin. | ||
171 | * | ||
172 | * @param ec extraction context provided to the plugin | ||
173 | */ | ||
174 | void | ||
175 | EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
176 | { | ||
177 | TidyDoc doc; | ||
178 | TidyNode head; | ||
179 | TidyNode child; | ||
180 | TidyNode title; | ||
181 | TidyInputSource src; | ||
182 | const char *name; | ||
183 | TidyBuffer tbuf; | ||
184 | TidyAttr attr; | ||
185 | enum EXTRACTOR_MetaType type; | ||
186 | ssize_t iret; | ||
187 | void *data; | ||
188 | const char *mime; | ||
189 | |||
190 | if (-1 == (iret = ec->read (ec->cls, | ||
191 | &data, | ||
192 | 16 * 1024))) | ||
193 | return; | ||
194 | if (NULL == (mime = magic_buffer (magic, data, iret))) | ||
195 | return; | ||
196 | if (0 != strncmp (mime, | ||
197 | "text/html", | ||
198 | strlen ("text/html"))) | ||
199 | return; /* not HTML */ | ||
200 | |||
201 | if (0 != ec->seek (ec->cls, 0, SEEK_SET)) | ||
202 | return; /* seek failed !? */ | ||
203 | |||
204 | tidyInitSource (&src, ec, | ||
205 | &get_byte_cb, | ||
206 | &unget_byte_cb, | ||
207 | &eof_cb); | ||
208 | if (NULL == (doc = tidyCreate ())) | ||
209 | return; | ||
210 | tidySetReportFilter (doc, &report_cb); | ||
211 | tidySetAppData (doc, ec); | ||
212 | if (0 > tidyParseSource (doc, &src)) | ||
213 | { | ||
214 | tidyRelease (doc); | ||
215 | return; | ||
216 | } | ||
217 | if (1 != tidyStatus (doc)) | ||
218 | { | ||
219 | tidyRelease (doc); | ||
220 | return; | ||
221 | } | ||
222 | if (NULL == (head = tidyGetHead (doc))) | ||
223 | { | ||
224 | fprintf (stderr, "no head\n"); | ||
225 | tidyRelease (doc); | ||
226 | return; | ||
227 | } | ||
228 | for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) | ||
229 | { | ||
230 | switch (tidyNodeGetType(child)) | ||
231 | { | ||
232 | case TidyNode_Root: | ||
233 | break; | ||
234 | case TidyNode_DocType: | ||
235 | break; | ||
236 | case TidyNode_Comment: | ||
237 | break; | ||
238 | case TidyNode_ProcIns: | ||
239 | break; | ||
240 | case TidyNode_Text: | ||
241 | break; | ||
242 | case TidyNode_CDATA: | ||
243 | break; | ||
244 | case TidyNode_Section: | ||
245 | break; | ||
246 | case TidyNode_Asp: | ||
247 | break; | ||
248 | case TidyNode_Jste: | ||
249 | break; | ||
250 | case TidyNode_Php: | ||
251 | break; | ||
252 | case TidyNode_XmlDecl: | ||
253 | break; | ||
254 | case TidyNode_Start: | ||
255 | case TidyNode_StartEnd: | ||
256 | name = tidyNodeGetName (child); | ||
257 | if ( (0 == strcasecmp (name, "title")) && | ||
258 | (NULL != (title = tidyGetChild (child))) ) | ||
259 | { | ||
260 | tidyBufInit (&tbuf); | ||
261 | tidyNodeGetValue (doc, title, &tbuf); | ||
262 | /* add 0-termination */ | ||
263 | tidyBufPutByte (&tbuf, 0); | ||
264 | if (0 != | ||
265 | ec->proc (ec->cls, | ||
266 | "html", | ||
267 | EXTRACTOR_METATYPE_TITLE, | ||
268 | EXTRACTOR_METAFORMAT_UTF8, | ||
269 | "text/plain", | ||
270 | (const char *) tbuf.bp, | ||
271 | tbuf.size)) | ||
272 | { | ||
273 | tidyBufFree (&tbuf); | ||
274 | goto CLEANUP; | ||
275 | } | ||
276 | tidyBufFree (&tbuf); | ||
277 | break; | ||
278 | } | ||
279 | if (0 == strcasecmp (name, "meta")) | ||
280 | { | ||
281 | if (NULL == (attr = tidyAttrGetById (child, | ||
282 | TidyAttr_NAME))) | ||
283 | break; | ||
284 | if (EXTRACTOR_METATYPE_RESERVED == | ||
285 | (type = tag_to_type (tidyAttrValue (attr)))) | ||
286 | break; | ||
287 | if (NULL == (attr = tidyAttrGetById (child, | ||
288 | TidyAttr_CONTENT))) | ||
289 | break; | ||
290 | name = tidyAttrValue (attr); | ||
291 | if (0 != | ||
292 | ec->proc (ec->cls, | ||
293 | "html", | ||
294 | type, | ||
295 | EXTRACTOR_METAFORMAT_UTF8, | ||
296 | "text/plain", | ||
297 | name, | ||
298 | strlen (name) + 1)) | ||
299 | goto CLEANUP; | ||
300 | break; | ||
301 | } | ||
302 | break; | ||
303 | case TidyNode_End: | ||
304 | break; | ||
305 | default: | ||
306 | break; | ||
307 | } | ||
308 | } | ||
309 | CLEANUP: | ||
310 | tidyRelease (doc); | ||
311 | } | ||
312 | |||
313 | |||
314 | |||
315 | #if OLD | ||
316 | |||
317 | |||
318 | /* ******************** parser helper functions ************** */ | ||
319 | |||
320 | static int | ||
321 | tagMatch (const char *tag, const char *s, const char *e) | ||
322 | { | ||
323 | return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); | ||
324 | } | ||
325 | |||
326 | static int | ||
327 | lookFor (char c, size_t * pos, const char *data, size_t size) | ||
328 | { | ||
329 | size_t p = *pos; | ||
330 | |||
331 | while ((p < size) && (data[p] != c)) | ||
332 | { | ||
333 | if (data[p] == '\0') | ||
334 | return 0; | ||
335 | p++; | ||
336 | } | ||
337 | *pos = p; | ||
338 | return p < size; | ||
339 | } | ||
340 | |||
341 | static int | ||
342 | skipWhitespace (size_t * pos, const char *data, size_t size) | ||
343 | { | ||
344 | size_t p = *pos; | ||
345 | |||
346 | while ((p < size) && (isspace ( (unsigned char) data[p]))) | ||
347 | { | ||
348 | if (data[p] == '\0') | ||
349 | return 0; | ||
350 | p++; | ||
351 | } | ||
352 | *pos = p; | ||
353 | return p < size; | ||
354 | } | ||
355 | |||
356 | static int | ||
357 | skipLetters (size_t * pos, const char *data, size_t size) | ||
358 | { | ||
359 | size_t p = *pos; | ||
360 | |||
361 | while ((p < size) && (isalpha ( (unsigned char) data[p]))) | ||
362 | { | ||
363 | if (data[p] == '\0') | ||
364 | return 0; | ||
365 | p++; | ||
366 | } | ||
367 | *pos = p; | ||
368 | return p < size; | ||
369 | } | ||
370 | |||
371 | static int | ||
372 | lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) | ||
373 | { | ||
374 | size_t p = *pos; | ||
375 | |||
376 | while ((p < size) && (strchr (c, data[p]) == NULL)) | ||
377 | { | ||
378 | if (data[p] == '\0') | ||
379 | return 0; | ||
380 | p++; | ||
381 | } | ||
382 | *pos = p; | ||
383 | return p < size; | ||
384 | } | ||
385 | |||
386 | static void | ||
387 | findEntry (const char *key, | ||
388 | const char *start, | ||
389 | const char *end, const char **mstart, const char **mend) | ||
390 | { | ||
391 | size_t len; | ||
392 | |||
393 | *mstart = NULL; | ||
394 | *mend = NULL; | ||
395 | len = strlen (key); | ||
396 | while (start < end - len - 1) | ||
397 | { | ||
398 | start++; | ||
399 | if (start[len] != '=') | ||
400 | continue; | ||
401 | if (0 == strncasecmp (start, key, len)) | ||
402 | { | ||
403 | start += len + 1; | ||
404 | *mstart = start; | ||
405 | if ((*start == '\"') || (*start == '\'')) | ||
406 | { | ||
407 | start++; | ||
408 | while ((start < end) && (*start != **mstart)) | ||
409 | start++; | ||
410 | (*mstart)++; /* skip quote */ | ||
411 | } | ||
412 | else | ||
413 | { | ||
414 | while ((start < end) && (!isspace ( (unsigned char) *start))) | ||
415 | start++; | ||
416 | } | ||
417 | *mend = start; | ||
418 | return; | ||
419 | } | ||
420 | } | ||
421 | } | ||
422 | |||
423 | /** | ||
424 | * Search all tags that correspond to "tagname". Example: | ||
425 | * If the tag is <meta name="foo" desc="bar">, and | ||
426 | * tagname == "meta", keyname="name", keyvalue="foo", | ||
427 | * and searchname="desc", then this function returns a | ||
428 | * copy (!) of "bar". Easy enough? | ||
429 | * | ||
430 | * @return NULL if nothing is found | ||
431 | */ | ||
432 | static char * | ||
433 | findInTags (struct TagInfo * t, | ||
434 | const char *tagname, | ||
435 | const char *keyname, const char *keyvalue, const char *searchname) | ||
436 | { | ||
437 | const char *pstart; | ||
438 | const char *pend; | ||
439 | |||
440 | while (t != NULL) | ||
441 | { | ||
442 | if (tagMatch (tagname, t->tagStart, t->tagEnd)) | ||
443 | { | ||
444 | findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); | ||
445 | if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) | ||
446 | { | ||
447 | findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); | ||
448 | if (pstart != NULL) | ||
449 | { | ||
450 | char *ret = malloc (pend - pstart + 1); | ||
451 | if (ret == NULL) | ||
452 | return NULL; | ||
453 | memcpy (ret, pstart, pend - pstart); | ||
454 | ret[pend - pstart] = '\0'; | ||
455 | return ret; | ||
456 | } | ||
457 | } | ||
458 | } | ||
459 | t = t->next; | ||
460 | } | ||
461 | return NULL; | ||
462 | } | ||
463 | |||
464 | |||
465 | /* mimetype = text/html */ | ||
466 | int | ||
467 | EXTRACTOR_html_extract (const char *data, | ||
468 | size_t size, | ||
469 | EXTRACTOR_MetaDataProcessor proc, | ||
470 | void *proc_cls, | ||
471 | const char *options) | ||
472 | { | ||
473 | size_t xsize; | ||
474 | struct TagInfo *tags; | ||
475 | struct TagInfo *t; | ||
476 | struct TagInfo tag; | ||
477 | size_t pos; | ||
478 | size_t tpos; | ||
479 | int i; | ||
480 | char *charset; | ||
481 | char *tmp; | ||
482 | char *xtmp; | ||
483 | int ret; | ||
484 | |||
485 | ret = 0; | ||
486 | if (size == 0) | ||
487 | return 0; | ||
488 | /* only scan first 32k */ | ||
489 | if (size > 1024 * 32) | ||
490 | xsize = 1024 * 32; | ||
491 | else | ||
492 | xsize = size; | ||
493 | tags = NULL; | ||
494 | tag.next = NULL; | ||
495 | pos = 0; | ||
496 | while (pos < xsize) | ||
497 | { | ||
498 | if (!lookFor ('<', &pos, data, size)) | ||
499 | break; | ||
500 | tag.tagStart = &data[++pos]; | ||
501 | if (!skipLetters (&pos, data, size)) | ||
502 | break; | ||
503 | tag.tagEnd = &data[pos]; | ||
504 | if (!skipWhitespace (&pos, data, size)) | ||
505 | break; | ||
506 | STEP3: | ||
507 | if (!lookForMultiple (">\"\'", &pos, data, size)) | ||
508 | break; | ||
509 | if (data[pos] != '>') | ||
510 | { | ||
511 | /* find end-quote, ignore escaped quotes (\') */ | ||
512 | do | ||
513 | { | ||
514 | tpos = pos; | ||
515 | pos++; | ||
516 | if (!lookFor (data[tpos], &pos, data, size)) | ||
517 | break; | ||
518 | } | ||
519 | while (data[pos - 1] == '\\'); | ||
520 | pos++; | ||
521 | goto STEP3; | ||
522 | } | ||
523 | pos++; | ||
524 | if (!skipWhitespace (&pos, data, size)) | ||
525 | break; | ||
526 | tag.dataStart = &data[pos]; | ||
527 | if (!lookFor ('<', &pos, data, size)) | ||
528 | break; | ||
529 | tag.dataEnd = &data[pos]; | ||
530 | i = 0; | ||
531 | while (relevantTags[i] != NULL) | ||
532 | { | ||
533 | if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && | ||
534 | (0 == strncasecmp (relevantTags[i], | ||
535 | tag.tagStart, tag.tagEnd - tag.tagStart))) | ||
536 | { | ||
537 | t = malloc (sizeof (struct TagInfo)); | ||
538 | if (t == NULL) | ||
539 | return 0; | ||
540 | *t = tag; | ||
541 | t->next = tags; | ||
542 | tags = t; | ||
543 | break; | ||
544 | } | ||
545 | i++; | ||
546 | } | ||
547 | /* abort early if we hit the body tag */ | ||
548 | if (tagMatch ("body", tag.tagStart, tag.tagEnd)) | ||
549 | break; | ||
550 | } | ||
551 | |||
552 | /* fast exit */ | ||
553 | if (tags == NULL) | ||
554 | return 0; | ||
555 | |||
556 | charset = NULL; | ||
557 | /* first, try to determine mime type and/or character set */ | ||
558 | tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); | ||
559 | if (tmp != NULL) | ||
560 | { | ||
561 | /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; | ||
562 | if text/html is present, we take that as the mime-type; if charset= | ||
563 | is present, we try to use that for character set conversion. */ | ||
564 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) | ||
565 | ret = proc (proc_cls, | ||
566 | "html", | ||
567 | EXTRACTOR_METATYPE_MIMETYPE, | ||
568 | EXTRACTOR_METAFORMAT_UTF8, | ||
569 | "text/plain", | ||
570 | "text/html", | ||
571 | strlen ("text/html")+1); | ||
572 | charset = strcasestr (tmp, "charset="); | ||
573 | if (charset != NULL) | ||
574 | charset = strdup (&charset[strlen ("charset=")]); | ||
575 | free (tmp); | ||
576 | } | ||
577 | i = 0; | ||
578 | while (tagmap[i].name != NULL) | ||
579 | { | ||
580 | tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); | ||
581 | if ( (tmp != NULL) && | ||
582 | (ret == 0) ) | ||
583 | { | ||
584 | if (charset == NULL) | ||
585 | { | ||
586 | ret = proc (proc_cls, | ||
587 | "html", | ||
588 | tagmap[i].type, | ||
589 | EXTRACTOR_METAFORMAT_C_STRING, | ||
590 | "text/plain", | ||
591 | tmp, | ||
592 | strlen (tmp) + 1); | ||
593 | } | ||
594 | else | ||
595 | { | ||
596 | xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, | ||
597 | strlen (tmp), | ||
598 | charset); | ||
599 | if (xtmp != NULL) | ||
600 | { | ||
601 | ret = proc (proc_cls, | ||
602 | "html", | ||
603 | tagmap[i].type, | ||
604 | EXTRACTOR_METAFORMAT_UTF8, | ||
605 | "text/plain", | ||
606 | xtmp, | ||
607 | strlen (xtmp) + 1); | ||
608 | free (xtmp); | ||
609 | } | ||
610 | } | ||
611 | } | ||
612 | if (tmp != NULL) | ||
613 | free (tmp); | ||
614 | i++; | ||
615 | } | ||
616 | while (tags != NULL) | ||
617 | { | ||
618 | t = tags; | ||
619 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && | ||
620 | (ret == 0) ) | ||
621 | { | ||
622 | if (charset == NULL) | ||
623 | { | ||
624 | xtmp = malloc (t->dataEnd - t->dataStart + 1); | ||
625 | if (xtmp != NULL) | ||
626 | { | ||
627 | memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); | ||
628 | xtmp[t->dataEnd - t->dataStart] = '\0'; | ||
629 | ret = proc (proc_cls, | ||
630 | "html", | ||
631 | EXTRACTOR_METATYPE_TITLE, | ||
632 | EXTRACTOR_METAFORMAT_C_STRING, | ||
633 | "text/plain", | ||
634 | xtmp, | ||
635 | strlen (xtmp) + 1); | ||
636 | free (xtmp); | ||
637 | } | ||
638 | } | ||
639 | else | ||
640 | { | ||
641 | xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, | ||
642 | t->dataEnd - t->dataStart, | ||
643 | charset); | ||
644 | if (xtmp != NULL) | ||
645 | { | ||
646 | ret = proc (proc_cls, | ||
647 | "html", | ||
648 | EXTRACTOR_METATYPE_TITLE, | ||
649 | EXTRACTOR_METAFORMAT_UTF8, | ||
650 | "text/plain", | ||
651 | xtmp, | ||
652 | strlen (xtmp) + 1); | ||
653 | free (xtmp); | ||
654 | } | ||
655 | } | ||
656 | } | ||
657 | tags = t->next; | ||
658 | free (t); | ||
659 | } | ||
660 | if (charset != NULL) | ||
661 | free (charset); | ||
662 | return ret; | ||
663 | } | ||
664 | #endif | ||
665 | |||
666 | |||
667 | /** | ||
668 | * Initialize glib and load magic file. | ||
669 | */ | ||
670 | void __attribute__ ((constructor)) | ||
671 | html_gobject_init () | ||
672 | { | ||
673 | magic = magic_open (MAGIC_MIME_TYPE); | ||
674 | if (0 != magic_load (magic, NULL)) | ||
675 | { | ||
676 | /* FIXME: how to deal with errors? */ | ||
677 | } | ||
678 | } | ||
679 | |||
680 | |||
681 | /** | ||
682 | * Destructor for the library, cleans up. | ||
683 | */ | ||
684 | void __attribute__ ((destructor)) | ||
685 | html_ltdl_fini () | ||
686 | { | ||
687 | if (NULL != magic) | ||
688 | { | ||
689 | magic_close (magic); | ||
690 | magic = NULL; | ||
691 | } | ||
692 | } | ||
693 | |||
694 | /* end of html_extractor.c */ | ||