aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/html_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r--src/plugins/html_extractor.c652
1 files changed, 330 insertions, 322 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c
index 8cd4aba..5ebf97b 100644
--- a/src/plugins/html_extractor.c
+++ b/src/plugins/html_extractor.c
@@ -87,9 +87,9 @@ tag_to_type (const char *tag)
87{ 87{
88 unsigned int i; 88 unsigned int i;
89 89
90 for (i=0; NULL != tagmap[i].name; i++) 90 for (i = 0; NULL != tagmap[i].name; i++)
91 if (0 == strcasecmp (tag, 91 if (0 == strcasecmp (tag,
92 tagmap[i].name)) 92 tagmap[i].name))
93 return tagmap[i].type; 93 return tagmap[i].type;
94 return EXTRACTOR_METATYPE_RESERVED; 94 return EXTRACTOR_METATYPE_RESERVED;
95} 95}
@@ -107,10 +107,10 @@ tag_to_type (const char *tag)
107 */ 107 */
108static Bool TIDY_CALL 108static Bool TIDY_CALL
109report_cb (TidyDoc doc, 109report_cb (TidyDoc doc,
110 TidyReportLevel lvl, 110 TidyReportLevel lvl,
111 uint line, 111 uint line,
112 uint col, 112 uint col,
113 ctmbstr mssg) 113 ctmbstr mssg)
114{ 114{
115 return 0; 115 return 0;
116} 116}
@@ -130,7 +130,7 @@ get_byte_cb (void *sourceData)
130 130
131 if (1 != 131 if (1 !=
132 ec->read (ec->cls, 132 ec->read (ec->cls,
133 &data, 1)) 133 &data, 1))
134 return EndOfStream; 134 return EndOfStream;
135 return *(unsigned char*) data; 135 return *(unsigned char*) data;
136} 136}
@@ -188,130 +188,129 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
188 const char *mime; 188 const char *mime;
189 189
190 if (-1 == (iret = ec->read (ec->cls, 190 if (-1 == (iret = ec->read (ec->cls,
191 &data, 191 &data,
192 16 * 1024))) 192 16 * 1024)))
193 return; 193 return;
194 if (NULL == (mime = magic_buffer (magic, data, iret))) 194 if (NULL == (mime = magic_buffer (magic, data, iret)))
195 return; 195 return;
196 if (0 != strncmp (mime, 196 if (0 != strncmp (mime,
197 "text/html", 197 "text/html",
198 strlen ("text/html"))) 198 strlen ("text/html")))
199 return; /* not HTML */ 199 return; /* not HTML */
200 200
201 if (0 != ec->seek (ec->cls, 0, SEEK_SET)) 201 if (0 != ec->seek (ec->cls, 0, SEEK_SET))
202 return; /* seek failed !? */ 202 return; /* seek failed !? */
203 203
204 tidyInitSource (&src, ec, 204 tidyInitSource (&src, ec,
205 &get_byte_cb, 205 &get_byte_cb,
206 &unget_byte_cb, 206 &unget_byte_cb,
207 &eof_cb); 207 &eof_cb);
208 if (NULL == (doc = tidyCreate ())) 208 if (NULL == (doc = tidyCreate ()))
209 return; 209 return;
210 tidySetReportFilter (doc, &report_cb); 210 tidySetReportFilter (doc, &report_cb);
211 tidySetAppData (doc, ec); 211 tidySetAppData (doc, ec);
212 if (0 > tidyParseSource (doc, &src)) 212 if (0 > tidyParseSource (doc, &src))
213 { 213 {
214 tidyRelease (doc); 214 tidyRelease (doc);
215 return; 215 return;
216 } 216 }
217 if (1 != tidyStatus (doc)) 217 if (1 != tidyStatus (doc))
218 { 218 {
219 tidyRelease (doc); 219 tidyRelease (doc);
220 return; 220 return;
221 } 221 }
222 if (NULL == (head = tidyGetHead (doc))) 222 if (NULL == (head = tidyGetHead (doc)))
223 { 223 {
224 fprintf (stderr, "no head\n"); 224 fprintf (stderr, "no head\n");
225 tidyRelease (doc); 225 tidyRelease (doc);
226 return; 226 return;
227 } 227 }
228 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) 228 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
229 {
230 switch (tidyNodeGetType (child))
229 { 231 {
230 switch (tidyNodeGetType(child)) 232 case TidyNode_Root:
231 { 233 break;
232 case TidyNode_Root: 234 case TidyNode_DocType:
233 break; 235 break;
234 case TidyNode_DocType: 236 case TidyNode_Comment:
235 break; 237 break;
236 case TidyNode_Comment: 238 case TidyNode_ProcIns:
237 break; 239 break;
238 case TidyNode_ProcIns: 240 case TidyNode_Text:
239 break; 241 break;
240 case TidyNode_Text: 242 case TidyNode_CDATA:
241 break; 243 break;
242 case TidyNode_CDATA: 244 case TidyNode_Section:
243 break; 245 break;
244 case TidyNode_Section: 246 case TidyNode_Asp:
245 break; 247 break;
246 case TidyNode_Asp: 248 case TidyNode_Jste:
247 break; 249 break;
248 case TidyNode_Jste: 250 case TidyNode_Php:
249 break; 251 break;
250 case TidyNode_Php: 252 case TidyNode_XmlDecl:
251 break; 253 break;
252 case TidyNode_XmlDecl: 254 case TidyNode_Start:
253 break; 255 case TidyNode_StartEnd:
254 case TidyNode_Start: 256 name = tidyNodeGetName (child);
255 case TidyNode_StartEnd: 257 if ( (0 == strcasecmp (name, "title")) &&
256 name = tidyNodeGetName (child); 258 (NULL != (title = tidyGetChild (child))) )
257 if ( (0 == strcasecmp (name, "title")) && 259 {
258 (NULL != (title = tidyGetChild (child))) ) 260 tidyBufInit (&tbuf);
259 { 261 tidyNodeGetValue (doc, title, &tbuf);
260 tidyBufInit (&tbuf); 262 /* add 0-termination */
261 tidyNodeGetValue (doc, title, &tbuf); 263 tidyBufPutByte (&tbuf, 0);
262 /* add 0-termination */ 264 if (0 !=
263 tidyBufPutByte (&tbuf, 0); 265 ec->proc (ec->cls,
264 if (0 != 266 "html",
265 ec->proc (ec->cls, 267 EXTRACTOR_METATYPE_TITLE,
266 "html", 268 EXTRACTOR_METAFORMAT_UTF8,
267 EXTRACTOR_METATYPE_TITLE, 269 "text/plain",
268 EXTRACTOR_METAFORMAT_UTF8, 270 (const char *) tbuf.bp,
269 "text/plain", 271 tbuf.size))
270 (const char *) tbuf.bp, 272 {
271 tbuf.size)) 273 tidyBufFree (&tbuf);
272 { 274 goto CLEANUP;
273 tidyBufFree (&tbuf); 275 }
274 goto CLEANUP; 276 tidyBufFree (&tbuf);
275 } 277 break;
276 tidyBufFree (&tbuf); 278 }
277 break; 279 if (0 == strcasecmp (name, "meta"))
278 } 280 {
279 if (0 == strcasecmp (name, "meta")) 281 if (NULL == (attr = tidyAttrGetById (child,
280 { 282 TidyAttr_NAME)))
281 if (NULL == (attr = tidyAttrGetById (child, 283 break;
282 TidyAttr_NAME))) 284 if (EXTRACTOR_METATYPE_RESERVED ==
283 break; 285 (type = tag_to_type (tidyAttrValue (attr))))
284 if (EXTRACTOR_METATYPE_RESERVED == 286 break;
285 (type = tag_to_type (tidyAttrValue (attr)))) 287 if (NULL == (attr = tidyAttrGetById (child,
286 break; 288 TidyAttr_CONTENT)))
287 if (NULL == (attr = tidyAttrGetById (child, 289 break;
288 TidyAttr_CONTENT))) 290 name = tidyAttrValue (attr);
289 break; 291 if (0 !=
290 name = tidyAttrValue (attr); 292 ec->proc (ec->cls,
291 if (0 != 293 "html",
292 ec->proc (ec->cls, 294 type,
293 "html", 295 EXTRACTOR_METAFORMAT_UTF8,
294 type, 296 "text/plain",
295 EXTRACTOR_METAFORMAT_UTF8, 297 name,
296 "text/plain", 298 strlen (name) + 1))
297 name, 299 goto CLEANUP;
298 strlen (name) + 1)) 300 break;
299 goto CLEANUP; 301 }
300 break; 302 break;
301 } 303 case TidyNode_End:
302 break; 304 break;
303 case TidyNode_End: 305 default:
304 break; 306 break;
305 default:
306 break;
307 }
308 } 307 }
309 CLEANUP: 308 }
309CLEANUP:
310 tidyRelease (doc); 310 tidyRelease (doc);
311} 311}
312 312
313 313
314
315#if OLD 314#if OLD
316 315
317 316
@@ -323,66 +322,71 @@ tagMatch (const char *tag, const char *s, const char *e)
323 return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); 322 return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
324} 323}
325 324
325
326static int 326static int
327lookFor (char c, size_t * pos, const char *data, size_t size) 327lookFor (char c, size_t *pos, const char *data, size_t size)
328{ 328{
329 size_t p = *pos; 329 size_t p = *pos;
330 330
331 while ((p < size) && (data[p] != c)) 331 while ((p < size) && (data[p] != c))
332 { 332 {
333 if (data[p] == '\0') 333 if (data[p] == '\0')
334 return 0; 334 return 0;
335 p++; 335 p++;
336 } 336 }
337 *pos = p; 337 *pos = p;
338 return p < size; 338 return p < size;
339} 339}
340 340
341
341static int 342static int
342skipWhitespace (size_t * pos, const char *data, size_t size) 343skipWhitespace (size_t *pos, const char *data, size_t size)
343{ 344{
344 size_t p = *pos; 345 size_t p = *pos;
345 346
346 while ((p < size) && (isspace ( (unsigned char) data[p]))) 347 while ((p < size) && (isspace ( (unsigned char) data[p])))
347 { 348 {
348 if (data[p] == '\0') 349 if (data[p] == '\0')
349 return 0; 350 return 0;
350 p++; 351 p++;
351 } 352 }
352 *pos = p; 353 *pos = p;
353 return p < size; 354 return p < size;
354} 355}
355 356
357
356static int 358static int
357skipLetters (size_t * pos, const char *data, size_t size) 359skipLetters (size_t *pos, const char *data, size_t size)
358{ 360{
359 size_t p = *pos; 361 size_t p = *pos;
360 362
361 while ((p < size) && (isalpha ( (unsigned char) data[p]))) 363 while ((p < size) && (isalpha ( (unsigned char) data[p])))
362 { 364 {
363 if (data[p] == '\0') 365 if (data[p] == '\0')
364 return 0; 366 return 0;
365 p++; 367 p++;
366 } 368 }
367 *pos = p; 369 *pos = p;
368 return p < size; 370 return p < size;
369} 371}
370 372
373
371static int 374static int
372lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) 375lookForMultiple (const char *c, size_t *pos, const char *data, size_t size)
373{ 376{
374 size_t p = *pos; 377 size_t p = *pos;
375 378
376 while ((p < size) && (strchr (c, data[p]) == NULL)) 379 while ((p < size) && (strchr (c, data[p]) == NULL))
377 { 380 {
378 if (data[p] == '\0') 381 if (data[p] == '\0')
379 return 0; 382 return 0;
380 p++; 383 p++;
381 } 384 }
382 *pos = p; 385 *pos = p;
383 return p < size; 386 return p < size;
384} 387}
385 388
389
386static void 390static void
387findEntry (const char *key, 391findEntry (const char *key,
388 const char *start, 392 const char *start,
@@ -394,32 +398,33 @@ findEntry (const char *key,
394 *mend = NULL; 398 *mend = NULL;
395 len = strlen (key); 399 len = strlen (key);
396 while (start < end - len - 1) 400 while (start < end - len - 1)
401 {
402 start++;
403 if (start[len] != '=')
404 continue;
405 if (0 == strncasecmp (start, key, len))
397 { 406 {
398 start++; 407 start += len + 1;
399 if (start[len] != '=') 408 *mstart = start;
400 continue; 409 if ((*start == '\"') || (*start == '\''))
401 if (0 == strncasecmp (start, key, len)) 410 {
402 { 411 start++;
403 start += len + 1; 412 while ((start < end) && (*start != **mstart))
404 *mstart = start; 413 start++;
405 if ((*start == '\"') || (*start == '\'')) 414 (*mstart)++; /* skip quote */
406 { 415 }
407 start++; 416 else
408 while ((start < end) && (*start != **mstart)) 417 {
409 start++; 418 while ((start < end) && (! isspace ( (unsigned char) *start)))
410 (*mstart)++; /* skip quote */ 419 start++;
411 } 420 }
412 else 421 *mend = start;
413 { 422 return;
414 while ((start < end) && (!isspace ( (unsigned char) *start)))
415 start++;
416 }
417 *mend = start;
418 return;
419 }
420 } 423 }
424 }
421} 425}
422 426
427
423/** 428/**
424 * Search all tags that correspond to "tagname". Example: 429 * Search all tags that correspond to "tagname". Example:
425 * If the tag is <meta name="foo" desc="bar">, and 430 * If the tag is <meta name="foo" desc="bar">, and
@@ -430,7 +435,7 @@ findEntry (const char *key,
430 * @return NULL if nothing is found 435 * @return NULL if nothing is found
431 */ 436 */
432static char * 437static char *
433findInTags (struct TagInfo * t, 438findInTags (struct TagInfo *t,
434 const char *tagname, 439 const char *tagname,
435 const char *keyname, const char *keyvalue, const char *searchname) 440 const char *keyname, const char *keyvalue, const char *searchname)
436{ 441{
@@ -438,26 +443,26 @@ findInTags (struct TagInfo * t,
438 const char *pend; 443 const char *pend;
439 444
440 while (t != NULL) 445 while (t != NULL)
446 {
447 if (tagMatch (tagname, t->tagStart, t->tagEnd))
441 { 448 {
442 if (tagMatch (tagname, t->tagStart, t->tagEnd)) 449 findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
450 if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
451 {
452 findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
453 if (pstart != NULL)
443 { 454 {
444 findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); 455 char *ret = malloc (pend - pstart + 1);
445 if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) 456 if (ret == NULL)
446 { 457 return NULL;
447 findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); 458 memcpy (ret, pstart, pend - pstart);
448 if (pstart != NULL) 459 ret[pend - pstart] = '\0';
449 { 460 return ret;
450 char *ret = malloc (pend - pstart + 1);
451 if (ret == NULL)
452 return NULL;
453 memcpy (ret, pstart, pend - pstart);
454 ret[pend - pstart] = '\0';
455 return ret;
456 }
457 }
458 } 461 }
459 t = t->next; 462 }
460 } 463 }
464 t = t->next;
465 }
461 return NULL; 466 return NULL;
462} 467}
463 468
@@ -465,10 +470,10 @@ findInTags (struct TagInfo * t,
465/* mimetype = text/html */ 470/* mimetype = text/html */
466int 471int
467EXTRACTOR_html_extract (const char *data, 472EXTRACTOR_html_extract (const char *data,
468 size_t size, 473 size_t size,
469 EXTRACTOR_MetaDataProcessor proc, 474 EXTRACTOR_MetaDataProcessor proc,
470 void *proc_cls, 475 void *proc_cls,
471 const char *options) 476 const char *options)
472{ 477{
473 size_t xsize; 478 size_t xsize;
474 struct TagInfo *tags; 479 struct TagInfo *tags;
@@ -494,60 +499,60 @@ EXTRACTOR_html_extract (const char *data,
494 tag.next = NULL; 499 tag.next = NULL;
495 pos = 0; 500 pos = 0;
496 while (pos < xsize) 501 while (pos < xsize)
502 {
503 if (! lookFor ('<', &pos, data, size))
504 break;
505 tag.tagStart = &data[++pos];
506 if (! skipLetters (&pos, data, size))
507 break;
508 tag.tagEnd = &data[pos];
509 if (! skipWhitespace (&pos, data, size))
510 break;
511STEP3:
512 if (! lookForMultiple (">\"\'", &pos, data, size))
513 break;
514 if (data[pos] != '>')
497 { 515 {
498 if (!lookFor ('<', &pos, data, size)) 516 /* find end-quote, ignore escaped quotes (\') */
499 break; 517 do
500 tag.tagStart = &data[++pos]; 518 {
501 if (!skipLetters (&pos, data, size)) 519 tpos = pos;
502 break; 520 pos++;
503 tag.tagEnd = &data[pos]; 521 if (! lookFor (data[tpos], &pos, data, size))
504 if (!skipWhitespace (&pos, data, size)) 522 break;
505 break; 523 }
506 STEP3: 524 while (data[pos - 1] == '\\');
507 if (!lookForMultiple (">\"\'", &pos, data, size))
508 break;
509 if (data[pos] != '>')
510 {
511 /* find end-quote, ignore escaped quotes (\') */
512 do
513 {
514 tpos = pos;
515 pos++;
516 if (!lookFor (data[tpos], &pos, data, size))
517 break;
518 }
519 while (data[pos - 1] == '\\');
520 pos++;
521 goto STEP3;
522 }
523 pos++; 525 pos++;
524 if (!skipWhitespace (&pos, data, size)) 526 goto STEP3;
525 break; 527 }
526 tag.dataStart = &data[pos]; 528 pos++;
527 if (!lookFor ('<', &pos, data, size)) 529 if (! skipWhitespace (&pos, data, size))
528 break; 530 break;
529 tag.dataEnd = &data[pos]; 531 tag.dataStart = &data[pos];
530 i = 0; 532 if (! lookFor ('<', &pos, data, size))
531 while (relevantTags[i] != NULL) 533 break;
532 { 534 tag.dataEnd = &data[pos];
533 if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && 535 i = 0;
534 (0 == strncasecmp (relevantTags[i], 536 while (relevantTags[i] != NULL)
535 tag.tagStart, tag.tagEnd - tag.tagStart))) 537 {
536 { 538 if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
537 t = malloc (sizeof (struct TagInfo)); 539 (0 == strncasecmp (relevantTags[i],
538 if (t == NULL) 540 tag.tagStart, tag.tagEnd - tag.tagStart)))
539 return 0; 541 {
540 *t = tag; 542 t = malloc (sizeof (struct TagInfo));
541 t->next = tags; 543 if (t == NULL)
542 tags = t; 544 return 0;
543 break; 545 *t = tag;
544 } 546 t->next = tags;
545 i++; 547 tags = t;
546 }
547 /* abort early if we hit the body tag */
548 if (tagMatch ("body", tag.tagStart, tag.tagEnd))
549 break; 548 break;
549 }
550 i++;
550 } 551 }
552 /* abort early if we hit the body tag */
553 if (tagMatch ("body", tag.tagStart, tag.tagEnd))
554 break;
555 }
551 556
552 /* fast exit */ 557 /* fast exit */
553 if (tags == NULL) 558 if (tags == NULL)
@@ -557,110 +562,112 @@ EXTRACTOR_html_extract (const char *data,
557 /* first, try to determine mime type and/or character set */ 562 /* first, try to determine mime type and/or character set */
558 tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); 563 tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
559 if (tmp != NULL) 564 if (tmp != NULL)
560 { 565 {
561 /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; 566 /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
562 if text/html is present, we take that as the mime-type; if charset= 567 if text/html is present, we take that as the mime-type; if charset=
563 is present, we try to use that for character set conversion. */ 568 is present, we try to use that for character set conversion. */
564 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) 569 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
565 ret = proc (proc_cls, 570 ret = proc (proc_cls,
566 "html", 571 "html",
567 EXTRACTOR_METATYPE_MIMETYPE, 572 EXTRACTOR_METATYPE_MIMETYPE,
568 EXTRACTOR_METAFORMAT_UTF8, 573 EXTRACTOR_METAFORMAT_UTF8,
569 "text/plain", 574 "text/plain",
570 "text/html", 575 "text/html",
571 strlen ("text/html")+1); 576 strlen ("text/html") + 1);
572 charset = strcasestr (tmp, "charset="); 577 charset = strcasestr (tmp, "charset=");
573 if (charset != NULL) 578 if (charset != NULL)
574 charset = strdup (&charset[strlen ("charset=")]); 579 charset = strdup (&charset[strlen ("charset=")]);
575 free (tmp); 580 free (tmp);
576 } 581 }
577 i = 0; 582 i = 0;
578 while (tagmap[i].name != NULL) 583 while (tagmap[i].name != NULL)
584 {
585 tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
586 if ( (tmp != NULL) &&
587 (ret == 0) )
579 { 588 {
580 tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); 589 if (charset == NULL)
581 if ( (tmp != NULL) && 590 {
582 (ret == 0) ) 591 ret = proc (proc_cls,
592 "html",
593 tagmap[i].type,
594 EXTRACTOR_METAFORMAT_C_STRING,
595 "text/plain",
596 tmp,
597 strlen (tmp) + 1);
598 }
599 else
600 {
601 xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
602 strlen (tmp),
603 charset);
604 if (xtmp != NULL)
583 { 605 {
584 if (charset == NULL) 606 ret = proc (proc_cls,
585 { 607 "html",
586 ret = proc (proc_cls, 608 tagmap[i].type,
587 "html", 609 EXTRACTOR_METAFORMAT_UTF8,
588 tagmap[i].type, 610 "text/plain",
589 EXTRACTOR_METAFORMAT_C_STRING, 611 xtmp,
590 "text/plain", 612 strlen (xtmp) + 1);
591 tmp, 613 free (xtmp);
592 strlen (tmp) + 1);
593 }
594 else
595 {
596 xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
597 strlen (tmp),
598 charset);
599 if (xtmp != NULL)
600 {
601 ret = proc (proc_cls,
602 "html",
603 tagmap[i].type,
604 EXTRACTOR_METAFORMAT_UTF8,
605 "text/plain",
606 xtmp,
607 strlen (xtmp) + 1);
608 free (xtmp);
609 }
610 }
611 } 614 }
612 if (tmp != NULL) 615 }
613 free (tmp);
614 i++;
615 } 616 }
617 if (tmp != NULL)
618 free (tmp);
619 i++;
620 }
616 while (tags != NULL) 621 while (tags != NULL)
622 {
623 t = tags;
624 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
625 (ret == 0) )
617 { 626 {
618 t = tags; 627 if (charset == NULL)
619 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && 628 {
620 (ret == 0) ) 629 xtmp = malloc (t->dataEnd - t->dataStart + 1);
621 { 630 if (xtmp != NULL)
622 if (charset == NULL) 631 {
623 { 632 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
624 xtmp = malloc (t->dataEnd - t->dataStart + 1); 633 xtmp[t->dataEnd - t->dataStart] = '\0';
625 if (xtmp != NULL) 634 ret = proc (proc_cls,
626 { 635 "html",
627 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); 636 EXTRACTOR_METATYPE_TITLE,
628 xtmp[t->dataEnd - t->dataStart] = '\0'; 637 EXTRACTOR_METAFORMAT_C_STRING,
629 ret = proc (proc_cls, 638 "text/plain",
630 "html", 639 xtmp,
631 EXTRACTOR_METATYPE_TITLE, 640 strlen (xtmp) + 1);
632 EXTRACTOR_METAFORMAT_C_STRING, 641 free (xtmp);
633 "text/plain", 642 }
634 xtmp, 643 }
635 strlen (xtmp) + 1); 644 else
636 free (xtmp); 645 {
637 } 646 xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
638 } 647 t->dataEnd - t->dataStart,
639 else 648 charset);
640 { 649 if (xtmp != NULL)
641 xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, 650 {
642 t->dataEnd - t->dataStart, 651 ret = proc (proc_cls,
643 charset); 652 "html",
644 if (xtmp != NULL) 653 EXTRACTOR_METATYPE_TITLE,
645 { 654 EXTRACTOR_METAFORMAT_UTF8,
646 ret = proc (proc_cls, 655 "text/plain",
647 "html", 656 xtmp,
648 EXTRACTOR_METATYPE_TITLE, 657 strlen (xtmp) + 1);
649 EXTRACTOR_METAFORMAT_UTF8, 658 free (xtmp);
650 "text/plain", 659 }
651 xtmp, 660 }
652 strlen (xtmp) + 1);
653 free (xtmp);
654 }
655 }
656 }
657 tags = t->next;
658 free (t);
659 } 661 }
662 tags = t->next;
663 free (t);
664 }
660 if (charset != NULL) 665 if (charset != NULL)
661 free (charset); 666 free (charset);
662 return ret; 667 return ret;
663} 668}
669
670
664#endif 671#endif
665 672
666 673
@@ -672,9 +679,9 @@ html_gobject_init ()
672{ 679{
673 magic = magic_open (MAGIC_MIME_TYPE); 680 magic = magic_open (MAGIC_MIME_TYPE);
674 if (0 != magic_load (magic, NULL)) 681 if (0 != magic_load (magic, NULL))
675 { 682 {
676 /* FIXME: how to deal with errors? */ 683 /* FIXME: how to deal with errors? */
677 } 684 }
678} 685}
679 686
680 687
@@ -685,10 +692,11 @@ void __attribute__ ((destructor))
685html_ltdl_fini () 692html_ltdl_fini ()
686{ 693{
687 if (NULL != magic) 694 if (NULL != magic)
688 { 695 {
689 magic_close (magic); 696 magic_close (magic);
690 magic = NULL; 697 magic = NULL;
691 } 698 }
692} 699}
693 700
701
694/* end of html_extractor.c */ 702/* end of html_extractor.c */