html_extractor.c (16856B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 2, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 20 */ 21 /** 22 * @file plugins/html_extractor.c 23 * @brief plugin to support HTML files 24 * @author Christian Grothoff 25 */ 26 #include "platform.h" 27 #include "extractor.h" 28 #include <magic.h> 29 #if HAVE_TIDY_H 30 #include <tidy.h> 31 #include <tidybuffio.h> 32 #elif HAVE_TIDY_TIDY_H 33 #include <tidy/tidy.h> 34 #include <tidy/tidybuffio.h> 35 #else 36 Broken build, fix tidy detection. 37 #endif 38 39 /** 40 * Mapping of HTML META names to LE types. 41 */ 42 static struct 43 { 44 /** 45 * HTML META name. 46 */ 47 const char *name; 48 49 /** 50 * Corresponding LE type. 51 */ 52 enum EXTRACTOR_MetaType type; 53 } tagmap[] = { 54 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 55 { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 56 { "title", EXTRACTOR_METATYPE_TITLE }, 57 { "dc.title", EXTRACTOR_METATYPE_TITLE}, 58 { "description", EXTRACTOR_METATYPE_DESCRIPTION }, 59 { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION }, 60 { "subject", EXTRACTOR_METATYPE_SUBJECT}, 61 { "dc.subject", EXTRACTOR_METATYPE_SUBJECT}, 62 { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, 63 { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE}, 64 { "publisher", EXTRACTOR_METATYPE_PUBLISHER }, 65 { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER}, 66 { "rights", EXTRACTOR_METATYPE_RIGHTS }, 67 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, 68 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, 69 { "language", EXTRACTOR_METATYPE_LANGUAGE }, 70 { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, 71 { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, 72 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 73 { "dc.creator", EXTRACTOR_METATYPE_CREATOR}, 74 { "dc.identifier", EXTRACTOR_METATYPE_URI }, 75 { "dc.format", EXTRACTOR_METATYPE_FORMAT }, 76 { NULL, EXTRACTOR_METATYPE_RESERVED } 77 }; 78 79 80 /** 81 * Global handle to MAGIC data. 82 */ 83 static magic_t magic; 84 85 86 /** 87 * Map 'meta' tag to LE type. 88 * 89 * @param tag tag to map 90 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found 91 */ 92 static enum EXTRACTOR_MetaType 93 tag_to_type (const char *tag) 94 { 95 unsigned int i; 96 97 for (i = 0; NULL != tagmap[i].name; i++) 98 if (0 == strcasecmp (tag, 99 tagmap[i].name)) 100 return tagmap[i].type; 101 return EXTRACTOR_METATYPE_RESERVED; 102 } 103 104 105 /** 106 * Function called by libtidy for error reporting. 107 * 108 * @param doc tidy doc being processed 109 * @param lvl report level 110 * @param line input line 111 * @param col input column 112 * @param mssg message 113 * @return FALSE (no output) 114 */ 115 static Bool TIDY_CALL 116 report_cb (TidyDoc doc, 117 TidyReportLevel lvl, 118 uint line, 119 uint col, 120 ctmbstr mssg) 121 { 122 return 0; 123 } 124 125 126 /** 127 * Input callback: get next byte of input. 128 * 129 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 130 * @return next byte of input, EndOfStream on errors and EOF 131 */ 132 static int TIDY_CALL 133 get_byte_cb (void *sourceData) 134 { 135 struct EXTRACTOR_ExtractContext *ec = sourceData; 136 void *data; 137 138 if (1 != 139 ec->read (ec->cls, 140 &data, 1)) 141 return EndOfStream; 142 return *(unsigned char*) data; 143 } 144 145 146 /** 147 * Input callback: unget last byte of input. 148 * 149 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 150 * @param bt byte to unget (ignored) 151 */ 152 static void TIDY_CALL 153 unget_byte_cb (void *sourceData, byte bt) 154 { 155 struct EXTRACTOR_ExtractContext *ec = sourceData; 156 157 (void) ec->seek (ec->cls, -1, SEEK_CUR); 158 } 159 160 161 /** 162 * Input callback: check for EOF. 163 * 164 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 165 * @return true if we are at the EOF 166 */ 167 static Bool TIDY_CALL 168 eof_cb (void *sourceData) 169 { 170 struct EXTRACTOR_ExtractContext *ec = sourceData; 171 172 return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls); 173 } 174 175 176 /** 177 * Main entry method for the 'text/html' extraction plugin. 178 * 179 * @param ec extraction context provided to the plugin 180 */ 181 void 182 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) 183 { 184 TidyDoc doc; 185 TidyNode head; 186 TidyNode child; 187 TidyNode title; 188 TidyInputSource src; 189 const char *name; 190 TidyBuffer tbuf; 191 TidyAttr attr; 192 enum EXTRACTOR_MetaType type; 193 ssize_t iret; 194 void *data; 195 const char *mime; 196 197 if (-1 == (iret = ec->read (ec->cls, 198 &data, 199 16 * 1024))) 200 return; 201 if (NULL == (mime = magic_buffer (magic, data, iret))) 202 return; 203 if (0 != strncmp (mime, 204 "text/html", 205 strlen ("text/html"))) 206 return; /* not HTML */ 207 208 if (0 != ec->seek (ec->cls, 0, SEEK_SET)) 209 return; /* seek failed !? */ 210 211 tidyInitSource (&src, ec, 212 &get_byte_cb, 213 &unget_byte_cb, 214 &eof_cb); 215 if (NULL == (doc = tidyCreate ())) 216 return; 217 tidySetReportFilter (doc, &report_cb); 218 tidySetAppData (doc, ec); 219 if (0 > tidyParseSource (doc, &src)) 220 { 221 tidyRelease (doc); 222 return; 223 } 224 if (1 != tidyStatus (doc)) 225 { 226 tidyRelease (doc); 227 return; 228 } 229 if (NULL == (head = tidyGetHead (doc))) 230 { 231 fprintf (stderr, "no head\n"); 232 tidyRelease (doc); 233 return; 234 } 235 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) 236 { 237 switch (tidyNodeGetType (child)) 238 { 239 case TidyNode_Root: 240 break; 241 case TidyNode_DocType: 242 break; 243 case TidyNode_Comment: 244 break; 245 case TidyNode_ProcIns: 246 break; 247 case TidyNode_Text: 248 break; 249 case TidyNode_CDATA: 250 break; 251 case TidyNode_Section: 252 break; 253 case TidyNode_Asp: 254 break; 255 case TidyNode_Jste: 256 break; 257 case TidyNode_Php: 258 break; 259 case TidyNode_XmlDecl: 260 break; 261 case TidyNode_Start: 262 case TidyNode_StartEnd: 263 name = tidyNodeGetName (child); 264 if ( (0 == strcasecmp (name, "title")) && 265 (NULL != (title = tidyGetChild (child))) ) 266 { 267 tidyBufInit (&tbuf); 268 tidyNodeGetValue (doc, title, &tbuf); 269 /* add 0-termination */ 270 tidyBufPutByte (&tbuf, 0); 271 if (0 != 272 ec->proc (ec->cls, 273 "html", 274 EXTRACTOR_METATYPE_TITLE, 275 EXTRACTOR_METAFORMAT_UTF8, 276 "text/plain", 277 (const char *) tbuf.bp, 278 tbuf.size)) 279 { 280 tidyBufFree (&tbuf); 281 goto CLEANUP; 282 } 283 tidyBufFree (&tbuf); 284 break; 285 } 286 if (0 == strcasecmp (name, "meta")) 287 { 288 if (NULL == (attr = tidyAttrGetById (child, 289 TidyAttr_NAME))) 290 break; 291 if (EXTRACTOR_METATYPE_RESERVED == 292 (type = tag_to_type (tidyAttrValue (attr)))) 293 break; 294 if (NULL == (attr = tidyAttrGetById (child, 295 TidyAttr_CONTENT))) 296 break; 297 name = tidyAttrValue (attr); 298 if (0 != 299 ec->proc (ec->cls, 300 "html", 301 type, 302 EXTRACTOR_METAFORMAT_UTF8, 303 "text/plain", 304 name, 305 strlen (name) + 1)) 306 goto CLEANUP; 307 break; 308 } 309 break; 310 case TidyNode_End: 311 break; 312 default: 313 break; 314 } 315 } 316 CLEANUP: 317 tidyRelease (doc); 318 } 319 320 321 #if OLD 322 323 324 /* ******************** parser helper functions ************** */ 325 326 static int 327 tagMatch (const char *tag, const char *s, const char *e) 328 { 329 return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); 330 } 331 332 333 static int 334 lookFor (char c, size_t *pos, const char *data, size_t size) 335 { 336 size_t p = *pos; 337 338 while ((p < size) && (data[p] != c)) 339 { 340 if (data[p] == '\0') 341 return 0; 342 p++; 343 } 344 *pos = p; 345 return p < size; 346 } 347 348 349 static int 350 skipWhitespace (size_t *pos, const char *data, size_t size) 351 { 352 size_t p = *pos; 353 354 while ((p < size) && (isspace ( (unsigned char) data[p]))) 355 { 356 if (data[p] == '\0') 357 return 0; 358 p++; 359 } 360 *pos = p; 361 return p < size; 362 } 363 364 365 static int 366 skipLetters (size_t *pos, const char *data, size_t size) 367 { 368 size_t p = *pos; 369 370 while ((p < size) && (isalpha ( (unsigned char) data[p]))) 371 { 372 if (data[p] == '\0') 373 return 0; 374 p++; 375 } 376 *pos = p; 377 return p < size; 378 } 379 380 381 static int 382 lookForMultiple (const char *c, size_t *pos, const char *data, size_t size) 383 { 384 size_t p = *pos; 385 386 while ((p < size) && (strchr (c, data[p]) == NULL)) 387 { 388 if (data[p] == '\0') 389 return 0; 390 p++; 391 } 392 *pos = p; 393 return p < size; 394 } 395 396 397 static void 398 findEntry (const char *key, 399 const char *start, 400 const char *end, const char **mstart, const char **mend) 401 { 402 size_t len; 403 404 *mstart = NULL; 405 *mend = NULL; 406 len = strlen (key); 407 while (start < end - len - 1) 408 { 409 start++; 410 if (start[len] != '=') 411 continue; 412 if (0 == strncasecmp (start, key, len)) 413 { 414 start += len + 1; 415 *mstart = start; 416 if ((*start == '\"') || (*start == '\'')) 417 { 418 start++; 419 while ((start < end) && (*start != **mstart)) 420 start++; 421 (*mstart)++; /* skip quote */ 422 } 423 else 424 { 425 while ((start < end) && (! isspace ( (unsigned char) *start))) 426 start++; 427 } 428 *mend = start; 429 return; 430 } 431 } 432 } 433 434 435 /** 436 * Search all tags that correspond to "tagname". Example: 437 * If the tag is <meta name="foo" desc="bar">, and 438 * tagname == "meta", keyname="name", keyvalue="foo", 439 * and searchname="desc", then this function returns a 440 * copy (!) of "bar". Easy enough? 441 * 442 * @return NULL if nothing is found 443 */ 444 static char * 445 findInTags (struct TagInfo *t, 446 const char *tagname, 447 const char *keyname, const char *keyvalue, const char *searchname) 448 { 449 const char *pstart; 450 const char *pend; 451 452 while (t != NULL) 453 { 454 if (tagMatch (tagname, t->tagStart, t->tagEnd)) 455 { 456 findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); 457 if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) 458 { 459 findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); 460 if (pstart != NULL) 461 { 462 char *ret = malloc (pend - pstart + 1); 463 if (ret == NULL) 464 return NULL; 465 memcpy (ret, pstart, pend - pstart); 466 ret[pend - pstart] = '\0'; 467 return ret; 468 } 469 } 470 } 471 t = t->next; 472 } 473 return NULL; 474 } 475 476 477 /* mimetype = text/html */ 478 int 479 EXTRACTOR_html_extract (const char *data, 480 size_t size, 481 EXTRACTOR_MetaDataProcessor proc, 482 void *proc_cls, 483 const char *options) 484 { 485 size_t xsize; 486 struct TagInfo *tags; 487 struct TagInfo *t; 488 struct TagInfo tag; 489 size_t pos; 490 size_t tpos; 491 int i; 492 char *charset; 493 char *tmp; 494 char *xtmp; 495 int ret; 496 497 ret = 0; 498 if (size == 0) 499 return 0; 500 /* only scan first 32k */ 501 if (size > 1024 * 32) 502 xsize = 1024 * 32; 503 else 504 xsize = size; 505 tags = NULL; 506 tag.next = NULL; 507 pos = 0; 508 while (pos < xsize) 509 { 510 if (! lookFor ('<', &pos, data, size)) 511 break; 512 tag.tagStart = &data[++pos]; 513 if (! skipLetters (&pos, data, size)) 514 break; 515 tag.tagEnd = &data[pos]; 516 if (! skipWhitespace (&pos, data, size)) 517 break; 518 STEP3: 519 if (! lookForMultiple (">\"\'", &pos, data, size)) 520 break; 521 if (data[pos] != '>') 522 { 523 /* find end-quote, ignore escaped quotes (\') */ 524 do 525 { 526 tpos = pos; 527 pos++; 528 if (! lookFor (data[tpos], &pos, data, size)) 529 break; 530 } 531 while (data[pos - 1] == '\\'); 532 pos++; 533 goto STEP3; 534 } 535 pos++; 536 if (! skipWhitespace (&pos, data, size)) 537 break; 538 tag.dataStart = &data[pos]; 539 if (! lookFor ('<', &pos, data, size)) 540 break; 541 tag.dataEnd = &data[pos]; 542 i = 0; 543 while (relevantTags[i] != NULL) 544 { 545 if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && 546 (0 == strncasecmp (relevantTags[i], 547 tag.tagStart, tag.tagEnd - tag.tagStart))) 548 { 549 t = malloc (sizeof (struct TagInfo)); 550 if (t == NULL) 551 return 0; 552 *t = tag; 553 t->next = tags; 554 tags = t; 555 break; 556 } 557 i++; 558 } 559 /* abort early if we hit the body tag */ 560 if (tagMatch ("body", tag.tagStart, tag.tagEnd)) 561 break; 562 } 563 564 /* fast exit */ 565 if (tags == NULL) 566 return 0; 567 568 charset = NULL; 569 /* first, try to determine mime type and/or character set */ 570 tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); 571 if (tmp != NULL) 572 { 573 /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; 574 if text/html is present, we take that as the mime-type; if charset= 575 is present, we try to use that for character set conversion. */ 576 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) 577 ret = proc (proc_cls, 578 "html", 579 EXTRACTOR_METATYPE_MIMETYPE, 580 EXTRACTOR_METAFORMAT_UTF8, 581 "text/plain", 582 "text/html", 583 strlen ("text/html") + 1); 584 charset = strcasestr (tmp, "charset="); 585 if (charset != NULL) 586 charset = strdup (&charset[strlen ("charset=")]); 587 free (tmp); 588 } 589 i = 0; 590 while (tagmap[i].name != NULL) 591 { 592 tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); 593 if ( (tmp != NULL) && 594 (ret == 0) ) 595 { 596 if (charset == NULL) 597 { 598 ret = proc (proc_cls, 599 "html", 600 tagmap[i].type, 601 EXTRACTOR_METAFORMAT_C_STRING, 602 "text/plain", 603 tmp, 604 strlen (tmp) + 1); 605 } 606 else 607 { 608 xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, 609 strlen (tmp), 610 charset); 611 if (xtmp != NULL) 612 { 613 ret = proc (proc_cls, 614 "html", 615 tagmap[i].type, 616 EXTRACTOR_METAFORMAT_UTF8, 617 "text/plain", 618 xtmp, 619 strlen (xtmp) + 1); 620 free (xtmp); 621 } 622 } 623 } 624 if (tmp != NULL) 625 free (tmp); 626 i++; 627 } 628 while (tags != NULL) 629 { 630 t = tags; 631 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && 632 (ret == 0) ) 633 { 634 if (charset == NULL) 635 { 636 xtmp = malloc (t->dataEnd - t->dataStart + 1); 637 if (xtmp != NULL) 638 { 639 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); 640 xtmp[t->dataEnd - t->dataStart] = '\0'; 641 ret = proc (proc_cls, 642 "html", 643 EXTRACTOR_METATYPE_TITLE, 644 EXTRACTOR_METAFORMAT_C_STRING, 645 "text/plain", 646 xtmp, 647 strlen (xtmp) + 1); 648 free (xtmp); 649 } 650 } 651 else 652 { 653 xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, 654 t->dataEnd - t->dataStart, 655 charset); 656 if (xtmp != NULL) 657 { 658 ret = proc (proc_cls, 659 "html", 660 EXTRACTOR_METATYPE_TITLE, 661 EXTRACTOR_METAFORMAT_UTF8, 662 "text/plain", 663 xtmp, 664 strlen (xtmp) + 1); 665 free (xtmp); 666 } 667 } 668 } 669 tags = t->next; 670 free (t); 671 } 672 if (charset != NULL) 673 free (charset); 674 return ret; 675 } 676 677 678 #endif 679 680 681 /** 682 * Initialize glib and load magic file. 683 */ 684 void __attribute__ ((constructor)) 685 html_gobject_init () 686 { 687 magic = magic_open (MAGIC_MIME_TYPE); 688 if (0 != magic_load (magic, NULL)) 689 { 690 /* FIXME: how to deal with errors? */ 691 } 692 } 693 694 695 /** 696 * Destructor for the library, cleans up. 697 */ 698 void __attribute__ ((destructor)) 699 html_ltdl_fini () 700 { 701 if (NULL != magic) 702 { 703 magic_close (magic); 704 magic = NULL; 705 } 706 } 707 708 709 /* end of html_extractor.c */