html_extractor.c - libextractor

html_extractor.c (16856B)
      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 2, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 
     20  */
     21 /**
     22  * @file plugins/html_extractor.c
     23  * @brief plugin to support HTML files
     24  * @author Christian Grothoff
     25  */
     26 #include "platform.h"
     27 #include "extractor.h"
     28 #include <magic.h>
     29 #if HAVE_TIDY_H
     30 #include <tidy.h>
     31 #include <tidybuffio.h>
     32 #elif HAVE_TIDY_TIDY_H
     33 #include <tidy/tidy.h>
     34 #include <tidy/tidybuffio.h>
     35 #else
     36 Broken build, fix tidy detection.
     37 #endif
     38 
     39 /**
     40  * Mapping of HTML META names to LE types.
     41  */
     42 static struct
     43 {
     44   /**
     45    * HTML META name.
     46    */
     47   const char *name;
     48 
     49   /**
     50    * Corresponding LE type.
     51    */
     52   enum EXTRACTOR_MetaType type;
     53 } tagmap[] = {
     54   { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
     55   { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
     56   { "title", EXTRACTOR_METATYPE_TITLE },
     57   { "dc.title", EXTRACTOR_METATYPE_TITLE},
     58   { "description", EXTRACTOR_METATYPE_DESCRIPTION },
     59   { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
     60   { "subject", EXTRACTOR_METATYPE_SUBJECT},
     61   { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
     62   { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
     63   { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
     64   { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
     65   { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
     66   { "rights", EXTRACTOR_METATYPE_RIGHTS },
     67   { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
     68   { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
     69   { "language", EXTRACTOR_METATYPE_LANGUAGE },
     70   { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
     71   { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
     72   { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     73   { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
     74   { "dc.identifier", EXTRACTOR_METATYPE_URI },
     75   { "dc.format", EXTRACTOR_METATYPE_FORMAT },
     76   { NULL, EXTRACTOR_METATYPE_RESERVED }
     77 };
     78 
     79 
     80 /**
     81  * Global handle to MAGIC data.
     82  */
     83 static magic_t magic;
     84 
     85 
     86 /**
     87  * Map 'meta' tag to LE type.
     88  *
     89  * @param tag tag to map
     90  * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
     91  */
     92 static enum EXTRACTOR_MetaType
     93 tag_to_type (const char *tag)
     94 {
     95   unsigned int i;
     96 
     97   for (i = 0; NULL != tagmap[i].name; i++)
     98     if (0 == strcasecmp (tag,
     99                          tagmap[i].name))
    100       return tagmap[i].type;
    101   return EXTRACTOR_METATYPE_RESERVED;
    102 }
    103 
    104 
    105 /**
    106  * Function called by libtidy for error reporting.
    107  *
    108  * @param doc tidy doc being processed
    109  * @param lvl report level
    110  * @param line input line
    111  * @param col input column
    112  * @param mssg message
    113  * @return FALSE (no output)
    114  */
    115 static Bool TIDY_CALL
    116 report_cb (TidyDoc doc,
    117            TidyReportLevel lvl,
    118            uint line,
    119            uint col,
    120            ctmbstr mssg)
    121 {
    122   return 0;
    123 }
    124 
    125 
    126 /**
    127  * Input callback: get next byte of input.
    128  *
    129  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    130  * @return next byte of input, EndOfStream on errors and EOF
    131  */
    132 static int TIDY_CALL
    133 get_byte_cb (void *sourceData)
    134 {
    135   struct EXTRACTOR_ExtractContext *ec = sourceData;
    136   void *data;
    137 
    138   if (1 !=
    139       ec->read (ec->cls,
    140                 &data, 1))
    141     return EndOfStream;
    142   return *(unsigned char*) data;
    143 }
    144 
    145 
    146 /**
    147  * Input callback: unget last byte of input.
    148  *
    149  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    150  * @param bt byte to unget (ignored)
    151  */
    152 static void TIDY_CALL
    153 unget_byte_cb (void *sourceData, byte bt)
    154 {
    155   struct EXTRACTOR_ExtractContext *ec = sourceData;
    156 
    157   (void) ec->seek (ec->cls, -1, SEEK_CUR);
    158 }
    159 
    160 
    161 /**
    162  * Input callback: check for EOF.
    163  *
    164  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    165  * @return true if we are at the EOF
    166  */
    167 static Bool TIDY_CALL
    168 eof_cb (void *sourceData)
    169 {
    170   struct EXTRACTOR_ExtractContext *ec = sourceData;
    171 
    172   return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
    173 }
    174 
    175 
    176 /**
    177  * Main entry method for the 'text/html' extraction plugin.
    178  *
    179  * @param ec extraction context provided to the plugin
    180  */
    181 void
    182 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
    183 {
    184   TidyDoc doc;
    185   TidyNode head;
    186   TidyNode child;
    187   TidyNode title;
    188   TidyInputSource src;
    189   const char *name;
    190   TidyBuffer tbuf;
    191   TidyAttr attr;
    192   enum EXTRACTOR_MetaType type;
    193   ssize_t iret;
    194   void *data;
    195   const char *mime;
    196 
    197   if (-1 == (iret = ec->read (ec->cls,
    198                               &data,
    199                               16 * 1024)))
    200     return;
    201   if (NULL == (mime = magic_buffer (magic, data, iret)))
    202     return;
    203   if (0 != strncmp (mime,
    204                     "text/html",
    205                     strlen ("text/html")))
    206     return; /* not HTML */
    207 
    208   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
    209     return; /* seek failed !? */
    210 
    211   tidyInitSource (&src, ec,
    212                   &get_byte_cb,
    213                   &unget_byte_cb,
    214                   &eof_cb);
    215   if (NULL == (doc = tidyCreate ()))
    216     return;
    217   tidySetReportFilter (doc, &report_cb);
    218   tidySetAppData (doc, ec);
    219   if (0 > tidyParseSource (doc, &src))
    220   {
    221     tidyRelease (doc);
    222     return;
    223   }
    224   if (1 != tidyStatus (doc))
    225   {
    226     tidyRelease (doc);
    227     return;
    228   }
    229   if (NULL == (head = tidyGetHead (doc)))
    230   {
    231     fprintf (stderr, "no head\n");
    232     tidyRelease (doc);
    233     return;
    234   }
    235   for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
    236   {
    237     switch (tidyNodeGetType (child))
    238     {
    239     case TidyNode_Root:
    240       break;
    241     case TidyNode_DocType:
    242       break;
    243     case TidyNode_Comment:
    244       break;
    245     case TidyNode_ProcIns:
    246       break;
    247     case TidyNode_Text:
    248       break;
    249     case TidyNode_CDATA:
    250       break;
    251     case TidyNode_Section:
    252       break;
    253     case TidyNode_Asp:
    254       break;
    255     case TidyNode_Jste:
    256       break;
    257     case TidyNode_Php:
    258       break;
    259     case TidyNode_XmlDecl:
    260       break;
    261     case TidyNode_Start:
    262     case TidyNode_StartEnd:
    263       name = tidyNodeGetName (child);
    264       if ( (0 == strcasecmp (name, "title")) &&
    265            (NULL != (title = tidyGetChild (child))) )
    266       {
    267         tidyBufInit (&tbuf);
    268         tidyNodeGetValue (doc, title, &tbuf);
    269         /* add 0-termination */
    270         tidyBufPutByte (&tbuf, 0);
    271         if (0 !=
    272             ec->proc (ec->cls,
    273                       "html",
    274                       EXTRACTOR_METATYPE_TITLE,
    275                       EXTRACTOR_METAFORMAT_UTF8,
    276                       "text/plain",
    277                       (const char *) tbuf.bp,
    278                       tbuf.size))
    279         {
    280           tidyBufFree (&tbuf);
    281           goto CLEANUP;
    282         }
    283         tidyBufFree (&tbuf);
    284         break;
    285       }
    286       if (0 == strcasecmp (name, "meta"))
    287       {
    288         if (NULL == (attr = tidyAttrGetById (child,
    289                                              TidyAttr_NAME)))
    290           break;
    291         if (EXTRACTOR_METATYPE_RESERVED ==
    292             (type = tag_to_type (tidyAttrValue (attr))))
    293           break;
    294         if (NULL == (attr = tidyAttrGetById (child,
    295                                              TidyAttr_CONTENT)))
    296           break;
    297         name = tidyAttrValue (attr);
    298         if (0 !=
    299             ec->proc (ec->cls,
    300                       "html",
    301                       type,
    302                       EXTRACTOR_METAFORMAT_UTF8,
    303                       "text/plain",
    304                       name,
    305                       strlen (name) + 1))
    306           goto CLEANUP;
    307         break;
    308       }
    309       break;
    310     case TidyNode_End:
    311       break;
    312     default:
    313       break;
    314     }
    315   }
    316 CLEANUP:
    317   tidyRelease (doc);
    318 }
    319 
    320 
    321 #if OLD
    322 
    323 
    324 /* ******************** parser helper functions ************** */
    325 
    326 static int
    327 tagMatch (const char *tag, const char *s, const char *e)
    328 {
    329   return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
    330 }
    331 
    332 
    333 static int
    334 lookFor (char c, size_t *pos, const char *data, size_t size)
    335 {
    336   size_t p = *pos;
    337 
    338   while ((p < size) && (data[p] != c))
    339   {
    340     if (data[p] == '\0')
    341       return 0;
    342     p++;
    343   }
    344   *pos = p;
    345   return p < size;
    346 }
    347 
    348 
    349 static int
    350 skipWhitespace (size_t *pos, const char *data, size_t size)
    351 {
    352   size_t p = *pos;
    353 
    354   while ((p < size) && (isspace ( (unsigned char) data[p])))
    355   {
    356     if (data[p] == '\0')
    357       return 0;
    358     p++;
    359   }
    360   *pos = p;
    361   return p < size;
    362 }
    363 
    364 
    365 static int
    366 skipLetters (size_t *pos, const char *data, size_t size)
    367 {
    368   size_t p = *pos;
    369 
    370   while ((p < size) && (isalpha ( (unsigned char) data[p])))
    371   {
    372     if (data[p] == '\0')
    373       return 0;
    374     p++;
    375   }
    376   *pos = p;
    377   return p < size;
    378 }
    379 
    380 
    381 static int
    382 lookForMultiple (const char *c, size_t *pos, const char *data, size_t size)
    383 {
    384   size_t p = *pos;
    385 
    386   while ((p < size) && (strchr (c, data[p]) == NULL))
    387   {
    388     if (data[p] == '\0')
    389       return 0;
    390     p++;
    391   }
    392   *pos = p;
    393   return p < size;
    394 }
    395 
    396 
    397 static void
    398 findEntry (const char *key,
    399            const char *start,
    400            const char *end, const char **mstart, const char **mend)
    401 {
    402   size_t len;
    403 
    404   *mstart = NULL;
    405   *mend = NULL;
    406   len = strlen (key);
    407   while (start < end - len - 1)
    408   {
    409     start++;
    410     if (start[len] != '=')
    411       continue;
    412     if (0 == strncasecmp (start, key, len))
    413     {
    414       start += len + 1;
    415       *mstart = start;
    416       if ((*start == '\"') || (*start == '\''))
    417       {
    418         start++;
    419         while ((start < end) && (*start != **mstart))
    420           start++;
    421         (*mstart)++;            /* skip quote */
    422       }
    423       else
    424       {
    425         while ((start < end) && (! isspace ( (unsigned char) *start)))
    426           start++;
    427       }
    428       *mend = start;
    429       return;
    430     }
    431   }
    432 }
    433 
    434 
    435 /**
    436  * Search all tags that correspond to "tagname".  Example:
    437  * If the tag is <meta name="foo" desc="bar">, and
    438  * tagname == "meta", keyname="name", keyvalue="foo",
    439  * and searchname="desc", then this function returns a
    440  * copy (!) of "bar".  Easy enough?
    441  *
    442  * @return NULL if nothing is found
    443  */
    444 static char *
    445 findInTags (struct TagInfo *t,
    446             const char *tagname,
    447             const char *keyname, const char *keyvalue, const char *searchname)
    448 {
    449   const char *pstart;
    450   const char *pend;
    451 
    452   while (t != NULL)
    453   {
    454     if (tagMatch (tagname, t->tagStart, t->tagEnd))
    455     {
    456       findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
    457       if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
    458       {
    459         findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
    460         if (pstart != NULL)
    461         {
    462           char *ret = malloc (pend - pstart + 1);
    463           if (ret == NULL)
    464             return NULL;
    465           memcpy (ret, pstart, pend - pstart);
    466           ret[pend - pstart] = '\0';
    467           return ret;
    468         }
    469       }
    470     }
    471     t = t->next;
    472   }
    473   return NULL;
    474 }
    475 
    476 
    477 /* mimetype = text/html */
    478 int
    479 EXTRACTOR_html_extract (const char *data,
    480                         size_t size,
    481                         EXTRACTOR_MetaDataProcessor proc,
    482                         void *proc_cls,
    483                         const char *options)
    484 {
    485   size_t xsize;
    486   struct TagInfo *tags;
    487   struct TagInfo *t;
    488   struct TagInfo tag;
    489   size_t pos;
    490   size_t tpos;
    491   int i;
    492   char *charset;
    493   char *tmp;
    494   char *xtmp;
    495   int ret;
    496 
    497   ret = 0;
    498   if (size == 0)
    499     return 0;
    500   /* only scan first 32k */
    501   if (size > 1024 * 32)
    502     xsize = 1024 * 32;
    503   else
    504     xsize = size;
    505   tags = NULL;
    506   tag.next = NULL;
    507   pos = 0;
    508   while (pos < xsize)
    509   {
    510     if (! lookFor ('<', &pos, data, size))
    511       break;
    512     tag.tagStart = &data[++pos];
    513     if (! skipLetters (&pos, data, size))
    514       break;
    515     tag.tagEnd = &data[pos];
    516     if (! skipWhitespace (&pos, data, size))
    517       break;
    518 STEP3:
    519     if (! lookForMultiple (">\"\'", &pos, data, size))
    520       break;
    521     if (data[pos] != '>')
    522     {
    523       /* find end-quote, ignore escaped quotes (\') */
    524       do
    525       {
    526         tpos = pos;
    527         pos++;
    528         if (! lookFor (data[tpos], &pos, data, size))
    529           break;
    530       }
    531       while (data[pos - 1] == '\\');
    532       pos++;
    533       goto STEP3;
    534     }
    535     pos++;
    536     if (! skipWhitespace (&pos, data, size))
    537       break;
    538     tag.dataStart = &data[pos];
    539     if (! lookFor ('<', &pos, data, size))
    540       break;
    541     tag.dataEnd = &data[pos];
    542     i = 0;
    543     while (relevantTags[i] != NULL)
    544     {
    545       if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
    546           (0 == strncasecmp (relevantTags[i],
    547                              tag.tagStart, tag.tagEnd - tag.tagStart)))
    548       {
    549         t = malloc (sizeof (struct TagInfo));
    550         if (t == NULL)
    551           return 0;
    552         *t = tag;
    553         t->next = tags;
    554         tags = t;
    555         break;
    556       }
    557       i++;
    558     }
    559     /* abort early if we hit the body tag */
    560     if (tagMatch ("body", tag.tagStart, tag.tagEnd))
    561       break;
    562   }
    563 
    564   /* fast exit */
    565   if (tags == NULL)
    566     return 0;
    567 
    568   charset = NULL;
    569   /* first, try to determine mime type and/or character set */
    570   tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
    571   if (tmp != NULL)
    572   {
    573     /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
    574        if text/html is present, we take that as the mime-type; if charset=
    575        is present, we try to use that for character set conversion. */
    576     if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
    577       ret = proc (proc_cls,
    578                   "html",
    579                   EXTRACTOR_METATYPE_MIMETYPE,
    580                   EXTRACTOR_METAFORMAT_UTF8,
    581                   "text/plain",
    582                   "text/html",
    583                   strlen ("text/html") + 1);
    584     charset = strcasestr (tmp, "charset=");
    585     if (charset != NULL)
    586       charset = strdup (&charset[strlen ("charset=")]);
    587     free (tmp);
    588   }
    589   i = 0;
    590   while (tagmap[i].name != NULL)
    591   {
    592     tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
    593     if ( (tmp != NULL) &&
    594          (ret == 0) )
    595     {
    596       if (charset == NULL)
    597       {
    598         ret = proc (proc_cls,
    599                     "html",
    600                     tagmap[i].type,
    601                     EXTRACTOR_METAFORMAT_C_STRING,
    602                     "text/plain",
    603                     tmp,
    604                     strlen (tmp) + 1);
    605       }
    606       else
    607       {
    608         xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
    609                                                  strlen (tmp),
    610                                                  charset);
    611         if (xtmp != NULL)
    612         {
    613           ret = proc (proc_cls,
    614                       "html",
    615                       tagmap[i].type,
    616                       EXTRACTOR_METAFORMAT_UTF8,
    617                       "text/plain",
    618                       xtmp,
    619                       strlen (xtmp) + 1);
    620           free (xtmp);
    621         }
    622       }
    623     }
    624     if (tmp != NULL)
    625       free (tmp);
    626     i++;
    627   }
    628   while (tags != NULL)
    629   {
    630     t = tags;
    631     if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
    632          (ret == 0) )
    633     {
    634       if (charset == NULL)
    635       {
    636         xtmp = malloc (t->dataEnd - t->dataStart + 1);
    637         if (xtmp != NULL)
    638         {
    639           memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
    640           xtmp[t->dataEnd - t->dataStart] = '\0';
    641           ret = proc (proc_cls,
    642                       "html",
    643                       EXTRACTOR_METATYPE_TITLE,
    644                       EXTRACTOR_METAFORMAT_C_STRING,
    645                       "text/plain",
    646                       xtmp,
    647                       strlen (xtmp) + 1);
    648           free (xtmp);
    649         }
    650       }
    651       else
    652       {
    653         xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
    654                                                  t->dataEnd - t->dataStart,
    655                                                  charset);
    656         if (xtmp != NULL)
    657         {
    658           ret = proc (proc_cls,
    659                       "html",
    660                       EXTRACTOR_METATYPE_TITLE,
    661                       EXTRACTOR_METAFORMAT_UTF8,
    662                       "text/plain",
    663                       xtmp,
    664                       strlen (xtmp) + 1);
    665           free (xtmp);
    666         }
    667       }
    668     }
    669     tags = t->next;
    670     free (t);
    671   }
    672   if (charset != NULL)
    673     free (charset);
    674   return ret;
    675 }
    676 
    677 
    678 #endif
    679 
    680 
    681 /**
    682  * Initialize glib and load magic file.
    683  */
    684 void __attribute__ ((constructor))
    685 html_gobject_init ()
    686 {
    687   magic = magic_open (MAGIC_MIME_TYPE);
    688   if (0 != magic_load (magic, NULL))
    689   {
    690     /* FIXME: how to deal with errors? */
    691   }
    692 }
    693 
    694 
    695 /**
    696  * Destructor for the library, cleans up.
    697  */
    698 void __attribute__ ((destructor))
    699 html_ltdl_fini ()
    700 {
    701   if (NULL != magic)
    702   {
    703     magic_close (magic);
    704     magic = NULL;
    705   }
    706 }
    707 
    708 
    709 /* end of html_extractor.c */
	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE