libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

html_extractor.c (8669B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 2, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 
     20  */
     21 /**
     22  * @file plugins/html_extractor.c
     23  * @brief plugin to support HTML files
     24  * @author Christian Grothoff
     25  */
     26 #include "platform.h"
     27 #include "extractor.h"
     28 #include <magic.h>
     29 #if HAVE_TIDY_H
     30 #include <tidy.h>
     31 #include <tidybuffio.h>
     32 #elif HAVE_TIDY_TIDY_H
     33 #include <tidy/tidy.h>
     34 #include <tidy/tidybuffio.h>
     35 #else
     36 Broken build, fix tidy detection.
     37 #endif
     38 
     39 /**
     40  * Mapping of HTML META names to LE types.
     41  */
     42 static struct
     43 {
     44   /**
     45    * HTML META name.
     46    */
     47   const char *name;
     48 
     49   /**
     50    * Corresponding LE type.
     51    */
     52   enum EXTRACTOR_MetaType type;
     53 } tagmap[] = {
     54   { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
     55   { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
     56   { "title", EXTRACTOR_METATYPE_TITLE },
     57   { "dc.title", EXTRACTOR_METATYPE_TITLE},
     58   { "description", EXTRACTOR_METATYPE_DESCRIPTION },
     59   { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
     60   { "subject", EXTRACTOR_METATYPE_SUBJECT},
     61   { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
     62   { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
     63   { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
     64   { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
     65   { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
     66   { "rights", EXTRACTOR_METATYPE_RIGHTS },
     67   { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
     68   { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
     69   { "language", EXTRACTOR_METATYPE_LANGUAGE },
     70   { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
     71   { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
     72   { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     73   { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
     74   { "dc.identifier", EXTRACTOR_METATYPE_URI },
     75   { "dc.format", EXTRACTOR_METATYPE_FORMAT },
     76   { NULL, EXTRACTOR_METATYPE_RESERVED }
     77 };
     78 
     79 
     80 /**
     81  * Global handle to MAGIC data.
     82  */
     83 static magic_t magic;
     84 
     85 
     86 /**
     87  * Map 'meta' tag to LE type.
     88  *
     89  * @param tag tag to map
     90  * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
     91  */
     92 static enum EXTRACTOR_MetaType
     93 tag_to_type (const char *tag)
     94 {
     95   unsigned int i;
     96 
     97   for (i = 0; NULL != tagmap[i].name; i++)
     98     if (0 == strcasecmp (tag,
     99                          tagmap[i].name))
    100       return tagmap[i].type;
    101   return EXTRACTOR_METATYPE_RESERVED;
    102 }
    103 
    104 
    105 /**
    106  * Function called by libtidy for error reporting.
    107  *
    108  * @param doc tidy doc being processed
    109  * @param lvl report level
    110  * @param line input line
    111  * @param col input column
    112  * @param mssg message
    113  * @return FALSE (no output)
    114  */
    115 static Bool TIDY_CALL
    116 report_cb (TidyDoc doc,
    117            TidyReportLevel lvl,
    118            uint line,
    119            uint col,
    120            ctmbstr mssg)
    121 {
    122   return 0;
    123 }
    124 
    125 
    126 /**
    127  * Input callback: get next byte of input.
    128  *
    129  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    130  * @return next byte of input, EndOfStream on errors and EOF
    131  */
    132 static int TIDY_CALL
    133 get_byte_cb (void *sourceData)
    134 {
    135   struct EXTRACTOR_ExtractContext *ec = sourceData;
    136   void *data;
    137 
    138   if (1 !=
    139       ec->read (ec->cls,
    140                 &data, 1))
    141     return EndOfStream;
    142   return *(unsigned char*) data;
    143 }
    144 
    145 
    146 /**
    147  * Input callback: unget last byte of input.
    148  *
    149  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    150  * @param bt byte to unget (ignored)
    151  */
    152 static void TIDY_CALL
    153 unget_byte_cb (void *sourceData, byte bt)
    154 {
    155   struct EXTRACTOR_ExtractContext *ec = sourceData;
    156 
    157   (void) ec->seek (ec->cls, -1, SEEK_CUR);
    158 }
    159 
    160 
    161 /**
    162  * Input callback: check for EOF.
    163  *
    164  * @param sourceData our 'struct EXTRACTOR_ExtractContext'
    165  * @return true if we are at the EOF
    166  */
    167 static Bool TIDY_CALL
    168 eof_cb (void *sourceData)
    169 {
    170   struct EXTRACTOR_ExtractContext *ec = sourceData;
    171 
    172   return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
    173 }
    174 
    175 
    176 /**
    177  * Main entry method for the 'text/html' extraction plugin.
    178  *
    179  * @param ec extraction context provided to the plugin
    180  */
    181 void
    182 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec);
    183 
    184 void
    185 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
    186 {
    187   TidyDoc doc;
    188   TidyNode head;
    189   TidyNode child;
    190   TidyNode title;
    191   TidyInputSource src;
    192   const char *name;
    193   TidyBuffer tbuf;
    194   TidyAttr attr;
    195   enum EXTRACTOR_MetaType type;
    196   ssize_t iret;
    197   void *data;
    198   const char *mime;
    199 
    200   if (-1 == (iret = ec->read (ec->cls,
    201                               &data,
    202                               16 * 1024)))
    203     return;
    204   if (NULL == (mime = magic_buffer (magic, data, iret)))
    205     return;
    206   if (0 != strncmp (mime,
    207                     "text/html",
    208                     strlen ("text/html")))
    209     return; /* not HTML */
    210 
    211   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
    212     return; /* seek failed !? */
    213 
    214   tidyInitSource (&src, ec,
    215                   &get_byte_cb,
    216                   &unget_byte_cb,
    217                   &eof_cb);
    218   if (NULL == (doc = tidyCreate ()))
    219     return;
    220   tidySetReportFilter (doc, &report_cb);
    221   tidySetAppData (doc, ec);
    222   if (0 > tidyParseSource (doc, &src))
    223   {
    224     tidyRelease (doc);
    225     return;
    226   }
    227   if (1 != tidyStatus (doc))
    228   {
    229     tidyRelease (doc);
    230     return;
    231   }
    232   if (NULL == (head = tidyGetHead (doc)))
    233   {
    234     fprintf (stderr, "no head\n");
    235     tidyRelease (doc);
    236     return;
    237   }
    238   for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
    239   {
    240     switch (tidyNodeGetType (child))
    241     {
    242     case TidyNode_Root:
    243       break;
    244     case TidyNode_DocType:
    245       break;
    246     case TidyNode_Comment:
    247       break;
    248     case TidyNode_ProcIns:
    249       break;
    250     case TidyNode_Text:
    251       break;
    252     case TidyNode_CDATA:
    253       break;
    254     case TidyNode_Section:
    255       break;
    256     case TidyNode_Asp:
    257       break;
    258     case TidyNode_Jste:
    259       break;
    260     case TidyNode_Php:
    261       break;
    262     case TidyNode_XmlDecl:
    263       break;
    264     case TidyNode_Start:
    265     case TidyNode_StartEnd:
    266       name = tidyNodeGetName (child);
    267       if ( (0 == strcasecmp (name, "title")) &&
    268            (NULL != (title = tidyGetChild (child))) )
    269       {
    270         tidyBufInit (&tbuf);
    271         tidyNodeGetValue (doc, title, &tbuf);
    272         /* add 0-termination */
    273         tidyBufPutByte (&tbuf, 0);
    274         if (0 !=
    275             ec->proc (ec->cls,
    276                       "html",
    277                       EXTRACTOR_METATYPE_TITLE,
    278                       EXTRACTOR_METAFORMAT_UTF8,
    279                       "text/plain",
    280                       (const char *) tbuf.bp,
    281                       tbuf.size))
    282         {
    283           tidyBufFree (&tbuf);
    284           goto CLEANUP;
    285         }
    286         tidyBufFree (&tbuf);
    287         break;
    288       }
    289       if (0 == strcasecmp (name, "meta"))
    290       {
    291         if (NULL == (attr = tidyAttrGetById (child,
    292                                              TidyAttr_NAME)))
    293           break;
    294         if (EXTRACTOR_METATYPE_RESERVED ==
    295             (type = tag_to_type (tidyAttrValue (attr))))
    296           break;
    297         if (NULL == (attr = tidyAttrGetById (child,
    298                                              TidyAttr_CONTENT)))
    299           break;
    300         name = tidyAttrValue (attr);
    301         if (0 !=
    302             ec->proc (ec->cls,
    303                       "html",
    304                       type,
    305                       EXTRACTOR_METAFORMAT_UTF8,
    306                       "text/plain",
    307                       name,
    308                       strlen (name) + 1))
    309           goto CLEANUP;
    310         break;
    311       }
    312       break;
    313     case TidyNode_End:
    314       break;
    315     default:
    316       break;
    317     }
    318   }
    319 CLEANUP:
    320   tidyRelease (doc);
    321 }
    322 
    323 
    324 /**
    325  * Initialize glib and load magic file.
    326  */
    327 void __attribute__ ((constructor))
    328 html_gobject_init (void);
    329 
    330 void __attribute__ ((constructor))
    331 html_gobject_init ()
    332 {
    333   magic = magic_open (MAGIC_MIME_TYPE);
    334   if (0 != magic_load (magic, NULL))
    335   {
    336     /* FIXME: how to deal with errors? */
    337   }
    338 }
    339 
    340 
    341 /**
    342  * Destructor for the library, cleans up.
    343  */
    344 void __attribute__ ((destructor))
    345 html_ltdl_fini (void);
    346 
    347 void __attribute__ ((destructor))
    348 html_ltdl_fini ()
    349 {
    350   if (NULL != magic)
    351   {
    352     magic_close (magic);
    353     magic = NULL;
    354   }
    355 }
    356 
    357 
    358 /* end of html_extractor.c */