html_extractor.c (8669B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 2, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 20 */ 21 /** 22 * @file plugins/html_extractor.c 23 * @brief plugin to support HTML files 24 * @author Christian Grothoff 25 */ 26 #include "platform.h" 27 #include "extractor.h" 28 #include <magic.h> 29 #if HAVE_TIDY_H 30 #include <tidy.h> 31 #include <tidybuffio.h> 32 #elif HAVE_TIDY_TIDY_H 33 #include <tidy/tidy.h> 34 #include <tidy/tidybuffio.h> 35 #else 36 Broken build, fix tidy detection. 37 #endif 38 39 /** 40 * Mapping of HTML META names to LE types. 41 */ 42 static struct 43 { 44 /** 45 * HTML META name. 46 */ 47 const char *name; 48 49 /** 50 * Corresponding LE type. 51 */ 52 enum EXTRACTOR_MetaType type; 53 } tagmap[] = { 54 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 55 { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 56 { "title", EXTRACTOR_METATYPE_TITLE }, 57 { "dc.title", EXTRACTOR_METATYPE_TITLE}, 58 { "description", EXTRACTOR_METATYPE_DESCRIPTION }, 59 { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION }, 60 { "subject", EXTRACTOR_METATYPE_SUBJECT}, 61 { "dc.subject", EXTRACTOR_METATYPE_SUBJECT}, 62 { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, 63 { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE}, 64 { "publisher", EXTRACTOR_METATYPE_PUBLISHER }, 65 { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER}, 66 { "rights", EXTRACTOR_METATYPE_RIGHTS }, 67 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, 68 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, 69 { "language", EXTRACTOR_METATYPE_LANGUAGE }, 70 { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, 71 { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, 72 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 73 { "dc.creator", EXTRACTOR_METATYPE_CREATOR}, 74 { "dc.identifier", EXTRACTOR_METATYPE_URI }, 75 { "dc.format", EXTRACTOR_METATYPE_FORMAT }, 76 { NULL, EXTRACTOR_METATYPE_RESERVED } 77 }; 78 79 80 /** 81 * Global handle to MAGIC data. 82 */ 83 static magic_t magic; 84 85 86 /** 87 * Map 'meta' tag to LE type. 88 * 89 * @param tag tag to map 90 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found 91 */ 92 static enum EXTRACTOR_MetaType 93 tag_to_type (const char *tag) 94 { 95 unsigned int i; 96 97 for (i = 0; NULL != tagmap[i].name; i++) 98 if (0 == strcasecmp (tag, 99 tagmap[i].name)) 100 return tagmap[i].type; 101 return EXTRACTOR_METATYPE_RESERVED; 102 } 103 104 105 /** 106 * Function called by libtidy for error reporting. 107 * 108 * @param doc tidy doc being processed 109 * @param lvl report level 110 * @param line input line 111 * @param col input column 112 * @param mssg message 113 * @return FALSE (no output) 114 */ 115 static Bool TIDY_CALL 116 report_cb (TidyDoc doc, 117 TidyReportLevel lvl, 118 uint line, 119 uint col, 120 ctmbstr mssg) 121 { 122 return 0; 123 } 124 125 126 /** 127 * Input callback: get next byte of input. 128 * 129 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 130 * @return next byte of input, EndOfStream on errors and EOF 131 */ 132 static int TIDY_CALL 133 get_byte_cb (void *sourceData) 134 { 135 struct EXTRACTOR_ExtractContext *ec = sourceData; 136 void *data; 137 138 if (1 != 139 ec->read (ec->cls, 140 &data, 1)) 141 return EndOfStream; 142 return *(unsigned char*) data; 143 } 144 145 146 /** 147 * Input callback: unget last byte of input. 148 * 149 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 150 * @param bt byte to unget (ignored) 151 */ 152 static void TIDY_CALL 153 unget_byte_cb (void *sourceData, byte bt) 154 { 155 struct EXTRACTOR_ExtractContext *ec = sourceData; 156 157 (void) ec->seek (ec->cls, -1, SEEK_CUR); 158 } 159 160 161 /** 162 * Input callback: check for EOF. 163 * 164 * @param sourceData our 'struct EXTRACTOR_ExtractContext' 165 * @return true if we are at the EOF 166 */ 167 static Bool TIDY_CALL 168 eof_cb (void *sourceData) 169 { 170 struct EXTRACTOR_ExtractContext *ec = sourceData; 171 172 return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls); 173 } 174 175 176 /** 177 * Main entry method for the 'text/html' extraction plugin. 178 * 179 * @param ec extraction context provided to the plugin 180 */ 181 void 182 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec); 183 184 void 185 EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) 186 { 187 TidyDoc doc; 188 TidyNode head; 189 TidyNode child; 190 TidyNode title; 191 TidyInputSource src; 192 const char *name; 193 TidyBuffer tbuf; 194 TidyAttr attr; 195 enum EXTRACTOR_MetaType type; 196 ssize_t iret; 197 void *data; 198 const char *mime; 199 200 if (-1 == (iret = ec->read (ec->cls, 201 &data, 202 16 * 1024))) 203 return; 204 if (NULL == (mime = magic_buffer (magic, data, iret))) 205 return; 206 if (0 != strncmp (mime, 207 "text/html", 208 strlen ("text/html"))) 209 return; /* not HTML */ 210 211 if (0 != ec->seek (ec->cls, 0, SEEK_SET)) 212 return; /* seek failed !? */ 213 214 tidyInitSource (&src, ec, 215 &get_byte_cb, 216 &unget_byte_cb, 217 &eof_cb); 218 if (NULL == (doc = tidyCreate ())) 219 return; 220 tidySetReportFilter (doc, &report_cb); 221 tidySetAppData (doc, ec); 222 if (0 > tidyParseSource (doc, &src)) 223 { 224 tidyRelease (doc); 225 return; 226 } 227 if (1 != tidyStatus (doc)) 228 { 229 tidyRelease (doc); 230 return; 231 } 232 if (NULL == (head = tidyGetHead (doc))) 233 { 234 fprintf (stderr, "no head\n"); 235 tidyRelease (doc); 236 return; 237 } 238 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) 239 { 240 switch (tidyNodeGetType (child)) 241 { 242 case TidyNode_Root: 243 break; 244 case TidyNode_DocType: 245 break; 246 case TidyNode_Comment: 247 break; 248 case TidyNode_ProcIns: 249 break; 250 case TidyNode_Text: 251 break; 252 case TidyNode_CDATA: 253 break; 254 case TidyNode_Section: 255 break; 256 case TidyNode_Asp: 257 break; 258 case TidyNode_Jste: 259 break; 260 case TidyNode_Php: 261 break; 262 case TidyNode_XmlDecl: 263 break; 264 case TidyNode_Start: 265 case TidyNode_StartEnd: 266 name = tidyNodeGetName (child); 267 if ( (0 == strcasecmp (name, "title")) && 268 (NULL != (title = tidyGetChild (child))) ) 269 { 270 tidyBufInit (&tbuf); 271 tidyNodeGetValue (doc, title, &tbuf); 272 /* add 0-termination */ 273 tidyBufPutByte (&tbuf, 0); 274 if (0 != 275 ec->proc (ec->cls, 276 "html", 277 EXTRACTOR_METATYPE_TITLE, 278 EXTRACTOR_METAFORMAT_UTF8, 279 "text/plain", 280 (const char *) tbuf.bp, 281 tbuf.size)) 282 { 283 tidyBufFree (&tbuf); 284 goto CLEANUP; 285 } 286 tidyBufFree (&tbuf); 287 break; 288 } 289 if (0 == strcasecmp (name, "meta")) 290 { 291 if (NULL == (attr = tidyAttrGetById (child, 292 TidyAttr_NAME))) 293 break; 294 if (EXTRACTOR_METATYPE_RESERVED == 295 (type = tag_to_type (tidyAttrValue (attr)))) 296 break; 297 if (NULL == (attr = tidyAttrGetById (child, 298 TidyAttr_CONTENT))) 299 break; 300 name = tidyAttrValue (attr); 301 if (0 != 302 ec->proc (ec->cls, 303 "html", 304 type, 305 EXTRACTOR_METAFORMAT_UTF8, 306 "text/plain", 307 name, 308 strlen (name) + 1)) 309 goto CLEANUP; 310 break; 311 } 312 break; 313 case TidyNode_End: 314 break; 315 default: 316 break; 317 } 318 } 319 CLEANUP: 320 tidyRelease (doc); 321 } 322 323 324 /** 325 * Initialize glib and load magic file. 326 */ 327 void __attribute__ ((constructor)) 328 html_gobject_init (void); 329 330 void __attribute__ ((constructor)) 331 html_gobject_init () 332 { 333 magic = magic_open (MAGIC_MIME_TYPE); 334 if (0 != magic_load (magic, NULL)) 335 { 336 /* FIXME: how to deal with errors? */ 337 } 338 } 339 340 341 /** 342 * Destructor for the library, cleans up. 343 */ 344 void __attribute__ ((destructor)) 345 html_ltdl_fini (void); 346 347 void __attribute__ ((destructor)) 348 html_ltdl_fini () 349 { 350 if (NULL != magic) 351 { 352 magic_close (magic); 353 magic = NULL; 354 } 355 } 356 357 358 /* end of html_extractor.c */