1 files changed, 694 insertions, 0 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c
new file mode 100644
index 0000000..65fb535
--- /dev/null
+++ b/src/plugins/html_extractor.c
@@ -0,0 +1,694 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+/**
+ * @file plugins/html_extractor.c
+ * @brief plugin to support HTML files
+ * @author Christian Grothoff
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <magic.h>
+#include <tidy/tidy.h>
+#include <tidy/buffio.h>
+/**
+ * Mapping of HTML META names to LE types.
+ */
+static struct
+{
+  /**
+   * HTML META name.
+   */
+  const char *name;
+  /**
+   * Corresponding LE type.
+   */
+  enum EXTRACTOR_MetaType type;
+} tagmap[] = {
+  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "title", EXTRACTOR_METATYPE_TITLE },
+  { "dc.title", EXTRACTOR_METATYPE_TITLE},
+  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
+  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
+  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
+  { "rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
+  { "language", EXTRACTOR_METATYPE_LANGUAGE },  
+  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
+  { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
+  { "dc.identifier", EXTRACTOR_METATYPE_URI },
+  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
+  { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+/**
+ * Global handle to MAGIC data.
+ */
+static magic_t magic;
+/**
+ * Map 'meta' tag to LE type.
+ *
+ * @param tag tag to map
+ * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
+ */
+static enum EXTRACTOR_MetaType 
+tag_to_type (const char *tag)
+{
+  unsigned int i;
+  for (i=0; NULL != tagmap[i].name; i++)
+    if (0 == strcasecmp (tag,
+                         tagmap[i].name))
+      return tagmap[i].type;
+  return EXTRACTOR_METATYPE_RESERVED;
+}
+/**
+ * Function called by libtidy for error reporting.
+ *
+ * @param doc tidy doc being processed
+ * @param lvl report level
+ * @param line input line
+ * @param col input column
+ * @param mssg message
+ * @return FALSE (no output)
+ */
+static Bool
+report_cb (TidyDoc doc,
+           TidyReportLevel lvl,
+           uint line,
+           uint col,
+           ctmbstr mssg)
+{
+  return 0;
+}
+/**
+ * Input callback: get next byte of input.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @return next byte of input, EndOfStream on errors and EOF
+ */
+static int
+get_byte_cb (void *sourceData)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+  void *data;
+  if (1 !=
+      ec->read (ec->cls,
+                &data, 1))
+    return EndOfStream;
+  return *(unsigned char*) data;
+}
+/**
+ * Input callback: unget last byte of input.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @param bt byte to unget (ignored)
+ */
+static void
+unget_byte_cb (void *sourceData, byte bt)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+  
+  (void) ec->seek (ec->cls, -1, SEEK_CUR);
+}
+/**
+ * Input callback: check for EOF.
+ *
+ * @param sourceData our 'struct EXTRACTOR_ExtractContext'
+ * @return true if we are at the EOF
+ */
+static Bool
+eof_cb (void *sourceData)
+{
+  struct EXTRACTOR_ExtractContext *ec = sourceData;
+  return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
+}
+/**
+ * Main entry method for the 'text/html' extraction plugin.  
+ *
+ * @param ec extraction context provided to the plugin
+ */
+void 
+EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
+{
+  TidyDoc doc;
+  TidyNode head;
+  TidyNode child;
+  TidyNode title;
+  TidyInputSource src;
+  const char *name;
+  TidyBuffer tbuf;
+  TidyAttr attr;
+  enum EXTRACTOR_MetaType type;
+  ssize_t iret;
+  void *data;
+  const char *mime;
+  if (-1 == (iret = ec->read (ec->cls,
+                              &data,
+                              16 * 1024)))
+    return;
+  if (NULL == (mime = magic_buffer (magic, data, iret)))
+    return;
+  if (0 != strncmp (mime,
+                    "text/html",
+                    strlen ("text/html")))
+    return; /* not HTML */
+  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
+    return; /* seek failed !? */
+  tidyInitSource (&src, ec,
+                  &get_byte_cb,
+                  &unget_byte_cb,
+                  &eof_cb);
+  if (NULL == (doc = tidyCreate ()))
+    return;
+  tidySetReportFilter (doc, &report_cb);
+  tidySetAppData (doc, ec);
+  if (0 > tidyParseSource (doc, &src))
+    {
+      tidyRelease (doc);
+      return;
+    }
+  if (1 != tidyStatus (doc))
+    {
+      tidyRelease (doc);
+      return;
+    }
+  if (NULL == (head = tidyGetHead (doc)))
+    {
+      fprintf (stderr, "no head\n");
+      tidyRelease (doc);
+      return;
+    }
+  for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
+    {
+      switch (tidyNodeGetType(child))
+        {
+        case TidyNode_Root:
+          break;
+        case TidyNode_DocType:
+          break;
+        case TidyNode_Comment:
+          break;
+        case TidyNode_ProcIns:
+          break;
+        case TidyNode_Text:
+          break;
+        case TidyNode_CDATA:
+          break;
+        case TidyNode_Section:
+          break;
+        case TidyNode_Asp:
+          break;
+        case TidyNode_Jste:
+          break;
+        case TidyNode_Php:
+          break;
+        case TidyNode_XmlDecl:
+          break;          
+        case TidyNode_Start:
+        case TidyNode_StartEnd: 
+          name = tidyNodeGetName (child);
+          if ( (0 == strcasecmp (name, "title")) &&
+               (NULL != (title = tidyGetChild (child))) )
+            {
+              tidyBufInit (&tbuf);
+              tidyNodeGetValue (doc, title, &tbuf);
+              /* add 0-termination */
+              tidyBufPutByte (&tbuf, 0);
+              if (0 !=
+                  ec->proc (ec->cls,
+                            "html",
+                            EXTRACTOR_METATYPE_TITLE,
+                            EXTRACTOR_METAFORMAT_UTF8,
+                            "text/plain",
+                            (const char *) tbuf.bp,
+                            tbuf.size))
+                {
+                  tidyBufFree (&tbuf);
+                  goto CLEANUP;
+                }
+              tidyBufFree (&tbuf);
+              break;
+            }
+          if (0 == strcasecmp (name, "meta"))
+            {
+              if (NULL == (attr = tidyAttrGetById (child, 
+                                                   TidyAttr_NAME)))
+                break;
+              if (EXTRACTOR_METATYPE_RESERVED == 
+                  (type = tag_to_type (tidyAttrValue (attr))))
+                break;
+              if (NULL == (attr = tidyAttrGetById (child, 
+                                                   TidyAttr_CONTENT)))
+                break;
+              name = tidyAttrValue (attr);
+              if (0 !=
+                  ec->proc (ec->cls,
+                            "html",
+                            type,
+                            EXTRACTOR_METAFORMAT_UTF8,
+                            "text/plain",
+                            name,
+                            strlen (name) + 1))
+                goto CLEANUP;
+              break;    
+            }
+          break;
+        case TidyNode_End:
+          break;          
+        default:
+          break;
+        }      
+    }
+ CLEANUP:
+  tidyRelease (doc);
+}
+#if OLD
+/* ******************** parser helper functions ************** */
+static int
+tagMatch (const char *tag, const char *s, const char *e)
+{
+  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
+}
+static int
+lookFor (char c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (data[p] != c))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+skipWhitespace (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (isspace ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+skipLetters (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (isalpha ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (strchr (c, data[p]) == NULL))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static void
+findEntry (const char *key,
+           const char *start,
+           const char *end, const char **mstart, const char **mend)
+{
+  size_t len;
+  *mstart = NULL;
+  *mend = NULL;
+  len = strlen (key);
+  while (start < end - len - 1)
+    {
+      start++;
+      if (start[len] != '=')
+        continue;
+      if (0 == strncasecmp (start, key, len))
+        {
+          start += len + 1;
+          *mstart = start;
+          if ((*start == '\"') || (*start == '\''))
+            {
+              start++;
+              while ((start < end) && (*start != **mstart))
+                start++;
+              (*mstart)++;      /* skip quote */
+            }
+          else
+            {
+              while ((start < end) && (!isspace ( (unsigned char) *start)))
+                start++;
+            }
+          *mend = start;
+          return;
+        }
+    }
+}
+/**
+ * Search all tags that correspond to "tagname".  Example:
+ * If the tag is <meta name="foo" desc="bar">, and
+ * tagname == "meta", keyname="name", keyvalue="foo",
+ * and searchname="desc", then this function returns a
+ * copy (!) of "bar".  Easy enough?
+ *
+ * @return NULL if nothing is found
+ */
+static char *
+findInTags (struct TagInfo * t,
+            const char *tagname,
+            const char *keyname, const char *keyvalue, const char *searchname)
+{
+  const char *pstart;
+  const char *pend;
+  while (t != NULL)
+    {
+      if (tagMatch (tagname, t->tagStart, t->tagEnd))
+        {
+          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+            {
+              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+              if (pstart != NULL)
+                {
+                  char *ret = malloc (pend - pstart + 1);
+                  if (ret == NULL)
+                    return NULL;
+                  memcpy (ret, pstart, pend - pstart);
+                  ret[pend - pstart] = '\0';
+                  return ret;
+                }
+            }
+        }
+      t = t->next;
+    }
+  return NULL;
+}
+/* mimetype = text/html */
+int 
+EXTRACTOR_html_extract (const char *data,
+                        size_t size,
+                        EXTRACTOR_MetaDataProcessor proc,
+                        void *proc_cls,
+                        const char *options)
+{
+  size_t xsize;
+  struct TagInfo *tags;
+  struct TagInfo *t;
+  struct TagInfo tag;
+  size_t pos;
+  size_t tpos;
+  int i;
+  char *charset;
+  char *tmp;
+  char *xtmp;
+  int ret;
+  ret = 0;
+  if (size == 0)
+    return 0;
+  /* only scan first 32k */
+  if (size > 1024 * 32)
+    xsize = 1024 * 32;
+  else
+    xsize = size;
+  tags = NULL;
+  tag.next = NULL;
+  pos = 0;
+  while (pos < xsize)
+    {
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.tagStart = &data[++pos];
+      if (!skipLetters (&pos, data, size))
+        break;
+      tag.tagEnd = &data[pos];
+      if (!skipWhitespace (&pos, data, size))
+        break;
+    STEP3:
+      if (!lookForMultiple (">\"\'", &pos, data, size))
+        break;
+      if (data[pos] != '>')
+        {
+          /* find end-quote, ignore escaped quotes (\') */
+          do
+            {
+              tpos = pos;
+              pos++;
+              if (!lookFor (data[tpos], &pos, data, size))
+                break;
+            }
+          while (data[pos - 1] == '\\');
+          pos++;
+          goto STEP3;
+        }
+      pos++;
+      if (!skipWhitespace (&pos, data, size))
+        break;
+      tag.dataStart = &data[pos];
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.dataEnd = &data[pos];
+      i = 0;
+      while (relevantTags[i] != NULL)
+        {
+          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+              (0 == strncasecmp (relevantTags[i],
+                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
+            {
+              t = malloc (sizeof (struct TagInfo));
+              if (t == NULL)
+                return 0;
+              *t = tag;
+              t->next = tags;
+              tags = t;
+              break;
+            }
+          i++;
+        }
+      /* abort early if we hit the body tag */
+      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
+        break;
+    }
+  /* fast exit */
+  if (tags == NULL)
+    return 0;
+  charset = NULL;
+  /* first, try to determine mime type and/or character set */
+  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
+  if (tmp != NULL)
+    {
+      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
+         if text/html is present, we take that as the mime-type; if charset=
+         is present, we try to use that for character set conversion. */
+      if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
+        ret = proc (proc_cls, 
+                    "html",
+                    EXTRACTOR_METATYPE_MIMETYPE,
+                    EXTRACTOR_METAFORMAT_UTF8,
+                    "text/plain",
+                    "text/html",
+                    strlen ("text/html")+1);
+      charset = strcasestr (tmp, "charset=");
+      if (charset != NULL)
+        charset = strdup (&charset[strlen ("charset=")]);
+      free (tmp);
+    }
+  i = 0;
+  while (tagmap[i].name != NULL)
+    {
+      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+      if ( (tmp != NULL) &&
+           (ret == 0) )
+        {
+          if (charset == NULL)
+            {
+              ret = proc (proc_cls,
+                          "html",
+                          tagmap[i].type,
+                          EXTRACTOR_METAFORMAT_C_STRING,
+                          "text/plain",
+                          tmp,
+                          strlen (tmp) + 1);
+            }
+          else
+            {
+              xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                       strlen (tmp),
+                                                       charset);
+              if (xtmp != NULL)
+                {
+                  ret = proc (proc_cls,
+                              "html",
+                              tagmap[i].type,
+                              EXTRACTOR_METAFORMAT_UTF8,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+        }
+      if (tmp != NULL)
+        free (tmp);
+      i++;
+    }
+  while (tags != NULL) 
+    {
+      t = tags;
+      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+           (ret == 0) )
+        {
+          if (charset == NULL)
+            {
+              xtmp = malloc (t->dataEnd - t->dataStart + 1);
+              if (xtmp != NULL)
+                {
+                  memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
+                  xtmp[t->dataEnd - t->dataStart] = '\0';
+                  ret = proc (proc_cls,
+                              "html",
+                              EXTRACTOR_METATYPE_TITLE,
+                              EXTRACTOR_METAFORMAT_C_STRING,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+          else
+            {
+              xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
+                                                       t->dataEnd - t->dataStart,
+                                                       charset);
+              if (xtmp != NULL)
+                {
+                  ret = proc (proc_cls,
+                              "html",
+                              EXTRACTOR_METATYPE_TITLE,
+                              EXTRACTOR_METAFORMAT_UTF8,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+        }
+      tags = t->next;
+      free (t);
+    }
+  if (charset != NULL)
+    free (charset);
+  return ret;
+}
+#endif
+/**
+ * Initialize glib and load magic file.
+ */
+void __attribute__ ((constructor)) 
+html_gobject_init ()
+{
+  magic = magic_open (MAGIC_MIME_TYPE);
+  if (0 != magic_load (magic, NULL))
+    {
+      /* FIXME: how to deal with errors? */
+    }
+}
+/**
+ * Destructor for the library, cleans up.
+ */
+void __attribute__ ((destructor)) 
+html_ltdl_fini () 
+{
+  if (NULL != magic)
+    {
+      magic_close (magic);
+      magic = NULL;
+    }
+}
+/* end of html_extractor.c */

diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c new file mode 100644 index 0000000..65fb535 --- /dev/null +++ b/src/plugins/html_extractor.c
@@ -0,0 +1,694 @@
	1	/*
	2	This file is part of libextractor.
	3	(C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
	4
	5	libextractor is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU General Public License as published
	7	by the Free Software Foundation; either version 2, or (at your
	8	option) any later version.
	9
	10	libextractor is distributed in the hope that it will be useful, but
	11	WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	General Public License for more details.
	14
	15	You should have received a copy of the GNU General Public License
	16	along with libextractor; see the file COPYING. If not, write to the
	17	Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	18	Boston, MA 02111-1307, USA.
	19
	20	*/
	21	/**
	22	* @file plugins/html_extractor.c
	23	* @brief plugin to support HTML files
	24	* @author Christian Grothoff
	25	*/
	26	#include "platform.h"
	27	#include "extractor.h"
	28	#include <magic.h>
	29	#include <tidy/tidy.h>
	30	#include <tidy/buffio.h>
	31
	32	/**
	33	* Mapping of HTML META names to LE types.
	34	*/
	35	static struct
	36	{
	37	/**
	38	* HTML META name.
	39	*/
	40	const char *name;
	41
	42	/**
	43	* Corresponding LE type.
	44	*/
	45	enum EXTRACTOR_MetaType type;
	46	} tagmap[] = {
	47	{ "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
	48	{ "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
	49	{ "title", EXTRACTOR_METATYPE_TITLE },
	50	{ "dc.title", EXTRACTOR_METATYPE_TITLE},
	51	{ "description", EXTRACTOR_METATYPE_DESCRIPTION },
	52	{ "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
	53	{ "subject", EXTRACTOR_METATYPE_SUBJECT},
	54	{ "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
	55	{ "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
	56	{ "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
	57	{ "publisher", EXTRACTOR_METATYPE_PUBLISHER },
	58	{ "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
	59	{ "rights", EXTRACTOR_METATYPE_RIGHTS },
	60	{ "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
	61	{ "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
	62	{ "language", EXTRACTOR_METATYPE_LANGUAGE },
	63	{ "keywords", EXTRACTOR_METATYPE_KEYWORDS },
	64	{ "abstract", EXTRACTOR_METATYPE_ABSTRACT },
	65	{ "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
	66	{ "dc.creator", EXTRACTOR_METATYPE_CREATOR},
	67	{ "dc.identifier", EXTRACTOR_METATYPE_URI },
	68	{ "dc.format", EXTRACTOR_METATYPE_FORMAT },
	69	{ NULL, EXTRACTOR_METATYPE_RESERVED }
	70	};
	71
	72
	73	/**
	74	* Global handle to MAGIC data.
	75	*/
	76	static magic_t magic;
	77
	78
	79	/**
	80	* Map 'meta' tag to LE type.
	81	*
	82	* @param tag tag to map
	83	* @return EXTRACTOR_METATYPE_RESERVED if the type was not found
	84	*/
	85	static enum EXTRACTOR_MetaType
	86	tag_to_type (const char *tag)
	87	{
	88	unsigned int i;
	89
	90	for (i=0; NULL != tagmap[i].name; i++)
	91	if (0 == strcasecmp (tag,
	92	tagmap[i].name))
	93	return tagmap[i].type;
	94	return EXTRACTOR_METATYPE_RESERVED;
	95	}
	96
	97
	98	/**
	99	* Function called by libtidy for error reporting.
	100	*
	101	* @param doc tidy doc being processed
	102	* @param lvl report level
	103	* @param line input line
	104	* @param col input column
	105	* @param mssg message
	106	* @return FALSE (no output)
	107	*/
	108	static Bool
	109	report_cb (TidyDoc doc,
	110	TidyReportLevel lvl,
	111	uint line,
	112	uint col,
	113	ctmbstr mssg)
	114	{
	115	return 0;
	116	}
	117
	118
	119	/**
	120	* Input callback: get next byte of input.
	121	*
	122	* @param sourceData our 'struct EXTRACTOR_ExtractContext'
	123	* @return next byte of input, EndOfStream on errors and EOF
	124	*/
	125	static int
	126	get_byte_cb (void *sourceData)
	127	{
	128	struct EXTRACTOR_ExtractContext *ec = sourceData;
	129	void *data;
	130
	131	if (1 !=
	132	ec->read (ec->cls,
	133	&data, 1))
	134	return EndOfStream;
	135	return (unsigned char) data;
	136	}
	137
	138
	139	/**
	140	* Input callback: unget last byte of input.
	141	*
	142	* @param sourceData our 'struct EXTRACTOR_ExtractContext'
	143	* @param bt byte to unget (ignored)
	144	*/
	145	static void
	146	unget_byte_cb (void *sourceData, byte bt)
	147	{
	148	struct EXTRACTOR_ExtractContext *ec = sourceData;
	149
	150	(void) ec->seek (ec->cls, -1, SEEK_CUR);
	151	}
	152
	153
	154	/**
	155	* Input callback: check for EOF.
	156	*
	157	* @param sourceData our 'struct EXTRACTOR_ExtractContext'
	158	* @return true if we are at the EOF
	159	*/
	160	static Bool
	161	eof_cb (void *sourceData)
	162	{
	163	struct EXTRACTOR_ExtractContext *ec = sourceData;
	164
	165	return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
	166	}
	167
	168
	169	/**
	170	* Main entry method for the 'text/html' extraction plugin.
	171	*
	172	* @param ec extraction context provided to the plugin
	173	*/
	174	void
	175	EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
	176	{
	177	TidyDoc doc;
	178	TidyNode head;
	179	TidyNode child;
	180	TidyNode title;
	181	TidyInputSource src;
	182	const char *name;
	183	TidyBuffer tbuf;
	184	TidyAttr attr;
	185	enum EXTRACTOR_MetaType type;
	186	ssize_t iret;
	187	void *data;
	188	const char *mime;
	189
	190	if (-1 == (iret = ec->read (ec->cls,
	191	&data,
	192	16 * 1024)))
	193	return;
	194	if (NULL == (mime = magic_buffer (magic, data, iret)))
	195	return;
	196	if (0 != strncmp (mime,
	197	"text/html",
	198	strlen ("text/html")))
	199	return; /* not HTML */
	200
	201	if (0 != ec->seek (ec->cls, 0, SEEK_SET))
	202	return; /* seek failed !? */
	203
	204	tidyInitSource (&src, ec,
	205	&get_byte_cb,
	206	&unget_byte_cb,
	207	&eof_cb);
	208	if (NULL == (doc = tidyCreate ()))
	209	return;
	210	tidySetReportFilter (doc, &report_cb);
	211	tidySetAppData (doc, ec);
	212	if (0 > tidyParseSource (doc, &src))
	213	{
	214	tidyRelease (doc);
	215	return;
	216	}
	217	if (1 != tidyStatus (doc))
	218	{
	219	tidyRelease (doc);
	220	return;
	221	}
	222	if (NULL == (head = tidyGetHead (doc)))
	223	{
	224	fprintf (stderr, "no head\n");
	225	tidyRelease (doc);
	226	return;
	227	}
	228	for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
	229	{
	230	switch (tidyNodeGetType(child))
	231	{
	232	case TidyNode_Root:
	233	break;
	234	case TidyNode_DocType:
	235	break;
	236	case TidyNode_Comment:
	237	break;
	238	case TidyNode_ProcIns:
	239	break;
	240	case TidyNode_Text:
	241	break;
	242	case TidyNode_CDATA:
	243	break;
	244	case TidyNode_Section:
	245	break;
	246	case TidyNode_Asp:
	247	break;
	248	case TidyNode_Jste:
	249	break;
	250	case TidyNode_Php:
	251	break;
	252	case TidyNode_XmlDecl:
	253	break;
	254	case TidyNode_Start:
	255	case TidyNode_StartEnd:
	256	name = tidyNodeGetName (child);
	257	if ( (0 == strcasecmp (name, "title")) &&
	258	(NULL != (title = tidyGetChild (child))) )
	259	{
	260	tidyBufInit (&tbuf);
	261	tidyNodeGetValue (doc, title, &tbuf);
	262	/* add 0-termination */
	263	tidyBufPutByte (&tbuf, 0);
	264	if (0 !=
	265	ec->proc (ec->cls,
	266	"html",
	267	EXTRACTOR_METATYPE_TITLE,
	268	EXTRACTOR_METAFORMAT_UTF8,
	269	"text/plain",
	270	(const char *) tbuf.bp,
	271	tbuf.size))
	272	{
	273	tidyBufFree (&tbuf);
	274	goto CLEANUP;
	275	}
	276	tidyBufFree (&tbuf);
	277	break;
	278	}
	279	if (0 == strcasecmp (name, "meta"))
	280	{
	281	if (NULL == (attr = tidyAttrGetById (child,
	282	TidyAttr_NAME)))
	283	break;
	284	if (EXTRACTOR_METATYPE_RESERVED ==
	285	(type = tag_to_type (tidyAttrValue (attr))))
	286	break;
	287	if (NULL == (attr = tidyAttrGetById (child,
	288	TidyAttr_CONTENT)))
	289	break;
	290	name = tidyAttrValue (attr);
	291	if (0 !=
	292	ec->proc (ec->cls,
	293	"html",
	294	type,
	295	EXTRACTOR_METAFORMAT_UTF8,
	296	"text/plain",
	297	name,
	298	strlen (name) + 1))
	299	goto CLEANUP;
	300	break;
	301	}
	302	break;
	303	case TidyNode_End:
	304	break;
	305	default:
	306	break;
	307	}
	308	}
	309	CLEANUP:
	310	tidyRelease (doc);
	311	}
	312
	313
	314
	315	#if OLD
	316
	317
	318	/* ****************** parser helper functions ************ */
	319
	320	static int
	321	tagMatch (const char tag, const char s, const char *e)
	322	{
	323	return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
	324	}
	325
	326	static int
	327	lookFor (char c, size_t * pos, const char *data, size_t size)
	328	{
	329	size_t p = *pos;
	330
	331	while ((p < size) && (data[p] != c))
	332	{
	333	if (data[p] == '\0')
	334	return 0;
	335	p++;
	336	}
	337	*pos = p;
	338	return p < size;
	339	}
	340
	341	static int
	342	skipWhitespace (size_t * pos, const char *data, size_t size)
	343	{
	344	size_t p = *pos;
	345
	346	while ((p < size) && (isspace ( (unsigned char) data[p])))
	347	{
	348	if (data[p] == '\0')
	349	return 0;
	350	p++;
	351	}
	352	*pos = p;
	353	return p < size;
	354	}
	355
	356	static int
	357	skipLetters (size_t * pos, const char *data, size_t size)
	358	{
	359	size_t p = *pos;
	360
	361	while ((p < size) && (isalpha ( (unsigned char) data[p])))
	362	{
	363	if (data[p] == '\0')
	364	return 0;
	365	p++;
	366	}
	367	*pos = p;
	368	return p < size;
	369	}
	370
	371	static int
	372	lookForMultiple (const char c, size_t pos, const char *data, size_t size)
	373	{
	374	size_t p = *pos;
	375
	376	while ((p < size) && (strchr (c, data[p]) == NULL))
	377	{
	378	if (data[p] == '\0')
	379	return 0;
	380	p++;
	381	}
	382	*pos = p;
	383	return p < size;
	384	}
	385
	386	static void
	387	findEntry (const char *key,
	388	const char *start,
	389	const char end, const char mstart, const char *mend)
	390	{
	391	size_t len;
	392
	393	*mstart = NULL;
	394	*mend = NULL;
	395	len = strlen (key);
	396	while (start < end - len - 1)
	397	{
	398	start++;
	399	if (start[len] != '=')
	400	continue;
	401	if (0 == strncasecmp (start, key, len))
	402	{
	403	start += len + 1;
	404	*mstart = start;
	405	if ((start == '\"') \|\| (start == '\''))
	406	{
	407	start++;
	408	while ((start < end) && (start != *mstart))
	409	start++;
	410	(mstart)++; / skip quote */
	411	}
	412	else
	413	{
	414	while ((start < end) && (!isspace ( (unsigned char) *start)))
	415	start++;
	416	}
	417	*mend = start;
	418	return;
	419	}
	420	}
	421	}
	422
	423	/**
	424	* Search all tags that correspond to "tagname". Example:
	425	* If the tag is <meta name="foo" desc="bar">, and
	426	* tagname == "meta", keyname="name", keyvalue="foo",
	427	* and searchname="desc", then this function returns a
	428	* copy (!) of "bar". Easy enough?
	429	*
	430	* @return NULL if nothing is found
	431	*/
	432	static char *
	433	findInTags (struct TagInfo * t,
	434	const char *tagname,
	435	const char keyname, const char keyvalue, const char *searchname)
	436	{
	437	const char *pstart;
	438	const char *pend;
	439
	440	while (t != NULL)
	441	{
	442	if (tagMatch (tagname, t->tagStart, t->tagEnd))
	443	{
	444	findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
	445	if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
	446	{
	447	findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
	448	if (pstart != NULL)
	449	{
	450	char *ret = malloc (pend - pstart + 1);
	451	if (ret == NULL)
	452	return NULL;
	453	memcpy (ret, pstart, pend - pstart);
	454	ret[pend - pstart] = '\0';
	455	return ret;
	456	}
	457	}
	458	}
	459	t = t->next;
	460	}
	461	return NULL;
	462	}
	463
	464
	465	/* mimetype = text/html */
	466	int
	467	EXTRACTOR_html_extract (const char *data,
	468	size_t size,
	469	EXTRACTOR_MetaDataProcessor proc,
	470	void *proc_cls,
	471	const char *options)
	472	{
	473	size_t xsize;
	474	struct TagInfo *tags;
	475	struct TagInfo *t;
	476	struct TagInfo tag;
	477	size_t pos;
	478	size_t tpos;
	479	int i;
	480	char *charset;
	481	char *tmp;
	482	char *xtmp;
	483	int ret;
	484
	485	ret = 0;
	486	if (size == 0)
	487	return 0;
	488	/* only scan first 32k */
	489	if (size > 1024 * 32)
	490	xsize = 1024 * 32;
	491	else
	492	xsize = size;
	493	tags = NULL;
	494	tag.next = NULL;
	495	pos = 0;
	496	while (pos < xsize)
	497	{
	498	if (!lookFor ('<', &pos, data, size))
	499	break;
	500	tag.tagStart = &data[++pos];
	501	if (!skipLetters (&pos, data, size))
	502	break;
	503	tag.tagEnd = &data[pos];
	504	if (!skipWhitespace (&pos, data, size))
	505	break;
	506	STEP3:
	507	if (!lookForMultiple (">\"\'", &pos, data, size))
	508	break;
	509	if (data[pos] != '>')
	510	{
	511	/* find end-quote, ignore escaped quotes (\') */
	512	do
	513	{
	514	tpos = pos;
	515	pos++;
	516	if (!lookFor (data[tpos], &pos, data, size))
	517	break;
	518	}
	519	while (data[pos - 1] == '\\');
	520	pos++;
	521	goto STEP3;
	522	}
	523	pos++;
	524	if (!skipWhitespace (&pos, data, size))
	525	break;
	526	tag.dataStart = &data[pos];
	527	if (!lookFor ('<', &pos, data, size))
	528	break;
	529	tag.dataEnd = &data[pos];
	530	i = 0;
	531	while (relevantTags[i] != NULL)
	532	{
	533	if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
	534	(0 == strncasecmp (relevantTags[i],
	535	tag.tagStart, tag.tagEnd - tag.tagStart)))
	536	{
	537	t = malloc (sizeof (struct TagInfo));
	538	if (t == NULL)
	539	return 0;
	540	*t = tag;
	541	t->next = tags;
	542	tags = t;
	543	break;
	544	}
	545	i++;
	546	}
	547	/* abort early if we hit the body tag */
	548	if (tagMatch ("body", tag.tagStart, tag.tagEnd))
	549	break;
	550	}
	551
	552	/* fast exit */
	553	if (tags == NULL)
	554	return 0;
	555
	556	charset = NULL;
	557	/* first, try to determine mime type and/or character set */
	558	tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
	559	if (tmp != NULL)
	560	{
	561	/* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
	562	if text/html is present, we take that as the mime-type; if charset=
	563	is present, we try to use that for character set conversion. */
	564	if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
	565	ret = proc (proc_cls,
	566	"html",
	567	EXTRACTOR_METATYPE_MIMETYPE,
	568	EXTRACTOR_METAFORMAT_UTF8,
	569	"text/plain",
	570	"text/html",
	571	strlen ("text/html")+1);
	572	charset = strcasestr (tmp, "charset=");
	573	if (charset != NULL)
	574	charset = strdup (&charset[strlen ("charset=")]);
	575	free (tmp);
	576	}
	577	i = 0;
	578	while (tagmap[i].name != NULL)
	579	{
	580	tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
	581	if ( (tmp != NULL) &&
	582	(ret == 0) )
	583	{
	584	if (charset == NULL)
	585	{
	586	ret = proc (proc_cls,
	587	"html",
	588	tagmap[i].type,
	589	EXTRACTOR_METAFORMAT_C_STRING,
	590	"text/plain",
	591	tmp,
	592	strlen (tmp) + 1);
	593	}
	594	else
	595	{
	596	xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
	597	strlen (tmp),
	598	charset);
	599	if (xtmp != NULL)
	600	{
	601	ret = proc (proc_cls,
	602	"html",
	603	tagmap[i].type,
	604	EXTRACTOR_METAFORMAT_UTF8,
	605	"text/plain",
	606	xtmp,
	607	strlen (xtmp) + 1);
	608	free (xtmp);
	609	}
	610	}
	611	}
	612	if (tmp != NULL)
	613	free (tmp);
	614	i++;
	615	}
	616	while (tags != NULL)
	617	{
	618	t = tags;
	619	if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
	620	(ret == 0) )
	621	{
	622	if (charset == NULL)
	623	{
	624	xtmp = malloc (t->dataEnd - t->dataStart + 1);
	625	if (xtmp != NULL)
	626	{
	627	memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
	628	xtmp[t->dataEnd - t->dataStart] = '\0';
	629	ret = proc (proc_cls,
	630	"html",
	631	EXTRACTOR_METATYPE_TITLE,
	632	EXTRACTOR_METAFORMAT_C_STRING,
	633	"text/plain",
	634	xtmp,
	635	strlen (xtmp) + 1);
	636	free (xtmp);
	637	}
	638	}
	639	else
	640	{
	641	xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
	642	t->dataEnd - t->dataStart,
	643	charset);
	644	if (xtmp != NULL)
	645	{
	646	ret = proc (proc_cls,
	647	"html",
	648	EXTRACTOR_METATYPE_TITLE,
	649	EXTRACTOR_METAFORMAT_UTF8,
	650	"text/plain",
	651	xtmp,
	652	strlen (xtmp) + 1);
	653	free (xtmp);
	654	}
	655	}
	656	}
	657	tags = t->next;
	658	free (t);
	659	}
	660	if (charset != NULL)
	661	free (charset);
	662	return ret;
	663	}
	664	#endif
	665
	666
	667	/**
	668	* Initialize glib and load magic file.
	669	*/
	670	void __attribute__ ((constructor))
	671	html_gobject_init ()
	672	{
	673	magic = magic_open (MAGIC_MIME_TYPE);
	674	if (0 != magic_load (magic, NULL))
	675	{
	676	/* FIXME: how to deal with errors? */
	677	}
	678	}
	679
	680
	681	/**
	682	* Destructor for the library, cleans up.
	683	*/
	684	void __attribute__ ((destructor))
	685	html_ltdl_fini ()
	686	{
	687	if (NULL != magic)
	688	{
	689	magic_close (magic);
	690	magic = NULL;
	691	}
	692	}
	693
	694	/* end of html_extractor.c */