1 files changed, 420 insertions, 0 deletions
diff --git a/src/plugins/old/html_extractor.c b/src/plugins/old/html_extractor.c
new file mode 100644
index 0000000..004d22a
--- /dev/null
+++ b/src/plugins/old/html_extractor.c
@@ -0,0 +1,420 @@
+/*
+     This file is part of libextractor.
+     (C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff
+     libextractor is free software; you can redistribute it and/or modify
+     it under the terms of the GNU General Public License as published
+     by the Free Software Foundation; either version 2, or (at your
+     option) any later version.
+     libextractor is distributed in the hope that it will be useful, but
+     WITHOUT ANY WARRANTY; without even the implied warranty of
+     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+     General Public License for more details.
+     You should have received a copy of the GNU General Public License
+     along with libextractor; see the file COPYING.  If not, write to the
+     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+     Boston, MA 02111-1307, USA.
+ */
+#include "platform.h"
+#include "extractor.h"
+#include <string.h>
+#include "convert.h"
+static struct
+{
+  const char *name;
+  enum EXTRACTOR_MetaType type;
+} tagmap[] = {
+  { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
+  { "title", EXTRACTOR_METATYPE_TITLE },
+  { "dc.title", EXTRACTOR_METATYPE_TITLE},
+  { "description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
+  { "subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
+  { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
+  { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
+  { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
+  { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
+  { "rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
+  { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
+  { "language", EXTRACTOR_METATYPE_LANGUAGE },  
+  { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
+  { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
+  { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
+  { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
+  { "dc.identifier", EXTRACTOR_METATYPE_URI },
+  { "dc.format", EXTRACTOR_METATYPE_FORMAT },
+  { NULL, EXTRACTOR_METATYPE_RESERVED }
+};
+static const char *relevantTags[] = {
+  "title",
+  "meta",
+  NULL,
+};
+typedef struct TI
+{
+  struct TI *next;
+  const char *tagStart;
+  const char *tagEnd;
+  const char *dataStart;
+  const char *dataEnd;
+} TagInfo;
+/* ******************** parser helper functions ************** */
+static int
+tagMatch (const char *tag, const char *s, const char *e)
+{
+  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
+}
+static int
+lookFor (char c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (data[p] != c))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+skipWhitespace (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (isspace ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+skipLetters (size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (isalpha ( (unsigned char) data[p])))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static int
+lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
+{
+  size_t p = *pos;
+  while ((p < size) && (strchr (c, data[p]) == NULL))
+    {
+      if (data[p] == '\0')
+        return 0;
+      p++;
+    }
+  *pos = p;
+  return p < size;
+}
+static void
+findEntry (const char *key,
+           const char *start,
+           const char *end, const char **mstart, const char **mend)
+{
+  size_t len;
+  *mstart = NULL;
+  *mend = NULL;
+  len = strlen (key);
+  while (start < end - len - 1)
+    {
+      start++;
+      if (start[len] != '=')
+        continue;
+      if (0 == strncasecmp (start, key, len))
+        {
+          start += len + 1;
+          *mstart = start;
+          if ((*start == '\"') || (*start == '\''))
+            {
+              start++;
+              while ((start < end) && (*start != **mstart))
+                start++;
+              (*mstart)++;      /* skip quote */
+            }
+          else
+            {
+              while ((start < end) && (!isspace ( (unsigned char) *start)))
+                start++;
+            }
+          *mend = start;
+          return;
+        }
+    }
+}
+/**
+ * Search all tags that correspond to "tagname".  Example:
+ * If the tag is <meta name="foo" desc="bar">, and
+ * tagname == "meta", keyname="name", keyvalue="foo",
+ * and searchname="desc", then this function returns a
+ * copy (!) of "bar".  Easy enough?
+ *
+ * @return NULL if nothing is found
+ */
+static char *
+findInTags (TagInfo * t,
+            const char *tagname,
+            const char *keyname, const char *keyvalue, const char *searchname)
+{
+  const char *pstart;
+  const char *pend;
+  while (t != NULL)
+    {
+      if (tagMatch (tagname, t->tagStart, t->tagEnd))
+        {
+          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+            {
+              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+              if (pstart != NULL)
+                {
+                  char *ret = malloc (pend - pstart + 1);
+                  if (ret == NULL)
+                    return NULL;
+                  memcpy (ret, pstart, pend - pstart);
+                  ret[pend - pstart] = '\0';
+                  return ret;
+                }
+            }
+        }
+      t = t->next;
+    }
+  return NULL;
+}
+/* mimetype = text/html */
+int 
+EXTRACTOR_html_extract (const char *data,
+                        size_t size,
+                        EXTRACTOR_MetaDataProcessor proc,
+                        void *proc_cls,
+                        const char *options)
+{
+  size_t xsize;
+  TagInfo *tags;
+  TagInfo *t;
+  TagInfo tag;
+  size_t pos;
+  size_t tpos;
+  int i;
+  char *charset;
+  char *tmp;
+  char *xtmp;
+  int ret;
+  ret = 0;
+  if (size == 0)
+    return 0;
+  /* only scan first 32k */
+  if (size > 1024 * 32)
+    xsize = 1024 * 32;
+  else
+    xsize = size;
+  tags = NULL;
+  tag.next = NULL;
+  pos = 0;
+  while (pos < xsize)
+    {
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.tagStart = &data[++pos];
+      if (!skipLetters (&pos, data, size))
+        break;
+      tag.tagEnd = &data[pos];
+      if (!skipWhitespace (&pos, data, size))
+        break;
+    STEP3:
+      if (!lookForMultiple (">\"\'", &pos, data, size))
+        break;
+      if (data[pos] != '>')
+        {
+          /* find end-quote, ignore escaped quotes (\') */
+          do
+            {
+              tpos = pos;
+              pos++;
+              if (!lookFor (data[tpos], &pos, data, size))
+                break;
+            }
+          while (data[pos - 1] == '\\');
+          pos++;
+          goto STEP3;
+        }
+      pos++;
+      if (!skipWhitespace (&pos, data, size))
+        break;
+      tag.dataStart = &data[pos];
+      if (!lookFor ('<', &pos, data, size))
+        break;
+      tag.dataEnd = &data[pos];
+      i = 0;
+      while (relevantTags[i] != NULL)
+        {
+          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+              (0 == strncasecmp (relevantTags[i],
+                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
+            {
+              t = malloc (sizeof (TagInfo));
+              if (t == NULL)
+                return 0;
+              *t = tag;
+              t->next = tags;
+              tags = t;
+              break;
+            }
+          i++;
+        }
+      /* abort early if we hit the body tag */
+      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
+        break;
+    }
+  /* fast exit */
+  if (tags == NULL)
+    return 0;
+  charset = NULL;
+  /* first, try to determine mime type and/or character set */
+  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
+  if (tmp != NULL)
+    {
+      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
+         if text/html is present, we take that as the mime-type; if charset=
+         is present, we try to use that for character set conversion. */
+      if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
+        ret = proc (proc_cls, 
+                    "html",
+                    EXTRACTOR_METATYPE_MIMETYPE,
+                    EXTRACTOR_METAFORMAT_UTF8,
+                    "text/plain",
+                    "text/html",
+                    strlen ("text/html")+1);
+      charset = strcasestr (tmp, "charset=");
+      if (charset != NULL)
+        charset = strdup (&charset[strlen ("charset=")]);
+      free (tmp);
+    }
+  i = 0;
+  while (tagmap[i].name != NULL)
+    {
+      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+      if ( (tmp != NULL) &&
+           (ret == 0) )
+        {
+          if (charset == NULL)
+            {
+              ret = proc (proc_cls,
+                          "html",
+                          tagmap[i].type,
+                          EXTRACTOR_METAFORMAT_C_STRING,
+                          "text/plain",
+                          tmp,
+                          strlen (tmp) + 1);
+            }
+          else
+            {
+              xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                       strlen (tmp),
+                                                       charset);
+              if (xtmp != NULL)
+                {
+                  ret = proc (proc_cls,
+                              "html",
+                              tagmap[i].type,
+                              EXTRACTOR_METAFORMAT_UTF8,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+        }
+      if (tmp != NULL)
+        free (tmp);
+      i++;
+    }
+  while (tags != NULL) 
+    {
+      t = tags;
+      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+           (ret == 0) )
+        {
+          if (charset == NULL)
+            {
+              xtmp = malloc (t->dataEnd - t->dataStart + 1);
+              if (xtmp != NULL)
+                {
+                  memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
+                  xtmp[t->dataEnd - t->dataStart] = '\0';
+                  ret = proc (proc_cls,
+                              "html",
+                              EXTRACTOR_METATYPE_TITLE,
+                              EXTRACTOR_METAFORMAT_C_STRING,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+          else
+            {
+              xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
+                                                       t->dataEnd - t->dataStart,
+                                                       charset);
+              if (xtmp != NULL)
+                {
+                  ret = proc (proc_cls,
+                              "html",
+                              EXTRACTOR_METATYPE_TITLE,
+                              EXTRACTOR_METAFORMAT_UTF8,
+                              "text/plain",
+                              xtmp,
+                              strlen (xtmp) + 1);
+                  free (xtmp);
+                }
+            }
+        }
+      tags = t->next;
+      free (t);
+    }
+  if (charset != NULL)
+    free (charset);
+  return ret;
+}

diff --git a/src/plugins/old/html_extractor.c b/src/plugins/old/html_extractor.c new file mode 100644 index 0000000..004d22a --- /dev/null +++ b/src/plugins/old/html_extractor.c
@@ -0,0 +1,420 @@
	1	/*
	2	This file is part of libextractor.
	3	(C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff
	4
	5	libextractor is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU General Public License as published
	7	by the Free Software Foundation; either version 2, or (at your
	8	option) any later version.
	9
	10	libextractor is distributed in the hope that it will be useful, but
	11	WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	General Public License for more details.
	14
	15	You should have received a copy of the GNU General Public License
	16	along with libextractor; see the file COPYING. If not, write to the
	17	Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	18	Boston, MA 02111-1307, USA.
	19
	20	*/
	21
	22	#include "platform.h"
	23	#include "extractor.h"
	24	#include <string.h>
	25	#include "convert.h"
	26
	27	static struct
	28	{
	29	const char *name;
	30	enum EXTRACTOR_MetaType type;
	31	} tagmap[] = {
	32	{ "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
	33	{ "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
	34	{ "title", EXTRACTOR_METATYPE_TITLE },
	35	{ "dc.title", EXTRACTOR_METATYPE_TITLE},
	36	{ "description", EXTRACTOR_METATYPE_DESCRIPTION },
	37	{ "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
	38	{ "subject", EXTRACTOR_METATYPE_SUBJECT},
	39	{ "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
	40	{ "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
	41	{ "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
	42	{ "publisher", EXTRACTOR_METATYPE_PUBLISHER },
	43	{ "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
	44	{ "rights", EXTRACTOR_METATYPE_RIGHTS },
	45	{ "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
	46	{ "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
	47	{ "language", EXTRACTOR_METATYPE_LANGUAGE },
	48	{ "keywords", EXTRACTOR_METATYPE_KEYWORDS },
	49	{ "abstract", EXTRACTOR_METATYPE_ABSTRACT },
	50	{ "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
	51	{ "dc.creator", EXTRACTOR_METATYPE_CREATOR},
	52	{ "dc.identifier", EXTRACTOR_METATYPE_URI },
	53	{ "dc.format", EXTRACTOR_METATYPE_FORMAT },
	54	{ NULL, EXTRACTOR_METATYPE_RESERVED }
	55	};
	56
	57	static const char *relevantTags[] = {
	58	"title",
	59	"meta",
	60	NULL,
	61	};
	62
	63	typedef struct TI
	64	{
	65	struct TI *next;
	66	const char *tagStart;
	67	const char *tagEnd;
	68	const char *dataStart;
	69	const char *dataEnd;
	70	} TagInfo;
	71
	72
	73
	74
	75	/* ****************** parser helper functions ************ */
	76
	77	static int
	78	tagMatch (const char tag, const char s, const char *e)
	79	{
	80	return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
	81	}
	82
	83	static int
	84	lookFor (char c, size_t * pos, const char *data, size_t size)
	85	{
	86	size_t p = *pos;
	87
	88	while ((p < size) && (data[p] != c))
	89	{
	90	if (data[p] == '\0')
	91	return 0;
	92	p++;
	93	}
	94	*pos = p;
	95	return p < size;
	96	}
	97
	98	static int
	99	skipWhitespace (size_t * pos, const char *data, size_t size)
	100	{
	101	size_t p = *pos;
	102
	103	while ((p < size) && (isspace ( (unsigned char) data[p])))
	104	{
	105	if (data[p] == '\0')
	106	return 0;
	107	p++;
	108	}
	109	*pos = p;
	110	return p < size;
	111	}
	112
	113	static int
	114	skipLetters (size_t * pos, const char *data, size_t size)
	115	{
	116	size_t p = *pos;
	117
	118	while ((p < size) && (isalpha ( (unsigned char) data[p])))
	119	{
	120	if (data[p] == '\0')
	121	return 0;
	122	p++;
	123	}
	124	*pos = p;
	125	return p < size;
	126	}
	127
	128	static int
	129	lookForMultiple (const char c, size_t pos, const char *data, size_t size)
	130	{
	131	size_t p = *pos;
	132
	133	while ((p < size) && (strchr (c, data[p]) == NULL))
	134	{
	135	if (data[p] == '\0')
	136	return 0;
	137	p++;
	138	}
	139	*pos = p;
	140	return p < size;
	141	}
	142
	143	static void
	144	findEntry (const char *key,
	145	const char *start,
	146	const char end, const char mstart, const char *mend)
	147	{
	148	size_t len;
	149
	150	*mstart = NULL;
	151	*mend = NULL;
	152	len = strlen (key);
	153	while (start < end - len - 1)
	154	{
	155	start++;
	156	if (start[len] != '=')
	157	continue;
	158	if (0 == strncasecmp (start, key, len))
	159	{
	160	start += len + 1;
	161	*mstart = start;
	162	if ((start == '\"') \|\| (start == '\''))
	163	{
	164	start++;
	165	while ((start < end) && (start != *mstart))
	166	start++;
	167	(mstart)++; / skip quote */
	168	}
	169	else
	170	{
	171	while ((start < end) && (!isspace ( (unsigned char) *start)))
	172	start++;
	173	}
	174	*mend = start;
	175	return;
	176	}
	177	}
	178	}
	179
	180	/**
	181	* Search all tags that correspond to "tagname". Example:
	182	* If the tag is <meta name="foo" desc="bar">, and
	183	* tagname == "meta", keyname="name", keyvalue="foo",
	184	* and searchname="desc", then this function returns a
	185	* copy (!) of "bar". Easy enough?
	186	*
	187	* @return NULL if nothing is found
	188	*/
	189	static char *
	190	findInTags (TagInfo * t,
	191	const char *tagname,
	192	const char keyname, const char keyvalue, const char *searchname)
	193	{
	194	const char *pstart;
	195	const char *pend;
	196
	197	while (t != NULL)
	198	{
	199	if (tagMatch (tagname, t->tagStart, t->tagEnd))
	200	{
	201	findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
	202	if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
	203	{
	204	findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
	205	if (pstart != NULL)
	206	{
	207	char *ret = malloc (pend - pstart + 1);
	208	if (ret == NULL)
	209	return NULL;
	210	memcpy (ret, pstart, pend - pstart);
	211	ret[pend - pstart] = '\0';
	212	return ret;
	213	}
	214	}
	215	}
	216	t = t->next;
	217	}
	218	return NULL;
	219	}
	220
	221
	222	/* mimetype = text/html */
	223	int
	224	EXTRACTOR_html_extract (const char *data,
	225	size_t size,
	226	EXTRACTOR_MetaDataProcessor proc,
	227	void *proc_cls,
	228	const char *options)
	229	{
	230	size_t xsize;
	231	TagInfo *tags;
	232	TagInfo *t;
	233	TagInfo tag;
	234	size_t pos;
	235	size_t tpos;
	236	int i;
	237	char *charset;
	238	char *tmp;
	239	char *xtmp;
	240	int ret;
	241
	242	ret = 0;
	243	if (size == 0)
	244	return 0;
	245	/* only scan first 32k */
	246	if (size > 1024 * 32)
	247	xsize = 1024 * 32;
	248	else
	249	xsize = size;
	250	tags = NULL;
	251	tag.next = NULL;
	252	pos = 0;
	253	while (pos < xsize)
	254	{
	255	if (!lookFor ('<', &pos, data, size))
	256	break;
	257	tag.tagStart = &data[++pos];
	258	if (!skipLetters (&pos, data, size))
	259	break;
	260	tag.tagEnd = &data[pos];
	261	if (!skipWhitespace (&pos, data, size))
	262	break;
	263	STEP3:
	264	if (!lookForMultiple (">\"\'", &pos, data, size))
	265	break;
	266	if (data[pos] != '>')
	267	{
	268	/* find end-quote, ignore escaped quotes (\') */
	269	do
	270	{
	271	tpos = pos;
	272	pos++;
	273	if (!lookFor (data[tpos], &pos, data, size))
	274	break;
	275	}
	276	while (data[pos - 1] == '\\');
	277	pos++;
	278	goto STEP3;
	279	}
	280	pos++;
	281	if (!skipWhitespace (&pos, data, size))
	282	break;
	283	tag.dataStart = &data[pos];
	284	if (!lookFor ('<', &pos, data, size))
	285	break;
	286	tag.dataEnd = &data[pos];
	287	i = 0;
	288	while (relevantTags[i] != NULL)
	289	{
	290	if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
	291	(0 == strncasecmp (relevantTags[i],
	292	tag.tagStart, tag.tagEnd - tag.tagStart)))
	293	{
	294	t = malloc (sizeof (TagInfo));
	295	if (t == NULL)
	296	return 0;
	297	*t = tag;
	298	t->next = tags;
	299	tags = t;
	300	break;
	301	}
	302	i++;
	303	}
	304	/* abort early if we hit the body tag */
	305	if (tagMatch ("body", tag.tagStart, tag.tagEnd))
	306	break;
	307	}
	308
	309	/* fast exit */
	310	if (tags == NULL)
	311	return 0;
	312
	313	charset = NULL;
	314	/* first, try to determine mime type and/or character set */
	315	tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
	316	if (tmp != NULL)
	317	{
	318	/* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
	319	if text/html is present, we take that as the mime-type; if charset=
	320	is present, we try to use that for character set conversion. */
	321	if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
	322	ret = proc (proc_cls,
	323	"html",
	324	EXTRACTOR_METATYPE_MIMETYPE,
	325	EXTRACTOR_METAFORMAT_UTF8,
	326	"text/plain",
	327	"text/html",
	328	strlen ("text/html")+1);
	329	charset = strcasestr (tmp, "charset=");
	330	if (charset != NULL)
	331	charset = strdup (&charset[strlen ("charset=")]);
	332	free (tmp);
	333	}
	334	i = 0;
	335	while (tagmap[i].name != NULL)
	336	{
	337	tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
	338	if ( (tmp != NULL) &&
	339	(ret == 0) )
	340	{
	341	if (charset == NULL)
	342	{
	343	ret = proc (proc_cls,
	344	"html",
	345	tagmap[i].type,
	346	EXTRACTOR_METAFORMAT_C_STRING,
	347	"text/plain",
	348	tmp,
	349	strlen (tmp) + 1);
	350	}
	351	else
	352	{
	353	xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
	354	strlen (tmp),
	355	charset);
	356	if (xtmp != NULL)
	357	{
	358	ret = proc (proc_cls,
	359	"html",
	360	tagmap[i].type,
	361	EXTRACTOR_METAFORMAT_UTF8,
	362	"text/plain",
	363	xtmp,
	364	strlen (xtmp) + 1);
	365	free (xtmp);
	366	}
	367	}
	368	}
	369	if (tmp != NULL)
	370	free (tmp);
	371	i++;
	372	}
	373	while (tags != NULL)
	374	{
	375	t = tags;
	376	if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
	377	(ret == 0) )
	378	{
	379	if (charset == NULL)
	380	{
	381	xtmp = malloc (t->dataEnd - t->dataStart + 1);
	382	if (xtmp != NULL)
	383	{
	384	memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
	385	xtmp[t->dataEnd - t->dataStart] = '\0';
	386	ret = proc (proc_cls,
	387	"html",
	388	EXTRACTOR_METATYPE_TITLE,
	389	EXTRACTOR_METAFORMAT_C_STRING,
	390	"text/plain",
	391	xtmp,
	392	strlen (xtmp) + 1);
	393	free (xtmp);
	394	}
	395	}
	396	else
	397	{
	398	xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
	399	t->dataEnd - t->dataStart,
	400	charset);
	401	if (xtmp != NULL)
	402	{
	403	ret = proc (proc_cls,
	404	"html",
	405	EXTRACTOR_METATYPE_TITLE,
	406	EXTRACTOR_METAFORMAT_UTF8,
	407	"text/plain",
	408	xtmp,
	409	strlen (xtmp) + 1);
	410	free (xtmp);
	411	}
	412	}
	413	}
	414	tags = t->next;
	415	free (t);
	416	}
	417	if (charset != NULL)
	418	free (charset);
	419	return ret;
	420	}