1 files changed, 330 insertions, 322 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c
index 8cd4aba..5ebf97b 100644
--- a/src/plugins/html_extractor.c
+++ b/src/plugins/html_extractor.c
@@ -87,9 +87,9 @@ tag_to_type (const char *tag)
 {
  unsigned int i;
-  for (i=0; NULL != tagmap[i].name; i++)
+  for (i = 0; NULL != tagmap[i].name; i++)
    if (0 == strcasecmp (tag,
-                         tagmap[i].name))
+                         tagmap[i].name))
      return tagmap[i].type;
  return EXTRACTOR_METATYPE_RESERVED;
 }
@@ -107,10 +107,10 @@ tag_to_type (const char *tag)
 */
 static Bool TIDY_CALL
 report_cb (TidyDoc doc,
-           TidyReportLevel lvl,
+           TidyReportLevel lvl,
-           uint line,
+           uint line,
-           uint col,
+           uint col,
-           ctmbstr mssg)
+           ctmbstr mssg)
 {
  return 0;
 }
@@ -130,7 +130,7 @@ get_byte_cb (void *sourceData)
  if (1 !=
      ec->read (ec->cls,
-                &data, 1))
+                &data, 1))
    return EndOfStream;
  return *(unsigned char*) data;
 }
@@ -188,130 +188,129 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
  const char *mime;
  if (-1 == (iret = ec->read (ec->cls,
-                              &data,
+                              &data,
-                              16 * 1024)))
+                              16 * 1024)))
    return;
  if (NULL == (mime = magic_buffer (magic, data, iret)))
    return;
  if (0 != strncmp (mime,
-                    "text/html",
+                    "text/html",
-                    strlen ("text/html")))
+                    strlen ("text/html")))
    return; /* not HTML */
  if (0 != ec->seek (ec->cls, 0, SEEK_SET))
    return; /* seek failed !? */
  tidyInitSource (&src, ec,
-                  &get_byte_cb,
+                  &get_byte_cb,
-                  &unget_byte_cb,
+                  &unget_byte_cb,
-                  &eof_cb);
+                  &eof_cb);
  if (NULL == (doc = tidyCreate ()))
    return;
  tidySetReportFilter (doc, &report_cb);
  tidySetAppData (doc, ec);
  if (0 > tidyParseSource (doc, &src))
-    {
+  {
-      tidyRelease (doc);
+    tidyRelease (doc);
-      return;
+    return;
-    }
+  }
  if (1 != tidyStatus (doc))
-    {
+  {
-      tidyRelease (doc);
+    tidyRelease (doc);
-      return;
+    return;
-    }
+  }
  if (NULL == (head = tidyGetHead (doc)))
-    {
+  {
-      fprintf (stderr, "no head\n");
+    fprintf (stderr, "no head\n");
-      tidyRelease (doc);
+    tidyRelease (doc);
-      return;
+    return;
-    }
+  }
  for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
+  {
+    switch (tidyNodeGetType (child))
    {
-      switch (tidyNodeGetType(child))
+    case TidyNode_Root:
-        {
+      break;
-        case TidyNode_Root:
+    case TidyNode_DocType:
-          break;
+      break;
-        case TidyNode_DocType:
+    case TidyNode_Comment:
-          break;
+      break;
-        case TidyNode_Comment:
+    case TidyNode_ProcIns:
-          break;
+      break;
-        case TidyNode_ProcIns:
+    case TidyNode_Text:
-          break;
+      break;
-        case TidyNode_Text:
+    case TidyNode_CDATA:
-          break;
+      break;
-        case TidyNode_CDATA:
+    case TidyNode_Section:
-          break;
+      break;
-        case TidyNode_Section:
+    case TidyNode_Asp:
-          break;
+      break;
-        case TidyNode_Asp:
+    case TidyNode_Jste:
-          break;
+      break;
-        case TidyNode_Jste:
+    case TidyNode_Php:
-          break;
+      break;
-        case TidyNode_Php:
+    case TidyNode_XmlDecl:
-          break;
+      break;
-        case TidyNode_XmlDecl:
+    case TidyNode_Start:
-          break;
+    case TidyNode_StartEnd:
-        case TidyNode_Start:
+      name = tidyNodeGetName (child);
-        case TidyNode_StartEnd:
+      if ( (0 == strcasecmp (name, "title")) &&
-          name = tidyNodeGetName (child);
+           (NULL != (title = tidyGetChild (child))) )
-          if ( (0 == strcasecmp (name, "title")) &&
+      {
-               (NULL != (title = tidyGetChild (child))) )
+        tidyBufInit (&tbuf);
-            {
+        tidyNodeGetValue (doc, title, &tbuf);
-              tidyBufInit (&tbuf);
+        /* add 0-termination */
-              tidyNodeGetValue (doc, title, &tbuf);
+        tidyBufPutByte (&tbuf, 0);
-              /* add 0-termination */
+        if (0 !=
-              tidyBufPutByte (&tbuf, 0);
+            ec->proc (ec->cls,
-              if (0 !=
+                      "html",
-                  ec->proc (ec->cls,
+                      EXTRACTOR_METATYPE_TITLE,
-                            "html",
+                      EXTRACTOR_METAFORMAT_UTF8,
-                            EXTRACTOR_METATYPE_TITLE,
+                      "text/plain",
-                            EXTRACTOR_METAFORMAT_UTF8,
+                      (const char *) tbuf.bp,
-                            "text/plain",
+                      tbuf.size))
-                            (const char *) tbuf.bp,
+        {
-                            tbuf.size))
+          tidyBufFree (&tbuf);
-                {
+          goto CLEANUP;
-                  tidyBufFree (&tbuf);
+        }
-                  goto CLEANUP;
+        tidyBufFree (&tbuf);
-                }
+        break;
-              tidyBufFree (&tbuf);
+      }
-              break;
+      if (0 == strcasecmp (name, "meta"))
-            }
+      {
-          if (0 == strcasecmp (name, "meta"))
+        if (NULL == (attr = tidyAttrGetById (child,
-            {
+                                             TidyAttr_NAME)))
-              if (NULL == (attr = tidyAttrGetById (child,
+          break;
-                                                   TidyAttr_NAME)))
+        if (EXTRACTOR_METATYPE_RESERVED ==
-                break;
+            (type = tag_to_type (tidyAttrValue (attr))))
-              if (EXTRACTOR_METATYPE_RESERVED ==
+          break;
-                  (type = tag_to_type (tidyAttrValue (attr))))
+        if (NULL == (attr = tidyAttrGetById (child,
-                break;
+                                             TidyAttr_CONTENT)))
-              if (NULL == (attr = tidyAttrGetById (child,
+          break;
-                                                   TidyAttr_CONTENT)))
+        name = tidyAttrValue (attr);
-                break;
+        if (0 !=
-              name = tidyAttrValue (attr);
+            ec->proc (ec->cls,
-              if (0 !=
+                      "html",
-                  ec->proc (ec->cls,
+                      type,
-                            "html",
+                      EXTRACTOR_METAFORMAT_UTF8,
-                            type,
+                      "text/plain",
-                            EXTRACTOR_METAFORMAT_UTF8,
+                      name,
-                            "text/plain",
+                      strlen (name) + 1))
-                            name,
+          goto CLEANUP;
-                            strlen (name) + 1))
+        break;
-                goto CLEANUP;
+      }
-              break;
+      break;
-            }
+    case TidyNode_End:
-          break;
+      break;
-        case TidyNode_End:
+    default:
-          break;
+      break;
-        default:
-          break;
-        }
    }
- CLEANUP:
+  }
+CLEANUP:
  tidyRelease (doc);
 }
 #if OLD
@@ -323,66 +322,71 @@ tagMatch (const char *tag, const char *s, const char *e)
  return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
 }
 static int
-lookFor (char c, size_t * pos, const char *data, size_t size)
+lookFor (char c, size_t *pos, const char *data, size_t size)
 {
  size_t p = *pos;
  while ((p < size) && (data[p] != c))
-    {
+  {
-      if (data[p] == '\0')
+    if (data[p] == '\0')
-        return 0;
+      return 0;
-      p++;
+    p++;
-    }
+  }
  *pos = p;
  return p < size;
 }
 static int
-skipWhitespace (size_t * pos, const char *data, size_t size)
+skipWhitespace (size_t *pos, const char *data, size_t size)
 {
  size_t p = *pos;
  while ((p < size) && (isspace ( (unsigned char) data[p])))
-    {
+  {
-      if (data[p] == '\0')
+    if (data[p] == '\0')
-        return 0;
+      return 0;
-      p++;
+    p++;
-    }
+  }
  *pos = p;
  return p < size;
 }
 static int
-skipLetters (size_t * pos, const char *data, size_t size)
+skipLetters (size_t *pos, const char *data, size_t size)
 {
  size_t p = *pos;
  while ((p < size) && (isalpha ( (unsigned char) data[p])))
-    {
+  {
-      if (data[p] == '\0')
+    if (data[p] == '\0')
-        return 0;
+      return 0;
-      p++;
+    p++;
-    }
+  }
  *pos = p;
  return p < size;
 }
 static int
-lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
+lookForMultiple (const char *c, size_t *pos, const char *data, size_t size)
 {
  size_t p = *pos;
  while ((p < size) && (strchr (c, data[p]) == NULL))
-    {
+  {
-      if (data[p] == '\0')
+    if (data[p] == '\0')
-        return 0;
+      return 0;
-      p++;
+    p++;
-    }
+  }
  *pos = p;
  return p < size;
 }
 static void
 findEntry (const char *key,
           const char *start,
@@ -394,32 +398,33 @@ findEntry (const char *key,
  *mend = NULL;
  len = strlen (key);
  while (start < end - len - 1)
+  {
+    start++;
+    if (start[len] != '=')
+      continue;
+    if (0 == strncasecmp (start, key, len))
    {
-      start++;
+      start += len + 1;
-      if (start[len] != '=')
+      *mstart = start;
-        continue;
+      if ((*start == '\"') || (*start == '\''))
-      if (0 == strncasecmp (start, key, len))
+      {
-        {
+        start++;
-          start += len + 1;
+        while ((start < end) && (*start != **mstart))
-          *mstart = start;
+          start++;
-          if ((*start == '\"') || (*start == '\''))
+        (*mstart)++;            /* skip quote */
-            {
+      }
-              start++;
+      else
-              while ((start < end) && (*start != **mstart))
+      {
-                start++;
+        while ((start < end) && (! isspace ( (unsigned char) *start)))
-              (*mstart)++;      /* skip quote */
+          start++;
-            }
+      }
-          else
+      *mend = start;
-            {
+      return;
-              while ((start < end) && (!isspace ( (unsigned char) *start)))
-                start++;
-            }
-          *mend = start;
-          return;
-        }
    }
+  }
 }
 /**
 * Search all tags that correspond to "tagname".  Example:
 * If the tag is <meta name="foo" desc="bar">, and
@@ -430,7 +435,7 @@ findEntry (const char *key,
 * @return NULL if nothing is found
 */
 static char *
-findInTags (struct TagInfo * t,
+findInTags (struct TagInfo *t,
            const char *tagname,
            const char *keyname, const char *keyvalue, const char *searchname)
 {
@@ -438,26 +443,26 @@ findInTags (struct TagInfo * t,
  const char *pend;
  while (t != NULL)
+  {
+    if (tagMatch (tagname, t->tagStart, t->tagEnd))
    {
-      if (tagMatch (tagname, t->tagStart, t->tagEnd))
+      findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+      if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+      {
+        findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+        if (pstart != NULL)
        {
-          findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
+          char *ret = malloc (pend - pstart + 1);
-          if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
+          if (ret == NULL)
-            {
+            return NULL;
-              findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
+          memcpy (ret, pstart, pend - pstart);
-              if (pstart != NULL)
+          ret[pend - pstart] = '\0';
-                {
+          return ret;
-                  char *ret = malloc (pend - pstart + 1);
-                  if (ret == NULL)
-                    return NULL;
-                  memcpy (ret, pstart, pend - pstart);
-                  ret[pend - pstart] = '\0';
-                  return ret;
-                }
-            }
        }
-      t = t->next;
+      }
    }
+    t = t->next;
+  }
  return NULL;
 }
@@ -465,10 +470,10 @@ findInTags (struct TagInfo * t,
 /* mimetype = text/html */
 int
 EXTRACTOR_html_extract (const char *data,
-                        size_t size,
+                        size_t size,
-                        EXTRACTOR_MetaDataProcessor proc,
+                        EXTRACTOR_MetaDataProcessor proc,
-                        void *proc_cls,
+                        void *proc_cls,
-                        const char *options)
+                        const char *options)
 {
  size_t xsize;
  struct TagInfo *tags;
@@ -494,60 +499,60 @@ EXTRACTOR_html_extract (const char *data,
  tag.next = NULL;
  pos = 0;
  while (pos < xsize)
+  {
+    if (! lookFor ('<', &pos, data, size))
+      break;
+    tag.tagStart = &data[++pos];
+    if (! skipLetters (&pos, data, size))
+      break;
+    tag.tagEnd = &data[pos];
+    if (! skipWhitespace (&pos, data, size))
+      break;
+STEP3:
+    if (! lookForMultiple (">\"\'", &pos, data, size))
+      break;
+    if (data[pos] != '>')
    {
-      if (!lookFor ('<', &pos, data, size))
+      /* find end-quote, ignore escaped quotes (\') */
-        break;
+      do
-      tag.tagStart = &data[++pos];
+      {
-      if (!skipLetters (&pos, data, size))
+        tpos = pos;
-        break;
+        pos++;
-      tag.tagEnd = &data[pos];
+        if (! lookFor (data[tpos], &pos, data, size))
-      if (!skipWhitespace (&pos, data, size))
+          break;
-        break;
+      }
-    STEP3:
+      while (data[pos - 1] == '\\');
-      if (!lookForMultiple (">\"\'", &pos, data, size))
-        break;
-      if (data[pos] != '>')
-        {
-          /* find end-quote, ignore escaped quotes (\') */
-          do
-            {
-              tpos = pos;
-              pos++;
-              if (!lookFor (data[tpos], &pos, data, size))
-                break;
-            }
-          while (data[pos - 1] == '\\');
-          pos++;
-          goto STEP3;
-        }
      pos++;
-      if (!skipWhitespace (&pos, data, size))
+      goto STEP3;
-        break;
+    }
-      tag.dataStart = &data[pos];
+    pos++;
-      if (!lookFor ('<', &pos, data, size))
+    if (! skipWhitespace (&pos, data, size))
-        break;
+      break;
-      tag.dataEnd = &data[pos];
+    tag.dataStart = &data[pos];
-      i = 0;
+    if (! lookFor ('<', &pos, data, size))
-      while (relevantTags[i] != NULL)
+      break;
-        {
+    tag.dataEnd = &data[pos];
-          if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+    i = 0;
-              (0 == strncasecmp (relevantTags[i],
+    while (relevantTags[i] != NULL)
-                                 tag.tagStart, tag.tagEnd - tag.tagStart)))
+    {
-            {
+      if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
-              t = malloc (sizeof (struct TagInfo));
+          (0 == strncasecmp (relevantTags[i],
-              if (t == NULL)
+                             tag.tagStart, tag.tagEnd - tag.tagStart)))
-                return 0;
+      {
-              *t = tag;
+        t = malloc (sizeof (struct TagInfo));
-              t->next = tags;
+        if (t == NULL)
-              tags = t;
+          return 0;
-              break;
+        *t = tag;
-            }
+        t->next = tags;
-          i++;
+        tags = t;
-        }
-      /* abort early if we hit the body tag */
-      if (tagMatch ("body", tag.tagStart, tag.tagEnd))
        break;
+      }
+      i++;
    }
+    /* abort early if we hit the body tag */
+    if (tagMatch ("body", tag.tagStart, tag.tagEnd))
+      break;
+  }
  /* fast exit */
  if (tags == NULL)
@@ -557,110 +562,112 @@ EXTRACTOR_html_extract (const char *data,
  /* first, try to determine mime type and/or character set */
  tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
  if (tmp != NULL)
-    {
+  {
-      /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
+    /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
-         if text/html is present, we take that as the mime-type; if charset=
+       if text/html is present, we take that as the mime-type; if charset=
-         is present, we try to use that for character set conversion. */
+       is present, we try to use that for character set conversion. */
-      if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
+    if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
-        ret = proc (proc_cls,
+      ret = proc (proc_cls,
-                    "html",
+                  "html",
-                    EXTRACTOR_METATYPE_MIMETYPE,
+                  EXTRACTOR_METATYPE_MIMETYPE,
-                    EXTRACTOR_METAFORMAT_UTF8,
+                  EXTRACTOR_METAFORMAT_UTF8,
-                    "text/plain",
+                  "text/plain",
-                    "text/html",
+                  "text/html",
-                    strlen ("text/html")+1);
+                  strlen ("text/html") + 1);
-      charset = strcasestr (tmp, "charset=");
+    charset = strcasestr (tmp, "charset=");
-      if (charset != NULL)
+    if (charset != NULL)
-        charset = strdup (&charset[strlen ("charset=")]);
+      charset = strdup (&charset[strlen ("charset=")]);
-      free (tmp);
+    free (tmp);
-    }
+  }
  i = 0;
  while (tagmap[i].name != NULL)
+  {
+    tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+    if ( (tmp != NULL) &&
+         (ret == 0) )
    {
-      tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
+      if (charset == NULL)
-      if ( (tmp != NULL) &&
+      {
-           (ret == 0) )
+        ret = proc (proc_cls,
+                    "html",
+                    tagmap[i].type,
+                    EXTRACTOR_METAFORMAT_C_STRING,
+                    "text/plain",
+                    tmp,
+                    strlen (tmp) + 1);
+      }
+      else
+      {
+        xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
+                                                 strlen (tmp),
+                                                 charset);
+        if (xtmp != NULL)
        {
-          if (charset == NULL)
+          ret = proc (proc_cls,
-            {
+                      "html",
-              ret = proc (proc_cls,
+                      tagmap[i].type,
-                          "html",
+                      EXTRACTOR_METAFORMAT_UTF8,
-                          tagmap[i].type,
+                      "text/plain",
-                          EXTRACTOR_METAFORMAT_C_STRING,
+                      xtmp,
-                          "text/plain",
+                      strlen (xtmp) + 1);
-                          tmp,
+          free (xtmp);
-                          strlen (tmp) + 1);
-            }
-          else
-            {
-              xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
-                                                       strlen (tmp),
-                                                       charset);
-              if (xtmp != NULL)
-                {
-                  ret = proc (proc_cls,
-                              "html",
-                              tagmap[i].type,
-                              EXTRACTOR_METAFORMAT_UTF8,
-                              "text/plain",
-                              xtmp,
-                              strlen (xtmp) + 1);
-                  free (xtmp);
-                }
-            }
        }
-      if (tmp != NULL)
+      }
-        free (tmp);
-      i++;
    }
+    if (tmp != NULL)
+      free (tmp);
+    i++;
+  }
  while (tags != NULL)
+  {
+    t = tags;
+    if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+         (ret == 0) )
    {
-      t = tags;
+      if (charset == NULL)
-      if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
+      {
-           (ret == 0) )
+        xtmp = malloc (t->dataEnd - t->dataStart + 1);
-        {
+        if (xtmp != NULL)
-          if (charset == NULL)
+        {
-            {
+          memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
-              xtmp = malloc (t->dataEnd - t->dataStart + 1);
+          xtmp[t->dataEnd - t->dataStart] = '\0';
-              if (xtmp != NULL)
+          ret = proc (proc_cls,
-                {
+                      "html",
-                  memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
+                      EXTRACTOR_METATYPE_TITLE,
-                  xtmp[t->dataEnd - t->dataStart] = '\0';
+                      EXTRACTOR_METAFORMAT_C_STRING,
-                  ret = proc (proc_cls,
+                      "text/plain",
-                              "html",
+                      xtmp,
-                              EXTRACTOR_METATYPE_TITLE,
+                      strlen (xtmp) + 1);
-                              EXTRACTOR_METAFORMAT_C_STRING,
+          free (xtmp);
-                              "text/plain",
+        }
-                              xtmp,
+      }
-                              strlen (xtmp) + 1);
+      else
-                  free (xtmp);
+      {
-                }
+        xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
-            }
+                                                 t->dataEnd - t->dataStart,
-          else
+                                                 charset);
-            {
+        if (xtmp != NULL)
-              xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
+        {
-                                                       t->dataEnd - t->dataStart,
+          ret = proc (proc_cls,
-                                                       charset);
+                      "html",
-              if (xtmp != NULL)
+                      EXTRACTOR_METATYPE_TITLE,
-                {
+                      EXTRACTOR_METAFORMAT_UTF8,
-                  ret = proc (proc_cls,
+                      "text/plain",
-                              "html",
+                      xtmp,
-                              EXTRACTOR_METATYPE_TITLE,
+                      strlen (xtmp) + 1);
-                              EXTRACTOR_METAFORMAT_UTF8,
+          free (xtmp);
-                              "text/plain",
+        }
-                              xtmp,
+      }
-                              strlen (xtmp) + 1);
-                  free (xtmp);
-                }
-            }
-        }
-      tags = t->next;
-      free (t);
    }
+    tags = t->next;
+    free (t);
+  }
  if (charset != NULL)
    free (charset);
  return ret;
 }
 #endif
@@ -672,9 +679,9 @@ html_gobject_init ()
 {
  magic = magic_open (MAGIC_MIME_TYPE);
  if (0 != magic_load (magic, NULL))
-    {
+  {
-      /* FIXME: how to deal with errors? */
+    /* FIXME: how to deal with errors? */
-    }
+  }
 }
@@ -685,10 +692,11 @@ void __attribute__ ((destructor))
 html_ltdl_fini ()
 {
  if (NULL != magic)
-    {
+  {
-      magic_close (magic);
+    magic_close (magic);
-      magic = NULL;
+    magic = NULL;
-    }
+  }
 }
 /* end of html_extractor.c */

diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c index 8cd4aba..5ebf97b 100644 --- a/src/plugins/html_extractor.c +++ b/src/plugins/html_extractor.c
@@ -87,9 +87,9 @@ tag_to_type (const char *tag)
87	{	87	{
88	unsigned int i;	88	unsigned int i;
89		89
90	for (i=0; NULL != tagmap[i].name; i++)	90	for (i = 0; NULL != tagmap[i].name; i++)
91	if (0 == strcasecmp (tag,	91	if (0 == strcasecmp (tag,
92	tagmap[i].name))	92	tagmap[i].name))
93	return tagmap[i].type;	93	return tagmap[i].type;
94	return EXTRACTOR_METATYPE_RESERVED;	94	return EXTRACTOR_METATYPE_RESERVED;
95	}	95	}
@@ -107,10 +107,10 @@ tag_to_type (const char *tag)
107	*/	107	*/
108	static Bool TIDY_CALL	108	static Bool TIDY_CALL
109	report_cb (TidyDoc doc,	109	report_cb (TidyDoc doc,
110	TidyReportLevel lvl,	110	TidyReportLevel lvl,
111	uint line,	111	uint line,
112	uint col,	112	uint col,
113	ctmbstr mssg)	113	ctmbstr mssg)
114	{	114	{
115	return 0;	115	return 0;
116	}	116	}
@@ -130,7 +130,7 @@ get_byte_cb (void *sourceData)
130		130
131	if (1 !=	131	if (1 !=
132	ec->read (ec->cls,	132	ec->read (ec->cls,
133	&data, 1))	133	&data, 1))
134	return EndOfStream;	134	return EndOfStream;
135	return (unsigned char) data;	135	return (unsigned char) data;
136	}	136	}
@@ -188,130 +188,129 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
188	const char *mime;	188	const char *mime;
189		189
190	if (-1 == (iret = ec->read (ec->cls,	190	if (-1 == (iret = ec->read (ec->cls,
191	&data,	191	&data,
192	16 * 1024)))	192	16 * 1024)))
193	return;	193	return;
194	if (NULL == (mime = magic_buffer (magic, data, iret)))	194	if (NULL == (mime = magic_buffer (magic, data, iret)))
195	return;	195	return;
196	if (0 != strncmp (mime,	196	if (0 != strncmp (mime,
197	"text/html",	197	"text/html",
198	strlen ("text/html")))	198	strlen ("text/html")))
199	return; /* not HTML */	199	return; /* not HTML */
200		200
201	if (0 != ec->seek (ec->cls, 0, SEEK_SET))	201	if (0 != ec->seek (ec->cls, 0, SEEK_SET))
202	return; /* seek failed !? */	202	return; /* seek failed !? */
203		203
204	tidyInitSource (&src, ec,	204	tidyInitSource (&src, ec,
205	&get_byte_cb,	205	&get_byte_cb,
206	&unget_byte_cb,	206	&unget_byte_cb,
207	&eof_cb);	207	&eof_cb);
208	if (NULL == (doc = tidyCreate ()))	208	if (NULL == (doc = tidyCreate ()))
209	return;	209	return;
210	tidySetReportFilter (doc, &report_cb);	210	tidySetReportFilter (doc, &report_cb);
211	tidySetAppData (doc, ec);	211	tidySetAppData (doc, ec);
212	if (0 > tidyParseSource (doc, &src))	212	if (0 > tidyParseSource (doc, &src))
213	{	213	{
214	tidyRelease (doc);	214	tidyRelease (doc);
215	return;	215	return;
216	}	216	}
217	if (1 != tidyStatus (doc))	217	if (1 != tidyStatus (doc))
218	{	218	{
219	tidyRelease (doc);	219	tidyRelease (doc);
220	return;	220	return;
221	}	221	}
222	if (NULL == (head = tidyGetHead (doc)))	222	if (NULL == (head = tidyGetHead (doc)))
223	{	223	{
224	fprintf (stderr, "no head\n");	224	fprintf (stderr, "no head\n");
225	tidyRelease (doc);	225	tidyRelease (doc);
226	return;	226	return;
227	}	227	}
228	for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))	228	for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
		229	{
		230	switch (tidyNodeGetType (child))
229	{	231	{
230	switch (tidyNodeGetType(child))	232	case TidyNode_Root:
231	{	233	break;
232	case TidyNode_Root:	234	case TidyNode_DocType:
233	break;	235	break;
234	case TidyNode_DocType:	236	case TidyNode_Comment:
235	break;	237	break;
236	case TidyNode_Comment:	238	case TidyNode_ProcIns:
237	break;	239	break;
238	case TidyNode_ProcIns:	240	case TidyNode_Text:
239	break;	241	break;
240	case TidyNode_Text:	242	case TidyNode_CDATA:
241	break;	243	break;
242	case TidyNode_CDATA:	244	case TidyNode_Section:
243	break;	245	break;
244	case TidyNode_Section:	246	case TidyNode_Asp:
245	break;	247	break;
246	case TidyNode_Asp:	248	case TidyNode_Jste:
247	break;	249	break;
248	case TidyNode_Jste:	250	case TidyNode_Php:
249	break;	251	break;
250	case TidyNode_Php:	252	case TidyNode_XmlDecl:
251	break;	253	break;
252	case TidyNode_XmlDecl:	254	case TidyNode_Start:
253	break;	255	case TidyNode_StartEnd:
254	case TidyNode_Start:	256	name = tidyNodeGetName (child);
255	case TidyNode_StartEnd:	257	if ( (0 == strcasecmp (name, "title")) &&
256	name = tidyNodeGetName (child);	258	(NULL != (title = tidyGetChild (child))) )
257	if ( (0 == strcasecmp (name, "title")) &&	259	{
258	(NULL != (title = tidyGetChild (child))) )	260	tidyBufInit (&tbuf);
259	{	261	tidyNodeGetValue (doc, title, &tbuf);
260	tidyBufInit (&tbuf);	262	/* add 0-termination */
261	tidyNodeGetValue (doc, title, &tbuf);	263	tidyBufPutByte (&tbuf, 0);
262	/* add 0-termination */	264	if (0 !=
263	tidyBufPutByte (&tbuf, 0);	265	ec->proc (ec->cls,
264	if (0 !=	266	"html",
265	ec->proc (ec->cls,	267	EXTRACTOR_METATYPE_TITLE,
266	"html",	268	EXTRACTOR_METAFORMAT_UTF8,
267	EXTRACTOR_METATYPE_TITLE,	269	"text/plain",
268	EXTRACTOR_METAFORMAT_UTF8,	270	(const char *) tbuf.bp,
269	"text/plain",	271	tbuf.size))
270	(const char *) tbuf.bp,	272	{
271	tbuf.size))	273	tidyBufFree (&tbuf);
272	{	274	goto CLEANUP;
273	tidyBufFree (&tbuf);	275	}
274	goto CLEANUP;	276	tidyBufFree (&tbuf);
275	}	277	break;
276	tidyBufFree (&tbuf);	278	}
277	break;	279	if (0 == strcasecmp (name, "meta"))
278	}	280	{
279	if (0 == strcasecmp (name, "meta"))	281	if (NULL == (attr = tidyAttrGetById (child,
280	{	282	TidyAttr_NAME)))
281	if (NULL == (attr = tidyAttrGetById (child,	283	break;
282	TidyAttr_NAME)))	284	if (EXTRACTOR_METATYPE_RESERVED ==
283	break;	285	(type = tag_to_type (tidyAttrValue (attr))))
284	if (EXTRACTOR_METATYPE_RESERVED ==	286	break;
285	(type = tag_to_type (tidyAttrValue (attr))))	287	if (NULL == (attr = tidyAttrGetById (child,
286	break;	288	TidyAttr_CONTENT)))
287	if (NULL == (attr = tidyAttrGetById (child,	289	break;
288	TidyAttr_CONTENT)))	290	name = tidyAttrValue (attr);
289	break;	291	if (0 !=
290	name = tidyAttrValue (attr);	292	ec->proc (ec->cls,
291	if (0 !=	293	"html",
292	ec->proc (ec->cls,	294	type,
293	"html",	295	EXTRACTOR_METAFORMAT_UTF8,
294	type,	296	"text/plain",
295	EXTRACTOR_METAFORMAT_UTF8,	297	name,
296	"text/plain",	298	strlen (name) + 1))
297	name,	299	goto CLEANUP;
298	strlen (name) + 1))	300	break;
299	goto CLEANUP;	301	}
300	break;	302	break;
301	}	303	case TidyNode_End:
302	break;	304	break;
303	case TidyNode_End:	305	default:
304	break;	306	break;
305	default:
306	break;
307	}
308	}	307	}
309	CLEANUP:	308	}
		309	CLEANUP:
310	tidyRelease (doc);	310	tidyRelease (doc);
311	}	311	}
312		312
313		313
314
315	#if OLD	314	#if OLD
316		315
317		316
@@ -323,66 +322,71 @@ tagMatch (const char tag, const char s, const char *e)
323	return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));	322	return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
324	}	323	}
325		324
		325
326	static int	326	static int
327	lookFor (char c, size_t * pos, const char *data, size_t size)	327	lookFor (char c, size_t pos, const char data, size_t size)
328	{	328	{
329	size_t p = *pos;	329	size_t p = *pos;
330		330
331	while ((p < size) && (data[p] != c))	331	while ((p < size) && (data[p] != c))
332	{	332	{
333	if (data[p] == '\0')	333	if (data[p] == '\0')
334	return 0;	334	return 0;
335	p++;	335	p++;
336	}	336	}
337	*pos = p;	337	*pos = p;
338	return p < size;	338	return p < size;
339	}	339	}
340		340
		341
341	static int	342	static int
342	skipWhitespace (size_t * pos, const char *data, size_t size)	343	skipWhitespace (size_t pos, const char data, size_t size)
343	{	344	{
344	size_t p = *pos;	345	size_t p = *pos;
345		346
346	while ((p < size) && (isspace ( (unsigned char) data[p])))	347	while ((p < size) && (isspace ( (unsigned char) data[p])))
347	{	348	{
348	if (data[p] == '\0')	349	if (data[p] == '\0')
349	return 0;	350	return 0;
350	p++;	351	p++;
351	}	352	}
352	*pos = p;	353	*pos = p;
353	return p < size;	354	return p < size;
354	}	355	}
355		356
		357
356	static int	358	static int
357	skipLetters (size_t * pos, const char *data, size_t size)	359	skipLetters (size_t pos, const char data, size_t size)
358	{	360	{
359	size_t p = *pos;	361	size_t p = *pos;
360		362
361	while ((p < size) && (isalpha ( (unsigned char) data[p])))	363	while ((p < size) && (isalpha ( (unsigned char) data[p])))
362	{	364	{
363	if (data[p] == '\0')	365	if (data[p] == '\0')
364	return 0;	366	return 0;
365	p++;	367	p++;
366	}	368	}
367	*pos = p;	369	*pos = p;
368	return p < size;	370	return p < size;
369	}	371	}
370		372
		373
371	static int	374	static int
372	lookForMultiple (const char c, size_t pos, const char *data, size_t size)	375	lookForMultiple (const char c, size_t pos, const char *data, size_t size)
373	{	376	{
374	size_t p = *pos;	377	size_t p = *pos;
375		378
376	while ((p < size) && (strchr (c, data[p]) == NULL))	379	while ((p < size) && (strchr (c, data[p]) == NULL))
377	{	380	{
378	if (data[p] == '\0')	381	if (data[p] == '\0')
379	return 0;	382	return 0;
380	p++;	383	p++;
381	}	384	}
382	*pos = p;	385	*pos = p;
383	return p < size;	386	return p < size;
384	}	387	}
385		388
		389
386	static void	390	static void
387	findEntry (const char *key,	391	findEntry (const char *key,
388	const char *start,	392	const char *start,
@@ -394,32 +398,33 @@ findEntry (const char *key,
394	*mend = NULL;	398	*mend = NULL;
395	len = strlen (key);	399	len = strlen (key);
396	while (start < end - len - 1)	400	while (start < end - len - 1)
		401	{
		402	start++;
		403	if (start[len] != '=')
		404	continue;
		405	if (0 == strncasecmp (start, key, len))
397	{	406	{
398	start++;	407	start += len + 1;
399	if (start[len] != '=')	408	*mstart = start;
400	continue;	409	if ((start == '\"') \|\| (start == '\''))
401	if (0 == strncasecmp (start, key, len))	410	{
402	{	411	start++;
403	start += len + 1;	412	while ((start < end) && (start != *mstart))
404	*mstart = start;	413	start++;
405	if ((start == '\"') \|\| (start == '\''))	414	(mstart)++; / skip quote */
406	{	415	}
407	start++;	416	else
408	while ((start < end) && (start != *mstart))	417	{
409	start++;	418	while ((start < end) && (! isspace ( (unsigned char) *start)))
410	(mstart)++; / skip quote */	419	start++;
411	}	420	}
412	else	421	*mend = start;
413	{	422	return;
414	while ((start < end) && (!isspace ( (unsigned char) *start)))
415	start++;
416	}
417	*mend = start;
418	return;
419	}
420	}	423	}
		424	}
421	}	425	}
422		426
		427
423	/**	428	/**
424	* Search all tags that correspond to "tagname". Example:	429	* Search all tags that correspond to "tagname". Example:
425	* If the tag is <meta name="foo" desc="bar">, and	430	* If the tag is <meta name="foo" desc="bar">, and
@@ -430,7 +435,7 @@ findEntry (const char *key,
430	* @return NULL if nothing is found	435	* @return NULL if nothing is found
431	*/	436	*/
432	static char *	437	static char *
433	findInTags (struct TagInfo * t,	438	findInTags (struct TagInfo *t,
434	const char *tagname,	439	const char *tagname,
435	const char keyname, const char keyvalue, const char *searchname)	440	const char keyname, const char keyvalue, const char *searchname)
436	{	441	{
@@ -438,26 +443,26 @@ findInTags (struct TagInfo * t,
438	const char *pend;	443	const char *pend;
439		444
440	while (t != NULL)	445	while (t != NULL)
		446	{
		447	if (tagMatch (tagname, t->tagStart, t->tagEnd))
441	{	448	{
442	if (tagMatch (tagname, t->tagStart, t->tagEnd))	449	findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
		450	if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
		451	{
		452	findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
		453	if (pstart != NULL)
443	{	454	{
444	findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);	455	char *ret = malloc (pend - pstart + 1);
445	if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))	456	if (ret == NULL)
446	{	457	return NULL;
447	findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);	458	memcpy (ret, pstart, pend - pstart);
448	if (pstart != NULL)	459	ret[pend - pstart] = '\0';
449	{	460	return ret;
450	char *ret = malloc (pend - pstart + 1);
451	if (ret == NULL)
452	return NULL;
453	memcpy (ret, pstart, pend - pstart);
454	ret[pend - pstart] = '\0';
455	return ret;
456	}
457	}
458	}	461	}
459	t = t->next;	462	}
460	}	463	}
		464	t = t->next;
		465	}
461	return NULL;	466	return NULL;
462	}	467	}
463		468
@@ -465,10 +470,10 @@ findInTags (struct TagInfo * t,
465	/* mimetype = text/html */	470	/* mimetype = text/html */
466	int	471	int
467	EXTRACTOR_html_extract (const char *data,	472	EXTRACTOR_html_extract (const char *data,
468	size_t size,	473	size_t size,
469	EXTRACTOR_MetaDataProcessor proc,	474	EXTRACTOR_MetaDataProcessor proc,
470	void *proc_cls,	475	void *proc_cls,
471	const char *options)	476	const char *options)
472	{	477	{
473	size_t xsize;	478	size_t xsize;
474	struct TagInfo *tags;	479	struct TagInfo *tags;
@@ -494,60 +499,60 @@ EXTRACTOR_html_extract (const char *data,
494	tag.next = NULL;	499	tag.next = NULL;
495	pos = 0;	500	pos = 0;
496	while (pos < xsize)	501	while (pos < xsize)
		502	{
		503	if (! lookFor ('<', &pos, data, size))
		504	break;
		505	tag.tagStart = &data[++pos];
		506	if (! skipLetters (&pos, data, size))
		507	break;
		508	tag.tagEnd = &data[pos];
		509	if (! skipWhitespace (&pos, data, size))
		510	break;
		511	STEP3:
		512	if (! lookForMultiple (">\"\'", &pos, data, size))
		513	break;
		514	if (data[pos] != '>')
497	{	515	{
498	if (!lookFor ('<', &pos, data, size))	516	/* find end-quote, ignore escaped quotes (\') */
499	break;	517	do
500	tag.tagStart = &data[++pos];	518	{
501	if (!skipLetters (&pos, data, size))	519	tpos = pos;
502	break;	520	pos++;
503	tag.tagEnd = &data[pos];	521	if (! lookFor (data[tpos], &pos, data, size))
504	if (!skipWhitespace (&pos, data, size))	522	break;
505	break;	523	}
506	STEP3:	524	while (data[pos - 1] == '\\');
507	if (!lookForMultiple (">\"\'", &pos, data, size))
508	break;
509	if (data[pos] != '>')
510	{
511	/* find end-quote, ignore escaped quotes (\') */
512	do
513	{
514	tpos = pos;
515	pos++;
516	if (!lookFor (data[tpos], &pos, data, size))
517	break;
518	}
519	while (data[pos - 1] == '\\');
520	pos++;
521	goto STEP3;
522	}
523	pos++;	525	pos++;
524	if (!skipWhitespace (&pos, data, size))	526	goto STEP3;
525	break;	527	}
526	tag.dataStart = &data[pos];	528	pos++;
527	if (!lookFor ('<', &pos, data, size))	529	if (! skipWhitespace (&pos, data, size))
528	break;	530	break;
529	tag.dataEnd = &data[pos];	531	tag.dataStart = &data[pos];
530	i = 0;	532	if (! lookFor ('<', &pos, data, size))
531	while (relevantTags[i] != NULL)	533	break;
532	{	534	tag.dataEnd = &data[pos];
533	if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&	535	i = 0;
534	(0 == strncasecmp (relevantTags[i],	536	while (relevantTags[i] != NULL)
535	tag.tagStart, tag.tagEnd - tag.tagStart)))	537	{
536	{	538	if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
537	t = malloc (sizeof (struct TagInfo));	539	(0 == strncasecmp (relevantTags[i],
538	if (t == NULL)	540	tag.tagStart, tag.tagEnd - tag.tagStart)))
539	return 0;	541	{
540	*t = tag;	542	t = malloc (sizeof (struct TagInfo));
541	t->next = tags;	543	if (t == NULL)
542	tags = t;	544	return 0;
543	break;	545	*t = tag;
544	}	546	t->next = tags;
545	i++;	547	tags = t;
546	}
547	/* abort early if we hit the body tag */
548	if (tagMatch ("body", tag.tagStart, tag.tagEnd))
549	break;	548	break;
		549	}
		550	i++;
550	}	551	}
		552	/* abort early if we hit the body tag */
		553	if (tagMatch ("body", tag.tagStart, tag.tagEnd))
		554	break;
		555	}
551		556
552	/* fast exit */	557	/* fast exit */
553	if (tags == NULL)	558	if (tags == NULL)
@@ -557,110 +562,112 @@ EXTRACTOR_html_extract (const char *data,
557	/* first, try to determine mime type and/or character set */	562	/* first, try to determine mime type and/or character set */
558	tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");	563	tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
559	if (tmp != NULL)	564	if (tmp != NULL)
560	{	565	{
561	/* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;	566	/* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
562	if text/html is present, we take that as the mime-type; if charset=	567	if text/html is present, we take that as the mime-type; if charset=
563	is present, we try to use that for character set conversion. */	568	is present, we try to use that for character set conversion. */
564	if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))	569	if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
565	ret = proc (proc_cls,	570	ret = proc (proc_cls,
566	"html",	571	"html",
567	EXTRACTOR_METATYPE_MIMETYPE,	572	EXTRACTOR_METATYPE_MIMETYPE,
568	EXTRACTOR_METAFORMAT_UTF8,	573	EXTRACTOR_METAFORMAT_UTF8,
569	"text/plain",	574	"text/plain",
570	"text/html",	575	"text/html",
571	strlen ("text/html")+1);	576	strlen ("text/html") + 1);
572	charset = strcasestr (tmp, "charset=");	577	charset = strcasestr (tmp, "charset=");
573	if (charset != NULL)	578	if (charset != NULL)
574	charset = strdup (&charset[strlen ("charset=")]);	579	charset = strdup (&charset[strlen ("charset=")]);
575	free (tmp);	580	free (tmp);
576	}	581	}
577	i = 0;	582	i = 0;
578	while (tagmap[i].name != NULL)	583	while (tagmap[i].name != NULL)
		584	{
		585	tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
		586	if ( (tmp != NULL) &&
		587	(ret == 0) )
579	{	588	{
580	tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");	589	if (charset == NULL)
581	if ( (tmp != NULL) &&	590	{
582	(ret == 0) )	591	ret = proc (proc_cls,
		592	"html",
		593	tagmap[i].type,
		594	EXTRACTOR_METAFORMAT_C_STRING,
		595	"text/plain",
		596	tmp,
		597	strlen (tmp) + 1);
		598	}
		599	else
		600	{
		601	xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
		602	strlen (tmp),
		603	charset);
		604	if (xtmp != NULL)
583	{	605	{
584	if (charset == NULL)	606	ret = proc (proc_cls,
585	{	607	"html",
586	ret = proc (proc_cls,	608	tagmap[i].type,
587	"html",	609	EXTRACTOR_METAFORMAT_UTF8,
588	tagmap[i].type,	610	"text/plain",
589	EXTRACTOR_METAFORMAT_C_STRING,	611	xtmp,
590	"text/plain",	612	strlen (xtmp) + 1);
591	tmp,	613	free (xtmp);
592	strlen (tmp) + 1);
593	}
594	else
595	{
596	xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
597	strlen (tmp),
598	charset);
599	if (xtmp != NULL)
600	{
601	ret = proc (proc_cls,
602	"html",
603	tagmap[i].type,
604	EXTRACTOR_METAFORMAT_UTF8,
605	"text/plain",
606	xtmp,
607	strlen (xtmp) + 1);
608	free (xtmp);
609	}
610	}
611	}	614	}
612	if (tmp != NULL)	615	}
613	free (tmp);
614	i++;
615	}	616	}
		617	if (tmp != NULL)
		618	free (tmp);
		619	i++;
		620	}
616	while (tags != NULL)	621	while (tags != NULL)
		622	{
		623	t = tags;
		624	if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
		625	(ret == 0) )
617	{	626	{
618	t = tags;	627	if (charset == NULL)
619	if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&	628	{
620	(ret == 0) )	629	xtmp = malloc (t->dataEnd - t->dataStart + 1);
621	{	630	if (xtmp != NULL)
622	if (charset == NULL)	631	{
623	{	632	memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
624	xtmp = malloc (t->dataEnd - t->dataStart + 1);	633	xtmp[t->dataEnd - t->dataStart] = '\0';
625	if (xtmp != NULL)	634	ret = proc (proc_cls,
626	{	635	"html",
627	memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);	636	EXTRACTOR_METATYPE_TITLE,
628	xtmp[t->dataEnd - t->dataStart] = '\0';	637	EXTRACTOR_METAFORMAT_C_STRING,
629	ret = proc (proc_cls,	638	"text/plain",
630	"html",	639	xtmp,
631	EXTRACTOR_METATYPE_TITLE,	640	strlen (xtmp) + 1);
632	EXTRACTOR_METAFORMAT_C_STRING,	641	free (xtmp);
633	"text/plain",	642	}
634	xtmp,	643	}
635	strlen (xtmp) + 1);	644	else
636	free (xtmp);	645	{
637	}	646	xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
638	}	647	t->dataEnd - t->dataStart,
639	else	648	charset);
640	{	649	if (xtmp != NULL)
641	xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,	650	{
642	t->dataEnd - t->dataStart,	651	ret = proc (proc_cls,
643	charset);	652	"html",
644	if (xtmp != NULL)	653	EXTRACTOR_METATYPE_TITLE,
645	{	654	EXTRACTOR_METAFORMAT_UTF8,
646	ret = proc (proc_cls,	655	"text/plain",
647	"html",	656	xtmp,
648	EXTRACTOR_METATYPE_TITLE,	657	strlen (xtmp) + 1);
649	EXTRACTOR_METAFORMAT_UTF8,	658	free (xtmp);
650	"text/plain",	659	}
651	xtmp,	660	}
652	strlen (xtmp) + 1);
653	free (xtmp);
654	}
655	}
656	}
657	tags = t->next;
658	free (t);
659	}	661	}
		662	tags = t->next;
		663	free (t);
		664	}
660	if (charset != NULL)	665	if (charset != NULL)
661	free (charset);	666	free (charset);
662	return ret;	667	return ret;
663	}	668	}
		669
		670
664	#endif	671	#endif
665		672
666		673
@@ -672,9 +679,9 @@ html_gobject_init ()
672	{	679	{
673	magic = magic_open (MAGIC_MIME_TYPE);	680	magic = magic_open (MAGIC_MIME_TYPE);
674	if (0 != magic_load (magic, NULL))	681	if (0 != magic_load (magic, NULL))
675	{	682	{
676	/* FIXME: how to deal with errors? */	683	/* FIXME: how to deal with errors? */
677	}	684	}
678	}	685	}
679		686
680		687
@@ -685,10 +692,11 @@ void __attribute__ ((destructor))
685	html_ltdl_fini ()	692	html_ltdl_fini ()
686	{	693	{
687	if (NULL != magic)	694	if (NULL != magic)
688	{	695	{
689	magic_close (magic);	696	magic_close (magic);
690	magic = NULL;	697	magic = NULL;
691	}	698	}
692	}	699	}
693		700
		701
694	/* end of html_extractor.c */	702	/* end of html_extractor.c */