update - libextractor - GNU libextractor

commit b5b023205ca70b1ce23c565974e2e6bc58799498
parent 54130de64ae4ef42ec2e33b91ad50692e8fd9246
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 18 Sep 2005 14:09:14 +0000

update

Diffstat:
M TODO  | 8 ++++----
D src/main/extract.py  | 32 --------------------------------
M src/plugins/Makefile.am  | 4 +++-
M src/plugins/debextractor.c  | 1 -
M src/plugins/htmlextractor.c  | 1426 +++++++++++++++----------------------------------------------------------------

5 files changed, 266 insertions(+), 1205 deletions(-)
diff --git a/TODO b/TODO
@@ -1,9 +1,9 @@
 FIX:
 * HTML-extractor now broken (!) Also crappy code. FIX?!
-* check exiv2 memory consumption on very large files
-* integrate pt dictionary -- address charset issues!
-* complete language plugin
-
+* check exiv2 memory consumption on very large files; 
+  also investigate 500kb (!) allocation/leak in exiv2 on test/test.html
+  (reported by valgrind)
+* 500 kb leak for each load/unload of exiv2 plugin (glibc?)
 
 
 Core:
diff --git a/src/main/extract.py b/src/main/extract.py
@@ -1,32 +0,0 @@
-"""extract.py
-
-     This file is part of libextractor.
-     (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff
-
-     libextractor is free software; you can redistribute it and/or modify
-     it under the terms of the GNU General Public License as published
-     by the Free Software Foundation; either version 2, or (at your
-     option) any later version.
-
-     libextractor is distributed in the hope that it will be useful, but
-     WITHOUT ANY WARRANTY; without even the implied warranty of
-     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-     General Public License for more details.
-
-     You should have received a copy of the GNU General Public License
-     along with libextractor; see the file COPYING.  If not, write to the
-     Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-     Boston, MA 02111-1307, USA.
-
-Little demo how to use the libextractor Python binding.
-
-"""
-import Extractor
-import sys
-
-xtract = Extractor.Extractor()
-for arg in sys.argv[1:]:
-    print "Keywords from " + arg
-    keys = xtract.extract(arg);
-    for i in keys:
-        print i
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
@@ -186,8 +186,10 @@ libextractor_jpeg_la_LDFLAGS = \
 libextractor_html_la_SOURCES = \
   htmlextractor.c 
 libextractor_html_la_LDFLAGS = \
-  $(top_builddir)/src/main/libextractor.la \
   $(PLUGINFLAGS)   $(retaincommand)
+libextractor_html_la_LIBADD = \
+  $(top_builddir)/src/main/libextractor.la \
+  libconvert.la 
 
 libextractor_real_la_SOURCES = \
   realextractor.c 
diff --git a/src/plugins/debextractor.c b/src/plugins/debextractor.c
@@ -21,7 +21,6 @@
 #include "platform.h"
 #include "extractor.h"
 #include <zlib.h>
-#include <pthread.h>
 
 /*
  * The .deb is an ar-chive file.  It contains a tar.gz file
diff --git a/src/plugins/htmlextractor.c b/src/plugins/htmlextractor.c
@@ -17,1171 +17,48 @@
      Free Software Foundation, Inc., 59 Temple Place - Suite 330,
      Boston, MA 02111-1307, USA.
 
-     Portions of this code were adapted from libhtmlparse by
-     Mooneer Salem (mooneer@translator.cs).  The main changes
-     to libhtmlparse were the removal of globals to make the
-     code reentrant.
  */
 
 #include "platform.h"
 #include "extractor.h"
 #include <string.h>
-
-/* struct holding the arguments of tags */
-struct ArgvTable {
-  char *arg, *val;
-};
-
-
-/**
- * libhtmlparse has the callbacks defined as globals,
- * which is bad for making libextractor re-entrant.
- * We now put them all in one big table that is passed
- * around inside the parser.
- *
- *                        The CallBacks
- * You may call one ore several or even all callbacks. Except of the
- * XHTMLCallBack, all CallBacks will work as expected and described
- *
- * XHTMLCallBack:
- * The XHTMLCallBack is a special case, because you can decide, if the
- * XHTML specific tags should be handeled as a start- AND endtag, or
- * as an XHTML tag. If you call nothing, except start and endtag, the
- * behaviour is, that you'll get a start AND an endtag called back.
- * If you call XHTMLCallBack, it will only give you the XHTML call back.
- *
- * If you are in doubt or simply confused now, call XHTMLCallBack()
- */
-typedef struct PC_ {
-/* handle comments and javascript */
-  int (*commentCallBack) (char *comment, struct PC_ * pc);
-  int (*commentStartCallBack) (struct PC_ * pc);
-  int (*commentEndCallBack) (struct PC_ * pc);
-
-  /* Declaration e.g. <!DOCTYPE HTML ... */
-  int (*declCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
-
-  /* Start tag e.g. <html>, with arguments, args may be NULL, numargs may be 0 */
-  int (*startCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
-
-  /* End tag e.g. </html>*/
-  int (*endCallBack) (char *tag, struct PC_ * pc);
-
-  /* handle plain text */
-  int (*textCallBack) (char *text, struct PC_ * pc);
-  int (*textStartCallBack) (struct PC_ * pc);
-  int (*textEndCallBack) (struct PC_ * pc);
-
-  /* PHP inserts. BUG(?): if someone prints another PHP function from this PHP function
-     our lib will get confused. */
-  int (*phpCallBack) (char *text, struct PC_ * pc);
-
-  /* empty tags like <hr/>, <br/>, with arguments, args may be NULL, numargs may be 0 */
-  int (*XHTMLCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
-
-  /* XML tags <?xml>, with arguments, args may be NULL, numargs may be 0 */
-  int (*xmlCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc);
-
-  /* entities like &auml;,&#228; text will inherit all chars between '&' and ';' */
-  int (*entityCallBack) (char *text, struct PC_ * pc);
-
-  /* and we also put some formaly static variables in this */
-
-  /* needed to pass text in <script> tags verbatim */
-  unsigned int lhtml_script_passthru;
-
-  const char * end;
-
-  int numArgs;
-
-  int numArgsStatus;
-
-  /**
-   * 0: ignore, 1: add keyword
-   */
-  int nextTextAction;
-
-  /**
-   * If nextTextAction == 1, this gives the type of the
-   * keyword.
-   */
-  EXTRACTOR_KeywordType nextKeywordType;
-
-  /**
-   * Result of the current pass.
-   */
-  struct EXTRACTOR_Keywords * result;
-
-} ParserContext;
-
-
-/**********************************************************************/
-
-
-/* argument caching (e.g width="80%") */
-static struct ArgvTable *addArgToTable(struct ArgvTable *args, char *arg, char *val,
-				       struct PC_ * pc) {
-  pc->numArgs++;
-  if (args == NULL) {
-    args = (struct ArgvTable*) calloc(1, 
-				      sizeof(struct ArgvTable)*(pc->numArgs+1));
-  } else {
-    args = (struct ArgvTable*) realloc(args, 
-				       sizeof(struct ArgvTable)*(pc->numArgs+1));
-  }
-  if (args == NULL) {
-    fprintf(stderr,
-	    _("Fatal: could not allocate (%s at %s:%d).\n"),
-	    strerror(errno),
-	    __FILE__, __LINE__);
-    exit(EXIT_FAILURE);
-  }
-  args[pc->numArgs-1].arg = arg;
-  args[pc->numArgs-1].val = val;
-  return args;
-}
-
-/* clean up memory */
-static void freeArgs (struct ArgvTable *args,
-		      struct PC_ * pc) {
-  int i;
-
-  if (args != NULL) {
-    for(i=0; i<pc->numArgs; i++) {
-      free(args[i].arg);
-      free(args[i].val);
-    }
-    free(args);
-    args=NULL;
-    pc->numArgs=0;
-  }
-}
-
-/* prototype */
-static const char *parseForEntities(const char *, struct PC_ * pc);
-
-
-static const char *parseText(const char *html, struct PC_ * pc) {
-  char *tmp;
-  const char *tmp2;
-  int ret=0;
-
-  while( (html < pc->end) && isspace((int) *html)) html++;
-
-  if (html >= pc->end) 
-    return html;
-  if (*html == '<') return html;
-
-  tmp2 = html;
-  while ( (html < pc->end) && (*html != '<') ) html++;
-
-  tmp = (char *)calloc(1, (size_t)(html-tmp2+1));
-  if (!tmp) return pc->end;
-
-  memcpy(tmp, tmp2, (size_t)(html-tmp2));
-
-  if (strlen(tmp) > 0) {
-    if (pc->textStartCallBack) {
-      ret = pc->textStartCallBack(pc);
-      if (ret != 0) {
-	free(tmp);
-	return pc->end;
-      }
-    }
-    if (pc->textCallBack) {
-      if (pc->entityCallBack){ /* that is textCallBack(text)
-			      with entityCallBack(entity) as an extrabonus */
-	/*printf("entity is here\n");*/
-	parseForEntities(tmp, pc);
-      } else{
-	ret = pc->textCallBack(tmp, pc);
-	if (ret != 0) {
-	  free(tmp);
-	  return pc->end;
-	}
-      }
-    }
-    if (pc->textEndCallBack) {
-      ret = pc->textEndCallBack(pc);
-      if (ret != 0) {
-	free(tmp);
-	return pc->end;
-      }
-    }
-  }
-  free(tmp);
-  if (html < pc->end-1)
-    if (*(html+1) == '>') html += 2;
-  return html;
-}
-
-static const char *parseComment (const char *html, struct PC_ * pc) {
-  char *tmp;
-  const char *tmp2;
-  int ret=0;
-
-  while ( (html < pc->end) &&
-	  ( (*html == '-') || isspace((int)*html)) ) html++;
-
-  tmp2 = html;
-  while ( (html+2 < pc->end) && 
-	  !(*html == '-' && *(html+1) == '-' && *(html+2) == '>')) html++;
-
-  tmp = (char *)calloc(1, (size_t)(html-tmp2+1));
-  if (!tmp) return pc->end;
-
-  memcpy(tmp, tmp2, (size_t)(html-tmp2));
-
-  if (html+3 < pc->end) {
-    html += 3;
-  } else {
-    free(tmp);
-    return pc->end;
-  }
-
-  if (pc->commentStartCallBack) {
-    ret = pc->commentStartCallBack(pc);
-    if (ret != 0) {
-      free(tmp);
-      return pc->end;
-    }
-  }
-  if (pc->commentCallBack) {
-    ret = pc->commentCallBack(tmp, pc);
-    if (ret != 0) {
-      free(tmp);
-      return pc->end;
-    }
-  }
-  if (pc->commentEndCallBack) {
-    ret = pc->commentEndCallBack(pc);
-    if (ret != 0) {
-      free(tmp);
-      return pc->end;
-    }
-  }
-  free(tmp);
-  return html;
-}
-
-static const char *parseEndTag(const char *html, struct PC_ * pc) {
-  char *tmp;
-  const char *tmp2;
-  int ret=0;
-
-  if (html >= pc->end)
-    return html;
-
-  html++;
-  tmp2 = html;
-  while(html < pc->end && *html != '>') html++;
-
-  tmp =(char *) calloc(1, (size_t)(html-tmp2+1));
-  if (!tmp) return pc->end;
-
-  memcpy(tmp, tmp2, (size_t)(html-tmp2));
-
-  if (pc->endCallBack) {
-    ret = pc->endCallBack(tmp,pc);
-    if (ret != 0) {
-      free(tmp);
-      return pc->end;
-    }
-  }
-  if ( (html < pc->end) && (*html == '>') ) html++;
-  free(tmp);
-  return html;
-}
-
-static const char *parsePHP(const char *html, struct PC_ * pc) {
-  const char *tmp;
-  char *tmp2;
-  int ret=0;
-
-  html += 4;
-  while(html < pc->end && isspace((int)*html)) html++;
-
-  tmp = html;
-
-  while ( (html+1 < pc->end) && !(*html == '?' && *(html+1) == '>')) html++;
-  tmp2 = (char *)calloc(1, (size_t)(html-tmp+1));
-  if (!tmp2) return pc->end;
-
-  memcpy(tmp2, tmp, (size_t)(html-tmp));
-
-  if (pc->phpCallBack) {
-    ret = pc->phpCallBack(tmp2, pc);
-    if (ret != 0) {
-      free(tmp2);
-      return pc->end;
-    }
-  }
-  free(tmp2);
-  html += 2;
-  return html;
-}
-
-/* parse the XML tag itself */
-static const char *parseXMLtag(const char *html, struct PC_ * pc) {
-  char *tag, *name, *value;
-  const char *tmp;
-  int ret;
-  struct ArgvTable *tmp2 = NULL;
-
-  pc->numArgs = 0;
-  tmp = html;
-  while (html < pc->end && !isspace((int)*html) && *html != '>') html++;
-
-  /* you may want to upper/lower tags, so I leave the tag itself untouched */
-  tag = (char *)calloc(1, (size_t)(html-tmp+1));
-  if (!tag) {
-    return pc->end;
-  }
-  memcpy(tag, tmp, (size_t)(html-tmp)); 
-  if (html >= pc->end) {
-    free(tag);
-    return html;
-  }
-  if (*html == '>') {
-    if (pc->xmlCallBack != NULL) {
-      ret = pc->xmlCallBack(tag, NULL, 0, pc);
-      free(tag);
-      if (*html == '>') html++;
-      return ((ret != 0) ? pc->end : html);
-    }
-  }
-  while((html < pc->end) && isspace((int)*html)) html++;
-
-  while( (html < pc->end) && *html != '>' ) {
-    while ( (html < pc->end) && (isspace((int)*html)) ) html++;
-    if (html >= pc->end) 
-      return pc->end;
-    if (*html == '>') break;
-
-    tmp = html;
-    while( (html < pc->end) && !isspace((int)*html) && *html != '=' && *html != '>') html++;
-    name = (char *)calloc(1, (size_t)(html-tmp+1));
-    if (!name) {
-      free(tag);
-      tag = NULL;
-      return pc->end;
-    }
-    memcpy(name, tmp, (size_t)(html-tmp));
-    if (isspace((int)*html)) {
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      while(html < pc->end && isspace((int)*html) && *html != '>') html++;
-    }
-    if (html >= pc->end) {
-      free(tag);
-      return html;
-    }
-    if (*html == '>') {
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      html++;
-      break;
-    }
-    if (*html == '=') html++;
-    if (html >= pc->end) {
-      free(tag);
-      return html;
-    }
-    if (*html != '"' && *html != '\'') {
-      tmp = html;
-      while(html < pc->end && *html != '>' && !isspace((int)*html)) html++;
-      value = (char *)calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);
-	name = NULL;
-	free(tag);
-	tag = NULL;
-	
-	if (tmp2 != NULL) {
-	  freeArgs(tmp2, pc);
-	  tmp2 = NULL;
-	}
-	return pc->end;
-      }
-      memcpy(value, tmp, (size_t)(html-tmp));
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-    } else if (*html == '"') {
-      html++;
-      if (html >= pc->end) {
-	free(tag);
-	return html;
-      }
-      tmp = html;
-      while(html < pc->end && !(*html == '"' && *(html-1) != '\\')) html++;
-      value = (char *) calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);
-	name = NULL;
-	free(tag);
-	tag = NULL;
-	
-	if (tmp2 != NULL) {
-	  freeArgs(tmp2, pc);
-	  tmp2 = NULL;
-	}
-	return pc->end;
-      }
-      memcpy(value, tmp, (size_t)(html-tmp));
-      if (html < pc->end)
-	html++;
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-    } else if (*html == '\'') {
-      html++;
-      if (html >= pc->end) {
-	free(tag);
-	return html;
-      }
-      tmp = html;
-      while(html < pc->end && !(*html == '\'' && *(html-1) != '\\')) html++;
-
-      value =  (char *)calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);
-	name = NULL;
-	free(tag);
-	tag = NULL;	
-	if (tmp2 != NULL) {
-	  freeArgs(tmp2, pc);
-	  tmp2 = NULL;
-	}
-	return pc->end;
-      }
-      memcpy(value, tmp, (size_t)(html-tmp));
-      if (html < pc->end)
-	html++;
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-    }
-    tmp = NULL;
-    value = NULL;
-    name = NULL;
-  }
-  if (html < pc->end) html++;
-  ret = pc->xmlCallBack(tag, tmp2, pc->numArgs, pc);
-  if (tmp2 != NULL) {
-    freeArgs(tmp2, pc);
-    tmp2 = NULL;
-  }
-  free(tag);
-  tag = NULL;
-  pc->numArgsStatus=0;
-  return (ret != 0 ? pc->end : html);
-}
-
-/* cannibalistic function, munches the actuall tag */
-static const char *eatUp(const char *html,
-			 struct PC_ * pc){
-  while ( (html < pc->end) &&
-	  (*html != '>') ) {
-    html++;
-  }
-  if (html < pc->end)
-    html++;
-  return html;
-}
-
-/* cannibalistic function, munches the actuall text */
-static const char *eatUpText(const char *html,
-			     struct PC_ * pc){
-  while ( (html < pc->end)
-	  && (*html != '<') )
-    html++;
-  return html;
-}
-
-
-/* decides, if a found '?' leads to PHP or XML if requisited
-   otherwise it gormandizes them up. *burps* */
-static const char *parseXML(const char *html, struct PC_ * pc) {
-  /* conditional expressions inside a conditional expression
-     don't try _this_ at home kids! ;-) */
-  if (html+1 >= pc->end) 
-    return html;
-  html=(((tolower((int)(*(html+1))))==(int)('p')) ?
-	( (pc->phpCallBack) ? parsePHP   (html, pc) :  eatUp(html, pc) ) :
-	( (pc->xmlCallBack) ? parseXMLtag(html, pc) :  eatUp(html, pc) )   );
-  return html;
-}
-
-static const char *parseStartTag (const char *html, struct PC_ * pc) {
-  char *tag, *name, *value;
-  const char * tmp;
-  const char * start = html;
-  int ret = 0;
-  struct ArgvTable *tmp2 = NULL;
-
-  pc->numArgs = 0;
-  tmp = html;
-  while(html < pc->end && !isspace((int)*html) &&
-	*html != '>' && *html != '/') html++;
-  
-  tag = (char *)calloc(1, (size_t)(html-tmp+1));
-  if (!tag) {
-    return pc->end;
-  }
-  memcpy(tag, tmp, (size_t)(html-tmp));
-
-  if (strncasecmp("script", tag, 6) == 0) {
-    pc->lhtml_script_passthru = 1;
-  }
-  else if (strncasecmp("pre", tag, 3) == 0) {
-    pc->lhtml_script_passthru = 2;
-  }
-  if (html >= pc->end)
-    return pc->end;
-
-  if (*html == '>') {
-    if (pc->startCallBack) {
-      ret = pc->startCallBack(tag, NULL, 0, pc);
-      free(tag);
-      tag = NULL;
-
-      /* this check is redundant */
-      /* if (*html == '>') */ html++;
-      return((ret != 0) ? pc->end : html);
-    }
-  }
-  else if (*html == '/' ) {   /* XHTML empty tag like <hr/>, <br/>*/
-    /**********************************************
-     * You may choose now between two behaviors    *
-     * of libhtmlparse to handle XHTML empty tags: *
-     * a) call XHTMLCallBack                       *
-     * b) call start- AND endCallBack              *
-     ***********************************************/
-    if (pc->startCallBack != NULL && !(pc->XHTMLCallBack)) {
-      ret = pc->startCallBack(tag, NULL, 0, pc);
-    }
-    if (pc->endCallBack != NULL && ret==0 && !(pc->XHTMLCallBack)) {
-      ret = pc->endCallBack(tag, pc);
-    }
-    if(pc->XHTMLCallBack){
-      ret = pc->XHTMLCallBack(tag, NULL, 0, pc);
-    }
-
-    free(tag);
-    tag = NULL;
-
-    html += 2;
-    return((ret != 0) ? pc->end : html);
-  }
-
-  while(html < pc->end && isspace((int)*html)) html++;
-
-  while(html < pc->end && *html != '>' ) {
-    while ( (html < pc->end) && (isspace((int)*html))) html++;
-    if (html+1 >= pc->end)
-      break;
-    if (*html == '>') 
-      break;
-
-    if (*html == '/' && *(html+1) == '>') {
-      html++; 
-      break;
-    }
-
-    tmp = html;
-    while(html < pc->end && !isspace((int)*html) &&
-	  *html != '=' && *html != '>') html++;
-    name = (char *)calloc(1, (size_t)(html-tmp+1));
-    if (!name) {
-      free(tag);
-      return pc->end;
-    }
-
-    memcpy(name, tmp, (size_t)(html-tmp));
-    if (html >= pc->end) {
-      free(tag);
-      return pc->end;
-    }
-    if (isspace((int)*html)) {
-      const char *x = html;
-      while (x < pc->end && *x != '>' && *x != '=') x++;
-      if (x >= pc->end) {
-	free(tag);
-	return pc->end;
-      }
-      if (*x == '=') {
-	html = x;
-	goto namevalue;
-      }
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      while(html+1 < pc->end && isspace((int)*html) &&
-	    *html != '>' &&
-	    !(*html == '/' && *(html+1) == '>'))
-	html++;
-    } else {
-      
-      if (*html == '/') {
-	html++;
-	break;
-      }
-
-      /* html++ is repeated after the while loop
-       * and may cause deletion of important info */
-      if (*html == '>') {
-	tmp2 = addArgToTable(tmp2, name, NULL, pc);
-	/*html++;*/
-	break;
-      }
-
-    namevalue:
-      if (*html == '=') html++;
-
-      while ( (html < pc->end) && (isspace(*html))) html++;
-
-      if (html >= pc->end) {
-	free(tag);
-	return pc->end;
-      }
-      if (*html != '\'') {
-	tmp = html;
-	while(html+1 < pc->end && *html != '>' &&
-	      !isspace((int)*html) &&
-	      !(*html == '/' && *(html+1) == '>'))
-	  html++;
-	value = (char *)calloc(1, (size_t)(html-tmp+1));
-	if (value == NULL) {
-	  free(name);
-	  name = NULL;
-	  free(tag);
-	  tag = NULL;
-	  
-	  freeArgs(tmp2, pc);
-	  return pc->end;
-	}	
-	memcpy(value, tmp, (size_t)(html-tmp));
-	tmp2 = addArgToTable(tmp2, name, value, pc);
-      } else if (*html == '"') {
-	html++;
-	tmp = html;
-	while (html < pc->end &&
-	       !(*html == '"' && *(html-1) != '\\'))
-	  html++;
-	value = (char *) calloc(1, (size_t)(html-tmp+1));
-	if (value == NULL) {
-	  free(name);
-	  name = NULL;
-	  free(tag);
-	  tag = NULL;
-	  
-	  freeArgs(tmp2, pc);
-	  return pc->end;
-	}
-	
-	memcpy(value, tmp, (size_t)(html-tmp));
-	if (html < pc->end)
-	  html++;
-	tmp2 = addArgToTable(tmp2, name, value, pc);
-      } else if (*html == '\'') {
-	html++;
-	tmp = html;
-	while(html < pc->end && !(*html == '\'' &&
-				 *(html-1) != '\\')) html++;
-	
-	value = (char *)calloc(1, (size_t)(html-tmp+1));
-	if (value == NULL) {
-	  free(name);
-	  name = NULL;
-	  free(tag);
-	  tag = NULL;
-	
-	  freeArgs(tmp2, pc);
-	  return pc->end;
-	}
-	
-	memcpy(value, tmp, (size_t)(html-tmp));
-	if (html < pc->end)
-	  html++;
-	tmp2 = addArgToTable(tmp2, name, value, pc);
-      }
-      tmp = NULL;
-    }
-  }
-  if (html < pc->end) html++;
-
-  if (html - start > 2) {
-    if (pc->startCallBack != NULL && (*(html-2)!='/')) {
-      ret = pc->startCallBack(tag, tmp2, pc->numArgs, pc);
-    }
-    if (pc->endCallBack != NULL && ret==0 && *(html-2)=='/'
-	&& !(pc->XHTMLCallBack)) {
-      ret = pc->endCallBack(tag, pc);
-    }
-    /* these tags may have arguments too, e.g. <hr noshade/> */
-    if (pc->XHTMLCallBack != NULL && *(html-2)=='/') {
-      ret = pc->XHTMLCallBack(tag, tmp2, pc->numArgs, pc);
-    }
-  }
-  if(tmp2 != NULL){
-    freeArgs(tmp2, pc);
-  }
-  free(tag);
-  tag = NULL;
-
-  pc->numArgsStatus=0;
-
-  /* this is a bad hack, feel free to write a better one (maybe a more readable one? ;-)*/
-  return
-    (pc->XHTMLCallBack != NULL) ?
-    (html) :
-    ((ret != 0) ? pc->end : html);
-}
-
-static const char *parseDecl(const char *html, struct PC_ * pc) {
-  char *tag, *name, *value;
-  const char *tmp;
-  int ret=0;
-  struct ArgvTable *tmp2 = NULL;
-
-  pc->numArgs = 0;
-  tmp = html;
-  while(html < pc->end && !isspace((int)*html) && *html != '>') html++;
-  if (html >= pc->end)
-    return pc->end;
-  tag = (char *)calloc(1, (size_t)(html-tmp+1));
-  if (!tag) {
-    return pc->end;
-  }
-
-  memcpy(tag, tmp, (size_t)(html-tmp));
-
-  if (*html == '>') {
-    if (pc->declCallBack) {
-      ret = pc->declCallBack(tag, NULL, 0, pc);
-      free(tag);
-      tag = NULL;
-
-      if (*html == '>') html++;
-      return((ret != 0) ? pc->end : html);
-    }
-  }
-
-  while(html < pc->end && isspace((int)*html)) html++;
-
-  while(html < pc->end && *html != '>') {
-    while ( (html<pc->end) && (isspace((int)*html)) ) html++;
-    if (html >= pc->end)
-      return pc->end;
-    if (*html == '>') break;
-    tmp = html;
-    switch(*tmp) {
-    case '\'' :
-      html++;
-      tmp = html;
-      while (html < pc->end && !(*html == '\'' && *html != '\\'))
-	html++;
-      break;
-    case '"'  :
-      html++;
-      tmp = html;
-      while(html < pc->end && !(*html == '"' && *html != '\\'))
-	html++;
-      break;
-    default  :
-      while(html < pc->end && !isspace((int)*html) && *html != '=' && *html != '>')
-	html++;
-      break;
-    }
-
-    name = (char *) calloc(1, (size_t)(html-tmp+1));
-    if (!name) {
-      free(tag);
-      tag = NULL;
-      return pc->end;
-    }
-
-    memcpy(name, tmp, (size_t)(html-tmp));
-    if (html >= pc->end) {
-      free(tag);
-      free(name);
-      return pc->end;
-    }
-
-    if (isspace((int)*html)) {
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      while (html < pc->end && isspace((int)*html) && *html != '>')
-	html++;
-      continue;
-    }
-    if (html >= pc->end) {
-      free(tag);
-      free(name);
-      return pc->end;
-    }
-
-    if (*html == '>') {
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      html++;
-      break;
-    }
-    if (html+1 >= pc->end) {
-      free(tag);
-      free(name);
-      return pc->end;
-    }
-
-    if (*(html+1) == '>') {
-      tmp2 = addArgToTable(tmp2, name, NULL, pc);
-      html += 2;
-      break;
-    }
-    if (html >= pc->end) {
-      free(tag);
-      free(name);
-      return pc->end;
-    }
-
-    if (*html == '=') html++;
-    switch(*html){
-    case '\''  :
-      html++;
-      tmp = html;
-      while(html < pc->end && !(*html == '\'' && *(html-1) != '\\'))
-	html++;
-
-      value = (char *) calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);
-	name = NULL;
-	free(tag);
-	tag = NULL;
-	
-	freeArgs(tmp2, pc);
-	return pc->end;
-      }
-
-      memcpy(value, tmp, (size_t)(html-tmp));
-      if (html < pc->end)
-	html++;
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-      break;
-    case '"'  :
-      html++;
-      tmp = html;
-      while (html < pc->end && !(*html == '"' && *(html-1) != '\\'))
-	html++;
-      value =  (char *)calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);       
-	free(tag);	
-	freeArgs(tmp2, pc);
-	return pc->end;
-      }
-
-      memcpy(value, tmp, (size_t)(html-tmp));
-      if (html < pc->end)
-	html++;
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-      break;
-    default  :
-      html++;
-      tmp = html;
-      while (html < pc->end && *html != '>' && !isspace((int)*html))
-	html++;
-      value = (char *) calloc(1, (size_t)(html-tmp+1));
-      if (!value) {
-	free(name);
-	name = NULL;
-	free(tag);
-	tag = NULL;
-	
-	freeArgs(tmp2, pc);
-	return pc->end;
-      }
-
-      memcpy(value, tmp, (size_t)(html-tmp));
-      tmp2 = addArgToTable(tmp2, name, value, pc);
-      break;
-    }
-    tmp = NULL;
-  }
-
-  if (html < pc->end) html++;
-
-  if (pc->declCallBack) {
-    ret = pc->declCallBack(tag, tmp2, pc->numArgs, pc);
-    freeArgs(tmp2, pc);
-    free(tag);
-    tag = NULL;
-    return((ret != 0) ? pc->end : html);
-  }
-  freeArgs(tmp2, pc);
-  pc->numArgsStatus=0;
-
-  return html;
-}
-
-static const char *parseForEntities (const char *tmp, struct PC_ * pc){
-  char *entity, *text ;
-  const char *tmp1, *tmp2;
-  int ret=0, count=0;
-  while (tmp < pc->end){
-    tmp1 = tmp;
-    while (tmp < pc->end && *tmp != '&')tmp++;
-
-    text = (char *)calloc(1, (size_t)(tmp-tmp1+1));
-    if (text == NULL) {
-      return pc->end;
-    }
-
-    memcpy(text, tmp1, (size_t)(tmp-tmp1));
-    /* the chunk of text before the first entity will
-       not be called, if it starts with an entity*/
-    if(strlen(text)>0 && (!(isspace((int)*text)))){
-      if (pc->textCallBack) {
-	ret = pc->textCallBack(text, pc);
-      }
-      free(text);
-      text = NULL;
-      tmp1 = pc->end;
-    }
-    if(*tmp == '&'){
-      tmp++;
-      tmp2=tmp;
-      /* sometimes the ';' is absent, it's a bad hack, just to avoid more trouble */
-      while( tmp < pc->end && (*tmp != ';' && count != 9) ){
-	tmp++;
-	count++;
-      }
-      entity = (char *)calloc(1, (size_t)(tmp-tmp2+1));
-      if (!entity) {
-	return pc->end;
-      } else {
-	memcpy(entity, tmp2, (size_t)(tmp-tmp2));
-	if (*tmp == ';' || count == 9){  /* should I add an errortrap here? */
-	  ret = pc->entityCallBack(entity, pc);
-	  free(entity);
-	  entity = NULL;
-	  tmp2 = pc->end;
-	  count = 0;
-	}
-      }
-    }
-    if (tmp < pc->end) tmp++;
-  }
-  return tmp;
-}
-
-static void parse (const char *html, struct PC_ * pc) {
-  while (html < pc->end) {
-    /* while(isspace(*html)){html++;} there may be leading blanks in some autogenerated files
-       add this or not, that is the question ;-)) */
-
-    if (pc->lhtml_script_passthru != 0) {
-      const char *text;
-      char *tmp;
-
-      text = html;
-      if (pc->lhtml_script_passthru == 1 ){
-	while(text+7 < pc->end) {
-	  if (*text == '<') {
-	    if (*(text+2) == 's' || *(text+2) == 'S') {
-	      if (*(text+7) == 't' || *(text+7) == 'T') {
-		break;
-	      }
-	    }
-	  }
-	  if (text < pc->end) text++;
-	} 
-      }
-      if (pc->lhtml_script_passthru == 2 ){
-	while (text + 4 < pc->end) {
-	  if (*text == '<') {
-	    if (*(text+2) == 'p' || *(text+2) == 'P') {
-	      if (*(text+4) == 'e' || *(text+4) == 'E') {
-		break;
-	      }
-	    }
-	  }
-	  if (text < pc->end) text++;
-	}
-      }
-      if (pc->textCallBack != NULL) {
-	tmp = (char *) malloc((size_t)(text-html+1));
-	if (tmp == NULL) 
-	  return;
-	strncpy(tmp, html, (size_t)(text-html));
-	tmp[text-html] = '\0';  /* strncpy does not zero-terminate! */
-	int ret = pc->textCallBack(tmp, pc);
-	if (ret != 0) {
-	  free(tmp);
-	  tmp = NULL;	
-	  return;
-	}	
-	free(tmp);
-	tmp = NULL;
-      }
-
-      pc->lhtml_script_passthru = 0;
-      html = text;
-    }
-
-    if (*html == '<'){
-      html++;
-      if (html < pc->end) {
-	switch (*html) { 
-	case '!'   :
-	  html++;
-	  
-	  /* I must admit, I like conditional expressions,
-	     they are so obviously obfuscated ;-)          */
-	  
-	  html = (*html == '-') ?
-	    ((pc->commentCallBack) ? parseComment(html, pc) : eatUp(html, pc)) :
-	    ((pc->declCallBack)    ? parseDecl(html, pc)    : eatUp(html, pc))  ;
-	  break;
-	case '?'  : 			/* XML/PHP tag */
-	  html = (pc->xmlCallBack != NULL || pc->phpCallBack != NULL) ?
-	    parseXML(html, pc) :
-	    eatUp(html, pc);
-	    break;
-	case '/'  : 			/* HTML end tag */
-	  html = (pc->endCallBack) ?
-	    parseEndTag(html, pc) :
-	    eatUp(html, pc);
-	    break;
-	default  : 			/* HTML start tag */
-	  html = (pc->XHTMLCallBack != NULL || pc->startCallBack != NULL) ?
-	    parseStartTag(html, pc) :
-	    eatUp(html, pc);
-	    break;
-	}
-      }
-    } else {				 /* All other text */
-      /* while(isspace(*html))html++;   it seems to be faster inside the function */
-      html = (pc->textCallBack)  ?
-	parseText(html, pc):
-	eatUpText(html, pc);
-    }
-  }
-  return;
-}
-
-
-
-/* ******************* now: LE specifics *************** */
-
-
-/**
- * Add a keyword.
- **/
-static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type,
-					      char * keyword,
-					      struct EXTRACTOR_Keywords * next) {
-  EXTRACTOR_KeywordList * result;
-
-  if (keyword == NULL)
-    return next;
-  result = (EXTRACTOR_KeywordList*)malloc(sizeof(EXTRACTOR_KeywordList));
-  result->next = next;
-  result->keyword = strdup(keyword);
-  result->keywordType = type;
-  return result;
-}
-
-/**
- * Called by the parser whenever we see text.
- **/
-static int texts (char *comment, struct PC_ * pc) {
-  if (pc->nextTextAction) {
-    pc->result = addKeyword(pc->nextKeywordType,
-			    comment,
-			    pc->result);
-    pc->nextTextAction = 0;
-  }
-  return 0;
-}
-
-static int hasTag(char * arg,
-		  char * val,
-		  struct ArgvTable * args,
-		  int numargs) {
-  int i;
-  for (i=0;i<numargs;i++) {
-    if ( (NULL != args[i].arg) &&
-	 (NULL != args[i].val) &&
-	 (0 == strcasecmp(args[i].arg, arg)) &&
-	 (0 == strcasecmp(args[i].val, val)) )
-      return 1;
-  }
-  return 0;
-}
-
-static char * getTag(char * arg,
-		     struct ArgvTable * args,
-		     int numargs) {
-  int i;
-  for (i=0;i<numargs;i++)
-    if (0 == strcasecmp(args[i].arg, arg))
-      return args[i].val;
-  return NULL;
-}
+#include "convert.h"
 
 static struct {
   char * name;
   EXTRACTOR_KeywordType type;
 } tagmap[] = {
-   { "author" , EXTRACTOR_AUTHOR},
-   { "description" , EXTRACTOR_DESCRIPTION},
-   { "language", EXTRACTOR_LANGUAGE},
-   { "rights", EXTRACTOR_COPYRIGHT},
-   { "publisher", EXTRACTOR_PUBLISHER},
-   { "date", EXTRACTOR_DATE},
-   { "keywords", EXTRACTOR_KEYWORDS},
-   {NULL, EXTRACTOR_UNKNOWN},
+  { "author" ,         EXTRACTOR_AUTHOR},
+  { "title" ,          EXTRACTOR_TITLE},
+  { "description" ,    EXTRACTOR_DESCRIPTION},
+  { "language",        EXTRACTOR_LANGUAGE},
+  { "rights",          EXTRACTOR_COPYRIGHT},
+  { "publisher",       EXTRACTOR_PUBLISHER},
+  { "formatter",       EXTRACTOR_SOFTWARE},
+  { "copyright",       EXTRACTOR_COPYRIGHT},
+  { "abstract",        EXTRACTOR_SUMMARY},
+  { "subject",         EXTRACTOR_SUBJECT},
+  { "abstract",        EXTRACTOR_SUMMARY},
+  { "date",            EXTRACTOR_DATE},
+  { "keywords",        EXTRACTOR_KEYWORDS},
+  { "dc.author" ,      EXTRACTOR_AUTHOR},
+  { "dc.title" ,       EXTRACTOR_TITLE},
+  { "dc.description" , EXTRACTOR_DESCRIPTION},
+  { "dc.subject",      EXTRACTOR_SUBJECT},
+  { "dc.creator",      EXTRACTOR_CREATOR},
+  { "dc.publisher",    EXTRACTOR_PUBLISHER},
+  { "dc.date",         EXTRACTOR_DATE},
+  { "dc.format",       EXTRACTOR_FORMAT},
+  { "dc.identifier",   EXTRACTOR_RESOURCE_IDENTIFIER},
+  { "dc.rights",       EXTRACTOR_COPYRIGHT},
+  {NULL, EXTRACTOR_UNKNOWN},
 };
 
-
-
-static int starttag(char *tag,
-		    struct ArgvTable *args,
-		    int numargs,
-		    struct PC_ * pc) {
-  int i;
-
-  if (0 == strcasecmp(tag,"title")) {
-    pc->nextTextAction = 1;
-    pc->nextKeywordType = EXTRACTOR_TITLE;
-    return 0;
-  }
-  if (0 == strcasecmp(tag,"meta")) {
-    i = 0;
-    while (tagmap[i].name != NULL) {
-      if (hasTag("name",tagmap[i].name,args, numargs))
-	pc->result = addKeyword(tagmap[i].type,
-				getTag("content",
-				       args, numargs),
-				pc->result);
-      i++;
-    }
-  }
-  /* Don't do this, you can't be certain...*/
-#if I_AM_CERTAIN
-  if (0 == strcasecmp(tag,"html")) {
-    pc->result = addKeyword(EXTRACTOR_MIMETYPE,
-			    "text/html",
-			    pc->result);
-    return 0;
-  }
-#endif
-  if ( (tag != NULL) &&
-       ( (0 == strcasecmp(tag, "body")) ||
-	 (0 == strcasecmp(tag, "/body")) ) )
-    return 1;
-  return 0;
-}
-
-static int endtag (char *tag, struct PC_ * pc) {
-  pc->nextTextAction = 0;
-  if ( (tag != NULL) &&
-       ( (0 == strcasecmp(tag, "head")) ||
-	 (0 == strcasecmp(tag, "/head")) ) )
-    return 1;
-  return 0;
-}
-
+static char * relevantTags[] = {
+  "title",
+  "meta",
+  NULL,
+};
 
 /* which mime-types should not be subjected to
    the HTML extractor (no use trying & parsing
@@ -1221,15 +98,128 @@ static char * blacklist[] = {
   NULL,
 };
 
+typedef struct TI {
+  struct TI * next;
+  const char * tagStart;
+  const char * tagEnd;
+  const char * dataStart;
+  const char * dataEnd;
+} TagInfo;
+
+/**
+ * Add a keyword.
+ */
+static struct EXTRACTOR_Keywords * 
+addKeyword(EXTRACTOR_KeywordType type,
+	   char * keyword,
+	   struct EXTRACTOR_Keywords * next) {
+  EXTRACTOR_KeywordList * result;
+
+  result = malloc(sizeof(EXTRACTOR_KeywordList));
+  result->next = next;
+  result->keyword = keyword;
+  result->keywordType = type;
+  return result;
+}
+
+/* ******************** parser helper functions ************** */
+
+static int lookFor(char c, 
+		   size_t * pos, 
+		   const char * data,
+		   size_t size) {
+  size_t p = *pos;
+
+  if (data[p] == '\0') return 0;
+  while ( (p < size) &&
+	  (data[p] != c) ) {
+    p++;
+    if (data[p] == '\0') return 0;
+  }
+  *pos = p;
+  return p < size;
+}
+
+static int skipWhitespace(size_t * pos, 
+			  const char * data,
+			  size_t size) {
+  size_t p = *pos;
+
+  if (data[p] == '\0') return 0;
+  while ( (p < size) &&
+	  (isspace(data[p])) ) {
+    p++;
+    if (data[p] == '\0') return 0;
+  }
+  *pos = p;
+  return p < size;
+}
+
+static int skipLetters(size_t * pos, 
+		       const char * data,
+		       size_t size) {
+  size_t p = *pos;
+  
+  if (data[p] == '\0') return 0;
+  while ( (p < size) &&
+	  (isalpha(data[p])) ) {
+    p++;
+    if (data[p] == '\0') return 0;
+  }
+  *pos = p;
+  return p < size;
+}
+
+static int lookForMultiple(const char * c, 
+			   size_t * pos, 
+			   const char * data,
+			   size_t size) {
+  size_t p = *pos;
+
+  if (data[p] == '\0') return 0;
+  while ( (p < size) &&
+	  (strchr(c, data[p]) == NULL) ) {
+    p++;
+    if (data[p] == '\0') return 0;
+  }
+  *pos = p;
+  return p < size;
+}
+
+/**
+ * Search all tags that correspond to "tagname".  Example:
+ * If the tag is <meta name="foo" desc="bar">, and
+ * tagname == "meta", keyname="name", keyvalue="foo",
+ * and searchname="desc", then this function returns a 
+ * copy (!) of "bar".  Easy enough?
+ *
+ * @return NULL if nothing is found
+ */
+static char * findInTags(TagInfo * t,
+			 const char * tagname,
+			 const char * keyname,
+			 const char * keyvalue,
+			 const char * searchname) {
+  return NULL;
+}
+
+
 /* mimetype = text/html */
 struct EXTRACTOR_Keywords * 
 libextractor_html_extract(const char * filename,
 			  const char * data,
 			  const size_t size,
 			  struct EXTRACTOR_Keywords * prev) {
-  ParserContext pc;
   size_t xsize;
   const char * mime;
+  TagInfo * tags;
+  TagInfo * t;
+  TagInfo tag;
+  size_t pos;
+  size_t tpos;
+  int i;
+  char * charset;
+  char * tmp;
 
   if (size == 0)
     return prev;
@@ -1246,23 +236,125 @@ libextractor_html_extract(const char * filename,
     }
   }
 
-  memset(&pc,
-	 0,
-	 sizeof(ParserContext));
-  pc.end = &data[size];
-  pc.result = prev;
-  pc.textCallBack = &texts;
-  pc.startCallBack = &starttag;
-  pc.endCallBack = &endtag;
+  /* only scan first 32k */
   if (size > 1024 * 32)
     xsize = 1024 * 32;
   else
     xsize = size;
-#ifdef strnlen
-  if (strnlen(data, xsize) < xsize - 1)
+  tags = NULL;
+  tag.next = NULL;
+  pos = 0;
+  while (pos < xsize) {
+    if (! lookFor('<', &pos, data, size)) break;
+    tag.tagStart = &data[++pos];
+    if (! skipLetters(&pos, data, size)) break;
+    tag.tagEnd = &data[pos];
+    if (! skipWhitespace(&pos, data, size)) break;
+  STEP3:
+    if (! lookForMultiple(">\"\'", &pos, data, size)) break;
+    if (data[pos] != '>') {      
+      /* find end-quote, ignore escaped quotes (\') */
+      do {
+	tpos = pos;
+	pos++;
+	if (! lookFor(data[tpos], &pos, data, size)) 
+	  break;
+      } while (data[pos-1] == '\\');
+      pos++;
+      goto STEP3;
+    }
+    pos++;
+    if (! skipWhitespace(&pos, data, size)) break;   
+    tag.dataStart = &data[pos];
+    if (! lookFor('<', &pos, data, size)) break;
+    tag.dataEnd = &data[pos];
+    i = 0;
+    while (relevantTags[i] != NULL) {
+      if ( (strlen(relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
+	   (0 == strncasecmp(relevantTags[i],
+			     tag.tagStart,
+			     tag.tagEnd - tag.tagStart)) ) {
+	t = malloc(sizeof(TagInfo));
+	*t = tag;
+	t->next = tags;
+	tags = t;
+	break;
+      }
+      i++;
+    } 
+    /* abort early if we hit the body tag */
+    if ( (tag.tagEnd - tag.tagStart == strlen("body")) &&
+	 0 == strncasecmp("body",
+			  tag.tagStart,
+			  tag.tagEnd - tag.tagStart))
+      break; 
+  }
+
+  /* fast exit */
+  if (tags == NULL)
     return prev;
-#endif
-  parse(data, &pc);
-  return pc.result;
+
+  charset = NULL;
+
+  /* first, try to determine mime type and/or character set */
+  tmp = findInTags(tags,
+		   "meta", 
+		   "http-equiv", "content-type",
+		   "content");
+  if (tmp != NULL) {
+    /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
+       if text/html is present, we take that as the mime-type; if charset=
+       is present, we try to use that for character set conversion. */
+    if (0 == strncmp(tmp,
+		     "text/html",
+		     strlen("text/html"))) 
+      prev = addKeyword(EXTRACTOR_MIMETYPE,
+			strdup("text/html"),
+			prev);
+    
+    charset = strstr(tmp, "charset=");
+    free(tmp);
+  }
+  if (charset != NULL)
+    charset = strdup(&charset[strlen("charset=")]);
+  else
+    charset = strdup("ISO-8859-1"); /* try a sensible default */
+  
+  
+  i = 0;
+  while (tagmap[i].name != NULL) {
+    tmp = findInTags(tags,
+		     "meta",
+		     "name", tagmap[i].name,
+		     "content");
+    if (tmp != NULL) {
+      prev = addKeyword(tagmap[i].type,
+			convertToUtf8(tmp,
+				      strlen(tmp),
+				      charset),
+			prev);    
+      free(tmp);
+    }
+    i++;
+  }
+
+  
+  while (tags != NULL) {
+    t = tags;
+    if ( (t->tagEnd - t->tagStart == strlen("title")) &&
+	 0 == strncasecmp("title",
+			  t->tagStart,
+			  t->tagEnd - t->tagStart))
+      prev = addKeyword(EXTRACTOR_TITLE,
+			convertToUtf8(t->dataStart,
+				      t->dataEnd - t->dataStart,
+				      charset),
+			prev);    
+    tags = t->next;
+    free(t);
+  }
+  free(charset);
+
+  return prev;
 }

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	TODO	\|	8	++++----
D	src/main/extract.py	\|	32	--------------------------------
M	src/plugins/Makefile.am	\|	4	+++-
M	src/plugins/debextractor.c	\|	1	-
M	src/plugins/htmlextractor.c	\|	1426	+++++++++++++++----------------------------------------------------------------