libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit b5b023205ca70b1ce23c565974e2e6bc58799498
parent 54130de64ae4ef42ec2e33b91ad50692e8fd9246
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 18 Sep 2005 14:09:14 +0000

update

Diffstat:
MTODO | 8++++----
Dsrc/main/extract.py | 32--------------------------------
Msrc/plugins/Makefile.am | 4+++-
Msrc/plugins/debextractor.c | 1-
Msrc/plugins/htmlextractor.c | 1426+++++++++++++++----------------------------------------------------------------
5 files changed, 266 insertions(+), 1205 deletions(-)

diff --git a/TODO b/TODO @@ -1,9 +1,9 @@ FIX: * HTML-extractor now broken (!) Also crappy code. FIX?! -* check exiv2 memory consumption on very large files -* integrate pt dictionary -- address charset issues! -* complete language plugin - +* check exiv2 memory consumption on very large files; + also investigate 500kb (!) allocation/leak in exiv2 on test/test.html + (reported by valgrind) +* 500 kb leak for each load/unload of exiv2 plugin (glibc?) Core: diff --git a/src/main/extract.py b/src/main/extract.py @@ -1,32 +0,0 @@ -"""extract.py - - This file is part of libextractor. - (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - -Little demo how to use the libextractor Python binding. - -""" -import Extractor -import sys - -xtract = Extractor.Extractor() -for arg in sys.argv[1:]: - print "Keywords from " + arg - keys = xtract.extract(arg); - for i in keys: - print i diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -186,8 +186,10 @@ libextractor_jpeg_la_LDFLAGS = \ libextractor_html_la_SOURCES = \ htmlextractor.c libextractor_html_la_LDFLAGS = \ - $(top_builddir)/src/main/libextractor.la \ $(PLUGINFLAGS) $(retaincommand) +libextractor_html_la_LIBADD = \ + $(top_builddir)/src/main/libextractor.la \ + libconvert.la libextractor_real_la_SOURCES = \ realextractor.c diff --git a/src/plugins/debextractor.c b/src/plugins/debextractor.c @@ -21,7 +21,6 @@ #include "platform.h" #include "extractor.h" #include <zlib.h> -#include <pthread.h> /* * The .deb is an ar-chive file. It contains a tar.gz file diff --git a/src/plugins/htmlextractor.c b/src/plugins/htmlextractor.c @@ -17,1171 +17,48 @@ Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - Portions of this code were adapted from libhtmlparse by - Mooneer Salem (mooneer@translator.cs). The main changes - to libhtmlparse were the removal of globals to make the - code reentrant. */ #include "platform.h" #include "extractor.h" #include <string.h> - -/* struct holding the arguments of tags */ -struct ArgvTable { - char *arg, *val; -}; - - -/** - * libhtmlparse has the callbacks defined as globals, - * which is bad for making libextractor re-entrant. - * We now put them all in one big table that is passed - * around inside the parser. - * - * The CallBacks - * You may call one ore several or even all callbacks. Except of the - * XHTMLCallBack, all CallBacks will work as expected and described - * - * XHTMLCallBack: - * The XHTMLCallBack is a special case, because you can decide, if the - * XHTML specific tags should be handeled as a start- AND endtag, or - * as an XHTML tag. If you call nothing, except start and endtag, the - * behaviour is, that you'll get a start AND an endtag called back. - * If you call XHTMLCallBack, it will only give you the XHTML call back. - * - * If you are in doubt or simply confused now, call XHTMLCallBack() - */ -typedef struct PC_ { -/* handle comments and javascript */ - int (*commentCallBack) (char *comment, struct PC_ * pc); - int (*commentStartCallBack) (struct PC_ * pc); - int (*commentEndCallBack) (struct PC_ * pc); - - /* Declaration e.g. <!DOCTYPE HTML ... */ - int (*declCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc); - - /* Start tag e.g. <html>, with arguments, args may be NULL, numargs may be 0 */ - int (*startCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc); - - /* End tag e.g. </html>*/ - int (*endCallBack) (char *tag, struct PC_ * pc); - - /* handle plain text */ - int (*textCallBack) (char *text, struct PC_ * pc); - int (*textStartCallBack) (struct PC_ * pc); - int (*textEndCallBack) (struct PC_ * pc); - - /* PHP inserts. BUG(?): if someone prints another PHP function from this PHP function - our lib will get confused. */ - int (*phpCallBack) (char *text, struct PC_ * pc); - - /* empty tags like <hr/>, <br/>, with arguments, args may be NULL, numargs may be 0 */ - int (*XHTMLCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc); - - /* XML tags <?xml>, with arguments, args may be NULL, numargs may be 0 */ - int (*xmlCallBack) (char *tag, /*@null@*/ struct ArgvTable *args, int numargs, struct PC_ * pc); - - /* entities like &auml;,&#228; text will inherit all chars between '&' and ';' */ - int (*entityCallBack) (char *text, struct PC_ * pc); - - /* and we also put some formaly static variables in this */ - - /* needed to pass text in <script> tags verbatim */ - unsigned int lhtml_script_passthru; - - const char * end; - - int numArgs; - - int numArgsStatus; - - /** - * 0: ignore, 1: add keyword - */ - int nextTextAction; - - /** - * If nextTextAction == 1, this gives the type of the - * keyword. - */ - EXTRACTOR_KeywordType nextKeywordType; - - /** - * Result of the current pass. - */ - struct EXTRACTOR_Keywords * result; - -} ParserContext; - - -/**********************************************************************/ - - -/* argument caching (e.g width="80%") */ -static struct ArgvTable *addArgToTable(struct ArgvTable *args, char *arg, char *val, - struct PC_ * pc) { - pc->numArgs++; - if (args == NULL) { - args = (struct ArgvTable*) calloc(1, - sizeof(struct ArgvTable)*(pc->numArgs+1)); - } else { - args = (struct ArgvTable*) realloc(args, - sizeof(struct ArgvTable)*(pc->numArgs+1)); - } - if (args == NULL) { - fprintf(stderr, - _("Fatal: could not allocate (%s at %s:%d).\n"), - strerror(errno), - __FILE__, __LINE__); - exit(EXIT_FAILURE); - } - args[pc->numArgs-1].arg = arg; - args[pc->numArgs-1].val = val; - return args; -} - -/* clean up memory */ -static void freeArgs (struct ArgvTable *args, - struct PC_ * pc) { - int i; - - if (args != NULL) { - for(i=0; i<pc->numArgs; i++) { - free(args[i].arg); - free(args[i].val); - } - free(args); - args=NULL; - pc->numArgs=0; - } -} - -/* prototype */ -static const char *parseForEntities(const char *, struct PC_ * pc); - - -static const char *parseText(const char *html, struct PC_ * pc) { - char *tmp; - const char *tmp2; - int ret=0; - - while( (html < pc->end) && isspace((int) *html)) html++; - - if (html >= pc->end) - return html; - if (*html == '<') return html; - - tmp2 = html; - while ( (html < pc->end) && (*html != '<') ) html++; - - tmp = (char *)calloc(1, (size_t)(html-tmp2+1)); - if (!tmp) return pc->end; - - memcpy(tmp, tmp2, (size_t)(html-tmp2)); - - if (strlen(tmp) > 0) { - if (pc->textStartCallBack) { - ret = pc->textStartCallBack(pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - if (pc->textCallBack) { - if (pc->entityCallBack){ /* that is textCallBack(text) - with entityCallBack(entity) as an extrabonus */ - /*printf("entity is here\n");*/ - parseForEntities(tmp, pc); - } else{ - ret = pc->textCallBack(tmp, pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - } - if (pc->textEndCallBack) { - ret = pc->textEndCallBack(pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - } - free(tmp); - if (html < pc->end-1) - if (*(html+1) == '>') html += 2; - return html; -} - -static const char *parseComment (const char *html, struct PC_ * pc) { - char *tmp; - const char *tmp2; - int ret=0; - - while ( (html < pc->end) && - ( (*html == '-') || isspace((int)*html)) ) html++; - - tmp2 = html; - while ( (html+2 < pc->end) && - !(*html == '-' && *(html+1) == '-' && *(html+2) == '>')) html++; - - tmp = (char *)calloc(1, (size_t)(html-tmp2+1)); - if (!tmp) return pc->end; - - memcpy(tmp, tmp2, (size_t)(html-tmp2)); - - if (html+3 < pc->end) { - html += 3; - } else { - free(tmp); - return pc->end; - } - - if (pc->commentStartCallBack) { - ret = pc->commentStartCallBack(pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - if (pc->commentCallBack) { - ret = pc->commentCallBack(tmp, pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - if (pc->commentEndCallBack) { - ret = pc->commentEndCallBack(pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - free(tmp); - return html; -} - -static const char *parseEndTag(const char *html, struct PC_ * pc) { - char *tmp; - const char *tmp2; - int ret=0; - - if (html >= pc->end) - return html; - - html++; - tmp2 = html; - while(html < pc->end && *html != '>') html++; - - tmp =(char *) calloc(1, (size_t)(html-tmp2+1)); - if (!tmp) return pc->end; - - memcpy(tmp, tmp2, (size_t)(html-tmp2)); - - if (pc->endCallBack) { - ret = pc->endCallBack(tmp,pc); - if (ret != 0) { - free(tmp); - return pc->end; - } - } - if ( (html < pc->end) && (*html == '>') ) html++; - free(tmp); - return html; -} - -static const char *parsePHP(const char *html, struct PC_ * pc) { - const char *tmp; - char *tmp2; - int ret=0; - - html += 4; - while(html < pc->end && isspace((int)*html)) html++; - - tmp = html; - - while ( (html+1 < pc->end) && !(*html == '?' && *(html+1) == '>')) html++; - tmp2 = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!tmp2) return pc->end; - - memcpy(tmp2, tmp, (size_t)(html-tmp)); - - if (pc->phpCallBack) { - ret = pc->phpCallBack(tmp2, pc); - if (ret != 0) { - free(tmp2); - return pc->end; - } - } - free(tmp2); - html += 2; - return html; -} - -/* parse the XML tag itself */ -static const char *parseXMLtag(const char *html, struct PC_ * pc) { - char *tag, *name, *value; - const char *tmp; - int ret; - struct ArgvTable *tmp2 = NULL; - - pc->numArgs = 0; - tmp = html; - while (html < pc->end && !isspace((int)*html) && *html != '>') html++; - - /* you may want to upper/lower tags, so I leave the tag itself untouched */ - tag = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!tag) { - return pc->end; - } - memcpy(tag, tmp, (size_t)(html-tmp)); - if (html >= pc->end) { - free(tag); - return html; - } - if (*html == '>') { - if (pc->xmlCallBack != NULL) { - ret = pc->xmlCallBack(tag, NULL, 0, pc); - free(tag); - if (*html == '>') html++; - return ((ret != 0) ? pc->end : html); - } - } - while((html < pc->end) && isspace((int)*html)) html++; - - while( (html < pc->end) && *html != '>' ) { - while ( (html < pc->end) && (isspace((int)*html)) ) html++; - if (html >= pc->end) - return pc->end; - if (*html == '>') break; - - tmp = html; - while( (html < pc->end) && !isspace((int)*html) && *html != '=' && *html != '>') html++; - name = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!name) { - free(tag); - tag = NULL; - return pc->end; - } - memcpy(name, tmp, (size_t)(html-tmp)); - if (isspace((int)*html)) { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - while(html < pc->end && isspace((int)*html) && *html != '>') html++; - } - if (html >= pc->end) { - free(tag); - return html; - } - if (*html == '>') { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - html++; - break; - } - if (*html == '=') html++; - if (html >= pc->end) { - free(tag); - return html; - } - if (*html != '"' && *html != '\'') { - tmp = html; - while(html < pc->end && *html != '>' && !isspace((int)*html)) html++; - value = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - if (tmp2 != NULL) { - freeArgs(tmp2, pc); - tmp2 = NULL; - } - return pc->end; - } - memcpy(value, tmp, (size_t)(html-tmp)); - tmp2 = addArgToTable(tmp2, name, value, pc); - } else if (*html == '"') { - html++; - if (html >= pc->end) { - free(tag); - return html; - } - tmp = html; - while(html < pc->end && !(*html == '"' && *(html-1) != '\\')) html++; - value = (char *) calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - if (tmp2 != NULL) { - freeArgs(tmp2, pc); - tmp2 = NULL; - } - return pc->end; - } - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - } else if (*html == '\'') { - html++; - if (html >= pc->end) { - free(tag); - return html; - } - tmp = html; - while(html < pc->end && !(*html == '\'' && *(html-1) != '\\')) html++; - - value = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - name = NULL; - free(tag); - tag = NULL; - if (tmp2 != NULL) { - freeArgs(tmp2, pc); - tmp2 = NULL; - } - return pc->end; - } - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - } - tmp = NULL; - value = NULL; - name = NULL; - } - if (html < pc->end) html++; - ret = pc->xmlCallBack(tag, tmp2, pc->numArgs, pc); - if (tmp2 != NULL) { - freeArgs(tmp2, pc); - tmp2 = NULL; - } - free(tag); - tag = NULL; - pc->numArgsStatus=0; - return (ret != 0 ? pc->end : html); -} - -/* cannibalistic function, munches the actuall tag */ -static const char *eatUp(const char *html, - struct PC_ * pc){ - while ( (html < pc->end) && - (*html != '>') ) { - html++; - } - if (html < pc->end) - html++; - return html; -} - -/* cannibalistic function, munches the actuall text */ -static const char *eatUpText(const char *html, - struct PC_ * pc){ - while ( (html < pc->end) - && (*html != '<') ) - html++; - return html; -} - - -/* decides, if a found '?' leads to PHP or XML if requisited - otherwise it gormandizes them up. *burps* */ -static const char *parseXML(const char *html, struct PC_ * pc) { - /* conditional expressions inside a conditional expression - don't try _this_ at home kids! ;-) */ - if (html+1 >= pc->end) - return html; - html=(((tolower((int)(*(html+1))))==(int)('p')) ? - ( (pc->phpCallBack) ? parsePHP (html, pc) : eatUp(html, pc) ) : - ( (pc->xmlCallBack) ? parseXMLtag(html, pc) : eatUp(html, pc) ) ); - return html; -} - -static const char *parseStartTag (const char *html, struct PC_ * pc) { - char *tag, *name, *value; - const char * tmp; - const char * start = html; - int ret = 0; - struct ArgvTable *tmp2 = NULL; - - pc->numArgs = 0; - tmp = html; - while(html < pc->end && !isspace((int)*html) && - *html != '>' && *html != '/') html++; - - tag = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!tag) { - return pc->end; - } - memcpy(tag, tmp, (size_t)(html-tmp)); - - if (strncasecmp("script", tag, 6) == 0) { - pc->lhtml_script_passthru = 1; - } - else if (strncasecmp("pre", tag, 3) == 0) { - pc->lhtml_script_passthru = 2; - } - if (html >= pc->end) - return pc->end; - - if (*html == '>') { - if (pc->startCallBack) { - ret = pc->startCallBack(tag, NULL, 0, pc); - free(tag); - tag = NULL; - - /* this check is redundant */ - /* if (*html == '>') */ html++; - return((ret != 0) ? pc->end : html); - } - } - else if (*html == '/' ) { /* XHTML empty tag like <hr/>, <br/>*/ - /********************************************** - * You may choose now between two behaviors * - * of libhtmlparse to handle XHTML empty tags: * - * a) call XHTMLCallBack * - * b) call start- AND endCallBack * - ***********************************************/ - if (pc->startCallBack != NULL && !(pc->XHTMLCallBack)) { - ret = pc->startCallBack(tag, NULL, 0, pc); - } - if (pc->endCallBack != NULL && ret==0 && !(pc->XHTMLCallBack)) { - ret = pc->endCallBack(tag, pc); - } - if(pc->XHTMLCallBack){ - ret = pc->XHTMLCallBack(tag, NULL, 0, pc); - } - - free(tag); - tag = NULL; - - html += 2; - return((ret != 0) ? pc->end : html); - } - - while(html < pc->end && isspace((int)*html)) html++; - - while(html < pc->end && *html != '>' ) { - while ( (html < pc->end) && (isspace((int)*html))) html++; - if (html+1 >= pc->end) - break; - if (*html == '>') - break; - - if (*html == '/' && *(html+1) == '>') { - html++; - break; - } - - tmp = html; - while(html < pc->end && !isspace((int)*html) && - *html != '=' && *html != '>') html++; - name = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!name) { - free(tag); - return pc->end; - } - - memcpy(name, tmp, (size_t)(html-tmp)); - if (html >= pc->end) { - free(tag); - return pc->end; - } - if (isspace((int)*html)) { - const char *x = html; - while (x < pc->end && *x != '>' && *x != '=') x++; - if (x >= pc->end) { - free(tag); - return pc->end; - } - if (*x == '=') { - html = x; - goto namevalue; - } - tmp2 = addArgToTable(tmp2, name, NULL, pc); - while(html+1 < pc->end && isspace((int)*html) && - *html != '>' && - !(*html == '/' && *(html+1) == '>')) - html++; - } else { - - if (*html == '/') { - html++; - break; - } - - /* html++ is repeated after the while loop - * and may cause deletion of important info */ - if (*html == '>') { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - /*html++;*/ - break; - } - - namevalue: - if (*html == '=') html++; - - while ( (html < pc->end) && (isspace(*html))) html++; - - if (html >= pc->end) { - free(tag); - return pc->end; - } - if (*html != '\'') { - tmp = html; - while(html+1 < pc->end && *html != '>' && - !isspace((int)*html) && - !(*html == '/' && *(html+1) == '>')) - html++; - value = (char *)calloc(1, (size_t)(html-tmp+1)); - if (value == NULL) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - freeArgs(tmp2, pc); - return pc->end; - } - memcpy(value, tmp, (size_t)(html-tmp)); - tmp2 = addArgToTable(tmp2, name, value, pc); - } else if (*html == '"') { - html++; - tmp = html; - while (html < pc->end && - !(*html == '"' && *(html-1) != '\\')) - html++; - value = (char *) calloc(1, (size_t)(html-tmp+1)); - if (value == NULL) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - freeArgs(tmp2, pc); - return pc->end; - } - - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - } else if (*html == '\'') { - html++; - tmp = html; - while(html < pc->end && !(*html == '\'' && - *(html-1) != '\\')) html++; - - value = (char *)calloc(1, (size_t)(html-tmp+1)); - if (value == NULL) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - freeArgs(tmp2, pc); - return pc->end; - } - - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - } - tmp = NULL; - } - } - if (html < pc->end) html++; - - if (html - start > 2) { - if (pc->startCallBack != NULL && (*(html-2)!='/')) { - ret = pc->startCallBack(tag, tmp2, pc->numArgs, pc); - } - if (pc->endCallBack != NULL && ret==0 && *(html-2)=='/' - && !(pc->XHTMLCallBack)) { - ret = pc->endCallBack(tag, pc); - } - /* these tags may have arguments too, e.g. <hr noshade/> */ - if (pc->XHTMLCallBack != NULL && *(html-2)=='/') { - ret = pc->XHTMLCallBack(tag, tmp2, pc->numArgs, pc); - } - } - if(tmp2 != NULL){ - freeArgs(tmp2, pc); - } - free(tag); - tag = NULL; - - pc->numArgsStatus=0; - - /* this is a bad hack, feel free to write a better one (maybe a more readable one? ;-)*/ - return - (pc->XHTMLCallBack != NULL) ? - (html) : - ((ret != 0) ? pc->end : html); -} - -static const char *parseDecl(const char *html, struct PC_ * pc) { - char *tag, *name, *value; - const char *tmp; - int ret=0; - struct ArgvTable *tmp2 = NULL; - - pc->numArgs = 0; - tmp = html; - while(html < pc->end && !isspace((int)*html) && *html != '>') html++; - if (html >= pc->end) - return pc->end; - tag = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!tag) { - return pc->end; - } - - memcpy(tag, tmp, (size_t)(html-tmp)); - - if (*html == '>') { - if (pc->declCallBack) { - ret = pc->declCallBack(tag, NULL, 0, pc); - free(tag); - tag = NULL; - - if (*html == '>') html++; - return((ret != 0) ? pc->end : html); - } - } - - while(html < pc->end && isspace((int)*html)) html++; - - while(html < pc->end && *html != '>') { - while ( (html<pc->end) && (isspace((int)*html)) ) html++; - if (html >= pc->end) - return pc->end; - if (*html == '>') break; - tmp = html; - switch(*tmp) { - case '\'' : - html++; - tmp = html; - while (html < pc->end && !(*html == '\'' && *html != '\\')) - html++; - break; - case '"' : - html++; - tmp = html; - while(html < pc->end && !(*html == '"' && *html != '\\')) - html++; - break; - default : - while(html < pc->end && !isspace((int)*html) && *html != '=' && *html != '>') - html++; - break; - } - - name = (char *) calloc(1, (size_t)(html-tmp+1)); - if (!name) { - free(tag); - tag = NULL; - return pc->end; - } - - memcpy(name, tmp, (size_t)(html-tmp)); - if (html >= pc->end) { - free(tag); - free(name); - return pc->end; - } - - if (isspace((int)*html)) { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - while (html < pc->end && isspace((int)*html) && *html != '>') - html++; - continue; - } - if (html >= pc->end) { - free(tag); - free(name); - return pc->end; - } - - if (*html == '>') { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - html++; - break; - } - if (html+1 >= pc->end) { - free(tag); - free(name); - return pc->end; - } - - if (*(html+1) == '>') { - tmp2 = addArgToTable(tmp2, name, NULL, pc); - html += 2; - break; - } - if (html >= pc->end) { - free(tag); - free(name); - return pc->end; - } - - if (*html == '=') html++; - switch(*html){ - case '\'' : - html++; - tmp = html; - while(html < pc->end && !(*html == '\'' && *(html-1) != '\\')) - html++; - - value = (char *) calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - freeArgs(tmp2, pc); - return pc->end; - } - - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - break; - case '"' : - html++; - tmp = html; - while (html < pc->end && !(*html == '"' && *(html-1) != '\\')) - html++; - value = (char *)calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - free(tag); - freeArgs(tmp2, pc); - return pc->end; - } - - memcpy(value, tmp, (size_t)(html-tmp)); - if (html < pc->end) - html++; - tmp2 = addArgToTable(tmp2, name, value, pc); - break; - default : - html++; - tmp = html; - while (html < pc->end && *html != '>' && !isspace((int)*html)) - html++; - value = (char *) calloc(1, (size_t)(html-tmp+1)); - if (!value) { - free(name); - name = NULL; - free(tag); - tag = NULL; - - freeArgs(tmp2, pc); - return pc->end; - } - - memcpy(value, tmp, (size_t)(html-tmp)); - tmp2 = addArgToTable(tmp2, name, value, pc); - break; - } - tmp = NULL; - } - - if (html < pc->end) html++; - - if (pc->declCallBack) { - ret = pc->declCallBack(tag, tmp2, pc->numArgs, pc); - freeArgs(tmp2, pc); - free(tag); - tag = NULL; - return((ret != 0) ? pc->end : html); - } - freeArgs(tmp2, pc); - pc->numArgsStatus=0; - - return html; -} - -static const char *parseForEntities (const char *tmp, struct PC_ * pc){ - char *entity, *text ; - const char *tmp1, *tmp2; - int ret=0, count=0; - while (tmp < pc->end){ - tmp1 = tmp; - while (tmp < pc->end && *tmp != '&')tmp++; - - text = (char *)calloc(1, (size_t)(tmp-tmp1+1)); - if (text == NULL) { - return pc->end; - } - - memcpy(text, tmp1, (size_t)(tmp-tmp1)); - /* the chunk of text before the first entity will - not be called, if it starts with an entity*/ - if(strlen(text)>0 && (!(isspace((int)*text)))){ - if (pc->textCallBack) { - ret = pc->textCallBack(text, pc); - } - free(text); - text = NULL; - tmp1 = pc->end; - } - if(*tmp == '&'){ - tmp++; - tmp2=tmp; - /* sometimes the ';' is absent, it's a bad hack, just to avoid more trouble */ - while( tmp < pc->end && (*tmp != ';' && count != 9) ){ - tmp++; - count++; - } - entity = (char *)calloc(1, (size_t)(tmp-tmp2+1)); - if (!entity) { - return pc->end; - } else { - memcpy(entity, tmp2, (size_t)(tmp-tmp2)); - if (*tmp == ';' || count == 9){ /* should I add an errortrap here? */ - ret = pc->entityCallBack(entity, pc); - free(entity); - entity = NULL; - tmp2 = pc->end; - count = 0; - } - } - } - if (tmp < pc->end) tmp++; - } - return tmp; -} - -static void parse (const char *html, struct PC_ * pc) { - while (html < pc->end) { - /* while(isspace(*html)){html++;} there may be leading blanks in some autogenerated files - add this or not, that is the question ;-)) */ - - if (pc->lhtml_script_passthru != 0) { - const char *text; - char *tmp; - - text = html; - if (pc->lhtml_script_passthru == 1 ){ - while(text+7 < pc->end) { - if (*text == '<') { - if (*(text+2) == 's' || *(text+2) == 'S') { - if (*(text+7) == 't' || *(text+7) == 'T') { - break; - } - } - } - if (text < pc->end) text++; - } - } - if (pc->lhtml_script_passthru == 2 ){ - while (text + 4 < pc->end) { - if (*text == '<') { - if (*(text+2) == 'p' || *(text+2) == 'P') { - if (*(text+4) == 'e' || *(text+4) == 'E') { - break; - } - } - } - if (text < pc->end) text++; - } - } - if (pc->textCallBack != NULL) { - tmp = (char *) malloc((size_t)(text-html+1)); - if (tmp == NULL) - return; - strncpy(tmp, html, (size_t)(text-html)); - tmp[text-html] = '\0'; /* strncpy does not zero-terminate! */ - int ret = pc->textCallBack(tmp, pc); - if (ret != 0) { - free(tmp); - tmp = NULL; - return; - } - free(tmp); - tmp = NULL; - } - - pc->lhtml_script_passthru = 0; - html = text; - } - - if (*html == '<'){ - html++; - if (html < pc->end) { - switch (*html) { - case '!' : - html++; - - /* I must admit, I like conditional expressions, - they are so obviously obfuscated ;-) */ - - html = (*html == '-') ? - ((pc->commentCallBack) ? parseComment(html, pc) : eatUp(html, pc)) : - ((pc->declCallBack) ? parseDecl(html, pc) : eatUp(html, pc)) ; - break; - case '?' : /* XML/PHP tag */ - html = (pc->xmlCallBack != NULL || pc->phpCallBack != NULL) ? - parseXML(html, pc) : - eatUp(html, pc); - break; - case '/' : /* HTML end tag */ - html = (pc->endCallBack) ? - parseEndTag(html, pc) : - eatUp(html, pc); - break; - default : /* HTML start tag */ - html = (pc->XHTMLCallBack != NULL || pc->startCallBack != NULL) ? - parseStartTag(html, pc) : - eatUp(html, pc); - break; - } - } - } else { /* All other text */ - /* while(isspace(*html))html++; it seems to be faster inside the function */ - html = (pc->textCallBack) ? - parseText(html, pc): - eatUpText(html, pc); - } - } - return; -} - - - -/* ******************* now: LE specifics *************** */ - - -/** - * Add a keyword. - **/ -static struct EXTRACTOR_Keywords * addKeyword(EXTRACTOR_KeywordType type, - char * keyword, - struct EXTRACTOR_Keywords * next) { - EXTRACTOR_KeywordList * result; - - if (keyword == NULL) - return next; - result = (EXTRACTOR_KeywordList*)malloc(sizeof(EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = strdup(keyword); - result->keywordType = type; - return result; -} - -/** - * Called by the parser whenever we see text. - **/ -static int texts (char *comment, struct PC_ * pc) { - if (pc->nextTextAction) { - pc->result = addKeyword(pc->nextKeywordType, - comment, - pc->result); - pc->nextTextAction = 0; - } - return 0; -} - -static int hasTag(char * arg, - char * val, - struct ArgvTable * args, - int numargs) { - int i; - for (i=0;i<numargs;i++) { - if ( (NULL != args[i].arg) && - (NULL != args[i].val) && - (0 == strcasecmp(args[i].arg, arg)) && - (0 == strcasecmp(args[i].val, val)) ) - return 1; - } - return 0; -} - -static char * getTag(char * arg, - struct ArgvTable * args, - int numargs) { - int i; - for (i=0;i<numargs;i++) - if (0 == strcasecmp(args[i].arg, arg)) - return args[i].val; - return NULL; -} +#include "convert.h" static struct { char * name; EXTRACTOR_KeywordType type; } tagmap[] = { - { "author" , EXTRACTOR_AUTHOR}, - { "description" , EXTRACTOR_DESCRIPTION}, - { "language", EXTRACTOR_LANGUAGE}, - { "rights", EXTRACTOR_COPYRIGHT}, - { "publisher", EXTRACTOR_PUBLISHER}, - { "date", EXTRACTOR_DATE}, - { "keywords", EXTRACTOR_KEYWORDS}, - {NULL, EXTRACTOR_UNKNOWN}, + { "author" , EXTRACTOR_AUTHOR}, + { "title" , EXTRACTOR_TITLE}, + { "description" , EXTRACTOR_DESCRIPTION}, + { "language", EXTRACTOR_LANGUAGE}, + { "rights", EXTRACTOR_COPYRIGHT}, + { "publisher", EXTRACTOR_PUBLISHER}, + { "formatter", EXTRACTOR_SOFTWARE}, + { "copyright", EXTRACTOR_COPYRIGHT}, + { "abstract", EXTRACTOR_SUMMARY}, + { "subject", EXTRACTOR_SUBJECT}, + { "abstract", EXTRACTOR_SUMMARY}, + { "date", EXTRACTOR_DATE}, + { "keywords", EXTRACTOR_KEYWORDS}, + { "dc.author" , EXTRACTOR_AUTHOR}, + { "dc.title" , EXTRACTOR_TITLE}, + { "dc.description" , EXTRACTOR_DESCRIPTION}, + { "dc.subject", EXTRACTOR_SUBJECT}, + { "dc.creator", EXTRACTOR_CREATOR}, + { "dc.publisher", EXTRACTOR_PUBLISHER}, + { "dc.date", EXTRACTOR_DATE}, + { "dc.format", EXTRACTOR_FORMAT}, + { "dc.identifier", EXTRACTOR_RESOURCE_IDENTIFIER}, + { "dc.rights", EXTRACTOR_COPYRIGHT}, + {NULL, EXTRACTOR_UNKNOWN}, }; - - -static int starttag(char *tag, - struct ArgvTable *args, - int numargs, - struct PC_ * pc) { - int i; - - if (0 == strcasecmp(tag,"title")) { - pc->nextTextAction = 1; - pc->nextKeywordType = EXTRACTOR_TITLE; - return 0; - } - if (0 == strcasecmp(tag,"meta")) { - i = 0; - while (tagmap[i].name != NULL) { - if (hasTag("name",tagmap[i].name,args, numargs)) - pc->result = addKeyword(tagmap[i].type, - getTag("content", - args, numargs), - pc->result); - i++; - } - } - /* Don't do this, you can't be certain...*/ -#if I_AM_CERTAIN - if (0 == strcasecmp(tag,"html")) { - pc->result = addKeyword(EXTRACTOR_MIMETYPE, - "text/html", - pc->result); - return 0; - } -#endif - if ( (tag != NULL) && - ( (0 == strcasecmp(tag, "body")) || - (0 == strcasecmp(tag, "/body")) ) ) - return 1; - return 0; -} - -static int endtag (char *tag, struct PC_ * pc) { - pc->nextTextAction = 0; - if ( (tag != NULL) && - ( (0 == strcasecmp(tag, "head")) || - (0 == strcasecmp(tag, "/head")) ) ) - return 1; - return 0; -} - +static char * relevantTags[] = { + "title", + "meta", + NULL, +}; /* which mime-types should not be subjected to the HTML extractor (no use trying & parsing @@ -1221,15 +98,128 @@ static char * blacklist[] = { NULL, }; +typedef struct TI { + struct TI * next; + const char * tagStart; + const char * tagEnd; + const char * dataStart; + const char * dataEnd; +} TagInfo; + +/** + * Add a keyword. + */ +static struct EXTRACTOR_Keywords * +addKeyword(EXTRACTOR_KeywordType type, + char * keyword, + struct EXTRACTOR_Keywords * next) { + EXTRACTOR_KeywordList * result; + + result = malloc(sizeof(EXTRACTOR_KeywordList)); + result->next = next; + result->keyword = keyword; + result->keywordType = type; + return result; +} + +/* ******************** parser helper functions ************** */ + +static int lookFor(char c, + size_t * pos, + const char * data, + size_t size) { + size_t p = *pos; + + if (data[p] == '\0') return 0; + while ( (p < size) && + (data[p] != c) ) { + p++; + if (data[p] == '\0') return 0; + } + *pos = p; + return p < size; +} + +static int skipWhitespace(size_t * pos, + const char * data, + size_t size) { + size_t p = *pos; + + if (data[p] == '\0') return 0; + while ( (p < size) && + (isspace(data[p])) ) { + p++; + if (data[p] == '\0') return 0; + } + *pos = p; + return p < size; +} + +static int skipLetters(size_t * pos, + const char * data, + size_t size) { + size_t p = *pos; + + if (data[p] == '\0') return 0; + while ( (p < size) && + (isalpha(data[p])) ) { + p++; + if (data[p] == '\0') return 0; + } + *pos = p; + return p < size; +} + +static int lookForMultiple(const char * c, + size_t * pos, + const char * data, + size_t size) { + size_t p = *pos; + + if (data[p] == '\0') return 0; + while ( (p < size) && + (strchr(c, data[p]) == NULL) ) { + p++; + if (data[p] == '\0') return 0; + } + *pos = p; + return p < size; +} + +/** + * Search all tags that correspond to "tagname". Example: + * If the tag is <meta name="foo" desc="bar">, and + * tagname == "meta", keyname="name", keyvalue="foo", + * and searchname="desc", then this function returns a + * copy (!) of "bar". Easy enough? + * + * @return NULL if nothing is found + */ +static char * findInTags(TagInfo * t, + const char * tagname, + const char * keyname, + const char * keyvalue, + const char * searchname) { + return NULL; +} + + /* mimetype = text/html */ struct EXTRACTOR_Keywords * libextractor_html_extract(const char * filename, const char * data, const size_t size, struct EXTRACTOR_Keywords * prev) { - ParserContext pc; size_t xsize; const char * mime; + TagInfo * tags; + TagInfo * t; + TagInfo tag; + size_t pos; + size_t tpos; + int i; + char * charset; + char * tmp; if (size == 0) return prev; @@ -1246,23 +236,125 @@ libextractor_html_extract(const char * filename, } } - memset(&pc, - 0, - sizeof(ParserContext)); - pc.end = &data[size]; - pc.result = prev; - pc.textCallBack = &texts; - pc.startCallBack = &starttag; - pc.endCallBack = &endtag; + /* only scan first 32k */ if (size > 1024 * 32) xsize = 1024 * 32; else xsize = size; -#ifdef strnlen - if (strnlen(data, xsize) < xsize - 1) + tags = NULL; + tag.next = NULL; + pos = 0; + while (pos < xsize) { + if (! lookFor('<', &pos, data, size)) break; + tag.tagStart = &data[++pos]; + if (! skipLetters(&pos, data, size)) break; + tag.tagEnd = &data[pos]; + if (! skipWhitespace(&pos, data, size)) break; + STEP3: + if (! lookForMultiple(">\"\'", &pos, data, size)) break; + if (data[pos] != '>') { + /* find end-quote, ignore escaped quotes (\') */ + do { + tpos = pos; + pos++; + if (! lookFor(data[tpos], &pos, data, size)) + break; + } while (data[pos-1] == '\\'); + pos++; + goto STEP3; + } + pos++; + if (! skipWhitespace(&pos, data, size)) break; + tag.dataStart = &data[pos]; + if (! lookFor('<', &pos, data, size)) break; + tag.dataEnd = &data[pos]; + i = 0; + while (relevantTags[i] != NULL) { + if ( (strlen(relevantTags[i]) == tag.tagEnd - tag.tagStart) && + (0 == strncasecmp(relevantTags[i], + tag.tagStart, + tag.tagEnd - tag.tagStart)) ) { + t = malloc(sizeof(TagInfo)); + *t = tag; + t->next = tags; + tags = t; + break; + } + i++; + } + /* abort early if we hit the body tag */ + if ( (tag.tagEnd - tag.tagStart == strlen("body")) && + 0 == strncasecmp("body", + tag.tagStart, + tag.tagEnd - tag.tagStart)) + break; + } + + /* fast exit */ + if (tags == NULL) return prev; -#endif - parse(data, &pc); - return pc.result; + + charset = NULL; + + /* first, try to determine mime type and/or character set */ + tmp = findInTags(tags, + "meta", + "http-equiv", "content-type", + "content"); + if (tmp != NULL) { + /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; + if text/html is present, we take that as the mime-type; if charset= + is present, we try to use that for character set conversion. */ + if (0 == strncmp(tmp, + "text/html", + strlen("text/html"))) + prev = addKeyword(EXTRACTOR_MIMETYPE, + strdup("text/html"), + prev); + + charset = strstr(tmp, "charset="); + free(tmp); + } + if (charset != NULL) + charset = strdup(&charset[strlen("charset=")]); + else + charset = strdup("ISO-8859-1"); /* try a sensible default */ + + + i = 0; + while (tagmap[i].name != NULL) { + tmp = findInTags(tags, + "meta", + "name", tagmap[i].name, + "content"); + if (tmp != NULL) { + prev = addKeyword(tagmap[i].type, + convertToUtf8(tmp, + strlen(tmp), + charset), + prev); + free(tmp); + } + i++; + } + + + while (tags != NULL) { + t = tags; + if ( (t->tagEnd - t->tagStart == strlen("title")) && + 0 == strncasecmp("title", + t->tagStart, + t->tagEnd - t->tagStart)) + prev = addKeyword(EXTRACTOR_TITLE, + convertToUtf8(t->dataStart, + t->dataEnd - t->dataStart, + charset), + prev); + tags = t->next; + free(t); + } + free(charset); + + return prev; }