libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 1c1ab9345ffb30fdabe892cec619f8935c8f2e32
parent 3d6637f0f4521ce1df920e8919046ba237d086aa
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sun, 20 Feb 2005 03:57:58 +0000

bugfix

Diffstat:
MChangeLog | 5+++++
Mconfigure.ac | 2+-
Msrc/main/Makefile.am | 3++-
Msrc/main/extract.c | 47+++++++----------------------------------------
Msrc/main/extractor.c | 58++++++++++++++++------------------------------------------
Asrc/main/iconv.c | 35+++++++++++++++++++++++++++++++++++
Msrc/plugins/convert.c | 9+++++++--
Msrc/plugins/pdf/pdfextractor.cc | 53++++++++++++++++++++++++++++++++++++++++++++---------
Msrc/plugins/pngextractor.c | 8++++----
9 files changed, 121 insertions(+), 99 deletions(-)

diff --git a/ChangeLog b/ChangeLog @@ -1,3 +1,8 @@ +Sat Feb 19 22:58:30 EST 2005 + Fixed problems with wrong byteorder for Unicode decoding + in PDF meta-data. Fixed minor problems with character + set conversion error handling. + Wed Jan 26 19:31:04 EST 2005 Workaround possible bug in glib quarks (OLE2 extractor). Improved QT support (?nam tag, support for description). diff --git a/configure.ac b/configure.ac @@ -165,7 +165,7 @@ AC_FUNC_VPRINTF AC_FUNC_MMAP AC_FUNC_STAT AC_FUNC_ERROR_AT_LINE -AC_CHECK_FUNCS([strndup munmap strcasecmp strdup strncasecmp memmove memset strtoul floor getcwd pow setenv sqrt strchr strcspn strrchr strnlen strndup]) +AC_CHECK_FUNCS([mkstemp strndup munmap strcasecmp strdup strncasecmp memmove memset strtoul floor getcwd pow setenv sqrt strchr strcspn strrchr strnlen strndup]) AM_GNU_GETTEXT_VERSION(0.14) AM_GNU_GETTEXT([external]) diff --git a/src/main/Makefile.am b/src/main/Makefile.am @@ -34,7 +34,8 @@ libextractor_la_DEPENDENCIES = \ EXTRA_DIST = \ winproc.c \ libextractor_python.c \ - extract.py + extract.py \ + iconv.c if MINGW winproc = winproc.c diff --git a/src/main/extract.c b/src/main/extract.c @@ -168,40 +168,7 @@ printHelp () } -/** - * Convert the given input using the given converter - * and return as a 0-terminated string. - */ -static char * iconvHelper(iconv_t cd, - const char * in) { - size_t inSize; - char * buf; - char * ibuf; - size_t outSize; - size_t outLeft; - size_t ret; - - /* reset iconv */ - iconv(cd, NULL, NULL, NULL, NULL); - - inSize = strlen(in); - outSize = 4 * strlen(in) + 2; - outLeft = outSize - 2; /* make sure we have 2 0-terminations! */ - buf = malloc(outSize); - ibuf = buf; - memset(buf, 0, outSize); - ret = iconv(cd, - (char**) &in, - &inSize, - &ibuf, - &outLeft); - if (ret == (size_t)-1) { - /* conversion failed */ - free(buf); - return strdup(in); - } - return buf; -} +#include "iconv.c" /** @@ -221,12 +188,12 @@ printSelectedKeywords (FILE * handle, iconv_t cd; char * buf; - cd = iconv_open( -#ifdef MINGW - "" -#else - nl_langinfo(CODESET) -#endif + cd = iconv_open( +#ifdef MINGW + "char" +#else + nl_langinfo(CODESET) +#endif , "UTF-8"); while (keywords != NULL) { buf = NULL; diff --git a/src/main/extractor.c b/src/main/extractor.c @@ -27,10 +27,10 @@ #include <../../libltdl/ltdl.h> #endif #include <locale.h> -#include <iconv.h> +#include <iconv.h> #ifndef MINGW #include <langinfo.h> -#endif +#endif #define DEBUG 1 @@ -720,37 +720,7 @@ EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list) return list; } -/** - * Convert the given input using the given converter - * and return as a 0-terminated string. - */ -static char * iconvHelper(iconv_t cd, - const char * in) { - size_t inSize; - char * buf; - char * ibuf; - size_t outSize; - size_t outLeft; - /* reset iconv */ - iconv(cd, NULL, NULL, NULL, NULL); - - inSize = strlen(in); - outSize = 4 * strlen(in) + 2; - outLeft = outSize - 2; /* make sure we have 2 0-terminations! */ - buf = malloc(outSize); - ibuf = buf; - memset(buf, 0, outSize); - if (iconv(cd, - (char**) &in, - &inSize, - &ibuf, - &outLeft) == (size_t)-1) { - /* conversion failed */ - free(buf); - return strdup(in); - } - return buf; -} +#include "iconv.c" /** * Print a keyword list to a file. @@ -765,17 +735,20 @@ EXTRACTOR_printKeywords (FILE * handle, iconv_t cd; char * buf; - cd = iconv_open( -#ifdef MINGW - "" -#else - nl_langinfo(CODESET) -#endif + cd = iconv_open( +#ifdef MINGW + "" +#else + nl_langinfo(CODESET) +#endif , "UTF-8"); while (keywords != NULL) { - buf = iconvHelper(cd, - keywords->keyword); + if (cd == (iconv_t) -1) + buf = strdup(keywords->keyword); + else + buf = iconvHelper(cd, + keywords->keyword); if (keywords->keywordType >= HIGHEST_TYPE_NUMBER) fprintf(handle, _("INVALID TYPE - %s\n"), @@ -788,7 +761,8 @@ EXTRACTOR_printKeywords (FILE * handle, free(buf); keywords = keywords->next; } - iconv_close(cd); + if (cd != (iconv_t) -1) + iconv_close(cd); } /** diff --git a/src/main/iconv.c b/src/main/iconv.c @@ -0,0 +1,35 @@ + +/** + * Convert the given input using the given converter + * and return as a 0-terminated string. + */ +static char * iconvHelper(iconv_t cd, + const char * in) { + size_t inSize; + char * buf; + char * ibuf; + const char * i; + size_t outSize; + size_t outLeft; + + i = in; + /* reset iconv */ + iconv(cd, NULL, NULL, NULL, NULL); + + inSize = strlen(in); + outSize = 4 * strlen(in) + 2; + outLeft = outSize - 2; /* make sure we have 2 0-terminations! */ + buf = malloc(outSize); + ibuf = buf; + memset(buf, 0, outSize); + if (iconv(cd, + (char**) &in, + &inSize, + &ibuf, + &outLeft) == (size_t)-1) { + /* conversion failed */ + free(buf); + return strdup(i); + } + return buf; +} diff --git a/src/plugins/convert.c b/src/plugins/convert.c @@ -23,6 +23,9 @@ #include "convert.h" #include <iconv.h> +#ifndef MINGW +#include <langinfo.h> +#endif /** * Convert the len characters long character sequence @@ -40,11 +43,13 @@ char * convertToUtf8(const char * input, char * tmp; char * ret; char * itmp; + const char * i; iconv_t cd; + i = input; cd = iconv_open("UTF-8", charset); if (cd == (iconv_t) -1) - return strdup(charset); + return strdup(i); tmpSize = 3 * len + 4; tmp = malloc(tmpSize); itmp = tmp; @@ -56,7 +61,7 @@ char * convertToUtf8(const char * input, &finSize) == (size_t)-1) { iconv_close(cd); free(tmp); - return strdup(charset); + return strdup(i); } ret = malloc(tmpSize - finSize + 1); memcpy(ret, diff --git a/src/plugins/pdf/pdfextractor.cc b/src/plugins/pdf/pdfextractor.cc @@ -72,12 +72,26 @@ extern "C" { s = s1->getCString(); if ((((unsigned char)s[0]) & 0xff) == 0xfe && (((unsigned char)s[1]) & 0xff) == 0xff) { - s = &s[2]; + char * result; + unsigned char u[2]; + unsigned int pos; + unsigned int len; + char * con; + + result = (char*) malloc(s1->getLength() * 4); + result[0] = '\0'; + len = s1->getLength(); + for (pos=0;pos<len;pos+=2) { + u[0] = s1->getChar(pos+1); + u[1] = s1->getChar(pos); + con = (char*) convertToUtf8((const char*) u, 2, "UNICODE"); + strcat(result, con); + free(con); + } next = addKeyword(type, - convertToUtf8(s, - strlen(s), - "UNICODE"), + strdup(result), next); + free(result); } else { next = addKeyword(type, convertToUtf8(s, @@ -105,12 +119,26 @@ extern "C" { if ((s1->getChar(0) & 0xff) == 0xfe && (s1->getChar(1) & 0xff) == 0xff) { /* isUnicode */ - s = &s[2]; + char * result; + unsigned char u[2]; + unsigned int pos; + unsigned int len; + char * con; + + result = (char*) malloc(s1->getLength() * 4); + result[0] = '\0'; + len = s1->getLength(); + for (pos=0;pos<len;pos+=2) { + u[0] = s1->getChar(pos+1); + u[1] = s1->getChar(pos); + con = (char*) convertToUtf8((const char*) u, 2, "UNICODE"); + strcat(result, con); + free(con); + } next = addKeyword(type, - convertToUtf8(s, - strlen(s), - "UNICODE"), + strdup(result), next); + free(result); } else { if (s[0] == 'D' && s[1] == ':') { s += 2; @@ -238,6 +266,13 @@ extern "C" { strdup(pcnt), result); } + { + char pcnt[20]; + sprintf(pcnt, "PDF %.1f", doc->getPDFVersion()); + result = addKeyword(EXTRACTOR_FORMAT, + strdup(pcnt), + result); + } result = printInfoDate(info.getDict(), "CreationDate", EXTRACTOR_CREATION_DATE, @@ -247,6 +282,7 @@ extern "C" { EXTRACTOR_MODIFICATION_DATE, result); } + info.free(); delete doc; freeParams(); @@ -254,4 +290,3 @@ extern "C" { return result; } } - diff --git a/src/plugins/pngextractor.c b/src/plugins/pngextractor.c @@ -269,11 +269,11 @@ static struct EXTRACTOR_Keywords * processzTXt(unsigned char * data, struct EXTRACTOR_Keywords * libextractor_png_extract(char * filename, - unsigned char * data, + const unsigned char * data, size_t size, struct EXTRACTOR_Keywords * prev) { - unsigned char * pos; - unsigned char * end; + const unsigned char * pos; + const unsigned char * end; struct EXTRACTOR_Keywords * result; unsigned int length; @@ -290,7 +290,7 @@ struct EXTRACTOR_Keywords * libextractor_png_extract(char * filename, while(1) { if (pos+12 >= end) break; - length = htonl(getIntAt(pos)); pos+=4; + length = htonl(getIntAt(pos)); pos+=4; if (pos+4+length+4 > end) break; if (0 == strncmp(pos, "IHDR", 4))