commit 1c1ab9345ffb30fdabe892cec619f8935c8f2e32
parent 3d6637f0f4521ce1df920e8919046ba237d086aa
Author: Christian Grothoff <christian@grothoff.org>
Date: Sun, 20 Feb 2005 03:57:58 +0000
bugfix
Diffstat:
9 files changed, 121 insertions(+), 99 deletions(-)
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,8 @@
+Sat Feb 19 22:58:30 EST 2005
+ Fixed problems with wrong byteorder for Unicode decoding
+ in PDF meta-data. Fixed minor problems with character
+ set conversion error handling.
+
Wed Jan 26 19:31:04 EST 2005
Workaround possible bug in glib quarks (OLE2 extractor).
Improved QT support (?nam tag, support for description).
diff --git a/configure.ac b/configure.ac
@@ -165,7 +165,7 @@ AC_FUNC_VPRINTF
AC_FUNC_MMAP
AC_FUNC_STAT
AC_FUNC_ERROR_AT_LINE
-AC_CHECK_FUNCS([strndup munmap strcasecmp strdup strncasecmp memmove memset strtoul floor getcwd pow setenv sqrt strchr strcspn strrchr strnlen strndup])
+AC_CHECK_FUNCS([mkstemp strndup munmap strcasecmp strdup strncasecmp memmove memset strtoul floor getcwd pow setenv sqrt strchr strcspn strrchr strnlen strndup])
AM_GNU_GETTEXT_VERSION(0.14)
AM_GNU_GETTEXT([external])
diff --git a/src/main/Makefile.am b/src/main/Makefile.am
@@ -34,7 +34,8 @@ libextractor_la_DEPENDENCIES = \
EXTRA_DIST = \
winproc.c \
libextractor_python.c \
- extract.py
+ extract.py \
+ iconv.c
if MINGW
winproc = winproc.c
diff --git a/src/main/extract.c b/src/main/extract.c
@@ -168,40 +168,7 @@ printHelp ()
}
-/**
- * Convert the given input using the given converter
- * and return as a 0-terminated string.
- */
-static char * iconvHelper(iconv_t cd,
- const char * in) {
- size_t inSize;
- char * buf;
- char * ibuf;
- size_t outSize;
- size_t outLeft;
- size_t ret;
-
- /* reset iconv */
- iconv(cd, NULL, NULL, NULL, NULL);
-
- inSize = strlen(in);
- outSize = 4 * strlen(in) + 2;
- outLeft = outSize - 2; /* make sure we have 2 0-terminations! */
- buf = malloc(outSize);
- ibuf = buf;
- memset(buf, 0, outSize);
- ret = iconv(cd,
- (char**) &in,
- &inSize,
- &ibuf,
- &outLeft);
- if (ret == (size_t)-1) {
- /* conversion failed */
- free(buf);
- return strdup(in);
- }
- return buf;
-}
+#include "iconv.c"
/**
@@ -221,12 +188,12 @@ printSelectedKeywords (FILE * handle,
iconv_t cd;
char * buf;
- cd = iconv_open(
-#ifdef MINGW
- ""
-#else
- nl_langinfo(CODESET)
-#endif
+ cd = iconv_open(
+#ifdef MINGW
+ "char"
+#else
+ nl_langinfo(CODESET)
+#endif
, "UTF-8");
while (keywords != NULL) {
buf = NULL;
diff --git a/src/main/extractor.c b/src/main/extractor.c
@@ -27,10 +27,10 @@
#include <../../libltdl/ltdl.h>
#endif
#include <locale.h>
-#include <iconv.h>
+#include <iconv.h>
#ifndef MINGW
#include <langinfo.h>
-#endif
+#endif
#define DEBUG 1
@@ -720,37 +720,7 @@ EXTRACTOR_removeEmptyKeywords (EXTRACTOR_KeywordList * list)
return list;
}
-/**
- * Convert the given input using the given converter
- * and return as a 0-terminated string.
- */
-static char * iconvHelper(iconv_t cd,
- const char * in) {
- size_t inSize;
- char * buf;
- char * ibuf;
- size_t outSize;
- size_t outLeft;
- /* reset iconv */
- iconv(cd, NULL, NULL, NULL, NULL);
-
- inSize = strlen(in);
- outSize = 4 * strlen(in) + 2;
- outLeft = outSize - 2; /* make sure we have 2 0-terminations! */
- buf = malloc(outSize);
- ibuf = buf;
- memset(buf, 0, outSize);
- if (iconv(cd,
- (char**) &in,
- &inSize,
- &ibuf,
- &outLeft) == (size_t)-1) {
- /* conversion failed */
- free(buf);
- return strdup(in);
- }
- return buf;
-}
+#include "iconv.c"
/**
* Print a keyword list to a file.
@@ -765,17 +735,20 @@ EXTRACTOR_printKeywords (FILE * handle,
iconv_t cd;
char * buf;
- cd = iconv_open(
-#ifdef MINGW
- ""
-#else
- nl_langinfo(CODESET)
-#endif
+ cd = iconv_open(
+#ifdef MINGW
+ ""
+#else
+ nl_langinfo(CODESET)
+#endif
, "UTF-8");
while (keywords != NULL)
{
- buf = iconvHelper(cd,
- keywords->keyword);
+ if (cd == (iconv_t) -1)
+ buf = strdup(keywords->keyword);
+ else
+ buf = iconvHelper(cd,
+ keywords->keyword);
if (keywords->keywordType >= HIGHEST_TYPE_NUMBER)
fprintf(handle,
_("INVALID TYPE - %s\n"),
@@ -788,7 +761,8 @@ EXTRACTOR_printKeywords (FILE * handle,
free(buf);
keywords = keywords->next;
}
- iconv_close(cd);
+ if (cd != (iconv_t) -1)
+ iconv_close(cd);
}
/**
diff --git a/src/main/iconv.c b/src/main/iconv.c
@@ -0,0 +1,35 @@
+
+/**
+ * Convert the given input using the given converter
+ * and return as a 0-terminated string.
+ */
+static char * iconvHelper(iconv_t cd,
+ const char * in) {
+ size_t inSize;
+ char * buf;
+ char * ibuf;
+ const char * i;
+ size_t outSize;
+ size_t outLeft;
+
+ i = in;
+ /* reset iconv */
+ iconv(cd, NULL, NULL, NULL, NULL);
+
+ inSize = strlen(in);
+ outSize = 4 * strlen(in) + 2;
+ outLeft = outSize - 2; /* make sure we have 2 0-terminations! */
+ buf = malloc(outSize);
+ ibuf = buf;
+ memset(buf, 0, outSize);
+ if (iconv(cd,
+ (char**) &in,
+ &inSize,
+ &ibuf,
+ &outLeft) == (size_t)-1) {
+ /* conversion failed */
+ free(buf);
+ return strdup(i);
+ }
+ return buf;
+}
diff --git a/src/plugins/convert.c b/src/plugins/convert.c
@@ -23,6 +23,9 @@
#include "convert.h"
#include <iconv.h>
+#ifndef MINGW
+#include <langinfo.h>
+#endif
/**
* Convert the len characters long character sequence
@@ -40,11 +43,13 @@ char * convertToUtf8(const char * input,
char * tmp;
char * ret;
char * itmp;
+ const char * i;
iconv_t cd;
+ i = input;
cd = iconv_open("UTF-8", charset);
if (cd == (iconv_t) -1)
- return strdup(charset);
+ return strdup(i);
tmpSize = 3 * len + 4;
tmp = malloc(tmpSize);
itmp = tmp;
@@ -56,7 +61,7 @@ char * convertToUtf8(const char * input,
&finSize) == (size_t)-1) {
iconv_close(cd);
free(tmp);
- return strdup(charset);
+ return strdup(i);
}
ret = malloc(tmpSize - finSize + 1);
memcpy(ret,
diff --git a/src/plugins/pdf/pdfextractor.cc b/src/plugins/pdf/pdfextractor.cc
@@ -72,12 +72,26 @@ extern "C" {
s = s1->getCString();
if ((((unsigned char)s[0]) & 0xff) == 0xfe &&
(((unsigned char)s[1]) & 0xff) == 0xff) {
- s = &s[2];
+ char * result;
+ unsigned char u[2];
+ unsigned int pos;
+ unsigned int len;
+ char * con;
+
+ result = (char*) malloc(s1->getLength() * 4);
+ result[0] = '\0';
+ len = s1->getLength();
+ for (pos=0;pos<len;pos+=2) {
+ u[0] = s1->getChar(pos+1);
+ u[1] = s1->getChar(pos);
+ con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
+ strcat(result, con);
+ free(con);
+ }
next = addKeyword(type,
- convertToUtf8(s,
- strlen(s),
- "UNICODE"),
+ strdup(result),
next);
+ free(result);
} else {
next = addKeyword(type,
convertToUtf8(s,
@@ -105,12 +119,26 @@ extern "C" {
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
/* isUnicode */
- s = &s[2];
+ char * result;
+ unsigned char u[2];
+ unsigned int pos;
+ unsigned int len;
+ char * con;
+
+ result = (char*) malloc(s1->getLength() * 4);
+ result[0] = '\0';
+ len = s1->getLength();
+ for (pos=0;pos<len;pos+=2) {
+ u[0] = s1->getChar(pos+1);
+ u[1] = s1->getChar(pos);
+ con = (char*) convertToUtf8((const char*) u, 2, "UNICODE");
+ strcat(result, con);
+ free(con);
+ }
next = addKeyword(type,
- convertToUtf8(s,
- strlen(s),
- "UNICODE"),
+ strdup(result),
next);
+ free(result);
} else {
if (s[0] == 'D' && s[1] == ':') {
s += 2;
@@ -238,6 +266,13 @@ extern "C" {
strdup(pcnt),
result);
}
+ {
+ char pcnt[20];
+ sprintf(pcnt, "PDF %.1f", doc->getPDFVersion());
+ result = addKeyword(EXTRACTOR_FORMAT,
+ strdup(pcnt),
+ result);
+ }
result = printInfoDate(info.getDict(),
"CreationDate",
EXTRACTOR_CREATION_DATE,
@@ -247,6 +282,7 @@ extern "C" {
EXTRACTOR_MODIFICATION_DATE,
result);
}
+
info.free();
delete doc;
freeParams();
@@ -254,4 +290,3 @@ extern "C" {
return result;
}
}
-
diff --git a/src/plugins/pngextractor.c b/src/plugins/pngextractor.c
@@ -269,11 +269,11 @@ static struct EXTRACTOR_Keywords * processzTXt(unsigned char * data,
struct EXTRACTOR_Keywords * libextractor_png_extract(char * filename,
- unsigned char * data,
+ const unsigned char * data,
size_t size,
struct EXTRACTOR_Keywords * prev) {
- unsigned char * pos;
- unsigned char * end;
+ const unsigned char * pos;
+ const unsigned char * end;
struct EXTRACTOR_Keywords * result;
unsigned int length;
@@ -290,7 +290,7 @@ struct EXTRACTOR_Keywords * libextractor_png_extract(char * filename,
while(1) {
if (pos+12 >= end)
break;
- length = htonl(getIntAt(pos)); pos+=4;
+ length = htonl(getIntAt(pos)); pos+=4;
if (pos+4+length+4 > end)
break;
if (0 == strncmp(pos, "IHDR", 4))