aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/plugins/Makefile.am45
-rw-r--r--src/plugins/html_extractor.c (renamed from src/plugins/old/html_extractor.c)316
-rw-r--r--src/plugins/man_extractor.c292
-rw-r--r--src/plugins/old/man_extractor.c232
-rw-r--r--src/plugins/old/riff_extractor.c123
-rw-r--r--src/plugins/riff_extractor.c157
-rw-r--r--src/plugins/test_html.c124
-rw-r--r--src/plugins/test_man.c85
-rw-r--r--src/plugins/testdata/html_grothoff.html44
-rw-r--r--src/plugins/testdata/man_extract.1109
-rw-r--r--src/plugins/thumbnailgtk_extractor.c7
11 files changed, 1153 insertions, 381 deletions
diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am
index 11cb2b5..f6a4a6a 100644
--- a/src/plugins/Makefile.am
+++ b/src/plugins/Makefile.am
@@ -36,7 +36,9 @@ EXTRA_DIST = template_extractor.c \
36 testdata/nsf_arkanoid.nsf \ 36 testdata/nsf_arkanoid.nsf \
37 testdata/nsfe_classics.nsfe \ 37 testdata/nsfe_classics.nsfe \
38 testdata/xm_diesel.xm \ 38 testdata/xm_diesel.xm \
39 testdata/tiff_haute.tiff 39 testdata/tiff_haute.tiff \
40 testdata/man_extract.1 \
41 testdata/html_grothoff.html
40 42
41if HAVE_VORBISFILE 43if HAVE_VORBISFILE
42PLUGIN_OGG=libextractor_ogg.la 44PLUGIN_OGG=libextractor_ogg.la
@@ -58,6 +60,11 @@ if HAVE_FFMPEG
58PLUGIN_FFMPEG=libextractor_thumbnailffmpeg.la 60PLUGIN_FFMPEG=libextractor_thumbnailffmpeg.la
59TEST_FFMPEG=test_thumbnailffmpeg 61TEST_FFMPEG=test_thumbnailffmpeg
60endif 62endif
63
64if HAVE_TIDY
65PLUGIN_HTML=libextractor_html.la
66TEST_HTML=test_html
67endif
61endif 68endif
62 69
63if HAVE_GIF 70if HAVE_GIF
@@ -105,6 +112,7 @@ PLUGIN_ZLIB=libextractor_deb.la
105TEST_ZLIB=test_deb 112TEST_ZLIB=test_deb
106endif 113endif
107 114
115
108if HAVE_GSTREAMER 116if HAVE_GSTREAMER
109PLUGIN_GSTREAMER=libextractor_gstreamer.la 117PLUGIN_GSTREAMER=libextractor_gstreamer.la
110TEST_GSTREAMER=test_gstreamer 118TEST_GSTREAMER=test_gstreamer
@@ -112,6 +120,7 @@ endif
112 120
113plugin_LTLIBRARIES = \ 121plugin_LTLIBRARIES = \
114 libextractor_it.la \ 122 libextractor_it.la \
123 libextractor_man.la \
115 libextractor_nsf.la \ 124 libextractor_nsf.la \
116 libextractor_nsfe.la \ 125 libextractor_nsfe.la \
117 libextractor_odf.la \ 126 libextractor_odf.la \
@@ -119,9 +128,11 @@ plugin_LTLIBRARIES = \
119 libextractor_xm.la \ 128 libextractor_xm.la \
120 libextractor_s3m.la \ 129 libextractor_s3m.la \
121 libextractor_sid.la \ 130 libextractor_sid.la \
131 libextractor_riff.la \
122 libextractor_wav.la \ 132 libextractor_wav.la \
123 libextractor_zip.la \ 133 libextractor_zip.la \
124 $(PLUGIN_GTK) \ 134 $(PLUGIN_GTK) \
135 $(PLUGIN_HTML) \
125 $(PLUGIN_FFMPEG) \ 136 $(PLUGIN_FFMPEG) \
126 $(PLUGIN_ZLIB) \ 137 $(PLUGIN_ZLIB) \
127 $(PLUGIN_OGG) \ 138 $(PLUGIN_OGG) \
@@ -142,6 +153,7 @@ endif
142 153
143check_PROGRAMS = \ 154check_PROGRAMS = \
144 test_wav \ 155 test_wav \
156 test_man \
145 test_it \ 157 test_it \
146 test_s3m \ 158 test_s3m \
147 test_png \ 159 test_png \
@@ -151,6 +163,7 @@ check_PROGRAMS = \
151 test_nsf \ 163 test_nsf \
152 test_nsfe \ 164 test_nsfe \
153 $(TEST_ZLIB) \ 165 $(TEST_ZLIB) \
166 $(TEST_HTML) \
154 $(TEST_GTK) \ 167 $(TEST_GTK) \
155 $(TEST_FFMPEG) \ 168 $(TEST_FFMPEG) \
156 $(TEST_OGG) \ 169 $(TEST_OGG) \
@@ -201,6 +214,17 @@ test_deb_LDADD = \
201 $(top_builddir)/src/plugins/libtest.la 214 $(top_builddir)/src/plugins/libtest.la
202 215
203 216
217libextractor_man_la_SOURCES = \
218 man_extractor.c
219libextractor_man_la_LDFLAGS = \
220 $(PLUGINFLAGS)
221
222test_man_SOURCES = \
223 test_man.c
224test_man_LDADD = \
225 $(top_builddir)/src/plugins/libtest.la
226
227
204libextractor_nsf_la_SOURCES = \ 228libextractor_nsf_la_SOURCES = \
205 nsf_extractor.c 229 nsf_extractor.c
206libextractor_nsf_la_LDFLAGS = \ 230libextractor_nsf_la_LDFLAGS = \
@@ -279,6 +303,12 @@ libextractor_sid_la_LDFLAGS = \
279 $(PLUGINFLAGS) 303 $(PLUGINFLAGS)
280 304
281 305
306libextractor_riff_la_SOURCES = \
307 riff_extractor.c
308libextractor_riff_la_LDFLAGS = \
309 $(PLUGINFLAGS)
310
311
282libextractor_s3m_la_SOURCES = \ 312libextractor_s3m_la_SOURCES = \
283 s3m_extractor.c 313 s3m_extractor.c
284libextractor_s3m_la_LDFLAGS = \ 314libextractor_s3m_la_LDFLAGS = \
@@ -477,3 +507,16 @@ test_thumbnailffmpeg_SOURCES = \
477 test_thumbnailffmpeg.c 507 test_thumbnailffmpeg.c
478test_thumbnailffmpeg_LDADD = \ 508test_thumbnailffmpeg_LDADD = \
479 $(top_builddir)/src/plugins/libtest.la 509 $(top_builddir)/src/plugins/libtest.la
510
511
512libextractor_html_la_SOURCES = \
513 html_extractor.c
514libextractor_html_la_LDFLAGS = \
515 $(PLUGINFLAGS)
516libextractor_html_la_LIBADD = \
517 -ltidy -lmagic
518
519test_html_SOURCES = \
520 test_html.c
521test_html_LDADD = \
522 $(top_builddir)/src/plugins/libtest.la
diff --git a/src/plugins/old/html_extractor.c b/src/plugins/html_extractor.c
index 004d22a..65fb535 100644
--- a/src/plugins/old/html_extractor.c
+++ b/src/plugins/html_extractor.c
@@ -1,6 +1,6 @@
1/* 1/*
2 This file is part of libextractor. 2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff 3 (C) 2002, 2003, 2004, 2005, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 4
5 libextractor is free software; you can redistribute it and/or modify 5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published 6 it under the terms of the GNU General Public License as published
@@ -18,15 +18,30 @@
18 Boston, MA 02111-1307, USA. 18 Boston, MA 02111-1307, USA.
19 19
20 */ 20 */
21 21/**
22 * @file plugins/html_extractor.c
23 * @brief plugin to support HTML files
24 * @author Christian Grothoff
25 */
22#include "platform.h" 26#include "platform.h"
23#include "extractor.h" 27#include "extractor.h"
24#include <string.h> 28#include <magic.h>
25#include "convert.h" 29#include <tidy/tidy.h>
30#include <tidy/buffio.h>
26 31
32/**
33 * Mapping of HTML META names to LE types.
34 */
27static struct 35static struct
28{ 36{
37 /**
38 * HTML META name.
39 */
29 const char *name; 40 const char *name;
41
42 /**
43 * Corresponding LE type.
44 */
30 enum EXTRACTOR_MetaType type; 45 enum EXTRACTOR_MetaType type;
31} tagmap[] = { 46} tagmap[] = {
32 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, 47 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
@@ -54,22 +69,250 @@ static struct
54 { NULL, EXTRACTOR_METATYPE_RESERVED } 69 { NULL, EXTRACTOR_METATYPE_RESERVED }
55}; 70};
56 71
57static const char *relevantTags[] = {
58 "title",
59 "meta",
60 NULL,
61};
62 72
63typedef struct TI 73/**
74 * Global handle to MAGIC data.
75 */
76static magic_t magic;
77
78
79/**
80 * Map 'meta' tag to LE type.
81 *
82 * @param tag tag to map
83 * @return EXTRACTOR_METATYPE_RESERVED if the type was not found
84 */
85static enum EXTRACTOR_MetaType
86tag_to_type (const char *tag)
87{
88 unsigned int i;
89
90 for (i=0; NULL != tagmap[i].name; i++)
91 if (0 == strcasecmp (tag,
92 tagmap[i].name))
93 return tagmap[i].type;
94 return EXTRACTOR_METATYPE_RESERVED;
95}
96
97
98/**
99 * Function called by libtidy for error reporting.
100 *
101 * @param doc tidy doc being processed
102 * @param lvl report level
103 * @param line input line
104 * @param col input column
105 * @param mssg message
106 * @return FALSE (no output)
107 */
108static Bool
109report_cb (TidyDoc doc,
110 TidyReportLevel lvl,
111 uint line,
112 uint col,
113 ctmbstr mssg)
114{
115 return 0;
116}
117
118
119/**
120 * Input callback: get next byte of input.
121 *
122 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
123 * @return next byte of input, EndOfStream on errors and EOF
124 */
125static int
126get_byte_cb (void *sourceData)
127{
128 struct EXTRACTOR_ExtractContext *ec = sourceData;
129 void *data;
130
131 if (1 !=
132 ec->read (ec->cls,
133 &data, 1))
134 return EndOfStream;
135 return *(unsigned char*) data;
136}
137
138
139/**
140 * Input callback: unget last byte of input.
141 *
142 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
143 * @param bt byte to unget (ignored)
144 */
145static void
146unget_byte_cb (void *sourceData, byte bt)
147{
148 struct EXTRACTOR_ExtractContext *ec = sourceData;
149
150 (void) ec->seek (ec->cls, -1, SEEK_CUR);
151}
152
153
154/**
155 * Input callback: check for EOF.
156 *
157 * @param sourceData our 'struct EXTRACTOR_ExtractContext'
158 * @return true if we are at the EOF
159 */
160static Bool
161eof_cb (void *sourceData)
162{
163 struct EXTRACTOR_ExtractContext *ec = sourceData;
164
165 return ec->seek (ec->cls, 0, SEEK_CUR) == ec->get_size (ec->cls);
166}
167
168
169/**
170 * Main entry method for the 'text/html' extraction plugin.
171 *
172 * @param ec extraction context provided to the plugin
173 */
174void
175EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec)
64{ 176{
65 struct TI *next; 177 TidyDoc doc;
66 const char *tagStart; 178 TidyNode head;
67 const char *tagEnd; 179 TidyNode child;
68 const char *dataStart; 180 TidyNode title;
69 const char *dataEnd; 181 TidyInputSource src;
70} TagInfo; 182 const char *name;
183 TidyBuffer tbuf;
184 TidyAttr attr;
185 enum EXTRACTOR_MetaType type;
186 ssize_t iret;
187 void *data;
188 const char *mime;
71 189
190 if (-1 == (iret = ec->read (ec->cls,
191 &data,
192 16 * 1024)))
193 return;
194 if (NULL == (mime = magic_buffer (magic, data, iret)))
195 return;
196 if (0 != strncmp (mime,
197 "text/html",
198 strlen ("text/html")))
199 return; /* not HTML */
72 200
201 if (0 != ec->seek (ec->cls, 0, SEEK_SET))
202 return; /* seek failed !? */
203
204 tidyInitSource (&src, ec,
205 &get_byte_cb,
206 &unget_byte_cb,
207 &eof_cb);
208 if (NULL == (doc = tidyCreate ()))
209 return;
210 tidySetReportFilter (doc, &report_cb);
211 tidySetAppData (doc, ec);
212 if (0 > tidyParseSource (doc, &src))
213 {
214 tidyRelease (doc);
215 return;
216 }
217 if (1 != tidyStatus (doc))
218 {
219 tidyRelease (doc);
220 return;
221 }
222 if (NULL == (head = tidyGetHead (doc)))
223 {
224 fprintf (stderr, "no head\n");
225 tidyRelease (doc);
226 return;
227 }
228 for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child))
229 {
230 switch (tidyNodeGetType(child))
231 {
232 case TidyNode_Root:
233 break;
234 case TidyNode_DocType:
235 break;
236 case TidyNode_Comment:
237 break;
238 case TidyNode_ProcIns:
239 break;
240 case TidyNode_Text:
241 break;
242 case TidyNode_CDATA:
243 break;
244 case TidyNode_Section:
245 break;
246 case TidyNode_Asp:
247 break;
248 case TidyNode_Jste:
249 break;
250 case TidyNode_Php:
251 break;
252 case TidyNode_XmlDecl:
253 break;
254 case TidyNode_Start:
255 case TidyNode_StartEnd:
256 name = tidyNodeGetName (child);
257 if ( (0 == strcasecmp (name, "title")) &&
258 (NULL != (title = tidyGetChild (child))) )
259 {
260 tidyBufInit (&tbuf);
261 tidyNodeGetValue (doc, title, &tbuf);
262 /* add 0-termination */
263 tidyBufPutByte (&tbuf, 0);
264 if (0 !=
265 ec->proc (ec->cls,
266 "html",
267 EXTRACTOR_METATYPE_TITLE,
268 EXTRACTOR_METAFORMAT_UTF8,
269 "text/plain",
270 (const char *) tbuf.bp,
271 tbuf.size))
272 {
273 tidyBufFree (&tbuf);
274 goto CLEANUP;
275 }
276 tidyBufFree (&tbuf);
277 break;
278 }
279 if (0 == strcasecmp (name, "meta"))
280 {
281 if (NULL == (attr = tidyAttrGetById (child,
282 TidyAttr_NAME)))
283 break;
284 if (EXTRACTOR_METATYPE_RESERVED ==
285 (type = tag_to_type (tidyAttrValue (attr))))
286 break;
287 if (NULL == (attr = tidyAttrGetById (child,
288 TidyAttr_CONTENT)))
289 break;
290 name = tidyAttrValue (attr);
291 if (0 !=
292 ec->proc (ec->cls,
293 "html",
294 type,
295 EXTRACTOR_METAFORMAT_UTF8,
296 "text/plain",
297 name,
298 strlen (name) + 1))
299 goto CLEANUP;
300 break;
301 }
302 break;
303 case TidyNode_End:
304 break;
305 default:
306 break;
307 }
308 }
309 CLEANUP:
310 tidyRelease (doc);
311}
312
313
314
315#if OLD
73 316
74 317
75/* ******************** parser helper functions ************** */ 318/* ******************** parser helper functions ************** */
@@ -187,7 +430,7 @@ findEntry (const char *key,
187 * @return NULL if nothing is found 430 * @return NULL if nothing is found
188 */ 431 */
189static char * 432static char *
190findInTags (TagInfo * t, 433findInTags (struct TagInfo * t,
191 const char *tagname, 434 const char *tagname,
192 const char *keyname, const char *keyvalue, const char *searchname) 435 const char *keyname, const char *keyvalue, const char *searchname)
193{ 436{
@@ -228,9 +471,9 @@ EXTRACTOR_html_extract (const char *data,
228 const char *options) 471 const char *options)
229{ 472{
230 size_t xsize; 473 size_t xsize;
231 TagInfo *tags; 474 struct TagInfo *tags;
232 TagInfo *t; 475 struct TagInfo *t;
233 TagInfo tag; 476 struct TagInfo tag;
234 size_t pos; 477 size_t pos;
235 size_t tpos; 478 size_t tpos;
236 int i; 479 int i;
@@ -291,7 +534,7 @@ EXTRACTOR_html_extract (const char *data,
291 (0 == strncasecmp (relevantTags[i], 534 (0 == strncasecmp (relevantTags[i],
292 tag.tagStart, tag.tagEnd - tag.tagStart))) 535 tag.tagStart, tag.tagEnd - tag.tagStart)))
293 { 536 {
294 t = malloc (sizeof (TagInfo)); 537 t = malloc (sizeof (struct TagInfo));
295 if (t == NULL) 538 if (t == NULL)
296 return 0; 539 return 0;
297 *t = tag; 540 *t = tag;
@@ -418,3 +661,34 @@ EXTRACTOR_html_extract (const char *data,
418 free (charset); 661 free (charset);
419 return ret; 662 return ret;
420} 663}
664#endif
665
666
667/**
668 * Initialize glib and load magic file.
669 */
670void __attribute__ ((constructor))
671html_gobject_init ()
672{
673 magic = magic_open (MAGIC_MIME_TYPE);
674 if (0 != magic_load (magic, NULL))
675 {
676 /* FIXME: how to deal with errors? */
677 }
678}
679
680
681/**
682 * Destructor for the library, cleans up.
683 */
684void __attribute__ ((destructor))
685html_ltdl_fini ()
686{
687 if (NULL != magic)
688 {
689 magic_close (magic);
690 magic = NULL;
691 }
692}
693
694/* end of html_extractor.c */
diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c
new file mode 100644
index 0000000..f074e5b
--- /dev/null
+++ b/src/plugins/man_extractor.c
@@ -0,0 +1,292 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19 */
20/**
21 * @file plugins/man_extractor.c
22 * @brief plugin to support man pages
23 * @author Christian Grothoff
24 */
25#include "platform.h"
26#include "extractor.h"
27#include <ctype.h>
28
29
30/**
31 * Create string from first 'n' characters of 'str'. See 'strndup'.
32 *
33 * @param str input string
34 * @param n desired output length (plus 0-termination)
35 * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
36 */
37static char *
38stndup (const char *str, size_t n)
39{
40 char *tmp;
41
42 if (NULL == (tmp = malloc (n + 1)))
43 return NULL;
44 tmp[n] = '\0';
45 memcpy (tmp, str, n);
46 return tmp;
47}
48
49
50/**
51 * Give a metadata item to LE. Removes double-quotes and
52 * makes sure we don't pass empty strings or NULL pointers.
53 *
54 * @param type metadata type to use
55 * @param keyword metdata value; freed in the process
56 * @param proc function to call with meta data
57 * @param proc_cls closure for 'proc'
58 * @return 0 to continue extracting, 1 if we are done
59 */
60static int
61add_keyword (enum EXTRACTOR_MetaType type,
62 char *keyword,
63 EXTRACTOR_MetaDataProcessor proc,
64 void *proc_cls)
65{
66 int ret;
67 char *value;
68
69 if (NULL == keyword)
70 return 0;
71 if ( (keyword[0] == '\"') &&
72 (keyword[strlen (keyword) - 1] == '\"') )
73 {
74 keyword[strlen (keyword) - 1] = '\0';
75 value = &keyword[1];
76 }
77 else
78 value = keyword;
79 if (0 == strlen (value))
80 {
81 free (keyword);
82 return 0;
83 }
84 ret = proc (proc_cls,
85 "man",
86 type,
87 EXTRACTOR_METAFORMAT_UTF8,
88 "text/plain",
89 value,
90 strlen (value)+1);
91 free (keyword);
92 return ret;
93}
94
95
96/**
97 * Find the end of the current token (which may be quoted).
98 *
99 * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly
100 * @param buf input buffer with the characters
101 * @param size number of bytes in buf
102 */
103static void
104find_end_of_token (size_t *end,
105 const char *buf,
106 const size_t size)
107{
108 int quot;
109
110 quot = 0;
111 while ( (*end < size) &&
112 ( (0 != (quot & 1)) ||
113 ((' ' != buf[*end])) ) )
114 {
115 if ('\"' == buf[*end])
116 quot++;
117 (*end)++;
118 }
119 if (1 == (quot & 1))
120 (*end) = size + 1;
121}
122
123
124/**
125 * How many bytes do we actually try to scan? (from the beginning
126 * of the file).
127 */
128#define MAX_READ (16 * 1024)
129
130
131/**
132 * Add a keyword to LE.
133 *
134 * @param t type to use
135 * @param s keyword to give to LE
136 */
137#define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; } while (0)
138
139
140/**
141 * Main entry method for the man page extraction plugin.
142 *
143 * @param ec extraction context provided to the plugin
144 */
145void
146EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec)
147{
148 const size_t xlen = strlen (".TH ");
149 size_t pos;
150 size_t xsize;
151 size_t end;
152 void *data;
153 ssize_t size;
154 char *buf;
155
156 if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
157 return;
158 buf = data;
159 pos = 0;
160 if (size < xlen)
161 return;
162 /* find actual beginning of the man page (.TH);
163 abort if we find non-printable characters */
164 while ( (pos < size - xlen) &&
165 ( (0 != strncmp (".TH ",
166 &buf[pos],
167 xlen)) ||
168 ( (0 != pos) &&
169 (buf[pos - 1] != '\n') ) ) )
170 {
171 if ( (! isgraph ((unsigned char) buf[pos])) &&
172 (! isspace ((unsigned char) buf[pos])) )
173 return;
174 pos++;
175 }
176 if (0 != strncmp (".TH ", &buf[pos], xlen))
177 return;
178
179 /* find end of ".TH"-line */
180 xsize = pos;
181 while ( (xsize < size) && ('\n' != buf[xsize]) )
182 xsize++;
183 /* limit processing to ".TH" line */
184 size = xsize;
185
186 /* skip over ".TH" */
187 pos += xlen;
188
189 /* first token is the title */
190 end = pos;
191 find_end_of_token (&end, buf, size);
192 if (end > size)
193 return;
194 if (end > pos)
195 {
196 ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
197 pos = end + 1;
198 }
199 if (pos >= size)
200 return;
201
202 /* next token is the section */
203 end = pos;
204 find_end_of_token (&end, buf, size);
205 if (end > size)
206 return;
207 if ('\"' == buf[pos])
208 pos++;
209 if ((end - pos >= 1) && (end - pos <= 4))
210 {
211 switch (buf[pos])
212 {
213 case '1':
214 ADD (EXTRACTOR_METATYPE_SECTION,
215 strdup (_("Commands")));
216 break;
217 case '2':
218 ADD (EXTRACTOR_METATYPE_SECTION,
219 strdup (_("System calls")));
220 break;
221 case '3':
222 ADD (EXTRACTOR_METATYPE_SECTION,
223 strdup (_("Library calls")));
224 break;
225 case '4':
226 ADD (EXTRACTOR_METATYPE_SECTION,
227 strdup (_("Special files")));
228 break;
229 case '5':
230 ADD (EXTRACTOR_METATYPE_SECTION,
231 strdup (_("File formats and conventions")));
232 break;
233 case '6':
234 ADD (EXTRACTOR_METATYPE_SECTION,
235 strdup (_("Games")));
236 break;
237 case '7':
238 ADD (EXTRACTOR_METATYPE_SECTION,
239 strdup (_("Conventions and miscellaneous")));
240 break;
241 case '8':
242 ADD (EXTRACTOR_METATYPE_SECTION,
243 strdup (_("System management commands")));
244 break;
245 case '9':
246 ADD (EXTRACTOR_METATYPE_SECTION,
247 strdup (_("Kernel routines")));
248 break;
249 default:
250 ADD (EXTRACTOR_METATYPE_SECTION,
251 stndup (&buf[pos], 1));
252 }
253 pos = end + 1;
254 }
255 end = pos;
256
257 /* next token is the modification date */
258 find_end_of_token (&end, buf, size);
259 if (end > size)
260 return;
261 if (end > pos)
262 {
263 ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
264 pos = end + 1;
265 }
266
267 /* next token is the source of the man page */
268 end = pos;
269 find_end_of_token (&end, buf, size);
270 if (end > size)
271 return;
272 if (end > pos)
273 {
274 ADD (EXTRACTOR_METATYPE_SOURCE,
275 stndup (&buf[pos], end - pos));
276 pos = end + 1;
277 }
278
279 /* last token is the title of the book the man page belongs to */
280 end = pos;
281 find_end_of_token (&end, buf, size);
282 if (end > size)
283 return;
284 if (end > pos)
285 {
286 ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
287 stndup (&buf[pos], end - pos));
288 pos = end + 1;
289 }
290}
291
292/* end of man_extractor.c */
diff --git a/src/plugins/old/man_extractor.c b/src/plugins/old/man_extractor.c
deleted file mode 100644
index eeb40a8..0000000
--- a/src/plugins/old/man_extractor.c
+++ /dev/null
@@ -1,232 +0,0 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2009 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19 */
20
21#include "platform.h"
22#include "extractor.h"
23#include <ctype.h>
24
25static char *
26stndup (const char *str, size_t n)
27{
28 char *tmp;
29 tmp = malloc (n + 1);
30 if (tmp == NULL)
31 return NULL;
32 tmp[n] = '\0';
33 memcpy (tmp, str, n);
34 return tmp;
35}
36
37static int
38addKeyword (enum EXTRACTOR_MetaType type,
39 char *keyword,
40 EXTRACTOR_MetaDataProcessor proc,
41 void *proc_cls)
42{
43 int ret;
44 if (keyword == NULL)
45 return 0;
46 if (strlen (keyword) == 0)
47 {
48 free (keyword);
49 return 0;
50 }
51 if ((keyword[0] == '\"') && (keyword[strlen (keyword) - 1] == '\"'))
52 {
53 char *tmp;
54
55 keyword[strlen (keyword) - 1] = '\0';
56 tmp = strdup (&keyword[1]);
57 free (keyword);
58 if (tmp == NULL)
59 return 0;
60 keyword = tmp;
61 }
62 if (strlen (keyword) == 0)
63 {
64 free (keyword);
65 return 0;
66 }
67 ret = proc (proc_cls,
68 "man",
69 type,
70 EXTRACTOR_METAFORMAT_UTF8,
71 "text/plain",
72 keyword,
73 strlen (keyword)+1);
74 free (keyword);
75 return ret;
76}
77
78static void
79NEXT (size_t * end, const char *buf, const size_t size)
80{
81 int quot;
82
83 quot = 0;
84 while ((*end < size) && (((quot & 1) != 0) || ((buf[*end] != ' '))))
85 {
86 if (buf[*end] == '\"')
87 quot++;
88 (*end)++;
89 }
90 if ((quot & 1) == 1)
91 (*end) = size + 1;
92}
93
94/**
95 * How many bytes do we actually try to scan? (from the beginning
96 * of the file).
97 */
98#define MAX_READ (16 * 1024)
99
100#define ADD(t,s) do { if (0 != addKeyword (t, s, proc, proc_cls)) return 1; } while (0)
101
102int
103EXTRACTOR_man_extract (const char *buf,
104 size_t size,
105 EXTRACTOR_MetaDataProcessor proc,
106 void *proc_cls,
107 const char *options)
108{
109 int pos;
110 size_t xsize;
111 const size_t xlen = strlen (".TH ");
112
113 if (size > MAX_READ)
114 size = MAX_READ;
115 pos = 0;
116 if (size < xlen)
117 return 0;
118 while ((pos < size - xlen) &&
119 ((0 != strncmp (".TH ",
120 &buf[pos],
121 xlen)) || ((pos != 0) && (buf[pos - 1] != '\n'))))
122 {
123 if (!isgraph ((unsigned char) buf[pos]) &&
124 !isspace ((unsigned char) buf[pos]))
125 return 0;
126 pos++;
127 }
128 xsize = pos;
129 while ((xsize < size) && (buf[xsize] != '\n'))
130 xsize++;
131 size = xsize;
132
133 if (0 == strncmp (".TH ", &buf[pos], xlen))
134 {
135 size_t end;
136
137 pos += xlen;
138 end = pos;
139 NEXT (&end, buf, size);
140 if (end > size)
141 return 0;
142 if (end - pos > 0)
143 {
144 ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
145 pos = end + 1;
146 }
147 if (pos >= size)
148 return 0;
149 end = pos;
150 NEXT (&end, buf, size);
151 if (end > size)
152 return 0;
153 if (buf[pos] == '\"')
154 pos++;
155 if ((end - pos >= 1) && (end - pos <= 4))
156 {
157 switch (buf[pos])
158 {
159 case '1':
160 ADD (EXTRACTOR_METATYPE_SECTION,
161 strdup (_("Commands")));
162 break;
163 case '2':
164 ADD (EXTRACTOR_METATYPE_SECTION,
165 strdup (_("System calls")));
166 break;
167 case '3':
168 ADD (EXTRACTOR_METATYPE_SECTION,
169 strdup (_("Library calls")));
170 break;
171 case '4':
172 ADD (EXTRACTOR_METATYPE_SECTION,
173 strdup (_("Special files")));
174 break;
175 case '5':
176 ADD (EXTRACTOR_METATYPE_SECTION,
177 strdup (_("File formats and conventions")));
178 break;
179 case '6':
180 ADD (EXTRACTOR_METATYPE_SECTION,
181 strdup (_("Games")));
182 break;
183 case '7':
184 ADD (EXTRACTOR_METATYPE_SECTION,
185 strdup (_("Conventions and miscellaneous")));
186 break;
187 case '8':
188 ADD (EXTRACTOR_METATYPE_SECTION,
189 strdup (_("System management commands")));
190 break;
191 case '9':
192 ADD (EXTRACTOR_METATYPE_SECTION,
193 strdup (_("Kernel routines")));
194 break;
195 }
196 pos = end + 1;
197 }
198 end = pos;
199 NEXT (&end, buf, size);
200 if (end > size)
201 return 0;
202 if (end - pos > 0)
203 {
204 ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
205 pos = end + 1;
206 }
207 end = pos;
208 NEXT (&end, buf, size);
209 if (end > size)
210 return 0;
211 if (end - pos > 0)
212 {
213 ADD (EXTRACTOR_METATYPE_SOURCE,
214 stndup (&buf[pos], end - pos));
215 pos = end + 1;
216 }
217 end = pos;
218 NEXT (&end, buf, size);
219 if (end > size)
220 return 0;
221 if (end - pos > 0)
222 {
223 ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
224 stndup (&buf[pos], end - pos));
225 pos = end + 1;
226 }
227 }
228
229 return 0;
230}
231
232/* end of man_extractor.c */
diff --git a/src/plugins/old/riff_extractor.c b/src/plugins/old/riff_extractor.c
deleted file mode 100644
index f6cd7f6..0000000
--- a/src/plugins/old/riff_extractor.c
+++ /dev/null
@@ -1,123 +0,0 @@
1/*
2 This file is part of libextractor.
3 (C) 2004, 2009 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19
20 This code was based on AVInfo 1.0 alpha 11
21 (c) George Shuklin, gs]AT[shounen.ru, 2002-2004
22 http://shounen.ru/soft/avinfo/
23
24 and bitcollider 0.6.0
25 (PD) 2004 The Bitzi Corporation
26 http://bitzi.com/
27 */
28
29#include "platform.h"
30#include "extractor.h"
31#include <math.h>
32
33/**
34 * Read the specified number of bytes as a little-endian (least
35 * significant byte first) integer.
36 */
37static unsigned int
38fread_le (const char *data)
39{
40 int x;
41 unsigned int result = 0;
42
43 for (x = 0; x < 4; x++)
44 result |= ((unsigned char) data[x]) << (x * 8);
45 return result;
46}
47
48/* We implement our own rounding function, because the availability of
49 * C99's round(), nearbyint(), rint(), etc. seems to be spotty, whereas
50 * floor() is available in math.h on all C compilers.
51 */
52static double
53round_double (double num)
54{
55 return floor (num + 0.5);
56}
57
58#define ADD(s,t) do { if (0 != (ret = proc (proc_cls, "riff", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen(s)+1))) goto FINISH; } while (0)
59
60/* video/x-msvideo */
61int
62EXTRACTOR_riff_extract (const char *xdata,
63 size_t xsize,
64 EXTRACTOR_MetaDataProcessor proc,
65 void *proc_cls,
66 const char *options)
67{
68 unsigned int blockLen;
69 unsigned int fps;
70 unsigned int duration;
71 size_t pos;
72 unsigned int width;
73 unsigned int height;
74 char codec[5];
75 char format[256];
76 int ret;
77
78 if (xsize < 32)
79 return 0;
80 if ((memcmp (&xdata[0],
81 "RIFF", 4) != 0) || (memcmp (&xdata[8], "AVI ", 4) != 0))
82 return 0;
83 if (memcmp (&xdata[12], "LIST", 4) != 0)
84 return 0;
85 if (memcmp (&xdata[20], "hdrlavih", 8) != 0)
86 return 0;
87
88 blockLen = fread_le (&xdata[28]);
89
90 /* begin of AVI header at 32 */
91 fps = (unsigned int) round_double ((double) 1.0e6 / fread_le (&xdata[32]));
92 duration = (unsigned int) round_double ((double) fread_le (&xdata[48])
93 * 1000 / fps);
94 width = fread_le (&xdata[64]);
95 height = fread_le (&xdata[68]);
96 /* pos: begin of video stream header */
97 pos = blockLen + 32;
98
99 if ((pos < blockLen) || (pos + 32 > xsize) || (pos > xsize))
100 return 0;
101 if (memcmp (&xdata[pos], "LIST", 4) != 0)
102 return 0;
103 blockLen = fread_le (&xdata[pos + 4]);
104 if (memcmp (&xdata[pos + 8], "strlstrh", 8) != 0)
105 return 0;
106 if (memcmp (&xdata[pos + 20], "vids", 4) != 0)
107 return 0;
108 ret = 0;
109 /* pos + 24: video stream header */
110 memcpy (codec, &xdata[pos + 24], 4);
111 codec[4] = '\0';
112 snprintf (format,
113 sizeof(format),
114 _("codec: %s, %u fps, %u ms"), codec, fps, duration);
115 ADD (format, EXTRACTOR_METATYPE_FORMAT);
116 snprintf (format,
117 sizeof(format),
118 "%ux%u", width, height);
119 ADD (format, EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
120 ADD ("video/x-msvideo", EXTRACTOR_METATYPE_MIMETYPE);
121 FINISH:
122 return ret;
123}
diff --git a/src/plugins/riff_extractor.c b/src/plugins/riff_extractor.c
new file mode 100644
index 0000000..b9cb5b3
--- /dev/null
+++ b/src/plugins/riff_extractor.c
@@ -0,0 +1,157 @@
1/*
2 This file is part of libextractor.
3 (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19
20 This code was based on AVInfo 1.0 alpha 11
21 (c) George Shuklin, gs]AT[shounen.ru, 2002-2004
22 http://shounen.ru/soft/avinfo/
23
24 and bitcollider 0.6.0
25 (PD) 2004 The Bitzi Corporation
26 http://bitzi.com/
27 */
28/**
29 * @file plugins/riff_extractor.c
30 * @brief plugin to support RIFF files (ms-video)
31 * @author Christian Grothoff
32 */
33#include "platform.h"
34#include "extractor.h"
35#include <math.h>
36
37
38/**
39 * Read an uint32_t as a little-endian (least
40 * significant byte first) integer from 'data'
41 *
42 * @param data input data
43 * @return integer read
44 */
45static uint32_t
46fread_le (const char *data)
47{
48 unsigned int x;
49 uint32_t result = 0;
50
51 for (x = 0; x < 4; x++)
52 result |= ((unsigned char) data[x]) << (x * 8);
53 return result;
54}
55
56
57/**
58 * We implement our own rounding function, because the availability of
59 * C99's round(), nearbyint(), rint(), etc. seems to be spotty, whereas
60 * floor() is available in math.h on all C compilers.
61 *
62 * @param num value to round
63 * @return rounded-to-nearest value
64 */
65static double
66round_double (double num)
67{
68 return floor (num + 0.5);
69}
70
71
72/**
73 * Pass the given UTF-8 string to the 'proc' callback using
74 * the given type. Uses 'return' if 'proc' returns non-0.
75 *
76 * @param s 0-terminated UTF8 string value with the meta data
77 * @param t libextractor type for the meta data
78 */
79#define ADD(s,t) do { if (0 != ec->proc (ec->cls, "riff", t, EXTRACTOR_METAFORMAT_UTF8, "text/plain", s, strlen (s) + 1)) return; } while (0)
80
81
82/**
83 * Main entry method for the 'video/x-msvideo' extraction plugin.
84 *
85 * @param ec extraction context provided to the plugin
86 */
87void
88EXTRACTOR_riff_extract_method (struct EXTRACTOR_ExtractContext *ec)
89{
90 ssize_t xsize;
91 void *data;
92 char *xdata;
93 uint32_t blockLen;
94 unsigned int fps;
95 unsigned int duration;
96 uint64_t pos;
97 uint32_t width;
98 uint32_t height;
99 char codec[5];
100 char format[256];
101
102 /* read header */
103 if (72 > (xsize = ec->read (ec->cls, &data, 72)))
104 return;
105 xdata = data;
106
107 /* check magic values */
108 if ( (0 != memcmp (&xdata[0],
109 "RIFF", 4)) ||
110 (0 != memcmp (&xdata[8], "AVI ", 4)) ||
111 (0 != memcmp (&xdata[12], "LIST", 4)) ||
112 (0 != memcmp (&xdata[20], "hdrlavih", 8)) )
113 return;
114
115 blockLen = fread_le (&xdata[28]);
116
117 /* begin of AVI header at 32 */
118 fps = (unsigned int) round_double ((double) 1.0e6 / fread_le (&xdata[32]));
119 duration = (unsigned int) round_double ((double) fread_le (&xdata[48])
120 * 1000 / fps);
121 width = fread_le (&xdata[64]);
122 height = fread_le (&xdata[68]);
123
124 /* pos: begin of video stream header */
125 pos = blockLen + 32;
126
127 if (pos !=
128 ec->seek (ec->cls, pos, SEEK_SET))
129 return;
130 if (32 > ec->read (ec->cls, &data, 32))
131 return;
132 xdata = data;
133
134 /* check magic */
135 if ( (0 != memcmp (xdata, "LIST", 4)) ||
136 (0 != memcmp (&xdata[8], "strlstrh", 8)) ||
137 (0 != memcmp (&xdata[20], "vids", 4)) )
138 return;
139
140 /* pos + 24: video stream header with codec */
141 memcpy (codec, &xdata[24], 4);
142 codec[4] = '\0';
143 snprintf (format,
144 sizeof (format),
145 _("codec: %s, %u fps, %u ms"),
146 codec, fps, duration);
147 ADD (format, EXTRACTOR_METATYPE_FORMAT);
148 snprintf (format,
149 sizeof (format),
150 "%ux%u",
151 (unsigned int) width,
152 (unsigned int) height);
153 ADD (format, EXTRACTOR_METATYPE_IMAGE_DIMENSIONS);
154 ADD ("video/x-msvideo", EXTRACTOR_METATYPE_MIMETYPE);
155}
156
157/* end of riff_extractor.c */
diff --git a/src/plugins/test_html.c b/src/plugins/test_html.c
new file mode 100644
index 0000000..150dac1
--- /dev/null
+++ b/src/plugins/test_html.c
@@ -0,0 +1,124 @@
1/*
2 This file is part of libextractor.
3 (C) 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19*/
20/**
21 * @file plugins/test_html.c
22 * @brief testcase for html plugin
23 * @author Christian Grothoff
24 */
25#include "platform.h"
26#include "test_lib.h"
27
28
29/**
30 * Main function for the HTML testcase.
31 *
32 * @param argc number of arguments (ignored)
33 * @param argv arguments (ignored)
34 * @return 0 on success
35 */
36int
37main (int argc, char *argv[])
38{
39 struct SolutionData html_grothoff_sol[] =
40 {
41 {
42 EXTRACTOR_METATYPE_TITLE,
43 EXTRACTOR_METAFORMAT_UTF8,
44 "text/plain",
45 "Christian Grothoff",
46 strlen ("Christian Grothoff") + 1,
47 0
48 },
49 {
50 EXTRACTOR_METATYPE_DESCRIPTION,
51 EXTRACTOR_METAFORMAT_UTF8,
52 "text/plain",
53 "Homepage of Christian Grothoff",
54 strlen ("Homepage of Christian Grothoff") + 1,
55 0
56 },
57 {
58 EXTRACTOR_METATYPE_AUTHOR_NAME,
59 EXTRACTOR_METAFORMAT_UTF8,
60 "text/plain",
61 "Christian Grothoff",
62 strlen ("Christian Grothoff") + 1,
63 0
64 },
65 {
66 EXTRACTOR_METATYPE_KEYWORDS,
67 EXTRACTOR_METAFORMAT_UTF8,
68 "text/plain",
69 "Christian,Grothoff",
70 strlen ("Christian,Grothoff") + 1,
71 0
72 },
73 {
74 EXTRACTOR_METATYPE_TITLE,
75 EXTRACTOR_METAFORMAT_UTF8,
76 "text/plain",
77 "Welcome to Christian Grothoff",
78 strlen ("Welcome to Christian Grothoff") + 1,
79 0
80 },
81 {
82 EXTRACTOR_METATYPE_LANGUAGE,
83 EXTRACTOR_METAFORMAT_UTF8,
84 "text/plain",
85 "en",
86 strlen ("en") + 1,
87 0
88 },
89 {
90 EXTRACTOR_METATYPE_PUBLISHER,
91 EXTRACTOR_METAFORMAT_UTF8,
92 "text/plain",
93 "Christian Grothoff",
94 strlen ("Christian Grothoff") + 1,
95 0
96 },
97 {
98 EXTRACTOR_METATYPE_UNKNOWN_DATE,
99 EXTRACTOR_METAFORMAT_UTF8,
100 "text/plain",
101 "2000-08-20",
102 strlen ("2000-08-20") + 1,
103 0
104 },
105 {
106 EXTRACTOR_METATYPE_RIGHTS,
107 EXTRACTOR_METAFORMAT_UTF8,
108 "text/plain",
109 "(C) 2000 by Christian Grothoff",
110 strlen ("(C) 2000 by Christian Grothoff") + 1,
111 0
112 },
113 { 0, 0, NULL, NULL, 0, -1 }
114 };
115 struct ProblemSet ps[] =
116 {
117 { "testdata/html_grothoff.html",
118 html_grothoff_sol },
119 { NULL, NULL }
120 };
121 return ET_main ("html", ps);
122}
123
124/* end of test_html.c */
diff --git a/src/plugins/test_man.c b/src/plugins/test_man.c
new file mode 100644
index 0000000..a7c33d3
--- /dev/null
+++ b/src/plugins/test_man.c
@@ -0,0 +1,85 @@
1/*
2 This file is part of libextractor.
3 (C) 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19*/
20/**
21 * @file plugins/test_man.c
22 * @brief testcase for man plugin
23 * @author Christian Grothoff
24 */
25#include "platform.h"
26#include "test_lib.h"
27
28
29
30/**
31 * Main function for the MAN testcase.
32 *
33 * @param argc number of arguments (ignored)
34 * @param argv arguments (ignored)
35 * @return 0 on success
36 */
37int
38main (int argc, char *argv[])
39{
40 struct SolutionData man_extract_sol[] =
41 {
42 {
43 EXTRACTOR_METATYPE_TITLE,
44 EXTRACTOR_METAFORMAT_UTF8,
45 "text/plain",
46 "EXTRACT",
47 strlen ("EXTRACT") + 1,
48 0
49 },
50 {
51 EXTRACTOR_METATYPE_SECTION,
52 EXTRACTOR_METAFORMAT_UTF8,
53 "text/plain",
54 _("Commands"),
55 strlen (_("Commands")) + 1,
56 0
57 },
58 {
59 EXTRACTOR_METATYPE_MODIFICATION_DATE,
60 EXTRACTOR_METAFORMAT_UTF8,
61 "text/plain",
62 "Aug 7, 2012",
63 strlen ("Aug 7, 2012") + 1,
64 0
65 },
66 {
67 EXTRACTOR_METATYPE_SOURCE,
68 EXTRACTOR_METAFORMAT_UTF8,
69 "text/plain",
70 _("libextractor 0.7.0"),
71 strlen (_("libextractor 0.7.0")) + 1,
72 0
73 },
74 { 0, 0, NULL, NULL, 0, -1 }
75 };
76 struct ProblemSet ps[] =
77 {
78 { "testdata/man_extract.1",
79 man_extract_sol },
80 { NULL, NULL }
81 };
82 return ET_main ("man", ps);
83}
84
85/* end of test_man.c */
diff --git a/src/plugins/testdata/html_grothoff.html b/src/plugins/testdata/html_grothoff.html
new file mode 100644
index 0000000..fc7c620
--- /dev/null
+++ b/src/plugins/testdata/html_grothoff.html
@@ -0,0 +1,44 @@
1<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN">
2<html lang="en">
3<head>
4<title>Christian Grothoff</title>
5<meta name="description" content="Homepage of Christian Grothoff">
6<meta name="author" content="Christian Grothoff">
7<meta name="keywords" content="Christian,Grothoff">
8<meta name="robots" content="index,follow">
9<meta name="revisit-after" content="28 days">
10<meta name="title" content="Welcome to Christian Grothoff">
11<meta name="content-language" content="en">
12<meta name="language" content="en">
13<meta name="publisher" content="Christian Grothoff">
14<meta name="date" content="2000-08-20">
15<meta name="rights" content="(C) 2000 by Christian Grothoff">
16<meta http-equiv="expires" content="43200">
17<meta http-equiv="content-type" content="text/html;CHARSET=iso8859-1">
18<meta http-equiv="Content-Style-Type" content="text/css">
19<link rel=stylesheet type="text/css" href="grothoff.css">
20<script language="JavaScript">
21<!--
22 if(top.frames.length > 0)
23 top.location.href=self.location;
24//-->
25</script>
26</head>
27<frameset cols="180,*" border=5 frameborder=5 framespacing=5 bordercolor="#000000">
28<frame src="navigation.php3?currenttopic=Welcome" name="navigation">
29<frame src="welcome.php3" name="contentwindow">
30</frameset>
31<body>
32<h1>Welcome to Christian Grothoff</h1>
33<hr class="big">
34<br clear=all>
35<ul>
36<li><A href="welcome.php3">Welcome</A></li>
37<li><A href="cs/">Computer Science</A></li>
38<li><A href="linux/">Linux</A></li>
39<li><A href="http://www.stud.uni-wuppertal.de/~ma0035/">Willkommen (my german homepage)</A></li>
40</ul>
41<hr>
42<A href="mailto:grothoff@cs.purdue.edu"><em>grothoff@cs.purdue.edu</em></A>
43</body>
44</html>
diff --git a/src/plugins/testdata/man_extract.1 b/src/plugins/testdata/man_extract.1
new file mode 100644
index 0000000..500c061
--- /dev/null
+++ b/src/plugins/testdata/man_extract.1
@@ -0,0 +1,109 @@
1.TH EXTRACT 1 "Aug 7, 2012" "libextractor 0.7.0"
2.\" $Id
3.SH NAME
4extract
5\- determine meta-information about a file
6.SH SYNOPSIS
7.B extract
8[
9.B \-bgihLmnvV
10]
11[
12.B \-l
13.I library
14]
15[
16.B \-p
17.I type
18]
19[
20.B \-x
21.I type
22]
23.I file
24\&...
25.br
26.SH DESCRIPTION
27This manual page documents version 0.7.0 of the
28.B extract
29command.
30.PP
31.B extract
32tests each file specified in the argument list in an attempt to infer meta\-information from it. Each file is subjected to the meta\-data extraction libraries from
33.I libextractor.
34.PP
35libextractor classifies meta\-information (also referred to as keywords) into types. A list of all types can be obtained with the
36.B \-L
37option.
38
39.SH OPTIONS
40.TP 8
41.B \-b
42Display the output in BiBTeX format.
43.TP 8
44.B \-g
45Use grep\-friendly output (all keywords on a single line for each file). Use the verbose option to print the filename first, followed by the keywords. Use the verbose option twice to also display the keyword types. This option will not print keyword types or non\-textual metadata.
46.TP 8
47.B \-h
48Print a brief summary of the options.
49.TP 8
50.B \-i
51Run plugins in\-process (for debugging). By default, each plugin is run in its own process.
52.TP 8
53.BI \-l " libraries"
54Use the specified libraries to extract keywords. The general format of libraries is .I [[\-]LIBRARYNAME[:[\-]LIBRARYNAME]*] where LIBRARYNAME is a libextractor compatible library and typically of the form .Ijpeg\. The minus before the libraryname indicates that this library should be removed from the existing list. To run only a few selected plugins, use \-l in combination with \-n.
55.TP 8
56.B \-L
57Print a list of all known keyword types.
58.TP 8
59.B \-m
60Load the file into memory and perform extraction from memory (for debugging).
61.TP 8
62.B \-n
63Do not use the default set of extractors (typically all standard extractors, currently mp3, ogg, jpg, gif, png, tiff, real, html, pdf and mime\-types), use only the extractors specified with the .B \-l option.
64.TP
65.B \-p " type"
66Print only the keywords matching the specified type. By default, all keywords that are found and not removed as duplicates are printed.
67.TP 8
68.B \-v
69Print the version number and exit.
70.TP 8
71.B \-V
72Be verbose. This option can be specified multiple times to increase verbosity further.
73.TP 8
74.I \-x " type"
75Exclude keywords of the specified type from the output. By default, all keywords that are found and not removed as duplicates are printed.
76.SH SEE ALSO
77.BR libextractor (3)
78\- description of the libextractor library
79.br
80.SH EXAMPLES
81.nf
82$ extract test/test.jpg
83comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
84mimetype \- image/jpeg
85
86$ extract \-V \-x comment test/test.jpg
87Keywords for file test/test.jpg:
88mimetype \- image/jpeg
89
90$ extract \-p comment test/test.jpg
91comment \- (C) 2001 by Christian Grothoff, using gimp 1.2 1
92
93$ extract \-nV \-l png.so \-p comment test/test.jpg test/test.png
94Keywords for file test/test.jpg:
95Keywords for file test/test.png:
96comment \- Testing keyword extraction
97
98.SH LEGAL NOTICE
99libextractor and the extract tool are released under the GPL. libextractor is a GNU package.
100
101.SH BUGS
102A couple of file\-formats (on the order of 10^3) are not recognized...
103
104.SH AUTHORS
105.B extract
106was originally written by Christian Grothoff <christian@grothoff.org> and Vidyut Samanta <vids@cs.ucla.edu>. Use <libextractor@gnu.org> to contact the current maintainer(s).
107
108.SH AVAILABILITY
109You can obtain the original author's latest version from http://www.gnu.org/software/libextractor/
diff --git a/src/plugins/thumbnailgtk_extractor.c b/src/plugins/thumbnailgtk_extractor.c
index a68df7b..3676177 100644
--- a/src/plugins/thumbnailgtk_extractor.c
+++ b/src/plugins/thumbnailgtk_extractor.c
@@ -74,10 +74,9 @@ EXTRACTOR_thumbnailgtk_extract_method (struct EXTRACTOR_ExtractContext *ec)
74 void *buf; 74 void *buf;
75 const char *mime; 75 const char *mime;
76 76
77 iret = ec->read (ec->cls, 77 if (-1 == (iret = ec->read (ec->cls,
78 &data, 78 &data,
79 16 * 1024); 79 16 * 1024)))
80 if (-1 == iret)
81 return; 80 return;
82 if (NULL == (mime = magic_buffer (magic, data, iret))) 81 if (NULL == (mime = magic_buffer (magic, data, iret)))
83 return; 82 return;