aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/old/html_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/old/html_extractor.c')
-rw-r--r--src/plugins/old/html_extractor.c420
1 files changed, 420 insertions, 0 deletions
diff --git a/src/plugins/old/html_extractor.c b/src/plugins/old/html_extractor.c
new file mode 100644
index 0000000..004d22a
--- /dev/null
+++ b/src/plugins/old/html_extractor.c
@@ -0,0 +1,420 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19
20 */
21
22#include "platform.h"
23#include "extractor.h"
24#include <string.h>
25#include "convert.h"
26
27static struct
28{
29 const char *name;
30 enum EXTRACTOR_MetaType type;
31} tagmap[] = {
32 { "author", EXTRACTOR_METATYPE_AUTHOR_NAME },
33 { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME },
34 { "title", EXTRACTOR_METATYPE_TITLE },
35 { "dc.title", EXTRACTOR_METATYPE_TITLE},
36 { "description", EXTRACTOR_METATYPE_DESCRIPTION },
37 { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION },
38 { "subject", EXTRACTOR_METATYPE_SUBJECT},
39 { "dc.subject", EXTRACTOR_METATYPE_SUBJECT},
40 { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
41 { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE},
42 { "publisher", EXTRACTOR_METATYPE_PUBLISHER },
43 { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER},
44 { "rights", EXTRACTOR_METATYPE_RIGHTS },
45 { "dc.rights", EXTRACTOR_METATYPE_RIGHTS },
46 { "copyright", EXTRACTOR_METATYPE_COPYRIGHT },
47 { "language", EXTRACTOR_METATYPE_LANGUAGE },
48 { "keywords", EXTRACTOR_METATYPE_KEYWORDS },
49 { "abstract", EXTRACTOR_METATYPE_ABSTRACT },
50 { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
51 { "dc.creator", EXTRACTOR_METATYPE_CREATOR},
52 { "dc.identifier", EXTRACTOR_METATYPE_URI },
53 { "dc.format", EXTRACTOR_METATYPE_FORMAT },
54 { NULL, EXTRACTOR_METATYPE_RESERVED }
55};
56
57static const char *relevantTags[] = {
58 "title",
59 "meta",
60 NULL,
61};
62
63typedef struct TI
64{
65 struct TI *next;
66 const char *tagStart;
67 const char *tagEnd;
68 const char *dataStart;
69 const char *dataEnd;
70} TagInfo;
71
72
73
74
75/* ******************** parser helper functions ************** */
76
77static int
78tagMatch (const char *tag, const char *s, const char *e)
79{
80 return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s)));
81}
82
83static int
84lookFor (char c, size_t * pos, const char *data, size_t size)
85{
86 size_t p = *pos;
87
88 while ((p < size) && (data[p] != c))
89 {
90 if (data[p] == '\0')
91 return 0;
92 p++;
93 }
94 *pos = p;
95 return p < size;
96}
97
98static int
99skipWhitespace (size_t * pos, const char *data, size_t size)
100{
101 size_t p = *pos;
102
103 while ((p < size) && (isspace ( (unsigned char) data[p])))
104 {
105 if (data[p] == '\0')
106 return 0;
107 p++;
108 }
109 *pos = p;
110 return p < size;
111}
112
113static int
114skipLetters (size_t * pos, const char *data, size_t size)
115{
116 size_t p = *pos;
117
118 while ((p < size) && (isalpha ( (unsigned char) data[p])))
119 {
120 if (data[p] == '\0')
121 return 0;
122 p++;
123 }
124 *pos = p;
125 return p < size;
126}
127
128static int
129lookForMultiple (const char *c, size_t * pos, const char *data, size_t size)
130{
131 size_t p = *pos;
132
133 while ((p < size) && (strchr (c, data[p]) == NULL))
134 {
135 if (data[p] == '\0')
136 return 0;
137 p++;
138 }
139 *pos = p;
140 return p < size;
141}
142
143static void
144findEntry (const char *key,
145 const char *start,
146 const char *end, const char **mstart, const char **mend)
147{
148 size_t len;
149
150 *mstart = NULL;
151 *mend = NULL;
152 len = strlen (key);
153 while (start < end - len - 1)
154 {
155 start++;
156 if (start[len] != '=')
157 continue;
158 if (0 == strncasecmp (start, key, len))
159 {
160 start += len + 1;
161 *mstart = start;
162 if ((*start == '\"') || (*start == '\''))
163 {
164 start++;
165 while ((start < end) && (*start != **mstart))
166 start++;
167 (*mstart)++; /* skip quote */
168 }
169 else
170 {
171 while ((start < end) && (!isspace ( (unsigned char) *start)))
172 start++;
173 }
174 *mend = start;
175 return;
176 }
177 }
178}
179
180/**
181 * Search all tags that correspond to "tagname". Example:
182 * If the tag is <meta name="foo" desc="bar">, and
183 * tagname == "meta", keyname="name", keyvalue="foo",
184 * and searchname="desc", then this function returns a
185 * copy (!) of "bar". Easy enough?
186 *
187 * @return NULL if nothing is found
188 */
189static char *
190findInTags (TagInfo * t,
191 const char *tagname,
192 const char *keyname, const char *keyvalue, const char *searchname)
193{
194 const char *pstart;
195 const char *pend;
196
197 while (t != NULL)
198 {
199 if (tagMatch (tagname, t->tagStart, t->tagEnd))
200 {
201 findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend);
202 if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend)))
203 {
204 findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend);
205 if (pstart != NULL)
206 {
207 char *ret = malloc (pend - pstart + 1);
208 if (ret == NULL)
209 return NULL;
210 memcpy (ret, pstart, pend - pstart);
211 ret[pend - pstart] = '\0';
212 return ret;
213 }
214 }
215 }
216 t = t->next;
217 }
218 return NULL;
219}
220
221
222/* mimetype = text/html */
223int
224EXTRACTOR_html_extract (const char *data,
225 size_t size,
226 EXTRACTOR_MetaDataProcessor proc,
227 void *proc_cls,
228 const char *options)
229{
230 size_t xsize;
231 TagInfo *tags;
232 TagInfo *t;
233 TagInfo tag;
234 size_t pos;
235 size_t tpos;
236 int i;
237 char *charset;
238 char *tmp;
239 char *xtmp;
240 int ret;
241
242 ret = 0;
243 if (size == 0)
244 return 0;
245 /* only scan first 32k */
246 if (size > 1024 * 32)
247 xsize = 1024 * 32;
248 else
249 xsize = size;
250 tags = NULL;
251 tag.next = NULL;
252 pos = 0;
253 while (pos < xsize)
254 {
255 if (!lookFor ('<', &pos, data, size))
256 break;
257 tag.tagStart = &data[++pos];
258 if (!skipLetters (&pos, data, size))
259 break;
260 tag.tagEnd = &data[pos];
261 if (!skipWhitespace (&pos, data, size))
262 break;
263 STEP3:
264 if (!lookForMultiple (">\"\'", &pos, data, size))
265 break;
266 if (data[pos] != '>')
267 {
268 /* find end-quote, ignore escaped quotes (\') */
269 do
270 {
271 tpos = pos;
272 pos++;
273 if (!lookFor (data[tpos], &pos, data, size))
274 break;
275 }
276 while (data[pos - 1] == '\\');
277 pos++;
278 goto STEP3;
279 }
280 pos++;
281 if (!skipWhitespace (&pos, data, size))
282 break;
283 tag.dataStart = &data[pos];
284 if (!lookFor ('<', &pos, data, size))
285 break;
286 tag.dataEnd = &data[pos];
287 i = 0;
288 while (relevantTags[i] != NULL)
289 {
290 if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) &&
291 (0 == strncasecmp (relevantTags[i],
292 tag.tagStart, tag.tagEnd - tag.tagStart)))
293 {
294 t = malloc (sizeof (TagInfo));
295 if (t == NULL)
296 return 0;
297 *t = tag;
298 t->next = tags;
299 tags = t;
300 break;
301 }
302 i++;
303 }
304 /* abort early if we hit the body tag */
305 if (tagMatch ("body", tag.tagStart, tag.tagEnd))
306 break;
307 }
308
309 /* fast exit */
310 if (tags == NULL)
311 return 0;
312
313 charset = NULL;
314 /* first, try to determine mime type and/or character set */
315 tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content");
316 if (tmp != NULL)
317 {
318 /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that;
319 if text/html is present, we take that as the mime-type; if charset=
320 is present, we try to use that for character set conversion. */
321 if (0 == strncasecmp (tmp, "text/html", strlen ("text/html")))
322 ret = proc (proc_cls,
323 "html",
324 EXTRACTOR_METATYPE_MIMETYPE,
325 EXTRACTOR_METAFORMAT_UTF8,
326 "text/plain",
327 "text/html",
328 strlen ("text/html")+1);
329 charset = strcasestr (tmp, "charset=");
330 if (charset != NULL)
331 charset = strdup (&charset[strlen ("charset=")]);
332 free (tmp);
333 }
334 i = 0;
335 while (tagmap[i].name != NULL)
336 {
337 tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content");
338 if ( (tmp != NULL) &&
339 (ret == 0) )
340 {
341 if (charset == NULL)
342 {
343 ret = proc (proc_cls,
344 "html",
345 tagmap[i].type,
346 EXTRACTOR_METAFORMAT_C_STRING,
347 "text/plain",
348 tmp,
349 strlen (tmp) + 1);
350 }
351 else
352 {
353 xtmp = EXTRACTOR_common_convert_to_utf8 (tmp,
354 strlen (tmp),
355 charset);
356 if (xtmp != NULL)
357 {
358 ret = proc (proc_cls,
359 "html",
360 tagmap[i].type,
361 EXTRACTOR_METAFORMAT_UTF8,
362 "text/plain",
363 xtmp,
364 strlen (xtmp) + 1);
365 free (xtmp);
366 }
367 }
368 }
369 if (tmp != NULL)
370 free (tmp);
371 i++;
372 }
373 while (tags != NULL)
374 {
375 t = tags;
376 if ( (tagMatch ("title", t->tagStart, t->tagEnd)) &&
377 (ret == 0) )
378 {
379 if (charset == NULL)
380 {
381 xtmp = malloc (t->dataEnd - t->dataStart + 1);
382 if (xtmp != NULL)
383 {
384 memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart);
385 xtmp[t->dataEnd - t->dataStart] = '\0';
386 ret = proc (proc_cls,
387 "html",
388 EXTRACTOR_METATYPE_TITLE,
389 EXTRACTOR_METAFORMAT_C_STRING,
390 "text/plain",
391 xtmp,
392 strlen (xtmp) + 1);
393 free (xtmp);
394 }
395 }
396 else
397 {
398 xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart,
399 t->dataEnd - t->dataStart,
400 charset);
401 if (xtmp != NULL)
402 {
403 ret = proc (proc_cls,
404 "html",
405 EXTRACTOR_METATYPE_TITLE,
406 EXTRACTOR_METAFORMAT_UTF8,
407 "text/plain",
408 xtmp,
409 strlen (xtmp) + 1);
410 free (xtmp);
411 }
412 }
413 }
414 tags = t->next;
415 free (t);
416 }
417 if (charset != NULL)
418 free (charset);
419 return ret;
420}