diff options
Diffstat (limited to 'src/plugins/old/html_extractor.c')
-rw-r--r-- | src/plugins/old/html_extractor.c | 420 |
1 files changed, 420 insertions, 0 deletions
diff --git a/src/plugins/old/html_extractor.c b/src/plugins/old/html_extractor.c new file mode 100644 index 0000000..004d22a --- /dev/null +++ b/src/plugins/old/html_extractor.c | |||
@@ -0,0 +1,420 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | (C) 2002, 2003, 2004, 2005, 2009 Vidyut Samanta and Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 2, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | |||
20 | */ | ||
21 | |||
22 | #include "platform.h" | ||
23 | #include "extractor.h" | ||
24 | #include <string.h> | ||
25 | #include "convert.h" | ||
26 | |||
27 | static struct | ||
28 | { | ||
29 | const char *name; | ||
30 | enum EXTRACTOR_MetaType type; | ||
31 | } tagmap[] = { | ||
32 | { "author", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
33 | { "dc.author", EXTRACTOR_METATYPE_AUTHOR_NAME }, | ||
34 | { "title", EXTRACTOR_METATYPE_TITLE }, | ||
35 | { "dc.title", EXTRACTOR_METATYPE_TITLE}, | ||
36 | { "description", EXTRACTOR_METATYPE_DESCRIPTION }, | ||
37 | { "dc.description", EXTRACTOR_METATYPE_DESCRIPTION }, | ||
38 | { "subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
39 | { "dc.subject", EXTRACTOR_METATYPE_SUBJECT}, | ||
40 | { "date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, | ||
41 | { "dc.date", EXTRACTOR_METATYPE_UNKNOWN_DATE}, | ||
42 | { "publisher", EXTRACTOR_METATYPE_PUBLISHER }, | ||
43 | { "dc.publisher", EXTRACTOR_METATYPE_PUBLISHER}, | ||
44 | { "rights", EXTRACTOR_METATYPE_RIGHTS }, | ||
45 | { "dc.rights", EXTRACTOR_METATYPE_RIGHTS }, | ||
46 | { "copyright", EXTRACTOR_METATYPE_COPYRIGHT }, | ||
47 | { "language", EXTRACTOR_METATYPE_LANGUAGE }, | ||
48 | { "keywords", EXTRACTOR_METATYPE_KEYWORDS }, | ||
49 | { "abstract", EXTRACTOR_METATYPE_ABSTRACT }, | ||
50 | { "formatter", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, | ||
51 | { "dc.creator", EXTRACTOR_METATYPE_CREATOR}, | ||
52 | { "dc.identifier", EXTRACTOR_METATYPE_URI }, | ||
53 | { "dc.format", EXTRACTOR_METATYPE_FORMAT }, | ||
54 | { NULL, EXTRACTOR_METATYPE_RESERVED } | ||
55 | }; | ||
56 | |||
57 | static const char *relevantTags[] = { | ||
58 | "title", | ||
59 | "meta", | ||
60 | NULL, | ||
61 | }; | ||
62 | |||
63 | typedef struct TI | ||
64 | { | ||
65 | struct TI *next; | ||
66 | const char *tagStart; | ||
67 | const char *tagEnd; | ||
68 | const char *dataStart; | ||
69 | const char *dataEnd; | ||
70 | } TagInfo; | ||
71 | |||
72 | |||
73 | |||
74 | |||
75 | /* ******************** parser helper functions ************** */ | ||
76 | |||
77 | static int | ||
78 | tagMatch (const char *tag, const char *s, const char *e) | ||
79 | { | ||
80 | return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); | ||
81 | } | ||
82 | |||
83 | static int | ||
84 | lookFor (char c, size_t * pos, const char *data, size_t size) | ||
85 | { | ||
86 | size_t p = *pos; | ||
87 | |||
88 | while ((p < size) && (data[p] != c)) | ||
89 | { | ||
90 | if (data[p] == '\0') | ||
91 | return 0; | ||
92 | p++; | ||
93 | } | ||
94 | *pos = p; | ||
95 | return p < size; | ||
96 | } | ||
97 | |||
98 | static int | ||
99 | skipWhitespace (size_t * pos, const char *data, size_t size) | ||
100 | { | ||
101 | size_t p = *pos; | ||
102 | |||
103 | while ((p < size) && (isspace ( (unsigned char) data[p]))) | ||
104 | { | ||
105 | if (data[p] == '\0') | ||
106 | return 0; | ||
107 | p++; | ||
108 | } | ||
109 | *pos = p; | ||
110 | return p < size; | ||
111 | } | ||
112 | |||
113 | static int | ||
114 | skipLetters (size_t * pos, const char *data, size_t size) | ||
115 | { | ||
116 | size_t p = *pos; | ||
117 | |||
118 | while ((p < size) && (isalpha ( (unsigned char) data[p]))) | ||
119 | { | ||
120 | if (data[p] == '\0') | ||
121 | return 0; | ||
122 | p++; | ||
123 | } | ||
124 | *pos = p; | ||
125 | return p < size; | ||
126 | } | ||
127 | |||
128 | static int | ||
129 | lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) | ||
130 | { | ||
131 | size_t p = *pos; | ||
132 | |||
133 | while ((p < size) && (strchr (c, data[p]) == NULL)) | ||
134 | { | ||
135 | if (data[p] == '\0') | ||
136 | return 0; | ||
137 | p++; | ||
138 | } | ||
139 | *pos = p; | ||
140 | return p < size; | ||
141 | } | ||
142 | |||
143 | static void | ||
144 | findEntry (const char *key, | ||
145 | const char *start, | ||
146 | const char *end, const char **mstart, const char **mend) | ||
147 | { | ||
148 | size_t len; | ||
149 | |||
150 | *mstart = NULL; | ||
151 | *mend = NULL; | ||
152 | len = strlen (key); | ||
153 | while (start < end - len - 1) | ||
154 | { | ||
155 | start++; | ||
156 | if (start[len] != '=') | ||
157 | continue; | ||
158 | if (0 == strncasecmp (start, key, len)) | ||
159 | { | ||
160 | start += len + 1; | ||
161 | *mstart = start; | ||
162 | if ((*start == '\"') || (*start == '\'')) | ||
163 | { | ||
164 | start++; | ||
165 | while ((start < end) && (*start != **mstart)) | ||
166 | start++; | ||
167 | (*mstart)++; /* skip quote */ | ||
168 | } | ||
169 | else | ||
170 | { | ||
171 | while ((start < end) && (!isspace ( (unsigned char) *start))) | ||
172 | start++; | ||
173 | } | ||
174 | *mend = start; | ||
175 | return; | ||
176 | } | ||
177 | } | ||
178 | } | ||
179 | |||
180 | /** | ||
181 | * Search all tags that correspond to "tagname". Example: | ||
182 | * If the tag is <meta name="foo" desc="bar">, and | ||
183 | * tagname == "meta", keyname="name", keyvalue="foo", | ||
184 | * and searchname="desc", then this function returns a | ||
185 | * copy (!) of "bar". Easy enough? | ||
186 | * | ||
187 | * @return NULL if nothing is found | ||
188 | */ | ||
189 | static char * | ||
190 | findInTags (TagInfo * t, | ||
191 | const char *tagname, | ||
192 | const char *keyname, const char *keyvalue, const char *searchname) | ||
193 | { | ||
194 | const char *pstart; | ||
195 | const char *pend; | ||
196 | |||
197 | while (t != NULL) | ||
198 | { | ||
199 | if (tagMatch (tagname, t->tagStart, t->tagEnd)) | ||
200 | { | ||
201 | findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); | ||
202 | if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) | ||
203 | { | ||
204 | findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); | ||
205 | if (pstart != NULL) | ||
206 | { | ||
207 | char *ret = malloc (pend - pstart + 1); | ||
208 | if (ret == NULL) | ||
209 | return NULL; | ||
210 | memcpy (ret, pstart, pend - pstart); | ||
211 | ret[pend - pstart] = '\0'; | ||
212 | return ret; | ||
213 | } | ||
214 | } | ||
215 | } | ||
216 | t = t->next; | ||
217 | } | ||
218 | return NULL; | ||
219 | } | ||
220 | |||
221 | |||
222 | /* mimetype = text/html */ | ||
223 | int | ||
224 | EXTRACTOR_html_extract (const char *data, | ||
225 | size_t size, | ||
226 | EXTRACTOR_MetaDataProcessor proc, | ||
227 | void *proc_cls, | ||
228 | const char *options) | ||
229 | { | ||
230 | size_t xsize; | ||
231 | TagInfo *tags; | ||
232 | TagInfo *t; | ||
233 | TagInfo tag; | ||
234 | size_t pos; | ||
235 | size_t tpos; | ||
236 | int i; | ||
237 | char *charset; | ||
238 | char *tmp; | ||
239 | char *xtmp; | ||
240 | int ret; | ||
241 | |||
242 | ret = 0; | ||
243 | if (size == 0) | ||
244 | return 0; | ||
245 | /* only scan first 32k */ | ||
246 | if (size > 1024 * 32) | ||
247 | xsize = 1024 * 32; | ||
248 | else | ||
249 | xsize = size; | ||
250 | tags = NULL; | ||
251 | tag.next = NULL; | ||
252 | pos = 0; | ||
253 | while (pos < xsize) | ||
254 | { | ||
255 | if (!lookFor ('<', &pos, data, size)) | ||
256 | break; | ||
257 | tag.tagStart = &data[++pos]; | ||
258 | if (!skipLetters (&pos, data, size)) | ||
259 | break; | ||
260 | tag.tagEnd = &data[pos]; | ||
261 | if (!skipWhitespace (&pos, data, size)) | ||
262 | break; | ||
263 | STEP3: | ||
264 | if (!lookForMultiple (">\"\'", &pos, data, size)) | ||
265 | break; | ||
266 | if (data[pos] != '>') | ||
267 | { | ||
268 | /* find end-quote, ignore escaped quotes (\') */ | ||
269 | do | ||
270 | { | ||
271 | tpos = pos; | ||
272 | pos++; | ||
273 | if (!lookFor (data[tpos], &pos, data, size)) | ||
274 | break; | ||
275 | } | ||
276 | while (data[pos - 1] == '\\'); | ||
277 | pos++; | ||
278 | goto STEP3; | ||
279 | } | ||
280 | pos++; | ||
281 | if (!skipWhitespace (&pos, data, size)) | ||
282 | break; | ||
283 | tag.dataStart = &data[pos]; | ||
284 | if (!lookFor ('<', &pos, data, size)) | ||
285 | break; | ||
286 | tag.dataEnd = &data[pos]; | ||
287 | i = 0; | ||
288 | while (relevantTags[i] != NULL) | ||
289 | { | ||
290 | if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && | ||
291 | (0 == strncasecmp (relevantTags[i], | ||
292 | tag.tagStart, tag.tagEnd - tag.tagStart))) | ||
293 | { | ||
294 | t = malloc (sizeof (TagInfo)); | ||
295 | if (t == NULL) | ||
296 | return 0; | ||
297 | *t = tag; | ||
298 | t->next = tags; | ||
299 | tags = t; | ||
300 | break; | ||
301 | } | ||
302 | i++; | ||
303 | } | ||
304 | /* abort early if we hit the body tag */ | ||
305 | if (tagMatch ("body", tag.tagStart, tag.tagEnd)) | ||
306 | break; | ||
307 | } | ||
308 | |||
309 | /* fast exit */ | ||
310 | if (tags == NULL) | ||
311 | return 0; | ||
312 | |||
313 | charset = NULL; | ||
314 | /* first, try to determine mime type and/or character set */ | ||
315 | tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); | ||
316 | if (tmp != NULL) | ||
317 | { | ||
318 | /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; | ||
319 | if text/html is present, we take that as the mime-type; if charset= | ||
320 | is present, we try to use that for character set conversion. */ | ||
321 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) | ||
322 | ret = proc (proc_cls, | ||
323 | "html", | ||
324 | EXTRACTOR_METATYPE_MIMETYPE, | ||
325 | EXTRACTOR_METAFORMAT_UTF8, | ||
326 | "text/plain", | ||
327 | "text/html", | ||
328 | strlen ("text/html")+1); | ||
329 | charset = strcasestr (tmp, "charset="); | ||
330 | if (charset != NULL) | ||
331 | charset = strdup (&charset[strlen ("charset=")]); | ||
332 | free (tmp); | ||
333 | } | ||
334 | i = 0; | ||
335 | while (tagmap[i].name != NULL) | ||
336 | { | ||
337 | tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); | ||
338 | if ( (tmp != NULL) && | ||
339 | (ret == 0) ) | ||
340 | { | ||
341 | if (charset == NULL) | ||
342 | { | ||
343 | ret = proc (proc_cls, | ||
344 | "html", | ||
345 | tagmap[i].type, | ||
346 | EXTRACTOR_METAFORMAT_C_STRING, | ||
347 | "text/plain", | ||
348 | tmp, | ||
349 | strlen (tmp) + 1); | ||
350 | } | ||
351 | else | ||
352 | { | ||
353 | xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, | ||
354 | strlen (tmp), | ||
355 | charset); | ||
356 | if (xtmp != NULL) | ||
357 | { | ||
358 | ret = proc (proc_cls, | ||
359 | "html", | ||
360 | tagmap[i].type, | ||
361 | EXTRACTOR_METAFORMAT_UTF8, | ||
362 | "text/plain", | ||
363 | xtmp, | ||
364 | strlen (xtmp) + 1); | ||
365 | free (xtmp); | ||
366 | } | ||
367 | } | ||
368 | } | ||
369 | if (tmp != NULL) | ||
370 | free (tmp); | ||
371 | i++; | ||
372 | } | ||
373 | while (tags != NULL) | ||
374 | { | ||
375 | t = tags; | ||
376 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && | ||
377 | (ret == 0) ) | ||
378 | { | ||
379 | if (charset == NULL) | ||
380 | { | ||
381 | xtmp = malloc (t->dataEnd - t->dataStart + 1); | ||
382 | if (xtmp != NULL) | ||
383 | { | ||
384 | memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); | ||
385 | xtmp[t->dataEnd - t->dataStart] = '\0'; | ||
386 | ret = proc (proc_cls, | ||
387 | "html", | ||
388 | EXTRACTOR_METATYPE_TITLE, | ||
389 | EXTRACTOR_METAFORMAT_C_STRING, | ||
390 | "text/plain", | ||
391 | xtmp, | ||
392 | strlen (xtmp) + 1); | ||
393 | free (xtmp); | ||
394 | } | ||
395 | } | ||
396 | else | ||
397 | { | ||
398 | xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, | ||
399 | t->dataEnd - t->dataStart, | ||
400 | charset); | ||
401 | if (xtmp != NULL) | ||
402 | { | ||
403 | ret = proc (proc_cls, | ||
404 | "html", | ||
405 | EXTRACTOR_METATYPE_TITLE, | ||
406 | EXTRACTOR_METAFORMAT_UTF8, | ||
407 | "text/plain", | ||
408 | xtmp, | ||
409 | strlen (xtmp) + 1); | ||
410 | free (xtmp); | ||
411 | } | ||
412 | } | ||
413 | } | ||
414 | tags = t->next; | ||
415 | free (t); | ||
416 | } | ||
417 | if (charset != NULL) | ||
418 | free (charset); | ||
419 | return ret; | ||
420 | } | ||