diff options
Diffstat (limited to 'src/plugins/html_extractor.c')
-rw-r--r-- | src/plugins/html_extractor.c | 652 |
1 files changed, 330 insertions, 322 deletions
diff --git a/src/plugins/html_extractor.c b/src/plugins/html_extractor.c index 8cd4aba..5ebf97b 100644 --- a/src/plugins/html_extractor.c +++ b/src/plugins/html_extractor.c | |||
@@ -87,9 +87,9 @@ tag_to_type (const char *tag) | |||
87 | { | 87 | { |
88 | unsigned int i; | 88 | unsigned int i; |
89 | 89 | ||
90 | for (i=0; NULL != tagmap[i].name; i++) | 90 | for (i = 0; NULL != tagmap[i].name; i++) |
91 | if (0 == strcasecmp (tag, | 91 | if (0 == strcasecmp (tag, |
92 | tagmap[i].name)) | 92 | tagmap[i].name)) |
93 | return tagmap[i].type; | 93 | return tagmap[i].type; |
94 | return EXTRACTOR_METATYPE_RESERVED; | 94 | return EXTRACTOR_METATYPE_RESERVED; |
95 | } | 95 | } |
@@ -107,10 +107,10 @@ tag_to_type (const char *tag) | |||
107 | */ | 107 | */ |
108 | static Bool TIDY_CALL | 108 | static Bool TIDY_CALL |
109 | report_cb (TidyDoc doc, | 109 | report_cb (TidyDoc doc, |
110 | TidyReportLevel lvl, | 110 | TidyReportLevel lvl, |
111 | uint line, | 111 | uint line, |
112 | uint col, | 112 | uint col, |
113 | ctmbstr mssg) | 113 | ctmbstr mssg) |
114 | { | 114 | { |
115 | return 0; | 115 | return 0; |
116 | } | 116 | } |
@@ -130,7 +130,7 @@ get_byte_cb (void *sourceData) | |||
130 | 130 | ||
131 | if (1 != | 131 | if (1 != |
132 | ec->read (ec->cls, | 132 | ec->read (ec->cls, |
133 | &data, 1)) | 133 | &data, 1)) |
134 | return EndOfStream; | 134 | return EndOfStream; |
135 | return *(unsigned char*) data; | 135 | return *(unsigned char*) data; |
136 | } | 136 | } |
@@ -188,130 +188,129 @@ EXTRACTOR_html_extract_method (struct EXTRACTOR_ExtractContext *ec) | |||
188 | const char *mime; | 188 | const char *mime; |
189 | 189 | ||
190 | if (-1 == (iret = ec->read (ec->cls, | 190 | if (-1 == (iret = ec->read (ec->cls, |
191 | &data, | 191 | &data, |
192 | 16 * 1024))) | 192 | 16 * 1024))) |
193 | return; | 193 | return; |
194 | if (NULL == (mime = magic_buffer (magic, data, iret))) | 194 | if (NULL == (mime = magic_buffer (magic, data, iret))) |
195 | return; | 195 | return; |
196 | if (0 != strncmp (mime, | 196 | if (0 != strncmp (mime, |
197 | "text/html", | 197 | "text/html", |
198 | strlen ("text/html"))) | 198 | strlen ("text/html"))) |
199 | return; /* not HTML */ | 199 | return; /* not HTML */ |
200 | 200 | ||
201 | if (0 != ec->seek (ec->cls, 0, SEEK_SET)) | 201 | if (0 != ec->seek (ec->cls, 0, SEEK_SET)) |
202 | return; /* seek failed !? */ | 202 | return; /* seek failed !? */ |
203 | 203 | ||
204 | tidyInitSource (&src, ec, | 204 | tidyInitSource (&src, ec, |
205 | &get_byte_cb, | 205 | &get_byte_cb, |
206 | &unget_byte_cb, | 206 | &unget_byte_cb, |
207 | &eof_cb); | 207 | &eof_cb); |
208 | if (NULL == (doc = tidyCreate ())) | 208 | if (NULL == (doc = tidyCreate ())) |
209 | return; | 209 | return; |
210 | tidySetReportFilter (doc, &report_cb); | 210 | tidySetReportFilter (doc, &report_cb); |
211 | tidySetAppData (doc, ec); | 211 | tidySetAppData (doc, ec); |
212 | if (0 > tidyParseSource (doc, &src)) | 212 | if (0 > tidyParseSource (doc, &src)) |
213 | { | 213 | { |
214 | tidyRelease (doc); | 214 | tidyRelease (doc); |
215 | return; | 215 | return; |
216 | } | 216 | } |
217 | if (1 != tidyStatus (doc)) | 217 | if (1 != tidyStatus (doc)) |
218 | { | 218 | { |
219 | tidyRelease (doc); | 219 | tidyRelease (doc); |
220 | return; | 220 | return; |
221 | } | 221 | } |
222 | if (NULL == (head = tidyGetHead (doc))) | 222 | if (NULL == (head = tidyGetHead (doc))) |
223 | { | 223 | { |
224 | fprintf (stderr, "no head\n"); | 224 | fprintf (stderr, "no head\n"); |
225 | tidyRelease (doc); | 225 | tidyRelease (doc); |
226 | return; | 226 | return; |
227 | } | 227 | } |
228 | for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) | 228 | for (child = tidyGetChild (head); NULL != child; child = tidyGetNext (child)) |
229 | { | ||
230 | switch (tidyNodeGetType (child)) | ||
229 | { | 231 | { |
230 | switch (tidyNodeGetType(child)) | 232 | case TidyNode_Root: |
231 | { | 233 | break; |
232 | case TidyNode_Root: | 234 | case TidyNode_DocType: |
233 | break; | 235 | break; |
234 | case TidyNode_DocType: | 236 | case TidyNode_Comment: |
235 | break; | 237 | break; |
236 | case TidyNode_Comment: | 238 | case TidyNode_ProcIns: |
237 | break; | 239 | break; |
238 | case TidyNode_ProcIns: | 240 | case TidyNode_Text: |
239 | break; | 241 | break; |
240 | case TidyNode_Text: | 242 | case TidyNode_CDATA: |
241 | break; | 243 | break; |
242 | case TidyNode_CDATA: | 244 | case TidyNode_Section: |
243 | break; | 245 | break; |
244 | case TidyNode_Section: | 246 | case TidyNode_Asp: |
245 | break; | 247 | break; |
246 | case TidyNode_Asp: | 248 | case TidyNode_Jste: |
247 | break; | 249 | break; |
248 | case TidyNode_Jste: | 250 | case TidyNode_Php: |
249 | break; | 251 | break; |
250 | case TidyNode_Php: | 252 | case TidyNode_XmlDecl: |
251 | break; | 253 | break; |
252 | case TidyNode_XmlDecl: | 254 | case TidyNode_Start: |
253 | break; | 255 | case TidyNode_StartEnd: |
254 | case TidyNode_Start: | 256 | name = tidyNodeGetName (child); |
255 | case TidyNode_StartEnd: | 257 | if ( (0 == strcasecmp (name, "title")) && |
256 | name = tidyNodeGetName (child); | 258 | (NULL != (title = tidyGetChild (child))) ) |
257 | if ( (0 == strcasecmp (name, "title")) && | 259 | { |
258 | (NULL != (title = tidyGetChild (child))) ) | 260 | tidyBufInit (&tbuf); |
259 | { | 261 | tidyNodeGetValue (doc, title, &tbuf); |
260 | tidyBufInit (&tbuf); | 262 | /* add 0-termination */ |
261 | tidyNodeGetValue (doc, title, &tbuf); | 263 | tidyBufPutByte (&tbuf, 0); |
262 | /* add 0-termination */ | 264 | if (0 != |
263 | tidyBufPutByte (&tbuf, 0); | 265 | ec->proc (ec->cls, |
264 | if (0 != | 266 | "html", |
265 | ec->proc (ec->cls, | 267 | EXTRACTOR_METATYPE_TITLE, |
266 | "html", | 268 | EXTRACTOR_METAFORMAT_UTF8, |
267 | EXTRACTOR_METATYPE_TITLE, | 269 | "text/plain", |
268 | EXTRACTOR_METAFORMAT_UTF8, | 270 | (const char *) tbuf.bp, |
269 | "text/plain", | 271 | tbuf.size)) |
270 | (const char *) tbuf.bp, | 272 | { |
271 | tbuf.size)) | 273 | tidyBufFree (&tbuf); |
272 | { | 274 | goto CLEANUP; |
273 | tidyBufFree (&tbuf); | 275 | } |
274 | goto CLEANUP; | 276 | tidyBufFree (&tbuf); |
275 | } | 277 | break; |
276 | tidyBufFree (&tbuf); | 278 | } |
277 | break; | 279 | if (0 == strcasecmp (name, "meta")) |
278 | } | 280 | { |
279 | if (0 == strcasecmp (name, "meta")) | 281 | if (NULL == (attr = tidyAttrGetById (child, |
280 | { | 282 | TidyAttr_NAME))) |
281 | if (NULL == (attr = tidyAttrGetById (child, | 283 | break; |
282 | TidyAttr_NAME))) | 284 | if (EXTRACTOR_METATYPE_RESERVED == |
283 | break; | 285 | (type = tag_to_type (tidyAttrValue (attr)))) |
284 | if (EXTRACTOR_METATYPE_RESERVED == | 286 | break; |
285 | (type = tag_to_type (tidyAttrValue (attr)))) | 287 | if (NULL == (attr = tidyAttrGetById (child, |
286 | break; | 288 | TidyAttr_CONTENT))) |
287 | if (NULL == (attr = tidyAttrGetById (child, | 289 | break; |
288 | TidyAttr_CONTENT))) | 290 | name = tidyAttrValue (attr); |
289 | break; | 291 | if (0 != |
290 | name = tidyAttrValue (attr); | 292 | ec->proc (ec->cls, |
291 | if (0 != | 293 | "html", |
292 | ec->proc (ec->cls, | 294 | type, |
293 | "html", | 295 | EXTRACTOR_METAFORMAT_UTF8, |
294 | type, | 296 | "text/plain", |
295 | EXTRACTOR_METAFORMAT_UTF8, | 297 | name, |
296 | "text/plain", | 298 | strlen (name) + 1)) |
297 | name, | 299 | goto CLEANUP; |
298 | strlen (name) + 1)) | 300 | break; |
299 | goto CLEANUP; | 301 | } |
300 | break; | 302 | break; |
301 | } | 303 | case TidyNode_End: |
302 | break; | 304 | break; |
303 | case TidyNode_End: | 305 | default: |
304 | break; | 306 | break; |
305 | default: | ||
306 | break; | ||
307 | } | ||
308 | } | 307 | } |
309 | CLEANUP: | 308 | } |
309 | CLEANUP: | ||
310 | tidyRelease (doc); | 310 | tidyRelease (doc); |
311 | } | 311 | } |
312 | 312 | ||
313 | 313 | ||
314 | |||
315 | #if OLD | 314 | #if OLD |
316 | 315 | ||
317 | 316 | ||
@@ -323,66 +322,71 @@ tagMatch (const char *tag, const char *s, const char *e) | |||
323 | return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); | 322 | return (((e - s) == strlen (tag)) && (0 == strncasecmp (tag, s, e - s))); |
324 | } | 323 | } |
325 | 324 | ||
325 | |||
326 | static int | 326 | static int |
327 | lookFor (char c, size_t * pos, const char *data, size_t size) | 327 | lookFor (char c, size_t *pos, const char *data, size_t size) |
328 | { | 328 | { |
329 | size_t p = *pos; | 329 | size_t p = *pos; |
330 | 330 | ||
331 | while ((p < size) && (data[p] != c)) | 331 | while ((p < size) && (data[p] != c)) |
332 | { | 332 | { |
333 | if (data[p] == '\0') | 333 | if (data[p] == '\0') |
334 | return 0; | 334 | return 0; |
335 | p++; | 335 | p++; |
336 | } | 336 | } |
337 | *pos = p; | 337 | *pos = p; |
338 | return p < size; | 338 | return p < size; |
339 | } | 339 | } |
340 | 340 | ||
341 | |||
341 | static int | 342 | static int |
342 | skipWhitespace (size_t * pos, const char *data, size_t size) | 343 | skipWhitespace (size_t *pos, const char *data, size_t size) |
343 | { | 344 | { |
344 | size_t p = *pos; | 345 | size_t p = *pos; |
345 | 346 | ||
346 | while ((p < size) && (isspace ( (unsigned char) data[p]))) | 347 | while ((p < size) && (isspace ( (unsigned char) data[p]))) |
347 | { | 348 | { |
348 | if (data[p] == '\0') | 349 | if (data[p] == '\0') |
349 | return 0; | 350 | return 0; |
350 | p++; | 351 | p++; |
351 | } | 352 | } |
352 | *pos = p; | 353 | *pos = p; |
353 | return p < size; | 354 | return p < size; |
354 | } | 355 | } |
355 | 356 | ||
357 | |||
356 | static int | 358 | static int |
357 | skipLetters (size_t * pos, const char *data, size_t size) | 359 | skipLetters (size_t *pos, const char *data, size_t size) |
358 | { | 360 | { |
359 | size_t p = *pos; | 361 | size_t p = *pos; |
360 | 362 | ||
361 | while ((p < size) && (isalpha ( (unsigned char) data[p]))) | 363 | while ((p < size) && (isalpha ( (unsigned char) data[p]))) |
362 | { | 364 | { |
363 | if (data[p] == '\0') | 365 | if (data[p] == '\0') |
364 | return 0; | 366 | return 0; |
365 | p++; | 367 | p++; |
366 | } | 368 | } |
367 | *pos = p; | 369 | *pos = p; |
368 | return p < size; | 370 | return p < size; |
369 | } | 371 | } |
370 | 372 | ||
373 | |||
371 | static int | 374 | static int |
372 | lookForMultiple (const char *c, size_t * pos, const char *data, size_t size) | 375 | lookForMultiple (const char *c, size_t *pos, const char *data, size_t size) |
373 | { | 376 | { |
374 | size_t p = *pos; | 377 | size_t p = *pos; |
375 | 378 | ||
376 | while ((p < size) && (strchr (c, data[p]) == NULL)) | 379 | while ((p < size) && (strchr (c, data[p]) == NULL)) |
377 | { | 380 | { |
378 | if (data[p] == '\0') | 381 | if (data[p] == '\0') |
379 | return 0; | 382 | return 0; |
380 | p++; | 383 | p++; |
381 | } | 384 | } |
382 | *pos = p; | 385 | *pos = p; |
383 | return p < size; | 386 | return p < size; |
384 | } | 387 | } |
385 | 388 | ||
389 | |||
386 | static void | 390 | static void |
387 | findEntry (const char *key, | 391 | findEntry (const char *key, |
388 | const char *start, | 392 | const char *start, |
@@ -394,32 +398,33 @@ findEntry (const char *key, | |||
394 | *mend = NULL; | 398 | *mend = NULL; |
395 | len = strlen (key); | 399 | len = strlen (key); |
396 | while (start < end - len - 1) | 400 | while (start < end - len - 1) |
401 | { | ||
402 | start++; | ||
403 | if (start[len] != '=') | ||
404 | continue; | ||
405 | if (0 == strncasecmp (start, key, len)) | ||
397 | { | 406 | { |
398 | start++; | 407 | start += len + 1; |
399 | if (start[len] != '=') | 408 | *mstart = start; |
400 | continue; | 409 | if ((*start == '\"') || (*start == '\'')) |
401 | if (0 == strncasecmp (start, key, len)) | 410 | { |
402 | { | 411 | start++; |
403 | start += len + 1; | 412 | while ((start < end) && (*start != **mstart)) |
404 | *mstart = start; | 413 | start++; |
405 | if ((*start == '\"') || (*start == '\'')) | 414 | (*mstart)++; /* skip quote */ |
406 | { | 415 | } |
407 | start++; | 416 | else |
408 | while ((start < end) && (*start != **mstart)) | 417 | { |
409 | start++; | 418 | while ((start < end) && (! isspace ( (unsigned char) *start))) |
410 | (*mstart)++; /* skip quote */ | 419 | start++; |
411 | } | 420 | } |
412 | else | 421 | *mend = start; |
413 | { | 422 | return; |
414 | while ((start < end) && (!isspace ( (unsigned char) *start))) | ||
415 | start++; | ||
416 | } | ||
417 | *mend = start; | ||
418 | return; | ||
419 | } | ||
420 | } | 423 | } |
424 | } | ||
421 | } | 425 | } |
422 | 426 | ||
427 | |||
423 | /** | 428 | /** |
424 | * Search all tags that correspond to "tagname". Example: | 429 | * Search all tags that correspond to "tagname". Example: |
425 | * If the tag is <meta name="foo" desc="bar">, and | 430 | * If the tag is <meta name="foo" desc="bar">, and |
@@ -430,7 +435,7 @@ findEntry (const char *key, | |||
430 | * @return NULL if nothing is found | 435 | * @return NULL if nothing is found |
431 | */ | 436 | */ |
432 | static char * | 437 | static char * |
433 | findInTags (struct TagInfo * t, | 438 | findInTags (struct TagInfo *t, |
434 | const char *tagname, | 439 | const char *tagname, |
435 | const char *keyname, const char *keyvalue, const char *searchname) | 440 | const char *keyname, const char *keyvalue, const char *searchname) |
436 | { | 441 | { |
@@ -438,26 +443,26 @@ findInTags (struct TagInfo * t, | |||
438 | const char *pend; | 443 | const char *pend; |
439 | 444 | ||
440 | while (t != NULL) | 445 | while (t != NULL) |
446 | { | ||
447 | if (tagMatch (tagname, t->tagStart, t->tagEnd)) | ||
441 | { | 448 | { |
442 | if (tagMatch (tagname, t->tagStart, t->tagEnd)) | 449 | findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); |
450 | if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) | ||
451 | { | ||
452 | findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); | ||
453 | if (pstart != NULL) | ||
443 | { | 454 | { |
444 | findEntry (keyname, t->tagEnd, t->dataStart, &pstart, &pend); | 455 | char *ret = malloc (pend - pstart + 1); |
445 | if ((pstart != NULL) && (tagMatch (keyvalue, pstart, pend))) | 456 | if (ret == NULL) |
446 | { | 457 | return NULL; |
447 | findEntry (searchname, t->tagEnd, t->dataStart, &pstart, &pend); | 458 | memcpy (ret, pstart, pend - pstart); |
448 | if (pstart != NULL) | 459 | ret[pend - pstart] = '\0'; |
449 | { | 460 | return ret; |
450 | char *ret = malloc (pend - pstart + 1); | ||
451 | if (ret == NULL) | ||
452 | return NULL; | ||
453 | memcpy (ret, pstart, pend - pstart); | ||
454 | ret[pend - pstart] = '\0'; | ||
455 | return ret; | ||
456 | } | ||
457 | } | ||
458 | } | 461 | } |
459 | t = t->next; | 462 | } |
460 | } | 463 | } |
464 | t = t->next; | ||
465 | } | ||
461 | return NULL; | 466 | return NULL; |
462 | } | 467 | } |
463 | 468 | ||
@@ -465,10 +470,10 @@ findInTags (struct TagInfo * t, | |||
465 | /* mimetype = text/html */ | 470 | /* mimetype = text/html */ |
466 | int | 471 | int |
467 | EXTRACTOR_html_extract (const char *data, | 472 | EXTRACTOR_html_extract (const char *data, |
468 | size_t size, | 473 | size_t size, |
469 | EXTRACTOR_MetaDataProcessor proc, | 474 | EXTRACTOR_MetaDataProcessor proc, |
470 | void *proc_cls, | 475 | void *proc_cls, |
471 | const char *options) | 476 | const char *options) |
472 | { | 477 | { |
473 | size_t xsize; | 478 | size_t xsize; |
474 | struct TagInfo *tags; | 479 | struct TagInfo *tags; |
@@ -494,60 +499,60 @@ EXTRACTOR_html_extract (const char *data, | |||
494 | tag.next = NULL; | 499 | tag.next = NULL; |
495 | pos = 0; | 500 | pos = 0; |
496 | while (pos < xsize) | 501 | while (pos < xsize) |
502 | { | ||
503 | if (! lookFor ('<', &pos, data, size)) | ||
504 | break; | ||
505 | tag.tagStart = &data[++pos]; | ||
506 | if (! skipLetters (&pos, data, size)) | ||
507 | break; | ||
508 | tag.tagEnd = &data[pos]; | ||
509 | if (! skipWhitespace (&pos, data, size)) | ||
510 | break; | ||
511 | STEP3: | ||
512 | if (! lookForMultiple (">\"\'", &pos, data, size)) | ||
513 | break; | ||
514 | if (data[pos] != '>') | ||
497 | { | 515 | { |
498 | if (!lookFor ('<', &pos, data, size)) | 516 | /* find end-quote, ignore escaped quotes (\') */ |
499 | break; | 517 | do |
500 | tag.tagStart = &data[++pos]; | 518 | { |
501 | if (!skipLetters (&pos, data, size)) | 519 | tpos = pos; |
502 | break; | 520 | pos++; |
503 | tag.tagEnd = &data[pos]; | 521 | if (! lookFor (data[tpos], &pos, data, size)) |
504 | if (!skipWhitespace (&pos, data, size)) | 522 | break; |
505 | break; | 523 | } |
506 | STEP3: | 524 | while (data[pos - 1] == '\\'); |
507 | if (!lookForMultiple (">\"\'", &pos, data, size)) | ||
508 | break; | ||
509 | if (data[pos] != '>') | ||
510 | { | ||
511 | /* find end-quote, ignore escaped quotes (\') */ | ||
512 | do | ||
513 | { | ||
514 | tpos = pos; | ||
515 | pos++; | ||
516 | if (!lookFor (data[tpos], &pos, data, size)) | ||
517 | break; | ||
518 | } | ||
519 | while (data[pos - 1] == '\\'); | ||
520 | pos++; | ||
521 | goto STEP3; | ||
522 | } | ||
523 | pos++; | 525 | pos++; |
524 | if (!skipWhitespace (&pos, data, size)) | 526 | goto STEP3; |
525 | break; | 527 | } |
526 | tag.dataStart = &data[pos]; | 528 | pos++; |
527 | if (!lookFor ('<', &pos, data, size)) | 529 | if (! skipWhitespace (&pos, data, size)) |
528 | break; | 530 | break; |
529 | tag.dataEnd = &data[pos]; | 531 | tag.dataStart = &data[pos]; |
530 | i = 0; | 532 | if (! lookFor ('<', &pos, data, size)) |
531 | while (relevantTags[i] != NULL) | 533 | break; |
532 | { | 534 | tag.dataEnd = &data[pos]; |
533 | if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && | 535 | i = 0; |
534 | (0 == strncasecmp (relevantTags[i], | 536 | while (relevantTags[i] != NULL) |
535 | tag.tagStart, tag.tagEnd - tag.tagStart))) | 537 | { |
536 | { | 538 | if ((strlen (relevantTags[i]) == tag.tagEnd - tag.tagStart) && |
537 | t = malloc (sizeof (struct TagInfo)); | 539 | (0 == strncasecmp (relevantTags[i], |
538 | if (t == NULL) | 540 | tag.tagStart, tag.tagEnd - tag.tagStart))) |
539 | return 0; | 541 | { |
540 | *t = tag; | 542 | t = malloc (sizeof (struct TagInfo)); |
541 | t->next = tags; | 543 | if (t == NULL) |
542 | tags = t; | 544 | return 0; |
543 | break; | 545 | *t = tag; |
544 | } | 546 | t->next = tags; |
545 | i++; | 547 | tags = t; |
546 | } | ||
547 | /* abort early if we hit the body tag */ | ||
548 | if (tagMatch ("body", tag.tagStart, tag.tagEnd)) | ||
549 | break; | 548 | break; |
549 | } | ||
550 | i++; | ||
550 | } | 551 | } |
552 | /* abort early if we hit the body tag */ | ||
553 | if (tagMatch ("body", tag.tagStart, tag.tagEnd)) | ||
554 | break; | ||
555 | } | ||
551 | 556 | ||
552 | /* fast exit */ | 557 | /* fast exit */ |
553 | if (tags == NULL) | 558 | if (tags == NULL) |
@@ -557,110 +562,112 @@ EXTRACTOR_html_extract (const char *data, | |||
557 | /* first, try to determine mime type and/or character set */ | 562 | /* first, try to determine mime type and/or character set */ |
558 | tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); | 563 | tmp = findInTags (tags, "meta", "http-equiv", "content-type", "content"); |
559 | if (tmp != NULL) | 564 | if (tmp != NULL) |
560 | { | 565 | { |
561 | /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; | 566 | /* ideally, tmp == "test/html; charset=ISO-XXXX-Y" or something like that; |
562 | if text/html is present, we take that as the mime-type; if charset= | 567 | if text/html is present, we take that as the mime-type; if charset= |
563 | is present, we try to use that for character set conversion. */ | 568 | is present, we try to use that for character set conversion. */ |
564 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) | 569 | if (0 == strncasecmp (tmp, "text/html", strlen ("text/html"))) |
565 | ret = proc (proc_cls, | 570 | ret = proc (proc_cls, |
566 | "html", | 571 | "html", |
567 | EXTRACTOR_METATYPE_MIMETYPE, | 572 | EXTRACTOR_METATYPE_MIMETYPE, |
568 | EXTRACTOR_METAFORMAT_UTF8, | 573 | EXTRACTOR_METAFORMAT_UTF8, |
569 | "text/plain", | 574 | "text/plain", |
570 | "text/html", | 575 | "text/html", |
571 | strlen ("text/html")+1); | 576 | strlen ("text/html") + 1); |
572 | charset = strcasestr (tmp, "charset="); | 577 | charset = strcasestr (tmp, "charset="); |
573 | if (charset != NULL) | 578 | if (charset != NULL) |
574 | charset = strdup (&charset[strlen ("charset=")]); | 579 | charset = strdup (&charset[strlen ("charset=")]); |
575 | free (tmp); | 580 | free (tmp); |
576 | } | 581 | } |
577 | i = 0; | 582 | i = 0; |
578 | while (tagmap[i].name != NULL) | 583 | while (tagmap[i].name != NULL) |
584 | { | ||
585 | tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); | ||
586 | if ( (tmp != NULL) && | ||
587 | (ret == 0) ) | ||
579 | { | 588 | { |
580 | tmp = findInTags (tags, "meta", "name", tagmap[i].name, "content"); | 589 | if (charset == NULL) |
581 | if ( (tmp != NULL) && | 590 | { |
582 | (ret == 0) ) | 591 | ret = proc (proc_cls, |
592 | "html", | ||
593 | tagmap[i].type, | ||
594 | EXTRACTOR_METAFORMAT_C_STRING, | ||
595 | "text/plain", | ||
596 | tmp, | ||
597 | strlen (tmp) + 1); | ||
598 | } | ||
599 | else | ||
600 | { | ||
601 | xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, | ||
602 | strlen (tmp), | ||
603 | charset); | ||
604 | if (xtmp != NULL) | ||
583 | { | 605 | { |
584 | if (charset == NULL) | 606 | ret = proc (proc_cls, |
585 | { | 607 | "html", |
586 | ret = proc (proc_cls, | 608 | tagmap[i].type, |
587 | "html", | 609 | EXTRACTOR_METAFORMAT_UTF8, |
588 | tagmap[i].type, | 610 | "text/plain", |
589 | EXTRACTOR_METAFORMAT_C_STRING, | 611 | xtmp, |
590 | "text/plain", | 612 | strlen (xtmp) + 1); |
591 | tmp, | 613 | free (xtmp); |
592 | strlen (tmp) + 1); | ||
593 | } | ||
594 | else | ||
595 | { | ||
596 | xtmp = EXTRACTOR_common_convert_to_utf8 (tmp, | ||
597 | strlen (tmp), | ||
598 | charset); | ||
599 | if (xtmp != NULL) | ||
600 | { | ||
601 | ret = proc (proc_cls, | ||
602 | "html", | ||
603 | tagmap[i].type, | ||
604 | EXTRACTOR_METAFORMAT_UTF8, | ||
605 | "text/plain", | ||
606 | xtmp, | ||
607 | strlen (xtmp) + 1); | ||
608 | free (xtmp); | ||
609 | } | ||
610 | } | ||
611 | } | 614 | } |
612 | if (tmp != NULL) | 615 | } |
613 | free (tmp); | ||
614 | i++; | ||
615 | } | 616 | } |
617 | if (tmp != NULL) | ||
618 | free (tmp); | ||
619 | i++; | ||
620 | } | ||
616 | while (tags != NULL) | 621 | while (tags != NULL) |
622 | { | ||
623 | t = tags; | ||
624 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && | ||
625 | (ret == 0) ) | ||
617 | { | 626 | { |
618 | t = tags; | 627 | if (charset == NULL) |
619 | if ( (tagMatch ("title", t->tagStart, t->tagEnd)) && | 628 | { |
620 | (ret == 0) ) | 629 | xtmp = malloc (t->dataEnd - t->dataStart + 1); |
621 | { | 630 | if (xtmp != NULL) |
622 | if (charset == NULL) | 631 | { |
623 | { | 632 | memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); |
624 | xtmp = malloc (t->dataEnd - t->dataStart + 1); | 633 | xtmp[t->dataEnd - t->dataStart] = '\0'; |
625 | if (xtmp != NULL) | 634 | ret = proc (proc_cls, |
626 | { | 635 | "html", |
627 | memcpy (xtmp, t->dataStart, t->dataEnd - t->dataStart); | 636 | EXTRACTOR_METATYPE_TITLE, |
628 | xtmp[t->dataEnd - t->dataStart] = '\0'; | 637 | EXTRACTOR_METAFORMAT_C_STRING, |
629 | ret = proc (proc_cls, | 638 | "text/plain", |
630 | "html", | 639 | xtmp, |
631 | EXTRACTOR_METATYPE_TITLE, | 640 | strlen (xtmp) + 1); |
632 | EXTRACTOR_METAFORMAT_C_STRING, | 641 | free (xtmp); |
633 | "text/plain", | 642 | } |
634 | xtmp, | 643 | } |
635 | strlen (xtmp) + 1); | 644 | else |
636 | free (xtmp); | 645 | { |
637 | } | 646 | xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, |
638 | } | 647 | t->dataEnd - t->dataStart, |
639 | else | 648 | charset); |
640 | { | 649 | if (xtmp != NULL) |
641 | xtmp = EXTRACTOR_common_convert_to_utf8 (t->dataStart, | 650 | { |
642 | t->dataEnd - t->dataStart, | 651 | ret = proc (proc_cls, |
643 | charset); | 652 | "html", |
644 | if (xtmp != NULL) | 653 | EXTRACTOR_METATYPE_TITLE, |
645 | { | 654 | EXTRACTOR_METAFORMAT_UTF8, |
646 | ret = proc (proc_cls, | 655 | "text/plain", |
647 | "html", | 656 | xtmp, |
648 | EXTRACTOR_METATYPE_TITLE, | 657 | strlen (xtmp) + 1); |
649 | EXTRACTOR_METAFORMAT_UTF8, | 658 | free (xtmp); |
650 | "text/plain", | 659 | } |
651 | xtmp, | 660 | } |
652 | strlen (xtmp) + 1); | ||
653 | free (xtmp); | ||
654 | } | ||
655 | } | ||
656 | } | ||
657 | tags = t->next; | ||
658 | free (t); | ||
659 | } | 661 | } |
662 | tags = t->next; | ||
663 | free (t); | ||
664 | } | ||
660 | if (charset != NULL) | 665 | if (charset != NULL) |
661 | free (charset); | 666 | free (charset); |
662 | return ret; | 667 | return ret; |
663 | } | 668 | } |
669 | |||
670 | |||
664 | #endif | 671 | #endif |
665 | 672 | ||
666 | 673 | ||
@@ -672,9 +679,9 @@ html_gobject_init () | |||
672 | { | 679 | { |
673 | magic = magic_open (MAGIC_MIME_TYPE); | 680 | magic = magic_open (MAGIC_MIME_TYPE); |
674 | if (0 != magic_load (magic, NULL)) | 681 | if (0 != magic_load (magic, NULL)) |
675 | { | 682 | { |
676 | /* FIXME: how to deal with errors? */ | 683 | /* FIXME: how to deal with errors? */ |
677 | } | 684 | } |
678 | } | 685 | } |
679 | 686 | ||
680 | 687 | ||
@@ -685,10 +692,11 @@ void __attribute__ ((destructor)) | |||
685 | html_ltdl_fini () | 692 | html_ltdl_fini () |
686 | { | 693 | { |
687 | if (NULL != magic) | 694 | if (NULL != magic) |
688 | { | 695 | { |
689 | magic_close (magic); | 696 | magic_close (magic); |
690 | magic = NULL; | 697 | magic = NULL; |
691 | } | 698 | } |
692 | } | 699 | } |
693 | 700 | ||
701 | |||
694 | /* end of html_extractor.c */ | 702 | /* end of html_extractor.c */ |