aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Grothoff <christian@grothoff.org>2012-08-10 16:37:53 +0000
committerChristian Grothoff <christian@grothoff.org>2012-08-10 16:37:53 +0000
commit72944e8c23a2a0c2592569ffb7a6f76e09258bbb (patch)
tree613ae9649aa23584564882fbe8dc70b6335fb017
parent7f4e9149c26051f5e4c4ef6fd56be8f1a69eebc3 (diff)
downloadlibextractor-72944e8c23a2a0c2592569ffb7a6f76e09258bbb.tar.gz
libextractor-72944e8c23a2a0c2592569ffb7a6f76e09258bbb.zip
hacking on OLE plugin
-rw-r--r--src/plugins/ole2_extractor.c987
1 files changed, 592 insertions, 395 deletions
diff --git a/src/plugins/ole2_extractor.c b/src/plugins/ole2_extractor.c
index fa6a448..afa451e 100644
--- a/src/plugins/ole2_extractor.c
+++ b/src/plugins/ole2_extractor.c
@@ -1,10 +1,10 @@
1/* 1/*
2 This file is part of libextractor. 2 This file is part of libextractor.
3 (C) 2004, 2005, 2006, 2007, 2009 Vidyut Samanta and Christian Grothoff 3 (C) 2004, 2005, 2006, 2007, 2009, 2012 Vidyut Samanta and Christian Grothoff
4 4
5 libextractor is free software; you can redistribute it and/or modify 5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published 6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your 7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version. 8 option) any later version.
9 9
10 libextractor is distributed in the hope that it will be useful, but 10 libextractor is distributed in the hope that it will be useful, but
@@ -24,16 +24,18 @@
24 Part of this code was borrowed from wordleaker.cpp. See also 24 Part of this code was borrowed from wordleaker.cpp. See also
25 the README file in this directory. 25 the README file in this directory.
26*/ 26*/
27 27/**
28 * @file plugins/ole2_extractor.c
29 * @brief plugin to support OLE2 (DOC, XLS, etc.) files
30 * @author Christian Grothoff
31 */
28#include "platform.h" 32#include "platform.h"
29#include "extractor.h" 33#include "extractor.h"
30#include "convert.h" 34#include "convert.h"
31
32#include <glib-object.h> 35#include <glib-object.h>
33#include <string.h> 36#include <string.h>
34#include <stdio.h> 37#include <stdio.h>
35#include <ctype.h> 38#include <ctype.h>
36
37#include <gsf/gsf-utils.h> 39#include <gsf/gsf-utils.h>
38#include <gsf/gsf-input-memory.h> 40#include <gsf/gsf-input-memory.h>
39#include <gsf/gsf-infile.h> 41#include <gsf/gsf-infile.h>
@@ -42,20 +44,31 @@
42 44
43#define DEBUG_OLE2 0 45#define DEBUG_OLE2 0
44 46
45/* ******************************** main extraction code ************************ */
46 47
48/**
49 * Give the given UTF8 string to LE by calling 'proc'.
50 *
51 * @param proc callback to invoke
52 * @param proc_cls closure for proc
53 * @param phrase metadata string to pass; may include spaces
54 * just double-quotes or just a space in a double quote;
55 * in those cases, nothing should be done
56 * @param type meta data type to use
57 * @return if 'proc' returned 1, otherwise 0
58 */
47static int 59static int
48addKeyword(EXTRACTOR_MetaDataProcessor proc, 60add_metadata (EXTRACTOR_MetaDataProcessor proc,
49 void *proc_cls, 61 void *proc_cls,
50 const char *phrase, 62 const char *phrase,
51 enum EXTRACTOR_MetaType type) { 63 enum EXTRACTOR_MetaType type)
52 if (strlen(phrase) == 0) 64{
65 if (0 == strlen (phrase))
53 return 0; 66 return 0;
54 if (0 == strcmp(phrase, "\"\"")) 67 if (0 == strcmp (phrase, "\"\""))
55 return 0; 68 return 0;
56 if (0 == strcmp(phrase, "\" \"")) 69 if (0 == strcmp (phrase, "\" \""))
57 return 0; 70 return 0;
58 if (0 == strcmp(phrase, " ")) 71 if (0 == strcmp (phrase, " "))
59 return 0; 72 return 0;
60 return proc (proc_cls, 73 return proc (proc_cls,
61 "ole2", 74 "ole2",
@@ -66,12 +79,26 @@ addKeyword(EXTRACTOR_MetaDataProcessor proc,
66 strlen (phrase) +1); 79 strlen (phrase) +1);
67} 80}
68 81
69typedef struct { 82
70 const char * text; 83/**
84 * Entry in the map from OLE meta type strings
85 * to LE types.
86 */
87struct Matches
88{
89 /**
90 * OLE description.
91 */
92 const char *text;
93
94 /**
95 * Corresponding LE type.
96 */
71 enum EXTRACTOR_MetaType type; 97 enum EXTRACTOR_MetaType type;
72} Matches; 98};
99
73 100
74static Matches tmap[] = { 101static struct Matches tmap[] = {
75 { "Title", EXTRACTOR_METATYPE_TITLE }, 102 { "Title", EXTRACTOR_METATYPE_TITLE },
76 { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT }, 103 { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
77 { "Category", EXTRACTOR_METATYPE_SECTION }, 104 { "Category", EXTRACTOR_METATYPE_SECTION },
@@ -116,513 +143,683 @@ static Matches tmap[] = {
116}; 143};
117 144
118 145
146/**
147 * Closure for 'process_metadata'.
148 */
119struct ProcContext 149struct ProcContext
120{ 150{
151 /**
152 * Function to call for meta data that was found.
153 */
121 EXTRACTOR_MetaDataProcessor proc; 154 EXTRACTOR_MetaDataProcessor proc;
155
156 /**
157 * Closure for 'proc'.
158 */
122 void *proc_cls; 159 void *proc_cls;
160
161 /**
162 * Return value; 0 to continue to extract, 1 if we are done
163 */
123 int ret; 164 int ret;
124}; 165};
125 166
126 167
127static void processMetadata(gpointer key, 168/**
128 gpointer value, 169 * Function invoked by 'gst_msole_metadata_read' with
129 gpointer user_data) { 170 * metadata found in the document.
171 *
172 * @param key 'const char *' describing the meta data
173 * @param value the UTF8 representation of the meta data
174 * @param user_data our 'struct ProcContext' (closure)
175 */
176static void
177process_metadata (gpointer key,
178 gpointer value,
179 gpointer user_data)
180{
181 const char *type = key;
182 const GsfDocProp *prop = value;
130 struct ProcContext *pc = user_data; 183 struct ProcContext *pc = user_data;
131 const char * type = key; 184 const GValue *gval;
132 const GsfDocProp * prop = value; 185 char *contents;
133 const GValue * gval;
134 char * contents;
135 int pos; 186 int pos;
136 187
137 if ( (key == NULL) || 188 if ( (NULL == key) ||
138 (value == NULL) ) 189 (NULL == value) )
139 return; 190 return;
140 if (pc->ret != 0) 191 if (0 != pc->ret)
141 return; 192 return;
142 gval = gsf_doc_prop_get_val(prop); 193 gval = gsf_doc_prop_get_val (prop);
143 194
144 if (G_VALUE_TYPE(gval) == G_TYPE_STRING) 195 if (G_VALUE_TYPE(gval) == G_TYPE_STRING)
145 { 196 {
146 contents = strdup(g_value_get_string(gval)); 197 contents = strdup (g_value_get_string (gval));
147 } 198 }
148 else 199 else
149 { 200 {
150 /* convert other formats? */ 201 /* convert other formats? */
151 contents = g_strdup_value_contents(gval); 202 contents = g_strdup_value_contents (gval);
152 } 203 }
153 if (contents == NULL) 204 if (NULL == contents)
154 return; 205 return;
155 if ( (strlen(contents) > 0) && 206 if ( (strlen (contents) > 0) &&
156 (contents[strlen(contents)-1] == '\n') ) 207 ('\n' == contents[strlen (contents) - 1]) )
157 contents[strlen(contents)-1] = '\0'; 208 contents [strlen (contents) - 1] = '\0';
158 pos = 0;
159 while (tmap[pos].text != NULL)
160 {
161 if (0 == strcmp(tmap[pos].text,
162 type))
163 break;
164 pos++;
165 }
166 if (0 == strcmp (type, "meta:generator")) 209 if (0 == strcmp (type, "meta:generator"))
167 { 210 {
168 const char * mimetype = "application/vnd.ms-files"; 211 const char *mimetype = "application/vnd.ms-files";
169 if((0 == strncmp(value, "Microsoft Word", 14)) || 212 if ( (0 == strncmp (value, "Microsoft Word", 14)) ||
170 (0 == strncmp(value, "Microsoft Office Word", 21))) 213 (0 == strncmp (value, "Microsoft Office Word", 21)))
171 mimetype = "application/msword"; 214 mimetype = "application/msword";
172 else if((0 == strncmp(value, "Microsoft Excel", 15)) || 215 else if ( (0 == strncmp(value, "Microsoft Excel", 15)) ||
173 (0 == strncmp(value, "Microsoft Office Excel", 22))) 216 (0 == strncmp(value, "Microsoft Office Excel", 22)) )
174 mimetype = "application/vnd.ms-excel"; 217 mimetype = "application/vnd.ms-excel";
175 else if((0 == strncmp(value, "Microsoft PowerPoint", 20)) || 218 else if ( (0 == strncmp(value, "Microsoft PowerPoint", 20)) ||
176 (0 == strncmp(value, "Microsoft Office PowerPoint", 27))) 219 (0 == strncmp(value, "Microsoft Office PowerPoint", 27)) )
177 mimetype = "application/vnd.ms-powerpoint"; 220 mimetype = "application/vnd.ms-powerpoint";
178 else if(0 == strncmp(value, "Microsoft Project", 17)) 221 else if (0 == strncmp(value, "Microsoft Project", 17))
179 mimetype = "application/vnd.ms-project"; 222 mimetype = "application/vnd.ms-project";
180 else if(0 == strncmp(value, "Microsoft Visio", 15)) 223 else if (0 == strncmp(value, "Microsoft Visio", 15))
181 mimetype = "application/vnd.visio"; 224 mimetype = "application/vnd.visio";
182 else if(0 == strncmp(value, "Microsoft Office", 16)) 225 else if (0 == strncmp(value, "Microsoft Office", 16))
183 mimetype = "application/vnd.ms-office"; 226 mimetype = "application/vnd.ms-office";
184 227 if (0 != add_metadata (pc->proc,
185 if (0 != addKeyword(pc->proc, 228 pc->proc_cls,
186 pc->proc_cls, mimetype, EXTRACTOR_METATYPE_MIMETYPE)) 229 mimetype,
230 EXTRACTOR_METATYPE_MIMETYPE))
187 { 231 {
188 free (contents); 232 free (contents);
189 pc->ret = 1; 233 pc->ret = 1;
190 return; 234 return;
191 } 235 }
192 } 236 }
193 if (tmap[pos].text != NULL) 237 for (pos = 0; NULL != tmap[pos].text; pos++)
238 if (0 == strcmp (tmap[pos].text,
239 type))
240 break;
241 if ( (NULL != tmap[pos].text) &&
242 (0 != add_metadata (pc->proc, pc->proc_cls,
243 contents,
244 tmap[pos].type)) )
194 { 245 {
195 if (0 != addKeyword(pc->proc, pc->proc_cls, 246 free (contents);
196 contents, 247 pc->ret = 1;
197 tmap[pos].type)) 248 return;
198 {
199 free (contents);
200 pc->ret = 1;
201 return;
202 }
203 } 249 }
204#if DEBUG_OLE2
205 else
206 printf("No match for type `%s'\n",
207 type);
208#endif
209 free(contents); 250 free(contents);
210} 251}
211 252
212 253
254/**
255 * Function called on (Document)SummaryInformation OLE
256 * streams.
257 *
258 * @param in the input OLE stream
259 * @param proc function to call on meta data found
260 * @param proc_cls closure for proc
261 * @return 0 to continue to extract, 1 if we are done
262 */
213static int 263static int
214process(GsfInput * in, 264process (GsfInput *in,
215 EXTRACTOR_MetaDataProcessor proc, 265 EXTRACTOR_MetaDataProcessor proc,
216 void *proc_cls) 266 void *proc_cls)
217{ 267{
218 struct ProcContext pc; 268 struct ProcContext pc;
219 GsfDocMetaData * sections; 269 GsfDocMetaData *sections;
220 GError * error;
221 270
222 pc.proc = proc; 271 pc.proc = proc;
223 pc.proc_cls = proc_cls; 272 pc.proc_cls = proc_cls;
224 pc.ret = 0; 273 pc.ret = 0;
225 sections = gsf_doc_meta_data_new(); 274 sections = gsf_doc_meta_data_new ();
226 error = gsf_msole_metadata_read(in, sections); 275 if (NULL == gsf_msole_metadata_read (in, sections))
227 if (error == NULL) { 276 {
228 gsf_doc_meta_data_foreach(sections, 277 gsf_doc_meta_data_foreach (sections,
229 &processMetadata, 278 &process_metadata,
230 &pc); 279 &pc);
231 } 280 }
232 g_object_unref(G_OBJECT(sections)); 281 g_object_unref (G_OBJECT (sections));
233 return pc.ret; 282 return pc.ret;
234} 283}
235 284
285
286/**
287 * Function called on SfxDocumentInfo OLE
288 * streams.
289 *
290 * @param in the input OLE stream
291 * @param proc function to call on meta data found
292 * @param proc_cls closure for proc
293 * @return 0 to continue to extract, 1 if we are done
294 */
236static int 295static int
237processSO(GsfInput * src, 296process_star_office (GsfInput *src,
238 EXTRACTOR_MetaDataProcessor proc, 297 EXTRACTOR_MetaDataProcessor proc,
239 void *proc_cls) { 298 void *proc_cls)
240 off_t size = gsf_input_size(src); 299{
241 if ( (size < 0x374) || (size > 4*1024*1024) ) /* == 0x375?? */ 300 off_t size = gsf_input_size (src);
242 return 0; 301
243 char buf[size]; 302 if ( (size < 0x374) ||
244 gsf_input_read(src, size, (unsigned char*) buf); 303 (size > 4*1024*1024) ) /* == 0x375?? */
245 if ( (buf[0] != 0x0F) ||
246 (buf[1] != 0x0) ||
247 (0 != strncmp(&buf[2],
248 "SfxDocumentInfo",
249 strlen("SfxDocumentInfo"))) ||
250 (buf[0x11] != 0x0B) ||
251 (buf[0x13] != 0x00) || /* pw protected! */
252 (buf[0x12] != 0x00) )
253 return 0; 304 return 0;
254 buf[0xd3] = '\0'; 305 {
255 if (buf[0x94] + buf[0x93] > 0) 306 char buf[size];
256 if (0 != addKeyword(proc, proc_cls, 307
257 &buf[0x95], 308 gsf_input_read (src, size, (unsigned char*) buf);
258 EXTRACTOR_METATYPE_TITLE)) 309 if ( (buf[0] != 0x0F) ||
310 (buf[1] != 0x0) ||
311 (0 != strncmp (&buf[2],
312 "SfxDocumentInfo",
313 strlen ("SfxDocumentInfo"))) ||
314 (buf[0x11] != 0x0B) ||
315 (buf[0x13] != 0x00) || /* pw protected! */
316 (buf[0x12] != 0x00) )
317 return 0;
318 buf[0xd3] = '\0';
319 if ( (buf[0x94] + buf[0x93] > 0) &&
320 (0 != add_metadata (proc, proc_cls,
321 &buf[0x95],
322 EXTRACTOR_METATYPE_TITLE)) )
259 return 1; 323 return 1;
260 buf[0x114] = '\0'; 324 buf[0x114] = '\0';
261 if (buf[0xd5] + buf[0xd4] > 0) 325 if ( (buf[0xd5] + buf[0xd4] > 0) &&
262 if (0 != addKeyword(proc, proc_cls, 326 (0 != add_metadata (proc, proc_cls,
263 &buf[0xd6], 327 &buf[0xd6],
264 EXTRACTOR_METATYPE_SUBJECT)) 328 EXTRACTOR_METATYPE_SUBJECT)) _)
265 return 1; 329 return 1;
266 buf[0x215] = '\0'; 330 buf[0x215] = '\0';
267 if (buf[0x115] + buf[0x116] > 0) 331 if ( (buf[0x115] + buf[0x116] > 0) &&
268 if (0 != addKeyword(proc, proc_cls, 332 (0 != add_metadata (proc, proc_cls,
269 &buf[0x117], 333 &buf[0x117],
270 EXTRACTOR_METATYPE_COMMENT)) 334 EXTRACTOR_METATYPE_COMMENT)) )
271 return 1; 335 return 1;
272 buf[0x296] = '\0'; 336 buf[0x296] = '\0';
273 if (buf[0x216] + buf[0x217] > 0) 337 if ( (buf[0x216] + buf[0x217] > 0) &&
274 if (0 != addKeyword(proc, proc_cls, 338 (0 != add_metadata(proc, proc_cls,
275 &buf[0x218], 339 &buf[0x218],
276 EXTRACTOR_METATYPE_KEYWORDS)) 340 EXTRACTOR_METATYPE_KEYWORDS)) )
277 return 1; 341 return 1;
278 /* fixme: do timestamps, 342 /* fixme: do timestamps,
279 mime-type, user-defined info's */ 343 mime-type, user-defined info's */
344 }
280 return 0; 345 return 0;
281} 346}
282 347
283/* *************** wordleaker stuff *************** */
284 348
349/**
350 * We use "__" to translate using iso-639.
351 *
352 * @param a string to translate
353 * @return translated string
354 */
285#define __(a) dgettext("iso-639", a) 355#define __(a) dgettext("iso-639", a)
286 356
287static const char * lidToLanguage( unsigned int lid ) { 357
288 switch ( lid ) { 358/**
289 case 0x0400: 359 * Get the language string for the given language ID (lid)
290 return _("No Proofing"); 360 * value.
291 case 0x0401: 361 *
292 return __("Arabic"); 362 * @param lid language id value
293 case 0x0402: 363 * @return language string corresponding to the lid
294 return __("Bulgarian"); 364 */
295 case 0x0403: 365static const char *
296 return __("Catalan"); 366lid_to_language (unsigned int lid)
297 case 0x0404: 367{
298 return _("Traditional Chinese"); 368 switch (lid)
299 case 0x0804: 369 {
300 return _("Simplified Chinese"); 370 case 0x0400:
301 case 0x0405: 371 return _("No Proofing");
302 return __("Chechen"); 372 case 0x0401:
303 case 0x0406: 373 return __("Arabic");
304 return __("Danish"); 374 case 0x0402:
305 case 0x0407: 375 return __("Bulgarian");
306 return __("German"); 376 case 0x0403:
307 case 0x0807: 377 return __("Catalan");
308 return _("Swiss German"); 378 case 0x0404:
309 case 0x0408: 379 return _("Traditional Chinese");
310 return __("Greek"); 380 case 0x0804:
311 case 0x0409: 381 return _("Simplified Chinese");
312 return _("U.S. English"); 382 case 0x0405:
313 case 0x0809: 383 return __("Chechen");
314 return _("U.K. English"); 384 case 0x0406:
315 case 0x0c09: 385 return __("Danish");
316 return _("Australian English"); 386 case 0x0407:
317 case 0x040a: 387 return __("German");
318 return _("Castilian Spanish"); 388 case 0x0807:
319 case 0x080a: 389 return _("Swiss German");
320 return _("Mexican Spanish"); 390 case 0x0408:
321 case 0x040b: 391 return __("Greek");
322 return __("Finnish"); 392 case 0x0409:
323 case 0x040c: 393 return _("U.S. English");
324 return __("French"); 394 case 0x0809:
325 case 0x080c: 395 return _("U.K. English");
326 return _("Belgian French"); 396 case 0x0c09:
327 case 0x0c0c: 397 return _("Australian English");
328 return _("Canadian French"); 398 case 0x040a:
329 case 0x100c: 399 return _("Castilian Spanish");
330 return _("Swiss French"); 400 case 0x080a:
331 case 0x040d: 401 return _("Mexican Spanish");
332 return __("Hebrew"); 402 case 0x040b:
333 case 0x040e: 403 return __("Finnish");
334 return __("Hungarian"); 404 case 0x040c:
335 case 0x040f: 405 return __("French");
336 return __("Icelandic"); 406 case 0x080c:
337 case 0x0410: 407 return _("Belgian French");
338 return __("Italian"); 408 case 0x0c0c:
339 case 0x0810: 409 return _("Canadian French");
340 return _("Swiss Italian"); 410 case 0x100c:
341 case 0x0411: 411 return _("Swiss French");
342 return __("Japanese"); 412 case 0x040d:
343 case 0x0412: 413 return __("Hebrew");
344 return __("Korean"); 414 case 0x040e:
345 case 0x0413: 415 return __("Hungarian");
346 return __("Dutch"); 416 case 0x040f:
347 case 0x0813: 417 return __("Icelandic");
348 return _("Belgian Dutch"); 418 case 0x0410:
349 case 0x0414: 419 return __("Italian");
350 return _("Norwegian Bokmal"); 420 case 0x0810:
351 case 0x0814: 421 return _("Swiss Italian");
352 return __("Norwegian Nynorsk"); 422 case 0x0411:
353 case 0x0415: 423 return __("Japanese");
354 return __("Polish"); 424 case 0x0412:
355 case 0x0416: 425 return __("Korean");
356 return __("Brazilian Portuguese"); 426 case 0x0413:
357 case 0x0816: 427 return __("Dutch");
358 return __("Portuguese"); 428 case 0x0813:
359 case 0x0417: 429 return _("Belgian Dutch");
360 return _("Rhaeto-Romanic"); 430 case 0x0414:
361 case 0x0418: 431 return _("Norwegian Bokmal");
362 return __("Romanian"); 432 case 0x0814:
363 case 0x0419: 433 return __("Norwegian Nynorsk");
364 return __("Russian"); 434 case 0x0415:
365 case 0x041a: 435 return __("Polish");
366 return _("Croato-Serbian (Latin)"); 436 case 0x0416:
367 case 0x081a: 437 return __("Brazilian Portuguese");
368 return _("Serbo-Croatian (Cyrillic)"); 438 case 0x0816:
369 case 0x041b: 439 return __("Portuguese");
370 return __("Slovak"); 440 case 0x0417:
371 case 0x041c: 441 return _("Rhaeto-Romanic");
442 case 0x0418:
443 return __("Romanian");
444 case 0x0419:
445 return __("Russian");
446 case 0x041a:
447 return _("Croato-Serbian (Latin)");
448 case 0x081a:
449 return _("Serbo-Croatian (Cyrillic)");
450 case 0x041b:
451 return __("Slovak");
452 case 0x041c:
372 return __("Albanian"); 453 return __("Albanian");
373 case 0x041d: 454 case 0x041d:
374 return __("Swedish"); 455 return __("Swedish");
375 case 0x041e: 456 case 0x041e:
376 return __("Thai"); 457 return __("Thai");
377 case 0x041f: 458 case 0x041f:
378 return __("Turkish"); 459 return __("Turkish");
379 case 0x0420: 460 case 0x0420:
380 return __("Urdu"); 461 return __("Urdu");
381 case 0x0421: 462 case 0x0421:
382 return __("Bahasa"); 463 return __("Bahasa");
383 case 0x0422: 464 case 0x0422:
384 return __("Ukrainian"); 465 return __("Ukrainian");
385 case 0x0423: 466 case 0x0423:
386 return __("Byelorussian"); 467 return __("Byelorussian");
387 case 0x0424: 468 case 0x0424:
388 return __("Slovenian"); 469 return __("Slovenian");
389 case 0x0425: 470 case 0x0425:
390 return __("Estonian"); 471 return __("Estonian");
391 case 0x0426: 472 case 0x0426:
392 return __("Latvian"); 473 return __("Latvian");
393 case 0x0427: 474 case 0x0427:
394 return __("Lithuanian"); 475 return __("Lithuanian");
395 case 0x0429: 476 case 0x0429:
396 return _("Farsi"); 477 return _("Farsi");
397 case 0x042D: 478 case 0x042D:
398 return __("Basque"); 479 return __("Basque");
399 case 0x042F: 480 case 0x042F:
400 return __("Macedonian"); 481 return __("Macedonian");
401 case 0x0436: 482 case 0x0436:
402 return __("Afrikaans"); 483 return __("Afrikaans");
403 case 0x043E: 484 case 0x043E:
404 return __("Malayalam"); 485 return __("Malayalam");
405 default: 486 default:
406 return NULL; 487 return NULL;
407 } 488 }
408} 489}
409 490
410 491
492/**
493 * Extract editing history from XTable stream.
494 *
495 * @param stream OLE stream to process
496 * @param lcSttbSavedBy length of the revision history in bytes
497 * @param fcSttbSavedBy offset of the revision history in the stream
498 * @param proc function to call on meta data found
499 * @param proc_cls closure for proc
500 * @return 0 to continue to extract, 1 if we are done
501 */
411static int 502static int
412history_extract(GsfInput * stream, 503history_extract (GsfInput *stream,
413 unsigned int lcbSttbSavedBy, 504 unsigned int lcbSttbSavedBy,
414 unsigned int fcSttbSavedBy, 505 unsigned int fcSttbSavedBy,
415 EXTRACTOR_MetaDataProcessor proc, 506 EXTRACTOR_MetaDataProcessor proc,
416 void *proc_cls) 507 void *proc_cls)
417{ 508{
418 unsigned int where = 0; 509 unsigned int where;
419 unsigned char * lbuffer; 510 unsigned char *lbuffer;
420 unsigned int i; 511 unsigned int i;
421 unsigned int length; 512 unsigned int length;
422 char * author; 513 char *author;
423 char * filename; 514 char *filename;
424 char * rbuf; 515 char *rbuf;
425 unsigned int nRev; 516 unsigned int nRev;
426 int ret; 517 int ret;
427 518
428 // goto offset of revision 519 /* goto offset of revision information */
429 gsf_input_seek(stream, fcSttbSavedBy, G_SEEK_SET); 520 gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET);
430 if (gsf_input_remaining(stream) < lcbSttbSavedBy) 521 if (gsf_input_remaining (stream) < lcbSttbSavedBy)
431 return 0; 522 return 0;
432 lbuffer = malloc(lcbSttbSavedBy); 523 if (NULL == (lbuffer = malloc (lcbSttbSavedBy)))
433 if (lbuffer == NULL)
434 return 0; 524 return 0;
435 // read all the revision history 525 /* read all the revision history */
436 gsf_input_read(stream, lcbSttbSavedBy, lbuffer); 526 gsf_input_read (stream, lcbSttbSavedBy, lbuffer);
437 // there are n strings, so n/2 revisions (author & file) 527 /* there are n strings, so n/2 revisions (author & file) */
438 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2; 528 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
439 where = 6; 529 where = 6;
440 ret = 0; 530 ret = 0;
441 for (i=0; i < nRev; i++) { 531 for (i=0; i < nRev; i++)
442 if (where >= lcbSttbSavedBy) 532 {
443 break; 533 if (where >= lcbSttbSavedBy)
444 length = lbuffer[where++]; 534 break;
445 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) || 535 length = lbuffer[where++];
446 (where + 2 * length + 2 <= where) ) 536 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
447 break; 537 (where + 2 * length + 2 <= where) )
448 author = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], 538 break;
449 length * 2, 539 author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
450 "UTF-16BE"); 540 length * 2,
451 where += length * 2 + 1; 541 "UTF-16BE");
452 length = lbuffer[where++]; 542 where += length * 2 + 1;
453 if ( (where + 2 * length >= lcbSttbSavedBy) || 543 length = lbuffer[where++];
454 (where + 2 * length + 1 <= where) ) { 544 if ( (where + 2 * length >= lcbSttbSavedBy) ||
455 if (author != NULL) 545 (where + 2 * length + 1 <= where) )
456 free(author); 546 {
457 break; 547 if (NULL != author)
548 free(author);
549 break;
550 }
551 filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
552 length * 2,
553 "UTF-16BE");
554 where += length * 2 + 1;
555 if ( (NULL != author) &&
556 (NULL != filename) )
557 {
558 if (NULL != (rbuf = malloc (strlen (author) + strlen (filename) + 512)))
559 {
560 snprintf (rbuf,
561 512 + strlen (author) + strlen (filename),
562 _("Revision #%u: Author `%s' worked on `%s'"),
563 i,
564 author,
565 filename);
566 ret = add_metadata (proc, proc_cls,
567 rbuf,
568 EXTRACTOR_METATYPE_REVISION_HISTORY);
569 free (rbuf);
570 }
571 }
572 if (NULL != author)
573 free (author);
574 if (NULL != filename)
575 free (filename);
576 if (0 != ret)
577 break;
458 } 578 }
459 filename = EXTRACTOR_common_convert_to_utf8((const char*) &lbuffer[where], 579 free (lbuffer);
460 length * 2,
461 "UTF-16BE");
462 where += length * 2 + 1;
463 if ( (author != NULL) &&
464 (filename != NULL) )
465 {
466 rbuf = malloc(strlen(author) + strlen(filename) + 512);
467 if (rbuf != NULL)
468 {
469 snprintf(rbuf,
470 512 + strlen(author) + strlen(filename),
471 _("Revision #%u: Author '%s' worked on '%s'"),
472 i, author, filename);
473 ret = addKeyword(proc, proc_cls,
474 rbuf,
475 EXTRACTOR_METATYPE_REVISION_HISTORY);
476 if (rbuf != NULL)
477 free(rbuf);
478 }
479 }
480 if (author != NULL)
481 free(author);
482 if (filename != NULL)
483 free(filename);
484 if (0 != ret)
485 break;
486 }
487 free(lbuffer);
488 return ret; 580 return ret;
489} 581}
490 582
491 583
584/* *************************** custom GSF input method ***************** */
585
586G_BEGIN_DECLS
587#define LE_TYPE_INPUT (le_input_get_type ())
588#define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), TYPE_LE_INPUT, LeInput))
589#define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), TYPE_LE_INPUT, LeInputClass))
590#define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), TYPE_LE_INPUT))
591#define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), TYPE_LE_INPUT))
592#define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), TYPE_LE_INPUT, LeInputClass))
492 593
493const char * 594/**
494EXTRACTOR_ole2_options () 595 * Overall state of an "LeInput" object.
596 */
597typedef struct _LeInput
495{ 598{
496 /* 599 /**
497 Since the Gnome developers think that being unable to 600 * Inherited state from parent (GsfInput).
498 unload plugins is an 'acceptable' limitation, we 601 */
499 require out-of-process execution for plugins depending 602 GsfInput input;
500 on libgsf and other glib-based plugins. 603
501 See also https://bugzilla.gnome.org/show_bug.cgi?id=374940 604 /*< private > */
502 */ 605 /**
503 return "oop-only"; 606 * Private state of the LeInput.
607 */
608 LeInputPrivate *priv;
609} LeInput;
610
611
612/**
613 * Internal state of an "LeInput" object.
614 */
615typedef struct _LeInputPrivate
616{
617 /**
618 * Our extraction context.
619 */
620 struct EXTRACTOR_ExtractContext *ec;
621} LeInputPrivate;
622
623
624/**
625 * LeInput's class state.
626 */
627typedef struct _LeInputClass
628{
629 /**
630 * GsfInput is our parent class.
631 */
632 GsfInputClass parent_class;
633
634 /* Padding for future expansion */
635 void (*_gtk_reserved1) (void);
636 void (*_gtk_reserved2) (void);
637 void (*_gtk_reserved3) (void);
638 void (*_gtk_reserved4) (void);
639} LeInputClass;
640
641
642/**
643 * Required method to obtain the LeInput "type".
644 */
645GType
646le_input_get_type (void) G_GNUC_CONST;
647
648
649/**
650 * Constructor for LeInput objects.
651 *
652 * @param ec extraction context to use
653 * @return the LeInput, NULL on error
654 */
655GsfInput *
656le_input_new (struct EXTRACTOR_ExtractContext *ec);
657G_END_DECLS
658
659
660/**
661 * Macro to create LeInput type definition.
662 */
663G_DEFINE_TYPE (LeInput, le_input, GSF_TYPE_INPUT)
664
665
666/**
667 *
668 */
669static void
670le_input_class_init (LeInputClass *class)
671{
672 // GObjectClass *gobject_class;
673 GsfInputClass *input_class;
674
675 // gobject_class = (GObjectClass *) class;
676 input_class = (GsfInputClass *) class;
677 input_class->read = le_input_read;
678 g_type_class_add_private (class, sizeof (LeInputPrivate));
504} 679}
505 680
506 681
507int 682
508EXTRACTOR_ole2_extract (const char *data, 683/* *********************** end of custom GSF input method ************* */
509 size_t size, 684
510 EXTRACTOR_MetaDataProcessor proc, 685
511 void *proc_cls, 686/**
512 const char *options) 687 * Main entry method for the OLE2 extraction plugin.
688 *
689 * @param ec extraction context provided to the plugin
690 */
691void
692EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
513{ 693{
514 GsfInput * input; 694 GsfInput *input;
515 GsfInfile * infile; 695 GsfInfile *infile;
516 GsfInput * src; 696 GsfInput *src;
517 const char * name; 697 const char *name;
518 int i; 698 unsigned int i;
519 unsigned int lcb; 699 unsigned int lcb;
520 unsigned int fcb; 700 unsigned int fcb;
521 const unsigned char * data512; 701 const unsigned char *data512;
522 unsigned int lid; 702 unsigned int lid;
523 const char * lang; 703 const char *lang;
524 int ret; 704 int ret;
525 705
526 ret = 0;
527 if (size < 512 + 898) 706 if (size < 512 + 898)
528 return 0; /* can hardly be OLE2 */ 707 return; /* can hardly be OLE2 */
529 input = gsf_input_memory_new((const guint8 *) data, 708 if (NULL == (input = gsf_input_memory_new ((const guint8 *) data,
530 (gsf_off_t) size, 709 (gsf_off_t) size,
531 FALSE); 710 FALSE)))
532 if (input == NULL) 711 return;
533 return 0; 712 if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
534 713 {
535 infile = gsf_infile_msole_new(input, NULL); 714 g_object_unref (G_OBJECT (input));
536 if (infile == NULL) { 715 return 0;
537 g_object_unref(G_OBJECT(input));
538 return 0;
539 }
540 lcb = 0;
541 fcb = 0;
542 for (i=0;i<gsf_infile_num_children(infile);i++) {
543 name = gsf_infile_name_by_index (infile, i);
544 src = NULL;
545 if (ret != 0)
546 break;
547 if (name == NULL)
548 continue;
549 if ( (0 == strcmp(name, "\005SummaryInformation"))
550 || (0 == strcmp(name, "\005DocumentSummaryInformation")) ) {
551 src = gsf_infile_child_by_index (infile, i);
552 if (src != NULL)
553 ret = process(src,
554 proc,
555 proc_cls);
556 } 716 }
557 if (0 == strcmp(name, "SfxDocumentInfo")) { 717 ret = 0;
558 src = gsf_infile_child_by_index (infile, i); 718 for (i=0;i<gsf_infile_num_children (infile);i++)
559 if ( (src != NULL) && (ret == 0) ) 719 {
560 ret = processSO(src, 720 if (0 != ret)
561 proc, 721 break;
562 proc_cls); 722 if (NULL == (name = gsf_infile_name_by_index (infile, i)))
723 continue;
724 src = NULL;
725 if ( ( (0 == strcmp(name, "\005SummaryInformation")) ||
726 (0 == strcmp(name, "\005DocumentSummaryInformation")) ) &&
727 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
728 ret = process (src,
729 proc,
730 proc_cls);
731 if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
732 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
733 ret = process_star_office (src,
734 proc,
735 proc_cls);
736 if (NULL != src)
737 g_object_unref (G_OBJECT (src));
563 } 738 }
564 if (src != NULL) 739 if (0 != ret)
565 g_object_unref(G_OBJECT(src)); 740 goto CLEANUP;
566 }
567 741
568 data512 = (const unsigned char*) &data[512]; 742 data512 = (const unsigned char*) &data[512];
569 lid = data512[6] + (data512[7] << 8); 743 lid = data512[6] + (data512[7] << 8);
744 if ( (NULL != (lang = lid_to_language (lid))) &&
745 (0 != (ret = add_metadata (proc, proc_cls,
746 lang,
747 EXTRACTOR_METATYPE_LANGUAGE))) )
748 goto CLEANUP;
570 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24); 749 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16) + (data512[729] << 24);
571 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24); 750 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16) + (data512[725] << 24);
572 lang = lidToLanguage(lid); 751 if (lcb < 6)
573 if ( (lang != NULL) && (ret == 0) ) 752 goto CLEANUP;
574 ret = addKeyword(proc, proc_cls, 753 for (i=0;i<gsf_infile_num_children (infile);i++)
575 lang, 754 {
576 EXTRACTOR_METATYPE_LANGUAGE);
577 if (lcb >= 6) {
578 for (i=0;i<gsf_infile_num_children(infile);i++) {
579 if (ret != 0) 755 if (ret != 0)
580 break; 756 break;
581 name = gsf_infile_name_by_index (infile, i); 757 if (NULL == (name = gsf_infile_name_by_index (infile, i)))
582 if (name == NULL)
583 continue; 758 continue;
584 if ( (0 == strcmp(name, "1Table")) || 759 if ( ( (0 == strcmp (name, "1Table")) ||
585 (0 == strcmp(name, "0Table")) ) { 760 (0 == strcmp (name, "0Table")) ) &&
586 src = gsf_infile_child_by_index (infile, i); 761 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
587 if (src != NULL) { 762 {
588 ret = history_extract(src, 763 ret = history_extract (src,
589 lcb, 764 lcb,
590 fcb, 765 fcb,
591 proc, proc_cls); 766 proc, proc_cls);
592 g_object_unref(G_OBJECT(src)); 767 g_object_unref (G_OBJECT (src));
593 } 768 }
594 }
595 } 769 }
596 } 770 CLEANUP:
597 g_object_unref(G_OBJECT(infile)); 771 g_object_unref (G_OBJECT (infile));
598 g_object_unref(G_OBJECT(input)); 772 g_object_unref (G_OBJECT (input));
599 return ret; 773 return ret;
600} 774}
601 775
602 776
777/**
778 * Custom log function we give to GSF to disable logging.
779 *
780 * @param log_domain unused
781 * @param log_level unused
782 * @param message unused
783 * @param user_data unused
784 */
603static void 785static void
604nolog (const gchar *log_domain, 786nolog (const gchar *log_domain,
605 GLogLevelFlags log_level, 787 GLogLevelFlags log_level,
606 const gchar *message, 788 const gchar *message,
607 gpointer user_data) { 789 gpointer user_data)
790{
791 /* do nothing */
608} 792}
609 793
610 794
611void __attribute__ ((constructor)) ole2_ltdl_init() { 795/**
796 * OLE2 plugin constructor. Initializes glib and gsf, in particular
797 * gsf logging is disabled.
798 */
799void __attribute__ ((constructor))
800ole2_ltdl_init()
801{
612 g_type_init(); 802 g_type_init();
613#ifdef HAVE_GSF_INIT 803#ifdef HAVE_GSF_INIT
614 gsf_init(); 804 gsf_init();
615#endif 805#endif
616 /* disable logging -- thanks, Jody! */ 806 /* disable logging -- thanks, Jody! */
617 g_log_set_handler ("libgsf:msole", G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING, &nolog, NULL); 807 g_log_set_handler ("libgsf:msole",
808 G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,
809 &nolog, NULL);
618} 810}
619 811
620 812
621void __attribute__ ((destructor)) ole2_ltdl_fini() { 813/**
814 * OLE2 plugin destructor. Shutdown of gsf.
815 */
816void __attribute__ ((destructor))
817ole2_ltdl_fini()
818{
622#ifdef HAVE_GSF_INIT 819#ifdef HAVE_GSF_INIT
623 gsf_shutdown(); 820 gsf_shutdown();
624#endif 821#endif
625} 822}
626 823
627/* end of ole2_extractor.c */
628 824
825/* end of ole2_extractor.c */