diff options
Diffstat (limited to 'src/plugins/odf_extractor.c')
-rw-r--r-- | src/plugins/odf_extractor.c | 349 |
1 files changed, 176 insertions, 173 deletions
diff --git a/src/plugins/odf_extractor.c b/src/plugins/odf_extractor.c index 6a68f7c..ff8f9b2 100644 --- a/src/plugins/odf_extractor.c +++ b/src/plugins/odf_extractor.c | |||
@@ -41,12 +41,12 @@ | |||
41 | /** | 41 | /** |
42 | * Mapping from ODF meta data strings to LE types. | 42 | * Mapping from ODF meta data strings to LE types. |
43 | */ | 43 | */ |
44 | struct Matches | 44 | struct Matches |
45 | { | 45 | { |
46 | /** | 46 | /** |
47 | * ODF description. | 47 | * ODF description. |
48 | */ | 48 | */ |
49 | const char * text; | 49 | const char *text; |
50 | 50 | ||
51 | /** | 51 | /** |
52 | * Corresponding LE type. | 52 | * Corresponding LE type. |
@@ -85,7 +85,7 @@ static struct Matches tmap[] = { | |||
85 | * @return NULL if no mimetype could be found, otherwise the mime type | 85 | * @return NULL if no mimetype could be found, otherwise the mime type |
86 | */ | 86 | */ |
87 | static char * | 87 | static char * |
88 | libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile * uf) | 88 | libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile *uf) |
89 | { | 89 | { |
90 | char filename_inzip[MAXFILENAME]; | 90 | char filename_inzip[MAXFILENAME]; |
91 | struct EXTRACTOR_UnzipFileInfo file_info; | 91 | struct EXTRACTOR_UnzipFileInfo file_info; |
@@ -94,61 +94,61 @@ libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile * uf) | |||
94 | 94 | ||
95 | if (EXTRACTOR_UNZIP_OK != | 95 | if (EXTRACTOR_UNZIP_OK != |
96 | EXTRACTOR_common_unzip_go_find_local_file (uf, | 96 | EXTRACTOR_common_unzip_go_find_local_file (uf, |
97 | "mimetype", | 97 | "mimetype", |
98 | 2)) | 98 | 2)) |
99 | return NULL; | 99 | return NULL; |
100 | if (EXTRACTOR_UNZIP_OK != | 100 | if (EXTRACTOR_UNZIP_OK != |
101 | EXTRACTOR_common_unzip_get_current_file_info (uf, | 101 | EXTRACTOR_common_unzip_get_current_file_info (uf, |
102 | &file_info, | 102 | &file_info, |
103 | filename_inzip, | 103 | filename_inzip, |
104 | sizeof (filename_inzip), | 104 | sizeof (filename_inzip), |
105 | NULL, | 105 | NULL, |
106 | 0, | 106 | 0, |
107 | NULL, | 107 | NULL, |
108 | 0)) | 108 | 0)) |
109 | return NULL; | 109 | return NULL; |
110 | if (EXTRACTOR_UNZIP_OK != | 110 | if (EXTRACTOR_UNZIP_OK != |
111 | EXTRACTOR_common_unzip_open_current_file (uf)) | 111 | EXTRACTOR_common_unzip_open_current_file (uf)) |
112 | return NULL; | 112 | return NULL; |
113 | buf_size = file_info.uncompressed_size; | 113 | buf_size = file_info.uncompressed_size; |
114 | if (buf_size > 1024) | 114 | if (buf_size > 1024) |
115 | { | 115 | { |
116 | /* way too large! */ | 116 | /* way too large! */ |
117 | EXTRACTOR_common_unzip_close_current_file (uf); | 117 | EXTRACTOR_common_unzip_close_current_file (uf); |
118 | return NULL; | 118 | return NULL; |
119 | } | 119 | } |
120 | if (NULL == (buf = malloc (1 + buf_size))) | 120 | if (NULL == (buf = malloc (1 + buf_size))) |
121 | { | 121 | { |
122 | /* memory exhausted! */ | 122 | /* memory exhausted! */ |
123 | EXTRACTOR_common_unzip_close_current_file (uf); | 123 | EXTRACTOR_common_unzip_close_current_file (uf); |
124 | return NULL; | 124 | return NULL; |
125 | } | 125 | } |
126 | if (buf_size != | 126 | if (buf_size != |
127 | (size_t) EXTRACTOR_common_unzip_read_current_file (uf, | 127 | (size_t) EXTRACTOR_common_unzip_read_current_file (uf, |
128 | buf, | 128 | buf, |
129 | buf_size)) | 129 | buf_size)) |
130 | { | 130 | { |
131 | free(buf); | 131 | free (buf); |
132 | EXTRACTOR_common_unzip_close_current_file(uf); | 132 | EXTRACTOR_common_unzip_close_current_file (uf); |
133 | return NULL; | 133 | return NULL; |
134 | } | 134 | } |
135 | /* found something */ | 135 | /* found something */ |
136 | buf[buf_size] = '\0'; | 136 | buf[buf_size] = '\0'; |
137 | while ( (0 < buf_size) && | 137 | while ( (0 < buf_size) && |
138 | isspace( (unsigned char) buf[buf_size - 1])) | 138 | isspace ( (unsigned char) buf[buf_size - 1])) |
139 | buf[--buf_size] = '\0'; | 139 | buf[--buf_size] = '\0'; |
140 | if ('\0' == buf[0]) | 140 | if ('\0' == buf[0]) |
141 | { | 141 | { |
142 | free (buf); | 142 | free (buf); |
143 | buf = NULL; | 143 | buf = NULL; |
144 | } | 144 | } |
145 | EXTRACTOR_common_unzip_close_current_file (uf); | 145 | EXTRACTOR_common_unzip_close_current_file (uf); |
146 | return buf; | 146 | return buf; |
147 | } | 147 | } |
148 | 148 | ||
149 | 149 | ||
150 | /** | 150 | /** |
151 | * Main entry method for the ODF extraction plugin. | 151 | * Main entry method for the ODF extraction plugin. |
152 | * | 152 | * |
153 | * @param ec extraction context provided to the plugin | 153 | * @param ec extraction context provided to the plugin |
154 | */ | 154 | */ |
@@ -167,154 +167,157 @@ EXTRACTOR_odf_extract_method (struct EXTRACTOR_ExtractContext *ec) | |||
167 | if (NULL == (uf = EXTRACTOR_common_unzip_open (ec))) | 167 | if (NULL == (uf = EXTRACTOR_common_unzip_open (ec))) |
168 | return; | 168 | return; |
169 | if (NULL != (mimetype = libextractor_oo_getmimetype (uf))) | 169 | if (NULL != (mimetype = libextractor_oo_getmimetype (uf))) |
170 | { | ||
171 | if (0 != ec->proc (ec->cls, | ||
172 | "odf", | ||
173 | EXTRACTOR_METATYPE_MIMETYPE, | ||
174 | EXTRACTOR_METAFORMAT_UTF8, | ||
175 | "text/plain", | ||
176 | mimetype, | ||
177 | strlen (mimetype) + 1)) | ||
170 | { | 178 | { |
171 | if (0 != ec->proc (ec->cls, | 179 | EXTRACTOR_common_unzip_close (uf); |
172 | "odf", | ||
173 | EXTRACTOR_METATYPE_MIMETYPE, | ||
174 | EXTRACTOR_METAFORMAT_UTF8, | ||
175 | "text/plain", | ||
176 | mimetype, | ||
177 | strlen (mimetype) + 1)) | ||
178 | { | ||
179 | EXTRACTOR_common_unzip_close (uf); | ||
180 | free (mimetype); | ||
181 | return; | ||
182 | } | ||
183 | free (mimetype); | 180 | free (mimetype); |
181 | return; | ||
184 | } | 182 | } |
183 | free (mimetype); | ||
184 | } | ||
185 | if (EXTRACTOR_UNZIP_OK != | 185 | if (EXTRACTOR_UNZIP_OK != |
186 | EXTRACTOR_common_unzip_go_find_local_file (uf, | 186 | EXTRACTOR_common_unzip_go_find_local_file (uf, |
187 | METAFILE, | 187 | METAFILE, |
188 | 2)) | 188 | 2)) |
189 | { | 189 | { |
190 | /* metafile not found */ | 190 | /* metafile not found */ |
191 | EXTRACTOR_common_unzip_close (uf); | 191 | EXTRACTOR_common_unzip_close (uf); |
192 | return; | 192 | return; |
193 | } | 193 | } |
194 | if (EXTRACTOR_UNZIP_OK != | 194 | if (EXTRACTOR_UNZIP_OK != |
195 | EXTRACTOR_common_unzip_get_current_file_info (uf, | 195 | EXTRACTOR_common_unzip_get_current_file_info (uf, |
196 | &file_info, | 196 | &file_info, |
197 | filename_inzip, | 197 | filename_inzip, |
198 | sizeof (filename_inzip), | 198 | sizeof (filename_inzip), |
199 | NULL, 0, NULL, 0)) | 199 | NULL, 0, NULL, 0)) |
200 | { | 200 | { |
201 | /* problems accessing metafile */ | 201 | /* problems accessing metafile */ |
202 | EXTRACTOR_common_unzip_close (uf); | 202 | EXTRACTOR_common_unzip_close (uf); |
203 | return; | 203 | return; |
204 | } | 204 | } |
205 | if (EXTRACTOR_UNZIP_OK != | 205 | if (EXTRACTOR_UNZIP_OK != |
206 | EXTRACTOR_common_unzip_open_current_file (uf)) | 206 | EXTRACTOR_common_unzip_open_current_file (uf)) |
207 | { | 207 | { |
208 | /* problems with unzip */ | 208 | /* problems with unzip */ |
209 | EXTRACTOR_common_unzip_close (uf); | 209 | EXTRACTOR_common_unzip_close (uf); |
210 | return; | 210 | return; |
211 | } | 211 | } |
212 | 212 | ||
213 | buf_size = file_info.uncompressed_size; | 213 | buf_size = file_info.uncompressed_size; |
214 | if (buf_size > 128 * 1024) | 214 | if (buf_size > 128 * 1024) |
215 | { | 215 | { |
216 | /* too big to be meta-data! */ | 216 | /* too big to be meta-data! */ |
217 | EXTRACTOR_common_unzip_close_current_file (uf); | 217 | EXTRACTOR_common_unzip_close_current_file (uf); |
218 | EXTRACTOR_common_unzip_close (uf); | 218 | EXTRACTOR_common_unzip_close (uf); |
219 | return; | 219 | return; |
220 | } | 220 | } |
221 | if (NULL == (buf = malloc (buf_size+1))) | 221 | if (NULL == (buf = malloc (buf_size + 1))) |
222 | { | 222 | { |
223 | /* out of memory */ | 223 | /* out of memory */ |
224 | EXTRACTOR_common_unzip_close_current_file (uf); | 224 | EXTRACTOR_common_unzip_close_current_file (uf); |
225 | EXTRACTOR_common_unzip_close (uf); | 225 | EXTRACTOR_common_unzip_close (uf); |
226 | return; | 226 | return; |
227 | } | 227 | } |
228 | if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size)) | 228 | if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size)) |
229 | { | 229 | { |
230 | EXTRACTOR_common_unzip_close_current_file (uf); | 230 | EXTRACTOR_common_unzip_close_current_file (uf); |
231 | goto CLEANUP; | 231 | goto CLEANUP; |
232 | } | 232 | } |
233 | EXTRACTOR_common_unzip_close_current_file (uf); | 233 | EXTRACTOR_common_unzip_close_current_file (uf); |
234 | /* we don't do "proper" parsing of the meta-data but rather use some heuristics | 234 | /* we don't do "proper" parsing of the meta-data but rather use some heuristics |
235 | to get values out that we understand */ | 235 | to get values out that we understand */ |
236 | buf[buf_size] = '\0'; | 236 | buf[buf_size] = '\0'; |
237 | /* printf("%s\n", buf); */ | 237 | /* printf("%s\n", buf); */ |
238 | /* try to find some of the typical OO xml headers */ | 238 | /* try to find some of the typical OO xml headers */ |
239 | if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") != NULL) || | 239 | if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") != |
240 | (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") != NULL) || | 240 | NULL) || |
241 | (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) ) | 241 | (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") != |
242 | NULL) || | ||
243 | (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) ) | ||
244 | { | ||
245 | /* accept as meta-data */ | ||
246 | for (i = 0; NULL != tmap[i].text; i++) | ||
242 | { | 247 | { |
243 | /* accept as meta-data */ | 248 | char *spos; |
244 | for (i = 0; NULL != tmap[i].text; i++) | 249 | char *epos; |
245 | { | 250 | char needle[256]; |
246 | char * spos; | 251 | int oc; |
247 | char * epos; | 252 | |
248 | char needle[256]; | 253 | pbuf = buf; |
249 | int oc; | 254 | |
250 | 255 | while (1) | |
251 | pbuf = buf; | 256 | { |
252 | 257 | strcpy (needle, "<"); | |
253 | while (1) | 258 | strcat (needle, tmap[i].text); |
254 | { | 259 | strcat (needle, ">"); |
255 | strcpy(needle, "<"); | 260 | spos = strstr (pbuf, needle); |
256 | strcat(needle, tmap[i].text); | 261 | if (NULL == spos) |
257 | strcat(needle, ">"); | 262 | { |
258 | spos = strstr(pbuf, needle); | 263 | strcpy (needle, tmap[i].text); |
259 | if (NULL == spos) | 264 | strcat (needle, "=\""); |
260 | { | 265 | spos = strstr (pbuf, needle); |
261 | strcpy(needle, tmap[i].text); | 266 | if (spos == NULL) |
262 | strcat(needle, "=\""); | 267 | break; |
263 | spos = strstr(pbuf, needle); | 268 | spos += strlen (needle); |
264 | if (spos == NULL) | 269 | epos = spos; |
265 | break; | 270 | while ( (epos[0] != '\0') && |
266 | spos += strlen(needle); | 271 | (epos[0] != '"') ) |
267 | epos = spos; | 272 | epos++; |
268 | while ( (epos[0] != '\0') && | 273 | } |
269 | (epos[0] != '"') ) | 274 | else |
270 | epos++; | 275 | { |
271 | } | 276 | oc = 0; |
272 | else | 277 | spos += strlen (needle); |
273 | { | 278 | while ( (spos[0] != '\0') && |
274 | oc = 0; | 279 | ( (spos[0] == '<') || |
275 | spos += strlen(needle); | 280 | (oc > 0) ) ) |
276 | while ( (spos[0] != '\0') && | 281 | { |
277 | ( (spos[0] == '<') || | 282 | if (spos[0] == '<') |
278 | (oc > 0) ) ) | 283 | oc++; |
279 | { | 284 | if (spos[0] == '>') |
280 | if (spos[0] == '<') | 285 | oc--; |
281 | oc++; | 286 | spos++; |
282 | if (spos[0] == '>') | 287 | } |
283 | oc--; | 288 | epos = spos; |
284 | spos++; | 289 | while ( (epos[0] != '\0') && |
285 | } | 290 | (epos[0] != '<') && |
286 | epos = spos; | 291 | (epos[0] != '>') ) |
287 | while ( (epos[0] != '\0') && | 292 | { |
288 | (epos[0] != '<') && | 293 | epos++; |
289 | (epos[0] != '>') ) | 294 | } |
290 | { | 295 | } |
291 | epos++; | 296 | if (spos != epos) |
292 | } | 297 | { |
293 | } | 298 | char key[epos - spos + 1]; |
294 | if (spos != epos) | 299 | |
295 | { | 300 | memcpy (key, spos, epos - spos); |
296 | char key[epos - spos + 1]; | 301 | key[epos - spos] = '\0'; |
297 | 302 | if (0 != ec->proc (ec->cls, | |
298 | memcpy(key, spos, epos-spos); | 303 | "odf", |
299 | key[epos-spos] = '\0'; | 304 | tmap[i].type, |
300 | if (0 != ec->proc (ec->cls, | 305 | EXTRACTOR_METAFORMAT_UTF8, |
301 | "odf", | 306 | "text/plain", |
302 | tmap[i].type, | 307 | key, |
303 | EXTRACTOR_METAFORMAT_UTF8, | 308 | epos - spos + 1)) |
304 | "text/plain", | 309 | goto CLEANUP; |
305 | key, | 310 | pbuf = epos; |
306 | epos - spos + 1)) | 311 | } |
307 | goto CLEANUP; | 312 | else |
308 | pbuf = epos; | 313 | break; |
309 | } | 314 | } |
310 | else | ||
311 | break; | ||
312 | } | ||
313 | } | ||
314 | } | 315 | } |
315 | CLEANUP: | 316 | } |
317 | CLEANUP: | ||
316 | free (buf); | 318 | free (buf); |
317 | EXTRACTOR_common_unzip_close (uf); | 319 | EXTRACTOR_common_unzip_close (uf); |
318 | } | 320 | } |
319 | 321 | ||
322 | |||
320 | /* end of odf_extractor.c */ | 323 | /* end of odf_extractor.c */ |