aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/odf_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/odf_extractor.c')
-rw-r--r--src/plugins/odf_extractor.c349
1 files changed, 176 insertions, 173 deletions
diff --git a/src/plugins/odf_extractor.c b/src/plugins/odf_extractor.c
index 6a68f7c..ff8f9b2 100644
--- a/src/plugins/odf_extractor.c
+++ b/src/plugins/odf_extractor.c
@@ -41,12 +41,12 @@
41/** 41/**
42 * Mapping from ODF meta data strings to LE types. 42 * Mapping from ODF meta data strings to LE types.
43 */ 43 */
44struct Matches 44struct Matches
45{ 45{
46 /** 46 /**
47 * ODF description. 47 * ODF description.
48 */ 48 */
49 const char * text; 49 const char *text;
50 50
51 /** 51 /**
52 * Corresponding LE type. 52 * Corresponding LE type.
@@ -85,7 +85,7 @@ static struct Matches tmap[] = {
85 * @return NULL if no mimetype could be found, otherwise the mime type 85 * @return NULL if no mimetype could be found, otherwise the mime type
86 */ 86 */
87static char * 87static char *
88libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile * uf) 88libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile *uf)
89{ 89{
90 char filename_inzip[MAXFILENAME]; 90 char filename_inzip[MAXFILENAME];
91 struct EXTRACTOR_UnzipFileInfo file_info; 91 struct EXTRACTOR_UnzipFileInfo file_info;
@@ -94,61 +94,61 @@ libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile * uf)
94 94
95 if (EXTRACTOR_UNZIP_OK != 95 if (EXTRACTOR_UNZIP_OK !=
96 EXTRACTOR_common_unzip_go_find_local_file (uf, 96 EXTRACTOR_common_unzip_go_find_local_file (uf,
97 "mimetype", 97 "mimetype",
98 2)) 98 2))
99 return NULL; 99 return NULL;
100 if (EXTRACTOR_UNZIP_OK != 100 if (EXTRACTOR_UNZIP_OK !=
101 EXTRACTOR_common_unzip_get_current_file_info (uf, 101 EXTRACTOR_common_unzip_get_current_file_info (uf,
102 &file_info, 102 &file_info,
103 filename_inzip, 103 filename_inzip,
104 sizeof (filename_inzip), 104 sizeof (filename_inzip),
105 NULL, 105 NULL,
106 0, 106 0,
107 NULL, 107 NULL,
108 0)) 108 0))
109 return NULL; 109 return NULL;
110 if (EXTRACTOR_UNZIP_OK != 110 if (EXTRACTOR_UNZIP_OK !=
111 EXTRACTOR_common_unzip_open_current_file (uf)) 111 EXTRACTOR_common_unzip_open_current_file (uf))
112 return NULL; 112 return NULL;
113 buf_size = file_info.uncompressed_size; 113 buf_size = file_info.uncompressed_size;
114 if (buf_size > 1024) 114 if (buf_size > 1024)
115 { 115 {
116 /* way too large! */ 116 /* way too large! */
117 EXTRACTOR_common_unzip_close_current_file (uf); 117 EXTRACTOR_common_unzip_close_current_file (uf);
118 return NULL; 118 return NULL;
119 } 119 }
120 if (NULL == (buf = malloc (1 + buf_size))) 120 if (NULL == (buf = malloc (1 + buf_size)))
121 { 121 {
122 /* memory exhausted! */ 122 /* memory exhausted! */
123 EXTRACTOR_common_unzip_close_current_file (uf); 123 EXTRACTOR_common_unzip_close_current_file (uf);
124 return NULL; 124 return NULL;
125 } 125 }
126 if (buf_size != 126 if (buf_size !=
127 (size_t) EXTRACTOR_common_unzip_read_current_file (uf, 127 (size_t) EXTRACTOR_common_unzip_read_current_file (uf,
128 buf, 128 buf,
129 buf_size)) 129 buf_size))
130 { 130 {
131 free(buf); 131 free (buf);
132 EXTRACTOR_common_unzip_close_current_file(uf); 132 EXTRACTOR_common_unzip_close_current_file (uf);
133 return NULL; 133 return NULL;
134 } 134 }
135 /* found something */ 135 /* found something */
136 buf[buf_size] = '\0'; 136 buf[buf_size] = '\0';
137 while ( (0 < buf_size) && 137 while ( (0 < buf_size) &&
138 isspace( (unsigned char) buf[buf_size - 1])) 138 isspace ( (unsigned char) buf[buf_size - 1]))
139 buf[--buf_size] = '\0'; 139 buf[--buf_size] = '\0';
140 if ('\0' == buf[0]) 140 if ('\0' == buf[0])
141 { 141 {
142 free (buf); 142 free (buf);
143 buf = NULL; 143 buf = NULL;
144 } 144 }
145 EXTRACTOR_common_unzip_close_current_file (uf); 145 EXTRACTOR_common_unzip_close_current_file (uf);
146 return buf; 146 return buf;
147} 147}
148 148
149 149
150/** 150/**
151 * Main entry method for the ODF extraction plugin. 151 * Main entry method for the ODF extraction plugin.
152 * 152 *
153 * @param ec extraction context provided to the plugin 153 * @param ec extraction context provided to the plugin
154 */ 154 */
@@ -167,154 +167,157 @@ EXTRACTOR_odf_extract_method (struct EXTRACTOR_ExtractContext *ec)
167 if (NULL == (uf = EXTRACTOR_common_unzip_open (ec))) 167 if (NULL == (uf = EXTRACTOR_common_unzip_open (ec)))
168 return; 168 return;
169 if (NULL != (mimetype = libextractor_oo_getmimetype (uf))) 169 if (NULL != (mimetype = libextractor_oo_getmimetype (uf)))
170 {
171 if (0 != ec->proc (ec->cls,
172 "odf",
173 EXTRACTOR_METATYPE_MIMETYPE,
174 EXTRACTOR_METAFORMAT_UTF8,
175 "text/plain",
176 mimetype,
177 strlen (mimetype) + 1))
170 { 178 {
171 if (0 != ec->proc (ec->cls, 179 EXTRACTOR_common_unzip_close (uf);
172 "odf",
173 EXTRACTOR_METATYPE_MIMETYPE,
174 EXTRACTOR_METAFORMAT_UTF8,
175 "text/plain",
176 mimetype,
177 strlen (mimetype) + 1))
178 {
179 EXTRACTOR_common_unzip_close (uf);
180 free (mimetype);
181 return;
182 }
183 free (mimetype); 180 free (mimetype);
181 return;
184 } 182 }
183 free (mimetype);
184 }
185 if (EXTRACTOR_UNZIP_OK != 185 if (EXTRACTOR_UNZIP_OK !=
186 EXTRACTOR_common_unzip_go_find_local_file (uf, 186 EXTRACTOR_common_unzip_go_find_local_file (uf,
187 METAFILE, 187 METAFILE,
188 2)) 188 2))
189 { 189 {
190 /* metafile not found */ 190 /* metafile not found */
191 EXTRACTOR_common_unzip_close (uf); 191 EXTRACTOR_common_unzip_close (uf);
192 return; 192 return;
193 } 193 }
194 if (EXTRACTOR_UNZIP_OK != 194 if (EXTRACTOR_UNZIP_OK !=
195 EXTRACTOR_common_unzip_get_current_file_info (uf, 195 EXTRACTOR_common_unzip_get_current_file_info (uf,
196 &file_info, 196 &file_info,
197 filename_inzip, 197 filename_inzip,
198 sizeof (filename_inzip), 198 sizeof (filename_inzip),
199 NULL, 0, NULL, 0)) 199 NULL, 0, NULL, 0))
200 { 200 {
201 /* problems accessing metafile */ 201 /* problems accessing metafile */
202 EXTRACTOR_common_unzip_close (uf); 202 EXTRACTOR_common_unzip_close (uf);
203 return; 203 return;
204 } 204 }
205 if (EXTRACTOR_UNZIP_OK != 205 if (EXTRACTOR_UNZIP_OK !=
206 EXTRACTOR_common_unzip_open_current_file (uf)) 206 EXTRACTOR_common_unzip_open_current_file (uf))
207 { 207 {
208 /* problems with unzip */ 208 /* problems with unzip */
209 EXTRACTOR_common_unzip_close (uf); 209 EXTRACTOR_common_unzip_close (uf);
210 return; 210 return;
211 } 211 }
212 212
213 buf_size = file_info.uncompressed_size; 213 buf_size = file_info.uncompressed_size;
214 if (buf_size > 128 * 1024) 214 if (buf_size > 128 * 1024)
215 { 215 {
216 /* too big to be meta-data! */ 216 /* too big to be meta-data! */
217 EXTRACTOR_common_unzip_close_current_file (uf); 217 EXTRACTOR_common_unzip_close_current_file (uf);
218 EXTRACTOR_common_unzip_close (uf); 218 EXTRACTOR_common_unzip_close (uf);
219 return; 219 return;
220 } 220 }
221 if (NULL == (buf = malloc (buf_size+1))) 221 if (NULL == (buf = malloc (buf_size + 1)))
222 { 222 {
223 /* out of memory */ 223 /* out of memory */
224 EXTRACTOR_common_unzip_close_current_file (uf); 224 EXTRACTOR_common_unzip_close_current_file (uf);
225 EXTRACTOR_common_unzip_close (uf); 225 EXTRACTOR_common_unzip_close (uf);
226 return; 226 return;
227 } 227 }
228 if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size)) 228 if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size))
229 { 229 {
230 EXTRACTOR_common_unzip_close_current_file (uf); 230 EXTRACTOR_common_unzip_close_current_file (uf);
231 goto CLEANUP; 231 goto CLEANUP;
232 } 232 }
233 EXTRACTOR_common_unzip_close_current_file (uf); 233 EXTRACTOR_common_unzip_close_current_file (uf);
234 /* we don't do "proper" parsing of the meta-data but rather use some heuristics 234 /* we don't do "proper" parsing of the meta-data but rather use some heuristics
235 to get values out that we understand */ 235 to get values out that we understand */
236 buf[buf_size] = '\0'; 236 buf[buf_size] = '\0';
237 /* printf("%s\n", buf); */ 237 /* printf("%s\n", buf); */
238 /* try to find some of the typical OO xml headers */ 238 /* try to find some of the typical OO xml headers */
239 if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") != NULL) || 239 if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") !=
240 (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") != NULL) || 240 NULL) ||
241 (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) ) 241 (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") !=
242 NULL) ||
243 (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) )
244 {
245 /* accept as meta-data */
246 for (i = 0; NULL != tmap[i].text; i++)
242 { 247 {
243 /* accept as meta-data */ 248 char *spos;
244 for (i = 0; NULL != tmap[i].text; i++) 249 char *epos;
245 { 250 char needle[256];
246 char * spos; 251 int oc;
247 char * epos; 252
248 char needle[256]; 253 pbuf = buf;
249 int oc; 254
250 255 while (1)
251 pbuf = buf; 256 {
252 257 strcpy (needle, "<");
253 while (1) 258 strcat (needle, tmap[i].text);
254 { 259 strcat (needle, ">");
255 strcpy(needle, "<"); 260 spos = strstr (pbuf, needle);
256 strcat(needle, tmap[i].text); 261 if (NULL == spos)
257 strcat(needle, ">"); 262 {
258 spos = strstr(pbuf, needle); 263 strcpy (needle, tmap[i].text);
259 if (NULL == spos) 264 strcat (needle, "=\"");
260 { 265 spos = strstr (pbuf, needle);
261 strcpy(needle, tmap[i].text); 266 if (spos == NULL)
262 strcat(needle, "=\""); 267 break;
263 spos = strstr(pbuf, needle); 268 spos += strlen (needle);
264 if (spos == NULL) 269 epos = spos;
265 break; 270 while ( (epos[0] != '\0') &&
266 spos += strlen(needle); 271 (epos[0] != '"') )
267 epos = spos; 272 epos++;
268 while ( (epos[0] != '\0') && 273 }
269 (epos[0] != '"') ) 274 else
270 epos++; 275 {
271 } 276 oc = 0;
272 else 277 spos += strlen (needle);
273 { 278 while ( (spos[0] != '\0') &&
274 oc = 0; 279 ( (spos[0] == '<') ||
275 spos += strlen(needle); 280 (oc > 0) ) )
276 while ( (spos[0] != '\0') && 281 {
277 ( (spos[0] == '<') || 282 if (spos[0] == '<')
278 (oc > 0) ) ) 283 oc++;
279 { 284 if (spos[0] == '>')
280 if (spos[0] == '<') 285 oc--;
281 oc++; 286 spos++;
282 if (spos[0] == '>') 287 }
283 oc--; 288 epos = spos;
284 spos++; 289 while ( (epos[0] != '\0') &&
285 } 290 (epos[0] != '<') &&
286 epos = spos; 291 (epos[0] != '>') )
287 while ( (epos[0] != '\0') && 292 {
288 (epos[0] != '<') && 293 epos++;
289 (epos[0] != '>') ) 294 }
290 { 295 }
291 epos++; 296 if (spos != epos)
292 } 297 {
293 } 298 char key[epos - spos + 1];
294 if (spos != epos) 299
295 { 300 memcpy (key, spos, epos - spos);
296 char key[epos - spos + 1]; 301 key[epos - spos] = '\0';
297 302 if (0 != ec->proc (ec->cls,
298 memcpy(key, spos, epos-spos); 303 "odf",
299 key[epos-spos] = '\0'; 304 tmap[i].type,
300 if (0 != ec->proc (ec->cls, 305 EXTRACTOR_METAFORMAT_UTF8,
301 "odf", 306 "text/plain",
302 tmap[i].type, 307 key,
303 EXTRACTOR_METAFORMAT_UTF8, 308 epos - spos + 1))
304 "text/plain", 309 goto CLEANUP;
305 key, 310 pbuf = epos;
306 epos - spos + 1)) 311 }
307 goto CLEANUP; 312 else
308 pbuf = epos; 313 break;
309 } 314 }
310 else
311 break;
312 }
313 }
314 } 315 }
315 CLEANUP: 316 }
317CLEANUP:
316 free (buf); 318 free (buf);
317 EXTRACTOR_common_unzip_close (uf); 319 EXTRACTOR_common_unzip_close (uf);
318} 320}
319 321
322
320/* end of odf_extractor.c */ 323/* end of odf_extractor.c */