odf_extractor.c (9455B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/odf_extractor.c 22 * @brief plugin to support ODF files 23 * @author Christian Grothoff 24 */ 25 #include "platform.h" 26 #include <ctype.h> 27 #include "extractor.h" 28 #include "unzip.h" 29 30 /** 31 * Maximum length of a filename allowed inside the ZIP archive. 32 */ 33 #define MAXFILENAME 256 34 35 /** 36 * Name of the file with the meta-data in OO documents. 37 */ 38 #define METAFILE "meta.xml" 39 40 41 /** 42 * Mapping from ODF meta data strings to LE types. 43 */ 44 struct Matches 45 { 46 /** 47 * ODF description. 48 */ 49 const char *text; 50 51 /** 52 * Corresponding LE type. 53 */ 54 enum EXTRACTOR_MetaType type; 55 }; 56 57 58 /** 59 * NULL-terminated map from ODF meta data strings to LE types. 60 */ 61 static struct Matches tmap[] = { 62 { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, 63 { "meta:page-count", EXTRACTOR_METATYPE_PAGE_COUNT }, 64 { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE }, 65 { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE }, 66 { "dc:creator", EXTRACTOR_METATYPE_CREATOR }, 67 { "dc:language", EXTRACTOR_METATYPE_LANGUAGE }, 68 { "dc:title", EXTRACTOR_METATYPE_TITLE }, 69 { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION }, 70 { "dc:subject", EXTRACTOR_METATYPE_SUBJECT }, 71 { "meta:keyword", EXTRACTOR_METATYPE_KEYWORDS }, 72 { "meta:user-defined meta:name=\"Info 1\"", EXTRACTOR_METATYPE_COMMENT }, 73 { "meta:user-defined meta:name=\"Info 2\"", EXTRACTOR_METATYPE_COMMENT }, 74 { "meta:user-defined meta:name=\"Info 3\"", EXTRACTOR_METATYPE_COMMENT }, 75 { "meta:user-defined meta:name=\"Info 4\"", EXTRACTOR_METATYPE_COMMENT }, 76 { NULL, 0 } 77 }; 78 79 80 /** 81 * Obtain the mimetype of the archive by reading the 'mimetype' 82 * file of the ZIP. 83 * 84 * @param uf unzip context to extract the mimetype from 85 * @return NULL if no mimetype could be found, otherwise the mime type 86 */ 87 static char * 88 libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile *uf) 89 { 90 char filename_inzip[MAXFILENAME]; 91 struct EXTRACTOR_UnzipFileInfo file_info; 92 char *buf; 93 size_t buf_size; 94 95 if (EXTRACTOR_UNZIP_OK != 96 EXTRACTOR_common_unzip_go_find_local_file (uf, 97 "mimetype", 98 2)) 99 return NULL; 100 if (EXTRACTOR_UNZIP_OK != 101 EXTRACTOR_common_unzip_get_current_file_info (uf, 102 &file_info, 103 filename_inzip, 104 sizeof (filename_inzip), 105 NULL, 106 0, 107 NULL, 108 0)) 109 return NULL; 110 if (EXTRACTOR_UNZIP_OK != 111 EXTRACTOR_common_unzip_open_current_file (uf)) 112 return NULL; 113 buf_size = file_info.uncompressed_size; 114 if (buf_size > 1024) 115 { 116 /* way too large! */ 117 EXTRACTOR_common_unzip_close_current_file (uf); 118 return NULL; 119 } 120 if (NULL == (buf = malloc (1 + buf_size))) 121 { 122 /* memory exhausted! */ 123 EXTRACTOR_common_unzip_close_current_file (uf); 124 return NULL; 125 } 126 if (buf_size != 127 (size_t) EXTRACTOR_common_unzip_read_current_file (uf, 128 buf, 129 buf_size)) 130 { 131 free (buf); 132 EXTRACTOR_common_unzip_close_current_file (uf); 133 return NULL; 134 } 135 /* found something */ 136 buf[buf_size] = '\0'; 137 while ( (0 < buf_size) && 138 isspace ( (unsigned char) buf[buf_size - 1])) 139 buf[--buf_size] = '\0'; 140 if ('\0' == buf[0]) 141 { 142 free (buf); 143 buf = NULL; 144 } 145 EXTRACTOR_common_unzip_close_current_file (uf); 146 return buf; 147 } 148 149 150 /** 151 * Main entry method for the ODF extraction plugin. 152 * 153 * @param ec extraction context provided to the plugin 154 */ 155 void 156 EXTRACTOR_odf_extract_method (struct EXTRACTOR_ExtractContext *ec) 157 { 158 char filename_inzip[MAXFILENAME]; 159 struct EXTRACTOR_UnzipFile *uf; 160 struct EXTRACTOR_UnzipFileInfo file_info; 161 char *buf; 162 char *pbuf; 163 size_t buf_size; 164 unsigned int i; 165 char *mimetype; 166 167 if (NULL == (uf = EXTRACTOR_common_unzip_open (ec))) 168 return; 169 if (NULL != (mimetype = libextractor_oo_getmimetype (uf))) 170 { 171 if (0 != ec->proc (ec->cls, 172 "odf", 173 EXTRACTOR_METATYPE_MIMETYPE, 174 EXTRACTOR_METAFORMAT_UTF8, 175 "text/plain", 176 mimetype, 177 strlen (mimetype) + 1)) 178 { 179 EXTRACTOR_common_unzip_close (uf); 180 free (mimetype); 181 return; 182 } 183 free (mimetype); 184 } 185 if (EXTRACTOR_UNZIP_OK != 186 EXTRACTOR_common_unzip_go_find_local_file (uf, 187 METAFILE, 188 2)) 189 { 190 /* metafile not found */ 191 EXTRACTOR_common_unzip_close (uf); 192 return; 193 } 194 if (EXTRACTOR_UNZIP_OK != 195 EXTRACTOR_common_unzip_get_current_file_info (uf, 196 &file_info, 197 filename_inzip, 198 sizeof (filename_inzip), 199 NULL, 0, NULL, 0)) 200 { 201 /* problems accessing metafile */ 202 EXTRACTOR_common_unzip_close (uf); 203 return; 204 } 205 if (EXTRACTOR_UNZIP_OK != 206 EXTRACTOR_common_unzip_open_current_file (uf)) 207 { 208 /* problems with unzip */ 209 EXTRACTOR_common_unzip_close (uf); 210 return; 211 } 212 213 buf_size = file_info.uncompressed_size; 214 if (buf_size > 128 * 1024) 215 { 216 /* too big to be meta-data! */ 217 EXTRACTOR_common_unzip_close_current_file (uf); 218 EXTRACTOR_common_unzip_close (uf); 219 return; 220 } 221 if (NULL == (buf = malloc (buf_size + 1))) 222 { 223 /* out of memory */ 224 EXTRACTOR_common_unzip_close_current_file (uf); 225 EXTRACTOR_common_unzip_close (uf); 226 return; 227 } 228 if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size)) 229 { 230 EXTRACTOR_common_unzip_close_current_file (uf); 231 goto CLEANUP; 232 } 233 EXTRACTOR_common_unzip_close_current_file (uf); 234 /* we don't do "proper" parsing of the meta-data but rather use some heuristics 235 to get values out that we understand */ 236 buf[buf_size] = '\0'; 237 /* printf("%s\n", buf); */ 238 /* try to find some of the typical OO xml headers */ 239 if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") != 240 NULL) || 241 (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") != 242 NULL) || 243 (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) ) 244 { 245 /* accept as meta-data */ 246 for (i = 0; NULL != tmap[i].text; i++) 247 { 248 char *spos; 249 char *epos; 250 char needle[256]; 251 int oc; 252 253 pbuf = buf; 254 255 while (1) 256 { 257 strcpy (needle, "<"); 258 strcat (needle, tmap[i].text); 259 strcat (needle, ">"); 260 spos = strstr (pbuf, needle); 261 if (NULL == spos) 262 { 263 strcpy (needle, tmap[i].text); 264 strcat (needle, "=\""); 265 spos = strstr (pbuf, needle); 266 if (spos == NULL) 267 break; 268 spos += strlen (needle); 269 epos = spos; 270 while ( (epos[0] != '\0') && 271 (epos[0] != '"') ) 272 epos++; 273 } 274 else 275 { 276 oc = 0; 277 spos += strlen (needle); 278 while ( (spos[0] != '\0') && 279 ( (spos[0] == '<') || 280 (oc > 0) ) ) 281 { 282 if (spos[0] == '<') 283 oc++; 284 if (spos[0] == '>') 285 oc--; 286 spos++; 287 } 288 epos = spos; 289 while ( (epos[0] != '\0') && 290 (epos[0] != '<') && 291 (epos[0] != '>') ) 292 { 293 epos++; 294 } 295 } 296 if (spos != epos) 297 { 298 char key[epos - spos + 1]; 299 300 memcpy (key, spos, epos - spos); 301 key[epos - spos] = '\0'; 302 if (0 != ec->proc (ec->cls, 303 "odf", 304 tmap[i].type, 305 EXTRACTOR_METAFORMAT_UTF8, 306 "text/plain", 307 key, 308 epos - spos + 1)) 309 goto CLEANUP; 310 pbuf = epos; 311 } 312 else 313 break; 314 } 315 } 316 } 317 CLEANUP: 318 free (buf); 319 EXTRACTOR_common_unzip_close (uf); 320 } 321 322 323 /* end of odf_extractor.c */