libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

odf_extractor.c (9455B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19 */
     20 /**
     21  * @file plugins/odf_extractor.c
     22  * @brief plugin to support ODF files
     23  * @author Christian Grothoff
     24  */
     25 #include "platform.h"
     26 #include <ctype.h>
     27 #include "extractor.h"
     28 #include "unzip.h"
     29 
     30 /**
     31  * Maximum length of a filename allowed inside the ZIP archive.
     32  */
     33 #define MAXFILENAME 256
     34 
     35 /**
     36  * Name of the file with the meta-data in OO documents.
     37  */
     38 #define METAFILE "meta.xml"
     39 
     40 
     41 /**
     42  * Mapping from ODF meta data strings to LE types.
     43  */
     44 struct Matches
     45 {
     46   /**
     47    * ODF description.
     48    */
     49   const char *text;
     50 
     51   /**
     52    * Corresponding LE type.
     53    */
     54   enum EXTRACTOR_MetaType type;
     55 };
     56 
     57 
     58 /**
     59  * NULL-terminated map from ODF meta data strings to LE types.
     60  */
     61 static struct Matches tmap[] = {
     62   { "meta:generator",     EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
     63   { "meta:page-count",    EXTRACTOR_METATYPE_PAGE_COUNT },
     64   { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
     65   { "dc:date",            EXTRACTOR_METATYPE_UNKNOWN_DATE },
     66   { "dc:creator",         EXTRACTOR_METATYPE_CREATOR },
     67   { "dc:language",        EXTRACTOR_METATYPE_LANGUAGE },
     68   { "dc:title",           EXTRACTOR_METATYPE_TITLE },
     69   { "dc:description",     EXTRACTOR_METATYPE_DESCRIPTION },
     70   { "dc:subject",         EXTRACTOR_METATYPE_SUBJECT },
     71   { "meta:keyword",       EXTRACTOR_METATYPE_KEYWORDS },
     72   { "meta:user-defined meta:name=\"Info 1\"", EXTRACTOR_METATYPE_COMMENT },
     73   { "meta:user-defined meta:name=\"Info 2\"", EXTRACTOR_METATYPE_COMMENT },
     74   { "meta:user-defined meta:name=\"Info 3\"", EXTRACTOR_METATYPE_COMMENT },
     75   { "meta:user-defined meta:name=\"Info 4\"", EXTRACTOR_METATYPE_COMMENT },
     76   { NULL, 0 }
     77 };
     78 
     79 
     80 /**
     81  * Obtain the mimetype of the archive by reading the 'mimetype'
     82  * file of the ZIP.
     83  *
     84  * @param uf unzip context to extract the mimetype from
     85  * @return NULL if no mimetype could be found, otherwise the mime type
     86  */
     87 static char *
     88 libextractor_oo_getmimetype (struct EXTRACTOR_UnzipFile *uf)
     89 {
     90   char filename_inzip[MAXFILENAME];
     91   struct EXTRACTOR_UnzipFileInfo file_info;
     92   char *buf;
     93   size_t buf_size;
     94 
     95   if (EXTRACTOR_UNZIP_OK !=
     96       EXTRACTOR_common_unzip_go_find_local_file (uf,
     97                                                  "mimetype",
     98                                                  2))
     99     return NULL;
    100   if (EXTRACTOR_UNZIP_OK !=
    101       EXTRACTOR_common_unzip_get_current_file_info (uf,
    102                                                     &file_info,
    103                                                     filename_inzip,
    104                                                     sizeof (filename_inzip),
    105                                                     NULL,
    106                                                     0,
    107                                                     NULL,
    108                                                     0))
    109     return NULL;
    110   if (EXTRACTOR_UNZIP_OK !=
    111       EXTRACTOR_common_unzip_open_current_file (uf))
    112     return NULL;
    113   buf_size = file_info.uncompressed_size;
    114   if (buf_size > 1024)
    115   {
    116     /* way too large! */
    117     EXTRACTOR_common_unzip_close_current_file (uf);
    118     return NULL;
    119   }
    120   if (NULL == (buf = malloc (1 + buf_size)))
    121   {
    122     /* memory exhausted! */
    123     EXTRACTOR_common_unzip_close_current_file (uf);
    124     return NULL;
    125   }
    126   if (buf_size !=
    127       (size_t) EXTRACTOR_common_unzip_read_current_file (uf,
    128                                                          buf,
    129                                                          buf_size))
    130   {
    131     free (buf);
    132     EXTRACTOR_common_unzip_close_current_file (uf);
    133     return NULL;
    134   }
    135   /* found something */
    136   buf[buf_size] = '\0';
    137   while ( (0 < buf_size) &&
    138           isspace ( (unsigned char) buf[buf_size - 1]))
    139     buf[--buf_size] = '\0';
    140   if ('\0' == buf[0])
    141   {
    142     free (buf);
    143     buf = NULL;
    144   }
    145   EXTRACTOR_common_unzip_close_current_file (uf);
    146   return buf;
    147 }
    148 
    149 
    150 /**
    151  * Main entry method for the ODF extraction plugin.
    152  *
    153  * @param ec extraction context provided to the plugin
    154  */
    155 void
    156 EXTRACTOR_odf_extract_method (struct EXTRACTOR_ExtractContext *ec)
    157 {
    158   char filename_inzip[MAXFILENAME];
    159   struct EXTRACTOR_UnzipFile *uf;
    160   struct EXTRACTOR_UnzipFileInfo file_info;
    161   char *buf;
    162   char *pbuf;
    163   size_t buf_size;
    164   unsigned int i;
    165   char *mimetype;
    166 
    167   if (NULL == (uf = EXTRACTOR_common_unzip_open (ec)))
    168     return;
    169   if (NULL != (mimetype = libextractor_oo_getmimetype (uf)))
    170   {
    171     if (0 != ec->proc (ec->cls,
    172                        "odf",
    173                        EXTRACTOR_METATYPE_MIMETYPE,
    174                        EXTRACTOR_METAFORMAT_UTF8,
    175                        "text/plain",
    176                        mimetype,
    177                        strlen (mimetype) + 1))
    178     {
    179       EXTRACTOR_common_unzip_close (uf);
    180       free (mimetype);
    181       return;
    182     }
    183     free (mimetype);
    184   }
    185   if (EXTRACTOR_UNZIP_OK !=
    186       EXTRACTOR_common_unzip_go_find_local_file (uf,
    187                                                  METAFILE,
    188                                                  2))
    189   {
    190     /* metafile not found */
    191     EXTRACTOR_common_unzip_close (uf);
    192     return;
    193   }
    194   if (EXTRACTOR_UNZIP_OK !=
    195       EXTRACTOR_common_unzip_get_current_file_info (uf,
    196                                                     &file_info,
    197                                                     filename_inzip,
    198                                                     sizeof (filename_inzip),
    199                                                     NULL, 0, NULL, 0))
    200   {
    201     /* problems accessing metafile */
    202     EXTRACTOR_common_unzip_close (uf);
    203     return;
    204   }
    205   if (EXTRACTOR_UNZIP_OK !=
    206       EXTRACTOR_common_unzip_open_current_file (uf))
    207   {
    208     /* problems with unzip */
    209     EXTRACTOR_common_unzip_close (uf);
    210     return;
    211   }
    212 
    213   buf_size = file_info.uncompressed_size;
    214   if (buf_size > 128 * 1024)
    215   {
    216     /* too big to be meta-data! */
    217     EXTRACTOR_common_unzip_close_current_file (uf);
    218     EXTRACTOR_common_unzip_close (uf);
    219     return;
    220   }
    221   if (NULL == (buf = malloc (buf_size + 1)))
    222   {
    223     /* out of memory */
    224     EXTRACTOR_common_unzip_close_current_file (uf);
    225     EXTRACTOR_common_unzip_close (uf);
    226     return;
    227   }
    228   if (buf_size != EXTRACTOR_common_unzip_read_current_file (uf, buf, buf_size))
    229   {
    230     EXTRACTOR_common_unzip_close_current_file (uf);
    231     goto CLEANUP;
    232   }
    233   EXTRACTOR_common_unzip_close_current_file (uf);
    234   /* we don't do "proper" parsing of the meta-data but rather use some heuristics
    235      to get values out that we understand */
    236   buf[buf_size] = '\0';
    237   /* printf("%s\n", buf); */
    238   /* try to find some of the typical OO xml headers */
    239   if ( (strstr (buf, "xmlns:meta=\"http://openoffice.org/2000/meta\"") !=
    240         NULL) ||
    241        (strstr (buf, "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"") !=
    242         NULL) ||
    243        (strstr (buf, "xmlns:xlink=\"http://www.w3.org/1999/xlink\"") != NULL) )
    244   {
    245     /* accept as meta-data */
    246     for (i = 0; NULL  != tmap[i].text; i++)
    247     {
    248       char *spos;
    249       char *epos;
    250       char needle[256];
    251       int oc;
    252 
    253       pbuf = buf;
    254 
    255       while (1)
    256       {
    257         strcpy (needle, "<");
    258         strcat (needle, tmap[i].text);
    259         strcat (needle, ">");
    260         spos = strstr (pbuf, needle);
    261         if (NULL == spos)
    262         {
    263           strcpy (needle, tmap[i].text);
    264           strcat (needle, "=\"");
    265           spos = strstr (pbuf, needle);
    266           if (spos == NULL)
    267             break;
    268           spos += strlen (needle);
    269           epos = spos;
    270           while ( (epos[0] != '\0') &&
    271                   (epos[0] != '"') )
    272             epos++;
    273         }
    274         else
    275         {
    276           oc = 0;
    277           spos += strlen (needle);
    278           while ( (spos[0] != '\0') &&
    279                   ( (spos[0] == '<') ||
    280                     (oc > 0) ) )
    281           {
    282             if (spos[0] == '<')
    283               oc++;
    284             if (spos[0] == '>')
    285               oc--;
    286             spos++;
    287           }
    288           epos = spos;
    289           while ( (epos[0] != '\0') &&
    290                   (epos[0] != '<') &&
    291                   (epos[0] != '>') )
    292           {
    293             epos++;
    294           }
    295         }
    296         if (spos != epos)
    297         {
    298           char key[epos - spos + 1];
    299 
    300           memcpy (key, spos, epos - spos);
    301           key[epos - spos] = '\0';
    302           if (0 != ec->proc (ec->cls,
    303                              "odf",
    304                              tmap[i].type,
    305                              EXTRACTOR_METAFORMAT_UTF8,
    306                              "text/plain",
    307                              key,
    308                              epos - spos + 1))
    309             goto CLEANUP;
    310           pbuf = epos;
    311         }
    312         else
    313           break;
    314       }
    315     }
    316   }
    317 CLEANUP:
    318   free (buf);
    319   EXTRACTOR_common_unzip_close (uf);
    320 }
    321 
    322 
    323 /* end of odf_extractor.c */