libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

man_extractor.c (7055B)


      1 /*
      2      This file is part of libextractor.
      3      Copyright (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
      4 
      5      libextractor is free software; you can redistribute it and/or modify
      6      it under the terms of the GNU General Public License as published
      7      by the Free Software Foundation; either version 3, or (at your
      8      option) any later version.
      9 
     10      libextractor is distributed in the hope that it will be useful, but
     11      WITHOUT ANY WARRANTY; without even the implied warranty of
     12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13      General Public License for more details.
     14 
     15      You should have received a copy of the GNU General Public License
     16      along with libextractor; see the file COPYING.  If not, write to the
     17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18      Boston, MA 02110-1301, USA.
     19  */
     20 /**
     21  * @file plugins/man_extractor.c
     22  * @brief plugin to support man pages
     23  * @author Christian Grothoff
     24  */
     25 #include "platform.h"
     26 #include "extractor.h"
     27 #include <ctype.h>
     28 
     29 
     30 /**
     31  * Create string from first 'n' characters of 'str'.  See 'strndup'.
     32  *
     33  * @param str input string
     34  * @param n desired output length (plus 0-termination)
     35  * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
     36  */
     37 static char *
     38 stndup (const char *str, size_t n)
     39 {
     40   char *tmp;
     41 
     42   if (NULL == (tmp = malloc (n + 1)))
     43     return NULL;
     44   tmp[n] = '\0';
     45   memcpy (tmp, str, n);
     46   return tmp;
     47 }
     48 
     49 
     50 /**
     51  * Give a metadata item to LE.  Removes double-quotes and
     52  * makes sure we don't pass empty strings or NULL pointers.
     53  *
     54  * @param type metadata type to use
     55  * @param keyword metadata value; freed in the process
     56  * @param proc function to call with meta data
     57  * @param proc_cls closure for 'proc'
     58  * @return 0 to continue extracting, 1 if we are done
     59  */
     60 static int
     61 add_keyword (enum EXTRACTOR_MetaType type,
     62              char *keyword,
     63              EXTRACTOR_MetaDataProcessor proc,
     64              void *proc_cls)
     65 {
     66   int ret;
     67   char *value;
     68 
     69   if (NULL == keyword)
     70     return 0;
     71   if ( (keyword[0] == '\"') &&
     72        (keyword[strlen (keyword) - 1] == '\"') )
     73   {
     74     keyword[strlen (keyword) - 1] = '\0';
     75     value = &keyword[1];
     76   }
     77   else
     78     value = keyword;
     79   if (0 == strlen (value))
     80   {
     81     free (keyword);
     82     return 0;
     83   }
     84   ret = proc (proc_cls,
     85               "man",
     86               type,
     87               EXTRACTOR_METAFORMAT_UTF8,
     88               "text/plain",
     89               value,
     90               strlen (value) + 1);
     91   free (keyword);
     92   return ret;
     93 }
     94 
     95 
     96 /**
     97  * Find the end of the current token (which may be quoted).
     98  *
     99  * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly
    100  * @param buf input buffer with the characters
    101  * @param size number of bytes in buf
    102  */
    103 static void
    104 find_end_of_token (size_t *end,
    105                    const char *buf,
    106                    const size_t size)
    107 {
    108   int quot;
    109 
    110   quot = 0;
    111   while ( (*end < size) &&
    112           ( (0 != (quot & 1)) ||
    113             ((' ' != buf[*end])) ) )
    114   {
    115     if ('\"' == buf[*end])
    116       quot++;
    117     (*end)++;
    118   }
    119   if (1 == (quot & 1))
    120     (*end) = size + 1;
    121 }
    122 
    123 
    124 /**
    125  * How many bytes do we actually try to scan? (from the beginning
    126  * of the file).
    127  */
    128 #define MAX_READ (16 * 1024)
    129 
    130 
    131 /**
    132  * Add a keyword to LE.
    133  *
    134  * @param t type to use
    135  * @param s keyword to give to LE
    136  */
    137 #define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; \
    138 } while (0)
    139 
    140 
    141 /**
    142  * Main entry method for the man page extraction plugin.
    143  *
    144  * @param ec extraction context provided to the plugin
    145  */
    146 void
    147 EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec)
    148 {
    149   const size_t xlen = strlen (".TH ");
    150   size_t pos;
    151   size_t xsize;
    152   size_t end;
    153   void *data;
    154   ssize_t size;
    155   char *buf;
    156 
    157   if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
    158     return;
    159   buf = data;
    160   pos = 0;
    161   if (size < xlen)
    162     return;
    163   /* find actual beginning of the man page (.TH);
    164      abort if we find non-printable characters */
    165   while ( (pos < size - xlen) &&
    166           ( (0 != strncmp (".TH ",
    167                            &buf[pos],
    168                            xlen)) ||
    169             ( (0 != pos) &&
    170               (buf[pos - 1] != '\n') ) ) )
    171   {
    172     if ( (! isgraph ((unsigned char) buf[pos])) &&
    173          (! isspace ((unsigned char) buf[pos])) )
    174       return;
    175     pos++;
    176   }
    177   if (0 != strncmp (".TH ", &buf[pos], xlen))
    178     return;
    179 
    180   /* find end of ".TH"-line */
    181   xsize = pos;
    182   while ( (xsize < size) && ('\n' != buf[xsize]) )
    183     xsize++;
    184   /* limit processing to ".TH" line */
    185   size = xsize;
    186 
    187   /* skip over ".TH" */
    188   pos += xlen;
    189 
    190   /* first token is the title */
    191   end = pos;
    192   find_end_of_token (&end, buf, size);
    193   if (end > size)
    194     return;
    195   if (end > pos)
    196   {
    197     ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
    198     pos = end + 1;
    199   }
    200   if (pos >= size)
    201     return;
    202 
    203   /* next token is the section */
    204   end = pos;
    205   find_end_of_token (&end, buf, size);
    206   if (end > size)
    207     return;
    208   if ('\"' == buf[pos])
    209     pos++;
    210   if ((end - pos >= 1) && (end - pos <= 4))
    211   {
    212     switch (buf[pos])
    213     {
    214     case '1':
    215       ADD (EXTRACTOR_METATYPE_SECTION,
    216            strdup (_ ("Commands")));
    217       break;
    218     case '2':
    219       ADD (EXTRACTOR_METATYPE_SECTION,
    220            strdup (_ ("System calls")));
    221       break;
    222     case '3':
    223       ADD (EXTRACTOR_METATYPE_SECTION,
    224            strdup (_ ("Library calls")));
    225       break;
    226     case '4':
    227       ADD (EXTRACTOR_METATYPE_SECTION,
    228            strdup (_ ("Special files")));
    229       break;
    230     case '5':
    231       ADD (EXTRACTOR_METATYPE_SECTION,
    232            strdup (_ ("File formats and conventions")));
    233       break;
    234     case '6':
    235       ADD (EXTRACTOR_METATYPE_SECTION,
    236            strdup (_ ("Games")));
    237       break;
    238     case '7':
    239       ADD (EXTRACTOR_METATYPE_SECTION,
    240            strdup (_ ("Conventions and miscellaneous")));
    241       break;
    242     case '8':
    243       ADD (EXTRACTOR_METATYPE_SECTION,
    244            strdup (_ ("System management commands")));
    245       break;
    246     case '9':
    247       ADD (EXTRACTOR_METATYPE_SECTION,
    248            strdup (_ ("Kernel routines")));
    249       break;
    250     default:
    251       ADD (EXTRACTOR_METATYPE_SECTION,
    252            stndup (&buf[pos], 1));
    253     }
    254     pos = end + 1;
    255   }
    256   end = pos;
    257 
    258   /* next token is the modification date */
    259   find_end_of_token (&end, buf, size);
    260   if (end > size)
    261     return;
    262   if (end > pos)
    263   {
    264     ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
    265     pos = end + 1;
    266   }
    267 
    268   /* next token is the source of the man page */
    269   end = pos;
    270   find_end_of_token (&end, buf, size);
    271   if (end > size)
    272     return;
    273   if (end > pos)
    274   {
    275     ADD (EXTRACTOR_METATYPE_SOURCE,
    276          stndup (&buf[pos], end - pos));
    277     pos = end + 1;
    278   }
    279 
    280   /* last token is the title of the book the man page belongs to */
    281   end = pos;
    282   find_end_of_token (&end, buf, size);
    283   if (end > size)
    284     return;
    285   if (end > pos)
    286   {
    287     ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
    288          stndup (&buf[pos], end - pos));
    289     pos = end + 1;
    290   }
    291 }
    292 
    293 
    294 /* end of man_extractor.c */