man_extractor.c (7055B)
1 /* 2 This file is part of libextractor. 3 Copyright (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff 4 5 libextractor is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 libextractor is distributed in the hope that it will be useful, but 11 WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with libextractor; see the file COPYING. If not, write to the 17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 Boston, MA 02110-1301, USA. 19 */ 20 /** 21 * @file plugins/man_extractor.c 22 * @brief plugin to support man pages 23 * @author Christian Grothoff 24 */ 25 #include "platform.h" 26 #include "extractor.h" 27 #include <ctype.h> 28 29 30 /** 31 * Create string from first 'n' characters of 'str'. See 'strndup'. 32 * 33 * @param str input string 34 * @param n desired output length (plus 0-termination) 35 * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error 36 */ 37 static char * 38 stndup (const char *str, size_t n) 39 { 40 char *tmp; 41 42 if (NULL == (tmp = malloc (n + 1))) 43 return NULL; 44 tmp[n] = '\0'; 45 memcpy (tmp, str, n); 46 return tmp; 47 } 48 49 50 /** 51 * Give a metadata item to LE. Removes double-quotes and 52 * makes sure we don't pass empty strings or NULL pointers. 53 * 54 * @param type metadata type to use 55 * @param keyword metadata value; freed in the process 56 * @param proc function to call with meta data 57 * @param proc_cls closure for 'proc' 58 * @return 0 to continue extracting, 1 if we are done 59 */ 60 static int 61 add_keyword (enum EXTRACTOR_MetaType type, 62 char *keyword, 63 EXTRACTOR_MetaDataProcessor proc, 64 void *proc_cls) 65 { 66 int ret; 67 char *value; 68 69 if (NULL == keyword) 70 return 0; 71 if ( (keyword[0] == '\"') && 72 (keyword[strlen (keyword) - 1] == '\"') ) 73 { 74 keyword[strlen (keyword) - 1] = '\0'; 75 value = &keyword[1]; 76 } 77 else 78 value = keyword; 79 if (0 == strlen (value)) 80 { 81 free (keyword); 82 return 0; 83 } 84 ret = proc (proc_cls, 85 "man", 86 type, 87 EXTRACTOR_METAFORMAT_UTF8, 88 "text/plain", 89 value, 90 strlen (value) + 1); 91 free (keyword); 92 return ret; 93 } 94 95 96 /** 97 * Find the end of the current token (which may be quoted). 98 * 99 * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly 100 * @param buf input buffer with the characters 101 * @param size number of bytes in buf 102 */ 103 static void 104 find_end_of_token (size_t *end, 105 const char *buf, 106 const size_t size) 107 { 108 int quot; 109 110 quot = 0; 111 while ( (*end < size) && 112 ( (0 != (quot & 1)) || 113 ((' ' != buf[*end])) ) ) 114 { 115 if ('\"' == buf[*end]) 116 quot++; 117 (*end)++; 118 } 119 if (1 == (quot & 1)) 120 (*end) = size + 1; 121 } 122 123 124 /** 125 * How many bytes do we actually try to scan? (from the beginning 126 * of the file). 127 */ 128 #define MAX_READ (16 * 1024) 129 130 131 /** 132 * Add a keyword to LE. 133 * 134 * @param t type to use 135 * @param s keyword to give to LE 136 */ 137 #define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; \ 138 } while (0) 139 140 141 /** 142 * Main entry method for the man page extraction plugin. 143 * 144 * @param ec extraction context provided to the plugin 145 */ 146 void 147 EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec) 148 { 149 const size_t xlen = strlen (".TH "); 150 size_t pos; 151 size_t xsize; 152 size_t end; 153 void *data; 154 ssize_t size; 155 char *buf; 156 157 if (0 >= (size = ec->read (ec->cls, &data, MAX_READ))) 158 return; 159 buf = data; 160 pos = 0; 161 if (size < xlen) 162 return; 163 /* find actual beginning of the man page (.TH); 164 abort if we find non-printable characters */ 165 while ( (pos < size - xlen) && 166 ( (0 != strncmp (".TH ", 167 &buf[pos], 168 xlen)) || 169 ( (0 != pos) && 170 (buf[pos - 1] != '\n') ) ) ) 171 { 172 if ( (! isgraph ((unsigned char) buf[pos])) && 173 (! isspace ((unsigned char) buf[pos])) ) 174 return; 175 pos++; 176 } 177 if (0 != strncmp (".TH ", &buf[pos], xlen)) 178 return; 179 180 /* find end of ".TH"-line */ 181 xsize = pos; 182 while ( (xsize < size) && ('\n' != buf[xsize]) ) 183 xsize++; 184 /* limit processing to ".TH" line */ 185 size = xsize; 186 187 /* skip over ".TH" */ 188 pos += xlen; 189 190 /* first token is the title */ 191 end = pos; 192 find_end_of_token (&end, buf, size); 193 if (end > size) 194 return; 195 if (end > pos) 196 { 197 ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos)); 198 pos = end + 1; 199 } 200 if (pos >= size) 201 return; 202 203 /* next token is the section */ 204 end = pos; 205 find_end_of_token (&end, buf, size); 206 if (end > size) 207 return; 208 if ('\"' == buf[pos]) 209 pos++; 210 if ((end - pos >= 1) && (end - pos <= 4)) 211 { 212 switch (buf[pos]) 213 { 214 case '1': 215 ADD (EXTRACTOR_METATYPE_SECTION, 216 strdup (_ ("Commands"))); 217 break; 218 case '2': 219 ADD (EXTRACTOR_METATYPE_SECTION, 220 strdup (_ ("System calls"))); 221 break; 222 case '3': 223 ADD (EXTRACTOR_METATYPE_SECTION, 224 strdup (_ ("Library calls"))); 225 break; 226 case '4': 227 ADD (EXTRACTOR_METATYPE_SECTION, 228 strdup (_ ("Special files"))); 229 break; 230 case '5': 231 ADD (EXTRACTOR_METATYPE_SECTION, 232 strdup (_ ("File formats and conventions"))); 233 break; 234 case '6': 235 ADD (EXTRACTOR_METATYPE_SECTION, 236 strdup (_ ("Games"))); 237 break; 238 case '7': 239 ADD (EXTRACTOR_METATYPE_SECTION, 240 strdup (_ ("Conventions and miscellaneous"))); 241 break; 242 case '8': 243 ADD (EXTRACTOR_METATYPE_SECTION, 244 strdup (_ ("System management commands"))); 245 break; 246 case '9': 247 ADD (EXTRACTOR_METATYPE_SECTION, 248 strdup (_ ("Kernel routines"))); 249 break; 250 default: 251 ADD (EXTRACTOR_METATYPE_SECTION, 252 stndup (&buf[pos], 1)); 253 } 254 pos = end + 1; 255 } 256 end = pos; 257 258 /* next token is the modification date */ 259 find_end_of_token (&end, buf, size); 260 if (end > size) 261 return; 262 if (end > pos) 263 { 264 ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos)); 265 pos = end + 1; 266 } 267 268 /* next token is the source of the man page */ 269 end = pos; 270 find_end_of_token (&end, buf, size); 271 if (end > size) 272 return; 273 if (end > pos) 274 { 275 ADD (EXTRACTOR_METATYPE_SOURCE, 276 stndup (&buf[pos], end - pos)); 277 pos = end + 1; 278 } 279 280 /* last token is the title of the book the man page belongs to */ 281 end = pos; 282 find_end_of_token (&end, buf, size); 283 if (end > size) 284 return; 285 if (end > pos) 286 { 287 ADD (EXTRACTOR_METATYPE_BOOK_TITLE, 288 stndup (&buf[pos], end - pos)); 289 pos = end + 1; 290 } 291 } 292 293 294 /* end of man_extractor.c */