diff options
Diffstat (limited to 'src/plugins/man_extractor.c')
-rw-r--r-- | src/plugins/man_extractor.c | 292 |
1 files changed, 292 insertions, 0 deletions
diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c new file mode 100644 index 0000000..f074e5b --- /dev/null +++ b/src/plugins/man_extractor.c | |||
@@ -0,0 +1,292 @@ | |||
1 | /* | ||
2 | This file is part of libextractor. | ||
3 | (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff | ||
4 | |||
5 | libextractor is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 3, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | libextractor is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with libextractor; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | */ | ||
20 | /** | ||
21 | * @file plugins/man_extractor.c | ||
22 | * @brief plugin to support man pages | ||
23 | * @author Christian Grothoff | ||
24 | */ | ||
25 | #include "platform.h" | ||
26 | #include "extractor.h" | ||
27 | #include <ctype.h> | ||
28 | |||
29 | |||
30 | /** | ||
31 | * Create string from first 'n' characters of 'str'. See 'strndup'. | ||
32 | * | ||
33 | * @param str input string | ||
34 | * @param n desired output length (plus 0-termination) | ||
35 | * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error | ||
36 | */ | ||
37 | static char * | ||
38 | stndup (const char *str, size_t n) | ||
39 | { | ||
40 | char *tmp; | ||
41 | |||
42 | if (NULL == (tmp = malloc (n + 1))) | ||
43 | return NULL; | ||
44 | tmp[n] = '\0'; | ||
45 | memcpy (tmp, str, n); | ||
46 | return tmp; | ||
47 | } | ||
48 | |||
49 | |||
50 | /** | ||
51 | * Give a metadata item to LE. Removes double-quotes and | ||
52 | * makes sure we don't pass empty strings or NULL pointers. | ||
53 | * | ||
54 | * @param type metadata type to use | ||
55 | * @param keyword metdata value; freed in the process | ||
56 | * @param proc function to call with meta data | ||
57 | * @param proc_cls closure for 'proc' | ||
58 | * @return 0 to continue extracting, 1 if we are done | ||
59 | */ | ||
60 | static int | ||
61 | add_keyword (enum EXTRACTOR_MetaType type, | ||
62 | char *keyword, | ||
63 | EXTRACTOR_MetaDataProcessor proc, | ||
64 | void *proc_cls) | ||
65 | { | ||
66 | int ret; | ||
67 | char *value; | ||
68 | |||
69 | if (NULL == keyword) | ||
70 | return 0; | ||
71 | if ( (keyword[0] == '\"') && | ||
72 | (keyword[strlen (keyword) - 1] == '\"') ) | ||
73 | { | ||
74 | keyword[strlen (keyword) - 1] = '\0'; | ||
75 | value = &keyword[1]; | ||
76 | } | ||
77 | else | ||
78 | value = keyword; | ||
79 | if (0 == strlen (value)) | ||
80 | { | ||
81 | free (keyword); | ||
82 | return 0; | ||
83 | } | ||
84 | ret = proc (proc_cls, | ||
85 | "man", | ||
86 | type, | ||
87 | EXTRACTOR_METAFORMAT_UTF8, | ||
88 | "text/plain", | ||
89 | value, | ||
90 | strlen (value)+1); | ||
91 | free (keyword); | ||
92 | return ret; | ||
93 | } | ||
94 | |||
95 | |||
96 | /** | ||
97 | * Find the end of the current token (which may be quoted). | ||
98 | * | ||
99 | * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly | ||
100 | * @param buf input buffer with the characters | ||
101 | * @param size number of bytes in buf | ||
102 | */ | ||
103 | static void | ||
104 | find_end_of_token (size_t *end, | ||
105 | const char *buf, | ||
106 | const size_t size) | ||
107 | { | ||
108 | int quot; | ||
109 | |||
110 | quot = 0; | ||
111 | while ( (*end < size) && | ||
112 | ( (0 != (quot & 1)) || | ||
113 | ((' ' != buf[*end])) ) ) | ||
114 | { | ||
115 | if ('\"' == buf[*end]) | ||
116 | quot++; | ||
117 | (*end)++; | ||
118 | } | ||
119 | if (1 == (quot & 1)) | ||
120 | (*end) = size + 1; | ||
121 | } | ||
122 | |||
123 | |||
124 | /** | ||
125 | * How many bytes do we actually try to scan? (from the beginning | ||
126 | * of the file). | ||
127 | */ | ||
128 | #define MAX_READ (16 * 1024) | ||
129 | |||
130 | |||
131 | /** | ||
132 | * Add a keyword to LE. | ||
133 | * | ||
134 | * @param t type to use | ||
135 | * @param s keyword to give to LE | ||
136 | */ | ||
137 | #define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; } while (0) | ||
138 | |||
139 | |||
140 | /** | ||
141 | * Main entry method for the man page extraction plugin. | ||
142 | * | ||
143 | * @param ec extraction context provided to the plugin | ||
144 | */ | ||
145 | void | ||
146 | EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec) | ||
147 | { | ||
148 | const size_t xlen = strlen (".TH "); | ||
149 | size_t pos; | ||
150 | size_t xsize; | ||
151 | size_t end; | ||
152 | void *data; | ||
153 | ssize_t size; | ||
154 | char *buf; | ||
155 | |||
156 | if (0 >= (size = ec->read (ec->cls, &data, MAX_READ))) | ||
157 | return; | ||
158 | buf = data; | ||
159 | pos = 0; | ||
160 | if (size < xlen) | ||
161 | return; | ||
162 | /* find actual beginning of the man page (.TH); | ||
163 | abort if we find non-printable characters */ | ||
164 | while ( (pos < size - xlen) && | ||
165 | ( (0 != strncmp (".TH ", | ||
166 | &buf[pos], | ||
167 | xlen)) || | ||
168 | ( (0 != pos) && | ||
169 | (buf[pos - 1] != '\n') ) ) ) | ||
170 | { | ||
171 | if ( (! isgraph ((unsigned char) buf[pos])) && | ||
172 | (! isspace ((unsigned char) buf[pos])) ) | ||
173 | return; | ||
174 | pos++; | ||
175 | } | ||
176 | if (0 != strncmp (".TH ", &buf[pos], xlen)) | ||
177 | return; | ||
178 | |||
179 | /* find end of ".TH"-line */ | ||
180 | xsize = pos; | ||
181 | while ( (xsize < size) && ('\n' != buf[xsize]) ) | ||
182 | xsize++; | ||
183 | /* limit processing to ".TH" line */ | ||
184 | size = xsize; | ||
185 | |||
186 | /* skip over ".TH" */ | ||
187 | pos += xlen; | ||
188 | |||
189 | /* first token is the title */ | ||
190 | end = pos; | ||
191 | find_end_of_token (&end, buf, size); | ||
192 | if (end > size) | ||
193 | return; | ||
194 | if (end > pos) | ||
195 | { | ||
196 | ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos)); | ||
197 | pos = end + 1; | ||
198 | } | ||
199 | if (pos >= size) | ||
200 | return; | ||
201 | |||
202 | /* next token is the section */ | ||
203 | end = pos; | ||
204 | find_end_of_token (&end, buf, size); | ||
205 | if (end > size) | ||
206 | return; | ||
207 | if ('\"' == buf[pos]) | ||
208 | pos++; | ||
209 | if ((end - pos >= 1) && (end - pos <= 4)) | ||
210 | { | ||
211 | switch (buf[pos]) | ||
212 | { | ||
213 | case '1': | ||
214 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
215 | strdup (_("Commands"))); | ||
216 | break; | ||
217 | case '2': | ||
218 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
219 | strdup (_("System calls"))); | ||
220 | break; | ||
221 | case '3': | ||
222 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
223 | strdup (_("Library calls"))); | ||
224 | break; | ||
225 | case '4': | ||
226 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
227 | strdup (_("Special files"))); | ||
228 | break; | ||
229 | case '5': | ||
230 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
231 | strdup (_("File formats and conventions"))); | ||
232 | break; | ||
233 | case '6': | ||
234 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
235 | strdup (_("Games"))); | ||
236 | break; | ||
237 | case '7': | ||
238 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
239 | strdup (_("Conventions and miscellaneous"))); | ||
240 | break; | ||
241 | case '8': | ||
242 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
243 | strdup (_("System management commands"))); | ||
244 | break; | ||
245 | case '9': | ||
246 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
247 | strdup (_("Kernel routines"))); | ||
248 | break; | ||
249 | default: | ||
250 | ADD (EXTRACTOR_METATYPE_SECTION, | ||
251 | stndup (&buf[pos], 1)); | ||
252 | } | ||
253 | pos = end + 1; | ||
254 | } | ||
255 | end = pos; | ||
256 | |||
257 | /* next token is the modification date */ | ||
258 | find_end_of_token (&end, buf, size); | ||
259 | if (end > size) | ||
260 | return; | ||
261 | if (end > pos) | ||
262 | { | ||
263 | ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos)); | ||
264 | pos = end + 1; | ||
265 | } | ||
266 | |||
267 | /* next token is the source of the man page */ | ||
268 | end = pos; | ||
269 | find_end_of_token (&end, buf, size); | ||
270 | if (end > size) | ||
271 | return; | ||
272 | if (end > pos) | ||
273 | { | ||
274 | ADD (EXTRACTOR_METATYPE_SOURCE, | ||
275 | stndup (&buf[pos], end - pos)); | ||
276 | pos = end + 1; | ||
277 | } | ||
278 | |||
279 | /* last token is the title of the book the man page belongs to */ | ||
280 | end = pos; | ||
281 | find_end_of_token (&end, buf, size); | ||
282 | if (end > size) | ||
283 | return; | ||
284 | if (end > pos) | ||
285 | { | ||
286 | ADD (EXTRACTOR_METATYPE_BOOK_TITLE, | ||
287 | stndup (&buf[pos], end - pos)); | ||
288 | pos = end + 1; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | /* end of man_extractor.c */ | ||