aboutsummaryrefslogtreecommitdiff
path: root/src/plugins/man_extractor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/plugins/man_extractor.c')
-rw-r--r--src/plugins/man_extractor.c292
1 files changed, 292 insertions, 0 deletions
diff --git a/src/plugins/man_extractor.c b/src/plugins/man_extractor.c
new file mode 100644
index 0000000..f074e5b
--- /dev/null
+++ b/src/plugins/man_extractor.c
@@ -0,0 +1,292 @@
1/*
2 This file is part of libextractor.
3 (C) 2002, 2003, 2004, 2009, 2012 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19 */
20/**
21 * @file plugins/man_extractor.c
22 * @brief plugin to support man pages
23 * @author Christian Grothoff
24 */
25#include "platform.h"
26#include "extractor.h"
27#include <ctype.h>
28
29
30/**
31 * Create string from first 'n' characters of 'str'. See 'strndup'.
32 *
33 * @param str input string
34 * @param n desired output length (plus 0-termination)
35 * @return copy of first 'n' bytes from 'str' plus 0-terminator, NULL on error
36 */
37static char *
38stndup (const char *str, size_t n)
39{
40 char *tmp;
41
42 if (NULL == (tmp = malloc (n + 1)))
43 return NULL;
44 tmp[n] = '\0';
45 memcpy (tmp, str, n);
46 return tmp;
47}
48
49
50/**
51 * Give a metadata item to LE. Removes double-quotes and
52 * makes sure we don't pass empty strings or NULL pointers.
53 *
54 * @param type metadata type to use
55 * @param keyword metdata value; freed in the process
56 * @param proc function to call with meta data
57 * @param proc_cls closure for 'proc'
58 * @return 0 to continue extracting, 1 if we are done
59 */
60static int
61add_keyword (enum EXTRACTOR_MetaType type,
62 char *keyword,
63 EXTRACTOR_MetaDataProcessor proc,
64 void *proc_cls)
65{
66 int ret;
67 char *value;
68
69 if (NULL == keyword)
70 return 0;
71 if ( (keyword[0] == '\"') &&
72 (keyword[strlen (keyword) - 1] == '\"') )
73 {
74 keyword[strlen (keyword) - 1] = '\0';
75 value = &keyword[1];
76 }
77 else
78 value = keyword;
79 if (0 == strlen (value))
80 {
81 free (keyword);
82 return 0;
83 }
84 ret = proc (proc_cls,
85 "man",
86 type,
87 EXTRACTOR_METAFORMAT_UTF8,
88 "text/plain",
89 value,
90 strlen (value)+1);
91 free (keyword);
92 return ret;
93}
94
95
96/**
97 * Find the end of the current token (which may be quoted).
98 *
99 * @param end beginning of the current token, updated to its end; set to size + 1 if the token does not end properly
100 * @param buf input buffer with the characters
101 * @param size number of bytes in buf
102 */
103static void
104find_end_of_token (size_t *end,
105 const char *buf,
106 const size_t size)
107{
108 int quot;
109
110 quot = 0;
111 while ( (*end < size) &&
112 ( (0 != (quot & 1)) ||
113 ((' ' != buf[*end])) ) )
114 {
115 if ('\"' == buf[*end])
116 quot++;
117 (*end)++;
118 }
119 if (1 == (quot & 1))
120 (*end) = size + 1;
121}
122
123
124/**
125 * How many bytes do we actually try to scan? (from the beginning
126 * of the file).
127 */
128#define MAX_READ (16 * 1024)
129
130
131/**
132 * Add a keyword to LE.
133 *
134 * @param t type to use
135 * @param s keyword to give to LE
136 */
137#define ADD(t,s) do { if (0 != add_keyword (t, s, ec->proc, ec->cls)) return; } while (0)
138
139
140/**
141 * Main entry method for the man page extraction plugin.
142 *
143 * @param ec extraction context provided to the plugin
144 */
145void
146EXTRACTOR_man_extract_method (struct EXTRACTOR_ExtractContext *ec)
147{
148 const size_t xlen = strlen (".TH ");
149 size_t pos;
150 size_t xsize;
151 size_t end;
152 void *data;
153 ssize_t size;
154 char *buf;
155
156 if (0 >= (size = ec->read (ec->cls, &data, MAX_READ)))
157 return;
158 buf = data;
159 pos = 0;
160 if (size < xlen)
161 return;
162 /* find actual beginning of the man page (.TH);
163 abort if we find non-printable characters */
164 while ( (pos < size - xlen) &&
165 ( (0 != strncmp (".TH ",
166 &buf[pos],
167 xlen)) ||
168 ( (0 != pos) &&
169 (buf[pos - 1] != '\n') ) ) )
170 {
171 if ( (! isgraph ((unsigned char) buf[pos])) &&
172 (! isspace ((unsigned char) buf[pos])) )
173 return;
174 pos++;
175 }
176 if (0 != strncmp (".TH ", &buf[pos], xlen))
177 return;
178
179 /* find end of ".TH"-line */
180 xsize = pos;
181 while ( (xsize < size) && ('\n' != buf[xsize]) )
182 xsize++;
183 /* limit processing to ".TH" line */
184 size = xsize;
185
186 /* skip over ".TH" */
187 pos += xlen;
188
189 /* first token is the title */
190 end = pos;
191 find_end_of_token (&end, buf, size);
192 if (end > size)
193 return;
194 if (end > pos)
195 {
196 ADD (EXTRACTOR_METATYPE_TITLE, stndup (&buf[pos], end - pos));
197 pos = end + 1;
198 }
199 if (pos >= size)
200 return;
201
202 /* next token is the section */
203 end = pos;
204 find_end_of_token (&end, buf, size);
205 if (end > size)
206 return;
207 if ('\"' == buf[pos])
208 pos++;
209 if ((end - pos >= 1) && (end - pos <= 4))
210 {
211 switch (buf[pos])
212 {
213 case '1':
214 ADD (EXTRACTOR_METATYPE_SECTION,
215 strdup (_("Commands")));
216 break;
217 case '2':
218 ADD (EXTRACTOR_METATYPE_SECTION,
219 strdup (_("System calls")));
220 break;
221 case '3':
222 ADD (EXTRACTOR_METATYPE_SECTION,
223 strdup (_("Library calls")));
224 break;
225 case '4':
226 ADD (EXTRACTOR_METATYPE_SECTION,
227 strdup (_("Special files")));
228 break;
229 case '5':
230 ADD (EXTRACTOR_METATYPE_SECTION,
231 strdup (_("File formats and conventions")));
232 break;
233 case '6':
234 ADD (EXTRACTOR_METATYPE_SECTION,
235 strdup (_("Games")));
236 break;
237 case '7':
238 ADD (EXTRACTOR_METATYPE_SECTION,
239 strdup (_("Conventions and miscellaneous")));
240 break;
241 case '8':
242 ADD (EXTRACTOR_METATYPE_SECTION,
243 strdup (_("System management commands")));
244 break;
245 case '9':
246 ADD (EXTRACTOR_METATYPE_SECTION,
247 strdup (_("Kernel routines")));
248 break;
249 default:
250 ADD (EXTRACTOR_METATYPE_SECTION,
251 stndup (&buf[pos], 1));
252 }
253 pos = end + 1;
254 }
255 end = pos;
256
257 /* next token is the modification date */
258 find_end_of_token (&end, buf, size);
259 if (end > size)
260 return;
261 if (end > pos)
262 {
263 ADD (EXTRACTOR_METATYPE_MODIFICATION_DATE, stndup (&buf[pos], end - pos));
264 pos = end + 1;
265 }
266
267 /* next token is the source of the man page */
268 end = pos;
269 find_end_of_token (&end, buf, size);
270 if (end > size)
271 return;
272 if (end > pos)
273 {
274 ADD (EXTRACTOR_METATYPE_SOURCE,
275 stndup (&buf[pos], end - pos));
276 pos = end + 1;
277 }
278
279 /* last token is the title of the book the man page belongs to */
280 end = pos;
281 find_end_of_token (&end, buf, size);
282 if (end > size)
283 return;
284 if (end > pos)
285 {
286 ADD (EXTRACTOR_METATYPE_BOOK_TITLE,
287 stndup (&buf[pos], end - pos));
288 pos = end + 1;
289 }
290}
291
292/* end of man_extractor.c */