diff options
author | Christian Grothoff <christian@grothoff.org> | 2012-01-28 21:08:10 +0000 |
---|---|---|
committer | Christian Grothoff <christian@grothoff.org> | 2012-01-28 21:08:10 +0000 |
commit | 6535dad120517e6572f93ffc28b800801bf2781d (patch) | |
tree | 4c4afc385e23e0234c8443f77eb91a2a0e01a335 /src/fs | |
parent | 4b9e92b8f0c33dd655abff1cda89b33aaf90ef16 (diff) | |
download | gnunet-6535dad120517e6572f93ffc28b800801bf2781d.tar.gz gnunet-6535dad120517e6572f93ffc28b800801bf2781d.zip |
-cleaner version of sharetree trim code; should be fast enough to run synchrnously
Diffstat (limited to 'src/fs')
-rw-r--r-- | src/fs/fs_sharetree.c | 408 |
1 files changed, 408 insertions, 0 deletions
diff --git a/src/fs/fs_sharetree.c b/src/fs/fs_sharetree.c new file mode 100644 index 000000000..864be58bf --- /dev/null +++ b/src/fs/fs_sharetree.c | |||
@@ -0,0 +1,408 @@ | |||
1 | /* | ||
2 | This file is part of GNUnet | ||
3 | (C) 2005-2012 Christian Grothoff (and other contributing authors) | ||
4 | |||
5 | GNUnet is free software; you can redistribute it and/or modify | ||
6 | it under the terms of the GNU General Public License as published | ||
7 | by the Free Software Foundation; either version 2, or (at your | ||
8 | option) any later version. | ||
9 | |||
10 | GNUnet is distributed in the hope that it will be useful, but | ||
11 | WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
13 | General Public License for more details. | ||
14 | |||
15 | You should have received a copy of the GNU General Public License | ||
16 | along with GNUnet; see the file COPYING. If not, write to the | ||
17 | Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
18 | Boston, MA 02111-1307, USA. | ||
19 | */ | ||
20 | |||
21 | /** | ||
22 | * @file fs/fs_sharetree.c | ||
23 | * @brief code to manipulate the 'struct GNUNET_FS_ShareTreeItem' tree | ||
24 | * @author LRN | ||
25 | * @author Christian Grothoff | ||
26 | */ | ||
27 | #include "platform.h" | ||
28 | #include "gnunet_fs_service.h" | ||
29 | #include "gnunet_scheduler_lib.h" | ||
30 | #include <pthread.h> | ||
31 | |||
32 | |||
33 | /** | ||
34 | * Entry for each unique keyword to track how often | ||
35 | * it occured. Contains the keyword and the counter. | ||
36 | */ | ||
37 | struct KeywordCounter | ||
38 | { | ||
39 | |||
40 | /** | ||
41 | * This is a doubly-linked list | ||
42 | */ | ||
43 | struct KeywordCounter *prev; | ||
44 | |||
45 | /** | ||
46 | * This is a doubly-linked list | ||
47 | */ | ||
48 | struct KeywordCounter *next; | ||
49 | |||
50 | /** | ||
51 | * Keyword that was found. | ||
52 | */ | ||
53 | const char *value; | ||
54 | |||
55 | /** | ||
56 | * How many files have this keyword? | ||
57 | */ | ||
58 | unsigned int count; | ||
59 | |||
60 | }; | ||
61 | |||
62 | |||
63 | /** | ||
64 | * Aggregate information we keep for meta data in each directory. | ||
65 | */ | ||
66 | struct MetaCounter | ||
67 | { | ||
68 | |||
69 | /** | ||
70 | * This is a doubly-linked list | ||
71 | */ | ||
72 | struct MetaCounter *prev; | ||
73 | |||
74 | /** | ||
75 | * This is a doubly-linked list | ||
76 | */ | ||
77 | struct MetaCounter *next; | ||
78 | |||
79 | /** | ||
80 | * Name of the plugin that provided that piece of metadata | ||
81 | */ | ||
82 | const char *plugin_name; | ||
83 | |||
84 | /** | ||
85 | * MIME-type of the metadata itself | ||
86 | */ | ||
87 | const char *data_mime_type; | ||
88 | |||
89 | /** | ||
90 | * The actual meta data. | ||
91 | */ | ||
92 | const char *data; | ||
93 | |||
94 | /** | ||
95 | * Number of bytes in 'data'. | ||
96 | */ | ||
97 | size_t data_size; | ||
98 | |||
99 | /** | ||
100 | * Type of the data | ||
101 | */ | ||
102 | enum EXTRACTOR_MetaType type; | ||
103 | |||
104 | /** | ||
105 | * Format of the data | ||
106 | */ | ||
107 | enum EXTRACTOR_MetaFormat format; | ||
108 | |||
109 | /** | ||
110 | * How many files have meta entries matching this value? | ||
111 | * (type and format do not have to match). | ||
112 | */ | ||
113 | unsigned int count; | ||
114 | |||
115 | }; | ||
116 | |||
117 | |||
118 | /** | ||
119 | * A structure that forms a singly-linked list that serves as a stack | ||
120 | * for metadata-processing function. | ||
121 | */ | ||
122 | struct TrimContext | ||
123 | { | ||
124 | |||
125 | /** | ||
126 | * Map from the hash over the keyword to an 'struct KeywordCounter *' | ||
127 | * counter that says how often this keyword was | ||
128 | * encountered in the current directory. | ||
129 | */ | ||
130 | struct GNUNET_CONTAINER_MultiHashMap *keywordcounter; | ||
131 | |||
132 | /** | ||
133 | * Map from the hash over the metadata to an 'struct MetaCounter *' | ||
134 | * counter that says how often this metadata was | ||
135 | * encountered in the current directory. | ||
136 | */ | ||
137 | struct GNUNET_CONTAINER_MultiHashMap *metacounter; | ||
138 | |||
139 | /** | ||
140 | * Position we are currently manipulating. | ||
141 | */ | ||
142 | struct GNUNET_FS_ShareTreeItem *pos; | ||
143 | |||
144 | /** | ||
145 | * Number of times an item has to be found to be moved to the parent. | ||
146 | */ | ||
147 | unsigned int move_threshold; | ||
148 | |||
149 | }; | ||
150 | |||
151 | |||
152 | /** | ||
153 | * Add the given keyword to the keyword statistics tracker. | ||
154 | * | ||
155 | * @param cls the multihashmap we store the keyword counters in | ||
156 | * @param keyword the keyword to count | ||
157 | * @param is_mandatory ignored | ||
158 | * @return always GNUNET_OK | ||
159 | */ | ||
160 | static int | ||
161 | add_to_keyword_counter (void *cls, const char *keyword, int is_mandatory) | ||
162 | { | ||
163 | struct GNUNET_CONTAINER_MultiHashMap *mcm = cls; | ||
164 | struct KeywordCounter *cnt; | ||
165 | GNUNET_HashCode hc; | ||
166 | size_t klen; | ||
167 | |||
168 | klen = strlen (keyword) + 1; | ||
169 | GNUNET_CRYPTO_hash (keyword, klen - 1, &hc); | ||
170 | cnt = GNUNET_CONTAINER_multihashmap_get (mcm, &hc); | ||
171 | if (cnt == NULL) | ||
172 | { | ||
173 | cnt = GNUNET_malloc (sizeof (struct KeywordCounter) + klen); | ||
174 | cnt->value = (const char *) &cnt[1]; | ||
175 | memcpy (&cnt[1], keyword, klen); | ||
176 | GNUNET_assert (GNUNET_OK == | ||
177 | GNUNET_CONTAINER_multihashmap_put (mcm, | ||
178 | &hc, cnt, | ||
179 | GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY)); | ||
180 | } | ||
181 | cnt->count++; | ||
182 | return GNUNET_OK; | ||
183 | } | ||
184 | |||
185 | |||
186 | /** | ||
187 | * Function called on each meta data item. Increments the | ||
188 | * respective counter. | ||
189 | * | ||
190 | * @param cls the container multihashmap to update | ||
191 | * @param plugin_name name of the plugin that produced this value; | ||
192 | * special values can be used (i.e. '<zlib>' for zlib being | ||
193 | * used in the main libextractor library and yielding | ||
194 | * meta data). | ||
195 | * @param type libextractor-type describing the meta data | ||
196 | * @param format basic format information about data | ||
197 | * @param data_mime_type mime-type of data (not of the original file); | ||
198 | * can be NULL (if mime-type is not known) | ||
199 | * @param data actual meta-data found | ||
200 | * @param data_len number of bytes in data | ||
201 | * @return GNUNET_OK to continue extracting / iterating | ||
202 | */ | ||
203 | static int | ||
204 | add_to_meta_counter (void *cls, const char *plugin_name, | ||
205 | enum EXTRACTOR_MetaType type, enum EXTRACTOR_MetaFormat format, | ||
206 | const char *data_mime_type, const char *data, size_t data_len) | ||
207 | { | ||
208 | struct GNUNET_CONTAINER_MultiHashMap *map = cls; | ||
209 | GNUNET_HashCode key; | ||
210 | struct MetaCounter *cnt; | ||
211 | |||
212 | GNUNET_CRYPTO_hash (data, data_len, &key); | ||
213 | cnt = GNUNET_CONTAINER_multihashmap_get (map, &key); | ||
214 | if (cnt == NULL) | ||
215 | { | ||
216 | cnt = GNUNET_malloc (sizeof (struct MetaCounter)); | ||
217 | cnt->data = data; | ||
218 | cnt->data_size = data_len; | ||
219 | cnt->plugin_name = plugin_name; | ||
220 | cnt->type = type; | ||
221 | cnt->format = format; | ||
222 | cnt->data_mime_type = data_mime_type; | ||
223 | GNUNET_assert (GNUNET_OK == | ||
224 | GNUNET_CONTAINER_multihashmap_put (map, | ||
225 | &key, cnt, | ||
226 | GNUNET_CONTAINER_MULTIHASHMAPOPTION_UNIQUE_ONLY)); | ||
227 | } | ||
228 | cnt->count++; | ||
229 | return 0; | ||
230 | } | ||
231 | |||
232 | |||
233 | /** | ||
234 | * Remove keywords above the threshold. | ||
235 | * | ||
236 | * @param cls the 'struct TrimContext' with the pos to remove the keywords from | ||
237 | * @param keyword the keyword to check | ||
238 | * @param is_mandatory ignored | ||
239 | * @return always GNUNET_OK | ||
240 | */ | ||
241 | static int | ||
242 | remove_high_frequency_keywords (void *cls, const char *keyword, int is_mandatory) | ||
243 | { | ||
244 | struct TrimContext *tc = cls; | ||
245 | struct KeywordCounter *counter; | ||
246 | GNUNET_HashCode hc; | ||
247 | size_t klen; | ||
248 | |||
249 | klen = strlen (keyword) + 1; | ||
250 | GNUNET_CRYPTO_hash (keyword, klen - 1, &hc); | ||
251 | counter = GNUNET_CONTAINER_multihashmap_get (tc->keywordcounter, &hc); | ||
252 | GNUNET_assert (NULL != counter); | ||
253 | if (counter->count < tc->move_threshold) | ||
254 | return GNUNET_OK; | ||
255 | GNUNET_FS_uri_ksk_remove_keyword (tc->pos->ksk_uri, | ||
256 | counter->value); | ||
257 | return GNUNET_OK; | ||
258 | } | ||
259 | |||
260 | |||
261 | /** | ||
262 | * Move "frequent" keywords over to the target ksk uri, free the | ||
263 | * counters. | ||
264 | * | ||
265 | * @param cls the 'struct TrimContext' | ||
266 | * @param key key of the entry | ||
267 | * @param value the 'struct KeywordCounter' | ||
268 | * @return GNUNET_YES (always) | ||
269 | */ | ||
270 | static int | ||
271 | migrate_and_drop_keywords (void *cls, const GNUNET_HashCode * key, void *value) | ||
272 | { | ||
273 | struct TrimContext *tc = cls; | ||
274 | struct KeywordCounter *counter = value; | ||
275 | |||
276 | if (counter->count >= tc->move_threshold) | ||
277 | GNUNET_FS_uri_ksk_add_keyword (tc->pos->ksk_uri, counter->value, GNUNET_NO); | ||
278 | GNUNET_assert (GNUNET_YES == | ||
279 | GNUNET_CONTAINER_multihashmap_remove (tc->keywordcounter, | ||
280 | key, | ||
281 | counter)); | ||
282 | GNUNET_free (counter); | ||
283 | return GNUNET_YES; | ||
284 | } | ||
285 | |||
286 | |||
287 | /** | ||
288 | * Copy "frequent" metadata items over to the | ||
289 | * target metadata container, free the counters. | ||
290 | * | ||
291 | * @param cls the 'struct TrimContext' | ||
292 | * @param key key of the entry | ||
293 | * @param value the 'struct KeywordCounter' | ||
294 | * @return GNUNET_YES (always) | ||
295 | */ | ||
296 | static int | ||
297 | migrate_and_drop_metadata (void *cls, const GNUNET_HashCode * key, void *value) | ||
298 | { | ||
299 | struct TrimContext *tc = cls; | ||
300 | struct MetaCounter *counter = value; | ||
301 | |||
302 | if (counter->count >= tc->move_threshold) | ||
303 | GNUNET_CONTAINER_meta_data_insert (tc->pos->meta, | ||
304 | counter->plugin_name, | ||
305 | counter->type, | ||
306 | counter->format, | ||
307 | counter->data_mime_type, counter->data, | ||
308 | counter->data_size); | ||
309 | GNUNET_assert (GNUNET_YES == | ||
310 | GNUNET_CONTAINER_multihashmap_remove (tc->metacounter, | ||
311 | key, | ||
312 | counter)); | ||
313 | GNUNET_free (counter); | ||
314 | return GNUNET_YES; | ||
315 | } | ||
316 | |||
317 | |||
318 | /** | ||
319 | * Process a share item tree, moving frequent keywords up and | ||
320 | * copying frequent metadata up. | ||
321 | * | ||
322 | * @param tc trim context with hash maps to use | ||
323 | * @param tree tree to trim | ||
324 | */ | ||
325 | static void | ||
326 | share_tree_trim (struct TrimContext *tc, | ||
327 | struct GNUNET_FS_ShareTreeItem *tree) | ||
328 | { | ||
329 | struct GNUNET_FS_ShareTreeItem *pos; | ||
330 | unsigned int num_children; | ||
331 | |||
332 | /* first, trim all children */ | ||
333 | num_children = 0; | ||
334 | for (pos = tree->children_head; NULL != pos; pos = pos->next) | ||
335 | { | ||
336 | share_tree_trim (tc, pos); | ||
337 | num_children++; | ||
338 | } | ||
339 | |||
340 | /* consider adding filename to directory meta data */ | ||
341 | if (tree->is_directory) | ||
342 | { | ||
343 | const char *user = getenv ("USER"); | ||
344 | if ( (user == NULL) || | ||
345 | (0 != strncasecmp (user, tree->short_filename, strlen(user)))) | ||
346 | { | ||
347 | /* only use filename if it doesn't match $USER */ | ||
348 | GNUNET_CONTAINER_meta_data_insert (tree->meta, "<libgnunetfs>", | ||
349 | EXTRACTOR_METATYPE_GNUNET_ORIGINAL_FILENAME, | ||
350 | EXTRACTOR_METAFORMAT_UTF8, | ||
351 | "text/plain", tree->short_filename, | ||
352 | strlen (tree->short_filename) + 1); | ||
353 | } | ||
354 | } | ||
355 | |||
356 | if (1 >= num_children) | ||
357 | return; /* nothing to trim */ | ||
358 | |||
359 | /* now, count keywords and meta data in children */ | ||
360 | for (pos = tree->children_head; NULL != pos; pos = pos->next) | ||
361 | { | ||
362 | GNUNET_CONTAINER_meta_data_iterate (pos->meta, &add_to_meta_counter, tc->metacounter); | ||
363 | GNUNET_FS_uri_ksk_get_keywords (pos->ksk_uri, &add_to_keyword_counter, tc->keywordcounter); | ||
364 | } | ||
365 | |||
366 | /* calculate threshold for moving keywords / meta data */ | ||
367 | tc->move_threshold = 1 + (num_children / 2); | ||
368 | |||
369 | /* remove high-frequency keywords from children */ | ||
370 | for (pos = tree->children_head; NULL != pos; pos = pos->next) | ||
371 | { | ||
372 | tc->pos = pos; | ||
373 | GNUNET_FS_uri_ksk_get_keywords (pos->ksk_uri, &remove_high_frequency_keywords, tc); | ||
374 | } | ||
375 | |||
376 | /* add high-frequency meta data and keywords to parent */ | ||
377 | tc->pos = tree; | ||
378 | GNUNET_CONTAINER_multihashmap_iterate (tc->keywordcounter, | ||
379 | &migrate_and_drop_keywords, | ||
380 | tc); | ||
381 | GNUNET_CONTAINER_multihashmap_iterate (tc->metacounter, | ||
382 | &migrate_and_drop_metadata, | ||
383 | tc); | ||
384 | } | ||
385 | |||
386 | |||
387 | /** | ||
388 | * Process a share item tree, moving frequent keywords up and | ||
389 | * copying frequent metadata up. | ||
390 | * | ||
391 | * @param toplevel toplevel directory in the tree, returned by the scanner | ||
392 | */ | ||
393 | void | ||
394 | GNUNET_FS_share_tree_trim (struct GNUNET_FS_ShareTreeItem *toplevel) | ||
395 | { | ||
396 | struct TrimContext tc; | ||
397 | |||
398 | if (toplevel == NULL) | ||
399 | return; | ||
400 | tc.keywordcounter = GNUNET_CONTAINER_multihashmap_create (1024); | ||
401 | tc.metacounter = GNUNET_CONTAINER_multihashmap_create (1024); | ||
402 | share_tree_trim (&tc, toplevel); | ||
403 | GNUNET_CONTAINER_multihashmap_destroy (tc.keywordcounter); | ||
404 | GNUNET_CONTAINER_multihashmap_destroy (tc.metacounter); | ||
405 | } | ||
406 | |||
407 | /* end fs_sharetree.c */ | ||
408 | |||