aboutsummaryrefslogtreecommitdiff
path: root/src/util/container_bloomfilter.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/util/container_bloomfilter.c')
-rw-r--r--src/util/container_bloomfilter.c677
1 files changed, 677 insertions, 0 deletions
diff --git a/src/util/container_bloomfilter.c b/src/util/container_bloomfilter.c
new file mode 100644
index 000000000..8b76ef8dc
--- /dev/null
+++ b/src/util/container_bloomfilter.c
@@ -0,0 +1,677 @@
1/*
2 This file is part of GNUnet.
3 (C) 2001, 2002, 2003, 2004, 2006, 2008 Christian Grothoff (and other contributing authors)
4
5 GNUnet is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 2, or (at your
8 option) any later version.
9
10 GNUnet is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GNUnet; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA.
19*/
20/**
21 * @file util/container_bloomfilter.c
22 * @brief data structure used to reduce disk accesses.
23 *
24 * The idea basically: Create a signature for each element in the
25 * database. Add those signatures to a bit array. When doing a lookup,
26 * check if the bit array matches the signature of the requested
27 * element. If yes, address the disk, otherwise return 'not found'.
28 *
29 * A property of the bloom filter is that sometimes we will have
30 * a match even if the element is not on the disk (then we do
31 * an unnecessary disk access), but what's most important is that
32 * we never get a single "false negative".
33 *
34 * To be able to delete entries from the bloom filter, we maintain
35 * a 4 bit counter in the file on the drive (we still use only one
36 * bit in memory).
37 *
38 * @author Igor Wronsky
39 * @author Christian Grothoff
40 */
41
42#include "platform.h"
43#include "gnunet_common.h"
44#include "gnunet_container_lib.h"
45#include "gnunet_disk_lib.h"
46
47struct GNUNET_CONTAINER_BloomFilter
48{
49
50 /**
51 * The actual bloomfilter bit array
52 */
53 char *bitArray;
54
55 /**
56 * Filename of the filter
57 */
58 char *filename;
59
60 /**
61 * The bit counter file on disk
62 */
63 int fd;
64
65 /**
66 * How many bits we set for each stored element
67 */
68 unsigned int addressesPerElement;
69
70 /**
71 * Size of bitArray in bytes
72 */
73 unsigned int bitArraySize;
74
75};
76
77
78/**
79 * Sets a bit active in the bitArray. Increment bit-specific
80 * usage counter on disk only if below 4bit max (==15).
81 *
82 * @param bitArray memory area to set the bit in
83 * @param bitIdx which bit to set
84 */
85static void
86setBit (char *bitArray, unsigned int bitIdx)
87{
88 unsigned int arraySlot;
89 unsigned int targetBit;
90
91 arraySlot = bitIdx / 8;
92 targetBit = (1L << (bitIdx % 8));
93 bitArray[arraySlot] |= targetBit;
94}
95
96/**
97 * Clears a bit from bitArray. Bit is cleared from the array
98 * only if the respective usage counter on the disk hits/is zero.
99 *
100 * @param bitArray memory area to set the bit in
101 * @param bitIdx which bit to unset
102 */
103static void
104clearBit (char *bitArray, unsigned int bitIdx)
105{
106 unsigned int slot;
107 unsigned int targetBit;
108
109 slot = bitIdx / 8;
110 targetBit = (1L << (bitIdx % 8));
111 bitArray[slot] = bitArray[slot] & (~targetBit);
112}
113
114/**
115 * Checks if a bit is active in the bitArray
116 *
117 * @param bitArray memory area to set the bit in
118 * @param bitIdx which bit to test
119 * @return GNUNET_YES if the bit is set, GNUNET_NO if not.
120 */
121static int
122testBit (char *bitArray, unsigned int bitIdx)
123{
124 unsigned int slot;
125 unsigned int targetBit;
126
127 slot = bitIdx / 8;
128 targetBit = (1L << (bitIdx % 8));
129 if (bitArray[slot] & targetBit)
130 return GNUNET_YES;
131 else
132 return GNUNET_NO;
133}
134
135/**
136 * Sets a bit active in the bitArray and increments
137 * bit-specific usage counter on disk (but only if
138 * the counter was below 4 bit max (==15)).
139 *
140 * @param bitArray memory area to set the bit in
141 * @param bitIdx which bit to test
142 * @param fd A file to keep the 4 bit address usage counters in
143 */
144static void
145incrementBit (char *bitArray, unsigned int bitIdx, int fd)
146{
147 unsigned int fileSlot;
148 unsigned char value;
149 unsigned int high;
150 unsigned int low;
151 unsigned int targetLoc;
152
153 setBit (bitArray, bitIdx);
154 if (fd == -1)
155 return;
156 /* Update the counter file on disk */
157 fileSlot = bitIdx / 2;
158 targetLoc = bitIdx % 2;
159
160 GNUNET_assert (fileSlot == (unsigned int) LSEEK (fd, fileSlot, SEEK_SET));
161 if (1 != READ (fd, &value, 1))
162 value = 0;
163 low = value & 0xF;
164 high = (value & (~0xF)) >> 4;
165
166 if (targetLoc == 0)
167 {
168 if (low < 0xF)
169 low++;
170 }
171 else
172 {
173 if (high < 0xF)
174 high++;
175 }
176 value = ((high << 4) | low);
177 GNUNET_assert (fileSlot == (unsigned int) LSEEK (fd, fileSlot, SEEK_SET));
178 GNUNET_assert (1 == WRITE (fd, &value, 1));
179}
180
181/**
182 * Clears a bit from bitArray if the respective usage
183 * counter on the disk hits/is zero.
184 *
185 * @param bitArray memory area to set the bit in
186 * @param bitIdx which bit to test
187 * @param fd A file to keep the 4bit address usage counters in
188 */
189static void
190decrementBit (char *bitArray, unsigned int bitIdx, int fd)
191{
192 unsigned int fileSlot;
193 unsigned char value;
194 unsigned int high;
195 unsigned int low;
196 unsigned int targetLoc;
197
198 if (fd == -1)
199 return; /* cannot decrement! */
200 /* Each char slot in the counter file holds two 4 bit counters */
201 fileSlot = bitIdx / 2;
202 targetLoc = bitIdx % 2;
203 LSEEK (fd, fileSlot, SEEK_SET);
204 if (1 != READ (fd, &value, 1))
205 value = 0;
206 low = value & 0xF;
207 high = (value & 0xF0) >> 4;
208
209 /* decrement, but once we have reached the max, never go back! */
210 if (targetLoc == 0)
211 {
212 if ((low > 0) && (low < 0xF))
213 low--;
214 if (low == 0)
215 {
216 clearBit (bitArray, bitIdx);
217 }
218 }
219 else
220 {
221 if ((high > 0) && (high < 0xF))
222 high--;
223 if (high == 0)
224 {
225 clearBit (bitArray, bitIdx);
226 }
227 }
228 value = ((high << 4) | low);
229 LSEEK (fd, fileSlot, SEEK_SET);
230 GNUNET_assert (1 == WRITE (fd, &value, 1));
231}
232
233#define BUFFSIZE 65536
234
235/**
236 * Creates a file filled with zeroes
237 *
238 * @param fd the file handle
239 * @param size the size of the file
240 * @return GNUNET_OK if created ok, GNUNET_SYSERR otherwise
241 */
242static int
243makeEmptyFile (int fd, unsigned int size)
244{
245 char *buffer;
246 unsigned int bytesleft = size;
247 int res = 0;
248
249 if (fd == -1)
250 return GNUNET_SYSERR;
251 buffer = GNUNET_malloc (BUFFSIZE);
252 memset (buffer, 0, BUFFSIZE);
253 LSEEK (fd, 0, SEEK_SET);
254
255 while (bytesleft > 0)
256 {
257 if (bytesleft > BUFFSIZE)
258 {
259 res = WRITE (fd, buffer, BUFFSIZE);
260 bytesleft -= BUFFSIZE;
261 }
262 else
263 {
264 res = WRITE (fd, buffer, bytesleft);
265 bytesleft = 0;
266 }
267 GNUNET_assert (res != -1);
268 }
269 GNUNET_free (buffer);
270 return GNUNET_OK;
271}
272
273/* ************** GNUNET_CONTAINER_BloomFilter GNUNET_CRYPTO_hash iterator ********* */
274
275/**
276 * Iterator (callback) method to be called by the
277 * bloomfilter iterator on each bit that is to be
278 * set or tested for the key.
279 *
280 * @param bf the filter to manipulate
281 * @param bit the current bit
282 * @param additional context specific argument
283 */
284typedef void (*BitIterator) (struct GNUNET_CONTAINER_BloomFilter * bf,
285 unsigned int bit, void *arg);
286
287/**
288 * Call an iterator for each bit that the bloomfilter
289 * must test or set for this element.
290 *
291 * @param bf the filter
292 * @param callback the method to call
293 * @param arg extra argument to callback
294 * @param key the key for which we iterate over the BF bits
295 */
296static void
297iterateBits (struct GNUNET_CONTAINER_BloomFilter *bf,
298 BitIterator callback, void *arg, const GNUNET_HashCode * key)
299{
300 GNUNET_HashCode tmp[2];
301 int bitCount;
302 int round;
303 unsigned int slot = 0;
304
305 bitCount = bf->addressesPerElement;
306 memcpy (&tmp[0], key, sizeof (GNUNET_HashCode));
307 round = 0;
308 while (bitCount > 0)
309 {
310 while (slot < (sizeof (GNUNET_HashCode) / sizeof (unsigned int)))
311 {
312 callback (bf,
313 (((unsigned int *) &tmp[round & 1])[slot]) &
314 ((bf->bitArraySize * 8) - 1), arg);
315 slot++;
316 bitCount--;
317 if (bitCount == 0)
318 break;
319 }
320 if (bitCount > 0)
321 {
322 GNUNET_CRYPTO_hash (&tmp[round & 1], sizeof (GNUNET_HashCode),
323 &tmp[(round + 1) & 1]);
324 round++;
325 slot = 0;
326 }
327 }
328}
329
330/**
331 * Callback: increment bit
332 *
333 * @param bf the filter to manipulate
334 * @param bit the bit to increment
335 * @param arg not used
336 */
337static void
338incrementBitCallback (struct GNUNET_CONTAINER_BloomFilter *bf,
339 unsigned int bit, void *arg)
340{
341 incrementBit (bf->bitArray, bit, bf->fd);
342}
343
344/**
345 * Callback: decrement bit
346 *
347 * @param bf the filter to manipulate
348 * @param bit the bit to decrement
349 * @param arg not used
350 */
351static void
352decrementBitCallback (struct GNUNET_CONTAINER_BloomFilter *bf,
353 unsigned int bit, void *arg)
354{
355 decrementBit (bf->bitArray, bit, bf->fd);
356}
357
358/**
359 * Callback: test if all bits are set
360 *
361 * @param bf the filter
362 * @param bit the bit to test
363 * @param arg pointer set to GNUNET_NO if bit is not set
364 */
365static void
366testBitCallback (struct GNUNET_CONTAINER_BloomFilter *bf, unsigned int bit,
367 void *cls)
368{
369 int *arg = cls;
370 if (GNUNET_NO == testBit (bf->bitArray, bit))
371 *arg = GNUNET_NO;
372}
373
374/* *********************** INTERFACE **************** */
375
376/**
377 * Load a bloom-filter from a file.
378 *
379 * @param filename the name of the file (or the prefix)
380 * @param size the size of the bloom-filter (number of
381 * bytes of storage space to use)
382 * @param k the number of GNUNET_CRYPTO_hash-functions to apply per
383 * element (number of bits set per element in the set)
384 * @return the bloomfilter
385 */
386struct GNUNET_CONTAINER_BloomFilter *
387GNUNET_CONTAINER_bloomfilter_load (const char *filename, unsigned int size,
388 unsigned int k)
389{
390 struct GNUNET_CONTAINER_BloomFilter *bf;
391 char *rbuff;
392 unsigned int pos;
393 int i;
394 unsigned int ui;
395
396 if ((k == 0) || (size == 0))
397 return NULL;
398 if (size < BUFFSIZE)
399 size = BUFFSIZE;
400 ui = 1;
401 while (ui < size)
402 ui *= 2;
403 size = ui; /* make sure it's a power of 2 */
404
405 bf = GNUNET_malloc (sizeof (struct GNUNET_CONTAINER_BloomFilter));
406 /* Try to open a bloomfilter file */
407 if (filename != NULL)
408 {
409#ifndef _MSC_VER
410 bf->fd = GNUNET_DISK_file_open (filename, O_RDWR | O_CREAT,
411 S_IRUSR | S_IWUSR);
412#else
413 bf->fd = GNUNET_DISK_file_open (filename,
414 O_WRONLY | O_CREAT, S_IREAD | S_IWRITE);
415#endif
416 if (-1 == bf->fd)
417 {
418 GNUNET_free (bf);
419 return NULL;
420 }
421 bf->filename = GNUNET_strdup (filename);
422 }
423 else
424 {
425 bf->fd = -1;
426 bf->filename = NULL;
427 }
428 /* Alloc block */
429 bf->bitArray = GNUNET_malloc_large (size);
430 bf->bitArraySize = size;
431 bf->addressesPerElement = k;
432 memset (bf->bitArray, 0, bf->bitArraySize);
433
434 if (bf->fd != -1)
435 {
436 /* Read from the file what bits we can */
437 rbuff = GNUNET_malloc (BUFFSIZE);
438 pos = 0;
439 while (pos < size * 8)
440 {
441 int res;
442
443 res = READ (bf->fd, rbuff, BUFFSIZE);
444 if (res == 0)
445 break; /* is ok! we just did not use that many bits yet */
446 for (i = 0; i < res; i++)
447 {
448 if ((rbuff[i] & 0x0F) != 0)
449 setBit (bf->bitArray, pos + i * 2);
450 if ((rbuff[i] & 0xF0) != 0)
451 setBit (bf->bitArray, pos + i * 2 + 1);
452 }
453 if (res < BUFFSIZE)
454 break;
455 pos += BUFFSIZE * 2; /* 2 bits per byte in the buffer */
456 }
457 GNUNET_free (rbuff);
458 }
459 return bf;
460}
461
462
463/**
464 * Create a bloom filter from raw bits.
465 *
466 * @param data the raw bits in memory (maybe NULL,
467 * in which case all bits should be considered
468 * to be zero).
469 * @param size the size of the bloom-filter (number of
470 * bytes of storage space to use); also size of data
471 * -- unless data is NULL
472 * @param k the number of GNUNET_CRYPTO_hash-functions to apply per
473 * element (number of bits set per element in the set)
474 * @return the bloomfilter
475 */
476struct GNUNET_CONTAINER_BloomFilter *
477GNUNET_CONTAINER_bloomfilter_init (const char *data, unsigned int size,
478 unsigned int k)
479{
480 struct GNUNET_CONTAINER_BloomFilter *bf;
481 unsigned int ui;
482
483 if ((k == 0) || (size == 0))
484 return NULL;
485 ui = 1;
486 while (ui < size)
487 ui *= 2;
488 if (size != ui)
489 {
490 GNUNET_break (0);
491 return NULL;
492 }
493 bf = GNUNET_malloc (sizeof (struct GNUNET_CONTAINER_BloomFilter));
494 bf->fd = -1;
495 bf->filename = NULL;
496 bf->bitArray = GNUNET_malloc_large (size);
497 bf->bitArraySize = size;
498 bf->addressesPerElement = k;
499 if (data != NULL)
500 memcpy (bf->bitArray, data, size);
501 else
502 memset (bf->bitArray, 0, bf->bitArraySize);
503 return bf;
504}
505
506
507/**
508 * Copy the raw data of this bloomfilter into
509 * the given data array.
510 *
511 * @param data where to write the data
512 * @param size the size of the given data array
513 * @return GNUNET_SYSERR if the data array is not big enough
514 */
515int
516GNUNET_CONTAINER_bloomfilter_get_raw_data (struct GNUNET_CONTAINER_BloomFilter
517 *bf, char *data, unsigned int size)
518{
519 if (NULL == bf)
520 return GNUNET_SYSERR;
521
522 if (bf->bitArraySize != size)
523 return GNUNET_SYSERR;
524 memcpy (data, bf->bitArray, size);
525 return GNUNET_OK;
526}
527
528/**
529 * Free the space associated with a filter
530 * in memory, flush to drive if needed (do not
531 * free the space on the drive)
532 *
533 * @param bf the filter
534 */
535void
536GNUNET_CONTAINER_bloomfilter_free (struct GNUNET_CONTAINER_BloomFilter *bf)
537{
538 if (NULL == bf)
539 return;
540 if (bf->fd != -1)
541 GNUNET_DISK_file_close (bf->filename, bf->fd);
542 GNUNET_free_non_null (bf->filename);
543 GNUNET_free (bf->bitArray);
544 GNUNET_free (bf);
545}
546
547/**
548 * Reset a bloom filter to empty. Clears the file on disk.
549 *
550 * @param bf the filter
551 */
552void
553GNUNET_CONTAINER_bloomfilter_clear (struct GNUNET_CONTAINER_BloomFilter *bf)
554{
555 if (NULL == bf)
556 return;
557
558 memset (bf->bitArray, 0, bf->bitArraySize);
559 if (bf->fd != -1)
560 makeEmptyFile (bf->fd, bf->bitArraySize * 4);
561}
562
563
564/**
565 * Test if an element is in the filter.
566 *
567 * @param e the element
568 * @param bf the filter
569 * @return GNUNET_YES if the element is in the filter, GNUNET_NO if not
570 */
571int
572GNUNET_CONTAINER_bloomfilter_test (struct GNUNET_CONTAINER_BloomFilter *bf,
573 const GNUNET_HashCode * e)
574{
575 int res;
576
577 if (NULL == bf)
578 return GNUNET_YES;
579 res = GNUNET_YES;
580 iterateBits (bf, &testBitCallback, &res, e);
581 return res;
582}
583
584/**
585 * Add an element to the filter
586 *
587 * @param bf the filter
588 * @param e the element
589 */
590void
591GNUNET_CONTAINER_bloomfilter_add (struct GNUNET_CONTAINER_BloomFilter *bf,
592 const GNUNET_HashCode * e)
593{
594
595 if (NULL == bf)
596 return;
597 iterateBits (bf, &incrementBitCallback, NULL, e);
598}
599
600
601/**
602 * Or the entries of the given raw data array with the
603 * data of the given bloom filter. Assumes that
604 * the size of the data array and the current filter
605 * match.
606 * @param bf the filter
607 */
608int
609GNUNET_CONTAINER_bloomfilter_or (struct GNUNET_CONTAINER_BloomFilter *bf,
610 const char *data, unsigned int size)
611{
612 unsigned int i;
613
614 if (NULL == bf)
615 return GNUNET_YES;
616 if (bf->bitArraySize != size)
617 return GNUNET_SYSERR;
618 /* FIXME: we could do this 4-8x faster by
619 going over int/long arrays */
620 for (i = 0; i < size; i++)
621 bf->bitArray[i] |= data[i];
622 return GNUNET_OK;
623}
624
625/**
626 * Remove an element from the filter.
627 *
628 * @param bf the filter
629 * @param e the element to remove
630 */
631void
632GNUNET_CONTAINER_bloomfilter_remove (struct GNUNET_CONTAINER_BloomFilter *bf,
633 const GNUNET_HashCode * e)
634{
635 if (NULL == bf)
636 return;
637 if (bf->fd == -1)
638 return;
639 iterateBits (bf, &decrementBitCallback, NULL, e);
640}
641
642/**
643 * Resize a bloom filter. Note that this operation
644 * is pretty costly. Essentially, the bloom filter
645 * needs to be completely re-build.
646 *
647 * @param bf the filter
648 * @param iterator an iterator over all elements stored in the BF
649 * @param iterator_arg argument to the iterator function
650 * @param size the new size for the filter
651 * @param k the new number of GNUNET_CRYPTO_hash-function to apply per element
652 */
653void
654GNUNET_CONTAINER_bloomfilter_resize (struct GNUNET_CONTAINER_BloomFilter *bf,
655 GNUNET_HashCodeIterator iterator,
656 void *iterator_arg, unsigned int size,
657 unsigned int k)
658{
659 GNUNET_HashCode hc;
660 unsigned int i;
661
662 GNUNET_free (bf->bitArray);
663 i = 1;
664 while (i < size)
665 i *= 2;
666 size = i; /* make sure it's a power of 2 */
667
668 bf->bitArraySize = size;
669 bf->bitArray = GNUNET_malloc (size);
670 memset (bf->bitArray, 0, bf->bitArraySize);
671 if (bf->fd != -1)
672 makeEmptyFile (bf->fd, bf->bitArraySize * 4);
673 while (GNUNET_YES == iterator (&hc, iterator_arg))
674 GNUNET_CONTAINER_bloomfilter_add (bf, &hc);
675}
676
677/* end of container_bloomfilter.c */