chapter 3.1-3.4 - lsd0003 - LSD0003: Set Union

commit 4d240cd09d0d2adf4e6c755c2a5437e5c0f29463
parent 2e15bfdc45ea02d516126f2947e8d3ffb409d084
Author: Christian Grothoff <christian@grothoff.org>
Date:   Sat, 12 Jun 2021 18:07:31 +0200

chapter 3.1-3.4

Diffstat:
M draft-summermatter-set-union.xml  | 244 +++++++++++++++++++++++++++++++++++++++++--------------------------------------

1 file changed, 128 insertions(+), 116 deletions(-)
diff --git a/draft-summermatter-set-union.xml b/draft-summermatter-set-union.xml
@@ -363,6 +363,131 @@ hashSum |   HASHSUM   |   HASHSUM   |   HASHSUM   |    HASHSUM  |  H..
                     </figure>
 
             </section>
+
+            <section anchor="ibf_format_id_generation" numbered="true" toc="default">
+              <name>Salted Element ID Calculation</name>
+              <t>
+                IBFs are a probabilistic data structure, hence it can be necessary to
+                recompute the IBF in case operations fail, and then try again. The
+                recomputed IBF would ideally be statistically independent of the
+                failed IBF. This is achieved by introducing an IBF-salt. Given that with
+                benign peers failures should be rare, and that we need to be able to
+                "invert" the application of the IBF-salt to the element IDs, we use an
+                unsigned 32 bit non-random IBF-salt value of which the lowest 6 bits will
+                be used to rotate bits in the element ID computation.
+              </t>
+              <t>
+                64-bit element IDs are generated from a
+                <relref  section="2" target="RFC5869" displayFormat="of">HKDF construction</relref>
+                with HMAC-SHA512 as XTR and HMAC-SHA256 as PRF with a 16-bit KDF-salt set to a
+                unsigned 16-bit representation of zero.
+                The output of the KDF is then truncated to 64-bit.
+                Finally, salting is done by calculating the IBF-salt modulo 64
+                (effectively using only the lowest 6-bits of the IBF-salt)
+                and doing a bitwise right rotation of the output of KDF.  We
+                note that this operation was chosen as it is easily inverted,
+                allowing applications to easily derive element IDs with one
+                IBF-salt value from element IDs generated with a different
+                IBF-salt value.
+              </t>
+              <t>
+                In case the IBF does not decode, the IBF-salt can be changed to
+                compute different element IDs, which will (likely) be mapped
+                to different buckets, likely allowing the IBF to decode in a
+                subsequent iteration.
+              </t>
+              <figure anchor="ibf_format_id_generation_pseudo_code">
+                <artwork name="" type="" align="left" alt=""><![CDATA[
+# INPUTS:
+# key: Pre calculated and truncated key from id_calculation function
+# ibf_salt: Salt of the IBF
+# OUTPUT:
+# value: salted key
+FUNCTION salt_key(key,ibf_salt):
+  s = ibf_salt % 64;
+  /* rotate key */
+  return (key >> s) | (key << (64 - s))
+
+
+# INPUTS:
+# element: element for which we are to calculate the element ID
+# ibf_salt: Salt of the IBF
+# OUTPUT:
+# value: the ID of the element
+FUNCTION id_calculation (element,ibf_salt):
+    kdf_salt = 0 // 16 bits
+    XTR=HMAC-SHA256
+    PRF=HMAC-SHA256
+    key = HKDF(XTR, PRF, kdf_salt, element) modulo 2^64
+    return salt_key(key, ibf_salt)
+                 ]]></artwork>
+              </figure>
+            </section>
+
+            <section anchor="ibf_format_HASH_calculation" numbered="true" toc="default">
+              <name>HASH calculation</name>
+              <t>
+                The HASH of an element ID is computed by calculating the
+                CRC32 checksum of the 64-bit ID value,
+                which returns a 32-bit value. <!-- FIXME: CRC32 needs a reference! -->
+              </t>
+            </section>
+
+            <section anchor="ibf_m" numbered="true" toc="default">
+              <name>Mapping Function</name>
+              <t>
+                The mapping function M decides which buckets a given ID is mapped to.
+                For an IBF, it is beneficial to use an injective mapping function M.
+              </t>
+              <t>
+                The first index is simply the CRC32 of the ID modulo the IBF size. The second
+                index is calculated by creating a new 64-bit value by shifting the previous 32-bit
+                value left and setting the lower 32 bits to the number of indices already processed.
+                From the resulting 64-bit value, another CRC32 checksum is computed.
+                The subsequent index is the modulo of this CRC32 output.
+                The process is repeated until the desired number of indices is generated.
+                In the case the process computes the same index twice,
+                which would mean this bucket could not get pure again,
+                the second hit is just skipped and the next iteration is used instead,
+                creating an injective mapping function.
+              </t>
+              <figure anchor="ibf_format_bucket_identification_pseudo_code">
+                <artwork name="" type="" align="left" alt=""><![CDATA[
+# INPUTS:
+# key: the ID of the element calculated
+# k: numbers of buckets per element
+# L: total number of buckets in the IBF
+# OUTPUT:
+# dst: Array with k bucket IDs
+FUNCTION get_bucket_id (key, k, L)
+  bucket = CRC32(key)
+  i = 0 // unsigned 32-bit index
+  filled = 0
+  WHILE filled < k
+
+    element_already_in_bucket = false
+    j = 0
+    WHILE j < filled
+      IF dst[j] == bucket modulo L THEN
+        element_already_in_bucket = true
+      ENDIF
+      j++
+    ENDWHILE
+
+    IF !element_already_in_bucket THEN
+        dst[filled] = bucket modulo L
+        filled = filled + 1
+    ENDIF
+
+    x = (bucket << 32) | i // 64 bit result
+    bucket = CRC32(x)
+    i = i + 1
+  ENDWHILE
+  return dst
+  ]]></artwork>
+              </figure>
+            </section>
+
             <section anchor="ibf_operations" numbered="true" toc="default">
               <name>Operations</name>
               <t>
@@ -373,7 +498,7 @@ hashSum |   HASHSUM   |   HASHSUM   |   HASHSUM   |    HASHSUM  |  H..
                     <name>Insert Element</name>
                     <t>
                       To add an element to an IBF, the element is mapped to a subset of k buckets using
-                      the mapping function M as described in the <xref target="bf" format="title" /> section introducing
+                      the injective mapping function M as described in section <xref target="ibf_m" format="title" /> section introducing
                       BFs. For the buckets selected by the mapping function, the counter is increased by one and the
                       IDSUM field is set to the XOR of the element ID and the previously stored IDSUM. Furthermore,
                       the HASHSUM is set to the XOR of the hash of the element ID and the previously stored HASHSUM.
@@ -697,123 +822,11 @@ hashSum |    0x0101   |    0x0101   |    0x5050   |    0x0000   |
                     also again a reasonable size for many CPU
                     architectures.
                 </t>
-                <section anchor="ibf_format_id_generation" numbered="true" toc="default">
-                    <name>ID Calculation</name>
-                    <t>
-                        The ID is generated as 64-bit output from a <relref  section="2" target="RFC5869" displayFormat="of">HKDF construction</relref>
-                        with HMAC-SHA512 as XTR and HMAC-SHA256 as PRF and salt is set to the unsigned 64-bit equivalent of 0.
-                        The output is then truncated to 64-bit.
-                        It is important that the elements can be redistributed over the buckets in case the IBF does not
-                        decode. That is why the ID is salted with a random salt given in the SALT field of this message.
-                        Salting is done by calculating a random salt modulo 64 (using only the lowest 6-bits of the salt)
-                        and doing a bitwise right rotation of the output of KDF by the 6-bit salts numeric representation.
-                    </t>
-                    <t>
-                        Representation in pseudocode:
-                    </t>
-                    <figure anchor="ibf_format_id_generation_pseudo_code">
-                        <artwork name="" type="" align="left" alt=""><![CDATA[
-# INPUTS:
-# key: Pre calculated and truncated key from id_calculation function
-# ibf_salt: Salt of the IBF
-# OUTPUT:
-# value: salted key
-FUNCTION salt_key(key,ibf_salt):
-  s = ibf_salt % 64;
-  k = key
-
-  /* rotate ibf key */
-  k = (k >> s) | (k << (64 - k))
-  return key
-
-
-# INPUTS:
-# element: Element to calculated id from.
-# salt: Salt of the IBF
-# OUTPUT:
-# value: the ID of the element
-
-FUNCTION id_calculation (element,ibf_salt):
-    salt = 0
-    XTR=HMAC-SHA256
-    PRF=HMAC-SHA256
-    key = HKDF(XTR, PRF, salt, element)
-    key = key modulo 2^64 // Truncate
-    return salt_key(key,ibf_salt)
-
-
-                 ]]></artwork>
-                    </figure>
-                </section>
-                <section anchor="ibf_format_bucket_identification" numbered="true" toc="default">
-                    <name>Mapping Function</name>
-                    <t>
-                      For an IBF, it is beneficial to use an injective mapping function M.
-                      The mapping function M as described above in the figure <xref target="bf_mapping_function_math" format="default" />
-                        decides in which buckets the ID and HASH have to be binary XORed to. In practice
-                        the following algorithm is used:
-                    </t>
-                    <t>
-                        The first index is simply the HASH modulo the IBF size. The second
-                        index is calculated by creating a new 64-bit value by shifting the 32-bit
-                        value left and setting the lower 32-bit to the number of indexes already processed. From the
-                        resulting 64-bit value a CRC32 checksum is created. The second index is now the modulo of the
-                        CRC32 output, this is repeated until the predefined amount of indexes is generated.
-                        In the case a index is hit twice, which would mean this bucket could not get pure again,
-                        the second hit is just skipped and the next iteration is used.
-                    </t>
-                    <figure anchor="ibf_format_bucket_identification_pseudo_code">
-                        <artwork name="" type="" align="left" alt=""><![CDATA[
-# INPUTS:
-# key: Is the ID of the element calculated in the id_calculation function above.
-# number_of_buckets_per_element: Pre-defined numbers of buckets elements are inserted into
-# ibf_size: the size of the ibf (count of buckets)
-# OUTPUT:
-# dst: Array with bucket IDs to insert ID and HASH
-
-FUNCTION get_bucket_id (key, number_of_buckets_per_element, ibf_size)
-  bucket = CRC32(key)
-
-  i = 0
-  filled = 0
-  WHILE filled < number_of_buckets_per_element
-
-    element_already_in_bucket = false
-    j = 0
-    WHILE j < filled
-      IF dst[j] == bucket modulo ibf_size THEN
-        element_already_in_bucket = true
-      ENDIF
-      j++
-    ENDWHILE
-
-    IF !element_already_in_bucket THEN
-        dst[filled++] = bucket modulo ibf_size
-    ENDIF
-
-    x = (bucket << 32) | i
-    bucket = CRC32(x)
-
-    i++
-  ENDWHILE
-  return dst
-                 ]]></artwork>
-                    </figure>
-            </section>
-            <section anchor="ibf_format_HASH_calculation" numbered="true" toc="default">
-                <name>HASH calculation</name>
-                    <t>
-                        The HASH is calculated by calculating the CRC32 checksum of the 64-bit ID value
-                        which returns a 32-bit value.
-                    </t>
-            </section>
           </section>
         </section>
 
         <section anchor="se" numbered="true" toc="default">
             <name>Strata Estimator</name>
-            <section anchor="se_description" numbered="true" toc="default">
-                <name>Description</name>
                 <t>
                     Strata Estimators help estimate the size of the set difference between two sets of elements.
                     This is necessary to efficiently determinate the tuning parameters for an IBF, in particular
@@ -843,7 +856,6 @@ FUNCTION get_bucket_id (key, number_of_buckets_per_element, ibf_size)
                     on which side which part of the difference is. For this purpose, the pure buckets with
                     counter 1 and -1 must be distinguished and assigned to the respective side when decoding the IBFs.
                 </t>
-            </section>
         </section>
 
         <section anchor="modeofoperation" numbered="true" toc="default">
@@ -1267,7 +1279,7 @@ FUNCTION get_bucket_id (key, number_of_buckets_per_element, ibf_size)
                             </t>
                             <t>
                                 The algorithm to find the correct bucket in which the ID and the HASH have to be added
-                                is described in detail in section <xref target="ibf_format_bucket_identification" format="title" />.
+                                is described in detail in section <xref target="ibf_m" format="title" />.
                             </t>
 
                             <t>
@@ -3082,7 +3094,7 @@ Type    | Name                       | References | Description
                 </t>
                 <figure anchor="test_vectors_map_function_inputs">
                     <artwork name="" type="" align="left" alt=""><![CDATA[
-number_of_buckets_per_element: 3
+k: 3
 ibf_size: 300
 
 key1: 0xFFFFFFFFFFFFFFFF (64-bit)

	lsd0003 LSD0003: Set Union
	Log \| Files \| Refs \| README