Source code for clkhash.bloomfilter

#!/usr/bin/env python3

"""
Generate a Bloom filter
"""

import hmac
import math
import struct
from hashlib import md5, sha1
from typing import Callable, Iterable, List, Optional, Sequence, Text, Tuple
from bitarray import bitarray

from clkhash.field_formats import FieldHashingProperties
from clkhash.schema import Schema
from clkhash.comparators import AbstractComparison, NonComparison

try:
    from hashlib import blake2b
except ImportError:
    # We are in Python older than 3.6.
    from pyblake2 import blake2b  # type: ignore
    # Ignore because otherwise Mypy raises errors, thinking that
    # blake2b is already defined.


[docs]def double_hash_encode_ngrams(ngrams: Iterable[str],
                              keys: Sequence[bytes],
                              ks: Sequence[int],
                              l: int,
                              encoding: str
                              ) -> bitarray:
    """ Computes the double hash encoding of the ngrams with the given keys.

            Using the method from:
            Schnell, R., Bachteler, T., & Reiher, J. (2011).
            A Novel Error-Tolerant Anonymous Linking Code.
            http://grlc.german-microsimulation.de/wp-content/uploads/2017/05/downloadwp-grlc-2011-02.pdf

        :param ngrams: list of n-grams to be encoded
        :param keys: hmac secret keys for md5 and sha1 as bytes
        :param ks: ks[i] is k value to use for ngram[i]
        :param l: length of the output bitarray
        :param encoding: the encoding to use when turning the ngrams to bytes

        :return: bitarray of length l with the bits set which correspond to
                 the encoding of the ngrams
    """
    key_sha1, key_md5 = keys
    bf = bitarray(l)
    bf.setall(False)

    for m, k in zip(ngrams, ks):
        sha1hm = int(
            hmac.new(key_sha1, m.encode(encoding=encoding), sha1).hexdigest(),
            16) % l
        md5hm = int(
            hmac.new(key_md5, m.encode(encoding=encoding), md5).hexdigest(),
            16) % l
        for i in range(k):
            gi = (sha1hm + i * md5hm) % l
            bf[gi] = 1
    return bf


[docs]def double_hash_encode_ngrams_non_singular(ngrams: Iterable[str],
                                           keys: Sequence[bytes],
                                           ks: Sequence[int],
                                           l: int,
                                           encoding: str
                                           ) -> bitarray:
    """ computes the double hash encoding of the n-grams with the given keys.

        The original construction of [Schnell2011]_ displays an abnormality for
        certain inputs:

            An n-gram can be encoded into just one bit irrespective of the number
            of k.

        Their construction goes as follows: the :math:`k` different indices
        :math:`g_i` of the Bloom filter for an n-gram
        :math:`x` are defined as:

        .. math:: g_{i}(x) = (h_1(x) + i h_2(x)) \\mod l

        with :math:`0 \\leq i < k` and :math:`l` is the length of the Bloom
        filter. If the value of the hash of :math:`x` of
        the second hash function is a multiple of :math:`l`, then

        .. math:: h_2(x) = 0 \\mod l

        and thus

        .. math:: g_i(x) = h_1(x) \\mod l,

        irrespective of the value :math:`i`. A discussion of this potential flaw
        can be found
        `here <https://github.com/data61/clkhash/issues/33>`_.

        :param ngrams: list of n-grams to be encoded
        :param keys: tuple with (key_sha1, key_md5).
               That is, (hmac secret keys for sha1 as bytes, hmac secret keys for
               md5 as bytes)
        :param ks: ks[i] is k value to use for ngram[i]
        :param l: length of the output bitarray
        :param encoding: the encoding to use when turning the ngrams to bytes

        :return: bitarray of length l with the bits set which correspond to the
                 encoding of the ngrams
    """
    key_sha1, key_md5 = keys
    bf = bitarray(l)
    bf.setall(False)
    for m, k in zip(ngrams, ks):
        m_bytes = m.encode(encoding=encoding)

        sha1hm_bytes = hmac.new(key_sha1, m_bytes, sha1).digest()
        md5hm_bytes = hmac.new(key_md5, m_bytes, md5).digest()

        sha1hm = int.from_bytes(sha1hm_bytes, 'big') % l
        md5hm = int.from_bytes(md5hm_bytes, 'big') % l

        i = 0
        while md5hm == 0:
            md5hm_bytes = hmac.new(
                key_md5, m_bytes + chr(i).encode(), md5).digest()
            md5hm = int.from_bytes(md5hm_bytes, 'big') % l
            i += 1

        for i in range(k):
            gi = (sha1hm + i * md5hm) % l
            bf[gi] = True
    return bf


[docs]def blake_encode_ngrams(ngrams: Iterable[str],
                        keys: Sequence[bytes],
                        ks: Sequence[int],
                        l: int,
                        encoding: str
                        ) -> bitarray:
    """ Computes the encoding of the ngrams using the BLAKE2 hash function.

        We deliberately do not use the double hashing scheme as proposed in [
        Schnell2011]_, because this
        would introduce an exploitable structure into the Bloom filter. For more
        details on the
        weakness, see [Kroll2015]_.

        In short, the double hashing scheme only allows for :math:`l^2`
        different encodings for any possible n-gram,
        whereas the use of :math:`k` different independent hash functions gives
        you :math:`\\sum_{j=1}^{k}{\\binom{l}{j}}`
        combinations.


        **Our construction**

        It is advantageous to construct Bloom filters using a family of hash
        functions with the property of
        `k-independence <https://en.wikipedia.org/wiki/K-independent_hashing>`_
        to compute the indices for an entry.
        This approach minimises the change of collisions.

        An informal definition of *k-independence* of a family of hash functions
        is, that if selecting a function at random
        from the family, it guarantees that the hash codes of any designated k
        keys are independent random variables.

        Our construction utilises the fact that the output bits of a
        cryptographic hash function are uniformly distributed,
        independent, binary random variables (well, at least as close to as
        possible. See [Kaminsky2011]_ for an analysis).
        Thus, slicing the output of a cryptographic hash function into k
        different slices gives you k independent random
        variables.

        We chose Blake2 as the cryptographic hash function mainly for two reasons:

        * it is fast.
        * in keyed hashing mode, Blake2 provides MACs with just one hash
          function call instead of the two calls in the HMAC construction used
          in the double hashing scheme.


        .. warning::
           Please be aware that, although this construction makes the attack of
           [Kroll2015]_ infeasible, it is most likely
           not enough to ensure security. Or in their own words:

             | However, we think that using independent hash functions alone
               will not be sufficient to ensure security,
               since in this case other approaches (maybe related to or at least
               inspired through work from the
               area of Frequent Itemset Mining) are promising to detect at least
               the most frequent atoms automatically.

        :param ngrams: list of n-grams to be encoded
        :param keys: secret key for blake2 as bytes
        :param ks: ks[i] is k value to use for ngram[i]
        :param l: length of the output bitarray (has to be a power of 2)
        :param encoding: the encoding to use when turning the ngrams to bytes

        :return: bitarray of length l with the bits set which correspond to the
                 encoding of the ngrams
    """
    key = keys[0]  # We only need the first key

    log_l = int(math.log(l, 2))
    if not 2 ** log_l == l:
        raise ValueError(
            'parameter "l" has to be a power of two for the BLAKE2 encoding, '
            'but was: {}'.format(
                l))
    bf = bitarray(l)
    bf.setall(False)

    for m, k in zip(ngrams, ks):
        random_shorts = []  # type: List[int]
        num_macs = (k + 31) // 32
        for i in range(num_macs):
            hash_bytes = blake2b(m.encode(encoding=encoding), key=key,
                                 salt=str(i).encode()).digest()
            random_shorts.extend(struct.unpack('32H',
                                               hash_bytes))  # interpret
            # hash bytes as 32 unsigned shorts.
        for i in range(k):
            idx = random_shorts[i] % l
            bf[idx] = 1
    return bf


[docs]def hashing_function_from_properties(
        fhp: FieldHashingProperties
        ) -> Callable[[Iterable[str], Sequence[bytes], Sequence[int], int, str], bitarray]:
    """ Get the hashing function for this field
        :param fhp: hashing properties for this field
        :return: the hashing function
    """
    if fhp.hash_type == 'doubleHash':
        if fhp.prevent_singularity:
            return double_hash_encode_ngrams_non_singular
        else:
            return double_hash_encode_ngrams
    elif fhp.hash_type == 'blakeHash':
        return blake_encode_ngrams
    else:
        msg = f"Unsupported hash type '{fhp.hash_type}'"
        raise ValueError(msg)


[docs]def fold_xor(bloomfilter: bitarray,
             folds: int
             ) -> bitarray:
    """ Performs XOR folding on a Bloom filter.

        If the length of the original Bloom filter is n and we perform
        r folds, then the length of the resulting filter is n / 2 ** r.

        :param bloomfilter: Bloom filter to fold
        :param folds: number of folds

        :return: folded bloom filter
    """

    if len(bloomfilter) % 2 ** folds != 0:
        msg = ('The length of the bloom filter is {length}. It is not '
               'divisible by 2 ** {folds}, so it cannot be folded {folds} '
               'times.'
               .format(length=len(bloomfilter), folds=folds))
        raise ValueError(msg)

    for _ in range(folds):
        bf1 = bloomfilter[:len(bloomfilter) // 2]
        bf2 = bloomfilter[len(bloomfilter) // 2:]

        bloomfilter = bf1 ^ bf2

    return bloomfilter


[docs]def crypto_bloom_filter(record: Sequence[str],
                        comparators: List[AbstractComparison],
                        schema: Schema,
                        keys: Sequence[Sequence[bytes]],
                        ) -> Tuple[bitarray, str, int]:
    """ Computes the composite Bloom filter encoding of a record.

    Using the method from
    http://www.record-linkage.de/-download=wp-grlc-2011-02.pdf

    :param record: plaintext record tuple. E.g. (index, name, dob, gender)
    :param comparators: A list of comparators. They provide a 'tokenize' function to turn string into
        appropriate tokens.
    :param schema: Schema
    :param keys: Keys for the hash functions as a tuple of lists of bytes.

    :return: 3-tuple:

        - bloom filter for record as a bitarray
        - first element of record (usually an index)
        - number of bits set in the bloomfilter
    """
    hash_l = schema.l * 2 ** schema.xor_folds

    bloomfilter = bitarray(hash_l)
    bloomfilter.setall(False)

    for (entry, comparator, field, key) \
            in zip(record, comparators, schema.fields, keys):
        fhp = field.hashing_properties
        if fhp:
            ngrams = list(comparator.tokenize(field.format_value(entry)))
            hash_function = hashing_function_from_properties(fhp)
            if ngrams:
                bloomfilter |= hash_function(ngrams, key,
                                             fhp.strategy.bits_per_token(len(ngrams)),
                                             hash_l, fhp.encoding)

    bloomfilter = fold_xor(bloomfilter, schema.xor_folds)
    return bloomfilter, record[0], bloomfilter.count()


[docs]def stream_bloom_filters(dataset: Iterable[Sequence[str]],
                         keys: Sequence[Sequence[bytes]],
                         schema: Schema
                         ) -> Iterable[Tuple[bitarray, str, int]]:
    """ Compute composite Bloom filters (CLKs) for every record in an
        iterable dataset.

        :param dataset: An iterable of indexable records.
        :param schema: An instantiated Schema instance
        :param keys: A tuple of two lists of secret keys used in the HMAC.
        :return: Generator yielding bloom filters as 3-tuples
    """
    comparators = [field.hashing_properties.comparator if field.hashing_properties is not None else NonComparison()
                  for field in schema.fields]
    return (crypto_bloom_filter(s, comparators, schema, keys)
            for s in dataset)