Source code for clkhash.tokenizer

# -*- coding: utf-8 -*-

"""
Functions to tokenize words (PII)
"""
from __future__ import unicode_literals

from typing import Callable, Iterable, Optional, Text

from future.builtins import range

from clkhash import field_formats


[docs]def get_tokenizer(fhp  # type: Optional[field_formats.FieldHashingProperties]
                  ):
    # type: (...) -> Callable[[Text, Optional[Text]], Iterable[Text]]
    """ Get tokeniser function from the hash settings.

        This function takes a FieldHashingProperties object. It returns a
        function that takes a string and tokenises based on those properties.
    """

    def dummy(word, ignore=None):
        # type: (Text, Optional[Text]) -> Iterable[Text]
        """
        Null tokenizer returns empty Iterable.
        FieldSpec Ignore has hashing_properties = None
        and get_tokenizer has to return something for this case,
        even though it's never called. An alternative would be to
        use an Optional[Callable]].
        :param word: not used
        :param ignore: not used
        :return: empty Iterable
        """
        return ('' for i in range(0))

    if not fhp:
        return dummy

    n = fhp.ngram
    if n < 0:
        raise ValueError('`n` in `n`-gram must be non-negative.')

    positional = fhp.positional

    def tok(word, ignore=None):
        # type: (Text, Optional[Text]) -> Iterable[Text]
        """ Produce `n`-grams of `word`.

            :param word: The string to tokenize.
            :param ignore: The substring whose occurrences we remove from
                `word` before tokenization.
            :return: Tuple of n-gram strings.
        """
        if ignore is not None:
            word = word.replace(ignore, '')

        if len(word) == 0:
            return tuple()

        if n > 1:
            word = ' {} '.format(word)

        if positional:
            # These are 1-indexed.
            return ('{} {}'.format(i + 1, word[i:i + n])
                    for i in range(len(word) - n + 1))
        else:
            return (word[i:i + n] for i in range(len(word) - n + 1))

    return tok