Source code for clkhash.tokenizer
# -*- coding: utf-8 -*-
"""
Functions to tokenize words (PII)
"""
from __future__ import unicode_literals
from typing import Callable, Iterable, Optional, Text
from future.builtins import range
from clkhash import field_formats
[docs]def get_tokenizer(fhp # type: Optional[field_formats.FieldHashingProperties]
):
# type: (...) -> Callable[[Text, Optional[Text]], Iterable[Text]]
""" Get tokeniser function from the hash settings.
This function takes a FieldHashingProperties object. It returns a
function that takes a string and tokenises based on those properties.
"""
def dummy(word, ignore=None):
# type: (Text, Optional[Text]) -> Iterable[Text]
"""
Null tokenizer returns empty Iterable.
FieldSpec Ignore has hashing_properties = None
and get_tokenizer has to return something for this case,
even though it's never called. An alternative would be to
use an Optional[Callable]].
:param word: not used
:param ignore: not used
:return: empty Iterable
"""
return ('' for i in range(0))
if not fhp:
return dummy
n = fhp.ngram
if n < 0:
raise ValueError('`n` in `n`-gram must be non-negative.')
positional = fhp.positional
def tok(word, ignore=None):
# type: (Text, Optional[Text]) -> Iterable[Text]
""" Produce `n`-grams of `word`.
:param word: The string to tokenize.
:param ignore: The substring whose occurrences we remove from
`word` before tokenization.
:return: Tuple of n-gram strings.
"""
if ignore is not None:
word = word.replace(ignore, '')
if len(word) == 0:
return tuple()
if n > 1:
word = ' {} '.format(word)
if positional:
# These are 1-indexed.
return ('{} {}'.format(i + 1, word[i:i + n])
for i in range(len(word) - n + 1))
else:
return (word[i:i + n] for i in range(len(word) - n + 1))
return tok