| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from itertools import chain |
| |
|
| | def pad_sequence( |
| | sequence, |
| | n, |
| | pad_left=False, |
| | pad_right=False, |
| | left_pad_symbol=None, |
| | right_pad_symbol=None, |
| | ): |
| | """ |
| | Returns a padded sequence of items before ngram extraction. |
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
| | ['<s>', 1, 2, 3, 4, 5, '</s>'] |
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
| | ['<s>', 1, 2, 3, 4, 5] |
| | >>> list(pad_sequence([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
| | [1, 2, 3, 4, 5, '</s>'] |
| | :param sequence: the source data to be padded |
| | :type sequence: sequence or iter |
| | :param n: the degree of the ngrams |
| | :type n: int |
| | :param pad_left: whether the ngrams should be left-padded |
| | :type pad_left: bool |
| | :param pad_right: whether the ngrams should be right-padded |
| | :type pad_right: bool |
| | :param left_pad_symbol: the symbol to use for left padding (default is None) |
| | :type left_pad_symbol: any |
| | :param right_pad_symbol: the symbol to use for right padding (default is None) |
| | :type right_pad_symbol: any |
| | :rtype: sequence or iter |
| | """ |
| | sequence = iter(sequence) |
| | if pad_left: |
| | sequence = chain((left_pad_symbol,) * (n - 1), sequence) |
| | if pad_right: |
| | sequence = chain(sequence, (right_pad_symbol,) * (n - 1)) |
| | return sequence |
| |
|
| |
|
| | |
| |
|
| |
|
| | def ngrams( |
| | sequence, |
| | n, |
| | pad_left=False, |
| | pad_right=False, |
| | left_pad_symbol=None, |
| | right_pad_symbol=None, |
| | ): |
| | """ |
| | Return the ngrams generated from a sequence of items, as an iterator. |
| | For example: |
| | >>> from nltk.util import ngrams |
| | >>> list(ngrams([1,2,3,4,5], 3)) |
| | [(1, 2, 3), (2, 3, 4), (3, 4, 5)] |
| | Wrap with list for a list version of this function. Set pad_left |
| | or pad_right to true in order to get additional ngrams: |
| | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) |
| | [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] |
| | >>> list(ngrams([1,2,3,4,5], 2, pad_right=True, right_pad_symbol='</s>')) |
| | [(1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
| | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, left_pad_symbol='<s>')) |
| | [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5)] |
| | >>> list(ngrams([1,2,3,4,5], 2, pad_left=True, pad_right=True, left_pad_symbol='<s>', right_pad_symbol='</s>')) |
| | [('<s>', 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, '</s>')] |
| | :param sequence: the source data to be converted into ngrams |
| | :type sequence: sequence or iter |
| | :param n: the degree of the ngrams |
| | :type n: int |
| | :param pad_left: whether the ngrams should be left-padded |
| | :type pad_left: bool |
| | :param pad_right: whether the ngrams should be right-padded |
| | :type pad_right: bool |
| | :param left_pad_symbol: the symbol to use for left padding (default is None) |
| | :type left_pad_symbol: any |
| | :param right_pad_symbol: the symbol to use for right padding (default is None) |
| | :type right_pad_symbol: any |
| | :rtype: sequence or iter |
| | """ |
| | sequence = pad_sequence( |
| | sequence, n, pad_left, pad_right, left_pad_symbol, right_pad_symbol |
| | ) |
| |
|
| | history = [] |
| | while n > 1: |
| | |
| | try: |
| | next_item = next(sequence) |
| | except StopIteration: |
| | |
| | return |
| | history.append(next_item) |
| | n -= 1 |
| | for item in sequence: |
| | history.append(item) |
| | yield tuple(history) |
| | del history[0] |