nltk.tokenize.toktok.ToktokTokenizer

class documentation

class ToktokTokenizer(TokenizerI): (source)

This is a Python port of the tok-tok.pl from https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
>>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
True

Method	`tokenize`	Return a tokenized copy of s.
Constant	`AMPERCENT`	Undocumented
Constant	`CLOSE_PUNCT`	Undocumented
Constant	`CLOSE_PUNCT_RE`	Undocumented
Constant	`COMMA_IN_NUM`	Undocumented
Constant	`CURRENCY_SYM`	Undocumented
Constant	`CURRENCY_SYM_RE`	Undocumented
Constant	`EN_EM_DASHES`	Undocumented
Constant	`FINAL_PERIOD_1`	Undocumented
Constant	`FINAL_PERIOD_2`	Undocumented
Constant	`FUNKY_PUNCT_1`	Undocumented
Constant	`FUNKY_PUNCT_2`	Undocumented
Constant	`LSTRIP`	Undocumented
Constant	`MULTI_COMMAS`	Undocumented
Constant	`MULTI_DASHES`	Undocumented
Constant	`MULTI_DOTS`	Undocumented
Constant	`NON_BREAKING`	Undocumented
Constant	`ONE_SPACE`	Undocumented
Constant	`OPEN_PUNCT`	Undocumented
Constant	`OPEN_PUNCT_RE`	Undocumented
Constant	`PIPE`	Undocumented
Constant	`PROB_SINGLE_QUOTES`	Undocumented
Constant	`RSTRIP`	Undocumented
Constant	`STUPID_QUOTES_1`	Undocumented
Constant	`STUPID_QUOTES_2`	Undocumented
Constant	`TAB`	Undocumented
Constant	`TOKTOK_REGEXES`	Undocumented
Constant	`URL_FOE_1`	Undocumented
Constant	`URL_FOE_2`	Undocumented
Constant	`URL_FOE_3`	Undocumented
Constant	`URL_FOE_4`	Undocumented

Inherited from TokenizerI:

Method	`span_tokenize`	Identify the tokens using integer offsets `(start_i, end_i)`, where `s[start_i:end_i]` is the corresponding token.
Method	`span_tokenize_sents`	Apply `self.span_tokenize()` to each element of `strings`. I.e.:
Method	`tokenize_sents`	Apply `self.tokenize()` to each element of `strings`. I.e.:

def tokenize(self, text, return_str=False): (source) ¶

overrides nltk.tokenize.api.TokenizerI.tokenize

Return a tokenized copy of s.

Returns
list of str	Undocumented

AMPERCENT = (source) ¶

Undocumented

Value

(re.compile(r'& '), '&amp; ')

CLOSE_PUNCT = (source) ¶

Undocumented

Value

str(')]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣')

CLOSE_PUNCT_RE = (source) ¶

Undocumented

Value

(re.compile(u"""([{}])""".format(CLOSE_PUNCT)), '\\1 ')

COMMA_IN_NUM = (source) ¶

Undocumented

Value

(re.compile(r'(?<!,)([,\u060c])(?![,\d])'), ' \\1 ')

CURRENCY_SYM = (source) ¶

Undocumented

Value

str('$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦')

CURRENCY_SYM_RE = (source) ¶

Undocumented

Value

(re.compile(u"""([{}])""".format(CURRENCY_SYM)), '\\1 ')

EN_EM_DASHES = (source) ¶

Undocumented

Value

(re.compile(r'([\u2013\u2014])'), ' \\1 ')

FINAL_PERIOD_1 = (source) ¶

Undocumented

Value

(re.compile(r'(?<!\.)\.$'), ' .')

FINAL_PERIOD_2 = (source) ¶

Undocumented

Value

(re.compile(r'(?<!\.)\.\s*(["\'\u2019\xbb\u203a\u201d]) *$'), ' . \\1')

FUNKY_PUNCT_1 = (source) ¶

Undocumented

Value

(re.compile(r'([\u060c;\u061b\xbf!"\]\)\}\xbb\u203a\u201d\u061f\xa1%\u066a\xb0\x↵
b1\xa9\xae\u0964\u0965\u2026])'),
 ' \\1 ')

FUNKY_PUNCT_2 = (source) ¶

Undocumented

Value

(re.compile(r'([\(\{\[\u201c\u2018\u201e\u201a\xab\u2039\u300c\u300e])'),
 ' \\1 ')

LSTRIP = (source) ¶

Undocumented

Value

(re.compile(r'^ +'), '')

MULTI_COMMAS = (source) ¶

Undocumented

Value

(re.compile(r'(,{2,})'), ' \\1 ')

MULTI_DASHES = (source) ¶

Undocumented

Value

(re.compile(r'(-{2,})'), ' \\1 ')

MULTI_DOTS = (source) ¶

Undocumented

Value

(re.compile(r'(\.{2,})'), ' \\1 ')

NON_BREAKING = (source) ¶

Undocumented

Value

(re.compile(r'\xa0'), ' ')

ONE_SPACE = (source) ¶

Undocumented

Value

(re.compile(r' {2,}'), ' ')

OPEN_PUNCT = (source) ¶

Undocumented

Value

str('([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢')

OPEN_PUNCT_RE = (source) ¶

Undocumented

Value

(re.compile(u"""([{}])""".format(OPEN_PUNCT)), '\\1 ')

PIPE = (source) ¶

Undocumented

Value

(re.compile(r'\|'), ' &#124; ')

PROB_SINGLE_QUOTES = (source) ¶

Undocumented

Value

(re.compile(r'([\'\u2019`])'), ' \\1 ')

RSTRIP = (source) ¶

Undocumented

Value

(re.compile(r'\s+$'), '\n')

STUPID_QUOTES_1 = (source) ¶

Undocumented

Value

(re.compile(r' ` ` '), ' `` ')

STUPID_QUOTES_2 = (source) ¶

Undocumented

Value

(re.compile(r' \' \' '), ' \'\' ')

TAB = (source) ¶

Undocumented

Value

(re.compile(r'\t'), ' &#9; ')

TOKTOK_REGEXES = (source) ¶

Undocumented

Value

[NON_BREAKING,
 FUNKY_PUNCT_1,
 URL_FOE_1,
 URL_FOE_2,
 URL_FOE_3,
 URL_FOE_4,
 AMPERCENT,
...

URL_FOE_1 = (source) ¶

Undocumented

Value

(re.compile(r':(?!//)'), ' : ')

URL_FOE_2 = (source) ¶

Undocumented

Value

(re.compile(r'\?(?!\S)'), ' ? ')

URL_FOE_3 = (source) ¶

Undocumented

Value

(re.compile(r'(://)[\S\+\.\S\+/\S\+]/'), ' / ')

URL_FOE_4 = (source) ¶

Undocumented

Value

(re.compile(r' /'), ' / ')