public class ContractionTokenizer extends AbstractWordTokenizer implements WordTokenizer
abbreviations, aposTokens, apostropheCanBeQuote, coalesceAsterisks, coalesceHyphens, contractions, contractionsURL, hyphensMatcher, hyphensPattern, logger, preTokenizer
Constructor and Description |
---|
ContractionTokenizer()
Create a contraction tokenizer.
|
Modifier and Type | Method and Description |
---|---|
java.util.List<java.lang.String> |
extractWords(java.lang.String text)
Break text into word tokens.
|
static java.lang.String |
prepareTextForTokenization(java.lang.String str) |
addWordToSentence, findWordOffsets, getLogger, getPreTokenizer, isClosingQuote, isLetterOrSingleQuote, isMultipleHyphens, isSingleOpeningQuote, loadContractions, preprocessToken, setAbbreviations, setAposTokens, setLogger, setPreTokenizer, splitToken
close
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
addWordToSentence, close, findWordOffsets, getPreTokenizer, preprocessToken, setAbbreviations, setAposTokens, setPreTokenizer
close
public ContractionTokenizer()
public static java.lang.String prepareTextForTokenization(java.lang.String str)
public java.util.List<java.lang.String> extractWords(java.lang.String text)
extractWords
in interface WordTokenizer
extractWords
in class AbstractWordTokenizer
text
- Text to break into word tokens.Word tokens may be words, numbers, punctuation, etc.