public class DefaultWordTokenizer extends AbstractWordTokenizer implements WordTokenizer
abbreviations, aposTokens, apostropheCanBeQuote, coalesceAsterisks, coalesceHyphens, contractions, contractionsURL, hyphensMatcher, hyphensPattern, logger, preTokenizer
Constructor and Description |
---|
DefaultWordTokenizer()
Create a simple word tokenizer.
|
Modifier and Type | Method and Description |
---|---|
void |
addWordToSentence(java.util.List<java.lang.String> sentence,
java.lang.String word)
Add word to list of words in sentence.
|
java.util.List<java.lang.String> |
extractWords(java.lang.String text)
Break text into word tokens.
|
findWordOffsets, getLogger, getPreTokenizer, isClosingQuote, isLetterOrSingleQuote, isMultipleHyphens, isSingleOpeningQuote, loadContractions, preprocessToken, setAbbreviations, setAposTokens, setLogger, setPreTokenizer, splitToken
close
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
close, findWordOffsets, getPreTokenizer, preprocessToken, setAbbreviations, setAposTokens, setPreTokenizer
close
public DefaultWordTokenizer()
public java.util.List<java.lang.String> extractWords(java.lang.String text)
extractWords
in interface WordTokenizer
extractWords
in class AbstractWordTokenizer
text
- Text to break into word tokens.Word tokens may be words, numbers, punctuation, etc.
public void addWordToSentence(java.util.List<java.lang.String> sentence, java.lang.String word)
addWordToSentence
in interface WordTokenizer
addWordToSentence
in class AbstractWordTokenizer
sentence
- Result sentence.word
- Word to add.