public abstract class AbstractSpellingStandardizer extends IsCloseableObject implements SpellingStandardizer, UsesLogger
Modifier and Type | Field and Description |
---|---|
protected java.util.Set<java.lang.String> |
alternateSpellingsWordClasses
Word classes of alternate spellings.
|
protected static java.lang.String |
defaultSpellingsByWordClassFileName
Path to list of irregular word forms.
|
protected Lexicon |
lexicon
Lexicon associated with this standardizer.
|
protected Logger |
logger
Logger used for output.
|
protected TaggedStrings |
mappedSpellings
The map with alternate spellings as keys and standard spellings
as values.
|
protected Map2D<java.lang.String,java.lang.String,java.lang.String> |
spellingsByWordClass
Irregular forms.
|
protected java.util.Set<java.lang.String> |
standardSpellingSet
The set of standard spellings.
|
Constructor and Description |
---|
AbstractSpellingStandardizer()
Create abstract spelling standardizer.
|
Modifier and Type | Method and Description |
---|---|
void |
addCachedSpelling(java.lang.String alternateSpelling,
java.lang.String standardSpelling)
Cached a generated mapped spelling.
|
void |
addMappedSpelling(java.lang.String alternateSpelling,
java.lang.String standardSpelling)
Add a mapped spelling.
|
void |
addStandardSpelling(java.lang.String standardSpelling)
Add a standard spelling.
|
void |
addStandardSpellings(java.util.Collection<java.lang.String> standardSpellings)
Add standard spellings from a collection.
|
java.lang.String |
fixCapitalization(java.lang.String spelling,
java.lang.String standardSpelling)
Fix capitalization of standardized spelling.
|
Lexicon |
getLexicon()
Get the word lexicon.
|
Logger |
getLogger()
Get the logger.
|
TaggedStrings |
getMappedSpellings()
Return the mapped spellings.
|
int |
getNumberOfAlternateSpellings()
Returns number of alternate spellings.
|
int[] |
getNumberOfAlternateSpellingsByWordClass()
Returns number of alternate spellings by word class.
|
int |
getNumberOfStandardSpellings()
Returns number of standard spellings.
|
java.util.Set<java.lang.String> |
getStandardSpellings()
Return the standard spellings.
|
void |
loadAlternativeSpellings(java.io.Reader reader,
java.lang.String delimChars)
Loads alternative spellings from a reader.
|
void |
loadAlternativeSpellings(java.net.URL url,
boolean compressed,
java.lang.String encoding,
java.lang.String delimChars)
Loads alternate spellings from a URL.
|
void |
loadAlternativeSpellings(java.net.URL url,
java.lang.String encoding,
java.lang.String delimChars)
Loads alternate spellings from a URL.
|
void |
loadAlternativeSpellingsByWordClass(java.net.URL spellingsURL,
java.lang.String encoding)
Load alternate to standard spellings by word class.
|
void |
loadStandardSpellings(java.io.Reader reader)
Loads standard spellings from a reader.
|
void |
loadStandardSpellings(java.net.URL url,
boolean compressed,
java.lang.String encoding)
Loads standard spellings from a URL.
|
void |
loadStandardSpellings(java.net.URL url,
java.lang.String encoding)
Loads standard spellings from a URL.
|
java.lang.String |
preprocessSpelling(java.lang.String spelling)
Preprocess spelling.
|
void |
setLexicon(Lexicon lexicon)
Set the lexicon.
|
void |
setLogger(Logger logger)
Set the logger.
|
void |
setMappedSpellings(TaggedStrings mappedSpellings)
Sets map which maps alternate spellings to standard spellings.
|
void |
setStandardSpellings(java.util.Set<java.lang.String> standardSpellings)
Sets standard spellings.
|
java.lang.String[] |
standardizeSpelling(java.lang.String spelling)
Returns standard spellings given a spelling.
|
java.lang.String |
standardizeSpelling(java.lang.String spelling,
java.lang.String wordClass)
Returns a standard spelling given a standard or alternate spelling.
|
close
protected TaggedStrings mappedSpellings
protected java.util.Set<java.lang.String> standardSpellingSet
protected Map2D<java.lang.String,java.lang.String,java.lang.String> spellingsByWordClass
Spellings disambiguated by word class are stored in a HashMap2D. The compound key consists of the word class and alternate spelling, and the value is the standardized spelling.
protected java.util.Set<java.lang.String> alternateSpellingsWordClasses
protected static java.lang.String defaultSpellingsByWordClassFileName
protected Logger logger
protected Lexicon lexicon
public AbstractSpellingStandardizer()
public void loadAlternativeSpellingsByWordClass(java.net.URL spellingsURL, java.lang.String encoding) throws java.io.IOException
loadAlternativeSpellingsByWordClass
in interface SpellingStandardizer
spellingsURL
- URL of alternative spellings by word class.encoding
- Character set encoding for spellingsjava.io.IOException
public void loadAlternativeSpellings(java.net.URL url, boolean compressed, java.lang.String encoding, java.lang.String delimChars) throws java.io.IOException
loadAlternativeSpellings
in interface SpellingStandardizer
url
- URL containing alternate spellings to
standard spellings mappings.compressed
- true if gzip compressedencoding
- Text encoding (utf-8, 8859_1, etc.).delimChars
- Delimiter characters separating spelling pairs.java.io.IOException
public void loadAlternativeSpellings(java.net.URL url, java.lang.String encoding, java.lang.String delimChars) throws java.io.IOException
loadAlternativeSpellings
in interface SpellingStandardizer
url
- URL containing alternate spellings to
standard spellings mappings.encoding
- Text encoding (utf-8, 8859_1, etc.).delimChars
- Delimiter characters separating spelling pairs.java.io.IOException
public void loadAlternativeSpellings(java.io.Reader reader, java.lang.String delimChars) throws java.io.IOException
loadAlternativeSpellings
in interface SpellingStandardizer
reader
- The reader.delimChars
- Delimiter characters separating spelling pairs.java.io.IOException
public void loadStandardSpellings(java.net.URL url, boolean compressed, java.lang.String encoding) throws java.io.IOException
loadStandardSpellings
in interface SpellingStandardizer
url
- URL containing standard spellingscompressed
- true if gzip compressedencoding
- Character set encoding for spellingsjava.io.IOException
public void loadStandardSpellings(java.net.URL url, java.lang.String encoding) throws java.io.IOException
loadStandardSpellings
in interface SpellingStandardizer
url
- URL containing standard spellingsencoding
- Character set encoding for spellingsjava.io.IOException
public void loadStandardSpellings(java.io.Reader reader) throws java.io.IOException
loadStandardSpellings
in interface SpellingStandardizer
reader
- The reader.java.io.IOException
public void addMappedSpelling(java.lang.String alternateSpelling, java.lang.String standardSpelling)
addMappedSpelling
in interface SpellingStandardizer
alternateSpelling
- The alternate spelling.standardSpelling
- The corresponding standard spelling.public void addStandardSpelling(java.lang.String standardSpelling)
addStandardSpelling
in interface SpellingStandardizer
standardSpelling
- A standard spelling.public void addStandardSpellings(java.util.Collection<java.lang.String> standardSpellings)
addStandardSpellings
in interface SpellingStandardizer
standardSpellings
- A collection of standard spellings.public void addCachedSpelling(java.lang.String alternateSpelling, java.lang.String standardSpelling)
alternateSpelling
- The alternate spelling.standardSpelling
- The corresponding standard spelling.public void setMappedSpellings(TaggedStrings mappedSpellings)
setMappedSpellings
in interface SpellingStandardizer
mappedSpellings
- Map with alternate spellings as keys
and standard spellings as values.public void setStandardSpellings(java.util.Set<java.lang.String> standardSpellings)
setStandardSpellings
in interface SpellingStandardizer
standardSpellings
- Set of standard spellings.public java.lang.String[] standardizeSpelling(java.lang.String spelling)
standardizeSpelling
in interface SpellingStandardizer
spelling
- The spelling.If not spelling map is defined, the spelling is returned unchanged.
public java.lang.String standardizeSpelling(java.lang.String spelling, java.lang.String wordClass)
standardizeSpelling
in interface SpellingStandardizer
spelling
- The spelling.wordClass
- The major word class.public int getNumberOfAlternateSpellings()
getNumberOfAlternateSpellings
in interface SpellingStandardizer
public int[] getNumberOfAlternateSpellingsByWordClass()
getNumberOfAlternateSpellingsByWordClass
in interface SpellingStandardizer
public int getNumberOfStandardSpellings()
getNumberOfStandardSpellings
in interface SpellingStandardizer
public TaggedStrings getMappedSpellings()
getMappedSpellings
in interface SpellingStandardizer
public java.util.Set<java.lang.String> getStandardSpellings()
getStandardSpellings
in interface SpellingStandardizer
public java.lang.String preprocessSpelling(java.lang.String spelling)
preprocessSpelling
in interface SpellingStandardizer
spelling
- Spelling to preprocess.By default, no preprocessing is applied; the original spelling is returned unchanged.
public java.lang.String fixCapitalization(java.lang.String spelling, java.lang.String standardSpelling)
fixCapitalization
in interface SpellingStandardizer
spelling
- The original spelling.standardSpelling
- The candidate standard spelling.public Logger getLogger()
getLogger
in interface UsesLogger
public void setLogger(Logger logger)
setLogger
in interface UsesLogger
logger
- The logger.public Lexicon getLexicon()
public void setLexicon(Lexicon lexicon)
lexicon
- Lexicon used for tagging.