public class MorphAdornerUtils
extends java.lang.Object
Static utility methods used by MorphAdorner.
Modifier and Type | Field and Description |
---|---|
protected static java.lang.Runtime |
runTime
Runtime system.
|
protected static java.util.regex.Matcher |
underlineCapCapMatcher |
protected static java.util.regex.Pattern |
underlineCapCapPattern
Pattern to match _CapCap
|
Modifier | Constructor and Description |
---|---|
protected |
MorphAdornerUtils()
Allow overrides but not instantiation.
|
Modifier and Type | Method and Description |
---|---|
static int |
countPageBreaks(org.w3c.dom.Document document)
Count page breaks in a document.
|
static NameStandardizer |
createNameStandardizer(Lexicon wordLexicon,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Create proper name standardizer.
|
static SpellingMapper |
createSpellingMapper(UTF8Properties properties)
Create spelling mapper.
|
static SpellingStandardizer |
createSpellingStandardizer(Lexicon wordLexicon,
Names names,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Create spelling standardizer.
|
static java.lang.String |
durationString(MorphAdornerSettings adornerSettings,
long startTime)
Get duration value for display.
|
static void |
fixEmptySoftTags(XGOptions xgOptions,
org.w3c.dom.Document document)
Fix empty soft tags.
|
static java.lang.String |
fixSpelling(java.lang.String spelling)
Fix spelling.
|
static void |
fixSupTags(org.w3c.dom.Document document)
Fix sup tags.
|
static TaggedStrings |
getExtraWordsList(java.lang.String wordFileName,
java.lang.String posTag,
java.lang.String loadedMessage,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Get extra words list.
|
static java.lang.String[] |
getKWIC(java.util.List<AdornedWord> sentence,
int wordIndex,
int KWICWidth)
Generate a KWIC line for a word in a sentence.
|
static java.lang.String |
getLemma(MorphAdorner adorner,
java.lang.String spelling,
java.lang.String partOfSpeech)
Get lemma (possibly compound) for a spelling.
|
protected static java.lang.String |
getStandardizedSpelling(MorphAdorner adorner,
java.lang.String correctedSpelling,
java.lang.String standardizedSpelling,
java.lang.String partOfSpeech)
Get standardized spelling.
|
static int[] |
getWordAndSentenceCounts(java.util.List<java.util.List<java.lang.String>> sentences)
Get actual word and sentence count.
|
static int |
getWordCount(java.util.List<java.util.List<java.lang.String>> sentences)
Get count of words in a list of sentences.
|
static TaggedStrings |
getWordList(java.lang.String wordFileName,
java.lang.String posTag,
java.lang.String loadedMessage,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Get word list.
|
static boolean |
isAdorned(java.lang.String xmlFileName,
int maxLinesToCheck)
Check if file is already adorned.
|
static Lexicon |
loadSuffixLexicon(MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Loads the suffix lexicon.
|
static void |
loadTaggerRules(PartOfSpeechTagger tagger,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Loads part of speech tagger rules.
|
static TransitionMatrix |
loadTransitionMatrix(PartOfSpeechTagger tagger,
MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Loads the transition matrix.
|
static Lexicon |
loadWordLexicon(MorphAdornerSettings adornerSettings,
MorphAdornerLogger adornerLogger)
Loads the word lexicon.
|
static void |
logMemoryUsage(MorphAdornerLogger adornerLogger,
java.lang.String label)
Log current memory usage.
|
protected static java.util.regex.Pattern underlineCapCapPattern
protected static final java.util.regex.Matcher underlineCapCapMatcher
protected static java.lang.Runtime runTime
protected MorphAdornerUtils()
public static int countPageBreaks(org.w3c.dom.Document document)
document
- The DOM document.public static SpellingMapper createSpellingMapper(UTF8Properties properties) throws java.io.IOException
properties
- MorphAdorner properties.java.io.IOException
public static NameStandardizer createNameStandardizer(Lexicon wordLexicon, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws java.io.IOException
wordLexicon
- The word lexicon containing names.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.java.io.IOException
public static Lexicon loadWordLexicon(MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws java.io.IOException
adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.java.io.IOException
public static Lexicon loadSuffixLexicon(MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws java.io.IOException
adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.java.io.IOException
public static TransitionMatrix loadTransitionMatrix(PartOfSpeechTagger tagger, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws java.io.IOException
tagger
- Part of speech tagger.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.java.io.IOException
public static void loadTaggerRules(PartOfSpeechTagger tagger, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws InvalidRuleException, java.io.IOException
tagger
- Part of speech tagger.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.InvalidRuleException
java.io.IOException
public static SpellingStandardizer createSpellingStandardizer(Lexicon wordLexicon, Names names, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger) throws java.io.IOException
wordLexicon
- The word lexicon.names
- The names list.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.java.io.IOException
public static java.lang.String durationString(MorphAdornerSettings adornerSettings, long startTime)
adornerSettings
- Adorner settings.startTime
- Start time.public static void fixEmptySoftTags(XGOptions xgOptions, org.w3c.dom.Document document)
xgOptions
- XML parsing options.document
- The DOM document.
On exit, the DOM document has empty soft tags
expanded with a single blank as text, except
public static java.lang.String fixSpelling(java.lang.String spelling)
spelling
- Original spelling.public static void fixSupTags(org.w3c.dom.Document document)
document
- The DOM document.
Prepends a special marker character to the start of the text enclosed in tags to allow disambiguation of old printer's abbreviations from other types of abbreviations. The special marker character is removed before the adorned XML text is written out.
E the yT that yc the ye the yen then yere there yf if yi thy ym them yn than yo the yt that yu thou y that wch which wt withpublic static java.lang.String getLemma(MorphAdorner adorner, java.lang.String spelling, java.lang.String partOfSpeech)
adorner
- The adorner.spelling
- The spelling.partOfSpeech
- The part of speech tag.protected static java.lang.String getStandardizedSpelling(MorphAdorner adorner, java.lang.String correctedSpelling, java.lang.String standardizedSpelling, java.lang.String partOfSpeech)
adorner
- Adorner.correctedSpelling
- The spelling.standardizedSpelling
- The initial standardized spelling.partOfSpeech
- The part of speech tag.public static java.lang.String[] getKWIC(java.util.List<AdornedWord> sentence, int wordIndex, int KWICWidth)
sentence
- The sentence as an array list.wordIndex
- The index of the word for which to generate
a KWIC.KWICWidth
- Maximum width (in characters) of KWIC text.public static int getWordCount(java.util.List<java.util.List<java.lang.String>> sentences)
sentences
- List of sentences each containing list of words.public static int[] getWordAndSentenceCounts(java.util.List<java.util.List<java.lang.String>> sentences)
sentences
- List of sentences each containing list of words.Sentences and words containing only the special separator marker character are not counted.
public static TaggedStrings getWordList(java.lang.String wordFileName, java.lang.String posTag, java.lang.String loadedMessage, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger)
wordFileName
- File name of word list.posTag
- Part of speech tag for each word.loadedMessage
- Message to display when words loaded.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.public static TaggedStrings getExtraWordsList(java.lang.String wordFileName, java.lang.String posTag, java.lang.String loadedMessage, MorphAdornerSettings adornerSettings, MorphAdornerLogger adornerLogger)
wordFileName
- File name of word list.posTag
- Part of speech tag for each word.loadedMessage
- Message to display when words loaded.adornerSettings
- The adorner settings.adornerLogger
- The adorner logger.public static boolean isAdorned(java.lang.String xmlFileName, int maxLinesToCheck)
xmlFileName
- File to check for being adorned.maxLinesToCheck
- Maximum # of lines to read looking
for a "public static void logMemoryUsage(MorphAdornerLogger adornerLogger, java.lang.String label)
adornerLogger
- The adorner logger.label
- Label for memory usage.