public class IDFixerFilter extends ExtendedXMLFilterImpl
Modifier and Type | Field and Description |
---|---|
protected java.lang.String |
baseFileName
Base XML file name for generating ID values.
|
protected QueueStack<java.lang.String> |
divStack
Div tag stack.
|
protected java.lang.String |
elementURI
URI for elements without one.
|
protected int |
emittedWordCount
Current number of words emitted.
|
protected java.lang.String |
facsFromPB
"facs" or "ref" from current
|
protected QueueStack<java.lang.String> |
foreignStack
Foreign language attribute stack.
|
protected int |
gapCount
Gap count.
|
protected java.lang.String |
id
Current ID value as string.
|
protected static java.text.NumberFormat |
ID_FORMATTER
Word ID formatters.
|
protected java.lang.String |
idAttrName
ID attribute name.
|
protected int |
idSpacing
ID Spacing.
|
protected MorphAdornerSettings.XMLIDType |
idType
ID Type.
|
protected boolean |
isFirstWord
True if we're processing first word in a sentence.
|
protected QueueStack<XMLWriterState> |
jumpStack
Jump tag stack.
|
protected static java.util.Map<java.lang.String,java.lang.String> |
languageTags
Map of XML language tags to language name.
|
protected java.lang.String |
lastID
Previous ID value.
|
protected java.lang.String |
lastIDString |
protected MorphAdornerSettings |
morphAdornerSettings
MorphAdorner settings.
|
protected java.lang.String |
nFromPB
"n" from current
|
protected boolean |
outputNonredundantAttributesOnly
True to output non-redundant attributes only.
|
protected boolean |
outputNonredundantEosAttribute
True to output non-redundant eos attributes only.
|
protected boolean |
outputNonredundantPartAttribute
True to output non-redundant part attributes only.
|
protected boolean |
outputNonredundantTokenAttribute
True to output non-redundant token attributes only.
|
protected boolean |
outputPseudoPageBoundaryMilestones
True to output page boundary milestones.
|
protected boolean |
outputSentenceBoundaryMilestones
True to output sentence boundary milestones.
|
protected boolean |
outputWhitespace
True to output whitespace elements.
|
protected boolean |
outputWordOrdinal
Output word ordinal.
|
protected static java.text.NumberFormat |
PAGE_FORMATTER
Page number formatter.
|
protected int |
pageColumn
Column (0-based) within current page based upon repeated
facs= value of pb.
|
protected int |
pageNumber
Running page number.
|
protected PendingElement |
pendingWordElement
Pending word element.
|
protected PartOfSpeechTags |
posTags
Part of speech tags used in XML file.
|
protected java.lang.String |
prevFacsFromPB
"facs" or "ref" from previous
|
protected java.util.Set<java.lang.String> |
pseudoPageContainerDivTypes
Pseudo-page ending div types.
|
protected int |
pseudoPageCount
Current pseudo page count.
|
protected int |
pseudoPageSize
Page size in number of tokens.
|
protected boolean |
pseudoPageStarted
True if pseudo page started.
|
protected int |
pseudoPageWordCount
Current pseudo page word count.
|
protected XMLSentenceMelder |
sentenceMelder
XML sentence melder.
|
protected SortedArrayList<SentenceAndWordNumber> |
sortedWords
Sorted sentence and word number information.
|
protected java.util.Map<java.lang.Integer,java.lang.Integer> |
splitWords
Split words map of word ID to # of word parts.
|
protected java.util.Map<java.lang.Integer,java.lang.Integer> |
splitWordsCopy
Copy of split words map.
|
protected boolean |
tokenizingOnly
True if only doing tokenization.
|
protected int |
totalWordsToEmit
Total number of words to emit.
|
protected boolean |
usePCToMarkEndOfSentence
True to use
|
protected static java.text.NumberFormat |
WORD_FORMATTER
Word within page formatter.
|
protected int |
wordNumberWithinPage
Word within page number.
|
protected int |
wordOrdinal
Word ordinal.
|
protected XMLWriter |
writer
XML writer.
|
protected java.lang.String |
xmlTokenLabelAttribute
Token label attribute.
|
protected boolean |
xmlTokenLabelEmit
Emit token label?
|
protected boolean |
xmlTokenLabelPrependWorkName
Token label prepend work name flag.
|
protected int |
xmlTokenLabelSpacing
Token label spacing.
|
protected boolean |
zzzzljTagSeen
"zzzzlj" tag found.
|
Constructor and Description |
---|
IDFixerFilter(org.xml.sax.XMLReader reader,
PartOfSpeechTags posTags,
java.lang.String outFile,
int maxID,
SortedArrayList<SentenceAndWordNumber> sortedWords,
java.util.Map<java.lang.Integer,java.lang.Integer> splitWords,
int totalWords,
int totalPageBreaks,
MorphAdornerSettings morphAdornerSettings,
boolean tokenizingOnly)
Create ID filter.
|
Modifier and Type | Method and Description |
---|---|
void |
characters(char[] ch,
int start,
int length)
Handle character data.
|
PendingElement |
createPseudoPageElement(java.lang.String uri,
boolean forcedEmit,
boolean start,
java.lang.String path)
Create a pseudo page milestone.
|
void |
emitPseudoPageElement(PendingElement pseudoPageElement)
Emit a pseudo page milestone.
|
void |
emitWordElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.helpers.AttributesImpl atts,
java.lang.String wordText,
boolean allowOutputWhitespace,
boolean forceEOS)
Emit a word element.
|
void |
endElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName)
Handle end of an element.
|
protected java.lang.String |
generateTokenLabel()
Generate token label based upon current pb element attributes.
|
protected static java.lang.String |
getDisplayableAttributes(org.xml.sax.Attributes atts)
Get displayable list of element attributes and values.
|
java.lang.String |
getForeignLanguageTag(java.lang.String qName,
org.xml.sax.Attributes atts)
Get the foreign language tag for XML element.
|
XMLSentenceMelder |
getSentenceMelder()
Get associated sentence melder.
|
protected void |
setIDFormat(java.lang.String outFile,
int maxID,
int maxPageBreaks)
Set word ID format.
|
void |
setPosTags(PartOfSpeechTags posTags)
Set the part of speech tags.
|
void |
setSentenceMelder(XMLSentenceMelder sentenceMelder)
Set associated sentence melder.
|
protected void |
setSplitWords(java.util.Map<java.lang.Integer,java.lang.Integer> splitWords)
Set split words.
|
void |
setWriter(XMLWriter writer)
Set associated XML writer.
|
void |
startElement(java.lang.String uri,
java.lang.String localName,
java.lang.String qName,
org.xml.sax.Attributes atts)
Handle start of an XML element.
|
removeAttribute, setAttributeValue, setAttributeValue, setAttributeValue
endDocument, endPrefixMapping, error, fatalError, getContentHandler, getDTDHandler, getEntityResolver, getErrorHandler, getFeature, getParent, getProperty, ignorableWhitespace, notationDecl, parse, parse, processingInstruction, resolveEntity, setContentHandler, setDocumentLocator, setDTDHandler, setEntityResolver, setErrorHandler, setFeature, setParent, setProperty, skippedEntity, startDocument, startPrefixMapping, unparsedEntityDecl, warning
protected static final java.text.NumberFormat ID_FORMATTER
protected static final java.text.NumberFormat PAGE_FORMATTER
protected static final java.text.NumberFormat WORD_FORMATTER
protected int wordOrdinal
protected java.lang.String lastID
protected java.lang.String lastIDString
protected java.lang.String id
protected java.lang.String idAttrName
protected java.lang.String baseFileName
protected PartOfSpeechTags posTags
protected java.lang.String elementURI
protected boolean outputWhitespace
protected boolean outputNonredundantAttributesOnly
protected boolean outputNonredundantTokenAttribute
protected boolean outputNonredundantPartAttribute
protected boolean outputNonredundantEosAttribute
protected boolean outputSentenceBoundaryMilestones
protected boolean usePCToMarkEndOfSentence
protected boolean outputPseudoPageBoundaryMilestones
protected int pseudoPageSize
protected int pseudoPageCount
protected int pseudoPageWordCount
protected boolean pseudoPageStarted
protected int emittedWordCount
protected XMLSentenceMelder sentenceMelder
protected boolean isFirstWord
protected PendingElement pendingWordElement
protected java.util.Map<java.lang.Integer,java.lang.Integer> splitWords
protected java.util.Map<java.lang.Integer,java.lang.Integer> splitWordsCopy
protected QueueStack<java.lang.String> foreignStack
protected QueueStack<XMLWriterState> jumpStack
protected QueueStack<java.lang.String> divStack
protected java.util.Set<java.lang.String> pseudoPageContainerDivTypes
protected SortedArrayList<SentenceAndWordNumber> sortedWords
protected XMLWriter writer
protected int totalWordsToEmit
protected int pageNumber
protected int wordNumberWithinPage
protected java.lang.String nFromPB
protected java.lang.String facsFromPB
protected java.lang.String prevFacsFromPB
protected int pageColumn
protected int idSpacing
protected MorphAdornerSettings.XMLIDType idType
protected boolean xmlTokenLabelEmit
protected java.lang.String xmlTokenLabelAttribute
protected int xmlTokenLabelSpacing
protected boolean xmlTokenLabelPrependWorkName
protected boolean outputWordOrdinal
protected static java.util.Map<java.lang.String,java.lang.String> languageTags
protected int gapCount
protected MorphAdornerSettings morphAdornerSettings
protected boolean tokenizingOnly
protected boolean zzzzljTagSeen
public IDFixerFilter(org.xml.sax.XMLReader reader, PartOfSpeechTags posTags, java.lang.String outFile, int maxID, SortedArrayList<SentenceAndWordNumber> sortedWords, java.util.Map<java.lang.Integer,java.lang.Integer> splitWords, int totalWords, int totalPageBreaks, MorphAdornerSettings morphAdornerSettings, boolean tokenizingOnly)
reader
- The XML reader to filter.posTags
- The part of speech tags.outFile
- The output file name.maxID
- The maximum integer word ID.sortedWords
- Sentence and word numbers sorted by word ID.splitWords
- Split words.totalWords
- Total words.totalPageBreaks
- Total page breaks.tokenizingOnly
- True if only emitting tokenization.protected java.lang.String generateTokenLabel()
public void startElement(java.lang.String uri, java.lang.String localName, java.lang.String qName, org.xml.sax.Attributes atts) throws org.xml.sax.SAXException
startElement
in interface org.xml.sax.ContentHandler
startElement
in class org.xml.sax.helpers.XMLFilterImpl
org.xml.sax.SAXException
public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException
characters
in interface org.xml.sax.ContentHandler
characters
in class org.xml.sax.helpers.XMLFilterImpl
ch
- Array of characters.start
- The starting position in the array.length
- The number of characters.org.xml.sax.SAXException
- If there is an error.public void emitWordElement(java.lang.String uri, java.lang.String localName, java.lang.String qName, org.xml.sax.helpers.AttributesImpl atts, java.lang.String wordText, boolean allowOutputWhitespace, boolean forceEOS) throws org.xml.sax.SAXException
uri
- The word element's URI.localName
- The word element's local name.qName
- The word element's qname.atts
- The word element's attributes.wordText
- The word element's text.allowOutputWhitespace
- True to allow outputting
whitespace element for word.forceEOS
- True to force end of sentence
for this word.org.xml.sax.SAXException
public void endElement(java.lang.String uri, java.lang.String localName, java.lang.String qName) throws org.xml.sax.SAXException
endElement
in interface org.xml.sax.ContentHandler
endElement
in class org.xml.sax.helpers.XMLFilterImpl
uri
- The XML element's URI.localName
- The XML element's local name.qName
- The XML element's qname.org.xml.sax.SAXException
public PendingElement createPseudoPageElement(java.lang.String uri, boolean forcedEmit, boolean start, java.lang.String path)
uri
- Element URI.forcedEmit
- Emit pseudo page milestone even if
not enough words accumulated, as long as
at least one word in current block.start
- true if starting milestone, false if ending.path
- Path attribute. May be null.public void emitPseudoPageElement(PendingElement pseudoPageElement)
pseudoPageElement
- The pseudo page element to emit.public void setPosTags(PartOfSpeechTags posTags)
posTags
- The part of speech tags.protected void setSplitWords(java.util.Map<java.lang.Integer,java.lang.Integer> splitWords)
splitWords
- Map of split words.protected void setIDFormat(java.lang.String outFile, int maxID, int maxPageBreaks)
outFile
- Output file name used to derive word IDs.maxID
- Maximum integer word ID value.maxPageBreaks
- Maximum number of page breaks.public void setWriter(XMLWriter writer)
writer
- XML writer.public void setSentenceMelder(XMLSentenceMelder sentenceMelder)
sentenceMelder
- Sentence melder.public XMLSentenceMelder getSentenceMelder()
public java.lang.String getForeignLanguageTag(java.lang.String qName, org.xml.sax.Attributes atts)
qName
- XML element name.atts
- XML element attributes.protected static java.lang.String getDisplayableAttributes(org.xml.sax.Attributes atts)
atts
- Attributes.