public class XMLTextInputter extends IsCloseableObject implements TextInputter
The XML file can be divided into smaller sections which are stored in a map. MorphAdorner uses a modified XGTagger interface to adorn each section of text separately, and then merge the results to produce the final adorned XML output.
Modifier and Type | Field and Description |
---|---|
protected org.jdom2.Document |
document
The document text object.
|
protected java.lang.String |
encoding
Text encoding.
|
protected boolean |
fixGaps
True to fix
|
protected boolean |
fixOrigs
True to fix
|
protected boolean |
fixSplitWords
True to fix selected split words.
|
protected java.util.List<PatternReplacer> |
fixSplitWordsPatternReplacers
Pattern replacers for fixing split words.
|
protected java.util.Map<java.lang.String,java.lang.Object> |
segmentMap
Map which holds segmented XML text.
|
protected java.util.List<java.lang.String> |
segmentNames
Segment names.
|
protected boolean |
splitText
True to split text body into segments.
|
protected boolean |
storesSegmentFiles
True if segment files are stored in the segment map
instead of segment text.
|
protected java.lang.String |
teiHeaderPattern
TEI header element pattern.
|
protected int |
textID
Text ID number for generated XML segments.
|
Constructor and Description |
---|
XMLTextInputter()
Create XML text inputter.
|
Modifier and Type | Method and Description |
---|---|
void |
close()
Close inputter.
|
protected void |
doLoadDocument(org.jdom2.Document document,
java.lang.String schemaURI,
java.lang.String docPath)
Loads text from a document into a map.
|
protected void |
doLoadText(java.net.URL url,
java.lang.String encoding,
java.lang.String schemaURI)
Loads text from a URL into a map.
|
void |
enableGapFixer(boolean fixGaps)
Enable gap element fixer.
|
void |
enableOrigFixer(boolean fixOrigs)
Enable orig element fixer.
|
void |
enableSplitWordsFixer(boolean fixSplitWords,
java.util.List<PatternReplacer> patternReplacers)
Enable split words fixer.
|
void |
finalize()
Finalize,
|
org.jdom2.Element |
findChild(org.jdom2.Element parent,
java.lang.String namePat)
Find child node name matching regular expression.
|
protected int |
getNextTextID()
Get next text ID.
|
protected java.lang.String |
getSegment(java.lang.String segmentName)
Get segment text.
|
int |
getSegmentCount()
Returns number of text segments.
|
java.lang.String |
getSegmentName(int segmentNumber)
Returns name of specified segment.
|
java.lang.String |
getSegmentText(int segmentNumber)
Returns specified segment of loaded text.
|
java.lang.String |
getSegmentText(java.lang.String segmentName)
Returns specified segment of loaded text.
|
void |
loadText(java.lang.String str)
Reads text from a String.
|
void |
loadText(java.lang.String str,
java.lang.String xmlSchemaURI)
Reads text from a string using a specified XML schema.
|
void |
loadText(java.net.URL url,
java.lang.String encoding)
Reads text from a URL into a string.
|
void |
loadText(java.net.URL url,
java.lang.String encoding,
java.lang.String xmlSchemaURI)
Reads text from a URL using a specified XML schema.
|
protected void |
putSegment(java.lang.String segmentName,
java.lang.String segmentText)
Save segment text.
|
void |
setSegmentText(int segmentNumber,
java.io.File segmentTextFile)
Updates specified segment of loaded text from file.
|
void |
setSegmentText(int segmentNumber,
java.lang.String segmentText)
Updates specified segment of loaded text.
|
void |
setSegmentText(java.lang.String segmentName,
java.io.File segmentTextFile)
Updates specified segment of loaded text from file.
|
void |
setSegmentText(java.lang.String segmentName,
java.lang.String segmentText)
Updates specified segment of loaded text.
|
boolean |
usesSegmentFiles()
Does inputter use segment files?
|
protected void |
writeChildren(org.jdom2.Element element,
java.lang.String baseFileName,
boolean splitText)
Store children of a DOM element.
|
protected java.util.Map<java.lang.String,java.lang.Object> segmentMap
The key is the segment name. The value is the segment data. The value may be something else in subclasses.
protected int textID
protected java.util.List<java.lang.String> segmentNames
protected java.lang.String encoding
protected boolean splitText
protected boolean fixGaps
protected boolean fixOrigs
protected boolean fixSplitWords
protected java.util.List<PatternReplacer> fixSplitWordsPatternReplacers
protected final java.lang.String teiHeaderPattern
protected org.jdom2.Document document
protected boolean storesSegmentFiles
protected void doLoadDocument(org.jdom2.Document document, java.lang.String schemaURI, java.lang.String docPath) throws org.jdom2.JDOMException, java.io.IOException, java.net.URISyntaxException, org.xml.sax.SAXException
document
- Document from which to read text.schemaURI
- XML schema URI. Null if none.docPath
- Path for original document.java.io.IOException
- If an I/O error occurs.org.jdom2.JDOMException
java.net.URISyntaxException
org.xml.sax.SAXException
protected void doLoadText(java.net.URL url, java.lang.String encoding, java.lang.String schemaURI) throws org.jdom2.JDOMException, java.io.IOException, java.net.URISyntaxException, org.xml.sax.SAXException
url
- URL from which to read text.encoding
- Text encoding.schemaURI
- XML schema URI. Null if none.java.io.IOException
- If an I/O error occurs.org.jdom2.JDOMException
java.net.URISyntaxException
org.xml.sax.SAXException
public org.jdom2.Element findChild(org.jdom2.Element parent, java.lang.String namePat)
parent
- Node whose child we want.namePat
- Regular expression for child name.public void loadText(java.net.URL url, java.lang.String encoding) throws java.lang.Exception
loadText
in interface TextInputter
url
- URL from which to read text.encoding
- Text encoding.java.lang.Exception
- If an error occurs.public void loadText(java.net.URL url, java.lang.String encoding, java.lang.String xmlSchemaURI) throws java.lang.Exception
loadText
in interface TextInputter
url
- URL from which to read text.encoding
- Text encoding.xmlSchemaURI
- String URI specifying XML schema.java.lang.Exception
- If an error occurs.
The schema and schema type should be ignored when the input is not an XML file.
public void loadText(java.lang.String str) throws java.lang.Exception
loadText
in interface TextInputter
str
- String from which to read text.java.io.IOException
- If an error occurs.java.lang.Exception
public void loadText(java.lang.String str, java.lang.String xmlSchemaURI) throws java.lang.Exception
loadText
in interface TextInputter
str
- String from which to read text.xmlSchemaURI
- String URI specifying Xml schema.java.lang.Exception
- If an error occurs.
The schema and schema type should be ignored when the input is not an XML file.
public int getSegmentCount()
getSegmentCount
in interface TextInputter
public java.lang.String getSegmentName(int segmentNumber)
getSegmentName
in interface TextInputter
segmentNumber
- The segment number (starts at 0).public java.lang.String getSegmentText(int segmentNumber)
getSegmentText
in interface TextInputter
segmentNumber
- The segment number (starts at 0).public java.lang.String getSegmentText(java.lang.String segmentName)
getSegmentText
in interface TextInputter
segmentName
- The segment name.public void setSegmentText(int segmentNumber, java.lang.String segmentText)
setSegmentText
in interface TextInputter
segmentNumber
- The segment number (starts at 0).segmentText
- The updated segment text.public void setSegmentText(java.lang.String segmentName, java.lang.String segmentText)
setSegmentText
in interface TextInputter
segmentName
- The segment name.segmentText
- The updated segment text.public void setSegmentText(int segmentNumber, java.io.File segmentTextFile)
setSegmentText
in interface TextInputter
segmentNumber
- The segment number (starts at 0).segmentTextFile
- The file containing the updated segment text.public void setSegmentText(java.lang.String segmentName, java.io.File segmentTextFile)
setSegmentText
in interface TextInputter
segmentName
- The segment name.segmentTextFile
- The file containing the updated segment text.protected int getNextTextID()
protected void writeChildren(org.jdom2.Element element, java.lang.String baseFileName, boolean splitText)
element
- The DOM element to store.baseFileName
- The base file name for entry names
generated from the DOM element.splitText
- True to split body text into segments.protected java.lang.String getSegment(java.lang.String segmentName)
segmentName
- Segment name.protected void putSegment(java.lang.String segmentName, java.lang.String segmentText)
segmentName
- Segment name.segmentText
- Segment text.public void enableGapFixer(boolean fixGaps)
enableGapFixer
in interface TextInputter
fixGaps
- true to fix gap tags.public void enableOrigFixer(boolean fixOrigs)
enableOrigFixer
in interface TextInputter
fixOrigs
- true to fix orig tags.public void enableSplitWordsFixer(boolean fixSplitWords, java.util.List<PatternReplacer> patternReplacers)
enableSplitWordsFixer
in interface TextInputter
fixSplitWords
- true to fix selected split words.patternReplacers
- Patterns for fixing split words.public boolean usesSegmentFiles()
usesSegmentFiles
in interface TextInputter
public void close()
close
in class IsCloseableObject
public void finalize() throws java.lang.Throwable
finalize
in class java.lang.Object
java.lang.Throwable