public class PDFText2HTML extends PDFTextStripper
Modifier and Type | Class and Description |
---|---|
private static class |
PDFText2HTML.FontState
A helper class to maintain the current font state.
|
Modifier and Type | Field and Description |
---|---|
private PDFText2HTML.FontState |
fontState |
private static int |
INITIAL_PDF_TO_HTML_BYTES |
charactersByArticle, document, LINE_SEPARATOR, output
Constructor and Description |
---|
PDFText2HTML()
Constructor.
|
Modifier and Type | Method and Description |
---|---|
private static void |
appendEscaped(java.lang.StringBuilder builder,
char character) |
protected void |
endArticle()
Write out the article separator.
|
void |
endDocument(PDDocument document)
This method is available for subclasses of this class.
|
private static java.lang.String |
escape(java.lang.String chars)
Escape some HTML characters.
|
protected java.lang.String |
getTitle()
This method will attempt to guess the title of the document using
either the document properties or the first lines of text.
|
protected void |
startArticle(boolean isLTR)
Write out the article separator (div tag) with proper text direction
information.
|
protected void |
startDocument(PDDocument document)
This method is available for subclasses of this class.
|
protected void |
writeHeader()
Deprecated.
deprecated, use
startDocument(PDDocument) |
protected void |
writeParagraphEnd()
Writes the paragraph end "</p>" to the output.
|
protected void |
writeString(java.lang.String chars)
Write a string to the output stream and escape some HTML characters.
|
protected void |
writeString(java.lang.String text,
java.util.List<TextPosition> textPositions)
Write a string to the output stream, maintain font state, and escape some HTML characters.
|
endPage, getAddMoreFormatting, getArticleEnd, getArticleStart, getAverageCharTolerance, getCharactersByArticle, getCurrentPageNo, getDropThreshold, getEndBookmark, getEndPage, getIndentThreshold, getLineSeparator, getListItemPatterns, getOutput, getPageEnd, getPageStart, getParagraphEnd, getParagraphStart, getSeparateByBeads, getSortByPosition, getSpacingTolerance, getStartBookmark, getStartPage, getSuppressDuplicateOverlappingText, getText, getWordSeparator, matchPattern, processPage, processPages, processTextPosition, setAddMoreFormatting, setArticleEnd, setArticleStart, setAverageCharTolerance, setDropThreshold, setEndBookmark, setEndPage, setIndentThreshold, setLineSeparator, setListItemPatterns, setPageEnd, setPageStart, setParagraphEnd, setParagraphStart, setShouldSeparateByBeads, setSortByPosition, setSpacingTolerance, setStartBookmark, setStartPage, setSuppressDuplicateOverlappingText, setWordSeparator, startArticle, startPage, writeCharacters, writeLineSeparator, writePage, writePageEnd, writePageStart, writeParagraphSeparator, writeParagraphStart, writeText, writeWordSeparator
showGlyph
addOperator, applyTextAdjustment, beginMarkedContentSequence, beginText, decreaseLevel, endMarkedContentSequence, endText, getAppearance, getCurrentPage, getGraphicsStackSize, getGraphicsState, getInitialMatrix, getLevel, getResources, getTextLineMatrix, getTextMatrix, increaseLevel, operatorException, processAnnotation, processChildStream, processOperator, processOperator, processSoftMask, processTilingPattern, processTilingPattern, processTransparencyGroup, processType3Stream, registerOperatorProcessor, restoreGraphicsStack, restoreGraphicsState, saveGraphicsStack, saveGraphicsState, setLineDashPattern, setTextLineMatrix, setTextMatrix, showAnnotation, showFontGlyph, showForm, showText, showTextString, showTextStrings, showTransparencyGroup, showType3Glyph, transformedPoint, transformWidth, unsupportedOperator
private static final int INITIAL_PDF_TO_HTML_BYTES
private final PDFText2HTML.FontState fontState
public PDFText2HTML() throws java.io.IOException
java.io.IOException
- If there is an error during initialization.protected void writeHeader() throws java.io.IOException
startDocument(PDDocument)
java.io.IOException
- If there is a problem writing out the header to the document.protected void startDocument(PDDocument document) throws java.io.IOException
PDFTextStripper
startDocument
in class PDFTextStripper
document
- The PDF document that is being processed.java.io.IOException
- If an IO error occurs.public void endDocument(PDDocument document) throws java.io.IOException
endDocument
in class PDFTextStripper
document
- The PDF document that is being processed.java.io.IOException
- If an IO error occurs.protected java.lang.String getTitle()
protected void startArticle(boolean isLTR) throws java.io.IOException
startArticle
in class PDFTextStripper
isLTR
- true if direction of text is left to rightjava.io.IOException
- If there is an error writing to the stream.protected void endArticle() throws java.io.IOException
endArticle
in class PDFTextStripper
java.io.IOException
- If there is an error writing to the stream.protected void writeString(java.lang.String text, java.util.List<TextPosition> textPositions) throws java.io.IOException
writeString
in class PDFTextStripper
text
- The text to write to the stream.textPositions
- the corresponding text positionsjava.io.IOException
- If there is an error writing to the stream.protected void writeString(java.lang.String chars) throws java.io.IOException
writeString
in class PDFTextStripper
chars
- String to be written to the streamjava.io.IOException
- If there is an error writing to the stream.protected void writeParagraphEnd() throws java.io.IOException
writeParagraphEnd
in class PDFTextStripper
java.io.IOException
- if something went wrongprivate static java.lang.String escape(java.lang.String chars)
chars
- String to be escapedprivate static void appendEscaped(java.lang.StringBuilder builder, char character)