8000 Prevent extractor panic for invalid PDF text objects by adrg · Pull Request #196 · unidoc/unipdf · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Prevent extractor panic for invalid PDF text objects #196

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 30, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
pageText := &PageText{}
state := newTextState()
fontStack := fontStacker{}
var to *textObject
to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack)
var inTextObj bool

cstreamParser := contentstream.NewContentStreamParser(contents)
operations, err := cstreamParser.Parse()
Expand Down Expand Up @@ -102,16 +103,31 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
state.tfont = fontStack.pop()
}
case "BT": // Begin text
// Begin a text object, initializing the text matrix, Tm, and the text line matrix,
// Tlm, to the identity matrix. Text objects shall not be nested; a second BT shall
// not appear before an ET.
if to != nil {
// Begin a text object, initializing the text matrix, Tm, and
// the text line matrix, Tlm, to the identity matrix. Text
// objects shall not be nested. A second BT shall not appear
// before an ET. However, if that happens, all existing marks
// are added to the page marks, in order to avoid losing content.
if inTextObj {
common.Log.Debug("BT called while in a text object")
pageText.marks = append(pageText.marks, to.marks...)
}
inTextObj = true
to = newTextObject(e, resources, gs, &state, &fontStack)
case "ET": // End Text
// End text object, discarding text matrix. If the current
// text object contains text marks, they are added to the
// page text marks collection.
// The ET operator should always have a matching BT operator.
// However, if ET appears outside of a text object, the behavior
// does not change: the text matrices are discarded and all
// existing marks in the text object are added to the page marks.
if !inTextObj {
common.Log.Debug("ET called outside of a text object")
}
inTextObj = false
pageText.marks = append(pageText.marks, to.marks...)
to = nil
to.reset()
case "T*": // Move to start of next text line
to.nextLine()
case "Td": // Move text location
Expand Down Expand Up @@ -202,10 +218,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
}
to.setCharSpacing(y)
case "Tf": // Set font.
if to == nil {
// This is needed for 26-Hazard-Thermal-environment.pdf
to = newTextObject(e, resources, gs, &state, &fontStack)
}
if ok, err := to.checkOp(op, 2, true); !ok {
common.Log.Debug("ERROR: Tf err=%v", err)
return err
Expand Down Expand Up @@ -659,6 +671,14 @@ func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentst
}
}

// reset sets the text matrix `Tm` and the text line matrix `Tlm` of the text
// object to the identity matrix. In addition, the marks collection is cleared.
func (to *textObject) reset() {
to.tm = transform.IdentityMatrix()
to.tlm = transform.IdentityMatrix()
to.marks = nil
}

// renderText processes and renders byte array `data` for extraction purposes.
func (to *textObject) renderText(data []byte) error {
font := to.getCurrentFont()
Expand Down Expand Up @@ -1205,7 +1225,7 @@ func (pt *PageText) sortPosition(tol float64) {
if pt.marks[i-1].orient != pt.marks[i].orient {
cluster++
} else {
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol {
cluster++
}
}
Expand Down
0