8000 Convert pdf preprocessor by xf0e · Pull Request #108 · tleyden/open-ocr · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Convert pdf preprocessor #108

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions convert-pdf.go
10000
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
package ocrworker

/* Use this module if you want to call tesseract over
pdfsandwich with an image as input file.
Useful with big documents.

Use cases:
engine: tesseract with file_type: pdf and preprocessor: convert-pdf
engine: sandwich with file_type: [tif, png, jpg] and preprocessor: convert-pdf
*/

import (
"fmt"
"github.com/couchbaselabs/logg"
"io/ioutil"
"os"
"os/exec"
)

type ConvertPdf struct {
}

func (c ConvertPdf) preprocess(ocrRequest *OcrRequest) error {

tmpFileNameInput, err := createTempFileName()
tmpFileNameInput = fmt.Sprintf("%s.pdf", tmpFileNameInput)
if err != nil {
return err
}
defer os.Remove(tmpFileNameInput)

tmpFileNameOutput, err := createTempFileName()
tmpFileNameOutput = fmt.Sprintf("%s.tif", tmpFileNameOutput)
if err != nil {
return err
}
defer os.Remove(tmpFileNameOutput)

err = saveBytesToFileName(ocrRequest.ImgBytes, tmpFileNameInput)
if err != nil {
return err
}

logg.LogTo(
"PREPROCESSOR_WORKER",
"Convert PDF %s -> %s",
tmpFileNameInput,
tmpFileNameOutput,
)

var gsArgs []string
gsArgs = append(gsArgs,
"-dQUIET",
"-dNOPAUSE",
"-dBATCH",
"-sOutputFile="+tmpFileNameOutput,
"-sDEVICE=tiffg4",
tmpFileNameInput,
)
logg.LogTo("PREPROCESSOR_WORKER", "output: %s", gsArgs)

out, err := exec.Command("gs", gsArgs...).CombinedOutput()
if err != nil {
logg.LogFatal("Error running command: %s. out: %s", err, out)
}
logg.LogTo("PREPROCESSOR_WORKER", "output: %v", string(out))

// read bytes from output file
resultBytes, err := ioutil.ReadFile(tmpFileNameOutput)

if err != nil {
return err
}
ocrRequest.ImgBytes = resultBytes

return nil
}
5 changes: 3 additions & 2 deletions preprocessor.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package ocrworker

const PREPROCESSOR_IDENTITY = "identity"
const PREPROCESSOR_STROKE_WIDTH_TRANSFORM = "stroke-width-transform"
const PreprocessorIdentity = "identity"
const PreprocessorStrokeWidthTransform = "stroke-width-transform"
const PreprocessorConvertPdf = "convert-pdf"

type Preprocessor interface {
preprocess(ocrRequest *OcrRequest) error
Expand Down
25 changes: 13 additions & 12 deletions preprocessor_rpc_worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,13 @@ const preprocessor_tag = "preprocessor" // TODO: should be unique for each worke
func NewPreprocessorRpcWorker(rc RabbitConfig, preprocessor string) (*PreprocessorRpcWorker, error) {

preprocessorMap := make(map[string]Preprocessor)
preprocessorMap[PREPROCESSOR_STROKE_WIDTH_TRANSFORM] = StrokeWidthTransformer{}
preprocessorMap[PREPROCESSOR_IDENTITY] = IdentityPreprocessor{}
preprocessorMap[PreprocessorStrokeWidthTransform] = StrokeWidthTransformer{}
preprocessorMap[PreprocessorIdentity] = IdentityPreprocessor{}
preprocessorMap[PreprocessorConvertPdf] = ConvertPdf{}

_, ok := preprocessorMap[preprocessor]
if !ok {
return nil, fmt.Errorf("No preprocessor found for: %q", preprocessor)
return nil, fmt.Errorf("no preprocessor found for: %q", preprocessor)
}

preprocessorRpcWorker := &PreprocessorRpcWorker{
Expand Down Expand Up @@ -71,11 +72,11 @@ func (w PreprocessorRpcWorker) Run() error {
if err = w.channel.ExchangeDeclare(
w.rabbitConfig.Exchange, // name of the exchange
w.rabbitConfig.ExchangeType, // type
true, // durable
false, // delete when complete
false, // internal
false, // noWait
nil, // arguments
true, // durable
false, // delete when complete
false, // internal
false, // noWait
nil, // arguments
); err != nil {
return err
}
Expand All @@ -100,8 +101,8 @@ func (w PreprocessorRpcWorker) Run() error {
queue.Name, // name of the queue
w.bindingKey, // bindingKey
w.rabbitConfig.Exchange, // sourceExchange
false, // noWait
nil, // arguments
false, // noWait
nil, // arguments
); err != nil {
return err
}
Expand All @@ -128,7 +129,7 @@ func (w PreprocessorRpcWorker) Run() error {
func (w *PreprocessorRpcWorker) Shutdown() error {
// will close() the deliveries channel
if err := w.channel.Cancel(w.tag, true); err != nil {
return fmt.Errorf("Worker cancel failed: %s", err)
return fmt.Errorf("worker cancel failed: %s", err)
}

if err := w.conn.Close(); err != nil {
Expand Down Expand Up @@ -231,7 +232,7 @@ func (w *PreprocessorRpcWorker) handleDelivery(d amqp.Delivery) error {
ocrRequest := OcrRequest{}
err := json.Unmarshal(d.Body, &ocrRequest)
if err != nil {
msg := "Error unmarshaling json: %v. Error: %v"
msg := "Error unmarshalling json: %v. Error: %v"
errMsg := fmt.Sprintf(msg, string(d.Body), err)
logg.LogError(fmt.Errorf(errMsg))
return err
Expand Down
2 changes: 1 addition & 1 deletion stroke_width_transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (s StrokeWidthTransformer) extractDarkOnLightParam(ocrRequest OcrRequest) s
val := "1"

preprocessorArgs := ocrRequest.PreprocessorArgs
swtArgs := preprocessorArgs[PREPROCESSOR_STROKE_WIDTH_TRANSFORM]
swtArgs := preprocessorArgs[PreprocessorStrokeWidthTransform]
if swtArgs != nil {
swtArg, ok := swtArgs.(string)
if ok && (swtArg == "0" || swtArg == "1") {
Expand Down
0