-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_ocr.py
85 lines (67 loc) · 2.1 KB
/
pdf_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# OCR to convert pdf image to text and save it as csv file
# Note the dependencies
import re
import csv
import urllib2
from cStringIO import StringIO
try:
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
except:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from wand.image import Image
from PIL import Image as PI
import pyocr
from pyocr import tesseract as tool
import pyocr.builders
import io
def pdf_from_url_to_txt(filename):
# Create a PDF resource manager object that stores shared resources.
infp = file(filename, 'rb')
rsrcmgr = PDFResourceManager()
outfp = StringIO()
# becuase pdf documents are utf-8
codec = 'utf-8'
laparams=LAParams()
# Create a csv device object.
device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
try:
process_pdf(rsrcmgr, device, infp)
except:
PDFPageInterpreter(rsrcmgr, device)
strOut = outfp.getvalue()
return strOut
#list of all the pdf links, links can be local pathname as well
localLinks = [link]
outputOCR = csv.writer(open('outputOCR.csv', 'w'), delimiter='`',quotechar='"',lineterminator='\n')
outputOCR.writerow(['Index', 'Text'])
# URL link for the file
for index, link in enumerate(localLinks):
#try:
output=pdf_from_url_to_txt(str(link))
others = str(output)
print '_________________________'
print index+1, ':', link
if len(others) ==0:
#tool = pyocr.get_available_tools()
#lang = tool.get_available_languages()[1]
req_image = []
final_text = []
image_pdf = Image(filename=link, resolution=300)
image_jpeg = image_pdf.convert('jpeg')
for img in image_jpeg.sequence:
img_page = Image(image=img)
req_image.append(img_page.make_blob('jpeg'))
for img in req_image:
txt = tool.image_to_string(
PI.open(io.BytesIO(img)),
builder=pyocr.builders.TextBuilder()
)
final_text.append(txt)
others = str(final_text)
print 'Others:', others
else:
print 'Others:', others
outputOCR.writerow([index, others])
print '********************End********************'