-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpdf2txt.py
50 lines (38 loc) · 1.63 KB
/
pdf2txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#install tesseract and add it to the path: this is a open sourse OCR project By Google
#install persian language for tesseract
# import the following libraries
# will convert the image to text string
import pytesseract
# adds image processing capabilities
from PIL import Image
#split pdf
from PyPDF2 import PdfFileWriter, PdfFileReader
#convert pdf to image
from pdf2image import convert_from_path
#############################################################
## Split PDF
# add your pdf in project folder and change "biganeh.pdf" to your pdf name
inputpdf = PdfFileReader(open("biganeh.pdf", "rb"))
result=""
for i in range(inputpdf.numPages):
#make pdf
output = PdfFileWriter()
output.addPage(inputpdf.getPage(i))
name = "./pdfs/document-page%s.pdf" % i
with open(name, "wb") as outputStream:
output.write(outputStream)
##############################################
# make image
images = convert_from_path(name)
images[0].save('./images/'+'page' + str(i) + '.jpg', 'JPEG')
######################################
## OCR :: image to text
img = Image.open('./images/'+'page' + str(i) + '.jpg')
# path where the tesseract module is installed
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
# converts the image to result and saves it into result variable
result = result + "\n\n\n\n\n" + pytesseract.image_to_string(img, lang='fas')
# write text in a text file and save it to source path
with open('resultFile.txt', mode='wb') as file:
file.write(result.encode("utf-8"))
# file.write("\n\n\n\npage: "+str(i)+"\n\n\n\n".encode("utf-8") )