-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathget_document_detection.py
99 lines (78 loc) · 2.77 KB
/
get_document_detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from template.init import *
def start_job(s3_obj):
response = None
response = textract.start_document_text_detection(
DocumentLocation={
'S3Object': s3_obj
})
return response['JobId']
def is_job_complete(jobId):
time.sleep(5)
response = textract.get_document_text_detection(
JobId=jobId
)
status = response['JobStatus']
print("Job status: {}".format(status))
while status == "IN_PROGRESS":
time.sleep(5)
response = textract.get_document_text_detection(
JobId=jobId
)
status = response['JobStatus']
print("Job status: {}".format(status))
return status
def detect_document(jobId):
if(is_job_complete(jobId)):
response = textract.get_document_text_detection(
JobId=jobId
)
return response
def format_input(response):
input_format = []
id = 1
for item in response['Blocks']:
if item['BlockType'] == 'LINE' and item['Page'] == 1:
obj = {}
obj['eId'] = id
obj['geometry'] = item['Geometry']['BoundingBox']
obj['text'] = item['Text']
obj['Confidence'] = item['Confidence']
input_format.append(obj)
id += 1
return input_format
def save_input_format(input_format, jobId, fileName):
input_format_path = 'input_format/{0}.json'.format(jobId)
if not path.exists("input_format_mapping.json"):
with open('input_format_mapping.json', 'w+') as p:
p.close()
if not path.isdir("input_format"):
os.mkdir("input_format")
with open('input_format_mapping.json', "r+") as f:
try:
content = json.load(f)
content[get_file_name_without_extension(
fileName)] = input_format_path
f.seek(0)
f.truncate(0)
f.write(json.dumps(content))
except JSONDecodeError:
content = {}
content[get_file_name_without_extension(
fileName)] = input_format_path
f.write(json.dumps(content))
with open(input_format_path, "w+") as outFile:
outFile.write(json.dumps(input_format))
return input_format_path
if __name__ == '__main__':
print("In poc_input")
try:
bucketName = sys.argv[1]
fileName = sys.argv[2]
s3_obj = {"Bucket": bucketName, "Name": fileName}
jobId = start_job(s3_obj)
response = detect_document(jobId)
input_format = format_input(response)
input_format_path = save_input_format(input_format, jobId, fileName)
print('Response of Input Format Path {0}'.format(input_format_path))
except IndexError:
print('Please provide S3 "Bucket Name" and "File Name" while executing program.')