-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr_word_parser.py
63 lines (52 loc) · 1.94 KB
/
ocr_word_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/python
import pytesseract
import collections
import pandas as pd
from PIL import Image
import os
import matplotlib.pyplot as plt
#%matplotlib inline
img_dir = 'G:\\PyProjects\\imgs\\'
imgs = os.listdir(img_dir)
imgs = [os.path.join(img_dir, i) for i in imgs ]
tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
tessdata_dir = r'C:/Program Files (x86)/Tesseract-OCR/tessdata'
global tesseract_dir_config
global commonwords
global wordcount
tesseract_dir_config = '--tessdata-dir "{}"'.format(tessdata_dir)
commonwords = r'G:\PyProjects\commonwords.txt'
wordcount = {}
def processWords(imgfile):
words = pytesseract.image_to_string(Image.open(imgfile), lang='eng', config=tesseract_dir_config)
words = words.split(' ')
# Stopwords
stopwords = set(line.strip() for line in open(commonwords))
# Instantiate a dictionary, and for every word in the file,
# Add to the dictionary if it doesn't exist. If it does, increase the count.
# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
for word in words:
word = word.lower()
word = word.replace(". ","")
word = word.replace(",","")
word = word.replace(": ","")
word = word.replace("\"","")
word = word.replace("! ","")
word = word.replace("*","")
word = word.strip()
if word not in stopwords:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
word_counter = collections.Counter(wordcount)
for word, count in word_counter.most_common(len(words)):
print(word, ": ", count)
# Create a data frame of the most common words
# Draw a bar chart
lst = word_counter.most_common(len(words))
df = pd.DataFrame(lst, columns = ['Word', 'Count'])
df.plot.bar(x='Word',y='Count')
for i in imgs:
processWords(i)