Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended hocr #283

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ build/
*.so
.~*.vue
doc/.ipynb_checkpoints/
.idea/
24 changes: 23 additions & 1 deletion ocrolib/lstm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
from ocrolib.edist import levenshtein
import utils
import unicodedata
import sys
from scipy.ndimage import measurements,filters

initial_range = 0.1
Expand Down Expand Up @@ -743,14 +744,35 @@ def translate_back(outputs,threshold=0.7,pos=0):
"""Translate back. Thresholds on class 0, then assigns the maximum class to
each region. ``pos`` determines the depth of character information returned:
* `pos=0`: Return list of recognized characters
* `pos=1`: Return list of position-character tuples
* `pos=1`: Return list of position-character tuples (maxima only)
* `pos=2`: Return list of character-probability tuples
* `pos=3`: Return list of position-character tuples (start, end and maxima values)
"""
labels,n = measurements.label(outputs[:,0]<threshold)
mask = tile(labels.reshape(-1,1),(1,outputs.shape[1]))
maxima = measurements.maximum_position(outputs,mask,arange(1,amax(mask)+1))
if pos==1: return maxima # include character position
if pos==2: return [(c, outputs[r,c]) for (r,c) in maxima] # include character probabilities
if pos==3:
p = -1
x0 = None
x = []
for idx, val in enumerate(labels):
if val != 0 and x0 is None:
x0 = idx
p += 1
if val == 0 and x0 is not None:
#obosolete? This skips N/A strings
#if maxima[p][1] == 0:
#ignore '' spaces
# x0 = None
#else:
x.append((maxima[p][0], x0, idx, maxima[p][1]))
x0 = None
# append last non-zero region to list of no zero region occurs after it
if x0:
x.append((maxima[p][0], x0, len(outputs), maxima[p][1]))
return x
return [c for (r,c) in maxima] # only recognized characters

def log_mul(x,y):
Expand Down
33 changes: 33 additions & 0 deletions ocropus-gpageseg
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ import os.path
import sys
import traceback
from multiprocessing import Pool
import codecs
import json
from collections import OrderedDict as OD

import numpy as np
from scipy.ndimage import measurements
Expand Down Expand Up @@ -90,6 +93,9 @@ group_output.add_argument('-p','--pad',type=int,default=3,
help='padding for extracted lines, default: %(default)s')
group_output.add_argument('-e','--expand',type=int,default=3,
help='expand mask for grayscale extraction, default: %(default)s')
group_output.add_argument('-j','--json',action='store_true',
help='store information about parameters in a *.ocropy.json-file \
(this option can be used to create an extended hocr output)')

# other parameters
group_others = parser.add_argument_group('others')
Expand Down Expand Up @@ -442,9 +448,36 @@ def process1(job):
lines = [lines[i] for i in lsort]
ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)
cleaned = ocrolib.remove_noise(binary,args.noise)

# create ordered-dictobj for json output
if args.json:
func = "gpageseg"
jsondata = OD()
jsondata["global"] = OD()
jsondata["pad"] = OD()
jsondata["pad"][func] = args.pad
jsondata["scale"] = OD()
jsondata["scale"][func] = scale
jsondata["bbox"] = OD()

for i,l in enumerate(lines):
binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)

if args.json:
# write dictobj to json file
with codecs.open("%s/01%04x.ocropy.json" % (outputdir,i+1), "w",encoding="utf-8") as jsonfile:
y0, x0, y1, x1 = [int(x) for x in [l.bounds[0].start, l.bounds[1].start, l.bounds[0].stop, l.bounds[1].stop]]
jsondata["global"]["fpath"] = fname
jsondata["global"]["id"] = "01%04x" % (i + 1)
jsondata["bbox"]["line"] = OD()
jsondata["bbox"]["line"]["chars"] = ""
jsondata["bbox"]["line"]["x0"] = x0
jsondata["bbox"]["line"]["x1"] = x1
jsondata["bbox"]["line"]["y0"] = y0
jsondata["bbox"]["line"]["y1"] = y1
json.dump(jsondata, jsonfile,indent=4,ensure_ascii=False)

if args.gray:
grayline = psegutils.extract_masked(gray,l,pad=args.pad,expand=args.expand)
ocrolib.write_image_gray("%s/01%04x.nrm.png"%(outputdir,i+1),grayline)
Expand Down
128 changes: 119 additions & 9 deletions ocropus-hocr
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import __builtin__ as python
import random as pyrandom
import sys
import os.path
import re
import glob
import argparse
import codecs
import os
from collections import OrderedDict as OD
import json
from copy import deepcopy

import numpy as np
from matplotlib.pyplot import imread
Expand All @@ -30,6 +33,8 @@ For each page like 'book/0001.bin.png', it uses the following files:
""")
parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
parser.add_argument("-n","--normal",action="store_true",help="surpress the output of extended hocr version (only available if '-j/--json' was set to gpageseg)")
parser.add_argument("-c","--charconfs",action="store_true",help="output confidences of every char, only in combination with extened hocr-file.")
parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
parser.add_argument('files',nargs='+')
Expand Down Expand Up @@ -145,14 +150,119 @@ for arg in args.files:
if os.path.exists(lbase+".baseline"):
info += "; baseline "+ocrolib.read_text(lbase+".baseline")

# put it all together into a SPAN

PN("<span")
if style!="": PN(" style='"+style+"'")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
if not args.nobreaks: P("<br />")
else: P()

# extend the hocr files with information from cloc and prob files
if not args.normal:
if not os.path.exists(lbase + ".ocropy.json"):
print("There is no %s.ocropy.json file." %(lbase))
args.normal = True
else:
with open(lbase + ".ocropy.json", "r") as jsonfile:
jsondata = json.load(jsonfile, object_pairs_hook=OD)

if "chars" not in jsondata.keys() or "bbox" not in jsondata.keys():
continue

# put all information about the ocr_line together into a SPAN

PN(" <span")
if style != "": PN(" style='" + style + "'")
PN(" class='ocr_line' title='%s'>" % info, )
P("<br />")

pad = jsondata["pad"]["gpageseg"]
scale = jsondata["scale"]["rpred"]
jsondata["bbox"]["word"]= OD()
wcount= 0
#convert string key value to int

for key in jsondata["chars"].keys():
if jsondata["chars"][key]["char"] in [">","<","&"]:
jsondata["chars"][key]["char"] = ["&lt;","&gt;","&amp;"][[">","<","&"].index(jsondata["chars"][key]["char"])]
jsondata["chars"][int(key)] = deepcopy(jsondata["chars"][key])
del jsondata["chars"][key]
jsondata["chars"][len(jsondata["chars"])] = OD()
jsondata["chars"][len(jsondata["chars"])-1]["char"]=" "
jsondata["bbox"]["word"] = OD()


# idx0 = start_index, idx1 = stop_index(whitespace) of the recognized word
idx0 = 0
chars = ""
charconfs = ""
wconf = 1.0

# set lineparameter
wy0 = jsondata["bbox"]["line"]["y0"]
wy1 = jsondata["bbox"]["line"]["y1"]
if "x0" in jsondata["chars"][0]:
for idx1 in jsondata["chars"]:
if jsondata["chars"][idx1]["char"] == " ":
if jsondata["bbox"]["line"]["chars"] != "":
# set lineparameter
wx0= jsondata["bbox"]["line"]["x0"]
if idx0 != 0:
#Take the end value of the previous whitespace
# scale factor correct?
wx0 = jsondata["bbox"]["line"]["x0"] + jsondata["chars"][idx0]["x1"]
if idx1 == len(jsondata["chars"])-1:
wx1 = jsondata["bbox"]["line"]["x1"]
else:
# Take the maxima value of the current whitespace
wx1= jsondata["bbox"]["line"]["x0"] + jsondata["chars"][idx1]["xmax"]

# Store the char and bbox values in dict
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"])] = OD()
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["chars"] = chars
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["x0"]= wx0
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["x1"]= wx1
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["y0"]= wy0
jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["y1"]= wy1

# Stores all information in one string
info = "bbox " + ' '.join(map(str, [wx0,wy0,wx1,wy1]))
if wconf == 1: wconf = 0
info += "; x_wconf "+str(wconf*100)
if args.charconfs:
if charconfs == "": charconfs = " 0"
info += "; x_confs"+charconfs

# reset conf values
charconfs = ""
wconf = 1

# put all information about the ocr_word together into a SPAN
else:
info = "bbox " + ' '.join(map(str, [jsondata["bbox"]["line"]["x0"], wy0, jsondata["bbox"]["line"]["x1"], wy1]))
info += "; x_wconf 0; x_confs 0"

PN(" <span")
if style != "": PN(" style='" + style + "'")
PN(" class='ocrx_word' title='%s'>" % info, chars, "</span>\n")
idx0 = idx1
chars = ""

elif jsondata["chars"][idx1]["char"] != "":
# atm unknown chars will be skipped (you could maybe use them to cut finer?)
chars += jsondata["chars"][idx1]["char"]
wconf = wconf * jsondata["chars"][idx1]["prob"]
charconfs += " " + str(jsondata["chars"][idx1]["prob"]*100)

PN(" </span>\n")
else:
print("There are no bbox information in %s.ocropy.json file, please set '--clocs'-option on rpred." % (lbase))
args.normal = True
with codecs.open(lbase + ".ocropy.json", "w", encoding="utf-8") as jsonfile:
del jsondata["chars"][len(jsondata["chars"]) - 1]
json.dump(jsondata, jsonfile, indent=4, ensure_ascii=False)
if args.normal:

# put it all together into a SPAN (not extend hocr-files)

PN("<span")
if style!="": PN(" style='"+style+"'")
PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
if not args.nobreaks: P("<br />")
else: P()
finally:
P("</div>")

Expand Down
Loading