ocropus-archive · JKamlah · Dec 23, 2017 · Jan 8, 2018 · Jan 8, 2018 · Jan 25, 2018
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,4 @@ build/
 *.so
 .~*.vue
 doc/.ipynb_checkpoints/
+.idea/
diff --git a/ocrolib/lstm.py b/ocrolib/lstm.py
@@ -37,6 +37,7 @@
 from ocrolib.edist import levenshtein
 import utils
 import unicodedata
+import sys
 from scipy.ndimage import measurements,filters
 
 initial_range = 0.1
@@ -743,14 +744,35 @@ def translate_back(outputs,threshold=0.7,pos=0):
     """Translate back. Thresholds on class 0, then assigns the maximum class to
     each region. ``pos`` determines the depth of character information returned:
         * `pos=0`: Return list of recognized characters
-        * `pos=1`: Return list of position-character tuples
+        * `pos=1`: Return list of position-character tuples (maxima only)
         * `pos=2`: Return list of character-probability tuples
+        * `pos=3`: Return list of position-character tuples (start, end and maxima values)
      """
     labels,n = measurements.label(outputs[:,0]<threshold)
     mask = tile(labels.reshape(-1,1),(1,outputs.shape[1]))
     maxima = measurements.maximum_position(outputs,mask,arange(1,amax(mask)+1))
     if pos==1: return maxima # include character position
     if pos==2: return [(c, outputs[r,c]) for (r,c) in maxima] # include character probabilities
+    if pos==3:
+        p = -1
+        x0 = None
+        x = []
+        for idx, val in enumerate(labels):
+            if val != 0 and x0 is None:
+                x0 = idx
+                p += 1
+            if val == 0 and x0 is not None:
+                #obosolete? This skips N/A strings
+                #if maxima[p][1] == 0:
+                    #ignore '' spaces
+                #    x0 = None
+                #else:
+                x.append((maxima[p][0], x0, idx, maxima[p][1]))
+                x0 = None
+        # append last non-zero region to list of no zero region occurs after it
+        if x0:
+            x.append((maxima[p][0], x0, len(outputs), maxima[p][1]))
+        return x
     return [c for (r,c) in maxima] # only recognized characters
 
 def log_mul(x,y):

diff --git a/ocropus-gpageseg b/ocropus-gpageseg
@@ -20,6 +20,9 @@ import os.path
 import sys
 import traceback
 from multiprocessing import Pool
+import codecs
+import json
+from collections import OrderedDict as OD
 
 import numpy as np
 from scipy.ndimage import measurements
@@ -90,6 +93,9 @@ group_output.add_argument('-p','--pad',type=int,default=3,
                     help='padding for extracted lines, default: %(default)s')
 group_output.add_argument('-e','--expand',type=int,default=3,
                     help='expand mask for grayscale extraction, default: %(default)s')
+group_output.add_argument('-j','--json',action='store_true',
+                    help='store information about parameters in a *.ocropy.json-file \
+                    (this option can be used to create an extended hocr output)')
 
 # other parameters
 group_others = parser.add_argument_group('others')
@@ -442,9 +448,36 @@ def process1(job):
     lines = [lines[i] for i in lsort]
     ocrolib.write_page_segmentation("%s.pseg.png"%outputdir,segmentation)
     cleaned = ocrolib.remove_noise(binary,args.noise)
+
+    # create ordered-dictobj for json output
+    if args.json:
+        func = "gpageseg"
+        jsondata = OD()
+        jsondata["global"] = OD()
+        jsondata["pad"] = OD()
+        jsondata["pad"][func] = args.pad
+        jsondata["scale"] = OD()
+        jsondata["scale"][func] = scale
+        jsondata["bbox"] = OD()
+
     for i,l in enumerate(lines):
         binline = psegutils.extract_masked(1-cleaned,l,pad=args.pad,expand=args.expand)
         ocrolib.write_image_binary("%s/01%04x.bin.png"%(outputdir,i+1),binline)
+
+        if args.json:
+            # write dictobj to json file
+            with codecs.open("%s/01%04x.ocropy.json" % (outputdir,i+1), "w",encoding="utf-8") as jsonfile:
+                y0, x0, y1, x1 = [int(x) for x in [l.bounds[0].start, l.bounds[1].start, l.bounds[0].stop, l.bounds[1].stop]]
+                jsondata["global"]["fpath"] = fname
+                jsondata["global"]["id"] = "01%04x" % (i + 1)
+                jsondata["bbox"]["line"] = OD()
+                jsondata["bbox"]["line"]["chars"] = ""
+                jsondata["bbox"]["line"]["x0"] = x0
+                jsondata["bbox"]["line"]["x1"] = x1
+                jsondata["bbox"]["line"]["y0"] = y0
+                jsondata["bbox"]["line"]["y1"] = y1
+                json.dump(jsondata, jsonfile,indent=4,ensure_ascii=False)
+
         if args.gray:
             grayline = psegutils.extract_masked(gray,l,pad=args.pad,expand=args.expand)
             ocrolib.write_image_gray("%s/01%04x.nrm.png"%(outputdir,i+1),grayline)

diff --git a/ocropus-hocr b/ocropus-hocr
@@ -3,11 +3,14 @@
 import __builtin__ as python
 import random as pyrandom
 import sys
-import os.path
 import re
 import glob
 import argparse
 import codecs
+import os
+from collections import OrderedDict as OD
+import json
+from copy import deepcopy
 
 import numpy as np
 from matplotlib.pyplot import imread
@@ -30,6 +33,8 @@ For each page like 'book/0001.bin.png', it uses the following files:
 """)
 parser.add_argument("-b","--nobreaks",action="store_true",help="don't output line breaks")
 parser.add_argument("-p","--nopars",action="store_true",help="don't output paragraphs")
+parser.add_argument("-n","--normal",action="store_true",help="surpress the output of extended hocr version (only available if '-j/--json' was set to gpageseg)")
+parser.add_argument("-c","--charconfs",action="store_true",help="output confidences of every char, only in combination with extened hocr-file.")
 parser.add_argument("-s","--fscale",type=float,default=1.0,help="scale factor for translating xheights into font size (use 0 to disable), default: %(default)s")
 parser.add_argument("-o","--output",default="book.html",help="output file, default: %(default)s")
 parser.add_argument('files',nargs='+')
@@ -145,14 +150,119 @@ for arg in args.files:
             if os.path.exists(lbase+".baseline"):
                 info += "; baseline "+ocrolib.read_text(lbase+".baseline")
 
-            # put it all together into a SPAN
-
-            PN("<span")
-            if style!="": PN(" style='"+style+"'")
-            PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
-            if not args.nobreaks: P("<br />")
-            else: P()
-
+            # extend the hocr files with information from cloc and prob files
+            if not args.normal:
+                if not os.path.exists(lbase + ".ocropy.json"):
+                    print("There is no %s.ocropy.json file." %(lbase))
+                    args.normal = True
+                else:
+                    with open(lbase + ".ocropy.json", "r") as jsonfile:
+                        jsondata = json.load(jsonfile, object_pairs_hook=OD)
+
+                    if "chars" not in jsondata.keys() or "bbox" not in jsondata.keys():
+                        continue
+
+                    # put all information about the ocr_line together into a SPAN
+
+                    PN("    <span")
+                    if style != "": PN(" style='" + style + "'")
+                    PN(" class='ocr_line' title='%s'>" % info, )
+                    P("<br />")
+
+                    pad = jsondata["pad"]["gpageseg"]
+                    scale = jsondata["scale"]["rpred"]
+                    jsondata["bbox"]["word"]= OD()
+                    wcount= 0
+                    #convert string key value to int
+
+                    for key in jsondata["chars"].keys():
+                        if jsondata["chars"][key]["char"] in [">","<","&"]:
+                            jsondata["chars"][key]["char"] = ["&lt;","&gt;","&amp;"][[">","<","&"].index(jsondata["chars"][key]["char"])]
+                        jsondata["chars"][int(key)] = deepcopy(jsondata["chars"][key])
+                        del jsondata["chars"][key]
+                    jsondata["chars"][len(jsondata["chars"])] = OD()
+                    jsondata["chars"][len(jsondata["chars"])-1]["char"]=" "
+                    jsondata["bbox"]["word"] = OD()
+
+
+                    # idx0 = start_index, idx1 = stop_index(whitespace) of the recognized word
+                    idx0 = 0
+                    chars = ""
+                    charconfs = ""
+                    wconf = 1.0
+
+                    # set lineparameter
+                    wy0 = jsondata["bbox"]["line"]["y0"]
+                    wy1 = jsondata["bbox"]["line"]["y1"]
+                    if "x0" in jsondata["chars"][0]:
+                        for idx1 in jsondata["chars"]:
+                            if jsondata["chars"][idx1]["char"] == " ":
+                                if jsondata["bbox"]["line"]["chars"] != "":
+                                    # set lineparameter
+                                    wx0= jsondata["bbox"]["line"]["x0"]
+                                    if idx0 != 0:
+                                        #Take the end value of the previous whitespace
+                                        # scale factor correct?
+                                        wx0 = jsondata["bbox"]["line"]["x0"] + jsondata["chars"][idx0]["x1"]
+                                    if idx1 == len(jsondata["chars"])-1:
+                                        wx1 = jsondata["bbox"]["line"]["x1"]
+                                    else:
+                                        # Take the maxima value of the current whitespace
+                                        wx1= jsondata["bbox"]["line"]["x0"] + jsondata["chars"][idx1]["xmax"]
+
+                                    # Store the char and bbox values in dict
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"])] = OD()
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["chars"] = chars
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["x0"]= wx0
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["x1"]= wx1
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["y0"]= wy0
+                                    jsondata["bbox"]["word"][len(jsondata["bbox"]["word"]) - 1]["y1"]= wy1
+
+                                    # Stores all information in one string
+                                    info = "bbox " + ' '.join(map(str, [wx0,wy0,wx1,wy1]))
+                                    if wconf == 1: wconf = 0
+                                    info += "; x_wconf "+str(wconf*100)
+                                    if args.charconfs:
+                                        if charconfs == "": charconfs = " 0"
+                                        info += "; x_confs"+charconfs
+
+                                    # reset conf values
+                                    charconfs = ""
+                                    wconf = 1
+
+                                    # put all information about the ocr_word together into a SPAN
+                                else:
+                                    info = "bbox " + ' '.join(map(str, [jsondata["bbox"]["line"]["x0"], wy0, jsondata["bbox"]["line"]["x1"], wy1]))
+                                    info += "; x_wconf 0; x_confs 0"
+
+                                PN("        <span")
+                                if style != "": PN(" style='" + style + "'")
+                                PN(" class='ocrx_word' title='%s'>" % info, chars, "</span>\n")
+                                idx0 = idx1
+                                chars = ""
+
+                            elif jsondata["chars"][idx1]["char"] != "":
+                                # atm unknown chars will be skipped (you could maybe use them to cut finer?)
+                                chars += jsondata["chars"][idx1]["char"]
+                                wconf = wconf * jsondata["chars"][idx1]["prob"]
+                                charconfs += " " + str(jsondata["chars"][idx1]["prob"]*100)
+
+                        PN("    </span>\n")
+                    else:
+                        print("There are no bbox information in %s.ocropy.json file, please set '--clocs'-option on rpred." % (lbase))
+                        args.normal = True
+                    with codecs.open(lbase + ".ocropy.json", "w", encoding="utf-8") as jsonfile:
+                        del jsondata["chars"][len(jsondata["chars"]) - 1]
+                        json.dump(jsondata, jsonfile, indent=4, ensure_ascii=False)
+            if args.normal:
+
+                # put it all together into a SPAN (not extend hocr-files)
+
+                PN("<span")
+                if style!="": PN(" style='"+style+"'")
+                PN(" class='ocr_line' title='%s'>"%info,text,"</span>")
+                if not args.nobreaks: P("<br />")
+                else: P()
     finally:
         P("</div>")
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,3 +14,4 @@ build/ @@
     *.so
     .~*.vue
     doc/.ipynb_checkpoints/
+    .idea/