-
Notifications
You must be signed in to change notification settings - Fork 6
/
analyze_ocr.py
152 lines (126 loc) · 4.21 KB
/
analyze_ocr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import sys
from iabook import *
from windowed_iterator import windowed_iterator
import find_pagenos
# import find_header_footer
import make_toc
import json
opts = None
hardcode_toc_pages = None
# hardcode_toc_pages = range(16,18)
hardcode_nottoc_pages = None
# hardcode_nottoc_pages = [18]
djvu = True
# djvu = False
pagenos = False
hfs = False
scandata_ns = ''
def main(argv):
import optparse
parser = optparse.OptionParser(usage='usage: %prog [options]',
version='%prog 0.1',
description='make tocs')
parser.add_option('--human',
action='store_true',
default=False,
help='print some human-readable stuff')
global opts
opts, args = parser.parse_args(argv)
doc = ''
callback = None
if len(args) == 4:
(item_id, doc, path, callback) = args
elif len(args) == 3:
(item_id, doc, path) = args
else:
(book_id,) = args
path = book_id
book_id = args[0]
iabook = Book(book_id, doc, path)
global scandata_ns
scandata_ns = iabook.get_scandata_ns()
if djvu:
pages = iabook.get_pages_as_djvu()
else:
pages = iabook.get_pages_as_abbyy()
pages = filter(pages)
pages = annotate(pages)
def clear_page(page):
page.clear()
windowed_pages = windowed_iterator(pages, 5, clear_page)
pages = analyze(windowed_pages)
toc_result = make_toc.make_toc(iabook, pages, hardcode_toc_pages, hardcode_nottoc_pages)
toc_result['contents_leafnos'] = iabook.get_contents_indices()
toc_result['readable'] = print_readable(toc_result['qdtoc'])
if opts.human:
for r in ('readable', 'comments', 'isok'):
print r + ':'
print toc_result[r]
print
else:
if callback is not None:
print '%s(' % callback
print json.dumps(toc_result)
if callback is not None:
print ')'
else:
print json.dumps(toc_result, indent=4)
# consume(pages)
def print_readable(a):
maxllen = 0
maxtlen = 0
for el in a:
if len(el['title']) > maxtlen:
maxtlen = len(el['title'])
if len(el['label']) > maxllen:
maxllen = len(el['label'])
def printel(el):
return '%s %s %s %s' % (el['tocpage'],
el['label'].ljust(maxllen + 3),
el['title'].ljust(maxtlen + 3),
el['pagenum'].rjust(3))
return '\n'.join(printel(el) for el in a)
def filter(pages):
for page in pages:
# if page.index % 1 == 0:
# drawable = page.get_drawable()
# page.draw_basics(drawable)
# drawable.save()
if page.scandata.findtext(scandata_ns + 'addToAccessFormats') == 'true':
yield page
def annotate(pages):
for page in pages:
page.info['type'] = page.scandata.findtext(scandata_ns
+ 'pageType').lower()
number = page.scandata.findtext(scandata_ns + 'pageNumber')
if number is not None:
page.info['number'] = number.lower()
else:
page.info['number'] = ''
page.info['bounds'] = page.find_text_bounds()
# XXXmm comment out for speed
if pagenos:
find_pagenos.annotate_page(page)
# if hfs:
# find_header_footer.annotate_page(page)
yield page
page = None
def analyze(windowed_pages):
for page in windowed_pages:
if pagenos:
# XXXmm comment out for speed
find_pagenos.guess_best_pageno(page, windowed_pages,
windowed_pages.window)
# use guess if number is not supplied
if (len(page.info['number']) == 0
and 'pageno_guess' in page.info):
page.info['number'] = str(page.info['pageno_guess'])
# if hfs:
# find_header_footer.guess_hf(page, windowed_pages)
yield page
def consume(pages):
for page in pages:
# print page.info
page = None
if __name__ == '__main__':
main(sys.argv[1:])