All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesshelper.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 from __future__ import print_function
3 from builtins import input
4 """
5 tesshelper.py -- Utility operations to compare, report stats, and copy
6  public headers for tesseract 3.0x VS2008 Project
7 
8 $RCSfile: tesshelper.py,v $ $Revision: 7ca575b377aa $ $Date: 2012/03/07 17:26:31 $
9 """
10 
11 r"""
12 Requires:
13 
14  python 2.7 or greater: activestate.com
15  http://www.activestate.com/activepython/downloads
16 
17 because using the new argparse module and new literal set syntax (s={1, 2}) .
18 
19 General Notes:
20 --------------
21 
22 Format for a .vcproj file entry:
23 
24  <File
25  RelativePath="..\src\allheaders.h"
26  >
27  </File>
28 
29 """
30 
31 epilogStr = r"""
32 Examples:
33 
34 Assume that tesshelper.py is in c:\buildfolder\tesseract-3.02\vs2008,
35 which is also the current directory. Then,
36 
37  python tesshelper .. compare
38 
39 will compare c:\buildfolder\tesseract-3.02 "library" directories to the
40 libtesseract Project
41 (c:\buildfolder\tesseract-3.02\vs2008\libtesseract\libtesseract.vcproj).
42 
43  python tesshelper .. report
44 
45 will display summary stats for c:\buildfolder\tesseract-3.02 "library"
46 directories and the libtesseract Project.
47 
48  python tesshelper .. copy ..\..\include
49 
50 will copy all "public" libtesseract header files to
51 c:\buildfolder\include.
52 
53  python tesshelper .. clean
54 
55 will clean the vs2008 folder of all build directories, and .user, .suo,
56 .ncb, and other temp files.
57 
58 """
59 
60 # imports of python standard library modules
61 # See Python Documentation | Library Reference for details
62 import collections
63 import glob
64 import argparse
65 import os
66 import re
67 import shutil
68 import sys
69 
70 # ====================================================================
71 
72 VERSION = "1.0 %s" % "$Date: 2012/03/07 17:26:31 $".split()[1]
73 PROJ_SUBDIR = r"vs2008\libtesseract"
74 PROJFILE = "libtesseract.vcproj"
75 
76 NEWHEADERS_FILENAME = "newheaders.txt"
77 NEWSOURCES_FILENAME = "newsources.txt"
78 
79 fileNodeTemplate = \
80 ''' <ClCompile Include="..\..\%s" />'''
81 
82 # ====================================================================
83 
84 def getProjectfiles(libTessDir, libProjectFile, nTrimChars):
85  """Return sets of all, c, h, and resources files in libtesseract Project"""
86 
87  #extract filenames of header & source files from the .vcproj
88  projectCFiles = set()
89  projectHFiles = set()
90  projectRFiles = set()
91  projectFilesSet = set()
92  f = open(libProjectFile, "r")
93  data = f.read()
94  f.close()
95 
96  projectFiles = re.findall(r'(?i)Include="(\.[^"]+)"', data)
97  for projectFile in projectFiles:
98  root, ext = os.path.splitext(projectFile.lower())
99  if ext == ".c" or ext == ".cpp":
100  projectCFiles.add(projectFile)
101  elif ext == ".h":
102  projectHFiles.add(projectFile)
103  elif ext == ".rc":
104  projectRFiles.add(projectFile)
105  else:
106  print("unknown file type: %s" % projectFile)
107 
108  relativePath = os.path.join(libTessDir, projectFile)
109  relativePath = os.path.abspath(relativePath)
110  relativePath = relativePath[nTrimChars:].lower()
111  projectFilesSet.add(relativePath)
112 
113  return projectFilesSet, projectHFiles, projectCFiles, projectRFiles
114 
115 def getTessLibFiles(tessDir, nTrimChars):
116  """Return set of all libtesseract files in tessDir"""
117 
118  libDirs = [
119  "api",
120  "ccmain",
121  "ccstruct",
122  "ccutil",
123  "classify",
124  "cube",
125  "cutil",
126  "dict",
127  r"neural_networks\runtime",
128  "opencl",
129  "textord",
130  "viewer",
131  "wordrec",
132  #"training",
133  r"vs2010\port",
134  r"vs2010\libtesseract",
135  ]
136 
137  #create list of all .h, .c, .cpp files in "library" directories
138  tessFiles = set()
139  for curDir in libDirs:
140  baseDir = os.path.join(tessDir, curDir)
141  for filetype in ["*.c", "*.cpp", "*.h"]:
142  pattern = os.path.join(baseDir, filetype)
143  fileList = glob.glob(pattern)
144  for curFile in fileList:
145  curFile = os.path.abspath(curFile)
146  relativePath = curFile[nTrimChars:].lower()
147  tessFiles.add(relativePath)
148 
149  return tessFiles
150 
151 # ====================================================================
152 
153 def tessCompare(tessDir):
154  '''Compare libtesseract Project files and actual "sub-library" files.'''
155 
156  vs2010Dir = os.path.join(tessDir, "vs2010")
157  libTessDir = os.path.join(vs2010Dir, "libtesseract")
158  libProjectFile = os.path.join(libTessDir,"libtesseract.vcxproj")
159  tessAbsDir = os.path.abspath(tessDir)
160  nTrimChars = len(tessAbsDir)+1
161  print('Comparing VS2010 Project "%s" with\n "%s"' % (libProjectFile,
162  tessAbsDir))
163 
164  projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
165  getProjectfiles(libTessDir, libProjectFile, nTrimChars)
166  tessFiles = getTessLibFiles(tessDir, nTrimChars)
167 
168  extraFiles = tessFiles - projectFilesSet
169  print("%2d Extra files (in %s but not in Project)" % (len(extraFiles),
170  tessAbsDir))
171  headerFiles = []
172  sourceFiles = []
173  sortedList = list(extraFiles)
174  sortedList.sort()
175  for filename in sortedList:
176  root, ext = os.path.splitext(filename.lower())
177  if ext == ".h":
178  headerFiles.append(filename)
179  else:
180  sourceFiles.append(filename)
181  print(" %s " % filename)
182 
183  print()
184  print("%2d new header file items written to %s" % (len(headerFiles),
185  NEWHEADERS_FILENAME))
186  headerFiles.sort()
187  with open(NEWHEADERS_FILENAME, "w") as f:
188  for filename in headerFiles:
189  f.write(fileNodeTemplate % filename)
190 
191  print("%2d new source file items written to %s" % (len(sourceFiles),
192  NEWSOURCES_FILENAME))
193  sourceFiles.sort()
194  with open(NEWSOURCES_FILENAME, "w") as f:
195  for filename in sourceFiles:
196  f.write(fileNodeTemplate % filename)
197  print()
198 
199  deadFiles = projectFilesSet - tessFiles
200  print("%2d Dead files (in Project but not in %s" % (len(deadFiles),
201  tessAbsDir))
202  sortedList = list(deadFiles)
203  sortedList.sort()
204  for filename in sortedList:
205  print(" %s " % filename)
206 
207 # ====================================================================
208 
209 def tessReport(tessDir):
210  """Report summary stats on "sub-library" files and libtesseract Project file."""
211 
212  vs2010Dir = os.path.join(tessDir, "vs2008")
213  libTessDir = os.path.join(vs2010Dir, "libtesseract")
214  libProjectFile = os.path.join(libTessDir,"libtesseract.vcproj")
215  tessAbsDir = os.path.abspath(tessDir)
216  nTrimChars = len(tessAbsDir)+1
217 
218  projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \
219  getProjectfiles(libTessDir, libProjectFile, nTrimChars)
220  tessFiles = getTessLibFiles(tessDir, nTrimChars)
221 
222  print('Summary stats for "%s" library directories' % tessAbsDir)
223  folderCounters = {}
224  for tessFile in tessFiles:
225  tessFile = tessFile.lower()
226  folder, head = os.path.split(tessFile)
227  file, ext = os.path.splitext(head)
228  typeCounter = folderCounters.setdefault(folder, collections.Counter())
229  typeCounter[ext[1:]] += 1
230 
231  folders = list(folderCounters.keys())
232  folders.sort()
233  totalFiles = 0
234  totalH = 0
235  totalCPP = 0
236  totalOther = 0
237 
238  print()
239  print(" total h cpp")
240  print(" ----- --- ---")
241  for folder in folders:
242  counters = folderCounters[folder]
243  nHFiles = counters['h']
244  nCPPFiles = counters['cpp']
245 
246  total = nHFiles + nCPPFiles
247  totalFiles += total
248  totalH += nHFiles
249  totalCPP += nCPPFiles
250 
251  print(" %5d %3d %3d %s" % (total, nHFiles, nCPPFiles, folder))
252  print(" ----- --- ---")
253  print(" %5d %3d %3d" % (totalFiles, totalH, totalCPP))
254 
255  print()
256  print('Summary stats for VS2008 Project "%s"' % libProjectFile)
257  print(" %5d %s" %(len(projectHFiles), "Header files"))
258  print(" %5d %s" % (len(projectCFiles), "Source files"))
259  print(" %5d %s" % (len(projectRFiles), "Resource files"))
260  print(" -----")
261  print(" %5d" % (len(projectHFiles) + len(projectCFiles) + len(projectRFiles), ))
262 
263 # ====================================================================
264 
265 def copyIncludes(fileSet, description, tessDir, includeDir):
266  """Copy set of files to specified include dir."""
267 
268  print()
269  print('Copying libtesseract "%s" headers to %s' % (description, includeDir))
270  print()
271 
272  sortedList = list(fileSet)
273  sortedList.sort()
274 
275  count = 0
276  errList = []
277  for includeFile in sortedList:
278  filepath = os.path.join(tessDir, includeFile)
279  if os.path.isfile(filepath):
280  shutil.copy2(filepath, includeDir)
281  print("Copied: %s" % includeFile)
282  count += 1
283  else:
284  print('***Error: "%s" doesn\'t exist"' % filepath)
285  errList.append(filepath)
286 
287  print('%d header files successfully copied to "%s"' % (count, includeDir))
288  if len(errList):
289  print("The following %d files were not copied:")
290  for filepath in errList:
291  print(" %s" % filepath)
292 
293 def tessCopy(tessDir, includeDir):
294  '''Copy all "public" libtesseract Project header files to include directory.
295 
296  Preserves directory hierarchy.'''
297 
298  baseIncludeSet = {
299  r"api\baseapi.h",
300  r"api\capi.h",
301  r"api\apitypes.h",
302  r"ccstruct\publictypes.h",
303  r"ccmain\thresholder.h",
304  r"ccutil\host.h",
305  r"ccutil\basedir.h",
306  r"ccutil\tesscallback.h",
307  r"ccutil\unichar.h",
308  r"ccutil\platform.h",
309  }
310 
311  strngIncludeSet = {
312  r"ccutil\strngs.h",
313  r"ccutil\memry.h",
314  r"ccutil\host.h",
315  r"ccutil\serialis.h",
316  r"ccutil\errcode.h",
317  r"ccutil\fileerr.h",
318  #r"ccutil\genericvector.h",
319  }
320 
321  resultIteratorIncludeSet = {
322  r"ccmain\ltrresultiterator.h",
323  r"ccmain\pageiterator.h",
324  r"ccmain\resultiterator.h",
325  r"ccutil\genericvector.h",
326  r"ccutil\tesscallback.h",
327  r"ccutil\errcode.h",
328  r"ccutil\host.h",
329  r"ccutil\helpers.h",
330  r"ccutil\ndminx.h",
331  r"ccutil\params.h",
332  r"ccutil\unicharmap.h",
333  r"ccutil\unicharset.h",
334  }
335 
336  genericVectorIncludeSet = {
337  r"ccutil\genericvector.h",
338  r"ccutil\tesscallback.h",
339  r"ccutil\errcode.h",
340  r"ccutil\host.h",
341  r"ccutil\helpers.h",
342  r"ccutil\ndminx.h",
343  }
344 
345  blobsIncludeSet = {
346  r"ccstruct\blobs.h",
347  r"ccstruct\rect.h",
348  r"ccstruct\points.h",
349  r"ccstruct\ipoints.h",
350  r"ccutil\elst.h",
351  r"ccutil\host.h",
352  r"ccutil\serialis.h",
353  r"ccutil\lsterr.h",
354  r"ccutil\ndminx.h",
355  r"ccutil\tprintf.h",
356  r"ccutil\params.h",
357  r"viewer\scrollview.h",
358  r"ccstruct\vecfuncs.h",
359  }
360 
361  extraFilesSet = {
362  #r"vs2008\include\stdint.h",
363  r"vs2008\include\leptonica_versionnumbers.vsprops",
364  r"vs2008\include\tesseract_versionnumbers.vsprops",
365  }
366 
367  tessIncludeDir = os.path.join(includeDir, "tesseract")
368  if os.path.isfile(tessIncludeDir):
369  print('Aborting: "%s" is a file not a directory.' % tessIncludeDir)
370  return
371  if not os.path.exists(tessIncludeDir):
372  os.mkdir(tessIncludeDir)
373 
374  #fileSet = baseIncludeSet | strngIncludeSet | genericVectorIncludeSet | blobsIncludeSet
375  fileSet = baseIncludeSet | strngIncludeSet | resultIteratorIncludeSet
376 
377  copyIncludes(fileSet, "public", tessDir, tessIncludeDir)
378  copyIncludes(extraFilesSet, "extra", tessDir, includeDir)
379 
380 # ====================================================================
381 
382 def tessClean(tessDir):
383  '''Clean vs2008 folder of all build directories and certain temp files.'''
384 
385  vs2010Dir = os.path.join(tessDir, "vs2008")
386  vs2008AbsDir = os.path.abspath(vs2010Dir)
387 
388  answer = eval(input(
389  'Are you sure you want to clean the\n "%s" folder (Yes/No) [No]? ' %
390  vs2008AbsDir))
391  if answer.lower() not in ("yes",):
392  return
393  answer = eval(input('Only list the items to be deleted (Yes/No) [Yes]? '))
394  answer = answer.strip()
395  listOnly = answer.lower() not in ("no",)
396 
397  for rootDir, dirs, files in os.walk(vs2008AbsDir):
398  for buildDir in ("LIB_Release", "LIB_Debug", "DLL_Release", "DLL_Debug"):
399  if buildDir in dirs:
400  dirs.remove(buildDir)
401  absBuildDir = os.path.join(rootDir, buildDir)
402  if listOnly:
403  print("Would remove: %s" % absBuildDir)
404  else:
405  print("Removing: %s" % absBuildDir)
406  shutil.rmtree(absBuildDir)
407 
408  if rootDir == vs2008AbsDir:
409  for file in files:
410  if file.lower() not in ("tesseract.sln",
411  "tesshelper.py",
412  "readme.txt"):
413  absPath = os.path.join(rootDir, file)
414  if listOnly:
415  print("Would remove: %s" % absPath)
416  else:
417  print("Removing: %s" % absPath)
418  os.remove(absPath)
419  else:
420  for file in files:
421  root, ext = os.path.splitext(file)
422  if ext.lower() in (".suo",
423  ".ncb",
424  ".user",
425  ) or (
426  len(ext)>0 and ext[-1] == "~"):
427  absPath = os.path.join(rootDir, file)
428  if listOnly:
429  print("Would remove: %s" % absPath)
430  else:
431  print("Removing: %s" % absPath)
432  os.remove(absPath)
433 
434 # ====================================================================
435 
436 def validateTessDir(tessDir):
437  """Check that tessDir is a valid tesseract directory."""
438 
439  if not os.path.isdir(tessDir):
440  raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % tessDir)
441  projFile = os.path.join(tessDir, PROJ_SUBDIR, PROJFILE)
442  if not os.path.isfile(projFile):
443  raise argparse.ArgumentTypeError('Project file "%s" doesn\'t exist.' % projFile)
444  return tessDir
445 
446 def validateDir(dir):
447  """Check that dir is a valid directory named include."""
448 
449  if not os.path.isdir(dir):
450  raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % dir)
451 
452  dirpath = os.path.abspath(dir)
453  head, tail = os.path.split(dirpath)
454  if tail.lower() != "include":
455  raise argparse.ArgumentTypeError('Include directory "%s" must be named "include".' % tail)
456 
457  return dir
458 
459 def main ():
460  parser = argparse.ArgumentParser(
461  epilog=epilogStr,
462  formatter_class=argparse.RawDescriptionHelpFormatter)
463 
464  parser.add_argument("--version", action="version",
465  version="%(prog)s " + VERSION)
466  parser.add_argument('tessDir', type=validateTessDir,
467  help="tesseract installation directory")
468 
469  subparsers = parser.add_subparsers(
470  dest="subparser_name",
471  title="Commands")
472  parser_changes = subparsers.add_parser('compare',
473  help="compare libtesseract Project with tessDir")
474  parser_changes.set_defaults(func=tessCompare)
475 
476  parser_report = subparsers.add_parser('report',
477  help="report libtesseract summary stats")
478  parser_report.set_defaults(func=tessReport)
479 
480  parser_copy = subparsers.add_parser('copy',
481  help="copy public libtesseract header files to includeDir")
482  parser_copy.add_argument('includeDir', type=validateDir,
483  help="Directory to copy header files to.")
484  parser_copy.set_defaults(func=tessCopy)
485 
486  parser_clean = subparsers.add_parser('clean',
487  help="clean vs2008 folder of build folders and .user files")
488  parser_clean.set_defaults(func=tessClean)
489 
490  #kludge because argparse has no ability to set default subparser
491  if (len(sys.argv) == 2):
492  sys.argv.append("compare")
493  args = parser.parse_args()
494 
495  #handle commands
496  if args.func == tessCopy:
497  args.func(args.tessDir, args.includeDir)
498  else:
499  args.func(args.tessDir)
500 
501 if __name__ == '__main__' :
502  main()
def copyIncludes
Definition: tesshelper.py:265
def validateTessDir
Definition: tesshelper.py:436
def getTessLibFiles
Definition: tesshelper.py:115
def tessReport
Definition: tesshelper.py:209
def tessClean
Definition: tesshelper.py:382
def tessCopy
Definition: tesshelper.py:293
def validateDir
Definition: tesshelper.py:446
def getProjectfiles
Definition: tesshelper.py:84
def tessCompare
Definition: tesshelper.py:153