mirror of https://github.com/nodejs/node.git
356 lines
12 KiB
Python
Executable File
356 lines
12 KiB
Python
Executable File
#!/usr/bin/python
|
|
#
|
|
# Copyright (C) 2014 IBM Corporation and Others. All Rights Reserved.
|
|
#
|
|
# @author Steven R. Loomis <srl@icu-project.org>
|
|
#
|
|
# This tool slims down an ICU data (.dat) file according to a config file.
|
|
#
|
|
# See: http://bugs.icu-project.org/trac/ticket/10922
|
|
#
|
|
# Usage:
|
|
# Use "-h" to get help options.
|
|
|
|
from __future__ import print_function
|
|
|
|
import io
|
|
import json
|
|
import optparse
|
|
import os
|
|
import re
|
|
import shutil
|
|
import sys
|
|
|
|
try:
|
|
# for utf-8 on Python 2
|
|
reload(sys)
|
|
sys.setdefaultencoding("utf-8")
|
|
except NameError:
|
|
pass # Python 3 already defaults to utf-8
|
|
|
|
try:
|
|
basestring # Python 2
|
|
except NameError:
|
|
basestring = str # Python 3
|
|
|
|
endian=sys.byteorder
|
|
|
|
parser = optparse.OptionParser(usage="usage: mkdir tmp ; %prog -D ~/Downloads/icudt53l.dat -T tmp -F trim_en.json -O icudt53l.dat" )
|
|
|
|
parser.add_option("-P","--tool-path",
|
|
action="store",
|
|
dest="toolpath",
|
|
help="set the prefix directory for ICU tools")
|
|
|
|
parser.add_option("-D","--input-file",
|
|
action="store",
|
|
dest="datfile",
|
|
help="input data file (icudt__.dat)",
|
|
) # required
|
|
|
|
parser.add_option("-F","--filter-file",
|
|
action="store",
|
|
dest="filterfile",
|
|
help="filter file (JSON format)",
|
|
) # required
|
|
|
|
parser.add_option("-T","--tmp-dir",
|
|
action="store",
|
|
dest="tmpdir",
|
|
help="working directory.",
|
|
) # required
|
|
|
|
parser.add_option("--delete-tmp",
|
|
action="count",
|
|
dest="deltmpdir",
|
|
help="delete working directory.",
|
|
default=0)
|
|
|
|
parser.add_option("-O","--outfile",
|
|
action="store",
|
|
dest="outfile",
|
|
help="outfile (NOT a full path)",
|
|
) # required
|
|
|
|
parser.add_option("-v","--verbose",
|
|
action="count",
|
|
default=0)
|
|
|
|
parser.add_option('-L',"--locales",
|
|
action="store",
|
|
dest="locales",
|
|
help="sets the 'locales.only' variable",
|
|
default=None)
|
|
|
|
parser.add_option('-e', '--endian', action='store', dest='endian', help='endian, big, little or host, your default is "%s".' % endian, default=endian, metavar='endianness')
|
|
|
|
(options, args) = parser.parse_args()
|
|
|
|
optVars = vars(options)
|
|
|
|
for opt in [ "datfile", "filterfile", "tmpdir", "outfile" ]:
|
|
if optVars[opt] is None:
|
|
print("Missing required option: %s" % opt)
|
|
sys.exit(1)
|
|
|
|
if options.verbose>0:
|
|
print("Options: "+str(options))
|
|
|
|
if (os.path.isdir(options.tmpdir) and options.deltmpdir):
|
|
if options.verbose>1:
|
|
print("Deleting tmp dir %s.." % (options.tmpdir))
|
|
shutil.rmtree(options.tmpdir)
|
|
|
|
if not (os.path.isdir(options.tmpdir)):
|
|
os.mkdir(options.tmpdir)
|
|
else:
|
|
print("Please delete tmpdir %s before beginning." % options.tmpdir)
|
|
sys.exit(1)
|
|
|
|
if options.endian not in ("big","little","host"):
|
|
print("Unknown endianness: %s" % options.endian)
|
|
sys.exit(1)
|
|
|
|
if options.endian == "host":
|
|
options.endian = endian
|
|
|
|
if not os.path.isdir(options.tmpdir):
|
|
print("Error, tmpdir not a directory: %s" % (options.tmpdir))
|
|
sys.exit(1)
|
|
|
|
if not os.path.isfile(options.filterfile):
|
|
print("Filterfile doesn't exist: %s" % (options.filterfile))
|
|
sys.exit(1)
|
|
|
|
if not os.path.isfile(options.datfile):
|
|
print("Datfile doesn't exist: %s" % (options.datfile))
|
|
sys.exit(1)
|
|
|
|
if not options.datfile.endswith(".dat"):
|
|
print("Datfile doesn't end with .dat: %s" % (options.datfile))
|
|
sys.exit(1)
|
|
|
|
outfile = os.path.join(options.tmpdir, options.outfile)
|
|
|
|
if os.path.isfile(outfile):
|
|
print("Error, output file does exist: %s" % (outfile))
|
|
sys.exit(1)
|
|
|
|
if not options.outfile.endswith(".dat"):
|
|
print("Outfile doesn't end with .dat: %s" % (options.outfile))
|
|
sys.exit(1)
|
|
|
|
dataname=options.outfile[0:-4]
|
|
|
|
|
|
## TODO: need to improve this. Quotes, etc.
|
|
def runcmd(tool, cmd, doContinue=False):
|
|
if(options.toolpath):
|
|
cmd = os.path.join(options.toolpath, tool) + " " + cmd
|
|
else:
|
|
cmd = tool + " " + cmd
|
|
|
|
if(options.verbose>4):
|
|
print("# " + cmd)
|
|
|
|
rc = os.system(cmd)
|
|
if rc != 0 and not doContinue:
|
|
print("FAILED: %s" % cmd)
|
|
sys.exit(1)
|
|
return rc
|
|
|
|
## STEP 0 - read in json config
|
|
with io.open(options.filterfile, encoding='utf-8') as fi:
|
|
config = json.load(fi)
|
|
|
|
if options.locales:
|
|
config["variables"] = config.get("variables", {})
|
|
config["variables"]["locales"] = config["variables"].get("locales", {})
|
|
config["variables"]["locales"]["only"] = options.locales.split(',')
|
|
|
|
if options.verbose > 6:
|
|
print(config)
|
|
|
|
if "comment" in config:
|
|
print("%s: %s" % (options.filterfile, config["comment"]))
|
|
|
|
## STEP 1 - copy the data file, swapping endianness
|
|
## The first letter of endian_letter will be 'b' or 'l' for big or little
|
|
endian_letter = options.endian[0]
|
|
|
|
runcmd("icupkg", "-t%s %s %s""" % (endian_letter, options.datfile, outfile))
|
|
|
|
## STEP 2 - get listing
|
|
listfile = os.path.join(options.tmpdir,"icudata.lst")
|
|
runcmd("icupkg", "-l %s > %s""" % (outfile, listfile))
|
|
|
|
with open(listfile, 'rb') as fi:
|
|
items = [line.strip() for line in fi.read().decode("utf-8").splitlines()]
|
|
itemset = set(items)
|
|
|
|
if options.verbose > 1:
|
|
print("input file: %d items" % len(items))
|
|
|
|
# list of all trees
|
|
trees = {}
|
|
RES_INDX = "res_index.res"
|
|
remove = None
|
|
# remove - always remove these
|
|
if "remove" in config:
|
|
remove = set(config["remove"])
|
|
else:
|
|
remove = set()
|
|
|
|
# keep - always keep these
|
|
if "keep" in config:
|
|
keep = set(config["keep"])
|
|
else:
|
|
keep = set()
|
|
|
|
def queueForRemoval(tree):
|
|
global remove
|
|
if tree not in config.get("trees", {}):
|
|
return
|
|
mytree = trees[tree]
|
|
if options.verbose > 0:
|
|
print("* %s: %d items" % (tree, len(mytree["locs"])))
|
|
# do varible substitution for this tree here
|
|
if isinstance(config["trees"][tree], basestring):
|
|
treeStr = config["trees"][tree]
|
|
if options.verbose > 5:
|
|
print(" Substituting $%s for tree %s" % (treeStr, tree))
|
|
if treeStr not in config.get("variables", {}):
|
|
print(" ERROR: no variable: variables.%s for tree %s" % (treeStr, tree))
|
|
sys.exit(1)
|
|
config["trees"][tree] = config["variables"][treeStr]
|
|
myconfig = config["trees"][tree]
|
|
if options.verbose > 4:
|
|
print(" Config: %s" % (myconfig))
|
|
# Process this tree
|
|
if(len(myconfig)==0 or len(mytree["locs"])==0):
|
|
if(options.verbose>2):
|
|
print(" No processing for %s - skipping" % (tree))
|
|
else:
|
|
only = None
|
|
if "only" in myconfig:
|
|
only = set(myconfig["only"])
|
|
if (len(only)==0) and (mytree["treeprefix"] != ""):
|
|
thePool = "%spool.res" % (mytree["treeprefix"])
|
|
if (thePool in itemset):
|
|
if(options.verbose>0):
|
|
print("Removing %s because tree %s is empty." % (thePool, tree))
|
|
remove.add(thePool)
|
|
else:
|
|
print("tree %s - no ONLY")
|
|
for l in range(len(mytree["locs"])):
|
|
loc = mytree["locs"][l]
|
|
if (only is not None) and not loc in only:
|
|
# REMOVE loc
|
|
toRemove = "%s%s%s" % (mytree["treeprefix"], loc, mytree["extension"])
|
|
if(options.verbose>6):
|
|
print("Queueing for removal: %s" % toRemove)
|
|
remove.add(toRemove)
|
|
|
|
def addTreeByType(tree, mytree):
|
|
if(options.verbose>1):
|
|
print("(considering %s): %s" % (tree, mytree))
|
|
trees[tree] = mytree
|
|
mytree["locs"]=[]
|
|
for i in range(len(items)):
|
|
item = items[i]
|
|
if item.startswith(mytree["treeprefix"]) and item.endswith(mytree["extension"]):
|
|
mytree["locs"].append(item[len(mytree["treeprefix"]):-4])
|
|
# now, process
|
|
queueForRemoval(tree)
|
|
|
|
addTreeByType("converters",{"treeprefix":"", "extension":".cnv"})
|
|
addTreeByType("stringprep",{"treeprefix":"", "extension":".spp"})
|
|
addTreeByType("translit",{"treeprefix":"translit/", "extension":".res"})
|
|
addTreeByType("brkfiles",{"treeprefix":"brkitr/", "extension":".brk"})
|
|
addTreeByType("brkdict",{"treeprefix":"brkitr/", "extension":"dict"})
|
|
addTreeByType("confusables",{"treeprefix":"", "extension":".cfu"})
|
|
|
|
for i in range(len(items)):
|
|
item = items[i]
|
|
if item.endswith(RES_INDX):
|
|
treeprefix = item[0:item.rindex(RES_INDX)]
|
|
tree = None
|
|
if treeprefix == "":
|
|
tree = "ROOT"
|
|
else:
|
|
tree = treeprefix[0:-1]
|
|
if(options.verbose>6):
|
|
print("procesing %s" % (tree))
|
|
trees[tree] = { "extension": ".res", "treeprefix": treeprefix, "hasIndex": True }
|
|
# read in the resource list for the tree
|
|
treelistfile = os.path.join(options.tmpdir,"%s.lst" % tree)
|
|
runcmd("iculslocs", "-i %s -N %s -T %s -l > %s" % (outfile, dataname, tree, treelistfile))
|
|
with io.open(treelistfile, 'r', encoding='utf-8') as fi:
|
|
treeitems = fi.readlines()
|
|
trees[tree]["locs"] = [line.strip() for line in treeitems]
|
|
if tree not in config.get("trees", {}):
|
|
print(" Warning: filter file %s does not mention trees.%s - will be kept as-is" % (options.filterfile, tree))
|
|
else:
|
|
queueForRemoval(tree)
|
|
|
|
def removeList(count=0):
|
|
# don't allow "keep" items to creep in here.
|
|
global remove
|
|
remove = remove - keep
|
|
if(count > 10):
|
|
print("Giving up - %dth attempt at removal." % count)
|
|
sys.exit(1)
|
|
if(options.verbose>1):
|
|
print("%d items to remove - try #%d" % (len(remove),count))
|
|
if(len(remove)>0):
|
|
oldcount = len(remove)
|
|
hackerrfile=os.path.join(options.tmpdir, "REMOVE.err")
|
|
removefile = os.path.join(options.tmpdir, "REMOVE.lst")
|
|
with open(removefile, 'wb') as fi:
|
|
fi.write('\n'.join(remove).encode("utf-8") + b'\n')
|
|
rc = runcmd("icupkg","-r %s %s 2> %s" % (removefile,outfile,hackerrfile),True)
|
|
if rc != 0:
|
|
if(options.verbose>5):
|
|
print("## Damage control, trying to parse stderr from icupkg..")
|
|
fi = open(hackerrfile, 'rb')
|
|
erritems = fi.readlines()
|
|
fi.close()
|
|
#Item zone/zh_Hant_TW.res depends on missing item zone/zh_Hant.res
|
|
pat = re.compile(br"^Item ([^ ]+) depends on missing item ([^ ]+).*")
|
|
for i in range(len(erritems)):
|
|
line = erritems[i].strip()
|
|
m = pat.match(line)
|
|
if m:
|
|
toDelete = m.group(1).decode("utf-8")
|
|
if(options.verbose > 5):
|
|
print("<< %s added to delete" % toDelete)
|
|
remove.add(toDelete)
|
|
else:
|
|
print("ERROR: could not match errline: %s" % line)
|
|
sys.exit(1)
|
|
if(options.verbose > 5):
|
|
print(" now %d items to remove" % len(remove))
|
|
if(oldcount == len(remove)):
|
|
print(" ERROR: could not add any mor eitems to remove. Fail.")
|
|
sys.exit(1)
|
|
removeList(count+1)
|
|
|
|
# fire it up
|
|
removeList(1)
|
|
|
|
# now, fixup res_index, one at a time
|
|
for tree in trees:
|
|
# skip trees that don't have res_index
|
|
if "hasIndex" not in trees[tree]:
|
|
continue
|
|
treebunddir = options.tmpdir
|
|
if(trees[tree]["treeprefix"]):
|
|
treebunddir = os.path.join(treebunddir, trees[tree]["treeprefix"])
|
|
if not (os.path.isdir(treebunddir)):
|
|
os.mkdir(treebunddir)
|
|
treebundres = os.path.join(treebunddir,RES_INDX)
|
|
treebundtxt = "%s.txt" % (treebundres[0:-4])
|
|
runcmd("iculslocs", "-i %s -N %s -T %s -b %s" % (outfile, dataname, tree, treebundtxt))
|
|
runcmd("genrb","-d %s -s %s res_index.txt" % (treebunddir, treebunddir))
|
|
runcmd("icupkg","-s %s -a %s%s %s" % (options.tmpdir, trees[tree]["treeprefix"], RES_INDX, outfile))
|