# HG changeset patch # User Michael Diamond@Artemis # Date 1276382839 25200 # Node ID 9d004bdf5b1da5aac2f8c24433c6b5169cef6d40 # Parent c00fff9956e66e1b070e4f38d1e92c5965522bb0 Improved the _prefix() method of t, and added a test utility to confirm the new method works and runs faster - the same code on Python 3.1 runs much faster. https://bitbucket.org/dimo414/t-3.1/ diff -r c00fff9956e6 -r 9d004bdf5b1d SpeedResults.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SpeedResults.txt Sat Jun 12 15:47:19 2010 -0700 @@ -0,0 +1,223 @@ +These are the cProfile results of running the original prefix function +against the new prefix function. The original function ran in ~O(n^2) +time, the new one runs in ~O(n) time. + +You can also run prefixcheck.check_prefixes() to confirm there is no +regression from original to new. + +It is worth noting that + +----- + +Size: 1 + +t_prefixes() + 8 function calls in 0.000 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.000 0.000 :1() + 1 0.000 0.000 0.000 0.000 prefixcheck.py:14(t_prefixes) + 1 0.000 0.000 0.000 0.000 {any} + 1 0.000 0.000 0.000 0.000 {len} + 1 0.000 0.000 0.000 0.000 {map} + 1 0.000 0.000 0.000 0.000 {method 'difference' of 'set' objects} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.000 0.000 0.000 0.000 {range} + + +new_prefixes() + 8 function calls in 0.000 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.000 0.000 :1() + 1 0.000 0.000 0.000 0.000 prefixcheck.py:38(new_prefixes) + 1 0.000 0.000 0.000 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} + 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects} + 1 0.000 0.000 0.000 0.000 {range} + 1 0.000 0.000 0.000 0.000 {zip} + + +----- + +Size: 10 + +t_prefixes() + 2973 function calls in 0.009 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.009 0.009 :1() + 1 0.001 0.001 0.009 0.009 prefixcheck.py:14(t_prefixes) + 1323 0.004 0.000 0.006 0.000 prefixcheck.py:29() + 147 0.000 0.000 0.000 0.000 {any} + 10 0.000 0.000 0.000 0.000 {len} + 147 0.002 0.000 0.008 0.000 {map} + 10 0.000 0.000 0.000 0.000 {method 'difference' of 'set' objects} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1323 0.002 0.000 0.002 0.000 {method 'startswith' of 'str' objects} + 10 0.000 0.000 0.000 0.000 {range} + + +new_prefixes() + 30 function calls in 0.000 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.000 0.000 :1() + 1 0.000 0.000 0.000 0.000 prefixcheck.py:38(new_prefixes) + 10 0.000 0.000 0.000 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} + 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects} + 14 0.000 0.000 0.000 0.000 {range} + 1 0.000 0.000 0.000 0.000 {zip} + + +----- + +Size: 100 + +t_prefixes() + 265303 function calls in 0.831 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.831 0.831 :1() + 1 0.009 0.009 0.830 0.830 prefixcheck.py:14(t_prefixes) + 131175 0.397 0.000 0.597 0.000 prefixcheck.py:29() + 1325 0.003 0.000 0.003 0.000 {any} + 100 0.000 0.000 0.000 0.000 {len} + 1325 0.221 0.000 0.817 0.001 {map} + 100 0.001 0.000 0.001 0.000 {method 'difference' of 'set' objects} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 131175 0.200 0.000 0.200 0.000 {method 'startswith' of 'str' objects} + 100 0.000 0.000 0.000 0.000 {range} + + +new_prefixes() + 257 function calls in 0.002 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.002 0.002 :1() + 1 0.001 0.001 0.002 0.002 prefixcheck.py:38(new_prefixes) + 100 0.000 0.000 0.000 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} + 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects} + 151 0.000 0.000 0.000 0.000 {range} + 1 0.000 0.000 0.000 0.000 {zip} + + +----- + +Size: 1000 + +t_prefixes() + 25827686 function calls in 79.873 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 79.873 79.873 :1() + 1 0.248 0.248 79.873 79.873 prefixcheck.py:14(t_prefixes) + 12899060 38.676 0.000 58.231 0.000 prefixcheck.py:29() + 13298 0.073 0.000 0.073 0.000 {any} + 989 0.002 0.000 0.002 0.000 {len} + 13298 21.266 0.002 79.497 0.006 {map} + 989 0.050 0.000 0.050 0.000 {method 'difference' of 'set' objects} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 12899060 19.555 0.000 19.555 0.000 {method 'startswith' of 'str' objects} + 989 0.003 0.000 0.003 0.000 {range} + + +new_prefixes() + 2534 function calls in 0.019 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.000 0.000 0.019 0.019 :1() + 1 0.011 0.011 0.019 0.019 prefixcheck.py:38(new_prefixes) + 989 0.001 0.000 0.001 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects} + 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects} + 1539 0.003 0.000 0.003 0.000 {range} + 1 0.003 0.003 0.003 0.003 {zip} + + +----- + +Size: 10000 + +Too big to run t_prefixes() efficiently. Would take several minutes. + +new_prefixes() + 25448 function calls in 0.212 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.001 0.001 0.212 0.212 :1() + 1 0.125 0.125 0.210 0.210 prefixcheck.py:38(new_prefixes) + 9957 0.014 0.000 0.014 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.003 0.003 0.003 0.003 {method 'keys' of 'dict' objects} + 1 0.001 0.001 0.001 0.001 {method 'values' of 'dict' objects} + 15485 0.029 0.000 0.029 0.000 {range} + 1 0.037 0.037 0.037 0.037 {zip} + + +----- + +Size: 100000 + +Too big to run t_prefixes() efficiently. Would take several minutes. + +new_prefixes() + 253838 function calls in 3.780 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.021 0.021 3.780 3.780 :1() + 1 1.490 1.490 3.759 3.759 prefixcheck.py:38(new_prefixes) + 99615 0.139 0.000 0.139 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.041 0.041 0.041 0.041 {method 'keys' of 'dict' objects} + 1 0.016 0.016 0.016 0.016 {method 'values' of 'dict' objects} + 154217 0.274 0.000 0.274 0.000 {range} + 1 1.798 1.798 1.798 1.798 {zip} + + +----- + +Size: 1000000 + +Too big to run t_prefixes() efficiently. Would take several minutes. + +new_prefixes() + 2530533 function calls in 151.824 CPU seconds + + Ordered by: standard name + + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.256 0.256 151.824 151.824 :1() + 1 16.424 16.424 151.569 151.569 prefixcheck.py:38(new_prefixes) + 996118 1.388 0.000 1.388 0.000 {len} + 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects} + 1 0.397 0.397 0.397 0.397 {method 'keys' of 'dict' objects} + 1 0.159 0.159 0.159 0.159 {method 'values' of 'dict' objects} + 1534409 2.731 0.000 2.731 0.000 {range} + 1 130.471 130.471 130.471 130.471 {zip} \ No newline at end of file diff -r c00fff9956e6 -r 9d004bdf5b1d prefixcheck.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/prefixcheck.py Sat Jun 12 15:47:19 2010 -0700 @@ -0,0 +1,153 @@ +''' +Test the prefix generating method of t vs my bug program + +Created on Jun 11, 2010 + +@author: Michael Diamond +''' + +import math,random,hashlib,cProfile + +# +# Initial t Prefix +# +def t_prefixes(ids): + """Return a mapping of ids to prefixes. + + Each prefix will be the shortest possible substring of the ID that + can uniquely identify it among the given group of IDs. + + If an ID of one task is entirely a substring of another task's ID, the + entire ID will be the prefix. + + """ + prefixes = {} + for task_id in ids: + others = set(ids).difference([task_id]) + for i in range(1, len(task_id)+1): + prefix = task_id[:i] + if not any(map(lambda o: o.startswith(prefix), others)): + prefixes[task_id] = prefix + break + prefixes[task_id] = task_id + return prefixes + +# +# New t Prefix +# +def new_prefixes(ids): + """Return a mapping of ids to prefixes in O(n) time. + + This is much faster than the naitive t function, which + takes O(n^2) time. + + Each prefix will be the shortest possible substring of the ID that + can uniquely identify it among the given group of IDs. + + If an ID of one task is entirely a substring of another task's ID, the + entire ID will be the prefix. + """ + pre = {} + for id in ids: + #print(id) + id_len = len(id) + for i in range(1, id_len+1): + """ identifies an empty prefix slot, or a singular collision """ + prefix = id[:i] + #print (pre) + #print (prefix) + if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]): + break + #print (prefix) + #print ("--") + if prefix in pre: + """ if there is a collision """ + collide = pre[prefix] + for j in range(i,id_len+1): + if collide[:j] == id[:j]: + pre[id[:j]] = ':' + else: + pre[collide[:j]] = collide + pre[id[:j]] = id + break + else: + pre[collide[:id_len+1]] = collide + pre[id] = id + else: + """ no collision, can safely add """ + pre[prefix] = id + #print("Additional") + #print(pre) + #print("\n***\n") + pre = dict(zip(pre.values(),pre.keys())) + if ':' in pre: + del pre[':'] + return pre + +# +# Test the prefix methods +# +def _hash(text): + """ Return a hash of the given text for use as a testing id """ + return hashlib.sha1(text.encode('utf-8')).hexdigest() + +def _gen_ids(num, substr=.4): + """ Generates a list of hashes of size num to be used as ids. + substr specifies the percentage of ids that should be substrings + of other ids """ + out = [] + alphabet = 'abcdefghijklmnopqrstuvwxyz' + min = 5 + max = 15 + sub_prefix = int(math.floor(num*substr)) + num -= sub_prefix + for _ in range(num): + string = '' + for x in random.sample(alphabet,random.randint(min,max)): + string+=x + out.append(_hash(string)) + for _ in range(sub_prefix): + str = random.choice(out) + if len(str) < 2: + sub_prefix += 1 + continue + out.append(str[:random.randint(1,len(str)-1)]) + random.shuffle(out) + return out + +def _check_prefixes(t_pre,new_pre): + """ Tests two prefix maps to confirm they are the same """ + for k in t_pre: + if not k in new_pre: + raise Exception(k+" not in new. Should be: "+new_pre[k]) + if t_pre[k] != new_pre[k]: + raise Exception(k+" doesn't match. t: "+t_pre[k]+" - new: "+new_pre[k]) + for k in new_pre: + if not k in t_pre: + raise Exception(k+" not in t. Expected: "+new_pre[k]) + if new_pre[k] != t_pre[k]: + raise Exception(k+" doesn't match. new: "+new_pre[k]+" - t: "+t_pre[k]) + print("No errors found. Both maps are identical.") + +def check_prefixes(size=1000): + ids = _gen_ids(size) + _check_prefixes(t_prefixes(ids),new_prefixes(ids)) + +def speed_check(base=10,max_pow=7,size_limit=5000): + for i in range(max_pow): + size = int(math.pow(base,i)) + print("-----\n\nSize: %s\n" % size) + globals()['ids'] = _gen_ids(size) + #print(ids) + + if size <= size_limit: + print("t_prefixes()") + cProfile.run("t_prefixes(ids)") + else: + print("Too big to run t_prefixes() efficiently. Would take several minutes.\n") + + print("new_prefixes()") + cProfile.run("new_prefixes(ids)") + +#check_prefixes() +speed_check() \ No newline at end of file diff -r c00fff9956e6 -r 9d004bdf5b1d t.py --- a/t.py Wed Jan 20 20:55:17 2010 -0500 +++ b/t.py Sat Jun 12 15:47:19 2010 -0700 @@ -78,25 +78,45 @@ return tasklines def _prefixes(ids): - """Return a mapping of ids to prefixes. + """Return a mapping of ids to prefixes in O(n) time. + + This is much faster than the naitive t function, which + takes O(n^2) time. Each prefix will be the shortest possible substring of the ID that can uniquely identify it among the given group of IDs. If an ID of one task is entirely a substring of another task's ID, the entire ID will be the prefix. - """ - prefixes = {} - for task_id in ids: - others = set(ids).difference([task_id]) - for i in range(1, len(task_id)+1): - prefix = task_id[:i] - if not any(map(lambda o: o.startswith(prefix), others)): - prefixes[task_id] = prefix + pre = {} + for id in ids: + id_len = len(id) + for i in range(1, id_len+1): + """ identifies an empty prefix slot, or a singular collision """ + prefix = id[:i] + if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]): break - prefixes[task_id] = task_id - return prefixes + if prefix in pre: + """ if there is a collision """ + collide = pre[prefix] + for j in range(i,id_len+1): + if collide[:j] == id[:j]: + pre[id[:j]] = ':' + else: + pre[collide[:j]] = collide + pre[id[:j]] = id + break + else: + pre[collide[:id_len+1]] = collide + pre[id] = id + else: + """ no collision, can safely add """ + pre[prefix] = id + pre = dict(zip(pre.values(),pre.keys())) + if ':' in pre: + del pre[':'] + return pre class TaskDict(object):