--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/SpeedResults.txt Sat Jun 12 15:47:19 2010 -0700
@@ -0,0 +1,223 @@
+These are the cProfile results of running the original prefix function
+against the new prefix function. The original function ran in ~O(n^2)
+time, the new one runs in ~O(n) time.
+
+You can also run prefixcheck.check_prefixes() to confirm there is no
+regression from original to new.
+
+It is worth noting that
+
+-----
+
+Size: 1
+
+t_prefixes()
+ 8 function calls in 0.000 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.000 0.000 <string>:1(<module>)
+ 1 0.000 0.000 0.000 0.000 prefixcheck.py:14(t_prefixes)
+ 1 0.000 0.000 0.000 0.000 {any}
+ 1 0.000 0.000 0.000 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {map}
+ 1 0.000 0.000 0.000 0.000 {method 'difference' of 'set' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.000 0.000 0.000 0.000 {range}
+
+
+new_prefixes()
+ 8 function calls in 0.000 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.000 0.000 <string>:1(<module>)
+ 1 0.000 0.000 0.000 0.000 prefixcheck.py:38(new_prefixes)
+ 1 0.000 0.000 0.000 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
+ 1 0.000 0.000 0.000 0.000 {range}
+ 1 0.000 0.000 0.000 0.000 {zip}
+
+
+-----
+
+Size: 10
+
+t_prefixes()
+ 2973 function calls in 0.009 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.009 0.009 <string>:1(<module>)
+ 1 0.001 0.001 0.009 0.009 prefixcheck.py:14(t_prefixes)
+ 1323 0.004 0.000 0.006 0.000 prefixcheck.py:29(<lambda>)
+ 147 0.000 0.000 0.000 0.000 {any}
+ 10 0.000 0.000 0.000 0.000 {len}
+ 147 0.002 0.000 0.008 0.000 {map}
+ 10 0.000 0.000 0.000 0.000 {method 'difference' of 'set' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1323 0.002 0.000 0.002 0.000 {method 'startswith' of 'str' objects}
+ 10 0.000 0.000 0.000 0.000 {range}
+
+
+new_prefixes()
+ 30 function calls in 0.000 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.000 0.000 <string>:1(<module>)
+ 1 0.000 0.000 0.000 0.000 prefixcheck.py:38(new_prefixes)
+ 10 0.000 0.000 0.000 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
+ 14 0.000 0.000 0.000 0.000 {range}
+ 1 0.000 0.000 0.000 0.000 {zip}
+
+
+-----
+
+Size: 100
+
+t_prefixes()
+ 265303 function calls in 0.831 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.831 0.831 <string>:1(<module>)
+ 1 0.009 0.009 0.830 0.830 prefixcheck.py:14(t_prefixes)
+ 131175 0.397 0.000 0.597 0.000 prefixcheck.py:29(<lambda>)
+ 1325 0.003 0.000 0.003 0.000 {any}
+ 100 0.000 0.000 0.000 0.000 {len}
+ 1325 0.221 0.000 0.817 0.001 {map}
+ 100 0.001 0.000 0.001 0.000 {method 'difference' of 'set' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 131175 0.200 0.000 0.200 0.000 {method 'startswith' of 'str' objects}
+ 100 0.000 0.000 0.000 0.000 {range}
+
+
+new_prefixes()
+ 257 function calls in 0.002 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.002 0.002 <string>:1(<module>)
+ 1 0.001 0.001 0.002 0.002 prefixcheck.py:38(new_prefixes)
+ 100 0.000 0.000 0.000 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
+ 151 0.000 0.000 0.000 0.000 {range}
+ 1 0.000 0.000 0.000 0.000 {zip}
+
+
+-----
+
+Size: 1000
+
+t_prefixes()
+ 25827686 function calls in 79.873 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 79.873 79.873 <string>:1(<module>)
+ 1 0.248 0.248 79.873 79.873 prefixcheck.py:14(t_prefixes)
+ 12899060 38.676 0.000 58.231 0.000 prefixcheck.py:29(<lambda>)
+ 13298 0.073 0.000 0.073 0.000 {any}
+ 989 0.002 0.000 0.002 0.000 {len}
+ 13298 21.266 0.002 79.497 0.006 {map}
+ 989 0.050 0.000 0.050 0.000 {method 'difference' of 'set' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 12899060 19.555 0.000 19.555 0.000 {method 'startswith' of 'str' objects}
+ 989 0.003 0.000 0.003 0.000 {range}
+
+
+new_prefixes()
+ 2534 function calls in 0.019 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.000 0.000 0.019 0.019 <string>:1(<module>)
+ 1 0.011 0.011 0.019 0.019 prefixcheck.py:38(new_prefixes)
+ 989 0.001 0.000 0.001 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'keys' of 'dict' objects}
+ 1 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
+ 1539 0.003 0.000 0.003 0.000 {range}
+ 1 0.003 0.003 0.003 0.003 {zip}
+
+
+-----
+
+Size: 10000
+
+Too big to run t_prefixes() efficiently. Would take several minutes.
+
+new_prefixes()
+ 25448 function calls in 0.212 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.001 0.001 0.212 0.212 <string>:1(<module>)
+ 1 0.125 0.125 0.210 0.210 prefixcheck.py:38(new_prefixes)
+ 9957 0.014 0.000 0.014 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.003 0.003 0.003 0.003 {method 'keys' of 'dict' objects}
+ 1 0.001 0.001 0.001 0.001 {method 'values' of 'dict' objects}
+ 15485 0.029 0.000 0.029 0.000 {range}
+ 1 0.037 0.037 0.037 0.037 {zip}
+
+
+-----
+
+Size: 100000
+
+Too big to run t_prefixes() efficiently. Would take several minutes.
+
+new_prefixes()
+ 253838 function calls in 3.780 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.021 0.021 3.780 3.780 <string>:1(<module>)
+ 1 1.490 1.490 3.759 3.759 prefixcheck.py:38(new_prefixes)
+ 99615 0.139 0.000 0.139 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.041 0.041 0.041 0.041 {method 'keys' of 'dict' objects}
+ 1 0.016 0.016 0.016 0.016 {method 'values' of 'dict' objects}
+ 154217 0.274 0.000 0.274 0.000 {range}
+ 1 1.798 1.798 1.798 1.798 {zip}
+
+
+-----
+
+Size: 1000000
+
+Too big to run t_prefixes() efficiently. Would take several minutes.
+
+new_prefixes()
+ 2530533 function calls in 151.824 CPU seconds
+
+ Ordered by: standard name
+
+ ncalls tottime percall cumtime percall filename:lineno(function)
+ 1 0.256 0.256 151.824 151.824 <string>:1(<module>)
+ 1 16.424 16.424 151.569 151.569 prefixcheck.py:38(new_prefixes)
+ 996118 1.388 0.000 1.388 0.000 {len}
+ 1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 1 0.397 0.397 0.397 0.397 {method 'keys' of 'dict' objects}
+ 1 0.159 0.159 0.159 0.159 {method 'values' of 'dict' objects}
+ 1534409 2.731 0.000 2.731 0.000 {range}
+ 1 130.471 130.471 130.471 130.471 {zip}
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prefixcheck.py Sat Jun 12 15:47:19 2010 -0700
@@ -0,0 +1,153 @@
+'''
+Test the prefix generating method of t vs my bug program
+
+Created on Jun 11, 2010
+
+@author: Michael Diamond
+'''
+
+import math,random,hashlib,cProfile
+
+#
+# Initial t Prefix
+#
+def t_prefixes(ids):
+ """Return a mapping of ids to prefixes.
+
+ Each prefix will be the shortest possible substring of the ID that
+ can uniquely identify it among the given group of IDs.
+
+ If an ID of one task is entirely a substring of another task's ID, the
+ entire ID will be the prefix.
+
+ """
+ prefixes = {}
+ for task_id in ids:
+ others = set(ids).difference([task_id])
+ for i in range(1, len(task_id)+1):
+ prefix = task_id[:i]
+ if not any(map(lambda o: o.startswith(prefix), others)):
+ prefixes[task_id] = prefix
+ break
+ prefixes[task_id] = task_id
+ return prefixes
+
+#
+# New t Prefix
+#
+def new_prefixes(ids):
+ """Return a mapping of ids to prefixes in O(n) time.
+
+ This is much faster than the naitive t function, which
+ takes O(n^2) time.
+
+ Each prefix will be the shortest possible substring of the ID that
+ can uniquely identify it among the given group of IDs.
+
+ If an ID of one task is entirely a substring of another task's ID, the
+ entire ID will be the prefix.
+ """
+ pre = {}
+ for id in ids:
+ #print(id)
+ id_len = len(id)
+ for i in range(1, id_len+1):
+ """ identifies an empty prefix slot, or a singular collision """
+ prefix = id[:i]
+ #print (pre)
+ #print (prefix)
+ if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
+ break
+ #print (prefix)
+ #print ("--")
+ if prefix in pre:
+ """ if there is a collision """
+ collide = pre[prefix]
+ for j in range(i,id_len+1):
+ if collide[:j] == id[:j]:
+ pre[id[:j]] = ':'
+ else:
+ pre[collide[:j]] = collide
+ pre[id[:j]] = id
+ break
+ else:
+ pre[collide[:id_len+1]] = collide
+ pre[id] = id
+ else:
+ """ no collision, can safely add """
+ pre[prefix] = id
+ #print("Additional")
+ #print(pre)
+ #print("\n***\n")
+ pre = dict(zip(pre.values(),pre.keys()))
+ if ':' in pre:
+ del pre[':']
+ return pre
+
+#
+# Test the prefix methods
+#
+def _hash(text):
+ """ Return a hash of the given text for use as a testing id """
+ return hashlib.sha1(text.encode('utf-8')).hexdigest()
+
+def _gen_ids(num, substr=.4):
+ """ Generates a list of hashes of size num to be used as ids.
+ substr specifies the percentage of ids that should be substrings
+ of other ids """
+ out = []
+ alphabet = 'abcdefghijklmnopqrstuvwxyz'
+ min = 5
+ max = 15
+ sub_prefix = int(math.floor(num*substr))
+ num -= sub_prefix
+ for _ in range(num):
+ string = ''
+ for x in random.sample(alphabet,random.randint(min,max)):
+ string+=x
+ out.append(_hash(string))
+ for _ in range(sub_prefix):
+ str = random.choice(out)
+ if len(str) < 2:
+ sub_prefix += 1
+ continue
+ out.append(str[:random.randint(1,len(str)-1)])
+ random.shuffle(out)
+ return out
+
+def _check_prefixes(t_pre,new_pre):
+ """ Tests two prefix maps to confirm they are the same """
+ for k in t_pre:
+ if not k in new_pre:
+ raise Exception(k+" not in new. Should be: "+new_pre[k])
+ if t_pre[k] != new_pre[k]:
+ raise Exception(k+" doesn't match. t: "+t_pre[k]+" - new: "+new_pre[k])
+ for k in new_pre:
+ if not k in t_pre:
+ raise Exception(k+" not in t. Expected: "+new_pre[k])
+ if new_pre[k] != t_pre[k]:
+ raise Exception(k+" doesn't match. new: "+new_pre[k]+" - t: "+t_pre[k])
+ print("No errors found. Both maps are identical.")
+
+def check_prefixes(size=1000):
+ ids = _gen_ids(size)
+ _check_prefixes(t_prefixes(ids),new_prefixes(ids))
+
+def speed_check(base=10,max_pow=7,size_limit=5000):
+ for i in range(max_pow):
+ size = int(math.pow(base,i))
+ print("-----\n\nSize: %s\n" % size)
+ globals()['ids'] = _gen_ids(size)
+ #print(ids)
+
+ if size <= size_limit:
+ print("t_prefixes()")
+ cProfile.run("t_prefixes(ids)")
+ else:
+ print("Too big to run t_prefixes() efficiently. Would take several minutes.\n")
+
+ print("new_prefixes()")
+ cProfile.run("new_prefixes(ids)")
+
+#check_prefixes()
+speed_check()
\ No newline at end of file
--- a/t.py Wed Jan 20 20:55:17 2010 -0500
+++ b/t.py Sat Jun 12 15:47:19 2010 -0700
@@ -78,25 +78,45 @@
return tasklines
def _prefixes(ids):
- """Return a mapping of ids to prefixes.
+ """Return a mapping of ids to prefixes in O(n) time.
+
+ This is much faster than the naitive t function, which
+ takes O(n^2) time.
Each prefix will be the shortest possible substring of the ID that
can uniquely identify it among the given group of IDs.
If an ID of one task is entirely a substring of another task's ID, the
entire ID will be the prefix.
-
"""
- prefixes = {}
- for task_id in ids:
- others = set(ids).difference([task_id])
- for i in range(1, len(task_id)+1):
- prefix = task_id[:i]
- if not any(map(lambda o: o.startswith(prefix), others)):
- prefixes[task_id] = prefix
+ pre = {}
+ for id in ids:
+ id_len = len(id)
+ for i in range(1, id_len+1):
+ """ identifies an empty prefix slot, or a singular collision """
+ prefix = id[:i]
+ if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
break
- prefixes[task_id] = task_id
- return prefixes
+ if prefix in pre:
+ """ if there is a collision """
+ collide = pre[prefix]
+ for j in range(i,id_len+1):
+ if collide[:j] == id[:j]:
+ pre[id[:j]] = ':'
+ else:
+ pre[collide[:j]] = collide
+ pre[id[:j]] = id
+ break
+ else:
+ pre[collide[:id_len+1]] = collide
+ pre[id] = id
+ else:
+ """ no collision, can safely add """
+ pre[prefix] = id
+ pre = dict(zip(pre.values(),pre.keys()))
+ if ':' in pre:
+ del pre[':']
+ return pre
class TaskDict(object):