9d004bdf5b1d

Improved the _prefix() method of t, and added a test utility to confirm the new method works and runs faster - the same code on Python 3.1 runs much faster.
https://bitbucket.org/dimo414/t-3.1/
[view raw] [browse files]
author Michael Diamond@Artemis <dimo414@gmail.com>
date Sat, 12 Jun 2010 15:47:19 -0700
parents c00fff9956e6
children 21a4c758ca73
branches/tags (none)
files SpeedResults.txt prefixcheck.py t.py

Changes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SpeedResults.txt	Sat Jun 12 15:47:19 2010 -0700
@@ -0,0 +1,223 @@
+These are the cProfile results of running the original prefix function
+against the new prefix function.  The original function ran in ~O(n^2)
+time, the new one runs in ~O(n) time.
+
+You can also run prefixcheck.check_prefixes() to confirm there is no
+regression from original to new.
+
+It is worth noting that 
+
+-----
+
+Size: 1
+
+t_prefixes()
+         8 function calls in 0.000 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
+        1    0.000    0.000    0.000    0.000 prefixcheck.py:14(t_prefixes)
+        1    0.000    0.000    0.000    0.000 {any}
+        1    0.000    0.000    0.000    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {map}
+        1    0.000    0.000    0.000    0.000 {method 'difference' of 'set' objects}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.000    0.000    0.000    0.000 {range}
+
+
+new_prefixes()
+         8 function calls in 0.000 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
+        1    0.000    0.000    0.000    0.000 prefixcheck.py:38(new_prefixes)
+        1    0.000    0.000    0.000    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
+        1    0.000    0.000    0.000    0.000 {method 'values' of 'dict' objects}
+        1    0.000    0.000    0.000    0.000 {range}
+        1    0.000    0.000    0.000    0.000 {zip}
+
+
+-----
+
+Size: 10
+
+t_prefixes()
+         2973 function calls in 0.009 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.009    0.009 <string>:1(<module>)
+        1    0.001    0.001    0.009    0.009 prefixcheck.py:14(t_prefixes)
+     1323    0.004    0.000    0.006    0.000 prefixcheck.py:29(<lambda>)
+      147    0.000    0.000    0.000    0.000 {any}
+       10    0.000    0.000    0.000    0.000 {len}
+      147    0.002    0.000    0.008    0.000 {map}
+       10    0.000    0.000    0.000    0.000 {method 'difference' of 'set' objects}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+     1323    0.002    0.000    0.002    0.000 {method 'startswith' of 'str' objects}
+       10    0.000    0.000    0.000    0.000 {range}
+
+
+new_prefixes()
+         30 function calls in 0.000 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.000    0.000 <string>:1(<module>)
+        1    0.000    0.000    0.000    0.000 prefixcheck.py:38(new_prefixes)
+       10    0.000    0.000    0.000    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
+        1    0.000    0.000    0.000    0.000 {method 'values' of 'dict' objects}
+       14    0.000    0.000    0.000    0.000 {range}
+        1    0.000    0.000    0.000    0.000 {zip}
+
+
+-----
+
+Size: 100
+
+t_prefixes()
+         265303 function calls in 0.831 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.831    0.831 <string>:1(<module>)
+        1    0.009    0.009    0.830    0.830 prefixcheck.py:14(t_prefixes)
+   131175    0.397    0.000    0.597    0.000 prefixcheck.py:29(<lambda>)
+     1325    0.003    0.000    0.003    0.000 {any}
+      100    0.000    0.000    0.000    0.000 {len}
+     1325    0.221    0.000    0.817    0.001 {map}
+      100    0.001    0.000    0.001    0.000 {method 'difference' of 'set' objects}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+   131175    0.200    0.000    0.200    0.000 {method 'startswith' of 'str' objects}
+      100    0.000    0.000    0.000    0.000 {range}
+
+
+new_prefixes()
+         257 function calls in 0.002 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.002    0.002 <string>:1(<module>)
+        1    0.001    0.001    0.002    0.002 prefixcheck.py:38(new_prefixes)
+      100    0.000    0.000    0.000    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
+        1    0.000    0.000    0.000    0.000 {method 'values' of 'dict' objects}
+      151    0.000    0.000    0.000    0.000 {range}
+        1    0.000    0.000    0.000    0.000 {zip}
+
+
+-----
+
+Size: 1000
+
+t_prefixes()
+         25827686 function calls in 79.873 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000   79.873   79.873 <string>:1(<module>)
+        1    0.248    0.248   79.873   79.873 prefixcheck.py:14(t_prefixes)
+ 12899060   38.676    0.000   58.231    0.000 prefixcheck.py:29(<lambda>)
+    13298    0.073    0.000    0.073    0.000 {any}
+      989    0.002    0.000    0.002    0.000 {len}
+    13298   21.266    0.002   79.497    0.006 {map}
+      989    0.050    0.000    0.050    0.000 {method 'difference' of 'set' objects}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+ 12899060   19.555    0.000   19.555    0.000 {method 'startswith' of 'str' objects}
+      989    0.003    0.000    0.003    0.000 {range}
+
+
+new_prefixes()
+         2534 function calls in 0.019 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.000    0.000    0.019    0.019 <string>:1(<module>)
+        1    0.011    0.011    0.019    0.019 prefixcheck.py:38(new_prefixes)
+      989    0.001    0.000    0.001    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.000    0.000    0.000    0.000 {method 'keys' of 'dict' objects}
+        1    0.000    0.000    0.000    0.000 {method 'values' of 'dict' objects}
+     1539    0.003    0.000    0.003    0.000 {range}
+        1    0.003    0.003    0.003    0.003 {zip}
+
+
+-----
+
+Size: 10000
+
+Too big to run t_prefixes() efficiently.  Would take several minutes.
+
+new_prefixes()
+         25448 function calls in 0.212 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.001    0.001    0.212    0.212 <string>:1(<module>)
+        1    0.125    0.125    0.210    0.210 prefixcheck.py:38(new_prefixes)
+     9957    0.014    0.000    0.014    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.003    0.003    0.003    0.003 {method 'keys' of 'dict' objects}
+        1    0.001    0.001    0.001    0.001 {method 'values' of 'dict' objects}
+    15485    0.029    0.000    0.029    0.000 {range}
+        1    0.037    0.037    0.037    0.037 {zip}
+
+
+-----
+
+Size: 100000
+
+Too big to run t_prefixes() efficiently.  Would take several minutes.
+
+new_prefixes()
+         253838 function calls in 3.780 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.021    0.021    3.780    3.780 <string>:1(<module>)
+        1    1.490    1.490    3.759    3.759 prefixcheck.py:38(new_prefixes)
+    99615    0.139    0.000    0.139    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.041    0.041    0.041    0.041 {method 'keys' of 'dict' objects}
+        1    0.016    0.016    0.016    0.016 {method 'values' of 'dict' objects}
+   154217    0.274    0.000    0.274    0.000 {range}
+        1    1.798    1.798    1.798    1.798 {zip}
+
+
+-----
+
+Size: 1000000
+
+Too big to run t_prefixes() efficiently.  Would take several minutes.
+
+new_prefixes()
+         2530533 function calls in 151.824 CPU seconds
+
+   Ordered by: standard name
+
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.256    0.256  151.824  151.824 <string>:1(<module>)
+        1   16.424   16.424  151.569  151.569 prefixcheck.py:38(new_prefixes)
+   996118    1.388    0.000    1.388    0.000 {len}
+        1    0.000    0.000    0.000    0.000 {method 'disable' of '_lsprof.Profiler' objects}
+        1    0.397    0.397    0.397    0.397 {method 'keys' of 'dict' objects}
+        1    0.159    0.159    0.159    0.159 {method 'values' of 'dict' objects}
+  1534409    2.731    0.000    2.731    0.000 {range}
+        1  130.471  130.471  130.471  130.471 {zip}
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/prefixcheck.py	Sat Jun 12 15:47:19 2010 -0700
@@ -0,0 +1,153 @@
+'''
+Test the prefix generating method of t vs my bug program
+
+Created on Jun 11, 2010
+
+@author: Michael Diamond
+'''
+
+import math,random,hashlib,cProfile
+
+#
+# Initial t Prefix
+#
+def t_prefixes(ids):
+    """Return a mapping of ids to prefixes.
+    
+    Each prefix will be the shortest possible substring of the ID that
+    can uniquely identify it among the given group of IDs.
+    
+    If an ID of one task is entirely a substring of another task's ID, the
+    entire ID will be the prefix.
+    
+    """
+    prefixes = {}
+    for task_id in ids:
+        others = set(ids).difference([task_id])
+        for i in range(1, len(task_id)+1):
+            prefix = task_id[:i]
+            if not any(map(lambda o: o.startswith(prefix), others)):
+                prefixes[task_id] = prefix
+                break
+            prefixes[task_id] = task_id
+    return prefixes
+
+#
+# New t Prefix
+#
+def new_prefixes(ids):
+    """Return a mapping of ids to prefixes in O(n) time.
+    
+    This is much faster than the naitive t function, which
+    takes O(n^2) time.
+    
+    Each prefix will be the shortest possible substring of the ID that
+    can uniquely identify it among the given group of IDs.
+    
+    If an ID of one task is entirely a substring of another task's ID, the
+    entire ID will be the prefix.
+    """
+    pre = {}
+    for id in ids:
+        #print(id)
+        id_len = len(id)
+        for i in range(1, id_len+1):
+            """ identifies an empty prefix slot, or a singular collision """
+            prefix = id[:i]
+            #print (pre)
+            #print (prefix)
+            if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
+                break
+        #print (prefix)
+        #print ("--")
+        if prefix in pre:
+            """ if there is a collision """
+            collide = pre[prefix]
+            for j in range(i,id_len+1):
+                if collide[:j] == id[:j]:
+                    pre[id[:j]] = ':'
+                else:
+                    pre[collide[:j]] = collide
+                    pre[id[:j]] = id
+                    break
+            else:
+                pre[collide[:id_len+1]] = collide
+                pre[id] = id
+        else:
+            """ no collision, can safely add """
+            pre[prefix] = id
+        #print("Additional")
+        #print(pre)
+        #print("\n***\n")
+    pre = dict(zip(pre.values(),pre.keys()))
+    if ':' in pre:
+        del pre[':']
+    return pre
+
+#
+# Test the prefix methods
+#
+def _hash(text):
+    """ Return a hash of the given text for use as a testing id """
+    return hashlib.sha1(text.encode('utf-8')).hexdigest()
+
+def _gen_ids(num, substr=.4):
+    """ Generates a list of hashes of size num to be used as ids.
+    substr specifies the percentage of ids that should be substrings
+    of other ids """
+    out = []
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    min = 5
+    max = 15
+    sub_prefix = int(math.floor(num*substr))
+    num -= sub_prefix
+    for _ in range(num):
+        string = ''
+        for x in random.sample(alphabet,random.randint(min,max)):
+            string+=x
+        out.append(_hash(string))
+    for _ in range(sub_prefix):
+        str = random.choice(out)
+        if len(str) < 2:
+            sub_prefix += 1
+            continue
+        out.append(str[:random.randint(1,len(str)-1)])
+    random.shuffle(out)
+    return out
+
+def _check_prefixes(t_pre,new_pre):
+    """ Tests two prefix maps to confirm they are the same """
+    for k in t_pre:
+        if not k in new_pre:
+            raise Exception(k+" not in new.  Should be: "+new_pre[k])
+        if t_pre[k] != new_pre[k]:
+            raise Exception(k+" doesn't match.  t: "+t_pre[k]+" - new: "+new_pre[k])
+    for k in new_pre:
+        if not k in t_pre:
+            raise Exception(k+" not in t.  Expected: "+new_pre[k])
+        if new_pre[k] != t_pre[k]:
+            raise Exception(k+" doesn't match.  new: "+new_pre[k]+" - t: "+t_pre[k])
+    print("No errors found.  Both maps are identical.")
+    
+def check_prefixes(size=1000):
+    ids = _gen_ids(size)
+    _check_prefixes(t_prefixes(ids),new_prefixes(ids))
+
+def speed_check(base=10,max_pow=7,size_limit=5000):
+    for i in range(max_pow):
+        size = int(math.pow(base,i))
+        print("-----\n\nSize: %s\n" % size)
+        globals()['ids'] = _gen_ids(size)
+        #print(ids)
+        
+        if size <= size_limit:
+            print("t_prefixes()")
+            cProfile.run("t_prefixes(ids)")
+        else:
+            print("Too big to run t_prefixes() efficiently.  Would take several minutes.\n")
+            
+        print("new_prefixes()")
+        cProfile.run("new_prefixes(ids)")
+
+#check_prefixes()
+speed_check()
\ No newline at end of file
--- a/t.py	Wed Jan 20 20:55:17 2010 -0500
+++ b/t.py	Sat Jun 12 15:47:19 2010 -0700
@@ -78,25 +78,45 @@
     return tasklines
 
 def _prefixes(ids):
-    """Return a mapping of ids to prefixes.
+    """Return a mapping of ids to prefixes in O(n) time.
+    
+    This is much faster than the naitive t function, which
+    takes O(n^2) time.
     
     Each prefix will be the shortest possible substring of the ID that
     can uniquely identify it among the given group of IDs.
     
     If an ID of one task is entirely a substring of another task's ID, the
     entire ID will be the prefix.
-    
     """
-    prefixes = {}
-    for task_id in ids:
-        others = set(ids).difference([task_id])
-        for i in range(1, len(task_id)+1):
-            prefix = task_id[:i]
-            if not any(map(lambda o: o.startswith(prefix), others)):
-                prefixes[task_id] = prefix
+    pre = {}
+    for id in ids:
+        id_len = len(id)
+        for i in range(1, id_len+1):
+            """ identifies an empty prefix slot, or a singular collision """
+            prefix = id[:i]
+            if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
                 break
-            prefixes[task_id] = task_id
-    return prefixes
+        if prefix in pre:
+            """ if there is a collision """
+            collide = pre[prefix]
+            for j in range(i,id_len+1):
+                if collide[:j] == id[:j]:
+                    pre[id[:j]] = ':'
+                else:
+                    pre[collide[:j]] = collide
+                    pre[id[:j]] = id
+                    break
+            else:
+                pre[collide[:id_len+1]] = collide
+                pre[id] = id
+        else:
+            """ no collision, can safely add """
+            pre[prefix] = id
+    pre = dict(zip(pre.values(),pre.keys()))
+    if ':' in pre:
+        del pre[':']
+    return pre
 
 
 class TaskDict(object):