hg.stevelosh.com > t

--- a/prefixcheck.py	Sat Jun 12 15:47:19 2010 -0700
+++ b/prefixcheck.py	Sun Oct 03 17:45:09 2010 -0400
@@ -1,153 +1,106 @@
-'''
-Test the prefix generating method of t vs my bug program
-
-Created on Jun 11, 2010
-
-@author: Michael Diamond
-'''
-
-import math,random,hashlib,cProfile
-
-#
-# Initial t Prefix
-#
-def t_prefixes(ids):
-    """Return a mapping of ids to prefixes.
-
-    Each prefix will be the shortest possible substring of the ID that
-    can uniquely identify it among the given group of IDs.
-
-    If an ID of one task is entirely a substring of another task's ID, the
-    entire ID will be the prefix.
-
-    """
-    prefixes = {}
-    for task_id in ids:
-        others = set(ids).difference([task_id])
-        for i in range(1, len(task_id)+1):
-            prefix = task_id[:i]
-            if not any(map(lambda o: o.startswith(prefix), others)):
-                prefixes[task_id] = prefix
-                break
-            prefixes[task_id] = task_id
-    return prefixes
-
-#
-# New t Prefix
-#
-def new_prefixes(ids):
-    """Return a mapping of ids to prefixes in O(n) time.
-
-    This is much faster than the naitive t function, which
-    takes O(n^2) time.
-
-    Each prefix will be the shortest possible substring of the ID that
-    can uniquely identify it among the given group of IDs.
-
-    If an ID of one task is entirely a substring of another task's ID, the
-    entire ID will be the prefix.
-    """
-    pre = {}
-    for id in ids:
-        #print(id)
-        id_len = len(id)
-        for i in range(1, id_len+1):
-            """ identifies an empty prefix slot, or a singular collision """
-            prefix = id[:i]
-            #print (pre)
-            #print (prefix)
-            if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
-                break
-        #print (prefix)
-        #print ("--")
-        if prefix in pre:
-            """ if there is a collision """
-            collide = pre[prefix]
-            for j in range(i,id_len+1):
-                if collide[:j] == id[:j]:
-                    pre[id[:j]] = ':'
-                else:
-                    pre[collide[:j]] = collide
-                    pre[id[:j]] = id
-                    break
-            else:
-                pre[collide[:id_len+1]] = collide
-                pre[id] = id
-        else:
-            """ no collision, can safely add """
-            pre[prefix] = id
-        #print("Additional")
-        #print(pre)
-        #print("\n***\n")
-    pre = dict(zip(pre.values(),pre.keys()))
-    if ':' in pre:
-        del pre[':']
-    return pre
-
-#
-# Test the prefix methods
-#
-def _hash(text):
-    """ Return a hash of the given text for use as a testing id """
-    return hashlib.sha1(text.encode('utf-8')).hexdigest()
-
-def _gen_ids(num, substr=.4):
-    """ Generates a list of hashes of size num to be used as ids.
-    substr specifies the percentage of ids that should be substrings
-    of other ids """
-    out = []
-    alphabet = 'abcdefghijklmnopqrstuvwxyz'
-    min = 5
-    max = 15
-    sub_prefix = int(math.floor(num*substr))
-    num -= sub_prefix
-    for _ in range(num):
-        string = ''
-        for x in random.sample(alphabet,random.randint(min,max)):
-            string+=x
-        out.append(_hash(string))
-    for _ in range(sub_prefix):
-        str = random.choice(out)
-        if len(str) < 2:
-            sub_prefix += 1
-            continue
-        out.append(str[:random.randint(1,len(str)-1)])
-    random.shuffle(out)
-    return out
-
-def _check_prefixes(t_pre,new_pre):
-    """ Tests two prefix maps to confirm they are the same """
-    for k in t_pre:
-        if not k in new_pre:
-            raise Exception(k+" not in new.  Should be: "+new_pre[k])
-        if t_pre[k] != new_pre[k]:
-            raise Exception(k+" doesn't match.  t: "+t_pre[k]+" - new: "+new_pre[k])
-    for k in new_pre:
-        if not k in t_pre:
-            raise Exception(k+" not in t.  Expected: "+new_pre[k])
-        if new_pre[k] != t_pre[k]:
-            raise Exception(k+" doesn't match.  new: "+new_pre[k]+" - t: "+t_pre[k])
-    print("No errors found.  Both maps are identical.")
-
-def check_prefixes(size=1000):
-    ids = _gen_ids(size)
-    _check_prefixes(t_prefixes(ids),new_prefixes(ids))
-
-def speed_check(base=10,max_pow=7,size_limit=5000):
-    for i in range(max_pow):
-        size = int(math.pow(base,i))
-        print("-----\n\nSize: %s\n" % size)
-        globals()['ids'] = _gen_ids(size)
-        #print(ids)
-
-        if size <= size_limit:
-            print("t_prefixes()")
-            cProfile.run("t_prefixes(ids)")
-        else:
-            print("Too big to run t_prefixes() efficiently.  Would take several minutes.\n")
-
-        print("new_prefixes()")
-        cProfile.run("new_prefixes(ids)")
-
-#check_prefixes()
-speed_check()
\ No newline at end of file
+#!/usr/bin/env python
+'''
+Test the prefix generating method of t vs my bug program
+
+Created on Jun 11, 2010
+
+@author: Michael Diamond
+'''
+
+import math,random,hashlib,cProfile
+
+#
+# Initial t Prefix
+#
+def t_prefixes(ids):
+    """Return a mapping of ids to prefixes.
+
+    Each prefix will be the shortest possible substring of the ID that
+    can uniquely identify it among the given group of IDs.
+
+    If an ID of one task is entirely a substring of another task's ID, the
+    entire ID will be the prefix.
+
+    """
+    prefixes = {}
+    for task_id in ids:
+        others = set(ids).difference([task_id])
+        for i in range(1, len(task_id)+1):
+            prefix = task_id[:i]
+            if not any(map(lambda o: o.startswith(prefix), others)):
+                prefixes[task_id] = prefix
+                break
+            prefixes[task_id] = task_id
+    return prefixes
+
+#
+# New t Prefix
+#
+from t import _prefixes as new_prefixes
+#
+# Test the prefix methods
+#
+def _hash(text):
+    """ Return a hash of the given text for use as a testing id """
+    return hashlib.sha1(text.encode('utf-8')).hexdigest()
+
+def _gen_ids(num, substr=.4):
+    """ Generates a list of hashes of size num to be used as ids.
+    substr specifies the percentage of ids that should be substrings
+    of other ids """
+    out = []
+    alphabet = 'abcdefghijklmnopqrstuvwxyz'
+    min = 5
+    max = 15
+    sub_prefix = int(math.floor(num*substr))
+    num -= sub_prefix
+    for _ in range(num):
+        string = ''
+        for x in random.sample(alphabet,random.randint(min,max)):
+            string+=x
+        out.append(_hash(string))
+    for _ in range(sub_prefix):
+        str = random.choice(out)
+        if len(str) < 2:
+            sub_prefix += 1
+            continue
+        out.append(str[:random.randint(1,len(str)-1)])
+    random.shuffle(out)
+    return out
+
+def _check_prefixes(t_pre,new_pre):
+    """ Tests two prefix maps to confirm they are the same """
+    for k in t_pre:
+        if not k in new_pre:
+            raise Exception(k+" not in new.  Should be: "+new_pre[k])
+        if t_pre[k] != new_pre[k]:
+            raise Exception(k+" doesn't match.  t: "+t_pre[k]+" - new: "+new_pre[k])
+    for k in new_pre:
+        if not k in t_pre:
+            raise Exception(k+" not in t.  Expected: "+new_pre[k])
+        if new_pre[k] != t_pre[k]:
+            raise Exception(k+" doesn't match.  new: "+new_pre[k]+" - t: "+t_pre[k])
+    print("No errors found.  Both maps are identical.")
+
+def check_prefixes(size=1000):
+    ids = _gen_ids(size)
+    _check_prefixes(t_prefixes(ids),new_prefixes(ids))
+
+def speed_check(base=10,max_pow=4,size_limit=5000):
+    for i in range(max_pow):
+        size = int(math.pow(base,i))
+        print("-----\n\nSize: %s\n" % size)
+        globals()['ids'] = _gen_ids(size)
+        #print(ids)
+
+        if size <= size_limit:
+            print("t_prefixes()")
+            cProfile.run("t_prefixes(ids)")
+        else:
+            print("Too big to run t_prefixes() efficiently.  Would take several minutes.\n")
+
+        print("new_prefixes()")
+        cProfile.run("new_prefixes(ids)")
+
+check_prefixes()
+speed_check()
--- a/t.py	Sat Jun 12 15:47:19 2010 -0700
+++ b/t.py	Sun Oct 03 17:45:09 2010 -0400
@@ -89,34 +89,34 @@
     If an ID of one task is entirely a substring of another task's ID, the
     entire ID will be the prefix.
     """
-    pre = {}
+    ps = {}
     for id in ids:
         id_len = len(id)
         for i in range(1, id_len+1):
-            """ identifies an empty prefix slot, or a singular collision """
+            # identifies an empty prefix slot, or a singular collision
             prefix = id[:i]
-            if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
+            if (not prefix in ps) or (ps[prefix] and prefix != ps[prefix]):
                 break
-        if prefix in pre:
-            """ if there is a collision """
-            collide = pre[prefix]
-            for j in range(i,id_len+1):
-                if collide[:j] == id[:j]:
-                    pre[id[:j]] = ':'
+        if prefix in ps:
+            # if there is a collision
+            other_id = ps[prefix]
+            for j in range(i, id_len+1):
+                if other_id[:j] == id[:j]:
+                    ps[id[:j]] = ''
                 else:
-                    pre[collide[:j]] = collide
-                    pre[id[:j]] = id
+                    ps[other_id[:j]] = other_id
+                    ps[id[:j]] = id
                     break
             else:
-                pre[collide[:id_len+1]] = collide
-                pre[id] = id
+                ps[other_id[:id_len+1]] = other_id
+                ps[id] = id
         else:
-            """ no collision, can safely add """
-            pre[prefix] = id
-    pre = dict(zip(pre.values(),pre.keys()))
-    if ':' in pre:
-        del pre[':']
-    return pre
+            # no collision, can safely add
+            ps[prefix] = id
+    ps = dict(zip(ps.values(), ps.keys()))
+    if '' in ps:
+        del ps['']
+    return ps


 class TaskDict(object):
author	Steve Losh <steve@dwaiter.com>
date	Sun, 03 Oct 2010 17:45:09 -0400
parents	9d004bdf5b1d
children	887b6b2b7564
branches/tags	(none)
files	prefixcheck.py t.py