# HG changeset patch # User Steve Losh # Date 1286142309 14400 # Node ID 21a4c758ca737df4be61a874ff7aebe194a7a1b4 # Parent 9d004bdf5b1da5aac2f8c24433c6b5169cef6d40 Style changes. diff -r 9d004bdf5b1d -r 21a4c758ca73 prefixcheck.py --- a/prefixcheck.py Sat Jun 12 15:47:19 2010 -0700 +++ b/prefixcheck.py Sun Oct 03 17:45:09 2010 -0400 @@ -1,153 +1,106 @@ -''' -Test the prefix generating method of t vs my bug program - -Created on Jun 11, 2010 - -@author: Michael Diamond -''' - -import math,random,hashlib,cProfile - -# -# Initial t Prefix -# -def t_prefixes(ids): - """Return a mapping of ids to prefixes. - - Each prefix will be the shortest possible substring of the ID that - can uniquely identify it among the given group of IDs. - - If an ID of one task is entirely a substring of another task's ID, the - entire ID will be the prefix. - - """ - prefixes = {} - for task_id in ids: - others = set(ids).difference([task_id]) - for i in range(1, len(task_id)+1): - prefix = task_id[:i] - if not any(map(lambda o: o.startswith(prefix), others)): - prefixes[task_id] = prefix - break - prefixes[task_id] = task_id - return prefixes - -# -# New t Prefix -# -def new_prefixes(ids): - """Return a mapping of ids to prefixes in O(n) time. - - This is much faster than the naitive t function, which - takes O(n^2) time. - - Each prefix will be the shortest possible substring of the ID that - can uniquely identify it among the given group of IDs. - - If an ID of one task is entirely a substring of another task's ID, the - entire ID will be the prefix. - """ - pre = {} - for id in ids: - #print(id) - id_len = len(id) - for i in range(1, id_len+1): - """ identifies an empty prefix slot, or a singular collision """ - prefix = id[:i] - #print (pre) - #print (prefix) - if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]): - break - #print (prefix) - #print ("--") - if prefix in pre: - """ if there is a collision """ - collide = pre[prefix] - for j in range(i,id_len+1): - if collide[:j] == id[:j]: - pre[id[:j]] = ':' - else: - pre[collide[:j]] = collide - pre[id[:j]] = id - break - else: - pre[collide[:id_len+1]] = collide - pre[id] = id - else: - """ no collision, can safely add """ - pre[prefix] = id - #print("Additional") - #print(pre) - #print("\n***\n") - pre = dict(zip(pre.values(),pre.keys())) - if ':' in pre: - del pre[':'] - return pre - -# -# Test the prefix methods -# -def _hash(text): - """ Return a hash of the given text for use as a testing id """ - return hashlib.sha1(text.encode('utf-8')).hexdigest() - -def _gen_ids(num, substr=.4): - """ Generates a list of hashes of size num to be used as ids. - substr specifies the percentage of ids that should be substrings - of other ids """ - out = [] - alphabet = 'abcdefghijklmnopqrstuvwxyz' - min = 5 - max = 15 - sub_prefix = int(math.floor(num*substr)) - num -= sub_prefix - for _ in range(num): - string = '' - for x in random.sample(alphabet,random.randint(min,max)): - string+=x - out.append(_hash(string)) - for _ in range(sub_prefix): - str = random.choice(out) - if len(str) < 2: - sub_prefix += 1 - continue - out.append(str[:random.randint(1,len(str)-1)]) - random.shuffle(out) - return out - -def _check_prefixes(t_pre,new_pre): - """ Tests two prefix maps to confirm they are the same """ - for k in t_pre: - if not k in new_pre: - raise Exception(k+" not in new. Should be: "+new_pre[k]) - if t_pre[k] != new_pre[k]: - raise Exception(k+" doesn't match. t: "+t_pre[k]+" - new: "+new_pre[k]) - for k in new_pre: - if not k in t_pre: - raise Exception(k+" not in t. Expected: "+new_pre[k]) - if new_pre[k] != t_pre[k]: - raise Exception(k+" doesn't match. new: "+new_pre[k]+" - t: "+t_pre[k]) - print("No errors found. Both maps are identical.") - -def check_prefixes(size=1000): - ids = _gen_ids(size) - _check_prefixes(t_prefixes(ids),new_prefixes(ids)) - -def speed_check(base=10,max_pow=7,size_limit=5000): - for i in range(max_pow): - size = int(math.pow(base,i)) - print("-----\n\nSize: %s\n" % size) - globals()['ids'] = _gen_ids(size) - #print(ids) - - if size <= size_limit: - print("t_prefixes()") - cProfile.run("t_prefixes(ids)") - else: - print("Too big to run t_prefixes() efficiently. Would take several minutes.\n") - - print("new_prefixes()") - cProfile.run("new_prefixes(ids)") - -#check_prefixes() -speed_check() \ No newline at end of file +#!/usr/bin/env python +''' +Test the prefix generating method of t vs my bug program + +Created on Jun 11, 2010 + +@author: Michael Diamond +''' + +import math,random,hashlib,cProfile + +# +# Initial t Prefix +# +def t_prefixes(ids): + """Return a mapping of ids to prefixes. + + Each prefix will be the shortest possible substring of the ID that + can uniquely identify it among the given group of IDs. + + If an ID of one task is entirely a substring of another task's ID, the + entire ID will be the prefix. + + """ + prefixes = {} + for task_id in ids: + others = set(ids).difference([task_id]) + for i in range(1, len(task_id)+1): + prefix = task_id[:i] + if not any(map(lambda o: o.startswith(prefix), others)): + prefixes[task_id] = prefix + break + prefixes[task_id] = task_id + return prefixes + +# +# New t Prefix +# +from t import _prefixes as new_prefixes +# +# Test the prefix methods +# +def _hash(text): + """ Return a hash of the given text for use as a testing id """ + return hashlib.sha1(text.encode('utf-8')).hexdigest() + +def _gen_ids(num, substr=.4): + """ Generates a list of hashes of size num to be used as ids. + substr specifies the percentage of ids that should be substrings + of other ids """ + out = [] + alphabet = 'abcdefghijklmnopqrstuvwxyz' + min = 5 + max = 15 + sub_prefix = int(math.floor(num*substr)) + num -= sub_prefix + for _ in range(num): + string = '' + for x in random.sample(alphabet,random.randint(min,max)): + string+=x + out.append(_hash(string)) + for _ in range(sub_prefix): + str = random.choice(out) + if len(str) < 2: + sub_prefix += 1 + continue + out.append(str[:random.randint(1,len(str)-1)]) + random.shuffle(out) + return out + +def _check_prefixes(t_pre,new_pre): + """ Tests two prefix maps to confirm they are the same """ + for k in t_pre: + if not k in new_pre: + raise Exception(k+" not in new. Should be: "+new_pre[k]) + if t_pre[k] != new_pre[k]: + raise Exception(k+" doesn't match. t: "+t_pre[k]+" - new: "+new_pre[k]) + for k in new_pre: + if not k in t_pre: + raise Exception(k+" not in t. Expected: "+new_pre[k]) + if new_pre[k] != t_pre[k]: + raise Exception(k+" doesn't match. new: "+new_pre[k]+" - t: "+t_pre[k]) + print("No errors found. Both maps are identical.") + +def check_prefixes(size=1000): + ids = _gen_ids(size) + _check_prefixes(t_prefixes(ids),new_prefixes(ids)) + +def speed_check(base=10,max_pow=4,size_limit=5000): + for i in range(max_pow): + size = int(math.pow(base,i)) + print("-----\n\nSize: %s\n" % size) + globals()['ids'] = _gen_ids(size) + #print(ids) + + if size <= size_limit: + print("t_prefixes()") + cProfile.run("t_prefixes(ids)") + else: + print("Too big to run t_prefixes() efficiently. Would take several minutes.\n") + + print("new_prefixes()") + cProfile.run("new_prefixes(ids)") + +check_prefixes() +speed_check() diff -r 9d004bdf5b1d -r 21a4c758ca73 t.py --- a/t.py Sat Jun 12 15:47:19 2010 -0700 +++ b/t.py Sun Oct 03 17:45:09 2010 -0400 @@ -89,34 +89,34 @@ If an ID of one task is entirely a substring of another task's ID, the entire ID will be the prefix. """ - pre = {} + ps = {} for id in ids: id_len = len(id) for i in range(1, id_len+1): - """ identifies an empty prefix slot, or a singular collision """ + # identifies an empty prefix slot, or a singular collision prefix = id[:i] - if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]): + if (not prefix in ps) or (ps[prefix] and prefix != ps[prefix]): break - if prefix in pre: - """ if there is a collision """ - collide = pre[prefix] - for j in range(i,id_len+1): - if collide[:j] == id[:j]: - pre[id[:j]] = ':' + if prefix in ps: + # if there is a collision + other_id = ps[prefix] + for j in range(i, id_len+1): + if other_id[:j] == id[:j]: + ps[id[:j]] = '' else: - pre[collide[:j]] = collide - pre[id[:j]] = id + ps[other_id[:j]] = other_id + ps[id[:j]] = id break else: - pre[collide[:id_len+1]] = collide - pre[id] = id + ps[other_id[:id_len+1]] = other_id + ps[id] = id else: - """ no collision, can safely add """ - pre[prefix] = id - pre = dict(zip(pre.values(),pre.keys())) - if ':' in pre: - del pre[':'] - return pre + # no collision, can safely add + ps[prefix] = id + ps = dict(zip(ps.values(), ps.keys())) + if '' in ps: + del ps[''] + return ps class TaskDict(object):