prefixcheck.py @ 9d004bdf5b1d

Improved the _prefix() method of t, and added a test utility to confirm the new method works and runs faster - the same code on Python 3.1 runs much faster.
https://bitbucket.org/dimo414/t-3.1/
author Michael Diamond@Artemis <dimo414@gmail.com>
date Sat, 12 Jun 2010 15:47:19 -0700
parents (none)
children 21a4c758ca73
'''
Test the prefix generating method of t vs my bug program

Created on Jun 11, 2010

@author: Michael Diamond
'''

import math,random,hashlib,cProfile

#
# Initial t Prefix
#
def t_prefixes(ids):
    """Return a mapping of ids to prefixes.
    
    Each prefix will be the shortest possible substring of the ID that
    can uniquely identify it among the given group of IDs.
    
    If an ID of one task is entirely a substring of another task's ID, the
    entire ID will be the prefix.
    
    """
    prefixes = {}
    for task_id in ids:
        others = set(ids).difference([task_id])
        for i in range(1, len(task_id)+1):
            prefix = task_id[:i]
            if not any(map(lambda o: o.startswith(prefix), others)):
                prefixes[task_id] = prefix
                break
            prefixes[task_id] = task_id
    return prefixes

#
# New t Prefix
#
def new_prefixes(ids):
    """Return a mapping of ids to prefixes in O(n) time.
    
    This is much faster than the naitive t function, which
    takes O(n^2) time.
    
    Each prefix will be the shortest possible substring of the ID that
    can uniquely identify it among the given group of IDs.
    
    If an ID of one task is entirely a substring of another task's ID, the
    entire ID will be the prefix.
    """
    pre = {}
    for id in ids:
        #print(id)
        id_len = len(id)
        for i in range(1, id_len+1):
            """ identifies an empty prefix slot, or a singular collision """
            prefix = id[:i]
            #print (pre)
            #print (prefix)
            if (not prefix in pre) or (pre[prefix] != ':' and prefix != pre[prefix]):
                break
        #print (prefix)
        #print ("--")
        if prefix in pre:
            """ if there is a collision """
            collide = pre[prefix]
            for j in range(i,id_len+1):
                if collide[:j] == id[:j]:
                    pre[id[:j]] = ':'
                else:
                    pre[collide[:j]] = collide
                    pre[id[:j]] = id
                    break
            else:
                pre[collide[:id_len+1]] = collide
                pre[id] = id
        else:
            """ no collision, can safely add """
            pre[prefix] = id
        #print("Additional")
        #print(pre)
        #print("\n***\n")
    pre = dict(zip(pre.values(),pre.keys()))
    if ':' in pre:
        del pre[':']
    return pre

#
# Test the prefix methods
#
def _hash(text):
    """ Return a hash of the given text for use as a testing id """
    return hashlib.sha1(text.encode('utf-8')).hexdigest()

def _gen_ids(num, substr=.4):
    """ Generates a list of hashes of size num to be used as ids.
    substr specifies the percentage of ids that should be substrings
    of other ids """
    out = []
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    min = 5
    max = 15
    sub_prefix = int(math.floor(num*substr))
    num -= sub_prefix
    for _ in range(num):
        string = ''
        for x in random.sample(alphabet,random.randint(min,max)):
            string+=x
        out.append(_hash(string))
    for _ in range(sub_prefix):
        str = random.choice(out)
        if len(str) < 2:
            sub_prefix += 1
            continue
        out.append(str[:random.randint(1,len(str)-1)])
    random.shuffle(out)
    return out

def _check_prefixes(t_pre,new_pre):
    """ Tests two prefix maps to confirm they are the same """
    for k in t_pre:
        if not k in new_pre:
            raise Exception(k+" not in new.  Should be: "+new_pre[k])
        if t_pre[k] != new_pre[k]:
            raise Exception(k+" doesn't match.  t: "+t_pre[k]+" - new: "+new_pre[k])
    for k in new_pre:
        if not k in t_pre:
            raise Exception(k+" not in t.  Expected: "+new_pre[k])
        if new_pre[k] != t_pre[k]:
            raise Exception(k+" doesn't match.  new: "+new_pre[k]+" - t: "+t_pre[k])
    print("No errors found.  Both maps are identical.")
    
def check_prefixes(size=1000):
    ids = _gen_ids(size)
    _check_prefixes(t_prefixes(ids),new_prefixes(ids))

def speed_check(base=10,max_pow=7,size_limit=5000):
    for i in range(max_pow):
        size = int(math.pow(base,i))
        print("-----\n\nSize: %s\n" % size)
        globals()['ids'] = _gen_ids(size)
        #print(ids)
        
        if size <= size_limit:
            print("t_prefixes()")
            cProfile.run("t_prefixes(ids)")
        else:
            print("Too big to run t_prefixes() efficiently.  Would take several minutes.\n")
            
        print("new_prefixes()")
        cProfile.run("new_prefixes(ids)")

#check_prefixes()
speed_check()