mph-opcodes (3922B)
1 #!/usr/bin/env python3 2 # Easy Perfect Minimal Hashing 3 # By Steve Hanov. Released to the public domain. 4 # 5 # Based on: 6 # Edward A. Fox, Lenwood S. Heath, Qi Fan Chen and Amjad M. Daoud, 7 # "Practical minimal perfect hash functions for large databases", CACM, 35(1):105-121 8 # also a good reference: 9 # Compress, Hash, and Displace algorithm by Djamal Belazzougui, 10 # Fabiano C. Botelho, and Martin Dietzfelbinger 11 import sys 12 13 DICTIONARY = sys.argv[1] 14 15 # Calculates a distinct hash function for a given string. Each value of the 16 # integer d results in a different hash value. 17 def hash( d, str ): 18 if d == 0: d = 0x01000193 19 20 # Use the FNV algorithm from http://isthe.com/chongo/tech/comp/fnv/ 21 for c in str: 22 d = ( (d * 0x01000193) ^ ord(c) ) & 0xffffffff; 23 24 return d 25 26 # Computes a minimal perfect hash table using the given python dictionary. It 27 # returns a tuple (G, V). G and V are both arrays. G contains the intermediate 28 # table of values needed to compute the index of the value in V. V contains the 29 # values of the dictionary. 30 def CreateMinimalPerfectHash( dict ): 31 size = len(dict) 32 33 # Step 1: Place all of the keys into buckets 34 buckets = [ [] for i in range(size) ] 35 G = [0] * size 36 values = [None] * size 37 38 for key in dict.keys(): 39 buckets[hash(0, key) % size].append( key ) 40 41 # Step 2: Sort the buckets and process the ones with the most items first. 42 buckets.sort( key=len, reverse=True ) 43 for b in range( size ): 44 bucket = buckets[b] 45 if len(bucket) <= 1: break 46 47 d = 1 48 item = 0 49 slots = [] 50 51 # Repeatedly try different values of d until we find a hash function 52 # that places all items in the bucket into free slots 53 while item < len(bucket): 54 slot = hash( d, bucket[item] ) % size 55 if values[slot] != None or slot in slots: 56 d += 1 57 item = 0 58 slots = [] 59 else: 60 slots.append( slot ) 61 item += 1 62 63 G[hash(0, bucket[0]) % size] = d 64 for i in range(len(bucket)): 65 values[slots[i]] = dict[bucket[i]] 66 67 # Only buckets with 1 item remain. Process them more quickly by directly 68 # placing them into a free slot. Use a negative value of d to indicate 69 # this. 70 freelist = [] 71 for i in range(size): 72 if values[i] == None: freelist.append( i ) 73 74 for b in range( b, size ): 75 bucket = buckets[b] 76 if len(bucket) == 0: break 77 slot = freelist.pop() 78 # We subtract one to ensure it's negative even if the zeroeth slot was 79 # used. 80 G[hash(0, bucket[0]) % size] = -slot-1 81 values[slot] = dict[bucket[0]] 82 83 return (G, values) 84 85 # Look up a value in the hash table, defined by G and V. 86 def PerfectHashLookup( G, V, key ): 87 d = G[hash(0,key) % len(G)] 88 if d < 0: return V[-d-1] 89 return V[hash(d, key) % len(V)] 90 91 dict = {} 92 line = 1 93 keys = map(lambda x: x.strip(), open(DICTIONARY, "rt").readlines()) 94 for key in keys: 95 dict[key] = line 96 line += 1 97 98 (G, V) = CreateMinimalPerfectHash( dict ) 99 100 Gstrs = ",".join(map(str, G)) 101 Vstrs = ",".join(map(str, V)) 102 103 header = open('oplookup.h', 'w') 104 105 106 107 header.write("/* THIS IS GENERATED FROM mph-opcodes DO NOT EDIT */\n\n") 108 header.write("#ifndef BCS_OPLOOKUP_H\n") 109 header.write("#define BCS_OPLOOKUP_H\n\n") 110 111 header.write("#include \"misc.h\"\n") 112 header.write("extern const int opcodes_g[{}];\n".format(len(G))) 113 header.write("extern const int opcodes_v[{}];\n".format(len(V))) 114 header.write("\n#endif /* BCS_OPLOOKUP_H */\n") 115 header.close() 116 117 cfile = open('oplookup.c', 'w') 118 119 cfile.write("/* THIS IS GENERATED FROM mph-opcodes DO NOT EDIT */\n\n") 120 cfile.write("#include \"oplookup.h\"\n".format(Gstrs)) 121 cfile.write("const int opcodes_g[{}] = {{ {} }};\n".format(len(G), Gstrs)) 122 cfile.write("const int opcodes_v[{}] = {{ {} }};\n".format(len(V), Vstrs)) 123 cfile.close()