Source code for arsenal.integerizer

[docs]class AbstractIntegerizer: def __getitem__(self, i): if isinstance(i, list): return [self._list[ii] for ii in i] return self._list[i] def __call__(self, k): if isinstance(k, list): ks = k return [self._add(k) for k in ks] return self._add(k) def _add(self, k): raise NotImplementedError() # method aliases encode = __call__ decode = __getitem__ lookup = __getitem__ add = __call__
[docs]class Integerizer(AbstractIntegerizer): """ Class for maintaining a perfect hash for a set of keys. >>> a = Integerizer() >>> a(list('abcd')) [0, 1, 2, 3] >>> a(list('ace')) [0, 2, 4] >>> a[list(range(4))] ['a', 'b', 'c', 'd'] >>> a.freeze() >>> a.add('z') Traceback (most recent call last): ... ValueError: Alphabet is frozen. Key "z" not found. >>> list(a) ['a', 'b', 'c', 'd', 'e'] """ def __init__(self, data=None): self._map = {} # str -> int self._list = [] # int -> str self._frozen = False if data: self.add(data) def _add(self, k): try: return self._map[k] except KeyError: #if not isinstance(k, basestring): # raise ValueError("Invalid key (%s): only strings allowed." % (k,)) if self._frozen: raise ValueError('Alphabet is frozen. Key "%s" not found.' % (k,)) x = self._map[k] = len(self._list) self._list.append(k) return x def __contains__(self, k): return k in self._map def __iter__(self): return iter(self._list) def __len__(self): return len(self._list) def __eq__(self, other): return self._list == other._list def __repr__(self): return 'Alphabet(size=%s,frozen=%s)' % (len(self), self._frozen)
[docs] def freeze(self): self._frozen = True return self
[docs] def keys(self): return self._list
[docs] def items(self): return self._map.items()
[docs]def jenkins32(a): assert isinstance(a, int) and a >= 0 a = (a+0x7ed55d16) + (a<<12) a = (a^0xc761c23c) ^ (a>>19) a = (a+0x165667b1) + (a<<5) a = (a+0xd3a2646c) ^ (a<<9) a = (a+0xfd7046c5) + (a<<3) a = (a^0xb55a4f09) ^ (a>>16) return a
# TODO: might be good to write in Cython and provide a few fast hash functions # for strings, ints, and tuples. # ^ I have a reasonable implementation in dyna.learn.features
[docs]class FeatureHashing(AbstractIntegerizer): """ >>> h = FeatureHashing(jenkins32, 8) >>> h([1, 2, 3]) [182, 76, 156] """ def __init__(self, h, bits): self.h = h self.bits = bits self.D = 2**bits self.mask = self.D - 1 # use bit mask rather than mod for efficiency def _add(self, k): h = self.h(k) assert h >= 0 return h & self.mask
if __name__ == '__main__': import doctest doctest.testmod()