#! /usr/bin/python # terse version of huffman10.py - reads counts from stdin (c) David MacKay Dec 2005 # - writes a huffman code This is Free Software. License: GPL import sys, string class node: def __init__(self, count, index , name="" ): self.count = float(count) self.index = index self.name = name ## optional argument if self.name=="" : self.name = index self.word = "" ## codeword will go here def __cmp__(self, other): return cmp(self.count, other.count) def report(self): if (self.index == 1 ) : print '#Symbol\tCount\t\tCodeword' print '%s\t(%-8.4g)\t%s' % (self.name,self.count,self.word) def find(f, seq): """Return first item in sequence where f(item) == True.""" for item in seq: if f(item): return item """ Example usage of iterate: >>> c = []; \ c.append(node(0.5,1,'a')); \ c.append(node(0.25,2,'b')); \ c.append(node(0.125,3,'c')); \ c.append(node(0.125,4,'d')); \ iterate(c) ; reportcode(c) # doctest: +NORMALIZE_WHITESPACE #Symbol Count Codeword a (0.5) 1 b (0.25) 01 c (0.12) 000 d (0.12) 001 """ def iterate (c) : ## The list of nodes c is destroyed as we go, then recreated if ( len(c) > 1 ) : c.sort() ## sort the nodes by count, using the __cmp__ function defined in the node class deletednode = c[0] ## keep copy of smallest node so that we can put it back in later second = c[1].index ## index of second smallest node # MERGE THE BOTTOM TWO c[1].count += c[0].count ## this merged node retains the name of the bigger leaf. del c[0] iterate ( c ) ## find the codeword that has been split/joined at this step co = find( lambda p: p.index == second , c ) deletednode.word = co.word+'0' c.append( deletednode ) ## smaller one gets 0 co.word += '1' co.count -= deletednode.count ## restore correct count else : c[0].word = "" return def test(): ## This is the main example. It must be run from a shell and it reads in counts from stdin ## begin read in the list of counts #################################### c=[] m=0 for line in sys.stdin.readlines(): if line[0] != '#' : words = string.split(line) if len(words) >= 2: m += 1 ; c.append( node( words[0], m, words[1] ) ) elif len(words) >=1 : m += 1 ; c.append( node( words[0], m ) ) ## end read in the list of counts #################################### iterate ( c ) # make huffman code reportcode(c) reportLH(c) def reportcode(c): c.sort(lambda x, y: cmp(x.index, y.index)) # sort by index for co in c : # and write the answer co.report() def reportLH(c,verbose=1): from math import log total=0; H=0 ; L=0 for co in c : total += co.count for co in c : p = co.count * 1.0 / total logp = log(p)/log(2.0) H -= p * logp L += p * len(co.word) if verbose: print "#L = %10.6g H = %10.6g L/H = %10.7g" % ( L, H , L/H) return (L,H) ## end ########################################################################## ## alternate way of calling huffman with a list of counts ## for use as package by other programs ###### ## two example ways of using it: #>>> from Huffman import * #>>> huffman([0.5, 0.25, 0.125, 0.125],1) #>>> c = huffman([1, 2, 3, 4]) def huffman( counts , verbose=0 ) : """ >>> c = huffman([0.5, 0.25, 0.125, 0.125],1) # doctest: +NORMALIZE_WHITESPACE #Symbol Count Codeword 1 (0.5) 1 2 (0.25) 01 3 (0.12) 000 4 (0.12) 001 """ c=[] ## array of nodes m=0 for line in counts : m += 1 ; c.append( node( line, m ) ) iterate ( c ) # make huffman code if (verbose) : reportcode(c) reportLH(c) return c ## end ########################################################################## if __name__ == '__main__': test()