#!/usr/bin/perl -w # # David MacKay Dec 97 # huffman.p - extracted from frequency2.p - writes a huffman code # # See also huffmano.p for a version that writes OCTAVE-friendly output # # input: to standard input, or in a file, should be something like this: # # a 10 # b 7 # c 3 # d 2 # # or # # # Comment lines are allowed # X 0.3 # Y 0.4 # Z 0.3 # # The corresponding output will be the codewords: # # a 0 # b 10 # c 110 # d 111 # # The input and output formats can be varied a little. # input: $dc = 0 ; # how far from right column to go to get the counts $offset = 0 ; # whether to add extra offset to each count. # # the first entry in each row is used as the name of the symbol # # formatting of output: $informativeoutput=1; # change to zero to make output briefer $lengthonly=0; # set to 1 to get lengths printed $latex=0 ; # change to 1 to change all these $openbracket = ":\n" ; $endline = "" ; $realendline = "\n" ; $comma = " " ; $closebracket = "" ; # $verbose = 0 ; # # accept commandline modifications of the above variables. # eval "\$$1=\$2" while ($#ARGV >= 0) && $ARGV[0]=~ /^(\w+)=(.*)/ && shift ; # if ( $latex ) { $openbracket = ":\n" ; $endline = "\t".'\\\\' ; $realendline = "\n" ; $comma = " & " ; $closebracket = "" ; } # $endline = $endline.$realendline ; # # the start. # $cols = 1 ; $r = 0 ; # number of rows $Count = 0 ; READING: while ( <> ) { if ( /^\s*\#/ ) { next READING ; } # ignore comments @b = split ; if ( $r == 0 ) { $cols = $#b ; } elsif ( ($#b != $cols) ) { print STDERR "Warning, row $r odd\n" ; } $f = $b[0] ; $count = $b[$#b - $dc] ; $count += $offset ; if ( $verbose >= 1 ) { print "$f : $count\n" ; } $pr{$f} = $count ; $arr[$r] = $f ; $c[$r] = $count ; # this is destroyed by the huffman algm $ccc[$r] = $count ; # this copppy of c is retained $r ++ ; $Count += $count ; } $M = $r ; if ($M <= 1) { print "too few symbols\n" ; exit(0) ; } # # make huffman code # @sorted = sort bypr @arr ; if($informativeoutput){ print "sorted characters\n" ; print join(' ',@sorted) , "\n" ; } sub bypr { $pr{$a} <=> $pr{$b} ; } sub bynumber { $a <=> $b; } print "\n" ; # make huffman tree &iterate ( $M ) ; # ff and ss now contain lists of branching instructions # now deduce the codewords. Start from the root with the null codeword. $codes[0] = "" ; $lengths[0] = 0 ; for ( $mm = 2 ; $mm <= $M ; $mm ++ ) { $f = $ff[$mm] ; $s = $ss[$mm] ; $co = $codes[$f] ; # this is the one we will expand $le = $lengths[$f] ; # move down everyone below s for ( $m = $mm - 1 ; $m > $s ; $m -- ) { $codes[$m] = $codes[$m-1] ; $lengths[$m] = $lengths[$m-1] ; } $codes[$f] = $co."0" ; $codes[$s] = $co."1" ; $lengths[$f] = $le + 1 ; $lengths[$s] = $le + 1 ; } # write the answer print " codewords$openbracket" ; if ( $latex ) { print '\begin{tabular}{clrrl} \hline' ; print "\n" ; print '$a_i$ & $p_i$ & \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$} & $l_i$ & $c(a_i)$ \\\\[0.1in] \hline ' ; print "\n" ; } $L = 0.0 ; $H = 0.0 ; for ( $mm = 0 ; $mm < $M ; $mm ++ ) { $prob[$mm] = 1.0 * $ccc[$mm] / $Count ; if ( $ccc[$mm] > 0.0 ) { $logp = log($prob[$mm])*1.0/log(2.0) ; $H += $ccc[$mm] * log( 1.0 * $Count / $ccc[$mm] ) ; } else { print STDERR "WARNING ccc[$mm] = $ccc[$mm]\n" ; $logp = 0.0 ; } if ( $latex ) { $arr[$mm] = '{\tt '.$arr[$mm].'}' ; $codes[$mm] = '{\tt '.$codes[$mm].'}' ; } print $arr[$mm] , "\t$comma" ; printf "%-10.4g %s %-6.1f %s %2d" , $prob[$mm] , $comma , -$logp , $comma , $lengths[$mm] if ($informativeoutput) ; printf "%2d" , $lengths[$mm] if ($lengthonly) ; print "$comma" , $codes[$mm] , "$endline" ; $L += 1.0 * $lengths[$mm] * $ccc[$mm] ; } if ( $latex ) { print ' \hline' . "\n" ; print '\end{tabular}'."\n" ; } print "$closebracket" ; $L = 1.0 * $L / $Count ; $H = $H / ( $Count * log(2.0) ) ; printf "total count %10.5g\n" , $Count ; printf "expected length %10.5g\n" , $L ; printf "entropy %10.5g\n" , $H ; printf "length / entropy %10.8g\n" , $L/ $H ; # end sub iterate { #usage ( $m ) local ( $m ) = @_ ; if ( $verbose >= 1 ) { print "iterating, m=$m\n" ; print join ( ' ' , @c ) , "\n" ; } # this is not the most elegant huffman algorithm! # run through the array, finding the smallest two entries @sorted = sort bynumber @c[0..$m-1] ; $th = $sorted[ 1 ] ; $bot = $sorted[ 0 ] ; # find the first that is below $bot for ( $mm = 0 , $found = 0 ; $mm < $m ; $mm ++ ) { if ( $c[$mm] <= $bot ) { $second = $mm ; $found ++ ; last ; } } # find the first that is below th: for ( $mm = 0 ; $mm < $m ; $mm ++ ) { if ( ($second != $mm) && ($c[$mm] <= $th) ) { $first = $mm ; $found ++ ; last ; } } if ( $found != 2 ) { print STDERR "warning didn't find 2! threshold = !\n" ; } if ( $first > $second ) { $tmp = $first ; $first = $second ; $second = $tmp ; } # throw the weight of the 2nd into the 1st $c[$first] += $c[$second] ; # move everyone else up the list for ( $mm = $second ; $mm < $m-1 ; $mm ++ ) { $c[$mm] = $c[$mm+1] ; } # destroy the last entry $c[$mm] = "" ; # record what has been done $ff[$m] = $first ; $ss[$m] = $second ; if ( $verbose >= 1 ) { print "combining " , $first, " & " , $second , "\n" ; } if ( $m > 2 ) { &iterate ( $m-1 ) ; } }