src/huffman.lisp @ 84fa1724b747
default tip
More work on the Huffman encoder and basic project skeleton
author |
Steve Losh <steve@stevelosh.com> |
date |
Sun, 26 Jul 2020 16:49:17 -0400 |
parents |
b2decfc394fa |
children |
(none) |
(in-package :tdcb/huffman)
;;;; Counts and Chunks --------------------------------------------------------
(defun count-bytes (byte-vector)
(loop :with result = (make-array 257 :initial-element 0)
:for b :across byte-vector
:do (incf (aref result b))
:finally (progn
(incf (aref result 256))
(return result))))
(defun next-chunk (counts &key (start 0) (skips 0))
(when-let ((start (position-if #'plusp counts :start start :end 256)))
(iterate (with end = start)
(with skips-remaining = skips)
(for c :in-vector counts :with-index i :from start :below 256)
(if (plusp c)
(setf end i
skips-remaining skips)
(when (minusp (decf skips-remaining))
(finish)))
(returning (list start end (subseq counts start (1+ end)))
i))))
(defun counts-to-chunks (counts &key (skips 0))
(iterate
(with start = 0)
(for (values chunk i) = (next-chunk counts :start start :skips skips))
(while chunk)
(collect chunk)
(setf start i)))
(defun chunks-to-counts (chunks)
(let ((result (make-array 257 :initial-element 0)))
(iterate (for (start end vals) :in chunks)
(replace result vals :start1 start :end1 (1+ end)))
(incf (aref result 256))
result))
(defun print-chunks (chunks stream)
(if (null chunks)
(write-octets stream #(0 0 0))
(iterate (for (start end counts) :in chunks)
(write-octet stream start)
(write-octet stream end)
(write-octets stream counts)
(finally (write-octet stream 0)))))
(defun read-chunks (stream)
(iterate (for start = (read-octet stream))
(if-first-time
(progn)
(until (zerop start)))
(for end = (read-octet stream))
(for chunk = (read-octets stream (1+ (- end start))))
(collect (list start end chunk))))
;;;; Trees --------------------------------------------------------------------
(defclass* tree ()
(b0 b1 weight))
(defclass* leaf ()
(value weight))
(defmethod print-object ((o leaf) stream)
(print-unreadable-object (o stream :type t)
(format stream "~D ~S weight ~D"
(value o)
(if (= 256 (value o))
:eof
(code-char (value o)))
(weight o))))
(defmethod print-object ((o tree) stream)
(print-unreadable-object (o stream :type t)
(format stream "weight ~D" (weight o))))
(defun print-tree (tree)
(recursively ((node tree) (indent 0))
(format t "~vA~A~%" indent "" node)
(when (typep node 'tree)
(recur (b0 node) (+ indent 4))
(recur (b1 node) (+ indent 4))))
tree)
(defun make-tree (counts)
"Create a Huffman tree from `counts`."
(let ((heap (pileup:make-heap #'< :key #'weight :size 257)))
(iterate (for c :in-vector counts :with-index i)
(unless (zerop c)
(pileup:heap-insert (make-instance 'leaf :value i :weight c) heap)))
(iterate
(for a = (pileup:heap-pop heap))
(when (pileup:heap-empty-p heap)
(return a)) ; todo handle single-leaf trees
(for b = (pileup:heap-pop heap))
(for weight = (+ (weight a) (weight b)))
(pileup:heap-insert (make-instance 'tree :b0 a :b1 b :weight weight)
heap))))
;;;; Code Maps ----------------------------------------------------------------
(defun make-code-map (tree)
"Turn a Huffman `tree` into a code map.
A code map is a 257-element vector. The indexes are the bytes, and the values
are `(code . code-length)` conses, or `nil` if that byte was not present in
the tree. The seemingly out-of-bounds 256 value is the EOF marker.
"
(let ((result (make-array 257 :initial-element nil)))
(recursively ((node tree) (code 0) (code-length 0))
(etypecase node
(tree (recur (b0 node) (ash code 1) (1+ code-length))
(recur (b1 node) (+ (ash code 1) 1) (1+ code-length)))
(leaf (setf (aref result (value node))
(cons code (max 1 code-length))))))
result))
(defun print-code-map (map)
(iterate
(with vals = (_ (iterate
(for item :in-vector map :with-index value)
(when item
(collect (list value (car item) (cdr item)))))
(stable-sort _ #'< :key #'second) ; sort by code
(stable-sort _ #'< :key #'third))) ; then by length
(for (value code code-length) :in vals)
(for char = (if (= 256 value)
:eof
(code-char value)) )
(format t "~12<~v,'0B:~;~> byte #x~2,'0X ~S has code ~D (length ~D)~%"
code-length code
value char
code code-length)))
;;;; Trees --------------------------------------------------------------------
(defun encode (byte map bits)
"Encode `byte` to `bits` using the Huffman `map`."
(destructuring-bind (code . code-length) (aref map byte)
(write-bits bits code code-length)))
(defun decode (bits tree)
"Decode a byte from `bits` using the Huffman `tree`."
(recursively ((node tree))
(if (typep node 'leaf)
(value node)
(recur (if (zerop (read-bit bits))
(b0 node)
(b1 node))))))
(defun compress% (in out)
(let* ((data (alexandria:read-stream-content-into-byte-vector in))
(counts (count-bytes data))
(chunks (counts-to-chunks counts))
(tree (make-tree counts))
(map (make-code-map tree)))
(print-chunks chunks out)
(iterate (for byte :in-vector data)
(encode byte map out))
(encode 256 map out)
(values)))
(defun extract% (in out)
(let* ((chunks (read-chunks in))
(counts (chunks-to-counts chunks))
(tree (make-tree counts)))
(iterate
(for byte = (decode in tree))
(until (eql 256 byte))
(write-byte byte out))))
(defparameter *compress-ui*
(adopt:make-interface
:name "c huffman"
:summary "simple Huffman algorithm"
:usage "[OPTIONS]"
:help "Compress using a simple Huffman algorithm."))
(defparameter *extract-ui*
(adopt:make-interface
:name "e huffman"
:summary "simple Huffman algorithm"
:usage "[OPTIONS]"
:help "Extract using a simple Huffman algorithm."))
(defun compress (arguments)
(multiple-value-bind (arguments options)
(adopt:parse-options *compress-ui* arguments)
(destructuring-bind (in-path out-path) arguments
(with-open-file (in in-path :direction :input :element-type 'u8)
(with-open-file (out out-path :direction :output :element-type 'u8)
(with-bits-output (out out)
(compress% in out)))))))
(defun extract (arguments)
(multiple-value-bind (arguments options)
(adopt:parse-options *compress-ui* arguments)
(destructuring-bind (in-path out-path) arguments
(with-open-file (in in-path :direction :input :element-type 'u8)
(with-open-file (out out-path :direction :output :element-type 'u8)
(with-bits-input (in in)
(extract% in out)))))))
(tdcb/main:ensure-compressor "huffman" "simple Huffman algorithm" #'compress)
(tdcb/main:ensure-extractor "huffman" "simple Huffman algorithm" #'extract)