src/huffman.lisp @ 84fa1724b747 default tip

More work on the Huffman encoder and basic project skeleton
author Steve Losh <steve@stevelosh.com>
date Sun, 26 Jul 2020 16:49:17 -0400
parents b2decfc394fa
children (none)
(in-package :tdcb/huffman)

;;;; Counts and Chunks --------------------------------------------------------
(defun count-bytes (byte-vector)
  (loop :with result = (make-array 257 :initial-element 0)
        :for b :across byte-vector
        :do (incf (aref result b))
        :finally (progn
                   (incf (aref result 256))
                   (return result))))

(defun next-chunk (counts &key (start 0) (skips 0))
  (when-let ((start (position-if #'plusp counts :start start :end 256)))
    (iterate (with end = start)
             (with skips-remaining = skips)
             (for c :in-vector counts :with-index i :from start :below 256)
             (if (plusp c)
               (setf end i
                     skips-remaining skips)
               (when (minusp (decf skips-remaining))
                 (finish)))
             (returning (list start end (subseq counts start (1+ end)))
                        i))))

(defun counts-to-chunks (counts &key (skips 0))
  (iterate
    (with start = 0)
    (for (values chunk i) = (next-chunk counts :start start :skips skips))
    (while chunk)
    (collect chunk)
    (setf start i)))

(defun chunks-to-counts (chunks)
  (let ((result (make-array 257 :initial-element 0)))
    (iterate (for (start end vals) :in chunks)
             (replace result vals :start1 start :end1 (1+ end)))
    (incf (aref result 256))
    result))

(defun print-chunks (chunks stream)
  (if (null chunks)
    (write-octets stream #(0 0 0))
    (iterate (for (start end counts) :in chunks)
             (write-octet stream start)
             (write-octet stream end)
             (write-octets stream counts)
             (finally (write-octet stream 0)))))

(defun read-chunks (stream)
  (iterate (for start = (read-octet stream))
           (if-first-time
             (progn)
             (until (zerop start)))
           (for end = (read-octet stream))
           (for chunk = (read-octets stream (1+ (- end start))))
           (collect (list start end chunk))))


;;;; Trees --------------------------------------------------------------------
(defclass* tree ()
  (b0 b1 weight))

(defclass* leaf ()
  (value weight))

(defmethod print-object ((o leaf) stream)
  (print-unreadable-object (o stream :type t)
    (format stream "~D ~S weight ~D"
            (value o)
            (if (= 256 (value o))
              :eof
              (code-char (value o)))
            (weight o))))

(defmethod print-object ((o tree) stream)
  (print-unreadable-object (o stream :type t)
    (format stream "weight ~D" (weight o))))

(defun print-tree (tree)
  (recursively ((node tree) (indent 0))
    (format t "~vA~A~%" indent "" node)
    (when (typep node 'tree)
      (recur (b0 node) (+ indent 4))
      (recur (b1 node) (+ indent 4))))
  tree)

(defun make-tree (counts)
  "Create a Huffman tree from `counts`."
  (let ((heap (pileup:make-heap #'< :key #'weight :size 257)))
    (iterate (for c :in-vector counts :with-index i)
             (unless (zerop c)
               (pileup:heap-insert (make-instance 'leaf :value i :weight c) heap)))
    (iterate
      (for a = (pileup:heap-pop heap))
      (when (pileup:heap-empty-p heap)
        (return a)) ; todo handle single-leaf trees
      (for b = (pileup:heap-pop heap))
      (for weight = (+ (weight a) (weight b)))
      (pileup:heap-insert (make-instance 'tree :b0 a :b1 b :weight weight)
                          heap))))


;;;; Code Maps ----------------------------------------------------------------
(defun make-code-map (tree)
  "Turn a Huffman `tree` into a code map.

  A code map is a 257-element vector.  The indexes are the bytes, and the values
  are `(code . code-length)` conses, or `nil` if that byte was not present in
  the tree.  The seemingly out-of-bounds 256 value is the EOF marker.

  "
  (let ((result (make-array 257 :initial-element nil)))
    (recursively ((node tree) (code 0) (code-length 0))
      (etypecase node
        (tree (recur (b0 node) (ash code 1) (1+ code-length))
              (recur (b1 node) (+ (ash code 1) 1) (1+ code-length)))
        (leaf (setf (aref result (value node))
                    (cons code (max 1 code-length))))))
    result))

(defun print-code-map (map)
  (iterate
    (with vals = (_ (iterate
                      (for item :in-vector map :with-index value)
                      (when item
                        (collect (list value (car item) (cdr item)))))
                   (stable-sort _ #'< :key #'second) ; sort by code
                   (stable-sort _ #'< :key #'third))) ; then by length
    (for (value code code-length) :in vals)
    (for char = (if (= 256 value)
                  :eof
                  (code-char value)) )
    (format t "~12<~v,'0B:~;~> byte #x~2,'0X ~S has code ~D (length ~D)~%"
            code-length code
            value char
            code code-length)))


;;;; Trees --------------------------------------------------------------------
(defun encode (byte map bits)
  "Encode `byte` to `bits` using the Huffman `map`."
  (destructuring-bind (code . code-length) (aref map byte)
    (write-bits bits code code-length)))

(defun decode (bits tree)
  "Decode a byte from `bits` using the Huffman `tree`."
  (recursively ((node tree))
    (if (typep node 'leaf)
      (value node)
      (recur (if (zerop (read-bit bits))
               (b0 node)
               (b1 node))))))


(defun compress% (in out)
  (let* ((data (alexandria:read-stream-content-into-byte-vector in))
         (counts (count-bytes data))
         (chunks (counts-to-chunks counts))
         (tree (make-tree counts))
         (map (make-code-map tree)))
    (print-chunks chunks out)
    (iterate (for byte :in-vector data)
             (encode byte map out))
    (encode 256 map out)
    (values)))

(defun extract% (in out)
  (let* ((chunks (read-chunks in))
         (counts (chunks-to-counts chunks))
         (tree (make-tree counts)))
    (iterate
      (for byte = (decode in tree))
      (until (eql 256 byte))
      (write-byte byte out))))


(defparameter *compress-ui*
  (adopt:make-interface
    :name "c huffman"
    :summary "simple Huffman algorithm"
    :usage "[OPTIONS]"
    :help "Compress using a simple Huffman algorithm."))

(defparameter *extract-ui*
  (adopt:make-interface
    :name "e huffman"
    :summary "simple Huffman algorithm"
    :usage "[OPTIONS]"
    :help "Extract using a simple Huffman algorithm."))

(defun compress (arguments)
  (multiple-value-bind (arguments options)
      (adopt:parse-options *compress-ui* arguments)
    (destructuring-bind (in-path out-path) arguments
      (with-open-file (in in-path :direction :input :element-type 'u8)
        (with-open-file (out out-path :direction :output :element-type 'u8)
          (with-bits-output (out out)
            (compress% in out)))))))

(defun extract (arguments)
  (multiple-value-bind (arguments options)
      (adopt:parse-options *compress-ui* arguments)
    (destructuring-bind (in-path out-path) arguments
      (with-open-file (in in-path :direction :input :element-type 'u8)
        (with-open-file (out out-path :direction :output :element-type 'u8)
          (with-bits-input (in in)
            (extract% in out)))))))

(tdcb/main:ensure-compressor "huffman" "simple Huffman algorithm" #'compress)
(tdcb/main:ensure-extractor "huffman" "simple Huffman algorithm" #'extract)