716d1110c12f

Initial Huffman skeleton
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Sat, 25 Jul 2020 15:48:24 -0400
parents c689117887e3
children b2decfc394fa
branches/tags (none)
files src/bits.lisp src/hamming.lisp src/package.lisp src/utilities.lisp tdcb.asd

Changes

--- a/src/bits.lisp	Sat Jul 25 15:48:15 2020 -0400
+++ b/src/bits.lisp	Sat Jul 25 15:48:24 2020 -0400
@@ -31,6 +31,15 @@
                    (ashf mask -1)))
   byte)
 
+(defun write-octet (b octet)
+  (write-bits b octet 8))
+
+(defun write-octets (b sequence)
+  (map nil (lambda (octet)
+             (write-bits b octet 8))
+       sequence))
+
+
 (defun flush (b)
   (unless (= (mask b) #x80)
     (write-byte (rack b) (underlying-stream b))))
@@ -59,6 +68,16 @@
                    (ashf mask -1))
         :finally (return result)))
 
+(defun read-octet (b)
+  (read-bits b 8))
+
+(defun read-octets (b count)
+  (iterate
+    (with result = (make-array count))
+    (for i :from 0 :below count)
+    (setf (aref result i) (read-octet b))
+    (returning result)))
+
 
 ;;;; Create -------------------------------------------------------------------
 (defmacro with-bits-output ((var stream) &body body)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hamming.lisp	Sat Jul 25 15:48:24 2020 -0400
@@ -0,0 +1,188 @@
+(in-package :tdcb/huffman)
+
+;;;; Counts and Chunks --------------------------------------------------------
+(defun count-bytes (byte-vector)
+  (loop :with result = (make-array 256 :initial-element 0)
+        :for b :across byte-vector
+        :do (incf (aref result b))
+        :finally (return result)))
+
+(defun next-chunk (counts &key (start 0) (skips 0))
+  (when-let ((start (position-if #'plusp counts :start start)))
+    (iterate (with end = nil)
+             (with skips-remaining = skips)
+             (for c :in-vector counts :with-index i :from start)
+             (if (plusp c)
+               (setf end i
+                     skips-remaining skips)
+               (when (minusp (decf skips-remaining))
+                 (finish)))
+             (returning (list start end (subseq counts start (1+ end)))
+                        i))))
+
+(defun counts-to-chunks (counts &key (skips 0))
+  (iterate
+    (with start = 0)
+    (for (values chunk i) = (next-chunk counts :start start :skips skips))
+    (while chunk)
+    (collect chunk)
+    (setf start i)))
+
+(defun chunks-to-counts (chunks)
+  (let ((result (make-array 256 :initial-element 0)))
+    (iterate (for (start end vals) :in chunks)
+             (replace result vals :start1 start :end1 (1+ end)))
+    result))
+
+(defun print-chunks (chunks stream)
+  (if (null chunks)
+    (write-octets stream #(0 0 0))
+    (iterate (for (start end counts) :in chunks)
+             (write-octet stream start)
+             (write-octet stream end)
+             (write-octets stream counts)
+             (finally (write-octet stream 0)))))
+
+(defun read-chunks (stream)
+  (iterate (for start = (read-octet stream))
+           (until (and (zerop start) (not (first-time-p))))
+           (for end = (read-octet stream))
+           (for chunk = (read-octets stream (1+ (- end start))))
+           (collect (list start end chunk))))
+
+
+;;;; Trees --------------------------------------------------------------------
+(defclass* tree ()
+  (b0 b1 weight))
+
+(defclass* leaf ()
+  (value weight))
+
+(defmethod print-object ((o leaf) stream)
+  (print-unreadable-object (o stream :type t)
+    (format stream "~D ~S weight ~D"
+            (value o)
+            (code-char (value o))
+            (weight o))))
+
+(defmethod print-object ((o tree) stream)
+  (print-unreadable-object (o stream :type t)
+    (format stream "weight ~D" (weight o))))
+
+(defun print-tree (tree)
+  (recursively ((node tree) (indent 0))
+    (format t "~vA~A~%" indent "" node)
+    (when (typep node 'tree)
+      (recur (b0 node) (+ indent 4))
+      (recur (b1 node) (+ indent 4))))
+  tree)
+
+(defun make-tree (counts)
+  "Create a Huffman tree from `counts`."
+  (let ((heap (pileup:make-heap #'< :key #'weight :size 256)))
+    (iterate (for c :in-vector counts :with-index i)
+             (unless (zerop c)
+               (pileup:heap-insert (make-instance 'leaf :value i :weight c) heap)))
+    (iterate
+      (for a = (pileup:heap-pop heap))
+      (when (pileup:heap-empty-p heap)
+        (return a)) ; todo handle single-leaf trees
+      (for b = (pileup:heap-pop heap))
+      (for weight = (+ (weight a) (weight b)))
+      (pileup:heap-insert (make-instance 'tree :b0 a :b1 b :weight weight)
+                          heap))))
+
+
+;;;; Code Maps ----------------------------------------------------------------
+(defun make-code-map (tree)
+  "Turn a Huffman `tree` into a code map.
+
+  A code map is a 256-element vector.  The indexes are the bytes, and the values
+  are `(code . code-length)` conses, or `nil` if that byte was not present in
+  the tree.
+
+  "
+  (let ((result (make-array 256 :initial-element nil)))
+    (recursively ((node tree) (code 0) (code-length 0))
+      (etypecase node
+        (tree (recur (b0 node) (ash code 1) (1+ code-length))
+              (recur (b1 node) (+ (ash code 1) 1) (1+ code-length)))
+        (leaf (setf (aref result (value node))
+                    (cons code (max 1 code-length))))))
+    result))
+
+(defun print-code-map (map)
+  (iterate
+    (with vals = (_ (iterate
+                      (for item :in-vector map :with-index value)
+                      (when item
+                        (collect (list value (car item) (cdr item)))))
+                   (stable-sort _ #'< :key #'second) ; sort by code
+                   (stable-sort _ #'< :key #'third))) ; then by length
+    (for (value code code-length) :in vals)
+    (format t "~12<~v,'0B:~;~> byte #x~2,'0X ~S has code ~D (length ~D)~%"
+            code-length code
+            value (code-char value)
+            code code-length)))
+
+
+;;;; Trees --------------------------------------------------------------------
+(defun encode (byte map bits)
+  "Encode `byte` to `bits` using the Huffman `map`."
+  (destructuring-bind (code . code-length) (aref map byte)
+    (write-bits bits code code-length)))
+
+(defun decode (bits tree)
+  "Decode a byte from `bits` using the Huffman `tree`."
+  (recursively ((node tree))
+    (if (typep node 'leaf)
+      (value node)
+      (recur (if (zerop (read-bit bits))
+               (b0 node)
+               (b1 node))))))
+
+
+(defun compress% (in out)
+  (let* ((data (alexandria:read-stream-content-into-byte-vector in))
+         (counts (count-bytes data))
+         (chunks (counts-to-chunks counts))
+         (tree (make-tree counts))
+         (map (make-code-map tree)))
+    (print-chunks chunks out)
+    (iterate (for byte :in-vector data)
+             (encode byte map out))))
+
+(defparameter *compress-ui*
+  (adopt:make-interface
+    :name "c huffman"
+    :summary "simple Huffman algorithm"
+    :usage "[OPTIONS]"
+    :help "Compress using a simple Huffman algorithm."))
+
+(defparameter *extract-ui*
+  (adopt:make-interface
+    :name "e huffman"
+    :summary "simple Huffman algorithm"
+    :usage "[OPTIONS]"
+    :help "Extract using a simple Huffman algorithm."))
+
+(defun compress (arguments)
+  (multiple-value-bind (arguments options)
+      (adopt:parse-options *compress-ui* arguments)
+    (destructuring-bind (in-path out-path) arguments
+      (with-open-file (in in-path :direction :input :element-type 'u8)
+        (with-open-file (out out-path :direction :output :element-type 'u8)
+          (with-bits-output (out out)
+            (compress% in out)))))))
+
+(defun extract (arguments)
+  (multiple-value-bind (arguments options)
+      (adopt:parse-options *compress-ui* arguments)
+    (destructuring-bind (in-path out-path) arguments
+      (with-open-file (in in-path :direction :input :element-type 'u8)
+        (with-open-file (out out-path :direction :output :element-type 'u8)
+          (with-bits-input (in in)
+            (extract% in out)))))))
+
+(tdcb/main:ensure-compressor "huffman" "simple Huffman algorithm" #'compress)
+(tdcb/main:ensure-extractor "huffman" "simple Huffman algorithm" #'extract)
--- a/src/package.lisp	Sat Jul 25 15:48:15 2020 -0400
+++ b/src/package.lisp	Sat Jul 25 15:48:24 2020 -0400
@@ -1,6 +1,12 @@
 (defpackage :tdcb
-  (:use :cl)
-  (:export))
+  (:use :cl :iterate :losh)
+  (:export
+    :ashf :logiorf :u8
+    :with-bits-input :with-bits-output
+    :read-bit :read-bits
+    :read-octet :read-octets
+    :write-bit :write-bits
+    :write-octet :write-octets))
 
 (defpackage :tdcb/main
   (:use :cl)
@@ -9,3 +15,7 @@
     :build/extract
     :ensure-compressor
     :ensure-extractor))
+
+(defpackage :tdcb/huffman
+  (:use :cl :tdcb :iterate :losh)
+  (:export))
--- a/src/utilities.lisp	Sat Jul 25 15:48:15 2020 -0400
+++ b/src/utilities.lisp	Sat Jul 25 15:48:24 2020 -0400
@@ -3,5 +3,6 @@
 (define-modify-macro ashf (count) ash "Arithmetic shift in place")
 (define-modify-macro logiorf (&rest integers) logior "Logical inclusive or in place")
 
+(deftype u8 () '(unsigned-byte 8))
 
 
--- a/tdcb.asd	Sat Jul 25 15:48:15 2020 -0400
+++ b/tdcb.asd	Sat Jul 25 15:48:24 2020 -0400
@@ -4,13 +4,19 @@
 
   :license "MIT"
 
-  :depends-on (:adopt)
+  :depends-on (:adopt
+               :alexandria
+               :losh
+               :iterate
+               :flexi-streams
+               :pileup)
 
   :serial t
   :components ((:module "src" :serial t
                 :components ((:file "package")
                              (:file "utilities")
                              (:file "bits")
-                             (:file "main")))))
+                             (:file "main")
+                             (:file "huffman")))))