lisp/bucket.lisp @ dae3f0203507

More
author Steve Losh <steve@stevelosh.com>
date Mon, 28 Aug 2023 14:37:00 -0400
parents f37e47cda7c4
children (none)
(eval-when (:compile-toplevel :load-toplevel :execute)
  (ql:quickload '(:adopt :alexandria :cl-ppcre :with-user-abort :local-time)
                :silent t))

(defpackage :bucket
  (:use :cl)
  (:export :toplevel :*ui*))

(in-package :bucket)

;;;; Configuration ------------------------------------------------------------
(setf local-time:*default-timezone* local-time:+utc-zone+)


;;;; Functionality ------------------------------------------------------------
(defun collect-values (time-mode path)
  (flet ((collect (stream)
           (loop :for line = (read-line stream nil nil)
                 :while line
                 :for value = (if time-mode
                                (local-time:parse-rfc3339-timestring line)
                                (parse-integer line :junk-allowed t))
                 :when value :collect value)))
    (if (string= path "-")
      (collect *standard-input*)
      (with-open-file (s path)
        (collect s)))))


(defun floor-to-nearest (n divisor)
  (* divisor (floor n divisor)))

(declaim (inline in-range-p in-time-p))
(defun in-range-p (val start end)
  (and (<= start val)
       (< val end)))

(defun in-time-p (val start end)
  (and (local-time:timestamp<= start val)
       (local-time:timestamp< val end)))

(defun bucket-numeric (data width skip-empty output)
  (loop :with data = (sort (coerce data 'vector) #'<)
        :with len = (length data)
        :with i = 0
        :for bs :from (floor-to-nearest (aref data 0) width) :by width
        :for be = (+ bs width)
        :while (< i len)
        :for count = (loop :while (< i len)
                           :while (in-range-p (aref data i) bs be)
                           :sum 1
                           :do (incf i))
        :do (unless (and skip-empty (zerop count))
              (format t "~12F ~10D~%"
                      (ecase output
                        (:lower bs)
                        (:upper be)
                        (:mid (/ (+ bs bs width) 2.0d0)))
                      count))))

(defun bucket-temporal (data width skip-empty output)
  (loop :with data = (sort (coerce data 'vector) #'local-time:timestamp<)
        :with len = (length data)
        :with i = 0
        :with bs = (local-time:timestamp-minimize-part (aref data 0) :sec)
        :for be = (local-time:timestamp+ bs width :sec)
        :while (< i len)
        :for count = (loop :while (< i len)
                           :while (in-time-p (aref data i) bs be)
                           :sum 1
                           :do (incf i))
        :do (unless (and skip-empty (zerop count))
              (format t "~A ~10D~%"
                      (local-time:format-rfc3339-timestring
                        nil
                        (ecase output
                          (:lower bs)
                          (:upper be)
                          (:mid (local-time:timestamp+ bs (truncate (* width 1000000000) 2) :nsec)))) ; hack
                      count))
        :do (setf bs (local-time:timestamp+ bs width :sec))))

(defun run (paths &key time-mode width skip-empty (output :mid))
  (when (null width)
    (error "Bucket width must be specified."))
  (let ((data (mapcan (alexandria:curry #'collect-values time-mode)
                      (or paths '("-")))))
    (when (null data)
      (error "No data found."))
    (if time-mode
      (bucket-temporal data width skip-empty output)
      (bucket-numeric data width skip-empty output))))


;;;; User Interface -----------------------------------------------------------
(defparameter *option/help*
  (adopt:make-option 'help
    :help "Display help and exit."
    :long "help"
    :short #\h
    :reduce (constantly t)))

(adopt:defparameters (*option/time* *option/no-time*)
  (adopt:make-boolean-options 'time
    :help "Bucket in timestamp (RFC3339) mode."
    :help-no "Bucket in numeric mode (the default)."
    :long "time"
    :short #\t))

(adopt:defparameters (*option/skip-empty* *option/no-skip-empty*)
  (adopt:make-boolean-options 'skip-empty
    :help "Skip outputting empty buckets."
    :help-no "Include empty buckets (the default)."
    :long "skip-empty"
    :short #\e))


(defparameter *option/output/lower*
  (adopt:make-option 'output/lower
    :result-key 'output
    :help "Output the lower bound of the bucket."
    :long "lower"
    :short #\l
    :reduce (constantly :lower)))

(defparameter *option/output/mid*
  (adopt:make-option 'output/mid
    :result-key 'output
    :help "Output the midpoint of the bucket (the default)."
    :long "mid"
    :short #\m
    :initial-value :mid
    :reduce (constantly :mid)))

(defparameter *option/output/upper*
  (adopt:make-option 'output/upper
    :result-key 'output
    :help "Output the upper bound of the bucket."
    :long "upper"
    :short #\u
    :reduce (constantly :upper)))


(defparameter *option/width*
  (adopt:make-option 'width
    :result-key 'width
    :help "Set bucket width to N."
    :parameter "N"
    :long "width"
    :short #\W
    :initial-value nil
    :key #'parse-integer
    :reduce #'adopt:last))

(defparameter *option/width/seconds*
  (adopt:make-option 'width/seconds
    :result-key 'width
    :help "Set bucket width to N seconds."
    :parameter "N"
    :long "seconds"
    :short #\S
    :key (lambda (n)
           (parse-integer n))
    :reduce #'adopt:last))

(defparameter *option/width/minutes*
  (adopt:make-option 'width/minutes
    :result-key 'width
    :help "Set bucket width to N minutes."
    :parameter "N"
    :long "minutes"
    :short #\M
    :key (lambda (n)
           (* 60 (parse-integer n)))
    :reduce #'adopt:last))

(defparameter *option/width/hours*
  (adopt:make-option 'width/hours
    :result-key 'width
    :help "Set bucket width to N hours."
    :parameter "N"
    :long "hours"
    :short #\H
    :key (lambda (n)
           (* 60 60 (parse-integer n)))
    :reduce #'adopt:last))


(adopt:define-string *help-text*
  "bucket groups lines into histogrammy buckets.~@
   ~@
   This is handy if you have some non-bucketed data that you want to graph as ~
   a histogram with gnuplot, because gnuplot unbelievably does not have built-in ~
   histogramming.")

(defparameter *examples*
  '())


(defparameter *ui*
  (adopt:make-interface
    :name "bucket"
    :usage "[OPTIONS] [FILE...]"
    :summary "bucket things for easier histogramming"
    :help *help-text*
    :examples *examples*
    :contents (list *option/help*
                    *option/time*
                    *option/no-time*
                    *option/skip-empty*
                    *option/no-skip-empty*
                    (adopt:make-group 'bucket-width
                      :title "Bucket Widths"
                      :options (list
                                 *option/width*
                                 *option/width/seconds*
                                 *option/width/minutes*
                                 *option/width/hours*))
                    (adopt:make-group 'bucket-labels
                      :title "Bucket Labels"
                      :options (list
                                 *option/output/lower*
                                 *option/output/mid*
                                 *option/output/upper*)))))


(defun toplevel ()
  ;; #+sbcl (sb-ext:disable-debugger)
  (handler-case
      (adopt::quit-on-ctrl-c ()
        (multiple-value-bind (arguments options) (adopt:parse-options-or-exit *ui*)
          (cond
            ((gethash 'help options) (adopt:print-help-and-exit *ui*))
            (t (progn (local-time:reread-timezone-repository)
                      (run arguments
                           :time-mode (gethash 'time options)
                           :width (gethash 'width options)
                           :skip-empty (gethash 'skip-empty options)
                           :output (gethash 'output options)))))))
    (error (c) (adopt:print-error-and-exit c))))


#; Scratch --------------------------------------------------------------------


(ppcre:scan "(\\d{4})-(\\d{2})-(\\d{2})[ T](\\d{2}):(\\d{2}):(\\d{2})(?:\\.\\d+)?([+-]\\d{2}:\\d{2}|Z)?"
            "2022-04-25T17:00:41.289049Z")