lisp/parsre.lisp @ 1273bba3a35a

More
author Steve Losh <steve@stevelosh.com>
date Mon, 19 Aug 2024 08:55:42 -0400
parents 958358268b7e
children (none)
(eval-when (:compile-toplevel :load-toplevel :execute)
  (ql:quickload '(:adopt :cl-ppcre :iterate :with-user-abort
                  :jarl :conserve :losh)
                :silent t))

(defpackage :parsre
  (:use :cl :iterate)
  (:export :toplevel :*ui*))

(in-package :parsre)

;;;; Configuration ------------------------------------------------------------
(defparameter *output-format* :text)
(defparameter *case-sensitive* t)


;;;; Errors -------------------------------------------------------------------
(define-condition user-error (error) ())

(define-condition missing-regex (user-error) ()
  (:report "A regular expression is required."))

(define-condition missing-registers (user-error) ()
  (:report "Invalid regex, at least one named register is required."))

(define-condition anonymous-register (user-error) ()
  (:report "Invalid regex, anonymous registers are not supported.  Use non-capturing groups (?:…) if you need grouping but not capturing."))

(define-condition malformed-regex (user-error)
  ((underlying-error :initarg :underlying-error))
  (:report (lambda (c s)
             (format s "Invalid regex: ~A" (slot-value c 'underlying-error)))))


;;;; Functionality ------------------------------------------------------------
(defgeneric header (format names))
(defmethod header (format names) nil)
(defmethod header ((format (eql :csv)) names) (conserve:write-row names))


(defmethod output (format names values))

(defmethod output ((format (eql :text)) names values)
  (iterate (for name :in names)
           (for value :in values)
           (format t "~A: ~A~%" name value))
  (terpri))

(defmethod output ((format (eql :json)) names values)
  (jarl:print (alexandria:alist-hash-table (mapcar #'cons names values) :test #'equal))
  (terpri))

(defmethod output ((format (eql :csv)) names values)
  (declare (ignore names))
  (conserve:write-row values))


(defun register-values (string register-starts register-ends)
  (iterate
    (for start :in-vector register-starts)
    (for end :in-vector register-ends)
    (collect (subseq string start end))))

(defun run%% (scanner register-names string)
  (iterate (with start = 0)
           (for (values ms me rs re) = (ppcre:scan scanner string :start start))
           (while ms)
           (output *output-format* register-names (register-values string rs re))
           (setf start (max (1+ start) me))))

(defun run% (scanner register-names stream)
  (iterate (for line :in-stream stream :using #'read-line)
           (run%% scanner register-names line)
           (force-output)))


(defun make-scanner (pattern)
  (handler-case
      (ppcre:create-scanner pattern
                            :case-insensitive-mode (not *case-sensitive*)
                            :single-line-mode t)
    (ppcre:ppcre-syntax-error (c) (error 'malformed-regex :underlying-error c))))

(defun run (pattern paths)
  (let ((ppcre:*allow-named-registers* t)
        (ppcre:*use-bmh-matchers* t)
        (paths (or paths '("-"))))
    (multiple-value-bind (scanner register-names) (make-scanner pattern)
      (cond ((null register-names) (error 'missing-registers))
            ((member nil register-names) (error 'anonymous-register)))
      (header *output-format* register-names)
      (dolist (path paths)
        (if (string= "-" path)
          (run% scanner register-names *standard-input*)
          (with-open-file (stream path :direction :input)
            (run% scanner register-names stream)))))))


;;;; User Interface -----------------------------------------------------------
(defparameter *examples*
  '(("Parse some web service logs:" .
     "parsre '^(?<timestamp>\\\\S+) (?<level>INFO|WARN|ERROR|DEBUG) (?<message>.*)$'")
    ("Parse wordcount report into JSON:" .
     "wc **.lisp | parsre --json '^ *(?<lines>\\\\d+) +(?<words>\\\\d+) +(?<bytes>\\\\d+) +(?<path>.+)$'")))


(defparameter *option/help*
  (adopt:make-option 'help
    :help "Display help and exit."
    :long "help"
    :short #\h
    :reduce (constantly t)))

(defparameter *option/version*
  (adopt:make-option 'version
    :help "Display version information and exit."
    :long "version"
    :reduce (constantly t)))

(adopt:defparameters (*option/debug* *option/no-debug*)
  (adopt:make-boolean-options 'debug
    :long "debug"
    :short #\d
    :help "Enable the Lisp debugger."
    :help-no "Disable the Lisp debugger (default)."))

(adopt:defparameters (*option/profile* *option/no-profile*)
  (adopt:make-boolean-options 'profile
    :long "profile"
    :help "Profile the run and write results to lisp.prof."
    :help-no "Do not profile (default)."))

(adopt:defparameters (*option/matching/ignore-case* *option/matching/no-ignore-case*)
  (adopt:make-boolean-options 'ignore-case
    :long "ignore-case"
    :short #\i
    :help "Ignore case (i.e. case-insensitive)."
    :help-no "Do not ignore case (i.e. case-sensitive) (default)."
    :initial-value nil))

(defparameter *option/output-format/text*
  (adopt:make-option 'output-format/text
    :result-key 'output-format
    :help "Output results as text (default)."
    :long "text"
    :short #\t
    :initial-value :text
    :reduce (constantly :text)))

(defparameter *option/output-format/json*
  (adopt:make-option 'output-format/json
    :result-key 'output-format
    :help "Output results as JSON."
    :long "json"
    :short #\j
    :reduce (constantly :json)))

(defparameter *option/output-format/csv*
  (adopt:make-option 'output-format/csv
    :result-key 'output-format
    :help "Output results as CSV."
    :long "csv"
    :short #\c
    :reduce (constantly :csv)))


(adopt:define-string *help-text*
  "Parsre takes a Perl-compatible regular expression, matches it against input, ~
   and outputs the expression's registers in a variety of formats.~@
   ~@
   The regular expression will be matched against the input line-by-line.  ~
   Multiple matches per line are supported.~@
   ~@
   Registers can be named or anonymous.  Anonymous registers will be named ~
   arbitrarily (the scheme may change in the future).")


(defparameter *ui*
  (adopt:make-interface
    :name "parsre"
    :usage "[OPTIONS] REGEX [FILE...]"
    :summary "almost a parser"
    :help *help-text*
    :examples *examples*
    :contents (list *option/help*
                    *option/version*
                    (adopt:make-group
                      'matching-options
                      :title "Matching Options"
                      :options (list *option/matching/ignore-case*
                                     *option/matching/no-ignore-case*))
                    (adopt:make-group
                      'output-options
                      :title "Output Options"
                      :options (list *option/output-format/text*
                                     *option/output-format/json*
                                     *option/output-format/csv*))
                    (adopt:make-group
                      'debugging-options
                      :title "Debugging Options"
                      :options (list *option/debug*
                                *option/no-debug*
                                *option/profile*
                                *option/no-profile*)))))

(defun configure (options)
  (setf *output-format* (gethash 'output-format options)
        *case-sensitive* (not (gethash 'ignore-case options)))
  (values))

(defun toplevel ()
  (sb-ext:disable-debugger)
  (multiple-value-bind (arguments options) (adopt:parse-options-or-exit *ui*)
    (when (gethash 'debug options)
      (sb-ext:enable-debugger))
    (handler-case
        (with-user-abort:with-user-abort
          (cond
            ((gethash 'help options) (adopt:print-help-and-exit *ui*))
            ((null arguments) (error 'missing-regex))
            (t (destructuring-bind (pattern . files) arguments
                 (configure options)
                 (if (gethash 'profile options)
                   (losh:profile (run pattern files))
                   (run pattern files))))))
      (with-user-abort:user-abort () (adopt:exit 130))
      (user-error (e) (adopt:print-error-and-exit e)))))