9322171b2cc4

Remove unnecessary consing
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Tue, 25 Dec 2018 22:34:30 -0500
parents a4abd0534b7d
children c50aa06f13bd
branches/tags (none)
files .hgignore src/bench.lisp src/main.lisp

Changes

--- a/.hgignore	Tue Dec 25 14:32:50 2018 -0500
+++ b/.hgignore	Tue Dec 25 22:34:30 2018 -0500
@@ -1,7 +1,5 @@
 syntax: glob
 lisp.prof
 scratch.lisp
-*.log
 docs/build
-images
-test/data/fuzz.*
+test/large*
--- a/src/bench.lisp	Tue Dec 25 14:32:50 2018 -0500
+++ b/src/bench.lisp	Tue Dec 25 22:34:30 2018 -0500
@@ -1,20 +1,25 @@
 (in-package :trivial-csv)
 
+(defparameter *field-length* 50)
+(defparameter *row-length* 50)
+(defparameter *data-rows* 10000)
+(defparameter *data-repetitions* 10)
+
 (defun random-char (string)
   (aref string (random (length string))))
 
 (defun random-field ()
   (with-output-to-string (s)
-    (dotimes (i (random 100))
+    (dotimes (i (random *field-length*))
       (write-char (random-char (concatenate 'string (string #\newline)
                                             " abcdefghijklmnop,\""))
                   s))))
 
 (defun random-row ()
-  (loop :repeat (1+ (random 20)) :collect (random-field)))
+  (loop :repeat (1+ (random *row-length*)) :collect (random-field)))
 
-(defun random-data (rows)
-  (loop :repeat rows :collect (random-row)))
+(defun random-data ()
+  (loop :repeat *data-rows* :collect (random-row)))
 
 (defun bench-this (data)
   (let* ((str (make-string-input-stream
@@ -23,7 +28,7 @@
          (result (time (read-rows str))))
     (equal data result)))
 
-(defun bench-other (data)
+(defun bench-cl-csv (data)
   (let* ((str (make-string-input-stream
                 (with-output-to-string (s)
                   (time (cl-csv:write-csv data :stream s)))))
@@ -31,41 +36,58 @@
     (equal data result)))
 
 (defun bench ()
-  (let ((data (random-data 5000)))
+  (let ((data (random-data)))
     (write-line "MINE")
     (bench-this data)
-    (write-line "OTHER")
-    (bench-other data))
+    (write-line "cl-csv")
+    (bench-cl-csv data))
   )
 
 
-(defparameter *data* nil)
+(defvar *data* nil)
 
 (defun generate-large-data ()
-  (setf *data* (random-data 5000)))
+  (setf *data* (random-data)))
 
 (defun write-file-this ()
   (with-open-file (s "test/large-this.csv"
                      :direction :output
                      :if-exists :supersede)
     (time
-      (loop :repeat 10 :do (write-rows *data* s)))))
+      (loop :repeat *data-repetitions* :do
+            (write-rows *data* s)))))
 
-(defun write-file-other ()
-  (with-open-file (s "test/large-other.csv"
+(defun write-file-cl-csv ()
+  (with-open-file (s "test/large-cl-csv.csv"
                      :direction :output
                      :if-exists :supersede)
-    (time (loop :repeat 10 :do (cl-csv:write-csv *data*
-                                                 :stream s
-                                                 :newline (string #\newline))))))
+    (time (loop :repeat *data-repetitions* :do
+                (cl-csv:write-csv *data*
+                                  :stream s
+                                  :newline (string #\newline))))))
+(defun write-file-fare ()
+  (with-open-file (s "test/large-fare.csv"
+                     :direction :output
+                     :if-exists :supersede)
+    (fare-csv:with-rfc4180-csv-syntax ()
+      (time (loop :repeat *data-repetitions* :do
+                  (fare-csv:write-csv-lines *data* s))))))
 
 (defun bench-write-file ()
   (write-line "Generating data.")
   (time (generate-large-data))
+
   (write-line "Benchmarking this (writing).")
+  #+sbcl (sb-ext:gc :full t)
   (write-file-this)
-  (write-line "Benchmarking other (writing).")
-  (write-file-other))
+
+  (write-line "Benchmarking cl-csv (writing).")
+  #+sbcl (sb-ext:gc :full t)
+  (write-file-cl-csv)
+
+  (write-line "Benchmarking fare-csv (writing).")
+  #+sbcl (sb-ext:gc :full t)
+  (write-file-fare))
 
 
 (defun read-file-this ()
@@ -77,28 +99,67 @@
                                          (setf data *data*))
                                        (pop data))
             :until (eql :eof row)
+            :summing 1
             :do (assert (equal expected-row row))))))
 
-(defun read-file-other ()
-  (with-open-file (s "test/large-other.csv")
-    (time (handler-case
-              (loop
-                :with data = *data*
-                :for row = (cl-csv:read-csv-row s
-                                                :newline (string #\newline)
-                                                :trim-outer-whitespace nil)
-                :for expected-row = (progn (when (null data)
-                                             (setf data *data*))
-                                           (pop data))
-                :do (assert (equal expected-row row)))
-            (end-of-file () nil)))))
+(defun read-file-cl-csv ()
+  (with-open-file (s "test/large-cl-csv.csv")
+    (let ((result 0))
+      (time (handler-case
+                (loop
+                  :with data = *data*
+                  :for row = (cl-csv:read-csv-row s
+                                                  :newline (string #\newline)
+                                                  :trim-outer-whitespace nil)
+                  :for expected-row = (progn (when (null data)
+                                               (setf data *data*))
+                                             (pop data))
+                  :do (incf result) (assert (equal expected-row row)))
+              (end-of-file () result))))))
+
+;; There are a couple of problems with fare-csv that make it annoying to work
+;; with:
+;;
+;; fare-csv:read-csv-line can't tell you the difference between '("") and eof so
+;; we have to check for it manually to know when we're done.  This is only
+;; a problem when we're reading line by line.
+;;
+;; fare-csv also can't roundtrip '("").  CSV itself is incapable of
+;; differentiating '() and '("") unless you force quoting, but fare-csv chooses
+;; '() over '("").  Why would you sell out the marginally-useful case (a 1-col
+;; CSV) in favor of the utterly useless case (a 0-col CSV)?
+(defun read-file-fare ()
+  (with-open-file (s "test/large-fare.csv")
+    (fare-csv:with-rfc4180-csv-syntax ()
+      (time (loop
+              :with data = *data*
+              :for eof = (peek-char nil s nil :eof)
+              :for row = (fare-csv:read-csv-line s)
+              :for expected-row = (progn (when (null data)
+                                           (setf data *data*))
+                                         (pop data))
+              :until (eql :eof eof)
+              :summing 1
+              :do (if (equal expected-row '(""))
+                    (assert (equal nil row))
+                    (assert (equal expected-row row))))))))
 
 (defun bench-read-file ()
   (write-line "Benchmarking this (reading).")
-  (read-file-this)
-  (write-line "Benchmarking other (reading).")
-  (read-file-other))
+  #+sbcl (sb-ext:gc :full t)
+  (format t "Read ~D rows.~2%" (read-file-this))
+
+  (write-line "Benchmarking cl-csv (reading).")
+  #+sbcl (sb-ext:gc :full t)
+  (format t "Read ~D rows.~2%" (read-file-cl-csv))
+
+  (write-line "Benchmarking fare-csv (reading).")
+  #+sbcl (sb-ext:gc :full t)
+  (format t "Read ~D rows.~2%" (read-file-fare))
+
+  (values))
 
 (defun bench-file ()
   (bench-write-file)
   (bench-read-file))
+
--- a/src/main.lisp	Tue Dec 25 14:32:50 2018 -0500
+++ b/src/main.lisp	Tue Dec 25 22:34:30 2018 -0500
@@ -8,6 +8,10 @@
 
 
 ;;;; Reading ------------------------------------------------------------------
+;;; We reuse a single string output stream for all the fields in a row, rather
+;;; than having a fresh one per field.  It's a little more tedious but it's
+;;; *significantly* less consing (e.g. 500mb versus 1.5gb).
+
 (defun read-char-if (char stream &optional (eof-error-p t) eof-value)
   "If the next character in `stream` is `char`, read and return it.  Otherwise, return `nil`."
   (cond
@@ -15,59 +19,61 @@
     ((eql char eof-value) char)
     (t nil)))
 
-(defun read-unquoted-field (stream delimiter)
-  "Read an unquoted field (but not the ending field delimiter) from `stream`."
-  (with-output-to-string (result)
-    (loop :for next = (peek-char nil stream nil delimiter)
-          :until (cond
-                   ((char= next delimiter) t)
-                   ((char= next #\newline) t)
-                   ((char= next #\") (error "Unquoted field contains quote."))
-                   (t nil))
-          :do (write-char (read-char stream) result))))
+(defun read-unquoted-field (stream delimiter result)
+  "Read an unquoted field (but not the ending field delimiter) from `stream` into `result`."
+  (loop :for next = (peek-char nil stream nil delimiter)
+        :until (cond
+                 ((char= next delimiter) t)
+                 ((char= next #\newline) t)
+                 ((char= next #\") (error "Unquoted field contains quote."))
+                 (t nil))
+        :do (write-char (read-char stream) result)))
 
-(defun read-quoted-field (stream delimiter)
-  "Read a quoted field (but not the ending field delimiter) from `stream`."
+(defun read-quoted-field (stream delimiter result)
+  "Read a quoted field (but not the ending field delimiter) from `stream` into `result`."
   (declare (ignore delimiter))
   (read-char stream) ; chomp initial quote
-  (with-output-to-string (result)
-    (loop :for char = (read-char stream)
-          :until (and (char= #\" char)
-                      (not (read-char-if #\" stream nil)))
-          :do (write-char char result))))
+  (loop :for char = (read-char stream)
+        :until (and (char= #\" char)
+                    (not (read-char-if #\" stream nil)))
+        :do (write-char char result)))
 
-(defun read-field (stream delimiter)
-  "Read and return a single field from `stream`."
+(defun read-field (stream delimiter result)
+  "Read and return a single field from `stream` into `result`."
   (let* ((field (case (peek-char nil stream nil :eof)
-                  (#\" (read-quoted-field stream delimiter))
+                  (#\" (read-quoted-field stream delimiter result) t)
                   (#\newline (read-char stream) nil)
                   (:eof nil)
-                  (t (read-unquoted-field stream delimiter))))
+                  (t (read-unquoted-field stream delimiter result) t)))
          (done (cond
                  ((null field) t) ; empty field at the end
                  ((read-char-if delimiter stream nil) nil) ; normal field
                  ((read-char-if #\newline stream nil #\newline) t) ; last field
                  (t (error "Bad data after field ~S: ~S"
                            field (peek-char nil stream))))))
-    (values (or field "") done)))
+    (values (if field
+              (get-output-stream-string result)
+              "")
+            done)))
 
 (defun read-row% (stream delimiter eof-error-p eof-value)
   (if (eql :eof (peek-char nil stream eof-error-p :eof))
     eof-value
-    (loop
-      :with field :with done
-      :do (setf (values field done) (read-field stream delimiter))
-      :collect field
-      :until done)))
+    (loop :with field
+          :with done
+          :with result = (make-string-output-stream)
+          :do (setf (values field done) (read-field stream delimiter result))
+          :collect field
+          :until done)))
 
 
 ;;;; Writing ------------------------------------------------------------------
 (defun field-needs-quoting-p (field delimiter)
-  (find-if (lambda (char)
-             (or (char= char #\newline)
-                 (char= char #\")
-                 (char= char delimiter)))
-           field))
+  (some (lambda (char)
+          (or (char= char #\newline)
+              (char= char #\")
+              (char= char delimiter)))
+        field))
 
 (defun write-quoted-field (field stream)
   (write-char #\" stream)
@@ -96,12 +102,29 @@
 
 ;;;; API ----------------------------------------------------------------------
 (defun read-row (&optional (stream *standard-input*) (eof-error-p t) eof-value)
-  "Read and return a row of fields from `stream`."
+  "Read and return a row of fields from the CSV data in `stream`.
+
+  The result will be completely fresh.
+
+  If the end of file for the stream is encountered immediately, an error is
+  signaled unless `eof-error-p` is false, in which case `eof-value` is returned.
+
+  "
+  (check-type stream stream)
+  (assert (input-stream-p stream) (stream)
+          "Stream ~S is not an input stream." stream)
   (check-delimiter)
   (read-row% stream *delimiter* eof-error-p eof-value))
 
 (defun read-rows (&optional (stream *standard-input*))
-  "Read and return all rows from `stream`."
+  "Read and return all CSV rows from the CSV data in `stream`.
+
+  The result will be completely fresh.
+
+  "
+  (check-type stream stream)
+  (assert (input-stream-p stream) (stream)
+          "Stream ~S is not an input stream." stream)
   (check-delimiter)
   (loop :with delimiter = *delimiter*
         :for row = (read-row% stream delimiter nil :eof)
@@ -109,11 +132,13 @@
         :collect row))
 
 (defun write-row (row &optional (stream *standard-output*))
+  "Write `row` to `stream` as CSV data."
   (check-delimiter)
   (write-row% row stream *delimiter*)
   (values))
 
 (defun write-rows (rows &optional (stream *standard-output*))
+  "Write `rows` to `stream` as CSV data."
   (check-delimiter)
   (loop :with delimiter = *delimiter*
         :for row :in rows