# HG changeset patch # User Steve Losh # Date 1545795270 18000 # Node ID 9322171b2cc42882825df4d4d3b02c4fb76422d6 # Parent a4abd0534b7dfac275216c1bbd3ab863808d066c Remove unnecessary consing diff -r a4abd0534b7d -r 9322171b2cc4 .hgignore --- a/.hgignore Tue Dec 25 14:32:50 2018 -0500 +++ b/.hgignore Tue Dec 25 22:34:30 2018 -0500 @@ -1,7 +1,5 @@ syntax: glob lisp.prof scratch.lisp -*.log docs/build -images -test/data/fuzz.* +test/large* diff -r a4abd0534b7d -r 9322171b2cc4 src/bench.lisp --- a/src/bench.lisp Tue Dec 25 14:32:50 2018 -0500 +++ b/src/bench.lisp Tue Dec 25 22:34:30 2018 -0500 @@ -1,20 +1,25 @@ (in-package :trivial-csv) +(defparameter *field-length* 50) +(defparameter *row-length* 50) +(defparameter *data-rows* 10000) +(defparameter *data-repetitions* 10) + (defun random-char (string) (aref string (random (length string)))) (defun random-field () (with-output-to-string (s) - (dotimes (i (random 100)) + (dotimes (i (random *field-length*)) (write-char (random-char (concatenate 'string (string #\newline) " abcdefghijklmnop,\"")) s)))) (defun random-row () - (loop :repeat (1+ (random 20)) :collect (random-field))) + (loop :repeat (1+ (random *row-length*)) :collect (random-field))) -(defun random-data (rows) - (loop :repeat rows :collect (random-row))) +(defun random-data () + (loop :repeat *data-rows* :collect (random-row))) (defun bench-this (data) (let* ((str (make-string-input-stream @@ -23,7 +28,7 @@ (result (time (read-rows str)))) (equal data result))) -(defun bench-other (data) +(defun bench-cl-csv (data) (let* ((str (make-string-input-stream (with-output-to-string (s) (time (cl-csv:write-csv data :stream s))))) @@ -31,41 +36,58 @@ (equal data result))) (defun bench () - (let ((data (random-data 5000))) + (let ((data (random-data))) (write-line "MINE") (bench-this data) - (write-line "OTHER") - (bench-other data)) + (write-line "cl-csv") + (bench-cl-csv data)) ) -(defparameter *data* nil) +(defvar *data* nil) (defun generate-large-data () - (setf *data* (random-data 5000))) + (setf *data* (random-data))) (defun write-file-this () (with-open-file (s "test/large-this.csv" :direction :output :if-exists :supersede) (time - (loop :repeat 10 :do (write-rows *data* s))))) + (loop :repeat *data-repetitions* :do + (write-rows *data* s))))) -(defun write-file-other () - (with-open-file (s "test/large-other.csv" +(defun write-file-cl-csv () + (with-open-file (s "test/large-cl-csv.csv" :direction :output :if-exists :supersede) - (time (loop :repeat 10 :do (cl-csv:write-csv *data* - :stream s - :newline (string #\newline)))))) + (time (loop :repeat *data-repetitions* :do + (cl-csv:write-csv *data* + :stream s + :newline (string #\newline)))))) +(defun write-file-fare () + (with-open-file (s "test/large-fare.csv" + :direction :output + :if-exists :supersede) + (fare-csv:with-rfc4180-csv-syntax () + (time (loop :repeat *data-repetitions* :do + (fare-csv:write-csv-lines *data* s)))))) (defun bench-write-file () (write-line "Generating data.") (time (generate-large-data)) + (write-line "Benchmarking this (writing).") + #+sbcl (sb-ext:gc :full t) (write-file-this) - (write-line "Benchmarking other (writing).") - (write-file-other)) + + (write-line "Benchmarking cl-csv (writing).") + #+sbcl (sb-ext:gc :full t) + (write-file-cl-csv) + + (write-line "Benchmarking fare-csv (writing).") + #+sbcl (sb-ext:gc :full t) + (write-file-fare)) (defun read-file-this () @@ -77,28 +99,67 @@ (setf data *data*)) (pop data)) :until (eql :eof row) + :summing 1 :do (assert (equal expected-row row)))))) -(defun read-file-other () - (with-open-file (s "test/large-other.csv") - (time (handler-case - (loop - :with data = *data* - :for row = (cl-csv:read-csv-row s - :newline (string #\newline) - :trim-outer-whitespace nil) - :for expected-row = (progn (when (null data) - (setf data *data*)) - (pop data)) - :do (assert (equal expected-row row))) - (end-of-file () nil))))) +(defun read-file-cl-csv () + (with-open-file (s "test/large-cl-csv.csv") + (let ((result 0)) + (time (handler-case + (loop + :with data = *data* + :for row = (cl-csv:read-csv-row s + :newline (string #\newline) + :trim-outer-whitespace nil) + :for expected-row = (progn (when (null data) + (setf data *data*)) + (pop data)) + :do (incf result) (assert (equal expected-row row))) + (end-of-file () result)))))) + +;; There are a couple of problems with fare-csv that make it annoying to work +;; with: +;; +;; fare-csv:read-csv-line can't tell you the difference between '("") and eof so +;; we have to check for it manually to know when we're done. This is only +;; a problem when we're reading line by line. +;; +;; fare-csv also can't roundtrip '(""). CSV itself is incapable of +;; differentiating '() and '("") unless you force quoting, but fare-csv chooses +;; '() over '(""). Why would you sell out the marginally-useful case (a 1-col +;; CSV) in favor of the utterly useless case (a 0-col CSV)? +(defun read-file-fare () + (with-open-file (s "test/large-fare.csv") + (fare-csv:with-rfc4180-csv-syntax () + (time (loop + :with data = *data* + :for eof = (peek-char nil s nil :eof) + :for row = (fare-csv:read-csv-line s) + :for expected-row = (progn (when (null data) + (setf data *data*)) + (pop data)) + :until (eql :eof eof) + :summing 1 + :do (if (equal expected-row '("")) + (assert (equal nil row)) + (assert (equal expected-row row)))))))) (defun bench-read-file () (write-line "Benchmarking this (reading).") - (read-file-this) - (write-line "Benchmarking other (reading).") - (read-file-other)) + #+sbcl (sb-ext:gc :full t) + (format t "Read ~D rows.~2%" (read-file-this)) + + (write-line "Benchmarking cl-csv (reading).") + #+sbcl (sb-ext:gc :full t) + (format t "Read ~D rows.~2%" (read-file-cl-csv)) + + (write-line "Benchmarking fare-csv (reading).") + #+sbcl (sb-ext:gc :full t) + (format t "Read ~D rows.~2%" (read-file-fare)) + + (values)) (defun bench-file () (bench-write-file) (bench-read-file)) + diff -r a4abd0534b7d -r 9322171b2cc4 src/main.lisp --- a/src/main.lisp Tue Dec 25 14:32:50 2018 -0500 +++ b/src/main.lisp Tue Dec 25 22:34:30 2018 -0500 @@ -8,6 +8,10 @@ ;;;; Reading ------------------------------------------------------------------ +;;; We reuse a single string output stream for all the fields in a row, rather +;;; than having a fresh one per field. It's a little more tedious but it's +;;; *significantly* less consing (e.g. 500mb versus 1.5gb). + (defun read-char-if (char stream &optional (eof-error-p t) eof-value) "If the next character in `stream` is `char`, read and return it. Otherwise, return `nil`." (cond @@ -15,59 +19,61 @@ ((eql char eof-value) char) (t nil))) -(defun read-unquoted-field (stream delimiter) - "Read an unquoted field (but not the ending field delimiter) from `stream`." - (with-output-to-string (result) - (loop :for next = (peek-char nil stream nil delimiter) - :until (cond - ((char= next delimiter) t) - ((char= next #\newline) t) - ((char= next #\") (error "Unquoted field contains quote.")) - (t nil)) - :do (write-char (read-char stream) result)))) +(defun read-unquoted-field (stream delimiter result) + "Read an unquoted field (but not the ending field delimiter) from `stream` into `result`." + (loop :for next = (peek-char nil stream nil delimiter) + :until (cond + ((char= next delimiter) t) + ((char= next #\newline) t) + ((char= next #\") (error "Unquoted field contains quote.")) + (t nil)) + :do (write-char (read-char stream) result))) -(defun read-quoted-field (stream delimiter) - "Read a quoted field (but not the ending field delimiter) from `stream`." +(defun read-quoted-field (stream delimiter result) + "Read a quoted field (but not the ending field delimiter) from `stream` into `result`." (declare (ignore delimiter)) (read-char stream) ; chomp initial quote - (with-output-to-string (result) - (loop :for char = (read-char stream) - :until (and (char= #\" char) - (not (read-char-if #\" stream nil))) - :do (write-char char result)))) + (loop :for char = (read-char stream) + :until (and (char= #\" char) + (not (read-char-if #\" stream nil))) + :do (write-char char result))) -(defun read-field (stream delimiter) - "Read and return a single field from `stream`." +(defun read-field (stream delimiter result) + "Read and return a single field from `stream` into `result`." (let* ((field (case (peek-char nil stream nil :eof) - (#\" (read-quoted-field stream delimiter)) + (#\" (read-quoted-field stream delimiter result) t) (#\newline (read-char stream) nil) (:eof nil) - (t (read-unquoted-field stream delimiter)))) + (t (read-unquoted-field stream delimiter result) t))) (done (cond ((null field) t) ; empty field at the end ((read-char-if delimiter stream nil) nil) ; normal field ((read-char-if #\newline stream nil #\newline) t) ; last field (t (error "Bad data after field ~S: ~S" field (peek-char nil stream)))))) - (values (or field "") done))) + (values (if field + (get-output-stream-string result) + "") + done))) (defun read-row% (stream delimiter eof-error-p eof-value) (if (eql :eof (peek-char nil stream eof-error-p :eof)) eof-value - (loop - :with field :with done - :do (setf (values field done) (read-field stream delimiter)) - :collect field - :until done))) + (loop :with field + :with done + :with result = (make-string-output-stream) + :do (setf (values field done) (read-field stream delimiter result)) + :collect field + :until done))) ;;;; Writing ------------------------------------------------------------------ (defun field-needs-quoting-p (field delimiter) - (find-if (lambda (char) - (or (char= char #\newline) - (char= char #\") - (char= char delimiter))) - field)) + (some (lambda (char) + (or (char= char #\newline) + (char= char #\") + (char= char delimiter))) + field)) (defun write-quoted-field (field stream) (write-char #\" stream) @@ -96,12 +102,29 @@ ;;;; API ---------------------------------------------------------------------- (defun read-row (&optional (stream *standard-input*) (eof-error-p t) eof-value) - "Read and return a row of fields from `stream`." + "Read and return a row of fields from the CSV data in `stream`. + + The result will be completely fresh. + + If the end of file for the stream is encountered immediately, an error is + signaled unless `eof-error-p` is false, in which case `eof-value` is returned. + + " + (check-type stream stream) + (assert (input-stream-p stream) (stream) + "Stream ~S is not an input stream." stream) (check-delimiter) (read-row% stream *delimiter* eof-error-p eof-value)) (defun read-rows (&optional (stream *standard-input*)) - "Read and return all rows from `stream`." + "Read and return all CSV rows from the CSV data in `stream`. + + The result will be completely fresh. + + " + (check-type stream stream) + (assert (input-stream-p stream) (stream) + "Stream ~S is not an input stream." stream) (check-delimiter) (loop :with delimiter = *delimiter* :for row = (read-row% stream delimiter nil :eof) @@ -109,11 +132,13 @@ :collect row)) (defun write-row (row &optional (stream *standard-output*)) + "Write `row` to `stream` as CSV data." (check-delimiter) (write-row% row stream *delimiter*) (values)) (defun write-rows (rows &optional (stream *standard-output*)) + "Write `rows` to `stream` as CSV data." (check-delimiter) (loop :with delimiter = *delimiter* :for row :in rows