--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,1 @@
+build/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Makefile Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,10 @@
+all: build/quick-fastq
+
+
+build/asdf-manifest: Makefile quick-fastq.asd
+ mkdir -p build/
+ sbcl --disable-debugger --quit --eval '(ql:write-asdf-manifest-file "build/asdf-manifest")'
+
+build/quick-fastq: quick-fastq.lisp build-binary.sh build/asdf-manifest Makefile
+ mkdir -p build/
+ ./build-binary.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.markdown Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,81 @@
+Sometimes you just want to make a quick FASTQ file for testing.
+
+Usage:
+
+1. Write your FASTQ spec in a `foo.lisp` file (see below for the syntax).
+2. `quick-fastq foo.lisp` to dump a random FASTQ on stdout (or `cat foo.lisp | quick-fastq` if you prefer).
+
+You can also pipe the spec into stdin.
+
+## Syntax
+
+`quick-fastq` will read two Common Lisp forms from stdin (using the standard
+reader for now, so don't run it on untrusted data). The format of the input is:
+
+ bindings
+ expr
+
+`expr` is an expression describing how to generate a random read.
+
+* A literal string like `"ATCG"` generates those bases with random quality scores.
+* An integer like `123` generates that many random bases with random quality scores.
+* A vector like `#(expr1 expr2 …)` evaluates each expression and concatenates the results.
+* A symbol like `x` looks up the value in the bindings (see below).
+* A list performs some operation on the form inside, depending on the symbol at
+ the head of the list:
+ * `(qN expr)` where `N` is 0-90 evaluates `expr` and sets its quality scores to `N`, e.g. `(q12 500)` will generate 500
+ random bases with a qscore of `12`.
+ * `(rev expr)` reverses `expr` (you can also use `(r expr)` as a shortcut).
+ * `(comp expr)` complements `expr` (you can also use `(c expr)` as a shortcut).
+ * `(revcomp expr)` is equivalent to `(rev (comp expr))` (you can also use `(rc expr)` as a shortcut)
+ * `(first n expr)` takes the first `n` bases of `expr` (you can also use `(f n expr)` as a shortcut).
+ * `(last n expr)` takes the last `n` bases of `expr` (you can also use `(l n expr)` as a shortcut).
+ * `(rep n expr)` concatenates `n` copies of `expr` (you can also use `(tr n expr)` as a shortcut).
+ * `(snp freq expr)` modifies `expr` to add SNPs at a rate of `freq` (`freq` must be between 0 and 1).
+ * `(ins freq expr)` modifies `expr` to insert bases at a rate of `freq` (`freq` must be between 0 and 1).
+ * `(del freq expr)` modifies `expr` to delete bases at a rate of `freq` (`freq` must be between 0 and 1).
+ * `(err freq expr)` is equivalent to `(ins freq (del freq (snp freq expr)))` (`freq` must be between 0 and 1).
+
+Bindings must be a (possibly empty) list of bindings, each of the form `(symbol
+expr)`. `expr` will be evaluated and bound to `symbol`. Bindings are performed
+in order as if by `let*`. Several keyword symbols have special meanings:
+
+* Binding `:entries` to an integer `n` will generate that many FASTQ entries instead of just a single one.
+* Binding `:seed` to an integer will seed the RNG with a specific seed, to make runs reproducible.
+
+## Examples
+
+Generate a random 1000bp read:
+
+ ()
+ 1000
+
+Generate a read with the same 100bp beginning and end, with 500bp of random
+bases in the middle:
+
+ ((x 100))
+ #(x 500 x)
+
+Generate a foldback chimeric read, with the second half having a lower quality
+than the first:
+
+ ((x (q40 1000)))
+ #(x (q20 (revcomp x)))
+
+Generate a read with a tandem repeat in the middle:
+
+ ()
+ (1000 (rep 200 "ATTT"))
+
+Generate a foldback chimeric read with a double tandem duplication in the
+foldback strand, with simulated sequencing error, and small chunks of
+low-quality bases to make the transitions between sections as a hack:
+
+ ((x 1000)
+ (lq (q1 10))
+ (a (first 800 x))
+ (b (last 200 x))
+ (dup (last 150 a))
+ (f #(lq a lq dup lq (rc dup) lq dup lq b)))
+
+ (err 0.01 #(x (revcomp f)))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/build-binary.sh Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+buildapp \
+ --load-system 'quick-fastq' \
+ --entry 'quick-fastq:toplevel' \
+ --manifest-file 'build/asdf-manifest' \
+ --compress-core \
+ --output 'build/quick-fastq'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quick-fastq.asd Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,13 @@
+(asdf:defsystem :quick-fastq
+ :description "A tool for quickly generating synthetic FASTQ files."
+ :author "Steve Losh <steve@stevelosh.com>"
+ :homepage "https://github.com/sjl/quick-fastq"
+
+ :license "GPL-3.0-or-later"
+
+ :depends-on (:alexandria :iterate :losh)
+
+ :serial t
+ :components ((:file "quick-fastq")))
+
+
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/quick-fastq.lisp Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,176 @@
+(defpackage :quick-fastq
+ (:use :cl :iterate :losh)
+ (:export :toplevel :build))
+
+(in-package :quick-fastq)
+
+;; data is represented as a conses of (bases . quality-scores)
+
+(defun phred-char (q)
+ (code-char (+ (char-code #\!) q)))
+
+(defun random-qscore ()
+ (phred-char (+ 18 (random 4))))
+
+(defun random-base ()
+ (random-elt "ACTG"))
+
+(defun random-dna (n)
+ (cons (coerce (gimme n (random-base)) 'string)
+ (coerce (gimme n (random-qscore)) 'string)))
+
+(defun rev (dna)
+ (cons (reverse (car dna))
+ (reverse (cdr dna))))
+
+(defun complement-base (base)
+ (ecase base
+ (#\A #\T)
+ (#\T #\A)
+ (#\C #\G)
+ (#\G #\C)))
+
+(defun comp (dna)
+ (cons (map 'string #'complement-base (car dna))
+ (cdr dna)))
+
+(defun copy (dna)
+ (cons (copy-seq (car dna))
+ (copy-seq (cdr dna))))
+
+(defun concat (a b)
+ (cons (concatenate 'string (car a) (car b))
+ (concatenate 'string (cdr a) (cdr b))))
+
+(defun revcomp (dna)
+ (rev (comp dna)))
+
+(defun qual (q dna)
+ (cons (car dna)
+ (make-string (length (cdr dna)) :initial-element (phred-char q))))
+
+(defun take-first (n dna)
+ (cons (subseq (car dna) 0 n)
+ (subseq (cdr dna) 0 n)))
+
+(defun take-last (n dna &aux (len (length (car dna))))
+ (cons (subseq (car dna) (- len n) len)
+ (subseq (cdr dna) (- len n) len)))
+
+(defun mutate (base)
+ (random-elt (ecase base
+ (#\A "TCG")
+ (#\T "ACG")
+ (#\C "ATG")
+ (#\G "ATC"))))
+
+(defun add-snp (freq dna)
+ (iterate (with (seq . qs) = (copy dna))
+ (for b :in-string seq :with-index i)
+ (when (randomp freq)
+ (setf (aref seq i) (mutate b)))
+ (returning (cons seq qs))))
+
+(defun add-ins (freq dna)
+ (iterate (for b :in-string (car dna))
+ (for q :in-string (cdr dna))
+ (collect b :into seq)
+ (collect q :into qs)
+ (when (randomp freq)
+ (collect (random-base) :into seq)
+ (collect (random-qscore) :into qs))
+ (returning (cons (coerce seq 'string) (coerce qs 'string)))))
+
+(defun add-del (freq dna)
+ (iterate (for b :in-string (car dna))
+ (for q :in-string (cdr dna))
+ (unless (randomp freq)
+ (collect b :into seq)
+ (collect q :into qs))
+ (returning (cons (coerce seq 'string) (coerce qs 'string)))))
+
+(defun add-err (freq dna)
+ (add-ins freq (add-del freq (add-snp freq dna))))
+
+(defun literal (seq)
+ (cons (copy-seq seq)
+ (coerce (gimme (length seq) (random-qscore)) 'string)))
+
+(defun repeat (n seq)
+ (reduce #'concat (gimme n seq)))
+
+(defun run (binding-forms fastq-form)
+ (let ((bindings (make-hash-table))
+ (entries 1))
+ (labels ((r (form) (rev (eval-form form)))
+ (c (form) (comp (eval-form form)))
+ (rc (form) (revcomp (eval-form form)))
+ (f (n form) (take-first n (eval-form form)))
+ (l (n form) (take-last n (eval-form form)))
+ (q (n form) (qual n (eval-form form)))
+ (snp (freq form) (add-snp freq (eval-form form)))
+ (ins (freq form) (add-ins freq (eval-form form)))
+ (del (freq form) (add-del freq (eval-form form)))
+ (err (freq form) (add-err freq (eval-form form)))
+ (rep (n form) (repeat n (eval-form form)))
+ (eval-form (form)
+ (typecase form
+ (integer (random-dna form))
+ (string (literal form))
+ (vector (reduce #'concat form :key #'eval-form))
+ (symbol (gethash form bindings))
+ (list (destructuring-bind (op . args) form
+ (if (char= #\Q (char (symbol-name op) 0))
+ (apply #'q
+ (parse-integer (subseq (symbol-name op) 1))
+ args)
+ (apply (ecase op
+ ((rep tr) #'rep)
+ ((rev r) #'r)
+ ((comp c) #'c)
+ ((revcomp rc) #'rc)
+ ((first f) #'f)
+ ((last l) #'l)
+ ((snp) #'snp)
+ ((ins) #'ins)
+ ((del) #'del)
+ ((err) #'err))
+ args))))
+ (t form))))
+
+ ;; Process the bindings
+ (dolist (binding-form binding-forms)
+ (destructuring-bind (symbol form) binding-form
+ (case symbol
+ (:seed (setf *random-state* (sb-ext:seed-random-state form)))
+ (:entries (setf entries form))
+ (t (setf (gethash symbol bindings) (eval-form form))))))
+
+ ;; Process the FASTQ form using the bindings and print it
+ (loop :repeat entries
+ :for (seq . qs) = (eval-form fastq-form)
+ :collect (format nil "@quickfastq~%~A~%+~%~A~%" seq qs)))))
+
+(defun read-form (stream)
+ (let ((*package* (find-package :quick-fastq)))
+ ; todo safe-read this
+ (read stream)))
+
+(defun toplevel% (stream)
+ (let ((binding-forms (read-form stream))
+ (fastq-form (read-form stream)))
+ (map nil #'write-string (run binding-forms fastq-form))))
+
+(defun toplevel (argv)
+ (sb-ext:disable-debugger)
+ (pop argv)
+ (when (null argv)
+ (setf argv (list "-")))
+ (setf *random-state* (make-random-state t))
+ (assert (= 1 (length argv)) () "USAGE: quick-fastq [PATH]")
+ (let ((path (first argv)))
+ (if (string= "-" path)
+ (toplevel% *standard-input*)
+ (with-open-file (stream path)
+ (toplevel% stream)))))
+