57a12bdb713e

Initial commit
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Mon, 23 Mar 2026 16:38:57 -0400
parents
children 02081983e4e7
branches/tags (none)
files .hgignore Makefile README.markdown build-binary.sh quick-fastq.asd quick-fastq.lisp

Changes

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.hgignore	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,1 @@
+build/
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Makefile	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,10 @@
+all: build/quick-fastq
+
+
+build/asdf-manifest: Makefile quick-fastq.asd
+	mkdir -p build/
+	sbcl --disable-debugger --quit --eval '(ql:write-asdf-manifest-file "build/asdf-manifest")'
+
+build/quick-fastq: quick-fastq.lisp build-binary.sh build/asdf-manifest Makefile
+	mkdir -p build/
+	./build-binary.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.markdown	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,81 @@
+Sometimes you just want to make a quick FASTQ file for testing.
+
+Usage:
+
+1. Write your FASTQ spec in a `foo.lisp` file (see below for the syntax).
+2. `quick-fastq foo.lisp` to dump a random FASTQ on stdout (or `cat foo.lisp | quick-fastq` if you prefer).
+
+You can also pipe the spec into stdin.
+
+## Syntax
+
+`quick-fastq` will read two Common Lisp forms from stdin (using the standard
+reader for now, so don't run it on untrusted data).  The format of the input is:
+
+    bindings
+    expr
+
+`expr` is an expression describing how to generate a random read.
+
+* A literal string like `"ATCG"` generates those bases with random quality scores.
+* An integer like `123` generates that many random bases with random quality scores.
+* A vector like `#(expr1 expr2 …)` evaluates each expression and concatenates the results.
+* A symbol like `x` looks up the value in the bindings (see below).
+* A list performs some operation on the form inside, depending on the symbol at
+  the head of the list:
+  * `(qN expr)` where `N` is 0-90 evaluates `expr` and sets its quality scores to `N`, e.g. `(q12 500)` will generate 500
+  random bases with a qscore of `12`.
+  * `(rev expr)` reverses `expr` (you can also use `(r expr)` as a shortcut).
+  * `(comp expr)` complements `expr` (you can also use `(c expr)` as a shortcut).
+  * `(revcomp expr)` is equivalent to `(rev (comp expr))` (you can also use `(rc expr)` as a shortcut)
+  * `(first n expr)` takes the first `n` bases of `expr` (you can also use `(f n expr)` as a shortcut).
+  * `(last n expr)` takes the last `n` bases of `expr` (you can also use `(l n expr)` as a shortcut).
+  * `(rep n expr)` concatenates `n` copies of `expr` (you can also use `(tr n expr)` as a shortcut).
+  * `(snp freq expr)` modifies `expr` to add SNPs at a rate of `freq` (`freq` must be between 0 and 1).
+  * `(ins freq expr)` modifies `expr` to insert bases at a rate of `freq` (`freq` must be between 0 and 1).
+  * `(del freq expr)` modifies `expr` to delete bases at a rate of `freq` (`freq` must be between 0 and 1).
+  * `(err freq expr)` is equivalent to `(ins freq (del freq (snp freq expr)))` (`freq` must be between 0 and 1).
+
+Bindings must be a (possibly empty) list of bindings, each of the form `(symbol
+expr)`.  `expr` will be evaluated and bound to `symbol`.  Bindings are performed
+in order as if by `let*`.  Several keyword symbols have special meanings:
+
+* Binding `:entries` to an integer `n` will generate that many FASTQ entries instead of just a single one.
+* Binding `:seed` to an integer will seed the RNG with a specific seed, to make runs reproducible.
+
+## Examples
+
+Generate a random 1000bp read:
+
+    ()
+    1000
+
+Generate a read with the same 100bp beginning and end, with 500bp of random
+bases in the middle:
+
+    ((x 100))
+    #(x 500 x)
+
+Generate a foldback chimeric read, with the second half having a lower quality
+than the first:
+
+    ((x (q40 1000)))
+    #(x (q20 (revcomp x)))
+
+Generate a read with a tandem repeat in the middle:
+
+    ()
+    (1000 (rep 200 "ATTT"))
+
+Generate a foldback chimeric read with a double tandem duplication in the
+foldback strand, with simulated sequencing error, and small chunks of
+low-quality bases to make the transitions between sections as a hack:
+
+    ((x 1000)
+     (lq (q1 10))
+     (a (first 800 x))
+     (b (last 200 x))
+     (dup (last 150 a))
+     (f #(lq a lq dup lq (rc dup) lq dup lq b)))
+
+    (err 0.01 #(x (revcomp f)))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/build-binary.sh	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+buildapp \
+        --load-system 'quick-fastq' \
+        --entry 'quick-fastq:toplevel' \
+        --manifest-file 'build/asdf-manifest' \
+        --compress-core \
+        --output 'build/quick-fastq'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quick-fastq.asd	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,13 @@
+(asdf:defsystem :quick-fastq
+  :description "A tool for quickly generating synthetic FASTQ files."
+  :author "Steve Losh <steve@stevelosh.com>"
+  :homepage "https://github.com/sjl/quick-fastq"
+
+  :license "GPL-3.0-or-later"
+
+  :depends-on (:alexandria :iterate :losh)
+
+  :serial t
+  :components ((:file "quick-fastq")))
+
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/quick-fastq.lisp	Mon Mar 23 16:38:57 2026 -0400
@@ -0,0 +1,176 @@
+(defpackage :quick-fastq
+  (:use :cl :iterate :losh)
+  (:export :toplevel :build))
+
+(in-package :quick-fastq)
+
+;; data is represented as a conses of (bases . quality-scores)
+
+(defun phred-char (q)
+  (code-char (+ (char-code #\!) q)))
+
+(defun random-qscore ()
+  (phred-char (+ 18 (random 4))))
+
+(defun random-base ()
+  (random-elt "ACTG"))
+
+(defun random-dna (n)
+  (cons (coerce (gimme n (random-base)) 'string)
+        (coerce (gimme n (random-qscore)) 'string)))
+
+(defun rev (dna)
+  (cons (reverse (car dna))
+        (reverse (cdr dna))))
+
+(defun complement-base (base)
+  (ecase base
+    (#\A #\T)
+    (#\T #\A)
+    (#\C #\G)
+    (#\G #\C)))
+
+(defun comp (dna)
+  (cons (map 'string #'complement-base (car dna))
+        (cdr dna)))
+
+(defun copy (dna)
+  (cons (copy-seq (car dna))
+        (copy-seq (cdr dna))))
+
+(defun concat (a b)
+  (cons (concatenate 'string (car a) (car b))
+        (concatenate 'string (cdr a) (cdr b))))
+
+(defun revcomp (dna)
+  (rev (comp dna)))
+
+(defun qual (q dna)
+  (cons (car dna)
+        (make-string (length (cdr dna)) :initial-element (phred-char q))))
+
+(defun take-first (n dna)
+  (cons (subseq (car dna) 0 n)
+        (subseq (cdr dna) 0 n)))
+
+(defun take-last (n dna &aux (len (length (car dna))))
+  (cons (subseq (car dna) (- len n) len)
+        (subseq (cdr dna) (- len n) len)))
+
+(defun mutate (base)
+  (random-elt (ecase base
+                (#\A "TCG")
+                (#\T "ACG")
+                (#\C "ATG")
+                (#\G "ATC"))))
+
+(defun add-snp (freq dna)
+  (iterate (with (seq . qs) = (copy dna))
+           (for b :in-string seq :with-index i)
+           (when (randomp freq)
+             (setf (aref seq i) (mutate b)))
+           (returning (cons seq qs))))
+
+(defun add-ins (freq dna)
+  (iterate (for b :in-string (car dna))
+           (for q :in-string (cdr dna))
+           (collect b :into seq)
+           (collect q :into qs)
+           (when (randomp freq)
+             (collect (random-base) :into seq)
+             (collect (random-qscore) :into qs))
+           (returning (cons (coerce seq 'string) (coerce qs 'string)))))
+
+(defun add-del (freq dna)
+  (iterate (for b :in-string (car dna))
+           (for q :in-string (cdr dna))
+           (unless (randomp freq)
+             (collect b :into seq)
+             (collect q :into qs))
+           (returning (cons (coerce seq 'string) (coerce qs 'string)))))
+
+(defun add-err (freq dna)
+  (add-ins freq (add-del freq (add-snp freq dna))))
+
+(defun literal (seq)
+  (cons (copy-seq seq)
+        (coerce (gimme (length seq) (random-qscore)) 'string)))
+
+(defun repeat (n seq)
+  (reduce #'concat (gimme n seq)))
+
+(defun run (binding-forms fastq-form)
+  (let ((bindings (make-hash-table))
+        (entries 1))
+    (labels ((r (form) (rev (eval-form form)))
+             (c (form) (comp (eval-form form)))
+             (rc (form) (revcomp (eval-form form)))
+             (f (n form) (take-first n (eval-form form)))
+             (l (n form) (take-last n (eval-form form)))
+             (q (n form) (qual n (eval-form form)))
+             (snp (freq form) (add-snp freq (eval-form form)))
+             (ins (freq form) (add-ins freq (eval-form form)))
+             (del (freq form) (add-del freq (eval-form form)))
+             (err (freq form) (add-err freq (eval-form form)))
+             (rep (n form) (repeat n (eval-form form)))
+             (eval-form (form)
+               (typecase form
+                 (integer (random-dna form))
+                 (string (literal form))
+                 (vector (reduce #'concat form :key #'eval-form))
+                 (symbol (gethash form bindings))
+                 (list (destructuring-bind (op . args) form
+                         (if (char= #\Q (char (symbol-name op) 0))
+                           (apply #'q
+                                  (parse-integer (subseq (symbol-name op) 1))
+                                  args)
+                           (apply (ecase op
+                                    ((rep tr) #'rep)
+                                    ((rev r) #'r)
+                                    ((comp c) #'c)
+                                    ((revcomp rc) #'rc)
+                                    ((first f) #'f)
+                                    ((last l) #'l)
+                                    ((snp) #'snp)
+                                    ((ins) #'ins)
+                                    ((del) #'del)
+                                    ((err) #'err))
+                                  args))))
+                 (t form))))
+
+      ;; Process the bindings
+      (dolist (binding-form binding-forms)
+        (destructuring-bind (symbol form) binding-form
+          (case symbol
+            (:seed (setf *random-state* (sb-ext:seed-random-state form)))
+            (:entries (setf entries form))
+            (t (setf (gethash symbol bindings) (eval-form form))))))
+
+      ;; Process the FASTQ form using the bindings and print it
+      (loop :repeat entries
+            :for (seq . qs) = (eval-form fastq-form)
+            :collect (format nil "@quickfastq~%~A~%+~%~A~%" seq qs)))))
+
+(defun read-form (stream)
+  (let ((*package* (find-package :quick-fastq)))
+    ; todo safe-read this
+    (read stream)))
+
+(defun toplevel% (stream)
+  (let ((binding-forms (read-form stream))
+        (fastq-form (read-form stream)))
+    (map nil #'write-string (run binding-forms fastq-form))))
+
+(defun toplevel (argv)
+  (sb-ext:disable-debugger)
+  (pop argv)
+  (when (null argv)
+    (setf argv (list "-")))
+  (setf *random-state* (make-random-state t))
+  (assert (= 1 (length argv)) () "USAGE: quick-fastq [PATH]")
+  (let ((path (first argv)))
+    (if (string= "-" path)
+      (toplevel% *standard-input*)
+      (with-open-file (stream path)
+        (toplevel% stream)))))
+