# HG changeset patch # User Steve Losh # Date 1774298337 14400 # Node ID 57a12bdb713ea546e964dfe141c58f94830b5475 Initial commit diff -r 000000000000 -r 57a12bdb713e .hgignore --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.hgignore Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,1 @@ +build/ diff -r 000000000000 -r 57a12bdb713e Makefile --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Makefile Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,10 @@ +all: build/quick-fastq + + +build/asdf-manifest: Makefile quick-fastq.asd + mkdir -p build/ + sbcl --disable-debugger --quit --eval '(ql:write-asdf-manifest-file "build/asdf-manifest")' + +build/quick-fastq: quick-fastq.lisp build-binary.sh build/asdf-manifest Makefile + mkdir -p build/ + ./build-binary.sh diff -r 000000000000 -r 57a12bdb713e README.markdown --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.markdown Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,81 @@ +Sometimes you just want to make a quick FASTQ file for testing. + +Usage: + +1. Write your FASTQ spec in a `foo.lisp` file (see below for the syntax). +2. `quick-fastq foo.lisp` to dump a random FASTQ on stdout (or `cat foo.lisp | quick-fastq` if you prefer). + +You can also pipe the spec into stdin. + +## Syntax + +`quick-fastq` will read two Common Lisp forms from stdin (using the standard +reader for now, so don't run it on untrusted data). The format of the input is: + + bindings + expr + +`expr` is an expression describing how to generate a random read. + +* A literal string like `"ATCG"` generates those bases with random quality scores. +* An integer like `123` generates that many random bases with random quality scores. +* A vector like `#(expr1 expr2 …)` evaluates each expression and concatenates the results. +* A symbol like `x` looks up the value in the bindings (see below). +* A list performs some operation on the form inside, depending on the symbol at + the head of the list: + * `(qN expr)` where `N` is 0-90 evaluates `expr` and sets its quality scores to `N`, e.g. `(q12 500)` will generate 500 + random bases with a qscore of `12`. + * `(rev expr)` reverses `expr` (you can also use `(r expr)` as a shortcut). + * `(comp expr)` complements `expr` (you can also use `(c expr)` as a shortcut). + * `(revcomp expr)` is equivalent to `(rev (comp expr))` (you can also use `(rc expr)` as a shortcut) + * `(first n expr)` takes the first `n` bases of `expr` (you can also use `(f n expr)` as a shortcut). + * `(last n expr)` takes the last `n` bases of `expr` (you can also use `(l n expr)` as a shortcut). + * `(rep n expr)` concatenates `n` copies of `expr` (you can also use `(tr n expr)` as a shortcut). + * `(snp freq expr)` modifies `expr` to add SNPs at a rate of `freq` (`freq` must be between 0 and 1). + * `(ins freq expr)` modifies `expr` to insert bases at a rate of `freq` (`freq` must be between 0 and 1). + * `(del freq expr)` modifies `expr` to delete bases at a rate of `freq` (`freq` must be between 0 and 1). + * `(err freq expr)` is equivalent to `(ins freq (del freq (snp freq expr)))` (`freq` must be between 0 and 1). + +Bindings must be a (possibly empty) list of bindings, each of the form `(symbol +expr)`. `expr` will be evaluated and bound to `symbol`. Bindings are performed +in order as if by `let*`. Several keyword symbols have special meanings: + +* Binding `:entries` to an integer `n` will generate that many FASTQ entries instead of just a single one. +* Binding `:seed` to an integer will seed the RNG with a specific seed, to make runs reproducible. + +## Examples + +Generate a random 1000bp read: + + () + 1000 + +Generate a read with the same 100bp beginning and end, with 500bp of random +bases in the middle: + + ((x 100)) + #(x 500 x) + +Generate a foldback chimeric read, with the second half having a lower quality +than the first: + + ((x (q40 1000))) + #(x (q20 (revcomp x))) + +Generate a read with a tandem repeat in the middle: + + () + (1000 (rep 200 "ATTT")) + +Generate a foldback chimeric read with a double tandem duplication in the +foldback strand, with simulated sequencing error, and small chunks of +low-quality bases to make the transitions between sections as a hack: + + ((x 1000) + (lq (q1 10)) + (a (first 800 x)) + (b (last 200 x)) + (dup (last 150 a)) + (f #(lq a lq dup lq (rc dup) lq dup lq b))) + + (err 0.01 #(x (revcomp f))) diff -r 000000000000 -r 57a12bdb713e build-binary.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/build-binary.sh Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +set -euo pipefail + +buildapp \ + --load-system 'quick-fastq' \ + --entry 'quick-fastq:toplevel' \ + --manifest-file 'build/asdf-manifest' \ + --compress-core \ + --output 'build/quick-fastq' diff -r 000000000000 -r 57a12bdb713e quick-fastq.asd --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quick-fastq.asd Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,13 @@ +(asdf:defsystem :quick-fastq + :description "A tool for quickly generating synthetic FASTQ files." + :author "Steve Losh " + :homepage "https://github.com/sjl/quick-fastq" + + :license "GPL-3.0-or-later" + + :depends-on (:alexandria :iterate :losh) + + :serial t + :components ((:file "quick-fastq"))) + + diff -r 000000000000 -r 57a12bdb713e quick-fastq.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/quick-fastq.lisp Mon Mar 23 16:38:57 2026 -0400 @@ -0,0 +1,176 @@ +(defpackage :quick-fastq + (:use :cl :iterate :losh) + (:export :toplevel :build)) + +(in-package :quick-fastq) + +;; data is represented as a conses of (bases . quality-scores) + +(defun phred-char (q) + (code-char (+ (char-code #\!) q))) + +(defun random-qscore () + (phred-char (+ 18 (random 4)))) + +(defun random-base () + (random-elt "ACTG")) + +(defun random-dna (n) + (cons (coerce (gimme n (random-base)) 'string) + (coerce (gimme n (random-qscore)) 'string))) + +(defun rev (dna) + (cons (reverse (car dna)) + (reverse (cdr dna)))) + +(defun complement-base (base) + (ecase base + (#\A #\T) + (#\T #\A) + (#\C #\G) + (#\G #\C))) + +(defun comp (dna) + (cons (map 'string #'complement-base (car dna)) + (cdr dna))) + +(defun copy (dna) + (cons (copy-seq (car dna)) + (copy-seq (cdr dna)))) + +(defun concat (a b) + (cons (concatenate 'string (car a) (car b)) + (concatenate 'string (cdr a) (cdr b)))) + +(defun revcomp (dna) + (rev (comp dna))) + +(defun qual (q dna) + (cons (car dna) + (make-string (length (cdr dna)) :initial-element (phred-char q)))) + +(defun take-first (n dna) + (cons (subseq (car dna) 0 n) + (subseq (cdr dna) 0 n))) + +(defun take-last (n dna &aux (len (length (car dna)))) + (cons (subseq (car dna) (- len n) len) + (subseq (cdr dna) (- len n) len))) + +(defun mutate (base) + (random-elt (ecase base + (#\A "TCG") + (#\T "ACG") + (#\C "ATG") + (#\G "ATC")))) + +(defun add-snp (freq dna) + (iterate (with (seq . qs) = (copy dna)) + (for b :in-string seq :with-index i) + (when (randomp freq) + (setf (aref seq i) (mutate b))) + (returning (cons seq qs)))) + +(defun add-ins (freq dna) + (iterate (for b :in-string (car dna)) + (for q :in-string (cdr dna)) + (collect b :into seq) + (collect q :into qs) + (when (randomp freq) + (collect (random-base) :into seq) + (collect (random-qscore) :into qs)) + (returning (cons (coerce seq 'string) (coerce qs 'string))))) + +(defun add-del (freq dna) + (iterate (for b :in-string (car dna)) + (for q :in-string (cdr dna)) + (unless (randomp freq) + (collect b :into seq) + (collect q :into qs)) + (returning (cons (coerce seq 'string) (coerce qs 'string))))) + +(defun add-err (freq dna) + (add-ins freq (add-del freq (add-snp freq dna)))) + +(defun literal (seq) + (cons (copy-seq seq) + (coerce (gimme (length seq) (random-qscore)) 'string))) + +(defun repeat (n seq) + (reduce #'concat (gimme n seq))) + +(defun run (binding-forms fastq-form) + (let ((bindings (make-hash-table)) + (entries 1)) + (labels ((r (form) (rev (eval-form form))) + (c (form) (comp (eval-form form))) + (rc (form) (revcomp (eval-form form))) + (f (n form) (take-first n (eval-form form))) + (l (n form) (take-last n (eval-form form))) + (q (n form) (qual n (eval-form form))) + (snp (freq form) (add-snp freq (eval-form form))) + (ins (freq form) (add-ins freq (eval-form form))) + (del (freq form) (add-del freq (eval-form form))) + (err (freq form) (add-err freq (eval-form form))) + (rep (n form) (repeat n (eval-form form))) + (eval-form (form) + (typecase form + (integer (random-dna form)) + (string (literal form)) + (vector (reduce #'concat form :key #'eval-form)) + (symbol (gethash form bindings)) + (list (destructuring-bind (op . args) form + (if (char= #\Q (char (symbol-name op) 0)) + (apply #'q + (parse-integer (subseq (symbol-name op) 1)) + args) + (apply (ecase op + ((rep tr) #'rep) + ((rev r) #'r) + ((comp c) #'c) + ((revcomp rc) #'rc) + ((first f) #'f) + ((last l) #'l) + ((snp) #'snp) + ((ins) #'ins) + ((del) #'del) + ((err) #'err)) + args)))) + (t form)))) + + ;; Process the bindings + (dolist (binding-form binding-forms) + (destructuring-bind (symbol form) binding-form + (case symbol + (:seed (setf *random-state* (sb-ext:seed-random-state form))) + (:entries (setf entries form)) + (t (setf (gethash symbol bindings) (eval-form form)))))) + + ;; Process the FASTQ form using the bindings and print it + (loop :repeat entries + :for (seq . qs) = (eval-form fastq-form) + :collect (format nil "@quickfastq~%~A~%+~%~A~%" seq qs))))) + +(defun read-form (stream) + (let ((*package* (find-package :quick-fastq))) + ; todo safe-read this + (read stream))) + +(defun toplevel% (stream) + (let ((binding-forms (read-form stream)) + (fastq-form (read-form stream))) + (map nil #'write-string (run binding-forms fastq-form)))) + +(defun toplevel (argv) + (sb-ext:disable-debugger) + (pop argv) + (when (null argv) + (setf argv (list "-"))) + (setf *random-state* (make-random-state t)) + (assert (= 1 (length argv)) () "USAGE: quick-fastq [PATH]") + (let ((path (first argv))) + (if (string= "-" path) + (toplevel% *standard-input*) + (with-open-file (stream path) + (toplevel% stream))))) +