--- a/rosalind.asd Sat Nov 03 16:39:10 2018 -0400
+++ b/rosalind.asd Sat Nov 03 17:02:03 2018 -0400
@@ -40,5 +40,6 @@
(:file "iev")
(:file "fibd")
(:file "cons")
- (:file "grph")))))))
+ (:file "grph")
+ (:file "prtm")))))))
--- a/src/problems/grph.lisp Sat Nov 03 16:39:10 2018 -0400
+++ b/src/problems/grph.lisp Sat Nov 03 17:02:03 2018 -0400
@@ -17,6 +17,21 @@
")
+(defun strings-overlap-p (k left right)
+ "Return whether `left` and `right` overlap (in order) by exactly `k` characters.
+
+ (strings-overlap-p 3 \"abcdef\"
+ \"defhi\") ; => T
+
+ (strings-overlap-p 2 \"abcdef\"
+ \"defhi\") ; => NIL
+
+ "
+ (string= left right
+ :start1 (- (length left) k)
+ :end2 k))
+
+
(define-problem grph (data stream)
*input-grph*
*output-grph*
--- a/src/problems/prot.lisp Sat Nov 03 16:39:10 2018 -0400
+++ b/src/problems/prot.lisp Sat Nov 03 17:02:03 2018 -0400
@@ -1,5 +1,70 @@
(in-package :rosalind)
+(defmacro codon-case ((vector index) &rest clauses)
+ ;; Compiles a giant list of clauses into a tree of ECASEs.
+ ;;
+ ;; Each codon will have at most 3 ECASEs to pass through. Each ECASE has at
+ ;; most four options, so in the worst case we end up with 3 * 4 = 12
+ ;; comparisons instead of 64.
+ ;;
+ ;; If we ever convert bases to vectors of (unsigned-byte 2)s we could
+ ;; potentially use a lookup table here, e.g.:
+ ;;
+ ;; (aref +amino-acids+ (+ x (ash y 2) (ash z 4)))
+ (alexandria:once-only (vector index)
+ (alexandria:with-gensyms (x y z)
+ `(let ((,x (aref ,vector ,index))
+ (,y (aref ,vector (+ ,index 1)))
+ (,z (aref ,vector (+ ,index 2))))
+ ,(labels ((strip (clauses)
+ (if (= 1 (length (caar clauses)))
+ (cadar clauses)
+ (iterate (for (head body) :in clauses)
+ (collect (list (subseq head 1) body)))))
+ (split (clauses)
+ (-<> clauses
+ (group-by (rcurry #'aref 0) <> :key #'first)
+ (iterate (for (k v) :in-hashtable <>)
+ (collect (list k (strip v)))))))
+ (recursively ((clauses (split clauses))
+ (codons (list x y z))
+ (i 0))
+ `(ecase ,(first codons)
+ ,@(iterate (for (k remaining) :in clauses)
+ (collect `(,k ,(if (atom remaining)
+ remaining
+ (recur (split remaining)
+ (rest codons)
+ (1+ i)))))))))))))
+
+(defun codon-to-protein (vector index)
+ "Return the amino acid encoded by the codon in `vector` at `index`."
+ (codon-case (vector index)
+ ("UUU" #\F) ("CUU" #\L) ("AUU" #\I) ("GUU" #\V)
+ ("UUC" #\F) ("CUC" #\L) ("AUC" #\I) ("GUC" #\V)
+ ("UUA" #\L) ("CUA" #\L) ("AUA" #\I) ("GUA" #\V)
+ ("UUG" #\L) ("CUG" #\L) ("AUG" #\M) ("GUG" #\V)
+ ("UCU" #\S) ("CCU" #\P) ("ACU" #\T) ("GCU" #\A)
+ ("UCC" #\S) ("CCC" #\P) ("ACC" #\T) ("GCC" #\A)
+ ("UCA" #\S) ("CCA" #\P) ("ACA" #\T) ("GCA" #\A)
+ ("UCG" #\S) ("CCG" #\P) ("ACG" #\T) ("GCG" #\A)
+ ("UAU" #\Y) ("CAU" #\H) ("AAU" #\N) ("GAU" #\D)
+ ("UAC" #\Y) ("CAC" #\H) ("AAC" #\N) ("GAC" #\D)
+ ("UAA" nil) ("CAA" #\Q) ("AAA" #\K) ("GAA" #\E)
+ ("UAG" nil) ("CAG" #\Q) ("AAG" #\K) ("GAG" #\E)
+ ("UGU" #\C) ("CGU" #\R) ("AGU" #\S) ("GGU" #\G)
+ ("UGC" #\C) ("CGC" #\R) ("AGC" #\S) ("GGC" #\G)
+ ("UGA" nil) ("CGA" #\R) ("AGA" #\R) ("GGA" #\G)
+ ("UGG" #\W) ("CGG" #\R) ("AGG" #\R) ("GGG" #\G)))
+
+(defun translate (rna &key (start 0))
+ "Translate a string of RNA bases into a protein string of amino acids."
+ (iterate (for i :from (search "AUG" rna :start2 start) :by 3)
+ (for protein = (codon-to-protein rna i))
+ (while protein)
+ (collect protein :result-type 'string)))
+
+
(define-problem prot (data string)
"AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
"MAMAPRTEINSTRING"
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/prtm.lisp Sat Nov 03 17:02:03 2018 -0400
@@ -0,0 +1,41 @@
+(in-package :rosalind)
+
+(defconstant +monoisotopic-mass-of-water+ 18.01056d0
+ "The monoisotopic mass of a single water molecule, in Daltons.")
+
+(defun monoisotopic-mass (residue)
+ ;; todo is a hash table faster here? we could also do an array
+ ;; starting at (char-code #\A) if we really wanted
+ (ecase residue
+ ;; These have to be doubles or we get too much rounding error. It's fine.
+ (#\A 71.03711d0)
+ (#\C 103.00919d0)
+ (#\D 115.02694d0)
+ (#\E 129.04259d0)
+ (#\F 147.06841d0)
+ (#\G 57.02146d0)
+ (#\H 137.05891d0)
+ (#\I 113.08406d0)
+ (#\K 128.09496d0)
+ (#\L 113.08406d0)
+ (#\M 131.04049d0)
+ (#\N 114.04293d0)
+ (#\P 97.05276d0)
+ (#\Q 128.05858d0)
+ (#\R 156.10111d0)
+ (#\S 87.03203d0)
+ (#\T 101.04768d0)
+ (#\V 99.06841d0)
+ (#\W 186.07931d0)
+ (#\Y 163.06333d0)))
+
+
+(define-problem prtm (data string)
+ "SKADYEK"
+ "821.392"
+ (-<> data
+ (delete #\newline <>)
+ (summation <> :key #'monoisotopic-mass)
+ (format nil "~,3F" <>)))
+
+
--- a/src/utils.lisp Sat Nov 03 16:39:10 2018 -0400
+++ b/src/utils.lisp Sat Nov 03 17:02:03 2018 -0400
@@ -23,6 +23,7 @@
(defun pbcopy (string)
(values string (sh '("pbcopy") string)))
+
(defun ensure-stream (input)
(ctypecase input
(stream input)
@@ -33,6 +34,7 @@
(stream (alexandria:read-stream-content-into-string input))
(string (copy-seq input))))
+
(defun hamming (sequence1 sequence2 &key (test #'eql))
"Return the Hamming distance between `sequence1` and `sequence2`."
;; todo assert length=?
@@ -44,6 +46,7 @@
sequence2)
result))
+
(defun factorial (x)
(check-type x (integer 0))
(iterate (for i :from 1 :to x)
@@ -53,20 +56,7 @@
(gathering (alexandria:map-permutations #'gather items)))
-(defun strings-overlap-p (k left right)
- "Return whether `left` and `right` overlap (in order) by exactly `k` characters.
-
- (strings-overlap-p 3 \"abcdef\"
- \"defhi\") ; => T
-
- (strings-overlap-p 2 \"abcdef\"
- \"defhi\") ; => NIL
-
- "
- (string= left right
- :start1 (- (length left) k)
- :end2 k))
-
+;;;; Iterate ------------------------------------------------------------------
(defmacro-driver (FOR var SEED seed THEN then)
"Bind `var` to `seed` initially, then to `then` on every iteration.
@@ -160,72 +150,6 @@
(,(if append 'buffer-append 'buffer-push) ,var ,expr)))
-;;;; Translation --------------------------------------------------------------
-(defmacro codon-case ((vector index) &rest clauses)
- ;; Compiles a giant list of clauses into a tree of ECASEs.
- ;;
- ;; Each codon will have at most 3 ECASEs to pass through. Each ECASE has at
- ;; most four options, so in the worst case we end up with 3 * 4 = 12
- ;; comparisons instead of 64.
- ;;
- ;; If we ever convert bases to vectors of (unsigned-byte 2)s we could
- ;; potentially use a lookup table here, e.g.:
- ;;
- ;; (aref +amino-acids+ (+ x (ash y 2) (ash z 4)))
- (alexandria:once-only (vector index)
- (alexandria:with-gensyms (x y z)
- `(let ((,x (aref ,vector ,index))
- (,y (aref ,vector (+ ,index 1)))
- (,z (aref ,vector (+ ,index 2))))
- ,(labels ((strip (clauses)
- (if (= 1 (length (caar clauses)))
- (cadar clauses)
- (iterate (for (head body) :in clauses)
- (collect (list (subseq head 1) body)))))
- (split (clauses)
- (-<> clauses
- (group-by (rcurry #'aref 0) <> :key #'first)
- (iterate (for (k v) :in-hashtable <>)
- (collect (list k (strip v)))))))
- (recursively ((clauses (split clauses))
- (codons (list x y z))
- (i 0))
- `(ecase ,(first codons)
- ,@(iterate (for (k remaining) :in clauses)
- (collect `(,k ,(if (atom remaining)
- remaining
- (recur (split remaining)
- (rest codons)
- (1+ i)))))))))))))
-
-(defun codon-to-protein (vector index)
- "Return the amino acid encoded by the codon in `vector` at `index`."
- (codon-case (vector index)
- ("UUU" #\F) ("CUU" #\L) ("AUU" #\I) ("GUU" #\V)
- ("UUC" #\F) ("CUC" #\L) ("AUC" #\I) ("GUC" #\V)
- ("UUA" #\L) ("CUA" #\L) ("AUA" #\I) ("GUA" #\V)
- ("UUG" #\L) ("CUG" #\L) ("AUG" #\M) ("GUG" #\V)
- ("UCU" #\S) ("CCU" #\P) ("ACU" #\T) ("GCU" #\A)
- ("UCC" #\S) ("CCC" #\P) ("ACC" #\T) ("GCC" #\A)
- ("UCA" #\S) ("CCA" #\P) ("ACA" #\T) ("GCA" #\A)
- ("UCG" #\S) ("CCG" #\P) ("ACG" #\T) ("GCG" #\A)
- ("UAU" #\Y) ("CAU" #\H) ("AAU" #\N) ("GAU" #\D)
- ("UAC" #\Y) ("CAC" #\H) ("AAC" #\N) ("GAC" #\D)
- ("UAA" nil) ("CAA" #\Q) ("AAA" #\K) ("GAA" #\E)
- ("UAG" nil) ("CAG" #\Q) ("AAG" #\K) ("GAG" #\E)
- ("UGU" #\C) ("CGU" #\R) ("AGU" #\S) ("GGU" #\G)
- ("UGC" #\C) ("CGC" #\R) ("AGC" #\S) ("GGC" #\G)
- ("UGA" nil) ("CGA" #\R) ("AGA" #\R) ("GGA" #\G)
- ("UGG" #\W) ("CGG" #\R) ("AGG" #\R) ("GGG" #\G)))
-
-(defun translate (rna &key (start 0))
- "Translate a string of RNA bases into a protein string of amino acids."
- (iterate (for i :from (search "AUG" rna :start2 start) :by 3)
- (for protein = (codon-to-protein rna i))
- (while protein)
- (collect protein :result-type 'string)))
-
-
;;;; File Formats -------------------------------------------------------------
(defun read-fasta (stream)
"Read and return the next FASTA label/data pair from `stream`.