# HG changeset patch # User Steve Losh # Date 1541278923 14400 # Node ID 8de2e6d7c9d918f1c2e3a3224067004bacfec1de # Parent e3dc1e5b762c9990f364410dc8b15136b0eaf4e2 Inline utils into problem files, add PRTM diff -r e3dc1e5b762c -r 8de2e6d7c9d9 rosalind.asd --- a/rosalind.asd Sat Nov 03 16:39:10 2018 -0400 +++ b/rosalind.asd Sat Nov 03 17:02:03 2018 -0400 @@ -40,5 +40,6 @@ (:file "iev") (:file "fibd") (:file "cons") - (:file "grph"))))))) + (:file "grph") + (:file "prtm"))))))) diff -r e3dc1e5b762c -r 8de2e6d7c9d9 src/problems/grph.lisp --- a/src/problems/grph.lisp Sat Nov 03 16:39:10 2018 -0400 +++ b/src/problems/grph.lisp Sat Nov 03 17:02:03 2018 -0400 @@ -17,6 +17,21 @@ ") +(defun strings-overlap-p (k left right) + "Return whether `left` and `right` overlap (in order) by exactly `k` characters. + + (strings-overlap-p 3 \"abcdef\" + \"defhi\") ; => T + + (strings-overlap-p 2 \"abcdef\" + \"defhi\") ; => NIL + + " + (string= left right + :start1 (- (length left) k) + :end2 k)) + + (define-problem grph (data stream) *input-grph* *output-grph* diff -r e3dc1e5b762c -r 8de2e6d7c9d9 src/problems/prot.lisp --- a/src/problems/prot.lisp Sat Nov 03 16:39:10 2018 -0400 +++ b/src/problems/prot.lisp Sat Nov 03 17:02:03 2018 -0400 @@ -1,5 +1,70 @@ (in-package :rosalind) +(defmacro codon-case ((vector index) &rest clauses) + ;; Compiles a giant list of clauses into a tree of ECASEs. + ;; + ;; Each codon will have at most 3 ECASEs to pass through. Each ECASE has at + ;; most four options, so in the worst case we end up with 3 * 4 = 12 + ;; comparisons instead of 64. + ;; + ;; If we ever convert bases to vectors of (unsigned-byte 2)s we could + ;; potentially use a lookup table here, e.g.: + ;; + ;; (aref +amino-acids+ (+ x (ash y 2) (ash z 4))) + (alexandria:once-only (vector index) + (alexandria:with-gensyms (x y z) + `(let ((,x (aref ,vector ,index)) + (,y (aref ,vector (+ ,index 1))) + (,z (aref ,vector (+ ,index 2)))) + ,(labels ((strip (clauses) + (if (= 1 (length (caar clauses))) + (cadar clauses) + (iterate (for (head body) :in clauses) + (collect (list (subseq head 1) body))))) + (split (clauses) + (-<> clauses + (group-by (rcurry #'aref 0) <> :key #'first) + (iterate (for (k v) :in-hashtable <>) + (collect (list k (strip v))))))) + (recursively ((clauses (split clauses)) + (codons (list x y z)) + (i 0)) + `(ecase ,(first codons) + ,@(iterate (for (k remaining) :in clauses) + (collect `(,k ,(if (atom remaining) + remaining + (recur (split remaining) + (rest codons) + (1+ i))))))))))))) + +(defun codon-to-protein (vector index) + "Return the amino acid encoded by the codon in `vector` at `index`." + (codon-case (vector index) + ("UUU" #\F) ("CUU" #\L) ("AUU" #\I) ("GUU" #\V) + ("UUC" #\F) ("CUC" #\L) ("AUC" #\I) ("GUC" #\V) + ("UUA" #\L) ("CUA" #\L) ("AUA" #\I) ("GUA" #\V) + ("UUG" #\L) ("CUG" #\L) ("AUG" #\M) ("GUG" #\V) + ("UCU" #\S) ("CCU" #\P) ("ACU" #\T) ("GCU" #\A) + ("UCC" #\S) ("CCC" #\P) ("ACC" #\T) ("GCC" #\A) + ("UCA" #\S) ("CCA" #\P) ("ACA" #\T) ("GCA" #\A) + ("UCG" #\S) ("CCG" #\P) ("ACG" #\T) ("GCG" #\A) + ("UAU" #\Y) ("CAU" #\H) ("AAU" #\N) ("GAU" #\D) + ("UAC" #\Y) ("CAC" #\H) ("AAC" #\N) ("GAC" #\D) + ("UAA" nil) ("CAA" #\Q) ("AAA" #\K) ("GAA" #\E) + ("UAG" nil) ("CAG" #\Q) ("AAG" #\K) ("GAG" #\E) + ("UGU" #\C) ("CGU" #\R) ("AGU" #\S) ("GGU" #\G) + ("UGC" #\C) ("CGC" #\R) ("AGC" #\S) ("GGC" #\G) + ("UGA" nil) ("CGA" #\R) ("AGA" #\R) ("GGA" #\G) + ("UGG" #\W) ("CGG" #\R) ("AGG" #\R) ("GGG" #\G))) + +(defun translate (rna &key (start 0)) + "Translate a string of RNA bases into a protein string of amino acids." + (iterate (for i :from (search "AUG" rna :start2 start) :by 3) + (for protein = (codon-to-protein rna i)) + (while protein) + (collect protein :result-type 'string))) + + (define-problem prot (data string) "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA" "MAMAPRTEINSTRING" diff -r e3dc1e5b762c -r 8de2e6d7c9d9 src/problems/prtm.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/prtm.lisp Sat Nov 03 17:02:03 2018 -0400 @@ -0,0 +1,41 @@ +(in-package :rosalind) + +(defconstant +monoisotopic-mass-of-water+ 18.01056d0 + "The monoisotopic mass of a single water molecule, in Daltons.") + +(defun monoisotopic-mass (residue) + ;; todo is a hash table faster here? we could also do an array + ;; starting at (char-code #\A) if we really wanted + (ecase residue + ;; These have to be doubles or we get too much rounding error. It's fine. + (#\A 71.03711d0) + (#\C 103.00919d0) + (#\D 115.02694d0) + (#\E 129.04259d0) + (#\F 147.06841d0) + (#\G 57.02146d0) + (#\H 137.05891d0) + (#\I 113.08406d0) + (#\K 128.09496d0) + (#\L 113.08406d0) + (#\M 131.04049d0) + (#\N 114.04293d0) + (#\P 97.05276d0) + (#\Q 128.05858d0) + (#\R 156.10111d0) + (#\S 87.03203d0) + (#\T 101.04768d0) + (#\V 99.06841d0) + (#\W 186.07931d0) + (#\Y 163.06333d0))) + + +(define-problem prtm (data string) + "SKADYEK" + "821.392" + (-<> data + (delete #\newline <>) + (summation <> :key #'monoisotopic-mass) + (format nil "~,3F" <>))) + + diff -r e3dc1e5b762c -r 8de2e6d7c9d9 src/utils.lisp --- a/src/utils.lisp Sat Nov 03 16:39:10 2018 -0400 +++ b/src/utils.lisp Sat Nov 03 17:02:03 2018 -0400 @@ -23,6 +23,7 @@ (defun pbcopy (string) (values string (sh '("pbcopy") string))) + (defun ensure-stream (input) (ctypecase input (stream input) @@ -33,6 +34,7 @@ (stream (alexandria:read-stream-content-into-string input)) (string (copy-seq input)))) + (defun hamming (sequence1 sequence2 &key (test #'eql)) "Return the Hamming distance between `sequence1` and `sequence2`." ;; todo assert length=? @@ -44,6 +46,7 @@ sequence2) result)) + (defun factorial (x) (check-type x (integer 0)) (iterate (for i :from 1 :to x) @@ -53,20 +56,7 @@ (gathering (alexandria:map-permutations #'gather items))) -(defun strings-overlap-p (k left right) - "Return whether `left` and `right` overlap (in order) by exactly `k` characters. - - (strings-overlap-p 3 \"abcdef\" - \"defhi\") ; => T - - (strings-overlap-p 2 \"abcdef\" - \"defhi\") ; => NIL - - " - (string= left right - :start1 (- (length left) k) - :end2 k)) - +;;;; Iterate ------------------------------------------------------------------ (defmacro-driver (FOR var SEED seed THEN then) "Bind `var` to `seed` initially, then to `then` on every iteration. @@ -160,72 +150,6 @@ (,(if append 'buffer-append 'buffer-push) ,var ,expr))) -;;;; Translation -------------------------------------------------------------- -(defmacro codon-case ((vector index) &rest clauses) - ;; Compiles a giant list of clauses into a tree of ECASEs. - ;; - ;; Each codon will have at most 3 ECASEs to pass through. Each ECASE has at - ;; most four options, so in the worst case we end up with 3 * 4 = 12 - ;; comparisons instead of 64. - ;; - ;; If we ever convert bases to vectors of (unsigned-byte 2)s we could - ;; potentially use a lookup table here, e.g.: - ;; - ;; (aref +amino-acids+ (+ x (ash y 2) (ash z 4))) - (alexandria:once-only (vector index) - (alexandria:with-gensyms (x y z) - `(let ((,x (aref ,vector ,index)) - (,y (aref ,vector (+ ,index 1))) - (,z (aref ,vector (+ ,index 2)))) - ,(labels ((strip (clauses) - (if (= 1 (length (caar clauses))) - (cadar clauses) - (iterate (for (head body) :in clauses) - (collect (list (subseq head 1) body))))) - (split (clauses) - (-<> clauses - (group-by (rcurry #'aref 0) <> :key #'first) - (iterate (for (k v) :in-hashtable <>) - (collect (list k (strip v))))))) - (recursively ((clauses (split clauses)) - (codons (list x y z)) - (i 0)) - `(ecase ,(first codons) - ,@(iterate (for (k remaining) :in clauses) - (collect `(,k ,(if (atom remaining) - remaining - (recur (split remaining) - (rest codons) - (1+ i))))))))))))) - -(defun codon-to-protein (vector index) - "Return the amino acid encoded by the codon in `vector` at `index`." - (codon-case (vector index) - ("UUU" #\F) ("CUU" #\L) ("AUU" #\I) ("GUU" #\V) - ("UUC" #\F) ("CUC" #\L) ("AUC" #\I) ("GUC" #\V) - ("UUA" #\L) ("CUA" #\L) ("AUA" #\I) ("GUA" #\V) - ("UUG" #\L) ("CUG" #\L) ("AUG" #\M) ("GUG" #\V) - ("UCU" #\S) ("CCU" #\P) ("ACU" #\T) ("GCU" #\A) - ("UCC" #\S) ("CCC" #\P) ("ACC" #\T) ("GCC" #\A) - ("UCA" #\S) ("CCA" #\P) ("ACA" #\T) ("GCA" #\A) - ("UCG" #\S) ("CCG" #\P) ("ACG" #\T) ("GCG" #\A) - ("UAU" #\Y) ("CAU" #\H) ("AAU" #\N) ("GAU" #\D) - ("UAC" #\Y) ("CAC" #\H) ("AAC" #\N) ("GAC" #\D) - ("UAA" nil) ("CAA" #\Q) ("AAA" #\K) ("GAA" #\E) - ("UAG" nil) ("CAG" #\Q) ("AAG" #\K) ("GAG" #\E) - ("UGU" #\C) ("CGU" #\R) ("AGU" #\S) ("GGU" #\G) - ("UGC" #\C) ("CGC" #\R) ("AGC" #\S) ("GGC" #\G) - ("UGA" nil) ("CGA" #\R) ("AGA" #\R) ("GGA" #\G) - ("UGG" #\W) ("CGG" #\R) ("AGG" #\R) ("GGG" #\G))) - -(defun translate (rna &key (start 0)) - "Translate a string of RNA bases into a protein string of amino acids." - (iterate (for i :from (search "AUG" rna :start2 start) :by 3) - (for protein = (codon-to-protein rna i)) - (while protein) - (collect protein :result-type 'string))) - - ;;;; File Formats ------------------------------------------------------------- (defun read-fasta (stream) "Read and return the next FASTA label/data pair from `stream`.