b3598dc7ef5c

ORF
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Thu, 08 Nov 2018 21:17:47 -0500
parents b5923704ce42
children 23151d9021cf
branches/tags (none)
files rosalind.asd src/problems/mprt.lisp src/problems/orf.lisp src/problems/prot.lisp src/problems/revc.lisp

Changes

--- a/rosalind.asd	Thu Nov 08 20:36:22 2018 -0500
+++ b/rosalind.asd	Thu Nov 08 21:17:47 2018 -0500
@@ -42,7 +42,9 @@
                                            (:file "iprb")
                                            (:file "lcsm")
                                            (:file "lia")
+                                           (:file "mprt")
                                            (:file "mrna")
+                                           (:file "orf")
                                            (:file "perm")
                                            (:file "prot")
                                            (:file "prtm")
--- a/src/problems/mprt.lisp	Thu Nov 08 20:36:22 2018 -0500
+++ b/src/problems/mprt.lisp	Thu Nov 08 21:17:47 2018 -0500
@@ -1,5 +1,10 @@
 (in-package :rosalind)
 
+;; This was pretty simple, except for discovering that cl-ppcre's all-matches
+;; function skips overlapping matches.  Otherwise we just convert the motif to
+;; a regex and handle grabbing the data from Uniprot (which is straightforward
+;; but can be slow).
+
 (defparameter *input-mprt*
   "A2Z669
 B5ZC00
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/orf.lisp	Thu Nov 08 21:17:47 2018 -0500
@@ -0,0 +1,35 @@
+(in-package :rosalind)
+
+(defparameter *input-orf*
+  ">Rosalind_99
+AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG")
+
+(defparameter *output-orf*
+  "M
+MTPRLGLESLLE
+MGMTPRLGLESLLE
+MLLGSFRLIPKETLIQVAGSSPCNLS")
+
+
+(defun translate-all (rna)
+  "Return all possible proteins that can be translated from `rna`."
+  (iterate
+    (for start :first 0 :then (1+ protein-start))
+    (for (values protein protein-start) = (translate rna :start start))
+    (while protein)
+    (collect protein)))
+
+(define-problem orf (data stream)
+    *input-orf*
+    *output-orf*
+  (let* ((dna (cdr (first (read-fasta-into-alist data))))
+         (rna1 (transcribe dna))
+         (rna2 (transcribe (reverse-complement dna))))
+    (with-output-to-string (s)
+      (-<> (append (translate-all rna1)
+                   (translate-all rna2))
+        (remove-duplicates <> :test #'string=)
+        (sort <> #'< :key #'length)
+        (format s "~{~A~^~%~}" <>)))))
+
+
--- a/src/problems/prot.lisp	Thu Nov 08 20:36:22 2018 -0500
+++ b/src/problems/prot.lisp	Thu Nov 08 21:17:47 2018 -0500
@@ -58,11 +58,29 @@
     ("UGG" #\W) ("CGG" #\R) ("AGG" #\R) ("GGG" #\G)))
 
 (defun translate (rna &key (start 0))
-  "Translate a string of RNA bases into a protein string of amino acids."
-  (iterate (for i :from (search "AUG" rna :start2 start) :by 3)
-           (for protein = (codon-to-protein rna i))
-           (while protein)
-           (collect protein :result-type 'string)))
+  "Translate a string of RNA bases into a protein string of amino acids.
+
+  `rna` will be searched (beginning at `start`) for a start codon and
+  translation will proceed from there.  If no start codon occurs after `start`
+  then `nil` will be returned.
+
+  Once a start codon has been found, translation proceeds to the next stop
+  codon.  If no stop codon is present, `nil` will be returned.
+
+  Otherwise two values are returned: the protein string and the index into `rna`
+  where it started.
+
+  "
+  (when-let ((start (search "AUG" rna :start2 start)))
+    (values
+      (iterate (with limit = (- (length rna) 3))
+               (for i :from start :by 3)
+               (when (> i limit)
+                 (return-from translate (values nil nil)))
+               (for protein = (codon-to-protein rna i))
+               (while protein)
+               (collect protein :result-type 'string))
+      start)))
 
 
 (define-problem prot (data string)
--- a/src/problems/revc.lisp	Thu Nov 08 20:36:22 2018 -0500
+++ b/src/problems/revc.lisp	Thu Nov 08 21:17:47 2018 -0500
@@ -28,7 +28,7 @@
 ;; polarized ends, with one end being called 3′ and the other being 5′, but I'm
 ;; not 100% sure.
 
-(defun reverse-complement (dna)
+(defun nreverse-complement (dna)
   (flet ((dna-complement (base)
            (case base
              (#\A #\T)
@@ -38,9 +38,12 @@
     (map-into dna #'dna-complement dna)
     (nreverse dna)))
 
+(defun reverse-complement (dna)
+  (nreverse-complement (copy-seq dna)))
+
 (define-problem revc (data string)
     "AAAACCCGGT"
     "ACCGGGTTTT"
   "Return the reverse complement of `data`."
-  (reverse-complement (delete #\newline data)))
+  (nreverse-complement (delete #\newline data)))