4cad0eb1a700

Restructure, do GC
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Thu, 01 Nov 2018 21:10:34 -0400 (2018-11-02)
parents f4ede9fed310
children 6088027147e1
branches/tags (none)
files rosalind.asd src/problems.lisp src/problems/dna.lisp src/problems/gc.lisp src/problems/revc.lisp src/problems/rna.lisp src/utils.lisp vendor/quickutils-package.lisp

Changes

--- a/rosalind.asd	Thu Nov 01 19:46:55 2018 -0400
+++ b/rosalind.asd	Thu Nov 01 21:10:34 2018 -0400
@@ -23,5 +23,9 @@
                (:file "package")
                (:module "src" :serial t
                 :components ((:file "utils")
-                             (:file "problems")))))
+                             (:module "problems"
+                              :components ((:file "dna")
+                                           (:file "rna")
+                                           (:file "revc")
+                                           (:file "gc")))))))
 
--- a/src/problems.lisp	Thu Nov 01 19:46:55 2018 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-(in-package :rosalind)
-
-;;;; Testing ------------------------------------------------------------------
-(defmacro define-test (problem input output &optional (test 'string=))
-  `(test ,(symb 'test- problem)
-     (is (,test ,output (,problem ,input)))))
-
-(defun run-tests ()
-  (1am:run))
-
-
-;;;; Problems -----------------------------------------------------------------
-(defmacro define-problem
-    ((number name) args sample-input sample-output &body body)
-  (let ((symbol (symb 'problem- number)))
-    `(progn
-       (defun ,symbol ,args ,@body)
-       (setf (get ',symbol 'rosalind-name) ,(string-downcase name))
-       (define-test ,symbol ,sample-input ,sample-output)
-       ',symbol)))
-
-
-(define-problem (1 dna) (data)
-    "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
-    "20 12 17 21"
-  ;; A string is simply an ordered collection of symbols selected from some
-  ;; alphabet and formed into a word; the length of a string is the number of
-  ;; symbols that it contains.
-  ;;
-  ;; An example of a length 21 DNA string (whose alphabet contains the symbols
-  ;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG."
-  ;;
-  ;; Given: A DNA string s of length at most 1000 nt.
-  ;; Return: Four integers (separated by spaces) counting the respective number
-  ;; of times that the symbols 'A', 'C', 'G', and 'T' occur in s.
-  (let ((results (frequencies data)))
-    (format nil "~D ~D ~D ~D"
-            (gethash #\A results)
-            (gethash #\C results)
-            (gethash #\G results)
-            (gethash #\T results))))
-
-(define-problem (2 rna) (data)
-    "GATGGAACTTGACTACGTAAATT"
-    "GAUGGAACUUGACUACGUAAAUU"
-  ;; An RNA string is a string formed from the alphabet containing 'A', 'C',
-  ;; 'G', and 'U'.
-  ;;
-  ;; Given a DNA string t corresponding to a coding strand, its transcribed RNA
-  ;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u.
-  ;;
-  ;; Given: A DNA string t having length at most 1000 nt.
-  ;;
-  ;; Return: The transcribed RNA string of t.
-  (substitute #\U #\T data))
-
-(define-problem (3 revc) (data)
-    "AAAACCCGGT"
-    "ACCGGGTTTT"
-  ;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are
-  ;; 'C' and 'G'.
-  ;;
-  ;; The reverse complement of a DNA string s is the string sc formed by
-  ;; reversing the symbols of s, then taking the complement of each symbol
-  ;; (e.g., the reverse complement of "GTCA" is "TGAC").
-  ;;
-  ;; Given: A DNA string s of length at most 1000 bp.
-  ;;
-  ;; Return: The reverse complement sc of s.
-  (copyf data)
-  (flet ((dna-complement (base)
-           (case base
-             (#\A #\T)
-             (#\T #\A)
-             (#\G #\C)
-             (#\C #\G)
-             (t base)))) ; newline etc
-    (map-into data #'dna-complement data)
-    (nreverse data)))
-
-
-;;;; Solutions ----------------------------------------------------------------
-(defun read-problem-data (problem)
-  (-<> (get problem 'rosalind-name)
-    (format nil "~~/Downloads/rosalind_~A.txt" <>)
-    read-file-into-string))
-
-(defun solve% (problem)
-  (pbcopy (funcall problem (read-problem-data problem))))
-
-(defmacro solve (problem)
-  `(solve% ',problem))
-
-
-;; (problem-3 "AAAACCCGGT")
-
-;; (solve problem-3)
-
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/dna.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -0,0 +1,23 @@
+(in-package :rosalind)
+
+;; A string is simply an ordered collection of symbols selected from some
+;; alphabet and formed into a word; the length of a string is the number of
+;; symbols that it contains.
+;;
+;; An example of a length 21 DNA string (whose alphabet contains the symbols
+;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG."
+;;
+;; Given: A DNA string s of length at most 1000 nt.  Return: Four integers
+;; (separated by spaces) counting the respective number of times that the
+;; symbols 'A', 'C', 'G', and 'T' occur in s.
+
+(define-problem dna (data)
+    "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
+    "20 12 17 21"
+  (let ((results (frequencies data)))
+    (format nil "~D ~D ~D ~D"
+            (gethash #\A results)
+            (gethash #\C results)
+            (gethash #\G results)
+            (gethash #\T results))))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/gc.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -0,0 +1,55 @@
+(in-package :rosalind)
+
+;; The GC-content of a DNA string is given by the percentage of symbols in
+;; the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG"
+;; is 37.5%. Note that the reverse complement of any DNA string has the same
+;; GC-content.
+;;
+;; DNA strings must be labeled when they are consolidated into a database.
+;; A commonly used method of string labeling is called FASTA format. In this
+;; format, the string is introduced by a line that begins with '>', followed
+;; by some labeling information. Subsequent lines contain the string itself;
+;; the first line to begin with '>' indicates the label of the next string.
+;;
+;; In Rosalind's implementation, a string in FASTA format will be labeled by
+;; the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000
+;; and 9999.
+;;
+;; Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
+;;
+;; Return: The ID of the string having the highest GC-content, followed by the
+;; GC-content of that string. Rosalind allows for a default error of 0.001 in
+;; all decimal answers unless otherwise stated; please see the note on
+;; absolute error below.
+
+(defparameter *input-gc* ">Rosalind_6404
+CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
+TCCCACTAATAATTCTGAGG
+>Rosalind_5959
+CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
+ATATCCATTTGTCAGCAGACACGC
+>Rosalind_0808
+CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
+TGGGAACCTGCGGGCAGTAGGTGGAAT")
+(defparameter *output-gc* "Rosalind_0808
+60.919540")
+
+
+(define-problem gc (data)
+    *input-gc*
+    *output-gc*
+  (labels ((gcp (base)
+             (or (char= #\G base)
+                 (char= #\C base)))
+           (gc-content (string)
+             (/ (count-if #'gcp string)
+                (length string))))
+    (iterate
+      (for (label dna) :in-fasta data)
+      (for gc = (gc-content dna))
+      (finding (format nil "~A~%~,6F" label (* 100 gc))
+               :maximizing gc))))
+
+
+;; (problem-gc *input-gc*)
+;; (solve problem-gc)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/revc.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -0,0 +1,27 @@
+(in-package :rosalind)
+
+;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C'
+;; and 'G'.
+;;
+;; The reverse complement of a DNA string s is the string sc formed by reversing
+;; the symbols of s, then taking the complement of each symbol (e.g., the
+;; reverse complement of "GTCA" is "TGAC").
+;;
+;; Given: A DNA string s of length at most 1000 bp.
+;;
+;; Return: The reverse complement sc of s.
+
+(define-problem revc (data)
+    "AAAACCCGGT"
+    "ACCGGGTTTT"
+  (copyf data)
+  (flet ((dna-complement (base)
+           (case base
+             (#\A #\T)
+             (#\T #\A)
+             (#\G #\C)
+             (#\C #\G)
+             (t base)))) ; newline etc
+    (map-into data #'dna-complement data)
+    (nreverse data)))
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/rna.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -0,0 +1,17 @@
+(in-package :rosalind)
+
+;; An RNA string is a string formed from the alphabet containing 'A', 'C', 'G',
+;; and 'U'.
+;;
+;; Given a DNA string t corresponding to a coding strand, its transcribed RNA
+;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u.
+;;
+;; Given: A DNA string t having length at most 1000 nt.
+;;
+;; Return: The transcribed RNA string of t.
+
+(define-problem rna (data)
+    "GATGGAACTTGACTACGTAAATT"
+    "GAUGGAACUUGACUACGUAAAUU"
+  (substitute #\U #\T data))
+
--- a/src/utils.lisp	Thu Nov 01 19:46:55 2018 -0400
+++ b/src/utils.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -1,6 +1,7 @@
 (in-package :rosalind)
 
 
+;;;; Misc ---------------------------------------------------------------------
 (defun sh (command input)
   (declare (ignorable command input))
   #+sbcl
@@ -25,3 +26,93 @@
 
 (defmacro copyf (sequence)
   `(setf ,sequence (copy-seq ,sequence)))
+
+(defun ensure-stream (input)
+  (ctypecase input
+    (stream input)
+    (string (make-string-input-stream input))))
+
+(defun nconcatenate (v1 v2)
+  (let* ((l1 (length v1))
+         (l2 (length v2))
+         (needed (+ l1 l2)))
+    (when (< (array-total-size v1) needed)
+      (adjust-array v1 (max needed (* l1 2))))
+    (setf (fill-pointer v1) needed)
+    (replace v1 v2 :start1 l1)
+    (values)))
+
+(defun make-buffer (&optional (capacity 64))
+  (make-array capacity :element-type 'character :adjustable t :fill-pointer 0))
+
+(defun round-to (number precision)
+  "Round `number` to the given `precision`.
+
+  Examples:
+
+    (round-to 13 10)      ; => 10
+    (round-to 15 10)      ; => 20
+    (round-to 44 25)      ; => 50
+    (round-to 457/87 1/2) ; => 11/2
+
+  "
+  (* precision (round number precision)))
+
+
+;;;; Testing ------------------------------------------------------------------
+(defmacro define-test (problem input output &optional (test 'string=))
+  `(test ,(symb 'test- problem)
+     (is (,test ,output (,problem ,input)))))
+
+(defun run-tests ()
+  (1am:run))
+
+
+;;;; File Formats -------------------------------------------------------------
+
+(defmacro-driver (FOR vars IN-FASTA source)
+  (nest
+    (destructuring-bind (label line) vars)
+    (with-gensyms (stream l))
+    (let ((kwd (if generate 'generate 'for))))
+    `(progn
+       (with ,label = nil)
+       (with ,stream = (ensure-stream ,source))
+       (,kwd ,line :do-next
+        (labels ((labelp (line)
+                   (char= #\> (aref line 0)))
+                 (parse-next ()
+                   (let ((,l (read-line ,stream nil nil nil)))
+                     (cond
+                       ((null ,l) (terminate))
+                       ((zerop (length ,l)) (parse-next))
+                       ((labelp ,l) (progn (setf ,label (subseq ,l 1)
+                                                 ,line (make-buffer))
+                                           (parse-next)))
+                       (t (progn (nconcatenate ,line ,l)
+                                 (unless (char= #\> (peek-char nil ,stream nil #\>)) ; yuck
+                                   (parse-next))))))))
+          (parse-next))))))
+
+
+
+;;;; Problems -----------------------------------------------------------------
+(defmacro define-problem (name args sample-input sample-output &body body)
+  (let ((symbol (symb 'problem- name)))
+    `(progn
+       (defun ,symbol ,args ,@body)
+       (setf (get ',symbol 'rosalind-name) ,(string-downcase name))
+       (define-test ,symbol ,sample-input ,sample-output)
+       ',symbol)))
+
+(defun read-problem-data (problem)
+  (-<> (get problem 'rosalind-name)
+    (format nil "~~/Downloads/rosalind_~A.txt" <>)
+    read-file-into-string))
+
+(defun solve% (problem)
+  (pbcopy (funcall problem (read-problem-data problem))))
+
+(defmacro solve (problem)
+  `(solve% ',problem))
+
--- a/vendor/quickutils-package.lisp	Thu Nov 01 19:46:55 2018 -0400
+++ b/vendor/quickutils-package.lisp	Thu Nov 01 21:10:34 2018 -0400
@@ -1,10 +1,10 @@
 (eval-when (:compile-toplevel :load-toplevel :execute)
-  (unless (find-package "EULER.QUICKUTILS")
-    (defpackage "EULER.QUICKUTILS"
+  (unless (find-package "ROSALIND.QUICKUTILS")
+    (defpackage "ROSALIND.QUICKUTILS"
       (:documentation "Package that contains Quickutil utility functions.")
       (:use #:cl))))
 
-(in-package "EULER.QUICKUTILS")
+(in-package "ROSALIND.QUICKUTILS")
 
 ;; need to define this here so sbcl will shut the hell up about it being
 ;; undefined when compiling quickutils.lisp.  computers are trash.