# HG changeset patch # User Steve Losh # Date 1541121034 14400 # Node ID 4cad0eb1a700fceaab679ad17b35761921f2dfc6 # Parent f4ede9fed310d1c1de3d4c372d4e0f3ed6d71f99 Restructure, do GC diff -r f4ede9fed310 -r 4cad0eb1a700 rosalind.asd --- a/rosalind.asd Thu Nov 01 19:46:55 2018 -0400 +++ b/rosalind.asd Thu Nov 01 21:10:34 2018 -0400 @@ -23,5 +23,9 @@ (:file "package") (:module "src" :serial t :components ((:file "utils") - (:file "problems"))))) + (:module "problems" + :components ((:file "dna") + (:file "rna") + (:file "revc") + (:file "gc"))))))) diff -r f4ede9fed310 -r 4cad0eb1a700 src/problems.lisp --- a/src/problems.lisp Thu Nov 01 19:46:55 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -(in-package :rosalind) - -;;;; Testing ------------------------------------------------------------------ -(defmacro define-test (problem input output &optional (test 'string=)) - `(test ,(symb 'test- problem) - (is (,test ,output (,problem ,input))))) - -(defun run-tests () - (1am:run)) - - -;;;; Problems ----------------------------------------------------------------- -(defmacro define-problem - ((number name) args sample-input sample-output &body body) - (let ((symbol (symb 'problem- number))) - `(progn - (defun ,symbol ,args ,@body) - (setf (get ',symbol 'rosalind-name) ,(string-downcase name)) - (define-test ,symbol ,sample-input ,sample-output) - ',symbol))) - - -(define-problem (1 dna) (data) - "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC" - "20 12 17 21" - ;; A string is simply an ordered collection of symbols selected from some - ;; alphabet and formed into a word; the length of a string is the number of - ;; symbols that it contains. - ;; - ;; An example of a length 21 DNA string (whose alphabet contains the symbols - ;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG." - ;; - ;; Given: A DNA string s of length at most 1000 nt. - ;; Return: Four integers (separated by spaces) counting the respective number - ;; of times that the symbols 'A', 'C', 'G', and 'T' occur in s. - (let ((results (frequencies data))) - (format nil "~D ~D ~D ~D" - (gethash #\A results) - (gethash #\C results) - (gethash #\G results) - (gethash #\T results)))) - -(define-problem (2 rna) (data) - "GATGGAACTTGACTACGTAAATT" - "GAUGGAACUUGACUACGUAAAUU" - ;; An RNA string is a string formed from the alphabet containing 'A', 'C', - ;; 'G', and 'U'. - ;; - ;; Given a DNA string t corresponding to a coding strand, its transcribed RNA - ;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u. - ;; - ;; Given: A DNA string t having length at most 1000 nt. - ;; - ;; Return: The transcribed RNA string of t. - (substitute #\U #\T data)) - -(define-problem (3 revc) (data) - "AAAACCCGGT" - "ACCGGGTTTT" - ;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are - ;; 'C' and 'G'. - ;; - ;; The reverse complement of a DNA string s is the string sc formed by - ;; reversing the symbols of s, then taking the complement of each symbol - ;; (e.g., the reverse complement of "GTCA" is "TGAC"). - ;; - ;; Given: A DNA string s of length at most 1000 bp. - ;; - ;; Return: The reverse complement sc of s. - (copyf data) - (flet ((dna-complement (base) - (case base - (#\A #\T) - (#\T #\A) - (#\G #\C) - (#\C #\G) - (t base)))) ; newline etc - (map-into data #'dna-complement data) - (nreverse data))) - - -;;;; Solutions ---------------------------------------------------------------- -(defun read-problem-data (problem) - (-<> (get problem 'rosalind-name) - (format nil "~~/Downloads/rosalind_~A.txt" <>) - read-file-into-string)) - -(defun solve% (problem) - (pbcopy (funcall problem (read-problem-data problem)))) - -(defmacro solve (problem) - `(solve% ',problem)) - - -;; (problem-3 "AAAACCCGGT") - -;; (solve problem-3) - - diff -r f4ede9fed310 -r 4cad0eb1a700 src/problems/dna.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/dna.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -0,0 +1,23 @@ +(in-package :rosalind) + +;; A string is simply an ordered collection of symbols selected from some +;; alphabet and formed into a word; the length of a string is the number of +;; symbols that it contains. +;; +;; An example of a length 21 DNA string (whose alphabet contains the symbols +;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG." +;; +;; Given: A DNA string s of length at most 1000 nt. Return: Four integers +;; (separated by spaces) counting the respective number of times that the +;; symbols 'A', 'C', 'G', and 'T' occur in s. + +(define-problem dna (data) + "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC" + "20 12 17 21" + (let ((results (frequencies data))) + (format nil "~D ~D ~D ~D" + (gethash #\A results) + (gethash #\C results) + (gethash #\G results) + (gethash #\T results)))) + diff -r f4ede9fed310 -r 4cad0eb1a700 src/problems/gc.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/gc.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -0,0 +1,55 @@ +(in-package :rosalind) + +;; The GC-content of a DNA string is given by the percentage of symbols in +;; the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" +;; is 37.5%. Note that the reverse complement of any DNA string has the same +;; GC-content. +;; +;; DNA strings must be labeled when they are consolidated into a database. +;; A commonly used method of string labeling is called FASTA format. In this +;; format, the string is introduced by a line that begins with '>', followed +;; by some labeling information. Subsequent lines contain the string itself; +;; the first line to begin with '>' indicates the label of the next string. +;; +;; In Rosalind's implementation, a string in FASTA format will be labeled by +;; the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 +;; and 9999. +;; +;; Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each). +;; +;; Return: The ID of the string having the highest GC-content, followed by the +;; GC-content of that string. Rosalind allows for a default error of 0.001 in +;; all decimal answers unless otherwise stated; please see the note on +;; absolute error below. + +(defparameter *input-gc* ">Rosalind_6404 +CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC +TCCCACTAATAATTCTGAGG +>Rosalind_5959 +CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT +ATATCCATTTGTCAGCAGACACGC +>Rosalind_0808 +CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC +TGGGAACCTGCGGGCAGTAGGTGGAAT") +(defparameter *output-gc* "Rosalind_0808 +60.919540") + + +(define-problem gc (data) + *input-gc* + *output-gc* + (labels ((gcp (base) + (or (char= #\G base) + (char= #\C base))) + (gc-content (string) + (/ (count-if #'gcp string) + (length string)))) + (iterate + (for (label dna) :in-fasta data) + (for gc = (gc-content dna)) + (finding (format nil "~A~%~,6F" label (* 100 gc)) + :maximizing gc)))) + + +;; (problem-gc *input-gc*) +;; (solve problem-gc) diff -r f4ede9fed310 -r 4cad0eb1a700 src/problems/revc.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/revc.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -0,0 +1,27 @@ +(in-package :rosalind) + +;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' +;; and 'G'. +;; +;; The reverse complement of a DNA string s is the string sc formed by reversing +;; the symbols of s, then taking the complement of each symbol (e.g., the +;; reverse complement of "GTCA" is "TGAC"). +;; +;; Given: A DNA string s of length at most 1000 bp. +;; +;; Return: The reverse complement sc of s. + +(define-problem revc (data) + "AAAACCCGGT" + "ACCGGGTTTT" + (copyf data) + (flet ((dna-complement (base) + (case base + (#\A #\T) + (#\T #\A) + (#\G #\C) + (#\C #\G) + (t base)))) ; newline etc + (map-into data #'dna-complement data) + (nreverse data))) + diff -r f4ede9fed310 -r 4cad0eb1a700 src/problems/rna.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/rna.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -0,0 +1,17 @@ +(in-package :rosalind) + +;; An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', +;; and 'U'. +;; +;; Given a DNA string t corresponding to a coding strand, its transcribed RNA +;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u. +;; +;; Given: A DNA string t having length at most 1000 nt. +;; +;; Return: The transcribed RNA string of t. + +(define-problem rna (data) + "GATGGAACTTGACTACGTAAATT" + "GAUGGAACUUGACUACGUAAAUU" + (substitute #\U #\T data)) + diff -r f4ede9fed310 -r 4cad0eb1a700 src/utils.lisp --- a/src/utils.lisp Thu Nov 01 19:46:55 2018 -0400 +++ b/src/utils.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -1,6 +1,7 @@ (in-package :rosalind) +;;;; Misc --------------------------------------------------------------------- (defun sh (command input) (declare (ignorable command input)) #+sbcl @@ -25,3 +26,93 @@ (defmacro copyf (sequence) `(setf ,sequence (copy-seq ,sequence))) + +(defun ensure-stream (input) + (ctypecase input + (stream input) + (string (make-string-input-stream input)))) + +(defun nconcatenate (v1 v2) + (let* ((l1 (length v1)) + (l2 (length v2)) + (needed (+ l1 l2))) + (when (< (array-total-size v1) needed) + (adjust-array v1 (max needed (* l1 2)))) + (setf (fill-pointer v1) needed) + (replace v1 v2 :start1 l1) + (values))) + +(defun make-buffer (&optional (capacity 64)) + (make-array capacity :element-type 'character :adjustable t :fill-pointer 0)) + +(defun round-to (number precision) + "Round `number` to the given `precision`. + + Examples: + + (round-to 13 10) ; => 10 + (round-to 15 10) ; => 20 + (round-to 44 25) ; => 50 + (round-to 457/87 1/2) ; => 11/2 + + " + (* precision (round number precision))) + + +;;;; Testing ------------------------------------------------------------------ +(defmacro define-test (problem input output &optional (test 'string=)) + `(test ,(symb 'test- problem) + (is (,test ,output (,problem ,input))))) + +(defun run-tests () + (1am:run)) + + +;;;; File Formats ------------------------------------------------------------- + +(defmacro-driver (FOR vars IN-FASTA source) + (nest + (destructuring-bind (label line) vars) + (with-gensyms (stream l)) + (let ((kwd (if generate 'generate 'for)))) + `(progn + (with ,label = nil) + (with ,stream = (ensure-stream ,source)) + (,kwd ,line :do-next + (labels ((labelp (line) + (char= #\> (aref line 0))) + (parse-next () + (let ((,l (read-line ,stream nil nil nil))) + (cond + ((null ,l) (terminate)) + ((zerop (length ,l)) (parse-next)) + ((labelp ,l) (progn (setf ,label (subseq ,l 1) + ,line (make-buffer)) + (parse-next))) + (t (progn (nconcatenate ,line ,l) + (unless (char= #\> (peek-char nil ,stream nil #\>)) ; yuck + (parse-next)))))))) + (parse-next)))))) + + + +;;;; Problems ----------------------------------------------------------------- +(defmacro define-problem (name args sample-input sample-output &body body) + (let ((symbol (symb 'problem- name))) + `(progn + (defun ,symbol ,args ,@body) + (setf (get ',symbol 'rosalind-name) ,(string-downcase name)) + (define-test ,symbol ,sample-input ,sample-output) + ',symbol))) + +(defun read-problem-data (problem) + (-<> (get problem 'rosalind-name) + (format nil "~~/Downloads/rosalind_~A.txt" <>) + read-file-into-string)) + +(defun solve% (problem) + (pbcopy (funcall problem (read-problem-data problem)))) + +(defmacro solve (problem) + `(solve% ',problem)) + diff -r f4ede9fed310 -r 4cad0eb1a700 vendor/quickutils-package.lisp --- a/vendor/quickutils-package.lisp Thu Nov 01 19:46:55 2018 -0400 +++ b/vendor/quickutils-package.lisp Thu Nov 01 21:10:34 2018 -0400 @@ -1,10 +1,10 @@ (eval-when (:compile-toplevel :load-toplevel :execute) - (unless (find-package "EULER.QUICKUTILS") - (defpackage "EULER.QUICKUTILS" + (unless (find-package "ROSALIND.QUICKUTILS") + (defpackage "ROSALIND.QUICKUTILS" (:documentation "Package that contains Quickutil utility functions.") (:use #:cl)))) -(in-package "EULER.QUICKUTILS") +(in-package "ROSALIND.QUICKUTILS") ;; need to define this here so sbcl will shut the hell up about it being ;; undefined when compiling quickutils.lisp. computers are trash.