11df545d1a41

Remove comments, do CONS
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Sat, 03 Nov 2018 14:41:39 -0400
parents d6e73cb32b9b
children e3dc1e5b762c
branches/tags (none)
files rosalind.asd src/problems/cons.lisp src/problems/dna.lisp src/problems/fib.lisp src/problems/fibd.lisp src/problems/gc.lisp src/problems/hamm.lisp src/problems/iev.lisp src/problems/iprb.lisp src/problems/perm.lisp src/problems/prot.lisp src/problems/revc.lisp src/problems/rna.lisp src/problems/subs.lisp

Changes

--- a/rosalind.asd	Sat Nov 03 12:45:52 2018 -0400
+++ b/rosalind.asd	Sat Nov 03 14:41:39 2018 -0400
@@ -37,5 +37,6 @@
                                            (:file "subs")
                                            (:file "iprb")
                                            (:file "iev")
-                                           (:file "fibd")))))))
+                                           (:file "fibd")
+                                           (:file "cons")))))))
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/cons.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -0,0 +1,133 @@
+(in-package :rosalind)
+
+(defparameter *input-cons* ">Rosalind_1
+ATCCAGCT
+>Rosalind_2
+GGGCAACT
+>Rosalind_3
+ATGGATCT
+>Rosalind_4
+AAGCAACC
+>Rosalind_5
+TTGGAACT
+>Rosalind_6
+ATGCCATT
+>Rosalind_7
+ATGGCACT")
+
+(defparameter *output-cons* "ATGCAACT
+A: 5 1 0 0 5 5 0 0
+C: 0 0 1 4 2 0 6 1
+G: 1 1 6 3 0 1 0 0
+T: 1 5 0 0 0 1 1 6
+")
+
+
+(defun make-profile-matrix% (length)
+  "Allocate a zero-filled profile matrix with 4 rows and `length` columns."
+  (make-array (list 4 length) :initial-element 0))
+
+(defmacro pmref (profile-matrix base pos)
+  `(aref ,profile-matrix
+     (ecase ,base
+       (#\A 0)
+       (#\C 1)
+       (#\G 2)
+       (#\T 3))
+     ,pos))
+
+(defun pmincf (profile-matrix dna)
+  "Increment the profile matrix counts for the given `dna` string."
+  (iterate (for base :in-vector dna :with-index pos)
+           (incf (pmref profile-matrix base pos))))
+
+
+(defun profile-matrix-from-fasta (source)
+  "Read FASTA data from `source` and return its profile matrix.
+
+  All DNA strings in the data must have the same length.
+
+  A profile matrix is a 4xN matrix containing the total counts of A, C, G, and
+  T bases at each position in the input strings.  For example, for the input
+  strings:
+
+    AAAA
+    AGTC
+    GGGG
+    TAAC
+
+  the profile matrix would be:
+
+    pos: 0 1 2 3
+    ------------
+      A: 2 2 2 1
+      C: 0 0 0 2
+      G: 1 2 1 1
+      T: 1 0 1 0
+
+  "
+  (iterate
+    (with result)
+    (with length)
+    (for (nil dna) :in-fasta source)
+    (for n :from 0)
+
+    (if-first-time
+      (setf length (length dna)
+            result (make-profile-matrix length))
+      (assert (= length (length dna)) ()
+        "The ~:R DNA string in the supplied FASTA data has ~D bases (expected ~D)."
+        n (length dna) length))
+
+    (pmincf profile-matrix dna)
+
+    (finally (return result))))
+
+
+(defun consensus-string (profile-matrix)
+  "Return a consensus string from the given `profile-matrix`.
+
+  A consensus string is a string where the base at each position is the most
+  common base at that position among all of the input strings (ties are broken
+  arbitrarily).  For example, for the input strings:
+
+    AAAA
+    AGTC
+    GGGG
+    TAAC
+
+  the consensus string would be \"AGAC\".
+
+  "
+  (iterate
+    (with (nil length) = (array-dimensions profile-matrix))
+    (for pos :below length)
+    (for as = (pmref profile-matrix #\A pos))
+    (for cs = (pmref profile-matrix #\C pos))
+    (for gs = (pmref profile-matrix #\G pos))
+    (for ts = (pmref profile-matrix #\T pos))
+    (for m = (max as cs gs ts))
+    (for winner = (cond ((= m as) #\A)
+                        ((= m cs) #\C)
+                        ((= m gs) #\G)
+                        ((= m ts) #\T)))
+    (collecting winner :result-type 'string)))
+
+
+(define-problem cons (data stream)
+    *input-cons*
+    *output-cons*
+  (let* ((profile-matrix (profile-matrix-from-fasta data))
+         (consensus-string (consensus-string profile-matrix))
+         (length (length consensus-string)))
+    (with-output-to-string (s)
+      (format s "~A~%" consensus-string)
+      (dolist (base '(#\A #\C #\G #\T))
+        (format s "~C:" base)
+        (dotimes (i length)
+          (format s " ~D" (pmref profile-matrix base i)))
+        (terpri s)))))
+
+
+;; (problem-cons)
+;; (solve cons)
--- a/src/problems/dna.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/dna.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,16 +1,5 @@
 (in-package :rosalind)
 
-;; A string is simply an ordered collection of symbols selected from some
-;; alphabet and formed into a word; the length of a string is the number of
-;; symbols that it contains.
-;;
-;; An example of a length 21 DNA string (whose alphabet contains the symbols
-;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG."
-;;
-;; Given: A DNA string s of length at most 1000 nt.  Return: Four integers
-;; (separated by spaces) counting the respective number of times that the
-;; symbols 'A', 'C', 'G', and 'T' occur in s.
-
 (define-problem dna (data string)
     "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC"
     "20 12 17 21"
--- a/src/problems/fib.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/fib.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,36 +1,5 @@
 (in-package :rosalind)
 
-;; A sequence is an ordered collection of objects (usually numbers), which are
-;; allowed to repeat. Sequences can be finite or infinite. Two examples are the
-;; finite sequence (π,−2–√,0,π) and the infinite sequence of odd numbers
-;; (1,3,5,7,9,…). We use the notation an to represent the n-th term of
-;; a sequence.
-;;
-;; A recurrence relation is a way of defining the terms of a sequence with
-;; respect to the values of previous terms. In the case of Fibonacci's rabbits
-;; from the introduction, any given month will contain the rabbits that were
-;; alive the previous month, plus any new offspring. A key observation is that
-;; the number of offspring in any month is equal to the number of rabbits that
-;; were alive two months prior. As a result, if Fn represents the number of
-;; rabbit pairs alive after the n-th month, then we obtain the Fibonacci
-;; sequence having terms Fn that are defined by the recurrence relation
-;; Fn=Fn−1+Fn−2 (with F1=F2=1 to initiate the sequence). Although the sequence
-;; bears Fibonacci's name, it was known to Indian mathematicians over two
-;; millennia ago.
-;;
-;; When finding the n-th term of a sequence defined by a recurrence relation, we
-;; can simply use the recurrence relation to generate terms for progressively
-;; larger values of n. This problem introduces us to the computational technique
-;; of dynamic programming, which successively builds up solutions by using the
-;; answers to smaller cases.
-;;
-;; Given: Positive integers n≤40 and k≤5
-;;
-;; Return: The total number of rabbit pairs that will be present after n months,
-;; if we begin with 1 pair and in each generation, every pair of
-;; reproduction-age rabbits produces a litter of k rabbit pairs (instead of only
-;; 1 pair).
-
 (define-problem fib (data stream)
     "5 3"
     "19"
--- a/src/problems/fibd.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/fibd.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,21 +1,5 @@
 (in-package :rosalind)
 
-;; Recall the definition of the Fibonacci numbers from “Rabbits and Recurrence
-;; Relations”, which followed the recurrence relation Fn=Fn−1+Fn−2 and assumed
-;; that each pair of rabbits reaches maturity in one month and produces
-;; a single pair of offspring (one male, one female) each subsequent month.
-;;
-;; Our aim is to somehow modify this recurrence relation to achieve a dynamic
-;; programming solution in the case that all rabbits die out after a fixed
-;; number of months. See Figure 4 for a depiction of a rabbit tree in which
-;; rabbits live for three months (meaning that they reproduce only twice before
-;; dying).
-;;
-;; Given: Positive integers n≤100 and m≤20.
-;;
-;; Return: The total number of pairs of rabbits that will remain after the n-th
-;; month if all rabbits live for m months.
-
 (define-problem fibd (data stream)
     "6 3"
     "4"
--- a/src/problems/gc.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/gc.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,27 +1,5 @@
 (in-package :rosalind)
 
-;; The GC-content of a DNA string is given by the percentage of symbols in
-;; the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG"
-;; is 37.5%. Note that the reverse complement of any DNA string has the same
-;; GC-content.
-;;
-;; DNA strings must be labeled when they are consolidated into a database.
-;; A commonly used method of string labeling is called FASTA format. In this
-;; format, the string is introduced by a line that begins with '>', followed
-;; by some labeling information. Subsequent lines contain the string itself;
-;; the first line to begin with '>' indicates the label of the next string.
-;;
-;; In Rosalind's implementation, a string in FASTA format will be labeled by
-;; the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000
-;; and 9999.
-;;
-;; Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
-;;
-;; Return: The ID of the string having the highest GC-content, followed by the
-;; GC-content of that string. Rosalind allows for a default error of 0.001 in
-;; all decimal answers unless otherwise stated; please see the note on
-;; absolute error below.
-
 (defparameter *input-gc* ">Rosalind_6404
 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
 TCCCACTAATAATTCTGAGG
--- a/src/problems/hamm.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/hamm.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,13 +1,5 @@
 (in-package :rosalind)
 
-;; Given two strings s and t of equal length, the Hamming distance between s and
-;; t, denoted dH(s,t), is the number of corresponding symbols that differ in
-;; s and t.
-;;
-;; Given: Two DNA strings s and t of equal length (not exceeding 1 kbp).
-;;
-;; Return: The Hamming distance dH(s,t)
-
 (defparameter *input-hamm* "GAGCCTACTAACGGGAT
 CATCGTAATGACGGCCT")
 
--- a/src/problems/iev.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/iev.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,39 +1,5 @@
 (in-package :rosalind)
 
-;; For a random variable X taking integer values between 1 and n, the expected
-;; value of X is E(X)=∑nk=1k×Pr(X=k). The expected value offers us a way of
-;; taking the long-term average of a random variable over a large number of
-;; trials.
-;;
-;; As a motivating example, let X be the number on a six-sided die. Over a large
-;; number of rolls, we should expect to obtain an average of 3.5 on the die
-;; (even though it's not possible to roll a 3.5). The formula for expected value
-;; confirms that E(X)=∑6k=1k×Pr(X=k)=3.5.
-;;
-;; More generally, a random variable for which every one of a number of equally
-;; spaced outcomes has the same probability is called a uniform random variable
-;; (in the die example, this "equal spacing" is equal to 1). We can generalize
-;; our die example to find that if X is a uniform random variable with minimum
-;; possible value a and maximum possible value b, then E(X)=a+b2. You may also
-;; wish to verify that for the dice example, if Y is the random variable
-;; associated with the outcome of a second die roll, then E(X+Y)=7.
-;;
-;; Given: Six nonnegative integers, each of which does not exceed 20,000. The
-;; integers correspond to the number of couples in a population possessing each
-;; genotype pairing for a given factor. In order, the six given integers
-;; represent the number of couples having the following genotypes:
-;;
-;;     AA-AA
-;;     AA-Aa
-;;     AA-aa
-;;     Aa-Aa
-;;     Aa-aa
-;;     aa-aa
-;;
-;; Return: The expected number of offspring displaying the dominant phenotype in
-;; the next generation, under the assumption that every couple has exactly two
-;; offspring.
-
 (define-problem iev (data stream)
     "1 0 0 1 0 1"
     "3.5000"
--- a/src/problems/iprb.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/iprb.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,40 +1,5 @@
 (in-package :rosalind)
 
-;; Probability is the mathematical study of randomly occurring phenomena. We
-;; will model such a phenomenon with a random variable, which is simply
-;; a variable that can take a number of different distinct outcomes depending on
-;; the result of an underlying random process.
-;;
-;; For example, say that we have a bag containing 3 red balls and 2 blue balls.
-;; If we let X represent the random variable corresponding to the color of
-;; a drawn ball, then the probability of each of the two outcomes is given by
-;; Pr(X=red)=35 and Pr(X=blue)=25
-;;
-;; Random variables can be combined to yield new random variables. Returning to
-;; the ball example, let Y model the color of a second ball drawn from the bag
-;; (without replacing the first ball). The probability of Y being red depends on
-;; whether the first ball was red or blue. To represent all outcomes of X and Y,
-;; we therefore use a probability tree diagram. This branching diagram
-;; represents all possible individual probabilities for X and Y, with outcomes
-;; at the endpoints ("leaves") of the tree. The probability of any outcome is
-;; given by the product of probabilities along the path from the beginning of
-;; the tree; see Figure 2 for an illustrative example.
-;;
-;; An event is simply a collection of outcomes. Because outcomes are distinct,
-;; the probability of an event can be written as the sum of the probabilities of
-;; its constituent outcomes. For our colored ball example, let A be the event "Y
-;; is blue." Pr(A) is equal to the sum of the probabilities of two different
-;; outcomes: Pr(X=blue and Y=blue)+Pr(X=red and Y=blue), or 310+110=25 (see
-;; Figure 2 above).
-;;
-;; Given: Three positive integers k, m, and n, representing a population
-;; containing k+m+n organisms: k individuals are homozygous dominant for
-;; a factor, m are heterozygous, and n are homozygous recessive.
-;;
-;; Return: The probability that two randomly selected mating organisms will
-;; produce an individual possessing a dominant allele (and thus displaying the
-;; dominant phenotype). Assume that any two organisms can mate.
-
 (define-problem iprb (data stream)
     "2 2 2"
     "0.78333"
--- a/src/problems/perm.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/perm.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,13 +1,5 @@
 (in-package :rosalind)
 
-;; A permutation of length n is an ordering of the positive integers {1,2,…,n}.
-;; For example, π=(5,3,2,1,4) is a permutation of length 5.
-;;
-;; Given: A positive integer n≤7
-;;
-;; Return: The total number of permutations of length n, followed by a list of
-;; all such permutations (in any order).
-
 (defparameter *input-perm* "3")
 
 (defparameter *output-perm* "6
--- a/src/problems/prot.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/prot.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,19 +1,5 @@
 (in-package :rosalind)
 
-;; The 20 commonly occurring amino acids are abbreviated by using 20 letters
-;; from the English alphabet (all letters except for B, J, O, U, X, and Z).
-;; Protein strings are constructed from these 20 symbols. Henceforth, the term
-;; genetic string will incorporate protein strings along with DNA strings and
-;; RNA strings.
-;;
-;; The RNA codon table dictates the details regarding the encoding of specific
-;; codons into the amino acid alphabet.
-;;
-;; Given: An RNA string s corresponding to a strand of mRNA (of length at most
-;; 10 kbp).
-;;
-;; Return: The protein string encoded by s.
-
 (define-problem prot (data string)
     "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA"
     "MAMAPRTEINSTRING"
--- a/src/problems/revc.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/revc.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,16 +1,5 @@
 (in-package :rosalind)
 
-;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C'
-;; and 'G'.
-;;
-;; The reverse complement of a DNA string s is the string sc formed by reversing
-;; the symbols of s, then taking the complement of each symbol (e.g., the
-;; reverse complement of "GTCA" is "TGAC").
-;;
-;; Given: A DNA string s of length at most 1000 bp.
-;;
-;; Return: The reverse complement sc of s.
-
 (define-problem revc (data string)
     "AAAACCCGGT"
     "ACCGGGTTTT"
--- a/src/problems/rna.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/rna.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,15 +1,5 @@
 (in-package :rosalind)
 
-;; An RNA string is a string formed from the alphabet containing 'A', 'C', 'G',
-;; and 'U'.
-;;
-;; Given a DNA string t corresponding to a coding strand, its transcribed RNA
-;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u.
-;;
-;; Given: A DNA string t having length at most 1000 nt.
-;;
-;; Return: The transcribed RNA string of t.
-
 (define-problem rna (data string)
     "GATGGAACTTGACTACGTAAATT"
     "GAUGGAACUUGACUACGUAAAUU"
--- a/src/problems/subs.lisp	Sat Nov 03 12:45:52 2018 -0400
+++ b/src/problems/subs.lisp	Sat Nov 03 14:41:39 2018 -0400
@@ -1,26 +1,5 @@
 (in-package :rosalind)
 
-;; Given two strings s and t, t is a substring of s if t is contained as
-;; a contiguous collection of symbols in s (as a result, t must be no longer
-;; than s).
-;;
-;; The position of a symbol in a string is the total number of symbols found to
-;; its left, including itself (e.g., the positions of all occurrences of 'U' in
-;; "AUGCUUCAGAAAGGUCUUACG" are 2, 5, 6, 15, 17, and 18). The symbol at position
-;; i of s is denoted by s[i].
-;;
-;; A substring of s can be represented as s[j:k], where j and k represent the
-;; starting and ending positions of the substring in s; for example, if
-;; s = "AUGCUUCAGAAAGGUCUUACG", then s[2:5] = "UGCU".
-;;
-;; The location of a substring s[j:k] is its beginning position j; note that
-;; t will have multiple locations in s if it occurs more than once as
-;; a substring of s (see the Sample below).
-;;
-;; Given: Two DNA strings s and t (each of length at most 1 kbp).
-;;
-;; Return: All locations of t as a substring of s.
-
 (defparameter *input-subs* "GATATATGCATATACTT
 ATAT")