# HG changeset patch # User Steve Losh # Date 1541270499 14400 # Node ID 11df545d1a41b6e89b83e4ab951bb5438903a8c9 # Parent d6e73cb32b9b955ca28603d50370f003ce7506a9 Remove comments, do CONS diff -r d6e73cb32b9b -r 11df545d1a41 rosalind.asd --- a/rosalind.asd Sat Nov 03 12:45:52 2018 -0400 +++ b/rosalind.asd Sat Nov 03 14:41:39 2018 -0400 @@ -37,5 +37,6 @@ (:file "subs") (:file "iprb") (:file "iev") - (:file "fibd"))))))) + (:file "fibd") + (:file "cons"))))))) diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/cons.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/cons.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -0,0 +1,133 @@ +(in-package :rosalind) + +(defparameter *input-cons* ">Rosalind_1 +ATCCAGCT +>Rosalind_2 +GGGCAACT +>Rosalind_3 +ATGGATCT +>Rosalind_4 +AAGCAACC +>Rosalind_5 +TTGGAACT +>Rosalind_6 +ATGCCATT +>Rosalind_7 +ATGGCACT") + +(defparameter *output-cons* "ATGCAACT +A: 5 1 0 0 5 5 0 0 +C: 0 0 1 4 2 0 6 1 +G: 1 1 6 3 0 1 0 0 +T: 1 5 0 0 0 1 1 6 +") + + +(defun make-profile-matrix% (length) + "Allocate a zero-filled profile matrix with 4 rows and `length` columns." + (make-array (list 4 length) :initial-element 0)) + +(defmacro pmref (profile-matrix base pos) + `(aref ,profile-matrix + (ecase ,base + (#\A 0) + (#\C 1) + (#\G 2) + (#\T 3)) + ,pos)) + +(defun pmincf (profile-matrix dna) + "Increment the profile matrix counts for the given `dna` string." + (iterate (for base :in-vector dna :with-index pos) + (incf (pmref profile-matrix base pos)))) + + +(defun profile-matrix-from-fasta (source) + "Read FASTA data from `source` and return its profile matrix. + + All DNA strings in the data must have the same length. + + A profile matrix is a 4xN matrix containing the total counts of A, C, G, and + T bases at each position in the input strings. For example, for the input + strings: + + AAAA + AGTC + GGGG + TAAC + + the profile matrix would be: + + pos: 0 1 2 3 + ------------ + A: 2 2 2 1 + C: 0 0 0 2 + G: 1 2 1 1 + T: 1 0 1 0 + + " + (iterate + (with result) + (with length) + (for (nil dna) :in-fasta source) + (for n :from 0) + + (if-first-time + (setf length (length dna) + result (make-profile-matrix length)) + (assert (= length (length dna)) () + "The ~:R DNA string in the supplied FASTA data has ~D bases (expected ~D)." + n (length dna) length)) + + (pmincf profile-matrix dna) + + (finally (return result)))) + + +(defun consensus-string (profile-matrix) + "Return a consensus string from the given `profile-matrix`. + + A consensus string is a string where the base at each position is the most + common base at that position among all of the input strings (ties are broken + arbitrarily). For example, for the input strings: + + AAAA + AGTC + GGGG + TAAC + + the consensus string would be \"AGAC\". + + " + (iterate + (with (nil length) = (array-dimensions profile-matrix)) + (for pos :below length) + (for as = (pmref profile-matrix #\A pos)) + (for cs = (pmref profile-matrix #\C pos)) + (for gs = (pmref profile-matrix #\G pos)) + (for ts = (pmref profile-matrix #\T pos)) + (for m = (max as cs gs ts)) + (for winner = (cond ((= m as) #\A) + ((= m cs) #\C) + ((= m gs) #\G) + ((= m ts) #\T))) + (collecting winner :result-type 'string))) + + +(define-problem cons (data stream) + *input-cons* + *output-cons* + (let* ((profile-matrix (profile-matrix-from-fasta data)) + (consensus-string (consensus-string profile-matrix)) + (length (length consensus-string))) + (with-output-to-string (s) + (format s "~A~%" consensus-string) + (dolist (base '(#\A #\C #\G #\T)) + (format s "~C:" base) + (dotimes (i length) + (format s " ~D" (pmref profile-matrix base i))) + (terpri s))))) + + +;; (problem-cons) +;; (solve cons) diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/dna.lisp --- a/src/problems/dna.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/dna.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,16 +1,5 @@ (in-package :rosalind) -;; A string is simply an ordered collection of symbols selected from some -;; alphabet and formed into a word; the length of a string is the number of -;; symbols that it contains. -;; -;; An example of a length 21 DNA string (whose alphabet contains the symbols -;; 'A', 'C', 'G', and 'T') is "ATGCTTCAGAAAGGTCTTACG." -;; -;; Given: A DNA string s of length at most 1000 nt. Return: Four integers -;; (separated by spaces) counting the respective number of times that the -;; symbols 'A', 'C', 'G', and 'T' occur in s. - (define-problem dna (data string) "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC" "20 12 17 21" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/fib.lisp --- a/src/problems/fib.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/fib.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,36 +1,5 @@ (in-package :rosalind) -;; A sequence is an ordered collection of objects (usually numbers), which are -;; allowed to repeat. Sequences can be finite or infinite. Two examples are the -;; finite sequence (π,−2–√,0,π) and the infinite sequence of odd numbers -;; (1,3,5,7,9,…). We use the notation an to represent the n-th term of -;; a sequence. -;; -;; A recurrence relation is a way of defining the terms of a sequence with -;; respect to the values of previous terms. In the case of Fibonacci's rabbits -;; from the introduction, any given month will contain the rabbits that were -;; alive the previous month, plus any new offspring. A key observation is that -;; the number of offspring in any month is equal to the number of rabbits that -;; were alive two months prior. As a result, if Fn represents the number of -;; rabbit pairs alive after the n-th month, then we obtain the Fibonacci -;; sequence having terms Fn that are defined by the recurrence relation -;; Fn=Fn−1+Fn−2 (with F1=F2=1 to initiate the sequence). Although the sequence -;; bears Fibonacci's name, it was known to Indian mathematicians over two -;; millennia ago. -;; -;; When finding the n-th term of a sequence defined by a recurrence relation, we -;; can simply use the recurrence relation to generate terms for progressively -;; larger values of n. This problem introduces us to the computational technique -;; of dynamic programming, which successively builds up solutions by using the -;; answers to smaller cases. -;; -;; Given: Positive integers n≤40 and k≤5 -;; -;; Return: The total number of rabbit pairs that will be present after n months, -;; if we begin with 1 pair and in each generation, every pair of -;; reproduction-age rabbits produces a litter of k rabbit pairs (instead of only -;; 1 pair). - (define-problem fib (data stream) "5 3" "19" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/fibd.lisp --- a/src/problems/fibd.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/fibd.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,21 +1,5 @@ (in-package :rosalind) -;; Recall the definition of the Fibonacci numbers from “Rabbits and Recurrence -;; Relations”, which followed the recurrence relation Fn=Fn−1+Fn−2 and assumed -;; that each pair of rabbits reaches maturity in one month and produces -;; a single pair of offspring (one male, one female) each subsequent month. -;; -;; Our aim is to somehow modify this recurrence relation to achieve a dynamic -;; programming solution in the case that all rabbits die out after a fixed -;; number of months. See Figure 4 for a depiction of a rabbit tree in which -;; rabbits live for three months (meaning that they reproduce only twice before -;; dying). -;; -;; Given: Positive integers n≤100 and m≤20. -;; -;; Return: The total number of pairs of rabbits that will remain after the n-th -;; month if all rabbits live for m months. - (define-problem fibd (data stream) "6 3" "4" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/gc.lisp --- a/src/problems/gc.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/gc.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,27 +1,5 @@ (in-package :rosalind) -;; The GC-content of a DNA string is given by the percentage of symbols in -;; the string that are 'C' or 'G'. For example, the GC-content of "AGCTATAG" -;; is 37.5%. Note that the reverse complement of any DNA string has the same -;; GC-content. -;; -;; DNA strings must be labeled when they are consolidated into a database. -;; A commonly used method of string labeling is called FASTA format. In this -;; format, the string is introduced by a line that begins with '>', followed -;; by some labeling information. Subsequent lines contain the string itself; -;; the first line to begin with '>' indicates the label of the next string. -;; -;; In Rosalind's implementation, a string in FASTA format will be labeled by -;; the ID "Rosalind_xxxx", where "xxxx" denotes a four-digit code between 0000 -;; and 9999. -;; -;; Given: At most 10 DNA strings in FASTA format (of length at most 1 kbp each). -;; -;; Return: The ID of the string having the highest GC-content, followed by the -;; GC-content of that string. Rosalind allows for a default error of 0.001 in -;; all decimal answers unless otherwise stated; please see the note on -;; absolute error below. - (defparameter *input-gc* ">Rosalind_6404 CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC TCCCACTAATAATTCTGAGG diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/hamm.lisp --- a/src/problems/hamm.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/hamm.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,13 +1,5 @@ (in-package :rosalind) -;; Given two strings s and t of equal length, the Hamming distance between s and -;; t, denoted dH(s,t), is the number of corresponding symbols that differ in -;; s and t. -;; -;; Given: Two DNA strings s and t of equal length (not exceeding 1 kbp). -;; -;; Return: The Hamming distance dH(s,t) - (defparameter *input-hamm* "GAGCCTACTAACGGGAT CATCGTAATGACGGCCT") diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/iev.lisp --- a/src/problems/iev.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/iev.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,39 +1,5 @@ (in-package :rosalind) -;; For a random variable X taking integer values between 1 and n, the expected -;; value of X is E(X)=∑nk=1k×Pr(X=k). The expected value offers us a way of -;; taking the long-term average of a random variable over a large number of -;; trials. -;; -;; As a motivating example, let X be the number on a six-sided die. Over a large -;; number of rolls, we should expect to obtain an average of 3.5 on the die -;; (even though it's not possible to roll a 3.5). The formula for expected value -;; confirms that E(X)=∑6k=1k×Pr(X=k)=3.5. -;; -;; More generally, a random variable for which every one of a number of equally -;; spaced outcomes has the same probability is called a uniform random variable -;; (in the die example, this "equal spacing" is equal to 1). We can generalize -;; our die example to find that if X is a uniform random variable with minimum -;; possible value a and maximum possible value b, then E(X)=a+b2. You may also -;; wish to verify that for the dice example, if Y is the random variable -;; associated with the outcome of a second die roll, then E(X+Y)=7. -;; -;; Given: Six nonnegative integers, each of which does not exceed 20,000. The -;; integers correspond to the number of couples in a population possessing each -;; genotype pairing for a given factor. In order, the six given integers -;; represent the number of couples having the following genotypes: -;; -;; AA-AA -;; AA-Aa -;; AA-aa -;; Aa-Aa -;; Aa-aa -;; aa-aa -;; -;; Return: The expected number of offspring displaying the dominant phenotype in -;; the next generation, under the assumption that every couple has exactly two -;; offspring. - (define-problem iev (data stream) "1 0 0 1 0 1" "3.5000" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/iprb.lisp --- a/src/problems/iprb.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/iprb.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,40 +1,5 @@ (in-package :rosalind) -;; Probability is the mathematical study of randomly occurring phenomena. We -;; will model such a phenomenon with a random variable, which is simply -;; a variable that can take a number of different distinct outcomes depending on -;; the result of an underlying random process. -;; -;; For example, say that we have a bag containing 3 red balls and 2 blue balls. -;; If we let X represent the random variable corresponding to the color of -;; a drawn ball, then the probability of each of the two outcomes is given by -;; Pr(X=red)=35 and Pr(X=blue)=25 -;; -;; Random variables can be combined to yield new random variables. Returning to -;; the ball example, let Y model the color of a second ball drawn from the bag -;; (without replacing the first ball). The probability of Y being red depends on -;; whether the first ball was red or blue. To represent all outcomes of X and Y, -;; we therefore use a probability tree diagram. This branching diagram -;; represents all possible individual probabilities for X and Y, with outcomes -;; at the endpoints ("leaves") of the tree. The probability of any outcome is -;; given by the product of probabilities along the path from the beginning of -;; the tree; see Figure 2 for an illustrative example. -;; -;; An event is simply a collection of outcomes. Because outcomes are distinct, -;; the probability of an event can be written as the sum of the probabilities of -;; its constituent outcomes. For our colored ball example, let A be the event "Y -;; is blue." Pr(A) is equal to the sum of the probabilities of two different -;; outcomes: Pr(X=blue and Y=blue)+Pr(X=red and Y=blue), or 310+110=25 (see -;; Figure 2 above). -;; -;; Given: Three positive integers k, m, and n, representing a population -;; containing k+m+n organisms: k individuals are homozygous dominant for -;; a factor, m are heterozygous, and n are homozygous recessive. -;; -;; Return: The probability that two randomly selected mating organisms will -;; produce an individual possessing a dominant allele (and thus displaying the -;; dominant phenotype). Assume that any two organisms can mate. - (define-problem iprb (data stream) "2 2 2" "0.78333" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/perm.lisp --- a/src/problems/perm.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/perm.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,13 +1,5 @@ (in-package :rosalind) -;; A permutation of length n is an ordering of the positive integers {1,2,…,n}. -;; For example, π=(5,3,2,1,4) is a permutation of length 5. -;; -;; Given: A positive integer n≤7 -;; -;; Return: The total number of permutations of length n, followed by a list of -;; all such permutations (in any order). - (defparameter *input-perm* "3") (defparameter *output-perm* "6 diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/prot.lisp --- a/src/problems/prot.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/prot.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,19 +1,5 @@ (in-package :rosalind) -;; The 20 commonly occurring amino acids are abbreviated by using 20 letters -;; from the English alphabet (all letters except for B, J, O, U, X, and Z). -;; Protein strings are constructed from these 20 symbols. Henceforth, the term -;; genetic string will incorporate protein strings along with DNA strings and -;; RNA strings. -;; -;; The RNA codon table dictates the details regarding the encoding of specific -;; codons into the amino acid alphabet. -;; -;; Given: An RNA string s corresponding to a strand of mRNA (of length at most -;; 10 kbp). -;; -;; Return: The protein string encoded by s. - (define-problem prot (data string) "AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA" "MAMAPRTEINSTRING" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/revc.lisp --- a/src/problems/revc.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/revc.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,16 +1,5 @@ (in-package :rosalind) -;; In DNA strings, symbols 'A' and 'T' are complements of each other, as are 'C' -;; and 'G'. -;; -;; The reverse complement of a DNA string s is the string sc formed by reversing -;; the symbols of s, then taking the complement of each symbol (e.g., the -;; reverse complement of "GTCA" is "TGAC"). -;; -;; Given: A DNA string s of length at most 1000 bp. -;; -;; Return: The reverse complement sc of s. - (define-problem revc (data string) "AAAACCCGGT" "ACCGGGTTTT" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/rna.lisp --- a/src/problems/rna.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/rna.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,15 +1,5 @@ (in-package :rosalind) -;; An RNA string is a string formed from the alphabet containing 'A', 'C', 'G', -;; and 'U'. -;; -;; Given a DNA string t corresponding to a coding strand, its transcribed RNA -;; string u is formed by replacing all occurrences of 'T' in t with 'U' in u. -;; -;; Given: A DNA string t having length at most 1000 nt. -;; -;; Return: The transcribed RNA string of t. - (define-problem rna (data string) "GATGGAACTTGACTACGTAAATT" "GAUGGAACUUGACUACGUAAAUU" diff -r d6e73cb32b9b -r 11df545d1a41 src/problems/subs.lisp --- a/src/problems/subs.lisp Sat Nov 03 12:45:52 2018 -0400 +++ b/src/problems/subs.lisp Sat Nov 03 14:41:39 2018 -0400 @@ -1,26 +1,5 @@ (in-package :rosalind) -;; Given two strings s and t, t is a substring of s if t is contained as -;; a contiguous collection of symbols in s (as a result, t must be no longer -;; than s). -;; -;; The position of a symbol in a string is the total number of symbols found to -;; its left, including itself (e.g., the positions of all occurrences of 'U' in -;; "AUGCUUCAGAAAGGUCUUACG" are 2, 5, 6, 15, 17, and 18). The symbol at position -;; i of s is denoted by s[i]. -;; -;; A substring of s can be represented as s[j:k], where j and k represent the -;; starting and ending positions of the substring in s; for example, if -;; s = "AUGCUUCAGAAAGGUCUUACG", then s[2:5] = "UGCU". -;; -;; The location of a substring s[j:k] is its beginning position j; note that -;; t will have multiple locations in s if it occurs more than once as -;; a substring of s (see the Sample below). -;; -;; Given: Two DNA strings s and t (each of length at most 1 kbp). -;; -;; Return: All locations of t as a substring of s. - (defparameter *input-subs* "GATATATGCATATACTT ATAT")