7fcd748a4f00

LCSQ, PRSM
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Mon, 08 Aug 2022 19:00:58 -0400
parents a95ed046cc2c
children 97dda08645b3
branches/tags (none)
files src/package.lisp src/problems/lcsq.lisp src/problems/prsm.lisp src/problems/prtm.lisp src/problems/spec.lisp src/utils.lisp

Changes

--- a/src/package.lisp	Fri Aug 05 00:16:11 2022 -0400
+++ b/src/package.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -22,6 +22,7 @@
 
     :+monoisotopic-mass-of-water+
     :monoisotopic-mass
+    :residue-with-mass
     :*monoisotopic-masses*
 
     :gcp :base-probability :sequence-probability
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/lcsq.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -0,0 +1,72 @@
+(defpackage :rosalind/lcsq (:use :cl :rosalind :losh :iterate))
+(in-package :rosalind/lcsq)
+
+(defparameter *input* ">Rosalind_23
+AACCTTGG
+>Rosalind_64
+ACACTGTGA")
+
+(defparameter *output* "AACTTG")
+
+(defun print-2d-array (array)
+  (check-type array (array * (* *)))
+  ;; This will cons a lot.
+  (iterate
+    (with strings = (make-array (array-dimensions array)))
+    (for el :across-flat-array array :with-index i)
+    (for s = (princ-to-string el))
+    (setf (row-major-aref strings i) (princ-to-string el))
+    (maximizing (length s) :into len)
+    (finally (dotimes (row (array-dimension strings 0))
+               (dotimes (col (array-dimension strings 1))
+                 (format t " ~V@A" len (aref strings row col)))
+               (terpri))))
+  array)
+
+(defun build-table (a b)
+  ;; From CLRS 15.4
+  (iterate
+    (with-result table = (make-array (list (1+ (length a))
+                                           (1+ (length b)))
+                           :initial-element 0))
+    (for ca :in-string a)
+    (for i :from 1)
+    (iterate
+      (for cb :in-string b)
+      (for j :from 1)
+      (for up = (aref table (1- i) j))
+      (for left = (aref table i (1- j)))
+      (for upleft = (aref table (1- i) (1- j)))
+      (setf (aref table i j)
+            (cond ((char= ca cb) (1+ upleft))
+                  ((>= up left) up)
+                  (t left))))))
+
+(defun reconstruct-lcs-from-table (a b table)
+  (nreverse
+    (iterate
+      (with i = (length a))
+      (with j = (length b))
+      (until (or (zerop i) (zerop j)))
+      (for ca = (aref a (1- i)))
+      (for cb = (aref b (1- j)))
+      (for up = (aref table (1- i) j))
+      (for left = (aref table i (1- j)))
+      (cond ((char= ca cb) (progn (collect ca :result-type string)
+                                  (decf i)
+                                  (decf j)))
+            ((>= up left) (decf i))
+            (t (decf j))))))
+
+(defun least-common-subsequence (a b)
+  (check-type a string)
+  (check-type b string)
+  (reconstruct-lcs-from-table a b (build-table a b)))
+
+
+(define-problem lcsq (data stream) *input* *output*
+  (destructuring-bind (a b) (mapcar #'cdr (u:read-fasta-into-alist data))
+    (least-common-subsequence a b)))
+
+#; Scratch --------------------------------------------------------------------
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/problems/prsm.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -0,0 +1,47 @@
+(defpackage :rosalind/prsm (:use :cl :rosalind :losh :iterate))
+(in-package :rosalind/prsm)
+
+(defparameter *input* "4
+GSDMQS
+VWICN
+IASWMQS
+PVSMGAD
+445.17838
+115.02694
+186.07931
+314.13789
+317.1198
+215.09061")
+
+(defparameter *output* "3
+GSDMQS")
+
+
+(defun complete-spectrum (protein)
+  (let ((result (make-hash-table)))
+    (incf (gethash (u:monoisotopic-mass protein) result 0))
+    (iterate (for i :from 1 :below (length protein))
+             (incf (gethash (u:monoisotopic-mass protein :start i) result 0)))
+    (iterate (for i :from (1- (length protein)) :above 0)
+             (incf (gethash (u:monoisotopic-mass protein :end i) result 0)))
+    result))
+
+(defun multiplicity (spectrum string)
+  ;; TODO extract these into utils?
+  (cdr (rosalind/conv::msmax
+         (rosalind/conv::minkowski-difference spectrum
+                                              (complete-spectrum string)))))
+
+
+(define-problem prsm (data stream) *input* *output*
+  (let* ((*read-default-float-format* 'rational)
+         (n (parse-integer (read-line data)))
+         (proteins (gimme n (read-line data)))
+         (spectrum (frequencies (u:read-lines data :key #'read-from-string))))
+    (iterate
+      (for protein :in proteins)
+      (for m = (multiplicity spectrum protein))
+      (finding (cons m protein) :maximizing m :into result)
+      (finally (return (format nil "~D~%~A" (car result) (cdr result)))))))
+
+
--- a/src/problems/prtm.lisp	Fri Aug 05 00:16:11 2022 -0400
+++ b/src/problems/prtm.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -7,7 +7,7 @@
     "821.392"
   (_ data
     (delete #\newline _)
-    (summation _ :key #'u:monoisotopic-mass)
+    u:monoisotopic-mass
     u:float-string))
 
 
--- a/src/problems/spec.lisp	Fri Aug 05 00:16:11 2022 -0400
+++ b/src/problems/spec.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -10,15 +10,9 @@
 (defparameter *output* "WMQS")
 
 
-(defun roughly= (a b &optional (epsilon (* 0.000001d0 (min a b))))
-  (< (abs (- a b)) epsilon))
-
-(defun find-amino-acid (weight)
-  (rassocar weight u:*monoisotopic-masses* :test #'roughly=))
-
 (define-problem spec (data stream) *input* *output*
-  (let* ((*read-default-float-format* 'double-float)
-         (prefix-weights (u:read-lines data :key #'parse-float:parse-float))
+  (let* ((*read-default-float-format* 'rational)
+         (prefix-weights (u:read-lines data :key #'read-from-string))
          (weights (mapcar #'- (rest prefix-weights) prefix-weights))
-         (result (mapcar #'find-amino-acid weights)))
+         (result (mapcar #'u:residue-with-mass weights)))
     (string-join "" result)))
--- a/src/utils.lisp	Fri Aug 05 00:16:11 2022 -0400
+++ b/src/utils.lisp	Mon Aug 08 19:00:58 2022 -0400
@@ -166,52 +166,64 @@
   "The monoisotopic mass of a single water molecule, in Daltons.")
 
 (defparameter *monoisotopic-masses*
-  '((#\A .  71.03711d0)
-    (#\C . 103.00919d0)
-    (#\D . 115.02694d0)
-    (#\E . 129.04259d0)
-    (#\F . 147.06841d0)
-    (#\G .  57.02146d0)
-    (#\H . 137.05891d0)
-    (#\I . 113.08406d0)
-    (#\K . 128.09496d0)
-    (#\L . 113.08406d0)
-    (#\M . 131.04049d0)
-    (#\N . 114.04293d0)
-    (#\P .  97.05276d0)
-    (#\Q . 128.05858d0)
-    (#\R . 156.10111d0)
-    (#\S .  87.03203d0)
-    (#\T . 101.04768d0)
-    (#\V .  99.06841d0)
-    (#\W . 186.07931d0)
-    (#\Y . 163.06333d0)))
+  '((#\A .  71.03711r0)
+    (#\C . 103.00919r0)
+    (#\D . 115.02694r0)
+    (#\E . 129.04259r0)
+    (#\F . 147.06841r0)
+    (#\G .  57.02146r0)
+    (#\H . 137.05891r0)
+    (#\I . 113.08406r0)
+    (#\K . 128.09496r0)
+    (#\L . 113.08406r0)
+    (#\M . 131.04049r0)
+    (#\N . 114.04293r0)
+    (#\P .  97.05276r0)
+    (#\Q . 128.05858r0)
+    (#\R . 156.10111r0)
+    (#\S .  87.03203r0)
+    (#\T . 101.04768r0)
+    (#\V .  99.06841r0)
+    (#\W . 186.07931r0)
+    (#\Y . 163.06333r0)))
 
-(defun monoisotopic-mass (residue)
+(defun-inline residue-mass (residue)
   ;; todo is a hash table faster here?  we could also do an array
   ;; starting at (char-code #\A) if we really wanted
   (ecase residue
-    ;; These have to be doubles or we get too much rounding error.  It's fine.
-    (#\A  71.03711d0)
-    (#\C 103.00919d0)
-    (#\D 115.02694d0)
-    (#\E 129.04259d0)
-    (#\F 147.06841d0)
-    (#\G  57.02146d0)
-    (#\H 137.05891d0)
-    (#\I 113.08406d0)
-    (#\K 128.09496d0)
-    (#\L 113.08406d0)
-    (#\M 131.04049d0)
-    (#\N 114.04293d0)
-    (#\P  97.05276d0)
-    (#\Q 128.05858d0)
-    (#\R 156.10111d0)
-    (#\S  87.03203d0)
-    (#\T 101.04768d0)
-    (#\V  99.06841d0)
-    (#\W 186.07931d0)
-    (#\Y 163.06333d0)))
+    ;; Just use rationals to avoid rounding errors and make them usable in hash
+    ;; tables.  This doesn't have to be particularly fast.
+    (#\A  71.03711r0)
+    (#\C 103.00919r0)
+    (#\D 115.02694r0)
+    (#\E 129.04259r0)
+    (#\F 147.06841r0)
+    (#\G  57.02146r0)
+    (#\H 137.05891r0)
+    (#\I 113.08406r0)
+    (#\K 128.09496r0)
+    (#\L 113.08406r0)
+    (#\M 131.04049r0)
+    (#\N 114.04293r0)
+    (#\P  97.05276r0)
+    (#\Q 128.05858r0)
+    (#\R 156.10111r0)
+    (#\S  87.03203r0)
+    (#\T 101.04768r0)
+    (#\V  99.06841r0)
+    (#\W 186.07931r0)
+    (#\Y 163.06333r0)))
+
+(defun roughly= (a b &optional (epsilon (* 0.000001d0 (min a b))))
+  (< (abs (- a b)) epsilon))
+
+(defun residue-with-mass (mass)
+  (rassocar mass u:*monoisotopic-masses* :test #'roughly=))
+
+(defun monoisotopic-mass (protein &key (start 0) (end (length protein)))
+  (check-type protein string)
+  (loop :for i :from start :below end
+        :summing (residue-mass (char protein i))))
 
 
 ;;;; Strings ------------------------------------------------------------------