# HG changeset patch # User Steve Losh # Date 1659999658 14400 # Node ID 7fcd748a4f00830bb3655d2a5c4ad5869b7e5435 # Parent a95ed046cc2caf2cda135c366831175fdf909098 LCSQ, PRSM diff -r a95ed046cc2c -r 7fcd748a4f00 src/package.lisp --- a/src/package.lisp Fri Aug 05 00:16:11 2022 -0400 +++ b/src/package.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -22,6 +22,7 @@ :+monoisotopic-mass-of-water+ :monoisotopic-mass + :residue-with-mass :*monoisotopic-masses* :gcp :base-probability :sequence-probability diff -r a95ed046cc2c -r 7fcd748a4f00 src/problems/lcsq.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/lcsq.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -0,0 +1,72 @@ +(defpackage :rosalind/lcsq (:use :cl :rosalind :losh :iterate)) +(in-package :rosalind/lcsq) + +(defparameter *input* ">Rosalind_23 +AACCTTGG +>Rosalind_64 +ACACTGTGA") + +(defparameter *output* "AACTTG") + +(defun print-2d-array (array) + (check-type array (array * (* *))) + ;; This will cons a lot. + (iterate + (with strings = (make-array (array-dimensions array))) + (for el :across-flat-array array :with-index i) + (for s = (princ-to-string el)) + (setf (row-major-aref strings i) (princ-to-string el)) + (maximizing (length s) :into len) + (finally (dotimes (row (array-dimension strings 0)) + (dotimes (col (array-dimension strings 1)) + (format t " ~V@A" len (aref strings row col))) + (terpri)))) + array) + +(defun build-table (a b) + ;; From CLRS 15.4 + (iterate + (with-result table = (make-array (list (1+ (length a)) + (1+ (length b))) + :initial-element 0)) + (for ca :in-string a) + (for i :from 1) + (iterate + (for cb :in-string b) + (for j :from 1) + (for up = (aref table (1- i) j)) + (for left = (aref table i (1- j))) + (for upleft = (aref table (1- i) (1- j))) + (setf (aref table i j) + (cond ((char= ca cb) (1+ upleft)) + ((>= up left) up) + (t left)))))) + +(defun reconstruct-lcs-from-table (a b table) + (nreverse + (iterate + (with i = (length a)) + (with j = (length b)) + (until (or (zerop i) (zerop j))) + (for ca = (aref a (1- i))) + (for cb = (aref b (1- j))) + (for up = (aref table (1- i) j)) + (for left = (aref table i (1- j))) + (cond ((char= ca cb) (progn (collect ca :result-type string) + (decf i) + (decf j))) + ((>= up left) (decf i)) + (t (decf j)))))) + +(defun least-common-subsequence (a b) + (check-type a string) + (check-type b string) + (reconstruct-lcs-from-table a b (build-table a b))) + + +(define-problem lcsq (data stream) *input* *output* + (destructuring-bind (a b) (mapcar #'cdr (u:read-fasta-into-alist data)) + (least-common-subsequence a b))) + +#; Scratch -------------------------------------------------------------------- + diff -r a95ed046cc2c -r 7fcd748a4f00 src/problems/prsm.lisp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/problems/prsm.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -0,0 +1,47 @@ +(defpackage :rosalind/prsm (:use :cl :rosalind :losh :iterate)) +(in-package :rosalind/prsm) + +(defparameter *input* "4 +GSDMQS +VWICN +IASWMQS +PVSMGAD +445.17838 +115.02694 +186.07931 +314.13789 +317.1198 +215.09061") + +(defparameter *output* "3 +GSDMQS") + + +(defun complete-spectrum (protein) + (let ((result (make-hash-table))) + (incf (gethash (u:monoisotopic-mass protein) result 0)) + (iterate (for i :from 1 :below (length protein)) + (incf (gethash (u:monoisotopic-mass protein :start i) result 0))) + (iterate (for i :from (1- (length protein)) :above 0) + (incf (gethash (u:monoisotopic-mass protein :end i) result 0))) + result)) + +(defun multiplicity (spectrum string) + ;; TODO extract these into utils? + (cdr (rosalind/conv::msmax + (rosalind/conv::minkowski-difference spectrum + (complete-spectrum string))))) + + +(define-problem prsm (data stream) *input* *output* + (let* ((*read-default-float-format* 'rational) + (n (parse-integer (read-line data))) + (proteins (gimme n (read-line data))) + (spectrum (frequencies (u:read-lines data :key #'read-from-string)))) + (iterate + (for protein :in proteins) + (for m = (multiplicity spectrum protein)) + (finding (cons m protein) :maximizing m :into result) + (finally (return (format nil "~D~%~A" (car result) (cdr result))))))) + + diff -r a95ed046cc2c -r 7fcd748a4f00 src/problems/prtm.lisp --- a/src/problems/prtm.lisp Fri Aug 05 00:16:11 2022 -0400 +++ b/src/problems/prtm.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -7,7 +7,7 @@ "821.392" (_ data (delete #\newline _) - (summation _ :key #'u:monoisotopic-mass) + u:monoisotopic-mass u:float-string)) diff -r a95ed046cc2c -r 7fcd748a4f00 src/problems/spec.lisp --- a/src/problems/spec.lisp Fri Aug 05 00:16:11 2022 -0400 +++ b/src/problems/spec.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -10,15 +10,9 @@ (defparameter *output* "WMQS") -(defun roughly= (a b &optional (epsilon (* 0.000001d0 (min a b)))) - (< (abs (- a b)) epsilon)) - -(defun find-amino-acid (weight) - (rassocar weight u:*monoisotopic-masses* :test #'roughly=)) - (define-problem spec (data stream) *input* *output* - (let* ((*read-default-float-format* 'double-float) - (prefix-weights (u:read-lines data :key #'parse-float:parse-float)) + (let* ((*read-default-float-format* 'rational) + (prefix-weights (u:read-lines data :key #'read-from-string)) (weights (mapcar #'- (rest prefix-weights) prefix-weights)) - (result (mapcar #'find-amino-acid weights))) + (result (mapcar #'u:residue-with-mass weights))) (string-join "" result))) diff -r a95ed046cc2c -r 7fcd748a4f00 src/utils.lisp --- a/src/utils.lisp Fri Aug 05 00:16:11 2022 -0400 +++ b/src/utils.lisp Mon Aug 08 19:00:58 2022 -0400 @@ -166,52 +166,64 @@ "The monoisotopic mass of a single water molecule, in Daltons.") (defparameter *monoisotopic-masses* - '((#\A . 71.03711d0) - (#\C . 103.00919d0) - (#\D . 115.02694d0) - (#\E . 129.04259d0) - (#\F . 147.06841d0) - (#\G . 57.02146d0) - (#\H . 137.05891d0) - (#\I . 113.08406d0) - (#\K . 128.09496d0) - (#\L . 113.08406d0) - (#\M . 131.04049d0) - (#\N . 114.04293d0) - (#\P . 97.05276d0) - (#\Q . 128.05858d0) - (#\R . 156.10111d0) - (#\S . 87.03203d0) - (#\T . 101.04768d0) - (#\V . 99.06841d0) - (#\W . 186.07931d0) - (#\Y . 163.06333d0))) + '((#\A . 71.03711r0) + (#\C . 103.00919r0) + (#\D . 115.02694r0) + (#\E . 129.04259r0) + (#\F . 147.06841r0) + (#\G . 57.02146r0) + (#\H . 137.05891r0) + (#\I . 113.08406r0) + (#\K . 128.09496r0) + (#\L . 113.08406r0) + (#\M . 131.04049r0) + (#\N . 114.04293r0) + (#\P . 97.05276r0) + (#\Q . 128.05858r0) + (#\R . 156.10111r0) + (#\S . 87.03203r0) + (#\T . 101.04768r0) + (#\V . 99.06841r0) + (#\W . 186.07931r0) + (#\Y . 163.06333r0))) -(defun monoisotopic-mass (residue) +(defun-inline residue-mass (residue) ;; todo is a hash table faster here? we could also do an array ;; starting at (char-code #\A) if we really wanted (ecase residue - ;; These have to be doubles or we get too much rounding error. It's fine. - (#\A 71.03711d0) - (#\C 103.00919d0) - (#\D 115.02694d0) - (#\E 129.04259d0) - (#\F 147.06841d0) - (#\G 57.02146d0) - (#\H 137.05891d0) - (#\I 113.08406d0) - (#\K 128.09496d0) - (#\L 113.08406d0) - (#\M 131.04049d0) - (#\N 114.04293d0) - (#\P 97.05276d0) - (#\Q 128.05858d0) - (#\R 156.10111d0) - (#\S 87.03203d0) - (#\T 101.04768d0) - (#\V 99.06841d0) - (#\W 186.07931d0) - (#\Y 163.06333d0))) + ;; Just use rationals to avoid rounding errors and make them usable in hash + ;; tables. This doesn't have to be particularly fast. + (#\A 71.03711r0) + (#\C 103.00919r0) + (#\D 115.02694r0) + (#\E 129.04259r0) + (#\F 147.06841r0) + (#\G 57.02146r0) + (#\H 137.05891r0) + (#\I 113.08406r0) + (#\K 128.09496r0) + (#\L 113.08406r0) + (#\M 131.04049r0) + (#\N 114.04293r0) + (#\P 97.05276r0) + (#\Q 128.05858r0) + (#\R 156.10111r0) + (#\S 87.03203r0) + (#\T 101.04768r0) + (#\V 99.06841r0) + (#\W 186.07931r0) + (#\Y 163.06333r0))) + +(defun roughly= (a b &optional (epsilon (* 0.000001d0 (min a b)))) + (< (abs (- a b)) epsilon)) + +(defun residue-with-mass (mass) + (rassocar mass u:*monoisotopic-masses* :test #'roughly=)) + +(defun monoisotopic-mass (protein &key (start 0) (end (length protein))) + (check-type protein string) + (loop :for i :from start :below end + :summing (residue-mass (char protein i)))) ;;;; Strings ------------------------------------------------------------------