src/problems/mprt.lisp @ 23151d9021cf
REVP
| author | Steve Losh <steve@stevelosh.com> |
|---|---|
| date | Thu, 08 Nov 2018 22:06:36 -0500 |
| parents | b3598dc7ef5c |
| children | 2735aa6aab79 |
(in-package :rosalind) ;; This was pretty simple, except for discovering that cl-ppcre's all-matches ;; function skips overlapping matches. Otherwise we just convert the motif to ;; a regex and handle grabbing the data from Uniprot (which is straightforward ;; but can be slow). (defparameter *input-mprt* "A2Z669 B5ZC00 P07204_TRBM_HUMAN P20840_SAG1_YEAST") (defparameter *output-mprt* "B5ZC00 85 118 142 306 395 P07204_TRBM_HUMAN 47 115 116 382 409 P20840_SAG1_YEAST 79 109 135 248 306 348 364 402 485 501 614 ") (defparameter *motif-n-glycosylation* "N{P}[ST]{P}") (defun motif-to-regex (motif) "Turn a protein motif shorthand into a PPCRE scanner." (-<> motif ;; All we have to do is turn {X} into [^X] and compile. (ppcre:regex-replace-all "[{]" <> "[^") (substitute #\] #\} <>) ppcre:create-scanner)) (defun all-matches-dammit (regex target-string) "Return a list of start and end positions of all matches of `regex` on `target-string`. Unlike `ppcre:all-matches` this will return ALL matches, even if they're overlapping. Example: (all-matches-dammit \"a..\" \"aabc\") ; => ; (0 3 1 4) (ppcre:all-matches \"a..\" \"aabc\") ; => ; (0 3) ; dammit " ;; cl-ppcre (iterate (with i = 0) (for (values start end) = (ppcre:scan regex target-string :start i)) (while start) (collect start) (collect end) (setf i (1+ start)))) (define-problem mprt (data stream) *input-mprt* *output-mprt* (with-output-to-string (s) (iterate (with n-glycosylation = (motif-to-regex *motif-n-glycosylation*)) (for id :in-stream data :using #'read-line) (for (nil . protein) = (uniprot id)) (for matches = (all-matches-dammit n-glycosylation protein)) (when matches (format s "~A~%~{~D~*~^ ~}~%" id (mapcar #'1+ matches))))))