f2a11ed01196 default tip

Add Needleman-Wunsch
[view raw] [browse files]
author Steve Losh <steve@stevelosh.com>
date Sat, 02 Nov 2019 11:06:08 -0400
parents ea247d3d5953
children (none)
branches/tags default tip
files sand.asd src/alignment.lisp

Changes

--- a/sand.asd	Fri Feb 02 00:03:38 2018 -0500
+++ b/sand.asd	Sat Nov 02 11:06:08 2019 -0400
@@ -7,41 +7,11 @@
   :license "MIT"
   :version "0.0.1"
 
-  :depends-on (
-
-               #+sbcl :sb-sprof
-               ;; :cffi
-               ;; :cl-algebraic-data-type
-               ;; :cl-charms
-               ;; :cl-fad
-               ;; :cl-ppcre
-               ;; :clss
-               ;; :compiler-macro
-               ;; :cl-conspack
-               ;; :drakma
-               ;; :easing
-               ;; :flexi-streams
-               ;; :function-cache
+  :depends-on (:alexandria
                :iterate
-               :losh
-               ;; :parenscript
-               ;; :parse-float
-               ;; :plump
-               ;; :rs-colors
-               ;; :sanitize
-               ;; :sketch
-               ;; :split-sequence
-               ;; :storable-functions
-               ;; :trivia
-               ;; :trivial-main-thread
-               ;; :vex
-               ;; :yason
-
-               )
+               :losh)
 
   :serial t
-  :components
-  ((:module "vendor"
-    :serial t
-    :components ((:file "quickutils")))
-   ))
+  :components ((:module "vendor"
+                :serial t
+                :components ((:file "quickutils")))))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/alignment.lisp	Sat Nov 02 11:06:08 2019 -0400
@@ -0,0 +1,124 @@
+(defpackage :sand.alignment
+  (:use :cl :losh :iterate))
+
+(in-package :sand.alignment)
+
+;;;; An implementation of the Needleman-Wunsch global alignment algorithm.
+
+(declaim (ftype (function (base-char base-char) fixnum) basic-score))
+
+(defun basic-score (a b)
+  (if (char= a b)
+    1
+    -1))
+
+(defun make-needleman-wunsch-grid (rows cols)
+  (let ((grid (make-array (list rows cols)
+                :element-type 'fixnum
+                :initial-element 0)))
+    (iterate (for row :below rows)
+             (for score :downfrom 0)
+             (setf (aref grid row 0) score))
+    (iterate (for col :below cols)
+             (for score :downfrom 0)
+             (setf (aref grid 0 col) score))
+    grid))
+
+(defun extract-path (prev)
+  (iterate
+    (with (rows cols) = (array-dimensions prev))
+    (for coord
+         :first (cons (1- rows) (1- cols)) ; start at the bottom right
+         :then (aref prev (car coord) (cdr coord))) ; move according to the path
+    (until (null coord))
+    (collect coord :at :start)))
+
+(defun compute-alignment (seq1 seq2 path)
+  (macrolet
+      ((mark (&body dest-value-pairs)
+         `(progn
+            ,@(loop :for (dest value) :on dest-value-pairs :by #'cddr
+                    :collect `(collect ,value
+                                :into ,dest
+                                :result-type 'simple-base-string)))))
+    (iterate
+      (for (r . c) :in path)
+      (for pr :previous r)
+      (for pc :previous c)
+      (if-first-time
+        ;; Chew off the gap at the beginning of either sequence, if any.
+        (progn (dotimes (i r)
+                 (mark align1 #\-
+                       middle #\Space
+                       align2 (aref seq2 i)))
+               (dotimes (i c)
+                 (mark align1 (aref seq1 i)
+                       middle #\Space
+                       align2 #\-)))
+        ;; Mark another character.
+        (progn
+          (for ch1 = (aref seq1 (1- c)))
+          (for ch2 = (aref seq2 (1- r)))
+          (mark align1 (if (= c pc) #\- ch1)
+                align2 (if (= r pr) #\- ch2)
+                middle (cond
+                         ((or (= r pr) (= c pc)) #\space)
+                         ((char= ch1 ch2) #\|)
+                         (t #\x)))))
+      (finally (return (values align1 middle align2))))))
+
+(defun print-grid (seq1 seq2 grid &key (pad 4))
+  (flet ((print-padded (seq)
+           (map nil (lambda (el) (format t "~V@A" pad el)) seq))
+         (row-of (array row) ; ugly hack, but just for debugging…
+           (iterate
+             (for col :below (array-dimension array 1))
+             (collect (aref array row col)))))
+    (print-padded "  ")
+    (print-padded seq1)
+    (terpri)
+    (iterate
+      (for row :below (array-dimension grid 0))
+      (print-padded (list (if (plusp row)
+                            (aref seq2 (1- row))
+                            #\Space)))
+      (print-padded (row-of grid row))
+      (terpri))))
+
+(defun needleman-wunsch (seq1 seq2 &key
+                         (score-function #'basic-score)
+                         (gap-score -1))
+  (setf seq1 (coerce seq1 'simple-base-string)
+        seq2 (coerce seq2 'simple-base-string))
+  (iterate
+    (with rows = (1+ (length seq2)))
+    (with cols = (1+ (length seq1)))
+    (with grid = (make-needleman-wunsch-grid rows cols))
+    (with prev = (make-array (list rows cols) :initial-element nil))
+    (for row :from 1 :below rows)
+    (for ch2 :in-string seq2)
+    (iterate
+      (for col :from 1 :below cols)
+      (for ch1 :in-string seq1)
+      (for row-1 = (1- row))
+      (for col-1 = (1- col))
+      (for match = (+ (aref grid row-1 col-1)
+                      (funcall score-function ch1 ch2)))
+      (for gap-1 = (+ (aref grid row col-1) gap-score))
+      (for gap-2 = (+ (aref grid row-1 col) gap-score))
+      (for best = (max match gap-1 gap-2))
+      (setf (aref grid row col) best
+            (aref prev row col) (cond
+                                  ((= best match) (cons row-1 col-1))
+                                  ((= best gap-1) (cons row col-1))
+                                  ((= best gap-2) (cons col-1 row)))))
+    (finally
+      (print-grid seq1 seq2 grid)
+      (return
+        (compute-alignment seq1 seq2 (extract-path prev))))))
+
+
+
+;; (needleman-wunsch
+;;   "GGGGTTATAAAAC"
+;;   "GGTAT")