src/alignment.lisp @ f2a11ed01196 default tip

Add Needleman-Wunsch
author Steve Losh <steve@stevelosh.com>
date Sat, 02 Nov 2019 11:06:08 -0400
parents (none)
children (none)
(defpackage :sand.alignment
  (:use :cl :losh :iterate))

(in-package :sand.alignment)

;;;; An implementation of the Needleman-Wunsch global alignment algorithm.

(declaim (ftype (function (base-char base-char) fixnum) basic-score))

(defun basic-score (a b)
  (if (char= a b)
    1
    -1))

(defun make-needleman-wunsch-grid (rows cols)
  (let ((grid (make-array (list rows cols)
                :element-type 'fixnum
                :initial-element 0)))
    (iterate (for row :below rows)
             (for score :downfrom 0)
             (setf (aref grid row 0) score))
    (iterate (for col :below cols)
             (for score :downfrom 0)
             (setf (aref grid 0 col) score))
    grid))

(defun extract-path (prev)
  (iterate
    (with (rows cols) = (array-dimensions prev))
    (for coord
         :first (cons (1- rows) (1- cols)) ; start at the bottom right
         :then (aref prev (car coord) (cdr coord))) ; move according to the path
    (until (null coord))
    (collect coord :at :start)))

(defun compute-alignment (seq1 seq2 path)
  (macrolet
      ((mark (&body dest-value-pairs)
         `(progn
            ,@(loop :for (dest value) :on dest-value-pairs :by #'cddr
                    :collect `(collect ,value
                                :into ,dest
                                :result-type 'simple-base-string)))))
    (iterate
      (for (r . c) :in path)
      (for pr :previous r)
      (for pc :previous c)
      (if-first-time
        ;; Chew off the gap at the beginning of either sequence, if any.
        (progn (dotimes (i r)
                 (mark align1 #\-
                       middle #\Space
                       align2 (aref seq2 i)))
               (dotimes (i c)
                 (mark align1 (aref seq1 i)
                       middle #\Space
                       align2 #\-)))
        ;; Mark another character.
        (progn
          (for ch1 = (aref seq1 (1- c)))
          (for ch2 = (aref seq2 (1- r)))
          (mark align1 (if (= c pc) #\- ch1)
                align2 (if (= r pr) #\- ch2)
                middle (cond
                         ((or (= r pr) (= c pc)) #\space)
                         ((char= ch1 ch2) #\|)
                         (t #\x)))))
      (finally (return (values align1 middle align2))))))

(defun print-grid (seq1 seq2 grid &key (pad 4))
  (flet ((print-padded (seq)
           (map nil (lambda (el) (format t "~V@A" pad el)) seq))
         (row-of (array row) ; ugly hack, but just for debugging…
           (iterate
             (for col :below (array-dimension array 1))
             (collect (aref array row col)))))
    (print-padded "  ")
    (print-padded seq1)
    (terpri)
    (iterate
      (for row :below (array-dimension grid 0))
      (print-padded (list (if (plusp row)
                            (aref seq2 (1- row))
                            #\Space)))
      (print-padded (row-of grid row))
      (terpri))))

(defun needleman-wunsch (seq1 seq2 &key
                         (score-function #'basic-score)
                         (gap-score -1))
  (setf seq1 (coerce seq1 'simple-base-string)
        seq2 (coerce seq2 'simple-base-string))
  (iterate
    (with rows = (1+ (length seq2)))
    (with cols = (1+ (length seq1)))
    (with grid = (make-needleman-wunsch-grid rows cols))
    (with prev = (make-array (list rows cols) :initial-element nil))
    (for row :from 1 :below rows)
    (for ch2 :in-string seq2)
    (iterate
      (for col :from 1 :below cols)
      (for ch1 :in-string seq1)
      (for row-1 = (1- row))
      (for col-1 = (1- col))
      (for match = (+ (aref grid row-1 col-1)
                      (funcall score-function ch1 ch2)))
      (for gap-1 = (+ (aref grid row col-1) gap-score))
      (for gap-2 = (+ (aref grid row-1 col) gap-score))
      (for best = (max match gap-1 gap-2))
      (setf (aref grid row col) best
            (aref prev row col) (cond
                                  ((= best match) (cons row-1 col-1))
                                  ((= best gap-1) (cons row col-1))
                                  ((= best gap-2) (cons col-1 row)))))
    (finally
      (print-grid seq1 seq2 grid)
      (return
        (compute-alignment seq1 seq2 (extract-path prev))))))



;; (needleman-wunsch
;;   "GGGGTTATAAAAC"
;;   "GGTAT")