8cf52a515a48
Move Markov to Magitek
author | Steve Losh <steve@stevelosh.com> |
---|---|
date | Tue, 17 Jan 2017 17:29:35 +0000 |
parents | 4add15d40994 |
children | 5c5070c21269 |
branches/tags | (none) |
files | package.lisp sand.asd src/markov.lisp |
Changes
--- a/package.lisp Sat Jan 14 15:00:33 2017 +0000 +++ b/package.lisp Tue Jan 17 17:29:35 2017 +0000 @@ -81,16 +81,6 @@ :sand.quickutils :sand.utils)) -(defpackage :sand.markov - (:use - :cl - :cl-arrows - :losh - :iterate - :split-sequence - :sand.quickutils - :sand.utils)) - (defpackage :sand.dijkstra-maps (:use :cl
--- a/sand.asd Sat Jan 14 15:00:33 2017 +0000 +++ b/sand.asd Tue Jan 17 17:29:35 2017 +0000 @@ -56,7 +56,6 @@ (:file "ropes") (:file "sorting") (:file "ascii") - (:file "markov") (:file "dijkstra-maps") #+sbcl (:file "ffi") #+sbcl (:file "profiling")
--- a/src/markov.lisp Sat Jan 14 15:00:33 2017 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,187 +0,0 @@ -(in-package :sand.markov) - -; (defparameter *text* -; (read-file-into-string "data/lightships-and-lighthouses.txt")) - -(defclass markov () - ((database :initarg :database :accessor markov-database) - (beginnings :initarg :beginnings :accessor markov-beginnings))) - - -(defun make-vector (&optional (initial-length 1)) - (make-array initial-length :fill-pointer 0 :adjustable t)) - - -(defun delimiterp (c) - (member c '(#\space #\newline) :test #'char=)) - -(defun sentence-end-p (word) - (member (aref word (1- (length word))) - '(#\. #\? #\!))) - - -(defun split-words (string) - (split-sequence-if #'delimiterp string :remove-empty-subseqs t)) - -(defun partition-if (pred seq) - (iterate - (for element :in seq) - (collect element :into current) - (when (funcall pred element) - (collect current :into result) - (setf current nil)) - (finally (return result)))) - - -(defun build-markov-generator (corpus order) - (let* ((database (make-hash-table :test 'equal)) - (beginnings nil) - (words (split-words corpus)) - (sentences (partition-if #'sentence-end-p words))) - (iterate - (for sentence :in sentences) - (when (> (length sentence) order) - (iterate - (for chunk :in (n-grams (1+ order) sentence)) - (for prefix = (take order chunk)) - (for suffix = (car (last chunk))) - (if-first-time (pushnew prefix beginnings :test 'equal)) - (vector-push-extend - suffix - (ensure-gethash prefix database (make-vector)))))) - (make-instance 'markov - :database database - :beginnings (coerce beginnings 'vector)))) - -(defun generate-sentence (markov) - (iterate - (repeat 50) - (with start = (random-elt (markov-beginnings markov))) - (for prefix :first start :then (append (cdr prefix) (list word))) - (for word = (random-elt (gethash prefix (markov-database markov)))) - (collect word :into sentence) - (until (sentence-end-p word)) - (finally (return (format nil "~{~A~^ ~}" (append start sentence)))))) - - - -(defun firebase-get (url) - (-> url - drakma:http-request - (flex:octets-to-string :external-format :utf-8) - yason:parse)) - -(defun hn-top () - (firebase-get "https://hacker-news.firebaseio.com/v0/topstories.json")) - -(defun hn-item (id) - (firebase-get - (format nil "https://hacker-news.firebaseio.com/v0/item/~d.json" id))) - -(defun hn-story (story-id) - (hn-item story-id)) - -(defun hn-comment (story-id) - (hn-item story-id)) - -(defun hn-text (comment) - (-> (gethash "text" comment) - sanitize:clean - html-entities:decode-entities)) - -(defparameter *errors* 0) - -(defun hn-comments (story-id) - (iterate - (with story = (hn-story story-id)) - (with children = (gethash "kids" story)) - (repeat 50) - ; (sleep 0.1) - (while children) - (for child-id = (pop children)) - (for child = (handler-case (hn-comment child-id) - (drakma::drakma-simple-error () (incf *errors*) nil))) - (when child - (collect child) - (setf children (append children (gethash "kids" child)))))) - -(defvar *hn* nil) - -(defun build-hn-corpus () - (length (setf *hn* (-<> (hn-top) - (take 15 <>) - (mapcan #'hn-comments <>) - (mapcar #'hn-text <>) - (format nil "~{~a~%~}" <>))))) - - -(defun ratebeer-get (page) - (-<> (format nil "http://www.ratebeer.com/beer-ratings/0/~d/" page) - drakma:http-request - plump:parse)) - -(defun ratebeer-clean (raw) - (-<> raw - (plump:get-elements-by-tag-name <> "table") - car - (plump:get-elements-by-tag-name <> "td") - (mapcar (rcurry #'plump:get-elements-by-tag-name "span") <>) - (remove-if-not #'identity <>) - (mapcar #'first <>) - (mapcar #'plump:text <>))) - -(defvar *beer* nil) - -(defun build-beer-corpus () - (length - (setf *beer* - (iterate - (for page :from 1 :to 30) - (appending (ratebeer-clean (ratebeer-get page)) :into reviews) - (finally (return (format nil "~{~A~%~}" reviews))))))) - - - -(defun wine-get-list (page-number) - (-<> (format nil "http://www.winemag.com/?s=&drink_type=wine&page=~D" - page-number) - drakma:http-request - plump:parse)) - -(defun wine-get-review (url) - (-<> url - drakma:http-request - plump:parse)) - - -(defun wine-clean-list (list-page) - (-<> list-page - (clss:select "a.review-listing" <>) - (map 'list (rcurry #'plump:attribute "href") <>))) - -(defun wine-clean-review (review-page) - (plump:text (elt (clss:select "#review .description" review-page) 0))) - - -(defparameter *wine* nil) -(defun build-wine-corpus () - (length - (setf *wine* - (iterate - (for page :from 1 :to 10) - (for review-links = (wine-clean-list (wine-get-list page))) - (appending (mapcar (compose #'wine-clean-review #'wine-get-review) - review-links) - :into reviews) - (finally (return (format nil "~{~A~%~}" reviews))))))) - -; (defparameter *m* -; (build-markov-generator (concatenate 'string *hn* *wine*) 2)) - - -; (iterate (repeat 50) -; (for sentence = (generate-sentence *m*)) -; (when (<= (length sentence) 140) -; (terpri) -; (terpri) -; (princ sentence)))