src/robots/hacker-booze.lisp @ 0959bec6eaa2
Naming
author |
Steve Losh <steve@stevelosh.com> |
date |
Sat, 21 Jan 2017 01:33:05 +0000 |
parents |
1a365d4ae7bc |
children |
a5de24e85f19 |
(in-package :magitek.robots.hacker-booze)
; https://www.youtube.com/watch?v=2eIFeTn5nJg
;;;; Utils --------------------------------------------------------------------
(defun tick (ch)
(write-char ch)
(finish-output))
(defun read-corpus (path)
(read-file-into-string path))
(defun write-corpus (corpus path)
(write-string-into-file corpus path
:if-exists :supersede))
;;;; Hacker News --------------------------------------------------------------
(defparameter *errors* 0)
(defparameter *stories-per-corpus* 30)
(defparameter *max-comments-per-story* 200)
(defparameter *hn-corpus-path* "corpora/hacker-news.txt")
(defun firebase-get (url)
(-> url
drakma:http-request
(flex:octets-to-string :external-format :utf-8)
(jonathan:parse :as :hash-table)))
(defun hn-top ()
(firebase-get "https://hacker-news.firebaseio.com/v0/topstories.json"))
(defun hn-item (id)
(firebase-get
(format nil "https://hacker-news.firebaseio.com/v0/item/~d.json" id)))
(defun hn-story (story-id)
(hn-item story-id))
(defun hn-comment (story-id)
(hn-item story-id))
(defun hn-text (comment)
(-> (gethash "text" comment)
sanitize:clean
html-entities:decode-entities))
(defun hn-comments (story-id)
(iterate
(with story = (hn-story story-id))
(with children = (gethash "kids" story))
(repeat *max-comments-per-story*)
; (sleep 0.1)
(while children)
(for child-id = (pop children))
(for child = (handler-case (hn-comment child-id)
(drakma::drakma-simple-error () (incf *errors*) nil)))
(if child
(progn
(tick #\.)
(collect child)
(setf children (append children (gethash "kids" child))))
(tick #\x))
(finally (terpri))))
(defun rebuild-hn-corpus ()
(write-corpus (-<> (hn-top)
(take *stories-per-corpus* <>)
(mapcan #'hn-comments <>)
(remove-if-not #'identity <>)
(mapcar #'hn-text <>)
(format nil "~{~a~%~}" <>))
*hn-corpus-path*)
(values))
(defun read-hn-corpus ()
(read-corpus *hn-corpus-path*))
;;;; Beer ---------------------------------------------------------------------
(defparameter *ratebeer-pages* 5)
(defparameter *ratebeer-corpus-path* "corpora/ratebeer.txt")
(clss:define-pseudo-selector no-class (node)
(null (plump:attribute node "class")))
(defun ratebeer-get (page)
(-<> (format nil "http://www.ratebeer.com/beer-ratings/0/~d/" page)
drakma:http-request
plump:parse))
(defun ratebeer-clean (raw)
"Return a list of review strings."
(-<> raw
(clss:select "table.table td > span:no-class" <>)
(map 'list #'plump:text <>)))
(defun rebuild-ratebeer-corpus ()
(write-corpus (iterate
(for page :from 1 :to *ratebeer-pages*)
(appending (ratebeer-clean (ratebeer-get page)) :into reviews)
(tick #\.)
(finally (return (format nil "~{~A~%~}" reviews))))
*ratebeer-corpus-path*)
(values))
(defun read-ratebeer-corpus ()
(read-corpus *ratebeer-corpus-path*))
;;;; Wine ---------------------------------------------------------------------
(defparameter *wine-pages* 20)
(defparameter *wine-corpus-path* "corpora/wine.txt")
(defun wine-get-list (page-number)
(-<> (format nil "http://www.winemag.com/?s=&drink_type=wine&page=~D"
page-number)
drakma:http-request
plump:parse))
(defun wine-get-review (url)
(-<> url
drakma:http-request
plump:parse))
(defun wine-clean-list (list-page)
(-<> list-page
(clss:select "a.review-listing" <>)
(map 'list (rcurry #'plump:attribute "href") <>)))
(defun wine-clean-review (review-page)
(-<> review-page
(clss:select "#review .description" <>)
(elt <> 0)
(plump:text <>)))
(defun wine-get-reviews (page)
(iterate
(for review-link :in (wine-clean-list (wine-get-list page)))
(collect (wine-clean-review (wine-get-review review-link)))
(tick #\.)
(finally (terpri))))
(defun rebuild-wine-corpus ()
(write-corpus (iterate
(for page :from 1 :to *wine-pages*)
(appending (wine-get-reviews page) :into reviews)
(finally (return (format nil "~{~A~%~}" reviews))))
*wine-corpus-path*)
(values))
(defun read-wine-corpus ()
(read-corpus *wine-corpus-path*))
;;;; Generate -----------------------------------------------------------------
(defparameter *markov* nil)
(defparameter *markov-order* 2)
(defun load-corpora ()
(setf *markov*
(magitek.markov:build-markov-generator
(concatenate 'string
(read-wine-corpus)
(read-hn-corpus))
*markov-order*))
(values))
(defun random-string ()
(magitek.markov:generate-sentence *markov*))