Posted to tcl by aspect at Sun Jul 20 16:41:29 GMT 2014view raw
- # coded up over the space of an hour to download an archive of released-for-free MSDN ebooks from:
- set topurl {http://blogs.msdn.com/b/mssmallbiz/archive/2014/07/07/largest-collection-of-free-microsoft-ebooks-ever-including-windows-8-1-windows-8-windows-7-office-2013-office-365-office-2010-sharepoint-2013-dynamics-crm-powershell-exchange-server-lync-2013-system-center-azure-cloud-sql.aspx}
- package require http
- package require tdom
- # support https:
- package require tls
- ::http::register https 443 ::tls::socket
- # helper for composing scripts:
- proc script {args} {
- join [lmap a $args {concat {*}$a}] \;
- }
- # pop 1 or more items from the start of a list (into named args). Returns last item popped.
- proc lpop {_ls args} {
- upvar 1 $_ls ls
- if {$args eq ""} {
- set ls [lassign $ls x]
- return $x
- }
- tailcall try [script {*}[lmap a $args {
- list set [list $a] \[[list lpop $_ls]\]
- }]]
- # kept for checking:
- tailcall try [join [
- lmap a $args {
- concat set [list $a] "\[[list lpop $_ls]\]"
- }
- ] \;]
- }
- # [lany .... body]
- # -> [apply {{}}[lmap ... {if $body {return 1}}]
- # .. this needs to make an [apply] frame too, or gensym a variable in the caller's scope
- #proc lany {args} {
- # set script [lindex $args end]
- # set args [lrange $args 0 end-1]
- # uplevel 1 [format %s {
- # list lmap %s {
- # subst -nocommands {if {[%s]} {expr 1}}
- # }
- # return 0
- # } $args $script]
- #}
- # useful helpers for crawling:
- #
- proc seen? s {
- global seen
- if {[info exists seen($s)]} {
- return true
- }
- incr seen($s)
- return 0
- }
- proc throttle {n ms} {
- upvar 1 throttle($n,$ms) throttle
- set now [clock milliseconds]
- lappend throttle [expr {$now+$ms}]
- while {[llength $throttle] > $n} {
- set sleep [expr {[lpop throttle]-$now}]
- if {$sleep <= 0} continue
- puts "More than $n in the last $ms ms, sleeping $sleep ms .."
- after $ms
- }
- }
- proc geturl {url} {
- try {
- puts "Getting $url ..."
- set url [string map {\ %20} $url] ;# erk
- set tok [::http::geturl $url]
- upvar #0 $tok state
- if {$state(status) ne "ok"} {
- error $state(status)
- }
- return $state(body)
- } finally {
- ::http::cleanup $tok
- }
- }
- proc write_file {filename data} {
- puts "Saving $filename ..."
- if {[file exists $filename]} {
- error "$filename exists!"
- }
- set fd [open $filename w]
- chan configure $fd -translation binary
- puts -nonewline $fd $data
- close $fd
- }
- proc setdom {_dom html} {
- upvar 1 $_dom dom
- if {[info exists dom]} {$dom delete; unset dom}
- set dom [dom parse -html $html]
- }
- set html [geturl $topurl]
- setdom dom $html
- set urls [$dom selectNodes {//a[.="PDF"]/@href}]
- set urls [lmap x $urls {lindex $x 1}]
- set skip 0 ;# - optional pre-skip
- set skipped {} ;# URLs that might need to be visited by hand
- foreach url $urls {
- if {$skip} {incr skip -1; continue}
- while {![seen? $url]} { ;# follow redirections, the lazy man's way
- throttle 10 1000 ;# max 10 requests per second
- set html [geturl $url]
- if {![string match {<html*} $html]} {
- write_file [file tail $url] $html
- break
- }
- setdom dom $html
- set nodes [$dom selectNodes {//a/@href}]
- if {[llength $nodes] != 1} { ;# is there a single link in here?
- error "I don't know how to follow redirects from $url .."
- }
- set url [lindex $nodes 0 1]
- apply {{url} { ;# [apply] gives us a scope to [return -code break]
- foreach {what pattern} {
- ms *microsoft.com/*/confirmation.aspx*
- ms *microsoft.com/*/details.aspx*
- lulu *.lulu.com/*
- codeplex *.codeplex.com/downloads/get*
- } {
- if {[string match $pattern $url]} {
- puts "SKIPPING $what: $url"
- lappend skipped $url
- return -code break
- }
- }
- }} $url
- if {[string match *.pdf $url] && [file exists [file tail $url]]} {
- puts "Have $url .."
- break
- }
- }
- }