Posted to tcl by aspect at Sun Jul 20 16:41:29 GMT 2014view pretty
# coded up over the space of an hour to download an archive of released-for-free MSDN ebooks from: set topurl {http://blogs.msdn.com/b/mssmallbiz/archive/2014/07/07/largest-collection-of-free-microsoft-ebooks-ever-including-windows-8-1-windows-8-windows-7-office-2013-office-365-office-2010-sharepoint-2013-dynamics-crm-powershell-exchange-server-lync-2013-system-center-azure-cloud-sql.aspx} package require http package require tdom # support https: package require tls ::http::register https 443 ::tls::socket # helper for composing scripts: proc script {args} { join [lmap a $args {concat {*}$a}] \; } # pop 1 or more items from the start of a list (into named args). Returns last item popped. proc lpop {_ls args} { upvar 1 $_ls ls if {$args eq ""} { set ls [lassign $ls x] return $x } tailcall try [script {*}[lmap a $args { list set [list $a] \[[list lpop $_ls]\] }]] # kept for checking: tailcall try [join [ lmap a $args { concat set [list $a] "\[[list lpop $_ls]\]" } ] \;] } # [lany .... body] # -> [apply {{}}[lmap ... {if $body {return 1}}] # .. this needs to make an [apply] frame too, or gensym a variable in the caller's scope #proc lany {args} { # set script [lindex $args end] # set args [lrange $args 0 end-1] # uplevel 1 [format %s { # list lmap %s { # subst -nocommands {if {[%s]} {expr 1}} # } # return 0 # } $args $script] #} # useful helpers for crawling: # proc seen? s { global seen if {[info exists seen($s)]} { return true } incr seen($s) return 0 } proc throttle {n ms} { upvar 1 throttle($n,$ms) throttle set now [clock milliseconds] lappend throttle [expr {$now+$ms}] while {[llength $throttle] > $n} { set sleep [expr {[lpop throttle]-$now}] if {$sleep <= 0} continue puts "More than $n in the last $ms ms, sleeping $sleep ms .." after $ms } } proc geturl {url} { try { puts "Getting $url ..." set url [string map {\ %20} $url] ;# erk set tok [::http::geturl $url] upvar #0 $tok state if {$state(status) ne "ok"} { error $state(status) } return $state(body) } finally { ::http::cleanup $tok } } proc write_file {filename data} { puts "Saving $filename ..." if {[file exists $filename]} { error "$filename exists!" } set fd [open $filename w] chan configure $fd -translation binary puts -nonewline $fd $data close $fd } proc setdom {_dom html} { upvar 1 $_dom dom if {[info exists dom]} {$dom delete; unset dom} set dom [dom parse -html $html] } set html [geturl $topurl] setdom dom $html set urls [$dom selectNodes {//a[.="PDF"]/@href}] set urls [lmap x $urls {lindex $x 1}] set skip 0 ;# - optional pre-skip set skipped {} ;# URLs that might need to be visited by hand foreach url $urls { if {$skip} {incr skip -1; continue} while {![seen? $url]} { ;# follow redirections, the lazy man's way throttle 10 1000 ;# max 10 requests per second set html [geturl $url] if {![string match {<html*} $html]} { write_file [file tail $url] $html break } setdom dom $html set nodes [$dom selectNodes {//a/@href}] if {[llength $nodes] != 1} { ;# is there a single link in here? error "I don't know how to follow redirects from $url .." } set url [lindex $nodes 0 1] apply {{url} { ;# [apply] gives us a scope to [return -code break] foreach {what pattern} { ms *microsoft.com/*/confirmation.aspx* ms *microsoft.com/*/details.aspx* lulu *.lulu.com/* codeplex *.codeplex.com/downloads/get* } { if {[string match $pattern $url]} { puts "SKIPPING $what: $url" lappend skipped $url return -code break } } }} $url if {[string match *.pdf $url] && [file exists [file tail $url]]} { puts "Have $url .." break } } }