Posted to tcl by aspect at Sun Jul 20 16:41:29 GMT 2014view pretty

# coded up over the space of an hour to download an archive of released-for-free MSDN ebooks from:
set topurl {http://blogs.msdn.com/b/mssmallbiz/archive/2014/07/07/largest-collection-of-free-microsoft-ebooks-ever-including-windows-8-1-windows-8-windows-7-office-2013-office-365-office-2010-sharepoint-2013-dynamics-crm-powershell-exchange-server-lync-2013-system-center-azure-cloud-sql.aspx}

package require http
package require tdom

# support https:
package require tls
::http::register https 443 ::tls::socket

# helper for composing scripts:
proc script {args} {
    join [lmap a $args {concat {*}$a}] \;
}

# pop 1 or more items from the start of a list (into named args).  Returns last item popped.
proc lpop {_ls args} {
    upvar 1 $_ls ls
    if {$args eq ""} {
        set ls [lassign $ls x]
        return $x
    }
    tailcall try [script {*}[lmap a $args {
        list set [list $a] \[[list lpop $_ls]\]
    }]]
    # kept for checking:
    tailcall try [join [
        lmap a $args {
            concat set [list $a] "\[[list lpop $_ls]\]"
        }
    ] \;]
}

# [lany .... body]
#  -> [apply {{}}[lmap ... {if $body {return 1}}]
#  .. this needs to make an [apply] frame too, or gensym a variable in the caller's scope
#proc lany {args} {
#    set script [lindex $args end]
#    set args [lrange $args 0 end-1]
#    uplevel 1 [format %s {
#        list lmap %s {
#            subst -nocommands {if {[%s]} {expr 1}}
#        }
#        return 0
#    } $args $script]
#}

# useful helpers for crawling:
#
proc seen? s {
    global seen
    if {[info exists seen($s)]} {
        return true
    }
    incr seen($s)
    return 0
}

proc throttle {n ms} {
    upvar 1 throttle($n,$ms) throttle
    set now [clock milliseconds]
    lappend throttle [expr {$now+$ms}]
    while {[llength $throttle] > $n} {
        set sleep [expr {[lpop throttle]-$now}]
        if {$sleep <= 0} continue
        puts "More than $n in the last $ms ms, sleeping $sleep ms .."
        after $ms
    }
}

proc geturl {url} {
    try {
        puts "Getting $url ..."
        set url [string map {\  %20} $url]  ;# erk
        set tok [::http::geturl $url]
        upvar #0 $tok state
        if {$state(status) ne "ok"} {
            error $state(status)
        }
        return $state(body)
    } finally {
        ::http::cleanup $tok
    }
}

proc write_file {filename data} {
    puts "Saving $filename ..."
    if {[file exists $filename]} {
        error "$filename exists!"
    }
    set fd [open $filename w]
    chan configure $fd -translation binary
    puts -nonewline $fd $data
    close $fd
}

proc setdom {_dom html} {
    upvar 1 $_dom dom
    if {[info exists dom]} {$dom delete; unset dom}
    set dom [dom parse -html $html]
}

set html [geturl $topurl]
setdom dom $html

set urls [$dom selectNodes {//a[.="PDF"]/@href}]
set urls [lmap x $urls {lindex $x 1}]

set skip 0 ;# - optional pre-skip
set skipped {}  ;# URLs that might need to be visited by hand

foreach url $urls {
    if {$skip} {incr skip -1; continue}
    while {![seen? $url]} { ;# follow redirections, the lazy man's way
        throttle 10 1000    ;# max 10 requests per second
        set html [geturl $url]
        if {![string match {<html*} $html]} {
            write_file [file tail $url] $html
            break
        }
        setdom dom $html
        set nodes [$dom selectNodes {//a/@href}]
        if {[llength $nodes] != 1} {    ;# is there a single link in here?
            error "I don't know how to follow redirects from $url .."
        }
        set url [lindex $nodes 0 1]
        apply {{url} {  ;# [apply] gives us a scope to [return -code break]
            foreach {what pattern} {
                    ms      *microsoft.com/*/confirmation.aspx*
                    ms      *microsoft.com/*/details.aspx*
                    lulu    *.lulu.com/*
                    codeplex    *.codeplex.com/downloads/get*
            } {
                if {[string match $pattern $url]} {
                    puts "SKIPPING $what: $url"
                    lappend skipped $url
                    return -code break
                }
            }
        }} $url
        if {[string match *.pdf $url] && [file exists [file tail $url]]} {
            puts "Have $url .."
            break
        }
    }
}