Posted to tcl by aspect at Sun Jul 20 16:41:29 GMT 2014view raw

  1. # coded up over the space of an hour to download an archive of released-for-free MSDN ebooks from:
  2. set topurl {http://blogs.msdn.com/b/mssmallbiz/archive/2014/07/07/largest-collection-of-free-microsoft-ebooks-ever-including-windows-8-1-windows-8-windows-7-office-2013-office-365-office-2010-sharepoint-2013-dynamics-crm-powershell-exchange-server-lync-2013-system-center-azure-cloud-sql.aspx}
  3.  
  4. package require http
  5. package require tdom
  6.  
  7. # support https:
  8. package require tls
  9. ::http::register https 443 ::tls::socket
  10.  
  11. # helper for composing scripts:
  12. proc script {args} {
  13. join [lmap a $args {concat {*}$a}] \;
  14. }
  15.  
  16. # pop 1 or more items from the start of a list (into named args). Returns last item popped.
  17. proc lpop {_ls args} {
  18. upvar 1 $_ls ls
  19. if {$args eq ""} {
  20. set ls [lassign $ls x]
  21. return $x
  22. }
  23. tailcall try [script {*}[lmap a $args {
  24. list set [list $a] \[[list lpop $_ls]\]
  25. }]]
  26. # kept for checking:
  27. tailcall try [join [
  28. lmap a $args {
  29. concat set [list $a] "\[[list lpop $_ls]\]"
  30. }
  31. ] \;]
  32. }
  33.  
  34. # [lany .... body]
  35. # -> [apply {{}}[lmap ... {if $body {return 1}}]
  36. # .. this needs to make an [apply] frame too, or gensym a variable in the caller's scope
  37. #proc lany {args} {
  38. # set script [lindex $args end]
  39. # set args [lrange $args 0 end-1]
  40. # uplevel 1 [format %s {
  41. # list lmap %s {
  42. # subst -nocommands {if {[%s]} {expr 1}}
  43. # }
  44. # return 0
  45. # } $args $script]
  46. #}
  47.  
  48. # useful helpers for crawling:
  49. #
  50. proc seen? s {
  51. global seen
  52. if {[info exists seen($s)]} {
  53. return true
  54. }
  55. incr seen($s)
  56. return 0
  57. }
  58.  
  59. proc throttle {n ms} {
  60. upvar 1 throttle($n,$ms) throttle
  61. set now [clock milliseconds]
  62. lappend throttle [expr {$now+$ms}]
  63. while {[llength $throttle] > $n} {
  64. set sleep [expr {[lpop throttle]-$now}]
  65. if {$sleep <= 0} continue
  66. puts "More than $n in the last $ms ms, sleeping $sleep ms .."
  67. after $ms
  68. }
  69. }
  70.  
  71. proc geturl {url} {
  72. try {
  73. puts "Getting $url ..."
  74. set url [string map {\ %20} $url] ;# erk
  75. set tok [::http::geturl $url]
  76. upvar #0 $tok state
  77. if {$state(status) ne "ok"} {
  78. error $state(status)
  79. }
  80. return $state(body)
  81. } finally {
  82. ::http::cleanup $tok
  83. }
  84. }
  85.  
  86. proc write_file {filename data} {
  87. puts "Saving $filename ..."
  88. if {[file exists $filename]} {
  89. error "$filename exists!"
  90. }
  91. set fd [open $filename w]
  92. chan configure $fd -translation binary
  93. puts -nonewline $fd $data
  94. close $fd
  95. }
  96.  
  97. proc setdom {_dom html} {
  98. upvar 1 $_dom dom
  99. if {[info exists dom]} {$dom delete; unset dom}
  100. set dom [dom parse -html $html]
  101. }
  102.  
  103. set html [geturl $topurl]
  104. setdom dom $html
  105.  
  106. set urls [$dom selectNodes {//a[.="PDF"]/@href}]
  107. set urls [lmap x $urls {lindex $x 1}]
  108.  
  109. set skip 0 ;# - optional pre-skip
  110. set skipped {} ;# URLs that might need to be visited by hand
  111.  
  112. foreach url $urls {
  113. if {$skip} {incr skip -1; continue}
  114. while {![seen? $url]} { ;# follow redirections, the lazy man's way
  115. throttle 10 1000 ;# max 10 requests per second
  116. set html [geturl $url]
  117. if {![string match {<html*} $html]} {
  118. write_file [file tail $url] $html
  119. break
  120. }
  121. setdom dom $html
  122. set nodes [$dom selectNodes {//a/@href}]
  123. if {[llength $nodes] != 1} { ;# is there a single link in here?
  124. error "I don't know how to follow redirects from $url .."
  125. }
  126. set url [lindex $nodes 0 1]
  127. apply {{url} { ;# [apply] gives us a scope to [return -code break]
  128. foreach {what pattern} {
  129. ms *microsoft.com/*/confirmation.aspx*
  130. ms *microsoft.com/*/details.aspx*
  131. lulu *.lulu.com/*
  132. codeplex *.codeplex.com/downloads/get*
  133. } {
  134. if {[string match $pattern $url]} {
  135. puts "SKIPPING $what: $url"
  136. lappend skipped $url
  137. return -code break
  138. }
  139. }
  140. }} $url
  141. if {[string match *.pdf $url] && [file exists [file tail $url]]} {
  142. puts "Have $url .."
  143. break
  144. }
  145. }
  146. }