Posted to tcl by mjanssen at Thu Feb 15 19:46:27 GMT 2018view raw

  1. package require tdom
  2.  
  3. proc debug args {}
  4.  
  5. proc doit {} {
  6. tDOM::pullparser pp -ignorewhitecdata
  7. set file 8gbfile
  8. set channel [open $file]
  9. fconfigure $channel -encoding utf-8
  10. pp inputchannel $channel
  11. set start [clock milliseconds]
  12. puts [pp state]
  13. set package 0
  14. while {[set state [pp next]] ne "END_DOCUMENT"} {
  15. # puts -nonewline "$state "
  16. switch $state {
  17. "START_TAG" {
  18. if {[pp tag] eq "doc"} {
  19. debug "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY"
  20. set result [parseMainDoc]
  21. debug "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
  22. incr package
  23. if {$package % 100 == 0} {
  24. puts "$package packages handled in delta [expr {[clock milliseconds] - $start}] ms"
  25. set start [clock milliseconds]
  26. }
  27. }
  28. }
  29. }
  30. }
  31. pp delete
  32. }
  33.  
  34. proc parseMainDoc {} {
  35. while {[set state [pp next]] ne "END_DOCUMENT"} {
  36. switch $state {
  37. "START_TAG" {
  38. if {[pp tag] eq "doc"} {
  39. debug "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
  40. set result [parseRestriction]
  41. debug "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"
  42. }
  43. }
  44. "TEXT" {
  45. set trimmed [string trim [pp text]]
  46. if {$trimmed ne {}} {
  47. debug $trimmed
  48. }
  49. }
  50. "END_TAG" {
  51. if {[pp tag] eq "doc"} {
  52. return
  53. }
  54. }
  55. }
  56. }
  57. }
  58.  
  59. proc parseRestriction {} {
  60. while {[set state [pp next]] ne "END_DOCUMENT"} {
  61. switch $state {
  62. "END_TAG" {
  63. # puts "xxxxxxxxx[pp tag]"
  64. if {[pp tag] eq "doc"} {
  65. # puts $result
  66. return
  67. }
  68. }
  69. "TEXT" {
  70. # set trimmed {}
  71. set trimmed [string trim [pp text]]
  72. if {$trimmed ne {}} {
  73. debug $trimmed
  74. }
  75. }
  76. }
  77. }
  78. }
  79.  
  80. doit
  81.