Posted to tcl by mjanssen at Thu Feb 15 19:46:27 GMT 2018view pretty

package require tdom

proc debug args {}

proc doit {} {
  tDOM::pullparser pp -ignorewhitecdata
  set file  8gbfile 
  set channel [open $file]
  fconfigure $channel -encoding utf-8
  pp inputchannel $channel
  set start [clock milliseconds]
  puts [pp state]
  set package 0
  while {[set state [pp next]] ne "END_DOCUMENT"} {
  # puts -nonewline "$state "
    switch $state {
      "START_TAG" {
        if {[pp tag] eq "doc"} {
          debug "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY"
          set result [parseMainDoc]
          debug "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
          incr package
          if {$package % 100 == 0} {
            puts "$package packages handled in delta [expr {[clock milliseconds] - $start}] ms"
            set start [clock milliseconds]
          }
        }
      }
    }
  }
  pp delete
}

proc parseMainDoc {} {
  while {[set state [pp next]] ne "END_DOCUMENT"} {
    switch $state {
      "START_TAG" {
        if {[pp tag] eq "doc"} {
          debug "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
          set result [parseRestriction]
          debug "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"
        }
      }   
      "TEXT" {
        set trimmed [string trim [pp text]]
        if {$trimmed ne {}} {
          debug $trimmed
        }
      }
      "END_TAG" {
        if {[pp tag] eq "doc"} {
          return
        }
      }
    }
  }
}

proc parseRestriction {} {
  while {[set state [pp next]] ne "END_DOCUMENT"} {
    switch $state {
      "END_TAG" {
        # puts "xxxxxxxxx[pp tag]"
        if {[pp tag] eq "doc"} {
          # puts $result
          return 
        }
      }
      "TEXT" {
        # set trimmed {}
        set trimmed [string trim [pp text]]
        if {$trimmed ne {}} {
          debug $trimmed
        }
      }
    }
  }
}

doit