Posted to tcl by mjanssen at Thu Feb 15 19:46:27 GMT 2018view pretty
package require tdom proc debug args {} proc doit {} { tDOM::pullparser pp -ignorewhitecdata set file 8gbfile set channel [open $file] fconfigure $channel -encoding utf-8 pp inputchannel $channel set start [clock milliseconds] puts [pp state] set package 0 while {[set state [pp next]] ne "END_DOCUMENT"} { # puts -nonewline "$state " switch $state { "START_TAG" { if {[pp tag] eq "doc"} { debug "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY" set result [parseMainDoc] debug "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" incr package if {$package % 100 == 0} { puts "$package packages handled in delta [expr {[clock milliseconds] - $start}] ms" set start [clock milliseconds] } } } } } pp delete } proc parseMainDoc {} { while {[set state [pp next]] ne "END_DOCUMENT"} { switch $state { "START_TAG" { if {[pp tag] eq "doc"} { debug "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" set result [parseRestriction] debug "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB" } } "TEXT" { set trimmed [string trim [pp text]] if {$trimmed ne {}} { debug $trimmed } } "END_TAG" { if {[pp tag] eq "doc"} { return } } } } } proc parseRestriction {} { while {[set state [pp next]] ne "END_DOCUMENT"} { switch $state { "END_TAG" { # puts "xxxxxxxxx[pp tag]" if {[pp tag] eq "doc"} { # puts $result return } } "TEXT" { # set trimmed {} set trimmed [string trim [pp text]] if {$trimmed ne {}} { debug $trimmed } } } } } doit