Posted to tcl by mjanssen at Thu Feb 15 19:46:27 GMT 2018view raw
- package require tdom
- proc debug args {}
- proc doit {} {
- tDOM::pullparser pp -ignorewhitecdata
- set file 8gbfile
- set channel [open $file]
- fconfigure $channel -encoding utf-8
- pp inputchannel $channel
- set start [clock milliseconds]
- puts [pp state]
- set package 0
- while {[set state [pp next]] ne "END_DOCUMENT"} {
- # puts -nonewline "$state "
- switch $state {
- "START_TAG" {
- if {[pp tag] eq "doc"} {
- debug "YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY"
- set result [parseMainDoc]
- debug "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
- incr package
- if {$package % 100 == 0} {
- puts "$package packages handled in delta [expr {[clock milliseconds] - $start}] ms"
- set start [clock milliseconds]
- }
- }
- }
- }
- }
- pp delete
- }
- proc parseMainDoc {} {
- while {[set state [pp next]] ne "END_DOCUMENT"} {
- switch $state {
- "START_TAG" {
- if {[pp tag] eq "doc"} {
- debug "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
- set result [parseRestriction]
- debug "BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB"
- }
- }
- "TEXT" {
- set trimmed [string trim [pp text]]
- if {$trimmed ne {}} {
- debug $trimmed
- }
- }
- "END_TAG" {
- if {[pp tag] eq "doc"} {
- return
- }
- }
- }
- }
- }
- proc parseRestriction {} {
- while {[set state [pp next]] ne "END_DOCUMENT"} {
- switch $state {
- "END_TAG" {
- # puts "xxxxxxxxx[pp tag]"
- if {[pp tag] eq "doc"} {
- # puts $result
- return
- }
- }
- "TEXT" {
- # set trimmed {}
- set trimmed [string trim [pp text]]
- if {$trimmed ne {}} {
- debug $trimmed
- }
- }
- }
- }
- }
- doit