Posted to tcl by sebres at Wed Feb 06 17:34:59 GMT 2019view raw
- ## ----------------------------------------------------------------------------
- ## utf-bytes-needed --
- ##
- ## Checks end of byte-array is fulfilled and returns how many bytes are needed
- ## to get complete utf-8 sequence.
- ## ----------------------------------------------------------------------------
- variable totalBytes [split {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 4,4,4,4,4,4,4,4,
- 1,1,1,1,1,1,1,1
- } ,]
- proc utf-bytes-needed {barr} {
- if {![string length $barr]} {return 0}
- for {set i 0} {$i < 4} {incr i} {
- set look [string index $barr end-$i]
- if {$look eq ""} { #out of start
- incr i -1
- break;
- }
- binary scan $look cu b
- if {$b < 0x80} {
- break;
- }
- if {$b >= 0xC0} {
- break;
- }
- }
- variable totalBytes
- expr {[lindex $totalBytes $b] - [incr i]}
- }
- ## ----------------------------------------------------------------------
- ## Tests:
- ## ----------------------------------------------------------------------
- foreach v {
- 0x24 {0xC2 0xA2} {0xE2 0x82 0xAC}
- {0xC2} {0xE2} {0xE2 0x82}
- } {
- # set v [list 0x41 0x42 0x43 {*}$v]
- set b [binary format [string repeat cu [llength $v]] {*}$v]
- set needed [utf-bytes-needed $b]
- puts [format "%-30s %-4s\t(%s)\t needs %d byte(s)" $v $b [encoding convertfrom utf-8 $b] $needed]
- }
- ## ----------------------------------------------------------------------
- ## Results:
- ## ----------------------------------------------------------------------
- # 0x24 $ ($) needs 0 byte(s)
- # 0xC2 0xA2 ¢ (¢) needs 0 byte(s)
- # 0xE2 0x82 0xAC € (¬) needs 0 byte(s)
- # 0xC2 Â (Â) needs 1 byte(s)
- # 0xE2 â (â) needs 2 byte(s)
- # 0xE2 0x82 â (â) needs 1 byte(s)