Posted to tcl by sebres at Wed Feb 06 17:34:59 GMT 2019view raw
- ## ----------------------------------------------------------------------------
- ## utf-bytes-needed --
- ## 
- ## Checks end of byte-array is fulfilled and returns how many bytes are  needed
- ## to get complete utf-8 sequence.
- ## ----------------------------------------------------------------------------
-  
- variable totalBytes [split {
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-     4,4,4,4,4,4,4,4,
-     1,1,1,1,1,1,1,1
- } ,]
- proc utf-bytes-needed {barr} {
-     if {![string length $barr]} {return 0}
-  
-     for {set i 0} {$i < 4} {incr i} {
-         set look [string index $barr end-$i]
-         if {$look eq ""} { #out of start
-             incr i -1
-             break;
-         }
-         binary scan $look cu b
-         if {$b < 0x80} {
-             break;
-         }
-         if {$b >= 0xC0} {
-             break;
-         }
-     }
-  
-     variable totalBytes
-     expr {[lindex $totalBytes $b] - [incr i]}
- }
-  
- ## ----------------------------------------------------------------------
- ## Tests:
- ## ----------------------------------------------------------------------
-  
- foreach v {
-     0x24 {0xC2 0xA2} {0xE2 0x82 0xAC}
-     {0xC2} {0xE2} {0xE2 0x82}
- } {
-     # set v [list 0x41 0x42 0x43 {*}$v]
-     set b [binary format [string repeat cu [llength $v]] {*}$v]
-     set needed [utf-bytes-needed $b]
-     puts [format "%-30s %-4s\t(%s)\t needs %d byte(s)" $v $b [encoding convertfrom utf-8 $b] $needed]
- }
-  
- ## ----------------------------------------------------------------------
- ## Results:
- ## ----------------------------------------------------------------------
-  
- # 0x24                         $   	($)	 needs 0 byte(s)
- # 0xC2 0xA2                    ¢  	(¢)	 needs 0 byte(s)
- # 0xE2 0x82 0xAC               € 	(¬)	 needs 0 byte(s)
- # 0xC2                         Â   	(Â)	 needs 1 byte(s)
- # 0xE2                         â   	(â)	 needs 2 byte(s)
- # 0xE2 0x82                    â  	(â)	 needs 1 byte(s)
-