package require Tcl 8.5 namespace eval string2 { proc _has_surrogate_pair {string} { set b [_get_utf16_byte $string] binary scan $b su* vals foreach val $vals { if {$val >= 0xd800 && $val <= 0xdfff} { return 1 } } return 0 } proc _get_surrogate_pair_from_unicode_point {codepoint} { set plane [expr {$codepoint >> 16}] set lead2 [expr {(($codepoint >> 10) & 0x3f) + 0xd800 + 0x0040 * ($plane - 1)}] set trail2 [expr {($codepoint & 0x3ff) + 0xdc00}] return [list $lead2 $trail2] } proc _get_unicode_point_from_surrogate_pair {lead trail} { set plane [expr {(($lead - 0xd800) / 0x0040) + 1}] set lead2 [expr {$lead - (0xd800 + 0x0040 * ($plane - 1))}] set trail2 [expr {$trail - 0xdc00}] return [expr {$plane * 0x10000 + $lead2 * 0x100 + $lead2 * 0x300 + $trail2}] } proc _get_utf16_byte {string} { return [encoding convertto unicode $string] } proc _get_chars_list {string} { set l [list] set b [_get_utf16_byte $string] binary scan $b su* vals for {set i 0} {$i < [llength $vals]} {incr i} { set val [lindex $vals $i] set char [encoding convertfrom unicode [binary format su* $val]] set codepoint $val if {$val >= 0xd800 && $val <= 0xdbff} { # Surrogate Pair incr i set val2 [lindex $vals $i] set codepoint [_get_unicode_point_from_surrogate_pair $val $val2] append char [encoding convertfrom unicode [binary format su* $val2]] } if {($codepoint >= 0xe0100 && $codepoint <= 0xe01ef) || ($codepoint >= 0x180b && $codepoint <= 0x180d) || ($codepoint >= 0xfe00 && $codepoint <= 0xfe0f)} { # IVD set temp [lindex $l end] append temp $char lset l end $temp } else { # Not IVD (2 or 4 byte char) lappend l $char } } return $l } proc bytelength {string} { return [string bytelength $string] } #string compare #string equal proc first {needleString haystackString {startIndex 0}} { set haystack [_get_chars_list $haystackString] set needle [_get_chars_list $needleString] set hlen [llength $haystack] set nlen [llength $needle] set first_n [lindex $needle 0] for {set i $startIndex} {$i < $hlen} {incr i} { set first_index [lsearch -start $i $haystack $first_n] if {$first_index == -1} { break } if {$first_index + $nlen > $hlen} { break } set notfound 0 for {set j 0} {$j < $nlen} {incr j} { set c1 [lindex $haystack [expr {$i + $j}]] set c2 [lindex $needle $j] if {$c1 ne $c2} { set notfound 1 break } } if {$notfound == 0} { return $i } } return -1 } proc index {string charIndex} { return [lindex [_get_chars_list $string] $charIndex] } #string is proc last {needleString haystackString {startIndex ""}} { set haystack [_get_chars_list $haystackString] set needle [_get_chars_list $needleString] set hlen [llength $haystack] set nlen [llength $needle] set last_n [lindex $needle end] if {$startIndex eq ""} { set startIndex [expr {$hlen - 1}] } for {set i $startIndex} {$i >= $nlen - 1} {incr i -1} { if {[lindex $haystack $i] ne $last_n} { continue } set notfound 0 for {set j 0} {$j < $nlen} {incr j} { set c1 [lindex $haystack [expr {$i - $nlen + 1 + $j}]] set c2 [lindex $needle $j] if {$c1 ne $c2} { set notfound 1 break } } if {$notfound == 0} { return [expr {$i - $nlen + 1}] } } return -1 } proc length {string} { return [llength [_get_chars_list $string]] } #string map #string match proc range {string first last} { return [join [lrange [_get_chars_list $string] $first $last] ""] } proc repeat {string count} { return [string repeat $string $count] } proc replace {string first last {newstring ""}} { set list [lreplace [_get_chars_list $string] $first $last $newstring] return [join $list ""] } proc reverse {string} { return [join [lreverse [_get_chars_list $string]] ""] } proc tolower {string {first ""} {last ""}} { return [string tolower $string {*}$first {*}$last] } proc totitle {string {first ""} {last ""}} { return [string totitle $string {*}$first {*}$last] } proc toupper {string {first ""} {last ""}} { return [string toupper $string {*}$first {*}$last] } proc trim {string {chars " \t\r\n"}} { return [trimright [trimleft $chars] $chars] } proc trimleft {string {chars " \t\r\n"}} { set slist [_get_chars_list $string] set clist [_get_chars_list $chars] set i 0 for {} {$i < [llength $slist]} {incr i} { if {[lindex $slist $i] ni $clist} { break } } return [join [lrange $slist $i end] ""] } proc trimright {string {chars " \t\r\n"}} { set slist [_get_chars_list $string] set clist [_get_chars_list $chars] set i [expr {[llength $slist] - 1}] for {} {$i >= 0 } {incr i -1} { if {[lindex $slist $i] ni $clist} { break } } return [join [lrange $slist 0 $i] ""] } #string wordend #string wordstart proc _assert {name a b} { if {$a != $b} { puts "failed assert $name $a != $b" } } proc _test {} { # doesnt have surrogate pair set yosinoya1 \u5409\u91ce\u5bb6 # first 4 byte is a char with surrogate pair set yosinoya2 [encoding convertfrom unicode \x42\xd8\xb7\xdf\xce\x91\xb6\x5b] # first 6 byte is a char with extended with ivd set katusikaku [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe\x3a\x53] _assert 1-0 [_get_surrogate_pair_from_unicode_point 0xe0100] [list [expr 0xdb40] [expr 0xdd00]] _assert 1-1 [_get_unicode_point_from_surrogate_pair 0xdb40 0xdd00] 0xe0100 _assert first-1 [first bc abcdefg] [string first bc abcdefg] _assert first-2 [first ab abcdefg] [string first ab abcdefg] _assert first-3 [first ab abcdeab -1] [string first ab abcdeab -1] _assert first-4 [first ab abcdeab 0] [string first ab abcdeab 0] _assert first-5 [first ab abcdeab 1] [string first ab abcdeab 1] _assert first-6 [first ab abcdeab 2] [string first ab abcdeab 2] _assert first-7 [first ab abcdeab 5] [string first ab abcdeab 5] _assert first-8 [first ab abcdeab 6] [string first ab abcdeab 6] _assert first-9 [first ab abcdeab 7] [string first ab abcdeab 7] _assert first-10 [first \u91ce $yosinoya2] 1 _assert first-11 [first \u91ce $yosinoya2 1] 1 _assert first-12 [first \u91ce $yosinoya2 2] -1 _assert first-13 [first \u533a $katusikaku] 2 _assert first-14 [first \u533a $katusikaku 0] 2 _assert first-15 [first \u533a $katusikaku 1] 2 _assert first-16 [first \u533a $katusikaku 2] 2 _assert first-17 [first \u533a $katusikaku 3] -1 _assert last-1 [last ab abcdeabcd] [string last ab abcdeabcd] _assert last-2 [last ab abcdeabcd 6] [string last ab abcdeabcd 6] _assert last-3 [last ab abcdeabcd 5] [string last ab abcdeabcd 5] _assert last-4 [last ab abcdeabcd 4] [string last ab abcdeabcd 4] _assert last-5 [last ab abcdeabcd 2] [string last ab abcdeabcd 2] _assert last-6 [last ab abcdeabcd 1] [string last ab abcdeabcd 1] _assert last-7 [last ab abcdeabcd 0] [string last ab abcdeabcd 0] _assert last-8 [last ab abcdeabcd -1] [string last ab abcdeabcd -1] _assert last-9 [last \u91ce $yosinoya2] 1 _assert last-10 [last \u91ce $yosinoya2 2] 1 _assert last-11 [last \u91ce $yosinoya2 1] 1 _assert last-12 [last \u91ce $yosinoya2 0] -1 _assert last-13 [last \u533a $katusikaku] 2 _assert last-14 [last \u533a $katusikaku 2] 2 _assert last-15 [last \u533a $katusikaku 1] -1 _assert last-16 [last \u533a $katusikaku 0] -1 _assert length-1 [length $yosinoya1] 3 _assert length-2 [length $yosinoya2] 3 _assert length-3 [length $katusikaku] 3 _assert range-1 [range $katusikaku 0 1] [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe] _assert range-2 [range $katusikaku 1 2] [encoding convertfrom unicode \x98\xfe\x3a\x53] _assert trimleft-1 [trimleft " abc "] "abc " _assert trimleft-2 [trimleft $katusikaku \u845b] $katusikaku _assert trimright-1 [trimright " abc "] " abc" } set cmdlist [list] foreach cmdname [info commands [namespace current]::*] { set cmdname [namespace tail $cmdname] if {[string match _* $cmdname] == 0} { lappend cmdlist $cmdname } } namespace ensemble create -subcommands $cmdlist } string2::_test
string2名前空間内に切り出し用のコマンド。bytelength, first, index, last, length, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright。一応stringに似せてはある。わかりやすいので一文字として扱う文字列で区切ってリストにして処理している。 サロゲートペアだけでなく、異体字セレクタにもなんとなく対応。モンゴル?とかよくわかりません。regexpもサロゲートペア対応できればいいけど、今はできない。なかなか大変だわこれ・・・。
あとで書く