Diff of Tclによるサロゲートペア文字列の切り出しとか

The added line is THIS COLOR.
The deleted line is THIS COLOR.
Go to Tclによるサロゲートペア文字列の切り出しとか.
Deleting diff of Tclによるサロゲートペア文字列の切り出しとか
**突然ですがコードです [#i600f519]
 package require Tcl 8.5
 
 namespace eval string2 {
    proc _has_surrogate_pair {string} {
        set b [_get_utf16_byte $string]
        binary scan $b su* vals
        foreach val $vals {
            if {$val >= 0xd800 && $val <= 0xdfff} {
                return 1
            }
        }
        return 0
    }
 
    proc _get_surrogate_pair_from_unicode_point {codepoint} {
        set plane  [expr {$codepoint >> 16}]
        set lead2  [expr {(($codepoint >> 10) & 0x3f) + 0xd800 + 0x0040 * ($plane - 1)}]
        set trail2 [expr {($codepoint & 0x3ff) + 0xdc00}]
        return [list $lead2 $trail2]
    }
 
    proc _get_unicode_point_from_surrogate_pair {lead trail} {
        set plane  [expr {(($lead - 0xd800) / 0x0040) + 1}]
        set lead2  [expr {$lead - (0xd800 + 0x0040 * ($plane - 1))}]
        set trail2 [expr {$trail - 0xdc00}]
        return [expr {$plane * 0x10000 + $lead2 * 0x100 + $lead2 * 0x300 + $trail2}]
    }
 
    proc _get_utf16_byte {string} {
        return [encoding convertto unicode $string]
    }
 
    proc _get_chars_list {string} {
        set l [list]
        set b [_get_utf16_byte $string]
        binary scan $b su* vals
        for {set i 0} {$i < [llength $vals]} {incr i} {
            set val [lindex $vals $i]
            set char [encoding convertfrom unicode [binary format su* $val]]
            set codepoint $val
 
            if {$val >= 0xd800 && $val <= 0xdbff} {
                # Surrogate Pair
                incr i
                set val2 [lindex $vals $i]
                set codepoint [_get_unicode_point_from_surrogate_pair $val $val2]
                append char [encoding convertfrom unicode [binary format su* $val2]]
            }
            
            if {($codepoint >= 0xe0100 && $codepoint <= 0xe01ef) ||
                ($codepoint >= 0x180b && $codepoint <= 0x180d) ||
                ($codepoint >= 0xfe00 && $codepoint <= 0xfe0f)} {
                # IVD
                set temp [lindex $l end]
                append temp $char
                lset l end $temp
            } else {
                # Not IVD (2 or 4 byte char)
                lappend l $char
            }
        }
        return $l
    }
    
    proc bytelength {string} {
        return [string bytelength $string]
    }
 
 #string compare
 #string equal
 
    proc first {needleString haystackString {startIndex 0}} {
        set haystack [_get_chars_list $haystackString]
        set needle [_get_chars_list $needleString]
        set hlen [llength $haystack]
        set nlen [llength $needle]
        set first_n [lindex $needle 0]
 
        for {set i $startIndex} {$i < $hlen} {incr i} {
            set first_index [lsearch -start $i $haystack $first_n]
            if {$first_index == -1} {
                break
            }
            if {$first_index + $nlen > $hlen} {
                break
            }
            set notfound 0
            for {set j 0} {$j < $nlen} {incr j} {
                set c1 [lindex $haystack [expr {$i + $j}]]
                set c2 [lindex $needle $j]
                if {$c1 ne $c2} {
                    set notfound 1
                    break
                }
            }
            if {$notfound == 0} {
                return $i
            }
        }
        return -1
    }
 
    proc index {string charIndex} {
        return [lindex [_get_chars_list $string] $charIndex]
    }
 
 #string is
    
    proc last {needleString haystackString {startIndex ""}} {
        set haystack [_get_chars_list $haystackString]
        set needle [_get_chars_list $needleString]
        set hlen [llength $haystack]
        set nlen [llength $needle]
        set last_n [lindex $needle end]
        if {$startIndex eq ""} {
            set startIndex [expr {$hlen - 1}]
        }
 
        for {set i $startIndex} {$i >= $nlen - 1} {incr i -1} {
            if {[lindex $haystack $i] ne $last_n} {
                continue
            }
 
            set notfound 0
            for {set j 0} {$j < $nlen} {incr j} {
                set c1 [lindex $haystack [expr {$i - $nlen + 1 + $j}]]
                set c2 [lindex $needle $j]
                if {$c1 ne $c2} {
                    set notfound 1
                    break
                }
            }
            if {$notfound == 0} {
                return [expr {$i - $nlen + 1}]
            }
        }
        return -1
    }
    
    proc length {string} {
        return [llength [_get_chars_list $string]]
    }
 
 #string map
 #string match
    
    proc range {string first last} {
        return [join [lrange [_get_chars_list $string] $first $last] ""]
    }
 
    proc repeat {string count} {
        return [string repeat $string $count]
    }
    
    proc replace {string first last {newstring ""}} {
        set list [lreplace [_get_chars_list $string] $first $last $newstring]
        return [join $list ""]
    }
 
    proc reverse {string} {
        return [join [lreverse [_get_chars_list $string]] ""]
    }
 
    proc tolower {string {first ""} {last ""}} {
        return [string tolower $string {*}$first {*}$last]
    }
 
    proc totitle {string {first ""} {last ""}} {
        return [string totitle $string {*}$first {*}$last]
    }
 
    proc toupper {string {first ""} {last ""}} {
        return [string toupper $string {*}$first {*}$last]
    }
 
    proc trim {string {chars " \t\r\n"}} {
        return [trimright [trimleft $chars] $chars]
    }
 
    proc trimleft {string {chars " \t\r\n"}} {
        set slist [_get_chars_list $string]
        set clist [_get_chars_list $chars]
        set i 0
        for {} {$i < [llength $slist]} {incr i} {
            if {[lindex $slist $i] ni $clist} {
                break
            }
        }
        return [join [lrange $slist $i end] ""]
    }
 
    proc trimright {string {chars " \t\r\n"}} {
        set slist [_get_chars_list $string]
        set clist [_get_chars_list $chars]
        set i [expr {[llength $slist] - 1}]
        for {} {$i >= 0 } {incr i -1} {
            if {[lindex $slist $i] ni $clist} {
                break
            }
        }
        return [join [lrange $slist 0 $i] ""]
    }
 
 #string wordend
 #string wordstart
    
    proc _assert {name a b} {
        if {$a != $b} { puts "failed assert $name $a != $b" }
    }
 
    proc _test {} {
        # doesnt have surrogate pair
        set yosinoya1 \u5409\u91ce\u5bb6
        # first 4 byte is a char with surrogate pair
        set yosinoya2  [encoding convertfrom unicode \x42\xd8\xb7\xdf\xce\x91\xb6\x5b]
        # first 6 byte is a char with extended with ivd
        set katusikaku [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe\x3a\x53]
 
        _assert 1-0 [_get_surrogate_pair_from_unicode_point 0xe0100] [list [expr 0xdb40] [expr 0xdd00]]
        _assert 1-1 [_get_unicode_point_from_surrogate_pair 0xdb40 0xdd00] 0xe0100
 
        _assert first-1 [first bc abcdefg] [string first bc abcdefg]
        _assert first-2 [first ab abcdefg] [string first ab abcdefg]
        _assert first-3 [first ab abcdeab -1] [string first ab abcdeab -1]
        _assert first-4 [first ab abcdeab 0] [string first ab abcdeab 0]
        _assert first-5 [first ab abcdeab 1] [string first ab abcdeab 1]
        _assert first-6 [first ab abcdeab 2] [string first ab abcdeab 2]
        _assert first-7 [first ab abcdeab 5] [string first ab abcdeab 5]
        _assert first-8 [first ab abcdeab 6] [string first ab abcdeab 6]
        _assert first-9 [first ab abcdeab 7] [string first ab abcdeab 7]
        _assert first-10 [first \u91ce $yosinoya2] 1
        _assert first-11 [first \u91ce $yosinoya2 1] 1
        _assert first-12 [first \u91ce $yosinoya2 2] -1
        _assert first-13 [first \u533a $katusikaku] 2
        _assert first-14 [first \u533a $katusikaku 0] 2
        _assert first-15 [first \u533a $katusikaku 1] 2
        _assert first-16 [first \u533a $katusikaku 2] 2
        _assert first-17 [first \u533a $katusikaku 3] -1
 
        _assert last-1 [last ab abcdeabcd] [string last ab abcdeabcd]
        _assert last-2 [last ab abcdeabcd 6] [string last ab abcdeabcd 6]
        _assert last-3 [last ab abcdeabcd 5] [string last ab abcdeabcd 5]
        _assert last-4 [last ab abcdeabcd 4] [string last ab abcdeabcd 4]
        _assert last-5 [last ab abcdeabcd 2] [string last ab abcdeabcd 2]
        _assert last-6 [last ab abcdeabcd 1] [string last ab abcdeabcd 1]
        _assert last-7 [last ab abcdeabcd 0] [string last ab abcdeabcd 0]
        _assert last-8 [last ab abcdeabcd -1] [string last ab abcdeabcd -1]
        _assert last-9 [last \u91ce $yosinoya2] 1
        _assert last-10 [last \u91ce $yosinoya2 2] 1
        _assert last-11 [last \u91ce $yosinoya2 1] 1
        _assert last-12 [last \u91ce $yosinoya2 0] -1
        _assert last-13 [last \u533a $katusikaku] 2
        _assert last-14 [last \u533a $katusikaku 2] 2
        _assert last-15 [last \u533a $katusikaku 1] -1
        _assert last-16 [last \u533a $katusikaku 0] -1
 
        _assert length-1 [length $yosinoya1] 3
        _assert length-2 [length $yosinoya2] 3
        _assert length-3 [length $katusikaku] 3
 
        _assert range-1 [range $katusikaku 0 1] [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe]
        _assert range-2 [range $katusikaku 1 2] [encoding convertfrom unicode \x98\xfe\x3a\x53]
 
        _assert trimleft-1 [trimleft "   abc   "] "abc   "
        _assert trimleft-2 [trimleft $katusikaku \u845b] $katusikaku
        _assert trimright-1 [trimright "   abc   "] "   abc"
    }
 
    set cmdlist [list]
    foreach cmdname [info commands [namespace current]::*] {
        set cmdname [namespace tail $cmdname]
        if {[string match _* $cmdname] == 0} {
            lappend cmdlist $cmdname
        }
    }
    namespace ensemble create -subcommands $cmdlist
 }
 
 string2::_test

string2名前空間内に切り出し用のコマンド。bytelength, first, index, last, length, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright。一応stringに似せてはある。わかりやすいので一文字として扱う文字列で区切ってリストにして処理している。
サロゲートペアだけでなく、異体字セレクタにもなんとなく対応。モンゴル？とかよくわかりません。regexpもサロゲートペア対応できればいいけど、今はできない。なかなか大変だわこれ・・・。

** 使い方 [#nf81b24f]
 あとで書く

**コメントをどーぞ [#v0dbb31c]
- すばらしい！ -- ごろちゃん &new{2016-02-03 (水) 14:36:21};

#comment
----
[[CategoryTclTk]]