Tclによるサロゲートペア文字列の切り出しとか

メニュー

recent(15)

2025-05-21

Speedline

2025-05-04

GIMPで縦書き

2024-01-25

TkSQLite

2023-04-18

突然ですがコードです†

package require Tcl 8.5

namespace eval string2 {
   proc _has_surrogate_pair {string} {
       set b [_get_utf16_byte $string]
       binary scan $b su* vals
       foreach val $vals {
           if {$val >= 0xd800 && $val <= 0xdfff} {
               return 1
           }
       }
       return 0
   }

   proc _get_surrogate_pair_from_unicode_point {codepoint} {
       set plane  [expr {$codepoint >> 16}]
       set lead2  [expr {(($codepoint >> 10) & 0x3f) + 0xd800 + 0x0040 * ($plane - 1)}]
       set trail2 [expr {($codepoint & 0x3ff) + 0xdc00}]
       return [list $lead2 $trail2]
   }

   proc _get_unicode_point_from_surrogate_pair {lead trail} {
       set plane  [expr {(($lead - 0xd800) / 0x0040) + 1}]
       set lead2  [expr {$lead - (0xd800 + 0x0040 * ($plane - 1))}]
       set trail2 [expr {$trail - 0xdc00}]
       return [expr {$plane * 0x10000 + $lead2 * 0x100 + $lead2 * 0x300 + $trail2}]
   }

   proc _get_utf16_byte {string} {
       return [encoding convertto unicode $string]
   }

   proc _get_chars_list {string} {
       set l [list]
       set b [_get_utf16_byte $string]
       binary scan $b su* vals
       for {set i 0} {$i < [llength $vals]} {incr i} {
           set val [lindex $vals $i]
           set char [encoding convertfrom unicode [binary format su* $val]]
           set codepoint $val

           if {$val >= 0xd800 && $val <= 0xdbff} {
               # Surrogate Pair
               incr i
               set val2 [lindex $vals $i]
               set codepoint [_get_unicode_point_from_surrogate_pair $val $val2]
               append char [encoding convertfrom unicode [binary format su* $val2]]
           }
           
           if {($codepoint >= 0xe0100 && $codepoint <= 0xe01ef) ||
               ($codepoint >= 0x180b && $codepoint <= 0x180d) ||
               ($codepoint >= 0xfe00 && $codepoint <= 0xfe0f)} {
               # IVD
               set temp [lindex $l end]
               append temp $char
               lset l end $temp
           } else {
               # Not IVD (2 or 4 byte char)
               lappend l $char
           }
       }
       return $l
   }
   
   proc bytelength {string} {
       return [string bytelength $string]
   }

#string compare
#string equal

   proc first {needleString haystackString {startIndex 0}} {
       set haystack [_get_chars_list $haystackString]
       set needle [_get_chars_list $needleString]
       set hlen [llength $haystack]
       set nlen [llength $needle]
       set first_n [lindex $needle 0]

       for {set i $startIndex} {$i < $hlen} {incr i} {
           set first_index [lsearch -start $i $haystack $first_n]
           if {$first_index == -1} {
               break
           }
           if {$first_index + $nlen > $hlen} {
               break
           }
           set notfound 0
           for {set j 0} {$j < $nlen} {incr j} {
               set c1 [lindex $haystack [expr {$i + $j}]]
               set c2 [lindex $needle $j]
               if {$c1 ne $c2} {
                   set notfound 1
                   break
               }
           }
           if {$notfound == 0} {
               return $i
           }
       }
       return -1
   }

   proc index {string charIndex} {
       return [lindex [_get_chars_list $string] $charIndex]
   }

#string is
   
   proc last {needleString haystackString {startIndex ""}} {
       set haystack [_get_chars_list $haystackString]
       set needle [_get_chars_list $needleString]
       set hlen [llength $haystack]
       set nlen [llength $needle]
       set last_n [lindex $needle end]
       if {$startIndex eq ""} {
           set startIndex [expr {$hlen - 1}]
       }

       for {set i $startIndex} {$i >= $nlen - 1} {incr i -1} {
           if {[lindex $haystack $i] ne $last_n} {
               continue
           }

           set notfound 0
           for {set j 0} {$j < $nlen} {incr j} {
               set c1 [lindex $haystack [expr {$i - $nlen + 1 + $j}]]
               set c2 [lindex $needle $j]
               if {$c1 ne $c2} {
                   set notfound 1
                   break
               }
           }
           if {$notfound == 0} {
               return [expr {$i - $nlen + 1}]
           }
       }
       return -1
   }
   
   proc length {string} {
       return [llength [_get_chars_list $string]]
   }

#string map
#string match
   
   proc range {string first last} {
       return [join [lrange [_get_chars_list $string] $first $last] ""]
   }

   proc repeat {string count} {
       return [string repeat $string $count]
   }
   
   proc replace {string first last {newstring ""}} {
       set list [lreplace [_get_chars_list $string] $first $last $newstring]
       return [join $list ""]
   }

   proc reverse {string} {
       return [join [lreverse [_get_chars_list $string]] ""]
   }

   proc tolower {string {first ""} {last ""}} {
       return [string tolower $string {*}$first {*}$last]
   }

   proc totitle {string {first ""} {last ""}} {
       return [string totitle $string {*}$first {*}$last]
   }

   proc toupper {string {first ""} {last ""}} {
       return [string toupper $string {*}$first {*}$last]
   }

   proc trim {string {chars " \t\r\n"}} {
       return [trimright [trimleft $chars] $chars]
   }

   proc trimleft {string {chars " \t\r\n"}} {
       set slist [_get_chars_list $string]
       set clist [_get_chars_list $chars]
       set i 0
       for {} {$i < [llength $slist]} {incr i} {
           if {[lindex $slist $i] ni $clist} {
               break
           }
       }
       return [join [lrange $slist $i end] ""]
   }

   proc trimright {string {chars " \t\r\n"}} {
       set slist [_get_chars_list $string]
       set clist [_get_chars_list $chars]
       set i [expr {[llength $slist] - 1}]
       for {} {$i >= 0 } {incr i -1} {
           if {[lindex $slist $i] ni $clist} {
               break
           }
       }
       return [join [lrange $slist 0 $i] ""]
   }

#string wordend
#string wordstart
   
   proc _assert {name a b} {
       if {$a != $b} { puts "failed assert $name $a != $b" }
   }

   proc _test {} {
       # doesnt have surrogate pair
       set yosinoya1 \u5409\u91ce\u5bb6
       # first 4 byte is a char with surrogate pair
       set yosinoya2  [encoding convertfrom unicode \x42\xd8\xb7\xdf\xce\x91\xb6\x5b]
       # first 6 byte is a char with extended with ivd
       set katusikaku [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe\x3a\x53]

       _assert 1-0 [_get_surrogate_pair_from_unicode_point 0xe0100] [list [expr 0xdb40] [expr 0xdd00]]
       _assert 1-1 [_get_unicode_point_from_surrogate_pair 0xdb40 0xdd00] 0xe0100

       _assert first-1 [first bc abcdefg] [string first bc abcdefg]
       _assert first-2 [first ab abcdefg] [string first ab abcdefg]
       _assert first-3 [first ab abcdeab -1] [string first ab abcdeab -1]
       _assert first-4 [first ab abcdeab 0] [string first ab abcdeab 0]
       _assert first-5 [first ab abcdeab 1] [string first ab abcdeab 1]
       _assert first-6 [first ab abcdeab 2] [string first ab abcdeab 2]
       _assert first-7 [first ab abcdeab 5] [string first ab abcdeab 5]
       _assert first-8 [first ab abcdeab 6] [string first ab abcdeab 6]
       _assert first-9 [first ab abcdeab 7] [string first ab abcdeab 7]
       _assert first-10 [first \u91ce $yosinoya2] 1
       _assert first-11 [first \u91ce $yosinoya2 1] 1
       _assert first-12 [first \u91ce $yosinoya2 2] -1
       _assert first-13 [first \u533a $katusikaku] 2
       _assert first-14 [first \u533a $katusikaku 0] 2
       _assert first-15 [first \u533a $katusikaku 1] 2
       _assert first-16 [first \u533a $katusikaku 2] 2
       _assert first-17 [first \u533a $katusikaku 3] -1

       _assert last-1 [last ab abcdeabcd] [string last ab abcdeabcd]
       _assert last-2 [last ab abcdeabcd 6] [string last ab abcdeabcd 6]
       _assert last-3 [last ab abcdeabcd 5] [string last ab abcdeabcd 5]
       _assert last-4 [last ab abcdeabcd 4] [string last ab abcdeabcd 4]
       _assert last-5 [last ab abcdeabcd 2] [string last ab abcdeabcd 2]
       _assert last-6 [last ab abcdeabcd 1] [string last ab abcdeabcd 1]
       _assert last-7 [last ab abcdeabcd 0] [string last ab abcdeabcd 0]
       _assert last-8 [last ab abcdeabcd -1] [string last ab abcdeabcd -1]
       _assert last-9 [last \u91ce $yosinoya2] 1
       _assert last-10 [last \u91ce $yosinoya2 2] 1
       _assert last-11 [last \u91ce $yosinoya2 1] 1
       _assert last-12 [last \u91ce $yosinoya2 0] -1
       _assert last-13 [last \u533a $katusikaku] 2
       _assert last-14 [last \u533a $katusikaku 2] 2
       _assert last-15 [last \u533a $katusikaku 1] -1
       _assert last-16 [last \u533a $katusikaku 0] -1

       _assert length-1 [length $yosinoya1] 3
       _assert length-2 [length $yosinoya2] 3
       _assert length-3 [length $katusikaku] 3

       _assert range-1 [range $katusikaku 0 1] [encoding convertfrom unicode \x5b\x84\x40\xdb\x01\xdd\x98\xfe]
       _assert range-2 [range $katusikaku 1 2] [encoding convertfrom unicode \x98\xfe\x3a\x53]

       _assert trimleft-1 [trimleft "   abc   "] "abc   "
       _assert trimleft-2 [trimleft $katusikaku \u845b] $katusikaku
       _assert trimright-1 [trimright "   abc   "] "   abc"
   }

   set cmdlist [list]
   foreach cmdname [info commands [namespace current]::*] {
       set cmdname [namespace tail $cmdname]
       if {[string match _* $cmdname] == 0} {
           lappend cmdlist $cmdname
       }
   }
   namespace ensemble create -subcommands $cmdlist
}

string2::_test

string2名前空間内に切り出し用のコマンド。bytelength, first, index, last, length, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright。一応stringに似せてはある。わかりやすいので一文字として扱う文字列で区切ってリストにして処理している。サロゲートペアだけでなく、異体字セレクタにもなんとなく対応。モンゴル？とかよくわかりません。regexpもサロゲートペア対応できればいいけど、今はできない。なかなか大変だわこれ・・・。

↑

使い方†

あとで書く

↑

コメントをどーぞ†

すばらしい！ -- ごろちゃん 2016-02-03 (水) 14:36:21

CategoryTclTk