Tclによるサロゲートペア文字列の切り出しとか
をテンプレートにして作成
[
Front page
] [
Page list
|
Search
|
Recent changes
|
RSS of recent changes
]
Start:
**突然ですがコードです [#i600f519]
package require Tcl 8.5
namespace eval string2 {
proc _has_surrogate_pair {string} {
set b [_get_utf16_byte $string]
binary scan $b su* vals
foreach val $vals {
if {$val >= 0xd800 && $val <= 0xdfff} {
return 1
}
}
return 0
}
proc _get_surrogate_pair_from_unicode_point {codepoin...
set plane [expr {$codepoint >> 16}]
set lead2 [expr {(($codepoint >> 10) & 0x3f) + 0...
set trail2 [expr {($codepoint & 0x3ff) + 0xdc00}]
return [list $lead2 $trail2]
}
proc _get_unicode_point_from_surrogate_pair {lead tra...
set plane [expr {(($lead - 0xd800) / 0x0040) + 1}]
set lead2 [expr {$lead - (0xd800 + 0x0040 * ($pl...
set trail2 [expr {$trail - 0xdc00}]
return [expr {$plane * 0x10000 + $lead2 * 0x100 +...
}
proc _get_utf16_byte {string} {
return [encoding convertto unicode $string]
}
proc _get_chars_list {string} {
set l [list]
set b [_get_utf16_byte $string]
binary scan $b su* vals
for {set i 0} {$i < [llength $vals]} {incr i} {
set val [lindex $vals $i]
set char [encoding convertfrom unicode [binar...
set codepoint $val
if {$val >= 0xd800 && $val <= 0xdbff} {
# Surrogate Pair
incr i
set val2 [lindex $vals $i]
set codepoint [_get_unicode_point_from_su...
append char [encoding convertfrom unicode...
}
if {($codepoint >= 0xe0100 && $codepoint <= 0...
($codepoint >= 0x180b && $codepoint <= 0x...
($codepoint >= 0xfe00 && $codepoint <= 0x...
# IVD
set temp [lindex $l end]
append temp $char
lset l end $temp
} else {
# Not IVD (2 or 4 byte char)
lappend l $char
}
}
return $l
}
proc bytelength {string} {
return [string bytelength $string]
}
#string compare
#string equal
proc first {needleString haystackString {startIndex 0...
set haystack [_get_chars_list $haystackString]
set needle [_get_chars_list $needleString]
set hlen [llength $haystack]
set nlen [llength $needle]
set first_n [lindex $needle 0]
for {set i $startIndex} {$i < $hlen} {incr i} {
set first_index [lsearch -start $i $haystack ...
if {$first_index == -1} {
break
}
if {$first_index + $nlen > $hlen} {
break
}
set notfound 0
for {set j 0} {$j < $nlen} {incr j} {
set c1 [lindex $haystack [expr {$i + $j}]]
set c2 [lindex $needle $j]
if {$c1 ne $c2} {
set notfound 1
break
}
}
if {$notfound == 0} {
return $i
}
}
return -1
}
proc index {string charIndex} {
return [lindex [_get_chars_list $string] $charInd...
}
#string is
proc last {needleString haystackString {startIndex ""...
set haystack [_get_chars_list $haystackString]
set needle [_get_chars_list $needleString]
set hlen [llength $haystack]
set nlen [llength $needle]
set last_n [lindex $needle end]
if {$startIndex eq ""} {
set startIndex [expr {$hlen - 1}]
}
for {set i $startIndex} {$i >= $nlen - 1} {incr i...
if {[lindex $haystack $i] ne $last_n} {
continue
}
set notfound 0
for {set j 0} {$j < $nlen} {incr j} {
set c1 [lindex $haystack [expr {$i - $nle...
set c2 [lindex $needle $j]
if {$c1 ne $c2} {
set notfound 1
break
}
}
if {$notfound == 0} {
return [expr {$i - $nlen + 1}]
}
}
return -1
}
proc length {string} {
return [llength [_get_chars_list $string]]
}
#string map
#string match
proc range {string first last} {
return [join [lrange [_get_chars_list $string] $f...
}
proc repeat {string count} {
return [string repeat $string $count]
}
proc replace {string first last {newstring ""}} {
set list [lreplace [_get_chars_list $string] $fir...
return [join $list ""]
}
proc reverse {string} {
return [join [lreverse [_get_chars_list $string]]...
}
proc tolower {string {first ""} {last ""}} {
return [string tolower $string {*}$first {*}$last]
}
proc totitle {string {first ""} {last ""}} {
return [string totitle $string {*}$first {*}$last]
}
proc toupper {string {first ""} {last ""}} {
return [string toupper $string {*}$first {*}$last]
}
proc trim {string {chars " \t\r\n"}} {
return [trimright [trimleft $chars] $chars]
}
proc trimleft {string {chars " \t\r\n"}} {
set slist [_get_chars_list $string]
set clist [_get_chars_list $chars]
set i 0
for {} {$i < [llength $slist]} {incr i} {
if {[lindex $slist $i] ni $clist} {
break
}
}
return [join [lrange $slist $i end] ""]
}
proc trimright {string {chars " \t\r\n"}} {
set slist [_get_chars_list $string]
set clist [_get_chars_list $chars]
set i [expr {[llength $slist] - 1}]
for {} {$i >= 0 } {incr i -1} {
if {[lindex $slist $i] ni $clist} {
break
}
}
return [join [lrange $slist 0 $i] ""]
}
#string wordend
#string wordstart
proc _assert {name a b} {
if {$a != $b} { puts "failed assert $name $a != $...
}
proc _test {} {
# doesnt have surrogate pair
set yosinoya1 \u5409\u91ce\u5bb6
# first 4 byte is a char with surrogate pair
set yosinoya2 [encoding convertfrom unicode \x42...
# first 6 byte is a char with extended with ivd
set katusikaku [encoding convertfrom unicode \x5b...
_assert 1-0 [_get_surrogate_pair_from_unicode_poi...
_assert 1-1 [_get_unicode_point_from_surrogate_pa...
_assert first-1 [first bc abcdefg] [string first ...
_assert first-2 [first ab abcdefg] [string first ...
_assert first-3 [first ab abcdeab -1] [string fir...
_assert first-4 [first ab abcdeab 0] [string firs...
_assert first-5 [first ab abcdeab 1] [string firs...
_assert first-6 [first ab abcdeab 2] [string firs...
_assert first-7 [first ab abcdeab 5] [string firs...
_assert first-8 [first ab abcdeab 6] [string firs...
_assert first-9 [first ab abcdeab 7] [string firs...
_assert first-10 [first \u91ce $yosinoya2] 1
_assert first-11 [first \u91ce $yosinoya2 1] 1
_assert first-12 [first \u91ce $yosinoya2 2] -1
_assert first-13 [first \u533a $katusikaku] 2
_assert first-14 [first \u533a $katusikaku 0] 2
_assert first-15 [first \u533a $katusikaku 1] 2
_assert first-16 [first \u533a $katusikaku 2] 2
_assert first-17 [first \u533a $katusikaku 3] -1
_assert last-1 [last ab abcdeabcd] [string last a...
_assert last-2 [last ab abcdeabcd 6] [string last...
_assert last-3 [last ab abcdeabcd 5] [string last...
_assert last-4 [last ab abcdeabcd 4] [string last...
_assert last-5 [last ab abcdeabcd 2] [string last...
_assert last-6 [last ab abcdeabcd 1] [string last...
_assert last-7 [last ab abcdeabcd 0] [string last...
_assert last-8 [last ab abcdeabcd -1] [string las...
_assert last-9 [last \u91ce $yosinoya2] 1
_assert last-10 [last \u91ce $yosinoya2 2] 1
_assert last-11 [last \u91ce $yosinoya2 1] 1
_assert last-12 [last \u91ce $yosinoya2 0] -1
_assert last-13 [last \u533a $katusikaku] 2
_assert last-14 [last \u533a $katusikaku 2] 2
_assert last-15 [last \u533a $katusikaku 1] -1
_assert last-16 [last \u533a $katusikaku 0] -1
_assert length-1 [length $yosinoya1] 3
_assert length-2 [length $yosinoya2] 3
_assert length-3 [length $katusikaku] 3
_assert range-1 [range $katusikaku 0 1] [encoding...
_assert range-2 [range $katusikaku 1 2] [encoding...
_assert trimleft-1 [trimleft " abc "] "abc "
_assert trimleft-2 [trimleft $katusikaku \u845b] ...
_assert trimright-1 [trimright " abc "] " a...
}
set cmdlist [list]
foreach cmdname [info commands [namespace current]::*...
set cmdname [namespace tail $cmdname]
if {[string match _* $cmdname] == 0} {
lappend cmdlist $cmdname
}
}
namespace ensemble create -subcommands $cmdlist
}
string2::_test
string2名前空間内に切り出し用のコマンド。bytelength, firs...
サロゲートペアだけでなく、異体字セレクタにもなんとなく対...
** 使い方 [#nf81b24f]
あとで書く
**コメントをどーぞ [#v0dbb31c]
- すばらしい! -- ごろちゃん &new{2016-02-03 (水) 14:36:2...
#comment
----
[[CategoryTclTk]]
End:
**突然ですがコードです [#i600f519]
package require Tcl 8.5
namespace eval string2 {
proc _has_surrogate_pair {string} {
set b [_get_utf16_byte $string]
binary scan $b su* vals
foreach val $vals {
if {$val >= 0xd800 && $val <= 0xdfff} {
return 1
}
}
return 0
}
proc _get_surrogate_pair_from_unicode_point {codepoin...
set plane [expr {$codepoint >> 16}]
set lead2 [expr {(($codepoint >> 10) & 0x3f) + 0...
set trail2 [expr {($codepoint & 0x3ff) + 0xdc00}]
return [list $lead2 $trail2]
}
proc _get_unicode_point_from_surrogate_pair {lead tra...
set plane [expr {(($lead - 0xd800) / 0x0040) + 1}]
set lead2 [expr {$lead - (0xd800 + 0x0040 * ($pl...
set trail2 [expr {$trail - 0xdc00}]
return [expr {$plane * 0x10000 + $lead2 * 0x100 +...
}
proc _get_utf16_byte {string} {
return [encoding convertto unicode $string]
}
proc _get_chars_list {string} {
set l [list]
set b [_get_utf16_byte $string]
binary scan $b su* vals
for {set i 0} {$i < [llength $vals]} {incr i} {
set val [lindex $vals $i]
set char [encoding convertfrom unicode [binar...
set codepoint $val
if {$val >= 0xd800 && $val <= 0xdbff} {
# Surrogate Pair
incr i
set val2 [lindex $vals $i]
set codepoint [_get_unicode_point_from_su...
append char [encoding convertfrom unicode...
}
if {($codepoint >= 0xe0100 && $codepoint <= 0...
($codepoint >= 0x180b && $codepoint <= 0x...
($codepoint >= 0xfe00 && $codepoint <= 0x...
# IVD
set temp [lindex $l end]
append temp $char
lset l end $temp
} else {
# Not IVD (2 or 4 byte char)
lappend l $char
}
}
return $l
}
proc bytelength {string} {
return [string bytelength $string]
}
#string compare
#string equal
proc first {needleString haystackString {startIndex 0...
set haystack [_get_chars_list $haystackString]
set needle [_get_chars_list $needleString]
set hlen [llength $haystack]
set nlen [llength $needle]
set first_n [lindex $needle 0]
for {set i $startIndex} {$i < $hlen} {incr i} {
set first_index [lsearch -start $i $haystack ...
if {$first_index == -1} {
break
}
if {$first_index + $nlen > $hlen} {
break
}
set notfound 0
for {set j 0} {$j < $nlen} {incr j} {
set c1 [lindex $haystack [expr {$i + $j}]]
set c2 [lindex $needle $j]
if {$c1 ne $c2} {
set notfound 1
break
}
}
if {$notfound == 0} {
return $i
}
}
return -1
}
proc index {string charIndex} {
return [lindex [_get_chars_list $string] $charInd...
}
#string is
proc last {needleString haystackString {startIndex ""...
set haystack [_get_chars_list $haystackString]
set needle [_get_chars_list $needleString]
set hlen [llength $haystack]
set nlen [llength $needle]
set last_n [lindex $needle end]
if {$startIndex eq ""} {
set startIndex [expr {$hlen - 1}]
}
for {set i $startIndex} {$i >= $nlen - 1} {incr i...
if {[lindex $haystack $i] ne $last_n} {
continue
}
set notfound 0
for {set j 0} {$j < $nlen} {incr j} {
set c1 [lindex $haystack [expr {$i - $nle...
set c2 [lindex $needle $j]
if {$c1 ne $c2} {
set notfound 1
break
}
}
if {$notfound == 0} {
return [expr {$i - $nlen + 1}]
}
}
return -1
}
proc length {string} {
return [llength [_get_chars_list $string]]
}
#string map
#string match
proc range {string first last} {
return [join [lrange [_get_chars_list $string] $f...
}
proc repeat {string count} {
return [string repeat $string $count]
}
proc replace {string first last {newstring ""}} {
set list [lreplace [_get_chars_list $string] $fir...
return [join $list ""]
}
proc reverse {string} {
return [join [lreverse [_get_chars_list $string]]...
}
proc tolower {string {first ""} {last ""}} {
return [string tolower $string {*}$first {*}$last]
}
proc totitle {string {first ""} {last ""}} {
return [string totitle $string {*}$first {*}$last]
}
proc toupper {string {first ""} {last ""}} {
return [string toupper $string {*}$first {*}$last]
}
proc trim {string {chars " \t\r\n"}} {
return [trimright [trimleft $chars] $chars]
}
proc trimleft {string {chars " \t\r\n"}} {
set slist [_get_chars_list $string]
set clist [_get_chars_list $chars]
set i 0
for {} {$i < [llength $slist]} {incr i} {
if {[lindex $slist $i] ni $clist} {
break
}
}
return [join [lrange $slist $i end] ""]
}
proc trimright {string {chars " \t\r\n"}} {
set slist [_get_chars_list $string]
set clist [_get_chars_list $chars]
set i [expr {[llength $slist] - 1}]
for {} {$i >= 0 } {incr i -1} {
if {[lindex $slist $i] ni $clist} {
break
}
}
return [join [lrange $slist 0 $i] ""]
}
#string wordend
#string wordstart
proc _assert {name a b} {
if {$a != $b} { puts "failed assert $name $a != $...
}
proc _test {} {
# doesnt have surrogate pair
set yosinoya1 \u5409\u91ce\u5bb6
# first 4 byte is a char with surrogate pair
set yosinoya2 [encoding convertfrom unicode \x42...
# first 6 byte is a char with extended with ivd
set katusikaku [encoding convertfrom unicode \x5b...
_assert 1-0 [_get_surrogate_pair_from_unicode_poi...
_assert 1-1 [_get_unicode_point_from_surrogate_pa...
_assert first-1 [first bc abcdefg] [string first ...
_assert first-2 [first ab abcdefg] [string first ...
_assert first-3 [first ab abcdeab -1] [string fir...
_assert first-4 [first ab abcdeab 0] [string firs...
_assert first-5 [first ab abcdeab 1] [string firs...
_assert first-6 [first ab abcdeab 2] [string firs...
_assert first-7 [first ab abcdeab 5] [string firs...
_assert first-8 [first ab abcdeab 6] [string firs...
_assert first-9 [first ab abcdeab 7] [string firs...
_assert first-10 [first \u91ce $yosinoya2] 1
_assert first-11 [first \u91ce $yosinoya2 1] 1
_assert first-12 [first \u91ce $yosinoya2 2] -1
_assert first-13 [first \u533a $katusikaku] 2
_assert first-14 [first \u533a $katusikaku 0] 2
_assert first-15 [first \u533a $katusikaku 1] 2
_assert first-16 [first \u533a $katusikaku 2] 2
_assert first-17 [first \u533a $katusikaku 3] -1
_assert last-1 [last ab abcdeabcd] [string last a...
_assert last-2 [last ab abcdeabcd 6] [string last...
_assert last-3 [last ab abcdeabcd 5] [string last...
_assert last-4 [last ab abcdeabcd 4] [string last...
_assert last-5 [last ab abcdeabcd 2] [string last...
_assert last-6 [last ab abcdeabcd 1] [string last...
_assert last-7 [last ab abcdeabcd 0] [string last...
_assert last-8 [last ab abcdeabcd -1] [string las...
_assert last-9 [last \u91ce $yosinoya2] 1
_assert last-10 [last \u91ce $yosinoya2 2] 1
_assert last-11 [last \u91ce $yosinoya2 1] 1
_assert last-12 [last \u91ce $yosinoya2 0] -1
_assert last-13 [last \u533a $katusikaku] 2
_assert last-14 [last \u533a $katusikaku 2] 2
_assert last-15 [last \u533a $katusikaku 1] -1
_assert last-16 [last \u533a $katusikaku 0] -1
_assert length-1 [length $yosinoya1] 3
_assert length-2 [length $yosinoya2] 3
_assert length-3 [length $katusikaku] 3
_assert range-1 [range $katusikaku 0 1] [encoding...
_assert range-2 [range $katusikaku 1 2] [encoding...
_assert trimleft-1 [trimleft " abc "] "abc "
_assert trimleft-2 [trimleft $katusikaku \u845b] ...
_assert trimright-1 [trimright " abc "] " a...
}
set cmdlist [list]
foreach cmdname [info commands [namespace current]::*...
set cmdname [namespace tail $cmdname]
if {[string match _* $cmdname] == 0} {
lappend cmdlist $cmdname
}
}
namespace ensemble create -subcommands $cmdlist
}
string2::_test
string2名前空間内に切り出し用のコマンド。bytelength, firs...
サロゲートペアだけでなく、異体字セレクタにもなんとなく対...
** 使い方 [#nf81b24f]
あとで書く
**コメントをどーぞ [#v0dbb31c]
- すばらしい! -- ごろちゃん &new{2016-02-03 (水) 14:36:2...
#comment
----
[[CategoryTclTk]]
Page:
HTML convert time: 0.004 sec.