1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2003, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: edit_word.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Word Break Rules 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on Version 4.0.0, dated 2003-04-17 10*cdf0e10cSrcweir# 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir 13*cdf0e10cSrcweir 14*cdf0e10cSrcweir#################################################################################### 15*cdf0e10cSrcweir# 16*cdf0e10cSrcweir# Character class definitions from TR 29 17*cdf0e10cSrcweir# 18*cdf0e10cSrcweir#################################################################################### 19*cdf0e10cSrcweir$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 20*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 21*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] 22*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; 23*cdf0e10cSrcweir 24*cdf0e10cSrcweir$Ideographic = [:Ideographic:]; 25*cdf0e10cSrcweir$Hangul = [:Script = HANGUL:]; 26*cdf0e10cSrcweir 27*cdf0e10cSrcweir$ALetter = [\u0002 [:Alphabetic:] [:name= NO-BREAK SPACE:] [:name= HEBREW PUNCTUATION GERESH:] 28*cdf0e10cSrcweir - $Ideographic 29*cdf0e10cSrcweir - $Katakana 30*cdf0e10cSrcweir - $Hangul 31*cdf0e10cSrcweir - [:Script = Thai:] 32*cdf0e10cSrcweir - [:Script = Lao:] 33*cdf0e10cSrcweir - [:Script = Hiragana:]]; 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir$MidLetter = [[:name = APOSTROPHE:] [:name = MIDDLE DOT:] [:name = HEBREW PUNCTUATION GERSHAYIM:] 36*cdf0e10cSrcweir [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:]]; 37*cdf0e10cSrcweir 38*cdf0e10cSrcweir$MidNum = [[:LineBreak = Infix_Numeric:] - [:name = FULL STOP:]]; 39*cdf0e10cSrcweir$Numeric = [:LineBreak = Numeric:]; 40*cdf0e10cSrcweir 41*cdf0e10cSrcweir 42*cdf0e10cSrcweir$TheZWSP = \u200b; 43*cdf0e10cSrcweir 44*cdf0e10cSrcweir# 45*cdf0e10cSrcweir# Character Class Definitions. 46*cdf0e10cSrcweir# The names are those from TR29. 47*cdf0e10cSrcweir# 48*cdf0e10cSrcweir$CR = \u000d; 49*cdf0e10cSrcweir$LF = \u000a; 50*cdf0e10cSrcweir$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; 51*cdf0e10cSrcweir$Extend = [[:Grapheme_Extend = TRUE:]]; 52*cdf0e10cSrcweir 53*cdf0e10cSrcweir 54*cdf0e10cSrcweir 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir#################################################################################### 57*cdf0e10cSrcweir# 58*cdf0e10cSrcweir# Word Break Rules. Definitions and Rules specific to word break begin Here. 59*cdf0e10cSrcweir# 60*cdf0e10cSrcweir#################################################################################### 61*cdf0e10cSrcweir 62*cdf0e10cSrcweir$Format = [[:Cf:] - $TheZWSP]; 63*cdf0e10cSrcweir 64*cdf0e10cSrcweir 65*cdf0e10cSrcweir 66*cdf0e10cSrcweir# Rule 3: Treat a grapheme cluster as if it were a single character. 67*cdf0e10cSrcweir# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters 68*cdf0e10cSrcweir# because we don't need to find the boundaries between adjacent syllables - 69*cdf0e10cSrcweir# they won't be word boundaries. 70*cdf0e10cSrcweir# 71*cdf0e10cSrcweir 72*cdf0e10cSrcweir 73*cdf0e10cSrcweir# 74*cdf0e10cSrcweir# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. 75*cdf0e10cSrcweir# 76*cdf0e10cSrcweir$ALetterEx = $ALetter $Extend*; 77*cdf0e10cSrcweir$NumericEx = $Numeric $Extend*; 78*cdf0e10cSrcweir$MidNumEx = $MidNum $Extend*; 79*cdf0e10cSrcweir$MidLetterEx = $MidLetter $Extend*; 80*cdf0e10cSrcweir$KatakanaEx = $Katakana $Extend*; 81*cdf0e10cSrcweir$IdeographicEx= $Ideographic $Extend*; 82*cdf0e10cSrcweir$HangulEx = $Hangul $Extend*; 83*cdf0e10cSrcweir$FormatEx = $Format $Extend*; 84*cdf0e10cSrcweir 85*cdf0e10cSrcweir 86*cdf0e10cSrcweir# 87*cdf0e10cSrcweir# Numbers. Rules 8, 11, 12 form the TR. 88*cdf0e10cSrcweir# 89*cdf0e10cSrcweir$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; 90*cdf0e10cSrcweir$NumberSequence {100}; 91*cdf0e10cSrcweir 92*cdf0e10cSrcweir# 93*cdf0e10cSrcweir# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 94*cdf0e10cSrcweir# - must include at least one letter. 95*cdf0e10cSrcweir# - may include both letters and numbers. 96*cdf0e10cSrcweir# - may include MideLetter, MidNumber punctuation. 97*cdf0e10cSrcweir# 98*cdf0e10cSrcweir$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 99*cdf0e10cSrcweir($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* {200}; 100*cdf0e10cSrcweir 101*cdf0e10cSrcweir# Punctuations by themselves 102*cdf0e10cSrcweir[[:P:][:S:]-[:name = FULL STOP:]]*; 103*cdf0e10cSrcweir[[:name = FULL STOP:]]*; 104*cdf0e10cSrcweir 105*cdf0e10cSrcweir# 106*cdf0e10cSrcweir# Do not break between Katakana. Rule #13. 107*cdf0e10cSrcweir# 108*cdf0e10cSrcweir$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; 109*cdf0e10cSrcweir[:Hiragana:] $Extend* {300}; 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir# 112*cdf0e10cSrcweir# Ideographic Characters. Stand by themselves as words. 113*cdf0e10cSrcweir# Separated from the "Everything Else" rule, below, only so that they 114*cdf0e10cSrcweir# can be tagged with a return value. TODO: is this what we want? 115*cdf0e10cSrcweir# 116*cdf0e10cSrcweir$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; 117*cdf0e10cSrcweir$HangulEx ($FormatEx* $HangulEx)* {400}; 118*cdf0e10cSrcweir 119*cdf0e10cSrcweir# 120*cdf0e10cSrcweir# Everything Else, with no tag. 121*cdf0e10cSrcweir# Non-Control chars combine with $Extend (combining) chars. 122*cdf0e10cSrcweir# Controls are do not. 123*cdf0e10cSrcweir# 124*cdf0e10cSrcweir[^$Control [:Ideographic:]] $Extend*; 125*cdf0e10cSrcweir$CR $LF; 126*cdf0e10cSrcweir 127*cdf0e10cSrcweir# 128*cdf0e10cSrcweir# Reverse Rules. Back up over any of the chars that can group together. 129*cdf0e10cSrcweir# (Reverse rules do not need to be exact; they can back up too far, 130*cdf0e10cSrcweir# but must back up at least enough, and must stop on a boundary.) 131*cdf0e10cSrcweir# 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir# NonStarters are the set of all characters that can appear at the 2nd - nth position of 134*cdf0e10cSrcweir# a word. (They may also be the first.) The reverse rule skips over these, until it 135*cdf0e10cSrcweir# reaches something that can only be the start (and probably only) char in a "word". 136*cdf0e10cSrcweir# A space or punctuation meets the test. 137*cdf0e10cSrcweir# 138*cdf0e10cSrcweir$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $Extend $Format]; 139*cdf0e10cSrcweir 140*cdf0e10cSrcweir#!.*; 141*cdf0e10cSrcweir! ($NonStarters* | \n \r) .; 142*cdf0e10cSrcweir 143