1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2003, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: dict_word.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Word Break Rules 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on Version 4.0.0, dated 2003-04-17 10*cdf0e10cSrcweir# 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir 13*cdf0e10cSrcweir 14*cdf0e10cSrcweir#################################################################################### 15*cdf0e10cSrcweir# 16*cdf0e10cSrcweir# Character class definitions from TR 29 17*cdf0e10cSrcweir# 18*cdf0e10cSrcweir#################################################################################### 19*cdf0e10cSrcweir$Katakana = [[:Script = KATAKANA:] [:name = KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 20*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK:] 21*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA VOICED SOUND MARK:] 22*cdf0e10cSrcweir [:name = HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK:]]; 23*cdf0e10cSrcweir 24*cdf0e10cSrcweir$Ideographic = [:Ideographic:]; 25*cdf0e10cSrcweir$Hangul = [:Script = HANGUL:]; 26*cdf0e10cSrcweir 27*cdf0e10cSrcweir$ALetter = [\u0002 [:Alphabetic:] [:name= COMMERCIAL AT:] [:name= HEBREW PUNCTUATION GERESH:] 28*cdf0e10cSrcweir - $Ideographic 29*cdf0e10cSrcweir - $Katakana 30*cdf0e10cSrcweir - $Hangul 31*cdf0e10cSrcweir - [:Script = Thai:] 32*cdf0e10cSrcweir - [:Script = Lao:] 33*cdf0e10cSrcweir - [:Script = Hiragana:]]; 34*cdf0e10cSrcweir 35*cdf0e10cSrcweir$MidLetter = [[:name = APOSTROPHE:] [:name = GRAVE ACCENT:] \u0084 [:name = SOFT HYPHEN:] [:name = MIDDLE DOT:] [:name = GREEK TONOS:] [:name= FULL STOP:] 36*cdf0e10cSrcweir [:name = HEBREW PUNCTUATION GERSHAYIM:] [:name = DOUBLE VERTICAL LINE:] [:name = LEFT SINGLE QUOTATION MARK:] 37*cdf0e10cSrcweir [:name = RIGHT SINGLE QUOTATION MARK:] [:name = HYPHENATION POINT:] [:name = PRIME:] 38*cdf0e10cSrcweir [:name = HYPHEN-MINUS:] ]; 39*cdf0e10cSrcweir 40*cdf0e10cSrcweir$SufixLetter = [:name= FULL STOP:]; 41*cdf0e10cSrcweir 42*cdf0e10cSrcweir 43*cdf0e10cSrcweir$MidNum = [[:LineBreak = Infix_Numeric:] [:name= COMMERCIAL AT:] \u0084 [:name = GREEK TONOS:] [:name = ARABIC DECIMAL SEPARATOR:] 44*cdf0e10cSrcweir [:name = LEFT SINGLE QUOTATION MARK:] [:name = RIGHT SINGLE QUOTATION MARK:] [:name = SINGLE HIGH-REVERSED-9 QUOTATION MARK:] 45*cdf0e10cSrcweir [:name = PRIME:]]; 46*cdf0e10cSrcweir$Numeric = [:LineBreak = Numeric:]; 47*cdf0e10cSrcweir 48*cdf0e10cSrcweir 49*cdf0e10cSrcweir$TheZWSP = \u200b; 50*cdf0e10cSrcweir 51*cdf0e10cSrcweir# 52*cdf0e10cSrcweir# Character Class Definitions. 53*cdf0e10cSrcweir# The names are those from TR29. 54*cdf0e10cSrcweir# 55*cdf0e10cSrcweir$CR = \u000d; 56*cdf0e10cSrcweir$LF = \u000a; 57*cdf0e10cSrcweir$Control = [[[:Zl:] [:Zp:] [:Cc:] [:Cf:]] - $TheZWSP]; 58*cdf0e10cSrcweir$Extend = [[:Grapheme_Extend = TRUE:]]; 59*cdf0e10cSrcweir 60*cdf0e10cSrcweir 61*cdf0e10cSrcweir 62*cdf0e10cSrcweir 63*cdf0e10cSrcweir#################################################################################### 64*cdf0e10cSrcweir# 65*cdf0e10cSrcweir# Word Break Rules. Definitions and Rules specific to word break begin Here. 66*cdf0e10cSrcweir# 67*cdf0e10cSrcweir#################################################################################### 68*cdf0e10cSrcweir 69*cdf0e10cSrcweir$Format = [[:Cf:] - $TheZWSP]; 70*cdf0e10cSrcweir 71*cdf0e10cSrcweir 72*cdf0e10cSrcweir 73*cdf0e10cSrcweir# Rule 3: Treat a grapheme cluster as if it were a single character. 74*cdf0e10cSrcweir# Hangul Syllables are easier to deal with here than they are in Grapheme Clusters 75*cdf0e10cSrcweir# because we don't need to find the boundaries between adjacent syllables - 76*cdf0e10cSrcweir# they won't be word boundaries. 77*cdf0e10cSrcweir# 78*cdf0e10cSrcweir 79*cdf0e10cSrcweir 80*cdf0e10cSrcweir# 81*cdf0e10cSrcweir# "Extended" definitions. Grapheme Cluster + Format Chars, treated like the base char. 82*cdf0e10cSrcweir# 83*cdf0e10cSrcweir$ALetterEx = $ALetter $Extend*; 84*cdf0e10cSrcweir$NumericEx = $Numeric $Extend*; 85*cdf0e10cSrcweir$MidNumEx = $MidNum $Extend*; 86*cdf0e10cSrcweir$MidLetterEx = $MidLetter $Extend*; 87*cdf0e10cSrcweir$SufixLetterEx= $SufixLetter $Extend*; 88*cdf0e10cSrcweir$KatakanaEx = $Katakana $Extend*; 89*cdf0e10cSrcweir$IdeographicEx= $Ideographic $Extend*; 90*cdf0e10cSrcweir$HangulEx = $Hangul $Extend*; 91*cdf0e10cSrcweir$FormatEx = $Format $Extend*; 92*cdf0e10cSrcweir 93*cdf0e10cSrcweir 94*cdf0e10cSrcweir# 95*cdf0e10cSrcweir# Numbers. Rules 8, 11, 12 form the TR. 96*cdf0e10cSrcweir# 97*cdf0e10cSrcweir$NumberSequence = $NumericEx ($FormatEx* $MidNumEx? $FormatEx* $NumericEx)*; 98*cdf0e10cSrcweir$NumberSequence {100}; 99*cdf0e10cSrcweir 100*cdf0e10cSrcweir# 101*cdf0e10cSrcweir# Words. Alpha-numerics. Rule 5, 6, 7, 9, 10 102*cdf0e10cSrcweir# - must include at least one letter. 103*cdf0e10cSrcweir# - may include both letters and numbers. 104*cdf0e10cSrcweir# - may include MideLetter, MidNumber punctuation. 105*cdf0e10cSrcweir# 106*cdf0e10cSrcweir$LetterSequence = $ALetterEx ($FormatEx* $MidLetterEx? $FormatEx* $ALetterEx)*; # rules #6, #7 107*cdf0e10cSrcweir($NumberSequence $FormatEx*)? $LetterSequence ($FormatEx* ($NumberSequence | $LetterSequence))* $SufixLetterEx? {200}; 108*cdf0e10cSrcweir 109*cdf0e10cSrcweir[[:P:][:S:]]*; 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir# 112*cdf0e10cSrcweir# Do not break between Katakana. Rule #13. 113*cdf0e10cSrcweir# 114*cdf0e10cSrcweir$KatakanaEx ($FormatEx* $KatakanaEx)* {300}; 115*cdf0e10cSrcweir[:Hiragana:] $Extend* {300}; 116*cdf0e10cSrcweir 117*cdf0e10cSrcweir# 118*cdf0e10cSrcweir# Ideographic Characters. Stand by themselves as words. 119*cdf0e10cSrcweir# Separated from the "Everything Else" rule, below, only so that they 120*cdf0e10cSrcweir# can be tagged with a return value. TODO: is this what we want? 121*cdf0e10cSrcweir# 122*cdf0e10cSrcweir$IdeographicEx ($FormatEx* $IdeographicEx)* {400}; 123*cdf0e10cSrcweir$HangulEx ($FormatEx* $HangulEx)* {400}; 124*cdf0e10cSrcweir 125*cdf0e10cSrcweir# 126*cdf0e10cSrcweir# Everything Else, with no tag. 127*cdf0e10cSrcweir# Non-Control chars combine with $Extend (combining) chars. 128*cdf0e10cSrcweir# Controls are do not. 129*cdf0e10cSrcweir# 130*cdf0e10cSrcweir[^$Control [:Ideographic:]] $Extend*; 131*cdf0e10cSrcweir$CR $LF; 132*cdf0e10cSrcweir 133*cdf0e10cSrcweir# 134*cdf0e10cSrcweir# Reverse Rules. Back up over any of the chars that can group together. 135*cdf0e10cSrcweir# (Reverse rules do not need to be exact; they can back up too far, 136*cdf0e10cSrcweir# but must back up at least enough, and must stop on a boundary.) 137*cdf0e10cSrcweir# 138*cdf0e10cSrcweir 139*cdf0e10cSrcweir# NonStarters are the set of all characters that can appear at the 2nd - nth position of 140*cdf0e10cSrcweir# a word. (They may also be the first.) The reverse rule skips over these, until it 141*cdf0e10cSrcweir# reaches something that can only be the start (and probably only) char in a "word". 142*cdf0e10cSrcweir# A space or punctuation meets the test. 143*cdf0e10cSrcweir# 144*cdf0e10cSrcweir$NonStarters = [$Numeric $ALetter $Katakana $Ideographic $Hangul [:P:] [:S:] $MidLetter $MidNum $SufixLetter $Extend $Format]; 145*cdf0e10cSrcweir 146*cdf0e10cSrcweir#!.*; 147*cdf0e10cSrcweir! ($NonStarters* | \n \r) .; 148*cdf0e10cSrcweir 149