1# 2# Copyright (C) 2002-2009, International Business Machines Corporation and others. 3# All Rights Reserved. 4# 5# file: char.txt 6# 7# ICU Character Break Rules, also known as Grapheme Cluster Boundaries 8# See Unicode Standard Annex #29. 9# These rules are based on TR29 Revision 13, for Unicode Version 5.1 10# 11 12# 13# Character Class Definitions. 14# 15$CR = [\p{Grapheme_Cluster_Break = CR}]; 16$LF = [\p{Grapheme_Cluster_Break = LF}]; 17$Control = [\p{Grapheme_Cluster_Break = Control}]; 18$Prepend = [\p{Grapheme_Cluster_Break = Prepend}]; 19$Extend = [\p{Grapheme_Cluster_Break = Extend}]; 20$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}]; 21$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1]; 22$BengaliSignVirama = \u09CD; 23$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1]; 24$GujaratiSignVirama = \u0ACD; 25$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F]; 26$DevanagariSignVirama = \u094D; 27$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1]; 28$KannadaSignVirama = \u0CCD; 29$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F]; 30$MalayalamSignVirama = \u0D4D; 31$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71]; 32$OriyaSignVirama = \u0B4D; 33$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E]; 34$GurmukhiSignVirama = \u0A4D; 35$TamilLetter = [\u0B85-\u0BB9]; 36$TamilSignVirama = \u0BCD; 37$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61]; 38$TeluguSignVirama = \u0C4D; 39 40# 41# Korean Syllable Definitions 42# 43$L = [\p{Grapheme_Cluster_Break = L}]; 44$V = [\p{Grapheme_Cluster_Break = V}]; 45$T = [\p{Grapheme_Cluster_Break = T}]; 46 47$LV = [\p{Grapheme_Cluster_Break = LV}]; 48$LVT = [\p{Grapheme_Cluster_Break = LVT}]; 49 50 51## ------------------------------------------------- 52!!chain; 53 54!!forward; 55 56$CR $LF; 57 58$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+; 59$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+; 60$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+; 61$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+; 62$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+; 63$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+; 64$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+; 65$TamilLetter ($TamilSignVirama $TamilLetter?)+; 66$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+; 67 68$L ($L | $V | $LV | $LVT); 69($LV | $V) ($V | $T); 70($LVT | $T) $T; 71 72[^$Control $CR $LF] $Extend; 73 74[^$Control $CR $LF] $SpacingMark; 75$Prepend [^$Control $CR $LF]; 76 77 78## ------------------------------------------------- 79 80!!reverse; 81$LF $CR; 82($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter; 83($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter; 84($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter; 85($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter; 86($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter; 87($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter; 88($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter; 89($TamilLetter? $TamilSignVirama)+ $TamilLetter; 90($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter; 91($L | $V | $LV | $LVT) $L; 92($V | $T) ($LV | $V); 93$T ($LVT | $T); 94 95$Extend [^$Control $CR $LF]; 96$SpacingMark [^$Control $CR $LF]; 97[^$Control $CR $LF] $Prepend; 98 99 100## ------------------------------------------------- 101 102!!safe_reverse; 103 104 105## ------------------------------------------------- 106 107!!safe_forward; 108 109