1#
2#   Copyright (C) 2002-2009, International Business Machines Corporation and others.
3#       All Rights Reserved.
4#
5#   file:  char.txt
6#
7#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
8#      See Unicode Standard Annex #29.
9#      These rules are based on TR29 Revision 13, for Unicode Version 5.1
10#
11
12#
13#  Character Class Definitions.
14#
15$CR          = [\p{Grapheme_Cluster_Break = CR}];
16$LF          = [\p{Grapheme_Cluster_Break = LF}];
17$Control     = [\p{Grapheme_Cluster_Break = Control}];
18$Prepend     = [\p{Grapheme_Cluster_Break = Prepend}];
19$Extend      = [\p{Grapheme_Cluster_Break = Extend}];
20$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];
21$BengaliLetter = [\u0985-\u09B9 \u09CE \u09DC-\u09E1 \u09F0-\u09F1];
22$BengaliSignVirama = \u09CD;
23$GujaratiLetter = [\u0A85-\u0A8C \u0A8F-\u0A90 \u0A93-\u0AB9 \u0AE0-\u0AE1];
24$GujaratiSignVirama = \u0ACD;
25$DevanagariLetter = [\u0904-\u0939 \u0958-\u0961 \u0972-\u097F];
26$DevanagariSignVirama = \u094D;
27$KannadaLetter = [\u0C85-\u0CB9 \u0CDE-\u0CE1];
28$KannadaSignVirama = \u0CCD;
29$MalayalamLetter = [\u0D05-\u0D39 \u0D60-\u0D61 \u0D7A-\u0D7F];
30$MalayalamSignVirama = \u0D4D;
31$OriyaLetter = [\u0B05-\u0B39 \u0B5C-\u0B61 \u0B71];
32$OriyaSignVirama = \u0B4D;
33$GurmukhiLetter = [\u0A05-\u0A39 \u0A59-\u0A5E];
34$GurmukhiSignVirama = \u0A4D;
35$TamilLetter = [\u0B85-\u0BB9];
36$TamilSignVirama = \u0BCD;
37$TeluguLetter = [\u0C05-\u0C39 \u0C58-\u0C61];
38$TeluguSignVirama = \u0C4D;
39
40#
41# Korean Syllable Definitions
42#
43$L       = [\p{Grapheme_Cluster_Break = L}];
44$V       = [\p{Grapheme_Cluster_Break = V}];
45$T       = [\p{Grapheme_Cluster_Break = T}];
46
47$LV      = [\p{Grapheme_Cluster_Break = LV}];
48$LVT     = [\p{Grapheme_Cluster_Break = LVT}];
49
50
51## -------------------------------------------------
52!!chain;
53
54!!forward;
55
56$CR $LF;
57
58$BengaliLetter ($BengaliSignVirama $BengaliLetter?)+;
59$GujaratiLetter ($GujaratiSignVirama $GujaratiLetter?)+;
60$DevanagariLetter ($DevanagariSignVirama $DevanagariLetter?)+;
61$KannadaLetter ($KannadaSignVirama $KannadaLetter?)+;
62$MalayalamLetter ($MalayalamSignVirama $MalayalamLetter?)+;
63$OriyaLetter ($OriyaSignVirama $OriyaLetter?)+;
64$GurmukhiLetter ($GurmukhiSignVirama $GurmukhiLetter?)+;
65$TamilLetter ($TamilSignVirama $TamilLetter?)+;
66$TeluguLetter ($TeluguSignVirama $TeluguLetter?)+;
67
68$L ($L | $V | $LV | $LVT);
69($LV | $V) ($V | $T);
70($LVT | $T) $T;
71
72[^$Control $CR $LF] $Extend;
73
74[^$Control $CR $LF] $SpacingMark;
75$Prepend [^$Control $CR $LF];
76
77
78## -------------------------------------------------
79
80!!reverse;
81$LF $CR;
82($BengaliLetter? $BengaliSignVirama)+ $BengaliLetter;
83($GujaratiLetter? $GujaratiSignVirama)+ $GujaratiLetter;
84($DevanagariLetter? $DevanagariSignVirama)+ $DevanagariLetter;
85($KannadaLetter? $KannadaSignVirama)+ $KannadaLetter;
86($MalayalamLetter? $MalayalamSignVirama)+ $MalayalamLetter;
87($OriyaLetter? $OriyaSignVirama)+ $OriyaLetter;
88($GurmukhiLetter? $GurmukhiSignVirama)+ $GurmukhiLetter;
89($TamilLetter? $TamilSignVirama)+ $TamilLetter;
90($TeluguLetter? $TeluguSignVirama)+ $TeluguLetter;
91($L | $V | $LV | $LVT) $L;
92($V | $T) ($LV | $V);
93$T ($LVT | $T);
94
95$Extend      [^$Control $CR $LF];
96$SpacingMark [^$Control $CR $LF];
97[^$Control $CR $LF] $Prepend;
98
99
100## -------------------------------------------------
101
102!!safe_reverse;
103
104
105## -------------------------------------------------
106
107!!safe_forward;
108
109