1*cdf0e10cSrcweir#
2*cdf0e10cSrcweir#   Copyright (C) 2002-2006, International Business Machines Corporation and others.
3*cdf0e10cSrcweir#       All Rights Reserved.
4*cdf0e10cSrcweir#
5*cdf0e10cSrcweir#   file:  sent.txt
6*cdf0e10cSrcweir#
7*cdf0e10cSrcweir#   ICU Sentence Break Rules
8*cdf0e10cSrcweir#      See Unicode Standard Annex #29.
9*cdf0e10cSrcweir#      These rules are based on SA 29 version 5.0.0
10*cdf0e10cSrcweir#      Includes post 5.0 changes to treat Japanese half width voicing marks
11*cdf0e10cSrcweir#        as Grapheme Extend.
12*cdf0e10cSrcweir#
13*cdf0e10cSrcweir
14*cdf0e10cSrcweir
15*cdf0e10cSrcweir$VoiceMarks   = [\uff9e\uff9f];
16*cdf0e10cSrcweir$Thai         = [:Script = Thai:];
17*cdf0e10cSrcweir
18*cdf0e10cSrcweir#
19*cdf0e10cSrcweir# Character categories as defined in TR 29
20*cdf0e10cSrcweir#
21*cdf0e10cSrcweir$Sep       = [\p{Sentence_Break = Sep}];
22*cdf0e10cSrcweir$Format    = [\p{Sentence_Break = Format}];
23*cdf0e10cSrcweir$Sp        = [\p{Sentence_Break = Sp}];
24*cdf0e10cSrcweir$Lower     = [\p{Sentence_Break = Lower}];
25*cdf0e10cSrcweir$Upper     = [\p{Sentence_Break = Upper}];
26*cdf0e10cSrcweir$OLetter   = [\p{Sentence_Break = OLetter}-$VoiceMarks];
27*cdf0e10cSrcweir$Numeric   = [\p{Sentence_Break = Numeric}];
28*cdf0e10cSrcweir$ATerm     = [\p{Sentence_Break = ATerm}];
29*cdf0e10cSrcweir$STerm     = [\p{Sentence_Break = STerm}];
30*cdf0e10cSrcweir$Close     = [\p{Sentence_Break = Close}];
31*cdf0e10cSrcweir
32*cdf0e10cSrcweir#
33*cdf0e10cSrcweir# Define extended forms of the character classes,
34*cdf0e10cSrcweir#   incorporate grapheme cluster + format chars.
35*cdf0e10cSrcweir#   Rules 4 and 5.
36*cdf0e10cSrcweir
37*cdf0e10cSrcweir
38*cdf0e10cSrcweir$CR         = \u000d;
39*cdf0e10cSrcweir$LF         = \u000a;
40*cdf0e10cSrcweir$Extend     = [[:Grapheme_Extend = TRUE:]$VoiceMarks];
41*cdf0e10cSrcweir
42*cdf0e10cSrcweir$SpEx       = $Sp      ($Extend | $Format)*;
43*cdf0e10cSrcweir$LowerEx    = $Lower   ($Extend | $Format)*;
44*cdf0e10cSrcweir$UpperEx    = $Upper   ($Extend | $Format)*;
45*cdf0e10cSrcweir$OLetterEx  = $OLetter ($Extend | $Format)*;
46*cdf0e10cSrcweir$NumericEx  = $Numeric ($Extend | $Format)*;
47*cdf0e10cSrcweir$ATermEx    = $ATerm   ($Extend | $Format)*;
48*cdf0e10cSrcweir$STermEx    = $STerm   ($Extend | $Format)*;
49*cdf0e10cSrcweir$CloseEx    = $Close   ($Extend | $Format)*;
50*cdf0e10cSrcweir
51*cdf0e10cSrcweir
52*cdf0e10cSrcweir## -------------------------------------------------
53*cdf0e10cSrcweir
54*cdf0e10cSrcweir!!chain;
55*cdf0e10cSrcweir!!forward;
56*cdf0e10cSrcweir
57*cdf0e10cSrcweir# Rule 3 - break after separators.  Keep CR/LF together.
58*cdf0e10cSrcweir#
59*cdf0e10cSrcweir$CR $LF;
60*cdf0e10cSrcweir
61*cdf0e10cSrcweir$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*;
62*cdf0e10cSrcweir$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*;
63*cdf0e10cSrcweir
64*cdf0e10cSrcweir# Rule 4 - Break after $Sep.
65*cdf0e10cSrcweir# Rule 5 - Ignore $Format and $Extend
66*cdf0e10cSrcweir#
67*cdf0e10cSrcweir[^$Sep]? ($Extend | $Format)*;
68*cdf0e10cSrcweir
69*cdf0e10cSrcweir
70*cdf0e10cSrcweir# Rule 6
71*cdf0e10cSrcweir$ATermEx $NumericEx;
72*cdf0e10cSrcweir
73*cdf0e10cSrcweir# Rule 7
74*cdf0e10cSrcweir$UpperEx $ATermEx $UpperEx;
75*cdf0e10cSrcweir
76*cdf0e10cSrcweir#Rule 8
77*cdf0e10cSrcweir#  Note:  follows errata for Unicode 5.0 boundary rules.
78*cdf0e10cSrcweir$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*;
79*cdf0e10cSrcweir$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
80*cdf0e10cSrcweir
81*cdf0e10cSrcweir# Rule 8a
82*cdf0e10cSrcweir($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx);
83*cdf0e10cSrcweir
84*cdf0e10cSrcweir#Rule 9, 10, 11
85*cdf0e10cSrcweir($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?;
86*cdf0e10cSrcweir
87*cdf0e10cSrcweir#Rule 12
88*cdf0e10cSrcweir[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai];
89*cdf0e10cSrcweir[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100};
90*cdf0e10cSrcweir
91*cdf0e10cSrcweir## -------------------------------------------------
92*cdf0e10cSrcweir
93*cdf0e10cSrcweir!!reverse;
94*cdf0e10cSrcweir
95*cdf0e10cSrcweir$SpEx_R       = ($Extend | $Format)* $Sp;
96*cdf0e10cSrcweir$ATermEx_R    = ($Extend | $Format)* $ATerm;
97*cdf0e10cSrcweir$STermEx_R    = ($Extend | $Format)* $STerm;
98*cdf0e10cSrcweir$CloseEx_R    = ($Extend | $Format)* $Close;
99*cdf0e10cSrcweir
100*cdf0e10cSrcweir#
101*cdf0e10cSrcweir#  Reverse rules.
102*cdf0e10cSrcweir#     For now, use the old style inexact reverse rules, which are easier
103*cdf0e10cSrcweir#     to write, but less efficient.
104*cdf0e10cSrcweir#     TODO:  exact reverse rules.  It appears that exact reverse rules
105*cdf0e10cSrcweir#            may require improving support for look-ahead breaks in the
106*cdf0e10cSrcweir#            builder.  Needs more investigation.
107*cdf0e10cSrcweir#
108*cdf0e10cSrcweir
109*cdf0e10cSrcweir[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*;
110*cdf0e10cSrcweir#.*;
111*cdf0e10cSrcweir
112*cdf0e10cSrcweir# Explanation for this rule:
113*cdf0e10cSrcweir#
114*cdf0e10cSrcweir#    It needs to back over
115*cdf0e10cSrcweir#        The $Sep at which we probably begin
116*cdf0e10cSrcweir#        All of the non $Sep chars leading to the preceding $Sep
117*cdf0e10cSrcweir#        The preceding $Sep, which will be the second one that the rule matches.
118*cdf0e10cSrcweir#        Any immediately preceding STerm or ATerm sequences.  We need to see these
119*cdf0e10cSrcweir#              to get the correct rule status when moving forwards again.
120*cdf0e10cSrcweir#
121*cdf0e10cSrcweir# [{bof}]           inhibit rule chaining.  Without this, rule would loop on itself and match
122*cdf0e10cSrcweir#                   the entire string.
123*cdf0e10cSrcweir#
124*cdf0e10cSrcweir# (.? | $LF $CR)    Match one $Sep instance.  Use .? rather than $Sep because position might be
125*cdf0e10cSrcweir#                   at the beginning of the string at this point, and we don't want to fail.
126*cdf0e10cSrcweir#                   Can only use {eof} once, and it is used later.
127*cdf0e10cSrcweir#
128*cdf0e10cSrcweir
129