1*cdf0e10cSrcweir# 2*cdf0e10cSrcweir# Copyright (C) 2002-2006, International Business Machines Corporation and others. 3*cdf0e10cSrcweir# All Rights Reserved. 4*cdf0e10cSrcweir# 5*cdf0e10cSrcweir# file: sent.txt 6*cdf0e10cSrcweir# 7*cdf0e10cSrcweir# ICU Sentence Break Rules 8*cdf0e10cSrcweir# See Unicode Standard Annex #29. 9*cdf0e10cSrcweir# These rules are based on SA 29 version 5.0.0 10*cdf0e10cSrcweir# Includes post 5.0 changes to treat Japanese half width voicing marks 11*cdf0e10cSrcweir# as Grapheme Extend. 12*cdf0e10cSrcweir# 13*cdf0e10cSrcweir 14*cdf0e10cSrcweir 15*cdf0e10cSrcweir$VoiceMarks = [\uff9e\uff9f]; 16*cdf0e10cSrcweir$Thai = [:Script = Thai:]; 17*cdf0e10cSrcweir 18*cdf0e10cSrcweir# 19*cdf0e10cSrcweir# Character categories as defined in TR 29 20*cdf0e10cSrcweir# 21*cdf0e10cSrcweir$Sep = [\p{Sentence_Break = Sep}]; 22*cdf0e10cSrcweir$Format = [\p{Sentence_Break = Format}]; 23*cdf0e10cSrcweir$Sp = [\p{Sentence_Break = Sp}]; 24*cdf0e10cSrcweir$Lower = [\p{Sentence_Break = Lower}]; 25*cdf0e10cSrcweir$Upper = [\p{Sentence_Break = Upper}]; 26*cdf0e10cSrcweir$OLetter = [\p{Sentence_Break = OLetter}-$VoiceMarks]; 27*cdf0e10cSrcweir$Numeric = [\p{Sentence_Break = Numeric}]; 28*cdf0e10cSrcweir$ATerm = [\p{Sentence_Break = ATerm}]; 29*cdf0e10cSrcweir$STerm = [\p{Sentence_Break = STerm}]; 30*cdf0e10cSrcweir$Close = [\p{Sentence_Break = Close}]; 31*cdf0e10cSrcweir 32*cdf0e10cSrcweir# 33*cdf0e10cSrcweir# Define extended forms of the character classes, 34*cdf0e10cSrcweir# incorporate grapheme cluster + format chars. 35*cdf0e10cSrcweir# Rules 4 and 5. 36*cdf0e10cSrcweir 37*cdf0e10cSrcweir 38*cdf0e10cSrcweir$CR = \u000d; 39*cdf0e10cSrcweir$LF = \u000a; 40*cdf0e10cSrcweir$Extend = [[:Grapheme_Extend = TRUE:]$VoiceMarks]; 41*cdf0e10cSrcweir 42*cdf0e10cSrcweir$SpEx = $Sp ($Extend | $Format)*; 43*cdf0e10cSrcweir$LowerEx = $Lower ($Extend | $Format)*; 44*cdf0e10cSrcweir$UpperEx = $Upper ($Extend | $Format)*; 45*cdf0e10cSrcweir$OLetterEx = $OLetter ($Extend | $Format)*; 46*cdf0e10cSrcweir$NumericEx = $Numeric ($Extend | $Format)*; 47*cdf0e10cSrcweir$ATermEx = $ATerm ($Extend | $Format)*; 48*cdf0e10cSrcweir$STermEx = $STerm ($Extend | $Format)*; 49*cdf0e10cSrcweir$CloseEx = $Close ($Extend | $Format)*; 50*cdf0e10cSrcweir 51*cdf0e10cSrcweir 52*cdf0e10cSrcweir## ------------------------------------------------- 53*cdf0e10cSrcweir 54*cdf0e10cSrcweir!!chain; 55*cdf0e10cSrcweir!!forward; 56*cdf0e10cSrcweir 57*cdf0e10cSrcweir# Rule 3 - break after separators. Keep CR/LF together. 58*cdf0e10cSrcweir# 59*cdf0e10cSrcweir$CR $LF; 60*cdf0e10cSrcweir 61*cdf0e10cSrcweir$LettersEx = [$OLetter $Upper $Lower $Numeric $Close $STerm] ($Extend | $Format)*; 62*cdf0e10cSrcweir$LettersEx* $Thai $LettersEx* ($ATermEx | $SpEx)*; 63*cdf0e10cSrcweir 64*cdf0e10cSrcweir# Rule 4 - Break after $Sep. 65*cdf0e10cSrcweir# Rule 5 - Ignore $Format and $Extend 66*cdf0e10cSrcweir# 67*cdf0e10cSrcweir[^$Sep]? ($Extend | $Format)*; 68*cdf0e10cSrcweir 69*cdf0e10cSrcweir 70*cdf0e10cSrcweir# Rule 6 71*cdf0e10cSrcweir$ATermEx $NumericEx; 72*cdf0e10cSrcweir 73*cdf0e10cSrcweir# Rule 7 74*cdf0e10cSrcweir$UpperEx $ATermEx $UpperEx; 75*cdf0e10cSrcweir 76*cdf0e10cSrcweir#Rule 8 77*cdf0e10cSrcweir# Note: follows errata for Unicode 5.0 boundary rules. 78*cdf0e10cSrcweir$NotLettersEx = [^$OLetter $Upper $Lower $Sep $ATerm $STerm] ($Extend | $Format)*; 79*cdf0e10cSrcweir$ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; 80*cdf0e10cSrcweir 81*cdf0e10cSrcweir# Rule 8a 82*cdf0e10cSrcweir($STermEx | $ATermEx) $CloseEx* $SpEx* ($STermEx | $ATermEx); 83*cdf0e10cSrcweir 84*cdf0e10cSrcweir#Rule 9, 10, 11 85*cdf0e10cSrcweir($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; 86*cdf0e10cSrcweir 87*cdf0e10cSrcweir#Rule 12 88*cdf0e10cSrcweir[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend $Thai]{bof}] ($Extend | $Format | $Close | $Sp)* [^$Thai]; 89*cdf0e10cSrcweir[[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; 90*cdf0e10cSrcweir 91*cdf0e10cSrcweir## ------------------------------------------------- 92*cdf0e10cSrcweir 93*cdf0e10cSrcweir!!reverse; 94*cdf0e10cSrcweir 95*cdf0e10cSrcweir$SpEx_R = ($Extend | $Format)* $Sp; 96*cdf0e10cSrcweir$ATermEx_R = ($Extend | $Format)* $ATerm; 97*cdf0e10cSrcweir$STermEx_R = ($Extend | $Format)* $STerm; 98*cdf0e10cSrcweir$CloseEx_R = ($Extend | $Format)* $Close; 99*cdf0e10cSrcweir 100*cdf0e10cSrcweir# 101*cdf0e10cSrcweir# Reverse rules. 102*cdf0e10cSrcweir# For now, use the old style inexact reverse rules, which are easier 103*cdf0e10cSrcweir# to write, but less efficient. 104*cdf0e10cSrcweir# TODO: exact reverse rules. It appears that exact reverse rules 105*cdf0e10cSrcweir# may require improving support for look-ahead breaks in the 106*cdf0e10cSrcweir# builder. Needs more investigation. 107*cdf0e10cSrcweir# 108*cdf0e10cSrcweir 109*cdf0e10cSrcweir[{bof}] (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))*; 110*cdf0e10cSrcweir#.*; 111*cdf0e10cSrcweir 112*cdf0e10cSrcweir# Explanation for this rule: 113*cdf0e10cSrcweir# 114*cdf0e10cSrcweir# It needs to back over 115*cdf0e10cSrcweir# The $Sep at which we probably begin 116*cdf0e10cSrcweir# All of the non $Sep chars leading to the preceding $Sep 117*cdf0e10cSrcweir# The preceding $Sep, which will be the second one that the rule matches. 118*cdf0e10cSrcweir# Any immediately preceding STerm or ATerm sequences. We need to see these 119*cdf0e10cSrcweir# to get the correct rule status when moving forwards again. 120*cdf0e10cSrcweir# 121*cdf0e10cSrcweir# [{bof}] inhibit rule chaining. Without this, rule would loop on itself and match 122*cdf0e10cSrcweir# the entire string. 123*cdf0e10cSrcweir# 124*cdf0e10cSrcweir# (.? | $LF $CR) Match one $Sep instance. Use .? rather than $Sep because position might be 125*cdf0e10cSrcweir# at the beginning of the string at this point, and we don't want to fail. 126*cdf0e10cSrcweir# Can only use {eof} once, and it is used later. 127*cdf0e10cSrcweir# 128*cdf0e10cSrcweir 129