1*cdf0e10cSrcweir# Copyright (c) 2002-2006 International Business Machines Corporation and 2*cdf0e10cSrcweir# others. All Rights Reserved. 3*cdf0e10cSrcweir# 4*cdf0e10cSrcweir# file: line.txt 5*cdf0e10cSrcweir# 6*cdf0e10cSrcweir# Line Breaking Rules 7*cdf0e10cSrcweir# Implement default line breaking as defined by Unicode Standard Annex #14 version 5.0.0 8*cdf0e10cSrcweir# http://www.unicode.org/reports/tr14/ 9*cdf0e10cSrcweir 10*cdf0e10cSrcweir 11*cdf0e10cSrcweir 12*cdf0e10cSrcweir# 13*cdf0e10cSrcweir# Character Classes defined by TR 14. 14*cdf0e10cSrcweir# 15*cdf0e10cSrcweir 16*cdf0e10cSrcweir!!chain; 17*cdf0e10cSrcweir!!LBCMNoChain; 18*cdf0e10cSrcweir 19*cdf0e10cSrcweir 20*cdf0e10cSrcweir!!lookAheadHardBreak; 21*cdf0e10cSrcweir# 22*cdf0e10cSrcweir# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere 23*cdf0e10cSrcweir# and only used for the line break rules. 24*cdf0e10cSrcweir# 25*cdf0e10cSrcweir# It is used in the implementation of the incredibly annoying rule LB 10 26*cdf0e10cSrcweir# which says to treat any combining mark that is not attached to a base 27*cdf0e10cSrcweir# character as if it were of class AL (alphabetic). 28*cdf0e10cSrcweir# 29*cdf0e10cSrcweir# The problem occurs in the reverse rules. 30*cdf0e10cSrcweir# 31*cdf0e10cSrcweir# Consider a sequence like, with correct breaks as shown 32*cdf0e10cSrcweir# LF ID CM AL AL 33*cdf0e10cSrcweir# ^ ^ ^ 34*cdf0e10cSrcweir# Then consider the sequence without the initial ID (ideographic) 35*cdf0e10cSrcweir# LF CM AL AL 36*cdf0e10cSrcweir# ^ ^ 37*cdf0e10cSrcweir# Our CM, which in the first example was attached to the ideograph, 38*cdf0e10cSrcweir# is now unattached, becomes an alpha, and joins in with the other 39*cdf0e10cSrcweir# alphas. 40*cdf0e10cSrcweir# 41*cdf0e10cSrcweir# When iterating forwards, these sequences do not present any problems 42*cdf0e10cSrcweir# When iterating backwards, we need to look ahead when encountering 43*cdf0e10cSrcweir# a CM to see whether it attaches to something further on or not. 44*cdf0e10cSrcweir# (Look-ahead in a reverse rule is looking towards the start) 45*cdf0e10cSrcweir# 46*cdf0e10cSrcweir# If the CM is unattached, we need to force a break. 47*cdf0e10cSrcweir# 48*cdf0e10cSrcweir# !!lookAheadHardBreak forces the run time state machine to 49*cdf0e10cSrcweir# stop immediately when a look ahead rule ( '/' operator) matches, 50*cdf0e10cSrcweir# and set the match position to that of the look-ahead operator, 51*cdf0e10cSrcweir# no matter what other rules may be in play at the time. 52*cdf0e10cSrcweir# 53*cdf0e10cSrcweir# See rule LB 19 for an example. 54*cdf0e10cSrcweir# 55*cdf0e10cSrcweir 56*cdf0e10cSrcweir$AI = [:LineBreak = Ambiguous:]; 57*cdf0e10cSrcweir$DG = \u00B0; 58*cdf0e10cSrcweir$AL = [[:LineBreak = Alphabetic:] $DG]; 59*cdf0e10cSrcweir$BA = [:LineBreak = Break_After:]; 60*cdf0e10cSrcweir$BB = [:LineBreak = Break_Before:]; 61*cdf0e10cSrcweir$BK = [:LineBreak = Mandatory_Break:]; 62*cdf0e10cSrcweir$B2 = [:LineBreak = Break_Both:]; 63*cdf0e10cSrcweir$CB = [:LineBreak = Contingent_Break:]; 64*cdf0e10cSrcweir$CL = [:LineBreak = Close_Punctuation:] ; 65*cdf0e10cSrcweir$CM = [:LineBreak = Combining_Mark:]; 66*cdf0e10cSrcweir$CR = [:LineBreak = Carriage_Return:]; 67*cdf0e10cSrcweir$EX = [:LineBreak = Exclamation:]; 68*cdf0e10cSrcweir$GL = [:LineBreak = Glue:]; 69*cdf0e10cSrcweir$HY = [:LineBreak = Hyphen:]; 70*cdf0e10cSrcweir$H2 = [:LineBreak = H2:]; 71*cdf0e10cSrcweir$H3 = [:LineBreak = H3:]; 72*cdf0e10cSrcweir$ID = [[:LineBreak = Ideographic:] - [\ufe30]]; 73*cdf0e10cSrcweir$IN = [:LineBreak = Inseperable:]; 74*cdf0e10cSrcweir$IS = [[:LineBreak = Infix_Numeric:] [\ufe30]]; 75*cdf0e10cSrcweir$JL = [:LineBreak = JL:]; 76*cdf0e10cSrcweir$JV = [:LineBreak = JV:]; 77*cdf0e10cSrcweir$JT = [:LineBreak = JT:]; 78*cdf0e10cSrcweir$LF = [:LineBreak = Line_Feed:]; 79*cdf0e10cSrcweir$NL = [:LineBreak = Next_Line:]; 80*cdf0e10cSrcweir$NS = [:LineBreak = Nonstarter:]; 81*cdf0e10cSrcweir$NU = [:LineBreak = Numeric:]; 82*cdf0e10cSrcweir$OP = [[:LineBreak = Open_Punctuation:] - $DG]; 83*cdf0e10cSrcweir$PO = [:LineBreak = Postfix_Numeric:]; 84*cdf0e10cSrcweir$BS = \u005C; 85*cdf0e10cSrcweir$PR = [[:LineBreak = Prefix_Numeric:] - $BS]; 86*cdf0e10cSrcweir$QU = [:LineBreak = Quotation:]; 87*cdf0e10cSrcweir$SA = [:LineBreak = Complex_Context:]; 88*cdf0e10cSrcweir$SG = [:LineBreak = Surrogate:]; 89*cdf0e10cSrcweir$SP = [:LineBreak = Space:]; 90*cdf0e10cSrcweir$SY = [[:LineBreak = Break_Symbols:] $BS]; 91*cdf0e10cSrcweir$WJ = [:LineBreak = Word_Joiner:]; 92*cdf0e10cSrcweir$XX = [:LineBreak = Unknown:]; 93*cdf0e10cSrcweir$ZW = [:LineBreak = ZWSpace:]; 94*cdf0e10cSrcweir 95*cdf0e10cSrcweir# Dictionary character set, for triggering language-based break engines. Currently 96*cdf0e10cSrcweir# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 97*cdf0e10cSrcweir# 5.0 or later as the definition of Complex_Context was corrected to include all 98*cdf0e10cSrcweir# characters requiring dictionary break. 99*cdf0e10cSrcweir 100*cdf0e10cSrcweir$dictionary = [:LineBreak = Complex_Context:]; 101*cdf0e10cSrcweir 102*cdf0e10cSrcweir# 103*cdf0e10cSrcweir# Rule LB1. By default, treat AI (characters with ambiguous east Asian width), 104*cdf0e10cSrcweir# SA (South East Asian: Thai, Lao, Khmer) 105*cdf0e10cSrcweir# SG (Unpaired Surrogates) 106*cdf0e10cSrcweir# XX (Unknown, unassigned) 107*cdf0e10cSrcweir# as $AL (Alphabetic) 108*cdf0e10cSrcweir# 109*cdf0e10cSrcweir$ALPlus = [$AL $AI $SA $SG $XX]; 110*cdf0e10cSrcweir 111*cdf0e10cSrcweir# 112*cdf0e10cSrcweir# Combining Marks. X $CM* behaves as if it were X. Rule LB6. 113*cdf0e10cSrcweir# 114*cdf0e10cSrcweir$ALcm = $ALPlus $CM*; 115*cdf0e10cSrcweir$BAcm = $BA $CM*; 116*cdf0e10cSrcweir$BBcm = $BB $CM*; 117*cdf0e10cSrcweir$B2cm = $B2 $CM*; 118*cdf0e10cSrcweir$CLcm = $CL $CM*; 119*cdf0e10cSrcweir$EXcm = $EX $CM*; 120*cdf0e10cSrcweir$GLcm = $GL $CM*; 121*cdf0e10cSrcweir$HYcm = $HY $CM*; 122*cdf0e10cSrcweir$H2cm = $H2 $CM*; 123*cdf0e10cSrcweir$H3cm = $H3 $CM*; 124*cdf0e10cSrcweir$IDcm = $ID $CM*; 125*cdf0e10cSrcweir$INcm = $IN $CM*; 126*cdf0e10cSrcweir$IScm = $IS $CM*; 127*cdf0e10cSrcweir$JLcm = $JL $CM*; 128*cdf0e10cSrcweir$JVcm = $JV $CM*; 129*cdf0e10cSrcweir$JTcm = $JT $CM*; 130*cdf0e10cSrcweir$NScm = $NS $CM*; 131*cdf0e10cSrcweir$NUcm = $NU $CM*; 132*cdf0e10cSrcweir$OPcm = $OP $CM*; 133*cdf0e10cSrcweir$POcm = $PO $CM*; 134*cdf0e10cSrcweir$PRcm = $PR $CM*; 135*cdf0e10cSrcweir$QUcm = $QU $CM*; 136*cdf0e10cSrcweir$SYcm = $SY $CM*; 137*cdf0e10cSrcweir$WJcm = $WJ $CM*; 138*cdf0e10cSrcweir 139*cdf0e10cSrcweir## ------------------------------------------------- 140*cdf0e10cSrcweir 141*cdf0e10cSrcweir!!forward; 142*cdf0e10cSrcweir 143*cdf0e10cSrcweir# 144*cdf0e10cSrcweir# Each class of character can stand by itself as an unbroken token, with trailing combining stuff 145*cdf0e10cSrcweir# 146*cdf0e10cSrcweir$ALPlus $CM+; 147*cdf0e10cSrcweir$BA $CM+; 148*cdf0e10cSrcweir$BB $CM+; 149*cdf0e10cSrcweir$B2 $CM+; 150*cdf0e10cSrcweir$CL $CM+; 151*cdf0e10cSrcweir$EX $CM+; 152*cdf0e10cSrcweir$GL $CM+; 153*cdf0e10cSrcweir$HY $CM+; 154*cdf0e10cSrcweir$H2 $CM+; 155*cdf0e10cSrcweir$H3 $CM+; 156*cdf0e10cSrcweir$ID $CM+; 157*cdf0e10cSrcweir$IN $CM+; 158*cdf0e10cSrcweir$IS $CM+; 159*cdf0e10cSrcweir$JL $CM+; 160*cdf0e10cSrcweir$JV $CM+; 161*cdf0e10cSrcweir$JT $CM+; 162*cdf0e10cSrcweir$NS $CM+; 163*cdf0e10cSrcweir$NU $CM+; 164*cdf0e10cSrcweir$OP $CM+; 165*cdf0e10cSrcweir$PO $CM+; 166*cdf0e10cSrcweir$PR $CM+; 167*cdf0e10cSrcweir$QU $CM+; 168*cdf0e10cSrcweir$SY $CM+; 169*cdf0e10cSrcweir$WJ $CM+; 170*cdf0e10cSrcweir 171*cdf0e10cSrcweir# 172*cdf0e10cSrcweir# CAN_CM is the set of characters that may combine with CM combining chars. 173*cdf0e10cSrcweir# Note that Linebreak UAX 14's concept of a combining char and the rules 174*cdf0e10cSrcweir# for what they can combine with are _very_ different from the rest of Unicode. 175*cdf0e10cSrcweir# 176*cdf0e10cSrcweir# Note that $CM itself is left out of this set. If CM is needed as a base 177*cdf0e10cSrcweir# it must be listed separately in the rule. 178*cdf0e10cSrcweir# 179*cdf0e10cSrcweir$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs 180*cdf0e10cSrcweir$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs 181*cdf0e10cSrcweir 182*cdf0e10cSrcweir# 183*cdf0e10cSrcweir# AL_FOLLOW set of chars that can unconditionally follow an AL 184*cdf0e10cSrcweir# Needed in rules where stand-alone $CM s are treated as AL. 185*cdf0e10cSrcweir# Chaining is disabled with CM because it causes other failures, 186*cdf0e10cSrcweir# so for this one case we need to manually list out longer sequences. 187*cdf0e10cSrcweir# 188*cdf0e10cSrcweir$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; 189*cdf0e10cSrcweir$AL_FOLLOW_CM = [$CL $EX $IS $SY $WJ $GL $QU $BA $HY $NS $IN $NU $ALPlus $OP]; 190*cdf0e10cSrcweir$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; 191*cdf0e10cSrcweir 192*cdf0e10cSrcweir 193*cdf0e10cSrcweir# 194*cdf0e10cSrcweir# Rule LB 4, 5 Mandatory (Hard) breaks. 195*cdf0e10cSrcweir# 196*cdf0e10cSrcweir$LB4Breaks = [$BK $CR $LF $NL]; 197*cdf0e10cSrcweir$LB4NonBreaks = [^$BK $CR $LF $NL]; 198*cdf0e10cSrcweir$CR $LF {100}; 199*cdf0e10cSrcweir 200*cdf0e10cSrcweir# 201*cdf0e10cSrcweir# LB 6 Do not break before hard line breaks. 202*cdf0e10cSrcweir# 203*cdf0e10cSrcweir$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. 204*cdf0e10cSrcweir$CAN_CM $CM* $LB4Breaks {100}; 205*cdf0e10cSrcweir$CM+ $LB4Breaks {100}; 206*cdf0e10cSrcweir 207*cdf0e10cSrcweir# LB 7 x SP 208*cdf0e10cSrcweir# x ZW 209*cdf0e10cSrcweir$LB4NonBreaks [$SP $ZW]; 210*cdf0e10cSrcweir$CAN_CM $CM* [$SP $ZW]; 211*cdf0e10cSrcweir$CM+ [$SP $ZW]; 212*cdf0e10cSrcweir 213*cdf0e10cSrcweir# 214*cdf0e10cSrcweir# LB 8 Break after zero width space 215*cdf0e10cSrcweir# 216*cdf0e10cSrcweir$LB8Breaks = [$LB4Breaks $ZW]; 217*cdf0e10cSrcweir$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; 218*cdf0e10cSrcweir 219*cdf0e10cSrcweir 220*cdf0e10cSrcweir# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 221*cdf0e10cSrcweir# $CM not covered by the above needs to behave like $AL 222*cdf0e10cSrcweir# See definition of $CAN_CM. 223*cdf0e10cSrcweir 224*cdf0e10cSrcweir$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. 225*cdf0e10cSrcweir$CM+; 226*cdf0e10cSrcweir 227*cdf0e10cSrcweir# 228*cdf0e10cSrcweir# LB 11 Do not break before or after WORD JOINER & related characters. 229*cdf0e10cSrcweir# 230*cdf0e10cSrcweir$CAN_CM $CM* $WJcm; 231*cdf0e10cSrcweir$LB8NonBreaks $WJcm; 232*cdf0e10cSrcweir$CM+ $WJcm; 233*cdf0e10cSrcweir 234*cdf0e10cSrcweir$WJcm [^$CAN_CM]; 235*cdf0e10cSrcweir$WJcm $CAN_CM $CM*; 236*cdf0e10cSrcweir 237*cdf0e10cSrcweir# 238*cdf0e10cSrcweir# LB 12 Do not break before or after NBSP and related characters. 239*cdf0e10cSrcweir# 240*cdf0e10cSrcweir# (!SP) x GL 241*cdf0e10cSrcweir[$LB8NonBreaks-$SP] $CM* $GLcm; 242*cdf0e10cSrcweir$CM+ $GLcm; 243*cdf0e10cSrcweir 244*cdf0e10cSrcweir# GL x 245*cdf0e10cSrcweir$GLcm ($LB8Breaks | $SP); 246*cdf0e10cSrcweir$GLcm [$LB8NonBreaks-$SP] $CM*; # Don't let a combining mark go onto $CR, $BK, etc. 247*cdf0e10cSrcweir # TODO: I don't think we need this rule. 248*cdf0e10cSrcweir # All but $CM will chain off of preceding rule. 249*cdf0e10cSrcweir # $GLcm will pick up the CM case by itself. 250*cdf0e10cSrcweir 251*cdf0e10cSrcweir 252*cdf0e10cSrcweir 253*cdf0e10cSrcweir 254*cdf0e10cSrcweir# 255*cdf0e10cSrcweir# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. 256*cdf0e10cSrcweir# 257*cdf0e10cSrcweir$LB8NonBreaks $CL; 258*cdf0e10cSrcweir$CAN_CM $CM* $CL; 259*cdf0e10cSrcweir$CM+ $CL; # by rule 10, stand-alone CM behaves as AL 260*cdf0e10cSrcweir 261*cdf0e10cSrcweir$LB8NonBreaks $EX; 262*cdf0e10cSrcweir$CAN_CM $CM* $EX; 263*cdf0e10cSrcweir$CM+ $EX; # by rule 10, stand-alone CM behaves as AL 264*cdf0e10cSrcweir 265*cdf0e10cSrcweir$LB8NonBreaks $IS; 266*cdf0e10cSrcweir$CAN_CM $CM* $IS; 267*cdf0e10cSrcweir$CM+ $IS; # by rule 10, stand-alone CM behaves as AL 268*cdf0e10cSrcweir 269*cdf0e10cSrcweir$LB8NonBreaks $SY; 270*cdf0e10cSrcweir$CAN_CM $CM* $SY; 271*cdf0e10cSrcweir$CM+ $SY; # by rule 10, stand-alone CM behaves as AL 272*cdf0e10cSrcweir 273*cdf0e10cSrcweir 274*cdf0e10cSrcweir# 275*cdf0e10cSrcweir# LB 14 Do not break after OP, even after spaced 276*cdf0e10cSrcweir# 277*cdf0e10cSrcweir$OPcm $SP* $CAN_CM $CM*; 278*cdf0e10cSrcweir$OPcm $SP* $CANT_CM; 279*cdf0e10cSrcweir 280*cdf0e10cSrcweir$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL 281*cdf0e10cSrcweir 282*cdf0e10cSrcweir# LB 15 283*cdf0e10cSrcweir# $QUcm $SP* $OPcm; 284*cdf0e10cSrcweir 285*cdf0e10cSrcweir# LB 16 286*cdf0e10cSrcweir$CLcm $SP* $NScm; 287*cdf0e10cSrcweir 288*cdf0e10cSrcweir# LB 17 289*cdf0e10cSrcweir$B2cm $SP* $B2cm; 290*cdf0e10cSrcweir 291*cdf0e10cSrcweir# 292*cdf0e10cSrcweir# LB 18 Break after spaces. 293*cdf0e10cSrcweir# 294*cdf0e10cSrcweir$LB18NonBreaks = [$LB8NonBreaks - [$SP]]; 295*cdf0e10cSrcweir$LB18Breaks = [$LB8Breaks $SP]; 296*cdf0e10cSrcweir 297*cdf0e10cSrcweir 298*cdf0e10cSrcweir# LB 19 299*cdf0e10cSrcweir# x QU 300*cdf0e10cSrcweir$LB18NonBreaks $CM* $QUcm; 301*cdf0e10cSrcweir$CM+ $QUcm; 302*cdf0e10cSrcweir 303*cdf0e10cSrcweir# QU x 304*cdf0e10cSrcweir$QUcm .?; 305*cdf0e10cSrcweir$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. 306*cdf0e10cSrcweir # TODO: I don't think this rule is needed. 307*cdf0e10cSrcweir 308*cdf0e10cSrcweir 309*cdf0e10cSrcweir# LB 20 310*cdf0e10cSrcweir# <break> $CB 311*cdf0e10cSrcweir# $CB <break> 312*cdf0e10cSrcweir 313*cdf0e10cSrcweir$LB20NonBreaks = [$LB18NonBreaks - $CB]; 314*cdf0e10cSrcweir 315*cdf0e10cSrcweir# LB 21 x (BA | HY | NS) 316*cdf0e10cSrcweir# BB x 317*cdf0e10cSrcweir# 318*cdf0e10cSrcweir$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 319*cdf0e10cSrcweir 320*cdf0e10cSrcweir$BBcm [^$CB]; # $BB x 321*cdf0e10cSrcweir$BBcm $LB20NonBreaks $CM*; 322*cdf0e10cSrcweir 323*cdf0e10cSrcweir# LB 22 324*cdf0e10cSrcweir$ALcm $INcm; 325*cdf0e10cSrcweir$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL 326*cdf0e10cSrcweir$IDcm $INcm; 327*cdf0e10cSrcweir$INcm $INcm; 328*cdf0e10cSrcweir$NUcm $INcm; 329*cdf0e10cSrcweir 330*cdf0e10cSrcweir 331*cdf0e10cSrcweir# $LB 23 332*cdf0e10cSrcweir$IDcm $POcm; 333*cdf0e10cSrcweir$ALcm $NUcm; # includes $LB19 334*cdf0e10cSrcweir$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL 335*cdf0e10cSrcweir$NUcm $ALcm; 336*cdf0e10cSrcweir 337*cdf0e10cSrcweir# 338*cdf0e10cSrcweir# LB 24 339*cdf0e10cSrcweir# 340*cdf0e10cSrcweir$PRcm $IDcm; 341*cdf0e10cSrcweir$ALcm $PRcm; 342*cdf0e10cSrcweir$PRcm $ALcm; 343*cdf0e10cSrcweir$POcm $ALcm; 344*cdf0e10cSrcweir 345*cdf0e10cSrcweir# 346*cdf0e10cSrcweir# LB 25 Numbers. 347*cdf0e10cSrcweir# 348*cdf0e10cSrcweir($PRcm | $POcm)? ($OPcm)? $NUcm ($NUcm | $SYcm | $IScm)* $CLcm? ($PRcm | $POcm)?; 349*cdf0e10cSrcweir 350*cdf0e10cSrcweir# LB 26 Do not break a Korean syllable 351*cdf0e10cSrcweir# 352*cdf0e10cSrcweir$JLcm ($JLcm | $JVcm | $H2cm | $H3cm); 353*cdf0e10cSrcweir($JVcm | $H2cm) ($JVcm | $JTcm); 354*cdf0e10cSrcweir($JTcm | $H3cm) $JTcm; 355*cdf0e10cSrcweir 356*cdf0e10cSrcweir# LB 27 Treat korean Syllable Block the same as ID (don't break it) 357*cdf0e10cSrcweir($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; 358*cdf0e10cSrcweir($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; 359*cdf0e10cSrcweir$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); 360*cdf0e10cSrcweir 361*cdf0e10cSrcweir 362*cdf0e10cSrcweir# LB 28 Do not break between alphabetics 363*cdf0e10cSrcweir# 364*cdf0e10cSrcweir$ALcm $ALcm; 365*cdf0e10cSrcweir$CM+ $ALcm; # The $CM+ is from rule 10, and unattached CM is treated as AL 366*cdf0e10cSrcweir 367*cdf0e10cSrcweir# LB 29 368*cdf0e10cSrcweir$IScm ($ALcm | $NUcm); 369*cdf0e10cSrcweir 370*cdf0e10cSrcweir# 371*cdf0e10cSrcweir# Rule 30 Do not break between letters, numbers or ordinary symbols 372*cdf0e10cSrcweir# and opening or closing punctuation 373*cdf0e10cSrcweir# 374*cdf0e10cSrcweir($ALcm | $NUcm) $OPcm; 375*cdf0e10cSrcweir$CM+ $OPcm; 376*cdf0e10cSrcweir$CLcm ($ALcm | $NUcm); 377*cdf0e10cSrcweir 378*cdf0e10cSrcweir 379*cdf0e10cSrcweir 380*cdf0e10cSrcweir# 381*cdf0e10cSrcweir# Reverse Rules. 382*cdf0e10cSrcweir# 383*cdf0e10cSrcweir## ------------------------------------------------- 384*cdf0e10cSrcweir 385*cdf0e10cSrcweir!!reverse; 386*cdf0e10cSrcweir 387*cdf0e10cSrcweir$CM+ $ALPlus; 388*cdf0e10cSrcweir$CM+ $BA; 389*cdf0e10cSrcweir$CM+ $BB; 390*cdf0e10cSrcweir$CM+ $B2; 391*cdf0e10cSrcweir$CM+ $CL; 392*cdf0e10cSrcweir$CM+ $EX; 393*cdf0e10cSrcweir$CM+ $GL; 394*cdf0e10cSrcweir$CM+ $HY; 395*cdf0e10cSrcweir$CM+ $H2; 396*cdf0e10cSrcweir$CM+ $H3; 397*cdf0e10cSrcweir$CM+ $ID; 398*cdf0e10cSrcweir$CM+ $IN; 399*cdf0e10cSrcweir$CM+ $IS; 400*cdf0e10cSrcweir$CM+ $JL; 401*cdf0e10cSrcweir$CM+ $JV; 402*cdf0e10cSrcweir$CM+ $JT; 403*cdf0e10cSrcweir$CM+ $NS; 404*cdf0e10cSrcweir$CM+ $NU; 405*cdf0e10cSrcweir$CM+ $OP; 406*cdf0e10cSrcweir$CM+ $PO; 407*cdf0e10cSrcweir$CM+ $PR; 408*cdf0e10cSrcweir$CM+ $QU; 409*cdf0e10cSrcweir$CM+ $SY; 410*cdf0e10cSrcweir$CM+ $WJ; 411*cdf0e10cSrcweir$CM+; 412*cdf0e10cSrcweir 413*cdf0e10cSrcweir 414*cdf0e10cSrcweir# 415*cdf0e10cSrcweir# Sequences of the form (shown forwards) 416*cdf0e10cSrcweir# [CANT_CM] <break> [CM] [whatever] 417*cdf0e10cSrcweir# The CM needs to behave as an AL 418*cdf0e10cSrcweir# 419*cdf0e10cSrcweir$AL_FOLLOW $CM+ / ( 420*cdf0e10cSrcweir [$BK $CR $LF $NL $ZW {eof}] | 421*cdf0e10cSrcweir $SP+ $CM+ $SP | 422*cdf0e10cSrcweir $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. 423*cdf0e10cSrcweir # LB14 says OP SP* x . 424*cdf0e10cSrcweir # becomes OP SP* x AL 425*cdf0e10cSrcweir # becomes OP SP* x CM+ AL_FOLLOW 426*cdf0e10cSrcweir # 427*cdf0e10cSrcweir # Further note: the $AL in [$AL {eof}] is only to work around 428*cdf0e10cSrcweir # a rule compiler bug which complains about 429*cdf0e10cSrcweir # empty sets otherwise. 430*cdf0e10cSrcweir 431*cdf0e10cSrcweir# 432*cdf0e10cSrcweir# Sequences of the form (shown forwards) 433*cdf0e10cSrcweir# [CANT_CM] <break> [CM] <break> [PR] 434*cdf0e10cSrcweir# The CM needs to behave as an AL 435*cdf0e10cSrcweir# This rule is concerned about getting the second of the two <breaks> in place. 436*cdf0e10cSrcweir# 437*cdf0e10cSrcweir 438*cdf0e10cSrcweir[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; 439*cdf0e10cSrcweir 440*cdf0e10cSrcweir 441*cdf0e10cSrcweir 442*cdf0e10cSrcweir# LB 4, 5, 5 443*cdf0e10cSrcweir 444*cdf0e10cSrcweir$LB4Breaks [$LB4NonBreaks-$CM]; 445*cdf0e10cSrcweir$LB4Breaks $CM+ $CAN_CM; 446*cdf0e10cSrcweir$LF $CR; 447*cdf0e10cSrcweir 448*cdf0e10cSrcweir 449*cdf0e10cSrcweir# LB 7 x SP 450*cdf0e10cSrcweir# x ZW 451*cdf0e10cSrcweir[$SP $ZW] [$LB4NonBreaks-$CM]; 452*cdf0e10cSrcweir[$SP $ZW] $CM+ $CAN_CM; 453*cdf0e10cSrcweir 454*cdf0e10cSrcweir# LB 8 Break after zero width space 455*cdf0e10cSrcweir 456*cdf0e10cSrcweir 457*cdf0e10cSrcweir# LB 9,10 Combining marks. 458*cdf0e10cSrcweir# X $CM needs to behave like X, where X is not $SP or controls. 459*cdf0e10cSrcweir# $CM not covered by the above needs to behave like $AL 460*cdf0e10cSrcweir# Stick together any combining sequences that don't match other rules. 461*cdf0e10cSrcweir$CM+ $CAN_CM; 462*cdf0e10cSrcweir 463*cdf0e10cSrcweir 464*cdf0e10cSrcweir# LB 11 465*cdf0e10cSrcweir$CM* $WJ $CM* $CAN_CM; 466*cdf0e10cSrcweir$CM* $WJ [$LB8NonBreaks-$CM]; 467*cdf0e10cSrcweir 468*cdf0e10cSrcweir $CANT_CM $CM* $WJ; 469*cdf0e10cSrcweir$CM* $CAN_CM $CM* $WJ; 470*cdf0e10cSrcweir 471*cdf0e10cSrcweir# LB 12 472*cdf0e10cSrcweir# x GL 473*cdf0e10cSrcweir# 474*cdf0e10cSrcweir$CM* $GL $CM* [$LB8NonBreaks-$CM-$SP]; 475*cdf0e10cSrcweir 476*cdf0e10cSrcweir# 477*cdf0e10cSrcweir# GL x 478*cdf0e10cSrcweir# 479*cdf0e10cSrcweir$CANT_CM $CM* $GL; 480*cdf0e10cSrcweir$CM* $CAN_CM $CM* $GL; 481*cdf0e10cSrcweir 482*cdf0e10cSrcweir 483*cdf0e10cSrcweir# LB 13 484*cdf0e10cSrcweir$CL $CM+ $CAN_CM; 485*cdf0e10cSrcweir$EX $CM+ $CAN_CM; 486*cdf0e10cSrcweir$IS $CM+ $CAN_CM; 487*cdf0e10cSrcweir$SY $CM+ $CAN_CM; 488*cdf0e10cSrcweir 489*cdf0e10cSrcweir$CL [$LB8NonBreaks-$CM]; 490*cdf0e10cSrcweir$EX [$LB8NonBreaks-$CM]; 491*cdf0e10cSrcweir$IS [$LB8NonBreaks-$CM]; 492*cdf0e10cSrcweir$SY [$LB8NonBreaks-$CM]; 493*cdf0e10cSrcweir 494*cdf0e10cSrcweir# Rule 13 & 14 taken together for an edge case. 495*cdf0e10cSrcweir# Match this, shown forward 496*cdf0e10cSrcweir# OP SP+ ($CM+ behaving as $AL) (CL | EX | IS | IY) 497*cdf0e10cSrcweir# This really wants to chain at the $CM+ (which is acting as an $AL) 498*cdf0e10cSrcweir# except for $CM chaining being disabled. 499*cdf0e10cSrcweir[$CL $EX $IS $SY] $CM+ $SP+ $CM* $OP; 500*cdf0e10cSrcweir 501*cdf0e10cSrcweir# LB 14 OP SP* x 502*cdf0e10cSrcweir# 503*cdf0e10cSrcweir$CM* $CAN_CM $SP* $CM* $OP; 504*cdf0e10cSrcweir $CANT_CM $SP* $CM* $OP; 505*cdf0e10cSrcweir$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP 506*cdf0e10cSrcweir 507*cdf0e10cSrcweir $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; 508*cdf0e10cSrcweir$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; 509*cdf0e10cSrcweir$SY $CM $SP+ $OP; # TODO: Experiment. Remove. 510*cdf0e10cSrcweir 511*cdf0e10cSrcweir 512*cdf0e10cSrcweir 513*cdf0e10cSrcweir# LB 15 514*cdf0e10cSrcweir# $CM* $OP $SP* $CM* $QU; 515*cdf0e10cSrcweir 516*cdf0e10cSrcweir# LB 16 517*cdf0e10cSrcweir$CM* $NS $SP* $CM* $CL; 518*cdf0e10cSrcweir 519*cdf0e10cSrcweir# LB 17 520*cdf0e10cSrcweir$CM* $B2 $SP* $CM* $B2; 521*cdf0e10cSrcweir 522*cdf0e10cSrcweir# LB 18 break after spaces 523*cdf0e10cSrcweir# Nothing explicit needed here. 524*cdf0e10cSrcweir 525*cdf0e10cSrcweir 526*cdf0e10cSrcweir# 527*cdf0e10cSrcweir# LB 19 528*cdf0e10cSrcweir# 529*cdf0e10cSrcweir$CM* $QU $CM* $CAN_CM; # . x QU 530*cdf0e10cSrcweir$CM* $QU $LB18NonBreaks; 531*cdf0e10cSrcweir 532*cdf0e10cSrcweir 533*cdf0e10cSrcweir$CM* $CAN_CM $CM* $QU; # QU x . 534*cdf0e10cSrcweir $CANT_CM $CM* $QU; 535*cdf0e10cSrcweir 536*cdf0e10cSrcweir# 537*cdf0e10cSrcweir# LB 20 Break before and after CB. 538*cdf0e10cSrcweir# nothing needed here. 539*cdf0e10cSrcweir# 540*cdf0e10cSrcweir 541*cdf0e10cSrcweir# LB 21 542*cdf0e10cSrcweir$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) 543*cdf0e10cSrcweir 544*cdf0e10cSrcweir$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . 545*cdf0e10cSrcweir[^$CB] $CM* $BB; # 546*cdf0e10cSrcweir 547*cdf0e10cSrcweir 548*cdf0e10cSrcweir 549*cdf0e10cSrcweir# LB 22 550*cdf0e10cSrcweir$CM* $IN $CM* $ALPlus; 551*cdf0e10cSrcweir$CM* $IN $CM* $ID; 552*cdf0e10cSrcweir$CM* $IN $CM* $IN; 553*cdf0e10cSrcweir$CM* $IN $CM* $NU; 554*cdf0e10cSrcweir 555*cdf0e10cSrcweir# LB 23 556*cdf0e10cSrcweir$CM* $PO $CM* $ID; 557*cdf0e10cSrcweir$CM* $NU $CM* $ALPlus; 558*cdf0e10cSrcweir$CM* $ALPlus $CM* $NU; 559*cdf0e10cSrcweir 560*cdf0e10cSrcweir# LB 24 561*cdf0e10cSrcweir$CM* $ID $CM* $PR; 562*cdf0e10cSrcweir$CM* $PR $CM* $ALPlus; 563*cdf0e10cSrcweir$CM* $ALPlus $CM* $PR; 564*cdf0e10cSrcweir$CM* $ALPlus $CM* $PO; 565*cdf0e10cSrcweir 566*cdf0e10cSrcweir$CM* $ALPlus $CM* ($IS | $SY | $HY)+ / $SP; 567*cdf0e10cSrcweir$CM* $NU+ $CM* $HY+ / $SP; 568*cdf0e10cSrcweir 569*cdf0e10cSrcweir# LB 25 570*cdf0e10cSrcweir($CM* ($PR | $PO))? ($CM* $CL)? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP))? ($CM* ($PR | $PO))?; 571*cdf0e10cSrcweir 572*cdf0e10cSrcweir# LB 26 573*cdf0e10cSrcweir$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; 574*cdf0e10cSrcweir$CM* ($JT | $JV) $CM* ($H2 | $JV); 575*cdf0e10cSrcweir$CM* $JT $CM* ($H3 | $JT); 576*cdf0e10cSrcweir 577*cdf0e10cSrcweir# LB 27 578*cdf0e10cSrcweir$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); 579*cdf0e10cSrcweir$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); 580*cdf0e10cSrcweir$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; 581*cdf0e10cSrcweir 582*cdf0e10cSrcweir# LB 28 583*cdf0e10cSrcweir$CM* $ALPlus $CM* $ALPlus; 584*cdf0e10cSrcweir 585*cdf0e10cSrcweir 586*cdf0e10cSrcweir# LB 29 587*cdf0e10cSrcweir$CM* ($NU | $ALPlus) $CM* $IS+ [^$SP]; 588*cdf0e10cSrcweir 589*cdf0e10cSrcweir# LB 30 590*cdf0e10cSrcweir$CM* $OP $CM* ($NU | $ALPlus); 591*cdf0e10cSrcweir$CM* ($NU | $ALPlus) $CM* ($CL | $SY)+ [^$SP]; 592*cdf0e10cSrcweir 593*cdf0e10cSrcweir 594*cdf0e10cSrcweir## ------------------------------------------------- 595*cdf0e10cSrcweir 596*cdf0e10cSrcweir!!safe_reverse; 597*cdf0e10cSrcweir 598*cdf0e10cSrcweir# LB 7 599*cdf0e10cSrcweir$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; 600*cdf0e10cSrcweir$CM+ $SP / .; 601*cdf0e10cSrcweir 602*cdf0e10cSrcweir# LB 9 603*cdf0e10cSrcweir$SP+ $CM* $OP; 604*cdf0e10cSrcweir 605*cdf0e10cSrcweir# LB 10 606*cdf0e10cSrcweir$SP+ $CM* $QU; 607*cdf0e10cSrcweir 608*cdf0e10cSrcweir# LB 11 609*cdf0e10cSrcweir$SP+ $CM* $CL; 610*cdf0e10cSrcweir$SP+ $CM* $B2; 611*cdf0e10cSrcweir 612*cdf0e10cSrcweir# LB 18 613*cdf0e10cSrcweir($CM* ($IS | $SY))+ $CM* $NU; 614*cdf0e10cSrcweir$CL $CM* ($NU | $IS | $SY); 615*cdf0e10cSrcweir 616*cdf0e10cSrcweir# For dictionary-based break 617*cdf0e10cSrcweir$dictionary $dictionary; 618*cdf0e10cSrcweir 619*cdf0e10cSrcweir## ------------------------------------------------- 620*cdf0e10cSrcweir 621*cdf0e10cSrcweir!!safe_forward; 622*cdf0e10cSrcweir 623*cdf0e10cSrcweir# Skip forward over all character classes that are involved in 624*cdf0e10cSrcweir# rules containing patterns with possibly more than one char 625*cdf0e10cSrcweir# of context. 626*cdf0e10cSrcweir# 627*cdf0e10cSrcweir# It might be slightly more efficient to have specific rules 628*cdf0e10cSrcweir# instead of one generic one, but only if we could 629*cdf0e10cSrcweir# turn off rule chaining. We don't want to move more 630*cdf0e10cSrcweir# than necessary. 631*cdf0e10cSrcweir# 632*cdf0e10cSrcweir[$CM $OP $QU $CL $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $B2 $PR $HY $dictionary]; 633*cdf0e10cSrcweir$dictionary $dictionary; 634*cdf0e10cSrcweir 635