1 /************************************************************** 2 * 3 * Licensed to the Apache Software Foundation (ASF) under one 4 * or more contributor license agreements. See the NOTICE file 5 * distributed with this work for additional information 6 * regarding copyright ownership. The ASF licenses this file 7 * to you under the Apache License, Version 2.0 (the 8 * "License"); you may not use this file except in compliance 9 * with the License. You may obtain a copy of the License at 10 * 11 * http://www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, 14 * software distributed under the License is distributed on an 15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16 * KIND, either express or implied. See the License for the 17 * specific language governing permissions and limitations 18 * under the License. 19 * 20 *************************************************************/ 21 22 package org.apache.openoffice.ooxml.viewer.xml; 23 24 import java.io.IOException; 25 import java.io.InputStream; 26 import java.io.InputStreamReader; 27 import java.io.Reader; 28 import java.io.UnsupportedEncodingException; 29 import java.util.Arrays; 30 import java.util.Vector; 31 32 public class XMLScanner 33 { XMLScanner(final InputStream aIn)34 XMLScanner (final InputStream aIn) 35 { 36 Reader aReader = null; 37 try 38 { 39 aReader = new InputStreamReader(aIn, "UTF8"); 40 } 41 catch (UnsupportedEncodingException e) 42 { 43 e.printStackTrace(); 44 } 45 maIn = aReader; 46 mnNextCharacter = 0; 47 maTokens = new Vector<Token>(); 48 mnTokensReadIndex = 0; 49 mbIsInsideTag = false; 50 maTextBuffer = new int[1024]; 51 } 52 53 54 55 Next()56 public Token Next () 57 { 58 while (maTokens.isEmpty()) 59 ProvideToken(); 60 61 final Token aToken = maTokens.get(mnTokensReadIndex); 62 ++mnTokensReadIndex; 63 if (mnTokensReadIndex >= maTokens.size()) 64 { 65 maTokens.clear(); 66 mnTokensReadIndex = 0; 67 } 68 return aToken; 69 } 70 71 72 73 Peek()74 public Token Peek() 75 { 76 while (maTokens.isEmpty()) 77 ProvideToken(); 78 79 return maTokens.get(mnTokensReadIndex); 80 } 81 82 83 84 ProvideToken()85 private void ProvideToken () 86 { 87 final int nC = PeekCharacter(); 88 if (nC == -1) 89 { 90 AddToken(TokenType.EOF, "", mnOffset); 91 } 92 else if (mbIsInsideTag) 93 { 94 switch (Character.getType(nC)) 95 { 96 case Character.DIRECTIONALITY_WHITESPACE: 97 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 98 ScanWhitespace(); 99 break; 100 101 default: 102 switch(nC) 103 { 104 case '?': 105 case '/': 106 case '>': 107 case '=': 108 case ':': 109 case '-': 110 switch(ScanSymbol()) 111 { 112 case TAG_END: 113 case INTRO_END: 114 case ELEMENT_END: 115 mbIsInsideTag = false; 116 break; 117 default: 118 break; 119 } 120 break; 121 122 case '"': 123 ScanQuotedValue(); 124 break; 125 126 default: 127 ScanIdentifier(); 128 break; 129 } 130 } 131 } 132 else 133 { 134 switch (Character.getType(PeekCharacter())) 135 { 136 case Character.DIRECTIONALITY_WHITESPACE: 137 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 138 ScanWhitespace(); 139 break; 140 141 default: 142 if (nC == '<') 143 { 144 mbIsInsideTag = true; 145 ScanSymbol(); 146 } 147 else 148 { 149 ScanText(); 150 } 151 break; 152 } 153 } 154 } 155 156 157 158 NextNonWhitespaceToken()159 Token NextNonWhitespaceToken () 160 { 161 while(true) 162 { 163 final Token aToken = Next(); 164 if (aToken.Type != TokenType.WHITESPACE) 165 return aToken; 166 } 167 } 168 169 170 171 ScanSymbol()172 private TokenType ScanSymbol () 173 { 174 final int nStartOffset = mnOffset; 175 176 switch (PeekCharacter()) 177 { 178 case -1: 179 AddToken(TokenType.EOF, "", nStartOffset); 180 return TokenType.EOF; 181 182 case '<': 183 GetNextCharacter(); 184 switch(PeekCharacter()) 185 { 186 case '/': 187 GetNextCharacter(); 188 AddToken(TokenType.END_TAG_START, "</", nStartOffset); 189 break; 190 191 case '?': 192 GetNextCharacter(); 193 AddToken(TokenType.INTRO_START, "<?", nStartOffset); 194 break; 195 196 case '!': 197 GetNextCharacter(); 198 if (GetNextCharacter() != '-') 199 throw new RuntimeException("expected '-' after '<!'"); 200 if (GetNextCharacter() != '-') 201 throw new RuntimeException("expected '-' after '<!-'"); 202 AddToken(TokenType.COMMENT_START, "<!--", nStartOffset); 203 break; 204 205 default: 206 AddToken(TokenType.TAG_START, "<", nStartOffset); 207 break; 208 } 209 return maTokens.lastElement().Type; 210 211 case '>': 212 GetNextCharacter(); 213 AddToken(TokenType.TAG_END, ">", nStartOffset); 214 return TokenType.TAG_END; 215 216 case '/': 217 GetNextCharacter(); 218 if (GetNextCharacter() != '>') 219 throw new RuntimeException("expected '>' after '/'"); 220 AddToken(TokenType.ELEMENT_END, "/>", nStartOffset); 221 return TokenType.ELEMENT_END; 222 223 case '?': 224 GetNextCharacter(); 225 if (GetNextCharacter() != '>') 226 throw new RuntimeException("expected '>' after '?'"); 227 AddToken(TokenType.INTRO_END, "?>", nStartOffset); 228 return TokenType.INTRO_END; 229 230 case '-': 231 GetNextCharacter(); 232 if (GetNextCharacter() != '-') 233 throw new RuntimeException("expected '-' after '-'"); 234 if (GetNextCharacter() != '>') 235 throw new RuntimeException("expected '>' after '--'"); 236 AddToken(TokenType.COMMENT_END, "-->", nStartOffset); 237 return TokenType.COMMENT_END; 238 239 case '=': 240 GetNextCharacter(); 241 AddToken(TokenType.ATTRIBUTE_DEFINE, "=", nStartOffset); 242 return TokenType.ATTRIBUTE_DEFINE; 243 244 case ':': 245 GetNextCharacter(); 246 AddToken(TokenType.COLON, ":", nStartOffset); 247 return TokenType.COLON; 248 249 default: 250 throw new RuntimeException(String.format( 251 "unexpected character '%c' of type %d", 252 PeekCharacter(), 253 Character.getType(PeekCharacter()))); 254 } 255 } 256 257 258 259 ScanIdentifier()260 private boolean ScanIdentifier () 261 { 262 final int nStartOffset = mnOffset; 263 int nBufferWriteIndex = 0; 264 265 while (true) 266 { 267 switch(Character.getType(PeekCharacter())) 268 { 269 default: 270 case -1: 271 if (nBufferWriteIndex == 0) 272 throw new RuntimeException( 273 String.format( 274 "missing identifier, got '%c' of type %d", 275 PeekCharacter(), 276 Character.getType(PeekCharacter()))); 277 AddToken( 278 TokenType.IDENTIFIER, 279 new String(maTextBuffer, 0, nBufferWriteIndex), 280 nStartOffset); 281 return true; 282 283 case Character.LOWERCASE_LETTER: 284 case Character.UPPERCASE_LETTER: 285 case Character.DECIMAL_DIGIT_NUMBER: 286 if (nBufferWriteIndex >= maTextBuffer.length) 287 maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2); 288 maTextBuffer[nBufferWriteIndex] = GetNextCharacter(); 289 ++nBufferWriteIndex; 290 break; 291 } 292 } 293 } 294 295 296 297 ScanWhitespace()298 private void ScanWhitespace () 299 { 300 final StringBuffer aBuffer = new StringBuffer(); 301 final int nStartOffset = mnOffset; 302 303 while (true) 304 { 305 switch(Character.getType(PeekCharacter())) 306 { 307 default: 308 if (aBuffer.length() > 0) 309 AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset); 310 return; 311 312 case -1: 313 AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset); 314 AddToken(TokenType.EOF, "", nStartOffset); 315 return; 316 317 case Character.DIRECTIONALITY_WHITESPACE: 318 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE: 319 aBuffer.append((char)GetNextCharacter()); 320 break; 321 } 322 } 323 } 324 325 326 327 ScanQuotedValue()328 private void ScanQuotedValue () 329 { 330 if (PeekCharacter() == '"') 331 { 332 final int nStartOffset = mnOffset; 333 int nBufferWriteIndex = 0; 334 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter(); 335 336 while (PeekCharacter() != '"') 337 { 338 // Make sure that there is enough space for this character and the end quote. 339 if (nBufferWriteIndex >= maTextBuffer.length-1) 340 maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2); 341 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter(); 342 } 343 344 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter(); 345 346 AddToken(TokenType.ATTRIBUTE_VALUE, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset); 347 } 348 } 349 350 351 352 ScanText()353 private void ScanText () 354 { 355 final int nStartOffset = mnOffset; 356 int nBufferWriteIndex = 0; 357 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter(); 358 359 while (PeekCharacter() != '<') 360 { 361 if (nBufferWriteIndex >= maTextBuffer.length) 362 maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2); 363 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter(); 364 } 365 366 AddToken(TokenType.TEXT, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset); 367 } 368 369 370 371 GetNextCharacter()372 private int GetNextCharacter () 373 { 374 final int nC; 375 if (mnNextCharacter != 0) 376 { 377 nC = mnNextCharacter; 378 mnNextCharacter = 0; 379 } 380 else 381 { 382 try 383 { 384 nC = maIn.read(); 385 } 386 catch (Exception e) 387 { 388 e.printStackTrace(); 389 return -1; 390 } 391 } 392 ++mnOffset; 393 return nC; 394 } 395 396 397 398 PeekCharacter()399 private int PeekCharacter () 400 { 401 if (mnNextCharacter == 0) 402 { 403 try 404 { 405 mnNextCharacter = maIn.read(); 406 } 407 catch (IOException e) 408 { 409 e.printStackTrace(); 410 mnNextCharacter = -1; 411 } 412 } 413 return mnNextCharacter; 414 } 415 416 417 418 AddToken( final TokenType eType, final String sText, final int nOffset)419 private void AddToken ( 420 final TokenType eType, 421 final String sText, 422 final int nOffset) 423 { 424 if (eType != TokenType.WHITESPACE) 425 maTokens.add(new Token(eType, sText, nOffset)); 426 } 427 428 429 430 431 private final Reader maIn; 432 private int mnNextCharacter; 433 private Vector<Token> maTokens; 434 private int mnTokensReadIndex; 435 private boolean mbIsInsideTag; 436 private int[] maTextBuffer; 437 private int mnOffset; 438 } 439