1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements.  See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership.  The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License.  You may obtain a copy of the License at
10 *
11 *   http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied.  See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21 
22 package org.apache.openoffice.ooxml.viewer.xml;
23 
24 import java.io.IOException;
25 import java.io.InputStream;
26 import java.io.InputStreamReader;
27 import java.io.Reader;
28 import java.io.UnsupportedEncodingException;
29 import java.util.Arrays;
30 import java.util.Vector;
31 
32 public class XMLScanner
33 {
XMLScanner(final InputStream aIn)34     XMLScanner (final InputStream aIn)
35     {
36         Reader aReader = null;
37         try
38         {
39             aReader = new InputStreamReader(aIn, "UTF8");
40         }
41         catch (UnsupportedEncodingException e)
42         {
43             e.printStackTrace();
44         }
45         maIn = aReader;
46         mnNextCharacter = 0;
47         maTokens = new Vector<Token>();
48         mnTokensReadIndex = 0;
49         mbIsInsideTag = false;
50         maTextBuffer = new int[1024];
51     }
52 
53 
54 
55 
Next()56     public Token Next ()
57     {
58         while (maTokens.isEmpty())
59             ProvideToken();
60 
61         final Token aToken = maTokens.get(mnTokensReadIndex);
62         ++mnTokensReadIndex;
63         if (mnTokensReadIndex >= maTokens.size())
64         {
65             maTokens.clear();
66             mnTokensReadIndex = 0;
67         }
68         return aToken;
69     }
70 
71 
72 
73 
Peek()74     public Token Peek()
75     {
76         while (maTokens.isEmpty())
77             ProvideToken();
78 
79         return maTokens.get(mnTokensReadIndex);
80     }
81 
82 
83 
84 
ProvideToken()85     private void ProvideToken ()
86     {
87         final int nC = PeekCharacter();
88         if (nC == -1)
89         {
90             AddToken(TokenType.EOF, "", mnOffset);
91         }
92         else if (mbIsInsideTag)
93         {
94             switch (Character.getType(nC))
95             {
96                 case Character.DIRECTIONALITY_WHITESPACE:
97                 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
98                     ScanWhitespace();
99                     break;
100 
101                 default:
102                     switch(nC)
103                     {
104                         case '?':
105                         case '/':
106                         case '>':
107                         case '=':
108                         case ':':
109                         case '-':
110                             switch(ScanSymbol())
111                             {
112                                 case TAG_END:
113                                 case INTRO_END:
114                                 case ELEMENT_END:
115                                     mbIsInsideTag = false;
116                                     break;
117                                 default:
118                                     break;
119                             }
120                             break;
121 
122                         case '"':
123                             ScanQuotedValue();
124                             break;
125 
126                         default:
127                             ScanIdentifier();
128                             break;
129                     }
130             }
131         }
132         else
133         {
134             switch (Character.getType(PeekCharacter()))
135             {
136                 case Character.DIRECTIONALITY_WHITESPACE:
137                 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
138                     ScanWhitespace();
139                     break;
140 
141                 default:
142                     if (nC == '<')
143                     {
144                         mbIsInsideTag = true;
145                         ScanSymbol();
146                     }
147                     else
148                     {
149                         ScanText();
150                     }
151                     break;
152             }
153         }
154     }
155 
156 
157 
158 
NextNonWhitespaceToken()159     Token NextNonWhitespaceToken ()
160     {
161         while(true)
162         {
163             final Token aToken = Next();
164             if (aToken.Type != TokenType.WHITESPACE)
165                 return aToken;
166         }
167     }
168 
169 
170 
171 
ScanSymbol()172     private TokenType ScanSymbol ()
173     {
174         final int nStartOffset = mnOffset;
175 
176         switch (PeekCharacter())
177         {
178             case -1:
179                 AddToken(TokenType.EOF, "", nStartOffset);
180                 return TokenType.EOF;
181 
182             case '<':
183                 GetNextCharacter();
184                 switch(PeekCharacter())
185                 {
186                     case '/':
187                         GetNextCharacter();
188                         AddToken(TokenType.END_TAG_START, "</", nStartOffset);
189                         break;
190 
191                     case '?':
192                         GetNextCharacter();
193                         AddToken(TokenType.INTRO_START, "<?", nStartOffset);
194                         break;
195 
196                     case '!':
197                         GetNextCharacter();
198                         if (GetNextCharacter() != '-')
199                             throw new RuntimeException("expected '-' after '<!'");
200                         if (GetNextCharacter() != '-')
201                             throw new RuntimeException("expected '-' after '<!-'");
202                         AddToken(TokenType.COMMENT_START, "<!--", nStartOffset);
203                         break;
204 
205                     default:
206                         AddToken(TokenType.TAG_START, "<", nStartOffset);
207                         break;
208                 }
209                 return maTokens.lastElement().Type;
210 
211             case '>':
212                 GetNextCharacter();
213                 AddToken(TokenType.TAG_END, ">", nStartOffset);
214                 return TokenType.TAG_END;
215 
216             case '/':
217                 GetNextCharacter();
218                 if (GetNextCharacter() != '>')
219                     throw new RuntimeException("expected '>' after '/'");
220                 AddToken(TokenType.ELEMENT_END, "/>", nStartOffset);
221                 return TokenType.ELEMENT_END;
222 
223             case '?':
224                 GetNextCharacter();
225                 if (GetNextCharacter() != '>')
226                     throw new RuntimeException("expected '>' after '?'");
227                 AddToken(TokenType.INTRO_END, "?>", nStartOffset);
228                 return TokenType.INTRO_END;
229 
230             case '-':
231                 GetNextCharacter();
232                 if (GetNextCharacter() != '-')
233                     throw new RuntimeException("expected '-' after '-'");
234                 if (GetNextCharacter() != '>')
235                     throw new RuntimeException("expected '>' after '--'");
236                 AddToken(TokenType.COMMENT_END, "-->", nStartOffset);
237                 return TokenType.COMMENT_END;
238 
239             case '=':
240                 GetNextCharacter();
241                 AddToken(TokenType.ATTRIBUTE_DEFINE, "=", nStartOffset);
242                 return TokenType.ATTRIBUTE_DEFINE;
243 
244             case ':':
245                 GetNextCharacter();
246                 AddToken(TokenType.COLON, ":", nStartOffset);
247                 return TokenType.COLON;
248 
249             default:
250                     throw new RuntimeException(String.format(
251                             "unexpected character '%c' of type %d",
252                             PeekCharacter(),
253                             Character.getType(PeekCharacter())));
254         }
255     }
256 
257 
258 
259 
ScanIdentifier()260     private boolean ScanIdentifier ()
261     {
262         final int nStartOffset = mnOffset;
263         int nBufferWriteIndex = 0;
264 
265         while (true)
266         {
267             switch(Character.getType(PeekCharacter()))
268             {
269                 default:
270                 case -1:
271                     if (nBufferWriteIndex == 0)
272                         throw new RuntimeException(
273                                 String.format(
274                                         "missing identifier, got '%c' of type %d",
275                                         PeekCharacter(),
276                                         Character.getType(PeekCharacter())));
277                     AddToken(
278                         TokenType.IDENTIFIER,
279                         new String(maTextBuffer, 0, nBufferWriteIndex),
280                         nStartOffset);
281                     return true;
282 
283                 case Character.LOWERCASE_LETTER:
284                 case Character.UPPERCASE_LETTER:
285                 case Character.DECIMAL_DIGIT_NUMBER:
286                     if (nBufferWriteIndex >= maTextBuffer.length)
287                         maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
288                     maTextBuffer[nBufferWriteIndex] = GetNextCharacter();
289                     ++nBufferWriteIndex;
290                     break;
291             }
292         }
293     }
294 
295 
296 
297 
ScanWhitespace()298     private void ScanWhitespace ()
299     {
300         final StringBuffer aBuffer = new StringBuffer();
301         final int nStartOffset = mnOffset;
302 
303         while (true)
304         {
305             switch(Character.getType(PeekCharacter()))
306             {
307                 default:
308                     if (aBuffer.length() > 0)
309                         AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset);
310                     return;
311 
312                 case -1:
313                     AddToken(TokenType.WHITESPACE, aBuffer.toString(), nStartOffset);
314                     AddToken(TokenType.EOF, "", nStartOffset);
315                     return;
316 
317                 case Character.DIRECTIONALITY_WHITESPACE:
318                 case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
319                     aBuffer.append((char)GetNextCharacter());
320                     break;
321             }
322         }
323     }
324 
325 
326 
327 
ScanQuotedValue()328     private void ScanQuotedValue ()
329     {
330         if (PeekCharacter() == '"')
331         {
332             final int nStartOffset = mnOffset;
333             int nBufferWriteIndex = 0;
334             maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
335 
336             while (PeekCharacter() != '"')
337             {
338                 // Make sure that there is enough space for this character and the end quote.
339                 if (nBufferWriteIndex >= maTextBuffer.length-1)
340                     maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
341                 maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
342             }
343 
344             maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
345 
346             AddToken(TokenType.ATTRIBUTE_VALUE, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset);
347         }
348     }
349 
350 
351 
352 
ScanText()353     private void ScanText ()
354     {
355         final int nStartOffset = mnOffset;
356         int nBufferWriteIndex = 0;
357         maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
358 
359         while (PeekCharacter() != '<')
360         {
361             if (nBufferWriteIndex >= maTextBuffer.length)
362                 maTextBuffer = Arrays.copyOf(maTextBuffer, maTextBuffer.length*2);
363             maTextBuffer[nBufferWriteIndex++] = GetNextCharacter();
364         }
365 
366         AddToken(TokenType.TEXT, new String(maTextBuffer, 0, nBufferWriteIndex), nStartOffset);
367     }
368 
369 
370 
371 
GetNextCharacter()372     private int GetNextCharacter ()
373     {
374         final int nC;
375         if (mnNextCharacter != 0)
376         {
377             nC = mnNextCharacter;
378             mnNextCharacter = 0;
379         }
380         else
381         {
382             try
383             {
384                 nC = maIn.read();
385             }
386             catch (Exception e)
387             {
388                 e.printStackTrace();
389                 return -1;
390             }
391         }
392         ++mnOffset;
393         return nC;
394     }
395 
396 
397 
398 
PeekCharacter()399     private int PeekCharacter ()
400     {
401         if (mnNextCharacter == 0)
402         {
403             try
404             {
405                 mnNextCharacter = maIn.read();
406             }
407             catch (IOException e)
408             {
409                 e.printStackTrace();
410                 mnNextCharacter = -1;
411             }
412         }
413         return mnNextCharacter;
414     }
415 
416 
417 
418 
AddToken( final TokenType eType, final String sText, final int nOffset)419     private void AddToken (
420             final TokenType eType,
421             final String sText,
422             final int nOffset)
423     {
424         if (eType != TokenType.WHITESPACE)
425             maTokens.add(new Token(eType, sText, nOffset));
426     }
427 
428 
429 
430 
431     private final Reader maIn;
432     private int mnNextCharacter;
433     private Vector<Token> maTokens;
434     private int mnTokensReadIndex;
435     private boolean mbIsInsideTag;
436     private int[] maTextBuffer;
437     private int mnOffset;
438 }
439