1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements.  See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership.  The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License.  You may obtain a copy of the License at
10 *
11 *   http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied.  See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21 
22 package org.apache.openoffice.ooxml.parser;
23 
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.InputStream;
27 import java.util.Map;
28 import java.util.Map.Entry;
29 import java.util.TreeMap;
30 import java.util.zip.ZipEntry;
31 import java.util.zip.ZipFile;
32 
33 import javax.xml.stream.Location;
34 
35 import org.apache.openoffice.ooxml.parser.action.ActionManager;
36 import org.apache.openoffice.ooxml.parser.action.ActionTrigger;
37 import org.apache.openoffice.ooxml.parser.action.IAction;
38 
39 /** This OOXML parser is based on the output of the schema parser.
40  *  It exists to debug the schema parser and as illustration and preparation of
41  *  the C++ parse (yet to come.)
42  *  Because of this, the parser data (set of states and transitions) are
43  *  read at runtime while a real parser would do that at compile time.
44  */
45 public class OOXMLParser
46 {
47     class ActionContext
48     {
49         public Map<String,Integer> TypeCounts = new TreeMap<>();
50     }
51     /** The parser is called with two arguments:
52      *  - A path to where the parser tables with the states and transitions can
53      *    be found.
54      *  - The XML input file or Zip stream to parse.
55      *    The syntax for a Zip stream contains a '#' that separates the filename
56      *    to its left from the entry name to its right.
57      */
main(final String ... aArgumentList)58     public static void main (final String ... aArgumentList)
59     {
60         if (aArgumentList.length<2 ||aArgumentList.length>3)
61             throw new RuntimeException("usage: OOXMLParser <parser-tables-path> <XML-input-file> <log-file>?");
62 
63         if (aArgumentList.length == 3)
64         {
65             final File aLogFile = new File(aArgumentList[2]);
66             Log.Dbg = new Log(aLogFile);
67             System.out.printf("writing log data to %s\n", aLogFile.toString());
68         }
69         else
70         {
71             Log.Dbg = null;
72             System.out.printf("writing no log data\n");
73         }
74 
75         new OOXMLParser(aArgumentList[0], aArgumentList[1]);
76     }
77 
78 
79 
OOXMLParser( final String sParseTableFilename, final String sInputFilename)80     private OOXMLParser (
81         final String sParseTableFilename,
82         final String sInputFilename)
83     {
84         long nStartTime = System.currentTimeMillis();
85         final StateMachine aMachine = new StateMachine(new File(sParseTableFilename), null);
86         final InputStream aIn = GetInputStream(sInputFilename);
87         long nEndTime = System.currentTimeMillis();
88 
89         final ActionContext aActionContext = new ActionContext();
90         AddSomeActions(aMachine.GetActionManager(), aActionContext);
91 
92         System.out.printf("initialzed parser in %fs\n", (nEndTime-nStartTime)/1000.0);
93 
94         try
95         {
96             nStartTime = System.currentTimeMillis();
97             final Parser aParser = new Parser(aMachine, aIn);
98             aParser.Parse();
99             final int  nElementCount = aParser.GetElementCount();
100             nEndTime = System.currentTimeMillis();
101             System.out.printf("parsed %d elements in %fs\n",
102                 nElementCount,
103                 (nEndTime-nStartTime)/1000.0);
104 
105             System.out.printf("%d different elements found:\n", aActionContext.TypeCounts.size());
106             for (final Entry<String, Integer> aEntry : aActionContext.TypeCounts.entrySet())
107             {
108                 System.out.printf("%-32s : %6d\n", aEntry.getKey(), aEntry.getValue());
109             }
110         }
111         catch (final Exception aException)
112         {
113             aException.printStackTrace();
114         }
115     }
116 
117 
118 
119 
AddSomeActions( final ActionManager aActionManager, final ActionContext aActionContext)120     private static void AddSomeActions (
121         final ActionManager aActionManager,
122         final ActionContext aActionContext)
123     {
124         aActionManager.AddElementStartAction(
125             "*",
126             new IAction()
127             {
128                 @Override public void Run(
129                     final ActionTrigger eTrigger,
130                     final ElementContext aContext,
131                     final String sText,
132                     final Location aStartLocation,
133                     final Location aEndLocation)
134                 {
135                     Integer nValue = aActionContext.TypeCounts.get(aContext.GetTypeName());
136                     if (nValue == null)
137                         nValue = 1;
138                     else
139                         ++nValue;
140                     aActionContext.TypeCounts.put(aContext.GetTypeName(), nValue);
141                 }
142             }
143         );
144         aActionManager.AddElementStartAction(
145             ".*CT_Shd",
146             new IAction()
147             {
148                 @Override public void Run(
149                     final ActionTrigger eTrigger,
150                     final ElementContext aContext,
151                     final String sText,
152                     final Location aStartLocation,
153                     final Location aEndLocation)
154                 {
155                     System.out.printf("processing %s of element %s at position %d\n",
156                         eTrigger,
157                         aContext.GetElementName(),
158                         aStartLocation.getCharacterOffset());
159 
160                     if (aContext.GetAttributes().GetAttributeCount() == 0)
161                         System.out.printf("    no attributes\n");
162                     else
163                         for (final Entry<String,String> aAttribute : aContext.GetAttributes().GetAttributes())
164                             System.out.printf("    %s -> %s\n", aAttribute.getKey(), aAttribute.getValue());
165                 }
166             }
167         );
168         aActionManager.AddTextAction(
169             ".*CT_Text",
170             new IAction()
171             {
172                 @Override public void Run(
173                     final ActionTrigger eTrigger,
174                     final ElementContext aContext,
175                     final String sText,
176                     final Location aStartLocation,
177                     final Location aEndLocation)
178                 {
179 //                    System.out.printf("%s text \"%s\"\n", aContext.GetTypeName(), sText.replace("\n", "\\n"));
180                 }
181             }
182         );
183     }
184 
185 
186 
187 
GetInputStream(final String sInputName)188     private static InputStream GetInputStream (final String sInputName)
189     {
190         final InputStream aIn;
191         try
192         {
193             final int nSeparator = sInputName.indexOf('#');
194             if (nSeparator >= 0)
195             {
196                 // Split the input name into the file name of the archive and the
197                 // name of a zip entry.
198                 final String sArchiveName = sInputName.substring(0, nSeparator);
199                 String sEntryName = sInputName.substring(nSeparator+1);
200 
201                 // Normalize and cleanup the entry name.
202                 sEntryName = sEntryName.replace('\\',  '/');
203                 if (sEntryName.startsWith("/"))
204                     sEntryName = sEntryName.substring(1);
205 
206                 final ZipFile aZipFile = new ZipFile(new File(sArchiveName));
207                 final ZipEntry aZipEntry = aZipFile.getEntry(sEntryName);
208                 aIn = aZipFile.getInputStream(aZipEntry);
209             }
210             else
211             {
212                 // The input name points to a plain XML file.
213                 aIn = new FileInputStream(sInputName);
214             }
215         }
216         catch (final Exception aException)
217         {
218             aException.printStackTrace();
219             return null;
220         }
221         return aIn;
222     }
223 }
224