1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 package org.openoffice.xmerge.converter.xml.sxw.wordsmith;
25 
26 import org.w3c.dom.*;
27 
28 import java.io.IOException;
29 import java.util.Enumeration;
30 
31 import org.openoffice.xmerge.Document;
32 import org.openoffice.xmerge.ConvertData;
33 import org.openoffice.xmerge.ConvertException;
34 import org.openoffice.xmerge.DocumentDeserializer;
35 import org.openoffice.xmerge.converter.xml.OfficeConstants;
36 import org.openoffice.xmerge.converter.palm.PalmDB;
37 import org.openoffice.xmerge.converter.palm.Record;
38 import org.openoffice.xmerge.converter.palm.PdbDecoder;
39 import org.openoffice.xmerge.converter.palm.PalmDocument;
40 import org.openoffice.xmerge.converter.xml.sxw.SxwDocument;
41 
42 import java.util.Vector;
43 import java.io.ByteArrayInputStream;
44 
45 import org.openoffice.xmerge.converter.xml.*;
46 import org.openoffice.xmerge.util.Debug;
47 import org.openoffice.xmerge.util.XmlUtil;
48 
49 /**
50  *  <p>WordSmith implementation of
51  *  org.openoffice.xmerge.DocumentDeserializer
52  *  for the {@link
53  *  org.openoffice.xmerge.converter.xml.sxw.wordsmith.PluginFactoryImpl
54  *  PluginFactoryImpl}.</p>
55  *
56  *  The <code>deserialize</code> method uses a
57  *  <code>DocDecoder</code> to read the WordSmith format into a
58  *  <code>String</code> object, then it calls <code>buildDocument</code>
59  *  to create a <code>SxwDocument</code> object from it.
60  *
61  *  @author      Herbie Ong, David Proulx
62  */
63 public final class DocumentDeserializerImpl
64 implements DOCConstants, OfficeConstants, DocumentDeserializer {
65 
66     /** A Decoder object for decoding WordSmith format. */
67     private WSDecoder decoder = null;
68 
69     WseFontTable fontTable = null;
70     WseColorTable colorTable = null;
71     StyleCatalog styleCat = null;
72     StyleCatalog oldStyleCat = null;
73 
74     /** A <code>ConvertData</code> object assigned to this object. */
75     private ConvertData cd = null;
76 
77 
78     /**
79      *  Constructor that assigns the given <code>ConvertData</code>
80      *  to the object.
81      *
82      *  @param  cd  A <code>ConvertData</code> object to read data for
83      *              the conversion process by the deserialize method.
84      */
DocumentDeserializerImpl(ConvertData cd)85     public DocumentDeserializerImpl(ConvertData cd) {
86         this.cd = cd;
87     }
88 
89 
90     /**
91      *  Convert the given <code>ConvertData</code> into a
92      *  <code>SxwDocument</code> object.
93      *
94      *  @return  Resulting <code>Document</code> object.
95      *
96      *  @throws  ConvertException  If any conversion error occurs.
97      *  @throws  IOException       If any I/O error occurs.
98      */
deserialize()99     public Document deserialize() throws ConvertException,
100         IOException {
101         return deserialize(null, cd);
102     }
103 
104 
deserialize(Document origDoc, ConvertData cd)105     public Document deserialize(Document origDoc, ConvertData cd)
106     throws IOException {
107 
108         Document doc         = null;
109         PalmDocument palmDoc = null;
110         Enumeration e        = cd.getDocumentEnumeration();
111 
112         while(e.hasMoreElements()) {
113             palmDoc        = (PalmDocument) e.nextElement();
114             PalmDB pdb     = palmDoc.getPdb();
115             Record[] recs  = pdb.getRecords();
116             decoder        = new WSDecoder();
117             Wse[] b        = decoder.parseDocument(recs);
118             String docName = palmDoc.getName();
119             doc            = buildDocument(docName, b, origDoc);
120         }
121         return doc;
122     }
123 
124 
125     /**
126      *  Temporary method to read existing <code>StyleCatalog</code>
127      *  as a starting point.
128      *
129      *  @param  parentDoc  The parent <code>Document</code>.
130      */
readStyleCatalog(Document parentDoc)131     private void readStyleCatalog(Document parentDoc) {
132         Element rootNode = null;
133         try {
134             java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
135             parentDoc.write(bos);
136             SxwDocument sxwDoc = new SxwDocument("old");
137             sxwDoc.read(new ByteArrayInputStream(bos.toByteArray()));
138             org.w3c.dom.Document domDoc = sxwDoc.getContentDOM();
139 
140             String families[] = new String[3];
141             families[0] = "text";
142             families[1] = "paragraph";
143             families[2] = "paragraph";
144             Class classes[] = new Class[3];
145             classes[0] = TextStyle.class;
146             classes[1] = ParaStyle.class;
147             classes[2] = TextStyle.class;
148 
149             NodeList nl = domDoc.getElementsByTagName(TAG_OFFICE_STYLES);
150             oldStyleCat.add(nl.item(0), families, classes, null, false);
151             nl = domDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
152             oldStyleCat.add(nl.item(0), families, classes, null, false);
153             nl = domDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES);
154             oldStyleCat.add(nl.item(0), families, classes, null, false);
155 
156         } catch (Exception e) {
157             Debug.log(Debug.ERROR, "", e);
158         }
159 
160     }
161 
162 
163     /**
164      *  Given an array of paragraph <code>Style</code> objects, see if
165      *  there is exactly one which matches the text formatting
166      *  <code>Style</code> of <code>tStyle</code>.
167      *
168      *  @param  paraStyles  An array of paragraph <code>Style</code>
169      *                      objects.
170      *  @param  tStyle      Text <code>Style</code> to match.
171      *
172      *  @return  The paragraph <code>Style</code> that matches.
173      */
matchParaByText(Style paraStyles[], TextStyle tStyle)174     private ParaStyle matchParaByText(Style paraStyles[], TextStyle tStyle) {
175         int matchIndex = -1;
176     int matchCount = 0;
177     Style txtMatches[] = (Style[]) oldStyleCat.getMatching(tStyle);
178     if (txtMatches.length >= 1) {
179         for (int j = 0; j < txtMatches.length; j++) {
180             TextStyle t = (TextStyle)txtMatches[j];
181 
182             if (!t.getFamily().equals("paragraph"))
183                 continue;
184 
185             for (int k = 0; k < paraStyles.length; k++) {
186                 if (t.getName().equals(paraStyles[k].getName())) {
187                     matchCount++;
188                 matchIndex = k;
189                 }
190             }
191         }
192     }
193     if (matchCount == 1)
194             return (ParaStyle)paraStyles[matchIndex];
195         else return null;
196     }
197 
198 
199     /**
200      *  Take a <code>String</code> of text and turn it into a sequence
201      *  of <code>Node</code> objects.
202      *
203      *  @param  text       <code>String</code> of text.
204      *  @param  parentDoc  Parent <code>Document</code>.
205      *
206      *  @return  Array of <code>Node</code> objects.
207      */
parseText(String text, org.w3c.dom.Document parentDoc)208     private Node[] parseText(String text, org.w3c.dom.Document parentDoc) {
209     Vector nodeVec = new Vector();
210 
211         // Break up the text from the WordSmith text run into Open
212         // Office text runs.  There may be more runs in OO because
213         // runs of 2 or more spaces map to nodes.
214         while ((text.indexOf("  ") != -1) || (text.indexOf("\t") != 1)) {
215 
216             // Find the indices of tabs and multiple spaces, and
217             // figure out which of them occurs first in the string.
218             int spaceIndex = text.indexOf("  ");
219             int tabIndex = text.indexOf("\t");
220             if ((spaceIndex == -1) && (tabIndex == -1))
221                 break;  // DJP This should not be necessary.  What is wrong
222             // with the while() stmt up above?
223             int closerIndex;  // Index of the first of these
224             if (spaceIndex == -1)
225                 closerIndex = tabIndex;
226             else if (tabIndex == -1)
227                 closerIndex = spaceIndex;
228             else
229                 closerIndex = (spaceIndex > tabIndex) ? tabIndex : spaceIndex;
230 
231             // If there is any text prior to the first occurrence of a
232             // tab or spaces, create a text node from it, then chop it
233             // off the string we're working with.
234             if (closerIndex > 0) {
235                 String beginningText = text.substring(0, closerIndex);
236                 Text textNode = parentDoc.createTextNode(beginningText);
237                 nodeVec.addElement(textNode);
238                 log("<TEXT>");
239                 log(beginningText);
240                 log("</TEXT>");
241             }
242             text = text.substring(closerIndex);
243 
244             // Handle either tab character or space sequence by creating
245             // an element for it, and then chopping out the text that
246             // represented it in "text".
247             if (closerIndex == tabIndex) {
248                 Element tabNode = parentDoc.createElement(TAG_TAB_STOP);
249                 nodeVec.add(tabNode);
250                 text = text.substring(1);  // tab is always a single character
251                 log("<TAB/>");
252             } else {
253                 // Compute length of space sequence.
254                 int nrSpaces = 2;
255                 while ((nrSpaces < text.length())
256                 && text.substring(nrSpaces, nrSpaces + 1).equals(" "))
257                     nrSpaces++;
258 
259                 Element spaceNode = parentDoc.createElement(TAG_SPACE);
260                 spaceNode.setAttribute(ATTRIBUTE_SPACE_COUNT, new Integer(nrSpaces).toString());
261                 nodeVec.add(spaceNode);
262                 text = text.substring(nrSpaces);
263                 log("<SPACE count=\"" + nrSpaces + "\" />");
264             }
265         }
266 
267         // No more tabs or space sequences.  If there's any remaining
268         // text create a text node for it.
269         if (text.length() > 0) {
270             Text textNode = parentDoc.createTextNode(text);
271             nodeVec.add(textNode);
272             log("<TEXT>");
273             log(text);
274             log("</TEXT>");
275         }
276 
277         // Now create and populate an array to return the nodes in.
278         Node nodes[] = new Node[nodeVec.size()];
279         for (int i = 0; i < nodeVec.size(); i++)
280             nodes[i] = (Node)nodeVec.elementAt(i);
281         return nodes;
282     }
283 
284 
285     /**
286      *  Parses the text content of a WordSmith format and builds a
287      *  <code>SXWDocument</code>.
288      *
289      *  @param  docName  <code>Document</code> name
290      *  @param  data      Text content of WordSmith format
291      *
292      *  @return  Resulting <code>SXWDocument</code> object.
293      *
294      *  @throws  IOException  If any I/O error occurs.
295      */
buildDocument(String docName, Wse[] data, Document origDoc)296     private SxwDocument buildDocument(String docName, Wse[] data, Document origDoc)
297     throws IOException {
298 
299         // create minimum office xml document.
300         SxwDocument sxwDoc = new SxwDocument(docName);
301         sxwDoc.initContentDOM();
302 
303         org.w3c.dom.Document doc = sxwDoc.getContentDOM();
304 
305         // Grab hold of the office:body tag,
306         // Assume there should be one.
307         // This is where top level paragraphs will append to.
308         NodeList list = doc.getElementsByTagName(TAG_OFFICE_BODY);
309         Node bodyNode = list.item(0);
310 
311         styleCat = new StyleCatalog(50);
312         oldStyleCat = new StyleCatalog(50);
313            if (origDoc != null)
314              readStyleCatalog(origDoc);
315 
316         Element currPara = null;
317         ParaStyle currParaStyle = null;
318         int newTextStyleNr = 0;
319         int newParaStyleNr = 0;
320 
321         // Now write out the document body by running through
322         // the list of WordSmith elements and processing each one
323         // in turn.
324         for (int i = 0; i < data.length; i++) {
325 
326             if (data[i].getClass() == WsePara.class) {
327 
328                 currPara = doc.createElement(TAG_PARAGRAPH);
329                 log("</PARA>");
330                 log("<PARA>");
331 
332                 WsePara p = (WsePara)data[i];
333 
334                 // Save info about the first text run, if there is one.
335                 WseTextRun firstTextRun = null;
336 
337                 if ((data.length >= i + 2)
338                 && (data[i+1].getClass() == WseTextRun.class))
339                     firstTextRun = (WseTextRun)data[i+1];
340 
341                 Style matches[] = oldStyleCat.getMatching(p.makeStyle());
342 
343                 // See if we can find a unique match in the catalog
344                 // of existing styles from the original document.
345                 ParaStyle pStyle = null;
346                 if (matches.length == 1) {
347                     pStyle = (ParaStyle)matches[0];
348                     log("using an existing style");
349                 } else if ((matches.length > 1) && (firstTextRun != null)) {
350                     pStyle = matchParaByText(matches, firstTextRun.makeStyle());
351                     log("resolved a para by looking @ text");
352                 }
353 
354                 // If nothing found so far, try looking in the catalog
355                 // of newly-created styles.
356                 // DJP FIXME: if we need to add two para styles with the
357                 // same para formatting info but different default text
358                 // styles, this won't work!
359                 if (pStyle == null) {
360                     log("had " + matches.length + " matches in old catalog");
361                     matches = styleCat.getMatching(p.makeStyle());
362                     if (matches.length == 0) {
363                         pStyle = p.makeStyle();
364                         String newName = new String("PPP" + ++newParaStyleNr);
365                         pStyle.setName(newName);
366                         styleCat.add(pStyle);
367                         // DJP: write in the text format info here
368                         log("created a new style");
369                     } else if (matches.length == 1) {
370                         pStyle = (ParaStyle)matches[0];
371                         log("re-using a new style");
372                     } else if (firstTextRun != null) {
373                         pStyle = matchParaByText(matches, firstTextRun.makeStyle());
374                         if (pStyle != null) {
375                             log("resolved a (new) para by looking @ text");
376                     } else
377                             log("Hey this shouldn't happen! - nr of matches is "
378                             + matches.length);
379                     }
380                 }
381 
382                 if (pStyle == null)
383                     log("Unable to figure out a para style");
384 
385                 // Figured out a style to use.  Specify the style in this
386                 // paragraph's attributes.
387                 currPara.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, pStyle.getName());
388 
389                 bodyNode.appendChild(currPara);
390                 currParaStyle = pStyle;
391             } else if (data[i].getClass() == WseTextRun.class) {
392                 WseTextRun tr = (WseTextRun)data[i];
393                 TextStyle trStyle = null;
394                 Node trNodes[] = parseText(tr.getText(), doc);
395 
396                 // First see if the formatting of this text run matches
397                 // the default text formatting for this paragraph.  If
398                 // it does, then just make the text node(s) children of
399                 // the current paragraph.
400                 Style[] cps = new Style[1];
401                 cps[0] = currParaStyle;
402                 if (matchParaByText(cps, tr.makeStyle()) != null) {
403                     for (int ii  = 0; ii < trNodes.length; ii++) {
404                         currPara.appendChild(trNodes[ii]);
405                     }
406                     continue;
407              }
408 
409                 // Check for existing, matching styles in the old style
410                 // catalog.  If exactly one is found, use it.  Otherwise,
411                 // check the new style catalog, and either use the style
412                 // found or add this new one to it.
413                 Style matches[] = oldStyleCat.getMatching(tr.makeStyle());
414                 if (matches.length == 1)
415                     trStyle = (TextStyle)matches[0];
416                 else {
417                     matches = styleCat.getMatching(tr.makeStyle());
418                     if (matches.length == 0) {
419                         trStyle = tr.makeStyle();
420                         String newName = new String("TTT" + ++newTextStyleNr);
421                         trStyle.setName(newName);
422                         styleCat.add(trStyle);
423                     } else if (matches.length == 1)
424                         trStyle = (TextStyle)matches[0];
425                     else
426                         log("multiple text style matches from new catalog");
427                 }
428 
429                 // Create a text span node, set the style attribute, make the
430                 // text node(s) its children, and append it to current paragraph's
431                 // list of children.
432                 Element textSpanNode = doc.createElement(TAG_SPAN);
433                 textSpanNode.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, trStyle.getName());
434                 for (int ii  = 0; ii < trNodes.length; ii++) {
435                     textSpanNode.appendChild(trNodes[ii]);
436                 }
437                 currPara.appendChild(textSpanNode);
438                 log("</SPAN>");
439             }
440 
441             else if (data[i].getClass() == WseFontTable.class) {
442                 fontTable = (WseFontTable)data[i];
443             }
444 
445             else if (data[i].getClass() == WseColorTable.class) {
446                 colorTable = (WseColorTable)data[i];
447             }
448         }
449 
450 
451         //NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT);
452         NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT_CONTENT);
453         Node rootNode = r.item(0);
454 
455         // read the original document
456         org.w3c.dom.NodeList nl;
457         if (origDoc != null) {
458             java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
459             origDoc.write(bos);
460             SxwDocument origSxwDoc = new SxwDocument("old");
461             origSxwDoc.read(new ByteArrayInputStream(bos.toByteArray()));
462             org.w3c.dom.Document origDomDoc = origSxwDoc.getContentDOM();
463 
464             XmlUtil xu = new XmlUtil();
465             org.w3c.dom.DocumentFragment df;
466             org.w3c.dom.Node newNode;
467 
468             // copy font declarations from original document to the new document
469             nl = origDomDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS);
470             df = doc.createDocumentFragment();
471             newNode = xu.deepClone(df, nl.item(0));
472             rootNode.insertBefore(newNode, bodyNode);
473 
474             // copy style catalog from original document to the new document
475             nl = origDomDoc.getElementsByTagName(TAG_OFFICE_STYLES);
476             df = doc.createDocumentFragment();
477             newNode = xu.deepClone(df, nl.item(0));
478             rootNode.insertBefore(newNode, bodyNode);
479 
480             nl = origDomDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
481             df = doc.createDocumentFragment();
482             newNode = xu.deepClone(df, nl.item(0));
483             rootNode.insertBefore(newNode, bodyNode);
484 
485             nl = origDomDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES);
486             df = doc.createDocumentFragment();
487             newNode = xu.deepClone(df, nl.item(0));
488             rootNode.insertBefore(newNode, bodyNode);
489         }
490 
491         // Original document not specified.  We need to add font declarations.
492         // DJP: this might just be for debugging.  Merger will probably put
493         // the "real" ones in.
494         // DJP: if really doing it this way, do it right: gather font names
495         // from style catalog(s).
496         else {
497             org.w3c.dom.Node declNode;
498 
499             log("<FONT-DECLS/>");
500 
501             declNode = doc.createElement(TAG_OFFICE_FONT_DECLS);
502             rootNode.insertBefore(declNode, bodyNode);
503             org.w3c.dom.Element fontNode;
504 
505             fontNode = doc.createElement(TAG_STYLE_FONT_DECL);
506             fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arial");
507             fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arial");
508             fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable");
509             declNode.appendChild(fontNode);
510 
511             fontNode = doc.createElement(TAG_STYLE_FONT_DECL);
512             fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arioso");
513             fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arioso");
514             fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable");
515             declNode.appendChild(fontNode);
516         }
517 
518 
519         // Now add any new styles we have created in this document.
520         nl = doc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
521         Node autoStylesNode = nl.item(0);
522         if (autoStylesNode == null) {
523             autoStylesNode = doc.createElement(TAG_OFFICE_AUTOMATIC_STYLES);
524             log("<OFFICE-AUTOMATIC-STYLES/>");
525             rootNode.insertBefore(autoStylesNode, bodyNode);
526         }
527 
528         Node newStyleCatNode = styleCat.writeNode(doc, "dummy");
529         nl = newStyleCatNode.getChildNodes();
530         int nNodes = nl.getLength();
531         for (int i = 0; i < nNodes; i++) {
532             autoStylesNode.appendChild(nl.item(0));
533         }
534 
535         oldStyleCat.dumpCSV(true);
536         styleCat.dumpCSV(true);
537         return sxwDoc;
538     }
539 
540 
541     /**
542      *  Sends message to the log object.
543      *
544      *  @param  str  Debug message.
545      */
log(String str)546     private void log(String str) {
547 
548          Debug.log(Debug.TRACE, str);
549     }
550 
551 
552     /*
553     public static void main(String args[]) {
554 
555      //   DocumentDeserializerImpl d = new DocumentDeserializerImpl(new InputStream());
556 
557         Node nodes[] = parseText("Tab here:\tThen some more text");
558     }
559 */
560 }
561 
562