sxw/wordsmith/DocumentDeserializerImpl.java

/**************************************************************
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 *
 *************************************************************/


package org.openoffice.xmerge.converter.xml.sxw.wordsmith;

import org.w3c.dom.*;

import java.io.IOException;
import java.util.Enumeration;

import org.openoffice.xmerge.Document;
import org.openoffice.xmerge.ConvertData;
import org.openoffice.xmerge.ConvertException;
import org.openoffice.xmerge.DocumentDeserializer;
import org.openoffice.xmerge.converter.xml.OfficeConstants;
import org.openoffice.xmerge.converter.palm.PalmDB;
import org.openoffice.xmerge.converter.palm.Record;
import org.openoffice.xmerge.converter.palm.PdbDecoder;
import org.openoffice.xmerge.converter.palm.PalmDocument;
import org.openoffice.xmerge.converter.xml.sxw.SxwDocument;

import java.util.Vector;
import java.io.ByteArrayInputStream;

import org.openoffice.xmerge.converter.xml.*;
import org.openoffice.xmerge.util.Debug;
import org.openoffice.xmerge.util.XmlUtil;

/**
 *  <p>WordSmith implementation of
 *  org.openoffice.xmerge.DocumentDeserializer
 *  for the {@link
 *  org.openoffice.xmerge.converter.xml.sxw.wordsmith.PluginFactoryImpl
 *  PluginFactoryImpl}.</p>
 *
 *  The <code>deserialize</code> method uses a
 *  <code>DocDecoder</code> to read the WordSmith format into a
 *  <code>String</code> object, then it calls <code>buildDocument</code>
 *  to create a <code>SxwDocument</code> object from it.
 *
 *  @author      Herbie Ong, David Proulx
 */
public final class DocumentDeserializerImpl
implements DOCConstants, OfficeConstants, DocumentDeserializer {

    /** A Decoder object for decoding WordSmith format. */
    private WSDecoder decoder = null;

    WseFontTable fontTable = null;
    WseColorTable colorTable = null;
    StyleCatalog styleCat = null;
    StyleCatalog oldStyleCat = null;

    /** A <code>ConvertData</code> object assigned to this object. */
    private ConvertData cd = null;


    /**
     *  Constructor that assigns the given <code>ConvertData</code>
     *  to the object.
     *
     *  @param  cd  A <code>ConvertData</code> object to read data for
     *              the conversion process by the deserialize method.
     */
    public DocumentDeserializerImpl(ConvertData cd) {
        this.cd = cd;
    }


    /**
     *  Convert the given <code>ConvertData</code> into a
     *  <code>SxwDocument</code> object.
     *
     *  @return  Resulting <code>Document</code> object.
     *
     *  @throws  ConvertException  If any conversion error occurs.
     *  @throws  IOException       If any I/O error occurs.
     */
    public Document deserialize() throws ConvertException,
        IOException {
        return deserialize(null, cd);
    }


    public Document deserialize(Document origDoc, ConvertData cd)
    throws IOException {

        Document doc         = null;
        PalmDocument palmDoc = null;
        Enumeration e        = cd.getDocumentEnumeration();

        while(e.hasMoreElements()) {
            palmDoc        = (PalmDocument) e.nextElement();
            PalmDB pdb     = palmDoc.getPdb();
            Record[] recs  = pdb.getRecords();
            decoder        = new WSDecoder();
            Wse[] b        = decoder.parseDocument(recs);
            String docName = palmDoc.getName();
            doc            = buildDocument(docName, b, origDoc);
        }
        return doc;
    }


    /**
     *  Temporary method to read existing <code>StyleCatalog</code>
     *  as a starting point.
     *
     *  @param  parentDoc  The parent <code>Document</code>.
     */
    private void readStyleCatalog(Document parentDoc) {
        Element rootNode = null;
        try {
            java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
            parentDoc.write(bos);
            SxwDocument sxwDoc = new SxwDocument("old");
            sxwDoc.read(new ByteArrayInputStream(bos.toByteArray()));
            org.w3c.dom.Document domDoc = sxwDoc.getContentDOM();

            String families[] = new String[3];
            families[0] = "text";
            families[1] = "paragraph";
            families[2] = "paragraph";
            Class classes[] = new Class[3];
            classes[0] = TextStyle.class;
            classes[1] = ParaStyle.class;
            classes[2] = TextStyle.class;

            NodeList nl = domDoc.getElementsByTagName(TAG_OFFICE_STYLES);
            oldStyleCat.add(nl.item(0), families, classes, null, false);
            nl = domDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
            oldStyleCat.add(nl.item(0), families, classes, null, false);
            nl = domDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES);
            oldStyleCat.add(nl.item(0), families, classes, null, false);

        } catch (Exception e) {
            Debug.log(Debug.ERROR, "", e);
        }

    }


    /**
     *  Given an array of paragraph <code>Style</code> objects, see if
     *  there is exactly one which matches the text formatting
     *  <code>Style</code> of <code>tStyle</code>.
     *
     *  @param  paraStyles  An array of paragraph <code>Style</code>
     *                      objects.
     *  @param  tStyle      Text <code>Style</code> to match.
     *
     *  @return  The paragraph <code>Style</code> that matches.
     */
    private ParaStyle matchParaByText(Style paraStyles[], TextStyle tStyle) {
        int matchIndex = -1;
    int matchCount = 0;
    Style txtMatches[] = (Style[]) oldStyleCat.getMatching(tStyle);
    if (txtMatches.length >= 1) {
        for (int j = 0; j < txtMatches.length; j++) {
            TextStyle t = (TextStyle)txtMatches[j];

            if (!t.getFamily().equals("paragraph"))
                continue;

            for (int k = 0; k < paraStyles.length; k++) {
                if (t.getName().equals(paraStyles[k].getName())) {
                    matchCount++;
                matchIndex = k;
                }
            }
        }
    }
    if (matchCount == 1)
            return (ParaStyle)paraStyles[matchIndex];
        else return null;
    }


    /**
     *  Take a <code>String</code> of text and turn it into a sequence
     *  of <code>Node</code> objects.
     *
     *  @param  text       <code>String</code> of text.
     *  @param  parentDoc  Parent <code>Document</code>.
     *
     *  @return  Array of <code>Node</code> objects.
     */
    private Node[] parseText(String text, org.w3c.dom.Document parentDoc) {
    Vector nodeVec = new Vector();

        // Break up the text from the WordSmith text run into Open
        // Office text runs.  There may be more runs in OO because
        // runs of 2 or more spaces map to nodes.
        while ((text.indexOf("  ") != -1) || (text.indexOf("\t") != 1)) {

            // Find the indices of tabs and multiple spaces, and
            // figure out which of them occurs first in the string.
            int spaceIndex = text.indexOf("  ");
            int tabIndex = text.indexOf("\t");
            if ((spaceIndex == -1) && (tabIndex == -1))
                break;  // DJP This should not be necessary.  What is wrong
            // with the while() stmt up above?
            int closerIndex;  // Index of the first of these
            if (spaceIndex == -1)
                closerIndex = tabIndex;
            else if (tabIndex == -1)
                closerIndex = spaceIndex;
            else
                closerIndex = (spaceIndex > tabIndex) ? tabIndex : spaceIndex;

            // If there is any text prior to the first occurrence of a
            // tab or spaces, create a text node from it, then chop it
            // off the string we're working with.
            if (closerIndex > 0) {
                String beginningText = text.substring(0, closerIndex);
                Text textNode = parentDoc.createTextNode(beginningText);
                nodeVec.addElement(textNode);
                log("<TEXT>");
                log(beginningText);
                log("</TEXT>");
            }
            text = text.substring(closerIndex);

            // Handle either tab character or space sequence by creating
            // an element for it, and then chopping out the text that
            // represented it in "text".
            if (closerIndex == tabIndex) {
                Element tabNode = parentDoc.createElement(TAG_TAB_STOP);
                nodeVec.add(tabNode);
                text = text.substring(1);  // tab is always a single character
                log("<TAB/>");
            } else {
                // Compute length of space sequence.
                int nrSpaces = 2;
                while ((nrSpaces < text.length())
                && text.substring(nrSpaces, nrSpaces + 1).equals(" "))
                    nrSpaces++;

                Element spaceNode = parentDoc.createElement(TAG_SPACE);
                spaceNode.setAttribute(ATTRIBUTE_SPACE_COUNT, new Integer(nrSpaces).toString());
                nodeVec.add(spaceNode);
                text = text.substring(nrSpaces);
                log("<SPACE count=\"" + nrSpaces + "\" />");
            }
        }

        // No more tabs or space sequences.  If there's any remaining
        // text create a text node for it.
        if (text.length() > 0) {
            Text textNode = parentDoc.createTextNode(text);
            nodeVec.add(textNode);
            log("<TEXT>");
            log(text);
            log("</TEXT>");
        }

        // Now create and populate an array to return the nodes in.
        Node nodes[] = new Node[nodeVec.size()];
        for (int i = 0; i < nodeVec.size(); i++)
            nodes[i] = (Node)nodeVec.elementAt(i);
        return nodes;
    }


    /**
     *  Parses the text content of a WordSmith format and builds a
     *  <code>SXWDocument</code>.
     *
     *  @param  docName  <code>Document</code> name
     *  @param  data      Text content of WordSmith format
     *
     *  @return  Resulting <code>SXWDocument</code> object.
     *
     *  @throws  IOException  If any I/O error occurs.
     */
    private SxwDocument buildDocument(String docName, Wse[] data, Document origDoc)
    throws IOException {

        // create minimum office xml document.
        SxwDocument sxwDoc = new SxwDocument(docName);
        sxwDoc.initContentDOM();

        org.w3c.dom.Document doc = sxwDoc.getContentDOM();

        // Grab hold of the office:body tag,
        // Assume there should be one.
        // This is where top level paragraphs will append to.
        NodeList list = doc.getElementsByTagName(TAG_OFFICE_BODY);
        Node bodyNode = list.item(0);

        styleCat = new StyleCatalog(50);
        oldStyleCat = new StyleCatalog(50);
           if (origDoc != null)
             readStyleCatalog(origDoc);

        Element currPara = null;
        ParaStyle currParaStyle = null;
        int newTextStyleNr = 0;
        int newParaStyleNr = 0;

        // Now write out the document body by running through
        // the list of WordSmith elements and processing each one
        // in turn.
        for (int i = 0; i < data.length; i++) {

            if (data[i].getClass() == WsePara.class) {

                currPara = doc.createElement(TAG_PARAGRAPH);
                log("</PARA>");
                log("<PARA>");

                WsePara p = (WsePara)data[i];

                // Save info about the first text run, if there is one.
                WseTextRun firstTextRun = null;

                if ((data.length >= i + 2)
                && (data[i+1].getClass() == WseTextRun.class))
                    firstTextRun = (WseTextRun)data[i+1];

                Style matches[] = oldStyleCat.getMatching(p.makeStyle());

                // See if we can find a unique match in the catalog
                // of existing styles from the original document.
                ParaStyle pStyle = null;
                if (matches.length == 1) {
                    pStyle = (ParaStyle)matches[0];
                    log("using an existing style");
                } else if ((matches.length > 1) && (firstTextRun != null)) {
                    pStyle = matchParaByText(matches, firstTextRun.makeStyle());
                    log("resolved a para by looking @ text");
                }

                // If nothing found so far, try looking in the catalog
                // of newly-created styles.
                // DJP FIXME: if we need to add two para styles with the
                // same para formatting info but different default text
                // styles, this won't work!
                if (pStyle == null) {
                    log("had " + matches.length + " matches in old catalog");
                    matches = styleCat.getMatching(p.makeStyle());
                    if (matches.length == 0) {
                        pStyle = p.makeStyle();
                        String newName = new String("PPP" + ++newParaStyleNr);
                        pStyle.setName(newName);
                        styleCat.add(pStyle);
                        // DJP: write in the text format info here
                        log("created a new style");
                    } else if (matches.length == 1) {
                        pStyle = (ParaStyle)matches[0];
                        log("re-using a new style");
                    } else if (firstTextRun != null) {
                        pStyle = matchParaByText(matches, firstTextRun.makeStyle());
                        if (pStyle != null) {
                            log("resolved a (new) para by looking @ text");
                    } else
                            log("Hey this shouldn't happen! - nr of matches is "
                            + matches.length);
                    }
                }

                if (pStyle == null)
                    log("Unable to figure out a para style");

                // Figured out a style to use.  Specify the style in this
                // paragraph's attributes.
                currPara.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, pStyle.getName());

                bodyNode.appendChild(currPara);
                currParaStyle = pStyle;
            } else if (data[i].getClass() == WseTextRun.class) {
                WseTextRun tr = (WseTextRun)data[i];
                TextStyle trStyle = null;
                Node trNodes[] = parseText(tr.getText(), doc);

                // First see if the formatting of this text run matches
                // the default text formatting for this paragraph.  If
                // it does, then just make the text node(s) children of
                // the current paragraph.
                Style[] cps = new Style[1];
                cps[0] = currParaStyle;
                if (matchParaByText(cps, tr.makeStyle()) != null) {
                    for (int ii  = 0; ii < trNodes.length; ii++) {
                        currPara.appendChild(trNodes[ii]);
                    }
                    continue;
             }

                // Check for existing, matching styles in the old style
                // catalog.  If exactly one is found, use it.  Otherwise,
                // check the new style catalog, and either use the style
                // found or add this new one to it.
                Style matches[] = oldStyleCat.getMatching(tr.makeStyle());
                if (matches.length == 1)
                    trStyle = (TextStyle)matches[0];
                else {
                    matches = styleCat.getMatching(tr.makeStyle());
                    if (matches.length == 0) {
                        trStyle = tr.makeStyle();
                        String newName = new String("TTT" + ++newTextStyleNr);
                        trStyle.setName(newName);
                        styleCat.add(trStyle);
                    } else if (matches.length == 1)
                        trStyle = (TextStyle)matches[0];
                    else
                        log("multiple text style matches from new catalog");
                }

                // Create a text span node, set the style attribute, make the
                // text node(s) its children, and append it to current paragraph's
                // list of children.
                Element textSpanNode = doc.createElement(TAG_SPAN);
                textSpanNode.setAttribute(ATTRIBUTE_TEXT_STYLE_NAME, trStyle.getName());
                for (int ii  = 0; ii < trNodes.length; ii++) {
                    textSpanNode.appendChild(trNodes[ii]);
                }
                currPara.appendChild(textSpanNode);
                log("</SPAN>");
            }

            else if (data[i].getClass() == WseFontTable.class) {
                fontTable = (WseFontTable)data[i];
            }

            else if (data[i].getClass() == WseColorTable.class) {
                colorTable = (WseColorTable)data[i];
            }
        }


        //NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT);
        NodeList r = doc.getElementsByTagName(TAG_OFFICE_DOCUMENT_CONTENT);
        Node rootNode = r.item(0);

        // read the original document
        org.w3c.dom.NodeList nl;
        if (origDoc != null) {
            java.io.ByteArrayOutputStream bos = new java.io.ByteArrayOutputStream();
            origDoc.write(bos);
            SxwDocument origSxwDoc = new SxwDocument("old");
            origSxwDoc.read(new ByteArrayInputStream(bos.toByteArray()));
            org.w3c.dom.Document origDomDoc = origSxwDoc.getContentDOM();

            XmlUtil xu = new XmlUtil();
            org.w3c.dom.DocumentFragment df;
            org.w3c.dom.Node newNode;

            // copy font declarations from original document to the new document
            nl = origDomDoc.getElementsByTagName(TAG_OFFICE_FONT_DECLS);
            df = doc.createDocumentFragment();
            newNode = xu.deepClone(df, nl.item(0));
            rootNode.insertBefore(newNode, bodyNode);

            // copy style catalog from original document to the new document
            nl = origDomDoc.getElementsByTagName(TAG_OFFICE_STYLES);
            df = doc.createDocumentFragment();
            newNode = xu.deepClone(df, nl.item(0));
            rootNode.insertBefore(newNode, bodyNode);

            nl = origDomDoc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
            df = doc.createDocumentFragment();
            newNode = xu.deepClone(df, nl.item(0));
            rootNode.insertBefore(newNode, bodyNode);

            nl = origDomDoc.getElementsByTagName(TAG_OFFICE_MASTER_STYLES);
            df = doc.createDocumentFragment();
            newNode = xu.deepClone(df, nl.item(0));
            rootNode.insertBefore(newNode, bodyNode);
        }

        // Original document not specified.  We need to add font declarations.
        // DJP: this might just be for debugging.  Merger will probably put
        // the "real" ones in.
        // DJP: if really doing it this way, do it right: gather font names
        // from style catalog(s).
        else {
            org.w3c.dom.Node declNode;

            log("<FONT-DECLS/>");

            declNode = doc.createElement(TAG_OFFICE_FONT_DECLS);
            rootNode.insertBefore(declNode, bodyNode);
            org.w3c.dom.Element fontNode;

            fontNode = doc.createElement(TAG_STYLE_FONT_DECL);
            fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arial");
            fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arial");
            fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable");
            declNode.appendChild(fontNode);

            fontNode = doc.createElement(TAG_STYLE_FONT_DECL);
            fontNode.setAttribute(ATTRIBUTE_STYLE_NAME, "Arioso");
            fontNode.setAttribute(ATTRIBUTE_FO_FONT_FAMILY, "Arioso");
            fontNode.setAttribute(ATTRIBUTE_STYLE_FONT_PITCH, "variable");
            declNode.appendChild(fontNode);
        }


        // Now add any new styles we have created in this document.
        nl = doc.getElementsByTagName(TAG_OFFICE_AUTOMATIC_STYLES);
        Node autoStylesNode = nl.item(0);
        if (autoStylesNode == null) {
            autoStylesNode = doc.createElement(TAG_OFFICE_AUTOMATIC_STYLES);
            log("<OFFICE-AUTOMATIC-STYLES/>");
            rootNode.insertBefore(autoStylesNode, bodyNode);
        }

        Node newStyleCatNode = styleCat.writeNode(doc, "dummy");
        nl = newStyleCatNode.getChildNodes();
        int nNodes = nl.getLength();
        for (int i = 0; i < nNodes; i++) {
            autoStylesNode.appendChild(nl.item(0));
        }

        oldStyleCat.dumpCSV(true);
        styleCat.dumpCSV(true);
        return sxwDoc;
    }


    /**
     *  Sends message to the log object.
     *
     *  @param  str  Debug message.
     */
    private void log(String str) {

         Debug.log(Debug.TRACE, str);
    }


    /*
    public static void main(String args[]) {

     //   DocumentDeserializerImpl d = new DocumentDeserializerImpl(new InputStream());

        Node nodes[] = parseText("Tab here:\tThen some more text");
    }
*/
}