xref: /trunk/main/vcl/aqua/source/dtrans/HtmlFmtFlt.cxx (revision cf6516809c57e1bb0a940545cca99cdad54d4ce2)
19f62ea84SAndrew Rist /**************************************************************
29f62ea84SAndrew Rist  *
39f62ea84SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
49f62ea84SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
59f62ea84SAndrew Rist  * distributed with this work for additional information
69f62ea84SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
79f62ea84SAndrew Rist  * to you under the Apache License, Version 2.0 (the
89f62ea84SAndrew Rist  * "License"); you may not use this file except in compliance
99f62ea84SAndrew Rist  * with the License.  You may obtain a copy of the License at
109f62ea84SAndrew Rist  *
119f62ea84SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
129f62ea84SAndrew Rist  *
139f62ea84SAndrew Rist  * Unless required by applicable law or agreed to in writing,
149f62ea84SAndrew Rist  * software distributed under the License is distributed on an
159f62ea84SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
169f62ea84SAndrew Rist  * KIND, either express or implied.  See the License for the
179f62ea84SAndrew Rist  * specific language governing permissions and limitations
189f62ea84SAndrew Rist  * under the License.
199f62ea84SAndrew Rist  *
209f62ea84SAndrew Rist  *************************************************************/
219f62ea84SAndrew Rist 
22cdf0e10cSrcweir #include "HtmlFmtFlt.hxx"
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #include <rtl/string.h>
25cdf0e10cSrcweir 
26cdf0e10cSrcweir #include <string>
27cdf0e10cSrcweir #include <sstream>
28cdf0e10cSrcweir #include <vector>
29cdf0e10cSrcweir #include <iomanip>
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #include <boost/assert.hpp>
32cdf0e10cSrcweir 
33cdf0e10cSrcweir using namespace com::sun::star::uno;
34cdf0e10cSrcweir 
35cdf0e10cSrcweir //------------------------------------------------------------------------------
36cdf0e10cSrcweir // converts the openoffice text/html clipboard format to the HTML Format
37cdf0e10cSrcweir // well known under MS Windows
38*499844abSJohn Bampton // the MS HTML Format has a header before the real HTML data
39cdf0e10cSrcweir //
40cdf0e10cSrcweir // Version:1.0      Version number of the clipboard. Staring is 0.9
41cdf0e10cSrcweir // StartHTML:       Byte count from the beginning of the clipboard to the start
42cdf0e10cSrcweir //                  of the context, or -1 if no context
43cdf0e10cSrcweir // EndHTML:         Byte count from the beginning of the clipboard to the end
44cdf0e10cSrcweir //                  of the context, or -1 if no context
45cdf0e10cSrcweir // StartFragment:   Byte count from the beginning of the clipboard to the
46cdf0e10cSrcweir //                  start of the fragment
47cdf0e10cSrcweir // EndFragment:     Byte count from the beginning of the clipboard to the
48cdf0e10cSrcweir //                  end of the fragment
49cdf0e10cSrcweir // StartSelection:  Byte count from the beginning of the clipboard to the
50cdf0e10cSrcweir //                  start of the selection
51cdf0e10cSrcweir // EndSelection:    Byte count from the beginning of the clipboard to the
52cdf0e10cSrcweir //                  end of the selection
53cdf0e10cSrcweir //
54cdf0e10cSrcweir // StartSelection and EndSelection are optional
55cdf0e10cSrcweir // The fragment should be preceded and followed by the HTML comments
56cdf0e10cSrcweir // <!--StartFragment--> and <!--EndFragment--> (no space between !-- and the
57cdf0e10cSrcweir // text
58cdf0e10cSrcweir //------------------------------------------------------------------------------
59cdf0e10cSrcweir 
60cdf0e10cSrcweir namespace // private
61cdf0e10cSrcweir {
GetHtmlFormatHeader(size_t startHtml,size_t endHtml,size_t startFragment,size_t endFragment)62cdf0e10cSrcweir std::string GetHtmlFormatHeader(size_t startHtml, size_t endHtml, size_t startFragment, size_t endFragment)
63cdf0e10cSrcweir {
64cdf0e10cSrcweir     std::ostringstream htmlHeader;
65cdf0e10cSrcweir     htmlHeader << "Version:1.0" << '\r' << '\n';
66cdf0e10cSrcweir     htmlHeader << "StartHTML:" << std::setw(10) << std::setfill('0') << std::dec << startHtml << '\r' << '\n';
67cdf0e10cSrcweir     htmlHeader << "EndHTML:" << std::setw(10) << std::setfill('0') << std::dec << endHtml << '\r' << '\n';
68cdf0e10cSrcweir     htmlHeader << "StartFragment:" << std::setw(10) << std::setfill('0') << std::dec << startFragment << '\r' << '\n';
69cdf0e10cSrcweir     htmlHeader << "EndFragment:" << std::setw(10) << std::setfill('0') << std::dec << endFragment << '\r' << '\n';
70cdf0e10cSrcweir     return htmlHeader.str();
71cdf0e10cSrcweir }
72cdf0e10cSrcweir 
73cdf0e10cSrcweir } // namespace private
74cdf0e10cSrcweir 
75cdf0e10cSrcweir 
76*499844abSJohn Bampton // the office always writes the start and end HTML tag in upper cases and
77cdf0e10cSrcweir // without spaces both tags don't allow parameters
78cdf0e10cSrcweir const std::string TAG_HTML = std::string("<HTML>");
79cdf0e10cSrcweir const std::string TAG_END_HTML = std::string("</HTML>");
80cdf0e10cSrcweir 
81cdf0e10cSrcweir // The body tag may have parameters so we need to search for the
82cdf0e10cSrcweir // closing '>' manually e.g. <BODY param> #92840#
83cdf0e10cSrcweir const std::string TAG_BODY = std::string("<BODY");
84cdf0e10cSrcweir const std::string TAG_END_BODY = std::string("</BODY");
85cdf0e10cSrcweir 
TextHtmlToHTMLFormat(Sequence<sal_Int8> & aTextHtml)86cdf0e10cSrcweir Sequence<sal_Int8> SAL_CALL TextHtmlToHTMLFormat(Sequence<sal_Int8>& aTextHtml)
87cdf0e10cSrcweir {
88cdf0e10cSrcweir     OSL_ASSERT(aTextHtml.getLength() > 0);
89cdf0e10cSrcweir 
90cdf0e10cSrcweir     if (!(aTextHtml.getLength() > 0))
91cdf0e10cSrcweir         return Sequence<sal_Int8>();
92cdf0e10cSrcweir 
93cdf0e10cSrcweir     // fill the buffer with dummy values to calc the exact length
94cdf0e10cSrcweir     std::string dummyHtmlHeader = GetHtmlFormatHeader(0, 0, 0, 0);
95cdf0e10cSrcweir     size_t lHtmlFormatHeader = dummyHtmlHeader.length();
96cdf0e10cSrcweir 
97cdf0e10cSrcweir     std::string textHtml(
98cdf0e10cSrcweir         reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()),
99cdf0e10cSrcweir         reinterpret_cast<const sal_Char*>(aTextHtml.getConstArray()) + aTextHtml.getLength());
100cdf0e10cSrcweir 
101cdf0e10cSrcweir     std::string::size_type nStartHtml = textHtml.find(TAG_HTML) + lHtmlFormatHeader - 1; // we start one before '<HTML>' Word 2000 does also so
102cdf0e10cSrcweir     std::string::size_type nEndHtml = textHtml.find(TAG_END_HTML) + lHtmlFormatHeader + TAG_END_HTML.length() + 1; // our SOffice 5.2 wants 2 behind </HTML>?
103cdf0e10cSrcweir 
104cdf0e10cSrcweir     // The body tag may have parameters so we need to search for the
105cdf0e10cSrcweir     // closing '>' manually e.g. <BODY param> #92840#
106cdf0e10cSrcweir     std::string::size_type nStartFragment = textHtml.find(">", textHtml.find(TAG_BODY)) + lHtmlFormatHeader + 1;
107cdf0e10cSrcweir     std::string::size_type nEndFragment = textHtml.find(TAG_END_BODY) + lHtmlFormatHeader;
108cdf0e10cSrcweir 
109cdf0e10cSrcweir     std::string htmlFormat = GetHtmlFormatHeader(nStartHtml, nEndHtml, nStartFragment, nEndFragment);
110cdf0e10cSrcweir     htmlFormat += textHtml;
111cdf0e10cSrcweir 
112cdf0e10cSrcweir     Sequence<sal_Int8> byteSequence(htmlFormat.length() + 1); // space the trailing '\0'
113cdf0e10cSrcweir     rtl_zeroMemory(byteSequence.getArray(), byteSequence.getLength());
114cdf0e10cSrcweir 
115cdf0e10cSrcweir     rtl_copyMemory(
116cdf0e10cSrcweir         static_cast<void*>(byteSequence.getArray()),
117cdf0e10cSrcweir         static_cast<const void*>(htmlFormat.c_str()),
118cdf0e10cSrcweir         htmlFormat.length());
119cdf0e10cSrcweir 
120cdf0e10cSrcweir     return byteSequence;
121cdf0e10cSrcweir }
122cdf0e10cSrcweir 
123cdf0e10cSrcweir const char* HtmlStartTag = "<html";
124cdf0e10cSrcweir 
HTMLFormatToTextHtml(const Sequence<sal_Int8> & aHTMLFormat)125cdf0e10cSrcweir Sequence<sal_Int8> HTMLFormatToTextHtml(const Sequence<sal_Int8>& aHTMLFormat)
126cdf0e10cSrcweir {
127cdf0e10cSrcweir   BOOST_ASSERT(isHTMLFormat(aHTMLFormat) && "No HTML Format provided");
128cdf0e10cSrcweir 
129cdf0e10cSrcweir   Sequence<sal_Int8>& nonconstHTMLFormatRef = const_cast< Sequence<sal_Int8>& >(aHTMLFormat);
130cdf0e10cSrcweir   sal_Char* dataStart = reinterpret_cast<sal_Char*>(nonconstHTMLFormatRef.getArray());
131cdf0e10cSrcweir   sal_Char* dataEnd = dataStart + nonconstHTMLFormatRef.getLength() - 1;
132cdf0e10cSrcweir   const sal_Char* htmlStartTag = strcasestr(dataStart, HtmlStartTag);
133cdf0e10cSrcweir 
134cdf0e10cSrcweir   BOOST_ASSERT(htmlStartTag && "Seems to be no HTML at all");
135cdf0e10cSrcweir 
136cdf0e10cSrcweir   // It doesn't seem to be HTML? Well then simply return what has been
137cdf0e10cSrcweir   // provided in non-debug builds
138cdf0e10cSrcweir   if (htmlStartTag == NULL)
139cdf0e10cSrcweir     {
140cdf0e10cSrcweir     return aHTMLFormat;
141cdf0e10cSrcweir     }
142cdf0e10cSrcweir 
143cdf0e10cSrcweir   sal_Int32 len = dataEnd - htmlStartTag;
144cdf0e10cSrcweir   Sequence<sal_Int8> plainHtmlData(len);
145cdf0e10cSrcweir 
146cdf0e10cSrcweir   rtl_copyMemory(static_cast<void*>(plainHtmlData.getArray()), htmlStartTag, len);
147cdf0e10cSrcweir 
148cdf0e10cSrcweir   return plainHtmlData;
149cdf0e10cSrcweir }
150cdf0e10cSrcweir 
151cdf0e10cSrcweir /* A simple format detection. We are just comparing the first few bytes
152cdf0e10cSrcweir    of the provided byte sequence to see whether or not it is the MS
153cdf0e10cSrcweir    Office Html format. If it shows that this is not reliable enough we
154cdf0e10cSrcweir    can improve this
155cdf0e10cSrcweir */
156cdf0e10cSrcweir const char HtmlFormatStart[] = "Version:";
157cdf0e10cSrcweir int HtmlFormatStartLen = (sizeof(HtmlFormatStart) - 1);
158cdf0e10cSrcweir 
isHTMLFormat(const Sequence<sal_Int8> & aHtmlSequence)159cdf0e10cSrcweir bool isHTMLFormat(const Sequence<sal_Int8>& aHtmlSequence)
160cdf0e10cSrcweir {
161cdf0e10cSrcweir   if (aHtmlSequence.getLength() < HtmlFormatStartLen)
162cdf0e10cSrcweir     return false;
163cdf0e10cSrcweir 
164cdf0e10cSrcweir   return rtl_str_compareIgnoreAsciiCase_WithLength(HtmlFormatStart,
165cdf0e10cSrcweir                                                    HtmlFormatStartLen,
166cdf0e10cSrcweir                                                    reinterpret_cast<const sal_Char*>(aHtmlSequence.getConstArray()),
167cdf0e10cSrcweir                                                    HtmlFormatStartLen) == 0;
168cdf0e10cSrcweir }
169