xref: /trunk/main/svtools/source/svhtml/parhtml.cxx (revision 8d6213619fba521a89bc75695f8047b6b7239ec8)
1cdf0e10cSrcweir /*************************************************************************
2cdf0e10cSrcweir  *
3cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4cdf0e10cSrcweir  *
5cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6cdf0e10cSrcweir  *
7cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8cdf0e10cSrcweir  *
9cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10cdf0e10cSrcweir  *
11cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14cdf0e10cSrcweir  *
15cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20cdf0e10cSrcweir  *
21cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25cdf0e10cSrcweir  *
26cdf0e10cSrcweir  ************************************************************************/
27cdf0e10cSrcweir 
28cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
29cdf0e10cSrcweir #include "precompiled_svtools.hxx"
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #include <ctype.h>
32cdf0e10cSrcweir #include <stdio.h>
33cdf0e10cSrcweir #include <tools/stream.hxx>
34cdf0e10cSrcweir #include <tools/debug.hxx>
35cdf0e10cSrcweir #include <tools/color.hxx>
36cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
37cdf0e10cSrcweir #include <rtl/strbuf.hxx>
38cdf0e10cSrcweir #ifndef _SVSTDARR_HXX
39cdf0e10cSrcweir #define _SVSTDARR_ULONGS
40cdf0e10cSrcweir #include <svl/svstdarr.hxx>
41cdf0e10cSrcweir #endif
42cdf0e10cSrcweir 
43cdf0e10cSrcweir #include <tools/tenccvt.hxx>
44cdf0e10cSrcweir #include <tools/datetime.hxx>
45cdf0e10cSrcweir #include <svl/inettype.hxx>
46cdf0e10cSrcweir #include <comphelper/string.hxx>
47cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp>
48cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp>
49cdf0e10cSrcweir 
50cdf0e10cSrcweir #include <svtools/parhtml.hxx>
51cdf0e10cSrcweir #include <svtools/htmltokn.h>
52cdf0e10cSrcweir #include <svtools/htmlkywd.hxx>
53cdf0e10cSrcweir 
54cdf0e10cSrcweir 
55cdf0e10cSrcweir using namespace ::com::sun::star;
56cdf0e10cSrcweir 
57cdf0e10cSrcweir 
58cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L );
59cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ];
60cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 );
61cdf0e10cSrcweir 
62cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L );
63cdf0e10cSrcweir 
64cdf0e10cSrcweir /*  */
65cdf0e10cSrcweir 
66cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings
67cdf0e10cSrcweir 
68cdf0e10cSrcweir // <INPUT TYPE=xxx>
69cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] =
70cdf0e10cSrcweir {
71cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_text,      HTML_IT_TEXT        },
72cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_password,  HTML_IT_PASSWORD    },
73cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_checkbox,  HTML_IT_CHECKBOX    },
74cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_radio,     HTML_IT_RADIO       },
75cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_range,     HTML_IT_RANGE       },
76cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_scribble,  HTML_IT_SCRIBBLE    },
77cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_file,      HTML_IT_FILE        },
78cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_hidden,    HTML_IT_HIDDEN      },
79cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_submit,    HTML_IT_SUBMIT      },
80cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_image,     HTML_IT_IMAGE       },
81cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_reset,     HTML_IT_RESET       },
82cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_IT_button,    HTML_IT_BUTTON      },
83cdf0e10cSrcweir     { 0,                    0                   }
84cdf0e10cSrcweir };
85cdf0e10cSrcweir 
86cdf0e10cSrcweir // <TABLE FRAME=xxx>
87cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] =
88cdf0e10cSrcweir {
89cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_void,  HTML_TF_VOID    },
90cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE   },
91cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW   },
92cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_hsides,    HTML_TF_HSIDES  },
93cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_lhs,       HTML_TF_LHS     },
94cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_rhs,       HTML_TF_RHS     },
95cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_vsides,    HTML_TF_VSIDES  },
96cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_box,       HTML_TF_BOX     },
97cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TF_border,    HTML_TF_BOX     },
98cdf0e10cSrcweir     { 0,                0               }
99cdf0e10cSrcweir };
100cdf0e10cSrcweir 
101cdf0e10cSrcweir // <TABLE RULES=xxx>
102cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] =
103cdf0e10cSrcweir {
104cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_none,  HTML_TR_NONE    },
105cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_groups,    HTML_TR_GROUPS  },
106cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_rows,  HTML_TR_ROWS    },
107cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_cols,  HTML_TR_COLS    },
108cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_TR_all,       HTML_TR_ALL     },
109cdf0e10cSrcweir     { 0,                0               }
110cdf0e10cSrcweir };
111cdf0e10cSrcweir 
112cdf0e10cSrcweir 
113cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)
114cdf0e10cSrcweir 
115cdf0e10cSrcweir /*  */
116cdf0e10cSrcweir 
117cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
118cdf0e10cSrcweir {
119cdf0e10cSrcweir     sal_uInt16 nValue = nDflt;
120cdf0e10cSrcweir 
121cdf0e10cSrcweir     while( pOptEnums->pName )
122cdf0e10cSrcweir         if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
123cdf0e10cSrcweir             break;
124cdf0e10cSrcweir         else
125cdf0e10cSrcweir             pOptEnums++;
126cdf0e10cSrcweir 
127cdf0e10cSrcweir     if( pOptEnums->pName )
128cdf0e10cSrcweir         nValue = pOptEnums->nValue;
129cdf0e10cSrcweir 
130cdf0e10cSrcweir     return nValue;
131cdf0e10cSrcweir }
132cdf0e10cSrcweir 
133cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
134cdf0e10cSrcweir {
135cdf0e10cSrcweir     while( pOptEnums->pName )
136cdf0e10cSrcweir     {
137cdf0e10cSrcweir         if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
138cdf0e10cSrcweir             break;
139cdf0e10cSrcweir         else
140cdf0e10cSrcweir             pOptEnums++;
141cdf0e10cSrcweir     }
142cdf0e10cSrcweir 
143cdf0e10cSrcweir     const sal_Char *pName = pOptEnums->pName;
144cdf0e10cSrcweir     if( pName )
145cdf0e10cSrcweir         rEnum = pOptEnums->nValue;
146cdf0e10cSrcweir 
147cdf0e10cSrcweir     return (pName != 0);
148cdf0e10cSrcweir }
149cdf0e10cSrcweir 
150cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken,
151cdf0e10cSrcweir                         const String& rValue )
152cdf0e10cSrcweir     : aValue(rValue)
153cdf0e10cSrcweir     , aToken(rToken)
154cdf0e10cSrcweir     , nToken( nTok )
155cdf0e10cSrcweir {
156cdf0e10cSrcweir     DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
157cdf0e10cSrcweir         "HTMLOption: unbekanntes Token" );
158cdf0e10cSrcweir }
159cdf0e10cSrcweir 
160cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const
161cdf0e10cSrcweir {
162cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
163cdf0e10cSrcweir                  nToken<HTML_OPTION_NUMBER_END) ||
164cdf0e10cSrcweir                 (nToken>=HTML_OPTION_CONTEXT_START &&
165cdf0e10cSrcweir                  nToken<HTML_OPTION_CONTEXT_END) ||
166cdf0e10cSrcweir                 nToken==HTML_O_VALUE,
167cdf0e10cSrcweir         "GetNumber: Option ist nicht numerisch" );
168cdf0e10cSrcweir     String aTmp( aValue );
169cdf0e10cSrcweir     aTmp.EraseLeadingChars();
170cdf0e10cSrcweir     sal_Int32 nTmp = aTmp.ToInt32();
171cdf0e10cSrcweir     return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
172cdf0e10cSrcweir }
173cdf0e10cSrcweir 
174cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const
175cdf0e10cSrcweir {
176cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
177cdf0e10cSrcweir                 (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
178cdf0e10cSrcweir         "GetSNumber: Option ist nicht numerisch" );
179cdf0e10cSrcweir     String aTmp( aValue );
180cdf0e10cSrcweir     aTmp.EraseLeadingChars();
181cdf0e10cSrcweir     return aTmp.ToInt32();
182cdf0e10cSrcweir }
183cdf0e10cSrcweir 
184cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const
185cdf0e10cSrcweir {
186cdf0e10cSrcweir     if( rLongs.Count() )
187cdf0e10cSrcweir         rLongs.Remove( 0, rLongs.Count() );
188cdf0e10cSrcweir 
189cdf0e10cSrcweir     if( bSpaceDelim )
190cdf0e10cSrcweir     {
191cdf0e10cSrcweir         // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach
192cdf0e10cSrcweir         // alle Tiffern aus dem String
193cdf0e10cSrcweir         sal_Bool bInNum = sal_False;
194cdf0e10cSrcweir         sal_uLong nNum = 0;
195cdf0e10cSrcweir         for( xub_StrLen i=0; i<aValue.Len(); i++ )
196cdf0e10cSrcweir         {
197cdf0e10cSrcweir             register sal_Unicode c = aValue.GetChar( i );
198cdf0e10cSrcweir             if( c>='0' && c<='9' )
199cdf0e10cSrcweir             {
200cdf0e10cSrcweir                 nNum *= 10;
201cdf0e10cSrcweir                 nNum += (c - '0');
202cdf0e10cSrcweir                 bInNum = sal_True;
203cdf0e10cSrcweir             }
204cdf0e10cSrcweir             else if( bInNum )
205cdf0e10cSrcweir             {
206cdf0e10cSrcweir                 rLongs.Insert( nNum, rLongs.Count() );
207cdf0e10cSrcweir                 bInNum = sal_False;
208cdf0e10cSrcweir                 nNum = 0;
209cdf0e10cSrcweir             }
210cdf0e10cSrcweir         }
211cdf0e10cSrcweir         if( bInNum )
212cdf0e10cSrcweir         {
213cdf0e10cSrcweir             rLongs.Insert( nNum, rLongs.Count() );
214cdf0e10cSrcweir         }
215cdf0e10cSrcweir     }
216cdf0e10cSrcweir     else
217cdf0e10cSrcweir     {
218cdf0e10cSrcweir         // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet
219cdf0e10cSrcweir         // und auch mal eine 0 eingefuegt
220cdf0e10cSrcweir         xub_StrLen nPos = 0;
221cdf0e10cSrcweir         while( nPos < aValue.Len() )
222cdf0e10cSrcweir         {
223cdf0e10cSrcweir             register sal_Unicode c;
224cdf0e10cSrcweir             while( nPos < aValue.Len() &&
225cdf0e10cSrcweir                    ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' ||
226cdf0e10cSrcweir                    c == '\n' || c== '\r' ) )
227cdf0e10cSrcweir                 nPos++;
228cdf0e10cSrcweir 
229cdf0e10cSrcweir             if( nPos==aValue.Len() )
230cdf0e10cSrcweir                 rLongs.Insert( sal_uLong(0), rLongs.Count() );
231cdf0e10cSrcweir             else
232cdf0e10cSrcweir             {
233cdf0e10cSrcweir                 xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos );
234cdf0e10cSrcweir                 if( STRING_NOTFOUND==nEnd )
235cdf0e10cSrcweir                 {
236cdf0e10cSrcweir                     sal_Int32 nTmp = aValue.Copy(nPos).ToInt32();
237cdf0e10cSrcweir                     rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
238cdf0e10cSrcweir                                    rLongs.Count() );
239cdf0e10cSrcweir                     nPos = aValue.Len();
240cdf0e10cSrcweir                 }
241cdf0e10cSrcweir                 else
242cdf0e10cSrcweir                 {
243cdf0e10cSrcweir                     sal_Int32 nTmp =
244cdf0e10cSrcweir                         aValue.Copy(nPos,nEnd-nPos).ToInt32();
245cdf0e10cSrcweir                     rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
246cdf0e10cSrcweir                                    rLongs.Count() );
247cdf0e10cSrcweir                     nPos = nEnd+1;
248cdf0e10cSrcweir                 }
249cdf0e10cSrcweir             }
250cdf0e10cSrcweir         }
251cdf0e10cSrcweir     }
252cdf0e10cSrcweir }
253cdf0e10cSrcweir 
254cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const
255cdf0e10cSrcweir {
256cdf0e10cSrcweir     DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
257cdf0e10cSrcweir         "GetColor: Option spezifiziert keine Farbe" );
258cdf0e10cSrcweir 
259cdf0e10cSrcweir     String aTmp( aValue );
260cdf0e10cSrcweir     aTmp.ToUpperAscii();
261cdf0e10cSrcweir     sal_uLong nColor = ULONG_MAX;
262cdf0e10cSrcweir     if( '#'!=aTmp.GetChar( 0 ) )
263cdf0e10cSrcweir         nColor = GetHTMLColor( aTmp );
264cdf0e10cSrcweir 
265cdf0e10cSrcweir     if( ULONG_MAX == nColor )
266cdf0e10cSrcweir     {
267cdf0e10cSrcweir         nColor = 0;
268cdf0e10cSrcweir         xub_StrLen nPos = 0;
269cdf0e10cSrcweir         for( sal_uInt32 i=0; i<6; i++ )
270cdf0e10cSrcweir         {
271cdf0e10cSrcweir             // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt,
272cdf0e10cSrcweir             // maximal drei Zeichen, die kleiner als '0' sind werden
273cdf0e10cSrcweir             // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich
274cdf0e10cSrcweir             // irgendwelche HTML-Autoren noch so einfallen lassen...
275cdf0e10cSrcweir             register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ )
276cdf0e10cSrcweir                                                      : '0';
277cdf0e10cSrcweir             if( c < '0' )
278cdf0e10cSrcweir             {
279cdf0e10cSrcweir                 c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
280cdf0e10cSrcweir                 if( c < '0' )
281cdf0e10cSrcweir                     c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
282cdf0e10cSrcweir             }
283cdf0e10cSrcweir             nColor *= 16;
284cdf0e10cSrcweir             if( c >= '0' && c <= '9' )
285cdf0e10cSrcweir                 nColor += (c - 48);
286cdf0e10cSrcweir             else if( c >= 'A' && c <= 'F' )
287cdf0e10cSrcweir                 nColor += (c - 55);
288cdf0e10cSrcweir         }
289cdf0e10cSrcweir     }
290cdf0e10cSrcweir 
291cdf0e10cSrcweir     rColor.SetRed(   (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
292cdf0e10cSrcweir     rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
293cdf0e10cSrcweir     rColor.SetBlue(  (sal_uInt8)(nColor & 0x000000ff) );
294cdf0e10cSrcweir }
295cdf0e10cSrcweir 
296cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const
297cdf0e10cSrcweir {
298cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" );
299cdf0e10cSrcweir     return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
300cdf0e10cSrcweir }
301cdf0e10cSrcweir 
302cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const
303cdf0e10cSrcweir {
304cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" );
305cdf0e10cSrcweir     return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
306cdf0e10cSrcweir }
307cdf0e10cSrcweir 
308cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const
309cdf0e10cSrcweir {
310cdf0e10cSrcweir     DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" );
311cdf0e10cSrcweir     return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
312cdf0e10cSrcweir }
313cdf0e10cSrcweir 
314cdf0e10cSrcweir /*  */
315cdf0e10cSrcweir 
316cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc )
317cdf0e10cSrcweir     : SvParser( rIn )
318cdf0e10cSrcweir {
319cdf0e10cSrcweir     bNewDoc = bReadNewDoc;
320cdf0e10cSrcweir     bReadListing = bReadXMP = bReadPRE = bReadTextArea =
321cdf0e10cSrcweir         bReadScript = bReadStyle =
322cdf0e10cSrcweir         bEndTokenFound = bIsInBody = bReadNextChar =
323cdf0e10cSrcweir         bReadComment = sal_False;
324cdf0e10cSrcweir     bIsInHeader = sal_True;
325cdf0e10cSrcweir     pOptions = new HTMLOptions;
326*8d621361SPedro Giffuni 
327*8d621361SPedro Giffuni     //#i76649, default to UTF-8 for HTML unless we know differently
328*8d621361SPedro Giffuni     SetSrcEncoding(RTL_TEXTENCODING_UTF8);
329cdf0e10cSrcweir }
330cdf0e10cSrcweir 
331cdf0e10cSrcweir HTMLParser::~HTMLParser()
332cdf0e10cSrcweir {
333cdf0e10cSrcweir     if( pOptions && pOptions->Count() )
334cdf0e10cSrcweir         pOptions->DeleteAndDestroy( 0, pOptions->Count() );
335cdf0e10cSrcweir     delete pOptions;
336cdf0e10cSrcweir }
337cdf0e10cSrcweir 
338cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser()
339cdf0e10cSrcweir {
340cdf0e10cSrcweir     eState = SVPAR_WORKING;
341cdf0e10cSrcweir     nNextCh = GetNextChar();
342cdf0e10cSrcweir     SaveState( 0 );
343cdf0e10cSrcweir 
344cdf0e10cSrcweir     nPre_LinePos = 0;
345cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
346cdf0e10cSrcweir 
347cdf0e10cSrcweir     AddRef();
348cdf0e10cSrcweir     Continue( 0 );
349cdf0e10cSrcweir     if( SVPAR_PENDING != eState )
350cdf0e10cSrcweir         ReleaseRef();       // dann brauchen wir den Parser nicht mehr!
351cdf0e10cSrcweir 
352cdf0e10cSrcweir     return eState;
353cdf0e10cSrcweir }
354cdf0e10cSrcweir 
355cdf0e10cSrcweir void HTMLParser::Continue( int nToken )
356cdf0e10cSrcweir {
357cdf0e10cSrcweir     if( !nToken )
358cdf0e10cSrcweir         nToken = GetNextToken();
359cdf0e10cSrcweir 
360cdf0e10cSrcweir     while( IsParserWorking() )
361cdf0e10cSrcweir     {
362cdf0e10cSrcweir         SaveState( nToken );
363cdf0e10cSrcweir         nToken = FilterToken( nToken );
364cdf0e10cSrcweir 
365cdf0e10cSrcweir         if( nToken )
366cdf0e10cSrcweir             NextToken( nToken );
367cdf0e10cSrcweir 
368cdf0e10cSrcweir         if( IsParserWorking() )
369cdf0e10cSrcweir             SaveState( 0 );         // bis hierhin abgearbeitet,
370cdf0e10cSrcweir                                     // weiter mit neuem Token!
371cdf0e10cSrcweir         nToken = GetNextToken();
372cdf0e10cSrcweir     }
373cdf0e10cSrcweir }
374cdf0e10cSrcweir 
375cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken )
376cdf0e10cSrcweir {
377cdf0e10cSrcweir     switch( nToken )
378cdf0e10cSrcweir     {
379cdf0e10cSrcweir     case sal_Unicode(EOF):
380cdf0e10cSrcweir         nToken = 0;
381cdf0e10cSrcweir         break;          // nicht verschicken
382cdf0e10cSrcweir 
383cdf0e10cSrcweir     case HTML_HEAD_OFF:
384cdf0e10cSrcweir         bIsInBody = sal_True;
385cdf0e10cSrcweir     case HTML_HEAD_ON:
386cdf0e10cSrcweir         bIsInHeader = HTML_HEAD_ON == nToken;
387cdf0e10cSrcweir         break;
388cdf0e10cSrcweir 
389cdf0e10cSrcweir     case HTML_BODY_ON:
390cdf0e10cSrcweir     case HTML_FRAMESET_ON:
391cdf0e10cSrcweir         bIsInHeader = sal_False;
392cdf0e10cSrcweir         bIsInBody = HTML_BODY_ON == nToken;
393cdf0e10cSrcweir         break;
394cdf0e10cSrcweir 
395cdf0e10cSrcweir     case HTML_BODY_OFF:
396cdf0e10cSrcweir         bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False;
397cdf0e10cSrcweir         break;
398cdf0e10cSrcweir 
399cdf0e10cSrcweir     case HTML_HTML_OFF:
400cdf0e10cSrcweir         nToken = 0;
401cdf0e10cSrcweir         bReadPRE = bReadListing = bReadXMP = sal_False;
402cdf0e10cSrcweir         break;      // HTML_ON wurde auch nicht verschickt !
403cdf0e10cSrcweir 
404cdf0e10cSrcweir     case HTML_PREFORMTXT_ON:
405cdf0e10cSrcweir         StartPRE();
406cdf0e10cSrcweir         break;
407cdf0e10cSrcweir 
408cdf0e10cSrcweir     case HTML_PREFORMTXT_OFF:
409cdf0e10cSrcweir         FinishPRE();
410cdf0e10cSrcweir         break;
411cdf0e10cSrcweir 
412cdf0e10cSrcweir     case HTML_LISTING_ON:
413cdf0e10cSrcweir         StartListing();
414cdf0e10cSrcweir         break;
415cdf0e10cSrcweir 
416cdf0e10cSrcweir     case HTML_LISTING_OFF:
417cdf0e10cSrcweir         FinishListing();
418cdf0e10cSrcweir         break;
419cdf0e10cSrcweir 
420cdf0e10cSrcweir     case HTML_XMP_ON:
421cdf0e10cSrcweir         StartXMP();
422cdf0e10cSrcweir         break;
423cdf0e10cSrcweir 
424cdf0e10cSrcweir     case HTML_XMP_OFF:
425cdf0e10cSrcweir         FinishXMP();
426cdf0e10cSrcweir         break;
427cdf0e10cSrcweir 
428cdf0e10cSrcweir     default:
429cdf0e10cSrcweir         if( bReadPRE )
430cdf0e10cSrcweir             nToken = FilterPRE( nToken );
431cdf0e10cSrcweir         else if( bReadListing )
432cdf0e10cSrcweir             nToken = FilterListing( nToken );
433cdf0e10cSrcweir         else if( bReadXMP )
434cdf0e10cSrcweir             nToken = FilterXMP( nToken );
435cdf0e10cSrcweir 
436cdf0e10cSrcweir         break;
437cdf0e10cSrcweir     }
438cdf0e10cSrcweir 
439cdf0e10cSrcweir     return nToken;
440cdf0e10cSrcweir }
441cdf0e10cSrcweir 
442cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9')
443cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
444cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) )
445cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
446cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
447cdf0e10cSrcweir // --> OD 2006-07-26 #138464#
448cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') )
449cdf0e10cSrcweir // <--
450cdf0e10cSrcweir 
451cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak )
452cdf0e10cSrcweir {
453cdf0e10cSrcweir     ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
454cdf0e10cSrcweir     int bWeiter = sal_True;
455cdf0e10cSrcweir     int bEqSignFound = sal_False;
456cdf0e10cSrcweir     sal_Unicode cQuote = 0U;
457cdf0e10cSrcweir 
458cdf0e10cSrcweir     while( bWeiter && IsParserWorking() )
459cdf0e10cSrcweir     {
460cdf0e10cSrcweir         int bNextCh = sal_True;
461cdf0e10cSrcweir         switch( nNextCh )
462cdf0e10cSrcweir         {
463cdf0e10cSrcweir         case '&':
464cdf0e10cSrcweir             bEqSignFound = sal_False;
465cdf0e10cSrcweir             if( bReadXMP )
466cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)'&' );
467cdf0e10cSrcweir             else
468cdf0e10cSrcweir             {
469cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
470cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
471cdf0e10cSrcweir 
472cdf0e10cSrcweir                 sal_Unicode cChar = 0U;
473cdf0e10cSrcweir                 if( '#' == (nNextCh = GetNextChar()) )
474cdf0e10cSrcweir                 {
475cdf0e10cSrcweir                     nNextCh = GetNextChar();
476cdf0e10cSrcweir                     // --> OD 2006-07-26 #138464#
477cdf0e10cSrcweir                     // consider hexadecimal digits
478cdf0e10cSrcweir                     const sal_Bool bIsHex( 'x' == nNextCh );
479cdf0e10cSrcweir                     const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
480cdf0e10cSrcweir                     if ( bIsDecOrHex )
481cdf0e10cSrcweir                     {
482cdf0e10cSrcweir                         if ( bIsHex )
483cdf0e10cSrcweir                         {
484cdf0e10cSrcweir                             nNextCh = GetNextChar();
485cdf0e10cSrcweir                             while ( HTML_ISHEXDIGIT(nNextCh) )
486cdf0e10cSrcweir                             {
487cdf0e10cSrcweir                                 cChar = cChar * 16U +
488cdf0e10cSrcweir                                         ( nNextCh <= '9'
489cdf0e10cSrcweir                                           ? sal_Unicode( nNextCh - '0' )
490cdf0e10cSrcweir                                           : ( nNextCh <= 'F'
491cdf0e10cSrcweir                                               ? sal_Unicode( nNextCh - 'A' + 10 )
492cdf0e10cSrcweir                                               : sal_Unicode( nNextCh - 'a' + 10 ) ) );
493cdf0e10cSrcweir                                 nNextCh = GetNextChar();
494cdf0e10cSrcweir                             }
495cdf0e10cSrcweir                         }
496cdf0e10cSrcweir                         else
497cdf0e10cSrcweir                         {
498cdf0e10cSrcweir                             do
499cdf0e10cSrcweir                             {
500cdf0e10cSrcweir                                 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
501cdf0e10cSrcweir                                 nNextCh = GetNextChar();
502cdf0e10cSrcweir                             }
503cdf0e10cSrcweir                             while( HTML_ISDIGIT(nNextCh) );
504cdf0e10cSrcweir                         }
505cdf0e10cSrcweir 
506cdf0e10cSrcweir                         if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
507cdf0e10cSrcweir                             RTL_TEXTENCODING_UCS2 != eSrcEnc &&
508cdf0e10cSrcweir                             RTL_TEXTENCODING_UTF8 != eSrcEnc &&
509cdf0e10cSrcweir                             cChar < 256 )
510cdf0e10cSrcweir                         {
511cdf0e10cSrcweir                             sal_Unicode cOrig = cChar;
512cdf0e10cSrcweir                             cChar = ByteString::ConvertToUnicode(
513cdf0e10cSrcweir                                             (sal_Char)cChar, eSrcEnc );
514cdf0e10cSrcweir                             if( 0U == cChar )
515cdf0e10cSrcweir                             {
516cdf0e10cSrcweir                                 // #73398#: If the character could not be
517cdf0e10cSrcweir                                 // converted, because a conversion is not
518cdf0e10cSrcweir                                 // available, do no conversion at all.
519cdf0e10cSrcweir                                 cChar = cOrig;
520cdf0e10cSrcweir                             }
521cdf0e10cSrcweir                         }
522cdf0e10cSrcweir                     }
523cdf0e10cSrcweir                     // <--
524cdf0e10cSrcweir                     else
525cdf0e10cSrcweir                         nNextCh = 0U;
526cdf0e10cSrcweir                 }
527cdf0e10cSrcweir                 else if( HTML_ISALPHA( nNextCh ) )
528cdf0e10cSrcweir                 {
529cdf0e10cSrcweir                     ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
530cdf0e10cSrcweir                     xub_StrLen nPos = 0L;
531cdf0e10cSrcweir                     do
532cdf0e10cSrcweir                     {
533cdf0e10cSrcweir                         sEntityBuffer.append( nNextCh );
534cdf0e10cSrcweir                         nPos++;
535cdf0e10cSrcweir                         nNextCh = GetNextChar();
536cdf0e10cSrcweir                     }
537cdf0e10cSrcweir                     while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
538cdf0e10cSrcweir                            !rInput.IsEof() );
539cdf0e10cSrcweir 
540cdf0e10cSrcweir                     if( IsParserWorking() && !rInput.IsEof() )
541cdf0e10cSrcweir                     {
542cdf0e10cSrcweir                         String sEntity( sEntityBuffer.getStr(), nPos );
543cdf0e10cSrcweir                         cChar = GetHTMLCharName( sEntity );
544cdf0e10cSrcweir 
545cdf0e10cSrcweir                         // nicht gefunden ( == 0 ), dann Klartext
546cdf0e10cSrcweir                         // oder ein Zeichen das als Attribut eingefuegt
547cdf0e10cSrcweir                         // wird
548cdf0e10cSrcweir                         if( 0U == cChar && ';' != nNextCh )
549cdf0e10cSrcweir                         {
550cdf0e10cSrcweir                             DBG_ASSERT( rInput.Tell() - nStreamPos ==
551cdf0e10cSrcweir                                         (sal_uLong)(nPos+1L)*GetCharSize(),
552cdf0e10cSrcweir                                         "UTF-8 geht hier schief" );
553cdf0e10cSrcweir                             for( xub_StrLen i=nPos-1L; i>1L; i-- )
554cdf0e10cSrcweir                             {
555cdf0e10cSrcweir                                 nNextCh = sEntityBuffer[i];
556cdf0e10cSrcweir                                 sEntityBuffer.setLength( i );
557cdf0e10cSrcweir                                 sEntity.Assign( sEntityBuffer.getStr(), i );
558cdf0e10cSrcweir                                 cChar = GetHTMLCharName( sEntity );
559cdf0e10cSrcweir                                 if( cChar )
560cdf0e10cSrcweir                                 {
561cdf0e10cSrcweir                                     rInput.SeekRel( -(long)
562cdf0e10cSrcweir                                             ((nPos-i)*GetCharSize()) );
563cdf0e10cSrcweir                                     nlLinePos -= sal_uInt32(nPos-i);
564cdf0e10cSrcweir                                     nPos = i;
565cdf0e10cSrcweir                                     ClearTxtConvContext();
566cdf0e10cSrcweir                                     break;
567cdf0e10cSrcweir                                 }
568cdf0e10cSrcweir                             }
569cdf0e10cSrcweir                         }
570cdf0e10cSrcweir 
571cdf0e10cSrcweir                         if( !cChar )        // unbekanntes Zeichen?
572cdf0e10cSrcweir                         {
573cdf0e10cSrcweir                             // dann im Stream zurueck, das '&' als Zeichen
574cdf0e10cSrcweir                             // einfuegen und mit dem nachfolgenden Zeichen
575cdf0e10cSrcweir                             // wieder aufsetzen
576cdf0e10cSrcweir                             sTmpBuffer.append( (sal_Unicode)'&' );
577cdf0e10cSrcweir 
578cdf0e10cSrcweir //                          rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
579cdf0e10cSrcweir //                          nlLinePos -= nPos;
580cdf0e10cSrcweir                             DBG_ASSERT( rInput.Tell()-nStreamPos ==
581cdf0e10cSrcweir                                         (sal_uLong)(nPos+1)*GetCharSize(),
582cdf0e10cSrcweir                                         "Falsche Stream-Position" );
583cdf0e10cSrcweir                             DBG_ASSERT( nlLinePos-nLinePos ==
584cdf0e10cSrcweir                                         (sal_uLong)(nPos+1),
585cdf0e10cSrcweir                                         "Falsche Zeilen-Position" );
586cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
587cdf0e10cSrcweir                             nlLinePos = nLinePos;
588cdf0e10cSrcweir                             ClearTxtConvContext();
589cdf0e10cSrcweir                             break;
590cdf0e10cSrcweir                         }
591cdf0e10cSrcweir 
592cdf0e10cSrcweir                         // 1 == Non Breaking Space
593cdf0e10cSrcweir                         // 2 == SoftHyphen
594cdf0e10cSrcweir 
595cdf0e10cSrcweir                         if( cChar < 3U )
596cdf0e10cSrcweir                         {
597cdf0e10cSrcweir                             if( '>' == cBreak )
598cdf0e10cSrcweir                             {
599cdf0e10cSrcweir                                 // Wenn der Inhalt eines Tags gelesen wird,
600cdf0e10cSrcweir                                 // muessen wir ein Space bzw. - daraus machen
601cdf0e10cSrcweir                                 switch( cChar )
602cdf0e10cSrcweir                                 {
603cdf0e10cSrcweir                                 case 1U: cChar = ' '; break;
604cdf0e10cSrcweir                                 case 2U: cChar = '-'; break;
605cdf0e10cSrcweir                                 default:
606cdf0e10cSrcweir                                     DBG_ASSERT( cChar==1U,
607cdf0e10cSrcweir                             "\0x00 sollte doch schon laengt abgefangen sein!" );
608cdf0e10cSrcweir                                     break;
609cdf0e10cSrcweir                                 }
610cdf0e10cSrcweir                             }
611cdf0e10cSrcweir                             else
612cdf0e10cSrcweir                             {
613cdf0e10cSrcweir                                 // Wenn kein Tag gescannt wird, enstprechendes
614cdf0e10cSrcweir                                 // Token zurueckgeben
615cdf0e10cSrcweir                                 aToken +=
616cdf0e10cSrcweir                                     String( sTmpBuffer.makeStringAndClear() );
617cdf0e10cSrcweir                                 if( cChar )
618cdf0e10cSrcweir                                 {
619cdf0e10cSrcweir                                     if( aToken.Len() )
620cdf0e10cSrcweir                                     {
621cdf0e10cSrcweir                                         // mit dem Zeichen wieder aufsetzen
622cdf0e10cSrcweir                                         nNextCh = '&';
623cdf0e10cSrcweir //                                      rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
624cdf0e10cSrcweir //                                      nlLinePos -= nPos;
625cdf0e10cSrcweir                                         DBG_ASSERT( rInput.Tell()-nStreamPos ==
626cdf0e10cSrcweir                                                     (sal_uLong)(nPos+1)*GetCharSize(),
627cdf0e10cSrcweir                                                     "Falsche Stream-Position" );
628cdf0e10cSrcweir                                         DBG_ASSERT( nlLinePos-nLinePos ==
629cdf0e10cSrcweir                                                     (sal_uLong)(nPos+1),
630cdf0e10cSrcweir                                                     "Falsche Zeilen-Position" );
631cdf0e10cSrcweir                                         rInput.Seek( nStreamPos );
632cdf0e10cSrcweir                                         nlLinePos = nLinePos;
633cdf0e10cSrcweir                                         ClearTxtConvContext();
634cdf0e10cSrcweir                                         return HTML_TEXTTOKEN;
635cdf0e10cSrcweir                                     }
636cdf0e10cSrcweir 
637cdf0e10cSrcweir                                     // Hack: _GetNextChar soll nicht das
638cdf0e10cSrcweir                                     // naechste Zeichen lesen
639cdf0e10cSrcweir                                     if( ';' != nNextCh )
640cdf0e10cSrcweir                                         aToken += ' ';
641cdf0e10cSrcweir                                     if( 1U == cChar )
642cdf0e10cSrcweir                                         return HTML_NONBREAKSPACE;
643cdf0e10cSrcweir                                     if( 2U == cChar )
644cdf0e10cSrcweir                                         return HTML_SOFTHYPH;
645cdf0e10cSrcweir                                 }
646cdf0e10cSrcweir                                 aToken += (sal_Unicode)'&';
647cdf0e10cSrcweir                                 aToken +=
648cdf0e10cSrcweir                                     String(sEntityBuffer.makeStringAndClear());
649cdf0e10cSrcweir                                 break;
650cdf0e10cSrcweir                             }
651cdf0e10cSrcweir                         }
652cdf0e10cSrcweir                     }
653cdf0e10cSrcweir                     else
654cdf0e10cSrcweir                         nNextCh = 0U;
655cdf0e10cSrcweir                 }
656cdf0e10cSrcweir                 // MIB 03/02/2000: &{...};-JavaScript-Macros are not
657cdf0e10cSrcweir                 // supported any longer.
658cdf0e10cSrcweir                 else if( IsParserWorking() )
659cdf0e10cSrcweir                 {
660cdf0e10cSrcweir                     sTmpBuffer.append( (sal_Unicode)'&' );
661cdf0e10cSrcweir                     bNextCh = sal_False;
662cdf0e10cSrcweir                     break;
663cdf0e10cSrcweir                 }
664cdf0e10cSrcweir 
665cdf0e10cSrcweir                 bNextCh = (';' == nNextCh);
666cdf0e10cSrcweir                 if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
667cdf0e10cSrcweir                                     cChar=='\"' || cChar==' ') )
668cdf0e10cSrcweir                 {
669cdf0e10cSrcweir                     // ' und " mussen innerhalb von Tags mit einem
670cdf0e10cSrcweir                     // gekennzeichnet werden, um sie von ' und " als Klammern
671cdf0e10cSrcweir                     // um Optionen zu unterscheiden. Logischerweise muss
672cdf0e10cSrcweir                     // deshalb auch ein \ gekeenzeichnet werden. Ausserdem
673cdf0e10cSrcweir                     // schuetzen wir ein Space, weil es kein Trennzeichen
674cdf0e10cSrcweir                     // zwischen Optionen ist.
675cdf0e10cSrcweir                     sTmpBuffer.append( (sal_Unicode)'\\' );
676cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
677cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
678cdf0e10cSrcweir                 }
679cdf0e10cSrcweir                 if( IsParserWorking() )
680cdf0e10cSrcweir                 {
681cdf0e10cSrcweir                     if( cChar )
682cdf0e10cSrcweir                         sTmpBuffer.append( cChar );
683cdf0e10cSrcweir                 }
684cdf0e10cSrcweir                 else if( SVPAR_PENDING==eState && '>'!=cBreak )
685cdf0e10cSrcweir                 {
686cdf0e10cSrcweir                     // Mit dem '&' Zeichen wieder aufsetzen, der Rest
687cdf0e10cSrcweir                     // wird als Texttoken zurueckgegeben.
688cdf0e10cSrcweir                     if( aToken.Len() || sTmpBuffer.getLength() )
689cdf0e10cSrcweir                     {
690cdf0e10cSrcweir                         // Der bisherige Text wird von _GetNextChar()
691cdf0e10cSrcweir                         // zurueckgegeben und beim naechsten Aufruf wird
692cdf0e10cSrcweir                         // ein neues Zeichen gelesen. Also muessen wir uns
693cdf0e10cSrcweir                         // noch vor das & stellen.
694cdf0e10cSrcweir                         nNextCh = 0U;
695cdf0e10cSrcweir                         rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
696cdf0e10cSrcweir                         nlLinePos = nLinePos-1;
697cdf0e10cSrcweir                         ClearTxtConvContext();
698cdf0e10cSrcweir                         bReadNextChar = sal_True;
699cdf0e10cSrcweir                     }
700cdf0e10cSrcweir                     bNextCh = sal_False;
701cdf0e10cSrcweir                 }
702cdf0e10cSrcweir             }
703cdf0e10cSrcweir             break;
704cdf0e10cSrcweir         case '=':
705cdf0e10cSrcweir             if( '>'==cBreak && !cQuote )
706cdf0e10cSrcweir                 bEqSignFound = sal_True;
707cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
708cdf0e10cSrcweir             break;
709cdf0e10cSrcweir 
710cdf0e10cSrcweir         case '\\':
711cdf0e10cSrcweir             if( '>'==cBreak )
712cdf0e10cSrcweir             {
713cdf0e10cSrcweir                 // Innerhalb von Tags kennzeichnen
714cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)'\\' );
715cdf0e10cSrcweir                 if( MAX_LEN == sTmpBuffer.getLength() )
716cdf0e10cSrcweir                     aToken += String(sTmpBuffer.makeStringAndClear());
717cdf0e10cSrcweir             }
718cdf0e10cSrcweir             sTmpBuffer.append( (sal_Unicode)'\\' );
719cdf0e10cSrcweir             break;
720cdf0e10cSrcweir 
721cdf0e10cSrcweir         case '\"':
722cdf0e10cSrcweir         case '\'':
723cdf0e10cSrcweir             if( '>'==cBreak )
724cdf0e10cSrcweir             {
725cdf0e10cSrcweir                 if( bEqSignFound )
726cdf0e10cSrcweir                     cQuote = nNextCh;
727cdf0e10cSrcweir                 else if( cQuote && (cQuote==nNextCh ) )
728cdf0e10cSrcweir                     cQuote = 0U;
729cdf0e10cSrcweir             }
730cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
731cdf0e10cSrcweir             bEqSignFound = sal_False;
732cdf0e10cSrcweir             break;
733cdf0e10cSrcweir 
734cdf0e10cSrcweir         case sal_Unicode(EOF):
735cdf0e10cSrcweir             if( rInput.IsEof() )
736cdf0e10cSrcweir             {
737cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird
738cdf0e10cSrcweir // abc&auml;<EOF> nicht angezeigt, also lassen wir das in Zukunft.
739cdf0e10cSrcweir //              if( '>' != cBreak )
740cdf0e10cSrcweir //                  eState = SVPAR_ACCEPTED;
741cdf0e10cSrcweir                 bWeiter = sal_False;
742cdf0e10cSrcweir             }
743cdf0e10cSrcweir             else
744cdf0e10cSrcweir             {
745cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
746cdf0e10cSrcweir             }
747cdf0e10cSrcweir             break;
748cdf0e10cSrcweir 
749cdf0e10cSrcweir         case '<':
750cdf0e10cSrcweir             bEqSignFound = sal_False;
751cdf0e10cSrcweir             if( '>'==cBreak )
752cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
753cdf0e10cSrcweir             else
754cdf0e10cSrcweir                 bWeiter = sal_False;        // Abbrechen, String zusammen
755cdf0e10cSrcweir             break;
756cdf0e10cSrcweir 
757cdf0e10cSrcweir         case '\f':
758cdf0e10cSrcweir             if( '>' == cBreak )
759cdf0e10cSrcweir             {
760cdf0e10cSrcweir                 // Beim Scannen von Optionen wie ein Space behandeln
761cdf0e10cSrcweir                 sTmpBuffer.append( (sal_Unicode)' ' );
762cdf0e10cSrcweir             }
763cdf0e10cSrcweir             else
764cdf0e10cSrcweir             {
765cdf0e10cSrcweir                 // sonst wird es ein eigenes Token
766cdf0e10cSrcweir                 bWeiter = sal_False;
767cdf0e10cSrcweir             }
768cdf0e10cSrcweir             break;
769cdf0e10cSrcweir 
770cdf0e10cSrcweir         case '\r':
771cdf0e10cSrcweir         case '\n':
772cdf0e10cSrcweir             if( '>'==cBreak )
773cdf0e10cSrcweir             {
774cdf0e10cSrcweir                 // #26979# cr/lf in Tag wird in _GetNextToken() behandeln
775cdf0e10cSrcweir                 sTmpBuffer.append( nNextCh );
776cdf0e10cSrcweir                 break;
777cdf0e10cSrcweir             }
778cdf0e10cSrcweir             else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
779cdf0e10cSrcweir             {
780cdf0e10cSrcweir                 bWeiter = sal_False;
781cdf0e10cSrcweir                 break;
782cdf0e10cSrcweir             }
783cdf0e10cSrcweir             // Bug 18984: CR-LF -> Blank
784cdf0e10cSrcweir             //      Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln
785cdf0e10cSrcweir             // kein break!!
786cdf0e10cSrcweir         case '\t':
787cdf0e10cSrcweir             if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
788cdf0e10cSrcweir             {
789cdf0e10cSrcweir                 // In <PRE>: Tabs nach oben durchreichen
790cdf0e10cSrcweir                 bWeiter = sal_False;
791cdf0e10cSrcweir                 break;
792cdf0e10cSrcweir             }
793cdf0e10cSrcweir             // kein break
794cdf0e10cSrcweir         case '\x0b':
795cdf0e10cSrcweir             if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
796cdf0e10cSrcweir                 '>'!=cBreak )
797cdf0e10cSrcweir             {
798cdf0e10cSrcweir                 break;
799cdf0e10cSrcweir             }
800cdf0e10cSrcweir             nNextCh = ' ';
801cdf0e10cSrcweir             // kein break;
802cdf0e10cSrcweir         case ' ':
803cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
804cdf0e10cSrcweir             if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
805cdf0e10cSrcweir                                 !bReadPRE && !bReadTextArea) )
806cdf0e10cSrcweir             {
807cdf0e10cSrcweir                 // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln
808cdf0e10cSrcweir                 do {
809cdf0e10cSrcweir                     if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
810cdf0e10cSrcweir                         rInput.IsEof() )
811cdf0e10cSrcweir                     {
812cdf0e10cSrcweir                         if( aToken.Len() || sTmpBuffer.getLength() > 1L )
813cdf0e10cSrcweir                         {
814cdf0e10cSrcweir                             // ausser den Blanks wurde noch etwas geselen
815cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
816cdf0e10cSrcweir                             return HTML_TEXTTOKEN;
817cdf0e10cSrcweir                         }
818cdf0e10cSrcweir                         else
819cdf0e10cSrcweir                             // nur Blanks gelesen: dann darf kein Text
820cdf0e10cSrcweir                             // mehr zurueckgegeben werden und _GetNextToken
821cdf0e10cSrcweir                             // muss auf EOF laufen
822cdf0e10cSrcweir                             return 0;
823cdf0e10cSrcweir                     }
824cdf0e10cSrcweir                 } while ( ' ' == nNextCh || '\t' == nNextCh ||
825cdf0e10cSrcweir                           '\r' == nNextCh || '\n' == nNextCh ||
826cdf0e10cSrcweir                           '\x0b' == nNextCh );
827cdf0e10cSrcweir                 bNextCh = sal_False;
828cdf0e10cSrcweir             }
829cdf0e10cSrcweir             break;
830cdf0e10cSrcweir 
831cdf0e10cSrcweir         default:
832cdf0e10cSrcweir             bEqSignFound = sal_False;
833cdf0e10cSrcweir             if( (nNextCh==cBreak && !cQuote) ||
834cdf0e10cSrcweir                 (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 ))
835cdf0e10cSrcweir                 bWeiter = sal_False;
836cdf0e10cSrcweir             else
837cdf0e10cSrcweir             {
838cdf0e10cSrcweir                 do {
839cdf0e10cSrcweir                     // alle anderen Zeichen kommen in den Text
840cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
841cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
842cdf0e10cSrcweir                     {
843cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
844cdf0e10cSrcweir                         if( (sal_uLong(aToken.Len()) + MAX_LEN) >
845cdf0e10cSrcweir                                 sal_uLong(STRING_MAXLEN & ~1 ) )
846cdf0e10cSrcweir                         {
847cdf0e10cSrcweir                             nNextCh = GetNextChar();
848cdf0e10cSrcweir                             return HTML_TEXTTOKEN;
849cdf0e10cSrcweir                         }
850cdf0e10cSrcweir                     }
851cdf0e10cSrcweir                     if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
852cdf0e10cSrcweir                           rInput.IsEof() ) ||
853cdf0e10cSrcweir                         !IsParserWorking() )
854cdf0e10cSrcweir                     {
855cdf0e10cSrcweir                         if( sTmpBuffer.getLength() )
856cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
857cdf0e10cSrcweir                         return HTML_TEXTTOKEN;
858cdf0e10cSrcweir                     }
859cdf0e10cSrcweir                 } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
860cdf0e10cSrcweir                 bNextCh = sal_False;
861cdf0e10cSrcweir             }
862cdf0e10cSrcweir         }
863cdf0e10cSrcweir 
864cdf0e10cSrcweir         if( MAX_LEN == sTmpBuffer.getLength() )
865cdf0e10cSrcweir             aToken += String(sTmpBuffer.makeStringAndClear());
866cdf0e10cSrcweir 
867cdf0e10cSrcweir         if( bWeiter && bNextCh )
868cdf0e10cSrcweir             nNextCh = GetNextChar();
869cdf0e10cSrcweir     }
870cdf0e10cSrcweir 
871cdf0e10cSrcweir     if( sTmpBuffer.getLength() )
872cdf0e10cSrcweir         aToken += String(sTmpBuffer.makeStringAndClear());
873cdf0e10cSrcweir 
874cdf0e10cSrcweir     return HTML_TEXTTOKEN;
875cdf0e10cSrcweir }
876cdf0e10cSrcweir 
877cdf0e10cSrcweir int HTMLParser::_GetNextRawToken()
878cdf0e10cSrcweir {
879cdf0e10cSrcweir     ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
880cdf0e10cSrcweir 
881cdf0e10cSrcweir     if( bEndTokenFound )
882cdf0e10cSrcweir     {
883cdf0e10cSrcweir         // beim letzten Aufruf haben wir das End-Token bereits gefunden,
884cdf0e10cSrcweir         // deshalb muessen wir es nicht noch einmal suchen
885cdf0e10cSrcweir         bReadScript = sal_False;
886cdf0e10cSrcweir         bReadStyle = sal_False;
887cdf0e10cSrcweir         aEndToken.Erase();
888cdf0e10cSrcweir         bEndTokenFound = sal_False;
889cdf0e10cSrcweir 
890cdf0e10cSrcweir         return 0;
891cdf0e10cSrcweir     }
892cdf0e10cSrcweir 
893cdf0e10cSrcweir     // per default geben wir HTML_RAWDATA zurueck
894cdf0e10cSrcweir     int bWeiter = sal_True;
895cdf0e10cSrcweir     int nToken = HTML_RAWDATA;
896cdf0e10cSrcweir     SaveState( 0 );
897cdf0e10cSrcweir     while( bWeiter && IsParserWorking() )
898cdf0e10cSrcweir     {
899cdf0e10cSrcweir         int bNextCh = sal_True;
900cdf0e10cSrcweir         switch( nNextCh )
901cdf0e10cSrcweir         {
902cdf0e10cSrcweir         case '<':
903cdf0e10cSrcweir             {
904cdf0e10cSrcweir                 // Vielleicht haben wir das Ende erreicht
905cdf0e10cSrcweir 
906cdf0e10cSrcweir                 // das bisher gelesene erstmal retten
907cdf0e10cSrcweir                 aToken += String(sTmpBuffer.makeStringAndClear());
908cdf0e10cSrcweir 
909cdf0e10cSrcweir                 // und die Position im Stream merken
910cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
911cdf0e10cSrcweir                 sal_uLong nLineNr = GetLineNr();
912cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
913cdf0e10cSrcweir 
914cdf0e10cSrcweir                 // Start eines End-Token?
915cdf0e10cSrcweir                 int bOffState = sal_False;
916cdf0e10cSrcweir                 if( '/' == (nNextCh = GetNextChar()) )
917cdf0e10cSrcweir                 {
918cdf0e10cSrcweir                     bOffState = sal_True;
919cdf0e10cSrcweir                     nNextCh = GetNextChar();
920cdf0e10cSrcweir                 }
921cdf0e10cSrcweir                 else if( '!' == nNextCh )
922cdf0e10cSrcweir                 {
923cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
924cdf0e10cSrcweir                     nNextCh = GetNextChar();
925cdf0e10cSrcweir                 }
926cdf0e10cSrcweir 
927cdf0e10cSrcweir                 // jetzt die Buchstaben danach lesen
928cdf0e10cSrcweir                 while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
929cdf0e10cSrcweir                        IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
930cdf0e10cSrcweir                 {
931cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
932cdf0e10cSrcweir                     nNextCh = GetNextChar();
933cdf0e10cSrcweir                 }
934cdf0e10cSrcweir 
935cdf0e10cSrcweir                 String aTok( sTmpBuffer.getStr(),
936cdf0e10cSrcweir                              sal::static_int_cast< xub_StrLen >(
937cdf0e10cSrcweir                                  sTmpBuffer.getLength()) );
938cdf0e10cSrcweir                 aTok.ToUpperAscii();
939cdf0e10cSrcweir                 sal_Bool bDone = sal_False;
940cdf0e10cSrcweir                 if( bReadScript || aEndToken.Len() )
941cdf0e10cSrcweir                 {
942cdf0e10cSrcweir                     if( !bReadComment )
943cdf0e10cSrcweir                     {
944cdf0e10cSrcweir                         if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 )
945cdf0e10cSrcweir                                 == COMPARE_EQUAL )
946cdf0e10cSrcweir                         {
947cdf0e10cSrcweir                             bReadComment = sal_True;
948cdf0e10cSrcweir                         }
949cdf0e10cSrcweir                         else
950cdf0e10cSrcweir                         {
951cdf0e10cSrcweir                             // ein Script muss mit "</SCRIPT>" aufhoehren, wobei
952cdf0e10cSrcweir                             // wir es mit dem ">" aus sicherheitsgruenden
953cdf0e10cSrcweir                             // erstmal nicht so genau nehmen
954cdf0e10cSrcweir                             bDone = bOffState && // '>'==nNextCh &&
955cdf0e10cSrcweir                             COMPARE_EQUAL == ( bReadScript
956cdf0e10cSrcweir                                 ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script)
957cdf0e10cSrcweir                                 : aTok.CompareTo(aEndToken) );
958cdf0e10cSrcweir                         }
959cdf0e10cSrcweir                     }
960cdf0e10cSrcweir                     if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 &&
961cdf0e10cSrcweir                         aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) )
962cdf0e10cSrcweir                     {
963cdf0e10cSrcweir                         // hier ist ein Kommentar der Art <!-----> zuende
964cdf0e10cSrcweir                         bReadComment = sal_False;
965cdf0e10cSrcweir                     }
966cdf0e10cSrcweir                 }
967cdf0e10cSrcweir                 else
968cdf0e10cSrcweir                 {
969cdf0e10cSrcweir                     // ein Style-Sheet kann mit </STYLE>, </HEAD> oder
970cdf0e10cSrcweir                     // <BODY> aughoehren
971cdf0e10cSrcweir                     if( bOffState )
972cdf0e10cSrcweir                         bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style)
973cdf0e10cSrcweir                                     == COMPARE_EQUAL ||
974cdf0e10cSrcweir                                 aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head)
975cdf0e10cSrcweir                                     == COMPARE_EQUAL;
976cdf0e10cSrcweir                     else
977cdf0e10cSrcweir                         bDone =
978cdf0e10cSrcweir                             aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL;
979cdf0e10cSrcweir                 }
980cdf0e10cSrcweir 
981cdf0e10cSrcweir                 if( bDone )
982cdf0e10cSrcweir                 {
983cdf0e10cSrcweir                     // das war's, jetzt muessen wir gegebenenfalls den
984cdf0e10cSrcweir                     // bisher gelesenen String zurueckgeben und dnach normal
985cdf0e10cSrcweir                     // weitermachen
986cdf0e10cSrcweir 
987cdf0e10cSrcweir                     bWeiter = sal_False;
988cdf0e10cSrcweir 
989cdf0e10cSrcweir                     // nToken==0 heisst, dass _GetNextToken gleich weiterliest
990cdf0e10cSrcweir                     if( !aToken.Len() && (bReadStyle || bReadScript) )
991cdf0e10cSrcweir                     {
992cdf0e10cSrcweir                         // wir koennen sofort die Umgebung beeden und
993cdf0e10cSrcweir                         // das End-Token parsen
994cdf0e10cSrcweir                         bReadScript = sal_False;
995cdf0e10cSrcweir                         bReadStyle = sal_False;
996cdf0e10cSrcweir                         aEndToken.Erase();
997cdf0e10cSrcweir                         nToken = 0;
998cdf0e10cSrcweir                     }
999cdf0e10cSrcweir                     else
1000cdf0e10cSrcweir                     {
1001cdf0e10cSrcweir                         // wir muessen bReadScript/bReadStyle noch am
1002cdf0e10cSrcweir                         // Leben lassen und koennen erst beim naechsten
1003cdf0e10cSrcweir                         // mal das End-Token Parsen
1004cdf0e10cSrcweir                         bEndTokenFound = sal_True;
1005cdf0e10cSrcweir                     }
1006cdf0e10cSrcweir 
1007cdf0e10cSrcweir                     // jetzt fahren wir im Stream auf das '<' zurueck
1008cdf0e10cSrcweir                     rInput.Seek( nStreamPos );
1009cdf0e10cSrcweir                     SetLineNr( nLineNr );
1010cdf0e10cSrcweir                     SetLinePos( nLinePos );
1011cdf0e10cSrcweir                     ClearTxtConvContext();
1012cdf0e10cSrcweir                     nNextCh = '<';
1013cdf0e10cSrcweir 
1014cdf0e10cSrcweir                     // den String wollen wir nicht an das Token haengen
1015cdf0e10cSrcweir                     sTmpBuffer.setLength( 0L );
1016cdf0e10cSrcweir                 }
1017cdf0e10cSrcweir                 else
1018cdf0e10cSrcweir                 {
1019cdf0e10cSrcweir                     // "</" merken, alles andere steht noch im buffer
1020cdf0e10cSrcweir                     aToken += (sal_Unicode)'<';
1021cdf0e10cSrcweir                     if( bOffState )
1022cdf0e10cSrcweir                         aToken += (sal_Unicode)'/';
1023cdf0e10cSrcweir 
1024cdf0e10cSrcweir                     bNextCh = sal_False;
1025cdf0e10cSrcweir                 }
1026cdf0e10cSrcweir             }
1027cdf0e10cSrcweir             break;
1028cdf0e10cSrcweir         case '-':
1029cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
1030cdf0e10cSrcweir             if( bReadComment )
1031cdf0e10cSrcweir             {
1032cdf0e10cSrcweir                 sal_Bool bTwoMinus = sal_False;
1033cdf0e10cSrcweir                 nNextCh = GetNextChar();
1034cdf0e10cSrcweir                 while( '-' == nNextCh && IsParserWorking() )
1035cdf0e10cSrcweir                 {
1036cdf0e10cSrcweir                     bTwoMinus = sal_True;
1037cdf0e10cSrcweir 
1038cdf0e10cSrcweir                     if( MAX_LEN == sTmpBuffer.getLength() )
1039cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
1040cdf0e10cSrcweir                     sTmpBuffer.append( nNextCh );
1041cdf0e10cSrcweir                     nNextCh = GetNextChar();
1042cdf0e10cSrcweir                 }
1043cdf0e10cSrcweir 
1044cdf0e10cSrcweir                 if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
1045cdf0e10cSrcweir                     bReadComment = sal_False;
1046cdf0e10cSrcweir 
1047cdf0e10cSrcweir                 bNextCh = sal_False;
1048cdf0e10cSrcweir             }
1049cdf0e10cSrcweir             break;
1050cdf0e10cSrcweir 
1051cdf0e10cSrcweir         case '\r':
1052cdf0e10cSrcweir             // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist)
1053cdf0e10cSrcweir             nNextCh = GetNextChar();
1054cdf0e10cSrcweir             if( nNextCh=='\n' )
1055cdf0e10cSrcweir                 nNextCh = GetNextChar();
1056cdf0e10cSrcweir             bWeiter = sal_False;
1057cdf0e10cSrcweir             break;
1058cdf0e10cSrcweir         case '\n':
1059cdf0e10cSrcweir             // \n beendet das aktuelle Text-Token (auch wenn es leer ist)
1060cdf0e10cSrcweir             nNextCh = GetNextChar();
1061cdf0e10cSrcweir             bWeiter = sal_False;
1062cdf0e10cSrcweir             break;
1063cdf0e10cSrcweir         case sal_Unicode(EOF):
1064cdf0e10cSrcweir             // eof beendet das aktuelle Text-Token und tut so, als ob
1065cdf0e10cSrcweir             // ein End-Token gelesen wurde
1066cdf0e10cSrcweir             if( rInput.IsEof() )
1067cdf0e10cSrcweir             {
1068cdf0e10cSrcweir                 bWeiter = sal_False;
1069cdf0e10cSrcweir                 if( aToken.Len() || sTmpBuffer.getLength() )
1070cdf0e10cSrcweir                 {
1071cdf0e10cSrcweir                     bEndTokenFound = sal_True;
1072cdf0e10cSrcweir                 }
1073cdf0e10cSrcweir                 else
1074cdf0e10cSrcweir                 {
1075cdf0e10cSrcweir                     bReadScript = sal_False;
1076cdf0e10cSrcweir                     bReadStyle = sal_False;
1077cdf0e10cSrcweir                     aEndToken.Erase();
1078cdf0e10cSrcweir                     nToken = 0;
1079cdf0e10cSrcweir                 }
1080cdf0e10cSrcweir                 break;
1081cdf0e10cSrcweir             }
1082cdf0e10cSrcweir             // kein break
1083cdf0e10cSrcweir         default:
1084cdf0e10cSrcweir             // alle anderen Zeichen landen im Buffer
1085cdf0e10cSrcweir             sTmpBuffer.append( nNextCh );
1086cdf0e10cSrcweir             break;
1087cdf0e10cSrcweir         }
1088cdf0e10cSrcweir 
1089cdf0e10cSrcweir         if( (!bWeiter && sTmpBuffer.getLength() > 0L) ||
1090cdf0e10cSrcweir             MAX_LEN == sTmpBuffer.getLength() )
1091cdf0e10cSrcweir             aToken += String(sTmpBuffer.makeStringAndClear());
1092cdf0e10cSrcweir 
1093cdf0e10cSrcweir         if( bWeiter && bNextCh )
1094cdf0e10cSrcweir             nNextCh = GetNextChar();
1095cdf0e10cSrcweir     }
1096cdf0e10cSrcweir 
1097cdf0e10cSrcweir     if( IsParserWorking() )
1098cdf0e10cSrcweir         SaveState( 0 );
1099cdf0e10cSrcweir     else
1100cdf0e10cSrcweir         nToken = 0;
1101cdf0e10cSrcweir 
1102cdf0e10cSrcweir     return nToken;
1103cdf0e10cSrcweir }
1104cdf0e10cSrcweir 
1105cdf0e10cSrcweir // scanne das naechste Token,
1106cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken()
1107cdf0e10cSrcweir {
1108cdf0e10cSrcweir     int nRet = 0;
1109cdf0e10cSrcweir     sSaveToken.Erase();
1110cdf0e10cSrcweir 
1111cdf0e10cSrcweir     // die Optionen loeschen
1112cdf0e10cSrcweir     if( pOptions->Count() )
1113cdf0e10cSrcweir         pOptions->DeleteAndDestroy( 0, pOptions->Count() );
1114cdf0e10cSrcweir 
1115cdf0e10cSrcweir     if( !IsParserWorking() )        // wenn schon Fehler, dann nicht weiter!
1116cdf0e10cSrcweir         return 0;
1117cdf0e10cSrcweir 
1118cdf0e10cSrcweir     sal_Bool bReadNextCharSave = bReadNextChar;
1119cdf0e10cSrcweir     if( bReadNextChar )
1120cdf0e10cSrcweir     {
1121cdf0e10cSrcweir         DBG_ASSERT( !bEndTokenFound,
1122cdf0e10cSrcweir                     "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" );
1123cdf0e10cSrcweir         nNextCh = GetNextChar();
1124cdf0e10cSrcweir         if( !IsParserWorking() )        // wenn schon Fehler, dann nicht weiter!
1125cdf0e10cSrcweir             return 0;
1126cdf0e10cSrcweir         bReadNextChar = sal_False;
1127cdf0e10cSrcweir     }
1128cdf0e10cSrcweir 
1129cdf0e10cSrcweir     if( bReadScript || bReadStyle || aEndToken.Len() )
1130cdf0e10cSrcweir     {
1131cdf0e10cSrcweir         nRet = _GetNextRawToken();
1132cdf0e10cSrcweir         if( nRet || !IsParserWorking() )
1133cdf0e10cSrcweir             return nRet;
1134cdf0e10cSrcweir     }
1135cdf0e10cSrcweir 
1136cdf0e10cSrcweir     do {
1137cdf0e10cSrcweir         int bNextCh = sal_True;
1138cdf0e10cSrcweir         switch( nNextCh )
1139cdf0e10cSrcweir         {
1140cdf0e10cSrcweir         case '<':
1141cdf0e10cSrcweir             {
1142cdf0e10cSrcweir                 sal_uLong nStreamPos = rInput.Tell();
1143cdf0e10cSrcweir                 sal_uLong nLineNr = GetLineNr();
1144cdf0e10cSrcweir                 sal_uLong nLinePos = GetLinePos();
1145cdf0e10cSrcweir 
1146cdf0e10cSrcweir                 int bOffState = sal_False;
1147cdf0e10cSrcweir                 if( '/' == (nNextCh = GetNextChar()) )
1148cdf0e10cSrcweir                 {
1149cdf0e10cSrcweir                     bOffState = sal_True;
1150cdf0e10cSrcweir                     nNextCh = GetNextChar();
1151cdf0e10cSrcweir                 }
1152cdf0e10cSrcweir                 if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984#
1153cdf0e10cSrcweir                 {
1154cdf0e10cSrcweir                     ::rtl::OUStringBuffer sTmpBuffer;
1155cdf0e10cSrcweir                     do {
1156cdf0e10cSrcweir                         sTmpBuffer.append( nNextCh );
1157cdf0e10cSrcweir                         if( MAX_LEN == sTmpBuffer.getLength() )
1158cdf0e10cSrcweir                             aToken += String(sTmpBuffer.makeStringAndClear());
1159cdf0e10cSrcweir                         nNextCh = GetNextChar();
1160cdf0e10cSrcweir                     } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1161cdf0e10cSrcweir                              IsParserWorking() && !rInput.IsEof() );
1162cdf0e10cSrcweir 
1163cdf0e10cSrcweir                     if( sTmpBuffer.getLength() )
1164cdf0e10cSrcweir                         aToken += String(sTmpBuffer.makeStringAndClear());
1165cdf0e10cSrcweir 
1166cdf0e10cSrcweir                     // Blanks ueberlesen
1167cdf0e10cSrcweir                     while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1168cdf0e10cSrcweir                         nNextCh = GetNextChar();
1169cdf0e10cSrcweir 
1170cdf0e10cSrcweir                     if( !IsParserWorking() )
1171cdf0e10cSrcweir                     {
1172cdf0e10cSrcweir                         if( SVPAR_PENDING == eState )
1173cdf0e10cSrcweir                             bReadNextChar = bReadNextCharSave;
1174cdf0e10cSrcweir                         break;
1175cdf0e10cSrcweir                     }
1176cdf0e10cSrcweir 
1177cdf0e10cSrcweir                     // suche das Token in der Tabelle:
1178cdf0e10cSrcweir                     sSaveToken = aToken;
1179cdf0e10cSrcweir                     aToken.ToUpperAscii();
1180cdf0e10cSrcweir                     if( 0 == (nRet = GetHTMLToken( aToken )) )
1181cdf0e10cSrcweir                         // Unknown Control
1182cdf0e10cSrcweir                         nRet = HTML_UNKNOWNCONTROL_ON;
1183cdf0e10cSrcweir 
1184cdf0e10cSrcweir                     // Wenn es ein Token zum ausschalten ist ...
1185cdf0e10cSrcweir                     if( bOffState )
1186cdf0e10cSrcweir                     {
1187cdf0e10cSrcweir                          if( HTML_TOKEN_ONOFF & nRet )
1188cdf0e10cSrcweir                          {
1189cdf0e10cSrcweir                             // und es ein Off-Token gibt, das daraus machen
1190cdf0e10cSrcweir                             ++nRet;
1191cdf0e10cSrcweir                          }
1192cdf0e10cSrcweir                          else if( HTML_LINEBREAK!=nRet )
1193cdf0e10cSrcweir                          {
1194cdf0e10cSrcweir                             // und es kein Off-Token gibt, ein unbekanntes
1195cdf0e10cSrcweir                             // Token daraus machen (ausser </BR>, das wird
1196cdf0e10cSrcweir                             // wie <BR> behandelt
1197cdf0e10cSrcweir                             nRet = HTML_UNKNOWNCONTROL_OFF;
1198cdf0e10cSrcweir                          }
1199cdf0e10cSrcweir                     }
1200cdf0e10cSrcweir 
1201cdf0e10cSrcweir                     if( nRet == HTML_COMMENT )
1202cdf0e10cSrcweir                     {
1203cdf0e10cSrcweir                         // fix: sSaveToken wegen Gross-/Kleinschreibung
1204cdf0e10cSrcweir                         // als Anfang des Kommentars benutzen und ein
1205cdf0e10cSrcweir                         // Space anhaengen.
1206cdf0e10cSrcweir                         aToken = sSaveToken;
1207cdf0e10cSrcweir                         if( '>'!=nNextCh )
1208cdf0e10cSrcweir                             aToken += (sal_Unicode)' ';
1209cdf0e10cSrcweir                         sal_uLong nCStreamPos = 0;
1210cdf0e10cSrcweir                         sal_uLong nCLineNr = 0;
1211cdf0e10cSrcweir                         sal_uLong nCLinePos = 0;
1212cdf0e10cSrcweir                         xub_StrLen nCStrLen = 0;
1213cdf0e10cSrcweir 
1214cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1215cdf0e10cSrcweir                         // bis zum schliessenden --> lesen. wenn keins gefunden
1216cdf0e10cSrcweir                         // wurde beim der ersten > wieder aufsetzen
1217cdf0e10cSrcweir                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
1218cdf0e10cSrcweir                         {
1219cdf0e10cSrcweir                             if( '>'==nNextCh )
1220cdf0e10cSrcweir                             {
1221cdf0e10cSrcweir                                 if( !nCStreamPos )
1222cdf0e10cSrcweir                                 {
1223cdf0e10cSrcweir                                     nCStreamPos = rInput.Tell();
1224cdf0e10cSrcweir                                     nCStrLen = aToken.Len();
1225cdf0e10cSrcweir                                     nCLineNr = GetLineNr();
1226cdf0e10cSrcweir                                     nCLinePos = GetLinePos();
1227cdf0e10cSrcweir                                 }
1228cdf0e10cSrcweir                                 bDone = aToken.Len() >= 2 &&
1229cdf0e10cSrcweir                                         aToken.Copy(aToken.Len()-2,2).
1230cdf0e10cSrcweir                                                         EqualsAscii( "--" );
1231cdf0e10cSrcweir                                 if( !bDone )
1232cdf0e10cSrcweir                                 aToken += nNextCh;
1233cdf0e10cSrcweir                             }
1234cdf0e10cSrcweir                             else
1235cdf0e10cSrcweir                                 aToken += nNextCh;
1236cdf0e10cSrcweir                             if( !bDone )
1237cdf0e10cSrcweir                                 nNextCh = GetNextChar();
1238cdf0e10cSrcweir                         }
1239cdf0e10cSrcweir                         if( !bDone && IsParserWorking() && nCStreamPos )
1240cdf0e10cSrcweir                         {
1241cdf0e10cSrcweir                             rInput.Seek( nCStreamPos );
1242cdf0e10cSrcweir                             SetLineNr( nCLineNr );
1243cdf0e10cSrcweir                             SetLinePos( nCLinePos );
1244cdf0e10cSrcweir                             ClearTxtConvContext();
1245cdf0e10cSrcweir                             aToken.Erase( nCStrLen );
1246cdf0e10cSrcweir                             nNextCh = '>';
1247cdf0e10cSrcweir                         }
1248cdf0e10cSrcweir                     }
1249cdf0e10cSrcweir                     else
1250cdf0e10cSrcweir                     {
1251cdf0e10cSrcweir                         // den TokenString koennen wir jetzt verwerfen
1252cdf0e10cSrcweir                         aToken.Erase();
1253cdf0e10cSrcweir                     }
1254cdf0e10cSrcweir 
1255cdf0e10cSrcweir                     // dann lesen wir mal alles bis zur schliessenden '>'
1256cdf0e10cSrcweir                     if( '>' != nNextCh && IsParserWorking() )
1257cdf0e10cSrcweir                     {
1258cdf0e10cSrcweir                         ScanText( '>' );
1259cdf0e10cSrcweir                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1260cdf0e10cSrcweir                         {
1261cdf0e10cSrcweir                             // zurueck hinter die < gehen  und dort neu
1262cdf0e10cSrcweir                             // aufsetzen, das < als Text zurueckgeben
1263cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
1264cdf0e10cSrcweir                             SetLineNr( nLineNr );
1265cdf0e10cSrcweir                             SetLinePos( nLinePos );
1266cdf0e10cSrcweir                             ClearTxtConvContext();
1267cdf0e10cSrcweir 
1268cdf0e10cSrcweir                             aToken = '<';
1269cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1270cdf0e10cSrcweir                             nNextCh = GetNextChar();
1271cdf0e10cSrcweir                             bNextCh = sal_False;
1272cdf0e10cSrcweir                             break;
1273cdf0e10cSrcweir                         }
1274cdf0e10cSrcweir                     }
1275cdf0e10cSrcweir                     if( SVPAR_PENDING == eState )
1276cdf0e10cSrcweir                         bReadNextChar = bReadNextCharSave;
1277cdf0e10cSrcweir                 }
1278cdf0e10cSrcweir                 else
1279cdf0e10cSrcweir                 {
1280cdf0e10cSrcweir                     if( bOffState )
1281cdf0e10cSrcweir                     {
1282cdf0e10cSrcweir                         // einfach alles wegschmeissen
1283cdf0e10cSrcweir                         ScanText( '>' );
1284cdf0e10cSrcweir                         if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1285cdf0e10cSrcweir                         {
1286cdf0e10cSrcweir                             // zurueck hinter die < gehen  und dort neu
1287cdf0e10cSrcweir                             // aufsetzen, das < als Text zurueckgeben
1288cdf0e10cSrcweir                             rInput.Seek( nStreamPos );
1289cdf0e10cSrcweir                             SetLineNr( nLineNr );
1290cdf0e10cSrcweir                             SetLinePos( nLinePos );
1291cdf0e10cSrcweir                             ClearTxtConvContext();
1292cdf0e10cSrcweir 
1293cdf0e10cSrcweir                             aToken = '<';
1294cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1295cdf0e10cSrcweir                             nNextCh = GetNextChar();
1296cdf0e10cSrcweir                             bNextCh = sal_False;
1297cdf0e10cSrcweir                             break;
1298cdf0e10cSrcweir                         }
1299cdf0e10cSrcweir                         if( SVPAR_PENDING == eState )
1300cdf0e10cSrcweir                             bReadNextChar = bReadNextCharSave;
1301cdf0e10cSrcweir                         aToken.Erase();
1302cdf0e10cSrcweir                     }
1303cdf0e10cSrcweir                     else if( '%' == nNextCh )
1304cdf0e10cSrcweir                     {
1305cdf0e10cSrcweir                         nRet = HTML_UNKNOWNCONTROL_ON;
1306cdf0e10cSrcweir 
1307cdf0e10cSrcweir                         sal_uLong nCStreamPos = rInput.Tell();
1308cdf0e10cSrcweir                         sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1309cdf0e10cSrcweir 
1310cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1311cdf0e10cSrcweir                         // bis zum schliessenden %> lesen. wenn keins gefunden
1312cdf0e10cSrcweir                         // wurde beim der ersten > wieder aufsetzen
1313cdf0e10cSrcweir                         while( !bDone && !rInput.IsEof() && IsParserWorking() )
1314cdf0e10cSrcweir                         {
1315cdf0e10cSrcweir                             bDone = '>'==nNextCh && aToken.Len() >= 1 &&
1316cdf0e10cSrcweir                                     '%' == aToken.GetChar( aToken.Len()-1 );
1317cdf0e10cSrcweir                             if( !bDone )
1318cdf0e10cSrcweir                             {
1319cdf0e10cSrcweir                                 aToken += nNextCh;
1320cdf0e10cSrcweir                                 nNextCh = GetNextChar();
1321cdf0e10cSrcweir                             }
1322cdf0e10cSrcweir                         }
1323cdf0e10cSrcweir                         if( !bDone && IsParserWorking() )
1324cdf0e10cSrcweir                         {
1325cdf0e10cSrcweir                             rInput.Seek( nCStreamPos );
1326cdf0e10cSrcweir                             SetLineNr( nCLineNr );
1327cdf0e10cSrcweir                             SetLinePos( nCLinePos );
1328cdf0e10cSrcweir                             ClearTxtConvContext();
1329cdf0e10cSrcweir                             aToken.AssignAscii( "<%", 2 );
1330cdf0e10cSrcweir                             nRet = HTML_TEXTTOKEN;
1331cdf0e10cSrcweir                             break;
1332cdf0e10cSrcweir                         }
1333cdf0e10cSrcweir                         if( IsParserWorking() )
1334cdf0e10cSrcweir                         {
1335cdf0e10cSrcweir                             sSaveToken = aToken;
1336cdf0e10cSrcweir                             aToken.Erase();
1337cdf0e10cSrcweir                         }
1338cdf0e10cSrcweir                     }
1339cdf0e10cSrcweir                     else
1340cdf0e10cSrcweir                     {
1341cdf0e10cSrcweir                         aToken = '<';
1342cdf0e10cSrcweir                         nRet = HTML_TEXTTOKEN;
1343cdf0e10cSrcweir                         bNextCh = sal_False;
1344cdf0e10cSrcweir                         break;
1345cdf0e10cSrcweir                     }
1346cdf0e10cSrcweir                 }
1347cdf0e10cSrcweir 
1348cdf0e10cSrcweir                 if( IsParserWorking() )
1349cdf0e10cSrcweir                 {
1350cdf0e10cSrcweir                     bNextCh = '>' == nNextCh;
1351cdf0e10cSrcweir                     switch( nRet )
1352cdf0e10cSrcweir                     {
1353cdf0e10cSrcweir                     case HTML_TEXTAREA_ON:
1354cdf0e10cSrcweir                         bReadTextArea = sal_True;
1355cdf0e10cSrcweir                         break;
1356cdf0e10cSrcweir                     case HTML_TEXTAREA_OFF:
1357cdf0e10cSrcweir                         bReadTextArea = sal_False;
1358cdf0e10cSrcweir                         break;
1359cdf0e10cSrcweir                     case HTML_SCRIPT_ON:
1360cdf0e10cSrcweir                         if( !bReadTextArea )
1361cdf0e10cSrcweir                             bReadScript = sal_True;
1362cdf0e10cSrcweir                         break;
1363cdf0e10cSrcweir                     case HTML_SCRIPT_OFF:
1364cdf0e10cSrcweir                         if( !bReadTextArea )
1365cdf0e10cSrcweir                         {
1366cdf0e10cSrcweir                             bReadScript = sal_False;
1367cdf0e10cSrcweir                             // JavaScript kann den Stream veraendern
1368cdf0e10cSrcweir                             // also muss das letzte Zeichen nochmals
1369cdf0e10cSrcweir                             // gelesen werden
1370cdf0e10cSrcweir                             bReadNextChar = sal_True;
1371cdf0e10cSrcweir                             bNextCh = sal_False;
1372cdf0e10cSrcweir                         }
1373cdf0e10cSrcweir                         break;
1374cdf0e10cSrcweir 
1375cdf0e10cSrcweir                     case HTML_STYLE_ON:
1376cdf0e10cSrcweir                         bReadStyle = sal_True;
1377cdf0e10cSrcweir                         break;
1378cdf0e10cSrcweir                     case HTML_STYLE_OFF:
1379cdf0e10cSrcweir                         bReadStyle = sal_False;
1380cdf0e10cSrcweir                         break;
1381cdf0e10cSrcweir                     }
1382cdf0e10cSrcweir 
1383cdf0e10cSrcweir                 }
1384cdf0e10cSrcweir             }
1385cdf0e10cSrcweir             break;
1386cdf0e10cSrcweir 
1387cdf0e10cSrcweir         case sal_Unicode(EOF):
1388cdf0e10cSrcweir             if( rInput.IsEof() )
1389cdf0e10cSrcweir             {
1390cdf0e10cSrcweir                 eState = SVPAR_ACCEPTED;
1391cdf0e10cSrcweir                 nRet = nNextCh;
1392cdf0e10cSrcweir             }
1393cdf0e10cSrcweir             else
1394cdf0e10cSrcweir             {
1395cdf0e10cSrcweir                 // normalen Text lesen
1396cdf0e10cSrcweir                 goto scan_text;
1397cdf0e10cSrcweir             }
1398cdf0e10cSrcweir             break;
1399cdf0e10cSrcweir 
1400cdf0e10cSrcweir         case '\f':
1401cdf0e10cSrcweir             // Form-Feeds werden jetzt extra nach oben gereicht
1402cdf0e10cSrcweir             nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR
1403cdf0e10cSrcweir             break;
1404cdf0e10cSrcweir 
1405cdf0e10cSrcweir         case '\n':
1406cdf0e10cSrcweir         case '\r':
1407cdf0e10cSrcweir             if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1408cdf0e10cSrcweir             {
1409cdf0e10cSrcweir                 sal_Unicode c = GetNextChar();
1410cdf0e10cSrcweir                 if( ( '\n' != nNextCh || '\r' != c ) &&
1411cdf0e10cSrcweir                     ( '\r' != nNextCh || '\n' != c ) )
1412cdf0e10cSrcweir                 {
1413cdf0e10cSrcweir                     bNextCh = sal_False;
1414cdf0e10cSrcweir                     nNextCh = c;
1415cdf0e10cSrcweir                 }
1416cdf0e10cSrcweir                 nRet = HTML_NEWPARA;
1417cdf0e10cSrcweir                 break;
1418cdf0e10cSrcweir             }
1419cdf0e10cSrcweir             // kein break !
1420cdf0e10cSrcweir         case '\t':
1421cdf0e10cSrcweir             if( bReadPRE )
1422cdf0e10cSrcweir             {
1423cdf0e10cSrcweir                 nRet = HTML_TABCHAR;
1424cdf0e10cSrcweir                 break;
1425cdf0e10cSrcweir             }
1426cdf0e10cSrcweir             // kein break !
1427cdf0e10cSrcweir         case ' ':
1428cdf0e10cSrcweir             // kein break !
1429cdf0e10cSrcweir         default:
1430cdf0e10cSrcweir 
1431cdf0e10cSrcweir scan_text:
1432cdf0e10cSrcweir             // es folgt "normaler" Text
1433cdf0e10cSrcweir             nRet = ScanText();
1434cdf0e10cSrcweir             bNextCh = 0 == aToken.Len();
1435cdf0e10cSrcweir 
1436cdf0e10cSrcweir             // der Text sollte noch verarbeitet werden
1437cdf0e10cSrcweir             if( !bNextCh && eState == SVPAR_PENDING )
1438cdf0e10cSrcweir             {
1439cdf0e10cSrcweir                 eState = SVPAR_WORKING;
1440cdf0e10cSrcweir                 bReadNextChar = sal_True;
1441cdf0e10cSrcweir             }
1442cdf0e10cSrcweir 
1443cdf0e10cSrcweir             break;
1444cdf0e10cSrcweir         }
1445cdf0e10cSrcweir 
1446cdf0e10cSrcweir         if( bNextCh && SVPAR_WORKING == eState )
1447cdf0e10cSrcweir         {
1448cdf0e10cSrcweir             nNextCh = GetNextChar();
1449cdf0e10cSrcweir             if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1450cdf0e10cSrcweir             {
1451cdf0e10cSrcweir                 bReadNextChar = sal_True;
1452cdf0e10cSrcweir                 eState = SVPAR_WORKING;
1453cdf0e10cSrcweir             }
1454cdf0e10cSrcweir         }
1455cdf0e10cSrcweir 
1456cdf0e10cSrcweir     } while( !nRet && SVPAR_WORKING == eState );
1457cdf0e10cSrcweir 
1458cdf0e10cSrcweir     if( SVPAR_PENDING == eState )
1459cdf0e10cSrcweir         nRet = -1;      // irgendwas ungueltiges
1460cdf0e10cSrcweir 
1461cdf0e10cSrcweir     return nRet;
1462cdf0e10cSrcweir }
1463cdf0e10cSrcweir 
1464cdf0e10cSrcweir void HTMLParser::UnescapeToken()
1465cdf0e10cSrcweir {
1466cdf0e10cSrcweir     xub_StrLen nPos=0;
1467cdf0e10cSrcweir 
1468cdf0e10cSrcweir     sal_Bool bEscape = sal_False;
1469cdf0e10cSrcweir     while( nPos < aToken.Len() )
1470cdf0e10cSrcweir     {
1471cdf0e10cSrcweir         sal_Bool bOldEscape = bEscape;
1472cdf0e10cSrcweir         bEscape = sal_False;
1473cdf0e10cSrcweir         if( '\\'==aToken.GetChar(nPos) && !bOldEscape )
1474cdf0e10cSrcweir         {
1475cdf0e10cSrcweir             aToken.Erase( nPos, 1 );
1476cdf0e10cSrcweir             bEscape = sal_True;
1477cdf0e10cSrcweir         }
1478cdf0e10cSrcweir         else
1479cdf0e10cSrcweir         {
1480cdf0e10cSrcweir             nPos++;
1481cdf0e10cSrcweir         }
1482cdf0e10cSrcweir     }
1483cdf0e10cSrcweir }
1484cdf0e10cSrcweir 
1485cdf0e10cSrcweir // hole die Optionen
1486cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const
1487cdf0e10cSrcweir {
1488cdf0e10cSrcweir     // wenn die Option fuer das aktuelle Token schon einmal
1489cdf0e10cSrcweir     // geholt wurden, geben wir sie noch einmal zurueck
1490cdf0e10cSrcweir     if( pOptions->Count() )
1491cdf0e10cSrcweir         return pOptions;
1492cdf0e10cSrcweir 
1493cdf0e10cSrcweir     xub_StrLen nPos = 0;
1494cdf0e10cSrcweir     while( nPos < aToken.Len() )
1495cdf0e10cSrcweir     {
1496cdf0e10cSrcweir         // ein Zeichen ? Dann faengt hier eine Option an
1497cdf0e10cSrcweir         if( HTML_ISALPHA( aToken.GetChar(nPos) ) )
1498cdf0e10cSrcweir         {
1499cdf0e10cSrcweir             int nToken;
1500cdf0e10cSrcweir             String aValue;
1501cdf0e10cSrcweir             xub_StrLen nStt = nPos;
1502cdf0e10cSrcweir             sal_Unicode cChar = 0;
1503cdf0e10cSrcweir 
1504cdf0e10cSrcweir             // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt.
1505cdf0e10cSrcweir             // Netscape achtet aber nur auf "=" und Leerzeichen (siehe
1506cdf0e10cSrcweir             // Mozilla: PA_FetchRequestedNameValues in
1507cdf0e10cSrcweir             // lipparse/pa_mdl.c
1508cdf0e10cSrcweir //          while( nPos < aToken.Len() &&
1509cdf0e10cSrcweir //                  ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) )
1510cdf0e10cSrcweir             while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) &&
1511cdf0e10cSrcweir                    HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1512cdf0e10cSrcweir                 nPos++;
1513cdf0e10cSrcweir 
1514cdf0e10cSrcweir             String sName( aToken.Copy( nStt, nPos-nStt ) );
1515cdf0e10cSrcweir 
1516cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben
1517cdf0e10cSrcweir //              also nur fuers Suchen in UpperCase wandeln
1518cdf0e10cSrcweir             String sNameUpperCase( sName );
1519cdf0e10cSrcweir             sNameUpperCase.ToUpperAscii();
1520cdf0e10cSrcweir 
1521cdf0e10cSrcweir             nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig
1522cdf0e10cSrcweir             DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1523cdf0e10cSrcweir                         "GetOption: unbekannte HTML-Option" );
1524cdf0e10cSrcweir             sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1525cdf0e10cSrcweir                                nToken >= HTML_OPTION_SCRIPT_END) &&
1526cdf0e10cSrcweir                               (!pNoConvertToken || nToken != *pNoConvertToken);
1527cdf0e10cSrcweir 
1528cdf0e10cSrcweir             while( nPos < aToken.Len() &&
1529cdf0e10cSrcweir                    ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1530cdf0e10cSrcweir                      HTML_ISSPACE(cChar) ) )
1531cdf0e10cSrcweir                 nPos++;
1532cdf0e10cSrcweir 
1533cdf0e10cSrcweir             // hat die Option auch einen Wert?
1534cdf0e10cSrcweir             if( nPos!=aToken.Len() && '='==cChar )
1535cdf0e10cSrcweir             {
1536cdf0e10cSrcweir                 nPos++;
1537cdf0e10cSrcweir 
1538cdf0e10cSrcweir                 while( nPos < aToken.Len() &&
1539cdf0e10cSrcweir                         ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1540cdf0e10cSrcweir                           ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1541cdf0e10cSrcweir                     nPos++;
1542cdf0e10cSrcweir 
1543cdf0e10cSrcweir                 if( nPos != aToken.Len() )
1544cdf0e10cSrcweir                 {
1545cdf0e10cSrcweir                     xub_StrLen nLen = 0;
1546cdf0e10cSrcweir                     nStt = nPos;
1547cdf0e10cSrcweir                     if( ('"'==cChar) || ('\'')==cChar )
1548cdf0e10cSrcweir                     {
1549cdf0e10cSrcweir                         sal_Unicode cEnd = cChar;
1550cdf0e10cSrcweir                         nPos++; nStt++;
1551cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1552cdf0e10cSrcweir                         sal_Bool bEscape = sal_False;
1553cdf0e10cSrcweir                         while( nPos < aToken.Len() && !bDone )
1554cdf0e10cSrcweir                         {
1555cdf0e10cSrcweir                             sal_Bool bOldEscape = bEscape;
1556cdf0e10cSrcweir                             bEscape = sal_False;
1557cdf0e10cSrcweir                             cChar = aToken.GetChar(nPos);
1558cdf0e10cSrcweir                             switch( cChar )
1559cdf0e10cSrcweir                             {
1560cdf0e10cSrcweir                             case '\r':
1561cdf0e10cSrcweir                             case '\n':
1562cdf0e10cSrcweir                                 if( bStripCRLF )
1563cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1564cdf0e10cSrcweir                                 else
1565cdf0e10cSrcweir                                     nPos++, nLen++;
1566cdf0e10cSrcweir                                 break;
1567cdf0e10cSrcweir                             case '\\':
1568cdf0e10cSrcweir                                 if( bOldEscape )
1569cdf0e10cSrcweir                                 {
1570cdf0e10cSrcweir                                     nPos++, nLen++;
1571cdf0e10cSrcweir                                 }
1572cdf0e10cSrcweir                                 else
1573cdf0e10cSrcweir                                 {
1574cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1575cdf0e10cSrcweir                                     bEscape = sal_True;
1576cdf0e10cSrcweir                                 }
1577cdf0e10cSrcweir                                 break;
1578cdf0e10cSrcweir                             case '"':
1579cdf0e10cSrcweir                             case '\'':
1580cdf0e10cSrcweir                                 bDone = !bOldEscape && cChar==cEnd;
1581cdf0e10cSrcweir                                 if( !bDone )
1582cdf0e10cSrcweir                                     nPos++, nLen++;
1583cdf0e10cSrcweir                                 break;
1584cdf0e10cSrcweir                             default:
1585cdf0e10cSrcweir                                 nPos++, nLen++;
1586cdf0e10cSrcweir                                 break;
1587cdf0e10cSrcweir                             }
1588cdf0e10cSrcweir                         }
1589cdf0e10cSrcweir                         if( nPos!=aToken.Len() )
1590cdf0e10cSrcweir                             nPos++;
1591cdf0e10cSrcweir                     }
1592cdf0e10cSrcweir                     else
1593cdf0e10cSrcweir                     {
1594cdf0e10cSrcweir                         // hier sind wir etwas laxer als der
1595cdf0e10cSrcweir                         // Standard und erlauben alles druckbare
1596cdf0e10cSrcweir                         sal_Bool bEscape = sal_False;
1597cdf0e10cSrcweir                         sal_Bool bDone = sal_False;
1598cdf0e10cSrcweir                         while( nPos < aToken.Len() && !bDone )
1599cdf0e10cSrcweir                         {
1600cdf0e10cSrcweir                             sal_Bool bOldEscape = bEscape;
1601cdf0e10cSrcweir                             bEscape = sal_False;
1602cdf0e10cSrcweir                             sal_Unicode c = aToken.GetChar(nPos);
1603cdf0e10cSrcweir                             switch( c )
1604cdf0e10cSrcweir                             {
1605cdf0e10cSrcweir                             case ' ':
1606cdf0e10cSrcweir                                 bDone = !bOldEscape;
1607cdf0e10cSrcweir                                 if( !bDone )
1608cdf0e10cSrcweir                                     nPos++, nLen++;
1609cdf0e10cSrcweir                                 break;
1610cdf0e10cSrcweir 
1611cdf0e10cSrcweir                             case '\t':
1612cdf0e10cSrcweir                             case '\r':
1613cdf0e10cSrcweir                             case '\n':
1614cdf0e10cSrcweir                                 bDone = sal_True;
1615cdf0e10cSrcweir                                 break;
1616cdf0e10cSrcweir 
1617cdf0e10cSrcweir                             case '\\':
1618cdf0e10cSrcweir                                 if( bOldEscape )
1619cdf0e10cSrcweir                                 {
1620cdf0e10cSrcweir                                     nPos++, nLen++;
1621cdf0e10cSrcweir                                 }
1622cdf0e10cSrcweir                                 else
1623cdf0e10cSrcweir                                 {
1624cdf0e10cSrcweir                                     ((String &)aToken).Erase( nPos, 1 );
1625cdf0e10cSrcweir                                     bEscape = sal_True;
1626cdf0e10cSrcweir                                 }
1627cdf0e10cSrcweir                                 break;
1628cdf0e10cSrcweir 
1629cdf0e10cSrcweir                             default:
1630cdf0e10cSrcweir                                 if( HTML_ISPRINTABLE( c ) )
1631cdf0e10cSrcweir                                     nPos++, nLen++;
1632cdf0e10cSrcweir                                 else
1633cdf0e10cSrcweir                                     bDone = sal_True;
1634cdf0e10cSrcweir                                 break;
1635cdf0e10cSrcweir                             }
1636cdf0e10cSrcweir                         }
1637cdf0e10cSrcweir                     }
1638cdf0e10cSrcweir 
1639cdf0e10cSrcweir                     if( nLen )
1640cdf0e10cSrcweir                         aValue = aToken.Copy( nStt, nLen );
1641cdf0e10cSrcweir                 }
1642cdf0e10cSrcweir             }
1643cdf0e10cSrcweir 
1644cdf0e10cSrcweir             // Wir kennen das Token und koennen es Speichern
1645cdf0e10cSrcweir             HTMLOption *pOption =
1646cdf0e10cSrcweir                 new HTMLOption(
1647cdf0e10cSrcweir                     sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue );
1648cdf0e10cSrcweir 
1649cdf0e10cSrcweir             pOptions->Insert( pOption, pOptions->Count() );
1650cdf0e10cSrcweir 
1651cdf0e10cSrcweir         }
1652cdf0e10cSrcweir         else
1653cdf0e10cSrcweir             // white space un unerwartete Zeichen ignorieren wie
1654cdf0e10cSrcweir             nPos++;
1655cdf0e10cSrcweir     }
1656cdf0e10cSrcweir 
1657cdf0e10cSrcweir     return pOptions;
1658cdf0e10cSrcweir }
1659cdf0e10cSrcweir 
1660cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken )
1661cdf0e10cSrcweir {
1662cdf0e10cSrcweir     switch( nToken )
1663cdf0e10cSrcweir     {
1664cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR
1665cdf0e10cSrcweir     // diese werden laut Definition zu LFs
1666cdf0e10cSrcweir     case HTML_PARABREAK_ON:
1667cdf0e10cSrcweir     case HTML_LINEBREAK:
1668cdf0e10cSrcweir         nToken = HTML_NEWPARA;
1669cdf0e10cSrcweir #else
1670cdf0e10cSrcweir     // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung
1671cdf0e10cSrcweir     case HTML_PARABREAK_ON:
1672cdf0e10cSrcweir         nToken = HTML_LINEBREAK;
1673cdf0e10cSrcweir     case HTML_LINEBREAK:
1674cdf0e10cSrcweir #endif
1675cdf0e10cSrcweir     case HTML_NEWPARA:
1676cdf0e10cSrcweir         nPre_LinePos = 0;
1677cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1678cdf0e10cSrcweir             nToken = 0;
1679cdf0e10cSrcweir         break;
1680cdf0e10cSrcweir 
1681cdf0e10cSrcweir     case HTML_TABCHAR:
1682cdf0e10cSrcweir         {
1683cdf0e10cSrcweir             xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >(
1684cdf0e10cSrcweir                 8 - (nPre_LinePos % 8));
1685cdf0e10cSrcweir             DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" );
1686cdf0e10cSrcweir             aToken.Expand( nSpaces, ' ' );
1687cdf0e10cSrcweir             nPre_LinePos += nSpaces;
1688cdf0e10cSrcweir             nToken = HTML_TEXTTOKEN;
1689cdf0e10cSrcweir         }
1690cdf0e10cSrcweir         break;
1691cdf0e10cSrcweir     // diese bleiben erhalten
1692cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1693cdf0e10cSrcweir         nPre_LinePos += aToken.Len();
1694cdf0e10cSrcweir         break;
1695cdf0e10cSrcweir 
1696cdf0e10cSrcweir     case HTML_SELECT_ON:
1697cdf0e10cSrcweir     case HTML_SELECT_OFF:
1698cdf0e10cSrcweir     case HTML_BODY_ON:
1699cdf0e10cSrcweir     case HTML_FORM_ON:
1700cdf0e10cSrcweir     case HTML_FORM_OFF:
1701cdf0e10cSrcweir     case HTML_INPUT:
1702cdf0e10cSrcweir     case HTML_OPTION:
1703cdf0e10cSrcweir     case HTML_TEXTAREA_ON:
1704cdf0e10cSrcweir     case HTML_TEXTAREA_OFF:
1705cdf0e10cSrcweir 
1706cdf0e10cSrcweir     case HTML_IMAGE:
1707cdf0e10cSrcweir     case HTML_APPLET_ON:
1708cdf0e10cSrcweir     case HTML_APPLET_OFF:
1709cdf0e10cSrcweir     case HTML_PARAM:
1710cdf0e10cSrcweir     case HTML_EMBED:
1711cdf0e10cSrcweir 
1712cdf0e10cSrcweir     case HTML_HEAD1_ON:
1713cdf0e10cSrcweir     case HTML_HEAD1_OFF:
1714cdf0e10cSrcweir     case HTML_HEAD2_ON:
1715cdf0e10cSrcweir     case HTML_HEAD2_OFF:
1716cdf0e10cSrcweir     case HTML_HEAD3_ON:
1717cdf0e10cSrcweir     case HTML_HEAD3_OFF:
1718cdf0e10cSrcweir     case HTML_HEAD4_ON:
1719cdf0e10cSrcweir     case HTML_HEAD4_OFF:
1720cdf0e10cSrcweir     case HTML_HEAD5_ON:
1721cdf0e10cSrcweir     case HTML_HEAD5_OFF:
1722cdf0e10cSrcweir     case HTML_HEAD6_ON:
1723cdf0e10cSrcweir     case HTML_HEAD6_OFF:
1724cdf0e10cSrcweir     case HTML_BLOCKQUOTE_ON:
1725cdf0e10cSrcweir     case HTML_BLOCKQUOTE_OFF:
1726cdf0e10cSrcweir     case HTML_ADDRESS_ON:
1727cdf0e10cSrcweir     case HTML_ADDRESS_OFF:
1728cdf0e10cSrcweir     case HTML_HORZRULE:
1729cdf0e10cSrcweir 
1730cdf0e10cSrcweir     case HTML_CENTER_ON:
1731cdf0e10cSrcweir     case HTML_CENTER_OFF:
1732cdf0e10cSrcweir     case HTML_DIVISION_ON:
1733cdf0e10cSrcweir     case HTML_DIVISION_OFF:
1734cdf0e10cSrcweir 
1735cdf0e10cSrcweir     case HTML_SCRIPT_ON:
1736cdf0e10cSrcweir     case HTML_SCRIPT_OFF:
1737cdf0e10cSrcweir     case HTML_RAWDATA:
1738cdf0e10cSrcweir 
1739cdf0e10cSrcweir     case HTML_TABLE_ON:
1740cdf0e10cSrcweir     case HTML_TABLE_OFF:
1741cdf0e10cSrcweir     case HTML_CAPTION_ON:
1742cdf0e10cSrcweir     case HTML_CAPTION_OFF:
1743cdf0e10cSrcweir     case HTML_COLGROUP_ON:
1744cdf0e10cSrcweir     case HTML_COLGROUP_OFF:
1745cdf0e10cSrcweir     case HTML_COL_ON:
1746cdf0e10cSrcweir     case HTML_COL_OFF:
1747cdf0e10cSrcweir     case HTML_THEAD_ON:
1748cdf0e10cSrcweir     case HTML_THEAD_OFF:
1749cdf0e10cSrcweir     case HTML_TFOOT_ON:
1750cdf0e10cSrcweir     case HTML_TFOOT_OFF:
1751cdf0e10cSrcweir     case HTML_TBODY_ON:
1752cdf0e10cSrcweir     case HTML_TBODY_OFF:
1753cdf0e10cSrcweir     case HTML_TABLEROW_ON:
1754cdf0e10cSrcweir     case HTML_TABLEROW_OFF:
1755cdf0e10cSrcweir     case HTML_TABLEDATA_ON:
1756cdf0e10cSrcweir     case HTML_TABLEDATA_OFF:
1757cdf0e10cSrcweir     case HTML_TABLEHEADER_ON:
1758cdf0e10cSrcweir     case HTML_TABLEHEADER_OFF:
1759cdf0e10cSrcweir 
1760cdf0e10cSrcweir     case HTML_ANCHOR_ON:
1761cdf0e10cSrcweir     case HTML_ANCHOR_OFF:
1762cdf0e10cSrcweir     case HTML_BOLD_ON:
1763cdf0e10cSrcweir     case HTML_BOLD_OFF:
1764cdf0e10cSrcweir     case HTML_ITALIC_ON:
1765cdf0e10cSrcweir     case HTML_ITALIC_OFF:
1766cdf0e10cSrcweir     case HTML_STRIKE_ON:
1767cdf0e10cSrcweir     case HTML_STRIKE_OFF:
1768cdf0e10cSrcweir     case HTML_STRIKETHROUGH_ON:
1769cdf0e10cSrcweir     case HTML_STRIKETHROUGH_OFF:
1770cdf0e10cSrcweir     case HTML_UNDERLINE_ON:
1771cdf0e10cSrcweir     case HTML_UNDERLINE_OFF:
1772cdf0e10cSrcweir     case HTML_BASEFONT_ON:
1773cdf0e10cSrcweir     case HTML_BASEFONT_OFF:
1774cdf0e10cSrcweir     case HTML_FONT_ON:
1775cdf0e10cSrcweir     case HTML_FONT_OFF:
1776cdf0e10cSrcweir     case HTML_BLINK_ON:
1777cdf0e10cSrcweir     case HTML_BLINK_OFF:
1778cdf0e10cSrcweir     case HTML_SPAN_ON:
1779cdf0e10cSrcweir     case HTML_SPAN_OFF:
1780cdf0e10cSrcweir     case HTML_SUBSCRIPT_ON:
1781cdf0e10cSrcweir     case HTML_SUBSCRIPT_OFF:
1782cdf0e10cSrcweir     case HTML_SUPERSCRIPT_ON:
1783cdf0e10cSrcweir     case HTML_SUPERSCRIPT_OFF:
1784cdf0e10cSrcweir     case HTML_BIGPRINT_ON:
1785cdf0e10cSrcweir     case HTML_BIGPRINT_OFF:
1786cdf0e10cSrcweir     case HTML_SMALLPRINT_OFF:
1787cdf0e10cSrcweir     case HTML_SMALLPRINT_ON:
1788cdf0e10cSrcweir 
1789cdf0e10cSrcweir     case HTML_EMPHASIS_ON:
1790cdf0e10cSrcweir     case HTML_EMPHASIS_OFF:
1791cdf0e10cSrcweir     case HTML_CITIATION_ON:
1792cdf0e10cSrcweir     case HTML_CITIATION_OFF:
1793cdf0e10cSrcweir     case HTML_STRONG_ON:
1794cdf0e10cSrcweir     case HTML_STRONG_OFF:
1795cdf0e10cSrcweir     case HTML_CODE_ON:
1796cdf0e10cSrcweir     case HTML_CODE_OFF:
1797cdf0e10cSrcweir     case HTML_SAMPLE_ON:
1798cdf0e10cSrcweir     case HTML_SAMPLE_OFF:
1799cdf0e10cSrcweir     case HTML_KEYBOARD_ON:
1800cdf0e10cSrcweir     case HTML_KEYBOARD_OFF:
1801cdf0e10cSrcweir     case HTML_VARIABLE_ON:
1802cdf0e10cSrcweir     case HTML_VARIABLE_OFF:
1803cdf0e10cSrcweir     case HTML_DEFINSTANCE_ON:
1804cdf0e10cSrcweir     case HTML_DEFINSTANCE_OFF:
1805cdf0e10cSrcweir     case HTML_SHORTQUOTE_ON:
1806cdf0e10cSrcweir     case HTML_SHORTQUOTE_OFF:
1807cdf0e10cSrcweir     case HTML_LANGUAGE_ON:
1808cdf0e10cSrcweir     case HTML_LANGUAGE_OFF:
1809cdf0e10cSrcweir     case HTML_AUTHOR_ON:
1810cdf0e10cSrcweir     case HTML_AUTHOR_OFF:
1811cdf0e10cSrcweir     case HTML_PERSON_ON:
1812cdf0e10cSrcweir     case HTML_PERSON_OFF:
1813cdf0e10cSrcweir     case HTML_ACRONYM_ON:
1814cdf0e10cSrcweir     case HTML_ACRONYM_OFF:
1815cdf0e10cSrcweir     case HTML_ABBREVIATION_ON:
1816cdf0e10cSrcweir     case HTML_ABBREVIATION_OFF:
1817cdf0e10cSrcweir     case HTML_INSERTEDTEXT_ON:
1818cdf0e10cSrcweir     case HTML_INSERTEDTEXT_OFF:
1819cdf0e10cSrcweir     case HTML_DELETEDTEXT_ON:
1820cdf0e10cSrcweir     case HTML_DELETEDTEXT_OFF:
1821cdf0e10cSrcweir     case HTML_TELETYPE_ON:
1822cdf0e10cSrcweir     case HTML_TELETYPE_OFF:
1823cdf0e10cSrcweir 
1824cdf0e10cSrcweir         break;
1825cdf0e10cSrcweir 
1826cdf0e10cSrcweir     // der Rest wird als unbekanntes Token behandelt
1827cdf0e10cSrcweir     default:
1828cdf0e10cSrcweir         if( nToken )
1829cdf0e10cSrcweir         {
1830cdf0e10cSrcweir             nToken =
1831cdf0e10cSrcweir                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1832cdf0e10cSrcweir                     ? HTML_UNKNOWNCONTROL_OFF
1833cdf0e10cSrcweir                     : HTML_UNKNOWNCONTROL_ON );
1834cdf0e10cSrcweir         }
1835cdf0e10cSrcweir         break;
1836cdf0e10cSrcweir     }
1837cdf0e10cSrcweir 
1838cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1839cdf0e10cSrcweir 
1840cdf0e10cSrcweir     return nToken;
1841cdf0e10cSrcweir }
1842cdf0e10cSrcweir 
1843cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken )
1844cdf0e10cSrcweir {
1845cdf0e10cSrcweir     switch( nToken )
1846cdf0e10cSrcweir     {
1847cdf0e10cSrcweir     case HTML_NEWPARA:
1848cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1849cdf0e10cSrcweir             nToken = 0;
1850cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1851cdf0e10cSrcweir     case HTML_NONBREAKSPACE:
1852cdf0e10cSrcweir     case HTML_SOFTHYPH:
1853cdf0e10cSrcweir         break;              // bleiben erhalten
1854cdf0e10cSrcweir 
1855cdf0e10cSrcweir     default:
1856cdf0e10cSrcweir         if( nToken )
1857cdf0e10cSrcweir         {
1858cdf0e10cSrcweir             if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1859cdf0e10cSrcweir             {
1860cdf0e10cSrcweir                 sSaveToken.Insert( '<', 0 );
1861cdf0e10cSrcweir                 sSaveToken.Insert( '/', 1 );
1862cdf0e10cSrcweir             }
1863cdf0e10cSrcweir             else
1864cdf0e10cSrcweir                 sSaveToken.Insert( '<', 0 );
1865cdf0e10cSrcweir             if( aToken.Len() )
1866cdf0e10cSrcweir             {
1867cdf0e10cSrcweir                 UnescapeToken();
1868cdf0e10cSrcweir                 sSaveToken += (sal_Unicode)' ';
1869cdf0e10cSrcweir                 aToken.Insert( sSaveToken, 0 );
1870cdf0e10cSrcweir             }
1871cdf0e10cSrcweir             else
1872cdf0e10cSrcweir                 aToken = sSaveToken;
1873cdf0e10cSrcweir             aToken += (sal_Unicode)'>';
1874cdf0e10cSrcweir             nToken = HTML_TEXTTOKEN;
1875cdf0e10cSrcweir         }
1876cdf0e10cSrcweir         break;
1877cdf0e10cSrcweir     }
1878cdf0e10cSrcweir 
1879cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1880cdf0e10cSrcweir 
1881cdf0e10cSrcweir     return nToken;
1882cdf0e10cSrcweir }
1883cdf0e10cSrcweir 
1884cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken )
1885cdf0e10cSrcweir {
1886cdf0e10cSrcweir     switch( nToken )
1887cdf0e10cSrcweir     {
1888cdf0e10cSrcweir     case HTML_NEWPARA:
1889cdf0e10cSrcweir         if( bPre_IgnoreNewPara )
1890cdf0e10cSrcweir             nToken = 0;
1891cdf0e10cSrcweir     case HTML_TEXTTOKEN:
1892cdf0e10cSrcweir     case HTML_NONBREAKSPACE:
1893cdf0e10cSrcweir     case HTML_SOFTHYPH:
1894cdf0e10cSrcweir         break;      // bleiben erhalten
1895cdf0e10cSrcweir 
1896cdf0e10cSrcweir     default:
1897cdf0e10cSrcweir         if( nToken )
1898cdf0e10cSrcweir         {
1899cdf0e10cSrcweir             nToken =
1900cdf0e10cSrcweir                 ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1901cdf0e10cSrcweir                     ? HTML_UNKNOWNCONTROL_OFF
1902cdf0e10cSrcweir                     : HTML_UNKNOWNCONTROL_ON );
1903cdf0e10cSrcweir         }
1904cdf0e10cSrcweir         break;
1905cdf0e10cSrcweir     }
1906cdf0e10cSrcweir 
1907cdf0e10cSrcweir     bPre_IgnoreNewPara = sal_False;
1908cdf0e10cSrcweir 
1909cdf0e10cSrcweir     return nToken;
1910cdf0e10cSrcweir }
1911cdf0e10cSrcweir 
1912cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
1913cdf0e10cSrcweir                                    sal_Bool bSwitchToUCS2,
1914cdf0e10cSrcweir                                    rtl_TextEncoding eEnc )
1915cdf0e10cSrcweir {
1916cdf0e10cSrcweir     // Einer der folgenden regulaeren Ausdrucke muss sich auf den String
1917cdf0e10cSrcweir     // anwenden lassen, damit das Dok ein HTML-Dokument ist.
1918cdf0e10cSrcweir     //
1919cdf0e10cSrcweir     // ^[^<]*<[^ \t]*[> \t]
1920cdf0e10cSrcweir     //        -------
1921cdf0e10cSrcweir     // ^<!
1922cdf0e10cSrcweir     //
1923cdf0e10cSrcweir     // wobei der unterstrichene Teilausdruck einem HTML-Token
1924cdf0e10cSrcweir     // ensprechen muss
1925cdf0e10cSrcweir 
1926cdf0e10cSrcweir     ByteString sCmp;
1927cdf0e10cSrcweir     sal_Bool bUCS2B = sal_False;
1928cdf0e10cSrcweir     if( bSwitchToUCS2 )
1929cdf0e10cSrcweir     {
1930cdf0e10cSrcweir         if( 0xfeU == (sal_uChar)pHeader[0] &&
1931cdf0e10cSrcweir             0xffU == (sal_uChar)pHeader[1] )
1932cdf0e10cSrcweir         {
1933cdf0e10cSrcweir             eEnc = RTL_TEXTENCODING_UCS2;
1934cdf0e10cSrcweir             bUCS2B = sal_True;
1935cdf0e10cSrcweir         }
1936cdf0e10cSrcweir         else if( 0xffU == (sal_uChar)pHeader[0] &&
1937cdf0e10cSrcweir                  0xfeU == (sal_uChar)pHeader[1] )
1938cdf0e10cSrcweir         {
1939cdf0e10cSrcweir             eEnc = RTL_TEXTENCODING_UCS2;
1940cdf0e10cSrcweir         }
1941cdf0e10cSrcweir     }
1942cdf0e10cSrcweir     if
1943cdf0e10cSrcweir        (
1944cdf0e10cSrcweir         RTL_TEXTENCODING_UCS2 == eEnc &&
1945cdf0e10cSrcweir         (
1946cdf0e10cSrcweir          (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) ||
1947cdf0e10cSrcweir          (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1])
1948cdf0e10cSrcweir         )
1949cdf0e10cSrcweir        )
1950cdf0e10cSrcweir     {
1951cdf0e10cSrcweir         if( 0xfe == (sal_uChar)pHeader[0] )
1952cdf0e10cSrcweir             bUCS2B = sal_True;
1953cdf0e10cSrcweir 
1954cdf0e10cSrcweir         xub_StrLen nLen;
1955cdf0e10cSrcweir         for( nLen = 2;
1956cdf0e10cSrcweir              pHeader[nLen] != 0 || pHeader[nLen+1] != 0;
1957cdf0e10cSrcweir              nLen+=2 )
1958cdf0e10cSrcweir             ;
1959cdf0e10cSrcweir 
1960cdf0e10cSrcweir         ::rtl::OStringBuffer sTmp( (nLen - 2)/2 );
1961cdf0e10cSrcweir         for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 )
1962cdf0e10cSrcweir         {
1963cdf0e10cSrcweir             sal_Unicode cUC;
1964cdf0e10cSrcweir             if( bUCS2B )
1965cdf0e10cSrcweir                 cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
1966cdf0e10cSrcweir             else
1967cdf0e10cSrcweir                 cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
1968cdf0e10cSrcweir             if( 0U == cUC )
1969cdf0e10cSrcweir                 break;
1970cdf0e10cSrcweir 
1971cdf0e10cSrcweir             sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
1972cdf0e10cSrcweir         }
1973cdf0e10cSrcweir         sCmp = ByteString( sTmp.makeStringAndClear() );
1974cdf0e10cSrcweir     }
1975cdf0e10cSrcweir     else
1976cdf0e10cSrcweir     {
1977cdf0e10cSrcweir         sCmp = (sal_Char *)pHeader;
1978cdf0e10cSrcweir     }
1979cdf0e10cSrcweir 
1980cdf0e10cSrcweir     sCmp.ToUpperAscii();
1981cdf0e10cSrcweir 
1982cdf0e10cSrcweir     // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen
1983cdf0e10cSrcweir     xub_StrLen nStart = sCmp.Search( '<' );
1984cdf0e10cSrcweir     if( STRING_NOTFOUND  == nStart )
1985cdf0e10cSrcweir         return sal_False;
1986cdf0e10cSrcweir     nStart++;
1987cdf0e10cSrcweir 
1988cdf0e10cSrcweir     // danach duerfen beliebige andere Zeichen bis zu einem blank oder
1989cdf0e10cSrcweir     // '>' kommen
1990cdf0e10cSrcweir     sal_Char c;
1991cdf0e10cSrcweir     xub_StrLen nPos;
1992cdf0e10cSrcweir     for( nPos = nStart; nPos<sCmp.Len(); nPos++ )
1993cdf0e10cSrcweir     {
1994cdf0e10cSrcweir         if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) )
1995cdf0e10cSrcweir             break;
1996cdf0e10cSrcweir     }
1997cdf0e10cSrcweir 
1998cdf0e10cSrcweir     // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML
1999cdf0e10cSrcweir     if( nPos==nStart )
2000cdf0e10cSrcweir         return sal_False;
2001cdf0e10cSrcweir 
2002cdf0e10cSrcweir     // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes
2003cdf0e10cSrcweir     // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht
2004cdf0e10cSrcweir     // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML
2005cdf0e10cSrcweir     // interpretiert.
2006cdf0e10cSrcweir     String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US );
2007cdf0e10cSrcweir     int nTok = GetHTMLToken( sTest );
2008cdf0e10cSrcweir     if( 0 != nTok && HTML_DIRLIST_ON != nTok )
2009cdf0e10cSrcweir         return sal_True;
2010cdf0e10cSrcweir 
2011cdf0e10cSrcweir     // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#)
2012cdf0e10cSrcweir     if( nStart == 1 && '!' == sCmp.GetChar( 1 ) )
2013cdf0e10cSrcweir         return sal_True;
2014cdf0e10cSrcweir 
2015cdf0e10cSrcweir     // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen
2016cdf0e10cSrcweir     nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html );
2017cdf0e10cSrcweir     if( nStart!=STRING_NOTFOUND &&
2018cdf0e10cSrcweir         nStart>0 && '<'==sCmp.GetChar(nStart-1) &&
2019cdf0e10cSrcweir         nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) )
2020cdf0e10cSrcweir         return sal_True;
2021cdf0e10cSrcweir 
2022cdf0e10cSrcweir     // sonst ist es wohl doch eher kein HTML-Dokument
2023cdf0e10cSrcweir     return sal_False;
2024cdf0e10cSrcweir }
2025cdf0e10cSrcweir 
2026cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL )
2027cdf0e10cSrcweir {
2028cdf0e10cSrcweir     if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) ||
2029cdf0e10cSrcweir         rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL )
2030cdf0e10cSrcweir         return sal_False;
2031cdf0e10cSrcweir 
2032cdf0e10cSrcweir     sal_Bool bFound = sal_False;
2033cdf0e10cSrcweir 
2034cdf0e10cSrcweir     if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL )
2035cdf0e10cSrcweir     {
2036cdf0e10cSrcweir         String aName( rURL.Copy(16) );
2037cdf0e10cSrcweir         switch( aName.GetChar(0) )
2038cdf0e10cSrcweir         {
2039cdf0e10cSrcweir         case 'b':
2040cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary );
2041cdf0e10cSrcweir             break;
2042cdf0e10cSrcweir         case 'i':
2043cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) ||
2044cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index );
2045cdf0e10cSrcweir             break;
2046cdf0e10cSrcweir         case 'm':
2047cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) ||
2048cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie );
2049cdf0e10cSrcweir             break;
2050cdf0e10cSrcweir         case 's':
2051cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound );
2052cdf0e10cSrcweir             break;
2053cdf0e10cSrcweir         case 't':
2054cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) ||
2055cdf0e10cSrcweir                      aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text );
2056cdf0e10cSrcweir             break;
2057cdf0e10cSrcweir         case 'u':
2058cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown );
2059cdf0e10cSrcweir             break;
2060cdf0e10cSrcweir         }
2061cdf0e10cSrcweir     }
2062cdf0e10cSrcweir     else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL )
2063cdf0e10cSrcweir     {
2064cdf0e10cSrcweir         String aName( rURL.Copy(14) );
2065cdf0e10cSrcweir         switch( aName.GetChar(0) )
2066cdf0e10cSrcweir         {
2067cdf0e10cSrcweir         case 'b':
2068cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata );
2069cdf0e10cSrcweir             break;
2070cdf0e10cSrcweir         case 'd':
2071cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed );
2072cdf0e10cSrcweir             break;
2073cdf0e10cSrcweir         case 'e':
2074cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed );
2075cdf0e10cSrcweir             break;
2076cdf0e10cSrcweir         case 'i':
2077cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure );
2078cdf0e10cSrcweir             break;
2079cdf0e10cSrcweir         case 'n':
2080cdf0e10cSrcweir             bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound );
2081cdf0e10cSrcweir             break;
2082cdf0e10cSrcweir         }
2083cdf0e10cSrcweir     }
2084cdf0e10cSrcweir     if( bFound )
2085cdf0e10cSrcweir     {
2086cdf0e10cSrcweir         String sTmp ( rURL );
2087cdf0e10cSrcweir         rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image );
2088cdf0e10cSrcweir         rURL.Append( sTmp );
2089cdf0e10cSrcweir     }
2090cdf0e10cSrcweir 
2091cdf0e10cSrcweir     return bFound;
2092cdf0e10cSrcweir }
2093cdf0e10cSrcweir 
2094cdf0e10cSrcweir #ifdef USED
2095cdf0e10cSrcweir void HTMLParser::SaveState( int nToken )
2096cdf0e10cSrcweir {
2097cdf0e10cSrcweir     SvParser::SaveState( nToken );
2098cdf0e10cSrcweir }
2099cdf0e10cSrcweir 
2100cdf0e10cSrcweir void HTMLParser::RestoreState()
2101cdf0e10cSrcweir {
2102cdf0e10cSrcweir     SvParser::RestoreState();
2103cdf0e10cSrcweir }
2104cdf0e10cSrcweir #endif
2105cdf0e10cSrcweir 
2106cdf0e10cSrcweir 
2107cdf0e10cSrcweir enum eHtmlMetas {
2108cdf0e10cSrcweir     HTML_META_NONE = 0,
2109cdf0e10cSrcweir     HTML_META_AUTHOR,
2110cdf0e10cSrcweir     HTML_META_DESCRIPTION,
2111cdf0e10cSrcweir     HTML_META_KEYWORDS,
2112cdf0e10cSrcweir     HTML_META_REFRESH,
2113cdf0e10cSrcweir     HTML_META_CLASSIFICATION,
2114cdf0e10cSrcweir     HTML_META_CREATED,
2115cdf0e10cSrcweir     HTML_META_CHANGEDBY,
2116cdf0e10cSrcweir     HTML_META_CHANGED,
2117cdf0e10cSrcweir     HTML_META_GENERATOR,
2118cdf0e10cSrcweir     HTML_META_SDFOOTNOTE,
2119cdf0e10cSrcweir     HTML_META_SDENDNOTE,
2120cdf0e10cSrcweir     HTML_META_CONTENT_TYPE
2121cdf0e10cSrcweir };
2122cdf0e10cSrcweir 
2123cdf0e10cSrcweir // <META NAME=xxx>
2124cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] =
2125cdf0e10cSrcweir {
2126cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_author,        HTML_META_AUTHOR        },
2127cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changed,       HTML_META_CHANGED       },
2128cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HTML_META_CHANGEDBY     },
2129cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
2130cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HTML_META_CONTENT_TYPE  },
2131cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_created,       HTML_META_CREATED       },
2132cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_description,   HTML_META_DESCRIPTION   },
2133cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HTML_META_KEYWORDS      },
2134cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_generator,     HTML_META_GENERATOR     },
2135cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HTML_META_REFRESH       },
2136cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HTML_META_SDENDNOTE     },
2137cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HTML_META_SDFOOTNOTE    },
2138cdf0e10cSrcweir     { 0,                                          0                       }
2139cdf0e10cSrcweir };
2140cdf0e10cSrcweir 
2141cdf0e10cSrcweir 
2142cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & )
2143cdf0e10cSrcweir {
2144cdf0e10cSrcweir }
2145cdf0e10cSrcweir 
2146cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl(
2147cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2148cdf0e10cSrcweir         SvKeyValueIterator *i_pHTTPHeader,
2149cdf0e10cSrcweir         const HTMLOptions *i_pOptions,
2150cdf0e10cSrcweir         rtl_TextEncoding& o_rEnc )
2151cdf0e10cSrcweir {
2152cdf0e10cSrcweir     String aName, aContent;
2153cdf0e10cSrcweir     sal_uInt16 nAction = HTML_META_NONE;
2154cdf0e10cSrcweir     bool bHTTPEquiv = false, bChanged = false;
2155cdf0e10cSrcweir 
2156cdf0e10cSrcweir     for ( sal_uInt16 i = i_pOptions->Count(); i; )
2157cdf0e10cSrcweir     {
2158cdf0e10cSrcweir         const HTMLOption *pOption = (*i_pOptions)[ --i ];
2159cdf0e10cSrcweir         switch ( pOption->GetToken() )
2160cdf0e10cSrcweir         {
2161cdf0e10cSrcweir             case HTML_O_NAME:
2162cdf0e10cSrcweir                 aName = pOption->GetString();
2163cdf0e10cSrcweir                 if ( HTML_META_NONE==nAction )
2164cdf0e10cSrcweir                 {
2165cdf0e10cSrcweir                     pOption->GetEnum( nAction, aHTMLMetaNameTable );
2166cdf0e10cSrcweir                 }
2167cdf0e10cSrcweir                 break;
2168cdf0e10cSrcweir             case HTML_O_HTTPEQUIV:
2169cdf0e10cSrcweir                 aName = pOption->GetString();
2170cdf0e10cSrcweir                 pOption->GetEnum( nAction, aHTMLMetaNameTable );
2171cdf0e10cSrcweir                 bHTTPEquiv = true;
2172cdf0e10cSrcweir                 break;
2173cdf0e10cSrcweir             case HTML_O_CONTENT:
2174cdf0e10cSrcweir                 aContent = pOption->GetString();
2175cdf0e10cSrcweir                 break;
2176cdf0e10cSrcweir         }
2177cdf0e10cSrcweir     }
2178cdf0e10cSrcweir 
2179cdf0e10cSrcweir     if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
2180cdf0e10cSrcweir     {
2181cdf0e10cSrcweir         // if it is not a Description, remove CRs and LFs from CONTENT
2182cdf0e10cSrcweir         aContent.EraseAllChars( _CR );
2183cdf0e10cSrcweir         aContent.EraseAllChars( _LF );
2184cdf0e10cSrcweir     }
2185cdf0e10cSrcweir     else
2186cdf0e10cSrcweir     {
2187cdf0e10cSrcweir         // convert line endings for Description
2188cdf0e10cSrcweir         aContent.ConvertLineEnd();
2189cdf0e10cSrcweir     }
2190cdf0e10cSrcweir 
2191cdf0e10cSrcweir 
2192cdf0e10cSrcweir     if ( bHTTPEquiv && i_pHTTPHeader )
2193cdf0e10cSrcweir     {
2194cdf0e10cSrcweir         // #57232#: Netscape seems to just ignore a closing ", so we do too
2195cdf0e10cSrcweir         if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) )
2196cdf0e10cSrcweir         {
2197cdf0e10cSrcweir             aContent.Erase( aContent.Len() - 1 );
2198cdf0e10cSrcweir         }
2199cdf0e10cSrcweir         SvKeyValue aKeyValue( aName, aContent );
2200cdf0e10cSrcweir         i_pHTTPHeader->Append( aKeyValue );
2201cdf0e10cSrcweir     }
2202cdf0e10cSrcweir 
2203cdf0e10cSrcweir     switch ( nAction )
2204cdf0e10cSrcweir     {
2205cdf0e10cSrcweir         case HTML_META_AUTHOR:
2206cdf0e10cSrcweir             if (i_xDocProps.is()) {
2207cdf0e10cSrcweir                 i_xDocProps->setAuthor( aContent );
2208cdf0e10cSrcweir                 bChanged = true;
2209cdf0e10cSrcweir             }
2210cdf0e10cSrcweir             break;
2211cdf0e10cSrcweir         case HTML_META_DESCRIPTION:
2212cdf0e10cSrcweir             if (i_xDocProps.is()) {
2213cdf0e10cSrcweir                 i_xDocProps->setDescription( aContent );
2214cdf0e10cSrcweir                 bChanged = true;
2215cdf0e10cSrcweir             }
2216cdf0e10cSrcweir             break;
2217cdf0e10cSrcweir         case HTML_META_KEYWORDS:
2218cdf0e10cSrcweir             if (i_xDocProps.is()) {
2219cdf0e10cSrcweir                 i_xDocProps->setKeywords(
2220cdf0e10cSrcweir                     ::comphelper::string::convertCommaSeparated(aContent));
2221cdf0e10cSrcweir                 bChanged = true;
2222cdf0e10cSrcweir             }
2223cdf0e10cSrcweir             break;
2224cdf0e10cSrcweir         case HTML_META_CLASSIFICATION:
2225cdf0e10cSrcweir             if (i_xDocProps.is()) {
2226cdf0e10cSrcweir                 i_xDocProps->setSubject( aContent );
2227cdf0e10cSrcweir                 bChanged = true;
2228cdf0e10cSrcweir             }
2229cdf0e10cSrcweir             break;
2230cdf0e10cSrcweir 
2231cdf0e10cSrcweir         case HTML_META_CHANGEDBY:
2232cdf0e10cSrcweir             if (i_xDocProps.is()) {
2233cdf0e10cSrcweir                 i_xDocProps->setModifiedBy( aContent );
2234cdf0e10cSrcweir             }
2235cdf0e10cSrcweir             break;
2236cdf0e10cSrcweir 
2237cdf0e10cSrcweir         case HTML_META_CREATED:
2238cdf0e10cSrcweir         case HTML_META_CHANGED:
2239cdf0e10cSrcweir             if ( i_xDocProps.is() && aContent.Len() &&
2240cdf0e10cSrcweir                  aContent.GetTokenCount() == 2 )
2241cdf0e10cSrcweir             {
2242cdf0e10cSrcweir                 Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() );
2243cdf0e10cSrcweir                 Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() );
2244cdf0e10cSrcweir                 DateTime aDateTime( aDate, aTime );
2245cdf0e10cSrcweir                 ::util::DateTime uDT(aDateTime.Get100Sec(),
2246cdf0e10cSrcweir                     aDateTime.GetSec(), aDateTime.GetMin(),
2247cdf0e10cSrcweir                     aDateTime.GetHour(), aDateTime.GetDay(),
2248cdf0e10cSrcweir                     aDateTime.GetMonth(), aDateTime.GetYear());
2249cdf0e10cSrcweir                 if ( HTML_META_CREATED==nAction )
2250cdf0e10cSrcweir                     i_xDocProps->setCreationDate( uDT );
2251cdf0e10cSrcweir                 else
2252cdf0e10cSrcweir                     i_xDocProps->setModificationDate( uDT );
2253cdf0e10cSrcweir                 bChanged = true;
2254cdf0e10cSrcweir             }
2255cdf0e10cSrcweir             break;
2256cdf0e10cSrcweir 
2257cdf0e10cSrcweir         case HTML_META_REFRESH:
2258cdf0e10cSrcweir             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2259cdf0e10cSrcweir         "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2260cdf0e10cSrcweir             break;
2261cdf0e10cSrcweir 
2262cdf0e10cSrcweir         case HTML_META_CONTENT_TYPE:
2263cdf0e10cSrcweir             if ( aContent.Len() )
2264cdf0e10cSrcweir             {
2265cdf0e10cSrcweir                 o_rEnc = GetEncodingByMIME( aContent );
2266cdf0e10cSrcweir             }
2267cdf0e10cSrcweir             break;
2268cdf0e10cSrcweir 
2269cdf0e10cSrcweir         case HTML_META_NONE:
2270cdf0e10cSrcweir             if ( !bHTTPEquiv )
2271cdf0e10cSrcweir             {
2272cdf0e10cSrcweir                 if (i_xDocProps.is())
2273cdf0e10cSrcweir                 {
2274cdf0e10cSrcweir                     uno::Reference<beans::XPropertyContainer> xUDProps
2275cdf0e10cSrcweir                         = i_xDocProps->getUserDefinedProperties();
2276cdf0e10cSrcweir                     try {
2277cdf0e10cSrcweir                         xUDProps->addProperty(aName,
2278cdf0e10cSrcweir                             beans::PropertyAttribute::REMOVEABLE,
2279cdf0e10cSrcweir                             uno::makeAny(::rtl::OUString(aContent)));
2280cdf0e10cSrcweir                         AddMetaUserDefined(aName);
2281cdf0e10cSrcweir                         bChanged = true;
2282cdf0e10cSrcweir                     } catch (uno::Exception &) {
2283cdf0e10cSrcweir                         // ignore
2284cdf0e10cSrcweir                     }
2285cdf0e10cSrcweir                 }
2286cdf0e10cSrcweir             }
2287cdf0e10cSrcweir             break;
2288cdf0e10cSrcweir         default:
2289cdf0e10cSrcweir             break;
2290cdf0e10cSrcweir     }
2291cdf0e10cSrcweir 
2292cdf0e10cSrcweir     return bChanged;
2293cdf0e10cSrcweir }
2294cdf0e10cSrcweir 
2295cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions(
2296cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2297cdf0e10cSrcweir         SvKeyValueIterator *i_pHeader )
2298cdf0e10cSrcweir {
2299cdf0e10cSrcweir     sal_uInt16 nContentOption = HTML_O_CONTENT;
2300cdf0e10cSrcweir     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2301cdf0e10cSrcweir 
2302cdf0e10cSrcweir     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2303cdf0e10cSrcweir                       GetOptions(&nContentOption),
2304cdf0e10cSrcweir                       eEnc );
2305cdf0e10cSrcweir 
2306cdf0e10cSrcweir     // If the encoding is set by a META tag, it may only overwrite the
2307cdf0e10cSrcweir     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2308cdf0e10cSrcweir     // encodings. Everything else cannot lead to reasonable results.
2309cdf0e10cSrcweir     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2310cdf0e10cSrcweir         rtl_isOctetTextEncoding( eEnc ) &&
2311cdf0e10cSrcweir         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2312cdf0e10cSrcweir     {
2313cdf0e10cSrcweir         eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973#
2314cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2315cdf0e10cSrcweir     }
2316cdf0e10cSrcweir 
2317cdf0e10cSrcweir     return bRet;
2318cdf0e10cSrcweir }
2319cdf0e10cSrcweir 
2320cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime )
2321cdf0e10cSrcweir {
2322cdf0e10cSrcweir     ByteString sType;
2323cdf0e10cSrcweir     ByteString sSubType;
2324cdf0e10cSrcweir     INetContentTypeParameterList aParameters;
2325cdf0e10cSrcweir     ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US );
2326cdf0e10cSrcweir     if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters))
2327cdf0e10cSrcweir     {
2328cdf0e10cSrcweir         const INetContentTypeParameter * pCharset
2329cdf0e10cSrcweir             = aParameters.find("charset");
2330cdf0e10cSrcweir         if (pCharset != 0)
2331cdf0e10cSrcweir         {
2332cdf0e10cSrcweir             ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US );
2333cdf0e10cSrcweir             return GetExtendedCompatibilityTextEncoding(
2334cdf0e10cSrcweir                     rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) );
2335cdf0e10cSrcweir         }
2336cdf0e10cSrcweir     }
2337cdf0e10cSrcweir     return RTL_TEXTENCODING_DONTKNOW;
2338cdf0e10cSrcweir }
2339cdf0e10cSrcweir 
2340cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2341cdf0e10cSrcweir {
2342cdf0e10cSrcweir     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2343cdf0e10cSrcweir     if( pHTTPHeader )
2344cdf0e10cSrcweir     {
2345cdf0e10cSrcweir         SvKeyValue aKV;
2346cdf0e10cSrcweir         for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2347cdf0e10cSrcweir              bCont = pHTTPHeader->GetNext( aKV ) )
2348cdf0e10cSrcweir         {
2349cdf0e10cSrcweir             if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2350cdf0e10cSrcweir             {
2351cdf0e10cSrcweir                 if( aKV.GetValue().Len() )
2352cdf0e10cSrcweir                 {
2353cdf0e10cSrcweir                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2354cdf0e10cSrcweir                 }
2355cdf0e10cSrcweir             }
2356cdf0e10cSrcweir         }
2357cdf0e10cSrcweir     }
2358cdf0e10cSrcweir     return eRet;
2359cdf0e10cSrcweir }
2360cdf0e10cSrcweir 
2361cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader(
2362cdf0e10cSrcweir                                 SvKeyValueIterator *pHTTPHeader )
2363cdf0e10cSrcweir {
2364cdf0e10cSrcweir     sal_Bool bRet = sal_False;
2365cdf0e10cSrcweir     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2366cdf0e10cSrcweir     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2367cdf0e10cSrcweir     {
2368cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2369cdf0e10cSrcweir         bRet = sal_True;
2370cdf0e10cSrcweir     }
2371cdf0e10cSrcweir     return bRet;
2372cdf0e10cSrcweir }
2373cdf0e10cSrcweir 
2374cdf0e10cSrcweir 
2375