15900e8ecSAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 35900e8ecSAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 45900e8ecSAndrew Rist * or more contributor license agreements. See the NOTICE file 55900e8ecSAndrew Rist * distributed with this work for additional information 65900e8ecSAndrew Rist * regarding copyright ownership. The ASF licenses this file 75900e8ecSAndrew Rist * to you under the Apache License, Version 2.0 (the 85900e8ecSAndrew Rist * "License"); you may not use this file except in compliance 95900e8ecSAndrew Rist * with the License. You may obtain a copy of the License at 10cdf0e10cSrcweir * 115900e8ecSAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12cdf0e10cSrcweir * 135900e8ecSAndrew Rist * Unless required by applicable law or agreed to in writing, 145900e8ecSAndrew Rist * software distributed under the License is distributed on an 155900e8ecSAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 165900e8ecSAndrew Rist * KIND, either express or implied. See the License for the 175900e8ecSAndrew Rist * specific language governing permissions and limitations 185900e8ecSAndrew Rist * under the License. 19cdf0e10cSrcweir * 205900e8ecSAndrew Rist *************************************************************/ 215900e8ecSAndrew Rist 225900e8ecSAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove 25cdf0e10cSrcweir #include "precompiled_svtools.hxx" 26cdf0e10cSrcweir 27cdf0e10cSrcweir #include <ctype.h> 28cdf0e10cSrcweir #include <stdio.h> 29cdf0e10cSrcweir #include <tools/stream.hxx> 30cdf0e10cSrcweir #include <tools/debug.hxx> 31cdf0e10cSrcweir #include <tools/color.hxx> 32cdf0e10cSrcweir #include <rtl/ustrbuf.hxx> 33cdf0e10cSrcweir #include <rtl/strbuf.hxx> 34cdf0e10cSrcweir #ifndef _SVSTDARR_HXX 35cdf0e10cSrcweir #define _SVSTDARR_ULONGS 36cdf0e10cSrcweir #include <svl/svstdarr.hxx> 37cdf0e10cSrcweir #endif 38cdf0e10cSrcweir 39cdf0e10cSrcweir #include <tools/tenccvt.hxx> 40cdf0e10cSrcweir #include <tools/datetime.hxx> 41cdf0e10cSrcweir #include <svl/inettype.hxx> 42cdf0e10cSrcweir #include <comphelper/string.hxx> 43cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp> 44cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp> 45cdf0e10cSrcweir 46cdf0e10cSrcweir #include <svtools/parhtml.hxx> 47cdf0e10cSrcweir #include <svtools/htmltokn.h> 48cdf0e10cSrcweir #include <svtools/htmlkywd.hxx> 49cdf0e10cSrcweir 50cdf0e10cSrcweir 51cdf0e10cSrcweir using namespace ::com::sun::star; 52cdf0e10cSrcweir 53cdf0e10cSrcweir 54cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L ); 55cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ]; 56cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 ); 57cdf0e10cSrcweir 58cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L ); 59cdf0e10cSrcweir 60*1dda6fa0Smseidel /* */ 61cdf0e10cSrcweir 62cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings 63cdf0e10cSrcweir 64cdf0e10cSrcweir // <INPUT TYPE=xxx> 65cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] = 66cdf0e10cSrcweir { 67cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT }, 68cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD }, 69cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX }, 70cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO }, 71cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE }, 72cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE }, 73cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE }, 74cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN }, 75cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT }, 76cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE }, 77cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET }, 78cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON }, 79cdf0e10cSrcweir { 0, 0 } 80cdf0e10cSrcweir }; 81cdf0e10cSrcweir 82cdf0e10cSrcweir // <TABLE FRAME=xxx> 83cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] = 84cdf0e10cSrcweir { 85cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID }, 86cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE }, 87cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW }, 88cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES }, 89cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS }, 90cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS }, 91cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES }, 92cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX }, 93cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX }, 94cdf0e10cSrcweir { 0, 0 } 95cdf0e10cSrcweir }; 96cdf0e10cSrcweir 97cdf0e10cSrcweir // <TABLE RULES=xxx> 98cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] = 99cdf0e10cSrcweir { 100cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE }, 101cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS }, 102cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS }, 103cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS }, 104cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL }, 105cdf0e10cSrcweir { 0, 0 } 106cdf0e10cSrcweir }; 107cdf0e10cSrcweir 108cdf0e10cSrcweir 109cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr) 110cdf0e10cSrcweir 111*1dda6fa0Smseidel /* */ 112cdf0e10cSrcweir 113cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const 114cdf0e10cSrcweir { 115cdf0e10cSrcweir sal_uInt16 nValue = nDflt; 116cdf0e10cSrcweir 117cdf0e10cSrcweir while( pOptEnums->pName ) 118cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 119cdf0e10cSrcweir break; 120cdf0e10cSrcweir else 121cdf0e10cSrcweir pOptEnums++; 122cdf0e10cSrcweir 123cdf0e10cSrcweir if( pOptEnums->pName ) 124cdf0e10cSrcweir nValue = pOptEnums->nValue; 125cdf0e10cSrcweir 126cdf0e10cSrcweir return nValue; 127cdf0e10cSrcweir } 128cdf0e10cSrcweir 129cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const 130cdf0e10cSrcweir { 131cdf0e10cSrcweir while( pOptEnums->pName ) 132cdf0e10cSrcweir { 133cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) ) 134cdf0e10cSrcweir break; 135cdf0e10cSrcweir else 136cdf0e10cSrcweir pOptEnums++; 137cdf0e10cSrcweir } 138cdf0e10cSrcweir 139cdf0e10cSrcweir const sal_Char *pName = pOptEnums->pName; 140cdf0e10cSrcweir if( pName ) 141cdf0e10cSrcweir rEnum = pOptEnums->nValue; 142cdf0e10cSrcweir 143cdf0e10cSrcweir return (pName != 0); 144cdf0e10cSrcweir } 145cdf0e10cSrcweir 146cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken, 147cdf0e10cSrcweir const String& rValue ) 148cdf0e10cSrcweir : aValue(rValue) 149cdf0e10cSrcweir , aToken(rToken) 150cdf0e10cSrcweir , nToken( nTok ) 151cdf0e10cSrcweir { 152cdf0e10cSrcweir DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END, 153cdf0e10cSrcweir "HTMLOption: unbekanntes Token" ); 154cdf0e10cSrcweir } 155cdf0e10cSrcweir 156cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const 157cdf0e10cSrcweir { 158cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && 159cdf0e10cSrcweir nToken<HTML_OPTION_NUMBER_END) || 160cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && 161cdf0e10cSrcweir nToken<HTML_OPTION_CONTEXT_END) || 162cdf0e10cSrcweir nToken==HTML_O_VALUE, 163cdf0e10cSrcweir "GetNumber: Option ist nicht numerisch" ); 164cdf0e10cSrcweir String aTmp( aValue ); 165cdf0e10cSrcweir aTmp.EraseLeadingChars(); 166cdf0e10cSrcweir sal_Int32 nTmp = aTmp.ToInt32(); 167cdf0e10cSrcweir return nTmp >= 0 ? (sal_uInt32)nTmp : 0; 168cdf0e10cSrcweir } 169cdf0e10cSrcweir 170cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const 171cdf0e10cSrcweir { 172cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) || 173cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END), 174cdf0e10cSrcweir "GetSNumber: Option ist nicht numerisch" ); 175cdf0e10cSrcweir String aTmp( aValue ); 176cdf0e10cSrcweir aTmp.EraseLeadingChars(); 177cdf0e10cSrcweir return aTmp.ToInt32(); 178cdf0e10cSrcweir } 179cdf0e10cSrcweir 180cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const 181cdf0e10cSrcweir { 182cdf0e10cSrcweir if( rLongs.Count() ) 183cdf0e10cSrcweir rLongs.Remove( 0, rLongs.Count() ); 184cdf0e10cSrcweir 185cdf0e10cSrcweir if( bSpaceDelim ) 186cdf0e10cSrcweir { 187cdf0e10cSrcweir // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach 188cdf0e10cSrcweir // alle Tiffern aus dem String 189cdf0e10cSrcweir sal_Bool bInNum = sal_False; 190cdf0e10cSrcweir sal_uLong nNum = 0; 191cdf0e10cSrcweir for( xub_StrLen i=0; i<aValue.Len(); i++ ) 192cdf0e10cSrcweir { 193cdf0e10cSrcweir register sal_Unicode c = aValue.GetChar( i ); 194cdf0e10cSrcweir if( c>='0' && c<='9' ) 195cdf0e10cSrcweir { 196cdf0e10cSrcweir nNum *= 10; 197cdf0e10cSrcweir nNum += (c - '0'); 198cdf0e10cSrcweir bInNum = sal_True; 199cdf0e10cSrcweir } 200cdf0e10cSrcweir else if( bInNum ) 201cdf0e10cSrcweir { 202cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 203cdf0e10cSrcweir bInNum = sal_False; 204cdf0e10cSrcweir nNum = 0; 205cdf0e10cSrcweir } 206cdf0e10cSrcweir } 207cdf0e10cSrcweir if( bInNum ) 208cdf0e10cSrcweir { 209cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() ); 210cdf0e10cSrcweir } 211cdf0e10cSrcweir } 212cdf0e10cSrcweir else 213cdf0e10cSrcweir { 214cdf0e10cSrcweir // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet 215cdf0e10cSrcweir // und auch mal eine 0 eingefuegt 216cdf0e10cSrcweir xub_StrLen nPos = 0; 217cdf0e10cSrcweir while( nPos < aValue.Len() ) 218cdf0e10cSrcweir { 219cdf0e10cSrcweir register sal_Unicode c; 220cdf0e10cSrcweir while( nPos < aValue.Len() && 221cdf0e10cSrcweir ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' || 222cdf0e10cSrcweir c == '\n' || c== '\r' ) ) 223cdf0e10cSrcweir nPos++; 224cdf0e10cSrcweir 225cdf0e10cSrcweir if( nPos==aValue.Len() ) 226cdf0e10cSrcweir rLongs.Insert( sal_uLong(0), rLongs.Count() ); 227cdf0e10cSrcweir else 228cdf0e10cSrcweir { 229cdf0e10cSrcweir xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos ); 230cdf0e10cSrcweir if( STRING_NOTFOUND==nEnd ) 231cdf0e10cSrcweir { 232cdf0e10cSrcweir sal_Int32 nTmp = aValue.Copy(nPos).ToInt32(); 233cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 234cdf0e10cSrcweir rLongs.Count() ); 235cdf0e10cSrcweir nPos = aValue.Len(); 236cdf0e10cSrcweir } 237cdf0e10cSrcweir else 238cdf0e10cSrcweir { 239cdf0e10cSrcweir sal_Int32 nTmp = 240cdf0e10cSrcweir aValue.Copy(nPos,nEnd-nPos).ToInt32(); 241cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0, 242cdf0e10cSrcweir rLongs.Count() ); 243cdf0e10cSrcweir nPos = nEnd+1; 244cdf0e10cSrcweir } 245cdf0e10cSrcweir } 246cdf0e10cSrcweir } 247cdf0e10cSrcweir } 248cdf0e10cSrcweir } 249cdf0e10cSrcweir 250cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const 251cdf0e10cSrcweir { 252cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE, 253cdf0e10cSrcweir "GetColor: Option spezifiziert keine Farbe" ); 254cdf0e10cSrcweir 255cdf0e10cSrcweir String aTmp( aValue ); 256cdf0e10cSrcweir aTmp.ToUpperAscii(); 257cdf0e10cSrcweir sal_uLong nColor = ULONG_MAX; 258cdf0e10cSrcweir if( '#'!=aTmp.GetChar( 0 ) ) 259cdf0e10cSrcweir nColor = GetHTMLColor( aTmp ); 260cdf0e10cSrcweir 261cdf0e10cSrcweir if( ULONG_MAX == nColor ) 262cdf0e10cSrcweir { 263cdf0e10cSrcweir nColor = 0; 264cdf0e10cSrcweir xub_StrLen nPos = 0; 265cdf0e10cSrcweir for( sal_uInt32 i=0; i<6; i++ ) 266cdf0e10cSrcweir { 267cdf0e10cSrcweir // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt, 268cdf0e10cSrcweir // maximal drei Zeichen, die kleiner als '0' sind werden 269cdf0e10cSrcweir // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich 270cdf0e10cSrcweir // irgendwelche HTML-Autoren noch so einfallen lassen... 271cdf0e10cSrcweir register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ ) 272cdf0e10cSrcweir : '0'; 273cdf0e10cSrcweir if( c < '0' ) 274cdf0e10cSrcweir { 275cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 276cdf0e10cSrcweir if( c < '0' ) 277cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0'; 278cdf0e10cSrcweir } 279cdf0e10cSrcweir nColor *= 16; 280cdf0e10cSrcweir if( c >= '0' && c <= '9' ) 281cdf0e10cSrcweir nColor += (c - 48); 282cdf0e10cSrcweir else if( c >= 'A' && c <= 'F' ) 283cdf0e10cSrcweir nColor += (c - 55); 284cdf0e10cSrcweir } 285cdf0e10cSrcweir } 286cdf0e10cSrcweir 287cdf0e10cSrcweir rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) ); 288cdf0e10cSrcweir rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8)); 289cdf0e10cSrcweir rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) ); 290cdf0e10cSrcweir } 291cdf0e10cSrcweir 292cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const 293cdf0e10cSrcweir { 294cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" ); 295cdf0e10cSrcweir return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT ); 296cdf0e10cSrcweir } 297cdf0e10cSrcweir 298cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const 299cdf0e10cSrcweir { 300cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" ); 301cdf0e10cSrcweir return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID ); 302cdf0e10cSrcweir } 303cdf0e10cSrcweir 304cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const 305cdf0e10cSrcweir { 306cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" ); 307cdf0e10cSrcweir return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE ); 308cdf0e10cSrcweir } 309cdf0e10cSrcweir 310*1dda6fa0Smseidel /* */ 311cdf0e10cSrcweir 312cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc ) 313cdf0e10cSrcweir : SvParser( rIn ) 314cdf0e10cSrcweir { 315cdf0e10cSrcweir bNewDoc = bReadNewDoc; 316cdf0e10cSrcweir bReadListing = bReadXMP = bReadPRE = bReadTextArea = 317cdf0e10cSrcweir bReadScript = bReadStyle = 318cdf0e10cSrcweir bEndTokenFound = bIsInBody = bReadNextChar = 319cdf0e10cSrcweir bReadComment = sal_False; 320cdf0e10cSrcweir bIsInHeader = sal_True; 321cdf0e10cSrcweir pOptions = new HTMLOptions; 3228d621361SPedro Giffuni 3238d621361SPedro Giffuni //#i76649, default to UTF-8 for HTML unless we know differently 3248d621361SPedro Giffuni SetSrcEncoding(RTL_TEXTENCODING_UTF8); 325cdf0e10cSrcweir } 326cdf0e10cSrcweir 327cdf0e10cSrcweir HTMLParser::~HTMLParser() 328cdf0e10cSrcweir { 329cdf0e10cSrcweir if( pOptions && pOptions->Count() ) 330cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 331cdf0e10cSrcweir delete pOptions; 332cdf0e10cSrcweir } 333cdf0e10cSrcweir 334cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser() 335cdf0e10cSrcweir { 336cdf0e10cSrcweir eState = SVPAR_WORKING; 337cdf0e10cSrcweir nNextCh = GetNextChar(); 338cdf0e10cSrcweir SaveState( 0 ); 339cdf0e10cSrcweir 340cdf0e10cSrcweir nPre_LinePos = 0; 341cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 342cdf0e10cSrcweir 343cdf0e10cSrcweir AddRef(); 344cdf0e10cSrcweir Continue( 0 ); 345cdf0e10cSrcweir if( SVPAR_PENDING != eState ) 346cdf0e10cSrcweir ReleaseRef(); // dann brauchen wir den Parser nicht mehr! 347cdf0e10cSrcweir 348cdf0e10cSrcweir return eState; 349cdf0e10cSrcweir } 350cdf0e10cSrcweir 351cdf0e10cSrcweir void HTMLParser::Continue( int nToken ) 352cdf0e10cSrcweir { 353cdf0e10cSrcweir if( !nToken ) 354cdf0e10cSrcweir nToken = GetNextToken(); 355cdf0e10cSrcweir 356cdf0e10cSrcweir while( IsParserWorking() ) 357cdf0e10cSrcweir { 358cdf0e10cSrcweir SaveState( nToken ); 359cdf0e10cSrcweir nToken = FilterToken( nToken ); 360cdf0e10cSrcweir 361cdf0e10cSrcweir if( nToken ) 362cdf0e10cSrcweir NextToken( nToken ); 363cdf0e10cSrcweir 364cdf0e10cSrcweir if( IsParserWorking() ) 365cdf0e10cSrcweir SaveState( 0 ); // bis hierhin abgearbeitet, 366cdf0e10cSrcweir // weiter mit neuem Token! 367cdf0e10cSrcweir nToken = GetNextToken(); 368cdf0e10cSrcweir } 369cdf0e10cSrcweir } 370cdf0e10cSrcweir 371cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken ) 372cdf0e10cSrcweir { 373cdf0e10cSrcweir switch( nToken ) 374cdf0e10cSrcweir { 375cdf0e10cSrcweir case sal_Unicode(EOF): 376cdf0e10cSrcweir nToken = 0; 377cdf0e10cSrcweir break; // nicht verschicken 378cdf0e10cSrcweir 379cdf0e10cSrcweir case HTML_HEAD_OFF: 380cdf0e10cSrcweir bIsInBody = sal_True; 381cdf0e10cSrcweir case HTML_HEAD_ON: 382cdf0e10cSrcweir bIsInHeader = HTML_HEAD_ON == nToken; 383cdf0e10cSrcweir break; 384cdf0e10cSrcweir 385cdf0e10cSrcweir case HTML_BODY_ON: 386cdf0e10cSrcweir case HTML_FRAMESET_ON: 387cdf0e10cSrcweir bIsInHeader = sal_False; 388cdf0e10cSrcweir bIsInBody = HTML_BODY_ON == nToken; 389cdf0e10cSrcweir break; 390cdf0e10cSrcweir 391cdf0e10cSrcweir case HTML_BODY_OFF: 392cdf0e10cSrcweir bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False; 393cdf0e10cSrcweir break; 394cdf0e10cSrcweir 395cdf0e10cSrcweir case HTML_HTML_OFF: 396cdf0e10cSrcweir nToken = 0; 397cdf0e10cSrcweir bReadPRE = bReadListing = bReadXMP = sal_False; 398cdf0e10cSrcweir break; // HTML_ON wurde auch nicht verschickt ! 399cdf0e10cSrcweir 400cdf0e10cSrcweir case HTML_PREFORMTXT_ON: 401cdf0e10cSrcweir StartPRE(); 402cdf0e10cSrcweir break; 403cdf0e10cSrcweir 404cdf0e10cSrcweir case HTML_PREFORMTXT_OFF: 405cdf0e10cSrcweir FinishPRE(); 406cdf0e10cSrcweir break; 407cdf0e10cSrcweir 408cdf0e10cSrcweir case HTML_LISTING_ON: 409cdf0e10cSrcweir StartListing(); 410cdf0e10cSrcweir break; 411cdf0e10cSrcweir 412cdf0e10cSrcweir case HTML_LISTING_OFF: 413cdf0e10cSrcweir FinishListing(); 414cdf0e10cSrcweir break; 415cdf0e10cSrcweir 416cdf0e10cSrcweir case HTML_XMP_ON: 417cdf0e10cSrcweir StartXMP(); 418cdf0e10cSrcweir break; 419cdf0e10cSrcweir 420cdf0e10cSrcweir case HTML_XMP_OFF: 421cdf0e10cSrcweir FinishXMP(); 422cdf0e10cSrcweir break; 423cdf0e10cSrcweir 424cdf0e10cSrcweir default: 425cdf0e10cSrcweir if( bReadPRE ) 426cdf0e10cSrcweir nToken = FilterPRE( nToken ); 427cdf0e10cSrcweir else if( bReadListing ) 428cdf0e10cSrcweir nToken = FilterListing( nToken ); 429cdf0e10cSrcweir else if( bReadXMP ) 430cdf0e10cSrcweir nToken = FilterXMP( nToken ); 431cdf0e10cSrcweir 432cdf0e10cSrcweir break; 433cdf0e10cSrcweir } 434cdf0e10cSrcweir 435cdf0e10cSrcweir return nToken; 436cdf0e10cSrcweir } 437cdf0e10cSrcweir 438cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9') 439cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ) 440cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) ) 441cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) ) 442cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127) 443cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 444cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') ) 445cdf0e10cSrcweir // <-- 446cdf0e10cSrcweir 447cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak ) 448cdf0e10cSrcweir { 449cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 450cdf0e10cSrcweir int bWeiter = sal_True; 451cdf0e10cSrcweir int bEqSignFound = sal_False; 452cdf0e10cSrcweir sal_Unicode cQuote = 0U; 453cdf0e10cSrcweir 454cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 455cdf0e10cSrcweir { 456cdf0e10cSrcweir int bNextCh = sal_True; 457cdf0e10cSrcweir switch( nNextCh ) 458cdf0e10cSrcweir { 459cdf0e10cSrcweir case '&': 460cdf0e10cSrcweir bEqSignFound = sal_False; 461cdf0e10cSrcweir if( bReadXMP ) 462cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 463cdf0e10cSrcweir else 464cdf0e10cSrcweir { 465cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 466cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 467cdf0e10cSrcweir 468cdf0e10cSrcweir sal_Unicode cChar = 0U; 469cdf0e10cSrcweir if( '#' == (nNextCh = GetNextChar()) ) 470cdf0e10cSrcweir { 471cdf0e10cSrcweir nNextCh = GetNextChar(); 472cdf0e10cSrcweir // --> OD 2006-07-26 #138464# 473cdf0e10cSrcweir // consider hexadecimal digits 474cdf0e10cSrcweir const sal_Bool bIsHex( 'x' == nNextCh ); 475cdf0e10cSrcweir const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) ); 476cdf0e10cSrcweir if ( bIsDecOrHex ) 477cdf0e10cSrcweir { 478cdf0e10cSrcweir if ( bIsHex ) 479cdf0e10cSrcweir { 480cdf0e10cSrcweir nNextCh = GetNextChar(); 481cdf0e10cSrcweir while ( HTML_ISHEXDIGIT(nNextCh) ) 482cdf0e10cSrcweir { 483cdf0e10cSrcweir cChar = cChar * 16U + 484cdf0e10cSrcweir ( nNextCh <= '9' 485cdf0e10cSrcweir ? sal_Unicode( nNextCh - '0' ) 486cdf0e10cSrcweir : ( nNextCh <= 'F' 487cdf0e10cSrcweir ? sal_Unicode( nNextCh - 'A' + 10 ) 488cdf0e10cSrcweir : sal_Unicode( nNextCh - 'a' + 10 ) ) ); 489cdf0e10cSrcweir nNextCh = GetNextChar(); 490cdf0e10cSrcweir } 491cdf0e10cSrcweir } 492cdf0e10cSrcweir else 493cdf0e10cSrcweir { 494cdf0e10cSrcweir do 495cdf0e10cSrcweir { 496cdf0e10cSrcweir cChar = cChar * 10U + sal_Unicode( nNextCh - '0'); 497cdf0e10cSrcweir nNextCh = GetNextChar(); 498cdf0e10cSrcweir } 499cdf0e10cSrcweir while( HTML_ISDIGIT(nNextCh) ); 500cdf0e10cSrcweir } 501cdf0e10cSrcweir 502cdf0e10cSrcweir if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc && 503cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 != eSrcEnc && 504cdf0e10cSrcweir RTL_TEXTENCODING_UTF8 != eSrcEnc && 505cdf0e10cSrcweir cChar < 256 ) 506cdf0e10cSrcweir { 507cdf0e10cSrcweir sal_Unicode cOrig = cChar; 508cdf0e10cSrcweir cChar = ByteString::ConvertToUnicode( 509cdf0e10cSrcweir (sal_Char)cChar, eSrcEnc ); 510cdf0e10cSrcweir if( 0U == cChar ) 511cdf0e10cSrcweir { 512cdf0e10cSrcweir // #73398#: If the character could not be 513cdf0e10cSrcweir // converted, because a conversion is not 514cdf0e10cSrcweir // available, do no conversion at all. 515cdf0e10cSrcweir cChar = cOrig; 516cdf0e10cSrcweir } 517cdf0e10cSrcweir } 518cdf0e10cSrcweir } 519cdf0e10cSrcweir // <-- 520cdf0e10cSrcweir else 521cdf0e10cSrcweir nNextCh = 0U; 522cdf0e10cSrcweir } 523cdf0e10cSrcweir else if( HTML_ISALPHA( nNextCh ) ) 524cdf0e10cSrcweir { 525cdf0e10cSrcweir ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN ); 526cdf0e10cSrcweir xub_StrLen nPos = 0L; 527cdf0e10cSrcweir do 528cdf0e10cSrcweir { 529cdf0e10cSrcweir sEntityBuffer.append( nNextCh ); 530cdf0e10cSrcweir nPos++; 531cdf0e10cSrcweir nNextCh = GetNextChar(); 532cdf0e10cSrcweir } 533cdf0e10cSrcweir while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) && 534cdf0e10cSrcweir !rInput.IsEof() ); 535cdf0e10cSrcweir 536cdf0e10cSrcweir if( IsParserWorking() && !rInput.IsEof() ) 537cdf0e10cSrcweir { 538cdf0e10cSrcweir String sEntity( sEntityBuffer.getStr(), nPos ); 539cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 540cdf0e10cSrcweir 541cdf0e10cSrcweir // nicht gefunden ( == 0 ), dann Klartext 542cdf0e10cSrcweir // oder ein Zeichen das als Attribut eingefuegt 543cdf0e10cSrcweir // wird 544cdf0e10cSrcweir if( 0U == cChar && ';' != nNextCh ) 545cdf0e10cSrcweir { 546cdf0e10cSrcweir DBG_ASSERT( rInput.Tell() - nStreamPos == 547cdf0e10cSrcweir (sal_uLong)(nPos+1L)*GetCharSize(), 548cdf0e10cSrcweir "UTF-8 geht hier schief" ); 549cdf0e10cSrcweir for( xub_StrLen i=nPos-1L; i>1L; i-- ) 550cdf0e10cSrcweir { 551cdf0e10cSrcweir nNextCh = sEntityBuffer[i]; 552cdf0e10cSrcweir sEntityBuffer.setLength( i ); 553cdf0e10cSrcweir sEntity.Assign( sEntityBuffer.getStr(), i ); 554cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity ); 555cdf0e10cSrcweir if( cChar ) 556cdf0e10cSrcweir { 557cdf0e10cSrcweir rInput.SeekRel( -(long) 558cdf0e10cSrcweir ((nPos-i)*GetCharSize()) ); 559cdf0e10cSrcweir nlLinePos -= sal_uInt32(nPos-i); 560cdf0e10cSrcweir nPos = i; 561cdf0e10cSrcweir ClearTxtConvContext(); 562cdf0e10cSrcweir break; 563cdf0e10cSrcweir } 564cdf0e10cSrcweir } 565cdf0e10cSrcweir } 566cdf0e10cSrcweir 567cdf0e10cSrcweir if( !cChar ) // unbekanntes Zeichen? 568cdf0e10cSrcweir { 569cdf0e10cSrcweir // dann im Stream zurueck, das '&' als Zeichen 570cdf0e10cSrcweir // einfuegen und mit dem nachfolgenden Zeichen 571cdf0e10cSrcweir // wieder aufsetzen 572cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 573cdf0e10cSrcweir 574cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 575cdf0e10cSrcweir // nlLinePos -= nPos; 576cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 577cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 578cdf0e10cSrcweir "Falsche Stream-Position" ); 579cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 580cdf0e10cSrcweir (sal_uLong)(nPos+1), 581cdf0e10cSrcweir "Falsche Zeilen-Position" ); 582cdf0e10cSrcweir rInput.Seek( nStreamPos ); 583cdf0e10cSrcweir nlLinePos = nLinePos; 584cdf0e10cSrcweir ClearTxtConvContext(); 585cdf0e10cSrcweir break; 586cdf0e10cSrcweir } 587cdf0e10cSrcweir 588cdf0e10cSrcweir // 1 == Non Breaking Space 589cdf0e10cSrcweir // 2 == SoftHyphen 590cdf0e10cSrcweir 591cdf0e10cSrcweir if( cChar < 3U ) 592cdf0e10cSrcweir { 593cdf0e10cSrcweir if( '>' == cBreak ) 594cdf0e10cSrcweir { 595cdf0e10cSrcweir // Wenn der Inhalt eines Tags gelesen wird, 596cdf0e10cSrcweir // muessen wir ein Space bzw. - daraus machen 597cdf0e10cSrcweir switch( cChar ) 598cdf0e10cSrcweir { 599cdf0e10cSrcweir case 1U: cChar = ' '; break; 600cdf0e10cSrcweir case 2U: cChar = '-'; break; 601cdf0e10cSrcweir default: 602cdf0e10cSrcweir DBG_ASSERT( cChar==1U, 603cdf0e10cSrcweir "\0x00 sollte doch schon laengt abgefangen sein!" ); 604cdf0e10cSrcweir break; 605cdf0e10cSrcweir } 606cdf0e10cSrcweir } 607cdf0e10cSrcweir else 608cdf0e10cSrcweir { 609cdf0e10cSrcweir // Wenn kein Tag gescannt wird, enstprechendes 610cdf0e10cSrcweir // Token zurueckgeben 611cdf0e10cSrcweir aToken += 612cdf0e10cSrcweir String( sTmpBuffer.makeStringAndClear() ); 613cdf0e10cSrcweir if( cChar ) 614cdf0e10cSrcweir { 615cdf0e10cSrcweir if( aToken.Len() ) 616cdf0e10cSrcweir { 617cdf0e10cSrcweir // mit dem Zeichen wieder aufsetzen 618cdf0e10cSrcweir nNextCh = '&'; 619cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) ); 620cdf0e10cSrcweir // nlLinePos -= nPos; 621cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos == 622cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(), 623cdf0e10cSrcweir "Falsche Stream-Position" ); 624cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos == 625cdf0e10cSrcweir (sal_uLong)(nPos+1), 626cdf0e10cSrcweir "Falsche Zeilen-Position" ); 627cdf0e10cSrcweir rInput.Seek( nStreamPos ); 628cdf0e10cSrcweir nlLinePos = nLinePos; 629cdf0e10cSrcweir ClearTxtConvContext(); 630cdf0e10cSrcweir return HTML_TEXTTOKEN; 631cdf0e10cSrcweir } 632cdf0e10cSrcweir 633cdf0e10cSrcweir // Hack: _GetNextChar soll nicht das 634cdf0e10cSrcweir // naechste Zeichen lesen 635cdf0e10cSrcweir if( ';' != nNextCh ) 636cdf0e10cSrcweir aToken += ' '; 637cdf0e10cSrcweir if( 1U == cChar ) 638cdf0e10cSrcweir return HTML_NONBREAKSPACE; 639cdf0e10cSrcweir if( 2U == cChar ) 640cdf0e10cSrcweir return HTML_SOFTHYPH; 641cdf0e10cSrcweir } 642cdf0e10cSrcweir aToken += (sal_Unicode)'&'; 643cdf0e10cSrcweir aToken += 644cdf0e10cSrcweir String(sEntityBuffer.makeStringAndClear()); 645cdf0e10cSrcweir break; 646cdf0e10cSrcweir } 647cdf0e10cSrcweir } 648cdf0e10cSrcweir } 649cdf0e10cSrcweir else 650cdf0e10cSrcweir nNextCh = 0U; 651cdf0e10cSrcweir } 652cdf0e10cSrcweir // MIB 03/02/2000: &{...};-JavaScript-Macros are not 653cdf0e10cSrcweir // supported any longer. 654cdf0e10cSrcweir else if( IsParserWorking() ) 655cdf0e10cSrcweir { 656cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' ); 657cdf0e10cSrcweir bNextCh = sal_False; 658cdf0e10cSrcweir break; 659cdf0e10cSrcweir } 660cdf0e10cSrcweir 661cdf0e10cSrcweir bNextCh = (';' == nNextCh); 662cdf0e10cSrcweir if( cBreak=='>' && (cChar=='\\' || cChar=='\'' || 663cdf0e10cSrcweir cChar=='\"' || cChar==' ') ) 664cdf0e10cSrcweir { 665cdf0e10cSrcweir // ' und " mussen innerhalb von Tags mit einem 666cdf0e10cSrcweir // gekennzeichnet werden, um sie von ' und " als Klammern 667cdf0e10cSrcweir // um Optionen zu unterscheiden. Logischerweise muss 668cdf0e10cSrcweir // deshalb auch ein \ gekeenzeichnet werden. Ausserdem 669cdf0e10cSrcweir // schuetzen wir ein Space, weil es kein Trennzeichen 670cdf0e10cSrcweir // zwischen Optionen ist. 671cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 672cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 673cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 674cdf0e10cSrcweir } 675cdf0e10cSrcweir if( IsParserWorking() ) 676cdf0e10cSrcweir { 677cdf0e10cSrcweir if( cChar ) 678cdf0e10cSrcweir sTmpBuffer.append( cChar ); 679cdf0e10cSrcweir } 680cdf0e10cSrcweir else if( SVPAR_PENDING==eState && '>'!=cBreak ) 681cdf0e10cSrcweir { 682cdf0e10cSrcweir // Mit dem '&' Zeichen wieder aufsetzen, der Rest 683cdf0e10cSrcweir // wird als Texttoken zurueckgegeben. 684cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 685cdf0e10cSrcweir { 686cdf0e10cSrcweir // Der bisherige Text wird von _GetNextChar() 687cdf0e10cSrcweir // zurueckgegeben und beim naechsten Aufruf wird 688cdf0e10cSrcweir // ein neues Zeichen gelesen. Also muessen wir uns 689cdf0e10cSrcweir // noch vor das & stellen. 690cdf0e10cSrcweir nNextCh = 0U; 691cdf0e10cSrcweir rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() ); 692cdf0e10cSrcweir nlLinePos = nLinePos-1; 693cdf0e10cSrcweir ClearTxtConvContext(); 694cdf0e10cSrcweir bReadNextChar = sal_True; 695cdf0e10cSrcweir } 696cdf0e10cSrcweir bNextCh = sal_False; 697cdf0e10cSrcweir } 698cdf0e10cSrcweir } 699cdf0e10cSrcweir break; 700cdf0e10cSrcweir case '=': 701cdf0e10cSrcweir if( '>'==cBreak && !cQuote ) 702cdf0e10cSrcweir bEqSignFound = sal_True; 703cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 704cdf0e10cSrcweir break; 705cdf0e10cSrcweir 706cdf0e10cSrcweir case '\\': 707cdf0e10cSrcweir if( '>'==cBreak ) 708cdf0e10cSrcweir { 709cdf0e10cSrcweir // Innerhalb von Tags kennzeichnen 710cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 711cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 712cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 713cdf0e10cSrcweir } 714cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' ); 715cdf0e10cSrcweir break; 716cdf0e10cSrcweir 717cdf0e10cSrcweir case '\"': 718cdf0e10cSrcweir case '\'': 719cdf0e10cSrcweir if( '>'==cBreak ) 720cdf0e10cSrcweir { 721cdf0e10cSrcweir if( bEqSignFound ) 722cdf0e10cSrcweir cQuote = nNextCh; 723cdf0e10cSrcweir else if( cQuote && (cQuote==nNextCh ) ) 724cdf0e10cSrcweir cQuote = 0U; 725cdf0e10cSrcweir } 726cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 727cdf0e10cSrcweir bEqSignFound = sal_False; 728cdf0e10cSrcweir break; 729cdf0e10cSrcweir 730cdf0e10cSrcweir case sal_Unicode(EOF): 731cdf0e10cSrcweir if( rInput.IsEof() ) 732cdf0e10cSrcweir { 733cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird 734cdf0e10cSrcweir // abcä<EOF> nicht angezeigt, also lassen wir das in Zukunft. 735cdf0e10cSrcweir // if( '>' != cBreak ) 736cdf0e10cSrcweir // eState = SVPAR_ACCEPTED; 737cdf0e10cSrcweir bWeiter = sal_False; 738cdf0e10cSrcweir } 739cdf0e10cSrcweir else 740cdf0e10cSrcweir { 741cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 742cdf0e10cSrcweir } 743cdf0e10cSrcweir break; 744cdf0e10cSrcweir 745cdf0e10cSrcweir case '<': 746cdf0e10cSrcweir bEqSignFound = sal_False; 747cdf0e10cSrcweir if( '>'==cBreak ) 748cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 749cdf0e10cSrcweir else 750cdf0e10cSrcweir bWeiter = sal_False; // Abbrechen, String zusammen 751cdf0e10cSrcweir break; 752cdf0e10cSrcweir 753cdf0e10cSrcweir case '\f': 754cdf0e10cSrcweir if( '>' == cBreak ) 755cdf0e10cSrcweir { 756cdf0e10cSrcweir // Beim Scannen von Optionen wie ein Space behandeln 757cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)' ' ); 758cdf0e10cSrcweir } 759cdf0e10cSrcweir else 760cdf0e10cSrcweir { 761cdf0e10cSrcweir // sonst wird es ein eigenes Token 762cdf0e10cSrcweir bWeiter = sal_False; 763cdf0e10cSrcweir } 764cdf0e10cSrcweir break; 765cdf0e10cSrcweir 766cdf0e10cSrcweir case '\r': 767cdf0e10cSrcweir case '\n': 768cdf0e10cSrcweir if( '>'==cBreak ) 769cdf0e10cSrcweir { 770cdf0e10cSrcweir // #26979# cr/lf in Tag wird in _GetNextToken() behandeln 771cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 772cdf0e10cSrcweir break; 773cdf0e10cSrcweir } 774cdf0e10cSrcweir else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 775cdf0e10cSrcweir { 776cdf0e10cSrcweir bWeiter = sal_False; 777cdf0e10cSrcweir break; 778cdf0e10cSrcweir } 779cdf0e10cSrcweir // Bug 18984: CR-LF -> Blank 780cdf0e10cSrcweir // Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln 781cdf0e10cSrcweir // kein break!! 782cdf0e10cSrcweir case '\t': 783cdf0e10cSrcweir if( '\t'==nNextCh && bReadPRE && '>'!=cBreak ) 784cdf0e10cSrcweir { 785cdf0e10cSrcweir // In <PRE>: Tabs nach oben durchreichen 786cdf0e10cSrcweir bWeiter = sal_False; 787cdf0e10cSrcweir break; 788cdf0e10cSrcweir } 789cdf0e10cSrcweir // kein break 790cdf0e10cSrcweir case '\x0b': 791cdf0e10cSrcweir if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) && 792cdf0e10cSrcweir '>'!=cBreak ) 793cdf0e10cSrcweir { 794cdf0e10cSrcweir break; 795cdf0e10cSrcweir } 796cdf0e10cSrcweir nNextCh = ' '; 797cdf0e10cSrcweir // kein break; 798cdf0e10cSrcweir case ' ': 799cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 800cdf0e10cSrcweir if( '>'!=cBreak && (!bReadListing && !bReadXMP && 801cdf0e10cSrcweir !bReadPRE && !bReadTextArea) ) 802cdf0e10cSrcweir { 803cdf0e10cSrcweir // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln 804cdf0e10cSrcweir do { 805cdf0e10cSrcweir if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 806cdf0e10cSrcweir rInput.IsEof() ) 807cdf0e10cSrcweir { 808cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() > 1L ) 809cdf0e10cSrcweir { 810cdf0e10cSrcweir // ausser den Blanks wurde noch etwas geselen 811cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 812cdf0e10cSrcweir return HTML_TEXTTOKEN; 813cdf0e10cSrcweir } 814cdf0e10cSrcweir else 815cdf0e10cSrcweir // nur Blanks gelesen: dann darf kein Text 816cdf0e10cSrcweir // mehr zurueckgegeben werden und _GetNextToken 817cdf0e10cSrcweir // muss auf EOF laufen 818cdf0e10cSrcweir return 0; 819cdf0e10cSrcweir } 820cdf0e10cSrcweir } while ( ' ' == nNextCh || '\t' == nNextCh || 821cdf0e10cSrcweir '\r' == nNextCh || '\n' == nNextCh || 822cdf0e10cSrcweir '\x0b' == nNextCh ); 823cdf0e10cSrcweir bNextCh = sal_False; 824cdf0e10cSrcweir } 825cdf0e10cSrcweir break; 826cdf0e10cSrcweir 827cdf0e10cSrcweir default: 828cdf0e10cSrcweir bEqSignFound = sal_False; 829cdf0e10cSrcweir if( (nNextCh==cBreak && !cQuote) || 830cdf0e10cSrcweir (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 )) 831cdf0e10cSrcweir bWeiter = sal_False; 832cdf0e10cSrcweir else 833cdf0e10cSrcweir { 834cdf0e10cSrcweir do { 835cdf0e10cSrcweir // alle anderen Zeichen kommen in den Text 836cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 837cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 838cdf0e10cSrcweir { 839cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 840cdf0e10cSrcweir if( (sal_uLong(aToken.Len()) + MAX_LEN) > 841cdf0e10cSrcweir sal_uLong(STRING_MAXLEN & ~1 ) ) 842cdf0e10cSrcweir { 843cdf0e10cSrcweir nNextCh = GetNextChar(); 844cdf0e10cSrcweir return HTML_TEXTTOKEN; 845cdf0e10cSrcweir } 846cdf0e10cSrcweir } 847cdf0e10cSrcweir if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) && 848cdf0e10cSrcweir rInput.IsEof() ) || 849cdf0e10cSrcweir !IsParserWorking() ) 850cdf0e10cSrcweir { 851cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 852cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 853cdf0e10cSrcweir return HTML_TEXTTOKEN; 854cdf0e10cSrcweir } 855cdf0e10cSrcweir } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) ); 856cdf0e10cSrcweir bNextCh = sal_False; 857cdf0e10cSrcweir } 858cdf0e10cSrcweir } 859cdf0e10cSrcweir 860cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 861cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 862cdf0e10cSrcweir 863cdf0e10cSrcweir if( bWeiter && bNextCh ) 864cdf0e10cSrcweir nNextCh = GetNextChar(); 865cdf0e10cSrcweir } 866cdf0e10cSrcweir 867cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 868cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 869cdf0e10cSrcweir 870cdf0e10cSrcweir return HTML_TEXTTOKEN; 871cdf0e10cSrcweir } 872cdf0e10cSrcweir 873cdf0e10cSrcweir int HTMLParser::_GetNextRawToken() 874cdf0e10cSrcweir { 875cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN ); 876cdf0e10cSrcweir 877cdf0e10cSrcweir if( bEndTokenFound ) 878cdf0e10cSrcweir { 879cdf0e10cSrcweir // beim letzten Aufruf haben wir das End-Token bereits gefunden, 880cdf0e10cSrcweir // deshalb muessen wir es nicht noch einmal suchen 881cdf0e10cSrcweir bReadScript = sal_False; 882cdf0e10cSrcweir bReadStyle = sal_False; 883cdf0e10cSrcweir aEndToken.Erase(); 884cdf0e10cSrcweir bEndTokenFound = sal_False; 885cdf0e10cSrcweir 886cdf0e10cSrcweir return 0; 887cdf0e10cSrcweir } 888cdf0e10cSrcweir 889cdf0e10cSrcweir // per default geben wir HTML_RAWDATA zurueck 890cdf0e10cSrcweir int bWeiter = sal_True; 891cdf0e10cSrcweir int nToken = HTML_RAWDATA; 892cdf0e10cSrcweir SaveState( 0 ); 893cdf0e10cSrcweir while( bWeiter && IsParserWorking() ) 894cdf0e10cSrcweir { 895cdf0e10cSrcweir int bNextCh = sal_True; 896cdf0e10cSrcweir switch( nNextCh ) 897cdf0e10cSrcweir { 898cdf0e10cSrcweir case '<': 899cdf0e10cSrcweir { 900cdf0e10cSrcweir // Vielleicht haben wir das Ende erreicht 901cdf0e10cSrcweir 902cdf0e10cSrcweir // das bisher gelesene erstmal retten 903cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 904cdf0e10cSrcweir 905cdf0e10cSrcweir // und die Position im Stream merken 906cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 907cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 908cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 909cdf0e10cSrcweir 910cdf0e10cSrcweir // Start eines End-Token? 911cdf0e10cSrcweir int bOffState = sal_False; 912cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 913cdf0e10cSrcweir { 914cdf0e10cSrcweir bOffState = sal_True; 915cdf0e10cSrcweir nNextCh = GetNextChar(); 916cdf0e10cSrcweir } 917cdf0e10cSrcweir else if( '!' == nNextCh ) 918cdf0e10cSrcweir { 919cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 920cdf0e10cSrcweir nNextCh = GetNextChar(); 921cdf0e10cSrcweir } 922cdf0e10cSrcweir 923cdf0e10cSrcweir // jetzt die Buchstaben danach lesen 924cdf0e10cSrcweir while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) && 925cdf0e10cSrcweir IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN ) 926cdf0e10cSrcweir { 927cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 928cdf0e10cSrcweir nNextCh = GetNextChar(); 929cdf0e10cSrcweir } 930cdf0e10cSrcweir 931cdf0e10cSrcweir String aTok( sTmpBuffer.getStr(), 932cdf0e10cSrcweir sal::static_int_cast< xub_StrLen >( 933cdf0e10cSrcweir sTmpBuffer.getLength()) ); 934cdf0e10cSrcweir aTok.ToUpperAscii(); 935cdf0e10cSrcweir sal_Bool bDone = sal_False; 936cdf0e10cSrcweir if( bReadScript || aEndToken.Len() ) 937cdf0e10cSrcweir { 938cdf0e10cSrcweir if( !bReadComment ) 939cdf0e10cSrcweir { 940cdf0e10cSrcweir if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 ) 941cdf0e10cSrcweir == COMPARE_EQUAL ) 942cdf0e10cSrcweir { 943cdf0e10cSrcweir bReadComment = sal_True; 944cdf0e10cSrcweir } 945cdf0e10cSrcweir else 946cdf0e10cSrcweir { 947cdf0e10cSrcweir // ein Script muss mit "</SCRIPT>" aufhoehren, wobei 948cdf0e10cSrcweir // wir es mit dem ">" aus sicherheitsgruenden 949cdf0e10cSrcweir // erstmal nicht so genau nehmen 950cdf0e10cSrcweir bDone = bOffState && // '>'==nNextCh && 951cdf0e10cSrcweir COMPARE_EQUAL == ( bReadScript 952cdf0e10cSrcweir ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script) 953cdf0e10cSrcweir : aTok.CompareTo(aEndToken) ); 954cdf0e10cSrcweir } 955cdf0e10cSrcweir } 956cdf0e10cSrcweir if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 && 957cdf0e10cSrcweir aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) ) 958cdf0e10cSrcweir { 959cdf0e10cSrcweir // hier ist ein Kommentar der Art <!-----> zuende 960cdf0e10cSrcweir bReadComment = sal_False; 961cdf0e10cSrcweir } 962cdf0e10cSrcweir } 963cdf0e10cSrcweir else 964cdf0e10cSrcweir { 965cdf0e10cSrcweir // ein Style-Sheet kann mit </STYLE>, </HEAD> oder 966cdf0e10cSrcweir // <BODY> aughoehren 967cdf0e10cSrcweir if( bOffState ) 968cdf0e10cSrcweir bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style) 969cdf0e10cSrcweir == COMPARE_EQUAL || 970cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head) 971cdf0e10cSrcweir == COMPARE_EQUAL; 972cdf0e10cSrcweir else 973cdf0e10cSrcweir bDone = 974cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL; 975cdf0e10cSrcweir } 976cdf0e10cSrcweir 977cdf0e10cSrcweir if( bDone ) 978cdf0e10cSrcweir { 979cdf0e10cSrcweir // das war's, jetzt muessen wir gegebenenfalls den 980cdf0e10cSrcweir // bisher gelesenen String zurueckgeben und dnach normal 981cdf0e10cSrcweir // weitermachen 982cdf0e10cSrcweir 983cdf0e10cSrcweir bWeiter = sal_False; 984cdf0e10cSrcweir 985cdf0e10cSrcweir // nToken==0 heisst, dass _GetNextToken gleich weiterliest 986cdf0e10cSrcweir if( !aToken.Len() && (bReadStyle || bReadScript) ) 987cdf0e10cSrcweir { 988cdf0e10cSrcweir // wir koennen sofort die Umgebung beeden und 989cdf0e10cSrcweir // das End-Token parsen 990cdf0e10cSrcweir bReadScript = sal_False; 991cdf0e10cSrcweir bReadStyle = sal_False; 992cdf0e10cSrcweir aEndToken.Erase(); 993cdf0e10cSrcweir nToken = 0; 994cdf0e10cSrcweir } 995cdf0e10cSrcweir else 996cdf0e10cSrcweir { 997cdf0e10cSrcweir // wir muessen bReadScript/bReadStyle noch am 998cdf0e10cSrcweir // Leben lassen und koennen erst beim naechsten 999cdf0e10cSrcweir // mal das End-Token Parsen 1000cdf0e10cSrcweir bEndTokenFound = sal_True; 1001cdf0e10cSrcweir } 1002cdf0e10cSrcweir 1003cdf0e10cSrcweir // jetzt fahren wir im Stream auf das '<' zurueck 1004cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1005cdf0e10cSrcweir SetLineNr( nLineNr ); 1006cdf0e10cSrcweir SetLinePos( nLinePos ); 1007cdf0e10cSrcweir ClearTxtConvContext(); 1008cdf0e10cSrcweir nNextCh = '<'; 1009cdf0e10cSrcweir 1010cdf0e10cSrcweir // den String wollen wir nicht an das Token haengen 1011cdf0e10cSrcweir sTmpBuffer.setLength( 0L ); 1012cdf0e10cSrcweir } 1013cdf0e10cSrcweir else 1014cdf0e10cSrcweir { 1015cdf0e10cSrcweir // "</" merken, alles andere steht noch im buffer 1016cdf0e10cSrcweir aToken += (sal_Unicode)'<'; 1017cdf0e10cSrcweir if( bOffState ) 1018cdf0e10cSrcweir aToken += (sal_Unicode)'/'; 1019cdf0e10cSrcweir 1020cdf0e10cSrcweir bNextCh = sal_False; 1021cdf0e10cSrcweir } 1022cdf0e10cSrcweir } 1023cdf0e10cSrcweir break; 1024cdf0e10cSrcweir case '-': 1025cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1026cdf0e10cSrcweir if( bReadComment ) 1027cdf0e10cSrcweir { 1028cdf0e10cSrcweir sal_Bool bTwoMinus = sal_False; 1029cdf0e10cSrcweir nNextCh = GetNextChar(); 1030cdf0e10cSrcweir while( '-' == nNextCh && IsParserWorking() ) 1031cdf0e10cSrcweir { 1032cdf0e10cSrcweir bTwoMinus = sal_True; 1033cdf0e10cSrcweir 1034cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1035cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1036cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1037cdf0e10cSrcweir nNextCh = GetNextChar(); 1038cdf0e10cSrcweir } 1039cdf0e10cSrcweir 1040cdf0e10cSrcweir if( '>' == nNextCh && IsParserWorking() && bTwoMinus ) 1041cdf0e10cSrcweir bReadComment = sal_False; 1042cdf0e10cSrcweir 1043cdf0e10cSrcweir bNextCh = sal_False; 1044cdf0e10cSrcweir } 1045cdf0e10cSrcweir break; 1046cdf0e10cSrcweir 1047cdf0e10cSrcweir case '\r': 1048cdf0e10cSrcweir // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist) 1049cdf0e10cSrcweir nNextCh = GetNextChar(); 1050cdf0e10cSrcweir if( nNextCh=='\n' ) 1051cdf0e10cSrcweir nNextCh = GetNextChar(); 1052cdf0e10cSrcweir bWeiter = sal_False; 1053cdf0e10cSrcweir break; 1054cdf0e10cSrcweir case '\n': 1055cdf0e10cSrcweir // \n beendet das aktuelle Text-Token (auch wenn es leer ist) 1056cdf0e10cSrcweir nNextCh = GetNextChar(); 1057cdf0e10cSrcweir bWeiter = sal_False; 1058cdf0e10cSrcweir break; 1059cdf0e10cSrcweir case sal_Unicode(EOF): 1060cdf0e10cSrcweir // eof beendet das aktuelle Text-Token und tut so, als ob 1061cdf0e10cSrcweir // ein End-Token gelesen wurde 1062cdf0e10cSrcweir if( rInput.IsEof() ) 1063cdf0e10cSrcweir { 1064cdf0e10cSrcweir bWeiter = sal_False; 1065cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() ) 1066cdf0e10cSrcweir { 1067cdf0e10cSrcweir bEndTokenFound = sal_True; 1068cdf0e10cSrcweir } 1069cdf0e10cSrcweir else 1070cdf0e10cSrcweir { 1071cdf0e10cSrcweir bReadScript = sal_False; 1072cdf0e10cSrcweir bReadStyle = sal_False; 1073cdf0e10cSrcweir aEndToken.Erase(); 1074cdf0e10cSrcweir nToken = 0; 1075cdf0e10cSrcweir } 1076cdf0e10cSrcweir break; 1077cdf0e10cSrcweir } 1078cdf0e10cSrcweir // kein break 1079cdf0e10cSrcweir default: 1080cdf0e10cSrcweir // alle anderen Zeichen landen im Buffer 1081cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1082cdf0e10cSrcweir break; 1083cdf0e10cSrcweir } 1084cdf0e10cSrcweir 1085cdf0e10cSrcweir if( (!bWeiter && sTmpBuffer.getLength() > 0L) || 1086cdf0e10cSrcweir MAX_LEN == sTmpBuffer.getLength() ) 1087cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1088cdf0e10cSrcweir 1089cdf0e10cSrcweir if( bWeiter && bNextCh ) 1090cdf0e10cSrcweir nNextCh = GetNextChar(); 1091cdf0e10cSrcweir } 1092cdf0e10cSrcweir 1093cdf0e10cSrcweir if( IsParserWorking() ) 1094cdf0e10cSrcweir SaveState( 0 ); 1095cdf0e10cSrcweir else 1096cdf0e10cSrcweir nToken = 0; 1097cdf0e10cSrcweir 1098cdf0e10cSrcweir return nToken; 1099cdf0e10cSrcweir } 1100cdf0e10cSrcweir 1101cdf0e10cSrcweir // scanne das naechste Token, 1102cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken() 1103cdf0e10cSrcweir { 1104cdf0e10cSrcweir int nRet = 0; 1105cdf0e10cSrcweir sSaveToken.Erase(); 1106cdf0e10cSrcweir 1107cdf0e10cSrcweir // die Optionen loeschen 1108cdf0e10cSrcweir if( pOptions->Count() ) 1109cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() ); 1110cdf0e10cSrcweir 1111cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1112cdf0e10cSrcweir return 0; 1113cdf0e10cSrcweir 1114cdf0e10cSrcweir sal_Bool bReadNextCharSave = bReadNextChar; 1115cdf0e10cSrcweir if( bReadNextChar ) 1116cdf0e10cSrcweir { 1117cdf0e10cSrcweir DBG_ASSERT( !bEndTokenFound, 1118cdf0e10cSrcweir "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" ); 1119cdf0e10cSrcweir nNextCh = GetNextChar(); 1120cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter! 1121cdf0e10cSrcweir return 0; 1122cdf0e10cSrcweir bReadNextChar = sal_False; 1123cdf0e10cSrcweir } 1124cdf0e10cSrcweir 1125cdf0e10cSrcweir if( bReadScript || bReadStyle || aEndToken.Len() ) 1126cdf0e10cSrcweir { 1127cdf0e10cSrcweir nRet = _GetNextRawToken(); 1128cdf0e10cSrcweir if( nRet || !IsParserWorking() ) 1129cdf0e10cSrcweir return nRet; 1130cdf0e10cSrcweir } 1131cdf0e10cSrcweir 1132cdf0e10cSrcweir do { 1133cdf0e10cSrcweir int bNextCh = sal_True; 1134cdf0e10cSrcweir switch( nNextCh ) 1135cdf0e10cSrcweir { 1136cdf0e10cSrcweir case '<': 1137cdf0e10cSrcweir { 1138cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell(); 1139cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr(); 1140cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos(); 1141cdf0e10cSrcweir 1142cdf0e10cSrcweir int bOffState = sal_False; 1143cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) ) 1144cdf0e10cSrcweir { 1145cdf0e10cSrcweir bOffState = sal_True; 1146cdf0e10cSrcweir nNextCh = GetNextChar(); 1147cdf0e10cSrcweir } 1148cdf0e10cSrcweir if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984# 1149cdf0e10cSrcweir { 1150cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer; 1151cdf0e10cSrcweir do { 1152cdf0e10cSrcweir sTmpBuffer.append( nNextCh ); 1153cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() ) 1154cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1155cdf0e10cSrcweir nNextCh = GetNextChar(); 1156cdf0e10cSrcweir } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) && 1157cdf0e10cSrcweir IsParserWorking() && !rInput.IsEof() ); 1158cdf0e10cSrcweir 1159cdf0e10cSrcweir if( sTmpBuffer.getLength() ) 1160cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear()); 1161cdf0e10cSrcweir 1162cdf0e10cSrcweir // Blanks ueberlesen 1163cdf0e10cSrcweir while( HTML_ISSPACE( nNextCh ) && IsParserWorking() ) 1164cdf0e10cSrcweir nNextCh = GetNextChar(); 1165cdf0e10cSrcweir 1166cdf0e10cSrcweir if( !IsParserWorking() ) 1167cdf0e10cSrcweir { 1168cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1169cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1170cdf0e10cSrcweir break; 1171cdf0e10cSrcweir } 1172cdf0e10cSrcweir 1173cdf0e10cSrcweir // suche das Token in der Tabelle: 1174cdf0e10cSrcweir sSaveToken = aToken; 1175cdf0e10cSrcweir aToken.ToUpperAscii(); 1176cdf0e10cSrcweir if( 0 == (nRet = GetHTMLToken( aToken )) ) 1177cdf0e10cSrcweir // Unknown Control 1178cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1179cdf0e10cSrcweir 1180cdf0e10cSrcweir // Wenn es ein Token zum ausschalten ist ... 1181cdf0e10cSrcweir if( bOffState ) 1182cdf0e10cSrcweir { 1183cdf0e10cSrcweir if( HTML_TOKEN_ONOFF & nRet ) 1184cdf0e10cSrcweir { 1185cdf0e10cSrcweir // und es ein Off-Token gibt, das daraus machen 1186cdf0e10cSrcweir ++nRet; 1187cdf0e10cSrcweir } 1188cdf0e10cSrcweir else if( HTML_LINEBREAK!=nRet ) 1189cdf0e10cSrcweir { 1190cdf0e10cSrcweir // und es kein Off-Token gibt, ein unbekanntes 1191cdf0e10cSrcweir // Token daraus machen (ausser </BR>, das wird 1192cdf0e10cSrcweir // wie <BR> behandelt 1193cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_OFF; 1194cdf0e10cSrcweir } 1195cdf0e10cSrcweir } 1196cdf0e10cSrcweir 1197cdf0e10cSrcweir if( nRet == HTML_COMMENT ) 1198cdf0e10cSrcweir { 1199cdf0e10cSrcweir // fix: sSaveToken wegen Gross-/Kleinschreibung 1200cdf0e10cSrcweir // als Anfang des Kommentars benutzen und ein 1201cdf0e10cSrcweir // Space anhaengen. 1202cdf0e10cSrcweir aToken = sSaveToken; 1203cdf0e10cSrcweir if( '>'!=nNextCh ) 1204cdf0e10cSrcweir aToken += (sal_Unicode)' '; 1205cdf0e10cSrcweir sal_uLong nCStreamPos = 0; 1206cdf0e10cSrcweir sal_uLong nCLineNr = 0; 1207cdf0e10cSrcweir sal_uLong nCLinePos = 0; 1208cdf0e10cSrcweir xub_StrLen nCStrLen = 0; 1209cdf0e10cSrcweir 1210cdf0e10cSrcweir sal_Bool bDone = sal_False; 1211cdf0e10cSrcweir // bis zum schliessenden --> lesen. wenn keins gefunden 1212cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1213cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1214cdf0e10cSrcweir { 1215cdf0e10cSrcweir if( '>'==nNextCh ) 1216cdf0e10cSrcweir { 1217cdf0e10cSrcweir if( !nCStreamPos ) 1218cdf0e10cSrcweir { 1219cdf0e10cSrcweir nCStreamPos = rInput.Tell(); 1220cdf0e10cSrcweir nCStrLen = aToken.Len(); 1221cdf0e10cSrcweir nCLineNr = GetLineNr(); 1222cdf0e10cSrcweir nCLinePos = GetLinePos(); 1223cdf0e10cSrcweir } 1224cdf0e10cSrcweir bDone = aToken.Len() >= 2 && 1225cdf0e10cSrcweir aToken.Copy(aToken.Len()-2,2). 1226cdf0e10cSrcweir EqualsAscii( "--" ); 1227cdf0e10cSrcweir if( !bDone ) 1228cdf0e10cSrcweir aToken += nNextCh; 1229cdf0e10cSrcweir } 1230cdf0e10cSrcweir else 1231cdf0e10cSrcweir aToken += nNextCh; 1232cdf0e10cSrcweir if( !bDone ) 1233cdf0e10cSrcweir nNextCh = GetNextChar(); 1234cdf0e10cSrcweir } 1235cdf0e10cSrcweir if( !bDone && IsParserWorking() && nCStreamPos ) 1236cdf0e10cSrcweir { 1237cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1238cdf0e10cSrcweir SetLineNr( nCLineNr ); 1239cdf0e10cSrcweir SetLinePos( nCLinePos ); 1240cdf0e10cSrcweir ClearTxtConvContext(); 1241cdf0e10cSrcweir aToken.Erase( nCStrLen ); 1242cdf0e10cSrcweir nNextCh = '>'; 1243cdf0e10cSrcweir } 1244cdf0e10cSrcweir } 1245cdf0e10cSrcweir else 1246cdf0e10cSrcweir { 1247cdf0e10cSrcweir // den TokenString koennen wir jetzt verwerfen 1248cdf0e10cSrcweir aToken.Erase(); 1249cdf0e10cSrcweir } 1250cdf0e10cSrcweir 1251cdf0e10cSrcweir // dann lesen wir mal alles bis zur schliessenden '>' 1252cdf0e10cSrcweir if( '>' != nNextCh && IsParserWorking() ) 1253cdf0e10cSrcweir { 1254cdf0e10cSrcweir ScanText( '>' ); 1255cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1256cdf0e10cSrcweir { 1257cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1258cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1259cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1260cdf0e10cSrcweir SetLineNr( nLineNr ); 1261cdf0e10cSrcweir SetLinePos( nLinePos ); 1262cdf0e10cSrcweir ClearTxtConvContext(); 1263cdf0e10cSrcweir 1264cdf0e10cSrcweir aToken = '<'; 1265cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1266cdf0e10cSrcweir nNextCh = GetNextChar(); 1267cdf0e10cSrcweir bNextCh = sal_False; 1268cdf0e10cSrcweir break; 1269cdf0e10cSrcweir } 1270cdf0e10cSrcweir } 1271cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1272cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1273cdf0e10cSrcweir } 1274cdf0e10cSrcweir else 1275cdf0e10cSrcweir { 1276cdf0e10cSrcweir if( bOffState ) 1277cdf0e10cSrcweir { 1278cdf0e10cSrcweir // einfach alles wegschmeissen 1279cdf0e10cSrcweir ScanText( '>' ); 1280cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() ) 1281cdf0e10cSrcweir { 1282cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu 1283cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben 1284cdf0e10cSrcweir rInput.Seek( nStreamPos ); 1285cdf0e10cSrcweir SetLineNr( nLineNr ); 1286cdf0e10cSrcweir SetLinePos( nLinePos ); 1287cdf0e10cSrcweir ClearTxtConvContext(); 1288cdf0e10cSrcweir 1289cdf0e10cSrcweir aToken = '<'; 1290cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1291cdf0e10cSrcweir nNextCh = GetNextChar(); 1292cdf0e10cSrcweir bNextCh = sal_False; 1293cdf0e10cSrcweir break; 1294cdf0e10cSrcweir } 1295cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1296cdf0e10cSrcweir bReadNextChar = bReadNextCharSave; 1297cdf0e10cSrcweir aToken.Erase(); 1298cdf0e10cSrcweir } 1299cdf0e10cSrcweir else if( '%' == nNextCh ) 1300cdf0e10cSrcweir { 1301cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON; 1302cdf0e10cSrcweir 1303cdf0e10cSrcweir sal_uLong nCStreamPos = rInput.Tell(); 1304cdf0e10cSrcweir sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos(); 1305cdf0e10cSrcweir 1306cdf0e10cSrcweir sal_Bool bDone = sal_False; 1307cdf0e10cSrcweir // bis zum schliessenden %> lesen. wenn keins gefunden 1308cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen 1309cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() ) 1310cdf0e10cSrcweir { 1311cdf0e10cSrcweir bDone = '>'==nNextCh && aToken.Len() >= 1 && 1312cdf0e10cSrcweir '%' == aToken.GetChar( aToken.Len()-1 ); 1313cdf0e10cSrcweir if( !bDone ) 1314cdf0e10cSrcweir { 1315cdf0e10cSrcweir aToken += nNextCh; 1316cdf0e10cSrcweir nNextCh = GetNextChar(); 1317cdf0e10cSrcweir } 1318cdf0e10cSrcweir } 1319cdf0e10cSrcweir if( !bDone && IsParserWorking() ) 1320cdf0e10cSrcweir { 1321cdf0e10cSrcweir rInput.Seek( nCStreamPos ); 1322cdf0e10cSrcweir SetLineNr( nCLineNr ); 1323cdf0e10cSrcweir SetLinePos( nCLinePos ); 1324cdf0e10cSrcweir ClearTxtConvContext(); 1325cdf0e10cSrcweir aToken.AssignAscii( "<%", 2 ); 1326cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1327cdf0e10cSrcweir break; 1328cdf0e10cSrcweir } 1329cdf0e10cSrcweir if( IsParserWorking() ) 1330cdf0e10cSrcweir { 1331cdf0e10cSrcweir sSaveToken = aToken; 1332cdf0e10cSrcweir aToken.Erase(); 1333cdf0e10cSrcweir } 1334cdf0e10cSrcweir } 1335cdf0e10cSrcweir else 1336cdf0e10cSrcweir { 1337cdf0e10cSrcweir aToken = '<'; 1338cdf0e10cSrcweir nRet = HTML_TEXTTOKEN; 1339cdf0e10cSrcweir bNextCh = sal_False; 1340cdf0e10cSrcweir break; 1341cdf0e10cSrcweir } 1342cdf0e10cSrcweir } 1343cdf0e10cSrcweir 1344cdf0e10cSrcweir if( IsParserWorking() ) 1345cdf0e10cSrcweir { 1346cdf0e10cSrcweir bNextCh = '>' == nNextCh; 1347cdf0e10cSrcweir switch( nRet ) 1348cdf0e10cSrcweir { 1349cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1350cdf0e10cSrcweir bReadTextArea = sal_True; 1351cdf0e10cSrcweir break; 1352cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1353cdf0e10cSrcweir bReadTextArea = sal_False; 1354cdf0e10cSrcweir break; 1355cdf0e10cSrcweir case HTML_SCRIPT_ON: 1356cdf0e10cSrcweir if( !bReadTextArea ) 1357cdf0e10cSrcweir bReadScript = sal_True; 1358cdf0e10cSrcweir break; 1359cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1360cdf0e10cSrcweir if( !bReadTextArea ) 1361cdf0e10cSrcweir { 1362cdf0e10cSrcweir bReadScript = sal_False; 1363cdf0e10cSrcweir // JavaScript kann den Stream veraendern 1364cdf0e10cSrcweir // also muss das letzte Zeichen nochmals 1365cdf0e10cSrcweir // gelesen werden 1366cdf0e10cSrcweir bReadNextChar = sal_True; 1367cdf0e10cSrcweir bNextCh = sal_False; 1368cdf0e10cSrcweir } 1369cdf0e10cSrcweir break; 1370cdf0e10cSrcweir 1371cdf0e10cSrcweir case HTML_STYLE_ON: 1372cdf0e10cSrcweir bReadStyle = sal_True; 1373cdf0e10cSrcweir break; 1374cdf0e10cSrcweir case HTML_STYLE_OFF: 1375cdf0e10cSrcweir bReadStyle = sal_False; 1376cdf0e10cSrcweir break; 1377cdf0e10cSrcweir } 1378cdf0e10cSrcweir 1379cdf0e10cSrcweir } 1380cdf0e10cSrcweir } 1381cdf0e10cSrcweir break; 1382cdf0e10cSrcweir 1383cdf0e10cSrcweir case sal_Unicode(EOF): 1384cdf0e10cSrcweir if( rInput.IsEof() ) 1385cdf0e10cSrcweir { 1386cdf0e10cSrcweir eState = SVPAR_ACCEPTED; 1387cdf0e10cSrcweir nRet = nNextCh; 1388cdf0e10cSrcweir } 1389cdf0e10cSrcweir else 1390cdf0e10cSrcweir { 1391cdf0e10cSrcweir // normalen Text lesen 1392cdf0e10cSrcweir goto scan_text; 1393cdf0e10cSrcweir } 1394cdf0e10cSrcweir break; 1395cdf0e10cSrcweir 1396cdf0e10cSrcweir case '\f': 1397cdf0e10cSrcweir // Form-Feeds werden jetzt extra nach oben gereicht 1398cdf0e10cSrcweir nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR 1399cdf0e10cSrcweir break; 1400cdf0e10cSrcweir 1401cdf0e10cSrcweir case '\n': 1402cdf0e10cSrcweir case '\r': 1403cdf0e10cSrcweir if( bReadListing || bReadXMP || bReadPRE || bReadTextArea ) 1404cdf0e10cSrcweir { 1405cdf0e10cSrcweir sal_Unicode c = GetNextChar(); 1406cdf0e10cSrcweir if( ( '\n' != nNextCh || '\r' != c ) && 1407cdf0e10cSrcweir ( '\r' != nNextCh || '\n' != c ) ) 1408cdf0e10cSrcweir { 1409cdf0e10cSrcweir bNextCh = sal_False; 1410cdf0e10cSrcweir nNextCh = c; 1411cdf0e10cSrcweir } 1412cdf0e10cSrcweir nRet = HTML_NEWPARA; 1413cdf0e10cSrcweir break; 1414cdf0e10cSrcweir } 1415cdf0e10cSrcweir // kein break ! 1416cdf0e10cSrcweir case '\t': 1417cdf0e10cSrcweir if( bReadPRE ) 1418cdf0e10cSrcweir { 1419cdf0e10cSrcweir nRet = HTML_TABCHAR; 1420cdf0e10cSrcweir break; 1421cdf0e10cSrcweir } 1422cdf0e10cSrcweir // kein break ! 1423cdf0e10cSrcweir case ' ': 1424cdf0e10cSrcweir // kein break ! 1425cdf0e10cSrcweir default: 1426cdf0e10cSrcweir 1427cdf0e10cSrcweir scan_text: 1428cdf0e10cSrcweir // es folgt "normaler" Text 1429cdf0e10cSrcweir nRet = ScanText(); 1430cdf0e10cSrcweir bNextCh = 0 == aToken.Len(); 1431cdf0e10cSrcweir 1432cdf0e10cSrcweir // der Text sollte noch verarbeitet werden 1433cdf0e10cSrcweir if( !bNextCh && eState == SVPAR_PENDING ) 1434cdf0e10cSrcweir { 1435cdf0e10cSrcweir eState = SVPAR_WORKING; 1436cdf0e10cSrcweir bReadNextChar = sal_True; 1437cdf0e10cSrcweir } 1438cdf0e10cSrcweir 1439cdf0e10cSrcweir break; 1440cdf0e10cSrcweir } 1441cdf0e10cSrcweir 1442cdf0e10cSrcweir if( bNextCh && SVPAR_WORKING == eState ) 1443cdf0e10cSrcweir { 1444cdf0e10cSrcweir nNextCh = GetNextChar(); 1445cdf0e10cSrcweir if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet ) 1446cdf0e10cSrcweir { 1447cdf0e10cSrcweir bReadNextChar = sal_True; 1448cdf0e10cSrcweir eState = SVPAR_WORKING; 1449cdf0e10cSrcweir } 1450cdf0e10cSrcweir } 1451cdf0e10cSrcweir 1452cdf0e10cSrcweir } while( !nRet && SVPAR_WORKING == eState ); 1453cdf0e10cSrcweir 1454cdf0e10cSrcweir if( SVPAR_PENDING == eState ) 1455cdf0e10cSrcweir nRet = -1; // irgendwas ungueltiges 1456cdf0e10cSrcweir 1457cdf0e10cSrcweir return nRet; 1458cdf0e10cSrcweir } 1459cdf0e10cSrcweir 1460cdf0e10cSrcweir void HTMLParser::UnescapeToken() 1461cdf0e10cSrcweir { 1462cdf0e10cSrcweir xub_StrLen nPos=0; 1463cdf0e10cSrcweir 1464cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1465cdf0e10cSrcweir while( nPos < aToken.Len() ) 1466cdf0e10cSrcweir { 1467cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1468cdf0e10cSrcweir bEscape = sal_False; 1469cdf0e10cSrcweir if( '\\'==aToken.GetChar(nPos) && !bOldEscape ) 1470cdf0e10cSrcweir { 1471cdf0e10cSrcweir aToken.Erase( nPos, 1 ); 1472cdf0e10cSrcweir bEscape = sal_True; 1473cdf0e10cSrcweir } 1474cdf0e10cSrcweir else 1475cdf0e10cSrcweir { 1476cdf0e10cSrcweir nPos++; 1477cdf0e10cSrcweir } 1478cdf0e10cSrcweir } 1479cdf0e10cSrcweir } 1480cdf0e10cSrcweir 1481cdf0e10cSrcweir // hole die Optionen 1482cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const 1483cdf0e10cSrcweir { 1484cdf0e10cSrcweir // wenn die Option fuer das aktuelle Token schon einmal 1485cdf0e10cSrcweir // geholt wurden, geben wir sie noch einmal zurueck 1486cdf0e10cSrcweir if( pOptions->Count() ) 1487cdf0e10cSrcweir return pOptions; 1488cdf0e10cSrcweir 1489cdf0e10cSrcweir xub_StrLen nPos = 0; 1490cdf0e10cSrcweir while( nPos < aToken.Len() ) 1491cdf0e10cSrcweir { 1492cdf0e10cSrcweir // ein Zeichen ? Dann faengt hier eine Option an 1493cdf0e10cSrcweir if( HTML_ISALPHA( aToken.GetChar(nPos) ) ) 1494cdf0e10cSrcweir { 1495cdf0e10cSrcweir int nToken; 1496cdf0e10cSrcweir String aValue; 1497cdf0e10cSrcweir xub_StrLen nStt = nPos; 1498cdf0e10cSrcweir sal_Unicode cChar = 0; 1499cdf0e10cSrcweir 1500cdf0e10cSrcweir // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt. 1501cdf0e10cSrcweir // Netscape achtet aber nur auf "=" und Leerzeichen (siehe 1502cdf0e10cSrcweir // Mozilla: PA_FetchRequestedNameValues in 1503cdf0e10cSrcweir // lipparse/pa_mdl.c 1504cdf0e10cSrcweir // while( nPos < aToken.Len() && 1505cdf0e10cSrcweir // ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) ) 1506cdf0e10cSrcweir while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) && 1507cdf0e10cSrcweir HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) ) 1508cdf0e10cSrcweir nPos++; 1509cdf0e10cSrcweir 1510cdf0e10cSrcweir String sName( aToken.Copy( nStt, nPos-nStt ) ); 1511cdf0e10cSrcweir 1512cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben 1513cdf0e10cSrcweir // also nur fuers Suchen in UpperCase wandeln 1514cdf0e10cSrcweir String sNameUpperCase( sName ); 1515cdf0e10cSrcweir sNameUpperCase.ToUpperAscii(); 1516cdf0e10cSrcweir 1517cdf0e10cSrcweir nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig 1518cdf0e10cSrcweir DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN, 1519cdf0e10cSrcweir "GetOption: unbekannte HTML-Option" ); 1520cdf0e10cSrcweir sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START || 1521cdf0e10cSrcweir nToken >= HTML_OPTION_SCRIPT_END) && 1522cdf0e10cSrcweir (!pNoConvertToken || nToken != *pNoConvertToken); 1523cdf0e10cSrcweir 1524cdf0e10cSrcweir while( nPos < aToken.Len() && 1525cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1526cdf0e10cSrcweir HTML_ISSPACE(cChar) ) ) 1527cdf0e10cSrcweir nPos++; 1528cdf0e10cSrcweir 1529cdf0e10cSrcweir // hat die Option auch einen Wert? 1530cdf0e10cSrcweir if( nPos!=aToken.Len() && '='==cChar ) 1531cdf0e10cSrcweir { 1532cdf0e10cSrcweir nPos++; 1533cdf0e10cSrcweir 1534cdf0e10cSrcweir while( nPos < aToken.Len() && 1535cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) || 1536cdf0e10cSrcweir ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) ) 1537cdf0e10cSrcweir nPos++; 1538cdf0e10cSrcweir 1539cdf0e10cSrcweir if( nPos != aToken.Len() ) 1540cdf0e10cSrcweir { 1541cdf0e10cSrcweir xub_StrLen nLen = 0; 1542cdf0e10cSrcweir nStt = nPos; 1543cdf0e10cSrcweir if( ('"'==cChar) || ('\'')==cChar ) 1544cdf0e10cSrcweir { 1545cdf0e10cSrcweir sal_Unicode cEnd = cChar; 1546cdf0e10cSrcweir nPos++; nStt++; 1547cdf0e10cSrcweir sal_Bool bDone = sal_False; 1548cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1549cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1550cdf0e10cSrcweir { 1551cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1552cdf0e10cSrcweir bEscape = sal_False; 1553cdf0e10cSrcweir cChar = aToken.GetChar(nPos); 1554cdf0e10cSrcweir switch( cChar ) 1555cdf0e10cSrcweir { 1556cdf0e10cSrcweir case '\r': 1557cdf0e10cSrcweir case '\n': 1558cdf0e10cSrcweir if( bStripCRLF ) 1559cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1560cdf0e10cSrcweir else 1561cdf0e10cSrcweir nPos++, nLen++; 1562cdf0e10cSrcweir break; 1563cdf0e10cSrcweir case '\\': 1564cdf0e10cSrcweir if( bOldEscape ) 1565cdf0e10cSrcweir { 1566cdf0e10cSrcweir nPos++, nLen++; 1567cdf0e10cSrcweir } 1568cdf0e10cSrcweir else 1569cdf0e10cSrcweir { 1570cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1571cdf0e10cSrcweir bEscape = sal_True; 1572cdf0e10cSrcweir } 1573cdf0e10cSrcweir break; 1574cdf0e10cSrcweir case '"': 1575cdf0e10cSrcweir case '\'': 1576cdf0e10cSrcweir bDone = !bOldEscape && cChar==cEnd; 1577cdf0e10cSrcweir if( !bDone ) 1578cdf0e10cSrcweir nPos++, nLen++; 1579cdf0e10cSrcweir break; 1580cdf0e10cSrcweir default: 1581cdf0e10cSrcweir nPos++, nLen++; 1582cdf0e10cSrcweir break; 1583cdf0e10cSrcweir } 1584cdf0e10cSrcweir } 1585cdf0e10cSrcweir if( nPos!=aToken.Len() ) 1586cdf0e10cSrcweir nPos++; 1587cdf0e10cSrcweir } 1588cdf0e10cSrcweir else 1589cdf0e10cSrcweir { 1590cdf0e10cSrcweir // hier sind wir etwas laxer als der 1591cdf0e10cSrcweir // Standard und erlauben alles druckbare 1592cdf0e10cSrcweir sal_Bool bEscape = sal_False; 1593cdf0e10cSrcweir sal_Bool bDone = sal_False; 1594cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone ) 1595cdf0e10cSrcweir { 1596cdf0e10cSrcweir sal_Bool bOldEscape = bEscape; 1597cdf0e10cSrcweir bEscape = sal_False; 1598cdf0e10cSrcweir sal_Unicode c = aToken.GetChar(nPos); 1599cdf0e10cSrcweir switch( c ) 1600cdf0e10cSrcweir { 1601cdf0e10cSrcweir case ' ': 1602cdf0e10cSrcweir bDone = !bOldEscape; 1603cdf0e10cSrcweir if( !bDone ) 1604cdf0e10cSrcweir nPos++, nLen++; 1605cdf0e10cSrcweir break; 1606cdf0e10cSrcweir 1607cdf0e10cSrcweir case '\t': 1608cdf0e10cSrcweir case '\r': 1609cdf0e10cSrcweir case '\n': 1610cdf0e10cSrcweir bDone = sal_True; 1611cdf0e10cSrcweir break; 1612cdf0e10cSrcweir 1613cdf0e10cSrcweir case '\\': 1614cdf0e10cSrcweir if( bOldEscape ) 1615cdf0e10cSrcweir { 1616cdf0e10cSrcweir nPos++, nLen++; 1617cdf0e10cSrcweir } 1618cdf0e10cSrcweir else 1619cdf0e10cSrcweir { 1620cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 ); 1621cdf0e10cSrcweir bEscape = sal_True; 1622cdf0e10cSrcweir } 1623cdf0e10cSrcweir break; 1624cdf0e10cSrcweir 1625cdf0e10cSrcweir default: 1626cdf0e10cSrcweir if( HTML_ISPRINTABLE( c ) ) 1627cdf0e10cSrcweir nPos++, nLen++; 1628cdf0e10cSrcweir else 1629cdf0e10cSrcweir bDone = sal_True; 1630cdf0e10cSrcweir break; 1631cdf0e10cSrcweir } 1632cdf0e10cSrcweir } 1633cdf0e10cSrcweir } 1634cdf0e10cSrcweir 1635cdf0e10cSrcweir if( nLen ) 1636cdf0e10cSrcweir aValue = aToken.Copy( nStt, nLen ); 1637cdf0e10cSrcweir } 1638cdf0e10cSrcweir } 1639cdf0e10cSrcweir 1640cdf0e10cSrcweir // Wir kennen das Token und koennen es Speichern 1641cdf0e10cSrcweir HTMLOption *pOption = 1642cdf0e10cSrcweir new HTMLOption( 1643cdf0e10cSrcweir sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue ); 1644cdf0e10cSrcweir 1645cdf0e10cSrcweir pOptions->Insert( pOption, pOptions->Count() ); 1646cdf0e10cSrcweir 1647cdf0e10cSrcweir } 1648cdf0e10cSrcweir else 1649cdf0e10cSrcweir // white space un unerwartete Zeichen ignorieren wie 1650cdf0e10cSrcweir nPos++; 1651cdf0e10cSrcweir } 1652cdf0e10cSrcweir 1653cdf0e10cSrcweir return pOptions; 1654cdf0e10cSrcweir } 1655cdf0e10cSrcweir 1656cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken ) 1657cdf0e10cSrcweir { 1658cdf0e10cSrcweir switch( nToken ) 1659cdf0e10cSrcweir { 1660cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR 1661cdf0e10cSrcweir // diese werden laut Definition zu LFs 1662cdf0e10cSrcweir case HTML_PARABREAK_ON: 1663cdf0e10cSrcweir case HTML_LINEBREAK: 1664cdf0e10cSrcweir nToken = HTML_NEWPARA; 1665cdf0e10cSrcweir #else 1666cdf0e10cSrcweir // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung 1667cdf0e10cSrcweir case HTML_PARABREAK_ON: 1668cdf0e10cSrcweir nToken = HTML_LINEBREAK; 1669cdf0e10cSrcweir case HTML_LINEBREAK: 1670cdf0e10cSrcweir #endif 1671cdf0e10cSrcweir case HTML_NEWPARA: 1672cdf0e10cSrcweir nPre_LinePos = 0; 1673cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1674cdf0e10cSrcweir nToken = 0; 1675cdf0e10cSrcweir break; 1676cdf0e10cSrcweir 1677cdf0e10cSrcweir case HTML_TABCHAR: 1678cdf0e10cSrcweir { 1679cdf0e10cSrcweir xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >( 1680cdf0e10cSrcweir 8 - (nPre_LinePos % 8)); 1681cdf0e10cSrcweir DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" ); 1682cdf0e10cSrcweir aToken.Expand( nSpaces, ' ' ); 1683cdf0e10cSrcweir nPre_LinePos += nSpaces; 1684cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1685cdf0e10cSrcweir } 1686cdf0e10cSrcweir break; 1687cdf0e10cSrcweir // diese bleiben erhalten 1688cdf0e10cSrcweir case HTML_TEXTTOKEN: 1689cdf0e10cSrcweir nPre_LinePos += aToken.Len(); 1690cdf0e10cSrcweir break; 1691cdf0e10cSrcweir 1692cdf0e10cSrcweir case HTML_SELECT_ON: 1693cdf0e10cSrcweir case HTML_SELECT_OFF: 1694cdf0e10cSrcweir case HTML_BODY_ON: 1695cdf0e10cSrcweir case HTML_FORM_ON: 1696cdf0e10cSrcweir case HTML_FORM_OFF: 1697cdf0e10cSrcweir case HTML_INPUT: 1698cdf0e10cSrcweir case HTML_OPTION: 1699cdf0e10cSrcweir case HTML_TEXTAREA_ON: 1700cdf0e10cSrcweir case HTML_TEXTAREA_OFF: 1701cdf0e10cSrcweir 1702cdf0e10cSrcweir case HTML_IMAGE: 1703cdf0e10cSrcweir case HTML_APPLET_ON: 1704cdf0e10cSrcweir case HTML_APPLET_OFF: 1705cdf0e10cSrcweir case HTML_PARAM: 1706cdf0e10cSrcweir case HTML_EMBED: 1707cdf0e10cSrcweir 1708cdf0e10cSrcweir case HTML_HEAD1_ON: 1709cdf0e10cSrcweir case HTML_HEAD1_OFF: 1710cdf0e10cSrcweir case HTML_HEAD2_ON: 1711cdf0e10cSrcweir case HTML_HEAD2_OFF: 1712cdf0e10cSrcweir case HTML_HEAD3_ON: 1713cdf0e10cSrcweir case HTML_HEAD3_OFF: 1714cdf0e10cSrcweir case HTML_HEAD4_ON: 1715cdf0e10cSrcweir case HTML_HEAD4_OFF: 1716cdf0e10cSrcweir case HTML_HEAD5_ON: 1717cdf0e10cSrcweir case HTML_HEAD5_OFF: 1718cdf0e10cSrcweir case HTML_HEAD6_ON: 1719cdf0e10cSrcweir case HTML_HEAD6_OFF: 1720cdf0e10cSrcweir case HTML_BLOCKQUOTE_ON: 1721cdf0e10cSrcweir case HTML_BLOCKQUOTE_OFF: 1722cdf0e10cSrcweir case HTML_ADDRESS_ON: 1723cdf0e10cSrcweir case HTML_ADDRESS_OFF: 1724cdf0e10cSrcweir case HTML_HORZRULE: 1725cdf0e10cSrcweir 1726cdf0e10cSrcweir case HTML_CENTER_ON: 1727cdf0e10cSrcweir case HTML_CENTER_OFF: 1728cdf0e10cSrcweir case HTML_DIVISION_ON: 1729cdf0e10cSrcweir case HTML_DIVISION_OFF: 1730cdf0e10cSrcweir 1731cdf0e10cSrcweir case HTML_SCRIPT_ON: 1732cdf0e10cSrcweir case HTML_SCRIPT_OFF: 1733cdf0e10cSrcweir case HTML_RAWDATA: 1734cdf0e10cSrcweir 1735cdf0e10cSrcweir case HTML_TABLE_ON: 1736cdf0e10cSrcweir case HTML_TABLE_OFF: 1737cdf0e10cSrcweir case HTML_CAPTION_ON: 1738cdf0e10cSrcweir case HTML_CAPTION_OFF: 1739cdf0e10cSrcweir case HTML_COLGROUP_ON: 1740cdf0e10cSrcweir case HTML_COLGROUP_OFF: 1741cdf0e10cSrcweir case HTML_COL_ON: 1742cdf0e10cSrcweir case HTML_COL_OFF: 1743cdf0e10cSrcweir case HTML_THEAD_ON: 1744cdf0e10cSrcweir case HTML_THEAD_OFF: 1745cdf0e10cSrcweir case HTML_TFOOT_ON: 1746cdf0e10cSrcweir case HTML_TFOOT_OFF: 1747cdf0e10cSrcweir case HTML_TBODY_ON: 1748cdf0e10cSrcweir case HTML_TBODY_OFF: 1749cdf0e10cSrcweir case HTML_TABLEROW_ON: 1750cdf0e10cSrcweir case HTML_TABLEROW_OFF: 1751cdf0e10cSrcweir case HTML_TABLEDATA_ON: 1752cdf0e10cSrcweir case HTML_TABLEDATA_OFF: 1753cdf0e10cSrcweir case HTML_TABLEHEADER_ON: 1754cdf0e10cSrcweir case HTML_TABLEHEADER_OFF: 1755cdf0e10cSrcweir 1756cdf0e10cSrcweir case HTML_ANCHOR_ON: 1757cdf0e10cSrcweir case HTML_ANCHOR_OFF: 1758cdf0e10cSrcweir case HTML_BOLD_ON: 1759cdf0e10cSrcweir case HTML_BOLD_OFF: 1760cdf0e10cSrcweir case HTML_ITALIC_ON: 1761cdf0e10cSrcweir case HTML_ITALIC_OFF: 1762cdf0e10cSrcweir case HTML_STRIKE_ON: 1763cdf0e10cSrcweir case HTML_STRIKE_OFF: 1764cdf0e10cSrcweir case HTML_STRIKETHROUGH_ON: 1765cdf0e10cSrcweir case HTML_STRIKETHROUGH_OFF: 1766cdf0e10cSrcweir case HTML_UNDERLINE_ON: 1767cdf0e10cSrcweir case HTML_UNDERLINE_OFF: 1768cdf0e10cSrcweir case HTML_BASEFONT_ON: 1769cdf0e10cSrcweir case HTML_BASEFONT_OFF: 1770cdf0e10cSrcweir case HTML_FONT_ON: 1771cdf0e10cSrcweir case HTML_FONT_OFF: 1772cdf0e10cSrcweir case HTML_BLINK_ON: 1773cdf0e10cSrcweir case HTML_BLINK_OFF: 1774cdf0e10cSrcweir case HTML_SPAN_ON: 1775cdf0e10cSrcweir case HTML_SPAN_OFF: 1776cdf0e10cSrcweir case HTML_SUBSCRIPT_ON: 1777cdf0e10cSrcweir case HTML_SUBSCRIPT_OFF: 1778cdf0e10cSrcweir case HTML_SUPERSCRIPT_ON: 1779cdf0e10cSrcweir case HTML_SUPERSCRIPT_OFF: 1780cdf0e10cSrcweir case HTML_BIGPRINT_ON: 1781cdf0e10cSrcweir case HTML_BIGPRINT_OFF: 1782cdf0e10cSrcweir case HTML_SMALLPRINT_OFF: 1783cdf0e10cSrcweir case HTML_SMALLPRINT_ON: 1784cdf0e10cSrcweir 1785cdf0e10cSrcweir case HTML_EMPHASIS_ON: 1786cdf0e10cSrcweir case HTML_EMPHASIS_OFF: 1787cdf0e10cSrcweir case HTML_CITIATION_ON: 1788cdf0e10cSrcweir case HTML_CITIATION_OFF: 1789cdf0e10cSrcweir case HTML_STRONG_ON: 1790cdf0e10cSrcweir case HTML_STRONG_OFF: 1791cdf0e10cSrcweir case HTML_CODE_ON: 1792cdf0e10cSrcweir case HTML_CODE_OFF: 1793cdf0e10cSrcweir case HTML_SAMPLE_ON: 1794cdf0e10cSrcweir case HTML_SAMPLE_OFF: 1795cdf0e10cSrcweir case HTML_KEYBOARD_ON: 1796cdf0e10cSrcweir case HTML_KEYBOARD_OFF: 1797cdf0e10cSrcweir case HTML_VARIABLE_ON: 1798cdf0e10cSrcweir case HTML_VARIABLE_OFF: 1799cdf0e10cSrcweir case HTML_DEFINSTANCE_ON: 1800cdf0e10cSrcweir case HTML_DEFINSTANCE_OFF: 1801cdf0e10cSrcweir case HTML_SHORTQUOTE_ON: 1802cdf0e10cSrcweir case HTML_SHORTQUOTE_OFF: 1803cdf0e10cSrcweir case HTML_LANGUAGE_ON: 1804cdf0e10cSrcweir case HTML_LANGUAGE_OFF: 1805cdf0e10cSrcweir case HTML_AUTHOR_ON: 1806cdf0e10cSrcweir case HTML_AUTHOR_OFF: 1807cdf0e10cSrcweir case HTML_PERSON_ON: 1808cdf0e10cSrcweir case HTML_PERSON_OFF: 1809cdf0e10cSrcweir case HTML_ACRONYM_ON: 1810cdf0e10cSrcweir case HTML_ACRONYM_OFF: 1811cdf0e10cSrcweir case HTML_ABBREVIATION_ON: 1812cdf0e10cSrcweir case HTML_ABBREVIATION_OFF: 1813cdf0e10cSrcweir case HTML_INSERTEDTEXT_ON: 1814cdf0e10cSrcweir case HTML_INSERTEDTEXT_OFF: 1815cdf0e10cSrcweir case HTML_DELETEDTEXT_ON: 1816cdf0e10cSrcweir case HTML_DELETEDTEXT_OFF: 1817cdf0e10cSrcweir case HTML_TELETYPE_ON: 1818cdf0e10cSrcweir case HTML_TELETYPE_OFF: 1819cdf0e10cSrcweir 1820cdf0e10cSrcweir break; 1821cdf0e10cSrcweir 1822cdf0e10cSrcweir // der Rest wird als unbekanntes Token behandelt 1823cdf0e10cSrcweir default: 1824cdf0e10cSrcweir if( nToken ) 1825cdf0e10cSrcweir { 1826cdf0e10cSrcweir nToken = 1827cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1828cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1829cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1830cdf0e10cSrcweir } 1831cdf0e10cSrcweir break; 1832cdf0e10cSrcweir } 1833cdf0e10cSrcweir 1834cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1835cdf0e10cSrcweir 1836cdf0e10cSrcweir return nToken; 1837cdf0e10cSrcweir } 1838cdf0e10cSrcweir 1839cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken ) 1840cdf0e10cSrcweir { 1841cdf0e10cSrcweir switch( nToken ) 1842cdf0e10cSrcweir { 1843cdf0e10cSrcweir case HTML_NEWPARA: 1844cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1845cdf0e10cSrcweir nToken = 0; 1846cdf0e10cSrcweir case HTML_TEXTTOKEN: 1847cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1848cdf0e10cSrcweir case HTML_SOFTHYPH: 1849cdf0e10cSrcweir break; // bleiben erhalten 1850cdf0e10cSrcweir 1851cdf0e10cSrcweir default: 1852cdf0e10cSrcweir if( nToken ) 1853cdf0e10cSrcweir { 1854cdf0e10cSrcweir if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) ) 1855cdf0e10cSrcweir { 1856cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1857cdf0e10cSrcweir sSaveToken.Insert( '/', 1 ); 1858cdf0e10cSrcweir } 1859cdf0e10cSrcweir else 1860cdf0e10cSrcweir sSaveToken.Insert( '<', 0 ); 1861cdf0e10cSrcweir if( aToken.Len() ) 1862cdf0e10cSrcweir { 1863cdf0e10cSrcweir UnescapeToken(); 1864cdf0e10cSrcweir sSaveToken += (sal_Unicode)' '; 1865cdf0e10cSrcweir aToken.Insert( sSaveToken, 0 ); 1866cdf0e10cSrcweir } 1867cdf0e10cSrcweir else 1868cdf0e10cSrcweir aToken = sSaveToken; 1869cdf0e10cSrcweir aToken += (sal_Unicode)'>'; 1870cdf0e10cSrcweir nToken = HTML_TEXTTOKEN; 1871cdf0e10cSrcweir } 1872cdf0e10cSrcweir break; 1873cdf0e10cSrcweir } 1874cdf0e10cSrcweir 1875cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1876cdf0e10cSrcweir 1877cdf0e10cSrcweir return nToken; 1878cdf0e10cSrcweir } 1879cdf0e10cSrcweir 1880cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken ) 1881cdf0e10cSrcweir { 1882cdf0e10cSrcweir switch( nToken ) 1883cdf0e10cSrcweir { 1884cdf0e10cSrcweir case HTML_NEWPARA: 1885cdf0e10cSrcweir if( bPre_IgnoreNewPara ) 1886cdf0e10cSrcweir nToken = 0; 1887cdf0e10cSrcweir case HTML_TEXTTOKEN: 1888cdf0e10cSrcweir case HTML_NONBREAKSPACE: 1889cdf0e10cSrcweir case HTML_SOFTHYPH: 1890cdf0e10cSrcweir break; // bleiben erhalten 1891cdf0e10cSrcweir 1892cdf0e10cSrcweir default: 1893cdf0e10cSrcweir if( nToken ) 1894cdf0e10cSrcweir { 1895cdf0e10cSrcweir nToken = 1896cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken)) 1897cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF 1898cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON ); 1899cdf0e10cSrcweir } 1900cdf0e10cSrcweir break; 1901cdf0e10cSrcweir } 1902cdf0e10cSrcweir 1903cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False; 1904cdf0e10cSrcweir 1905cdf0e10cSrcweir return nToken; 1906cdf0e10cSrcweir } 1907cdf0e10cSrcweir 1908cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader, 1909cdf0e10cSrcweir sal_Bool bSwitchToUCS2, 1910cdf0e10cSrcweir rtl_TextEncoding eEnc ) 1911cdf0e10cSrcweir { 1912cdf0e10cSrcweir // Einer der folgenden regulaeren Ausdrucke muss sich auf den String 1913cdf0e10cSrcweir // anwenden lassen, damit das Dok ein HTML-Dokument ist. 1914cdf0e10cSrcweir // 1915cdf0e10cSrcweir // ^[^<]*<[^ \t]*[> \t] 1916cdf0e10cSrcweir // ------- 1917cdf0e10cSrcweir // ^<! 1918cdf0e10cSrcweir // 1919cdf0e10cSrcweir // wobei der unterstrichene Teilausdruck einem HTML-Token 1920cdf0e10cSrcweir // ensprechen muss 1921cdf0e10cSrcweir 1922cdf0e10cSrcweir ByteString sCmp; 1923cdf0e10cSrcweir sal_Bool bUCS2B = sal_False; 1924cdf0e10cSrcweir if( bSwitchToUCS2 ) 1925cdf0e10cSrcweir { 1926cdf0e10cSrcweir if( 0xfeU == (sal_uChar)pHeader[0] && 1927cdf0e10cSrcweir 0xffU == (sal_uChar)pHeader[1] ) 1928cdf0e10cSrcweir { 1929cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1930cdf0e10cSrcweir bUCS2B = sal_True; 1931cdf0e10cSrcweir } 1932cdf0e10cSrcweir else if( 0xffU == (sal_uChar)pHeader[0] && 1933cdf0e10cSrcweir 0xfeU == (sal_uChar)pHeader[1] ) 1934cdf0e10cSrcweir { 1935cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2; 1936cdf0e10cSrcweir } 1937cdf0e10cSrcweir } 1938cdf0e10cSrcweir if 1939cdf0e10cSrcweir ( 1940cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 == eEnc && 1941cdf0e10cSrcweir ( 1942cdf0e10cSrcweir (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) || 1943cdf0e10cSrcweir (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1]) 1944cdf0e10cSrcweir ) 1945cdf0e10cSrcweir ) 1946cdf0e10cSrcweir { 1947cdf0e10cSrcweir if( 0xfe == (sal_uChar)pHeader[0] ) 1948cdf0e10cSrcweir bUCS2B = sal_True; 1949cdf0e10cSrcweir 1950cdf0e10cSrcweir xub_StrLen nLen; 1951cdf0e10cSrcweir for( nLen = 2; 1952cdf0e10cSrcweir pHeader[nLen] != 0 || pHeader[nLen+1] != 0; 1953cdf0e10cSrcweir nLen+=2 ) 1954cdf0e10cSrcweir ; 1955cdf0e10cSrcweir 1956cdf0e10cSrcweir ::rtl::OStringBuffer sTmp( (nLen - 2)/2 ); 1957cdf0e10cSrcweir for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 ) 1958cdf0e10cSrcweir { 1959cdf0e10cSrcweir sal_Unicode cUC; 1960cdf0e10cSrcweir if( bUCS2B ) 1961cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1]; 1962cdf0e10cSrcweir else 1963cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos]; 1964cdf0e10cSrcweir if( 0U == cUC ) 1965cdf0e10cSrcweir break; 1966cdf0e10cSrcweir 1967cdf0e10cSrcweir sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' ); 1968cdf0e10cSrcweir } 1969cdf0e10cSrcweir sCmp = ByteString( sTmp.makeStringAndClear() ); 1970cdf0e10cSrcweir } 1971cdf0e10cSrcweir else 1972cdf0e10cSrcweir { 1973cdf0e10cSrcweir sCmp = (sal_Char *)pHeader; 1974cdf0e10cSrcweir } 1975cdf0e10cSrcweir 1976cdf0e10cSrcweir sCmp.ToUpperAscii(); 1977cdf0e10cSrcweir 1978cdf0e10cSrcweir // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen 1979cdf0e10cSrcweir xub_StrLen nStart = sCmp.Search( '<' ); 1980cdf0e10cSrcweir if( STRING_NOTFOUND == nStart ) 1981cdf0e10cSrcweir return sal_False; 1982cdf0e10cSrcweir nStart++; 1983cdf0e10cSrcweir 1984cdf0e10cSrcweir // danach duerfen beliebige andere Zeichen bis zu einem blank oder 1985cdf0e10cSrcweir // '>' kommen 1986cdf0e10cSrcweir sal_Char c; 1987cdf0e10cSrcweir xub_StrLen nPos; 1988cdf0e10cSrcweir for( nPos = nStart; nPos<sCmp.Len(); nPos++ ) 1989cdf0e10cSrcweir { 1990cdf0e10cSrcweir if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) ) 1991cdf0e10cSrcweir break; 1992cdf0e10cSrcweir } 1993cdf0e10cSrcweir 1994cdf0e10cSrcweir // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML 1995cdf0e10cSrcweir if( nPos==nStart ) 1996cdf0e10cSrcweir return sal_False; 1997cdf0e10cSrcweir 1998cdf0e10cSrcweir // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes 1999cdf0e10cSrcweir // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht 2000cdf0e10cSrcweir // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML 2001cdf0e10cSrcweir // interpretiert. 2002cdf0e10cSrcweir String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US ); 2003cdf0e10cSrcweir int nTok = GetHTMLToken( sTest ); 2004cdf0e10cSrcweir if( 0 != nTok && HTML_DIRLIST_ON != nTok ) 2005cdf0e10cSrcweir return sal_True; 2006cdf0e10cSrcweir 2007cdf0e10cSrcweir // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#) 2008cdf0e10cSrcweir if( nStart == 1 && '!' == sCmp.GetChar( 1 ) ) 2009cdf0e10cSrcweir return sal_True; 2010cdf0e10cSrcweir 2011cdf0e10cSrcweir // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen 2012cdf0e10cSrcweir nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html ); 2013cdf0e10cSrcweir if( nStart!=STRING_NOTFOUND && 2014cdf0e10cSrcweir nStart>0 && '<'==sCmp.GetChar(nStart-1) && 2015cdf0e10cSrcweir nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) ) 2016cdf0e10cSrcweir return sal_True; 2017cdf0e10cSrcweir 2018cdf0e10cSrcweir // sonst ist es wohl doch eher kein HTML-Dokument 2019cdf0e10cSrcweir return sal_False; 2020cdf0e10cSrcweir } 2021cdf0e10cSrcweir 2022cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL ) 2023cdf0e10cSrcweir { 2024cdf0e10cSrcweir if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) || 2025cdf0e10cSrcweir rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL ) 2026cdf0e10cSrcweir return sal_False; 2027cdf0e10cSrcweir 2028cdf0e10cSrcweir sal_Bool bFound = sal_False; 2029cdf0e10cSrcweir 2030cdf0e10cSrcweir if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL ) 2031cdf0e10cSrcweir { 2032cdf0e10cSrcweir String aName( rURL.Copy(16) ); 2033cdf0e10cSrcweir switch( aName.GetChar(0) ) 2034cdf0e10cSrcweir { 2035cdf0e10cSrcweir case 'b': 2036cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary ); 2037cdf0e10cSrcweir break; 2038cdf0e10cSrcweir case 'i': 2039cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) || 2040cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index ); 2041cdf0e10cSrcweir break; 2042cdf0e10cSrcweir case 'm': 2043cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) || 2044cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie ); 2045cdf0e10cSrcweir break; 2046cdf0e10cSrcweir case 's': 2047cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound ); 2048cdf0e10cSrcweir break; 2049cdf0e10cSrcweir case 't': 2050cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) || 2051cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text ); 2052cdf0e10cSrcweir break; 2053cdf0e10cSrcweir case 'u': 2054cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown ); 2055cdf0e10cSrcweir break; 2056cdf0e10cSrcweir } 2057cdf0e10cSrcweir } 2058cdf0e10cSrcweir else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL ) 2059cdf0e10cSrcweir { 2060cdf0e10cSrcweir String aName( rURL.Copy(14) ); 2061cdf0e10cSrcweir switch( aName.GetChar(0) ) 2062cdf0e10cSrcweir { 2063cdf0e10cSrcweir case 'b': 2064cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata ); 2065cdf0e10cSrcweir break; 2066cdf0e10cSrcweir case 'd': 2067cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed ); 2068cdf0e10cSrcweir break; 2069cdf0e10cSrcweir case 'e': 2070cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed ); 2071cdf0e10cSrcweir break; 2072cdf0e10cSrcweir case 'i': 2073cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure ); 2074cdf0e10cSrcweir break; 2075cdf0e10cSrcweir case 'n': 2076cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound ); 2077cdf0e10cSrcweir break; 2078cdf0e10cSrcweir } 2079cdf0e10cSrcweir } 2080cdf0e10cSrcweir if( bFound ) 2081cdf0e10cSrcweir { 2082cdf0e10cSrcweir String sTmp ( rURL ); 2083cdf0e10cSrcweir rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image ); 2084cdf0e10cSrcweir rURL.Append( sTmp ); 2085cdf0e10cSrcweir } 2086cdf0e10cSrcweir 2087cdf0e10cSrcweir return bFound; 2088cdf0e10cSrcweir } 2089cdf0e10cSrcweir 2090cdf0e10cSrcweir #ifdef USED 2091cdf0e10cSrcweir void HTMLParser::SaveState( int nToken ) 2092cdf0e10cSrcweir { 2093cdf0e10cSrcweir SvParser::SaveState( nToken ); 2094cdf0e10cSrcweir } 2095cdf0e10cSrcweir 2096cdf0e10cSrcweir void HTMLParser::RestoreState() 2097cdf0e10cSrcweir { 2098cdf0e10cSrcweir SvParser::RestoreState(); 2099cdf0e10cSrcweir } 2100cdf0e10cSrcweir #endif 2101cdf0e10cSrcweir 2102cdf0e10cSrcweir 2103cdf0e10cSrcweir enum eHtmlMetas { 2104cdf0e10cSrcweir HTML_META_NONE = 0, 2105cdf0e10cSrcweir HTML_META_AUTHOR, 2106cdf0e10cSrcweir HTML_META_DESCRIPTION, 2107cdf0e10cSrcweir HTML_META_KEYWORDS, 2108cdf0e10cSrcweir HTML_META_REFRESH, 2109cdf0e10cSrcweir HTML_META_CLASSIFICATION, 2110cdf0e10cSrcweir HTML_META_CREATED, 2111cdf0e10cSrcweir HTML_META_CHANGEDBY, 2112cdf0e10cSrcweir HTML_META_CHANGED, 2113cdf0e10cSrcweir HTML_META_GENERATOR, 2114cdf0e10cSrcweir HTML_META_SDFOOTNOTE, 2115cdf0e10cSrcweir HTML_META_SDENDNOTE, 2116cdf0e10cSrcweir HTML_META_CONTENT_TYPE 2117cdf0e10cSrcweir }; 2118cdf0e10cSrcweir 2119cdf0e10cSrcweir // <META NAME=xxx> 2120cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] = 2121cdf0e10cSrcweir { 2122cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR }, 2123cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED }, 2124cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY }, 2125cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION}, 2126cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE }, 2127cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED }, 2128cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION }, 2129cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS }, 2130cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR }, 2131cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH }, 2132cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE }, 2133cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE }, 2134cdf0e10cSrcweir { 0, 0 } 2135cdf0e10cSrcweir }; 2136cdf0e10cSrcweir 2137cdf0e10cSrcweir 2138cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & ) 2139cdf0e10cSrcweir { 2140cdf0e10cSrcweir } 2141cdf0e10cSrcweir 2142cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl( 2143cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2144cdf0e10cSrcweir SvKeyValueIterator *i_pHTTPHeader, 2145cdf0e10cSrcweir const HTMLOptions *i_pOptions, 2146cdf0e10cSrcweir rtl_TextEncoding& o_rEnc ) 2147cdf0e10cSrcweir { 2148cdf0e10cSrcweir String aName, aContent; 2149cdf0e10cSrcweir sal_uInt16 nAction = HTML_META_NONE; 2150cdf0e10cSrcweir bool bHTTPEquiv = false, bChanged = false; 2151cdf0e10cSrcweir 2152cdf0e10cSrcweir for ( sal_uInt16 i = i_pOptions->Count(); i; ) 2153cdf0e10cSrcweir { 2154cdf0e10cSrcweir const HTMLOption *pOption = (*i_pOptions)[ --i ]; 2155cdf0e10cSrcweir switch ( pOption->GetToken() ) 2156cdf0e10cSrcweir { 2157cdf0e10cSrcweir case HTML_O_NAME: 2158cdf0e10cSrcweir aName = pOption->GetString(); 2159cdf0e10cSrcweir if ( HTML_META_NONE==nAction ) 2160cdf0e10cSrcweir { 2161cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2162cdf0e10cSrcweir } 2163cdf0e10cSrcweir break; 2164cdf0e10cSrcweir case HTML_O_HTTPEQUIV: 2165cdf0e10cSrcweir aName = pOption->GetString(); 2166cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable ); 2167cdf0e10cSrcweir bHTTPEquiv = true; 2168cdf0e10cSrcweir break; 2169cdf0e10cSrcweir case HTML_O_CONTENT: 2170cdf0e10cSrcweir aContent = pOption->GetString(); 2171cdf0e10cSrcweir break; 2172cdf0e10cSrcweir } 2173cdf0e10cSrcweir } 2174cdf0e10cSrcweir 2175cdf0e10cSrcweir if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction ) 2176cdf0e10cSrcweir { 2177cdf0e10cSrcweir // if it is not a Description, remove CRs and LFs from CONTENT 2178cdf0e10cSrcweir aContent.EraseAllChars( _CR ); 2179cdf0e10cSrcweir aContent.EraseAllChars( _LF ); 2180cdf0e10cSrcweir } 2181cdf0e10cSrcweir else 2182cdf0e10cSrcweir { 2183cdf0e10cSrcweir // convert line endings for Description 2184cdf0e10cSrcweir aContent.ConvertLineEnd(); 2185cdf0e10cSrcweir } 2186cdf0e10cSrcweir 2187cdf0e10cSrcweir 2188cdf0e10cSrcweir if ( bHTTPEquiv && i_pHTTPHeader ) 2189cdf0e10cSrcweir { 2190cdf0e10cSrcweir // #57232#: Netscape seems to just ignore a closing ", so we do too 2191cdf0e10cSrcweir if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) ) 2192cdf0e10cSrcweir { 2193cdf0e10cSrcweir aContent.Erase( aContent.Len() - 1 ); 2194cdf0e10cSrcweir } 2195cdf0e10cSrcweir SvKeyValue aKeyValue( aName, aContent ); 2196cdf0e10cSrcweir i_pHTTPHeader->Append( aKeyValue ); 2197cdf0e10cSrcweir } 2198cdf0e10cSrcweir 2199cdf0e10cSrcweir switch ( nAction ) 2200cdf0e10cSrcweir { 2201cdf0e10cSrcweir case HTML_META_AUTHOR: 2202cdf0e10cSrcweir if (i_xDocProps.is()) { 2203cdf0e10cSrcweir i_xDocProps->setAuthor( aContent ); 2204cdf0e10cSrcweir bChanged = true; 2205cdf0e10cSrcweir } 2206cdf0e10cSrcweir break; 2207cdf0e10cSrcweir case HTML_META_DESCRIPTION: 2208cdf0e10cSrcweir if (i_xDocProps.is()) { 2209cdf0e10cSrcweir i_xDocProps->setDescription( aContent ); 2210cdf0e10cSrcweir bChanged = true; 2211cdf0e10cSrcweir } 2212cdf0e10cSrcweir break; 2213cdf0e10cSrcweir case HTML_META_KEYWORDS: 2214cdf0e10cSrcweir if (i_xDocProps.is()) { 2215cdf0e10cSrcweir i_xDocProps->setKeywords( 2216cdf0e10cSrcweir ::comphelper::string::convertCommaSeparated(aContent)); 2217cdf0e10cSrcweir bChanged = true; 2218cdf0e10cSrcweir } 2219cdf0e10cSrcweir break; 2220cdf0e10cSrcweir case HTML_META_CLASSIFICATION: 2221cdf0e10cSrcweir if (i_xDocProps.is()) { 2222cdf0e10cSrcweir i_xDocProps->setSubject( aContent ); 2223cdf0e10cSrcweir bChanged = true; 2224cdf0e10cSrcweir } 2225cdf0e10cSrcweir break; 2226cdf0e10cSrcweir 2227cdf0e10cSrcweir case HTML_META_CHANGEDBY: 2228cdf0e10cSrcweir if (i_xDocProps.is()) { 2229cdf0e10cSrcweir i_xDocProps->setModifiedBy( aContent ); 2230cdf0e10cSrcweir } 2231cdf0e10cSrcweir break; 2232cdf0e10cSrcweir 2233cdf0e10cSrcweir case HTML_META_CREATED: 2234cdf0e10cSrcweir case HTML_META_CHANGED: 2235cdf0e10cSrcweir if ( i_xDocProps.is() && aContent.Len() && 2236cdf0e10cSrcweir aContent.GetTokenCount() == 2 ) 2237cdf0e10cSrcweir { 2238cdf0e10cSrcweir Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() ); 2239cdf0e10cSrcweir Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() ); 2240cdf0e10cSrcweir DateTime aDateTime( aDate, aTime ); 2241cdf0e10cSrcweir ::util::DateTime uDT(aDateTime.Get100Sec(), 2242cdf0e10cSrcweir aDateTime.GetSec(), aDateTime.GetMin(), 2243cdf0e10cSrcweir aDateTime.GetHour(), aDateTime.GetDay(), 2244cdf0e10cSrcweir aDateTime.GetMonth(), aDateTime.GetYear()); 2245cdf0e10cSrcweir if ( HTML_META_CREATED==nAction ) 2246cdf0e10cSrcweir i_xDocProps->setCreationDate( uDT ); 2247cdf0e10cSrcweir else 2248cdf0e10cSrcweir i_xDocProps->setModificationDate( uDT ); 2249cdf0e10cSrcweir bChanged = true; 2250cdf0e10cSrcweir } 2251cdf0e10cSrcweir break; 2252cdf0e10cSrcweir 2253cdf0e10cSrcweir case HTML_META_REFRESH: 2254cdf0e10cSrcweir DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader, 2255cdf0e10cSrcweir "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" ); 2256cdf0e10cSrcweir break; 2257cdf0e10cSrcweir 2258cdf0e10cSrcweir case HTML_META_CONTENT_TYPE: 2259cdf0e10cSrcweir if ( aContent.Len() ) 2260cdf0e10cSrcweir { 2261cdf0e10cSrcweir o_rEnc = GetEncodingByMIME( aContent ); 2262cdf0e10cSrcweir } 2263cdf0e10cSrcweir break; 2264cdf0e10cSrcweir 2265cdf0e10cSrcweir case HTML_META_NONE: 2266cdf0e10cSrcweir if ( !bHTTPEquiv ) 2267cdf0e10cSrcweir { 2268cdf0e10cSrcweir if (i_xDocProps.is()) 2269cdf0e10cSrcweir { 2270cdf0e10cSrcweir uno::Reference<beans::XPropertyContainer> xUDProps 2271cdf0e10cSrcweir = i_xDocProps->getUserDefinedProperties(); 2272cdf0e10cSrcweir try { 2273cdf0e10cSrcweir xUDProps->addProperty(aName, 2274cdf0e10cSrcweir beans::PropertyAttribute::REMOVEABLE, 2275cdf0e10cSrcweir uno::makeAny(::rtl::OUString(aContent))); 2276cdf0e10cSrcweir AddMetaUserDefined(aName); 2277cdf0e10cSrcweir bChanged = true; 2278cdf0e10cSrcweir } catch (uno::Exception &) { 2279cdf0e10cSrcweir // ignore 2280cdf0e10cSrcweir } 2281cdf0e10cSrcweir } 2282cdf0e10cSrcweir } 2283cdf0e10cSrcweir break; 2284cdf0e10cSrcweir default: 2285cdf0e10cSrcweir break; 2286cdf0e10cSrcweir } 2287cdf0e10cSrcweir 2288cdf0e10cSrcweir return bChanged; 2289cdf0e10cSrcweir } 2290cdf0e10cSrcweir 2291cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions( 2292cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps, 2293cdf0e10cSrcweir SvKeyValueIterator *i_pHeader ) 2294cdf0e10cSrcweir { 2295cdf0e10cSrcweir sal_uInt16 nContentOption = HTML_O_CONTENT; 2296cdf0e10cSrcweir rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW; 2297cdf0e10cSrcweir 2298cdf0e10cSrcweir bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader, 2299cdf0e10cSrcweir GetOptions(&nContentOption), 2300cdf0e10cSrcweir eEnc ); 2301cdf0e10cSrcweir 2302cdf0e10cSrcweir // If the encoding is set by a META tag, it may only overwrite the 2303cdf0e10cSrcweir // current encoding if both, the current and the new encoding, are 1-sal_uInt8 2304cdf0e10cSrcweir // encodings. Everything else cannot lead to reasonable results. 2305cdf0e10cSrcweir if (RTL_TEXTENCODING_DONTKNOW != eEnc && 2306cdf0e10cSrcweir rtl_isOctetTextEncoding( eEnc ) && 2307cdf0e10cSrcweir rtl_isOctetTextEncoding( GetSrcEncoding() ) ) 2308cdf0e10cSrcweir { 2309cdf0e10cSrcweir eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973# 2310cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2311cdf0e10cSrcweir } 2312cdf0e10cSrcweir 2313cdf0e10cSrcweir return bRet; 2314cdf0e10cSrcweir } 2315cdf0e10cSrcweir 2316cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime ) 2317cdf0e10cSrcweir { 2318cdf0e10cSrcweir ByteString sType; 2319cdf0e10cSrcweir ByteString sSubType; 2320cdf0e10cSrcweir INetContentTypeParameterList aParameters; 2321cdf0e10cSrcweir ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US ); 2322cdf0e10cSrcweir if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters)) 2323cdf0e10cSrcweir { 2324cdf0e10cSrcweir const INetContentTypeParameter * pCharset 2325cdf0e10cSrcweir = aParameters.find("charset"); 2326cdf0e10cSrcweir if (pCharset != 0) 2327cdf0e10cSrcweir { 2328cdf0e10cSrcweir ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US ); 2329cdf0e10cSrcweir return GetExtendedCompatibilityTextEncoding( 2330cdf0e10cSrcweir rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) ); 2331cdf0e10cSrcweir } 2332cdf0e10cSrcweir } 2333cdf0e10cSrcweir return RTL_TEXTENCODING_DONTKNOW; 2334cdf0e10cSrcweir } 2335cdf0e10cSrcweir 2336cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader ) 2337cdf0e10cSrcweir { 2338cdf0e10cSrcweir rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW; 2339cdf0e10cSrcweir if( pHTTPHeader ) 2340cdf0e10cSrcweir { 2341cdf0e10cSrcweir SvKeyValue aKV; 2342cdf0e10cSrcweir for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont; 2343cdf0e10cSrcweir bCont = pHTTPHeader->GetNext( aKV ) ) 2344cdf0e10cSrcweir { 2345cdf0e10cSrcweir if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) ) 2346cdf0e10cSrcweir { 2347cdf0e10cSrcweir if( aKV.GetValue().Len() ) 2348cdf0e10cSrcweir { 2349cdf0e10cSrcweir eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() ); 2350cdf0e10cSrcweir } 2351cdf0e10cSrcweir } 2352cdf0e10cSrcweir } 2353cdf0e10cSrcweir } 2354cdf0e10cSrcweir return eRet; 2355cdf0e10cSrcweir } 2356cdf0e10cSrcweir 2357cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader( 2358cdf0e10cSrcweir SvKeyValueIterator *pHTTPHeader ) 2359cdf0e10cSrcweir { 2360cdf0e10cSrcweir sal_Bool bRet = sal_False; 2361cdf0e10cSrcweir rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader ); 2362cdf0e10cSrcweir if(RTL_TEXTENCODING_DONTKNOW != eEnc) 2363cdf0e10cSrcweir { 2364cdf0e10cSrcweir SetSrcEncoding( eEnc ); 2365cdf0e10cSrcweir bRet = sal_True; 2366cdf0e10cSrcweir } 2367cdf0e10cSrcweir return bRet; 2368cdf0e10cSrcweir } 2369