xref: /AOO42X/main/sw/source/filter/ascii/parasc.cxx (revision facfa769b24085cdd3483ec17d21b4f42b335e99)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sw.hxx"
26 
27 #include <tools/stream.hxx>
28 #include <hintids.hxx>
29 #include <rtl/tencinfo.h>
30 #include <sfx2/printer.hxx>
31 #include <editeng/fontitem.hxx>
32 #include <editeng/langitem.hxx>
33 #include <editeng/brkitem.hxx>
34 #include <editeng/scripttypeitem.hxx>
35 #include <shellio.hxx>
36 #include <doc.hxx>
37 #include <swtypes.hxx>
38 #include <ndtxt.hxx>
39 #include <pam.hxx>
40 #include <frmatr.hxx>
41 #include <fltini.hxx>
42 #include <pagedesc.hxx>
43 #include <breakit.hxx>
44 #include <swerror.h>
45 #ifndef _STATSTR_HRC
46 #include <statstr.hrc> // ResId fuer Statusleiste
47 #endif
48 #include <mdiexp.hxx> // ...Percent()
49 #include <poolfmt.hxx>
50 
51 #include "vcl/metric.hxx"
52 
53 #define ASC_BUFFLEN 4096
54 
55 class SwASCIIParser
56 {
57     SwDoc* pDoc;
58     SwPaM* pPam;
59     SvStream& rInput;
60     sal_Char* pArr;
61     const SwAsciiOptions& rOpt;
62     SfxItemSet* pItemSet;
63     long nFileSize;
64     sal_uInt16 nScript;
65     bool bNewDoc;
66 
67     sal_uLong ReadChars();
68     void InsertText( const String& rStr );
69 
70 public:
71     SwASCIIParser( SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
72                             int bReadNewDoc, const SwAsciiOptions& rOpts );
73     ~SwASCIIParser();
74 
75     sal_uLong CallParser();
76 };
77 
78 
79 // Aufruf fuer die allg. Reader-Schnittstelle
Read(SwDoc & rDoc,const String &,SwPaM & rPam,const String &)80 sal_uLong AsciiReader::Read( SwDoc &rDoc, const String&, SwPaM &rPam, const String & )
81 {
82     if( !pStrm )
83     {
84         ASSERT( sal_False, "ASCII-Read without stream" );
85         return ERR_SWG_READ_ERROR;
86     }
87 
88     //JP 18.01.96: Alle Ueberschriften sind normalerweise ohne
89     //              Kapitelnummer. Darum hier explizit abschalten
90     //              weil das Default jetzt wieder auf AN ist.
91     if( !bInsertMode )
92         Reader::SetNoOutlineNum( rDoc );
93 
94     SwASCIIParser* pParser = new SwASCIIParser( &rDoc, rPam, *pStrm,
95                                         !bInsertMode, aOpt.GetASCIIOpts() );
96     sal_uLong nRet = pParser->CallParser();
97 
98     delete pParser;
99     // after Read reset the options
100     aOpt.ResetASCIIOpts();
101     return nRet;
102 }
103 
SwASCIIParser(SwDoc * pD,const SwPaM & rCrsr,SvStream & rIn,int bReadNewDoc,const SwAsciiOptions & rOpts)104 SwASCIIParser::SwASCIIParser(SwDoc* pD, const SwPaM& rCrsr, SvStream& rIn,
105     int bReadNewDoc, const SwAsciiOptions& rOpts)
106     : pDoc(pD), rInput(rIn), rOpt(rOpts), nScript(0), bNewDoc(bReadNewDoc)
107 {
108     pPam = new SwPaM( *rCrsr.GetPoint() );
109     pArr = new sal_Char [ ASC_BUFFLEN + 2 ];
110 
111     pItemSet = new SfxItemSet( pDoc->GetAttrPool(),
112                 RES_CHRATR_FONT,        RES_CHRATR_LANGUAGE,
113                 RES_CHRATR_CJK_FONT,    RES_CHRATR_CJK_LANGUAGE,
114                 RES_CHRATR_CTL_FONT,    RES_CHRATR_CTL_LANGUAGE,
115                 0 );
116 
117     // set defaults from the options
118     if( rOpt.GetLanguage() )
119     {
120         SvxLanguageItem aLang( (LanguageType)rOpt.GetLanguage(),
121                                 RES_CHRATR_LANGUAGE );
122         pItemSet->Put( aLang );
123         pItemSet->Put( aLang, RES_CHRATR_CJK_LANGUAGE );
124         pItemSet->Put( aLang, RES_CHRATR_CTL_LANGUAGE );
125     }
126     if( rOpt.GetFontName().Len() )
127     {
128         Font aTextFont( rOpt.GetFontName(), Size( 0, 10 ) );
129         if( pDoc->getPrinter( false ) )
130             aTextFont = pDoc->getPrinter( false )->GetFontMetric( aTextFont );
131         SvxFontItem aFont( aTextFont.GetFamily(), aTextFont.GetName(),
132                            aEmptyStr, aTextFont.GetPitch(), aTextFont.GetCharSet(), RES_CHRATR_FONT );
133         pItemSet->Put( aFont );
134         pItemSet->Put( aFont, RES_CHRATR_CJK_FONT );
135         pItemSet->Put( aFont, RES_CHRATR_CTL_FONT );
136     }
137 }
138 
~SwASCIIParser()139 SwASCIIParser::~SwASCIIParser()
140 {
141     delete pPam;
142     delete [] pArr;
143     delete pItemSet;
144 }
145 
146 
147 // Aufruf des Parsers
CallParser()148 sal_uLong SwASCIIParser::CallParser()
149 {
150     rInput.Seek(STREAM_SEEK_TO_END);
151     rInput.ResetError();
152 
153     nFileSize = rInput.Tell();
154     rInput.Seek(STREAM_SEEK_TO_BEGIN);
155     rInput.ResetError();
156 
157     ::StartProgress( STR_STATSTR_W4WREAD, 0, nFileSize, pDoc->GetDocShell() );
158 
159     SwPaM* pInsPam = 0;
160     xub_StrLen nSttCntnt = 0;
161     if (!bNewDoc)
162     {
163         const SwNodeIndex& rTmp = pPam->GetPoint()->nNode;
164         pInsPam = new SwPaM( rTmp, rTmp, 0, -1 );
165         nSttCntnt = pPam->GetPoint()->nContent.GetIndex();
166     }
167 
168     SwTxtFmtColl *pColl = 0;
169 
170     if (bNewDoc)
171     {
172         pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_HTML_PRE, false);
173         if (!pColl)
174             pColl = pDoc->GetTxtCollFromPool(RES_POOLCOLL_STANDARD,false);
175         if (pColl)
176             pDoc->SetTxtFmtColl(*pPam, pColl);
177     }
178 
179     sal_uLong nError = ReadChars();
180 
181     if( pItemSet )
182     {
183         // set only the attribute, for scanned scripts.
184         if( !( SCRIPTTYPE_LATIN & nScript ))
185         {
186             pItemSet->ClearItem( RES_CHRATR_FONT );
187             pItemSet->ClearItem( RES_CHRATR_LANGUAGE );
188         }
189         if( !( SCRIPTTYPE_ASIAN & nScript ))
190         {
191             pItemSet->ClearItem( RES_CHRATR_CJK_FONT );
192             pItemSet->ClearItem( RES_CHRATR_CJK_LANGUAGE );
193         }
194         if( !( SCRIPTTYPE_COMPLEX & nScript ))
195         {
196             pItemSet->ClearItem( RES_CHRATR_CTL_FONT );
197             pItemSet->ClearItem( RES_CHRATR_CTL_LANGUAGE );
198         }
199         if( pItemSet->Count() )
200         {
201             if( bNewDoc )
202             {
203                 if (pColl)
204                 {
205                     // Using the pool defaults for the font causes significant
206                     // trouble for the HTML filter, because it is not able
207                     // to export the pool defaults (or to be more precise:
208                     // the HTML filter is not able to detect whether a pool
209                     // default has changed or not. Even a comparison with the
210                     // HTMLi template does not work, because the defaults are
211                     // not copied when a new doc is created. The result of
212                     // comparing pool defaults therefor would be that the
213                     // defaults are exported always if the have changed for
214                     // text documents in general. That's not sensible, as well
215                     // as it is not sensible to export them always.
216                     sal_uInt16 aWhichIds[4] =
217                     {
218                         RES_CHRATR_FONT, RES_CHRATR_CJK_FONT,
219                         RES_CHRATR_CTL_FONT, 0
220                     };
221                     sal_uInt16 *pWhichIds = aWhichIds;
222                     while (*pWhichIds)
223                     {
224                         const SfxPoolItem *pItem;
225                         if (SFX_ITEM_SET == pItemSet->GetItemState(*pWhichIds,
226                             false, &pItem))
227                         {
228                             pColl->SetFmtAttr( *pItem );
229                             pItemSet->ClearItem( *pWhichIds );
230                         }
231                         ++pWhichIds;
232                     }
233                 }
234                 if (pItemSet->Count())
235                     pDoc->SetDefault(*pItemSet);
236             }
237             else if( pInsPam )
238             {
239                 // then set over the insert range the defined attributes
240                 *pInsPam->GetMark() = *pPam->GetPoint();
241                 pInsPam->GetPoint()->nNode++;
242                 pInsPam->GetPoint()->nContent.Assign(
243                                     pInsPam->GetCntntNode(), nSttCntnt );
244 
245                 // !!!!!
246                 ASSERT( sal_False, "Have to change - hard attr. to para. style" );
247                 pDoc->InsertItemSet( *pInsPam, *pItemSet, 0 );
248             }
249         }
250         delete pItemSet, pItemSet = 0;
251     }
252 
253     if( pInsPam )
254         delete pInsPam;
255 
256     ::EndProgress( pDoc->GetDocShell() );
257     return nError;
258 }
259 
ReadChars()260 sal_uLong SwASCIIParser::ReadChars()
261 {
262     sal_Unicode *pStt = 0, *pEnd = 0, *pLastStt = 0;
263     long nReadCnt = 0, nLineLen = 0;
264     sal_Unicode cLastCR = 0;
265     bool bSwapUnicode = false;
266 
267     const SwAsciiOptions *pUseMe=&rOpt;
268     SwAsciiOptions aEmpty;
269     if (nFileSize >= 2 &&
270         aEmpty.GetFontName() == rOpt.GetFontName() &&
271         aEmpty.GetCharSet() == rOpt.GetCharSet() &&
272         aEmpty.GetLanguage() == rOpt.GetLanguage() &&
273         aEmpty.GetParaFlags() == rOpt.GetParaFlags())
274     {
275         sal_uLong nLen, nOrig;
276         nOrig = nLen = rInput.Read(pArr, ASC_BUFFLEN);
277         CharSet eCharSet;
278         bool bRet = SwIoSystem::IsDetectableText(pArr, nLen, &eCharSet, &bSwapUnicode);
279         ASSERT(bRet, "Autodetect of text import without nag dialog must "
280             "have failed");
281         if (bRet && eCharSet != RTL_TEXTENCODING_DONTKNOW)
282         {
283             aEmpty.SetCharSet(eCharSet);
284             rInput.SeekRel(-(long(nLen)));
285         }
286         else
287             rInput.SeekRel(-(long(nOrig)));
288         pUseMe=&aEmpty;
289     }
290 
291     rtl_TextToUnicodeConverter hConverter=0;
292     rtl_TextToUnicodeContext hContext=0;
293     CharSet currentCharSet = pUseMe->GetCharSet();
294     if (RTL_TEXTENCODING_UCS2 != currentCharSet)
295     {
296         if( currentCharSet == RTL_TEXTENCODING_DONTKNOW )
297                 currentCharSet = RTL_TEXTENCODING_ASCII_US;
298         hConverter = rtl_createTextToUnicodeConverter( currentCharSet );
299         ASSERT( hConverter, "no string convert available" );
300         if (!hConverter)
301             return ERROR_SW_READ_BASE;
302         bSwapUnicode = false;
303         hContext = rtl_createTextToUnicodeContext( hConverter );
304     }
305     else if (pUseMe != &aEmpty) // Already successfully figured out type
306     {
307         rInput.StartReadingUnicodeText( currentCharSet );
308         bSwapUnicode = rInput.IsEndianSwap();
309     }
310 
311     String sWork;
312     sal_uLong nArrOffset = 0;
313 
314     do {
315         if( pStt >= pEnd )
316         {
317             if( pLastStt != pStt )
318                 InsertText( String( pLastStt ));
319 
320             // lese einen neuen Block ein
321             sal_uLong lGCount;
322             if( SVSTREAM_OK != rInput.GetError() || 0 == (lGCount =
323                         rInput.Read( pArr + nArrOffset,
324                                      ASC_BUFFLEN - nArrOffset )))
325                 break;      // aus der WHILE-Schleife heraus
326 
327             /*
328             #98380#
329             If there was some unconverted bytes on the last cycle then they
330             were put at the beginning of the array, so total bytes available
331             to convert this cycle includes them. If we found 0 following bytes
332             then we ignore the previous partial character.
333             */
334             lGCount+=nArrOffset;
335 
336             if( hConverter )
337             {
338                 sal_uInt32 nInfo;
339                 sal_Size nNewLen = lGCount, nCntBytes;
340                 sal_Unicode* pBuf = sWork.AllocBuffer( static_cast< xub_StrLen >(nNewLen) );
341 
342                 nNewLen = rtl_convertTextToUnicode( hConverter, hContext,
343                                 pArr, lGCount, pBuf, nNewLen,
344                                 (
345                                 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT |
346                                 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
347                                 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT |
348                                 RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE
349                                 ),
350                                 &nInfo,
351                                 &nCntBytes );
352                 if( 0 != ( nArrOffset = lGCount - nCntBytes ) )
353                     memmove( pArr, pArr + nCntBytes, nArrOffset );
354                 sWork.ReleaseBufferAccess( static_cast< xub_StrLen >(nNewLen) );
355 
356                 pStt = pLastStt = sWork.GetBufferAccess();
357                 pEnd = pStt + nNewLen;
358             }
359             else
360             {
361                 pStt = pLastStt = (sal_Unicode*)pArr;
362                 pEnd = (sal_Unicode*)(pArr + lGCount);
363 
364                 if( bSwapUnicode )
365                 {
366                     sal_Char* pF = pArr, *pN = pArr + 1;
367                     for( sal_uLong n = 0; n < lGCount; n += 2, pF += 2, pN += 2 )
368                     {
369                         sal_Char c = *pF;
370                         *pF = *pN;
371                         *pN = c;
372                     }
373                 }
374             }
375 
376             *pEnd = 0;
377             nReadCnt += lGCount;
378 
379             ::SetProgressState( nReadCnt, pDoc->GetDocShell() );
380 
381             if( cLastCR )
382             {
383                 if( 0x0a == *pStt && 0x0d == cLastCR )
384                     pLastStt = ++pStt;
385                 cLastCR = 0;
386                 nLineLen = 0;
387                 // JP 03.04.96: das letzte am Ende nehmen wir nicht
388                 if( !rInput.IsEof() || !(pEnd == pStt ||
389                     ( !*pEnd && pEnd == pStt+1 ) ) )
390                     pDoc->SplitNode( *pPam->GetPoint(), false );
391             }
392         }
393 
394         bool bIns = true, bSplitNode = false;
395         switch( *pStt )
396         {
397 //JP 12.11.2001: task 94636 - don't ignore all behind the zero character,
398 //                            change it to the default "control character"
399 //      case 0:
400 //                  pEnd = pStt;
401 //                  bIns = false ;
402 //                  break;
403 
404         case 0x0a:  if( LINEEND_LF == pUseMe->GetParaFlags() )
405                     {
406                         bIns = false;
407                         *pStt = 0;
408                         ++pStt;
409 
410                         // JP 03.04.96: das letzte am Ende nehmen wir nicht
411                         if( !rInput.IsEof() || pEnd != pStt )
412                             bSplitNode = true;
413                     }
414                     break;
415 
416         case 0x0d:  if( LINEEND_LF != pUseMe->GetParaFlags() )
417                     {
418                         bIns = false;
419                         *pStt = 0;
420                         ++pStt;
421 
422                         bool bChkSplit = false;
423                         if( LINEEND_CRLF == pUseMe->GetParaFlags() )
424                         {
425                             if( pStt == pEnd )
426                                 cLastCR = 0x0d;
427                             else if( 0x0a == *pStt )
428                             {
429                                 ++pStt;
430                                 bChkSplit = true;
431                             }
432                         }
433                         else
434                             bChkSplit = true;
435 
436                             // JP 03.04.96: das letzte am Ende nehmen wir nicht
437                         if( bChkSplit && ( !rInput.IsEof() || pEnd != pStt ))
438                             bSplitNode = true;
439                     }
440                     break;
441 
442         case 0x0c:
443                     {
444                         // dann mal einen harten Seitenumbruch einfuegen
445                         *pStt++ = 0;
446                         if( nLineLen )
447                         {
448                             // Change to charset system!!!!
449                             //rOpt.GetCharSet();
450                             InsertText( String( pLastStt ));
451                         }
452                         pDoc->SplitNode( *pPam->GetPoint(), false );
453                         pDoc->InsertPoolItem(
454                             *pPam, SvxFmtBreakItem( SVX_BREAK_PAGE_BEFORE, RES_BREAK ), 0);
455                         pLastStt = pStt;
456                         nLineLen = 0;
457                         bIns = false;
458                     }
459                     break;
460 
461         case 0x1a:
462                     if( nReadCnt == nFileSize && pStt+1 == pEnd )
463                         *pStt = 0;
464                     else
465                         *pStt = '#'; // Ersatzdarstellung
466                     break;
467 
468         case '\t':  break;
469 
470         default:
471             if( ' ' > *pStt )
472                     // Ctrl-Zchn gefunden ersetze durch '#'
473                 *pStt = '#';
474             break;
475         }
476 
477         if( bIns )
478         {
479             if( ( nLineLen >= MAX_ASCII_PARA - 100 ) &&
480                 ( ( *pStt == ' ' ) || ( nLineLen >= MAX_ASCII_PARA - 1 ) ) )
481             {
482                 sal_Unicode c = *pStt;
483                 *pStt = 0;
484                 InsertText( String( pLastStt ));
485                 pDoc->SplitNode( *pPam->GetPoint(), false );
486                 pLastStt = pStt;
487                 nLineLen = 0;
488                 *pStt = c;
489             }
490             ++pStt;
491             ++nLineLen;
492         }
493         else if( bSplitNode )
494         {
495             // es wurde ein CR/LF erkannt, also speichere den Text
496 
497             InsertText( String( pLastStt ));
498             pDoc->SplitNode( *pPam->GetPoint(), false );
499             pLastStt = pStt;
500             nLineLen = 0;
501         }
502     } while(true);
503 
504     if( hConverter )
505     {
506         rtl_destroyTextToUnicodeContext( hConverter, hContext );
507         rtl_destroyTextToUnicodeConverter( hConverter );
508     }
509     return 0;
510 }
511 
InsertText(const String & rStr)512 void SwASCIIParser::InsertText( const String& rStr )
513 {
514     pDoc->InsertString( *pPam, rStr );
515     if( pItemSet && pBreakIt && nScript != ( SCRIPTTYPE_LATIN |
516                                              SCRIPTTYPE_ASIAN |
517                                              SCRIPTTYPE_COMPLEX ) )
518         nScript |= pBreakIt->GetAllScriptsOfText( rStr );
519 }
520 
521 /* vim: set noet sw=4 ts=4: */
522