xref: /aoo41x/main/sal/rtl/source/uri.cxx (revision 87d2adbc)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_sal.hxx"
26 
27 #include "rtl/uri.h"
28 
29 #include "surrogates.h"
30 
31 #include "osl/diagnose.h"
32 #include "rtl/strbuf.hxx"
33 #include "rtl/textenc.h"
34 #include "rtl/textcvt.h"
35 #include "rtl/uri.h"
36 #include "rtl/ustrbuf.h"
37 #include "rtl/ustrbuf.hxx"
38 #include "rtl/ustring.h"
39 #include "rtl/ustring.hxx"
40 #include "sal/types.h"
41 
42 #include <cstddef>
43 
44 namespace {
45 
46 std::size_t const nCharClassSize = 128;
47 
48 sal_Unicode const cEscapePrefix = 0x25; // '%'
49 
isDigit(sal_uInt32 nUtf32)50 inline bool isDigit(sal_uInt32 nUtf32)
51 {
52     return nUtf32 >= 0x30 && nUtf32 <= 0x39; // '0'--'9'
53 }
54 
isAlpha(sal_uInt32 nUtf32)55 inline bool isAlpha(sal_uInt32 nUtf32)
56 {
57     // 'A'--'Z', 'a'--'z'
58     return (
59             (nUtf32 >= 0x41 && nUtf32 <= 0x5A) ||
60             (nUtf32 >= 0x61 && nUtf32 <= 0x7A)
61            );
62 }
63 
isHighSurrogate(sal_uInt32 nUtf16)64 inline bool isHighSurrogate(sal_uInt32 nUtf16)
65 {
66     return SAL_RTL_IS_HIGH_SURROGATE(nUtf16);
67 }
68 
isLowSurrogate(sal_uInt32 nUtf16)69 inline bool isLowSurrogate(sal_uInt32 nUtf16)
70 {
71     return SAL_RTL_IS_LOW_SURROGATE(nUtf16);
72 }
73 
combineSurrogates(sal_uInt32 high,sal_uInt32 low)74 inline sal_uInt32 combineSurrogates(sal_uInt32 high, sal_uInt32 low)
75 {
76     return SAL_RTL_COMBINE_SURROGATES(high, low);
77 }
78 
getHexWeight(sal_uInt32 nUtf32)79 inline int getHexWeight(sal_uInt32 nUtf32)
80 {
81     return nUtf32 >= 0x30 && nUtf32 <= 0x39 ? // '0'--'9'
82                static_cast< int >(nUtf32 - 0x30) :
83            nUtf32 >= 0x41 && nUtf32 <= 0x46 ? // 'A'--'F'
84                static_cast< int >(nUtf32 - 0x41 + 10) :
85            nUtf32 >= 0x61 && nUtf32 <= 0x66 ? // 'a'--'f'
86                static_cast< int >(nUtf32 - 0x61 + 10) :
87                -1; // not a hex digit
88 }
89 
isValid(sal_Bool const * pCharClass,sal_uInt32 nUtf32)90 inline bool isValid(sal_Bool const * pCharClass, sal_uInt32 nUtf32)
91 {
92     return nUtf32 < nCharClassSize && pCharClass[nUtf32];
93 }
94 
writeUnicode(rtl_uString ** pBuffer,sal_Int32 * pCapacity,sal_Unicode cChar)95 inline void writeUnicode(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
96                          sal_Unicode cChar)
97 {
98     rtl_uStringbuffer_insert(pBuffer, pCapacity, (*pBuffer)->length, &cChar, 1);
99 }
100 
101 enum EscapeType
102 {
103     EscapeNo,
104     EscapeChar,
105     EscapeOctet
106 };
107 
108 /* Read any of the following:
109 
110    - sequence of escape sequences representing character from eCharset,
111      translated to single UCS4 character; or
112 
113    - pair of UTF-16 surrogates, translated to single UCS4 character; or
114 
115    _ single UTF-16 character, extended to UCS4 character.
116  */
readUcs4(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,bool bEncoded,rtl_TextEncoding eCharset,EscapeType * pType)117 sal_uInt32 readUcs4(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
118                     bool bEncoded, rtl_TextEncoding eCharset,
119                     EscapeType * pType)
120 {
121     sal_uInt32 nChar = *(*pBegin)++;
122     int nWeight1;
123     int nWeight2;
124     if (nChar == cEscapePrefix && bEncoded && pEnd - *pBegin >= 2
125         && (nWeight1 = getHexWeight((*pBegin)[0])) >= 0
126         && (nWeight2 = getHexWeight((*pBegin)[1])) >= 0)
127     {
128         *pBegin += 2;
129         nChar = static_cast< sal_uInt32 >(nWeight1 << 4 | nWeight2);
130         if (nChar <= 0x7F)
131             *pType = EscapeChar;
132         else if (eCharset == RTL_TEXTENCODING_UTF8)
133         {
134             if (nChar >= 0xC0 && nChar <= 0xF4)
135             {
136                 sal_uInt32 nEncoded;
137                 int nShift;
138                 sal_uInt32 nMin;
139                 if (nChar <= 0xDF)
140                 {
141                     nEncoded = (nChar & 0x1F) << 6;
142                     nShift = 0;
143                     nMin = 0x80;
144                 }
145                 else if (nChar <= 0xEF)
146                 {
147                     nEncoded = (nChar & 0x0F) << 12;
148                     nShift = 6;
149                     nMin = 0x800;
150                 }
151                 else
152                 {
153                     nEncoded = (nChar & 0x07) << 18;
154                     nShift = 12;
155                     nMin = 0x10000;
156                 }
157                 sal_Unicode const * p = *pBegin;
158                 bool bUTF8 = true;
159                 for (; nShift >= 0; nShift -= 6)
160                 {
161                     if (pEnd - p < 3 || p[0] != cEscapePrefix
162                         || (nWeight1 = getHexWeight(p[1])) < 8
163                         || nWeight1 > 11
164                         || (nWeight2 = getHexWeight(p[2])) < 0)
165                     {
166                         bUTF8 = sal_False;
167                         break;
168                     }
169                     p += 3;
170                     nEncoded |= ((nWeight1 & 3) << 4 | nWeight2) << nShift;
171                 }
172                 if (bUTF8 && nEncoded >= nMin && !isHighSurrogate(nEncoded)
173                     && !isLowSurrogate(nEncoded) && nEncoded <= 0x10FFFF)
174                 {
175                     *pBegin = p;
176                     *pType = EscapeChar;
177                     return nEncoded;
178                 }
179             }
180             *pType = EscapeOctet;
181         }
182         else
183         {
184             rtl::OStringBuffer aBuf;
185             aBuf.append(static_cast< char >(nChar));
186             rtl_TextToUnicodeConverter aConverter
187                 = rtl_createTextToUnicodeConverter(eCharset);
188             sal_Unicode const * p = *pBegin;
189             for (;;)
190             {
191                 sal_Unicode aDst[2];
192                 sal_uInt32 nInfo;
193                 sal_Size nConverted;
194                 sal_Size nDstSize = rtl_convertTextToUnicode(
195                     aConverter, 0, aBuf.getStr(), aBuf.getLength(), aDst,
196                     sizeof aDst / sizeof aDst[0],
197                     (RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_ERROR
198                      | RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR
199                      | RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR),
200                     &nInfo, &nConverted);
201                 if (nInfo == 0)
202                 {
203                     OSL_ASSERT(
204                         nConverted
205                         == sal::static_int_cast< sal_uInt32 >(
206                             aBuf.getLength()));
207                     rtl_destroyTextToUnicodeConverter(aConverter);
208                     *pBegin = p;
209                     *pType = EscapeChar;
210                     OSL_ASSERT(
211                         nDstSize == 1
212                         || (nDstSize == 2 && isHighSurrogate(aDst[0])
213                             && isLowSurrogate(aDst[1])));
214                     return nDstSize == 1
215                         ? aDst[0] : combineSurrogates(aDst[0], aDst[1]);
216                 }
217                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
218                          && pEnd - p >= 3 && p[0] == cEscapePrefix
219                          && (nWeight1 = getHexWeight(p[1])) >= 0
220                          && (nWeight2 = getHexWeight(p[2])) >= 0)
221                 {
222                     p += 3;
223                     aBuf.append(static_cast< char >(nWeight1 << 4 | nWeight2));
224                 }
225                 else if (nInfo == RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL
226                          && p != pEnd && *p <= 0x7F)
227                 {
228                     aBuf.append(static_cast< char >(*p++));
229                 }
230                 else
231                 {
232                     OSL_ASSERT(
233                         (nInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)
234                         == 0);
235                     break;
236                 }
237             }
238             rtl_destroyTextToUnicodeConverter(aConverter);
239             *pType = EscapeOctet;
240         }
241         return nChar;
242     }
243     else
244     {
245         *pType = EscapeNo;
246         return isHighSurrogate(nChar) && *pBegin < pEnd
247                && isLowSurrogate(**pBegin) ?
248                    combineSurrogates(nChar, *(*pBegin)++) : nChar;
249     }
250 }
251 
writeUcs4(rtl_uString ** pBuffer,sal_Int32 * pCapacity,sal_uInt32 nUtf32)252 void writeUcs4(rtl_uString ** pBuffer, sal_Int32 * pCapacity, sal_uInt32 nUtf32)
253 {
254     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
255     if (nUtf32 <= 0xFFFF) {
256         writeUnicode(
257             pBuffer, pCapacity, static_cast< sal_Unicode >(nUtf32));
258     } else {
259         nUtf32 -= 0x10000;
260         writeUnicode(
261             pBuffer, pCapacity,
262             static_cast< sal_Unicode >(nUtf32 >> 10 | 0xD800));
263         writeUnicode(
264             pBuffer, pCapacity,
265             static_cast< sal_Unicode >((nUtf32 & 0x3FF) | 0xDC00));
266     }
267 }
268 
writeEscapeOctet(rtl_uString ** pBuffer,sal_Int32 * pCapacity,sal_uInt32 nOctet)269 void writeEscapeOctet(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
270                       sal_uInt32 nOctet)
271 {
272     OSL_ENSURE(nOctet <= 0xFF, "bad octet");
273 
274     static sal_Unicode const aHex[16]
275         = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
276             0x41, 0x42, 0x43, 0x44, 0x45, 0x46 }; /* '0'--'9', 'A'--'F' */
277 
278     writeUnicode(pBuffer, pCapacity, cEscapePrefix);
279     writeUnicode(pBuffer, pCapacity, aHex[nOctet >> 4]);
280     writeUnicode(pBuffer, pCapacity, aHex[nOctet & 15]);
281 }
282 
writeEscapeChar(rtl_uString ** pBuffer,sal_Int32 * pCapacity,sal_uInt32 nUtf32,rtl_TextEncoding eCharset,bool bStrict)283 bool writeEscapeChar(rtl_uString ** pBuffer, sal_Int32 * pCapacity,
284                      sal_uInt32 nUtf32, rtl_TextEncoding eCharset, bool bStrict)
285 {
286     OSL_ENSURE(nUtf32 <= 0x10FFFF, "bad UTF-32 char");
287     if (eCharset == RTL_TEXTENCODING_UTF8) {
288         if (nUtf32 < 0x80)
289             writeEscapeOctet(pBuffer, pCapacity, nUtf32);
290         else if (nUtf32 < 0x800)
291         {
292             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 6 | 0xC0);
293             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
294         }
295         else if (nUtf32 < 0x10000)
296         {
297             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 12 | 0xE0);
298             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
299             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
300         }
301         else
302         {
303             writeEscapeOctet(pBuffer, pCapacity, nUtf32 >> 18 | 0xF0);
304             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 12 & 0x3F) | 0x80);
305             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 >> 6 & 0x3F) | 0x80);
306             writeEscapeOctet(pBuffer, pCapacity, (nUtf32 & 0x3F) | 0x80);
307         }
308     } else {
309         rtl_UnicodeToTextConverter aConverter
310             = rtl_createUnicodeToTextConverter(eCharset);
311         sal_Unicode aSrc[2];
312         sal_Size nSrcSize;
313         if (nUtf32 <= 0xFFFF)
314         {
315             aSrc[0] = static_cast< sal_Unicode >(nUtf32);
316             nSrcSize = 1;
317         }
318         else
319         {
320             aSrc[0] = static_cast< sal_Unicode >(
321                 ((nUtf32 - 0x10000) >> 10) | 0xD800);
322             aSrc[1] = static_cast< sal_Unicode >(
323                 ((nUtf32 - 0x10000) & 0x3FF) | 0xDC00);
324             nSrcSize = 2;
325         }
326         sal_Char aDst[32]; // FIXME  random value
327         sal_uInt32 nInfo;
328         sal_Size nConverted;
329         sal_Size nDstSize = rtl_convertUnicodeToText(
330             aConverter, 0, aSrc, nSrcSize, aDst, sizeof aDst,
331             RTL_UNICODETOTEXT_FLAGS_UNDEFINED_ERROR
332             | RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR
333             | RTL_UNICODETOTEXT_FLAGS_FLUSH,
334             &nInfo, &nConverted);
335         OSL_ASSERT((nInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL) == 0);
336         rtl_destroyUnicodeToTextConverter(aConverter);
337         if (nInfo == 0) {
338             OSL_ENSURE(nConverted == nSrcSize, "bad rtl_convertUnicodeToText");
339             for (sal_Size i = 0; i < nDstSize; ++i)
340                 writeEscapeOctet(pBuffer, pCapacity,
341                                  static_cast< unsigned char >(aDst[i]));
342                     // FIXME  all octets are escaped, even if there is no need
343         } else {
344             if (bStrict) {
345                 return false;
346             } else {
347                 writeUcs4(pBuffer, pCapacity, nUtf32);
348             }
349         }
350     }
351     return true;
352 }
353 
354 struct Component
355 {
356     sal_Unicode const * pBegin;
357     sal_Unicode const * pEnd;
358 
Component__anon4ac86d970111::Component359     inline Component(): pBegin(0) {}
360 
isPresent__anon4ac86d970111::Component361     inline bool isPresent() const { return pBegin != 0; }
362 
363     inline sal_Int32 getLength() const;
364 };
365 
getLength() const366 inline sal_Int32 Component::getLength() const
367 {
368     OSL_ENSURE(isPresent(), "taking length of non-present component");
369     return static_cast< sal_Int32 >(pEnd - pBegin);
370 }
371 
372 struct Components
373 {
374     Component aScheme;
375     Component aAuthority;
376     Component aPath;
377     Component aQuery;
378     Component aFragment;
379 };
380 
parseUriRef(rtl_uString const * pUriRef,Components * pComponents)381 void parseUriRef(rtl_uString const * pUriRef, Components * pComponents)
382 {
383     // This algorithm is liberal and accepts various forms of illegal input.
384 
385     sal_Unicode const * pBegin = pUriRef->buffer;
386     sal_Unicode const * pEnd = pBegin + pUriRef->length;
387     sal_Unicode const * pPos = pBegin;
388 
389     if (pPos != pEnd && isAlpha(*pPos))
390         for (sal_Unicode const * p = pPos + 1; p != pEnd; ++p)
391             if (*p == ':')
392             {
393                 pComponents->aScheme.pBegin = pBegin;
394                 pComponents->aScheme.pEnd = ++p;
395                 pPos = p;
396                 break;
397             }
398             else if (!isAlpha(*p) && !isDigit(*p) && *p != '+' && *p != '-'
399                      && *p != '.')
400                 break;
401 
402     if (pEnd - pPos >= 2 && pPos[0] == '/' && pPos[1] == '/')
403     {
404         pComponents->aAuthority.pBegin = pPos;
405         pPos += 2;
406         while (pPos != pEnd && *pPos != '/' && *pPos != '?' && *pPos != '#')
407             ++pPos;
408         pComponents->aAuthority.pEnd = pPos;
409     }
410 
411     pComponents->aPath.pBegin = pPos;
412     while (pPos != pEnd && *pPos != '?' && * pPos != '#')
413         ++pPos;
414     pComponents->aPath.pEnd = pPos;
415 
416     if (pPos != pEnd && *pPos == '?')
417     {
418         pComponents->aQuery.pBegin = pPos++;
419         while (pPos != pEnd && * pPos != '#')
420             ++pPos;
421         pComponents->aQuery.pEnd = pPos;
422     }
423 
424     if (pPos != pEnd)
425     {
426         OSL_ASSERT(*pPos == '#');
427         pComponents->aFragment.pBegin = pPos;
428         pComponents->aFragment.pEnd = pEnd;
429     }
430 }
431 
joinPaths(Component const & rBasePath,Component const & rRelPath)432 rtl::OUString joinPaths(Component const & rBasePath, Component const & rRelPath)
433 {
434     OSL_ASSERT(rBasePath.isPresent() && *rBasePath.pBegin == '/');
435     OSL_ASSERT(rRelPath.isPresent());
436 
437     // The invariant of aBuffer is that it always starts and ends with a slash
438     // (until probably right at the end of the algorithm, when the last segment
439     // of rRelPath is added, which does not necessarily end in a slash):
440     rtl::OUStringBuffer aBuffer(rBasePath.getLength() + rRelPath.getLength());
441         // XXX  numeric overflow
442 
443     // Segments "." and ".." within rBasePath are not conisdered special (but
444     // are also not removed by ".." segments within rRelPath), RFC 2396 seems a
445     // bit unclear about this point:
446     sal_Int32 nFixed = 1;
447     sal_Unicode const * p = rBasePath.pBegin + 1;
448     for (sal_Unicode const * q = p; q != rBasePath.pEnd; ++q)
449         if (*q == '/')
450         {
451             if (
452                 (q - p == 1 && p[0] == '.') ||
453                 (q - p == 2 && p[0] == '.' && p[1] == '.')
454                )
455             {
456                 nFixed = q + 1 - rBasePath.pBegin;
457             }
458             p = q + 1;
459         }
460     aBuffer.append(rBasePath.pBegin, p - rBasePath.pBegin);
461 
462     p = rRelPath.pBegin;
463     if (p != rRelPath.pEnd)
464         for (;;)
465         {
466             sal_Unicode const * q = p;
467             sal_Unicode const * r;
468             for (;;)
469             {
470                 if (q == rRelPath.pEnd)
471                 {
472                     r = q;
473                     break;
474                 }
475                 if (*q == '/')
476                 {
477                     r = q + 1;
478                     break;
479                 }
480                 ++q;
481             }
482             if (q - p == 2 && p[0] == '.' && p[1] == '.')
483             {
484                 // Erroneous excess segments ".." within rRelPath are left
485                 // intact, as the examples in RFC 2396, section C.2, suggest:
486                 sal_Int32 i = aBuffer.getLength() - 1;
487                 if (i < nFixed)
488                 {
489                     aBuffer.append(p, r - p);
490                     nFixed += 3;
491                 }
492                 else
493                 {
494                     while (aBuffer.charAt(i - 1) != '/')
495                         --i;
496                     aBuffer.setLength(i);
497                 }
498             }
499             else if (q - p != 1 || *p != '.')
500                 aBuffer.append(p, r - p);
501             if (q == rRelPath.pEnd)
502                 break;
503             p = q + 1;
504         }
505 
506     return aBuffer.makeStringAndClear();
507 }
508 
509 }
510 
rtl_getUriCharClass(rtl_UriCharClass eCharClass)511 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass)
512     SAL_THROW_EXTERN_C()
513 {
514     static sal_Bool const aCharClass[][nCharClassSize]
515     = {{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* None */
516          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
517          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* !"#$%&'()*+,-./*/
518          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*0123456789:;<=>?*/
519          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*@ABCDEFGHIJKLMNO*/
520          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*PQRSTUVWXYZ[\]^_*/
521          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /*`abcdefghijklmno*/
522          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /*pqrstuvwxyz{|}~ */
523        },
524        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Uric */
525          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
526          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* !"#$%&'()*+,-./*/
527          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
528          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
529          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*PQRSTUVWXYZ[\]^_*/
530          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
531          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
532        },
533        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UricNoSlash */
534          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
536          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, /*0123456789:;<=>?*/
537          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
538          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
539          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
540          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
541        },
542        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RelSegment */
543          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
545          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
546          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
547          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
548          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
549          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
550        },
551        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* RegName */
552          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
553          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
554          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
555          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
556          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
557          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
558          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
559        },
560        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Userinfo */
561          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
562          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
563          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, /*0123456789:;<=>?*/
564          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
565          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
566          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
567          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
568        },
569        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* Pchar */
570          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
571          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, /* !"#$%&'()*+,-./*/
572          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, /*0123456789:;<=>?*/
573          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
574          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
575          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
576          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
577        },
578        { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* UnoParamValue */
579          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
580          0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, /* !"#$%&'()*+,-./*/
581          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*0123456789:;<=>?*/
582          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*@ABCDEFGHIJKLMNO*/
583          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /*PQRSTUVWXYZ[\]^_*/
584          0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /*`abcdefghijklmno*/
585          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0  /*pqrstuvwxyz{|}~ */
586        }};
587     OSL_ENSURE(
588         (eCharClass >= 0
589          && (sal::static_int_cast< std::size_t >(eCharClass)
590              < sizeof aCharClass / sizeof aCharClass[0])),
591         "bad eCharClass");
592     return aCharClass[eCharClass];
593 }
594 
rtl_uriEncode(rtl_uString * pText,sal_Bool const * pCharClass,rtl_UriEncodeMechanism eMechanism,rtl_TextEncoding eCharset,rtl_uString ** pResult)595 void SAL_CALL rtl_uriEncode(rtl_uString * pText, sal_Bool const * pCharClass,
596                             rtl_UriEncodeMechanism eMechanism,
597                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
598     SAL_THROW_EXTERN_C()
599 {
600     OSL_ENSURE(!pCharClass[0x25], "bad pCharClass");
601         // make sure the percent sign is encoded...
602 
603     sal_Unicode const * p = pText->buffer;
604     sal_Unicode const * pEnd = p + pText->length;
605     sal_Int32 nCapacity = 0;
606     rtl_uString_new(pResult);
607     while (p < pEnd)
608     {
609         EscapeType eType;
610         sal_uInt32 nUtf32 = readUcs4(
611             &p, pEnd,
612             (eMechanism == rtl_UriEncodeKeepEscapes
613              || eMechanism == rtl_UriEncodeCheckEscapes
614              || eMechanism == rtl_UriEncodeStrictKeepEscapes),
615             eCharset, &eType);
616         switch (eType)
617         {
618         case EscapeNo:
619             if (isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
620                 writeUnicode(pResult, &nCapacity,
621                              static_cast< sal_Unicode >(nUtf32));
622             else if (!writeEscapeChar(
623                          pResult, &nCapacity, nUtf32, eCharset,
624                          (eMechanism == rtl_UriEncodeStrict
625                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
626             {
627                 rtl_uString_new(pResult);
628                 return;
629             }
630             break;
631 
632         case EscapeChar:
633             if (eMechanism == rtl_UriEncodeCheckEscapes
634                 && isValid(pCharClass, nUtf32)) // implies nUtf32 <= 0x7F
635                 writeUnicode(pResult, &nCapacity,
636                              static_cast< sal_Unicode >(nUtf32));
637             else if (!writeEscapeChar(
638                          pResult, &nCapacity, nUtf32, eCharset,
639                          (eMechanism == rtl_UriEncodeStrict
640                           || eMechanism == rtl_UriEncodeStrictKeepEscapes)))
641             {
642                 rtl_uString_new(pResult);
643                 return;
644             }
645             break;
646 
647         case EscapeOctet:
648             writeEscapeOctet(pResult, &nCapacity, nUtf32);
649             break;
650         }
651     }
652 }
653 
rtl_uriDecode(rtl_uString * pText,rtl_UriDecodeMechanism eMechanism,rtl_TextEncoding eCharset,rtl_uString ** pResult)654 void SAL_CALL rtl_uriDecode(rtl_uString * pText,
655                             rtl_UriDecodeMechanism eMechanism,
656                             rtl_TextEncoding eCharset, rtl_uString ** pResult)
657     SAL_THROW_EXTERN_C()
658 {
659     switch (eMechanism)
660     {
661     case rtl_UriDecodeNone:
662         rtl_uString_assign(pResult, pText);
663         break;
664 
665     case rtl_UriDecodeToIuri:
666         eCharset = RTL_TEXTENCODING_UTF8;
667     default: // rtl_UriDecodeWithCharset, rtl_UriDecodeStrict
668         {
669             sal_Unicode const * p = pText->buffer;
670             sal_Unicode const * pEnd = p + pText->length;
671             sal_Int32 nCapacity = 0;
672             rtl_uString_new(pResult);
673             while (p < pEnd)
674             {
675                 EscapeType eType;
676                 sal_uInt32 nUtf32 = readUcs4(&p, pEnd, true, eCharset, &eType);
677                 switch (eType)
678                 {
679                 case EscapeChar:
680                     if (nUtf32 <= 0x7F && eMechanism == rtl_UriDecodeToIuri)
681                     {
682                         writeEscapeOctet(pResult, &nCapacity, nUtf32);
683                         break;
684                     }
685                 case EscapeNo:
686                     writeUcs4(pResult, &nCapacity, nUtf32);
687                     break;
688 
689                 case EscapeOctet:
690                     if (eMechanism == rtl_UriDecodeStrict) {
691                         rtl_uString_new(pResult);
692                         return;
693                     }
694                     writeEscapeOctet(pResult, &nCapacity, nUtf32);
695                     break;
696                 }
697             }
698         }
699         break;
700     }
701 }
702 
rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,rtl_uString * pRelUriRef,rtl_uString ** pResult,rtl_uString ** pException)703 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef,
704                                          rtl_uString * pRelUriRef,
705                                          rtl_uString ** pResult,
706                                          rtl_uString ** pException)
707     SAL_THROW_EXTERN_C()
708 {
709     // If pRelUriRef starts with a scheme component it is an absolute URI
710     // reference, and we are done (i.e., this algorithm does not support
711     // backwards-compatible relative URIs starting with a scheme component, see
712     // RFC 2396, section 5.2, step 3):
713     Components aRelComponents;
714     parseUriRef(pRelUriRef, &aRelComponents);
715     if (aRelComponents.aScheme.isPresent())
716     {
717         rtl_uString_assign(pResult, pRelUriRef);
718         return true;
719     }
720 
721     // Parse pBaseUriRef; if the scheme component is not present or not valid,
722     // or the path component is not empty and starts with anything but a slash,
723     // an exception is raised:
724     Components aBaseComponents;
725     parseUriRef(pBaseUriRef, &aBaseComponents);
726     if (!aBaseComponents.aScheme.isPresent())
727     {
728         rtl::OUString aMessage(pBaseUriRef);
729         aMessage += rtl::OUString(
730                         RTL_CONSTASCII_USTRINGPARAM(
731                             " does not start with a scheme component"));
732         rtl_uString_assign(pException,
733                            const_cast< rtl::OUString & >(aMessage).pData);
734         return false;
735     }
736     if (aBaseComponents.aPath.pBegin != aBaseComponents.aPath.pEnd
737         && *aBaseComponents.aPath.pBegin != '/')
738     {
739         rtl::OUString aMessage(pBaseUriRef);
740         aMessage += rtl::OUString(
741                         RTL_CONSTASCII_USTRINGPARAM(
742                             "path component does not start with slash"));
743         rtl_uString_assign(pException, aMessage.pData);
744         return false;
745     }
746 
747     // Use the algorithm from RFC 2396, section 5.2, to turn the relative URI
748     // into an absolute one (if the relative URI is a reference to the "current
749     // document," the "current document" is here taken to be the base URI):
750     rtl::OUStringBuffer aBuffer;
751     aBuffer.append(aBaseComponents.aScheme.pBegin,
752                    aBaseComponents.aScheme.getLength());
753     if (aRelComponents.aAuthority.isPresent())
754     {
755         aBuffer.append(aRelComponents.aAuthority.pBegin,
756                        aRelComponents.aAuthority.getLength());
757         aBuffer.append(aRelComponents.aPath.pBegin,
758                        aRelComponents.aPath.getLength());
759         if (aRelComponents.aQuery.isPresent())
760             aBuffer.append(aRelComponents.aQuery.pBegin,
761                            aRelComponents.aQuery.getLength());
762     }
763     else
764     {
765         if (aBaseComponents.aAuthority.isPresent())
766             aBuffer.append(aBaseComponents.aAuthority.pBegin,
767                            aBaseComponents.aAuthority.getLength());
768         if (aRelComponents.aPath.pBegin == aRelComponents.aPath.pEnd
769             && !aRelComponents.aQuery.isPresent())
770         {
771             aBuffer.append(aBaseComponents.aPath.pBegin,
772                            aBaseComponents.aPath.getLength());
773             if (aBaseComponents.aQuery.isPresent())
774                 aBuffer.append(aBaseComponents.aQuery.pBegin,
775                                aBaseComponents.aQuery.getLength());
776         }
777         else
778         {
779             if (*aRelComponents.aPath.pBegin == '/')
780                 aBuffer.append(aRelComponents.aPath.pBegin,
781                                aRelComponents.aPath.getLength());
782             else
783                 aBuffer.append(joinPaths(aBaseComponents.aPath,
784                                          aRelComponents.aPath));
785             if (aRelComponents.aQuery.isPresent())
786                 aBuffer.append(aRelComponents.aQuery.pBegin,
787                                aRelComponents.aQuery.getLength());
788         }
789     }
790     if (aRelComponents.aFragment.isPresent())
791         aBuffer.append(aRelComponents.aFragment.pBegin,
792                        aRelComponents.aFragment.getLength());
793     rtl_uString_assign(pResult, aBuffer.makeStringAndClear().pData);
794     return true;
795 }
796