xref: /trunk/main/ucb/source/regexp/regexp.cxx (revision cdf0e10c4e3984b49a9502b011690b615761d4a3)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_ucb.hxx"
30 #include <regexp.hxx>
31 
32 #include <cstddef>
33 
34 #include "osl/diagnose.h"
35 #include <com/sun/star/lang/IllegalArgumentException.hpp>
36 #include <rtl/ustrbuf.hxx>
37 #include <rtl/ustring.hxx>
38 
39 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
40     // unnamed namespaces don't work well yet...
41 
42 using namespace com::sun::star;
43 using namespace ucb_impl;
44 
45 //============================================================================
46 //
47 //  Regexp
48 //
49 //============================================================================
50 
51 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
52                       bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
53                       bool bTheTranslation,
54                       rtl::OUString const & rTheReversePrefix):
55     m_eKind(eTheKind),
56     m_aPrefix(rThePrefix),
57     m_aInfix(rTheInfix),
58     m_aReversePrefix(rTheReversePrefix),
59     m_bEmptyDomain(bTheEmptyDomain),
60     m_bTranslation(bTheTranslation)
61 {
62     OSL_ASSERT(m_eKind == KIND_DOMAIN
63                || !m_bEmptyDomain && m_aInfix.getLength() == 0);
64     OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
65 }
66 
67 //============================================================================
68 namespace unnamed_ucb_regexp {
69 
70 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
71                            sal_Unicode const * pEnd,
72                            rtl::OUString const & rString)
73 {
74     sal_Unicode const * p = *pBegin;
75 
76     sal_Unicode const * q = rString.getStr();
77     sal_Unicode const * qEnd = q + rString.getLength();
78 
79     if (pEnd - p < qEnd - q)
80         return false;
81 
82     while (q != qEnd)
83     {
84         sal_Unicode c1 = *p++;
85         sal_Unicode c2 = *q++;
86         if (c1 >= 'a' && c1 <= 'z')
87             c1 -= 'a' - 'A';
88         if (c2 >= 'a' && c2 <= 'z')
89             c2 -= 'a' - 'A';
90         if (c1 != c2)
91             return false;
92     }
93 
94     *pBegin = p;
95     return true;
96 }
97 
98 }
99 
100 bool Regexp::matches(rtl::OUString const & rString,
101                      rtl::OUString * pTranslation, bool * pTranslated) const
102 {
103     sal_Unicode const * pBegin = rString.getStr();
104     sal_Unicode const * pEnd = pBegin + rString.getLength();
105 
106     bool bMatches = false;
107 
108     sal_Unicode const * p = pBegin;
109     if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
110     {
111         sal_Unicode const * pBlock1Begin = p;
112         sal_Unicode const * pBlock1End = pEnd;
113 
114         sal_Unicode const * pBlock2Begin = 0;
115         sal_Unicode const * pBlock2End = 0;
116 
117         switch (m_eKind)
118         {
119             case KIND_PREFIX:
120                 bMatches = true;
121                 break;
122 
123             case KIND_AUTHORITY:
124                 bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
125                 break;
126 
127             case KIND_DOMAIN:
128                 if (!m_bEmptyDomain)
129                 {
130                     if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
131                         break;
132                     ++p;
133                 }
134                 for (;;)
135                 {
136                     sal_Unicode const * q = p;
137                     if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
138                         && (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
139                     {
140                         bMatches = true;
141                         pBlock1End = p;
142                         pBlock2Begin = q;
143                         pBlock2End = pEnd;
144                         break;
145                     }
146 
147                     if (p == pEnd)
148                         break;
149 
150                     sal_Unicode c = *p++;
151                     if (c == '/' || c == '?' || c == '#')
152                         break;
153                 }
154                 break;
155         }
156 
157         if (bMatches)
158         {
159             if (m_bTranslation)
160             {
161                 if (pTranslation)
162                 {
163                     rtl::OUStringBuffer aBuffer(m_aReversePrefix);
164                     aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
165                     aBuffer.append(m_aInfix);
166                     aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
167                     *pTranslation = aBuffer.makeStringAndClear();
168                 }
169                 if (pTranslated)
170                     *pTranslated = true;
171             }
172             else
173             {
174                 if (pTranslation)
175                     *pTranslation = rString;
176                 if (pTranslated)
177                     *pTranslated = false;
178             }
179         }
180     }
181 
182     return bMatches;
183 }
184 
185 //============================================================================
186 namespace unnamed_ucb_regexp {
187 
188 inline bool isAlpha(sal_Unicode c)
189 {
190     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
191 }
192 
193 inline bool isDigit(sal_Unicode c)
194 {
195     return c >= '0' && c <= '9';
196 }
197 
198 bool isScheme(rtl::OUString const & rString, bool bColon)
199 {
200     // Return true if rString matches <scheme> (plus a trailing ":" if bColon
201     // is true) from RFC 2396:
202     sal_Unicode const * p = rString.getStr();
203     sal_Unicode const * pEnd = p + rString.getLength();
204     if (p != pEnd && isAlpha(*p))
205         for (++p;;)
206         {
207             if (p == pEnd)
208                 return !bColon;
209             sal_Unicode c = *p++;
210             if (!(isAlpha(c) || isDigit(c)
211                   || c == '+' || c == '-' || c == '.'))
212                 return bColon && c == ':' && p == pEnd;
213         }
214     return false;
215 }
216 
217 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
218                          rtl::OUString const & rString)
219 {
220     OSL_ASSERT(pBuffer);
221 
222     pBuffer->append(sal_Unicode('"'));
223     sal_Unicode const * p = rString.getStr();
224     sal_Unicode const * pEnd = p + rString.getLength();
225     while (p != pEnd)
226     {
227         sal_Unicode c = *p++;
228         if (c == '"' || c == '\\')
229             pBuffer->append(sal_Unicode('\\'));
230         pBuffer->append(c);
231     }
232     pBuffer->append(sal_Unicode('"'));
233 }
234 
235 }
236 
237 rtl::OUString Regexp::getRegexp(bool bReverse) const
238 {
239     if (m_bTranslation)
240     {
241         rtl::OUStringBuffer aBuffer;
242         if (bReverse)
243         {
244             if (m_aReversePrefix.getLength() != 0)
245                 appendStringLiteral(&aBuffer, m_aReversePrefix);
246         }
247         else
248         {
249             if (m_aPrefix.getLength() != 0)
250                 appendStringLiteral(&aBuffer, m_aPrefix);
251         }
252         switch (m_eKind)
253         {
254             case KIND_PREFIX:
255                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
256                 break;
257 
258             case KIND_AUTHORITY:
259                 aBuffer.
260                     appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
261                 break;
262 
263             case KIND_DOMAIN:
264                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
265                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
266                 if (m_aInfix.getLength() != 0)
267                     appendStringLiteral(&aBuffer, m_aInfix);
268                 aBuffer.
269                     appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
270                 break;
271         }
272         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
273         if (bReverse)
274         {
275             if (m_aPrefix.getLength() != 0)
276                 appendStringLiteral(&aBuffer, m_aPrefix);
277         }
278         else
279         {
280             if (m_aReversePrefix.getLength() != 0)
281                 appendStringLiteral(&aBuffer, m_aReversePrefix);
282         }
283         aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
284         return aBuffer.makeStringAndClear();
285     }
286     else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
287         return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
288     else
289     {
290         rtl::OUStringBuffer aBuffer;
291         if (m_aPrefix.getLength() != 0)
292             appendStringLiteral(&aBuffer, m_aPrefix);
293         switch (m_eKind)
294         {
295             case KIND_PREFIX:
296                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
297                 break;
298 
299             case KIND_AUTHORITY:
300                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
301                 break;
302 
303             case KIND_DOMAIN:
304                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
305                 aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
306                 if (m_aInfix.getLength() != 0)
307                     appendStringLiteral(&aBuffer, m_aInfix);
308                 aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
309                 break;
310         }
311         return aBuffer.makeStringAndClear();
312     }
313 }
314 
315 //============================================================================
316 namespace unnamed_ucb_regexp {
317 
318 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
319                  sal_Char const * pString, size_t nStringLength)
320 {
321     sal_Unicode const * p = *pBegin;
322 
323     sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
324     sal_uChar const * qEnd = q + nStringLength;
325 
326     if (pEnd - p < qEnd - q)
327         return false;
328 
329     while (q != qEnd)
330     {
331         sal_Unicode c1 = *p++;
332         sal_Unicode c2 = *q++;
333         if (c1 != c2)
334             return false;
335     }
336 
337     *pBegin = p;
338     return true;
339 }
340 
341 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
342                        rtl::OUString * pString)
343 {
344     sal_Unicode const * p = *pBegin;
345 
346     if (p == pEnd || *p++ != '"')
347         return false;
348 
349     rtl::OUStringBuffer aBuffer;
350     for (;;)
351     {
352         if (p == pEnd)
353             return false;
354         sal_Unicode c = *p++;
355         if (c == '"')
356             break;
357         if (c == '\\')
358         {
359             if (p == pEnd)
360                 return false;
361             c = *p++;
362             if (c != '"' && c != '\\')
363                 return false;
364         }
365         aBuffer.append(c);
366     }
367 
368     *pBegin = p;
369     *pString = aBuffer.makeStringAndClear();
370     return true;
371 }
372 
373 }
374 
375 Regexp Regexp::parse(rtl::OUString const & rRegexp)
376 {
377     // Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
378     // where <scheme> is as defined in RFC 2396:
379     if (isScheme(rRegexp, false))
380         return Regexp(Regexp::KIND_PREFIX,
381                       rRegexp
382                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
383                       false,
384                       rtl::OUString(),
385                       false,
386                       rtl::OUString());
387 
388     sal_Unicode const * p = rRegexp.getStr();
389     sal_Unicode const * pEnd = p + rRegexp.getLength();
390 
391     rtl::OUString aPrefix;
392     scanStringLiteral(&p, pEnd, &aPrefix);
393 
394     if (p == pEnd)
395         throw lang::IllegalArgumentException();
396 
397     if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
398     {
399         if (p != pEnd)
400             throw lang::IllegalArgumentException();
401 
402         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
403                       false, rtl::OUString());
404     }
405     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
406     {
407         rtl::OUString aReversePrefix;
408         scanStringLiteral(&p, pEnd, &aReversePrefix);
409 
410         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
411             || p != pEnd)
412             throw lang::IllegalArgumentException();
413 
414         return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
415                       true, aReversePrefix);
416     }
417     else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
418     {
419         if (p != pEnd)
420             throw lang::IllegalArgumentException();
421 
422         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
423                       false, rtl::OUString());
424     }
425     else if (matchString(&p, pEnd,
426                          RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
427     {
428         rtl::OUString aReversePrefix;
429         if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
430               && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
431               && p == pEnd))
432             throw lang::IllegalArgumentException();
433 
434         return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
435                       true, aReversePrefix);
436     }
437     else
438     {
439         bool bOpen = false;
440         if (p != pEnd && *p == '(')
441         {
442             ++p;
443             bOpen = true;
444         }
445 
446         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
447             throw lang::IllegalArgumentException();
448 
449         if (p == pEnd || (*p != '*' && *p != '+'))
450             throw lang::IllegalArgumentException();
451         bool bEmptyDomain = *p++ == '*';
452 
453         rtl::OUString aInfix;
454         scanStringLiteral(&p, pEnd, &aInfix);
455 
456         if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
457             throw lang::IllegalArgumentException();
458 
459         rtl::OUString aReversePrefix;
460         if (bOpen
461             && !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
462                  && scanStringLiteral(&p, pEnd, &aReversePrefix)
463                  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
464             throw lang::IllegalArgumentException();
465 
466         if (p != pEnd)
467             throw lang::IllegalArgumentException();
468 
469         return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
470                       bOpen, aReversePrefix);
471     }
472 }
473 
474