xref: /aoo42x/main/ucb/source/regexp/regexp.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_ucb.hxx"
30 #include <regexp.hxx>
31 
32 #include <cstddef>
33 
34 #include "osl/diagnose.h"
35 #include <com/sun/star/lang/IllegalArgumentException.hpp>
36 #include <rtl/ustrbuf.hxx>
37 #include <rtl/ustring.hxx>
38 
39 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
40 	// unnamed namespaces don't work well yet...
41 
42 using namespace com::sun::star;
43 using namespace ucb_impl;
44 
45 //============================================================================
46 //
47 //  Regexp
48 //
49 //============================================================================
50 
51 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
52 					  bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
53 					  bool bTheTranslation,
54 					  rtl::OUString const & rTheReversePrefix):
55 	m_eKind(eTheKind),
56 	m_aPrefix(rThePrefix),
57 	m_aInfix(rTheInfix),
58 	m_aReversePrefix(rTheReversePrefix),
59 	m_bEmptyDomain(bTheEmptyDomain),
60 	m_bTranslation(bTheTranslation)
61 {
62 	OSL_ASSERT(m_eKind == KIND_DOMAIN
63 			   || !m_bEmptyDomain && m_aInfix.getLength() == 0);
64 	OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
65 }
66 
67 //============================================================================
68 namespace unnamed_ucb_regexp {
69 
70 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
71 						   sal_Unicode const * pEnd,
72 						   rtl::OUString const & rString)
73 {
74 	sal_Unicode const * p = *pBegin;
75 
76 	sal_Unicode const * q = rString.getStr();
77 	sal_Unicode const * qEnd = q + rString.getLength();
78 
79 	if (pEnd - p < qEnd - q)
80 		return false;
81 
82 	while (q != qEnd)
83 	{
84 		sal_Unicode c1 = *p++;
85 		sal_Unicode c2 = *q++;
86 		if (c1 >= 'a' && c1 <= 'z')
87 			c1 -= 'a' - 'A';
88 		if (c2 >= 'a' && c2 <= 'z')
89 			c2 -= 'a' - 'A';
90 		if (c1 != c2)
91 			return false;
92 	}
93 
94 	*pBegin = p;
95 	return true;
96 }
97 
98 }
99 
100 bool Regexp::matches(rtl::OUString const & rString,
101 					 rtl::OUString * pTranslation, bool * pTranslated) const
102 {
103 	sal_Unicode const * pBegin = rString.getStr();
104 	sal_Unicode const * pEnd = pBegin + rString.getLength();
105 
106 	bool bMatches = false;
107 
108 	sal_Unicode const * p = pBegin;
109 	if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
110 	{
111 		sal_Unicode const * pBlock1Begin = p;
112 		sal_Unicode const * pBlock1End = pEnd;
113 
114 		sal_Unicode const * pBlock2Begin = 0;
115 		sal_Unicode const * pBlock2End = 0;
116 
117 		switch (m_eKind)
118 		{
119 			case KIND_PREFIX:
120 				bMatches = true;
121 				break;
122 
123 			case KIND_AUTHORITY:
124 				bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
125 				break;
126 
127 			case KIND_DOMAIN:
128 				if (!m_bEmptyDomain)
129 				{
130 					if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
131 						break;
132 					++p;
133 				}
134 				for (;;)
135 				{
136 					sal_Unicode const * q = p;
137 					if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
138 						&& (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
139 					{
140 						bMatches = true;
141 						pBlock1End = p;
142 						pBlock2Begin = q;
143 						pBlock2End = pEnd;
144 						break;
145 					}
146 
147 					if (p == pEnd)
148 						break;
149 
150 					sal_Unicode c = *p++;
151 					if (c == '/' || c == '?' || c == '#')
152 						break;
153 				}
154 				break;
155 		}
156 
157 		if (bMatches)
158 		{
159 			if (m_bTranslation)
160 			{
161 				if (pTranslation)
162 				{
163 					rtl::OUStringBuffer aBuffer(m_aReversePrefix);
164 					aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
165 					aBuffer.append(m_aInfix);
166 					aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
167 					*pTranslation = aBuffer.makeStringAndClear();
168 				}
169 				if (pTranslated)
170 					*pTranslated = true;
171 			}
172 			else
173 			{
174 				if (pTranslation)
175 					*pTranslation = rString;
176 				if (pTranslated)
177 					*pTranslated = false;
178 			}
179 		}
180 	}
181 
182 	return bMatches;
183 }
184 
185 //============================================================================
186 namespace unnamed_ucb_regexp {
187 
188 inline bool isAlpha(sal_Unicode c)
189 {
190 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
191 }
192 
193 inline bool isDigit(sal_Unicode c)
194 {
195 	return c >= '0' && c <= '9';
196 }
197 
198 bool isScheme(rtl::OUString const & rString, bool bColon)
199 {
200 	// Return true if rString matches <scheme> (plus a trailing ":" if bColon
201     // is true) from RFC 2396:
202 	sal_Unicode const * p = rString.getStr();
203 	sal_Unicode const * pEnd = p + rString.getLength();
204 	if (p != pEnd && isAlpha(*p))
205 		for (++p;;)
206 		{
207 			if (p == pEnd)
208 				return !bColon;
209 			sal_Unicode c = *p++;
210 			if (!(isAlpha(c) || isDigit(c)
211                   || c == '+' || c == '-' || c == '.'))
212                 return bColon && c == ':' && p == pEnd;
213 		}
214 	return false;
215 }
216 
217 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
218 						 rtl::OUString const & rString)
219 {
220 	OSL_ASSERT(pBuffer);
221 
222 	pBuffer->append(sal_Unicode('"'));
223 	sal_Unicode const * p = rString.getStr();
224 	sal_Unicode const * pEnd = p + rString.getLength();
225 	while (p != pEnd)
226 	{
227 		sal_Unicode c = *p++;
228 		if (c == '"' || c == '\\')
229 			pBuffer->append(sal_Unicode('\\'));
230 		pBuffer->append(c);
231 	}
232 	pBuffer->append(sal_Unicode('"'));
233 }
234 
235 }
236 
237 rtl::OUString Regexp::getRegexp(bool bReverse) const
238 {
239 	if (m_bTranslation)
240 	{
241 		rtl::OUStringBuffer aBuffer;
242 		if (bReverse)
243 		{
244 			if (m_aReversePrefix.getLength() != 0)
245 				appendStringLiteral(&aBuffer, m_aReversePrefix);
246 		}
247 		else
248 		{
249 			if (m_aPrefix.getLength() != 0)
250 				appendStringLiteral(&aBuffer, m_aPrefix);
251 		}
252 		switch (m_eKind)
253 		{
254 			case KIND_PREFIX:
255 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
256 				break;
257 
258 			case KIND_AUTHORITY:
259 				aBuffer.
260 					appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
261 				break;
262 
263 			case KIND_DOMAIN:
264 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
265 				aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
266 				if (m_aInfix.getLength() != 0)
267 					appendStringLiteral(&aBuffer, m_aInfix);
268 				aBuffer.
269 					appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
270 				break;
271 		}
272 		aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
273 		if (bReverse)
274 		{
275 			if (m_aPrefix.getLength() != 0)
276 				appendStringLiteral(&aBuffer, m_aPrefix);
277 		}
278 		else
279 		{
280 			if (m_aReversePrefix.getLength() != 0)
281 				appendStringLiteral(&aBuffer, m_aReversePrefix);
282 		}
283 		aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
284 		return aBuffer.makeStringAndClear();
285 	}
286 	else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
287 		return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
288 	else
289 	{
290 		rtl::OUStringBuffer aBuffer;
291 		if (m_aPrefix.getLength() != 0)
292 			appendStringLiteral(&aBuffer, m_aPrefix);
293 		switch (m_eKind)
294 		{
295 			case KIND_PREFIX:
296 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
297 				break;
298 
299 			case KIND_AUTHORITY:
300 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
301 				break;
302 
303 			case KIND_DOMAIN:
304 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
305 				aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
306 				if (m_aInfix.getLength() != 0)
307 					appendStringLiteral(&aBuffer, m_aInfix);
308 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
309 				break;
310 		}
311 		return aBuffer.makeStringAndClear();
312 	}
313 }
314 
315 //============================================================================
316 namespace unnamed_ucb_regexp {
317 
318 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
319 				 sal_Char const * pString, size_t nStringLength)
320 {
321 	sal_Unicode const * p = *pBegin;
322 
323 	sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
324 	sal_uChar const * qEnd = q + nStringLength;
325 
326 	if (pEnd - p < qEnd - q)
327 		return false;
328 
329 	while (q != qEnd)
330 	{
331 		sal_Unicode c1 = *p++;
332 		sal_Unicode c2 = *q++;
333 		if (c1 != c2)
334 			return false;
335 	}
336 
337 	*pBegin = p;
338 	return true;
339 }
340 
341 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
342 					   rtl::OUString * pString)
343 {
344 	sal_Unicode const * p = *pBegin;
345 
346 	if (p == pEnd || *p++ != '"')
347 		return false;
348 
349 	rtl::OUStringBuffer aBuffer;
350 	for (;;)
351 	{
352 		if (p == pEnd)
353 			return false;
354 		sal_Unicode c = *p++;
355 		if (c == '"')
356 			break;
357 		if (c == '\\')
358 		{
359 			if (p == pEnd)
360 				return false;
361 			c = *p++;
362 			if (c != '"' && c != '\\')
363 				return false;
364 		}
365 		aBuffer.append(c);
366 	}
367 
368 	*pBegin = p;
369 	*pString = aBuffer.makeStringAndClear();
370 	return true;
371 }
372 
373 }
374 
375 Regexp Regexp::parse(rtl::OUString const & rRegexp)
376 {
377 	// Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
378 	// where <scheme> is as defined in RFC 2396:
379 	if (isScheme(rRegexp, false))
380 		return Regexp(Regexp::KIND_PREFIX,
381                       rRegexp
382                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
383                       false,
384                       rtl::OUString(),
385 					  false,
386                       rtl::OUString());
387 
388 	sal_Unicode const * p = rRegexp.getStr();
389 	sal_Unicode const * pEnd = p + rRegexp.getLength();
390 
391 	rtl::OUString aPrefix;
392 	scanStringLiteral(&p, pEnd, &aPrefix);
393 
394 	if (p == pEnd)
395 		throw lang::IllegalArgumentException();
396 
397 	if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
398 	{
399 		if (p != pEnd)
400 			throw lang::IllegalArgumentException();
401 
402 		return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
403 					  false, rtl::OUString());
404 	}
405 	else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
406 	{
407 		rtl::OUString aReversePrefix;
408 		scanStringLiteral(&p, pEnd, &aReversePrefix);
409 
410 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
411 			|| p != pEnd)
412 			throw lang::IllegalArgumentException();
413 
414 		return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
415 					  true, aReversePrefix);
416 	}
417 	else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
418 	{
419 		if (p != pEnd)
420 			throw lang::IllegalArgumentException();
421 
422 		return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
423 					  false, rtl::OUString());
424 	}
425 	else if (matchString(&p, pEnd,
426 						 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
427 	{
428 		rtl::OUString aReversePrefix;
429 		if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
430 			  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
431 			  && p == pEnd))
432 			throw lang::IllegalArgumentException();
433 
434 		return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
435 					  true, aReversePrefix);
436 	}
437 	else
438 	{
439 		bool bOpen = false;
440 		if (p != pEnd && *p == '(')
441 		{
442 			++p;
443 			bOpen = true;
444 		}
445 
446 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
447 			throw lang::IllegalArgumentException();
448 
449 		if (p == pEnd || (*p != '*' && *p != '+'))
450 			throw lang::IllegalArgumentException();
451 		bool bEmptyDomain = *p++ == '*';
452 
453 		rtl::OUString aInfix;
454 		scanStringLiteral(&p, pEnd, &aInfix);
455 
456 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
457 			throw lang::IllegalArgumentException();
458 
459 		rtl::OUString aReversePrefix;
460 		if (bOpen
461 			&& !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
462 				 && scanStringLiteral(&p, pEnd, &aReversePrefix)
463 				 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
464 			throw lang::IllegalArgumentException();
465 
466 		if (p != pEnd)
467 			throw lang::IllegalArgumentException();
468 
469 		return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
470 					  bOpen, aReversePrefix);
471 	}
472 }
473 
474