xref: /trunk/main/ucb/source/regexp/regexp.cxx (revision 2f86921c)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_ucb.hxx"
26 #include <regexp.hxx>
27 
28 #include <cstddef>
29 
30 #include "osl/diagnose.h"
31 #include <com/sun/star/lang/IllegalArgumentException.hpp>
32 #include <rtl/ustrbuf.hxx>
33 #include <rtl/ustring.hxx>
34 
35 namespace unnamed_ucb_regexp {} using namespace unnamed_ucb_regexp;
36 	// unnamed namespaces don't work well yet...
37 
38 using namespace com::sun::star;
39 using namespace ucb_impl;
40 
41 //============================================================================
42 //
43 //  Regexp
44 //
45 //============================================================================
46 
Regexp(Kind eTheKind,rtl::OUString const & rThePrefix,bool bTheEmptyDomain,rtl::OUString const & rTheInfix,bool bTheTranslation,rtl::OUString const & rTheReversePrefix)47 inline Regexp::Regexp(Kind eTheKind, rtl::OUString const & rThePrefix,
48 					  bool bTheEmptyDomain, rtl::OUString const & rTheInfix,
49 					  bool bTheTranslation,
50 					  rtl::OUString const & rTheReversePrefix):
51 	m_eKind(eTheKind),
52 	m_aPrefix(rThePrefix),
53 	m_aInfix(rTheInfix),
54 	m_aReversePrefix(rTheReversePrefix),
55 	m_bEmptyDomain(bTheEmptyDomain),
56 	m_bTranslation(bTheTranslation)
57 {
58 	OSL_ASSERT(m_eKind == KIND_DOMAIN
59 			   || !m_bEmptyDomain && m_aInfix.getLength() == 0);
60 	OSL_ASSERT(m_bTranslation || m_aReversePrefix.getLength() == 0);
61 }
62 
63 //============================================================================
64 namespace unnamed_ucb_regexp {
65 
matchStringIgnoreCase(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,rtl::OUString const & rString)66 bool matchStringIgnoreCase(sal_Unicode const ** pBegin,
67 						   sal_Unicode const * pEnd,
68 						   rtl::OUString const & rString)
69 {
70 	sal_Unicode const * p = *pBegin;
71 
72 	sal_Unicode const * q = rString.getStr();
73 	sal_Unicode const * qEnd = q + rString.getLength();
74 
75 	if (pEnd - p < qEnd - q)
76 		return false;
77 
78 	while (q != qEnd)
79 	{
80 		sal_Unicode c1 = *p++;
81 		sal_Unicode c2 = *q++;
82 		if (c1 >= 'a' && c1 <= 'z')
83 			c1 -= 'a' - 'A';
84 		if (c2 >= 'a' && c2 <= 'z')
85 			c2 -= 'a' - 'A';
86 		if (c1 != c2)
87 			return false;
88 	}
89 
90 	*pBegin = p;
91 	return true;
92 }
93 
94 }
95 
matches(rtl::OUString const & rString,rtl::OUString * pTranslation,bool * pTranslated) const96 bool Regexp::matches(rtl::OUString const & rString,
97 					 rtl::OUString * pTranslation, bool * pTranslated) const
98 {
99 	sal_Unicode const * pBegin = rString.getStr();
100 	sal_Unicode const * pEnd = pBegin + rString.getLength();
101 
102 	bool bMatches = false;
103 
104 	sal_Unicode const * p = pBegin;
105 	if (matchStringIgnoreCase(&p, pEnd, m_aPrefix))
106 	{
107 		sal_Unicode const * pBlock1Begin = p;
108 		sal_Unicode const * pBlock1End = pEnd;
109 
110 		sal_Unicode const * pBlock2Begin = 0;
111 		sal_Unicode const * pBlock2End = 0;
112 
113 		switch (m_eKind)
114 		{
115 			case KIND_PREFIX:
116 				bMatches = true;
117 				break;
118 
119 			case KIND_AUTHORITY:
120 				bMatches = p == pEnd || *p == '/' || *p == '?' || *p == '#';
121 				break;
122 
123 			case KIND_DOMAIN:
124 				if (!m_bEmptyDomain)
125 				{
126 					if (p == pEnd || *p == '/' || *p == '?' || *p == '#')
127 						break;
128 					++p;
129 				}
130 				for (;;)
131 				{
132 					sal_Unicode const * q = p;
133 					if (matchStringIgnoreCase(&q, pEnd, m_aInfix)
134 						&& (q == pEnd || *q == '/' || *q == '?' || *q == '#'))
135 					{
136 						bMatches = true;
137 						pBlock1End = p;
138 						pBlock2Begin = q;
139 						pBlock2End = pEnd;
140 						break;
141 					}
142 
143 					if (p == pEnd)
144 						break;
145 
146 					sal_Unicode c = *p++;
147 					if (c == '/' || c == '?' || c == '#')
148 						break;
149 				}
150 				break;
151 		}
152 
153 		if (bMatches)
154 		{
155 			if (m_bTranslation)
156 			{
157 				if (pTranslation)
158 				{
159 					rtl::OUStringBuffer aBuffer(m_aReversePrefix);
160 					aBuffer.append(pBlock1Begin, pBlock1End - pBlock1Begin);
161 					aBuffer.append(m_aInfix);
162 					aBuffer.append(pBlock2Begin, pBlock2End - pBlock2Begin);
163 					*pTranslation = aBuffer.makeStringAndClear();
164 				}
165 				if (pTranslated)
166 					*pTranslated = true;
167 			}
168 			else
169 			{
170 				if (pTranslation)
171 					*pTranslation = rString;
172 				if (pTranslated)
173 					*pTranslated = false;
174 			}
175 		}
176 	}
177 
178 	return bMatches;
179 }
180 
181 //============================================================================
182 namespace unnamed_ucb_regexp {
183 
isAlpha(sal_Unicode c)184 inline bool isAlpha(sal_Unicode c)
185 {
186 	return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
187 }
188 
isDigit(sal_Unicode c)189 inline bool isDigit(sal_Unicode c)
190 {
191 	return c >= '0' && c <= '9';
192 }
193 
isScheme(rtl::OUString const & rString,bool bColon)194 bool isScheme(rtl::OUString const & rString, bool bColon)
195 {
196 	// Return true if rString matches <scheme> (plus a trailing ":" if bColon
197     // is true) from RFC 2396:
198 	sal_Unicode const * p = rString.getStr();
199 	sal_Unicode const * pEnd = p + rString.getLength();
200 	if (p != pEnd && isAlpha(*p))
201 		for (++p;;)
202 		{
203 			if (p == pEnd)
204 				return !bColon;
205 			sal_Unicode c = *p++;
206 			if (!(isAlpha(c) || isDigit(c)
207                   || c == '+' || c == '-' || c == '.'))
208                 return bColon && c == ':' && p == pEnd;
209 		}
210 	return false;
211 }
212 
appendStringLiteral(rtl::OUStringBuffer * pBuffer,rtl::OUString const & rString)213 void appendStringLiteral(rtl::OUStringBuffer * pBuffer,
214 						 rtl::OUString const & rString)
215 {
216 	OSL_ASSERT(pBuffer);
217 
218 	pBuffer->append(sal_Unicode('"'));
219 	sal_Unicode const * p = rString.getStr();
220 	sal_Unicode const * pEnd = p + rString.getLength();
221 	while (p != pEnd)
222 	{
223 		sal_Unicode c = *p++;
224 		if (c == '"' || c == '\\')
225 			pBuffer->append(sal_Unicode('\\'));
226 		pBuffer->append(c);
227 	}
228 	pBuffer->append(sal_Unicode('"'));
229 }
230 
231 }
232 
getRegexp(bool bReverse) const233 rtl::OUString Regexp::getRegexp(bool bReverse) const
234 {
235 	if (m_bTranslation)
236 	{
237 		rtl::OUStringBuffer aBuffer;
238 		if (bReverse)
239 		{
240 			if (m_aReversePrefix.getLength() != 0)
241 				appendStringLiteral(&aBuffer, m_aReversePrefix);
242 		}
243 		else
244 		{
245 			if (m_aPrefix.getLength() != 0)
246 				appendStringLiteral(&aBuffer, m_aPrefix);
247 		}
248 		switch (m_eKind)
249 		{
250 			case KIND_PREFIX:
251 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("(.*)"));
252 				break;
253 
254 			case KIND_AUTHORITY:
255 				aBuffer.
256 					appendAscii(RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)"));
257 				break;
258 
259 			case KIND_DOMAIN:
260 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([^/?#]"));
261 				aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
262 				if (m_aInfix.getLength() != 0)
263 					appendStringLiteral(&aBuffer, m_aInfix);
264 				aBuffer.
265 					appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?)"));
266 				break;
267 		}
268 		aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("->"));
269 		if (bReverse)
270 		{
271 			if (m_aPrefix.getLength() != 0)
272 				appendStringLiteral(&aBuffer, m_aPrefix);
273 		}
274 		else
275 		{
276 			if (m_aReversePrefix.getLength() != 0)
277 				appendStringLiteral(&aBuffer, m_aReversePrefix);
278 		}
279 		aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("\\1"));
280 		return aBuffer.makeStringAndClear();
281 	}
282 	else if (m_eKind == KIND_PREFIX && isScheme(m_aPrefix, true))
283 		return m_aPrefix.copy(0, m_aPrefix.getLength() - 1);
284 	else
285 	{
286 		rtl::OUStringBuffer aBuffer;
287 		if (m_aPrefix.getLength() != 0)
288 			appendStringLiteral(&aBuffer, m_aPrefix);
289 		switch (m_eKind)
290 		{
291 			case KIND_PREFIX:
292 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM(".*"));
293 				break;
294 
295 			case KIND_AUTHORITY:
296 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
297 				break;
298 
299 			case KIND_DOMAIN:
300 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("[^/?#]"));
301 				aBuffer.append(sal_Unicode(m_bEmptyDomain ? '*' : '+'));
302 				if (m_aInfix.getLength() != 0)
303 					appendStringLiteral(&aBuffer, m_aInfix);
304 				aBuffer.appendAscii(RTL_CONSTASCII_STRINGPARAM("([/?#].*)?"));
305 				break;
306 		}
307 		return aBuffer.makeStringAndClear();
308 	}
309 }
310 
311 //============================================================================
312 namespace unnamed_ucb_regexp {
313 
matchString(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,sal_Char const * pString,size_t nStringLength)314 bool matchString(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
315 				 sal_Char const * pString, size_t nStringLength)
316 {
317 	sal_Unicode const * p = *pBegin;
318 
319 	sal_uChar const * q = reinterpret_cast< sal_uChar const * >(pString);
320 	sal_uChar const * qEnd = q + nStringLength;
321 
322 	if (pEnd - p < qEnd - q)
323 		return false;
324 
325 	while (q != qEnd)
326 	{
327 		sal_Unicode c1 = *p++;
328 		sal_Unicode c2 = *q++;
329 		if (c1 != c2)
330 			return false;
331 	}
332 
333 	*pBegin = p;
334 	return true;
335 }
336 
scanStringLiteral(sal_Unicode const ** pBegin,sal_Unicode const * pEnd,rtl::OUString * pString)337 bool scanStringLiteral(sal_Unicode const ** pBegin, sal_Unicode const * pEnd,
338 					   rtl::OUString * pString)
339 {
340 	sal_Unicode const * p = *pBegin;
341 
342 	if (p == pEnd || *p++ != '"')
343 		return false;
344 
345 	rtl::OUStringBuffer aBuffer;
346 	for (;;)
347 	{
348 		if (p == pEnd)
349 			return false;
350 		sal_Unicode c = *p++;
351 		if (c == '"')
352 			break;
353 		if (c == '\\')
354 		{
355 			if (p == pEnd)
356 				return false;
357 			c = *p++;
358 			if (c != '"' && c != '\\')
359 				return false;
360 		}
361 		aBuffer.append(c);
362 	}
363 
364 	*pBegin = p;
365 	*pString = aBuffer.makeStringAndClear();
366 	return true;
367 }
368 
369 }
370 
parse(rtl::OUString const & rRegexp)371 Regexp Regexp::parse(rtl::OUString const & rRegexp)
372 {
373 	// Detect an input of '<scheme>' as an abbreviation of '"<scheme>:".*'
374 	// where <scheme> is as defined in RFC 2396:
375 	if (isScheme(rRegexp, false))
376 		return Regexp(Regexp::KIND_PREFIX,
377                       rRegexp
378                           + rtl::OUString(RTL_CONSTASCII_USTRINGPARAM(":")),
379                       false,
380                       rtl::OUString(),
381 					  false,
382                       rtl::OUString());
383 
384 	sal_Unicode const * p = rRegexp.getStr();
385 	sal_Unicode const * pEnd = p + rRegexp.getLength();
386 
387 	rtl::OUString aPrefix;
388 	scanStringLiteral(&p, pEnd, &aPrefix);
389 
390 	if (p == pEnd)
391 		throw lang::IllegalArgumentException();
392 
393 	if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(".*")))
394 	{
395 		if (p != pEnd)
396 			throw lang::IllegalArgumentException();
397 
398 		return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
399 					  false, rtl::OUString());
400 	}
401 	else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("(.*)->")))
402 	{
403 		rtl::OUString aReversePrefix;
404 		scanStringLiteral(&p, pEnd, &aReversePrefix);
405 
406 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
407 			|| p != pEnd)
408 			throw lang::IllegalArgumentException();
409 
410 		return Regexp(Regexp::KIND_PREFIX, aPrefix, false, rtl::OUString(),
411 					  true, aReversePrefix);
412 	}
413 	else if (matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
414 	{
415 		if (p != pEnd)
416 			throw lang::IllegalArgumentException();
417 
418 		return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
419 					  false, rtl::OUString());
420 	}
421 	else if (matchString(&p, pEnd,
422 						 RTL_CONSTASCII_STRINGPARAM("(([/?#].*)?)->")))
423 	{
424 		rtl::OUString aReversePrefix;
425 		if (!(scanStringLiteral(&p, pEnd, &aReversePrefix)
426 			  && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))
427 			  && p == pEnd))
428 			throw lang::IllegalArgumentException();
429 
430 		return Regexp(Regexp::KIND_AUTHORITY, aPrefix, false, rtl::OUString(),
431 					  true, aReversePrefix);
432 	}
433 	else
434 	{
435 		bool bOpen = false;
436 		if (p != pEnd && *p == '(')
437 		{
438 			++p;
439 			bOpen = true;
440 		}
441 
442 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("[^/?#]")))
443 			throw lang::IllegalArgumentException();
444 
445 		if (p == pEnd || (*p != '*' && *p != '+'))
446 			throw lang::IllegalArgumentException();
447 		bool bEmptyDomain = *p++ == '*';
448 
449 		rtl::OUString aInfix;
450 		scanStringLiteral(&p, pEnd, &aInfix);
451 
452 		if (!matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("([/?#].*)?")))
453 			throw lang::IllegalArgumentException();
454 
455 		rtl::OUString aReversePrefix;
456 		if (bOpen
457 			&& !(matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM(")->"))
458 				 && scanStringLiteral(&p, pEnd, &aReversePrefix)
459 				 && matchString(&p, pEnd, RTL_CONSTASCII_STRINGPARAM("\\1"))))
460 			throw lang::IllegalArgumentException();
461 
462 		if (p != pEnd)
463 			throw lang::IllegalArgumentException();
464 
465 		return Regexp(Regexp::KIND_DOMAIN, aPrefix, bEmptyDomain, aInfix,
466 					  bOpen, aReversePrefix);
467 	}
468 }
469 
470