1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #ifndef _RTL_URI_H_ 29 #define _RTL_URI_H_ 30 31 #include "rtl/textenc.h" 32 #include "rtl/ustring.h" 33 #include "sal/types.h" 34 35 #if defined __cplusplus 36 extern "C" { 37 #endif /* __cplusplus */ 38 39 /** Various predefined URI 'char classes.' 40 41 @descr 42 A 'char class' defines which (ASCII) characters can be written 'as they 43 are' in a part of a Uri, and which characters have to be written using 44 escape sequences ('%' followed by two hex digits). Characters outside 45 the ASCII range are always written using escape sequences. 46 47 @descr 48 If there are other frequently used char classes, they can be added to 49 this enumeration; the function rtl_getUriCharClass() has to be adapted 50 then, too. 51 */ 52 typedef enum 53 { 54 /** The empty char class. 55 56 @descr 57 All characters are written using escape sequences. 58 */ 59 rtl_UriCharClassNone, 60 61 /** The RFC 2732 <uric> char class. 62 63 @descr 64 The 'valid' characters are !$&'()*+,-./:;=?@[]_~ plus digits and 65 letters. 66 */ 67 rtl_UriCharClassUric, 68 69 /** The RFC 2396 <uric_no_slash> char class. 70 71 @descr 72 The 'valid' characters are !$&'()*+,-.:;=?@_~ plus digits and letters. 73 */ 74 rtl_UriCharClassUricNoSlash, 75 76 /** The RFC 2396 <rel_segment> char class. 77 78 @descr 79 The 'valid' characters are !$&'()*+,-.;=@_~ plus digits and letters. 80 */ 81 rtl_UriCharClassRelSegment, 82 83 /** The RFC 2396 <reg_name> char class. 84 85 @descr 86 The 'valid' characters are !$&'()*+,-.:;=@_~ plus digits and letters. 87 */ 88 rtl_UriCharClassRegName, 89 90 /** The RFC 2396 <userinfo> char class. 91 92 @descr 93 The 'valid' characters are !$&'()*+,-.:;=_~ plus digits and letters. 94 */ 95 rtl_UriCharClassUserinfo, 96 97 /** The RFC 2396 <pchar> char class. 98 99 @descr 100 The 'valid' characters are !$&'()*+,-.:=@_~ plus digits and letters. 101 */ 102 rtl_UriCharClassPchar, 103 104 /** The char class for the values of uno URL parameters. 105 106 @descr 107 The 'valid' characters are !$&'()*+-./:?@_~ plus digits and letters. 108 */ 109 rtl_UriCharClassUnoParamValue, 110 111 rtl_UriCharClass_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 112 } 113 rtl_UriCharClass; 114 115 /** The mechanism describing how escape sequences in the input of 116 rtl_uriEncode() are handled. 117 */ 118 typedef enum 119 { 120 /** The special meaning of '%' is ignored (i.e., there are by definition 121 no escape sequences in the input). 122 123 @descr 124 This mechanism is useful to encode user input as part of a URI (e.g., 125 the user-supplied password in an ftp URL---'%20abcde' is a valid 126 password, so do not assume that the '%20' is an escaped space). 127 */ 128 rtl_UriEncodeIgnoreEscapes, 129 130 /** All escape sequences ('%' followed by two hex digits) are kept intact, 131 even if they represent characters that need not be escaped or if they 132 do not even map to characters in the given charset. 133 134 @descr 135 This mechanism is useful when passing on complete URIs more or less 136 unmodified (e.g., within an HTTP proxy): missing escape sequences are 137 added, but existing escape sequences are not touched (except that any 138 lower case hex digits are replaced by upper case hex digits). 139 */ 140 rtl_UriEncodeKeepEscapes, 141 142 /** All escape sequences ('%' followed by two hex digits) are resolved in 143 a first step; only those that represent characters that need to be 144 escaped are kept intact. 145 146 @descr 147 This mechanism is useful to properly encode complete URIs entered by 148 the user: the URI is brought into a 'canonic form,' but care is taken 149 not to damage (valid) escape sequences the (careful) user already 150 entered as such. 151 */ 152 rtl_UriEncodeCheckEscapes, 153 154 /** Like rtl_UriEncodeIgnoreEscapes, but indicating failure when converting 155 unmappable characters. 156 157 @since UDK 3.2.0 158 */ 159 rtl_UriEncodeStrict, 160 161 /** Like rtl_UriEncodeKeepEscapes, but indicating failure when converting 162 unmappable characters. 163 164 @since UDK 3.2.7 165 */ 166 rtl_UriEncodeStrictKeepEscapes, 167 168 rtl_UriEncode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 169 } 170 rtl_UriEncodeMechanism; 171 172 /** The mechanism describing how rtl_uriDecode() translates (part of) a URI 173 into a Unicode string. 174 */ 175 typedef enum 176 { 177 /** The text is returned completely unmodified. 178 */ 179 rtl_UriDecodeNone, 180 181 /** The text is returned in the form of an IURI (cf. 182 draft-masinter-url-i18n-05.txt). 183 184 @descr 185 All escape sequences representing ASCII characters (%00--%7F) are 186 kept, all other escape sequences are interpreted as UTF-8 characters 187 and translated to Unicode, if possible. 188 */ 189 rtl_UriDecodeToIuri, 190 191 /** The text is decoded. 192 193 @descr 194 All escape sequences representing characters from the given charset 195 are decoded and translated to Unicode, if possible. 196 */ 197 rtl_UriDecodeWithCharset, 198 199 /** Like rtl_UriDecodeWithCharset, but indicating failure when converting 200 unmappable characters. 201 202 @since UDK 3.2.0 203 */ 204 rtl_UriDecodeStrict, 205 206 rtl_UriDecode_FORCE_EQUAL_SIZE = SAL_MAX_ENUM 207 } 208 rtl_UriDecodeMechanism; 209 210 /** Map a predefined rtl_UriCharClass to a form usable by rtl_uriEncode(). 211 212 @descr 213 The function rtl_uriEncode() expects an array of 128 booleans, and this 214 function maps rtl_UriCharClass enumeration members to such arrays. 215 216 @param eCharClass 217 Any valid member of rtl_UriCharClass. 218 219 @return 220 An array of 128 booleans, to be used in calls to rtl_uriEncode(). 221 */ 222 sal_Bool const * SAL_CALL rtl_getUriCharClass(rtl_UriCharClass eCharClass) 223 SAL_THROW_EXTERN_C(); 224 225 /** Encode a text as (part of) a URI. 226 227 @param pText 228 Any Unicode string. Must not be null. 229 230 @param pCharClass 231 A char class, represented as an array of 128 booleans (true means keep the 232 corresponding ASCII character unencoded, false means encode it). Must not 233 be null, and the boolean corresponding to the percent sign (0x25) must be 234 false. (See rtl_getUriCharClass() for a function mapping from 235 rtl_UriCharClass to such arrays.) 236 237 @param eMechanism 238 The mechanism describing how escape sequences in the input text are 239 handled. 240 241 @param eCharset 242 When Unicode characters from the input text have to be written using 243 escape sequences (because they are either outside the ASCII range or do 244 not belong to the given char class), they are first translated into this 245 charset before being encoded using escape sequences. 246 247 Also, if the encode mechanism is rtl_UriEncodeCheckEscapes, all escape 248 sequences already present in the input text are interpreted as characters 249 from this charset. 250 251 @param pResult 252 Returns an encoded representation of the input text. Must itself not be 253 null, and must point to either null or a valid string. 254 255 If the encode mechanism is rtl_UriEncodeStrict, and pText cannot be 256 converted to eCharset because it contains unmappable characters (which 257 implies that pText is not empty), then an empty string is returned. 258 */ 259 void SAL_CALL rtl_uriEncode(rtl_uString * pText, 260 sal_Bool const * pCharClass, 261 rtl_UriEncodeMechanism eMechanism, 262 rtl_TextEncoding eCharset, 263 rtl_uString ** pResult) 264 SAL_THROW_EXTERN_C(); 265 266 /** Decode (a part of) a URI. 267 268 @param pText 269 Any Unicode string. Must not be null. (If the input is indeed part of a 270 valid URI, this string will only contain a subset of the ASCII characters, 271 but this function also handles other Unicode characters properly.) 272 273 @param eMechanism 274 The mechanism describing how the input text is translated into a Unicode 275 string. 276 277 @param eCharset 278 When the decode mechanism is rtl_UriDecodeWithCharset, all escape 279 sequences in the input text are interpreted as characters from this 280 charset. Those characters are translated to Unicode characters in the 281 resulting output, if possible. 282 283 When the decode mechanism is rtl_UriDecodeNone or rtl_UriDecodeToIuri, 284 this parameter is ignored (and is best specified as 285 RTL_TEXTENCODING_UTF8). 286 287 @param pResult 288 Returns a decoded representation of the input text. Must itself not be 289 null, and must point to either null or a valid string. 290 291 If the decode mechanism is rtl_UriDecodeStrict, and pText cannot be 292 converted to eCharset because it contains (encodings of) unmappable 293 characters (which implies that pText is not empty), then an empty string is 294 returned. 295 */ 296 void SAL_CALL rtl_uriDecode(rtl_uString * pText, 297 rtl_UriDecodeMechanism eMechanism, 298 rtl_TextEncoding eCharset, 299 rtl_uString ** pResult) 300 SAL_THROW_EXTERN_C(); 301 302 /** Convert a relative URI reference into an absolute one. 303 304 A URI reference is a URI plus an optional <"#" fragment> part. 305 306 This function uses the algorithm described in RFC 2396, section 5.2, with 307 the following clarifications: (1) Backwards-compatible relative URIs 308 starting with a scheme component (see RFC 2396, section 5.2, step 3) are not 309 supported. (2) Segments "." and ".." within the path of the base URI are 310 not considered special, RFC 2396 seems a bit unlcear about that point. 311 (3) Erroneous excess segments ".." within the path of the relative URI (if 312 it is indeed relative) are left intact, as the examples in RFC 2396, 313 section C.2, suggest. (4) If the relative URI is a reference to the 314 "current document," the "current document" is taken to be the base URI. 315 316 This function signals exceptions by returning false and letting pException 317 point to a message explaining the exception. 318 319 @param pBaseUriRef 320 An absolute, hierarchical URI reference that serves as the base URI. If it 321 has to be inspected (i.e., pRelUriRef is not an absolute URI already), and 322 if it either is not an absolute URI (i.e., does not begin with a 323 <scheme ":"> part) or has a path that is non-empty but does not start 324 with "/", an exception will be signaled. 325 326 @param pRelUriRef 327 An URI reference that may be either absolute or relative. If it is 328 absolute, it will be returned unmodified (and it need not be hierarchical 329 then). 330 331 @param pResult 332 Returns an absolute URI reference. Must itself not be null, and must point 333 to either null or a valid string. If an exception is signalled, it is left 334 unchanged. 335 336 @param pException 337 Returns an explanatory message in case an exception is signalled. Must 338 itself not be null, and must point to either null or a valid string. If no 339 exception is signalled, it is left unchanged. 340 341 @return 342 True if no exception is signalled, otherwise false. 343 */ 344 sal_Bool SAL_CALL rtl_uriConvertRelToAbs(rtl_uString * pBaseUriRef, 345 rtl_uString * pRelUriRef, 346 rtl_uString ** pResult, 347 rtl_uString ** pException) 348 SAL_THROW_EXTERN_C(); 349 350 #if defined __cplusplus 351 } 352 #endif /* __cplusplus */ 353 354 #endif /* _RTL_URI_H_ */ 355