Apache HTTPD
framework
httpd-2.4.62
srclib
apr
misc
win32
utf8.c
Go to the documentation of this file.
1
/* Licensed to the Apache Software Foundation (ASF) under one or more
2
* contributor license agreements. See the NOTICE file distributed with
3
* this work for additional information regarding copyright ownership.
4
* The ASF licenses this file to You under the Apache License, Version 2.0
5
* (the "License"); you may not use this file except in compliance with
6
* the License. You may obtain a copy of the License at
7
*
8
* http://www.apache.org/licenses/LICENSE-2.0
9
*
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
15
*/
16
17
#include "apr.h"
18
#include "apr_private.h"
19
#include "
apr_errno.h
"
20
#include "
apr_arch_utf8.h
"
21
22
/* Implementation of RFC 3629, "UTF-8, a transformation format of ISO 10646"
23
* with particular attention to canonical translation forms (see section 10
24
* "Security Considerations" of the RFC for more info).
25
*
26
* Since several architectures including Windows support unicode, with UCS2
27
* used as the actual storage conventions by that archicture, these functions
28
* exist to transform or validate UCS2 strings into APR's 'char' type
29
* convention. It is left up to the operating system to determine the
30
* validitity of the string, e.g. normative forms, in the context of
31
* its native language support. Other file systems which support filename
32
* characters of 0x80-0xff but have no explicit requirement for Unicode
33
* will find this function useful only for validating the character sequences
34
* and rejecting poorly encoded UTF8 sequences.
35
*
36
* Len UCS-4 range (hex) UTF-8 octet sequence (binary)
37
* 1:2 00000000-0000007F 0xxxxxxx
38
* 2:2 00000080-000007FF 110XXXXx 10xxxxxx
39
* 3:2 00000800-0000FFFF 1110XXXX 10Xxxxxx 10xxxxxx
40
* 4:4 00010000-001FFFFF 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx
41
* 00200000-03FFFFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
42
* 04000000-7FFFFFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
43
*
44
* One of the X bits must be 1 to avoid overlong representation of ucs2 values.
45
*
46
* For conversion into ucs2, the 4th form is limited in range to 0010 FFFF,
47
* and the final two forms are used only by full ucs4, per RFC 3629;
48
*
49
* "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
50
* Unicode parlance), being actually UCS-4 characters transformed
51
* through UTF-16, need special treatment: the UTF-16 transformation
52
* must be undone, yielding a UCS-4 character that is then transformed
53
* as above."
54
*
55
* From RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
56
*
57
* U' = U - 0x10000
58
* U' = 00000000 0000yyyy yyyyyyxx xxxxxxxx
59
* W1 = 110110yy yyyyyyyy
60
* W2 = 110111xx xxxxxxxx
61
* Max U' = 0000 00001111 11111111 11111111
62
* Max U = 0000 00010000 11111111 11111111
63
*
64
* Len is the table above is a mapping of bytes used for utf8:ucs2 values,
65
* which results in these conclusions of maximum allocations;
66
*
67
* apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
68
* apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
69
*/
70
71
APR_DECLARE
(
apr_status_t
)
apr_conv_utf8_to_ucs2
(
const
char
*
in
,
72
apr_size_t
*
inbytes
,
73
apr_wchar_t
*
out
,
74
apr_size_t
*
outwords
)
75
{
76
apr_int64_t
newch
,
mask
;
77
apr_size_t
expect
,
eating
;
78
int
ch
;
79
80
while
(*
inbytes
&& *
outwords
)
81
{
82
ch
= (
unsigned
char
)(*
in
++);
83
if
(!(
ch
& 0200)) {
84
/* US-ASCII-7 plain text
85
*/
86
--*
inbytes
;
87
--*
outwords
;
88
*(
out
++) =
ch
;
89
}
90
else
91
{
92
if
((
ch
& 0300) != 0300) {
93
/* Multibyte Continuation is out of place
94
*/
95
return
APR_EINVAL
;
96
}
97
else
98
{
99
/* Multibyte Sequence Lead Character
100
*
101
* Compute the expected bytes while adjusting
102
* or lead byte and leading zeros mask.
103
*/
104
mask
= 0340;
105
expect
= 1;
106
while
((
ch
&
mask
) ==
mask
) {
107
mask
|=
mask
>> 1;
108
if
(++
expect
> 3)
/* (truly 5 for ucs-4) */
109
return
APR_EINVAL
;
110
}
111
newch
=
ch
&
~mask
;
112
eating
=
expect
+ 1;
113
if
(*
inbytes
<=
expect
)
114
return
APR_INCOMPLETE
;
115
/* Reject values of excessive leading 0 bits
116
* utf-8 _demands_ the shortest possible byte length
117
*/
118
if
(
expect
== 1) {
119
if
(!(
newch
& 0036))
120
return
APR_EINVAL
;
121
}
122
else
{
123
/* Reject values of excessive leading 0 bits
124
*/
125
if
(!
newch
&& !((
unsigned
char
)*
in
& 0077 & (
mask
<< 1)))
126
return
APR_EINVAL
;
127
if
(
expect
== 2) {
128
/* Reject values D800-DFFF when not utf16 encoded
129
* (may not be an appropriate restriction for ucs-4)
130
*/
131
if
(
newch
== 0015 && ((
unsigned
char
)*
in
& 0040))
132
return
APR_EINVAL
;
133
}
134
else
if
(
expect
== 3) {
135
/* Short circuit values > 110000
136
*/
137
if
(
newch
> 4)
138
return
APR_EINVAL
;
139
if
(
newch
== 4 && ((
unsigned
char
)*
in
& 0060))
140
return
APR_EINVAL
;
141
}
142
}
143
/* Where the boolean (expect > 2) is true, we will need
144
* an extra word for the output.
145
*/
146
if
(*
outwords
< (
apr_size_t
)(
expect
> 2) + 1)
147
break
;
/* buffer full */
148
while
(
expect
--)
149
{
150
/* Multibyte Continuation must be legal */
151
if
(((
ch
= (
unsigned
char
)*(
in
++)) & 0300) != 0200)
152
return
APR_EINVAL
;
153
newch
<<= 6;
154
newch
|= (
ch
& 0077);
155
}
156
*
inbytes
-=
eating
;
157
/* newch is now a true ucs-4 character
158
*
159
* now we need to fold to ucs-2
160
*/
161
if
(
newch
< 0x10000)
162
{
163
--*
outwords
;
164
*(
out
++) = (
apr_wchar_t
)
newch
;
165
}
166
else
167
{
168
*
outwords
-= 2;
169
newch
-= 0x10000;
170
*(
out
++) = (
apr_wchar_t
) (0xD800 | (
newch
>> 10));
171
*(
out
++) = (
apr_wchar_t
) (0xDC00 | (
newch
& 0x03FF));
172
}
173
}
174
}
175
}
176
/* Buffer full 'errors' aren't errors, the client must inspect both
177
* the inbytes and outwords values
178
*/
179
return
APR_SUCCESS
;
180
}
181
182
APR_DECLARE
(
apr_status_t
)
apr_conv_ucs2_to_utf8
(
const
apr_wchar_t
*
in
,
183
apr_size_t
*
inwords
,
184
char
*
out
,
185
apr_size_t
*
outbytes
)
186
{
187
apr_int64_t
newch
,
require
;
188
apr_size_t
need;
189
char
*
invout
;
190
int
ch
;
191
192
while
(*
inwords
&& *
outbytes
)
193
{
194
ch
= (
unsigned
short
)(*
in
++);
195
if
(
ch
< 0x80)
196
{
197
--*
inwords
;
198
--*
outbytes
;
199
*(
out
++) = (
unsigned
char
)
ch
;
200
}
201
else
202
{
203
if
((
ch
& 0xFC00) == 0xDC00) {
204
/* Invalid Leading ucs-2 Multiword Continuation Character
205
*/
206
return
APR_EINVAL
;
207
}
208
if
((
ch
& 0xFC00) == 0xD800) {
209
/* Leading ucs-2 Multiword Character
210
*/
211
if
(*
inwords
< 2) {
212
/* Missing ucs-2 Multiword Continuation Character
213
*/
214
return
APR_INCOMPLETE
;
215
}
216
if
(((
unsigned
short
)(*in) & 0xFC00) != 0xDC00) {
217
/* Invalid ucs-2 Multiword Continuation Character
218
*/
219
return
APR_EINVAL
;
220
}
221
newch
= (
ch
& 0x03FF) << 10 | ((
unsigned
short
)(*
in
++) & 0x03FF);
222
newch
+= 0x10000;
223
}
224
else
{
225
/* ucs-2 Single Word Character
226
*/
227
newch
=
ch
;
228
}
229
/* Determine the absolute minimum utf-8 bytes required
230
*/
231
require
=
newch
>> 11;
232
need = 1;
233
while
(
require
)
234
require
>>= 5, ++need;
235
if
(need >= *
outbytes
)
236
break
;
/* Insufficient buffer */
237
*
inwords
-= (need > 2) + 1;
238
*
outbytes
-= need + 1;
239
/* Compute the utf-8 characters in last to first order,
240
* calculating the lead character length bits along the way.
241
*/
242
ch
= 0200;
243
out
+= need + 1;
244
invout
=
out
;
245
while
(need--) {
246
ch
|=
ch
>> 1;
247
*(--
invout
) = (
unsigned
char
)(0200 | (
newch
& 0077));
248
newch
>>= 6;
249
}
250
/* Compute the lead utf-8 character and move the dest offset
251
*/
252
*(--
invout
) = (
unsigned
char
)(
ch
|
newch
);
253
}
254
}
255
/* Buffer full 'errors' aren't errors, the client must inspect both
256
* the inwords and outbytes values
257
*/
258
return
APR_SUCCESS
;
259
}
apr_arch_utf8.h
outbytes
apr_size_t char apr_size_t * outbytes
Definition
apr_arch_utf8.h:54
outwords
apr_size_t apr_wchar_t apr_size_t * outwords
Definition
apr_arch_utf8.h:40
apr_wchar_t
apr_uint16_t apr_wchar_t
Definition
apr_arch_utf8.h:26
inwords
apr_size_t * inwords
Definition
apr_arch_utf8.h:52
inbytes
apr_size_t * inbytes
Definition
apr_arch_utf8.h:38
apr_errno.h
APR Error Codes.
APR_INCOMPLETE
#define APR_INCOMPLETE
Definition
apr_errno.h:452
APR_EINVAL
#define APR_EINVAL
Definition
apr_errno.h:711
mask
const char * mask
Definition
apr_date.h:60
APR_DECLARE
const void apr_status_t(*) apr_status_t(* APR_DECLARE)(void) apr_pool_pre_cleanup_register(apr_pool_t *p
Definition
apr_pools.h:646
size
apr_size_t size
Definition
apr_allocator.h:115
APR_SUCCESS
#define APR_SUCCESS
Definition
apr_errno.h:225
apr_status_t
int apr_status_t
Definition
apr_errno.h:44
in
apr_int32_t in
Definition
apr_thread_proc.h:417
out
static apr_file_t * out
Definition
mod_info.c:85
Generated by
1.9.8