00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "config.h"
00022 #include "system.h"
00023 #include "cpplib.h"
00024 #include "internal.h"
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071 #if !HAVE_ICONV
00072
00073
00074
00075 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
00076 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
00077 #define iconv_close(x) (void)0
00078 #define ICONV_CONST
00079 #endif
00080
00081 #if HOST_CHARSET == HOST_CHARSET_ASCII
00082 #define SOURCE_CHARSET "UTF-8"
00083 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
00084 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
00085 #define SOURCE_CHARSET "UTF-EBCDIC"
00086 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
00087 #else
00088 #error "Unrecognized basic host character set"
00089 #endif
00090
00091 #ifndef EILSEQ
00092 #define EILSEQ EINVAL
00093 #endif
00094
00095
00096
00097
00098 struct _cpp_strbuf
00099 {
00100 uchar *text;
00101 size_t asize;
00102 size_t len;
00103 };
00104
00105
00106
00107
00108 #define OUTBUF_BLOCK_SIZE 256
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168 static inline int
00169 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
00170 cppchar_t *cp)
00171 {
00172 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
00173 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00174
00175 cppchar_t c;
00176 const uchar *inbuf = *inbufp;
00177 size_t nbytes, i;
00178
00179 if (*inbytesleftp < 1)
00180 return EINVAL;
00181
00182 c = *inbuf;
00183 if (c < 0x80)
00184 {
00185 *cp = c;
00186 *inbytesleftp -= 1;
00187 *inbufp += 1;
00188 return 0;
00189 }
00190
00191
00192
00193 for (nbytes = 2; nbytes < 7; nbytes++)
00194 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
00195 goto found;
00196 return EILSEQ;
00197 found:
00198
00199 if (*inbytesleftp < nbytes)
00200 return EINVAL;
00201
00202 c = (c & masks[nbytes-1]);
00203 inbuf++;
00204 for (i = 1; i < nbytes; i++)
00205 {
00206 cppchar_t n = *inbuf++;
00207 if ((n & 0xC0) != 0x80)
00208 return EILSEQ;
00209 c = ((c << 6) + (n & 0x3F));
00210 }
00211
00212
00213 if (c <= 0x7F && nbytes > 1) return EILSEQ;
00214 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
00215 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
00216 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
00217 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
00218
00219
00220 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
00221
00222 *cp = c;
00223 *inbufp = inbuf;
00224 *inbytesleftp -= nbytes;
00225 return 0;
00226 }
00227
00228 static inline int
00229 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
00230 {
00231 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00232 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
00233 size_t nbytes;
00234 uchar buf[6], *p = &buf[6];
00235 uchar *outbuf = *outbufp;
00236
00237 nbytes = 1;
00238 if (c < 0x80)
00239 *--p = c;
00240 else
00241 {
00242 do
00243 {
00244 *--p = ((c & 0x3F) | 0x80);
00245 c >>= 6;
00246 nbytes++;
00247 }
00248 while (c >= 0x3F || (c & limits[nbytes-1]));
00249 *--p = (c | masks[nbytes-1]);
00250 }
00251
00252 if (*outbytesleftp < nbytes)
00253 return E2BIG;
00254
00255 while (p < &buf[6])
00256 *outbuf++ = *p++;
00257 *outbytesleftp -= nbytes;
00258 *outbufp = outbuf;
00259 return 0;
00260 }
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279 static inline int
00280 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00281 uchar **outbufp, size_t *outbytesleftp)
00282 {
00283 uchar *outbuf;
00284 cppchar_t s = 0;
00285 int rval;
00286
00287
00288 if (*outbytesleftp < 4)
00289 return E2BIG;
00290
00291 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
00292 if (rval)
00293 return rval;
00294
00295 outbuf = *outbufp;
00296 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
00297 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
00298 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
00299 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
00300
00301 *outbufp += 4;
00302 *outbytesleftp -= 4;
00303 return 0;
00304 }
00305
00306 static inline int
00307 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00308 uchar **outbufp, size_t *outbytesleftp)
00309 {
00310 cppchar_t s;
00311 int rval;
00312 const uchar *inbuf;
00313
00314 if (*inbytesleftp < 4)
00315 return EINVAL;
00316
00317 inbuf = *inbufp;
00318
00319 s = inbuf[bigend ? 0 : 3] << 24;
00320 s += inbuf[bigend ? 1 : 2] << 16;
00321 s += inbuf[bigend ? 2 : 1] << 8;
00322 s += inbuf[bigend ? 3 : 0];
00323
00324 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
00325 return EILSEQ;
00326
00327 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
00328 if (rval)
00329 return rval;
00330
00331 *inbufp += 4;
00332 *inbytesleftp -= 4;
00333 return 0;
00334 }
00335
00336 static inline int
00337 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00338 uchar **outbufp, size_t *outbytesleftp)
00339 {
00340 int rval;
00341 cppchar_t s = 0;
00342 const uchar *save_inbuf = *inbufp;
00343 size_t save_inbytesleft = *inbytesleftp;
00344 uchar *outbuf = *outbufp;
00345
00346 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
00347 if (rval)
00348 return rval;
00349
00350 if (s > 0x0010FFFF)
00351 {
00352 *inbufp = save_inbuf;
00353 *inbytesleftp = save_inbytesleft;
00354 return EILSEQ;
00355 }
00356
00357 if (s < 0xFFFF)
00358 {
00359 if (*outbytesleftp < 2)
00360 {
00361 *inbufp = save_inbuf;
00362 *inbytesleftp = save_inbytesleft;
00363 return E2BIG;
00364 }
00365 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
00366 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
00367
00368 *outbufp += 2;
00369 *outbytesleftp -= 2;
00370 return 0;
00371 }
00372 else
00373 {
00374 cppchar_t hi, lo;
00375
00376 if (*outbytesleftp < 4)
00377 {
00378 *inbufp = save_inbuf;
00379 *inbytesleftp = save_inbytesleft;
00380 return E2BIG;
00381 }
00382
00383 hi = (s - 0x10000) / 0x400 + 0xD800;
00384 lo = (s - 0x10000) % 0x400 + 0xDC00;
00385
00386
00387
00388 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
00389 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
00390 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
00391 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
00392
00393 *outbufp += 4;
00394 *outbytesleftp -= 4;
00395 return 0;
00396 }
00397 }
00398
00399 static inline int
00400 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00401 uchar **outbufp, size_t *outbytesleftp)
00402 {
00403 cppchar_t s;
00404 const uchar *inbuf = *inbufp;
00405 int rval;
00406
00407 if (*inbytesleftp < 2)
00408 return EINVAL;
00409 s = inbuf[bigend ? 0 : 1] << 8;
00410 s += inbuf[bigend ? 1 : 0];
00411
00412
00413 if (s >= 0xDC00 && s <= 0xDFFF)
00414 return EILSEQ;
00415
00416 else if (s >= 0xD800 && s <= 0xDBFF)
00417 {
00418 cppchar_t hi = s, lo;
00419 if (*inbytesleftp < 4)
00420 return EINVAL;
00421
00422 lo = inbuf[bigend ? 2 : 3] << 8;
00423 lo += inbuf[bigend ? 3 : 2];
00424
00425 if (lo < 0xDC00 || lo > 0xDFFF)
00426 return EILSEQ;
00427
00428 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
00429 }
00430
00431 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
00432 if (rval)
00433 return rval;
00434
00435
00436
00437 if (s <= 0xFFFF)
00438 {
00439 *inbufp += 2;
00440 *inbytesleftp -= 2;
00441 }
00442 else
00443 {
00444 *inbufp += 4;
00445 *inbytesleftp -= 4;
00446 }
00447 return 0;
00448 }
00449
00450
00451
00452
00453
00454 static inline bool
00455 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
00456 uchar **, size_t *),
00457 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
00458 {
00459 const uchar *inbuf;
00460 uchar *outbuf;
00461 size_t inbytesleft, outbytesleft;
00462 int rval;
00463
00464 inbuf = from;
00465 inbytesleft = flen;
00466 outbuf = to->text + to->len;
00467 outbytesleft = to->asize - to->len;
00468
00469 for (;;)
00470 {
00471 do
00472 rval = one_conversion (cd, &inbuf, &inbytesleft,
00473 &outbuf, &outbytesleft);
00474 while (inbytesleft && !rval);
00475
00476 if (__builtin_expect (inbytesleft == 0, 1))
00477 {
00478 to->len = to->asize - outbytesleft;
00479 return true;
00480 }
00481 if (rval != E2BIG)
00482 {
00483 errno = rval;
00484 return false;
00485 }
00486
00487 outbytesleft += OUTBUF_BLOCK_SIZE;
00488 to->asize += OUTBUF_BLOCK_SIZE;
00489 to->text = XRESIZEVEC (uchar, to->text, to->asize);
00490 outbuf = to->text + to->asize - outbytesleft;
00491 }
00492 }
00493
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505 static bool
00506 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
00507 struct _cpp_strbuf *to)
00508 {
00509 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
00510 }
00511
00512 static bool
00513 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
00514 struct _cpp_strbuf *to)
00515 {
00516 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
00517 }
00518
00519 static bool
00520 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
00521 struct _cpp_strbuf *to)
00522 {
00523 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
00524 }
00525
00526 static bool
00527 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
00528 struct _cpp_strbuf *to)
00529 {
00530 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
00531 }
00532
00533
00534 static bool
00535 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
00536 const uchar *from, size_t flen, struct _cpp_strbuf *to)
00537 {
00538 if (to->len + flen > to->asize)
00539 {
00540 to->asize = to->len + flen;
00541 to->text = XRESIZEVEC (uchar, to->text, to->asize);
00542 }
00543 memcpy (to->text + to->len, from, flen);
00544 to->len += flen;
00545 return true;
00546 }
00547
00548
00549
00550 #if HAVE_ICONV
00551 static bool
00552 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
00553 struct _cpp_strbuf *to)
00554 {
00555 ICONV_CONST char *inbuf;
00556 char *outbuf;
00557 size_t inbytesleft, outbytesleft;
00558
00559
00560 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
00561 return false;
00562
00563 inbuf = (ICONV_CONST char *)from;
00564 inbytesleft = flen;
00565 outbuf = (char *)to->text + to->len;
00566 outbytesleft = to->asize - to->len;
00567
00568 for (;;)
00569 {
00570 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
00571 if (__builtin_expect (inbytesleft == 0, 1))
00572 {
00573 to->len = to->asize - outbytesleft;
00574 return true;
00575 }
00576 if (errno != E2BIG)
00577 return false;
00578
00579 outbytesleft += OUTBUF_BLOCK_SIZE;
00580 to->asize += OUTBUF_BLOCK_SIZE;
00581 to->text = XRESIZEVEC (uchar, to->text, to->asize);
00582 outbuf = (char *)to->text + to->asize - outbytesleft;
00583 }
00584 }
00585 #else
00586 #define convert_using_iconv 0
00587 #endif
00588
00589
00590
00591
00592 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
00593 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
00594
00595 struct conversion
00596 {
00597 const char *pair;
00598 convert_f func;
00599 iconv_t fake_cd;
00600 };
00601 static const struct conversion conversion_tab[] = {
00602 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
00603 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
00604 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
00605 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
00606 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
00607 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
00608 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
00609 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
00610 };
00611
00612
00613
00614
00615
00616
00617 static struct cset_converter
00618 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
00619 {
00620 struct cset_converter ret;
00621 char *pair;
00622 size_t i;
00623
00624 if (!strcasecmp (to, from))
00625 {
00626 ret.func = convert_no_conversion;
00627 ret.cd = (iconv_t) -1;
00628 return ret;
00629 }
00630
00631 pair = (char *) alloca(strlen(to) + strlen(from) + 2);
00632
00633 strcpy(pair, from);
00634 strcat(pair, "/");
00635 strcat(pair, to);
00636 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
00637 if (!strcasecmp (pair, conversion_tab[i].pair))
00638 {
00639 ret.func = conversion_tab[i].func;
00640 ret.cd = conversion_tab[i].fake_cd;
00641 return ret;
00642 }
00643
00644
00645 if (HAVE_ICONV)
00646 {
00647 ret.func = convert_using_iconv;
00648 ret.cd = iconv_open (to, from);
00649
00650 if (ret.cd == (iconv_t) -1)
00651 {
00652 if (errno == EINVAL)
00653 cpp_error (pfile, CPP_DL_ERROR,
00654 "conversion from %s to %s not supported by iconv",
00655 from, to);
00656 else
00657 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
00658
00659 ret.func = convert_no_conversion;
00660 }
00661 }
00662 else
00663 {
00664 cpp_error (pfile, CPP_DL_ERROR,
00665 "no iconv implementation, cannot convert from %s to %s",
00666 from, to);
00667 ret.func = convert_no_conversion;
00668 ret.cd = (iconv_t) -1;
00669 }
00670 return ret;
00671 }
00672
00673
00674
00675
00676
00677
00678 void
00679 cpp_init_iconv (cpp_reader *pfile)
00680 {
00681 const char *ncset = CPP_OPTION (pfile, narrow_charset);
00682 const char *wcset = CPP_OPTION (pfile, wide_charset);
00683 const char *default_wcset;
00684
00685 bool be = CPP_OPTION (pfile, bytes_big_endian);
00686
00687 if (CPP_OPTION (pfile, wchar_precision) >= 32)
00688 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
00689 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
00690 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
00691 else
00692
00693
00694 default_wcset = SOURCE_CHARSET;
00695
00696 if (!ncset)
00697 ncset = SOURCE_CHARSET;
00698 if (!wcset)
00699 wcset = default_wcset;
00700
00701 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
00702 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
00703 }
00704
00705
00706 void
00707 _cpp_destroy_iconv (cpp_reader *pfile)
00708 {
00709 if (HAVE_ICONV)
00710 {
00711 if (pfile->narrow_cset_desc.func == convert_using_iconv)
00712 iconv_close (pfile->narrow_cset_desc.cd);
00713 if (pfile->wide_cset_desc.func == convert_using_iconv)
00714 iconv_close (pfile->wide_cset_desc.cd);
00715 }
00716 }
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730 cppchar_t
00731 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
00732 {
00733 uchar sbuf[1];
00734 struct _cpp_strbuf tbuf;
00735
00736
00737
00738
00739 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
00740 {
00741 cpp_error (pfile, CPP_DL_ICE,
00742 "character 0x%lx is not in the basic source character set\n",
00743 (unsigned long)c);
00744 return 0;
00745 }
00746
00747
00748
00749
00750 sbuf[0] = c;
00751
00752
00753 tbuf.asize = 1;
00754 tbuf.text = XNEWVEC (uchar, tbuf.asize);
00755 tbuf.len = 0;
00756
00757 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
00758 {
00759 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
00760 return 0;
00761 }
00762 if (tbuf.len != 1)
00763 {
00764 cpp_error (pfile, CPP_DL_ICE,
00765 "character 0x%lx is not unibyte in execution character set",
00766 (unsigned long)c);
00767 return 0;
00768 }
00769 c = tbuf.text[0];
00770 free(tbuf.text);
00771 return c;
00772 }
00773
00774
00775
00776
00777
00778 static inline size_t
00779 width_to_mask (size_t width)
00780 {
00781 width = MIN (width, BITS_PER_CPPCHAR_T);
00782 if (width >= CHAR_BIT * sizeof (size_t))
00783 return ~(size_t) 0;
00784 else
00785 return ((size_t) 1 << width) - 1;
00786 }
00787
00788
00789 enum {
00790
00791 C99 = 1,
00792
00793 DIG = 2,
00794
00795 CXX = 4,
00796
00797 CID = 8,
00798
00799 NFC = 16,
00800
00801 NKC = 32,
00802
00803 CTX = 64
00804 };
00805
00806 static const struct {
00807
00808 unsigned char flags;
00809
00810 unsigned char combine;
00811
00812 unsigned short end;
00813 } ucnranges[] = {
00814 #include "ucnid.h"
00815 };
00816
00817
00818
00819
00820
00821
00822
00823
00824 static int
00825 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
00826 struct normalize_state *nst)
00827 {
00828 int mn, mx, md;
00829
00830 if (c > 0xFFFF)
00831 return 0;
00832
00833 mn = 0;
00834 mx = ARRAY_SIZE (ucnranges) - 1;
00835 while (mx != mn)
00836 {
00837 md = (mn + mx) / 2;
00838 if (c <= ucnranges[md].end)
00839 mx = md;
00840 else
00841 mn = md + 1;
00842 }
00843
00844
00845
00846
00847 if (! (ucnranges[mn].flags & (C99 | CXX)))
00848 return 0;
00849
00850 if (CPP_PEDANTIC (pfile)
00851 && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
00852 || (CPP_OPTION (pfile, cplusplus)
00853 && !(ucnranges[mn].flags & CXX))))
00854 return 0;
00855
00856
00857 if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
00858 nst->level = normalized_none;
00859 else if (ucnranges[mn].flags & CTX)
00860 {
00861 bool safe;
00862 cppchar_t p = nst->previous;
00863
00864
00865 if (c == 0x09BE)
00866 safe = p != 0x09C7;
00867 else if (c == 0x0B3E)
00868 safe = p != 0x0B47;
00869 else if (c == 0x0BBE)
00870 safe = p != 0x0BC6 && p != 0x0BC7;
00871 else if (c == 0x0CC2)
00872 safe = p != 0x0CC6;
00873 else if (c == 0x0D3E)
00874 safe = p != 0x0D46 && p != 0x0D47;
00875
00876
00877
00878
00879
00880
00881
00882 else if (c >= 0x1161 && c <= 0x1175)
00883 safe = p < 0x1100 || p > 0x1112;
00884 else if (c >= 0x11A8 && c <= 0x11C2)
00885 safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
00886 else
00887 {
00888
00889 cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
00890 safe = true;
00891 }
00892 if (!safe && c < 0x1161)
00893 nst->level = normalized_none;
00894 else if (!safe)
00895 nst->level = MAX (nst->level, normalized_identifier_C);
00896 }
00897 else if (ucnranges[mn].flags & NKC)
00898 ;
00899 else if (ucnranges[mn].flags & NFC)
00900 nst->level = MAX (nst->level, normalized_C);
00901 else if (ucnranges[mn].flags & CID)
00902 nst->level = MAX (nst->level, normalized_identifier_C);
00903 else
00904 nst->level = normalized_none;
00905 nst->previous = c;
00906 nst->prev_class = ucnranges[mn].combine;
00907
00908
00909 if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
00910 return 2;
00911
00912 return 1;
00913 }
00914
00915
00916
00917
00918
00919
00920
00921
00922
00923
00924
00925
00926
00927
00928
00929
00930
00931
00932
00933
00934
00935
00936
00937 cppchar_t
00938 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
00939 const uchar *limit, int identifier_pos,
00940 struct normalize_state *nst)
00941 {
00942 cppchar_t result, c;
00943 unsigned int length;
00944 const uchar *str = *pstr;
00945 const uchar *base = str - 2;
00946
00947 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
00948 cpp_error (pfile, CPP_DL_WARNING,
00949 "universal character names are only valid in C++ and C99");
00950 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
00951 cpp_error (pfile, CPP_DL_WARNING,
00952 "the meaning of '\\%c' is different in traditional C",
00953 (int) str[-1]);
00954
00955 if (str[-1] == 'u')
00956 length = 4;
00957 else if (str[-1] == 'U')
00958 length = 8;
00959 else
00960 {
00961 cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
00962 length = 4;
00963 }
00964
00965 result = 0;
00966 do
00967 {
00968 c = *str;
00969 if (!ISXDIGIT (c))
00970 break;
00971 str++;
00972 result = (result << 4) + hex_value (c);
00973 }
00974 while (--length && str < limit);
00975
00976
00977
00978
00979 if (length && identifier_pos)
00980 return 0;
00981
00982 *pstr = str;
00983 if (length)
00984 {
00985 cpp_error (pfile, CPP_DL_ERROR,
00986 "incomplete universal character name %.*s",
00987 (int) (str - base), base);
00988 result = 1;
00989 }
00990
00991
00992 else if ((result < 0xa0
00993 && (result != 0x24 && result != 0x40 && result != 0x60))
00994 || (result & 0x80000000)
00995 || (result >= 0xD800 && result <= 0xDFFF))
00996 {
00997 cpp_error (pfile, CPP_DL_ERROR,
00998 "%.*s is not a valid universal character",
00999 (int) (str - base), base);
01000 result = 1;
01001 }
01002 else if (identifier_pos && result == 0x24
01003 && CPP_OPTION (pfile, dollars_in_ident))
01004 {
01005 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
01006 {
01007 CPP_OPTION (pfile, warn_dollars) = 0;
01008 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
01009 }
01010 NORMALIZE_STATE_UPDATE_IDNUM (nst);
01011 }
01012 else if (identifier_pos)
01013 {
01014 int validity = ucn_valid_in_identifier (pfile, result, nst);
01015
01016 if (validity == 0)
01017 cpp_error (pfile, CPP_DL_ERROR,
01018 "universal character %.*s is not valid in an identifier",
01019 (int) (str - base), base);
01020 else if (validity == 2 && identifier_pos == 1)
01021 cpp_error (pfile, CPP_DL_ERROR,
01022 "universal character %.*s is not valid at the start of an identifier",
01023 (int) (str - base), base);
01024 }
01025
01026 if (result == 0)
01027 result = 1;
01028
01029 return result;
01030 }
01031
01032
01033
01034
01035 static const uchar *
01036 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
01037 struct _cpp_strbuf *tbuf, bool wide)
01038 {
01039 cppchar_t ucn;
01040 uchar buf[6];
01041 uchar *bufp = buf;
01042 size_t bytesleft = 6;
01043 int rval;
01044 struct cset_converter cvt
01045 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
01046 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
01047
01048 from++;
01049 ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
01050
01051 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
01052 if (rval)
01053 {
01054 errno = rval;
01055 cpp_errno (pfile, CPP_DL_ERROR,
01056 "converting UCN to source character set");
01057 }
01058 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
01059 cpp_errno (pfile, CPP_DL_ERROR,
01060 "converting UCN to execution character set");
01061
01062 return from;
01063 }
01064
01065
01066
01067
01068
01069
01070 static void
01071 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
01072 struct _cpp_strbuf *tbuf, bool wide)
01073 {
01074 if (wide)
01075 {
01076
01077
01078 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
01079 size_t width = CPP_OPTION (pfile, wchar_precision);
01080 size_t cwidth = CPP_OPTION (pfile, char_precision);
01081 size_t cmask = width_to_mask (cwidth);
01082 size_t nbwc = width / cwidth;
01083 size_t i;
01084 size_t off = tbuf->len;
01085 cppchar_t c;
01086
01087 if (tbuf->len + nbwc > tbuf->asize)
01088 {
01089 tbuf->asize += OUTBUF_BLOCK_SIZE;
01090 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
01091 }
01092
01093 for (i = 0; i < nbwc; i++)
01094 {
01095 c = n & cmask;
01096 n >>= cwidth;
01097 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
01098 }
01099 tbuf->len += nbwc;
01100 }
01101 else
01102 {
01103
01104
01105 if (tbuf->len + 1 > tbuf->asize)
01106 {
01107 tbuf->asize += OUTBUF_BLOCK_SIZE;
01108 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
01109 }
01110 tbuf->text[tbuf->len++] = n;
01111 }
01112 }
01113
01114
01115
01116
01117
01118
01119
01120 static const uchar *
01121 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
01122 struct _cpp_strbuf *tbuf, bool wide)
01123 {
01124 cppchar_t c, n = 0, overflow = 0;
01125 int digits_found = 0;
01126 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
01127 : CPP_OPTION (pfile, char_precision));
01128 size_t mask = width_to_mask (width);
01129
01130 if (CPP_WTRADITIONAL (pfile))
01131 cpp_error (pfile, CPP_DL_WARNING,
01132 "the meaning of '\\x' is different in traditional C");
01133
01134 from++;
01135 while (from < limit)
01136 {
01137 c = *from;
01138 if (! hex_p (c))
01139 break;
01140 from++;
01141 overflow |= n ^ (n << 4 >> 4);
01142 n = (n << 4) + hex_value (c);
01143 digits_found = 1;
01144 }
01145
01146 if (!digits_found)
01147 {
01148 cpp_error (pfile, CPP_DL_ERROR,
01149 "\\x used with no following hex digits");
01150 return from;
01151 }
01152
01153 if (overflow | (n != (n & mask)))
01154 {
01155 cpp_error (pfile, CPP_DL_PEDWARN,
01156 "hex escape sequence out of range");
01157 n &= mask;
01158 }
01159
01160 emit_numeric_escape (pfile, n, tbuf, wide);
01161
01162 return from;
01163 }
01164
01165
01166
01167
01168
01169
01170
01171 static const uchar *
01172 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
01173 struct _cpp_strbuf *tbuf, bool wide)
01174 {
01175 size_t count = 0;
01176 cppchar_t c, n = 0;
01177 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
01178 : CPP_OPTION (pfile, char_precision));
01179 size_t mask = width_to_mask (width);
01180 bool overflow = false;
01181
01182 while (from < limit && count++ < 3)
01183 {
01184 c = *from;
01185 if (c < '0' || c > '7')
01186 break;
01187 from++;
01188 overflow |= n ^ (n << 3 >> 3);
01189 n = (n << 3) + c - '0';
01190 }
01191
01192 if (n != (n & mask))
01193 {
01194 cpp_error (pfile, CPP_DL_PEDWARN,
01195 "octal escape sequence out of range");
01196 n &= mask;
01197 }
01198
01199 emit_numeric_escape (pfile, n, tbuf, wide);
01200
01201 return from;
01202 }
01203
01204
01205
01206
01207
01208 static const uchar *
01209 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
01210 struct _cpp_strbuf *tbuf, bool wide)
01211 {
01212
01213 #if HOST_CHARSET == HOST_CHARSET_ASCII
01214 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
01215 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
01216 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
01217 #else
01218 #error "unknown host character set"
01219 #endif
01220
01221 uchar c;
01222 struct cset_converter cvt
01223 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
01224
01225 c = *from;
01226 switch (c)
01227 {
01228
01229 case 'u': case 'U':
01230 return convert_ucn (pfile, from, limit, tbuf, wide);
01231
01232 case 'x':
01233 return convert_hex (pfile, from, limit, tbuf, wide);
01234 break;
01235
01236 case '0': case '1': case '2': case '3':
01237 case '4': case '5': case '6': case '7':
01238 return convert_oct (pfile, from, limit, tbuf, wide);
01239
01240
01241
01242 case '\\': case '\'': case '"': case '?': break;
01243
01244 case '(': case '{': case '[': case '%':
01245
01246
01247
01248
01249 if (CPP_PEDANTIC (pfile))
01250 goto unknown;
01251 break;
01252
01253 case 'b': c = charconsts[1]; break;
01254 case 'f': c = charconsts[3]; break;
01255 case 'n': c = charconsts[4]; break;
01256 case 'r': c = charconsts[5]; break;
01257 case 't': c = charconsts[6]; break;
01258 case 'v': c = charconsts[7]; break;
01259
01260 case 'a':
01261 if (CPP_WTRADITIONAL (pfile))
01262 cpp_error (pfile, CPP_DL_WARNING,
01263 "the meaning of '\\a' is different in traditional C");
01264 c = charconsts[0];
01265 break;
01266
01267 case 'e': case 'E':
01268 if (CPP_PEDANTIC (pfile))
01269 cpp_error (pfile, CPP_DL_PEDWARN,
01270 "non-ISO-standard escape sequence, '\\%c'", (int) c);
01271 c = charconsts[2];
01272 break;
01273
01274 default:
01275 unknown:
01276 if (ISGRAPH (c))
01277 cpp_error (pfile, CPP_DL_PEDWARN,
01278 "unknown escape sequence '\\%c'", (int) c);
01279 else
01280 {
01281
01282
01283 char buf[32];
01284 sprintf(buf, "%03o", (int) c);
01285 cpp_error (pfile, CPP_DL_PEDWARN,
01286 "unknown escape sequence: '\\%s'", buf);
01287 }
01288 }
01289
01290
01291 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
01292 cpp_errno (pfile, CPP_DL_ERROR,
01293 "converting escape sequence to execution character set");
01294
01295 return from + 1;
01296 }
01297
01298
01299
01300
01301
01302
01303
01304 bool
01305 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
01306 cpp_string *to, bool wide)
01307 {
01308 struct _cpp_strbuf tbuf;
01309 const uchar *p, *base, *limit;
01310 size_t i;
01311 struct cset_converter cvt
01312 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
01313
01314 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
01315 tbuf.text = XNEWVEC (uchar, tbuf.asize);
01316 tbuf.len = 0;
01317
01318 for (i = 0; i < count; i++)
01319 {
01320 p = from[i].text;
01321 if (*p == 'L') p++;
01322 p++;
01323 limit = from[i].text + from[i].len - 1;
01324
01325 for (;;)
01326 {
01327 base = p;
01328 while (p < limit && *p != '\\')
01329 p++;
01330 if (p > base)
01331 {
01332
01333
01334 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
01335 goto fail;
01336 }
01337 if (p == limit)
01338 break;
01339
01340 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
01341 }
01342 }
01343
01344
01345 emit_numeric_escape (pfile, 0, &tbuf, wide);
01346 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
01347 to->text = tbuf.text;
01348 to->len = tbuf.len;
01349 return true;
01350
01351 fail:
01352 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
01353 free (tbuf.text);
01354 return false;
01355 }
01356
01357
01358
01359 bool
01360 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
01361 size_t count, cpp_string *to, bool wide)
01362 {
01363 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
01364 bool retval;
01365
01366 pfile->narrow_cset_desc.func = convert_no_conversion;
01367 pfile->narrow_cset_desc.cd = (iconv_t) -1;
01368
01369 retval = cpp_interpret_string (pfile, from, count, to, wide);
01370
01371 pfile->narrow_cset_desc = save_narrow_cset_desc;
01372 return retval;
01373 }
01374
01375
01376
01377
01378
01379
01380 static cppchar_t
01381 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
01382 unsigned int *pchars_seen, int *unsignedp)
01383 {
01384 size_t width = CPP_OPTION (pfile, char_precision);
01385 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
01386 size_t mask = width_to_mask (width);
01387 size_t i;
01388 cppchar_t result, c;
01389 bool unsigned_p;
01390
01391
01392
01393
01394
01395
01396
01397
01398
01399
01400
01401 result = 0;
01402 for (i = 0; i < str.len - 1; i++)
01403 {
01404 c = str.text[i] & mask;
01405 if (width < BITS_PER_CPPCHAR_T)
01406 result = (result << width) | c;
01407 else
01408 result = c;
01409 }
01410
01411 if (i > max_chars)
01412 {
01413 i = max_chars;
01414 cpp_error (pfile, CPP_DL_WARNING,
01415 "character constant too long for its type");
01416 }
01417 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
01418 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
01419
01420
01421 if (i > 1)
01422 unsigned_p = 0;
01423 else
01424 unsigned_p = CPP_OPTION (pfile, unsigned_char);
01425
01426
01427
01428
01429
01430 if (i > 1)
01431 width = CPP_OPTION (pfile, int_precision);
01432 if (width < BITS_PER_CPPCHAR_T)
01433 {
01434 mask = ((cppchar_t) 1 << width) - 1;
01435 if (unsigned_p || !(result & (1 << (width - 1))))
01436 result &= mask;
01437 else
01438 result |= ~mask;
01439 }
01440 *pchars_seen = i;
01441 *unsignedp = unsigned_p;
01442 return result;
01443 }
01444
01445
01446
01447
01448
01449 static cppchar_t
01450 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
01451 unsigned int *pchars_seen, int *unsignedp)
01452 {
01453 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
01454 size_t width = CPP_OPTION (pfile, wchar_precision);
01455 size_t cwidth = CPP_OPTION (pfile, char_precision);
01456 size_t mask = width_to_mask (width);
01457 size_t cmask = width_to_mask (cwidth);
01458 size_t nbwc = width / cwidth;
01459 size_t off, i;
01460 cppchar_t result = 0, c;
01461
01462
01463
01464
01465 off = str.len - (nbwc * 2);
01466 result = 0;
01467 for (i = 0; i < nbwc; i++)
01468 {
01469 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
01470 result = (result << cwidth) | (c & cmask);
01471 }
01472
01473
01474
01475
01476 if (off > 0)
01477 cpp_error (pfile, CPP_DL_WARNING,
01478 "character constant too long for its type");
01479
01480
01481
01482 if (width < BITS_PER_CPPCHAR_T)
01483 {
01484 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
01485 result &= mask;
01486 else
01487 result |= ~mask;
01488 }
01489
01490 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
01491 *pchars_seen = 1;
01492 return result;
01493 }
01494
01495
01496
01497
01498
01499 cppchar_t
01500 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
01501 unsigned int *pchars_seen, int *unsignedp)
01502 {
01503 cpp_string str = { 0, 0 };
01504 bool wide = (token->type == CPP_WCHAR);
01505 cppchar_t result;
01506
01507
01508 if (token->val.str.len == (size_t) (2 + wide))
01509 {
01510 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
01511 return 0;
01512 }
01513 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
01514 return 0;
01515
01516 if (wide)
01517 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
01518 else
01519 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
01520
01521 if (str.text != token->val.str.text)
01522 free ((void *)str.text);
01523
01524 return result;
01525 }
01526
01527
01528
01529
01530 cpp_hashnode *
01531 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
01532 {
01533
01534
01535 uchar * buf = (uchar *) alloca (len + 1);
01536 uchar * bufp = buf;
01537 size_t idp;
01538
01539 for (idp = 0; idp < len; idp++)
01540 if (id[idp] != '\\')
01541 *bufp++ = id[idp];
01542 else
01543 {
01544 unsigned length = id[idp+1] == 'u' ? 4 : 8;
01545 cppchar_t value = 0;
01546 size_t bufleft = len - (bufp - buf);
01547 int rval;
01548
01549 idp += 2;
01550 while (length && idp < len && ISXDIGIT (id[idp]))
01551 {
01552 value = (value << 4) + hex_value (id[idp]);
01553 idp++;
01554 length--;
01555 }
01556 idp--;
01557
01558
01559
01560 if (value == 0x24)
01561 {
01562 *bufp++ = '$';
01563 continue;
01564 }
01565
01566 rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
01567 if (rval)
01568 {
01569 errno = rval;
01570 cpp_errno (pfile, CPP_DL_ERROR,
01571 "converting UCN to source character set");
01572 break;
01573 }
01574 }
01575
01576 return CPP_HASHNODE (ht_lookup (pfile->hash_table,
01577 buf, bufp - buf, HT_ALLOC));
01578 }
01579
01580
01581
01582
01583
01584
01585
01586
01587
01588
01589
01590 uchar *
01591 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
01592 uchar *input, size_t size, size_t len, off_t *st_size)
01593 {
01594 struct cset_converter input_cset;
01595 struct _cpp_strbuf to;
01596
01597 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
01598 if (input_cset.func == convert_no_conversion)
01599 {
01600 to.text = input;
01601 to.asize = size;
01602 to.len = len;
01603 }
01604 else
01605 {
01606 to.asize = MAX (65536, len);
01607 to.text = XNEWVEC (uchar, to.asize);
01608 to.len = 0;
01609
01610 if (!APPLY_CONVERSION (input_cset, input, len, &to))
01611 cpp_error (pfile, CPP_DL_ERROR,
01612 "failure to convert %s to %s",
01613 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
01614
01615 free (input);
01616 }
01617
01618
01619 if (input_cset.func == convert_using_iconv)
01620 iconv_close (input_cset.cd);
01621
01622
01623
01624 if (to.len + 4096 < to.asize || to.len >= to.asize)
01625 to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
01626
01627
01628
01629
01630
01631 if (to.text[to.len - 1] == '\r')
01632 to.text[to.len] = '\r';
01633 else
01634 to.text[to.len] = '\n';
01635
01636 *st_size = to.len;
01637 return to.text;
01638 }
01639
01640
01641 const char *
01642 _cpp_default_encoding (void)
01643 {
01644 const char *current_encoding = NULL;
01645
01646
01647
01648
01649
01650
01651
01652
01653
01654
01655
01656
01657
01658
01659
01660
01661
01662
01663
01664 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
01665 setlocale (LC_CTYPE, "");
01666 current_encoding = nl_langinfo (CODESET);
01667 #endif
01668 if (current_encoding == NULL || *current_encoding == '\0')
01669 current_encoding = SOURCE_CHARSET;
01670
01671 return current_encoding;
01672 }