00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "config.h"
00022 #include "system.h"
00023 #include "cpplib.h"
00024 #include "internal.h"
00025 #include "ucnid.h"
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070
00071
00072 #if !HAVE_ICONV
00073
00074
00075
00076 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
00077 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
00078 #define iconv_close(x) (void)0
00079 #define ICONV_CONST
00080 #endif
00081
00082 #if HOST_CHARSET == HOST_CHARSET_ASCII
00083 #define SOURCE_CHARSET "UTF-8"
00084 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
00085 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
00086 #define SOURCE_CHARSET "UTF-EBCDIC"
00087 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
00088 #else
00089 #error "Unrecognized basic host character set"
00090 #endif
00091
00092 #ifndef EILSEQ
00093 #define EILSEQ EINVAL
00094 #endif
00095
00096
00097
00098
00099 struct _cpp_strbuf
00100 {
00101 uchar *text;
00102 size_t asize;
00103 size_t len;
00104 };
00105
00106
00107
00108
00109 #define OUTBUF_BLOCK_SIZE 256
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169 static inline int
00170 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
00171 cppchar_t *cp)
00172 {
00173 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
00174 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00175
00176 cppchar_t c;
00177 const uchar *inbuf = *inbufp;
00178 size_t nbytes, i;
00179
00180 if (*inbytesleftp < 1)
00181 return EINVAL;
00182
00183 c = *inbuf;
00184 if (c < 0x80)
00185 {
00186 *cp = c;
00187 *inbytesleftp -= 1;
00188 *inbufp += 1;
00189 return 0;
00190 }
00191
00192
00193
00194 for (nbytes = 2; nbytes < 7; nbytes++)
00195 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
00196 goto found;
00197 return EILSEQ;
00198 found:
00199
00200 if (*inbytesleftp < nbytes)
00201 return EINVAL;
00202
00203 c = (c & masks[nbytes-1]);
00204 inbuf++;
00205 for (i = 1; i < nbytes; i++)
00206 {
00207 cppchar_t n = *inbuf++;
00208 if ((n & 0xC0) != 0x80)
00209 return EILSEQ;
00210 c = ((c << 6) + (n & 0x3F));
00211 }
00212
00213
00214 if (c <= 0x7F && nbytes > 1) return EILSEQ;
00215 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
00216 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
00217 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
00218 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
00219
00220
00221 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
00222
00223 *cp = c;
00224 *inbufp = inbuf;
00225 *inbytesleftp -= nbytes;
00226 return 0;
00227 }
00228
00229 static inline int
00230 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
00231 {
00232 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
00233 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
00234 size_t nbytes;
00235 uchar buf[6], *p = &buf[6];
00236 uchar *outbuf = *outbufp;
00237
00238 nbytes = 1;
00239 if (c < 0x80)
00240 *--p = c;
00241 else
00242 {
00243 do
00244 {
00245 *--p = ((c & 0x3F) | 0x80);
00246 c >>= 6;
00247 nbytes++;
00248 }
00249 while (c >= 0x3F || (c & limits[nbytes-1]));
00250 *--p = (c | masks[nbytes-1]);
00251 }
00252
00253 if (*outbytesleftp < nbytes)
00254 return E2BIG;
00255
00256 while (p < &buf[6])
00257 *outbuf++ = *p++;
00258 *outbytesleftp -= nbytes;
00259 *outbufp = outbuf;
00260 return 0;
00261 }
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278
00279
00280 static inline int
00281 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00282 uchar **outbufp, size_t *outbytesleftp)
00283 {
00284 uchar *outbuf;
00285 cppchar_t s = 0;
00286 int rval;
00287
00288
00289 if (*outbytesleftp < 4)
00290 return E2BIG;
00291
00292 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
00293 if (rval)
00294 return rval;
00295
00296 outbuf = *outbufp;
00297 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
00298 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
00299 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
00300 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
00301
00302 *outbufp += 4;
00303 *outbytesleftp -= 4;
00304 return 0;
00305 }
00306
00307 static inline int
00308 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00309 uchar **outbufp, size_t *outbytesleftp)
00310 {
00311 cppchar_t s;
00312 int rval;
00313 const uchar *inbuf;
00314
00315 if (*inbytesleftp < 4)
00316 return EINVAL;
00317
00318 inbuf = *inbufp;
00319
00320 s = inbuf[bigend ? 0 : 3] << 24;
00321 s += inbuf[bigend ? 1 : 2] << 16;
00322 s += inbuf[bigend ? 2 : 1] << 8;
00323 s += inbuf[bigend ? 3 : 0];
00324
00325 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
00326 return EILSEQ;
00327
00328 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
00329 if (rval)
00330 return rval;
00331
00332 *inbufp += 4;
00333 *inbytesleftp -= 4;
00334 return 0;
00335 }
00336
00337 static inline int
00338 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00339 uchar **outbufp, size_t *outbytesleftp)
00340 {
00341 int rval;
00342 cppchar_t s = 0;
00343 const uchar *save_inbuf = *inbufp;
00344 size_t save_inbytesleft = *inbytesleftp;
00345 uchar *outbuf = *outbufp;
00346
00347 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
00348 if (rval)
00349 return rval;
00350
00351 if (s > 0x0010FFFF)
00352 {
00353 *inbufp = save_inbuf;
00354 *inbytesleftp = save_inbytesleft;
00355 return EILSEQ;
00356 }
00357
00358 if (s < 0xFFFF)
00359 {
00360 if (*outbytesleftp < 2)
00361 {
00362 *inbufp = save_inbuf;
00363 *inbytesleftp = save_inbytesleft;
00364 return E2BIG;
00365 }
00366 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
00367 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
00368
00369 *outbufp += 2;
00370 *outbytesleftp -= 2;
00371 return 0;
00372 }
00373 else
00374 {
00375 cppchar_t hi, lo;
00376
00377 if (*outbytesleftp < 4)
00378 {
00379 *inbufp = save_inbuf;
00380 *inbytesleftp = save_inbytesleft;
00381 return E2BIG;
00382 }
00383
00384 hi = (s - 0x10000) / 0x400 + 0xD800;
00385 lo = (s - 0x10000) % 0x400 + 0xDC00;
00386
00387
00388
00389 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
00390 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
00391 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
00392 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
00393
00394 *outbufp += 4;
00395 *outbytesleftp -= 4;
00396 return 0;
00397 }
00398 }
00399
00400 static inline int
00401 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
00402 uchar **outbufp, size_t *outbytesleftp)
00403 {
00404 cppchar_t s;
00405 const uchar *inbuf = *inbufp;
00406 int rval;
00407
00408 if (*inbytesleftp < 2)
00409 return EINVAL;
00410 s = inbuf[bigend ? 0 : 1] << 8;
00411 s += inbuf[bigend ? 1 : 0];
00412
00413
00414 if (s >= 0xDC00 && s <= 0xDFFF)
00415 return EILSEQ;
00416
00417 else if (s >= 0xD800 && s <= 0xDBFF)
00418 {
00419 cppchar_t hi = s, lo;
00420 if (*inbytesleftp < 4)
00421 return EINVAL;
00422
00423 lo = inbuf[bigend ? 2 : 3] << 8;
00424 lo += inbuf[bigend ? 3 : 2];
00425
00426 if (lo < 0xDC00 || lo > 0xDFFF)
00427 return EILSEQ;
00428
00429 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
00430 }
00431
00432 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
00433 if (rval)
00434 return rval;
00435
00436
00437
00438 if (s <= 0xFFFF)
00439 {
00440 *inbufp += 2;
00441 *inbytesleftp -= 2;
00442 }
00443 else
00444 {
00445 *inbufp += 4;
00446 *inbytesleftp -= 4;
00447 }
00448 return 0;
00449 }
00450
00451
00452
00453
00454
00455 static inline bool
00456 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
00457 uchar **, size_t *),
00458 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
00459 {
00460 const uchar *inbuf;
00461 uchar *outbuf;
00462 size_t inbytesleft, outbytesleft;
00463 int rval;
00464
00465 inbuf = from;
00466 inbytesleft = flen;
00467 outbuf = to->text + to->len;
00468 outbytesleft = to->asize - to->len;
00469
00470 for (;;)
00471 {
00472 do
00473 rval = one_conversion (cd, &inbuf, &inbytesleft,
00474 &outbuf, &outbytesleft);
00475 while (inbytesleft && !rval);
00476
00477 if (__builtin_expect (inbytesleft == 0, 1))
00478 {
00479 to->len = to->asize - outbytesleft;
00480 return true;
00481 }
00482 if (rval != E2BIG)
00483 {
00484 errno = rval;
00485 return false;
00486 }
00487
00488 outbytesleft += OUTBUF_BLOCK_SIZE;
00489 to->asize += OUTBUF_BLOCK_SIZE;
00490 to->text = xrealloc (to->text, to->asize);
00491 outbuf = to->text + to->asize - outbytesleft;
00492 }
00493 }
00494
00495
00496
00497
00498
00499
00500
00501
00502
00503
00504
00505
00506 static bool
00507 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
00508 struct _cpp_strbuf *to)
00509 {
00510 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
00511 }
00512
00513 static bool
00514 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
00515 struct _cpp_strbuf *to)
00516 {
00517 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
00518 }
00519
00520 static bool
00521 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
00522 struct _cpp_strbuf *to)
00523 {
00524 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
00525 }
00526
00527 static bool
00528 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
00529 struct _cpp_strbuf *to)
00530 {
00531 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
00532 }
00533
00534
00535 static bool
00536 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
00537 const uchar *from, size_t flen, struct _cpp_strbuf *to)
00538 {
00539 if (to->len + flen > to->asize)
00540 {
00541 to->asize = to->len + flen;
00542 to->text = xrealloc (to->text, to->asize);
00543 }
00544 memcpy (to->text + to->len, from, flen);
00545 to->len += flen;
00546 return true;
00547 }
00548
00549
00550
00551 #if HAVE_ICONV
00552 static bool
00553 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
00554 struct _cpp_strbuf *to)
00555 {
00556 ICONV_CONST char *inbuf;
00557 char *outbuf;
00558 size_t inbytesleft, outbytesleft;
00559
00560
00561 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
00562 return false;
00563
00564 inbuf = (ICONV_CONST char *)from;
00565 inbytesleft = flen;
00566 outbuf = (char *)to->text + to->len;
00567 outbytesleft = to->asize - to->len;
00568
00569 for (;;)
00570 {
00571 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
00572 if (__builtin_expect (inbytesleft == 0, 1))
00573 {
00574 to->len = to->asize - outbytesleft;
00575 return true;
00576 }
00577 if (errno != E2BIG)
00578 return false;
00579
00580 outbytesleft += OUTBUF_BLOCK_SIZE;
00581 to->asize += OUTBUF_BLOCK_SIZE;
00582 to->text = xrealloc (to->text, to->asize);
00583 outbuf = (char *)to->text + to->asize - outbytesleft;
00584 }
00585 }
00586 #else
00587 #define convert_using_iconv 0
00588 #endif
00589
00590
00591
00592
00593 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
00594 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
00595
00596 struct conversion
00597 {
00598 const char *pair;
00599 convert_f func;
00600 iconv_t fake_cd;
00601 };
00602 static const struct conversion conversion_tab[] = {
00603 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
00604 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
00605 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
00606 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
00607 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
00608 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
00609 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
00610 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
00611 };
00612
00613
00614
00615
00616
00617
00618 static struct cset_converter
00619 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
00620 {
00621 struct cset_converter ret;
00622 char *pair;
00623 size_t i;
00624
00625 if (!strcasecmp (to, from))
00626 {
00627 ret.func = convert_no_conversion;
00628 ret.cd = (iconv_t) -1;
00629 return ret;
00630 }
00631
00632 pair = alloca(strlen(to) + strlen(from) + 2);
00633
00634 strcpy(pair, from);
00635 strcat(pair, "/");
00636 strcat(pair, to);
00637 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
00638 if (!strcasecmp (pair, conversion_tab[i].pair))
00639 {
00640 ret.func = conversion_tab[i].func;
00641 ret.cd = conversion_tab[i].fake_cd;
00642 return ret;
00643 }
00644
00645
00646 if (HAVE_ICONV)
00647 {
00648 ret.func = convert_using_iconv;
00649 ret.cd = iconv_open (to, from);
00650
00651 if (ret.cd == (iconv_t) -1)
00652 {
00653 if (errno == EINVAL)
00654 cpp_error (pfile, CPP_DL_ERROR,
00655 "conversion from %s to %s not supported by iconv",
00656 from, to);
00657 else
00658 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
00659
00660 ret.func = convert_no_conversion;
00661 }
00662 }
00663 else
00664 {
00665 cpp_error (pfile, CPP_DL_ERROR,
00666 "no iconv implementation, cannot convert from %s to %s",
00667 from, to);
00668 ret.func = convert_no_conversion;
00669 ret.cd = (iconv_t) -1;
00670 }
00671 return ret;
00672 }
00673
00674
00675
00676
00677
00678
00679 void
00680 cpp_init_iconv (cpp_reader *pfile)
00681 {
00682 const char *ncset = CPP_OPTION (pfile, narrow_charset);
00683 const char *wcset = CPP_OPTION (pfile, wide_charset);
00684 const char *default_wcset;
00685
00686 bool be = CPP_OPTION (pfile, bytes_big_endian);
00687
00688 if (CPP_OPTION (pfile, wchar_precision) >= 32)
00689 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
00690 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
00691 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
00692 else
00693
00694
00695 default_wcset = SOURCE_CHARSET;
00696
00697 if (!ncset)
00698 ncset = SOURCE_CHARSET;
00699 if (!wcset)
00700 wcset = default_wcset;
00701
00702 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
00703 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
00704 }
00705
00706
00707 void
00708 _cpp_destroy_iconv (cpp_reader *pfile)
00709 {
00710 if (HAVE_ICONV)
00711 {
00712 if (pfile->narrow_cset_desc.func == convert_using_iconv)
00713 iconv_close (pfile->narrow_cset_desc.cd);
00714 if (pfile->wide_cset_desc.func == convert_using_iconv)
00715 iconv_close (pfile->wide_cset_desc.cd);
00716 }
00717 }
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731 cppchar_t
00732 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
00733 {
00734 uchar sbuf[1];
00735 struct _cpp_strbuf tbuf;
00736
00737
00738
00739
00740 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
00741 {
00742 cpp_error (pfile, CPP_DL_ICE,
00743 "character 0x%lx is not in the basic source character set\n",
00744 (unsigned long)c);
00745 return 0;
00746 }
00747
00748
00749
00750
00751 sbuf[0] = c;
00752
00753
00754 tbuf.asize = 1;
00755 tbuf.text = xmalloc (tbuf.asize);
00756 tbuf.len = 0;
00757
00758 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
00759 {
00760 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
00761 return 0;
00762 }
00763 if (tbuf.len != 1)
00764 {
00765 cpp_error (pfile, CPP_DL_ICE,
00766 "character 0x%lx is not unibyte in execution character set",
00767 (unsigned long)c);
00768 return 0;
00769 }
00770 c = tbuf.text[0];
00771 free(tbuf.text);
00772 return c;
00773 }
00774
00775
00776
00777
00778
00779 static inline size_t
00780 width_to_mask (size_t width)
00781 {
00782 width = MIN (width, BITS_PER_CPPCHAR_T);
00783 if (width >= CHAR_BIT * sizeof (size_t))
00784 return ~(size_t) 0;
00785 else
00786 return ((size_t) 1 << width) - 1;
00787 }
00788
00789
00790
00791
00792
00793
00794
00795 static int
00796 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c)
00797 {
00798 int mn, mx, md;
00799
00800 mn = -1;
00801 mx = ARRAY_SIZE (ucnranges);
00802 while (mx - mn > 1)
00803 {
00804 md = (mn + mx) / 2;
00805 if (c < ucnranges[md].lo)
00806 mx = md;
00807 else if (c > ucnranges[md].hi)
00808 mn = md;
00809 else
00810 goto found;
00811 }
00812 return 0;
00813
00814 found:
00815
00816
00817
00818 if (CPP_PEDANTIC (pfile)
00819 && ((CPP_OPTION (pfile, c99) && !(ucnranges[md].flags & C99))
00820 || (CPP_OPTION (pfile, cplusplus)
00821 && !(ucnranges[md].flags & CXX))))
00822 return 0;
00823
00824
00825 if (CPP_OPTION (pfile, c99) && (ucnranges[md].flags & DIG))
00826 return 2;
00827
00828 return 1;
00829 }
00830
00831
00832
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854 cppchar_t
00855 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
00856 const uchar *limit, int identifier_pos)
00857 {
00858 cppchar_t result, c;
00859 unsigned int length;
00860 const uchar *str = *pstr;
00861 const uchar *base = str - 2;
00862
00863 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
00864 cpp_error (pfile, CPP_DL_WARNING,
00865 "universal character names are only valid in C++ and C99");
00866 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
00867 cpp_error (pfile, CPP_DL_WARNING,
00868 "the meaning of '\\%c' is different in traditional C",
00869 (int) str[-1]);
00870
00871 if (str[-1] == 'u')
00872 length = 4;
00873 else if (str[-1] == 'U')
00874 length = 8;
00875 else
00876 abort();
00877
00878 result = 0;
00879 do
00880 {
00881 c = *str;
00882 if (!ISXDIGIT (c))
00883 break;
00884 str++;
00885 result = (result << 4) + hex_value (c);
00886 }
00887 while (--length && str < limit);
00888
00889 *pstr = str;
00890 if (length)
00891 {
00892
00893 cpp_error (pfile, CPP_DL_ERROR,
00894 "incomplete universal character name %.*s",
00895 (int) (str - base), base);
00896 result = 1;
00897 }
00898
00899
00900 else if ((result < 0xa0
00901 && (result != 0x24 && result != 0x40 && result != 0x60))
00902 || (result & 0x80000000)
00903 || (result >= 0xD800 && result <= 0xDFFF))
00904 {
00905 cpp_error (pfile, CPP_DL_ERROR,
00906 "%.*s is not a valid universal character",
00907 (int) (str - base), base);
00908 result = 1;
00909 }
00910 else if (identifier_pos)
00911 {
00912 int validity = ucn_valid_in_identifier (pfile, result);
00913
00914 if (validity == 0)
00915 cpp_error (pfile, CPP_DL_ERROR,
00916 "universal character %.*s is not valid in an identifier",
00917 (int) (str - base), base);
00918 else if (validity == 2 && identifier_pos == 1)
00919 cpp_error (pfile, CPP_DL_ERROR,
00920 "universal character %.*s is not valid at the start of an identifier",
00921 (int) (str - base), base);
00922 }
00923
00924 if (result == 0)
00925 result = 1;
00926
00927 return result;
00928 }
00929
00930
00931
00932
00933 static const uchar *
00934 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
00935 struct _cpp_strbuf *tbuf, bool wide)
00936 {
00937 cppchar_t ucn;
00938 uchar buf[6];
00939 uchar *bufp = buf;
00940 size_t bytesleft = 6;
00941 int rval;
00942 struct cset_converter cvt
00943 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
00944
00945 from++;
00946 ucn = _cpp_valid_ucn (pfile, &from, limit, 0);
00947
00948 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
00949 if (rval)
00950 {
00951 errno = rval;
00952 cpp_errno (pfile, CPP_DL_ERROR,
00953 "converting UCN to source character set");
00954 }
00955 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
00956 cpp_errno (pfile, CPP_DL_ERROR,
00957 "converting UCN to execution character set");
00958
00959 return from;
00960 }
00961
00962
00963
00964
00965
00966
00967 static void
00968 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
00969 struct _cpp_strbuf *tbuf, bool wide)
00970 {
00971 if (wide)
00972 {
00973
00974
00975 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
00976 size_t width = CPP_OPTION (pfile, wchar_precision);
00977 size_t cwidth = CPP_OPTION (pfile, char_precision);
00978 size_t cmask = width_to_mask (cwidth);
00979 size_t nbwc = width / cwidth;
00980 size_t i;
00981 size_t off = tbuf->len;
00982 cppchar_t c;
00983
00984 if (tbuf->len + nbwc > tbuf->asize)
00985 {
00986 tbuf->asize += OUTBUF_BLOCK_SIZE;
00987 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
00988 }
00989
00990 for (i = 0; i < nbwc; i++)
00991 {
00992 c = n & cmask;
00993 n >>= cwidth;
00994 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
00995 }
00996 tbuf->len += nbwc;
00997 }
00998 else
00999 {
01000
01001
01002 if (tbuf->len + 1 > tbuf->asize)
01003 {
01004 tbuf->asize += OUTBUF_BLOCK_SIZE;
01005 tbuf->text = xrealloc (tbuf->text, tbuf->asize);
01006 }
01007 tbuf->text[tbuf->len++] = n;
01008 }
01009 }
01010
01011
01012
01013
01014
01015
01016
01017 static const uchar *
01018 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
01019 struct _cpp_strbuf *tbuf, bool wide)
01020 {
01021 cppchar_t c, n = 0, overflow = 0;
01022 int digits_found = 0;
01023 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
01024 : CPP_OPTION (pfile, char_precision));
01025 size_t mask = width_to_mask (width);
01026
01027 if (CPP_WTRADITIONAL (pfile))
01028 cpp_error (pfile, CPP_DL_WARNING,
01029 "the meaning of '\\x' is different in traditional C");
01030
01031 from++;
01032 while (from < limit)
01033 {
01034 c = *from;
01035 if (! hex_p (c))
01036 break;
01037 from++;
01038 overflow |= n ^ (n << 4 >> 4);
01039 n = (n << 4) + hex_value (c);
01040 digits_found = 1;
01041 }
01042
01043 if (!digits_found)
01044 {
01045 cpp_error (pfile, CPP_DL_ERROR,
01046 "\\x used with no following hex digits");
01047 return from;
01048 }
01049
01050 if (overflow | (n != (n & mask)))
01051 {
01052 cpp_error (pfile, CPP_DL_PEDWARN,
01053 "hex escape sequence out of range");
01054 n &= mask;
01055 }
01056
01057 emit_numeric_escape (pfile, n, tbuf, wide);
01058
01059 return from;
01060 }
01061
01062
01063
01064
01065
01066
01067
01068 static const uchar *
01069 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
01070 struct _cpp_strbuf *tbuf, bool wide)
01071 {
01072 size_t count = 0;
01073 cppchar_t c, n = 0;
01074 size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
01075 : CPP_OPTION (pfile, char_precision));
01076 size_t mask = width_to_mask (width);
01077 bool overflow = false;
01078
01079 while (from < limit && count++ < 3)
01080 {
01081 c = *from;
01082 if (c < '0' || c > '7')
01083 break;
01084 from++;
01085 overflow |= n ^ (n << 3 >> 3);
01086 n = (n << 3) + c - '0';
01087 }
01088
01089 if (n != (n & mask))
01090 {
01091 cpp_error (pfile, CPP_DL_PEDWARN,
01092 "octal escape sequence out of range");
01093 n &= mask;
01094 }
01095
01096 emit_numeric_escape (pfile, n, tbuf, wide);
01097
01098 return from;
01099 }
01100
01101
01102
01103
01104
01105 static const uchar *
01106 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
01107 struct _cpp_strbuf *tbuf, bool wide)
01108 {
01109
01110 #if HOST_CHARSET == HOST_CHARSET_ASCII
01111 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
01112 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
01113 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
01114 #else
01115 #error "unknown host character set"
01116 #endif
01117
01118 uchar c;
01119 struct cset_converter cvt
01120 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
01121
01122 c = *from;
01123 switch (c)
01124 {
01125
01126 case 'u': case 'U':
01127 return convert_ucn (pfile, from, limit, tbuf, wide);
01128
01129 case 'x':
01130 return convert_hex (pfile, from, limit, tbuf, wide);
01131 break;
01132
01133 case '0': case '1': case '2': case '3':
01134 case '4': case '5': case '6': case '7':
01135 return convert_oct (pfile, from, limit, tbuf, wide);
01136
01137
01138
01139 case '\\': case '\'': case '"': case '?': break;
01140
01141 case '(': case '{': case '[': case '%':
01142
01143
01144
01145
01146 if (CPP_PEDANTIC (pfile))
01147 goto unknown;
01148 break;
01149
01150 case 'b': c = charconsts[1]; break;
01151 case 'f': c = charconsts[3]; break;
01152 case 'n': c = charconsts[4]; break;
01153 case 'r': c = charconsts[5]; break;
01154 case 't': c = charconsts[6]; break;
01155 case 'v': c = charconsts[7]; break;
01156
01157 case 'a':
01158 if (CPP_WTRADITIONAL (pfile))
01159 cpp_error (pfile, CPP_DL_WARNING,
01160 "the meaning of '\\a' is different in traditional C");
01161 c = charconsts[0];
01162 break;
01163
01164 case 'e': case 'E':
01165 if (CPP_PEDANTIC (pfile))
01166 cpp_error (pfile, CPP_DL_PEDWARN,
01167 "non-ISO-standard escape sequence, '\\%c'", (int) c);
01168 c = charconsts[2];
01169 break;
01170
01171 default:
01172 unknown:
01173 if (ISGRAPH (c))
01174 cpp_error (pfile, CPP_DL_PEDWARN,
01175 "unknown escape sequence '\\%c'", (int) c);
01176 else
01177 cpp_error (pfile, CPP_DL_PEDWARN,
01178 "unknown escape sequence: '\\%03o'", (int) c);
01179 }
01180
01181
01182 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
01183 cpp_errno (pfile, CPP_DL_ERROR,
01184 "converting escape sequence to execution character set");
01185
01186 return from + 1;
01187 }
01188
01189
01190
01191
01192
01193
01194
01195 bool
01196 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
01197 cpp_string *to, bool wide)
01198 {
01199 struct _cpp_strbuf tbuf;
01200 const uchar *p, *base, *limit;
01201 size_t i;
01202 struct cset_converter cvt
01203 = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
01204
01205 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
01206 tbuf.text = xmalloc (tbuf.asize);
01207 tbuf.len = 0;
01208
01209 for (i = 0; i < count; i++)
01210 {
01211 p = from[i].text;
01212 if (*p == 'L') p++;
01213 p++;
01214 limit = from[i].text + from[i].len - 1;
01215
01216 for (;;)
01217 {
01218 base = p;
01219 while (p < limit && *p != '\\')
01220 p++;
01221 if (p > base)
01222 {
01223
01224
01225 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
01226 goto fail;
01227 }
01228 if (p == limit)
01229 break;
01230
01231 p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
01232 }
01233 }
01234
01235
01236 emit_numeric_escape (pfile, 0, &tbuf, wide);
01237 tbuf.text = xrealloc (tbuf.text, tbuf.len);
01238 to->text = tbuf.text;
01239 to->len = tbuf.len;
01240 return true;
01241
01242 fail:
01243 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
01244 free (tbuf.text);
01245 return false;
01246 }
01247
01248
01249
01250 bool
01251 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
01252 size_t count, cpp_string *to, bool wide)
01253 {
01254 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
01255 bool retval;
01256
01257 pfile->narrow_cset_desc.func = convert_no_conversion;
01258 pfile->narrow_cset_desc.cd = (iconv_t) -1;
01259
01260 retval = cpp_interpret_string (pfile, from, count, to, wide);
01261
01262 pfile->narrow_cset_desc = save_narrow_cset_desc;
01263 return retval;
01264 }
01265
01266
01267
01268
01269
01270
01271 static cppchar_t
01272 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
01273 unsigned int *pchars_seen, int *unsignedp)
01274 {
01275 size_t width = CPP_OPTION (pfile, char_precision);
01276 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
01277 size_t mask = width_to_mask (width);
01278 size_t i;
01279 cppchar_t result, c;
01280 bool unsigned_p;
01281
01282
01283
01284
01285
01286
01287
01288
01289
01290
01291
01292 result = 0;
01293 for (i = 0; i < str.len - 1; i++)
01294 {
01295 c = str.text[i] & mask;
01296 if (width < BITS_PER_CPPCHAR_T)
01297 result = (result << width) | c;
01298 else
01299 result = c;
01300 }
01301
01302 if (i > max_chars)
01303 {
01304 i = max_chars;
01305 cpp_error (pfile, CPP_DL_WARNING,
01306 "character constant too long for its type");
01307 }
01308 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
01309 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
01310
01311
01312 if (i > 1)
01313 unsigned_p = 0;
01314 else
01315 unsigned_p = CPP_OPTION (pfile, unsigned_char);
01316
01317
01318
01319
01320
01321 if (i > 1)
01322 width = CPP_OPTION (pfile, int_precision);
01323 if (width < BITS_PER_CPPCHAR_T)
01324 {
01325 mask = ((cppchar_t) 1 << width) - 1;
01326 if (unsigned_p || !(result & (1 << (width - 1))))
01327 result &= mask;
01328 else
01329 result |= ~mask;
01330 }
01331 *pchars_seen = i;
01332 *unsignedp = unsigned_p;
01333 return result;
01334 }
01335
01336
01337
01338
01339
01340 static cppchar_t
01341 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
01342 unsigned int *pchars_seen, int *unsignedp)
01343 {
01344 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
01345 size_t width = CPP_OPTION (pfile, wchar_precision);
01346 size_t cwidth = CPP_OPTION (pfile, char_precision);
01347 size_t mask = width_to_mask (width);
01348 size_t cmask = width_to_mask (cwidth);
01349 size_t nbwc = width / cwidth;
01350 size_t off, i;
01351 cppchar_t result = 0, c;
01352
01353
01354
01355
01356 off = str.len - (nbwc * 2);
01357 result = 0;
01358 for (i = 0; i < nbwc; i++)
01359 {
01360 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
01361 result = (result << cwidth) | (c & cmask);
01362 }
01363
01364
01365
01366
01367 if (off > 0)
01368 cpp_error (pfile, CPP_DL_WARNING,
01369 "character constant too long for its type");
01370
01371
01372
01373 if (width < BITS_PER_CPPCHAR_T)
01374 {
01375 if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
01376 result &= mask;
01377 else
01378 result |= ~mask;
01379 }
01380
01381 *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
01382 *pchars_seen = 1;
01383 return result;
01384 }
01385
01386
01387
01388
01389
01390 cppchar_t
01391 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
01392 unsigned int *pchars_seen, int *unsignedp)
01393 {
01394 cpp_string str = { 0, 0 };
01395 bool wide = (token->type == CPP_WCHAR);
01396 cppchar_t result;
01397
01398
01399 if (token->val.str.len == (size_t) (2 + wide))
01400 {
01401 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
01402 return 0;
01403 }
01404 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
01405 return 0;
01406
01407 if (wide)
01408 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
01409 else
01410 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
01411
01412 if (str.text != token->val.str.text)
01413 free ((void *)str.text);
01414
01415 return result;
01416 }
01417
01418
01419
01420
01421
01422
01423
01424
01425
01426
01427
01428 uchar *
01429 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
01430 uchar *input, size_t size, size_t len, off_t *st_size)
01431 {
01432 struct cset_converter input_cset;
01433 struct _cpp_strbuf to;
01434
01435 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
01436 if (input_cset.func == convert_no_conversion)
01437 {
01438 to.text = input;
01439 to.asize = size;
01440 to.len = len;
01441 }
01442 else
01443 {
01444 to.asize = MAX (65536, len);
01445 to.text = xmalloc (to.asize);
01446 to.len = 0;
01447
01448 if (!APPLY_CONVERSION (input_cset, input, len, &to))
01449 cpp_error (pfile, CPP_DL_ERROR,
01450 "failure to convert %s to %s",
01451 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
01452
01453 free (input);
01454 }
01455
01456
01457 if (input_cset.func == convert_using_iconv)
01458 iconv_close (input_cset.cd);
01459
01460
01461
01462 if (to.len + 4096 < to.asize || to.len >= to.asize)
01463 to.text = xrealloc (to.text, to.len + 1);
01464
01465
01466
01467
01468
01469 if (to.text[to.len - 1] == '\r')
01470 to.text[to.len] = '\r';
01471 else
01472 to.text[to.len] = '\n';
01473
01474 *st_size = to.len;
01475 return to.text;
01476 }
01477
01478
01479 const char *
01480 _cpp_default_encoding (void)
01481 {
01482 const char *current_encoding = NULL;
01483
01484
01485
01486
01487
01488
01489
01490
01491
01492
01493
01494
01495
01496
01497
01498
01499
01500
01501
01502 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
01503 setlocale (LC_CTYPE, "");
01504 current_encoding = nl_langinfo (CODESET);
01505 #endif
01506 if (current_encoding == NULL || *current_encoding == '\0')
01507 current_encoding = SOURCE_CHARSET;
01508
01509 return current_encoding;
01510 }