00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include <stdio.h>
00024 #include <string.h>
00025 #include <ctype.h>
00026 #include <stdbool.h>
00027 #include <stdlib.h>
00028
00029 enum {
00030 C99 = 1,
00031 CXX = 2,
00032 digit = 4,
00033 not_NFC = 8,
00034 not_NFKC = 16,
00035 maybe_not_NFC = 32
00036 };
00037
00038 static unsigned flags[65536];
00039 static unsigned short decomp[65536][2];
00040 static unsigned char combining_value[65536];
00041
00042
00043
00044 static void
00045 fail (const char *s)
00046 {
00047 fprintf (stderr, "%s\n", s);
00048 exit (1);
00049 }
00050
00051
00052
00053 static void
00054 read_ucnid (const char *fname)
00055 {
00056 FILE *f = fopen (fname, "r");
00057 unsigned fl = 0;
00058
00059 if (!f)
00060 fail ("opening ucnid.tab");
00061 for (;;)
00062 {
00063 char line[256];
00064
00065 if (!fgets (line, sizeof (line), f))
00066 break;
00067 if (strcmp (line, "[C99]\n") == 0)
00068 fl = C99;
00069 else if (strcmp (line, "[CXX]\n") == 0)
00070 fl = CXX;
00071 else if (isxdigit (line[0]))
00072 {
00073 char *l = line;
00074 while (*l)
00075 {
00076 unsigned long start, end;
00077 char *endptr;
00078 start = strtoul (l, &endptr, 16);
00079 if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
00080 fail ("parsing ucnid.tab [1]");
00081 l = endptr;
00082 if (*l != '-')
00083 end = start;
00084 else
00085 {
00086 end = strtoul (l + 1, &endptr, 16);
00087 if (end < start)
00088 fail ("parsing ucnid.tab, end before start");
00089 l = endptr;
00090 if (! isspace (*l))
00091 fail ("parsing ucnid.tab, junk after range");
00092 }
00093 while (isspace (*l))
00094 l++;
00095 if (end > 0xFFFF)
00096 fail ("parsing ucnid.tab, end too large");
00097 while (start <= end)
00098 flags[start++] |= fl;
00099 }
00100 }
00101 }
00102 if (ferror (f))
00103 fail ("reading ucnid.tab");
00104 fclose (f);
00105 }
00106
00107
00108
00109
00110
00111
00112 static void
00113 read_table (char *fname)
00114 {
00115 FILE * f = fopen (fname, "r");
00116
00117 if (!f)
00118 fail ("opening UnicodeData.txt");
00119 for (;;)
00120 {
00121 char line[256];
00122 unsigned long codepoint, this_decomp[4];
00123 char *l;
00124 int i;
00125 int decomp_useful;
00126
00127 if (!fgets (line, sizeof (line), f))
00128 break;
00129 codepoint = strtoul (line, &l, 16);
00130 if (l == line || *l != ';')
00131 fail ("parsing UnicodeData.txt, reading code point");
00132 if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX)))
00133 continue;
00134
00135 do {
00136 l++;
00137 } while (*l != ';');
00138
00139
00140 if (*++l == 'N')
00141 flags[codepoint] |= digit;
00142
00143 do {
00144 l++;
00145 } while (*l != ';');
00146
00147
00148 if (! isdigit (*++l))
00149 fail ("parsing UnicodeData.txt, combining class not number");
00150 combining_value[codepoint] = strtoul (l, &l, 10);
00151 if (*l++ != ';')
00152 fail ("parsing UnicodeData.txt, junk after combining class");
00153
00154
00155 do {
00156 l++;
00157 } while (*l != ';');
00158
00159
00160 decomp_useful = flags[codepoint];
00161 if (*++l == '<')
00162 continue;
00163 for (i = 0; i < 4; i++)
00164 {
00165 if (*l == ';')
00166 break;
00167 if (!isxdigit (*l))
00168 fail ("parsing UnicodeData.txt, decomposition format");
00169 this_decomp[i] = strtoul (l, &l, 16);
00170 decomp_useful &= flags[this_decomp[i]];
00171 while (isspace (*l))
00172 l++;
00173 }
00174 if (i > 2)
00175 fail ("parsing UnicodeData.txt, decomposition too long");
00176 if (decomp_useful)
00177 while (--i >= 0)
00178 decomp[codepoint][i] = this_decomp[i];
00179 }
00180 if (ferror (f))
00181 fail ("reading UnicodeData.txt");
00182 fclose (f);
00183 }
00184
00185
00186
00187
00188 static void
00189 read_derived (const char *fname)
00190 {
00191 FILE * f = fopen (fname, "r");
00192
00193 if (!f)
00194 fail ("opening DerivedNormalizationProps.txt");
00195 for (;;)
00196 {
00197 char line[256];
00198 unsigned long start, end;
00199 char *l;
00200 bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
00201
00202 if (!fgets (line, sizeof (line), f))
00203 break;
00204 not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
00205 not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
00206 maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
00207 if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
00208 continue;
00209
00210 start = strtoul (line, &l, 16);
00211 if (l == line)
00212 fail ("parsing DerivedNormalizationProps.txt, reading start");
00213 if (start > 0xffff)
00214 continue;
00215 if (*l == '.' && l[1] == '.')
00216 end = strtoul (l + 2, &l, 16);
00217 else
00218 end = start;
00219
00220 while (start <= end)
00221 flags[start++] |= ((not_NFC_p ? not_NFC : 0)
00222 | (not_NFKC_p ? not_NFKC : 0)
00223 | (maybe_not_NFC_p ? maybe_not_NFC : 0)
00224 );
00225 }
00226 if (ferror (f))
00227 fail ("reading DerivedNormalizationProps.txt");
00228 fclose (f);
00229 }
00230
00231
00232
00233
00234
00235 static void
00236 write_table (void)
00237 {
00238 unsigned i;
00239 unsigned last_flag = flags[0];
00240 bool really_safe = decomp[0][0] == 0;
00241 unsigned char last_combine = combining_value[0];
00242
00243 for (i = 1; i <= 65536; i++)
00244 if (i == 65536
00245 || (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX)))
00246 || really_safe != (decomp[i][0] == 0)
00247 || combining_value[i] != last_combine)
00248 {
00249 printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
00250 last_flag & C99 ? "C99" : " 0",
00251 last_flag & digit ? "DIG" : " 0",
00252 last_flag & CXX ? "CXX" : " 0",
00253 really_safe ? "CID" : " 0",
00254 last_flag & not_NFC ? " 0" : "NFC",
00255 last_flag & not_NFKC ? " 0" : "NKC",
00256 last_flag & maybe_not_NFC ? "CTX" : " 0",
00257 combining_value[i - 1],
00258 i - 1);
00259 last_flag = flags[i];
00260 last_combine = combining_value[0];
00261 really_safe = decomp[i][0] == 0;
00262 }
00263 }
00264
00265
00266
00267 static void
00268 write_copyright (void)
00269 {
00270 static const char copyright[] = "\
00271 /* Unicode characters and various properties.\n\
00272 Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\
00273 \n\
00274 This program is free software; you can redistribute it and/or modify it\n\
00275 under the terms of the GNU General Public License as published by the\n\
00276 Free Software Foundation; either version 2, or (at your option) any\n\
00277 later version.\n\
00278 \n\
00279 This program is distributed in the hope that it will be useful,\n\
00280 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
00281 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
00282 GNU General Public License for more details.\n\
00283 \n\
00284 You should have received a copy of the GNU General Public License\n\
00285 along with this program; if not, write to the Free Software\n\
00286 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n\
00287 \n\
00288 \n\
00289 Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\
00290 Distributed under the Terms of Use in\n\
00291 http://www.unicode.org/copyright.html.\n\
00292 \n\
00293 Permission is hereby granted, free of charge, to any person\n\
00294 obtaining a copy of the Unicode data files and any associated\n\
00295 documentation (the \"Data Files\") or Unicode software and any\n\
00296 associated documentation (the \"Software\") to deal in the Data Files\n\
00297 or Software without restriction, including without limitation the\n\
00298 rights to use, copy, modify, merge, publish, distribute, and/or\n\
00299 sell copies of the Data Files or Software, and to permit persons to\n\
00300 whom the Data Files or Software are furnished to do so, provided\n\
00301 that (a) the above copyright notice(s) and this permission notice\n\
00302 appear with all copies of the Data Files or Software, (b) both the\n\
00303 above copyright notice(s) and this permission notice appear in\n\
00304 associated documentation, and (c) there is clear notice in each\n\
00305 modified Data File or in the Software as well as in the\n\
00306 documentation associated with the Data File(s) or Software that the\n\
00307 data or software has been modified.\n\
00308 \n\
00309 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
00310 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
00311 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
00312 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
00313 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
00314 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
00315 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
00316 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
00317 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
00318 OF THE DATA FILES OR SOFTWARE.\n\
00319 \n\
00320 Except as contained in this notice, the name of a copyright holder\n\
00321 shall not be used in advertising or otherwise to promote the sale,\n\
00322 use or other dealings in these Data Files or Software without prior\n\
00323 written authorization of the copyright holder. */\n";
00324
00325 puts (copyright);
00326 }
00327
00328
00329
00330 int
00331 main(int argc, char ** argv)
00332 {
00333 if (argc != 4)
00334 fail ("too few arguments to makeucn");
00335 read_ucnid (argv[1]);
00336 read_table (argv[2]);
00337 read_derived (argv[3]);
00338
00339 write_copyright ();
00340 write_table ();
00341 return 0;
00342 }