github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/lib9/utf/mkrunetype.c (about)

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build ignore
     6  
     7  /*
     8   * make is(upper|lower|title|space|alpha)rune and
     9   * to(upper|lower|title)rune from a UnicodeData.txt file.
    10   * these can be found at unicode.org
    11   *
    12   * with -c, runs a check of the existing runetype functions vs.
    13   * those extracted from UnicodeData.
    14   *
    15   * with -p, generates tables for pairs of chars, as well as for ranges
    16   * and singletons.
    17   *
    18   * UnicodeData defines 4 fields of interest:
    19   * 1) a category
    20   * 2) an upper case mapping
    21   * 3) a lower case mapping
    22   * 4) a title case mapping
    23   *
    24   * toupper, tolower, and totitle are defined directly from the mapping.
    25   *
    26   * isalpharune(c) is true iff c is a "letter" category
    27   * isupperrune(c) is true iff c is the target of toupperrune,
    28   *	or is in the uppercase letter category
    29   * similarly for islowerrune and istitlerune.
    30   * isspacerune is true for space category chars, "C" locale white space chars,
    31   *	and two additions:
    32   *	0085	"next line" control char
    33   *	feff]	"zero-width non-break space"
    34   * isdigitrune is true iff c is a numeric-digit category.
    35   */
    36  
    37  #include <u.h>
    38  #include <libc.h>
    39  #include <stdio.h>
    40  #include "utf.h"
    41  #include "utfdef.h"
    42  
    43  enum {
    44  	/*
    45  	 * fields in the unicode data file
    46  	 */
    47  	FIELD_CODE,
    48  	FIELD_NAME,
    49  	FIELD_CATEGORY,
    50  	FIELD_COMBINING,
    51  	FIELD_BIDIR,
    52  	FIELD_DECOMP,
    53  	FIELD_DECIMAL_DIG,
    54  	FIELD_DIG,
    55  	FIELD_NUMERIC_VAL,
    56  	FIELD_MIRRORED,
    57  	FIELD_UNICODE_1_NAME,
    58  	FIELD_COMMENT,
    59  	FIELD_UPPER,
    60  	FIELD_LOWER,
    61  	FIELD_TITLE,
    62  	NFIELDS,
    63  
    64  	MAX_LINE	= 1024,
    65  
    66  	TO_OFFSET	= 1 << 20,
    67  
    68  	NRUNES		= 1 << 21,
    69  };
    70  
    71  #define TO_DELTA(xmapped,x)	(TO_OFFSET + (xmapped) - (x))
    72  
    73  static char	myisspace[NRUNES];
    74  static char	myisalpha[NRUNES];
    75  static char	myisdigit[NRUNES];
    76  static char	myisupper[NRUNES];
    77  static char	myislower[NRUNES];
    78  static char	myistitle[NRUNES];
    79  
    80  static int	mytoupper[NRUNES];
    81  static int	mytolower[NRUNES];
    82  static int	mytotitle[NRUNES];
    83  
    84  static void	check(void);
    85  static void	mktables(char *src, int usepairs);
    86  static void	fatal(const char *fmt, ...);
    87  static int	mygetfields(char **fields, int nfields, char *str, const char *delim);
    88  static int	getunicodeline(FILE *in, char **fields, char *buf);
    89  static int	getcode(char *s);
    90  
    91  static void
    92  usage(void)
    93  {
    94  	fprintf(stderr, "usage: mktables [-cp] <UnicodeData.txt>\n");
    95  	exit(1);
    96  }
    97  
    98  void
    99  main(int argc, char *argv[])
   100  {
   101  	FILE *in;
   102  	char buf[MAX_LINE], buf2[MAX_LINE];
   103  	char *fields[NFIELDS + 1], *fields2[NFIELDS + 1];
   104  	char *p;
   105  	int i, code, last, docheck, usepairs;
   106  
   107  	docheck = 0;
   108  	usepairs = 0;
   109  	ARGBEGIN{
   110  	case 'c':
   111  		docheck = 1;
   112  		break;
   113  	case 'p':
   114  		usepairs = 1;
   115  		break;
   116  	default:
   117  		usage();
   118  	}ARGEND
   119  
   120  	if(argc != 1){
   121  		usage();
   122  	}
   123  
   124  	in = fopen(argv[0], "r");
   125  	if(in == NULL){
   126  		fatal("can't open %s", argv[0]);
   127  	}
   128  
   129  	for(i = 0; i < NRUNES; i++){
   130  		mytoupper[i] = i;
   131  		mytolower[i] = i;
   132  		mytotitle[i] = i;
   133  	}
   134  
   135  	/*
   136  	 * make sure isspace has all of the "C" locale whitespace chars
   137  	 */
   138  	myisspace['\t'] = 1;
   139  	myisspace['\n'] = 1;
   140  	myisspace['\r'] = 1;
   141  	myisspace['\f'] = 1;
   142  	myisspace['\v'] = 1;
   143  
   144  	/*
   145  	 * a couple of other exceptions
   146  	 */
   147  	myisspace[0x85] = 1;	/* control char, "next line" */
   148  	myisspace[0xfeff] = 1;	/* zero-width non-break space */
   149  
   150  	last = -1;
   151  	while(getunicodeline(in, fields, buf)){
   152  		code = getcode(fields[FIELD_CODE]);
   153  		if (code >= NRUNES)
   154  			fatal("code-point value too big: %x", code);
   155  		if(code <= last)
   156  			fatal("bad code sequence: %x then %x", last, code);
   157  		last = code;
   158  
   159  		/*
   160  		 * check for ranges
   161  		 */
   162  		p = fields[FIELD_CATEGORY];
   163  		if(strstr(fields[FIELD_NAME], ", First>") != NULL){
   164  			if(!getunicodeline(in, fields2, buf2))
   165  				fatal("range start at eof");
   166  			if (strstr(fields2[FIELD_NAME], ", Last>") == NULL)
   167  				fatal("range start not followed by range end");
   168  			last = getcode(fields2[FIELD_CODE]);
   169  			if(last <= code)
   170  				fatal("range out of sequence: %x then %x", code, last);
   171  			if(strcmp(p, fields2[FIELD_CATEGORY]) != 0)
   172  				fatal("range with mismatched category");
   173  		}
   174  
   175  		/*
   176  		 * set properties and conversions
   177  		 */
   178  		for (; code <= last; code++){
   179  			if(p[0] == 'L')
   180  				myisalpha[code] = 1;
   181  			if(p[0] == 'Z')
   182  				myisspace[code] = 1;
   183  
   184  			if(strcmp(p, "Lu") == 0)
   185  				myisupper[code] = 1;
   186  			if(strcmp(p, "Ll") == 0)
   187  				myislower[code] = 1;
   188  
   189  			if(strcmp(p, "Lt") == 0)
   190  				myistitle[code] = 1;
   191  
   192  			if(strcmp(p, "Nd") == 0)
   193  				myisdigit[code] = 1;
   194  
   195  			/*
   196  			 * when finding conversions, also need to mark
   197  			 * upper/lower case, since some chars, like
   198  			 * "III" (0x2162), aren't defined as letters but have a
   199  			 * lower case mapping ("iii" (0x2172)).
   200  			 */
   201  			if(fields[FIELD_UPPER][0] != '\0'){
   202  				mytoupper[code] = getcode(fields[FIELD_UPPER]);
   203  			}
   204  			if(fields[FIELD_LOWER][0] != '\0'){
   205  				mytolower[code] = getcode(fields[FIELD_LOWER]);
   206  			}
   207  			if(fields[FIELD_TITLE][0] != '\0'){
   208  				mytotitle[code] = getcode(fields[FIELD_TITLE]);
   209  			}
   210  		}
   211  	}
   212  
   213  	fclose(in);
   214  
   215  	/*
   216  	 * check for codes with no totitle mapping but a toupper mapping.
   217  	 * these appear in UnicodeData-2.0.14.txt, but are almost certainly
   218  	 * erroneous.
   219  	 */
   220  	for(i = 0; i < NRUNES; i++){
   221  		if(mytotitle[i] == i
   222  		&& mytoupper[i] != i
   223  		&& !myistitle[i])
   224  			fprintf(stderr, "warning: code=%.4x not istitle, totitle is same, toupper=%.4x\n", i, mytoupper[i]);
   225  	}
   226  
   227  	/*
   228  	 * make sure isupper[c] is true if for some x toupper[x]  == c
   229  	 * ditto for islower and istitle
   230  	 */
   231  	for(i = 0; i < NRUNES; i++) {
   232  		if(mytoupper[i] != i)
   233  			myisupper[mytoupper[i]] = 1;
   234  		if(mytolower[i] != i)
   235  			myislower[mytolower[i]] = 1;
   236  		if(mytotitle[i] != i)
   237  			myistitle[mytotitle[i]] = 1;
   238  	}
   239  
   240  	if(docheck){
   241  		check();
   242  	}else{
   243  		mktables(argv[0], usepairs);
   244  	}
   245  	exit(0);
   246  }
   247  
   248  /*
   249   * generate a properties array for ranges, clearing those cases covered.
   250   * if force, generate one-entry ranges for singletons.
   251   */
   252  static int
   253  mkisrange(const char* label, char* prop, int force)
   254  {
   255  	int start, stop, some;
   256  
   257  	/*
   258  	 * first, the ranges
   259  	 */
   260  	some = 0;
   261  	for(start = 0; start < NRUNES; ) {
   262  		if(!prop[start]){
   263  			start++;
   264  			continue;
   265  		}
   266  
   267  		for(stop = start + 1; stop < NRUNES; stop++){
   268  			if(!prop[stop]){
   269  				break;
   270  			}
   271  			prop[stop] = 0;
   272  		}
   273  		if(force || stop != start + 1){
   274  			if(!some){
   275  				printf("static Rune __is%sr[] = {\n", label);
   276  				some = 1;
   277  			}
   278  			prop[start] = 0;
   279  			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 1);
   280  		}
   281  
   282  		start = stop;
   283  	}
   284  	if(some)
   285  		printf("};\n\n");
   286  	return some;
   287  }
   288  
   289  /*
   290   * generate a mapping array for pairs with a skip between,
   291   * clearing those entries covered.
   292   */
   293  static int
   294  mkispair(const char *label, char *prop)
   295  {
   296  	int start, stop, some;
   297  
   298  	some = 0;
   299  	for(start = 0; start + 2 < NRUNES; ) {
   300  		if(!prop[start]){
   301  			start++;
   302  			continue;
   303  		}
   304  
   305  		for(stop = start + 2; stop < NRUNES; stop += 2){
   306  			if(!prop[stop]){
   307  				break;
   308  			}
   309  			prop[stop] = 0;
   310  		}
   311  		if(stop != start + 2){
   312  			if(!some){
   313  				printf("static Rune __is%sp[] = {\n", label);
   314  				some = 1;
   315  			}
   316  			prop[start] = 0;
   317  			printf("\t0x%.4x, 0x%.4x,\n", start, stop - 2);
   318  		}
   319  
   320  		start = stop;
   321  	}
   322  	if(some)
   323  		printf("};\n\n");
   324  	return some;
   325  }
   326  
   327  /*
   328   * generate a properties array for singletons, clearing those cases covered.
   329   */
   330  static int
   331  mkissingle(const char *label, char *prop)
   332  {
   333  	int start, some;
   334  
   335  	some = 0;
   336  	for(start = 0; start < NRUNES; start++) {
   337  		if(!prop[start]){
   338  			continue;
   339  		}
   340  
   341  		if(!some){
   342  			printf("static Rune __is%ss[] = {\n", label);
   343  			some = 1;
   344  		}
   345  		prop[start] = 0;
   346  		printf("\t0x%.4x,\n", start);
   347  	}
   348  	if(some)
   349  		printf("};\n\n");
   350  	return some;
   351  }
   352  
   353  /*
   354   * generate tables and a function for is<label>rune
   355   */
   356  static void
   357  mkis(const char* label, char* prop, int usepairs)
   358  {
   359  	int isr, isp, iss;
   360  
   361  	isr = mkisrange(label, prop, 0);
   362  	isp = 0;
   363  	if(usepairs)
   364  		isp = mkispair(label, prop);
   365  	iss = mkissingle(label, prop);
   366  
   367  	printf(
   368  		"int\n"
   369  		"is%srune(Rune c)\n"
   370  		"{\n"
   371  		"	Rune *p;\n"
   372  		"\n",
   373  		label);
   374  
   375  	if(isr)
   376  		printf(
   377  			"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
   378  			"	if(p && c >= p[0] && c <= p[1])\n"
   379  			"		return 1;\n",
   380  			label, label);
   381  
   382  	if(isp)
   383  		printf(
   384  			"	p = rbsearch(c, __is%sp, nelem(__is%sp)/2, 2);\n"
   385  			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
   386  			"		return 1;\n",
   387  			label, label);
   388  
   389  	if(iss)
   390  		printf(
   391  			"	p = rbsearch(c, __is%ss, nelem(__is%ss), 1);\n"
   392  			"	if(p && c == p[0])\n"
   393  			"		return 1;\n",
   394  			label, label);
   395  
   396  
   397  	printf(
   398  		"	return 0;\n"
   399  		"}\n"
   400  		"\n"
   401  	);
   402  }
   403  
   404  /*
   405   * generate a mapping array for ranges, clearing those entries covered.
   406   * if force, generate one-entry ranges for singletons.
   407   */
   408  static int
   409  mktorange(const char* label, int* map, int force)
   410  {
   411  	int start, stop, delta, some;
   412  
   413  	some = 0;
   414  	for(start = 0; start < NRUNES; ) {
   415  		if(map[start] == start){
   416  			start++;
   417  			continue;
   418  		}
   419  
   420  		delta = TO_DELTA(map[start], start);
   421  		if(delta != (Rune)delta)
   422  			fatal("bad map delta %d", delta);
   423  		for(stop = start + 1; stop < NRUNES; stop++){
   424  			if(TO_DELTA(map[stop], stop) != delta){
   425  				break;
   426  			}
   427  			map[stop] = stop;
   428  		}
   429  		if(stop != start + 1){
   430  			if(!some){
   431  				printf("static Rune __to%sr[] = {\n", label);
   432  				some = 1;
   433  			}
   434  			map[start] = start;
   435  			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 1, delta);
   436  		}
   437  
   438  		start = stop;
   439  	}
   440  	if(some)
   441  		printf("};\n\n");
   442  	return some;
   443  }
   444  
   445  /*
   446   * generate a mapping array for pairs with a skip between,
   447   * clearing those entries covered.
   448   */
   449  static int
   450  mktopair(const char* label, int* map)
   451  {
   452  	int start, stop, delta, some;
   453  
   454  	some = 0;
   455  	for(start = 0; start + 2 < NRUNES; ) {
   456  		if(map[start] == start){
   457  			start++;
   458  			continue;
   459  		}
   460  
   461  		delta = TO_DELTA(map[start], start);
   462  		if(delta != (Rune)delta)
   463  			fatal("bad map delta %d", delta);
   464  		for(stop = start + 2; stop < NRUNES; stop += 2){
   465  			if(TO_DELTA(map[stop], stop) != delta){
   466  				break;
   467  			}
   468  			map[stop] = stop;
   469  		}
   470  		if(stop != start + 2){
   471  			if(!some){
   472  				printf("static Rune __to%sp[] = {\n", label);
   473  				some = 1;
   474  			}
   475  			map[start] = start;
   476  			printf("\t0x%.4x, 0x%.4x, %d,\n", start, stop - 2, delta);
   477  		}
   478  
   479  		start = stop;
   480  	}
   481  	if(some)
   482  		printf("};\n\n");
   483  	return some;
   484  }
   485  
   486  /*
   487   * generate a mapping array for singletons, clearing those entries covered.
   488   */
   489  static int
   490  mktosingle(const char* label, int* map)
   491  {
   492  	int start, delta, some;
   493  
   494  	some = 0;
   495  	for(start = 0; start < NRUNES; start++) {
   496  		if(map[start] == start){
   497  			continue;
   498  		}
   499  
   500  		delta = TO_DELTA(map[start], start);
   501  		if(delta != (Rune)delta)
   502  			fatal("bad map delta %d", delta);
   503  		if(!some){
   504  			printf("static Rune __to%ss[] = {\n", label);
   505  			some = 1;
   506  		}
   507  		map[start] = start;
   508  		printf("\t0x%.4x, %d,\n", start, delta);
   509  	}
   510  	if(some)
   511  		printf("};\n\n");
   512  	return some;
   513  }
   514  
   515  /*
   516   * generate tables and a function for to<label>rune
   517   */
   518  static void
   519  mkto(const char* label, int* map, int usepairs)
   520  {
   521  	int tor, top, tos;
   522  
   523  	tor = mktorange(label, map, 0);
   524  	top = 0;
   525  	if(usepairs)
   526  		top = mktopair(label, map);
   527  	tos = mktosingle(label, map);
   528  
   529  	printf(
   530  		"Rune\n"
   531  		"to%srune(Rune c)\n"
   532  		"{\n"
   533  		"	Rune *p;\n"
   534  		"\n",
   535  		label);
   536  
   537  	if(tor)
   538  		printf(
   539  			"	p = rbsearch(c, __to%sr, nelem(__to%sr)/3, 3);\n"
   540  			"	if(p && c >= p[0] && c <= p[1])\n"
   541  			"		return c + p[2] - %d;\n",
   542  			label, label, TO_OFFSET);
   543  
   544  	if(top)
   545  		printf(
   546  			"	p = rbsearch(c, __to%sp, nelem(__to%sp)/3, 3);\n"
   547  			"	if(p && c >= p[0] && c <= p[1] && !((c - p[0]) & 1))\n"
   548  			"		return c + p[2] - %d;\n",
   549  			label, label, TO_OFFSET);
   550  
   551  	if(tos)
   552  		printf(
   553  			"	p = rbsearch(c, __to%ss, nelem(__to%ss)/2, 2);\n"
   554  			"	if(p && c == p[0])\n"
   555  			"		return c + p[1] - %d;\n",
   556  			label, label, TO_OFFSET);
   557  
   558  
   559  	printf(
   560  		"	return c;\n"
   561  		"}\n"
   562  		"\n"
   563  	);
   564  }
   565  
   566  // Make only range tables and a function for is<label>rune.
   567  static void
   568  mkisronly(const char* label, char* prop)
   569  {
   570  	mkisrange(label, prop, 1);
   571  	printf(
   572  		"int\n"
   573  		"is%srune(Rune c)\n"
   574  		"{\n"
   575  		"	Rune *p;\n"
   576  		"\n"
   577  		"	p = rbsearch(c, __is%sr, nelem(__is%sr)/2, 2);\n"
   578  		"	if(p && c >= p[0] && c <= p[1])\n"
   579  		"		return 1;\n"
   580  		"	return 0;\n"
   581  		"}\n"
   582  		"\n",
   583  	        label, label, label);
   584  }
   585  
   586  /*
   587   * generate the body of runetype.
   588   * assumes there is a function Rune* rbsearch(Rune c, Rune *t, int n, int ne);
   589   */
   590  static void
   591  mktables(char *src, int usepairs)
   592  {
   593  	printf("/* generated automatically by mkrunetype.c from %s */\n\n", src);
   594  
   595  	/*
   596  	 * we special case the space and digit tables, since they are assumed
   597  	 * to be small with several ranges.
   598  	 */
   599  	mkisronly("space", myisspace);
   600  	mkisronly("digit", myisdigit);
   601  
   602  	mkis("alpha", myisalpha, 0);
   603  	mkis("upper", myisupper, usepairs);
   604  	mkis("lower", myislower, usepairs);
   605  	mkis("title", myistitle, usepairs);
   606  
   607  	mkto("upper", mytoupper, usepairs);
   608  	mkto("lower", mytolower, usepairs);
   609  	mkto("title", mytotitle, usepairs);
   610  }
   611  
   612  /*
   613   * find differences between the newly generated tables and current runetypes.
   614   */
   615  static void
   616  check(void)
   617  {
   618  	int i;
   619  
   620  	for(i = 0; i < NRUNES; i++){
   621  		if(isdigitrune(i) != myisdigit[i])
   622  			fprintf(stderr, "isdigit diff at %x: runetype=%x, unicode=%x\n",
   623  				i, isdigitrune(i), myisdigit[i]);
   624  
   625  		if(isspacerune(i) != myisspace[i])
   626  			fprintf(stderr, "isspace diff at %x: runetype=%x, unicode=%x\n",
   627  				i, isspacerune(i), myisspace[i]);
   628  
   629  		if(isupperrune(i) != myisupper[i])
   630  			fprintf(stderr, "isupper diff at %x: runetype=%x, unicode=%x\n",
   631  				i, isupperrune(i), myisupper[i]);
   632  
   633  		if(islowerrune(i) != myislower[i])
   634  			fprintf(stderr, "islower diff at %x: runetype=%x, unicode=%x\n",
   635  				i, islowerrune(i), myislower[i]);
   636  
   637  		if(isalpharune(i) != myisalpha[i])
   638  			fprintf(stderr, "isalpha diff at %x: runetype=%x, unicode=%x\n",
   639  				i, isalpharune(i), myisalpha[i]);
   640  
   641  		if(toupperrune(i) != mytoupper[i])
   642  			fprintf(stderr, "toupper diff at %x: runetype=%x, unicode=%x\n",
   643  				i, toupperrune(i), mytoupper[i]);
   644  
   645  		if(tolowerrune(i) != mytolower[i])
   646  			fprintf(stderr, "tolower diff at %x: runetype=%x, unicode=%x\n",
   647  				i, tolowerrune(i), mytolower[i]);
   648  
   649  		if(istitlerune(i) != myistitle[i])
   650  			fprintf(stderr, "istitle diff at %x: runetype=%x, unicode=%x\n",
   651  				i, istitlerune(i), myistitle[i]);
   652  
   653  		if(totitlerune(i) != mytotitle[i])
   654  			fprintf(stderr, "totitle diff at %x: runetype=%x, unicode=%x\n",
   655  				i, totitlerune(i), mytotitle[i]);
   656  
   657  
   658  	}
   659  }
   660  
   661  static int
   662  mygetfields(char **fields, int nfields, char *str, const char *delim)
   663  {
   664  	int nf;
   665  
   666  	fields[0] = str;
   667  	nf = 1;
   668  	if(nf >= nfields)
   669  		return nf;
   670  
   671  	for(; *str; str++){
   672  		if(strchr(delim, *str) != NULL){
   673  			*str = '\0';
   674  			fields[nf++] = str + 1;
   675  			if(nf >= nfields)
   676  				break;
   677  		}
   678  	}
   679  	return nf;
   680  }
   681  
   682  static int
   683  getunicodeline(FILE *in, char **fields, char *buf)
   684  {
   685  	char *p;
   686  
   687  	if(fgets(buf, MAX_LINE, in) == NULL)
   688  		return 0;
   689  
   690  	p = strchr(buf, '\n');
   691  	if (p == NULL)
   692  		fatal("line too long");
   693  	*p = '\0';
   694  
   695  	if (mygetfields(fields, NFIELDS + 1, buf, ";") != NFIELDS)
   696  		fatal("bad number of fields");
   697  
   698  	return 1;
   699  }
   700  
   701  static int
   702  getcode(char *s)
   703  {
   704  	int i, code;
   705  
   706  	code = 0;
   707  	i = 0;
   708  	/* Parse a hex number */
   709  	while(s[i]) {
   710  		code <<= 4;
   711  		if(s[i] >= '0' && s[i] <= '9')
   712  			code += s[i] - '0';
   713  		else if(s[i] >= 'A' && s[i] <= 'F')
   714  			code += s[i] - 'A' + 10;
   715  		else
   716  			fatal("bad code char '%c'", s[i]);
   717  		i++;
   718  	}
   719  	return code;
   720  }
   721  
   722  static void
   723  fatal(const char *fmt, ...)
   724  {
   725  	va_list arg;
   726  
   727  	fprintf(stderr, "%s: fatal error: ", argv0);
   728  	va_start(arg, fmt);
   729  	vfprintf(stderr, fmt, arg);
   730  	va_end(arg);
   731  	fprintf(stderr, "\n");
   732  
   733  	exit(1);
   734  }