github.com/xushiwei/go@v0.0.0-20130601165731-2b9d83f45bc9/src/cmd/8g/reg.c (about)

     1  // Derived from Inferno utils/6c/reg.c
     2  // http://code.google.com/p/inferno-os/source/browse/utils/6c/reg.c
     3  //
     4  //	Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
     5  //	Portions Copyright © 1995-1997 C H Forsyth (forsyth@terzarima.net)
     6  //	Portions Copyright © 1997-1999 Vita Nuova Limited
     7  //	Portions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com)
     8  //	Portions Copyright © 2004,2006 Bruce Ellis
     9  //	Portions Copyright © 2005-2007 C H Forsyth (forsyth@terzarima.net)
    10  //	Revisions Copyright © 2000-2007 Lucent Technologies Inc. and others
    11  //	Portions Copyright © 2009 The Go Authors.  All rights reserved.
    12  //
    13  // Permission is hereby granted, free of charge, to any person obtaining a copy
    14  // of this software and associated documentation files (the "Software"), to deal
    15  // in the Software without restriction, including without limitation the rights
    16  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    17  // copies of the Software, and to permit persons to whom the Software is
    18  // furnished to do so, subject to the following conditions:
    19  //
    20  // The above copyright notice and this permission notice shall be included in
    21  // all copies or substantial portions of the Software.
    22  //
    23  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    24  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    25  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
    26  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    27  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    28  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    29  // THE SOFTWARE.
    30  
    31  #include <u.h>
    32  #include <libc.h>
    33  #include "gg.h"
    34  #include "opt.h"
    35  
    36  #define	NREGVAR	16	/* 8 integer + 8 floating */
    37  #define	REGBITS	((uint32)0xffff)
    38  #define	P2R(p)	(Reg*)(p->reg)
    39  
    40  static	int	first	= 1;
    41  
    42  static	void	fixjmp(Prog*);
    43  static	void	fixtemp(Prog*);
    44  
    45  Reg*
    46  rega(void)
    47  {
    48  	Reg *r;
    49  
    50  	r = freer;
    51  	if(r == R) {
    52  		r = mal(sizeof(*r));
    53  	} else
    54  		freer = r->link;
    55  
    56  	*r = zreg;
    57  	return r;
    58  }
    59  
    60  int
    61  rcmp(const void *a1, const void *a2)
    62  {
    63  	Rgn *p1, *p2;
    64  	int c1, c2;
    65  
    66  	p1 = (Rgn*)a1;
    67  	p2 = (Rgn*)a2;
    68  	c1 = p2->cost;
    69  	c2 = p1->cost;
    70  	if(c1 -= c2)
    71  		return c1;
    72  	return p2->varno - p1->varno;
    73  }
    74  
    75  static void
    76  setoutvar(void)
    77  {
    78  	Type *t;
    79  	Node *n;
    80  	Addr a;
    81  	Iter save;
    82  	Bits bit;
    83  	int z;
    84  
    85  	t = structfirst(&save, getoutarg(curfn->type));
    86  	while(t != T) {
    87  		n = nodarg(t, 1);
    88  		a = zprog.from;
    89  		naddr(n, &a, 0);
    90  		bit = mkvar(R, &a);
    91  		for(z=0; z<BITS; z++)
    92  			ovar.b[z] |= bit.b[z];
    93  		t = structnext(&save);
    94  	}
    95  //if(bany(ovar))
    96  //print("ovars = %Q\n", ovar);
    97  }
    98  
    99  static void
   100  setaddrs(Bits bit)
   101  {
   102  	int i, n;
   103  	Var *v;
   104  	Node *node;
   105  
   106  	while(bany(&bit)) {
   107  		// convert each bit to a variable
   108  		i = bnum(bit);
   109  		node = var[i].node;
   110  		n = var[i].name;
   111  		bit.b[i/32] &= ~(1L<<(i%32));
   112  
   113  		// disable all pieces of that variable
   114  		for(i=0; i<nvar; i++) {
   115  			v = var+i;
   116  			if(v->node == node && v->name == n)
   117  				v->addr = 2;
   118  		}
   119  	}
   120  }
   121  
   122  static char* regname[] = {
   123  	".ax", ".cx", ".dx", ".bx", ".sp", ".bp", ".si", ".di",
   124  	".x0", ".x1", ".x2", ".x3", ".x4", ".x5", ".x6", ".x7",
   125  };
   126  
   127  static Node* regnodes[NREGVAR];
   128  
   129  void
   130  regopt(Prog *firstp)
   131  {
   132  	Reg *r, *r1;
   133  	Prog *p;
   134  	int i, z, nr;
   135  	uint32 vreg;
   136  	Bits bit;
   137  
   138  	if(first) {
   139  		fmtinstall('Q', Qconv);
   140  		exregoffset = D_DI;	// no externals
   141  		first = 0;
   142  	}
   143  	
   144  	fixtemp(firstp);
   145  	fixjmp(firstp);
   146  
   147  	// count instructions
   148  	nr = 0;
   149  	for(p=firstp; p!=P; p=p->link)
   150  		nr++;
   151  	// if too big dont bother
   152  	if(nr >= 10000) {
   153  //		print("********** %S is too big (%d)\n", curfn->nname->sym, nr);
   154  		return;
   155  	}
   156  
   157  	firstr = R;
   158  	lastr = R;
   159  	
   160  	/*
   161  	 * control flow is more complicated in generated go code
   162  	 * than in generated c code.  define pseudo-variables for
   163  	 * registers, so we have complete register usage information.
   164  	 */
   165  	nvar = NREGVAR;
   166  	memset(var, 0, NREGVAR*sizeof var[0]);
   167  	for(i=0; i<NREGVAR; i++) {
   168  		if(regnodes[i] == N)
   169  			regnodes[i] = newname(lookup(regname[i]));
   170  		var[i].node = regnodes[i];
   171  	}
   172  
   173  	regbits = RtoB(D_SP);
   174  	for(z=0; z<BITS; z++) {
   175  		externs.b[z] = 0;
   176  		params.b[z] = 0;
   177  		consts.b[z] = 0;
   178  		addrs.b[z] = 0;
   179  		ovar.b[z] = 0;
   180  	}
   181  
   182  	// build list of return variables
   183  	setoutvar();
   184  
   185  	/*
   186  	 * pass 1
   187  	 * build aux data structure
   188  	 * allocate pcs
   189  	 * find use and set of variables
   190  	 */
   191  	nr = 0;
   192  	for(p=firstp; p!=P; p=p->link) {
   193  		switch(p->as) {
   194  		case ADATA:
   195  		case AGLOBL:
   196  		case ANAME:
   197  		case ASIGNAME:
   198  		case ALOCALS:
   199  		case ATYPE:
   200  			continue;
   201  		}
   202  		r = rega();
   203  		nr++;
   204  		if(firstr == R) {
   205  			firstr = r;
   206  			lastr = r;
   207  		} else {
   208  			lastr->link = r;
   209  			r->p1 = lastr;
   210  			lastr->s1 = r;
   211  			lastr = r;
   212  		}
   213  		r->prog = p;
   214  		p->reg = r;
   215  
   216  		r1 = r->p1;
   217  		if(r1 != R) {
   218  			switch(r1->prog->as) {
   219  			case ARET:
   220  			case AJMP:
   221  			case AIRETL:
   222  				r->p1 = R;
   223  				r1->s1 = R;
   224  			}
   225  		}
   226  
   227  		// Avoid making variables for direct-called functions.
   228  		if(p->as == ACALL && p->to.type == D_EXTERN)
   229  			continue;
   230  
   231  		// Addressing makes some registers used.
   232  		if(p->from.type >= D_INDIR)
   233  			r->use1.b[0] |= RtoB(p->from.type-D_INDIR);
   234  		if(p->from.index != D_NONE)
   235  			r->use1.b[0] |= RtoB(p->from.index);
   236  		if(p->to.type >= D_INDIR)
   237  			r->use2.b[0] |= RtoB(p->to.type-D_INDIR);
   238  		if(p->to.index != D_NONE)
   239  			r->use2.b[0] |= RtoB(p->to.index);
   240  
   241  		bit = mkvar(r, &p->from);
   242  		if(bany(&bit))
   243  		switch(p->as) {
   244  		/*
   245  		 * funny
   246  		 */
   247  		case ALEAL:
   248  		case AFMOVD:
   249  		case AFMOVF:
   250  		case AFMOVL: 
   251  		case AFMOVW:
   252  		case AFMOVV:
   253  			setaddrs(bit);
   254  			break;
   255  
   256  		/*
   257  		 * left side read
   258  		 */
   259  		default:
   260  			for(z=0; z<BITS; z++)
   261  				r->use1.b[z] |= bit.b[z];
   262  			break;
   263  
   264  		/*
   265  		 * left side read+write
   266  		 */
   267  		case AXCHGB:
   268  		case AXCHGW:
   269  		case AXCHGL:
   270  			for(z=0; z<BITS; z++) {
   271  				r->use1.b[z] |= bit.b[z];
   272  				r->set.b[z] |= bit.b[z];
   273  			}
   274  			break;
   275  		}
   276  
   277  		bit = mkvar(r, &p->to);
   278  		if(bany(&bit))
   279  		switch(p->as) {
   280  		default:
   281  			yyerror("reg: unknown op: %A", p->as);
   282  			break;
   283  
   284  		/*
   285  		 * right side read
   286  		 */
   287  		case ACMPB:
   288  		case ACMPL:
   289  		case ACMPW:
   290  		case ACOMISS:
   291  		case ACOMISD:
   292  		case AUCOMISS:
   293  		case AUCOMISD:
   294  		case ATESTB:
   295  		case ATESTL:
   296  		case ATESTW:
   297  			for(z=0; z<BITS; z++)
   298  				r->use2.b[z] |= bit.b[z];
   299  			break;
   300  
   301  		/*
   302  		 * right side write
   303  		 */
   304  		case AFSTSW:
   305  		case ALEAL:
   306  		case ANOP:
   307  		case AMOVL:
   308  		case AMOVB:
   309  		case AMOVW:
   310  		case AMOVBLSX:
   311  		case AMOVBLZX:
   312  		case AMOVBWSX:
   313  		case AMOVBWZX:
   314  		case AMOVWLSX:
   315  		case AMOVWLZX:
   316  		case APOPL:
   317  
   318  		case AMOVSS:
   319  		case AMOVSD:
   320  		case ACVTSD2SL:
   321  		case ACVTSD2SS:
   322  		case ACVTSL2SD:
   323  		case ACVTSL2SS:
   324  		case ACVTSS2SD:
   325  		case ACVTSS2SL:
   326  		case ACVTTSD2SL:
   327  		case ACVTTSS2SL:
   328  			for(z=0; z<BITS; z++)
   329  				r->set.b[z] |= bit.b[z];
   330  			break;
   331  
   332  		/*
   333  		 * right side read+write
   334  		 */
   335  		case AINCB:
   336  		case AINCL:
   337  		case AINCW:
   338  		case ADECB:
   339  		case ADECL:
   340  		case ADECW:
   341  
   342  		case AADDB:
   343  		case AADDL:
   344  		case AADDW:
   345  		case AANDB:
   346  		case AANDL:
   347  		case AANDW:
   348  		case ASUBB:
   349  		case ASUBL:
   350  		case ASUBW:
   351  		case AORB:
   352  		case AORL:
   353  		case AORW:
   354  		case AXORB:
   355  		case AXORL:
   356  		case AXORW:
   357  		case ASALB:
   358  		case ASALL:
   359  		case ASALW:
   360  		case ASARB:
   361  		case ASARL:
   362  		case ASARW:
   363  		case ARCLB:
   364  		case ARCLL:
   365  		case ARCLW:
   366  		case ARCRB:
   367  		case ARCRL:
   368  		case ARCRW:
   369  		case AROLB:
   370  		case AROLL:
   371  		case AROLW:
   372  		case ARORB:
   373  		case ARORL:
   374  		case ARORW:
   375  		case ASHLB:
   376  		case ASHLL:
   377  		case ASHLW:
   378  		case ASHRB:
   379  		case ASHRL:
   380  		case ASHRW:
   381  		case AIMULL:
   382  		case AIMULW:
   383  		case ANEGB:
   384  		case ANEGL:
   385  		case ANEGW:
   386  		case ANOTB:
   387  		case ANOTL:
   388  		case ANOTW:
   389  		case AADCL:
   390  		case ASBBL:
   391  
   392  		case ASETCC:
   393  		case ASETCS:
   394  		case ASETEQ:
   395  		case ASETGE:
   396  		case ASETGT:
   397  		case ASETHI:
   398  		case ASETLE:
   399  		case ASETLS:
   400  		case ASETLT:
   401  		case ASETMI:
   402  		case ASETNE:
   403  		case ASETOC:
   404  		case ASETOS:
   405  		case ASETPC:
   406  		case ASETPL:
   407  		case ASETPS:
   408  
   409  		case AXCHGB:
   410  		case AXCHGW:
   411  		case AXCHGL:
   412  
   413  		case AADDSD:
   414  		case AADDSS:
   415  		case ACMPSD:
   416  		case ACMPSS:
   417  		case ADIVSD:
   418  		case ADIVSS:
   419  		case AMAXSD:
   420  		case AMAXSS:
   421  		case AMINSD:
   422  		case AMINSS:
   423  		case AMULSD:
   424  		case AMULSS:
   425  		case ARCPSS:
   426  		case ARSQRTSS:
   427  		case ASQRTSD:
   428  		case ASQRTSS:
   429  		case ASUBSD:
   430  		case ASUBSS:
   431  		case AXORPD:
   432  			for(z=0; z<BITS; z++) {
   433  				r->set.b[z] |= bit.b[z];
   434  				r->use2.b[z] |= bit.b[z];
   435  			}
   436  			break;
   437  
   438  		/*
   439  		 * funny
   440  		 */
   441  		case AFMOVDP:
   442  		case AFMOVFP:
   443  		case AFMOVLP:
   444  		case AFMOVVP:
   445  		case AFMOVWP:
   446  		case ACALL:
   447  			setaddrs(bit);
   448  			break;
   449  		}
   450  
   451  		switch(p->as) {
   452  		case AIMULL:
   453  		case AIMULW:
   454  			if(p->to.type != D_NONE)
   455  				break;
   456  
   457  		case AIDIVL:
   458  		case AIDIVW:
   459  		case ADIVL:
   460  		case ADIVW:
   461  		case AMULL:
   462  		case AMULW:
   463  			r->set.b[0] |= RtoB(D_AX) | RtoB(D_DX);
   464  			r->use1.b[0] |= RtoB(D_AX) | RtoB(D_DX);
   465  			break;
   466  
   467  		case AIDIVB:
   468  		case AIMULB:
   469  		case ADIVB:
   470  		case AMULB:
   471  			r->set.b[0] |= RtoB(D_AX);
   472  			r->use1.b[0] |= RtoB(D_AX);
   473  			break;
   474  
   475  		case ACWD:
   476  			r->set.b[0] |= RtoB(D_AX) | RtoB(D_DX);
   477  			r->use1.b[0] |= RtoB(D_AX);
   478  			break;
   479  
   480  		case ACDQ:
   481  			r->set.b[0] |= RtoB(D_DX);
   482  			r->use1.b[0] |= RtoB(D_AX);
   483  			break;
   484  
   485  		case AREP:
   486  		case AREPN:
   487  		case ALOOP:
   488  		case ALOOPEQ:
   489  		case ALOOPNE:
   490  			r->set.b[0] |= RtoB(D_CX);
   491  			r->use1.b[0] |= RtoB(D_CX);
   492  			break;
   493  
   494  		case AMOVSB:
   495  		case AMOVSL:
   496  		case AMOVSW:
   497  		case ACMPSB:
   498  		case ACMPSL:
   499  		case ACMPSW:
   500  			r->set.b[0] |= RtoB(D_SI) | RtoB(D_DI);
   501  			r->use1.b[0] |= RtoB(D_SI) | RtoB(D_DI);
   502  			break;
   503  
   504  		case ASTOSB:
   505  		case ASTOSL:
   506  		case ASTOSW:
   507  		case ASCASB:
   508  		case ASCASL:
   509  		case ASCASW:
   510  			r->set.b[0] |= RtoB(D_DI);
   511  			r->use1.b[0] |= RtoB(D_AX) | RtoB(D_DI);
   512  			break;
   513  
   514  		case AINSB:
   515  		case AINSL:
   516  		case AINSW:
   517  			r->set.b[0] |= RtoB(D_DX) | RtoB(D_DI);
   518  			r->use1.b[0] |= RtoB(D_DI);
   519  			break;
   520  
   521  		case AOUTSB:
   522  		case AOUTSL:
   523  		case AOUTSW:
   524  			r->set.b[0] |= RtoB(D_DI);
   525  			r->use1.b[0] |= RtoB(D_DX) | RtoB(D_DI);
   526  			break;
   527  		}
   528  	}
   529  	if(firstr == R)
   530  		return;
   531  
   532  	for(i=0; i<nvar; i++) {
   533  		Var *v = var+i;
   534  		if(v->addr) {
   535  			bit = blsh(i);
   536  			for(z=0; z<BITS; z++)
   537  				addrs.b[z] |= bit.b[z];
   538  		}
   539  
   540  		if(debug['R'] && debug['v'])
   541  			print("bit=%2d addr=%d et=%-6E w=%-2d s=%N + %lld\n",
   542  				i, v->addr, v->etype, v->width, v->node, v->offset);
   543  	}
   544  
   545  	if(debug['R'] && debug['v'])
   546  		dumpit("pass1", firstr);
   547  
   548  	/*
   549  	 * pass 2
   550  	 * turn branch references to pointers
   551  	 * build back pointers
   552  	 */
   553  	for(r=firstr; r!=R; r=r->link) {
   554  		p = r->prog;
   555  		if(p->to.type == D_BRANCH) {
   556  			if(p->to.u.branch == P)
   557  				fatal("pnil %P", p);
   558  			r1 = p->to.u.branch->reg;
   559  			if(r1 == R)
   560  				fatal("rnil %P", p);
   561  			if(r1 == r) {
   562  				//fatal("ref to self %P", p);
   563  				continue;
   564  			}
   565  			r->s2 = r1;
   566  			r->p2link = r1->p2;
   567  			r1->p2 = r;
   568  		}
   569  	}
   570  
   571  	if(debug['R'] && debug['v'])
   572  		dumpit("pass2", firstr);
   573  
   574  	/*
   575  	 * pass 2.5
   576  	 * find looping structure
   577  	 */
   578  	for(r = firstr; r != R; r = r->link)
   579  		r->active = 0;
   580  	change = 0;
   581  	loopit(firstr, nr);
   582  
   583  	if(debug['R'] && debug['v'])
   584  		dumpit("pass2.5", firstr);
   585  
   586  	/*
   587  	 * pass 3
   588  	 * iterate propagating usage
   589  	 * 	back until flow graph is complete
   590  	 */
   591  loop1:
   592  	change = 0;
   593  	for(r = firstr; r != R; r = r->link)
   594  		r->active = 0;
   595  	for(r = firstr; r != R; r = r->link)
   596  		if(r->prog->as == ARET)
   597  			prop(r, zbits, zbits);
   598  loop11:
   599  	/* pick up unreachable code */
   600  	i = 0;
   601  	for(r = firstr; r != R; r = r1) {
   602  		r1 = r->link;
   603  		if(r1 && r1->active && !r->active) {
   604  			prop(r, zbits, zbits);
   605  			i = 1;
   606  		}
   607  	}
   608  	if(i)
   609  		goto loop11;
   610  	if(change)
   611  		goto loop1;
   612  
   613  	if(debug['R'] && debug['v'])
   614  		dumpit("pass3", firstr);
   615  
   616  	/*
   617  	 * pass 4
   618  	 * iterate propagating register/variable synchrony
   619  	 * 	forward until graph is complete
   620  	 */
   621  loop2:
   622  	change = 0;
   623  	for(r = firstr; r != R; r = r->link)
   624  		r->active = 0;
   625  	synch(firstr, zbits);
   626  	if(change)
   627  		goto loop2;
   628  
   629  	if(debug['R'] && debug['v'])
   630  		dumpit("pass4", firstr);
   631  
   632  	/*
   633  	 * pass 4.5
   634  	 * move register pseudo-variables into regu.
   635  	 */
   636  	for(r = firstr; r != R; r = r->link) {
   637  		r->regu = (r->refbehind.b[0] | r->set.b[0]) & REGBITS;
   638  
   639  		r->set.b[0] &= ~REGBITS;
   640  		r->use1.b[0] &= ~REGBITS;
   641  		r->use2.b[0] &= ~REGBITS;
   642  		r->refbehind.b[0] &= ~REGBITS;
   643  		r->refahead.b[0] &= ~REGBITS;
   644  		r->calbehind.b[0] &= ~REGBITS;
   645  		r->calahead.b[0] &= ~REGBITS;
   646  		r->regdiff.b[0] &= ~REGBITS;
   647  		r->act.b[0] &= ~REGBITS;
   648  	}
   649  
   650  	/*
   651  	 * pass 5
   652  	 * isolate regions
   653  	 * calculate costs (paint1)
   654  	 */
   655  	r = firstr;
   656  	if(r) {
   657  		for(z=0; z<BITS; z++)
   658  			bit.b[z] = (r->refahead.b[z] | r->calahead.b[z]) &
   659  			  ~(externs.b[z] | params.b[z] | addrs.b[z] | consts.b[z]);
   660  		if(bany(&bit) && !r->refset) {
   661  			// should never happen - all variables are preset
   662  			if(debug['w'])
   663  				print("%L: used and not set: %Q\n", r->prog->lineno, bit);
   664  			r->refset = 1;
   665  		}
   666  	}
   667  	for(r = firstr; r != R; r = r->link)
   668  		r->act = zbits;
   669  	rgp = region;
   670  	nregion = 0;
   671  	for(r = firstr; r != R; r = r->link) {
   672  		for(z=0; z<BITS; z++)
   673  			bit.b[z] = r->set.b[z] &
   674  			  ~(r->refahead.b[z] | r->calahead.b[z] | addrs.b[z]);
   675  		if(bany(&bit) && !r->refset) {
   676  			if(debug['w'])
   677  				print("%L: set and not used: %Q\n", r->prog->lineno, bit);
   678  			r->refset = 1;
   679  			excise(r);
   680  		}
   681  		for(z=0; z<BITS; z++)
   682  			bit.b[z] = LOAD(r) & ~(r->act.b[z] | addrs.b[z]);
   683  		while(bany(&bit)) {
   684  			i = bnum(bit);
   685  			rgp->enter = r;
   686  			rgp->varno = i;
   687  			change = 0;
   688  			paint1(r, i);
   689  			bit.b[i/32] &= ~(1L<<(i%32));
   690  			if(change <= 0)
   691  				continue;
   692  			rgp->cost = change;
   693  			nregion++;
   694  			if(nregion >= NRGN) {
   695  				if(debug['R'] && debug['v'])
   696  					print("too many regions\n");
   697  				goto brk;
   698  			}
   699  			rgp++;
   700  		}
   701  	}
   702  brk:
   703  	qsort(region, nregion, sizeof(region[0]), rcmp);
   704  
   705  	/*
   706  	 * pass 6
   707  	 * determine used registers (paint2)
   708  	 * replace code (paint3)
   709  	 */
   710  	rgp = region;
   711  	for(i=0; i<nregion; i++) {
   712  		bit = blsh(rgp->varno);
   713  		vreg = paint2(rgp->enter, rgp->varno);
   714  		vreg = allreg(vreg, rgp);
   715  		if(rgp->regno != 0)
   716  			paint3(rgp->enter, rgp->varno, vreg, rgp->regno);
   717  		rgp++;
   718  	}
   719  
   720  	if(debug['R'] && debug['v'])
   721  		dumpit("pass6", firstr);
   722  
   723  	/*
   724  	 * pass 7
   725  	 * peep-hole on basic block
   726  	 */
   727  	if(!debug['R'] || debug['P']) {
   728  		peep();
   729  	}
   730  
   731  	/*
   732  	 * eliminate nops
   733  	 * free aux structures
   734  	 */
   735  	for(p=firstp; p!=P; p=p->link) {
   736  		while(p->link != P && p->link->as == ANOP)
   737  			p->link = p->link->link;
   738  		if(p->to.type == D_BRANCH)
   739  			while(p->to.u.branch != P && p->to.u.branch->as == ANOP)
   740  				p->to.u.branch = p->to.u.branch->link;
   741  	}
   742  
   743  	if(!use_sse)
   744  	for(p=firstp; p!=P; p=p->link) {
   745  		if(p->from.type >= D_X0 && p->from.type <= D_X7)
   746  			fatal("invalid use of %R with GO386=387: %P", p->from.type, p);
   747  		if(p->to.type >= D_X0 && p->to.type <= D_X7)
   748  			fatal("invalid use of %R with GO386=387: %P", p->to.type, p);
   749  	}
   750  
   751  	if(lastr != R) {
   752  		lastr->link = freer;
   753  		freer = firstr;
   754  	}
   755  
   756  	if(debug['R']) {
   757  		if(ostats.ncvtreg ||
   758  		   ostats.nspill ||
   759  		   ostats.nreload ||
   760  		   ostats.ndelmov ||
   761  		   ostats.nvar ||
   762  		   ostats.naddr ||
   763  		   0)
   764  			print("\nstats\n");
   765  
   766  		if(ostats.ncvtreg)
   767  			print("	%4d cvtreg\n", ostats.ncvtreg);
   768  		if(ostats.nspill)
   769  			print("	%4d spill\n", ostats.nspill);
   770  		if(ostats.nreload)
   771  			print("	%4d reload\n", ostats.nreload);
   772  		if(ostats.ndelmov)
   773  			print("	%4d delmov\n", ostats.ndelmov);
   774  		if(ostats.nvar)
   775  			print("	%4d var\n", ostats.nvar);
   776  		if(ostats.naddr)
   777  			print("	%4d addr\n", ostats.naddr);
   778  
   779  		memset(&ostats, 0, sizeof(ostats));
   780  	}
   781  }
   782  
   783  /*
   784   * add mov b,rn
   785   * just after r
   786   */
   787  void
   788  addmove(Reg *r, int bn, int rn, int f)
   789  {
   790  	Prog *p, *p1;
   791  	Adr *a;
   792  	Var *v;
   793  
   794  	p1 = mal(sizeof(*p1));
   795  	clearp(p1);
   796  	p1->loc = 9999;
   797  
   798  	p = r->prog;
   799  	p1->link = p->link;
   800  	p->link = p1;
   801  	p1->lineno = p->lineno;
   802  
   803  	v = var + bn;
   804  
   805  	a = &p1->to;
   806  	a->offset = v->offset;
   807  	a->etype = v->etype;
   808  	a->type = v->name;
   809  	a->node = v->node;
   810  	a->sym = v->node->sym;
   811  
   812  	// need to clean this up with wptr and
   813  	// some of the defaults
   814  	p1->as = AMOVL;
   815  	switch(v->etype) {
   816  	default:
   817  		fatal("unknown type %E", v->etype);
   818  	case TINT8:
   819  	case TUINT8:
   820  	case TBOOL:
   821  		p1->as = AMOVB;
   822  		break;
   823  	case TINT16:
   824  	case TUINT16:
   825  		p1->as = AMOVW;
   826  		break;
   827  	case TFLOAT32:
   828  		p1->as = AMOVSS;
   829  		break;
   830  	case TFLOAT64:
   831  		p1->as = AMOVSD;
   832  		break;
   833  	case TINT:
   834  	case TUINT:
   835  	case TINT32:
   836  	case TUINT32:
   837  	case TPTR32:
   838  		break;
   839  	}
   840  
   841  	p1->from.type = rn;
   842  	if(!f) {
   843  		p1->from = *a;
   844  		*a = zprog.from;
   845  		a->type = rn;
   846  		if(v->etype == TUINT8)
   847  			p1->as = AMOVB;
   848  		if(v->etype == TUINT16)
   849  			p1->as = AMOVW;
   850  	}
   851  	if(debug['R'] && debug['v'])
   852  		print("%P ===add=== %P\n", p, p1);
   853  	ostats.nspill++;
   854  }
   855  
   856  uint32
   857  doregbits(int r)
   858  {
   859  	uint32 b;
   860  
   861  	b = 0;
   862  	if(r >= D_INDIR)
   863  		r -= D_INDIR;
   864  	if(r >= D_AX && r <= D_DI)
   865  		b |= RtoB(r);
   866  	else
   867  	if(r >= D_AL && r <= D_BL)
   868  		b |= RtoB(r-D_AL+D_AX);
   869  	else
   870  	if(r >= D_AH && r <= D_BH)
   871  		b |= RtoB(r-D_AH+D_AX);
   872  	else
   873  	if(r >= D_X0 && r <= D_X0+7)
   874  		b |= FtoB(r);
   875  	return b;
   876  }
   877  
   878  static int
   879  overlap(int32 o1, int w1, int32 o2, int w2)
   880  {
   881  	int32 t1, t2;
   882  
   883  	t1 = o1+w1;
   884  	t2 = o2+w2;
   885  
   886  	if(!(t1 > o2 && t2 > o1))
   887  		return 0;
   888  
   889  	return 1;
   890  }
   891  
   892  Bits
   893  mkvar(Reg *r, Adr *a)
   894  {
   895  	Var *v;
   896  	int i, t, n, et, z, w, flag, regu;
   897  	int32 o;
   898  	Bits bit;
   899  	Node *node;
   900  
   901  	/*
   902  	 * mark registers used
   903  	 */
   904  	t = a->type;
   905  	if(t == D_NONE)
   906  		goto none;
   907  
   908  	if(r != R)
   909  		r->use1.b[0] |= doregbits(a->index);
   910  
   911  	switch(t) {
   912  	default:
   913  		regu = doregbits(t);
   914  		if(regu == 0)
   915  			goto none;
   916  		bit = zbits;
   917  		bit.b[0] = regu;
   918  		return bit;
   919  
   920  	case D_ADDR:
   921  		a->type = a->index;
   922  		bit = mkvar(r, a);
   923  		setaddrs(bit);
   924  		a->type = t;
   925  		ostats.naddr++;
   926  		goto none;
   927  
   928  	case D_EXTERN:
   929  	case D_STATIC:
   930  	case D_PARAM:
   931  	case D_AUTO:
   932  		n = t;
   933  		break;
   934  	}
   935  
   936  	node = a->node;
   937  	if(node == N || node->op != ONAME || node->orig == N)
   938  		goto none;
   939  	node = node->orig;
   940  	if(node->orig != node)
   941  		fatal("%D: bad node", a);
   942  	if(node->sym == S || node->sym->name[0] == '.')
   943  		goto none;
   944  	et = a->etype;
   945  	o = a->offset;
   946  	w = a->width;
   947  	if(w < 0)
   948  		fatal("bad width %d for %D", w, a);
   949  
   950  	flag = 0;
   951  	for(i=0; i<nvar; i++) {
   952  		v = var+i;
   953  		if(v->node == node && v->name == n) {
   954  			if(v->offset == o)
   955  			if(v->etype == et)
   956  			if(v->width == w)
   957  				return blsh(i);
   958  
   959  			// if they overlap, disable both
   960  			if(overlap(v->offset, v->width, o, w)) {
   961  				if(debug['R'])
   962  					print("disable %s\n", node->sym->name);
   963  				v->addr = 1;
   964  				flag = 1;
   965  			}
   966  		}
   967  	}
   968  
   969  	switch(et) {
   970  	case 0:
   971  	case TFUNC:
   972  		goto none;
   973  	}
   974  
   975  	if(nvar >= NVAR) {
   976  		if(debug['w'] > 1 && node != N)
   977  			fatal("variable not optimized: %D", a);
   978  		goto none;
   979  	}
   980  
   981  	i = nvar;
   982  	nvar++;
   983  	v = var+i;
   984  	v->offset = o;
   985  	v->name = n;
   986  	v->etype = et;
   987  	v->width = w;
   988  	v->addr = flag;		// funny punning
   989  	v->node = node;
   990  
   991  	if(debug['R'])
   992  		print("bit=%2d et=%2E w=%d+%d %#N %D flag=%d\n", i, et, o, w, node, a, v->addr);
   993  	ostats.nvar++;
   994  
   995  	bit = blsh(i);
   996  	if(n == D_EXTERN || n == D_STATIC)
   997  		for(z=0; z<BITS; z++)
   998  			externs.b[z] |= bit.b[z];
   999  	if(n == D_PARAM)
  1000  		for(z=0; z<BITS; z++)
  1001  			params.b[z] |= bit.b[z];
  1002  
  1003  	return bit;
  1004  
  1005  none:
  1006  	return zbits;
  1007  }
  1008  
  1009  void
  1010  prop(Reg *r, Bits ref, Bits cal)
  1011  {
  1012  	Reg *r1, *r2;
  1013  	int z;
  1014  
  1015  	for(r1 = r; r1 != R; r1 = r1->p1) {
  1016  		for(z=0; z<BITS; z++) {
  1017  			ref.b[z] |= r1->refahead.b[z];
  1018  			if(ref.b[z] != r1->refahead.b[z]) {
  1019  				r1->refahead.b[z] = ref.b[z];
  1020  				change++;
  1021  			}
  1022  			cal.b[z] |= r1->calahead.b[z];
  1023  			if(cal.b[z] != r1->calahead.b[z]) {
  1024  				r1->calahead.b[z] = cal.b[z];
  1025  				change++;
  1026  			}
  1027  		}
  1028  		switch(r1->prog->as) {
  1029  		case ACALL:
  1030  			if(noreturn(r1->prog))
  1031  				break;
  1032  			for(z=0; z<BITS; z++) {
  1033  				cal.b[z] |= ref.b[z] | externs.b[z];
  1034  				ref.b[z] = 0;
  1035  			}
  1036  			break;
  1037  
  1038  		case ATEXT:
  1039  			for(z=0; z<BITS; z++) {
  1040  				cal.b[z] = 0;
  1041  				ref.b[z] = 0;
  1042  			}
  1043  			break;
  1044  
  1045  		case ARET:
  1046  			for(z=0; z<BITS; z++) {
  1047  				cal.b[z] = externs.b[z] | ovar.b[z];
  1048  				ref.b[z] = 0;
  1049  			}
  1050  			break;
  1051  
  1052  		default:
  1053  			// Work around for issue 1304:
  1054  			// flush modified globals before each instruction.
  1055  			for(z=0; z<BITS; z++) {
  1056  				cal.b[z] |= externs.b[z];
  1057  				// issue 4066: flush modified return variables in case of panic
  1058  				if(hasdefer)
  1059  					cal.b[z] |= ovar.b[z];
  1060  			}
  1061  			break;
  1062  		}
  1063  		for(z=0; z<BITS; z++) {
  1064  			ref.b[z] = (ref.b[z] & ~r1->set.b[z]) |
  1065  				r1->use1.b[z] | r1->use2.b[z];
  1066  			cal.b[z] &= ~(r1->set.b[z] | r1->use1.b[z] | r1->use2.b[z]);
  1067  			r1->refbehind.b[z] = ref.b[z];
  1068  			r1->calbehind.b[z] = cal.b[z];
  1069  		}
  1070  		if(r1->active)
  1071  			break;
  1072  		r1->active = 1;
  1073  	}
  1074  	for(; r != r1; r = r->p1)
  1075  		for(r2 = r->p2; r2 != R; r2 = r2->p2link)
  1076  			prop(r2, r->refbehind, r->calbehind);
  1077  }
  1078  
  1079  /*
  1080   * find looping structure
  1081   *
  1082   * 1) find reverse postordering
  1083   * 2) find approximate dominators,
  1084   *	the actual dominators if the flow graph is reducible
  1085   *	otherwise, dominators plus some other non-dominators.
  1086   *	See Matthew S. Hecht and Jeffrey D. Ullman,
  1087   *	"Analysis of a Simple Algorithm for Global Data Flow Problems",
  1088   *	Conf.  Record of ACM Symp. on Principles of Prog. Langs, Boston, Massachusetts,
  1089   *	Oct. 1-3, 1973, pp.  207-217.
  1090   * 3) find all nodes with a predecessor dominated by the current node.
  1091   *	such a node is a loop head.
  1092   *	recursively, all preds with a greater rpo number are in the loop
  1093   */
  1094  int32
  1095  postorder(Reg *r, Reg **rpo2r, int32 n)
  1096  {
  1097  	Reg *r1;
  1098  
  1099  	r->rpo = 1;
  1100  	r1 = r->s1;
  1101  	if(r1 && !r1->rpo)
  1102  		n = postorder(r1, rpo2r, n);
  1103  	r1 = r->s2;
  1104  	if(r1 && !r1->rpo)
  1105  		n = postorder(r1, rpo2r, n);
  1106  	rpo2r[n] = r;
  1107  	n++;
  1108  	return n;
  1109  }
  1110  
  1111  int32
  1112  rpolca(int32 *idom, int32 rpo1, int32 rpo2)
  1113  {
  1114  	int32 t;
  1115  
  1116  	if(rpo1 == -1)
  1117  		return rpo2;
  1118  	while(rpo1 != rpo2){
  1119  		if(rpo1 > rpo2){
  1120  			t = rpo2;
  1121  			rpo2 = rpo1;
  1122  			rpo1 = t;
  1123  		}
  1124  		while(rpo1 < rpo2){
  1125  			t = idom[rpo2];
  1126  			if(t >= rpo2)
  1127  				fatal("bad idom");
  1128  			rpo2 = t;
  1129  		}
  1130  	}
  1131  	return rpo1;
  1132  }
  1133  
  1134  int
  1135  doms(int32 *idom, int32 r, int32 s)
  1136  {
  1137  	while(s > r)
  1138  		s = idom[s];
  1139  	return s == r;
  1140  }
  1141  
  1142  int
  1143  loophead(int32 *idom, Reg *r)
  1144  {
  1145  	int32 src;
  1146  
  1147  	src = r->rpo;
  1148  	if(r->p1 != R && doms(idom, src, r->p1->rpo))
  1149  		return 1;
  1150  	for(r = r->p2; r != R; r = r->p2link)
  1151  		if(doms(idom, src, r->rpo))
  1152  			return 1;
  1153  	return 0;
  1154  }
  1155  
  1156  void
  1157  loopmark(Reg **rpo2r, int32 head, Reg *r)
  1158  {
  1159  	if(r->rpo < head || r->active == head)
  1160  		return;
  1161  	r->active = head;
  1162  	r->loop += LOOP;
  1163  	if(r->p1 != R)
  1164  		loopmark(rpo2r, head, r->p1);
  1165  	for(r = r->p2; r != R; r = r->p2link)
  1166  		loopmark(rpo2r, head, r);
  1167  }
  1168  
  1169  void
  1170  loopit(Reg *r, int32 nr)
  1171  {
  1172  	Reg *r1;
  1173  	int32 i, d, me;
  1174  
  1175  	if(nr > maxnr) {
  1176  		rpo2r = mal(nr * sizeof(Reg*));
  1177  		idom = mal(nr * sizeof(int32));
  1178  		maxnr = nr;
  1179  	}
  1180  
  1181  	d = postorder(r, rpo2r, 0);
  1182  	if(d > nr)
  1183  		fatal("too many reg nodes %d %d", d, nr);
  1184  	nr = d;
  1185  	for(i = 0; i < nr / 2; i++) {
  1186  		r1 = rpo2r[i];
  1187  		rpo2r[i] = rpo2r[nr - 1 - i];
  1188  		rpo2r[nr - 1 - i] = r1;
  1189  	}
  1190  	for(i = 0; i < nr; i++)
  1191  		rpo2r[i]->rpo = i;
  1192  
  1193  	idom[0] = 0;
  1194  	for(i = 0; i < nr; i++) {
  1195  		r1 = rpo2r[i];
  1196  		me = r1->rpo;
  1197  		d = -1;
  1198  		// rpo2r[r->rpo] == r protects against considering dead code,
  1199  		// which has r->rpo == 0.
  1200  		if(r1->p1 != R && rpo2r[r1->p1->rpo] == r1->p1 && r1->p1->rpo < me)
  1201  			d = r1->p1->rpo;
  1202  		for(r1 = r1->p2; r1 != nil; r1 = r1->p2link)
  1203  			if(rpo2r[r1->rpo] == r1 && r1->rpo < me)
  1204  				d = rpolca(idom, d, r1->rpo);
  1205  		idom[i] = d;
  1206  	}
  1207  
  1208  	for(i = 0; i < nr; i++) {
  1209  		r1 = rpo2r[i];
  1210  		r1->loop++;
  1211  		if(r1->p2 != R && loophead(idom, r1))
  1212  			loopmark(rpo2r, i, r1);
  1213  	}
  1214  }
  1215  
  1216  void
  1217  synch(Reg *r, Bits dif)
  1218  {
  1219  	Reg *r1;
  1220  	int z;
  1221  
  1222  	for(r1 = r; r1 != R; r1 = r1->s1) {
  1223  		for(z=0; z<BITS; z++) {
  1224  			dif.b[z] = (dif.b[z] &
  1225  				~(~r1->refbehind.b[z] & r1->refahead.b[z])) |
  1226  					r1->set.b[z] | r1->regdiff.b[z];
  1227  			if(dif.b[z] != r1->regdiff.b[z]) {
  1228  				r1->regdiff.b[z] = dif.b[z];
  1229  				change++;
  1230  			}
  1231  		}
  1232  		if(r1->active)
  1233  			break;
  1234  		r1->active = 1;
  1235  		for(z=0; z<BITS; z++)
  1236  			dif.b[z] &= ~(~r1->calbehind.b[z] & r1->calahead.b[z]);
  1237  		if(r1->s2 != R)
  1238  			synch(r1->s2, dif);
  1239  	}
  1240  }
  1241  
  1242  uint32
  1243  allreg(uint32 b, Rgn *r)
  1244  {
  1245  	Var *v;
  1246  	int i;
  1247  
  1248  	v = var + r->varno;
  1249  	r->regno = 0;
  1250  	switch(v->etype) {
  1251  
  1252  	default:
  1253  		fatal("unknown etype %d/%E", bitno(b), v->etype);
  1254  		break;
  1255  
  1256  	case TINT8:
  1257  	case TUINT8:
  1258  	case TINT16:
  1259  	case TUINT16:
  1260  	case TINT32:
  1261  	case TUINT32:
  1262  	case TINT64:
  1263  	case TINT:
  1264  	case TUINT:
  1265  	case TUINTPTR:
  1266  	case TBOOL:
  1267  	case TPTR32:
  1268  		i = BtoR(~b);
  1269  		if(i && r->cost > 0) {
  1270  			r->regno = i;
  1271  			return RtoB(i);
  1272  		}
  1273  		break;
  1274  
  1275  	case TFLOAT32:
  1276  	case TFLOAT64:
  1277  		if(!use_sse)
  1278  			break;
  1279  		i = BtoF(~b);
  1280  		if(i && r->cost > 0) {
  1281  			r->regno = i;
  1282  			return FtoB(i);
  1283  		}
  1284  		break;
  1285  	}
  1286  	return 0;
  1287  }
  1288  
  1289  void
  1290  paint1(Reg *r, int bn)
  1291  {
  1292  	Reg *r1;
  1293  	Prog *p;
  1294  	int z;
  1295  	uint32 bb;
  1296  
  1297  	z = bn/32;
  1298  	bb = 1L<<(bn%32);
  1299  	if(r->act.b[z] & bb)
  1300  		return;
  1301  	for(;;) {
  1302  		if(!(r->refbehind.b[z] & bb))
  1303  			break;
  1304  		r1 = r->p1;
  1305  		if(r1 == R)
  1306  			break;
  1307  		if(!(r1->refahead.b[z] & bb))
  1308  			break;
  1309  		if(r1->act.b[z] & bb)
  1310  			break;
  1311  		r = r1;
  1312  	}
  1313  
  1314  	if(LOAD(r) & ~(r->set.b[z]&~(r->use1.b[z]|r->use2.b[z])) & bb) {
  1315  		change -= CLOAD * r->loop;
  1316  	}
  1317  	for(;;) {
  1318  		r->act.b[z] |= bb;
  1319  		p = r->prog;
  1320  
  1321  		if(r->use1.b[z] & bb) {
  1322  			change += CREF * r->loop;
  1323  			if(p->as == AFMOVL || p->as == AFMOVW)
  1324  				if(BtoR(bb) != D_F0)
  1325  					change = -CINF;
  1326  		}
  1327  
  1328  		if((r->use2.b[z]|r->set.b[z]) & bb) {
  1329  			change += CREF * r->loop;
  1330  			if(p->as == AFMOVL || p->as == AFMOVW)
  1331  				if(BtoR(bb) != D_F0)
  1332  					change = -CINF;
  1333  		}
  1334  
  1335  		if(STORE(r) & r->regdiff.b[z] & bb) {
  1336  			change -= CLOAD * r->loop;
  1337  			if(p->as == AFMOVL || p->as == AFMOVW)
  1338  				if(BtoR(bb) != D_F0)
  1339  					change = -CINF;
  1340  		}
  1341  
  1342  		if(r->refbehind.b[z] & bb)
  1343  			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
  1344  				if(r1->refahead.b[z] & bb)
  1345  					paint1(r1, bn);
  1346  
  1347  		if(!(r->refahead.b[z] & bb))
  1348  			break;
  1349  		r1 = r->s2;
  1350  		if(r1 != R)
  1351  			if(r1->refbehind.b[z] & bb)
  1352  				paint1(r1, bn);
  1353  		r = r->s1;
  1354  		if(r == R)
  1355  			break;
  1356  		if(r->act.b[z] & bb)
  1357  			break;
  1358  		if(!(r->refbehind.b[z] & bb))
  1359  			break;
  1360  	}
  1361  }
  1362  
  1363  uint32
  1364  regset(Reg *r, uint32 bb)
  1365  {
  1366  	uint32 b, set;
  1367  	Adr v;
  1368  	int c;
  1369  
  1370  	set = 0;
  1371  	v = zprog.from;
  1372  	while(b = bb & ~(bb-1)) {
  1373  		v.type = b & 0xFF ? BtoR(b): BtoF(b);
  1374  		c = copyu(r->prog, &v, A);
  1375  		if(c == 3)
  1376  			set |= b;
  1377  		bb &= ~b;
  1378  	}
  1379  	return set;
  1380  }
  1381  
  1382  uint32
  1383  reguse(Reg *r, uint32 bb)
  1384  {
  1385  	uint32 b, set;
  1386  	Adr v;
  1387  	int c;
  1388  
  1389  	set = 0;
  1390  	v = zprog.from;
  1391  	while(b = bb & ~(bb-1)) {
  1392  		v.type = b & 0xFF ? BtoR(b): BtoF(b);
  1393  		c = copyu(r->prog, &v, A);
  1394  		if(c == 1 || c == 2 || c == 4)
  1395  			set |= b;
  1396  		bb &= ~b;
  1397  	}
  1398  	return set;
  1399  }
  1400  
  1401  uint32
  1402  paint2(Reg *r, int bn)
  1403  {
  1404  	Reg *r1;
  1405  	int z;
  1406  	uint32 bb, vreg, x;
  1407  
  1408  	z = bn/32;
  1409  	bb = 1L << (bn%32);
  1410  	vreg = regbits;
  1411  	if(!(r->act.b[z] & bb))
  1412  		return vreg;
  1413  	for(;;) {
  1414  		if(!(r->refbehind.b[z] & bb))
  1415  			break;
  1416  		r1 = r->p1;
  1417  		if(r1 == R)
  1418  			break;
  1419  		if(!(r1->refahead.b[z] & bb))
  1420  			break;
  1421  		if(!(r1->act.b[z] & bb))
  1422  			break;
  1423  		r = r1;
  1424  	}
  1425  	for(;;) {
  1426  		r->act.b[z] &= ~bb;
  1427  
  1428  		vreg |= r->regu;
  1429  
  1430  		if(r->refbehind.b[z] & bb)
  1431  			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
  1432  				if(r1->refahead.b[z] & bb)
  1433  					vreg |= paint2(r1, bn);
  1434  
  1435  		if(!(r->refahead.b[z] & bb))
  1436  			break;
  1437  		r1 = r->s2;
  1438  		if(r1 != R)
  1439  			if(r1->refbehind.b[z] & bb)
  1440  				vreg |= paint2(r1, bn);
  1441  		r = r->s1;
  1442  		if(r == R)
  1443  			break;
  1444  		if(!(r->act.b[z] & bb))
  1445  			break;
  1446  		if(!(r->refbehind.b[z] & bb))
  1447  			break;
  1448  	}
  1449  
  1450  	bb = vreg;
  1451  	for(; r; r=r->s1) {
  1452  		x = r->regu & ~bb;
  1453  		if(x) {
  1454  			vreg |= reguse(r, x);
  1455  			bb |= regset(r, x);
  1456  		}
  1457  	}
  1458  	return vreg;
  1459  }
  1460  
  1461  void
  1462  paint3(Reg *r, int bn, int32 rb, int rn)
  1463  {
  1464  	Reg *r1;
  1465  	Prog *p;
  1466  	int z;
  1467  	uint32 bb;
  1468  
  1469  	z = bn/32;
  1470  	bb = 1L << (bn%32);
  1471  	if(r->act.b[z] & bb)
  1472  		return;
  1473  	for(;;) {
  1474  		if(!(r->refbehind.b[z] & bb))
  1475  			break;
  1476  		r1 = r->p1;
  1477  		if(r1 == R)
  1478  			break;
  1479  		if(!(r1->refahead.b[z] & bb))
  1480  			break;
  1481  		if(r1->act.b[z] & bb)
  1482  			break;
  1483  		r = r1;
  1484  	}
  1485  
  1486  	if(LOAD(r) & ~(r->set.b[z] & ~(r->use1.b[z]|r->use2.b[z])) & bb)
  1487  		addmove(r, bn, rn, 0);
  1488  	for(;;) {
  1489  		r->act.b[z] |= bb;
  1490  		p = r->prog;
  1491  
  1492  		if(r->use1.b[z] & bb) {
  1493  			if(debug['R'] && debug['v'])
  1494  				print("%P", p);
  1495  			addreg(&p->from, rn);
  1496  			if(debug['R'] && debug['v'])
  1497  				print(" ===change== %P\n", p);
  1498  		}
  1499  		if((r->use2.b[z]|r->set.b[z]) & bb) {
  1500  			if(debug['R'] && debug['v'])
  1501  				print("%P", p);
  1502  			addreg(&p->to, rn);
  1503  			if(debug['R'] && debug['v'])
  1504  				print(" ===change== %P\n", p);
  1505  		}
  1506  
  1507  		if(STORE(r) & r->regdiff.b[z] & bb)
  1508  			addmove(r, bn, rn, 1);
  1509  		r->regu |= rb;
  1510  
  1511  		if(r->refbehind.b[z] & bb)
  1512  			for(r1 = r->p2; r1 != R; r1 = r1->p2link)
  1513  				if(r1->refahead.b[z] & bb)
  1514  					paint3(r1, bn, rb, rn);
  1515  
  1516  		if(!(r->refahead.b[z] & bb))
  1517  			break;
  1518  		r1 = r->s2;
  1519  		if(r1 != R)
  1520  			if(r1->refbehind.b[z] & bb)
  1521  				paint3(r1, bn, rb, rn);
  1522  		r = r->s1;
  1523  		if(r == R)
  1524  			break;
  1525  		if(r->act.b[z] & bb)
  1526  			break;
  1527  		if(!(r->refbehind.b[z] & bb))
  1528  			break;
  1529  	}
  1530  }
  1531  
  1532  void
  1533  addreg(Adr *a, int rn)
  1534  {
  1535  
  1536  	a->sym = 0;
  1537  	a->offset = 0;
  1538  	a->type = rn;
  1539  
  1540  	ostats.ncvtreg++;
  1541  }
  1542  
  1543  int32
  1544  RtoB(int r)
  1545  {
  1546  
  1547  	if(r < D_AX || r > D_DI)
  1548  		return 0;
  1549  	return 1L << (r-D_AX);
  1550  }
  1551  
  1552  int
  1553  BtoR(int32 b)
  1554  {
  1555  
  1556  	b &= 0xffL;
  1557  	if(b == 0)
  1558  		return 0;
  1559  	return bitno(b) + D_AX;
  1560  }
  1561  
  1562  int32
  1563  FtoB(int f)
  1564  {
  1565  	if(f < D_X0 || f > D_X7)
  1566  		return 0;
  1567  	return 1L << (f - D_X0 + 8);
  1568  }
  1569  
  1570  int
  1571  BtoF(int32 b)
  1572  {
  1573  	b &= 0xFF00L;
  1574  	if(b == 0)
  1575  		return 0;
  1576  	return bitno(b) - 8 + D_X0;
  1577  }
  1578  
  1579  void
  1580  dumpone(Reg *r)
  1581  {
  1582  	int z;
  1583  	Bits bit;
  1584  
  1585  	print("%d:%P", r->loop, r->prog);
  1586  	for(z=0; z<BITS; z++)
  1587  		bit.b[z] =
  1588  			r->set.b[z] |
  1589  			r->use1.b[z] |
  1590  			r->use2.b[z] |
  1591  			r->refbehind.b[z] |
  1592  			r->refahead.b[z] |
  1593  			r->calbehind.b[z] |
  1594  			r->calahead.b[z] |
  1595  			r->regdiff.b[z] |
  1596  			r->act.b[z] |
  1597  				0;
  1598  	if(bany(&bit)) {
  1599  		print("\t");
  1600  		if(bany(&r->set))
  1601  			print(" s:%Q", r->set);
  1602  		if(bany(&r->use1))
  1603  			print(" u1:%Q", r->use1);
  1604  		if(bany(&r->use2))
  1605  			print(" u2:%Q", r->use2);
  1606  		if(bany(&r->refbehind))
  1607  			print(" rb:%Q ", r->refbehind);
  1608  		if(bany(&r->refahead))
  1609  			print(" ra:%Q ", r->refahead);
  1610  		if(bany(&r->calbehind))
  1611  			print(" cb:%Q ", r->calbehind);
  1612  		if(bany(&r->calahead))
  1613  			print(" ca:%Q ", r->calahead);
  1614  		if(bany(&r->regdiff))
  1615  			print(" d:%Q ", r->regdiff);
  1616  		if(bany(&r->act))
  1617  			print(" a:%Q ", r->act);
  1618  	}
  1619  	print("\n");
  1620  }
  1621  
  1622  void
  1623  dumpit(char *str, Reg *r0)
  1624  {
  1625  	Reg *r, *r1;
  1626  
  1627  	print("\n%s\n", str);
  1628  	for(r = r0; r != R; r = r->link) {
  1629  		dumpone(r);
  1630  		r1 = r->p2;
  1631  		if(r1 != R) {
  1632  			print("	pred:");
  1633  			for(; r1 != R; r1 = r1->p2link)
  1634  				print(" %.4ud", r1->prog->loc);
  1635  			print("\n");
  1636  		}
  1637  //		r1 = r->s1;
  1638  //		if(r1 != R) {
  1639  //			print("	succ:");
  1640  //			for(; r1 != R; r1 = r1->s1)
  1641  //				print(" %.4ud", r1->prog->loc);
  1642  //			print("\n");
  1643  //		}
  1644  	}
  1645  }
  1646  
  1647  static Sym*	symlist[10];
  1648  
  1649  int
  1650  noreturn(Prog *p)
  1651  {
  1652  	Sym *s;
  1653  	int i;
  1654  
  1655  	if(symlist[0] == S) {
  1656  		symlist[0] = pkglookup("panicindex", runtimepkg);
  1657  		symlist[1] = pkglookup("panicslice", runtimepkg);
  1658  		symlist[2] = pkglookup("throwinit", runtimepkg);
  1659  		symlist[3] = pkglookup("panic", runtimepkg);
  1660  		symlist[4] = pkglookup("panicwrap", runtimepkg);
  1661  	}
  1662  
  1663  	s = p->to.sym;
  1664  	if(s == S)
  1665  		return 0;
  1666  	for(i=0; symlist[i]!=S; i++)
  1667  		if(s == symlist[i])
  1668  			return 1;
  1669  	return 0;
  1670  }
  1671  
  1672  /*
  1673   * the code generator depends on being able to write out JMP
  1674   * instructions that it can jump to now but fill in later.
  1675   * the linker will resolve them nicely, but they make the code
  1676   * longer and more difficult to follow during debugging.
  1677   * remove them.
  1678   */
  1679  
  1680  /* what instruction does a JMP to p eventually land on? */
  1681  static Prog*
  1682  chasejmp(Prog *p, int *jmploop)
  1683  {
  1684  	int n;
  1685  
  1686  	n = 0;
  1687  	while(p != P && p->as == AJMP && p->to.type == D_BRANCH) {
  1688  		if(++n > 10) {
  1689  			*jmploop = 1;
  1690  			break;
  1691  		}
  1692  		p = p->to.u.branch;
  1693  	}
  1694  	return p;
  1695  }
  1696  
  1697  /*
  1698   * reuse reg pointer for mark/sweep state.
  1699   * leave reg==nil at end because alive==nil.
  1700   */
  1701  #define alive ((void*)0)
  1702  #define dead ((void*)1)
  1703  
  1704  /* mark all code reachable from firstp as alive */
  1705  static void
  1706  mark(Prog *firstp)
  1707  {
  1708  	Prog *p;
  1709  	
  1710  	for(p=firstp; p; p=p->link) {
  1711  		if(p->reg != dead)
  1712  			break;
  1713  		p->reg = alive;
  1714  		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.u.branch)
  1715  			mark(p->to.u.branch);
  1716  		if(p->as == AJMP || p->as == ARET || p->as == AUNDEF)
  1717  			break;
  1718  	}
  1719  }
  1720  
  1721  static void
  1722  fixjmp(Prog *firstp)
  1723  {
  1724  	int jmploop;
  1725  	Prog *p, *last;
  1726  	
  1727  	if(debug['R'] && debug['v'])
  1728  		print("\nfixjmp\n");
  1729  
  1730  	// pass 1: resolve jump to AJMP, mark all code as dead.
  1731  	jmploop = 0;
  1732  	for(p=firstp; p; p=p->link) {
  1733  		if(debug['R'] && debug['v'])
  1734  			print("%P\n", p);
  1735  		if(p->as != ACALL && p->to.type == D_BRANCH && p->to.u.branch && p->to.u.branch->as == AJMP) {
  1736  			p->to.u.branch = chasejmp(p->to.u.branch, &jmploop);
  1737  			if(debug['R'] && debug['v'])
  1738  				print("->%P\n", p);
  1739  		}
  1740  		p->reg = dead;
  1741  	}
  1742  	if(debug['R'] && debug['v'])
  1743  		print("\n");
  1744  
  1745  	// pass 2: mark all reachable code alive
  1746  	mark(firstp);
  1747  	
  1748  	// pass 3: delete dead code (mostly JMPs).
  1749  	last = nil;
  1750  	for(p=firstp; p; p=p->link) {
  1751  		if(p->reg == dead) {
  1752  			if(p->link == P && p->as == ARET && last && last->as != ARET) {
  1753  				// This is the final ARET, and the code so far doesn't have one.
  1754  				// Let it stay.
  1755  			} else {
  1756  				if(debug['R'] && debug['v'])
  1757  					print("del %P\n", p);
  1758  				continue;
  1759  			}
  1760  		}
  1761  		if(last)
  1762  			last->link = p;
  1763  		last = p;
  1764  	}
  1765  	last->link = P;
  1766  	
  1767  	// pass 4: elide JMP to next instruction.
  1768  	// only safe if there are no jumps to JMPs anymore.
  1769  	if(!jmploop) {
  1770  		last = nil;
  1771  		for(p=firstp; p; p=p->link) {
  1772  			if(p->as == AJMP && p->to.type == D_BRANCH && p->to.u.branch == p->link) {
  1773  				if(debug['R'] && debug['v'])
  1774  					print("del %P\n", p);
  1775  				continue;
  1776  			}
  1777  			if(last)
  1778  				last->link = p;
  1779  			last = p;
  1780  		}
  1781  		last->link = P;
  1782  	}
  1783  	
  1784  	if(debug['R'] && debug['v']) {
  1785  		print("\n");
  1786  		for(p=firstp; p; p=p->link)
  1787  			print("%P\n", p);
  1788  		print("\n");
  1789  	}
  1790  }
  1791  
  1792  static uint32
  1793  fnv1(Sym *sym)
  1794  {
  1795  	uint32 h;
  1796  	char *s;
  1797  
  1798  	h = 2166136261U;
  1799  	for(s=sym->name;*s;s++) {
  1800  		h = (16777619 * h) ^ (uint32)(uint8)(*s);
  1801  	}
  1802  	return h;
  1803  }
  1804  
  1805  static uint16
  1806  hash32to16(uint32 h)
  1807  {
  1808  	return (h & 0xffff) ^ (h >> 16);
  1809  }
  1810  
  1811  /*
  1812   * fixtemp eliminates sequences like:
  1813   *   MOV reg1, mem
  1814   *   OP mem, reg2
  1815   * when mem is a stack variable which is not mentioned
  1816   * anywhere else. The instructions are replaced by
  1817   *   OP reg1, reg2
  1818   * this reduces the number of variables that the register optimizer
  1819   * sees, which lets it do a better job and makes it less likely to turn
  1820   * itself off.
  1821   */
  1822  static void
  1823  fixtemp(Prog *firstp)
  1824  {
  1825  	static uint8 counts[1<<16]; // A hash table to count variable occurences.
  1826  	int i;
  1827  	Prog *p, *p2;
  1828  	uint32 h;
  1829  
  1830  	if(debug['R'] && debug['v'])
  1831  		print("\nfixtemp\n");
  1832  
  1833  	// Count variable references. We actually use a hashtable so this
  1834  	// is only approximate.
  1835  	for(i=0; i<nelem(counts); i++)
  1836  		counts[i] = 0;
  1837  	for(p=firstp; p!=P; p=p->link) {
  1838  		if(p->from.type == D_AUTO) {
  1839  			h = hash32to16(fnv1(p->from.sym));
  1840  			//print("seen %S hash %d\n", p->from.sym, hash32to16(h));
  1841  			if(counts[h] < 10)
  1842  				counts[h]++;
  1843  		}
  1844  		if(p->to.type == D_AUTO) {
  1845  			h = hash32to16(fnv1(p->to.sym));
  1846  			//print("seen %S hash %d\n", p->to.sym, hash32to16(h));
  1847  			if(counts[h] < 10)
  1848  				counts[h]++;
  1849  		}
  1850  	}
  1851  
  1852  	// Eliminate single-write, single-read stack variables.
  1853  	for(p=firstp; p!=P; p=p->link) {
  1854  		if(debug['R'] && debug['v'])
  1855  			print("%P\n", p);
  1856  		if(p->link == P || p->to.type != D_AUTO)
  1857  			continue;
  1858  		if(isfloat[p->to.etype] && FtoB(p->from.type)) {
  1859  			switch(p->as) {
  1860  			case AMOVSS:
  1861  			case AMOVSD:
  1862  				break;
  1863  			default:
  1864  				continue;
  1865  			}
  1866  		} else if(!isfloat[p->to.etype] && RtoB(p->from.type)) {
  1867  			switch(p->as) {
  1868  			case AMOVB:
  1869  				if(p->to.width == 1)
  1870  					break;
  1871  			case AMOVW:
  1872  				if(p->to.width == 2)
  1873  					break;
  1874  			case AMOVL:
  1875  				if(p->to.width == 4)
  1876  					break;
  1877  			default:
  1878  				continue;
  1879  			}
  1880  		} else
  1881  			continue;
  1882  		// p is a MOV reg, mem.
  1883  		p2 = p->link;
  1884  		h = hash32to16(fnv1(p->to.sym));
  1885  		if(counts[h] != 2) {
  1886  			continue;
  1887  		}
  1888  		switch(p2->as) {
  1889  		case ALEAL:
  1890  		case AFMOVD:
  1891  		case AFMOVF:
  1892  		case AFMOVL:
  1893  		case AFMOVW:
  1894  		case AFMOVV:
  1895  			// funny
  1896  			continue;
  1897  		}
  1898  		// p2 is OP mem, reg2
  1899  		// and OP is not a funny instruction.
  1900  		if(p2->from.sym == p->to.sym
  1901  			&& p2->from.offset == p->to.offset
  1902  			&& p2->from.type == p->to.type) {
  1903  			if(debug['R'] && debug['v']) {
  1904  				print(" ===elide== %D\n", &p->to);
  1905  				print("%P", p2);
  1906  			}
  1907  			// p2 is OP mem, reg2.
  1908  			// change to OP reg, reg2 and
  1909  			// eliminate the mov.
  1910  			p2->from = p->from;
  1911  			*p = *p2;
  1912  			p->link = p2->link;
  1913  			if(debug['R'] && debug['v']) {
  1914  				print(" ===change== %P\n", p);
  1915  			}
  1916  		}
  1917  	}
  1918  }