github.com/flyinox/gosm@v0.0.0-20171117061539-16768cb62077/src/math/big/arith_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,s390x
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
    13          MOVD    $x-24(SP), R1
    14          XC      $24, 0(R1), 0(R1) // clear the storage
    15          MOVD    $2, R0            // R0 is the number of double words stored -1
    16          WORD    $0xB2B01000       // STFLE 0(R1)
    17          XOR     R0, R0            // reset the value of R0
    18          MOVBZ   z-8(SP), R1
    19          AND     $0x40, R1
    20          BEQ     novector
    21  vectorinstalled:
    22          // check if the vector instruction has been enabled
    23          VLEIB   $0, $0xF, V16
    24          VLGVB   $0, V16, R1
    25          CMPBNE  R1, $0xF, novector
    26          MOVB    $1, ret+0(FP) // have vx
    27          RET
    28  novector:
    29          MOVB    $0, ret+0(FP) // no vx
    30          RET
    31  
    32  TEXT ·mulWW(SB),NOSPLIT,$0
    33  	MOVD	x+0(FP), R3
    34  	MOVD	y+8(FP), R4
    35  	MULHDU	R3, R4
    36  	MOVD	R10, z1+16(FP)
    37  	MOVD	R11, z0+24(FP)
    38  	RET
    39  
    40  // func divWW(x1, x0, y Word) (q, r Word)
    41  TEXT ·divWW(SB),NOSPLIT,$0
    42  	MOVD	x1+0(FP), R10
    43  	MOVD	x0+8(FP), R11
    44  	MOVD	y+16(FP), R5
    45  	WORD	$0xb98700a5 // dlgr r10,r5
    46  	MOVD	R11, q+24(FP)
    47  	MOVD	R10, r+32(FP)
    48  	RET
    49  
    50  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    51  // func addVV(z, x, y []Word) (c Word)
    52  
    53  
    54  TEXT ·addVV(SB),NOSPLIT,$0
    55  	MOVD	addvectorfacility+0x00(SB),R1
    56  	BR	(R1)
    57  	
    58  TEXT ·addVV_check(SB),NOSPLIT, $0
    59  	MOVB	·hasVX(SB), R1
    60  	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
    61  	MOVD	$addvectorfacility+0x00(SB), R1
    62  	MOVD	$·addVV_novec(SB), R2
    63  	MOVD	R2, 0(R1)
    64  	//MOVD	$·addVV_novec(SB), 0(R1)
    65  	BR	·addVV_novec(SB)
    66  vectorimpl:
    67  	MOVD	$addvectorfacility+0x00(SB), R1
    68  	MOVD	$·addVV_vec(SB), R2
    69  	MOVD	R2, 0(R1)
    70  	//MOVD	$·addVV_vec(SB), 0(R1)
    71  	BR	·addVV_vec(SB)
    72  
    73  GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    74  DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    75  
    76  TEXT ·addVV_vec(SB),NOSPLIT,$0
    77  	MOVD	z_len+8(FP), R3
    78  	MOVD	x+24(FP), R8
    79  	MOVD	y+48(FP), R9
    80  	MOVD	z+0(FP), R2
    81  
    82  	MOVD	$0, R4		// c = 0
    83  	MOVD	$0, R0		// make sure it's zero
    84  	MOVD	$0, R10		// i = 0
    85  
    86  
    87  	// s/JL/JMP/ below to disable the unrolled loop
    88  	SUB	$4, R3
    89  	BLT	v1
    90  	SUB     $12, R3                 // n -= 16
    91          BLT     A1                      // if n < 0 goto A1
    92         
    93  	MOVD	R8, R5
    94  	MOVD	R9, R6
    95  	MOVD	R2, R7
    96  	// n >= 0
    97  	// regular loop body unrolled 16x
    98  	VZERO	V0			// c = 0
    99  UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
   100  	ADD	$64, R5
   101  	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   102  	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   103  
   104  
   105  	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
   106  	ADD	$64, R6
   107  	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
   108  	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
   109  
   110  	VACCCQ	V1, V9, V0, V25
   111  	VACQ	V1, V9, V0, V17
   112  	VACCCQ	V2, V10, V25, V26
   113  	VACQ	V2, V10, V25, V18
   114  
   115  
   116  	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
   117  	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
   118  	ADD	$32, R5
   119  	ADD	$32, R6
   120  
   121  	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   122  	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   123  	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
   124  	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
   125  
   126  	VACCCQ	V3, V11, V26, V27
   127  	VACQ	V3, V11, V26, V19
   128  	VACCCQ	V4, V12, V27, V28
   129  	VACQ	V4, V12, V27, V20
   130  
   131  	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
   132  	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
   133  	ADD	$32, R5
   134  	ADD	$32, R6
   135  
   136  	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   137  	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   138  	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
   139  	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
   140  
   141  	VACCCQ	V5, V13, V28, V29
   142  	VACQ	V5, V13, V28, V21
   143  	VACCCQ	V6, V14, V29, V30
   144  	VACQ	V6, V14, V29, V22
   145  
   146  	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   147  	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   148  	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
   149  	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
   150  
   151  	VACCCQ	V7, V15, V30, V31
   152  	VACQ	V7, V15, V30, V23
   153  	VACCCQ	V8, V16, V31, V0	//V0 has carry-over
   154  	VACQ	V8, V16, V31, V24
   155  
   156  	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   157  	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   158  	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   159  	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   160  	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   161  	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   162  	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   163  	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   164  	VSTM	V17, V24, 0(R7)  	// 128-bytes into z
   165  	ADD	$128, R7
   166  	ADD	$128, R10	// i += 16
   167  	SUB	$16,  R3	// n -= 16
   168  	BGE	UU1		// if n >= 0 goto U1
   169  	VLGVG	$1, V0, R4	// put cf into R4
   170  	NEG	R4, R4		// save cf
   171  
   172  A1:	ADD	$12, R3		// n += 16
   173  
   174  
   175  	// s/JL/JMP/ below to disable the unrolled loop
   176  	BLT	v1		// if n < 0 goto v1
   177  
   178  U1:	// n >= 0
   179  	// regular loop body unrolled 4x
   180  	MOVD	0(R8)(R10*1), R5
   181  	MOVD	8(R8)(R10*1), R6
   182  	MOVD	16(R8)(R10*1), R7
   183  	MOVD	24(R8)(R10*1), R1
   184  	ADDC	R4, R4		// restore CF
   185  	MOVD	0(R9)(R10*1), R11
   186  	ADDE	R11, R5
   187  	MOVD	8(R9)(R10*1), R11
   188  	ADDE	R11, R6
   189  	MOVD	16(R9)(R10*1), R11
   190  	ADDE	R11, R7
   191  	MOVD	24(R9)(R10*1), R11
   192  	ADDE	R11, R1
   193  	MOVD	R0, R4
   194  	ADDE	R4, R4		// save CF
   195  	NEG	R4, R4
   196  	MOVD	R5, 0(R2)(R10*1)
   197  	MOVD	R6, 8(R2)(R10*1)
   198  	MOVD	R7, 16(R2)(R10*1)
   199  	MOVD	R1, 24(R2)(R10*1)
   200  
   201  
   202  	ADD	$32, R10	// i += 4
   203  	SUB	$4,  R3		// n -= 4
   204  	BGE	U1		// if n >= 0 goto U1
   205  
   206  v1:	ADD	$4, R3		// n += 4
   207  	BLE	E1		// if n <= 0 goto E1
   208  
   209  L1:	// n > 0
   210  	ADDC	R4, R4		// restore CF
   211  	MOVD	0(R8)(R10*1), R5
   212  	MOVD	0(R9)(R10*1), R11
   213  	ADDE	R11, R5
   214  	MOVD	R5, 0(R2)(R10*1)
   215  	MOVD	R0, R4
   216  	ADDE	R4, R4		// save CF
   217  	NEG 	R4, R4
   218  
   219  	ADD	$8, R10		// i++
   220  	SUB	$1, R3		// n--
   221  	BGT	L1		// if n > 0 goto L1
   222  
   223  E1:	NEG	R4, R4
   224  	MOVD	R4, c+72(FP)	// return c
   225  	RET
   226  
   227  TEXT ·addVV_novec(SB),NOSPLIT,$0
   228  novec:
   229  	MOVD	z_len+8(FP), R3
   230  	MOVD	x+24(FP), R8
   231  	MOVD	y+48(FP), R9
   232  	MOVD	z+0(FP), R2
   233  
   234  	MOVD	$0, R4		// c = 0
   235  	MOVD	$0, R0		// make sure it's zero
   236  	MOVD	$0, R10		// i = 0
   237  
   238  	// s/JL/JMP/ below to disable the unrolled loop
   239  	SUB	$4, R3		// n -= 4
   240  	BLT	v1n		// if n < 0 goto v1n
   241  U1n:	// n >= 0
   242  	// regular loop body unrolled 4x
   243  	MOVD	0(R8)(R10*1), R5
   244  	MOVD	8(R8)(R10*1), R6
   245  	MOVD	16(R8)(R10*1), R7
   246  	MOVD	24(R8)(R10*1), R1
   247  	ADDC	R4, R4		// restore CF
   248  	MOVD	0(R9)(R10*1), R11
   249  	ADDE	R11, R5
   250  	MOVD	8(R9)(R10*1), R11
   251  	ADDE	R11, R6
   252  	MOVD	16(R9)(R10*1), R11
   253  	ADDE	R11, R7
   254  	MOVD	24(R9)(R10*1), R11
   255  	ADDE	R11, R1
   256  	MOVD	R0, R4
   257  	ADDE	R4, R4		// save CF
   258  	NEG	R4, R4
   259  	MOVD	R5, 0(R2)(R10*1)
   260  	MOVD	R6, 8(R2)(R10*1)
   261  	MOVD	R7, 16(R2)(R10*1)
   262  	MOVD	R1, 24(R2)(R10*1)
   263  
   264  
   265  	ADD	$32, R10	// i += 4
   266  	SUB	$4,  R3		// n -= 4
   267  	BGE	U1n		// if n >= 0 goto U1n
   268  
   269  v1n:	ADD	$4, R3		// n += 4
   270  	BLE	E1n		// if n <= 0 goto E1n
   271  
   272  L1n:	// n > 0
   273  	ADDC	R4, R4		// restore CF
   274  	MOVD	0(R8)(R10*1), R5
   275  	MOVD	0(R9)(R10*1), R11
   276  	ADDE	R11, R5
   277  	MOVD	R5, 0(R2)(R10*1)
   278  	MOVD	R0, R4
   279  	ADDE	R4, R4		// save CF
   280  	NEG 	R4, R4
   281  
   282  	ADD	$8, R10		// i++
   283  	SUB	$1, R3		// n--
   284  	BGT L1n			// if n > 0 goto L1n
   285  
   286  E1n:	NEG	R4, R4
   287  	MOVD	R4, c+72(FP)	// return c
   288  	RET
   289  
   290  
   291  TEXT ·subVV(SB),NOSPLIT,$0
   292  	MOVD	subvectorfacility+0x00(SB),R1
   293  	BR	(R1)
   294  	
   295  TEXT ·subVV_check(SB),NOSPLIT,$0
   296  	MOVB	·hasVX(SB), R1
   297  	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   298  	MOVD	$subvectorfacility+0x00(SB), R1
   299  	MOVD	$·subVV_novec(SB), R2
   300  	MOVD	R2, 0(R1)
   301  	//MOVD	$·subVV_novec(SB), 0(R1)
   302  	BR	·subVV_novec(SB)
   303  vectorimpl:
   304  	MOVD	$subvectorfacility+0x00(SB), R1
   305  	MOVD    $·subVV_vec(SB), R2
   306          MOVD    R2, 0(R1)
   307  	//MOVD	$·subVV_vec(SB), 0(R1)
   308  	BR	·subVV_vec(SB)
   309  
   310  GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   311  DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   312  
   313  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   314  // func subVV(z, x, y []Word) (c Word)
   315  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   316  TEXT ·subVV_vec(SB),NOSPLIT,$0
   317  	MOVD	z_len+8(FP), R3
   318  	MOVD	x+24(FP), R8
   319  	MOVD	y+48(FP), R9
   320  	MOVD	z+0(FP), R2
   321  	MOVD	$0, R4		// c = 0
   322  	MOVD	$0, R0		// make sure it's zero
   323  	MOVD	$0, R10		// i = 0
   324  	
   325  	// s/JL/JMP/ below to disable the unrolled loop
   326  	SUB	$4, R3		// n -= 4
   327  	BLT	v1		// if n < 0 goto v1
   328  	SUB     $12, R3         // n -= 16
   329          BLT     A1              // if n < 0 goto A1
   330  
   331  	MOVD	R8, R5
   332  	MOVD	R9, R6
   333  	MOVD	R2, R7
   334  
   335  	// n >= 0
   336  	// regular loop body unrolled 16x
   337  	VZERO	V0		// cf = 0
   338  	MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow)
   339  	VLVGG	$1, R4, V0	//put carry into V0
   340  
   341  UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
   342  	ADD	$64, R5
   343  	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   344  	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   345  
   346  
   347  	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
   348  	ADD	$64, R6
   349  	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
   350  	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
   351  
   352  	VSBCBIQ	V1, V9, V0, V25
   353  	VSBIQ	V1, V9, V0, V17
   354  	VSBCBIQ	V2, V10, V25, V26
   355  	VSBIQ	V2, V10, V25, V18
   356  
   357  
   358  	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
   359  	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
   360  	ADD	$32, R5
   361  	ADD	$32, R6
   362  
   363  	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   364  	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   365  	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
   366  	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
   367  
   368  	VSBCBIQ	V3, V11, V26, V27
   369  	VSBIQ	V3, V11, V26, V19
   370  	VSBCBIQ	V4, V12, V27, V28
   371  	VSBIQ	V4, V12, V27, V20
   372  
   373  	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
   374  	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
   375  	ADD	$32, R5
   376  	ADD	$32, R6
   377  
   378  	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   379  	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   380  	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
   381  	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
   382  
   383  	VSBCBIQ	V5, V13, V28, V29
   384  	VSBIQ	V5, V13, V28, V21
   385  	VSBCBIQ	V6, V14, V29, V30
   386  	VSBIQ	V6, V14, V29, V22
   387  
   388  	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   389  	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   390  	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
   391  	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
   392  
   393  	VSBCBIQ	V7, V15, V30, V31
   394  	VSBIQ	V7, V15, V30, V23
   395  	VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over
   396  	VSBIQ	V8, V16, V31, V24
   397  
   398  	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   399  	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   400  	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   401  	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   402  	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   403  	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   404  	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   405  	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   406  	VSTM	V17, V24, 0(R7)   // 128-bytes into z
   407  	ADD	$128, R7
   408  	ADD	$128, R10	// i += 16
   409  	SUB	$16,  R3	// n -= 16
   410  	BGE	UU1		// if n >= 0 goto U1
   411  	VLGVG	$1, V0, R4	// put cf into R4
   412  	SUB	$1, R4		// save cf
   413  
   414  A1:	ADD	$12, R3		// n += 16
   415  	BLT	v1		// if n < 0 goto v1
   416  	
   417  U1:	// n >= 0
   418  	// regular loop body unrolled 4x
   419  	MOVD	0(R8)(R10*1), R5
   420  	MOVD	8(R8)(R10*1), R6
   421  	MOVD	16(R8)(R10*1), R7
   422  	MOVD	24(R8)(R10*1), R1
   423  	MOVD	R0, R11
   424  	SUBC	R4, R11		// restore CF
   425  	MOVD	0(R9)(R10*1), R11
   426  	SUBE	R11, R5
   427  	MOVD	8(R9)(R10*1), R11
   428  	SUBE	R11, R6
   429  	MOVD	16(R9)(R10*1), R11
   430  	SUBE	R11, R7
   431  	MOVD	24(R9)(R10*1), R11
   432  	SUBE	R11, R1
   433  	MOVD	R0, R4
   434  	SUBE	R4, R4		// save CF
   435  	MOVD	R5, 0(R2)(R10*1)
   436  	MOVD	R6, 8(R2)(R10*1)
   437  	MOVD	R7, 16(R2)(R10*1)
   438  	MOVD	R1, 24(R2)(R10*1)
   439  
   440  	ADD	$32, R10	// i += 4
   441  	SUB	$4,  R3		// n -= 4
   442  	BGE	U1		// if n >= 0 goto U1n
   443  
   444  v1:	ADD	$4, R3		// n += 4
   445  	BLE	E1		// if n <= 0 goto E1
   446  
   447  L1:	// n > 0
   448  	MOVD	R0, R11
   449  	SUBC	R4, R11		// restore CF
   450  	MOVD	0(R8)(R10*1), R5
   451  	MOVD	0(R9)(R10*1), R11
   452  	SUBE	R11, R5
   453  	MOVD	R5, 0(R2)(R10*1)
   454  	MOVD	R0, R4
   455  	SUBE	R4, R4		// save CF
   456  
   457  	ADD	$8, R10		// i++
   458  	SUB	$1, R3		// n--
   459  	BGT	L1		// if n > 0 goto L1n
   460  
   461  E1:	NEG	R4, R4
   462  	MOVD	R4, c+72(FP)	// return c
   463  	RET
   464  
   465  
   466  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   467  // func subVV(z, x, y []Word) (c Word)
   468  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   469  TEXT ·subVV_novec(SB),NOSPLIT,$0
   470  	MOVD z_len+8(FP), R3
   471  	MOVD x+24(FP), R8
   472  	MOVD y+48(FP), R9
   473  	MOVD z+0(FP), R2
   474  
   475  	MOVD $0, R4		// c = 0
   476  	MOVD $0, R0		// make sure it's zero
   477  	MOVD $0, R10		// i = 0
   478  
   479  	// s/JL/JMP/ below to disable the unrolled loop
   480  	SUB  $4, R3		// n -= 4
   481  	BLT v1			// if n < 0 goto v1
   482  
   483  U1:	// n >= 0
   484  	// regular loop body unrolled 4x
   485  	MOVD 0(R8)(R10*1), R5
   486  	MOVD 8(R8)(R10*1), R6
   487  	MOVD 16(R8)(R10*1), R7
   488  	MOVD 24(R8)(R10*1), R1
   489  	MOVD R0, R11
   490  	SUBC R4, R11		// restore CF
   491  	MOVD 0(R9)(R10*1), R11
   492  	SUBE R11, R5
   493  	MOVD 8(R9)(R10*1), R11
   494  	SUBE R11, R6
   495  	MOVD 16(R9)(R10*1), R11
   496  	SUBE R11, R7
   497  	MOVD 24(R9)(R10*1), R11
   498  	SUBE R11, R1
   499  	MOVD R0, R4
   500  	SUBE R4, R4		// save CF
   501  	MOVD R5, 0(R2)(R10*1)
   502  	MOVD R6, 8(R2)(R10*1)
   503  	MOVD R7, 16(R2)(R10*1)
   504  	MOVD R1, 24(R2)(R10*1)
   505  
   506  
   507  	ADD  $32, R10		// i += 4
   508  	SUB  $4,  R3		// n -= 4
   509  	BGE  U1			// if n >= 0 goto U1
   510  
   511  v1:	ADD  $4, R3		// n += 4
   512  	BLE E1			// if n <= 0 goto E1
   513  
   514  L1:	// n > 0
   515  	MOVD R0, R11
   516  	SUBC R4, R11		// restore CF
   517  	MOVD 0(R8)(R10*1), R5
   518  	MOVD 0(R9)(R10*1), R11
   519  	SUBE R11, R5
   520  	MOVD R5, 0(R2)(R10*1)
   521  	MOVD R0, R4
   522  	SUBE R4, R4		// save CF
   523  
   524  	ADD  $8, R10		// i++
   525  	SUB  $1, R3		// n--
   526  	BGT L1			// if n > 0 goto L1
   527  
   528  E1:	NEG  R4, R4
   529  	MOVD R4, c+72(FP)	// return c
   530  	RET
   531  
   532  TEXT ·addVW(SB),NOSPLIT,$0
   533  	MOVD	addwvectorfacility+0x00(SB),R1
   534  	BR	(R1)
   535  	
   536  TEXT ·addVW_check(SB),NOSPLIT,$0
   537  	MOVB	·hasVX(SB), R1
   538  	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   539  	MOVD	$addwvectorfacility+0x00(SB), R1
   540  	MOVD    $·addVW_novec(SB), R2
   541          MOVD    R2, 0(R1)
   542  	//MOVD	$·addVW_novec(SB), 0(R1)
   543  	BR	·addVW_novec(SB)
   544  vectorimpl:
   545  	MOVD	$addwvectorfacility+0x00(SB), R1
   546  	MOVD    $·addVW_vec(SB), R2
   547          MOVD    R2, 0(R1)
   548  	//MOVD	$·addVW_vec(SB), 0(R1)
   549  	BR	·addVW_vec(SB)
   550  
   551  GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
   552  DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
   553  
   554  
   555  // func addVW_vec(z, x []Word, y Word) (c Word)
   556  TEXT ·addVW_vec(SB),NOSPLIT,$0
   557  	MOVD	z_len+8(FP), R3
   558  	MOVD	x+24(FP), R8
   559  	MOVD	y+48(FP), R4	// c = y
   560  	MOVD	z+0(FP), R2
   561  
   562  	MOVD	$0, R0		// make sure it's zero
   563  	MOVD	$0, R10		// i = 0
   564  	MOVD	R8, R5
   565  	MOVD	R2, R7
   566  
   567  	// s/JL/JMP/ below to disable the unrolled loop
   568  	SUB	$4, R3			// n -= 4
   569  	BLT	v10			// if n < 0 goto v10
   570  	SUB	$12, R3
   571  	BLT	A10
   572  
   573  	// n >= 0
   574  	// regular loop body unrolled 16x
   575  
   576  	VZERO	V0			// prepare V0 to be final carry register
   577  	VZERO	V9			// to ensure upper half is zero
   578  	VLVGG	$1, R4, V9
   579  UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
   580  	ADD	$64, R5
   581  	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   582  	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   583  
   584  
   585  	VACCCQ	V1, V9, V0, V25
   586  	VACQ	V1, V9, V0, V17
   587  	VZERO	V9
   588  	VACCCQ	V2, V9, V25, V26
   589  	VACQ	V2, V9, V25, V18
   590  
   591  
   592  	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
   593  	ADD	$32, R5
   594  
   595  	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   596  	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   597  
   598  	VACCCQ	V3, V9, V26, V27
   599  	VACQ	V3, V9, V26, V19
   600  	VACCCQ	V4, V9, V27, V28
   601  	VACQ	V4, V9, V27, V20
   602  
   603  	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
   604  	ADD	$32, R5
   605  
   606  	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   607  	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   608  
   609  	VACCCQ	V5, V9, V28, V29
   610  	VACQ	V5, V9, V28, V21
   611  	VACCCQ	V6, V9, V29, V30
   612  	VACQ	V6, V9, V29, V22
   613  
   614  	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   615  	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   616  
   617  	VACCCQ	V7, V9, V30, V31
   618  	VACQ	V7, V9, V30, V23
   619  	VACCCQ	V8, V9, V31, V0	//V0 has carry-over
   620  	VACQ	V8, V9, V31, V24
   621  
   622  	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   623  	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   624  	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   625  	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   626  	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   627  	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   628  	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   629  	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   630  	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
   631  	ADD	$128, R7
   632  	ADD	$128, R10		// i += 16
   633  	SUB	$16,  R3		// n -= 16
   634  	BGE	UU1		// if n >= 0 goto U1
   635  	VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10
   636  
   637  A10:	ADD	$12, R3		// n += 16
   638  
   639  
   640  	// s/JL/JMP/ below to disable the unrolled loop
   641  
   642  	BLT	v10		// if n < 0 goto v10
   643  
   644  
   645  U4:	// n >= 0
   646  	// regular loop body unrolled 4x
   647  	MOVD 0(R8)(R10*1), R5
   648  	MOVD 8(R8)(R10*1), R6
   649  	MOVD 16(R8)(R10*1), R7
   650  	MOVD 24(R8)(R10*1), R1
   651  	ADDC R4, R5
   652  	ADDE R0, R6
   653  	ADDE R0, R7
   654  	ADDE R0, R1
   655  	ADDE R0, R0
   656  	MOVD R0, R4		// save CF
   657  	SUB  R0, R0
   658  	MOVD R5, 0(R2)(R10*1)
   659  	MOVD R6, 8(R2)(R10*1)
   660  	MOVD R7, 16(R2)(R10*1)
   661  	MOVD R1, 24(R2)(R10*1)
   662  
   663  	ADD $32, R10		// i += 4 -> i +=32
   664  	SUB $4, R3		// n -= 4
   665  	BGE U4			// if n >= 0 goto U4
   666  
   667  v10:	ADD $4, R3		// n += 4
   668  	BLE E10			// if n <= 0 goto E4
   669  
   670  
   671  L4:	// n > 0
   672  	MOVD	0(R8)(R10*1), R5
   673  	ADDC	R4, R5
   674  	ADDE	R0, R0
   675  	MOVD	R0, R4		// save CF
   676  	SUB 	R0, R0
   677  	MOVD	R5, 0(R2)(R10*1)
   678  
   679  	ADD	$8, R10		// i++
   680  	SUB	$1, R3		// n--
   681  	BGT	L4		// if n > 0 goto L4
   682  
   683  E10:	MOVD	R4, c+56(FP)	// return c
   684  
   685  	RET
   686  
   687  
   688  TEXT ·addVW_novec(SB),NOSPLIT,$0
   689  //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
   690  	MOVD z_len+8(FP), R3
   691  	MOVD x+24(FP), R8
   692  	MOVD y+48(FP), R4	// c = y
   693  	MOVD z+0(FP), R2
   694  	MOVD $0, R0		// make sure it's 0
   695  	MOVD $0, R10		// i = 0
   696  
   697  	// s/JL/JMP/ below to disable the unrolled loop
   698  	SUB $4, R3		// n -= 4
   699  	BLT v4			// if n < 4 goto v4
   700  
   701  U4:	// n >= 0
   702  	// regular loop body unrolled 4x
   703  	MOVD 0(R8)(R10*1), R5
   704  	MOVD 8(R8)(R10*1), R6
   705  	MOVD 16(R8)(R10*1), R7
   706  	MOVD 24(R8)(R10*1), R1
   707  	ADDC R4, R5
   708  	ADDE R0, R6
   709  	ADDE R0, R7
   710  	ADDE R0, R1
   711  	ADDE R0, R0
   712  	MOVD R0, R4		// save CF
   713  	SUB  R0, R0
   714  	MOVD R5, 0(R2)(R10*1)
   715  	MOVD R6, 8(R2)(R10*1)
   716  	MOVD R7, 16(R2)(R10*1)
   717  	MOVD R1, 24(R2)(R10*1)
   718  
   719  	ADD $32, R10		// i += 4 -> i +=32
   720  	SUB $4, R3		// n -= 4
   721  	BGE U4			// if n >= 0 goto U4
   722  
   723  v4:	ADD $4, R3		// n += 4
   724  	BLE E4			// if n <= 0 goto E4
   725  
   726  L4:	// n > 0
   727  	MOVD 0(R8)(R10*1), R5
   728  	ADDC R4, R5
   729  	ADDE R0, R0
   730  	MOVD R0, R4		// save CF
   731  	SUB  R0, R0
   732  	MOVD R5, 0(R2)(R10*1)
   733  
   734  	ADD  $8, R10		// i++
   735  	SUB  $1, R3		// n--
   736  	BGT L4			// if n > 0 goto L4
   737  
   738  E4:	MOVD R4, c+56(FP)	// return c
   739  
   740  	RET
   741  
   742  TEXT ·subVW(SB),NOSPLIT,$0
   743  	MOVD	subwvectorfacility+0x00(SB),R1
   744  	BR	(R1)
   745  	
   746  TEXT ·subVW_check(SB),NOSPLIT,$0
   747  	MOVB	·hasVX(SB), R1
   748  	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
   749  	MOVD	$subwvectorfacility+0x00(SB), R1
   750  	MOVD    $·subVW_novec(SB), R2
   751          MOVD    R2, 0(R1)
   752  	//MOVD	$·subVW_novec(SB), 0(R1)
   753  	BR	·subVW_novec(SB)
   754  vectorimpl:
   755  	MOVD	$subwvectorfacility+0x00(SB), R1
   756  	MOVD    $·subVW_vec(SB), R2
   757          MOVD    R2, 0(R1)
   758  	//MOVD	$·subVW_vec(SB), 0(R1)
   759  	BR	·subVW_vec(SB)
   760  
   761  GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
   762  DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
   763  
   764  // func subVW(z, x []Word, y Word) (c Word)
   765  TEXT ·subVW_vec(SB),NOSPLIT,$0
   766  	MOVD	z_len+8(FP), R3
   767  	MOVD	x+24(FP), R8
   768  	MOVD	y+48(FP), R4	// c = y
   769  	MOVD	z+0(FP), R2
   770  
   771  	MOVD	$0, R0		// make sure it's zero
   772  	MOVD	$0, R10		// i = 0
   773  	MOVD	R8, R5
   774  	MOVD	R2, R7
   775  
   776  	// s/JL/JMP/ below to disable the unrolled loop
   777  	SUB	$4, R3			// n -= 4
   778  	BLT	v11			// if n < 0 goto v11
   779  	SUB	$12, R3
   780  	BLT	A11
   781  
   782  	VZERO	V0
   783  	MOVD	$1, R6			// prepare V0 to be final carry register
   784  	VLVGG	$1, R6, V0		// borrow is initially "no borrow"
   785  	VZERO	V9			// to ensure upper half is zero
   786  	VLVGG	$1, R4, V9
   787  
   788  	// n >= 0
   789  	// regular loop body unrolled 16x
   790  
   791  
   792  UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
   793  	ADD	$64, R5
   794  	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
   795  	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
   796  
   797  
   798  	VSBCBIQ	V1, V9, V0, V25
   799  	VSBIQ	V1, V9, V0, V17
   800  	VZERO	V9
   801  	VSBCBIQ	V2, V9, V25, V26
   802  	VSBIQ	V2, V9, V25, V18
   803  
   804  	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
   805  	ADD	$32, R5
   806  
   807  	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
   808  	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
   809  
   810  
   811  	VSBCBIQ	V3, V9, V26, V27
   812  	VSBIQ	V3, V9, V26, V19
   813  	VSBCBIQ	V4, V9, V27, V28
   814  	VSBIQ	V4, V9, V27, V20
   815  
   816  	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
   817  	ADD	$32, R5
   818  
   819  	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
   820  	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
   821  
   822  	VSBCBIQ	V5, V9, V28, V29
   823  	VSBIQ	V5, V9, V28, V21
   824  	VSBCBIQ	V6, V9, V29, V30
   825  	VSBIQ	V6, V9, V29, V22
   826  
   827  	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
   828  	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
   829  
   830  	VSBCBIQ	V7, V9, V30, V31
   831  	VSBIQ	V7, V9, V30, V23
   832  	VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over
   833  	VSBIQ	V8, V9, V31, V24
   834  
   835  	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
   836  	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
   837  	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
   838  	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
   839  	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
   840  	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
   841  	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
   842  	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
   843  	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
   844  	ADD	$128, R7
   845  	ADD	$128, R10		// i += 16
   846  	SUB	$16,  R3		// n -= 16
   847  	BGE	UU1			// if n >= 0 goto U1
   848  	VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10
   849  	SUB	$1, R4			// save cf
   850  	NEG	R4, R4
   851  A11:	ADD	$12, R3			// n += 16
   852  
   853  	BLT	v11			// if n < 0 goto v11
   854  
   855  	// n >= 0
   856  	// regular loop body unrolled 4x
   857  
   858  U4:	// n >= 0
   859  	// regular loop body unrolled 4x
   860  	MOVD 0(R8)(R10*1), R5
   861  	MOVD 8(R8)(R10*1), R6
   862  	MOVD 16(R8)(R10*1), R7
   863  	MOVD 24(R8)(R10*1), R1
   864  	SUBC R4, R5 //SLGR  -> SUBC
   865  	SUBE R0, R6 //SLBGR -> SUBE
   866  	SUBE R0, R7
   867  	SUBE R0, R1
   868  	SUBE R4, R4		// save CF
   869  	NEG  R4, R4
   870  	MOVD R5, 0(R2)(R10*1)
   871  	MOVD R6, 8(R2)(R10*1)
   872  	MOVD R7, 16(R2)(R10*1)
   873  	MOVD R1, 24(R2)(R10*1)
   874  
   875  	ADD $32, R10		// i += 4 -> i +=32
   876  	SUB $4, R3		// n -= 4
   877  	BGE U4			// if n >= 0 goto U4
   878  
   879  v11:	ADD $4, R3		// n += 4
   880  	BLE E11			// if n <= 0 goto E4
   881  
   882  L4:	// n > 0
   883  
   884  	MOVD	0(R8)(R10*1), R5
   885  	SUBC	R4, R5
   886  	SUBE	R4, R4		// save CF
   887  	NEG	R4, R4
   888  	MOVD	R5, 0(R2)(R10*1)
   889  
   890  	ADD	$8, R10		// i++
   891  	SUB	$1, R3		// n--
   892  	BGT	L4		// if n > 0 goto L4
   893  
   894  E11:	MOVD	R4, c+56(FP)	// return c
   895  
   896  	RET
   897  
   898  //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
   899  // func subVW(z, x []Word, y Word) (c Word)
   900  // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
   901  TEXT ·subVW_novec(SB),NOSPLIT,$0
   902  	MOVD z_len+8(FP), R3
   903  	MOVD x+24(FP), R8
   904  	MOVD y+48(FP), R4	// c = y
   905  	MOVD z+0(FP), R2
   906  	MOVD $0, R0		// make sure it's 0
   907  	MOVD $0, R10		// i = 0
   908  
   909  	// s/JL/JMP/ below to disable the unrolled loop
   910  	SUB $4, R3		// n -= 4
   911  	BLT v4			// if n < 4 goto v4
   912  
   913  U4:	// n >= 0
   914  	// regular loop body unrolled 4x
   915  	MOVD 0(R8)(R10*1), R5
   916  	MOVD 8(R8)(R10*1), R6
   917  	MOVD 16(R8)(R10*1), R7
   918  	MOVD 24(R8)(R10*1), R1
   919  	SUBC R4, R5 //SLGR  -> SUBC
   920  	SUBE R0, R6 //SLBGR -> SUBE
   921  	SUBE R0, R7
   922  	SUBE R0, R1
   923  	SUBE R4, R4		// save CF
   924  	NEG  R4, R4
   925  	MOVD R5, 0(R2)(R10*1)
   926  	MOVD R6, 8(R2)(R10*1)
   927  	MOVD R7, 16(R2)(R10*1)
   928  	MOVD R1, 24(R2)(R10*1)
   929  
   930  	ADD $32, R10		// i += 4 -> i +=32
   931  	SUB $4, R3		// n -= 4
   932  	BGE U4			// if n >= 0 goto U4
   933  
   934  v4:	ADD $4, R3		// n += 4
   935  	BLE E4			// if n <= 0 goto E4
   936  
   937  L4:	// n > 0
   938  	MOVD 0(R8)(R10*1), R5
   939  	SUBC R4, R5
   940  	SUBE R4, R4		// save CF
   941  	NEG  R4, R4
   942  	MOVD R5, 0(R2)(R10*1)
   943  
   944  	ADD  $8, R10		// i++
   945  	SUB  $1, R3		// n--
   946  	BGT L4			// if n > 0 goto L4
   947  
   948  E4:	MOVD R4, c+56(FP)	// return c
   949  
   950  	RET
   951  
   952  // func shlVU(z, x []Word, s uint) (c Word)
   953  TEXT ·shlVU(SB),NOSPLIT,$0
   954  	MOVD	z_len+8(FP), R5
   955  	MOVD	$0, R0
   956  	SUB	$1, R5             // n--
   957  	BLT	X8b                // n < 0        (n <= 0)
   958  
   959  	// n > 0
   960  	MOVD	s+48(FP), R4
   961  	CMPBEQ	R0, R4, Z80	   //handle 0 case beq
   962  	MOVD	$64, R6
   963  	CMPBEQ	R6, R4, Z864	   //handle 64 case beq
   964  	MOVD	z+0(FP), R2
   965  	MOVD	x+24(FP), R8
   966  	SLD	$3, R5             // n = n*8
   967  	SUB	R4, R6, R7
   968  	MOVD	(R8)(R5*1), R10    // w1 = x[i-1]
   969  	SRD	R7, R10, R3
   970  	MOVD	R3, c+56(FP)
   971  
   972  	MOVD	$0, R1             // i = 0
   973  	BR	E8
   974  
   975  	// i < n-1
   976  L8:	MOVD	R10, R3             // w = w1
   977  	MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1]
   978  
   979  	SLD	R4,  R3             // w<<s | w1>>ŝ
   980  	SRD	R7, R10, R6
   981  	OR 	R6, R3
   982  	MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ
   983  	SUB	$8, R5              // i--
   984  
   985  E8:	CMPBGT	R5, R0, L8	    // i < n-1
   986  
   987  	// i >= n-1
   988  X8a:	SLD	R4, R10             // w1<<s
   989  	MOVD	R10, (R2)           // z[0] = w1<<s
   990  	RET
   991  
   992  X8b:	MOVD	R0, c+56(FP)
   993  	RET
   994  
   995  Z80:	MOVD	z+0(FP), R2
   996  	MOVD	x+24(FP), R8
   997  	SLD	$3, R5             // n = n*8
   998  
   999  	MOVD	(R8), R10
  1000  	MOVD	$0, R3
  1001  	MOVD	R3, c+56(FP)
  1002  
  1003  	MOVD	$0, R1             // i = 0
  1004  	BR	E8Z
  1005  
  1006  	// i < n-1
  1007  L8Z:	MOVD	R10, R3
  1008  	MOVD	8(R8)(R1*1), R10
  1009  
  1010  	MOVD	R3, (R2)(R1*1)
  1011  	ADD 	$8, R1
  1012  
  1013  E8Z:	CMPBLT	R1, R5, L8Z
  1014  
  1015  	// i >= n-1
  1016  	MOVD	R10, (R2)(R5*1)
  1017  	RET
  1018  
  1019  Z864:	MOVD	z+0(FP), R2
  1020  	MOVD	x+24(FP), R8
  1021  	SLD	$3, R5             // n = n*8
  1022  	MOVD	(R8)(R5*1), R3     // w1 = x[n-1]
  1023  	MOVD	R3, c+56(FP)       // z[i] = x[n-1]
  1024  
  1025  	BR	E864
  1026  
  1027  	// i < n-1
  1028  L864:	MOVD	-8(R8)(R5*1), R3
  1029  
  1030  	MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1]
  1031  	SUB	$8, R5             // i--
  1032  
  1033  E864:	CMPBGT	R5, R0, L864       // i < n-1
  1034  
  1035  	MOVD	R0, (R2)           // z[n-1] = 0
  1036  	RET
  1037  
  1038  
  1039  // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
  1040  // func shrVU(z, x []Word, s uint) (c Word)
  1041  TEXT ·shrVU(SB),NOSPLIT,$0
  1042  	MOVD	z_len+8(FP), R5
  1043  	MOVD	$0, R0
  1044  	SUB	$1, R5             // n--
  1045  	BLT	X9b                // n < 0        (n <= 0)
  1046  
  1047  	// n > 0
  1048  	MOVD	s+48(FP), R4
  1049  	CMPBEQ	R0, R4, ZB0	//handle 0 case beq
  1050  	MOVD	$64, R6
  1051  	CMPBEQ 	R6, R4, ZB64	//handle 64 case beq
  1052  	MOVD	z+0(FP), R2
  1053  	MOVD	x+24(FP), R8
  1054  	SLD	$3, R5		// n = n*8
  1055  	SUB	R4, R6, R7
  1056  	MOVD	(R8), R10	// w1 = x[0]
  1057  	SLD	R7, R10, R3
  1058  	MOVD	R3, c+56(FP)
  1059  
  1060  	MOVD	$0, R1		// i = 0
  1061  	BR 	E9
  1062  
  1063  	// i < n-1
  1064  L9:	MOVD	R10, R3		// w = w1
  1065  	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
  1066  
  1067  	SRD	R4,  R3		// w>>s | w1<<s
  1068  	SLD	R7, R10, R6
  1069  	OR	R6, R3
  1070  	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1071  	ADD	$8, R1		// i++
  1072  
  1073  E9:	CMPBLT	R1, R5, L9	// i < n-1
  1074  
  1075  	// i >= n-1
  1076  X9a:	SRD	R4, R10		// w1>>s
  1077  	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
  1078  	RET
  1079  
  1080  X9b:	MOVD	R0, c+56(FP)
  1081  	RET
  1082  
  1083  ZB0:	MOVD	z+0(FP), R2
  1084  	MOVD	x+24(FP), R8
  1085  	SLD	$3, R5		// n = n*8
  1086  
  1087  	MOVD	(R8), R10	// w1 = x[0]
  1088  	MOVD	$0, R3		// R10 << 64
  1089  	MOVD	R3, c+56(FP)
  1090  
  1091  	MOVD	$0, R1		// i = 0
  1092  	BR	E9Z
  1093  
  1094  	// i < n-1
  1095  L9Z:	MOVD	R10, R3		// w = w1
  1096  	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
  1097  
  1098  	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1099  	ADD	$8, R1		// i++
  1100  
  1101  E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1
  1102  
  1103  	// i >= n-1
  1104  	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
  1105  	RET
  1106  
  1107  ZB64:	MOVD	z+0(FP), R2
  1108  	MOVD	x+24(FP), R8
  1109  	SLD	$3, R5		// n = n*8
  1110  	MOVD	(R8), R3	// w1 = x[0]
  1111  	MOVD	R3, c+56(FP)
  1112  
  1113  	MOVD	$0, R1		// i = 0
  1114  	BR	E964
  1115  
  1116  	// i < n-1
  1117  L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1]
  1118  
  1119  	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
  1120  	ADD	$8, R1		// i++
  1121  
  1122  E964:	CMPBLT	R1, R5, L964	// i < n-1
  1123  
  1124  	// i >= n-1
  1125  	MOVD	$0, R10            // w1>>s
  1126  	MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s
  1127  	RET
  1128  
  1129  // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
  1130  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
  1131  TEXT ·mulAddVWW(SB),NOSPLIT,$0
  1132  	MOVD	z+0(FP), R2
  1133  	MOVD	x+24(FP), R8
  1134  	MOVD	y+48(FP), R9
  1135  	MOVD	r+56(FP), R4	// c = r
  1136  	MOVD	z_len+8(FP), R5
  1137  	MOVD	$0, R1		// i = 0
  1138  	MOVD	$0, R7		// i*8 = 0
  1139  	MOVD	$0, R0		// make sure it's zero
  1140  	BR	E5
  1141  
  1142  L5:	MOVD	(R8)(R1*1), R6
  1143  	MULHDU	R9, R6
  1144  	ADDC	R4, R11 	//add to low order bits
  1145  	ADDE	R0, R6
  1146  	MOVD	R11, (R2)(R1*1)
  1147  	MOVD	R6, R4
  1148  	ADD	$8, R1		// i*8 + 8
  1149  	ADD	$1, R7		// i++
  1150  
  1151  E5:	CMPBLT	R7, R5, L5	// i < n
  1152  
  1153  	MOVD	R4, c+64(FP)
  1154  	RET
  1155  
  1156  // func addMulVVW(z, x []Word, y Word) (c Word)
  1157  // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
  1158  TEXT ·addMulVVW(SB),NOSPLIT,$0
  1159  	MOVD	z+0(FP), R2
  1160  	MOVD	x+24(FP), R8
  1161  	MOVD	y+48(FP), R9
  1162  	MOVD	z_len+8(FP), R5
  1163  
  1164  	MOVD	$0, R1		// i*8 = 0
  1165  	MOVD	$0, R7		// i = 0
  1166  	MOVD	$0, R0		// make sure it's zero
  1167  	MOVD	$0, R4		// c = 0
  1168  
  1169  	MOVD	R5, R12
  1170  	AND	$-2, R12
  1171  	CMPBGE	R5, $2, A6
  1172  	BR	E6
  1173  
  1174  A6:	MOVD	(R8)(R1*1), R6
  1175  	MULHDU	R9, R6
  1176  	MOVD	(R2)(R1*1), R10
  1177  	ADDC	R10, R11	//add to low order bits
  1178  	ADDE	R0, R6
  1179  	ADDC	R4, R11
  1180  	ADDE	R0, R6
  1181  	MOVD	R6, R4
  1182  	MOVD	R11, (R2)(R1*1)
  1183  
  1184  	MOVD	(8)(R8)(R1*1), R6
  1185  	MULHDU	R9, R6
  1186  	MOVD	(8)(R2)(R1*1), R10
  1187  	ADDC	R10, R11	//add to low order bits
  1188  	ADDE	R0, R6
  1189  	ADDC	R4, R11
  1190  	ADDE	R0, R6
  1191  	MOVD	R6, R4
  1192  	MOVD	R11, (8)(R2)(R1*1)
  1193  
  1194  	ADD	$16, R1		// i*8 + 8
  1195  	ADD	$2, R7		// i++
  1196  
  1197  	CMPBLT	R7, R12, A6
  1198  	BR	E6
  1199  
  1200  L6:	MOVD	(R8)(R1*1), R6
  1201  	MULHDU	R9, R6
  1202  	MOVD	(R2)(R1*1), R10
  1203  	ADDC	R10, R11	//add to low order bits
  1204  	ADDE	R0, R6
  1205  	ADDC	R4, R11
  1206  	ADDE	R0, R6
  1207  	MOVD	R6, R4
  1208  	MOVD	R11, (R2)(R1*1)
  1209  
  1210  	ADD	$8, R1		// i*8 + 8
  1211  	ADD	$1, R7		// i++
  1212  
  1213  E6:	CMPBLT	R7, R5, L6	// i < n
  1214  
  1215  	MOVD	R4, c+56(FP)
  1216  	RET
  1217  
  1218  // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
  1219  // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
  1220  TEXT ·divWVW(SB),NOSPLIT,$0
  1221  	MOVD	z+0(FP), R2
  1222  	MOVD	xn+24(FP), R10	// r = xn
  1223  	MOVD	x+32(FP), R8
  1224  	MOVD	y+56(FP), R9
  1225  	MOVD	z_len+8(FP), R7	// i = z
  1226  	SLD	$3, R7, R1		// i*8
  1227  	MOVD	$0, R0		// make sure it's zero
  1228  	BR	E7
  1229  
  1230  L7:	MOVD	(R8)(R1*1), R11
  1231  	WORD	$0xB98700A9	//DLGR R10,R9
  1232  	MOVD	R11, (R2)(R1*1)
  1233  
  1234  E7:	SUB	$1, R7		// i--
  1235  	SUB	$8, R1
  1236  	BGE	L7		// i >= 0
  1237  
  1238  	MOVD	R10, r+64(FP)
  1239  	RET