github.com/gidoBOSSftw5731/go/src@v0.0.0-20210226122457-d24b0edbf019/math/big/arith_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // +build !math_big_pure_go,s390x
     6  
     7  #include "textflag.h"
     8  
     9  // This file provides fast assembly versions for the elementary
    10  // arithmetic operations on vectors implemented in arith.go.
    11  
    12  TEXT ·mulWW(SB), NOSPLIT, $0
    13  	MOVD   x+0(FP), R3
    14  	MOVD   y+8(FP), R4
    15  	MULHDU R3, R4
    16  	MOVD   R10, z1+16(FP)
    17  	MOVD   R11, z0+24(FP)
    18  	RET
    19  
    20  
    21  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    22  // func addVV(z, x, y []Word) (c Word)
    23  
    24  TEXT ·addVV(SB), NOSPLIT, $0
    25  	MOVD addvectorfacility+0x00(SB), R1
    26  	BR   (R1)
    27  
    28  TEXT ·addVV_check(SB), NOSPLIT, $0
    29  	MOVB   ·hasVX(SB), R1
    30  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
    31  	MOVD   $addvectorfacility+0x00(SB), R1
    32  	MOVD   $·addVV_novec(SB), R2
    33  	MOVD   R2, 0(R1)
    34  
    35  	// MOVD	$·addVV_novec(SB), 0(R1)
    36  	BR ·addVV_novec(SB)
    37  
    38  vectorimpl:
    39  	MOVD $addvectorfacility+0x00(SB), R1
    40  	MOVD $·addVV_vec(SB), R2
    41  	MOVD R2, 0(R1)
    42  
    43  	// MOVD	$·addVV_vec(SB), 0(R1)
    44  	BR ·addVV_vec(SB)
    45  
    46  GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    47  DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    48  
    49  TEXT ·addVV_vec(SB), NOSPLIT, $0
    50  	MOVD z_len+8(FP), R3
    51  	MOVD x+24(FP), R8
    52  	MOVD y+48(FP), R9
    53  	MOVD z+0(FP), R2
    54  
    55  	MOVD $0, R4  // c = 0
    56  	MOVD $0, R0  // make sure it's zero
    57  	MOVD $0, R10 // i = 0
    58  
    59  	// s/JL/JMP/ below to disable the unrolled loop
    60  	SUB $4, R3
    61  	BLT v1
    62  	SUB $12, R3 // n -= 16
    63  	BLT A1      // if n < 0 goto A1
    64  
    65  	MOVD R8, R5
    66  	MOVD R9, R6
    67  	MOVD R2, R7
    68  
    69  	// n >= 0
    70  	// regular loop body unrolled 16x
    71  	VZERO V0 // c = 0
    72  
    73  UU1:
    74  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
    75  	ADD  $64, R5
    76  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
    77  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
    78  
    79  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
    80  	ADD  $64, R6
    81  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
    82  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
    83  
    84  	VACCCQ V1, V9, V0, V25
    85  	VACQ   V1, V9, V0, V17
    86  	VACCCQ V2, V10, V25, V26
    87  	VACQ   V2, V10, V25, V18
    88  
    89  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
    90  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
    91  	ADD $32, R5
    92  	ADD $32, R6
    93  
    94  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
    95  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
    96  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
    97  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
    98  
    99  	VACCCQ V3, V11, V26, V27
   100  	VACQ   V3, V11, V26, V19
   101  	VACCCQ V4, V12, V27, V28
   102  	VACQ   V4, V12, V27, V20
   103  
   104  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   105  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   106  	ADD $32, R5
   107  	ADD $32, R6
   108  
   109  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   110  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   111  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   112  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   113  
   114  	VACCCQ V5, V13, V28, V29
   115  	VACQ   V5, V13, V28, V21
   116  	VACCCQ V6, V14, V29, V30
   117  	VACQ   V6, V14, V29, V22
   118  
   119  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   120  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   121  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   122  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   123  
   124  	VACCCQ V7, V15, V30, V31
   125  	VACQ   V7, V15, V30, V23
   126  	VACCCQ V8, V16, V31, V0  // V0 has carry-over
   127  	VACQ   V8, V16, V31, V24
   128  
   129  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   130  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   131  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   132  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   133  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   134  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   135  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   136  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   137  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   138  	ADD   $128, R7
   139  	ADD   $128, R10           // i += 16
   140  	SUB   $16, R3             // n -= 16
   141  	BGE   UU1                 // if n >= 0 goto U1
   142  	VLGVG $1, V0, R4          // put cf into R4
   143  	NEG   R4, R4              // save cf
   144  
   145  A1:
   146  	ADD $12, R3 // n += 16
   147  
   148  	// s/JL/JMP/ below to disable the unrolled loop
   149  	BLT v1 // if n < 0 goto v1
   150  
   151  U1:  // n >= 0
   152  	// regular loop body unrolled 4x
   153  	MOVD 0(R8)(R10*1), R5
   154  	MOVD 8(R8)(R10*1), R6
   155  	MOVD 16(R8)(R10*1), R7
   156  	MOVD 24(R8)(R10*1), R1
   157  	ADDC R4, R4             // restore CF
   158  	MOVD 0(R9)(R10*1), R11
   159  	ADDE R11, R5
   160  	MOVD 8(R9)(R10*1), R11
   161  	ADDE R11, R6
   162  	MOVD 16(R9)(R10*1), R11
   163  	ADDE R11, R7
   164  	MOVD 24(R9)(R10*1), R11
   165  	ADDE R11, R1
   166  	MOVD R0, R4
   167  	ADDE R4, R4             // save CF
   168  	NEG  R4, R4
   169  	MOVD R5, 0(R2)(R10*1)
   170  	MOVD R6, 8(R2)(R10*1)
   171  	MOVD R7, 16(R2)(R10*1)
   172  	MOVD R1, 24(R2)(R10*1)
   173  
   174  	ADD $32, R10 // i += 4
   175  	SUB $4, R3   // n -= 4
   176  	BGE U1       // if n >= 0 goto U1
   177  
   178  v1:
   179  	ADD $4, R3 // n += 4
   180  	BLE E1     // if n <= 0 goto E1
   181  
   182  L1:  // n > 0
   183  	ADDC R4, R4            // restore CF
   184  	MOVD 0(R8)(R10*1), R5
   185  	MOVD 0(R9)(R10*1), R11
   186  	ADDE R11, R5
   187  	MOVD R5, 0(R2)(R10*1)
   188  	MOVD R0, R4
   189  	ADDE R4, R4            // save CF
   190  	NEG  R4, R4
   191  
   192  	ADD $8, R10 // i++
   193  	SUB $1, R3  // n--
   194  	BGT L1      // if n > 0 goto L1
   195  
   196  E1:
   197  	NEG  R4, R4
   198  	MOVD R4, c+72(FP) // return c
   199  	RET
   200  
   201  TEXT ·addVV_novec(SB), NOSPLIT, $0
   202  novec:
   203  	MOVD z_len+8(FP), R3
   204  	MOVD x+24(FP), R8
   205  	MOVD y+48(FP), R9
   206  	MOVD z+0(FP), R2
   207  
   208  	MOVD $0, R4  // c = 0
   209  	MOVD $0, R0  // make sure it's zero
   210  	MOVD $0, R10 // i = 0
   211  
   212  	// s/JL/JMP/ below to disable the unrolled loop
   213  	SUB $4, R3 // n -= 4
   214  	BLT v1n    // if n < 0 goto v1n
   215  
   216  U1n:  // n >= 0
   217  	// regular loop body unrolled 4x
   218  	MOVD 0(R8)(R10*1), R5
   219  	MOVD 8(R8)(R10*1), R6
   220  	MOVD 16(R8)(R10*1), R7
   221  	MOVD 24(R8)(R10*1), R1
   222  	ADDC R4, R4             // restore CF
   223  	MOVD 0(R9)(R10*1), R11
   224  	ADDE R11, R5
   225  	MOVD 8(R9)(R10*1), R11
   226  	ADDE R11, R6
   227  	MOVD 16(R9)(R10*1), R11
   228  	ADDE R11, R7
   229  	MOVD 24(R9)(R10*1), R11
   230  	ADDE R11, R1
   231  	MOVD R0, R4
   232  	ADDE R4, R4             // save CF
   233  	NEG  R4, R4
   234  	MOVD R5, 0(R2)(R10*1)
   235  	MOVD R6, 8(R2)(R10*1)
   236  	MOVD R7, 16(R2)(R10*1)
   237  	MOVD R1, 24(R2)(R10*1)
   238  
   239  	ADD $32, R10 // i += 4
   240  	SUB $4, R3   // n -= 4
   241  	BGE U1n      // if n >= 0 goto U1n
   242  
   243  v1n:
   244  	ADD $4, R3 // n += 4
   245  	BLE E1n    // if n <= 0 goto E1n
   246  
   247  L1n:  // n > 0
   248  	ADDC R4, R4            // restore CF
   249  	MOVD 0(R8)(R10*1), R5
   250  	MOVD 0(R9)(R10*1), R11
   251  	ADDE R11, R5
   252  	MOVD R5, 0(R2)(R10*1)
   253  	MOVD R0, R4
   254  	ADDE R4, R4            // save CF
   255  	NEG  R4, R4
   256  
   257  	ADD $8, R10 // i++
   258  	SUB $1, R3  // n--
   259  	BGT L1n     // if n > 0 goto L1n
   260  
   261  E1n:
   262  	NEG  R4, R4
   263  	MOVD R4, c+72(FP) // return c
   264  	RET
   265  
   266  TEXT ·subVV(SB), NOSPLIT, $0
   267  	MOVD subvectorfacility+0x00(SB), R1
   268  	BR   (R1)
   269  
   270  TEXT ·subVV_check(SB), NOSPLIT, $0
   271  	MOVB   ·hasVX(SB), R1
   272  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
   273  	MOVD   $subvectorfacility+0x00(SB), R1
   274  	MOVD   $·subVV_novec(SB), R2
   275  	MOVD   R2, 0(R1)
   276  
   277  	// MOVD	$·subVV_novec(SB), 0(R1)
   278  	BR ·subVV_novec(SB)
   279  
   280  vectorimpl:
   281  	MOVD $subvectorfacility+0x00(SB), R1
   282  	MOVD $·subVV_vec(SB), R2
   283  	MOVD R2, 0(R1)
   284  
   285  	// MOVD	$·subVV_vec(SB), 0(R1)
   286  	BR ·subVV_vec(SB)
   287  
   288  GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   289  DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   290  
   291  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   292  // func subVV(z, x, y []Word) (c Word)
   293  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   294  TEXT ·subVV_vec(SB), NOSPLIT, $0
   295  	MOVD z_len+8(FP), R3
   296  	MOVD x+24(FP), R8
   297  	MOVD y+48(FP), R9
   298  	MOVD z+0(FP), R2
   299  	MOVD $0, R4          // c = 0
   300  	MOVD $0, R0          // make sure it's zero
   301  	MOVD $0, R10         // i = 0
   302  
   303  	// s/JL/JMP/ below to disable the unrolled loop
   304  	SUB $4, R3  // n -= 4
   305  	BLT v1      // if n < 0 goto v1
   306  	SUB $12, R3 // n -= 16
   307  	BLT A1      // if n < 0 goto A1
   308  
   309  	MOVD R8, R5
   310  	MOVD R9, R6
   311  	MOVD R2, R7
   312  
   313  	// n >= 0
   314  	// regular loop body unrolled 16x
   315  	VZERO V0         // cf = 0
   316  	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
   317  	VLVGG $1, R4, V0 // put carry into V0
   318  
   319  UU1:
   320  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
   321  	ADD  $64, R5
   322  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
   323  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
   324  
   325  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
   326  	ADD  $64, R6
   327  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
   328  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
   329  
   330  	VSBCBIQ V1, V9, V0, V25
   331  	VSBIQ   V1, V9, V0, V17
   332  	VSBCBIQ V2, V10, V25, V26
   333  	VSBIQ   V2, V10, V25, V18
   334  
   335  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
   336  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
   337  	ADD $32, R5
   338  	ADD $32, R6
   339  
   340  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
   341  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
   342  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
   343  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
   344  
   345  	VSBCBIQ V3, V11, V26, V27
   346  	VSBIQ   V3, V11, V26, V19
   347  	VSBCBIQ V4, V12, V27, V28
   348  	VSBIQ   V4, V12, V27, V20
   349  
   350  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   351  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   352  	ADD $32, R5
   353  	ADD $32, R6
   354  
   355  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   356  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   357  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   358  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   359  
   360  	VSBCBIQ V5, V13, V28, V29
   361  	VSBIQ   V5, V13, V28, V21
   362  	VSBCBIQ V6, V14, V29, V30
   363  	VSBIQ   V6, V14, V29, V22
   364  
   365  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   366  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   367  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   368  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   369  
   370  	VSBCBIQ V7, V15, V30, V31
   371  	VSBIQ   V7, V15, V30, V23
   372  	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
   373  	VSBIQ   V8, V16, V31, V24
   374  
   375  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   376  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   377  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   378  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   379  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   380  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   381  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   382  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   383  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   384  	ADD   $128, R7
   385  	ADD   $128, R10           // i += 16
   386  	SUB   $16, R3             // n -= 16
   387  	BGE   UU1                 // if n >= 0 goto U1
   388  	VLGVG $1, V0, R4          // put cf into R4
   389  	SUB   $1, R4              // save cf
   390  
   391  A1:
   392  	ADD $12, R3 // n += 16
   393  	BLT v1      // if n < 0 goto v1
   394  
   395  U1:  // n >= 0
   396  	// regular loop body unrolled 4x
   397  	MOVD 0(R8)(R10*1), R5
   398  	MOVD 8(R8)(R10*1), R6
   399  	MOVD 16(R8)(R10*1), R7
   400  	MOVD 24(R8)(R10*1), R1
   401  	MOVD R0, R11
   402  	SUBC R4, R11            // restore CF
   403  	MOVD 0(R9)(R10*1), R11
   404  	SUBE R11, R5
   405  	MOVD 8(R9)(R10*1), R11
   406  	SUBE R11, R6
   407  	MOVD 16(R9)(R10*1), R11
   408  	SUBE R11, R7
   409  	MOVD 24(R9)(R10*1), R11
   410  	SUBE R11, R1
   411  	MOVD R0, R4
   412  	SUBE R4, R4             // save CF
   413  	MOVD R5, 0(R2)(R10*1)
   414  	MOVD R6, 8(R2)(R10*1)
   415  	MOVD R7, 16(R2)(R10*1)
   416  	MOVD R1, 24(R2)(R10*1)
   417  
   418  	ADD $32, R10 // i += 4
   419  	SUB $4, R3   // n -= 4
   420  	BGE U1       // if n >= 0 goto U1n
   421  
   422  v1:
   423  	ADD $4, R3 // n += 4
   424  	BLE E1     // if n <= 0 goto E1
   425  
   426  L1:  // n > 0
   427  	MOVD R0, R11
   428  	SUBC R4, R11           // restore CF
   429  	MOVD 0(R8)(R10*1), R5
   430  	MOVD 0(R9)(R10*1), R11
   431  	SUBE R11, R5
   432  	MOVD R5, 0(R2)(R10*1)
   433  	MOVD R0, R4
   434  	SUBE R4, R4            // save CF
   435  
   436  	ADD $8, R10 // i++
   437  	SUB $1, R3  // n--
   438  	BGT L1      // if n > 0 goto L1n
   439  
   440  E1:
   441  	NEG  R4, R4
   442  	MOVD R4, c+72(FP) // return c
   443  	RET
   444  
   445  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   446  // func subVV(z, x, y []Word) (c Word)
   447  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   448  TEXT ·subVV_novec(SB), NOSPLIT, $0
   449  	MOVD z_len+8(FP), R3
   450  	MOVD x+24(FP), R8
   451  	MOVD y+48(FP), R9
   452  	MOVD z+0(FP), R2
   453  
   454  	MOVD $0, R4  // c = 0
   455  	MOVD $0, R0  // make sure it's zero
   456  	MOVD $0, R10 // i = 0
   457  
   458  	// s/JL/JMP/ below to disable the unrolled loop
   459  	SUB $4, R3 // n -= 4
   460  	BLT v1     // if n < 0 goto v1
   461  
   462  U1:  // n >= 0
   463  	// regular loop body unrolled 4x
   464  	MOVD 0(R8)(R10*1), R5
   465  	MOVD 8(R8)(R10*1), R6
   466  	MOVD 16(R8)(R10*1), R7
   467  	MOVD 24(R8)(R10*1), R1
   468  	MOVD R0, R11
   469  	SUBC R4, R11            // restore CF
   470  	MOVD 0(R9)(R10*1), R11
   471  	SUBE R11, R5
   472  	MOVD 8(R9)(R10*1), R11
   473  	SUBE R11, R6
   474  	MOVD 16(R9)(R10*1), R11
   475  	SUBE R11, R7
   476  	MOVD 24(R9)(R10*1), R11
   477  	SUBE R11, R1
   478  	MOVD R0, R4
   479  	SUBE R4, R4             // save CF
   480  	MOVD R5, 0(R2)(R10*1)
   481  	MOVD R6, 8(R2)(R10*1)
   482  	MOVD R7, 16(R2)(R10*1)
   483  	MOVD R1, 24(R2)(R10*1)
   484  
   485  	ADD $32, R10 // i += 4
   486  	SUB $4, R3   // n -= 4
   487  	BGE U1       // if n >= 0 goto U1
   488  
   489  v1:
   490  	ADD $4, R3 // n += 4
   491  	BLE E1     // if n <= 0 goto E1
   492  
   493  L1:  // n > 0
   494  	MOVD R0, R11
   495  	SUBC R4, R11           // restore CF
   496  	MOVD 0(R8)(R10*1), R5
   497  	MOVD 0(R9)(R10*1), R11
   498  	SUBE R11, R5
   499  	MOVD R5, 0(R2)(R10*1)
   500  	MOVD R0, R4
   501  	SUBE R4, R4            // save CF
   502  
   503  	ADD $8, R10 // i++
   504  	SUB $1, R3  // n--
   505  	BGT L1      // if n > 0 goto L1
   506  
   507  E1:
   508  	NEG  R4, R4
   509  	MOVD R4, c+72(FP) // return c
   510  	RET
   511  
   512  TEXT ·addVW(SB), NOSPLIT, $0
   513  	MOVD z_len+8(FP), R5 // length of z
   514  	MOVD x+24(FP), R6
   515  	MOVD y+48(FP), R7    // c = y
   516  	MOVD z+0(FP), R8
   517  
   518  	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
   519  
   520  	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
   521  	ADDC   0(R6), R7
   522  	MOVD   R7, 0(R8)
   523  	CMPBEQ R5, $1, returnResult // len(z) == 1
   524  	MOVD   $0, R9
   525  	ADDE   8(R6), R9
   526  	MOVD   R9, 8(R8)
   527  	CMPBEQ R5, $2, returnResult // len(z) == 2
   528  
   529  	// Update the counters
   530  	MOVD $16, R12    // i = 2
   531  	MOVD $-2(R5), R5 // n = n - 2
   532  
   533  loopOverEachWord:
   534  	BRC  $12, copySetup // carry = 0, copy the rest
   535  	MOVD $1, R9
   536  
   537  	// Originally we used the carry flag generated in the previous iteration
   538  	// (i.e: ADDE could be used here to do the addition).  However, since we
   539  	// already know carry is 1 (otherwise we will go to copy section), we can use
   540  	// ADDC here so the current iteration does not depend on the carry flag
   541  	// generated in the previous iteration. This could be useful when branch prediction happens.
   542  	ADDC 0(R6)(R12*1), R9
   543  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
   544  
   545  	MOVD  $8(R12), R12         // i++
   546  	BRCTG R5, loopOverEachWord // n--
   547  
   548  // Return the current carry value
   549  returnResult:
   550  	MOVD $0, R0
   551  	ADDE R0, R0
   552  	MOVD R0, c+56(FP)
   553  	RET
   554  
   555  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   556  // With the assumption that x and z will not overlap with each other or x and z will
   557  // point to same memory region, we can use a faster version of copy using only MVC here.
   558  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   559  // 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
   560  copySetup:
   561  	ADD R12, R6
   562  	ADD R12, R8
   563  
   564  	CMPBGE R5, $4, mediumLoop
   565  
   566  smallLoop:  // does a loop unrolling to copy word when n < 4
   567  	CMPBEQ R5, $0, returnZero
   568  	MVC    $8, 0(R6), 0(R8)
   569  	CMPBEQ R5, $1, returnZero
   570  	MVC    $8, 8(R6), 8(R8)
   571  	CMPBEQ R5, $2, returnZero
   572  	MVC    $8, 16(R6), 16(R8)
   573  
   574  returnZero:
   575  	MOVD $0, c+56(FP) // return 0 as carry
   576  	RET
   577  
   578  mediumLoop:
   579  	CMPBLT R5, $4, smallLoop
   580  	CMPBLT R5, $32, mediumLoopBody
   581  
   582  largeLoop:  // Copying 256 bytes at a time.
   583  	MVC    $256, 0(R6), 0(R8)
   584  	MOVD   $256(R6), R6
   585  	MOVD   $256(R8), R8
   586  	MOVD   $-32(R5), R5
   587  	CMPBGE R5, $32, largeLoop
   588  	BR     mediumLoop
   589  
   590  mediumLoopBody:  // Copying 32 bytes at a time
   591  	MVC    $32, 0(R6), 0(R8)
   592  	MOVD   $32(R6), R6
   593  	MOVD   $32(R8), R8
   594  	MOVD   $-4(R5), R5
   595  	CMPBGE R5, $4, mediumLoopBody
   596  	BR     smallLoop
   597  
   598  returnC:
   599  	MOVD R7, c+56(FP)
   600  	RET
   601  
   602  TEXT ·subVW(SB), NOSPLIT, $0
   603  	MOVD z_len+8(FP), R5
   604  	MOVD x+24(FP), R6
   605  	MOVD y+48(FP), R7    // The borrow bit passed in
   606  	MOVD z+0(FP), R8
   607  	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
   608  
   609  	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
   610  
   611  	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
   612  	MOVD   0(R6), R9
   613  	SUBC   R7, R9
   614  	MOVD   R9, 0(R8)
   615  	CMPBEQ R5, $1, returnResult
   616  	MOVD   8(R6), R9
   617  	SUBE   R0, R9
   618  	MOVD   R9, 8(R8)
   619  	CMPBEQ R5, $2, returnResult
   620  
   621  	// Update the counters
   622  	MOVD $16, R12    // i = 2
   623  	MOVD $-2(R5), R5 // n = n - 2
   624  
   625  loopOverEachWord:
   626  	BRC  $3, copySetup    // no borrow, copy the rest
   627  	MOVD 0(R6)(R12*1), R9
   628  
   629  	// Originally we used the borrow flag generated in the previous iteration
   630  	// (i.e: SUBE could be used here to do the subtraction). However, since we
   631  	// already know borrow is 1 (otherwise we will go to copy section), we can
   632  	// use SUBC here so the current iteration does not depend on the borrow flag
   633  	// generated in the previous iteration. This could be useful when branch prediction happens.
   634  	SUBC $1, R9
   635  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
   636  
   637  	MOVD  $8(R12), R12         // i++
   638  	BRCTG R5, loopOverEachWord // n--
   639  
   640  // return the current borrow value
   641  returnResult:
   642  	SUBE R0, R0
   643  	NEG  R0, R0
   644  	MOVD R0, c+56(FP)
   645  	RET
   646  
   647  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   648  // With the assumption that x and z will not overlap with each other or x and z will
   649  // point to same memory region, we can use a faster version of copy using only MVC here.
   650  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   651  // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
   652  copySetup:
   653  	ADD R12, R6
   654  	ADD R12, R8
   655  
   656  	CMPBGE R5, $4, mediumLoop
   657  
   658  smallLoop:  // does a loop unrolling to copy word when n < 4
   659  	CMPBEQ R5, $0, returnZero
   660  	MVC    $8, 0(R6), 0(R8)
   661  	CMPBEQ R5, $1, returnZero
   662  	MVC    $8, 8(R6), 8(R8)
   663  	CMPBEQ R5, $2, returnZero
   664  	MVC    $8, 16(R6), 16(R8)
   665  
   666  returnZero:
   667  	MOVD $0, c+56(FP) // return 0 as borrow
   668  	RET
   669  
   670  mediumLoop:
   671  	CMPBLT R5, $4, smallLoop
   672  	CMPBLT R5, $32, mediumLoopBody
   673  
   674  largeLoop:  // Copying 256 bytes at a time
   675  	MVC    $256, 0(R6), 0(R8)
   676  	MOVD   $256(R6), R6
   677  	MOVD   $256(R8), R8
   678  	MOVD   $-32(R5), R5
   679  	CMPBGE R5, $32, largeLoop
   680  	BR     mediumLoop
   681  
   682  mediumLoopBody:  // Copying 32 bytes at a time
   683  	MVC    $32, 0(R6), 0(R8)
   684  	MOVD   $32(R6), R6
   685  	MOVD   $32(R8), R8
   686  	MOVD   $-4(R5), R5
   687  	CMPBGE R5, $4, mediumLoopBody
   688  	BR     smallLoop
   689  
   690  returnC:
   691  	MOVD R7, c+56(FP)
   692  	RET
   693  
   694  // func shlVU(z, x []Word, s uint) (c Word)
   695  TEXT ·shlVU(SB), NOSPLIT, $0
   696  	BR ·shlVU_g(SB)
   697  
   698  // func shrVU(z, x []Word, s uint) (c Word)
   699  TEXT ·shrVU(SB), NOSPLIT, $0
   700  	BR ·shrVU_g(SB)
   701  
   702  // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
   703  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   704  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   705  	MOVD z+0(FP), R2
   706  	MOVD x+24(FP), R8
   707  	MOVD y+48(FP), R9
   708  	MOVD r+56(FP), R4    // c = r
   709  	MOVD z_len+8(FP), R5
   710  	MOVD $0, R1          // i = 0
   711  	MOVD $0, R7          // i*8 = 0
   712  	MOVD $0, R0          // make sure it's zero
   713  	BR   E5
   714  
   715  L5:
   716  	MOVD   (R8)(R1*1), R6
   717  	MULHDU R9, R6
   718  	ADDC   R4, R11         // add to low order bits
   719  	ADDE   R0, R6
   720  	MOVD   R11, (R2)(R1*1)
   721  	MOVD   R6, R4
   722  	ADD    $8, R1          // i*8 + 8
   723  	ADD    $1, R7          // i++
   724  
   725  E5:
   726  	CMPBLT R7, R5, L5 // i < n
   727  
   728  	MOVD R4, c+64(FP)
   729  	RET
   730  
   731  // func addMulVVW(z, x []Word, y Word) (c Word)
   732  // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
   733  TEXT ·addMulVVW(SB), NOSPLIT, $0
   734  	MOVD z+0(FP), R2
   735  	MOVD x+24(FP), R8
   736  	MOVD y+48(FP), R9
   737  	MOVD z_len+8(FP), R5
   738  
   739  	MOVD $0, R1 // i*8 = 0
   740  	MOVD $0, R7 // i = 0
   741  	MOVD $0, R0 // make sure it's zero
   742  	MOVD $0, R4 // c = 0
   743  
   744  	MOVD   R5, R12
   745  	AND    $-2, R12
   746  	CMPBGE R5, $2, A6
   747  	BR     E6
   748  
   749  A6:
   750  	MOVD   (R8)(R1*1), R6
   751  	MULHDU R9, R6
   752  	MOVD   (R2)(R1*1), R10
   753  	ADDC   R10, R11        // add to low order bits
   754  	ADDE   R0, R6
   755  	ADDC   R4, R11
   756  	ADDE   R0, R6
   757  	MOVD   R6, R4
   758  	MOVD   R11, (R2)(R1*1)
   759  
   760  	MOVD   (8)(R8)(R1*1), R6
   761  	MULHDU R9, R6
   762  	MOVD   (8)(R2)(R1*1), R10
   763  	ADDC   R10, R11           // add to low order bits
   764  	ADDE   R0, R6
   765  	ADDC   R4, R11
   766  	ADDE   R0, R6
   767  	MOVD   R6, R4
   768  	MOVD   R11, (8)(R2)(R1*1)
   769  
   770  	ADD $16, R1 // i*8 + 8
   771  	ADD $2, R7  // i++
   772  
   773  	CMPBLT R7, R12, A6
   774  	BR     E6
   775  
   776  L6:
   777  	MOVD   (R8)(R1*1), R6
   778  	MULHDU R9, R6
   779  	MOVD   (R2)(R1*1), R10
   780  	ADDC   R10, R11        // add to low order bits
   781  	ADDE   R0, R6
   782  	ADDC   R4, R11
   783  	ADDE   R0, R6
   784  	MOVD   R6, R4
   785  	MOVD   R11, (R2)(R1*1)
   786  
   787  	ADD $8, R1 // i*8 + 8
   788  	ADD $1, R7 // i++
   789  
   790  E6:
   791  	CMPBLT R7, R5, L6 // i < n
   792  
   793  	MOVD R4, c+56(FP)
   794  	RET
   795