github.com/mtsmfm/go/src@v0.0.0-20221020090648-44bdcb9f8fde/math/big/arith_s390x.s (about)

     1  // Copyright 2016 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !math_big_pure_go
     6  // +build !math_big_pure_go
     7  
     8  #include "textflag.h"
     9  
    10  // This file provides fast assembly versions for the elementary
    11  // arithmetic operations on vectors implemented in arith.go.
    12  
    13  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
    14  // func addVV(z, x, y []Word) (c Word)
    15  
    16  TEXT ·addVV(SB), NOSPLIT, $0
    17  	MOVD addvectorfacility+0x00(SB), R1
    18  	BR   (R1)
    19  
    20  TEXT ·addVV_check(SB), NOSPLIT, $0
    21  	MOVB   ·hasVX(SB), R1
    22  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
    23  	MOVD   $addvectorfacility+0x00(SB), R1
    24  	MOVD   $·addVV_novec(SB), R2
    25  	MOVD   R2, 0(R1)
    26  
    27  	// MOVD	$·addVV_novec(SB), 0(R1)
    28  	BR ·addVV_novec(SB)
    29  
    30  vectorimpl:
    31  	MOVD $addvectorfacility+0x00(SB), R1
    32  	MOVD $·addVV_vec(SB), R2
    33  	MOVD R2, 0(R1)
    34  
    35  	// MOVD	$·addVV_vec(SB), 0(R1)
    36  	BR ·addVV_vec(SB)
    37  
    38  GLOBL addvectorfacility+0x00(SB), NOPTR, $8
    39  DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
    40  
    41  TEXT ·addVV_vec(SB), NOSPLIT, $0
    42  	MOVD z_len+8(FP), R3
    43  	MOVD x+24(FP), R8
    44  	MOVD y+48(FP), R9
    45  	MOVD z+0(FP), R2
    46  
    47  	MOVD $0, R4  // c = 0
    48  	MOVD $0, R0  // make sure it's zero
    49  	MOVD $0, R10 // i = 0
    50  
    51  	// s/JL/JMP/ below to disable the unrolled loop
    52  	SUB $4, R3
    53  	BLT v1
    54  	SUB $12, R3 // n -= 16
    55  	BLT A1      // if n < 0 goto A1
    56  
    57  	MOVD R8, R5
    58  	MOVD R9, R6
    59  	MOVD R2, R7
    60  
    61  	// n >= 0
    62  	// regular loop body unrolled 16x
    63  	VZERO V0 // c = 0
    64  
    65  UU1:
    66  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
    67  	ADD  $64, R5
    68  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
    69  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
    70  
    71  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
    72  	ADD  $64, R6
    73  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
    74  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
    75  
    76  	VACCCQ V1, V9, V0, V25
    77  	VACQ   V1, V9, V0, V17
    78  	VACCCQ V2, V10, V25, V26
    79  	VACQ   V2, V10, V25, V18
    80  
    81  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
    82  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
    83  	ADD $32, R5
    84  	ADD $32, R6
    85  
    86  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
    87  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
    88  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
    89  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
    90  
    91  	VACCCQ V3, V11, V26, V27
    92  	VACQ   V3, V11, V26, V19
    93  	VACCCQ V4, V12, V27, V28
    94  	VACQ   V4, V12, V27, V20
    95  
    96  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
    97  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
    98  	ADD $32, R5
    99  	ADD $32, R6
   100  
   101  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   102  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   103  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   104  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   105  
   106  	VACCCQ V5, V13, V28, V29
   107  	VACQ   V5, V13, V28, V21
   108  	VACCCQ V6, V14, V29, V30
   109  	VACQ   V6, V14, V29, V22
   110  
   111  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   112  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   113  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   114  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   115  
   116  	VACCCQ V7, V15, V30, V31
   117  	VACQ   V7, V15, V30, V23
   118  	VACCCQ V8, V16, V31, V0  // V0 has carry-over
   119  	VACQ   V8, V16, V31, V24
   120  
   121  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   122  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   123  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   124  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   125  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   126  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   127  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   128  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   129  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   130  	ADD   $128, R7
   131  	ADD   $128, R10           // i += 16
   132  	SUB   $16, R3             // n -= 16
   133  	BGE   UU1                 // if n >= 0 goto U1
   134  	VLGVG $1, V0, R4          // put cf into R4
   135  	NEG   R4, R4              // save cf
   136  
   137  A1:
   138  	ADD $12, R3 // n += 16
   139  
   140  	// s/JL/JMP/ below to disable the unrolled loop
   141  	BLT v1 // if n < 0 goto v1
   142  
   143  U1:  // n >= 0
   144  	// regular loop body unrolled 4x
   145  	MOVD 0(R8)(R10*1), R5
   146  	MOVD 8(R8)(R10*1), R6
   147  	MOVD 16(R8)(R10*1), R7
   148  	MOVD 24(R8)(R10*1), R1
   149  	ADDC R4, R4             // restore CF
   150  	MOVD 0(R9)(R10*1), R11
   151  	ADDE R11, R5
   152  	MOVD 8(R9)(R10*1), R11
   153  	ADDE R11, R6
   154  	MOVD 16(R9)(R10*1), R11
   155  	ADDE R11, R7
   156  	MOVD 24(R9)(R10*1), R11
   157  	ADDE R11, R1
   158  	MOVD R0, R4
   159  	ADDE R4, R4             // save CF
   160  	NEG  R4, R4
   161  	MOVD R5, 0(R2)(R10*1)
   162  	MOVD R6, 8(R2)(R10*1)
   163  	MOVD R7, 16(R2)(R10*1)
   164  	MOVD R1, 24(R2)(R10*1)
   165  
   166  	ADD $32, R10 // i += 4
   167  	SUB $4, R3   // n -= 4
   168  	BGE U1       // if n >= 0 goto U1
   169  
   170  v1:
   171  	ADD $4, R3 // n += 4
   172  	BLE E1     // if n <= 0 goto E1
   173  
   174  L1:  // n > 0
   175  	ADDC R4, R4            // restore CF
   176  	MOVD 0(R8)(R10*1), R5
   177  	MOVD 0(R9)(R10*1), R11
   178  	ADDE R11, R5
   179  	MOVD R5, 0(R2)(R10*1)
   180  	MOVD R0, R4
   181  	ADDE R4, R4            // save CF
   182  	NEG  R4, R4
   183  
   184  	ADD $8, R10 // i++
   185  	SUB $1, R3  // n--
   186  	BGT L1      // if n > 0 goto L1
   187  
   188  E1:
   189  	NEG  R4, R4
   190  	MOVD R4, c+72(FP) // return c
   191  	RET
   192  
   193  TEXT ·addVV_novec(SB), NOSPLIT, $0
   194  novec:
   195  	MOVD z_len+8(FP), R3
   196  	MOVD x+24(FP), R8
   197  	MOVD y+48(FP), R9
   198  	MOVD z+0(FP), R2
   199  
   200  	MOVD $0, R4  // c = 0
   201  	MOVD $0, R0  // make sure it's zero
   202  	MOVD $0, R10 // i = 0
   203  
   204  	// s/JL/JMP/ below to disable the unrolled loop
   205  	SUB $4, R3 // n -= 4
   206  	BLT v1n    // if n < 0 goto v1n
   207  
   208  U1n:  // n >= 0
   209  	// regular loop body unrolled 4x
   210  	MOVD 0(R8)(R10*1), R5
   211  	MOVD 8(R8)(R10*1), R6
   212  	MOVD 16(R8)(R10*1), R7
   213  	MOVD 24(R8)(R10*1), R1
   214  	ADDC R4, R4             // restore CF
   215  	MOVD 0(R9)(R10*1), R11
   216  	ADDE R11, R5
   217  	MOVD 8(R9)(R10*1), R11
   218  	ADDE R11, R6
   219  	MOVD 16(R9)(R10*1), R11
   220  	ADDE R11, R7
   221  	MOVD 24(R9)(R10*1), R11
   222  	ADDE R11, R1
   223  	MOVD R0, R4
   224  	ADDE R4, R4             // save CF
   225  	NEG  R4, R4
   226  	MOVD R5, 0(R2)(R10*1)
   227  	MOVD R6, 8(R2)(R10*1)
   228  	MOVD R7, 16(R2)(R10*1)
   229  	MOVD R1, 24(R2)(R10*1)
   230  
   231  	ADD $32, R10 // i += 4
   232  	SUB $4, R3   // n -= 4
   233  	BGE U1n      // if n >= 0 goto U1n
   234  
   235  v1n:
   236  	ADD $4, R3 // n += 4
   237  	BLE E1n    // if n <= 0 goto E1n
   238  
   239  L1n:  // n > 0
   240  	ADDC R4, R4            // restore CF
   241  	MOVD 0(R8)(R10*1), R5
   242  	MOVD 0(R9)(R10*1), R11
   243  	ADDE R11, R5
   244  	MOVD R5, 0(R2)(R10*1)
   245  	MOVD R0, R4
   246  	ADDE R4, R4            // save CF
   247  	NEG  R4, R4
   248  
   249  	ADD $8, R10 // i++
   250  	SUB $1, R3  // n--
   251  	BGT L1n     // if n > 0 goto L1n
   252  
   253  E1n:
   254  	NEG  R4, R4
   255  	MOVD R4, c+72(FP) // return c
   256  	RET
   257  
   258  TEXT ·subVV(SB), NOSPLIT, $0
   259  	MOVD subvectorfacility+0x00(SB), R1
   260  	BR   (R1)
   261  
   262  TEXT ·subVV_check(SB), NOSPLIT, $0
   263  	MOVB   ·hasVX(SB), R1
   264  	CMPBEQ R1, $1, vectorimpl              // vectorfacility = 1, vector supported
   265  	MOVD   $subvectorfacility+0x00(SB), R1
   266  	MOVD   $·subVV_novec(SB), R2
   267  	MOVD   R2, 0(R1)
   268  
   269  	// MOVD	$·subVV_novec(SB), 0(R1)
   270  	BR ·subVV_novec(SB)
   271  
   272  vectorimpl:
   273  	MOVD $subvectorfacility+0x00(SB), R1
   274  	MOVD $·subVV_vec(SB), R2
   275  	MOVD R2, 0(R1)
   276  
   277  	// MOVD	$·subVV_vec(SB), 0(R1)
   278  	BR ·subVV_vec(SB)
   279  
   280  GLOBL subvectorfacility+0x00(SB), NOPTR, $8
   281  DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
   282  
   283  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   284  // func subVV(z, x, y []Word) (c Word)
   285  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   286  TEXT ·subVV_vec(SB), NOSPLIT, $0
   287  	MOVD z_len+8(FP), R3
   288  	MOVD x+24(FP), R8
   289  	MOVD y+48(FP), R9
   290  	MOVD z+0(FP), R2
   291  	MOVD $0, R4          // c = 0
   292  	MOVD $0, R0          // make sure it's zero
   293  	MOVD $0, R10         // i = 0
   294  
   295  	// s/JL/JMP/ below to disable the unrolled loop
   296  	SUB $4, R3  // n -= 4
   297  	BLT v1      // if n < 0 goto v1
   298  	SUB $12, R3 // n -= 16
   299  	BLT A1      // if n < 0 goto A1
   300  
   301  	MOVD R8, R5
   302  	MOVD R9, R6
   303  	MOVD R2, R7
   304  
   305  	// n >= 0
   306  	// regular loop body unrolled 16x
   307  	VZERO V0         // cf = 0
   308  	MOVD  $1, R4     // for 390 subtraction cf starts as 1 (no borrow)
   309  	VLVGG $1, R4, V0 // put carry into V0
   310  
   311  UU1:
   312  	VLM  0(R5), V1, V4    // 64-bytes into V1..V8
   313  	ADD  $64, R5
   314  	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
   315  	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
   316  
   317  	VLM  0(R6), V9, V12      // 64-bytes into V9..V16
   318  	ADD  $64, R6
   319  	VPDI $0x4, V9, V9, V9    // flip the doublewords to big-endian order
   320  	VPDI $0x4, V10, V10, V10 // flip the doublewords to big-endian order
   321  
   322  	VSBCBIQ V1, V9, V0, V25
   323  	VSBIQ   V1, V9, V0, V17
   324  	VSBCBIQ V2, V10, V25, V26
   325  	VSBIQ   V2, V10, V25, V18
   326  
   327  	VLM 0(R5), V5, V6   // 32-bytes into V1..V8
   328  	VLM 0(R6), V13, V14 // 32-bytes into V9..V16
   329  	ADD $32, R5
   330  	ADD $32, R6
   331  
   332  	VPDI $0x4, V3, V3, V3    // flip the doublewords to big-endian order
   333  	VPDI $0x4, V4, V4, V4    // flip the doublewords to big-endian order
   334  	VPDI $0x4, V11, V11, V11 // flip the doublewords to big-endian order
   335  	VPDI $0x4, V12, V12, V12 // flip the doublewords to big-endian order
   336  
   337  	VSBCBIQ V3, V11, V26, V27
   338  	VSBIQ   V3, V11, V26, V19
   339  	VSBCBIQ V4, V12, V27, V28
   340  	VSBIQ   V4, V12, V27, V20
   341  
   342  	VLM 0(R5), V7, V8   // 32-bytes into V1..V8
   343  	VLM 0(R6), V15, V16 // 32-bytes into V9..V16
   344  	ADD $32, R5
   345  	ADD $32, R6
   346  
   347  	VPDI $0x4, V5, V5, V5    // flip the doublewords to big-endian order
   348  	VPDI $0x4, V6, V6, V6    // flip the doublewords to big-endian order
   349  	VPDI $0x4, V13, V13, V13 // flip the doublewords to big-endian order
   350  	VPDI $0x4, V14, V14, V14 // flip the doublewords to big-endian order
   351  
   352  	VSBCBIQ V5, V13, V28, V29
   353  	VSBIQ   V5, V13, V28, V21
   354  	VSBCBIQ V6, V14, V29, V30
   355  	VSBIQ   V6, V14, V29, V22
   356  
   357  	VPDI $0x4, V7, V7, V7    // flip the doublewords to big-endian order
   358  	VPDI $0x4, V8, V8, V8    // flip the doublewords to big-endian order
   359  	VPDI $0x4, V15, V15, V15 // flip the doublewords to big-endian order
   360  	VPDI $0x4, V16, V16, V16 // flip the doublewords to big-endian order
   361  
   362  	VSBCBIQ V7, V15, V30, V31
   363  	VSBIQ   V7, V15, V30, V23
   364  	VSBCBIQ V8, V16, V31, V0  // V0 has carry-over
   365  	VSBIQ   V8, V16, V31, V24
   366  
   367  	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
   368  	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
   369  	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
   370  	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
   371  	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
   372  	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
   373  	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
   374  	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
   375  	VSTM  V17, V24, 0(R7)     // 128-bytes into z
   376  	ADD   $128, R7
   377  	ADD   $128, R10           // i += 16
   378  	SUB   $16, R3             // n -= 16
   379  	BGE   UU1                 // if n >= 0 goto U1
   380  	VLGVG $1, V0, R4          // put cf into R4
   381  	SUB   $1, R4              // save cf
   382  
   383  A1:
   384  	ADD $12, R3 // n += 16
   385  	BLT v1      // if n < 0 goto v1
   386  
   387  U1:  // n >= 0
   388  	// regular loop body unrolled 4x
   389  	MOVD 0(R8)(R10*1), R5
   390  	MOVD 8(R8)(R10*1), R6
   391  	MOVD 16(R8)(R10*1), R7
   392  	MOVD 24(R8)(R10*1), R1
   393  	MOVD R0, R11
   394  	SUBC R4, R11            // restore CF
   395  	MOVD 0(R9)(R10*1), R11
   396  	SUBE R11, R5
   397  	MOVD 8(R9)(R10*1), R11
   398  	SUBE R11, R6
   399  	MOVD 16(R9)(R10*1), R11
   400  	SUBE R11, R7
   401  	MOVD 24(R9)(R10*1), R11
   402  	SUBE R11, R1
   403  	MOVD R0, R4
   404  	SUBE R4, R4             // save CF
   405  	MOVD R5, 0(R2)(R10*1)
   406  	MOVD R6, 8(R2)(R10*1)
   407  	MOVD R7, 16(R2)(R10*1)
   408  	MOVD R1, 24(R2)(R10*1)
   409  
   410  	ADD $32, R10 // i += 4
   411  	SUB $4, R3   // n -= 4
   412  	BGE U1       // if n >= 0 goto U1n
   413  
   414  v1:
   415  	ADD $4, R3 // n += 4
   416  	BLE E1     // if n <= 0 goto E1
   417  
   418  L1:  // n > 0
   419  	MOVD R0, R11
   420  	SUBC R4, R11           // restore CF
   421  	MOVD 0(R8)(R10*1), R5
   422  	MOVD 0(R9)(R10*1), R11
   423  	SUBE R11, R5
   424  	MOVD R5, 0(R2)(R10*1)
   425  	MOVD R0, R4
   426  	SUBE R4, R4            // save CF
   427  
   428  	ADD $8, R10 // i++
   429  	SUB $1, R3  // n--
   430  	BGT L1      // if n > 0 goto L1n
   431  
   432  E1:
   433  	NEG  R4, R4
   434  	MOVD R4, c+72(FP) // return c
   435  	RET
   436  
   437  // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2, r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
   438  // func subVV(z, x, y []Word) (c Word)
   439  // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
   440  TEXT ·subVV_novec(SB), NOSPLIT, $0
   441  	MOVD z_len+8(FP), R3
   442  	MOVD x+24(FP), R8
   443  	MOVD y+48(FP), R9
   444  	MOVD z+0(FP), R2
   445  
   446  	MOVD $0, R4  // c = 0
   447  	MOVD $0, R0  // make sure it's zero
   448  	MOVD $0, R10 // i = 0
   449  
   450  	// s/JL/JMP/ below to disable the unrolled loop
   451  	SUB $4, R3 // n -= 4
   452  	BLT v1     // if n < 0 goto v1
   453  
   454  U1:  // n >= 0
   455  	// regular loop body unrolled 4x
   456  	MOVD 0(R8)(R10*1), R5
   457  	MOVD 8(R8)(R10*1), R6
   458  	MOVD 16(R8)(R10*1), R7
   459  	MOVD 24(R8)(R10*1), R1
   460  	MOVD R0, R11
   461  	SUBC R4, R11            // restore CF
   462  	MOVD 0(R9)(R10*1), R11
   463  	SUBE R11, R5
   464  	MOVD 8(R9)(R10*1), R11
   465  	SUBE R11, R6
   466  	MOVD 16(R9)(R10*1), R11
   467  	SUBE R11, R7
   468  	MOVD 24(R9)(R10*1), R11
   469  	SUBE R11, R1
   470  	MOVD R0, R4
   471  	SUBE R4, R4             // save CF
   472  	MOVD R5, 0(R2)(R10*1)
   473  	MOVD R6, 8(R2)(R10*1)
   474  	MOVD R7, 16(R2)(R10*1)
   475  	MOVD R1, 24(R2)(R10*1)
   476  
   477  	ADD $32, R10 // i += 4
   478  	SUB $4, R3   // n -= 4
   479  	BGE U1       // if n >= 0 goto U1
   480  
   481  v1:
   482  	ADD $4, R3 // n += 4
   483  	BLE E1     // if n <= 0 goto E1
   484  
   485  L1:  // n > 0
   486  	MOVD R0, R11
   487  	SUBC R4, R11           // restore CF
   488  	MOVD 0(R8)(R10*1), R5
   489  	MOVD 0(R9)(R10*1), R11
   490  	SUBE R11, R5
   491  	MOVD R5, 0(R2)(R10*1)
   492  	MOVD R0, R4
   493  	SUBE R4, R4            // save CF
   494  
   495  	ADD $8, R10 // i++
   496  	SUB $1, R3  // n--
   497  	BGT L1      // if n > 0 goto L1
   498  
   499  E1:
   500  	NEG  R4, R4
   501  	MOVD R4, c+72(FP) // return c
   502  	RET
   503  
   504  TEXT ·addVW(SB), NOSPLIT, $0
   505  	MOVD z_len+8(FP), R5 // length of z
   506  	MOVD x+24(FP), R6
   507  	MOVD y+48(FP), R7    // c = y
   508  	MOVD z+0(FP), R8
   509  
   510  	CMPBEQ R5, $0, returnC // if len(z) == 0, we can have an early return
   511  
   512  	// Add the first two words, and determine which path (copy path or loop path) to take based on the carry flag.
   513  	ADDC   0(R6), R7
   514  	MOVD   R7, 0(R8)
   515  	CMPBEQ R5, $1, returnResult // len(z) == 1
   516  	MOVD   $0, R9
   517  	ADDE   8(R6), R9
   518  	MOVD   R9, 8(R8)
   519  	CMPBEQ R5, $2, returnResult // len(z) == 2
   520  
   521  	// Update the counters
   522  	MOVD $16, R12    // i = 2
   523  	MOVD $-2(R5), R5 // n = n - 2
   524  
   525  loopOverEachWord:
   526  	BRC  $12, copySetup // carry = 0, copy the rest
   527  	MOVD $1, R9
   528  
   529  	// Originally we used the carry flag generated in the previous iteration
   530  	// (i.e: ADDE could be used here to do the addition).  However, since we
   531  	// already know carry is 1 (otherwise we will go to copy section), we can use
   532  	// ADDC here so the current iteration does not depend on the carry flag
   533  	// generated in the previous iteration. This could be useful when branch prediction happens.
   534  	ADDC 0(R6)(R12*1), R9
   535  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] + c
   536  
   537  	MOVD  $8(R12), R12         // i++
   538  	BRCTG R5, loopOverEachWord // n--
   539  
   540  // Return the current carry value
   541  returnResult:
   542  	MOVD $0, R0
   543  	ADDE R0, R0
   544  	MOVD R0, c+56(FP)
   545  	RET
   546  
   547  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   548  // With the assumption that x and z will not overlap with each other or x and z will
   549  // point to same memory region, we can use a faster version of copy using only MVC here.
   550  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   551  // 32 words at a time.  Via benchmarking, this implementation is faster than calling runtime·memmove.
   552  copySetup:
   553  	ADD R12, R6
   554  	ADD R12, R8
   555  
   556  	CMPBGE R5, $4, mediumLoop
   557  
   558  smallLoop:  // does a loop unrolling to copy word when n < 4
   559  	CMPBEQ R5, $0, returnZero
   560  	MVC    $8, 0(R6), 0(R8)
   561  	CMPBEQ R5, $1, returnZero
   562  	MVC    $8, 8(R6), 8(R8)
   563  	CMPBEQ R5, $2, returnZero
   564  	MVC    $8, 16(R6), 16(R8)
   565  
   566  returnZero:
   567  	MOVD $0, c+56(FP) // return 0 as carry
   568  	RET
   569  
   570  mediumLoop:
   571  	CMPBLT R5, $4, smallLoop
   572  	CMPBLT R5, $32, mediumLoopBody
   573  
   574  largeLoop:  // Copying 256 bytes at a time.
   575  	MVC    $256, 0(R6), 0(R8)
   576  	MOVD   $256(R6), R6
   577  	MOVD   $256(R8), R8
   578  	MOVD   $-32(R5), R5
   579  	CMPBGE R5, $32, largeLoop
   580  	BR     mediumLoop
   581  
   582  mediumLoopBody:  // Copying 32 bytes at a time
   583  	MVC    $32, 0(R6), 0(R8)
   584  	MOVD   $32(R6), R6
   585  	MOVD   $32(R8), R8
   586  	MOVD   $-4(R5), R5
   587  	CMPBGE R5, $4, mediumLoopBody
   588  	BR     smallLoop
   589  
   590  returnC:
   591  	MOVD R7, c+56(FP)
   592  	RET
   593  
   594  TEXT ·subVW(SB), NOSPLIT, $0
   595  	MOVD z_len+8(FP), R5
   596  	MOVD x+24(FP), R6
   597  	MOVD y+48(FP), R7    // The borrow bit passed in
   598  	MOVD z+0(FP), R8
   599  	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
   600  
   601  	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
   602  
   603  	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
   604  	MOVD   0(R6), R9
   605  	SUBC   R7, R9
   606  	MOVD   R9, 0(R8)
   607  	CMPBEQ R5, $1, returnResult
   608  	MOVD   8(R6), R9
   609  	SUBE   R0, R9
   610  	MOVD   R9, 8(R8)
   611  	CMPBEQ R5, $2, returnResult
   612  
   613  	// Update the counters
   614  	MOVD $16, R12    // i = 2
   615  	MOVD $-2(R5), R5 // n = n - 2
   616  
   617  loopOverEachWord:
   618  	BRC  $3, copySetup    // no borrow, copy the rest
   619  	MOVD 0(R6)(R12*1), R9
   620  
   621  	// Originally we used the borrow flag generated in the previous iteration
   622  	// (i.e: SUBE could be used here to do the subtraction). However, since we
   623  	// already know borrow is 1 (otherwise we will go to copy section), we can
   624  	// use SUBC here so the current iteration does not depend on the borrow flag
   625  	// generated in the previous iteration. This could be useful when branch prediction happens.
   626  	SUBC $1, R9
   627  	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
   628  
   629  	MOVD  $8(R12), R12         // i++
   630  	BRCTG R5, loopOverEachWord // n--
   631  
   632  // return the current borrow value
   633  returnResult:
   634  	SUBE R0, R0
   635  	NEG  R0, R0
   636  	MOVD R0, c+56(FP)
   637  	RET
   638  
   639  // Update position of x(R6) and z(R8) based on the current counter value and perform copying.
   640  // With the assumption that x and z will not overlap with each other or x and z will
   641  // point to same memory region, we can use a faster version of copy using only MVC here.
   642  // In the following implementation, we have three copy loops, each copying a word, 4 words, and
   643  // 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
   644  copySetup:
   645  	ADD R12, R6
   646  	ADD R12, R8
   647  
   648  	CMPBGE R5, $4, mediumLoop
   649  
   650  smallLoop:  // does a loop unrolling to copy word when n < 4
   651  	CMPBEQ R5, $0, returnZero
   652  	MVC    $8, 0(R6), 0(R8)
   653  	CMPBEQ R5, $1, returnZero
   654  	MVC    $8, 8(R6), 8(R8)
   655  	CMPBEQ R5, $2, returnZero
   656  	MVC    $8, 16(R6), 16(R8)
   657  
   658  returnZero:
   659  	MOVD $0, c+56(FP) // return 0 as borrow
   660  	RET
   661  
   662  mediumLoop:
   663  	CMPBLT R5, $4, smallLoop
   664  	CMPBLT R5, $32, mediumLoopBody
   665  
   666  largeLoop:  // Copying 256 bytes at a time
   667  	MVC    $256, 0(R6), 0(R8)
   668  	MOVD   $256(R6), R6
   669  	MOVD   $256(R8), R8
   670  	MOVD   $-32(R5), R5
   671  	CMPBGE R5, $32, largeLoop
   672  	BR     mediumLoop
   673  
   674  mediumLoopBody:  // Copying 32 bytes at a time
   675  	MVC    $32, 0(R6), 0(R8)
   676  	MOVD   $32(R6), R6
   677  	MOVD   $32(R8), R8
   678  	MOVD   $-4(R5), R5
   679  	CMPBGE R5, $4, mediumLoopBody
   680  	BR     smallLoop
   681  
   682  returnC:
   683  	MOVD R7, c+56(FP)
   684  	RET
   685  
   686  // func shlVU(z, x []Word, s uint) (c Word)
   687  TEXT ·shlVU(SB), NOSPLIT, $0
   688  	BR ·shlVU_g(SB)
   689  
   690  // func shrVU(z, x []Word, s uint) (c Word)
   691  TEXT ·shrVU(SB), NOSPLIT, $0
   692  	BR ·shrVU_g(SB)
   693  
   694  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, DX = r3, AX = r6, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   695  // func mulAddVWW(z, x []Word, y, r Word) (c Word)
   696  TEXT ·mulAddVWW(SB), NOSPLIT, $0
   697  	MOVD z+0(FP), R2
   698  	MOVD x+24(FP), R8
   699  	MOVD y+48(FP), R9
   700  	MOVD r+56(FP), R4    // c = r
   701  	MOVD z_len+8(FP), R5
   702  	MOVD $0, R1          // i = 0
   703  	MOVD $0, R7          // i*8 = 0
   704  	MOVD $0, R0          // make sure it's zero
   705  	BR   E5
   706  
   707  L5:
   708  	MOVD   (R8)(R1*1), R6
   709  	MULHDU R9, R6
   710  	ADDC   R4, R11         // add to low order bits
   711  	ADDE   R0, R6
   712  	MOVD   R11, (R2)(R1*1)
   713  	MOVD   R6, R4
   714  	ADD    $8, R1          // i*8 + 8
   715  	ADD    $1, R7          // i++
   716  
   717  E5:
   718  	CMPBLT R7, R5, L5 // i < n
   719  
   720  	MOVD R4, c+64(FP)
   721  	RET
   722  
   723  // func addMulVVW(z, x []Word, y Word) (c Word)
   724  // CX = R4, r8 = r8, r9=r9, r10 = r2, r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1, (R0 set to 0) + use R11 + use R7 for i
   725  TEXT ·addMulVVW(SB), NOSPLIT, $0
   726  	MOVD z+0(FP), R2
   727  	MOVD x+24(FP), R8
   728  	MOVD y+48(FP), R9
   729  	MOVD z_len+8(FP), R5
   730  
   731  	MOVD $0, R1 // i*8 = 0
   732  	MOVD $0, R7 // i = 0
   733  	MOVD $0, R0 // make sure it's zero
   734  	MOVD $0, R4 // c = 0
   735  
   736  	MOVD   R5, R12
   737  	AND    $-2, R12
   738  	CMPBGE R5, $2, A6
   739  	BR     E6
   740  
   741  A6:
   742  	MOVD   (R8)(R1*1), R6
   743  	MULHDU R9, R6
   744  	MOVD   (R2)(R1*1), R10
   745  	ADDC   R10, R11        // add to low order bits
   746  	ADDE   R0, R6
   747  	ADDC   R4, R11
   748  	ADDE   R0, R6
   749  	MOVD   R6, R4
   750  	MOVD   R11, (R2)(R1*1)
   751  
   752  	MOVD   (8)(R8)(R1*1), R6
   753  	MULHDU R9, R6
   754  	MOVD   (8)(R2)(R1*1), R10
   755  	ADDC   R10, R11           // add to low order bits
   756  	ADDE   R0, R6
   757  	ADDC   R4, R11
   758  	ADDE   R0, R6
   759  	MOVD   R6, R4
   760  	MOVD   R11, (8)(R2)(R1*1)
   761  
   762  	ADD $16, R1 // i*8 + 8
   763  	ADD $2, R7  // i++
   764  
   765  	CMPBLT R7, R12, A6
   766  	BR     E6
   767  
   768  L6:
   769  	MOVD   (R8)(R1*1), R6
   770  	MULHDU R9, R6
   771  	MOVD   (R2)(R1*1), R10
   772  	ADDC   R10, R11        // add to low order bits
   773  	ADDE   R0, R6
   774  	ADDC   R4, R11
   775  	ADDE   R0, R6
   776  	MOVD   R6, R4
   777  	MOVD   R11, (R2)(R1*1)
   778  
   779  	ADD $8, R1 // i*8 + 8
   780  	ADD $1, R7 // i++
   781  
   782  E6:
   783  	CMPBLT R7, R5, L6 // i < n
   784  
   785  	MOVD R4, c+56(FP)
   786  	RET
   787