Browse Source

all: Use github.com/minio/sha256-simd

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/3581
Jakob Borg 9 years ago
parent
commit
5e99d38412
31 changed files with 3547 additions and 20 deletions
  1. 2 1
      cmd/syncthing/usage_report.go
  2. 2 1
      lib/protocol/bep_extensions.go
  3. 2 1
      lib/protocol/deviceid.go
  4. 2 1
      lib/scanner/blocks.go
  5. 2 1
      lib/signature/signature.go
  6. 202 0
      vendor/github.com/minio/sha256-simd/LICENSE
  7. 55 0
      vendor/github.com/minio/sha256-simd/cpuid.go
  8. 24 0
      vendor/github.com/minio/sha256-simd/cpuid_386.go
  9. 33 0
      vendor/github.com/minio/sha256-simd/cpuid_386.s
  10. 24 0
      vendor/github.com/minio/sha256-simd/cpuid_amd64.go
  11. 34 0
      vendor/github.com/minio/sha256-simd/cpuid_amd64.s
  12. 33 0
      vendor/github.com/minio/sha256-simd/cpuid_arm.go
  13. 33 0
      vendor/github.com/minio/sha256-simd/cpuid_arm64.go
  14. 32 0
      vendor/github.com/minio/sha256-simd/cpuid_ppc64.go
  15. 32 0
      vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go
  16. 176 0
      vendor/github.com/minio/sha256-simd/sha256.go
  17. 22 0
      vendor/github.com/minio/sha256-simd/sha256blockAvx2_amd64.go
  18. 1442 0
      vendor/github.com/minio/sha256-simd/sha256blockAvx2_amd64.s
  19. 22 0
      vendor/github.com/minio/sha256-simd/sha256blockAvx_amd64.go
  20. 409 0
      vendor/github.com/minio/sha256-simd/sha256blockAvx_amd64.s
  21. 22 0
      vendor/github.com/minio/sha256-simd/sha256blockSsse_amd64.go
  22. 430 0
      vendor/github.com/minio/sha256-simd/sha256blockSsse_amd64.s
  23. 24 0
      vendor/github.com/minio/sha256-simd/sha256block_386.go
  24. 48 0
      vendor/github.com/minio/sha256-simd/sha256block_amd64.go
  25. 24 0
      vendor/github.com/minio/sha256-simd/sha256block_arm.go
  26. 36 0
      vendor/github.com/minio/sha256-simd/sha256block_arm64.go
  27. 193 0
      vendor/github.com/minio/sha256-simd/sha256block_arm64.s
  28. 136 0
      vendor/github.com/minio/sha256-simd/sha256block_noasm.go
  29. 22 0
      vendor/github.com/minio/sha256-simd/sha256block_ppc64.go
  30. 22 0
      vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go
  31. 7 15
      vendor/manifest

+ 2 - 1
cmd/syncthing/usage_report.go

@@ -9,7 +9,6 @@ package main
 import (
 	"bytes"
 	"crypto/rand"
-	"crypto/sha256"
 	"crypto/tls"
 	"encoding/json"
 	"fmt"
@@ -19,6 +18,8 @@ import (
 	"strings"
 	"time"
 
+	"github.com/minio/sha256-simd"
+
 	"github.com/syncthing/syncthing/lib/config"
 	"github.com/syncthing/syncthing/lib/dialer"
 	"github.com/syncthing/syncthing/lib/model"

+ 2 - 1
lib/protocol/bep_extensions.go

@@ -7,12 +7,13 @@ package protocol
 
 import (
 	"bytes"
-	"crypto/sha256"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"time"
 
+	"github.com/minio/sha256-simd"
+
 	"github.com/syncthing/syncthing/lib/rand"
 )
 

+ 2 - 1
lib/protocol/deviceid.go

@@ -4,7 +4,6 @@ package protocol
 
 import (
 	"bytes"
-	"crypto/sha256"
 	"encoding/base32"
 	"encoding/binary"
 	"errors"
@@ -12,6 +11,8 @@ import (
 	"regexp"
 	"strings"
 
+	"github.com/minio/sha256-simd"
+
 	"github.com/calmh/luhn"
 )
 

+ 2 - 1
lib/scanner/blocks.go

@@ -8,10 +8,11 @@ package scanner
 
 import (
 	"bytes"
-	"crypto/sha256"
 	"fmt"
 	"io"
 
+	"github.com/minio/sha256-simd"
+
 	"github.com/syncthing/syncthing/lib/protocol"
 )
 

+ 2 - 1
lib/signature/signature.go

@@ -12,7 +12,6 @@ import (
 	"crypto/ecdsa"
 	"crypto/elliptic"
 	"crypto/rand"
-	"crypto/sha256"
 	"crypto/x509"
 	"encoding/asn1"
 	"encoding/pem"
@@ -20,6 +19,8 @@ import (
 	"fmt"
 	"io"
 	"math/big"
+
+	"github.com/minio/sha256-simd"
 )
 
 // GenerateKeys returns a new key pair, with the private and public key

+ 202 - 0
vendor/github.com/minio/sha256-simd/LICENSE

@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

+ 55 - 0
vendor/github.com/minio/sha256-simd/cpuid.go

@@ -0,0 +1,55 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+// True when SIMD instructions are available.
+var avx2 = haveAVX2()
+var avx = haveAVX()
+var ssse3 = haveSSSE3()
+var armSha = haveArmSha()
+
+// haveAVX returns true when there is AVX support
+func haveAVX() bool {
+	_, _, c, _ := cpuid(1)
+
+	// Check XGETBV, OXSAVE and AVX bits
+	if c&(1<<26) != 0 && c&(1<<27) != 0 && c&(1<<28) != 0 {
+		// Check for OS support
+		eax, _ := xgetbv(0)
+		return (eax & 0x6) == 0x6
+	}
+	return false
+}
+
+// haveAVX2 returns true when there is AVX2 support
+func haveAVX2() bool {
+	mfi, _, _, _ := cpuid(0)
+
+	// Check AVX2, AVX2 requires OS support, but BMI1/2 don't.
+	if mfi >= 7 && haveAVX() {
+		_, ebx, _, _ := cpuidex(7, 0)
+		return (ebx & 0x00000020) != 0
+	}
+	return false
+}
+
+// haveSSSE3 returns true when there is SSSE3 support
+func haveSSSE3() bool {
+
+	_, _, c, _ := cpuid(1)
+
+	return (c & 0x00000200) != 0
+}

+ 24 - 0
vendor/github.com/minio/sha256-simd/cpuid_386.go

@@ -0,0 +1,24 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func xgetbv(index uint32) (eax, edx uint32)
+
+func haveArmSha() bool {
+	return false
+}

+ 33 - 0
vendor/github.com/minio/sha256-simd/cpuid_386.s

@@ -0,0 +1,33 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build 386,!gccgo
+
+// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), 7, $0
+        XORL CX, CX
+        MOVL op+0(FP), AX
+        CPUID
+        MOVL AX, eax+4(FP)
+        MOVL BX, ebx+8(FP)
+        MOVL CX, ecx+12(FP)
+        MOVL DX, edx+16(FP)
+        RET
+
+// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuidex(SB), 7, $0
+        MOVL op+0(FP), AX
+        MOVL op2+4(FP), CX
+        CPUID
+        MOVL AX, eax+8(FP)
+        MOVL BX, ebx+12(FP)
+        MOVL CX, ecx+16(FP)
+        MOVL DX, edx+20(FP)
+        RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·xgetbv(SB), 7, $0
+        MOVL index+0(FP), CX
+        BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+        MOVL AX, eax+4(FP)
+        MOVL DX, edx+8(FP)
+        RET

+ 24 - 0
vendor/github.com/minio/sha256-simd/cpuid_amd64.go

@@ -0,0 +1,24 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func xgetbv(index uint32) (eax, edx uint32)
+
+func haveArmSha() bool {
+	return false
+}

+ 34 - 0
vendor/github.com/minio/sha256-simd/cpuid_amd64.s

@@ -0,0 +1,34 @@
+// Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file.
+
+// +build amd64,!gccgo
+
+// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuid(SB), 7, $0
+        XORQ CX, CX
+        MOVL op+0(FP), AX
+        CPUID
+        MOVL AX, eax+8(FP)
+        MOVL BX, ebx+12(FP)
+        MOVL CX, ecx+16(FP)
+        MOVL DX, edx+20(FP)
+        RET
+
+
+// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·cpuidex(SB), 7, $0
+        MOVL op+0(FP), AX
+        MOVL op2+4(FP), CX
+        CPUID
+        MOVL AX, eax+8(FP)
+        MOVL BX, ebx+12(FP)
+        MOVL CX, ecx+16(FP)
+        MOVL DX, edx+20(FP)
+        RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·xgetbv(SB), 7, $0
+        MOVL index+0(FP), CX
+        BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+        MOVL AX, eax+8(FP)
+        MOVL DX, edx+12(FP)
+        RET

+ 33 - 0
vendor/github.com/minio/sha256-simd/cpuid_arm.go

@@ -0,0 +1,33 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func xgetbv(index uint32) (eax, edx uint32) {
+	return 0, 0
+}
+
+func haveArmSha() bool {
+	// TODO: Implement feature detection for ARM
+	return true
+}

+ 33 - 0
vendor/github.com/minio/sha256-simd/cpuid_arm64.go

@@ -0,0 +1,33 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func xgetbv(index uint32) (eax, edx uint32) {
+	return 0, 0
+}
+
+func haveArmSha() bool {
+	// TODO: Implement feature detection for ARM
+	return true
+}

+ 32 - 0
vendor/github.com/minio/sha256-simd/cpuid_ppc64.go

@@ -0,0 +1,32 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func xgetbv(index uint32) (eax, edx uint32) {
+	return 0, 0
+}
+
+func haveArmSha() bool {
+	return false
+}

+ 32 - 0
vendor/github.com/minio/sha256-simd/cpuid_ppc64le.go

@@ -0,0 +1,32 @@
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+package sha256
+
+func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
+	return 0, 0, 0, 0
+}
+
+func xgetbv(index uint32) (eax, edx uint32) {
+	return 0, 0
+}
+
+func haveArmSha() bool {
+	return false
+}

+ 176 - 0
vendor/github.com/minio/sha256-simd/sha256.go

@@ -0,0 +1,176 @@
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+import (
+	"crypto/sha256"
+	"hash"
+	"runtime"
+)
+
+// Size - The size of a SHA256 checksum in bytes.
+const Size = 32
+
+// BlockSize - The blocksize of SHA256 in bytes.
+const BlockSize = 64
+
+const (
+	chunk = 64
+	init0 = 0x6A09E667
+	init1 = 0xBB67AE85
+	init2 = 0x3C6EF372
+	init3 = 0xA54FF53A
+	init4 = 0x510E527F
+	init5 = 0x9B05688C
+	init6 = 0x1F83D9AB
+	init7 = 0x5BE0CD19
+)
+
+// digest represents the partial evaluation of a checksum.
+type digest struct {
+	h   [8]uint32
+	x   [chunk]byte
+	nx  int
+	len uint64
+}
+
+// Reset digest back to default
+func (d *digest) Reset() {
+	d.h[0] = init0
+	d.h[1] = init1
+	d.h[2] = init2
+	d.h[3] = init3
+	d.h[4] = init4
+	d.h[5] = init5
+	d.h[6] = init6
+	d.h[7] = init7
+	d.nx = 0
+	d.len = 0
+}
+
+func block(dig *digest, p []byte) {
+	is386bit := runtime.GOARCH == "386"
+	isARM := runtime.GOARCH == "arm"
+	if is386bit || isARM {
+		blockGeneric(dig, p)
+	}
+	switch !is386bit && !isARM {
+	case avx2:
+		blockAvx2Go(dig, p)
+	case avx:
+		blockAvxGo(dig, p)
+	case ssse3:
+		blockSsseGo(dig, p)
+	case armSha:
+		blockArmGo(dig, p)
+	default:
+		blockGeneric(dig, p)
+	}
+}
+
+// New returns a new hash.Hash computing the SHA256 checksum.
+func New() hash.Hash {
+	if avx2 || avx || ssse3 || armSha {
+		d := new(digest)
+		d.Reset()
+		return d
+	}
+	// default back to the standard golang implementation
+	return sha256.New()
+}
+
+// Sum256 - single caller sha256 helper
+func Sum256(data []byte) [Size]byte {
+	var d digest
+	d.Reset()
+	d.Write(data)
+	return d.checkSum()
+}
+
+// Return size of checksum
+func (d *digest) Size() int { return Size }
+
+// Return blocksize of checksum
+func (d *digest) BlockSize() int { return BlockSize }
+
+// Write to digest
+func (d *digest) Write(p []byte) (nn int, err error) {
+	nn = len(p)
+	d.len += uint64(nn)
+	if d.nx > 0 {
+		n := copy(d.x[d.nx:], p)
+		d.nx += n
+		if d.nx == chunk {
+			block(d, d.x[:])
+			d.nx = 0
+		}
+		p = p[n:]
+	}
+	if len(p) >= chunk {
+		n := len(p) &^ (chunk - 1)
+		block(d, p[:n])
+		p = p[n:]
+	}
+	if len(p) > 0 {
+		d.nx = copy(d.x[:], p)
+	}
+	return
+}
+
+// Return sha256 sum in bytes
+func (d *digest) Sum(in []byte) []byte {
+	// Make a copy of d0 so that caller can keep writing and summing.
+	d0 := *d
+	hash := d0.checkSum()
+	return append(in, hash[:]...)
+}
+
+// Intermediate checksum function
+func (d *digest) checkSum() [Size]byte {
+	len := d.len
+	// Padding.  Add a 1 bit and 0 bits until 56 bytes mod 64.
+	var tmp [64]byte
+	tmp[0] = 0x80
+	if len%64 < 56 {
+		d.Write(tmp[0 : 56-len%64])
+	} else {
+		d.Write(tmp[0 : 64+56-len%64])
+	}
+
+	// Length in bits.
+	len <<= 3
+	for i := uint(0); i < 8; i++ {
+		tmp[i] = byte(len >> (56 - 8*i))
+	}
+	d.Write(tmp[0:8])
+
+	if d.nx != 0 {
+		panic("d.nx != 0")
+	}
+
+	h := d.h[:]
+
+	var digest [Size]byte
+	for i, s := range h {
+		digest[i*4] = byte(s >> 24)
+		digest[i*4+1] = byte(s >> 16)
+		digest[i*4+2] = byte(s >> 8)
+		digest[i*4+3] = byte(s)
+	}
+
+	return digest
+}

+ 22 - 0
vendor/github.com/minio/sha256-simd/sha256blockAvx2_amd64.go

@@ -0,0 +1,22 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+//go:noescape
+func blockAvx2(h []uint32, message []uint8)

+ 1442 - 0
vendor/github.com/minio/sha256-simd/sha256blockAvx2_amd64.s

@@ -0,0 +1,1442 @@
+//+build !noasm !appengine
+
+// SHA256 implementation for AVX2
+
+//
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// This code is based on an Intel White-Paper:
+// "Fast SHA-256 Implementations on Intel Architecture Processors"
+//
+// together with the reference implementation from the following authors:
+//    James Guilford <[email protected]>
+//    Kirk Yap <[email protected]>
+//    Tim Chen <[email protected]>
+//
+// For Golang it has been converted to Plan 9 assembly with the help of
+// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
+// equivalents
+//
+
+#include "textflag.h"
+
+DATA K256<>+0x000(SB)/8, $0x71374491428a2f98
+DATA K256<>+0x008(SB)/8, $0xe9b5dba5b5c0fbcf
+DATA K256<>+0x010(SB)/8, $0x71374491428a2f98
+DATA K256<>+0x018(SB)/8, $0xe9b5dba5b5c0fbcf
+DATA K256<>+0x020(SB)/8, $0x59f111f13956c25b
+DATA K256<>+0x028(SB)/8, $0xab1c5ed5923f82a4
+DATA K256<>+0x030(SB)/8, $0x59f111f13956c25b
+DATA K256<>+0x038(SB)/8, $0xab1c5ed5923f82a4
+DATA K256<>+0x040(SB)/8, $0x12835b01d807aa98
+DATA K256<>+0x048(SB)/8, $0x550c7dc3243185be
+DATA K256<>+0x050(SB)/8, $0x12835b01d807aa98
+DATA K256<>+0x058(SB)/8, $0x550c7dc3243185be
+DATA K256<>+0x060(SB)/8, $0x80deb1fe72be5d74
+DATA K256<>+0x068(SB)/8, $0xc19bf1749bdc06a7
+DATA K256<>+0x070(SB)/8, $0x80deb1fe72be5d74
+DATA K256<>+0x078(SB)/8, $0xc19bf1749bdc06a7
+DATA K256<>+0x080(SB)/8, $0xefbe4786e49b69c1
+DATA K256<>+0x088(SB)/8, $0x240ca1cc0fc19dc6
+DATA K256<>+0x090(SB)/8, $0xefbe4786e49b69c1
+DATA K256<>+0x098(SB)/8, $0x240ca1cc0fc19dc6
+DATA K256<>+0x0a0(SB)/8, $0x4a7484aa2de92c6f
+DATA K256<>+0x0a8(SB)/8, $0x76f988da5cb0a9dc
+DATA K256<>+0x0b0(SB)/8, $0x4a7484aa2de92c6f
+DATA K256<>+0x0b8(SB)/8, $0x76f988da5cb0a9dc
+DATA K256<>+0x0c0(SB)/8, $0xa831c66d983e5152
+DATA K256<>+0x0c8(SB)/8, $0xbf597fc7b00327c8
+DATA K256<>+0x0d0(SB)/8, $0xa831c66d983e5152
+DATA K256<>+0x0d8(SB)/8, $0xbf597fc7b00327c8
+DATA K256<>+0x0e0(SB)/8, $0xd5a79147c6e00bf3
+DATA K256<>+0x0e8(SB)/8, $0x1429296706ca6351
+DATA K256<>+0x0f0(SB)/8, $0xd5a79147c6e00bf3
+DATA K256<>+0x0f8(SB)/8, $0x1429296706ca6351
+DATA K256<>+0x100(SB)/8, $0x2e1b213827b70a85
+DATA K256<>+0x108(SB)/8, $0x53380d134d2c6dfc
+DATA K256<>+0x110(SB)/8, $0x2e1b213827b70a85
+DATA K256<>+0x118(SB)/8, $0x53380d134d2c6dfc
+DATA K256<>+0x120(SB)/8, $0x766a0abb650a7354
+DATA K256<>+0x128(SB)/8, $0x92722c8581c2c92e
+DATA K256<>+0x130(SB)/8, $0x766a0abb650a7354
+DATA K256<>+0x138(SB)/8, $0x92722c8581c2c92e
+DATA K256<>+0x140(SB)/8, $0xa81a664ba2bfe8a1
+DATA K256<>+0x148(SB)/8, $0xc76c51a3c24b8b70
+DATA K256<>+0x150(SB)/8, $0xa81a664ba2bfe8a1
+DATA K256<>+0x158(SB)/8, $0xc76c51a3c24b8b70
+DATA K256<>+0x160(SB)/8, $0xd6990624d192e819
+DATA K256<>+0x168(SB)/8, $0x106aa070f40e3585
+DATA K256<>+0x170(SB)/8, $0xd6990624d192e819
+DATA K256<>+0x178(SB)/8, $0x106aa070f40e3585
+DATA K256<>+0x180(SB)/8, $0x1e376c0819a4c116
+DATA K256<>+0x188(SB)/8, $0x34b0bcb52748774c
+DATA K256<>+0x190(SB)/8, $0x1e376c0819a4c116
+DATA K256<>+0x198(SB)/8, $0x34b0bcb52748774c
+DATA K256<>+0x1a0(SB)/8, $0x4ed8aa4a391c0cb3
+DATA K256<>+0x1a8(SB)/8, $0x682e6ff35b9cca4f
+DATA K256<>+0x1b0(SB)/8, $0x4ed8aa4a391c0cb3
+DATA K256<>+0x1b8(SB)/8, $0x682e6ff35b9cca4f
+DATA K256<>+0x1c0(SB)/8, $0x78a5636f748f82ee
+DATA K256<>+0x1c8(SB)/8, $0x8cc7020884c87814
+DATA K256<>+0x1d0(SB)/8, $0x78a5636f748f82ee
+DATA K256<>+0x1d8(SB)/8, $0x8cc7020884c87814
+DATA K256<>+0x1e0(SB)/8, $0xa4506ceb90befffa
+DATA K256<>+0x1e8(SB)/8, $0xc67178f2bef9a3f7
+DATA K256<>+0x1f0(SB)/8, $0xa4506ceb90befffa
+DATA K256<>+0x1f8(SB)/8, $0xc67178f2bef9a3f7
+
+DATA K256<>+0x200(SB)/8, $0x0405060700010203
+DATA K256<>+0x208(SB)/8, $0x0c0d0e0f08090a0b
+DATA K256<>+0x210(SB)/8, $0x0405060700010203
+DATA K256<>+0x218(SB)/8, $0x0c0d0e0f08090a0b
+DATA K256<>+0x220(SB)/8, $0x0b0a090803020100
+DATA K256<>+0x228(SB)/8, $0xffffffffffffffff
+DATA K256<>+0x230(SB)/8, $0x0b0a090803020100
+DATA K256<>+0x238(SB)/8, $0xffffffffffffffff
+DATA K256<>+0x240(SB)/8, $0xffffffffffffffff
+DATA K256<>+0x248(SB)/8, $0x0b0a090803020100
+DATA K256<>+0x250(SB)/8, $0xffffffffffffffff
+DATA K256<>+0x258(SB)/8, $0x0b0a090803020100
+
+GLOBL K256<>(SB), 8, $608
+
+// func blockAvx2(h []uint32, message []uint8)
+TEXT ·blockAvx2(SB), 7, $0
+
+    MOVQ  ctx+0(FP), DI                        // DI: &h
+    MOVQ  inp+24(FP), SI                       // SI: &message
+    MOVQ  inplength+32(FP), DX                 // len(message)
+    ADDQ  SI, DX                               // end pointer of input
+    MOVQ  SP, R11                              // copy stack pointer
+    SUBQ  $0x220, SP                           // sp -= 0x220
+    ANDQ  $0xfffffffffffffc00, SP              // align stack frame
+    ADDQ  $0x1c0, SP
+    MOVQ DI, 0x40(SP)                          // save ctx
+    MOVQ SI, 0x48(SP)                          // save input
+    MOVQ DX, 0x50(SP)                          // save end pointer
+    MOVQ R11, 0x58(SP)                         // save copy of stack pointer
+
+    WORD $0xf8c5; BYTE $0x77                   // vzeroupper
+    ADDQ $0x40, SI			                   // input++
+    MOVL (DI), AX
+    MOVQ SI, R12                               // borrow $T1
+    MOVL 4(DI), BX
+    CMPQ SI, DX                                // $_end
+    MOVL 8(DI), CX
+    LONG $0xe4440f4c                           // cmove  r12,rsp            /* next block or random data */
+    MOVL 12(DI), DX
+    MOVL 16(DI), R8
+    MOVL 20(DI), R9
+    MOVL 24(DI), R10
+    MOVL 28(DI), R11
+
+    LEAQ K256<>(SB), BP
+    LONG $0x856f7dc5; LONG $0x00000220         // VMOVDQA YMM8, 0x220[rbp]  /* vmovdqa ymm8,YMMWORD PTR [rip+0x220] */
+    LONG $0x8d6f7dc5; LONG $0x00000240         // VMOVDQA YMM9, 0x240[rbp]  /* vmovdqa ymm9,YMMWORD PTR [rip+0x240] */
+    LONG $0x956f7dc5; LONG $0x00000200         // VMOVDQA YMM10, 0x200[rbp] /* vmovdqa ymm7,YMMWORD PTR [rip+0x200] */
+
+loop0:
+    LONG $0x6f7dc1c4; BYTE $0xfa               // VMOVDQA YMM7, YMM10
+
+    // Load first 16 dwords from two blocks
+    MOVOU -64(SI), X0                          // vmovdqu xmm0,XMMWORD PTR [rsi-0x40]
+    MOVOU -48(SI), X1                          // vmovdqu xmm1,XMMWORD PTR [rsi-0x30]
+    MOVOU -32(SI), X2                          // vmovdqu xmm2,XMMWORD PTR [rsi-0x20]
+    MOVOU -16(SI), X3                          // vmovdqu xmm3,XMMWORD PTR [rsi-0x10]
+
+    // Byte swap data and transpose data into high/low
+    LONG $0x387dc3c4; WORD $0x2404; BYTE $0x01 // vinserti128 ymm0,ymm0,[r12],0x1
+    LONG $0x3875c3c4; LONG $0x0110244c         // vinserti128 ymm1,ymm1,0x10[r12],0x1
+    LONG $0x007de2c4; BYTE $0xc7               // vpshufb     ymm0,ymm0,ymm7
+    LONG $0x386dc3c4; LONG $0x01202454         // vinserti128 ymm2,ymm2,0x20[r12],0x1
+    LONG $0x0075e2c4; BYTE $0xcf               // vpshufb     ymm1,ymm1,ymm7
+    LONG $0x3865c3c4; LONG $0x0130245c         // vinserti128 ymm3,ymm3,0x30[r12],0x1
+
+    LEAQ K256<>(SB), BP
+    LONG $0x006de2c4; BYTE $0xd7               // vpshufb ymm2,ymm2,ymm7
+    LONG $0x65fefdc5; BYTE $0x00               // vpaddd  ymm4,ymm0,[rbp]
+    LONG $0x0065e2c4; BYTE $0xdf               // vpshufb ymm3,ymm3,ymm7
+    LONG $0x6dfef5c5; BYTE $0x20               // vpaddd  ymm5,ymm1,0x20[rbp]
+    LONG $0x75feedc5; BYTE $0x40               // vpaddd  ymm6,ymm2,0x40[rbp]
+    LONG $0x7dfee5c5; BYTE $0x60               // vpaddd  ymm7,ymm3,0x60[rbp]
+
+    LONG $0x247ffdc5; BYTE $0x24               // vmovdqa [rsp],ymm4
+    XORQ R14, R14
+    LONG $0x6c7ffdc5; WORD $0x2024             // vmovdqa [rsp+0x20],ymm5
+
+    ADDQ $-0x40, SP
+    MOVQ BX, DI
+    LONG $0x347ffdc5; BYTE $0x24               // vmovdqa [rsp],ymm6
+    XORQ CX, DI                                // magic
+    LONG $0x7c7ffdc5; WORD $0x2024             // vmovdqa [rsp+0x20],ymm7
+    MOVQ R9, R12
+    ADDQ $0x80,BP
+
+loop1:
+    // Schedule 48 input dwords, by doing 3 rounds of 12 each
+    // Note: SIMD instructions are interleaved with the SHA calculations
+    ADDQ $-0x40, SP
+    LONG $0x0f75e3c4; WORD $0x04e0 // vpalignr ymm4,ymm1,ymm0,0x4
+
+    // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
+    LONG $0x249c0344; LONG $0x00000080 // add    r11d,[rsp+0x80]
+    WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
+    LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
+    LONG $0x0f65e3c4; WORD $0x04fa // vpalignr ymm7,ymm3,ymm2,0x4
+    LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
+    LONG $0x30048d42               // lea    eax,[rax+r14*1]            
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    LONG $0xd472cdc5; BYTE $0x07   // vpsrld ymm6,ymm4,0x7
+    LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
+    LONG $0xc7fefdc5               // vpaddd ymm0,ymm0,ymm7
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
+    LONG $0xd472c5c5; BYTE $0x03   // vpsrld ymm7,ymm4,0x3
+    LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
+    LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
+    WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
+    LONG $0xf472d5c5; BYTE $0x0e   // vpslld ymm5,ymm4,0xe
+    LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
+    LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
+    LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
+    LONG $0xe6efc5c5               // vpxor  ymm4,ymm7,ymm6
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xdf31                   // xor    edi,ebx
+    LONG $0xfb70fdc5; BYTE $0xfa   // vpshufd ymm7,ymm3,0xfa
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
+    WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
+    LONG $0xd672cdc5; BYTE $0x0b   // vpsrld ymm6,ymm6,0xb
+
+    // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
+    LONG $0x24940344; LONG $0x00000084 // add    r10d,[rsp+0x84]
+    WORD $0x2141; BYTE $0xd4       // and    r12d,edx
+    LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
+    LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    LONG $0xf572d5c5; BYTE $0x0b   // vpslld ymm5,ymm5,0xb
+    LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
+    LONG $0xe6efddc5               // vpxor  ymm4,ymm4,ymm6
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
+    LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
+    WORD $0xc731                   // xor    edi,eax
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
+    LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
+    LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
+    LONG $0xc4fefdc5               // vpaddd ymm0,ymm0,ymm4
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
+    WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
+    LONG $0x248c0344; LONG $0x00000088 // add    r9d,[rsp+0x88]
+    WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
+    LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
+    LONG $0x32148d47               // lea    r10d,[r10+r14*1]
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
+    LONG $0x004dc2c4; BYTE $0xf0   // vpshufb ymm6,ymm6,ymm8
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
+    LONG $0xc6fefdc5               // vpaddd ymm0,ymm0,ymm6
+    LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
+    LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
+    WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
+    LONG $0xf870fdc5; BYTE $0x50   // vpshufd ymm7,ymm0,0x50
+    LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
+    LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
+    LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
+    WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
+    LONG $0x24840344; LONG $0x0000008c // add    r8d,[rsp+0x8c]
+    WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
+    LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
+    LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
+    LONG $0x004dc2c4; BYTE $0xf1   // vpshufb ymm6,ymm6,ymm9
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
+    LONG $0xc6fefdc5               // vpaddd ymm0,ymm0,ymm6
+    LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
+    LONG $0x28048d47               // lea    r8d,[r8+r13*1]
+    WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
+    LONG $0x75fefdc5; BYTE $0x00   // vpaddd ymm6,ymm0,[rbp+0x0]
+    LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
+    LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
+    LONG $0x00048d42               // lea    eax,[rax+r8*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d47               // lea    r8d,[r8+r15*1]
+    WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
+
+    LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
+    LONG $0x0f6de3c4; WORD $0x04e1 // vpalignr ymm4,ymm2,ymm1,0x4
+
+    // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
+    LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add    edx,[rsp+0xa0]
+    WORD $0x2141; BYTE $0xc4       // and    r12d,eax
+    LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
+    LONG $0x0f7de3c4; WORD $0x04fb // vpalignr ymm7,ymm0,ymm3,0x4
+    LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
+    LONG $0x30048d47               // lea    r8d,[r8+r14*1]
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    LONG $0xd472cdc5; BYTE $0x07   // vpsrld ymm6,ymm4,0x7
+    LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
+    LONG $0xcffef5c5               // vpaddd ymm1,ymm1,ymm7
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
+    LONG $0xd472c5c5; BYTE $0x03   // vpsrld ymm7,ymm4,0x3
+    LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
+    LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
+    WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
+    LONG $0xf472d5c5; BYTE $0x0e   // vpslld ymm5,ymm4,0xe
+    LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
+    LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
+    LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
+    LONG $0xe6efc5c5               // vpxor  ymm4,ymm7,ymm6
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
+    LONG $0xf870fdc5; BYTE $0xfa   // vpshufd ymm7,ymm0,0xfa
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
+    WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
+    LONG $0xd672cdc5; BYTE $0x0b   // vpsrld ymm6,ymm6,0xb
+
+    // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
+    LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add    ecx,[rsp+0xa4]
+    WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
+    LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
+    LONG $0x32148d42               // lea    edx,[rdx+r14*1]
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    LONG $0xf572d5c5; BYTE $0x0b   // vpslld ymm5,ymm5,0xb
+    LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
+    LONG $0xe6efddc5               // vpxor  ymm4,ymm4,ymm6
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xd789                   // mov    edi,edx
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
+    LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
+    WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
+    LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
+    LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
+    LONG $0xccfef5c5               // vpaddd ymm1,ymm1,ymm4
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
+    WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
+    LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add    ebx,[rsp+0xa8]
+    WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
+    LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
+    LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
+    LONG $0x004dc2c4; BYTE $0xf0   // vpshufb ymm6,ymm6,ymm8
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
+    LONG $0xcefef5c5               // vpaddd ymm1,ymm1,ymm6
+    LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
+    LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
+    WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
+    LONG $0xf970fdc5; BYTE $0x50   // vpshufd ymm7,ymm1,0x50
+    LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
+    LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
+    LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xd731                   // xor    edi,edx
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
+    WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
+    LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add    eax,[rsp+0xac]
+    WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
+    LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
+    LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
+    LONG $0x004dc2c4; BYTE $0xf1   // vpshufb ymm6,ymm6,ymm9
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xdf89                   // mov    edi,ebx
+    LONG $0xcefef5c5               // vpaddd ymm1,ymm1,ymm6
+    LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
+    LONG $0x28048d42               // lea    eax,[rax+r13*1]
+    WORD $0xcf31                   // xor    edi,ecx
+    LONG $0x75fef5c5; BYTE $0x20   // vpaddd ymm6,ymm1,[rbp+0x20]
+    LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
+    LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
+    LONG $0x00048d45               // lea    r8d,[r8+rax*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d42               // lea    eax,[rax+r15*1]
+    WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
+
+    LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
+
+    LONG $0x24648d48; BYTE $0xc0   // lea    rsp,[rsp-0x40]
+    LONG $0x0f65e3c4; WORD $0x04e2 // vpalignr ymm4,ymm3,ymm2,0x4
+
+    // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x80)
+    LONG $0x249c0344; LONG $0x00000080 // add    r11d,[rsp+0x80]
+    WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
+    LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
+    LONG $0x0f75e3c4; WORD $0x04f8 // vpalignr ymm7,ymm1,ymm0,0x4
+    LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
+    LONG $0x30048d42               // lea    eax,[rax+r14*1]
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    LONG $0xd472cdc5; BYTE $0x07   // vpsrld ymm6,ymm4,0x7
+    LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
+    LONG $0xd7feedc5               // vpaddd ymm2,ymm2,ymm7
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
+    LONG $0xd472c5c5; BYTE $0x03   // vpsrld ymm7,ymm4,0x3
+    LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
+    LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
+    WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
+    LONG $0xf472d5c5; BYTE $0x0e   // vpslld ymm5,ymm4,0xe
+    LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
+    LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
+    LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
+    LONG $0xe6efc5c5               // vpxor  ymm4,ymm7,ymm6
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xdf31                   // xor    edi,ebx
+    LONG $0xf970fdc5; BYTE $0xfa   // vpshufd ymm7,ymm1,0xfa
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
+    WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
+    LONG $0xd672cdc5; BYTE $0x0b   // vpsrld ymm6,ymm6,0xb
+
+    // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x84)
+    LONG $0x24940344; LONG $0x00000084 // add    r10d,[rsp+0x84]
+    WORD $0x2141; BYTE $0xd4       // and    r12d,edx
+    LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
+    LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    LONG $0xf572d5c5; BYTE $0x0b   // vpslld ymm5,ymm5,0xb
+    LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
+    LONG $0xe6efddc5               // vpxor  ymm4,ymm4,ymm6
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
+    LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
+    WORD $0xc731                   // xor    edi,eax
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
+    LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
+    LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
+    LONG $0xd4feedc5               // vpaddd ymm2,ymm2,ymm4
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
+    WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x88)
+    LONG $0x248c0344; LONG $0x00000088 // add    r9d,[rsp+0x88]
+    WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
+    LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
+    LONG $0x32148d47               // lea    r10d,[r10+r14*1]
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
+    LONG $0x004dc2c4; BYTE $0xf0   // vpshufb ymm6,ymm6,ymm8
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
+    LONG $0xd6feedc5               // vpaddd ymm2,ymm2,ymm6
+    LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
+    LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
+    WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
+    LONG $0xfa70fdc5; BYTE $0x50   // vpshufd ymm7,ymm2,0x50
+    LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
+    LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
+    LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
+    WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x8c)
+    LONG $0x24840344; LONG $0x0000008c // add    r8d,[rsp+0x8c]
+    WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
+    LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
+    LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
+    LONG $0x004dc2c4; BYTE $0xf1   // vpshufb ymm6,ymm6,ymm9
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
+    LONG $0xd6feedc5               // vpaddd ymm2,ymm2,ymm6
+    LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
+    LONG $0x28048d47               // lea    r8d,[r8+r13*1]
+    WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
+    LONG $0x75feedc5; BYTE $0x40   // vpaddd ymm6,ymm2,[rbp+0x40]
+    LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
+    LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
+    LONG $0x00048d42               // lea    eax,[rax+r8*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d47               // lea    r8d,[r8+r15*1]
+    WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
+
+    LONG $0x347ffdc5; BYTE $0x24 // vmovdqa [rsp],ymm6
+    LONG $0x0f7de3c4; WORD $0x04e3 // vpalignr ymm4,ymm0,ymm3,0x4
+
+    // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0xa0)
+    LONG $0xa0249403; WORD $0x0000; BYTE $0x00 // add    edx,[rsp+0xa0]
+    WORD $0x2141; BYTE $0xc4       // and    r12d,eax
+    LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
+    LONG $0x0f6de3c4; WORD $0x04f9 // vpalignr ymm7,ymm2,ymm1,0x4
+    LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
+    LONG $0x30048d47               // lea    r8d,[r8+r14*1]
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    LONG $0xd472cdc5; BYTE $0x07   // vpsrld ymm6,ymm4,0x7
+    LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
+    LONG $0xdffee5c5               // vpaddd ymm3,ymm3,ymm7
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
+    LONG $0xd472c5c5; BYTE $0x03   // vpsrld ymm7,ymm4,0x3
+    LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
+    LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
+    WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
+    LONG $0xf472d5c5; BYTE $0x0e   // vpslld ymm5,ymm4,0xe
+    LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
+    LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
+    LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
+    LONG $0xe6efc5c5               // vpxor  ymm4,ymm7,ymm6
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
+    LONG $0xfa70fdc5; BYTE $0xfa   // vpshufd ymm7,ymm2,0xfa
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
+    WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
+    LONG $0xd672cdc5; BYTE $0x0b   // vpsrld ymm6,ymm6,0xb
+
+    // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0xa4)
+    LONG $0xa4248c03; WORD $0x0000; BYTE $0x00 // add    ecx,[rsp+0xa4]
+    WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
+    LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
+    LONG $0x32148d42               // lea    edx,[rdx+r14*1]
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    LONG $0xf572d5c5; BYTE $0x0b   // vpslld ymm5,ymm5,0xb
+    LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
+    LONG $0xe6efddc5               // vpxor  ymm4,ymm4,ymm6
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xd789                   // mov    edi,edx
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
+    LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
+    WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
+    LONG $0xe5efddc5               // vpxor  ymm4,ymm4,ymm5
+    LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
+    LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
+    LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
+    LONG $0xdcfee5c5               // vpaddd ymm3,ymm3,ymm4
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
+    WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0xa8)
+    LONG $0xa8249c03; WORD $0x0000; BYTE $0x00 // add    ebx,[rsp+0xa8]
+    WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
+    LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
+    LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
+    LONG $0x004dc2c4; BYTE $0xf0   // vpshufb ymm6,ymm6,ymm8
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
+    LONG $0xdefee5c5               // vpaddd ymm3,ymm3,ymm6
+    LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
+    LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
+    WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
+    LONG $0xfb70fdc5; BYTE $0x50   // vpshufd ymm7,ymm3,0x50
+    LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
+    LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
+    LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
+    LONG $0xd772cdc5; BYTE $0x0a   // vpsrld ymm6,ymm7,0xa
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xd731                   // xor    edi,edx
+    LONG $0xd773c5c5; BYTE $0x11   // vpsrlq ymm7,ymm7,0x11
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
+    WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+
+    // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0xac)
+    LONG $0xac248403; WORD $0x0000; BYTE $0x00 // add    eax,[rsp+0xac]
+    WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
+    LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
+    LONG $0xd773c5c5; BYTE $0x02   // vpsrlq ymm7,ymm7,0x2
+    LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
+    LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    LONG $0xf7efcdc5               // vpxor  ymm6,ymm6,ymm7
+    LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
+    LONG $0x004dc2c4; BYTE $0xf1   // vpshufb ymm6,ymm6,ymm9
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xdf89                   // mov    edi,ebx
+    LONG $0xdefee5c5               // vpaddd ymm3,ymm3,ymm6
+    LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
+    LONG $0x28048d42               // lea    eax,[rax+r13*1]
+    WORD $0xcf31                   // xor    edi,ecx
+    LONG $0x75fee5c5; BYTE $0x60   // vpaddd ymm6,ymm3,[rbp+0x60]
+    LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
+    LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
+    LONG $0x00048d45               // lea    r8d,[r8+rax*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d42               // lea    eax,[rax+r15*1]
+    WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
+
+    LONG $0x747ffdc5; WORD $0x2024 // vmovdqa [rsp+0x20],ymm6
+    ADDQ $0x80, BP
+
+    CMPB 0x3(BP),$0x0
+    JNE  loop1
+
+    // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x40)
+    LONG $0x245c0344; BYTE $0x40   // add    r11d,[rsp+0x40]
+    WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
+    LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
+    LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
+    LONG $0x30048d42               // lea    eax,[rax+r14*1]
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
+    LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
+    LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
+    WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
+    LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
+    LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
+    LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xdf31                   // xor    edi,ebx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
+    WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
+
+    // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x44)
+    LONG $0x24540344; BYTE $0x44   // add    r10d,[rsp+0x44]
+    WORD $0x2141; BYTE $0xd4       // and    r12d,edx
+    LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
+    LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
+    LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
+    LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
+    WORD $0xc731                   // xor    edi,eax
+    LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
+    LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
+    LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
+    WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
+
+    // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x48)
+    LONG $0x244c0344; BYTE $0x48   // add    r9d,[rsp+0x48]
+    WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
+    LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
+    LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
+    LONG $0x32148d47               // lea    r10d,[r10+r14*1]
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
+    LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
+    LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
+    WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
+    LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
+    LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
+    LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
+    WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
+
+    // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x4c)
+    LONG $0x24440344; BYTE $0x4c   // add    r8d,[rsp+0x4c]
+    WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
+    LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
+    LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
+    LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
+    LONG $0x28048d47               // lea    r8d,[r8+r13*1]
+    WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
+    LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
+    LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
+    LONG $0x00048d42               // lea    eax,[rax+r8*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d47               // lea    r8d,[r8+r15*1]
+    WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
+
+    // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x60)
+    LONG $0x60245403               // add    edx,[rsp+0x60]
+    WORD $0x2141; BYTE $0xc4       // and    r12d,eax
+    LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
+    LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
+    LONG $0x30048d47               // lea    r8d,[r8+r14*1]
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
+    LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
+    LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
+    WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
+    LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
+    LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
+    LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
+    WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
+
+    // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x64)
+    LONG $0x64244c03               // add    ecx,[rsp+0x64]
+    WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
+    LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
+    LONG $0x32148d42               // lea    edx,[rdx+r14*1]
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xd789                   // mov    edi,edx
+    LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
+    LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
+    WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
+    LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
+    LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
+    LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
+    WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
+
+    // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x68)
+    LONG $0x68245c03               // add    ebx,[rsp+0x68]
+    WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
+    LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
+    LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
+    LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
+    LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
+    LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
+    WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
+    LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
+    LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
+    LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xd731                   // xor    edi,edx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
+    WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
+
+    // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x6c)
+    LONG $0x6c244403               // add    eax,[rsp+0x6c]
+    WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
+    LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
+    LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xdf89                   // mov    edi,ebx
+    LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
+    LONG $0x28048d42               // lea    eax,[rax+r13*1]
+    WORD $0xcf31                   // xor    edi,ecx
+    LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
+    LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
+    LONG $0x00048d45               // lea    r8d,[r8+rax*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d42               // lea    eax,[rax+r15*1]
+    WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
+
+    // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, SP, 0x00)
+    LONG $0x241c0344               // add    r11d,[rsp]
+    WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
+    LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
+    LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
+    LONG $0x30048d42               // lea    eax,[rax+r14*1]
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
+    LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
+    LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
+    WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
+    LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
+    LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
+    LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xdf31                   // xor    edi,ebx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
+    WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
+
+    // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, SP, 0x04)
+    LONG $0x24540344; BYTE $0x04   // add    r10d,[rsp+0x4]
+    WORD $0x2141; BYTE $0xd4       // and    r12d,edx
+    LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
+    LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
+    LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
+    LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
+    WORD $0xc731                   // xor    edi,eax
+    LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
+    LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
+    LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
+    WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
+
+    // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, SP, 0x08)
+    LONG $0x244c0344; BYTE $0x08   // add    r9d,[rsp+0x8]
+    WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
+    LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
+    LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
+    LONG $0x32148d47               // lea    r10d,[r10+r14*1]
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
+    LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
+    LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
+    WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
+    LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
+    LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
+    LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
+    WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
+
+    // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, SP, 0x0c)
+    LONG $0x24440344; BYTE $0x0c   // add    r8d,[rsp+0xc]
+    WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
+    LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
+    LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
+    LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
+    LONG $0x28048d47               // lea    r8d,[r8+r13*1]
+    WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
+    LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
+    LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
+    LONG $0x00048d42               // lea    eax,[rax+r8*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d47               // lea    r8d,[r8+r15*1]
+    WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
+
+    // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, SP, 0x20)
+    LONG $0x20245403               // add    edx,[rsp+0x20]
+    WORD $0x2141; BYTE $0xc4       // and    r12d,eax
+    LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
+    LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
+    LONG $0x30048d47               // lea    r8d,[r8+r14*1]
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
+    LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
+    LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
+    WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
+    LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
+    LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
+    LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
+    WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
+
+    // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, SP, 0x24)
+    LONG $0x24244c03               // add    ecx,[rsp+0x24]
+    WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
+    LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
+    LONG $0x32148d42               // lea    edx,[rdx+r14*1]
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xd789                   // mov    edi,edx
+    LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
+    LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
+    WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
+    LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
+    LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
+    LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
+    WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
+
+    // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, SP, 0x28)
+    LONG $0x28245c03               // add    ebx,[rsp+0x28]
+    WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
+    LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
+    LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
+    LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
+    LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
+    LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
+    WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
+    LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
+    LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
+    LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xd731                   // xor    edi,edx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
+    WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
+
+    // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, SP, 0x2c)
+    LONG $0x2c244403               // add    eax,[rsp+0x2c]
+    WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
+    LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
+    LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xdf89                   // mov    edi,ebx
+    LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
+    LONG $0x28048d42               // lea    eax,[rax+r13*1]
+    WORD $0xcf31                   // xor    edi,ecx
+    LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
+    LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
+    LONG $0x00048d45               // lea    r8d,[r8+rax*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d42               // lea    eax,[rax+r15*1]
+    WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
+
+    MOVQ 0x200(SP), DI             // $_ctx
+    ADDQ R14, AX
+
+    LEAQ 0x1c0(SP), BP
+
+    ADDL (DI), AX
+    ADDL 4(DI), BX
+    ADDL 8(DI), CX
+    ADDL 12(DI), DX
+    ADDL 16(DI), R8
+    ADDL 20(DI), R9
+    ADDL 24(DI), R10
+    ADDL 28(DI), R11
+
+    MOVL AX, (DI)
+    MOVL BX, 4(DI)
+    MOVL CX, 8(DI)
+    MOVL DX, 12(DI)
+    MOVL R8, 16(DI)
+    MOVL R9, 20(DI)
+    MOVL R10, 24(DI)
+    MOVL R11, 28(DI)
+
+    CMPQ SI, 0x50(BP)              // $_end
+    JE   done
+
+    XORQ R14, R14
+    MOVQ BX, DI
+    XORQ CX, DI                    // magic
+    MOVQ R9, R12
+
+loop2:
+    // ROUND(AX, BX, CX, DX, R8, R9, R10, R11, R12, R13, R14, R15, DI, BP, 0x10)
+    LONG $0x105d0344               // add    r11d,[rbp+0x10]
+    WORD $0x2145; BYTE $0xc4       // and    r12d,r8d
+    LONG $0xf07b43c4; WORD $0x19e8 // rorx   r13d,r8d,0x19
+    LONG $0xf07b43c4; WORD $0x0bf8 // rorx   r15d,r8d,0xb
+    LONG $0x30048d42               // lea    eax,[rax+r14*1]
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    LONG $0xf23842c4; BYTE $0xe2   // andn   r12d,r8d,r10d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f0 // rorx   r14d,r8d,0x6
+    LONG $0x231c8d47               // lea    r11d,[r11+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xc7       // mov    r15d,eax
+    LONG $0xf07b63c4; WORD $0x16e0 // rorx   r12d,eax,0x16
+    LONG $0x2b1c8d47               // lea    r11d,[r11+r13*1]
+    WORD $0x3141; BYTE $0xdf       // xor    r15d,ebx
+    LONG $0xf07b63c4; WORD $0x0df0 // rorx   r14d,eax,0xd
+    LONG $0xf07b63c4; WORD $0x02e8 // rorx   r13d,eax,0x2
+    LONG $0x1a148d42               // lea    edx,[rdx+r11*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xdf31                   // xor    edi,ebx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3b1c8d45               // lea    r11d,[r11+rdi*1]
+    WORD $0x8945; BYTE $0xc4       // mov    r12d,r8d
+
+    // ROUND(R11, AX, BX, CX, DX, R8, R9, R10, R12, R13, R14, DI, R15, BP, 0x14)
+    LONG $0x14550344               // add    r10d,[rbp+0x14]
+    WORD $0x2141; BYTE $0xd4       // and    r12d,edx
+    LONG $0xf07b63c4; WORD $0x19ea // rorx   r13d,edx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfa // rorx   edi,edx,0xb
+    LONG $0x331c8d47               // lea    r11d,[r11+r14*1]
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    LONG $0xf26842c4; BYTE $0xe1   // andn   r12d,edx,r9d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f2 // rorx   r14d,edx,0x6
+    LONG $0x22148d47               // lea    r10d,[r10+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xdf       // mov    edi,r11d
+    LONG $0xf07b43c4; WORD $0x16e3 // rorx   r12d,r11d,0x16
+    LONG $0x2a148d47               // lea    r10d,[r10+r13*1]
+    WORD $0xc731                   // xor    edi,eax
+    LONG $0xf07b43c4; WORD $0x0df3 // rorx   r14d,r11d,0xd
+    LONG $0xf07b43c4; WORD $0x02eb // rorx   r13d,r11d,0x2
+    LONG $0x110c8d42               // lea    ecx,[rcx+r10*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xc7       // xor    r15d,eax
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x3a148d47               // lea    r10d,[r10+r15*1]
+    WORD $0x8941; BYTE $0xd4       // mov    r12d,edx
+
+    // ROUND(R10, R11, AX, BX, CX, DX, R8, R9, R12, R13, R14, R15, DI, BP, 0x18)
+    LONG $0x184d0344               // add    r9d,[rbp+0x18]
+    WORD $0x2141; BYTE $0xcc       // and    r12d,ecx
+    LONG $0xf07b63c4; WORD $0x19e9 // rorx   r13d,ecx,0x19
+    LONG $0xf07b63c4; WORD $0x0bf9 // rorx   r15d,ecx,0xb
+    LONG $0x32148d47               // lea    r10d,[r10+r14*1]
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    LONG $0xf27042c4; BYTE $0xe0   // andn   r12d,ecx,r8d
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f1 // rorx   r14d,ecx,0x6
+    LONG $0x210c8d47               // lea    r9d,[r9+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xd7       // mov    r15d,r10d
+    LONG $0xf07b43c4; WORD $0x16e2 // rorx   r12d,r10d,0x16
+    LONG $0x290c8d47               // lea    r9d,[r9+r13*1]
+    WORD $0x3145; BYTE $0xdf       // xor    r15d,r11d
+    LONG $0xf07b43c4; WORD $0x0df2 // rorx   r14d,r10d,0xd
+    LONG $0xf07b43c4; WORD $0x02ea // rorx   r13d,r10d,0x2
+    LONG $0x0b1c8d42               // lea    ebx,[rbx+r9*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xdf       // xor    edi,r11d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d45               // lea    r9d,[r9+rdi*1]
+    WORD $0x8941; BYTE $0xcc       // mov    r12d,ecx
+
+    // ROUND(R9, R10, R11, AX, BX, CX, DX, R8, R12, R13, R14, DI, R15, BP, 0x1c)
+    LONG $0x1c450344               // add    r8d,[rbp+0x1c]
+    WORD $0x2141; BYTE $0xdc       // and    r12d,ebx
+    LONG $0xf07b63c4; WORD $0x19eb // rorx   r13d,ebx,0x19
+    LONG $0xf07be3c4; WORD $0x0bfb // rorx   edi,ebx,0xb
+    LONG $0x310c8d47               // lea    r9d,[r9+r14*1]
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    LONG $0xf26062c4; BYTE $0xe2   // andn   r12d,ebx,edx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b63c4; WORD $0x06f3 // rorx   r14d,ebx,0x6
+    LONG $0x20048d47               // lea    r8d,[r8+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8944; BYTE $0xcf       // mov    edi,r9d
+    LONG $0xf07b43c4; WORD $0x16e1 // rorx   r12d,r9d,0x16
+    LONG $0x28048d47               // lea    r8d,[r8+r13*1]
+    WORD $0x3144; BYTE $0xd7       // xor    edi,r10d
+    LONG $0xf07b43c4; WORD $0x0df1 // rorx   r14d,r9d,0xd
+    LONG $0xf07b43c4; WORD $0x02e9 // rorx   r13d,r9d,0x2
+    LONG $0x00048d42               // lea    eax,[rax+r8*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xd7       // xor    r15d,r10d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d47               // lea    r8d,[r8+r15*1]
+    WORD $0x8941; BYTE $0xdc       // mov    r12d,ebx
+
+    // ROUND(R8, R9, R10, R11, AX, BX, CX, DX, R12, R13, R14, R15, DI, BP, 0x30)
+    WORD $0x5503; BYTE $0x30       // add    edx,[rbp+0x30]
+    WORD $0x2141; BYTE $0xc4       // and    r12d,eax
+    LONG $0xf07b63c4; WORD $0x19e8 // rorx   r13d,eax,0x19
+    LONG $0xf07b63c4; WORD $0x0bf8 // rorx   r15d,eax,0xb
+    LONG $0x30048d47               // lea    r8d,[r8+r14*1]
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    LONG $0xf27862c4; BYTE $0xe1   // andn   r12d,eax,ecx
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b63c4; WORD $0x06f0 // rorx   r14d,eax,0x6
+    LONG $0x22148d42               // lea    edx,[rdx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8945; BYTE $0xc7       // mov    r15d,r8d
+    LONG $0xf07b43c4; WORD $0x16e0 // rorx   r12d,r8d,0x16
+    LONG $0x2a148d42               // lea    edx,[rdx+r13*1]
+    WORD $0x3145; BYTE $0xcf       // xor    r15d,r9d
+    LONG $0xf07b43c4; WORD $0x0df0 // rorx   r14d,r8d,0xd
+    LONG $0xf07b43c4; WORD $0x02e8 // rorx   r13d,r8d,0x2
+    LONG $0x131c8d45               // lea    r11d,[r11+rdx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3144; BYTE $0xcf       // xor    edi,r9d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x148d; BYTE $0x3a       // lea    edx,[rdx+rdi*1]
+    WORD $0x8941; BYTE $0xc4       // mov    r12d,eax
+
+    // ROUND(DX, R8, R9, R10, R11, AX, BX, CX, R12, R13, R14, DI, R15, BP, 0x34)
+    WORD $0x4d03; BYTE $0x34       // add    ecx,[rbp+0x34]
+    WORD $0x2145; BYTE $0xdc       // and    r12d,r11d
+    LONG $0xf07b43c4; WORD $0x19eb // rorx   r13d,r11d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bfb // rorx   edi,r11d,0xb
+    LONG $0x32148d42               // lea    edx,[rdx+r14*1]
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    LONG $0xf22062c4; BYTE $0xe3   // andn   r12d,r11d,ebx
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f3 // rorx   r14d,r11d,0x6
+    LONG $0x210c8d42               // lea    ecx,[rcx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xd789                   // mov    edi,edx
+    LONG $0xf07b63c4; WORD $0x16e2 // rorx   r12d,edx,0x16
+    LONG $0x290c8d42               // lea    ecx,[rcx+r13*1]
+    WORD $0x3144; BYTE $0xc7       // xor    edi,r8d
+    LONG $0xf07b63c4; WORD $0x0df2 // rorx   r14d,edx,0xd
+    LONG $0xf07b63c4; WORD $0x02ea // rorx   r13d,edx,0x2
+    LONG $0x0a148d45               // lea    r10d,[r10+rcx*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3145; BYTE $0xc7       // xor    r15d,r8d
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x390c8d42               // lea    ecx,[rcx+r15*1]
+    WORD $0x8945; BYTE $0xdc       // mov    r12d,r11d
+
+    // ROUND(CX, DX, R8, R9, R10, R11, AX, BX, R12, R13, R14, R15, DI, BP, 0x38)
+    WORD $0x5d03; BYTE $0x38       // add    ebx,[rbp+0x38]
+    WORD $0x2145; BYTE $0xd4       // and    r12d,r10d
+    LONG $0xf07b43c4; WORD $0x19ea // rorx   r13d,r10d,0x19
+    LONG $0xf07b43c4; WORD $0x0bfa // rorx   r15d,r10d,0xb
+    LONG $0x310c8d42               // lea    ecx,[rcx+r14*1]
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    LONG $0xf22862c4; BYTE $0xe0   // andn   r12d,r10d,eax
+    WORD $0x3145; BYTE $0xfd       // xor    r13d,r15d
+    LONG $0xf07b43c4; WORD $0x06f2 // rorx   r14d,r10d,0x6
+    LONG $0x231c8d42               // lea    ebx,[rbx+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0x8941; BYTE $0xcf       // mov    r15d,ecx
+    LONG $0xf07b63c4; WORD $0x16e1 // rorx   r12d,ecx,0x16
+    LONG $0x2b1c8d42               // lea    ebx,[rbx+r13*1]
+    WORD $0x3141; BYTE $0xd7       // xor    r15d,edx
+    LONG $0xf07b63c4; WORD $0x0df1 // rorx   r14d,ecx,0xd
+    LONG $0xf07b63c4; WORD $0x02e9 // rorx   r13d,ecx,0x2
+    LONG $0x190c8d45               // lea    r9d,[r9+rbx*1]
+    WORD $0x2144; BYTE $0xff       // and    edi,r15d
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0xd731                   // xor    edi,edx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    WORD $0x1c8d; BYTE $0x3b       // lea    ebx,[rbx+rdi*1]
+    WORD $0x8945; BYTE $0xd4       // mov    r12d,r10d
+
+    // ROUND(BX, CX, DX, R8, R9, R10, R11, AX, R12, R13, R14, DI, R15, BP, 0x3c)
+    WORD $0x4503; BYTE $0x3c       // add    eax,[rbp+0x3c]
+    WORD $0x2145; BYTE $0xcc       // and    r12d,r9d
+    LONG $0xf07b43c4; WORD $0x19e9 // rorx   r13d,r9d,0x19
+    LONG $0xf07bc3c4; WORD $0x0bf9 // rorx   edi,r9d,0xb
+    LONG $0x331c8d42               // lea    ebx,[rbx+r14*1]
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    LONG $0xf23042c4; BYTE $0xe3   // andn   r12d,r9d,r11d
+    WORD $0x3141; BYTE $0xfd       // xor    r13d,edi
+    LONG $0xf07b43c4; WORD $0x06f1 // rorx   r14d,r9d,0x6
+    LONG $0x20048d42               // lea    eax,[rax+r12*1]
+    WORD $0x3145; BYTE $0xf5       // xor    r13d,r14d
+    WORD $0xdf89                   // mov    edi,ebx
+    LONG $0xf07b63c4; WORD $0x16e3 // rorx   r12d,ebx,0x16
+    LONG $0x28048d42               // lea    eax,[rax+r13*1]
+    WORD $0xcf31                   // xor    edi,ecx
+    LONG $0xf07b63c4; WORD $0x0df3 // rorx   r14d,ebx,0xd
+    LONG $0xf07b63c4; WORD $0x02eb // rorx   r13d,ebx,0x2
+    LONG $0x00048d45               // lea    r8d,[r8+rax*1]
+    WORD $0x2141; BYTE $0xff       // and    r15d,edi
+    WORD $0x3145; BYTE $0xe6       // xor    r14d,r12d
+    WORD $0x3141; BYTE $0xcf       // xor    r15d,ecx
+    WORD $0x3145; BYTE $0xee       // xor    r14d,r13d
+    LONG $0x38048d42               // lea    eax,[rax+r15*1]
+    WORD $0x8945; BYTE $0xcc       // mov    r12d,r9d
+
+    ADDQ $-0x40, BP
+    CMPQ BP, SP
+    JAE  loop2
+
+    MOVQ 0x200(SP), DI             // $_ctx
+    ADDQ R14, AX
+
+    ADDQ $0x1c0, SP
+
+    ADDL (DI), AX
+    ADDL 4(DI), BX
+    ADDL 8(DI), CX
+    ADDL 12(DI), DX
+    ADDL 16(DI), R8
+    ADDL 20(DI), R9
+
+    ADDQ $0x80, SI                 // input += 2
+    ADDL 24(DI), R10
+    MOVQ SI, R12
+    ADDL 28(DI), R11
+    CMPQ  SI, 0x50(SP)             // input == _end
+
+    MOVL AX, (DI)
+    LONG $0xe4440f4c               // cmove  r12,rsp                /* next block or stale data */
+    MOVL AX, (DI)
+    MOVL BX, 4(DI)
+    MOVL CX, 8(DI)
+    MOVL DX, 12(DI)
+    MOVL R8, 16(DI)
+    MOVL R9, 20(DI)
+    MOVL R10, 24(DI)
+    MOVL R11, 28(DI)
+
+    JBE loop0
+    LEAQ (SP), BP
+
+done:
+    MOVQ BP, SP
+    MOVQ 0x58(SP), SP
+    WORD $0xf8c5; BYTE $0x77     // vzeroupper
+
+    RET
+

+ 22 - 0
vendor/github.com/minio/sha256-simd/sha256blockAvx_amd64.go

@@ -0,0 +1,22 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+//go:noescape
+func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)

+ 409 - 0
vendor/github.com/minio/sha256-simd/sha256blockAvx_amd64.s

@@ -0,0 +1,409 @@
+//+build !noasm !appengine
+
+// SHA256 implementation for AVX
+
+//
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// This code is based on an Intel White-Paper:
+// "Fast SHA-256 Implementations on Intel Architecture Processors"
+//
+// together with the reference implementation from the following authors:
+//    James Guilford <[email protected]>
+//    Kirk Yap <[email protected]>
+//    Tim Chen <[email protected]>
+//
+// For Golang it has been converted to Plan 9 assembly with the help of
+// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
+// equivalents
+//
+
+#include "textflag.h"
+
+#define ROTATE_XS \
+    MOVOU  X4, X15 \
+    MOVOU  X5, X4 \
+    MOVOU  X6, X5 \
+    MOVOU  X7, X6 \
+    MOVOU X15, X7
+
+// compute s0 four at a time and s1 two at a time
+// compute W[-16] + W[-7] 4 at a time
+#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
+    MOVL e, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL a, R14 \                                                        /* y1 = a                                  */
+    LONG $0x0f41e3c4; WORD $0x04c6 \ // VPALIGNR XMM0,XMM7,XMM6,0x4      /* XTMP0 = W[-7]                           */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL f, R15 \                                                        /* y2 = f                                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    XORL g, R15 \                                                        /* y2 = f^g                                */
+    LONG $0xc4fef9c5               \ // VPADDD XMM0,XMM0,XMM4            /* XTMP0 = W[-7] + W[-16]                  */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) ) */
+    ANDL e, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    \                                                                    /*                                         */
+    \                                                                    /* compute s0                              */
+    \                                                                    /*                                         */
+    LONG $0x0f51e3c4; WORD $0x04cc \ // VPALIGNR XMM1,XMM5,XMM4,0x4      /* XTMP1 = W[-15]                          */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL g, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+48(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    MOVL a, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, h \                                                        /*  h = h + S1 + CH + k + w                */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL a, R15 \                                                        /* y2 = a                                  */
+    LONG $0xd172e9c5; BYTE $0x07   \ // VPSRLD XMM2,XMM1,0x7             /*                                         */
+    ORL  c, R13 \                                                        /* y0 = a|c                                */
+    ADDL h, d \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL c, R15 \                                                        /* y2 = a&c                                */
+    LONG $0xf172e1c5; BYTE $0x19   \ // VPSLLD XMM3,XMM1,0x19            /*                                         */
+    ANDL b, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, h \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    LONG $0xdaebe1c5               \ // VPOR   XMM3,XMM3,XMM2            /* XTMP1 = W[-15] MY_ROR 7                 */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, h \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL d, R13 \                                                        /* y0 = e                                  */
+    MOVL h, R14 \                                                        /* y1 = a                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    XORL d, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL e, R15 \                                                        /* y2 = f                                  */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    LONG $0xd172e9c5; BYTE $0x12   \ // VPSRLD XMM2,XMM1,0x12            /*                                         */
+    XORL h, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL f, R15 \                                                        /* y2 = f^g                                */
+    LONG $0xd172b9c5; BYTE $0x03   \ // VPSRLD XMM8,XMM1,0x3             /* XTMP4 = W[-15] >> 3                     */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    XORL d, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL d, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    LONG $0xf172f1c5; BYTE $0x0e   \ // VPSLLD XMM1,XMM1,0xe             /*                                         */
+    XORL h, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    XORL f, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    LONG $0xd9efe1c5               \ // VPXOR  XMM3,XMM3,XMM1            /*                                         */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+52(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    LONG $0xdaefe1c5               \ // VPXOR  XMM3,XMM3,XMM2            /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
+    MOVL h, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, g \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL h, R15 \                                                        /* y2 = a                                  */
+    LONG $0xef61c1c4; BYTE $0xc8   \ // VPXOR  XMM1,XMM3,XMM8            /* XTMP1 = s0                              */
+    ORL  b, R13 \                                                        /* y0 = a|c                                */
+    ADDL g, c \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL b, R15 \                                                        /* y2 = a&c                                */
+    \                                                                    /*                                         */
+    \                                                                    /* compute low s1                          */
+    \                                                                    /*                                         */
+    LONG $0xd770f9c5; BYTE $0xfa   \ // VPSHUFD XMM2,XMM7,0xfa           /* XTMP2 = W[-2] {BBAA}                    */
+    ANDL a, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, g \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    LONG $0xc1fef9c5               \ // VPADDD XMM0,XMM0,XMM1            /* XTMP0 = W[-16] + W[-7] + s0             */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, g \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL c, R13 \                                                        /* y0 = e                                  */
+    MOVL g, R14 \                                                        /* y1 = a                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    XORL c, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    MOVL d, R15 \                                                        /* y2 = f                                  */
+    XORL g, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    LONG $0xd272b9c5; BYTE $0x0a   \ // VPSRLD XMM8,XMM2,0xa             /* XTMP4 = W[-2] >> 10 {BBAA}              */
+    XORL e, R15 \                                                        /* y2 = f^g                                */
+    LONG $0xd273e1c5; BYTE $0x13   \ // VPSRLQ XMM3,XMM2,0x13            /* XTMP3 = W[-2] MY_ROR 19 {xBxA}          */
+    XORL c, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL c, R15 \                                                        /* y2 = (f^g)&e                            */
+    LONG $0xd273e9c5; BYTE $0x11   \ // VPSRLQ XMM2,XMM2,0x11            /* XTMP2 = W[-2] MY_ROR 17 {xBxA}          */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    XORL g, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    XORL e, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    LONG $0xd3efe9c5               \ // VPXOR  XMM2,XMM2,XMM3            /*                                         */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL _xfer+56(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    LONG $0xc2ef39c5               \ // VPXOR  XMM8,XMM8,XMM2            /* XTMP4 = s1 {xBxA}                       */
+    MOVL g, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, f \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL g, R15 \                                                        /* y2 = a                                  */
+    LONG $0x003942c4; BYTE $0xc2   \ // VPSHUFB XMM8,XMM8,XMM10          /* XTMP4 = s1 {00BA}                       */
+    ORL  a, R13 \                                                        /* y0 = a|c                                */
+    ADDL f, b \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL a, R15 \                                                        /* y2 = a&c                                */
+    LONG $0xfe79c1c4; BYTE $0xc0   \ // VPADDD XMM0,XMM0,XMM8            /* XTMP0 = {..., ..., W[1], W[0]}          */
+    ANDL h, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, f \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    \                                                                    /*                                         */
+    \                                                                    /* compute high s1                         */
+    \                                                                    /*                                         */
+    LONG $0xd070f9c5; BYTE $0x50   \ // VPSHUFD XMM2,XMM0,0x50           /* XTMP2 = W[-2] {DDCC}                    */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, f \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL b, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL f, R14 \                                                        /* y1 = a                                  */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    XORL b, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL c, R15 \                                                        /* y2 = f                                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    LONG $0xd272a1c5; BYTE $0x0a   \ // VPSRLD XMM11,XMM2,0xa            /* XTMP5 = W[-2] >> 10 {DDCC}              */
+    XORL f, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    XORL d, R15 \                                                        /* y2 = f^g                                */
+    LONG $0xd273e1c5; BYTE $0x13   \ // VPSRLQ XMM3,XMM2,0x13            /* XTMP3 = W[-2] MY_ROR 19 {xDxC}          */
+    XORL b, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL b, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    LONG $0xd273e9c5; BYTE $0x11   \ // VPSRLQ XMM2,XMM2,0x11            /* XTMP2 = W[-2] MY_ROR 17 {xDxC}          */
+    XORL f, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL d, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    LONG $0xd3efe9c5               \ // VPXOR  XMM2,XMM2,XMM3            /*                                         */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+60(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    LONG $0xdaef21c5               \ // VPXOR  XMM11,XMM11,XMM2          /* XTMP5 = s1 {xDxC}                       */
+    MOVL f, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, e \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL f, R15 \                                                        /* y2 = a                                  */
+    LONG $0x002142c4; BYTE $0xdc   \ // VPSHUFB XMM11,XMM11,XMM12        /* XTMP5 = s1 {DC00}                       */
+    ORL  h, R13 \                                                        /* y0 = a|c                                */
+    ADDL e, a \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL h, R15 \                                                        /* y2 = a&c                                */
+    LONG $0xe0fea1c5               \ // VPADDD XMM4,XMM11,XMM0           /* X0 = {W[3], W[2], W[1], W[0]}           */
+    ANDL g, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, e \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, e \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    ROTATE_XS
+
+
+#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
+    MOVL e, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL a, R14 \                                                        /* y1 = a                                  */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    MOVL f, R15 \                                                        /* y2 = f                                  */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL g, R15 \                                                        /* y2 = f^g                                */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    ANDL e, R15 \                                                        /* y2 = (f^g)&e                            */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL g, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL _xfer+offset(FP), R15 \                                         /* y2 = k + w + S1 + CH                    */
+    MOVL a, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, h \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL a, R15 \                                                        /* y2 = a                                  */
+    ORL  c, R13 \                                                        /* y0 = a|c                                */
+    ADDL h, d \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL c, R15 \                                                        /* y2 = a&c                                */
+    ANDL b, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, h \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, h                                                          /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+
+
+// func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
+TEXT ·blockAvx(SB), 7, $0
+
+	MOVQ  h+0(FP), SI // SI: &h
+    MOVQ  message+24(FP), R8    // &message
+    MOVQ  lenmessage+32(FP), R9 // length of message
+    CMPQ  R9, $0
+	JEQ   done_hash
+    ADDQ  R8, R9
+    MOVQ  R9, _inp_end+64(FP)   // store end of message
+
+    // Register definition
+    //  a -->  eax
+    //  b -->  ebx
+    //  c -->  ecx
+    //  d -->  r8d
+    //  e -->  edx
+    //  f -->  r9d
+    //  g --> r10d
+    //  h --> r11d
+    //
+    // y0 --> r13d
+    // y1 --> r14d
+    // y2 --> r15d
+
+    MOVL    (0*4)(SI),  AX        // a = H0
+    MOVL    (1*4)(SI),  BX        // b = H1
+    MOVL    (2*4)(SI),  CX        // c = H2
+    MOVL    (3*4)(SI),  R8        // d = H3
+    MOVL    (4*4)(SI),  DX        // e = H4
+    MOVL    (5*4)(SI),  R9        // f = H5
+    MOVL    (6*4)(SI), R10        // g = H6
+    MOVL    (7*4)(SI), R11        // h = H7
+
+	MOVOU bflipMask<>(SB), X13
+	MOVOU  shuf00BA<>(SB), X10    // shuffle xBxA -> 00BA
+	MOVOU  shufDC00<>(SB), X12    // shuffle xDxC -> DC00
+
+	MOVQ  message+24(FP), SI // SI: &message
+
+loop0:
+	LEAQ constants<>(SB), BP
+
+	// byte swap first 16 dwords
+    MOVOU 0*16(SI), X4
+    LONG $0x0059c2c4; BYTE $0xe5 // VPSHUFB XMM4, XMM4, XMM13
+    MOVOU 1*16(SI), X5
+    LONG $0x0051c2c4; BYTE $0xed // VPSHUFB XMM5, XMM5, XMM13
+    MOVOU 2*16(SI), X6
+    LONG $0x0049c2c4; BYTE $0xf5 // VPSHUFB XMM6, XMM6, XMM13
+    MOVOU 3*16(SI), X7
+    LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
+
+    MOVQ SI, _inp+72(FP)
+    MOVD $0x3, DI
+
+	// schedule 48 input dwords, by doing 3 rounds of 16 each
+loop1:
+    LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP]   /* Add 1st constant to first part of message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
+
+    LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP]   /* Add 2nd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
+
+    LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP]   /* Add 3rd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
+
+    LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP]   /* Add 4th constant to message */
+    MOVOU X9, _xfer+48(FP)
+    ADDQ $64, BP
+    FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
+
+	SUBQ    $1, DI
+	JNE     loop1
+
+    MOVD $0x2, DI
+loop2:
+    LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP]   /* Add 1st constant to first part of message */
+    MOVOU X9, _xfer+48(FP)
+    DO_ROUND( AX,  BX,  CX,  R8,  DX,  R9, R10, R11, 48)
+    DO_ROUND(R11,  AX,  BX,  CX,  R8,  DX,  R9, R10, 52)
+    DO_ROUND(R10, R11,  AX,  BX,  CX,  R8,  DX,  R9, 56)
+    DO_ROUND( R9, R10, R11,  AX,  BX,  CX,  R8,  DX, 60)
+
+    LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP]   /* Add 2nd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    ADDQ $32, BP
+    DO_ROUND( DX,  R9, R10, R11,  AX,  BX,  CX,  R8, 48)
+    DO_ROUND( R8,  DX,  R9, R10, R11,  AX,  BX,  CX, 52)
+    DO_ROUND( CX,  R8,  DX,  R9, R10, R11,  AX,  BX, 56)
+    DO_ROUND( BX,  CX,  R8,  DX,  R9, R10, R11,  AX, 60)
+
+    MOVOU  X6, X4
+    MOVOU  X7, X5
+
+	SUBQ    $1, DI
+	JNE     loop2
+
+    MOVQ    h+0(FP), SI // SI: &h
+    ADDL    (0*4)(SI), AX     // H0 = a + H0
+    MOVL    AX, (0*4)(SI)
+    ADDL    (1*4)(SI), BX     // H1 = b + H1
+    MOVL    BX, (1*4)(SI)
+    ADDL    (2*4)(SI), CX     // H2 = c + H2
+    MOVL    CX, (2*4)(SI)
+    ADDL    (3*4)(SI), R8     // H3 = d + H3
+    MOVL    R8, (3*4)(SI)
+    ADDL    (4*4)(SI), DX     // H4 = e + H4
+    MOVL    DX, (4*4)(SI)
+    ADDL    (5*4)(SI), R9     // H5 = f + H5
+    MOVL    R9, (5*4)(SI)
+    ADDL    (6*4)(SI), R10    // H6 = g + H6
+    MOVL    R10, (6*4)(SI)
+    ADDL    (7*4)(SI), R11    // H7 = h + H7
+    MOVL    R11, (7*4)(SI)
+
+    MOVQ _inp+72(FP), SI
+	ADDQ $64, SI
+	CMPQ _inp_end+64(FP), SI
+ 	JNE  loop0
+
+done_hash:
+    RET
+
+// Constants table
+DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
+DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
+DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
+DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
+DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
+DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
+DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
+DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
+DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
+DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
+DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
+DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
+DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
+DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
+DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
+DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
+DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
+DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
+DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
+DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
+DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
+DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
+DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
+DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
+DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
+DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
+DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
+DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
+DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
+DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
+DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
+DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
+
+DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
+DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
+
+DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
+DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
+
+DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
+
+GLOBL constants<>(SB), 8, $256
+GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
+GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
+GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16

+ 22 - 0
vendor/github.com/minio/sha256-simd/sha256blockSsse_amd64.go

@@ -0,0 +1,22 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+//go:noescape
+func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)

+ 430 - 0
vendor/github.com/minio/sha256-simd/sha256blockSsse_amd64.s

@@ -0,0 +1,430 @@
+//+build !noasm !appengine
+
+// SHA256 implementation for SSSE3
+
+//
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// This code is based on an Intel White-Paper:
+// "Fast SHA-256 Implementations on Intel Architecture Processors"
+//
+// together with the reference implementation from the following authors:
+//    James Guilford <[email protected]>
+//    Kirk Yap <[email protected]>
+//    Tim Chen <[email protected]>
+//
+// For Golang it has been converted to Plan 9 assembly with the help of
+// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
+// equivalents
+//
+
+#include "textflag.h"
+
+#define ROTATE_XS \
+    MOVOU  X4, X15 \
+    MOVOU  X5, X4 \
+    MOVOU  X6, X5 \
+    MOVOU  X7, X6 \
+    MOVOU X15, X7
+
+// compute s0 four at a time and s1 two at a time
+// compute W[-16] + W[-7] 4 at a time
+#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
+    MOVL e, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL a, R14 \                                                        /* y1 = a                                  */
+    MOVOU X7, X0 \
+    LONG $0x0f3a0f66; WORD $0x04c6 \ // PALIGNR XMM0,XMM6,0x4            /* XTMP0 = W[-7]                           */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL f, R15 \                                                        /* y2 = f                                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    XORL g, R15 \                                                        /* y2 = f^g                                */
+    LONG $0xc4fe0f66               \ // PADDD XMM0,XMM4                  /* XTMP0 = W[-7] + W[-16]                  */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) ) */
+    ANDL e, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    \                                                                    /*                                         */
+    \                                                                    /* compute s0                              */
+    \                                                                    /*                                         */
+    MOVOU X5, X1 \
+    LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1,XMM4,0x4            /* XTMP1 = W[-15]                          */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL g, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+48(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    MOVL a, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, h \                                                        /*  h = h + S1 + CH + k + w                */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL a, R15 \                                                        /* y2 = a                                  */
+    MOVOU X1, X2 \
+    LONG $0xd2720f66; BYTE $0x07   \ // PSRLD XMM2,0x7                   /*                                         */
+    ORL  c, R13 \                                                        /* y0 = a|c                                */
+    ADDL h, d \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL c, R15 \                                                        /* y2 = a&c                                */
+    MOVOU X1, X3 \
+    LONG $0xf3720f66; BYTE $0x19   \ // PSLLD XMM3,0x19                  /*                                         */
+    ANDL b, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, h \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    LONG $0xdaeb0f66               \ // POR   XMM3,XMM2                  /* XTMP1 = W[-15] MY_ROR 7                 */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, h \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL d, R13 \                                                        /* y0 = e                                  */
+    MOVL h, R14 \                                                        /* y1 = a                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    XORL d, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL e, R15 \                                                        /* y2 = f                                  */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    MOVOU X1, X2 \
+    LONG $0xd2720f66; BYTE $0x12   \ // PSRLD XMM2,0x12                  /*                                         */
+    XORL h, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL f, R15 \                                                        /* y2 = f^g                                */
+    MOVOU X1, X8 \
+    LONG $0x720f4166; WORD $0x03d0 \ // PSRLD XMM8,0x3                   /* XTMP4 = W[-15] >> 3                     */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    XORL d, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL d, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    LONG $0xf1720f66; BYTE $0x0e   \ // PSLLD XMM1,0xe                   /*                                         */
+    XORL h, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    XORL f, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    LONG $0xd9ef0f66               \ // PXOR  XMM3,XMM1                  /*                                         */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+52(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    LONG $0xdaef0f66               \ // PXOR  XMM3,XMM2                  /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
+    MOVL h, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, g \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL h, R15 \                                                        /* y2 = a                                  */
+    MOVOU X3, X1 \
+    LONG $0xef0f4166; BYTE $0xc8   \ // PXOR  XMM1,XMM8                  /* XTMP1 = s0                              */
+    ORL  b, R13 \                                                        /* y0 = a|c                                */
+    ADDL g, c \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL b, R15 \                                                        /* y2 = a&c                                */
+    \                                                                    /*                                         */
+    \                                                                    /* compute low s1                          */
+    \                                                                    /*                                         */
+    LONG $0xd7700f66; BYTE $0xfa   \ // PSHUFD XMM2,XMM7,0xfa            /* XTMP2 = W[-2] {BBAA}                    */
+    ANDL a, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, g \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    LONG $0xc1fe0f66               \ // PADDD XMM0,XMM1                  /* XTMP0 = W[-16] + W[-7] + s0             */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, g \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL c, R13 \                                                        /* y0 = e                                  */
+    MOVL g, R14 \                                                        /* y1 = a                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    XORL c, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    MOVL d, R15 \                                                        /* y2 = f                                  */
+    XORL g, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    MOVOU X2, X8 \
+    LONG $0x720f4166; WORD $0x0ad0 \ // PSRLD XMM8,0xa                   /* XTMP4 = W[-2] >> 10 {BBAA}              */
+    XORL e, R15 \                                                        /* y2 = f^g                                */
+    MOVOU X2, X3 \
+    LONG $0xd3730f66; BYTE $0x13   \ // PSRLQ XMM3,0x13                  /* XTMP3 = W[-2] MY_ROR 19 {xBxA}          */
+    XORL c, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL c, R15 \                                                        /* y2 = (f^g)&e                            */
+    LONG $0xd2730f66; BYTE $0x11   \ // PSRLQ XMM2,0x11                  /* XTMP2 = W[-2] MY_ROR 17 {xBxA}          */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    XORL g, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    XORL e, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    LONG $0xd3ef0f66               \ // PXOR  XMM2,XMM3                  /*                                         */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL _xfer+56(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    LONG $0xef0f4466; BYTE $0xc2   \ // PXOR  XMM8,XMM2                  /* XTMP4 = s1 {xBxA}                       */
+    MOVL g, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, f \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL g, R15 \                                                        /* y2 = a                                  */
+    LONG $0x380f4566; WORD $0xc200 \ // PSHUFB XMM8,XMM10                /* XTMP4 = s1 {00BA}                       */
+    ORL  a, R13 \                                                        /* y0 = a|c                                */
+    ADDL f, b \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL a, R15 \                                                        /* y2 = a&c                                */
+    LONG $0xfe0f4166; BYTE $0xc0   \ // PADDD XMM0,XMM8                  /* XTMP0 = {..., ..., W[1], W[0]}          */
+    ANDL h, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, f \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    \                                                                    /*                                         */
+    \                                                                    /* compute high s1                         */
+    \                                                                    /*                                         */
+    LONG $0xd0700f66; BYTE $0x50   \ // PSHUFD XMM2,XMM0,0x50            /* XTMP2 = W[-2] {DDCC}                    */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, f \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    MOVL b, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL f, R14 \                                                        /* y1 = a                                  */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    XORL b, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    MOVL c, R15 \                                                        /* y2 = f                                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    MOVOU X2, X11 \
+    LONG $0x720f4166; WORD $0x0ad3 \ // PSRLD XMM11,0xa                  /* XTMP5 = W[-2] >> 10 {DDCC}              */
+    XORL f, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    XORL d, R15 \                                                        /* y2 = f^g                                */
+    MOVOU X2, X3 \
+    LONG $0xd3730f66; BYTE $0x13   \ // PSRLQ XMM3,0x13                  /* XTMP3 = W[-2] MY_ROR 19 {xDxC}          */
+    XORL b, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ANDL b, R15 \                                                        /* y2 = (f^g)&e                            */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    LONG $0xd2730f66; BYTE $0x11   \ // PSRLQ XMM2,0x11                  /* XTMP2 = W[-2] MY_ROR 17 {xDxC}          */
+    XORL f, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL d, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    LONG $0xd3ef0f66               \ // PXOR  XMM2,XMM3                  /*                                         */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ADDL _xfer+60(FP), R15 \                                             /* y2 = k + w + S1 + CH                    */
+    LONG $0xef0f4466; BYTE $0xda   \ // PXOR  XMM11,XMM2                 /* XTMP5 = s1 {xDxC}                       */
+    MOVL f, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, e \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL f, R15 \                                                        /* y2 = a                                  */
+    LONG $0x380f4566; WORD $0xdc00 \ // PSHUFB XMM11,XMM12               /* XTMP5 = s1 {DC00}                       */
+    ORL  h, R13 \                                                        /* y0 = a|c                                */
+    ADDL e, a \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL h, R15 \                                                        /* y2 = a&c                                */
+    MOVOU X11, X4 \
+    LONG $0xe0fe0f66               \ // PADDD XMM4,XMM0                  /* X0 = {W[3], W[2], W[1], W[0]}           */
+    ANDL g, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, e \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, e \                                                        /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+    \                                                                    /* ROTATE_ARGS                             */
+    ROTATE_XS
+
+
+#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
+    MOVL e, R13 \                                                        /* y0 = e                                  */
+    ROLL $18, R13 \                                                      /* y0 = e >> (25-11)                       */
+    MOVL a, R14 \                                                        /* y1 = a                                  */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (25-11))                 */
+    ROLL $23, R14 \                                                      /* y1 = a >> (22-13)                       */
+    MOVL f, R15 \                                                        /* y2 = f                                  */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (22-13)                  */
+    ROLL $27, R13 \                                                      /* y0 = (e >> (11-6)) ^ (e >> (25-6))      */
+    XORL g, R15 \                                                        /* y2 = f^g                                */
+    XORL e, R13 \                                                        /* y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))  */
+    ROLL $21, R14 \                                                      /* y1 = (a >> (13-2)) ^ (a >> (22-2))      */
+    ANDL e, R15 \                                                        /* y2 = (f^g)&e                            */
+    XORL a, R14 \                                                        /* y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))  */
+    ROLL $26, R13 \                                                      /* y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)    */
+    XORL g, R15 \                                                        /* y2 = CH = ((f^g)&e)^g                   */
+    ADDL R13, R15 \                                                      /* y2 = S1 + CH                            */
+    ROLL $30, R14 \                                                      /* y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)    */
+    ADDL _xfer+offset(FP), R15 \                                         /* y2 = k + w + S1 + CH                    */
+    MOVL a, R13 \                                                        /* y0 = a                                  */
+    ADDL R15, h \                                                        /*  h = h + S1 + CH + k + w                */
+    MOVL a, R15 \                                                        /* y2 = a                                  */
+    ORL  c, R13 \                                                        /* y0 = a|c                                */
+    ADDL h, d \                                                          /*  d = d + h + S1 + CH + k + w            */
+    ANDL c, R15 \                                                        /* y2 = a&c                                */
+    ANDL b, R13 \                                                        /* y0 = (a|c)&b                            */
+    ADDL R14, h \                                                        /*  h = h + S1 + CH + k + w + S0           */
+    ORL  R15, R13 \                                                      /* y0 = MAJ = (a|c)&b)|(a&c)               */
+    ADDL R13, h                                                          /*  h = h + S1 + CH + k + w + S0 + MAJ     */
+
+
+// func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
+TEXT ·blockSsse(SB), 7, $0
+
+	MOVQ  h+0(FP), SI // SI: &h
+    MOVQ  message+24(FP), R8    // &message
+    MOVQ  lenmessage+32(FP), R9 // length of message
+    CMPQ  R9, $0
+	JEQ   done_hash
+    ADDQ  R8, R9
+    MOVQ  R9, _inp_end+64(FP)   // store end of message
+
+    // Register definition
+    //  a -->  eax
+    //  b -->  ebx
+    //  c -->  ecx
+    //  d -->  r8d
+    //  e -->  edx
+    //  f -->  r9d
+    //  g --> r10d
+    //  h --> r11d
+    //
+    // y0 --> r13d
+    // y1 --> r14d
+    // y2 --> r15d
+
+    MOVL    (0*4)(SI),  AX        // a = H0
+    MOVL    (1*4)(SI),  BX        // b = H1
+    MOVL    (2*4)(SI),  CX        // c = H2
+    MOVL    (3*4)(SI),  R8        // d = H3
+    MOVL    (4*4)(SI),  DX        // e = H4
+    MOVL    (5*4)(SI),  R9        // f = H5
+    MOVL    (6*4)(SI), R10        // g = H6
+    MOVL    (7*4)(SI), R11        // h = H7
+
+	MOVOU bflipMask<>(SB), X13
+	MOVOU  shuf00BA<>(SB), X10    // shuffle xBxA -> 00BA
+	MOVOU  shufDC00<>(SB), X12    // shuffle xDxC -> DC00
+
+	MOVQ  message+24(FP), SI // SI: &message
+
+loop0:
+	LEAQ constants<>(SB), BP
+
+	// byte swap first 16 dwords
+    MOVOU 0*16(SI), X4
+    LONG $0x380f4166; WORD $0xe500 // PSHUFB XMM4, XMM13
+    MOVOU 1*16(SI), X5
+    LONG $0x380f4166; WORD $0xed00 // PSHUFB XMM5, XMM13
+    MOVOU 2*16(SI), X6
+    LONG $0x380f4166; WORD $0xf500 // PSHUFB XMM6, XMM13
+    MOVOU 3*16(SI), X7
+    LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
+
+    MOVQ SI, _inp+72(FP)
+    MOVD $0x3, DI
+
+    // Align
+    //  nop    WORD PTR [rax+rax*1+0x0]
+
+	// schedule 48 input dwords, by doing 3 rounds of 16 each
+loop1:
+    MOVOU X4, X9
+    LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP]   /* Add 1st constant to first part of message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
+
+    MOVOU X4, X9
+    LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP]   /* Add 2nd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
+
+    MOVOU X4, X9
+    LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP]   /* Add 3rd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    FOUR_ROUNDS_AND_SCHED(AX, BX,  CX,  R8, DX, R9, R10, R11)
+
+    MOVOU X4, X9
+    LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP]   /* Add 4th constant to message */
+    MOVOU X9, _xfer+48(FP)
+    ADDQ $64, BP
+    FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX,  CX,  R8)
+
+	SUBQ    $1, DI
+	JNE     loop1
+
+    MOVD $0x2, DI
+loop2:
+    MOVOU X4, X9
+    LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP]   /* Add 1st constant to first part of message */
+    MOVOU X9, _xfer+48(FP)
+    DO_ROUND( AX,  BX,  CX,  R8,  DX,  R9, R10, R11, 48)
+    DO_ROUND(R11,  AX,  BX,  CX,  R8,  DX,  R9, R10, 52)
+    DO_ROUND(R10, R11,  AX,  BX,  CX,  R8,  DX,  R9, 56)
+    DO_ROUND( R9, R10, R11,  AX,  BX,  CX,  R8,  DX, 60)
+
+    MOVOU X5, X9
+    LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP]   /* Add 2nd constant to message */
+    MOVOU X9, _xfer+48(FP)
+    ADDQ $32, BP
+    DO_ROUND( DX,  R9, R10, R11,  AX,  BX,  CX,  R8, 48)
+    DO_ROUND( R8,  DX,  R9, R10, R11,  AX,  BX,  CX, 52)
+    DO_ROUND( CX,  R8,  DX,  R9, R10, R11,  AX,  BX, 56)
+    DO_ROUND( BX,  CX,  R8,  DX,  R9, R10, R11,  AX, 60)
+
+    MOVOU  X6, X4
+    MOVOU  X7, X5
+
+	SUBQ    $1, DI
+	JNE     loop2
+
+    MOVQ    h+0(FP), SI // SI: &h
+    ADDL    (0*4)(SI), AX     // H0 = a + H0
+    MOVL    AX, (0*4)(SI)
+    ADDL    (1*4)(SI), BX     // H1 = b + H1
+    MOVL    BX, (1*4)(SI)
+    ADDL    (2*4)(SI), CX     // H2 = c + H2
+    MOVL    CX, (2*4)(SI)
+    ADDL    (3*4)(SI), R8     // H3 = d + H3
+    MOVL    R8, (3*4)(SI)
+    ADDL    (4*4)(SI), DX     // H4 = e + H4
+    MOVL    DX, (4*4)(SI)
+    ADDL    (5*4)(SI), R9     // H5 = f + H5
+    MOVL    R9, (5*4)(SI)
+    ADDL    (6*4)(SI), R10    // H6 = g + H6
+    MOVL    R10, (6*4)(SI)
+    ADDL    (7*4)(SI), R11    // H7 = h + H7
+    MOVL    R11, (7*4)(SI)
+
+    MOVQ _inp+72(FP), SI
+	ADDQ $64, SI
+	CMPQ _inp_end+64(FP), SI
+     JNE  loop0
+
+done_hash:
+    RET
+
+// Constants table
+DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
+DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
+DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
+DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
+DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
+DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
+DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
+DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
+DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
+DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
+DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
+DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
+DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
+DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
+DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
+DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
+DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
+DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
+DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
+DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
+DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
+DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
+DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
+DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
+DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
+DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
+DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
+DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
+DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
+DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
+DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
+DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
+
+DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
+DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
+
+DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
+DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
+
+DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
+DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
+
+GLOBL constants<>(SB), 8, $256
+GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
+GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
+GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16

+ 24 - 0
vendor/github.com/minio/sha256-simd/sha256block_386.go

@@ -0,0 +1,24 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockArmGo(dig *digest, p []byte)  {}
+func blockAvx2Go(dig *digest, p []byte) {}
+func blockAvxGo(dig *digest, p []byte)  {}
+func blockSsseGo(dig *digest, p []byte) {}

+ 48 - 0
vendor/github.com/minio/sha256-simd/sha256block_amd64.go

@@ -0,0 +1,48 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockArmGo(dig *digest, p []byte) {}
+
+func blockAvxGo(dig *digest, p []byte) {
+
+	h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
+
+	blockAvx(h[:], p[:], 0, 0, 0, 0)
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
+}
+
+func blockAvx2Go(dig *digest, p []byte) {
+
+	h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
+
+	blockAvx2(h[:], p[:])
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
+}
+
+func blockSsseGo(dig *digest, p []byte) {
+
+	h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
+
+	blockSsse(h[:], p[:], 0, 0, 0, 0)
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
+}

+ 24 - 0
vendor/github.com/minio/sha256-simd/sha256block_arm.go

@@ -0,0 +1,24 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockAvx2Go(dig *digest, p []byte) {}
+func blockAvxGo(dig *digest, p []byte)  {}
+func blockSsseGo(dig *digest, p []byte) {}
+func blockArmGo(dig *digest, p []byte)  {}

+ 36 - 0
vendor/github.com/minio/sha256-simd/sha256block_arm64.go

@@ -0,0 +1,36 @@
+//+build !noasm
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockAvx2Go(dig *digest, p []byte) {}
+func blockAvxGo(dig *digest, p []byte)  {}
+func blockSsseGo(dig *digest, p []byte) {}
+
+//go:noescape
+func blockArm(h []uint32, message []uint8)
+
+func blockArmGo(dig *digest, p []byte) {
+
+	h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
+
+	blockArm(h[:], p[:])
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4],
+		h[5], h[6], h[7]
+}

+ 193 - 0
vendor/github.com/minio/sha256-simd/sha256block_arm64.s

@@ -0,0 +1,193 @@
+//+build !noasm !appengine
+
+// ARM64 version of SHA256
+
+//
+// Minio Cloud Storage, (C) 2016 Minio, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Based on implementaion as found in  https://github.com/jocover/sha256-armv8
+//
+// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
+// their Plan9 equivalents
+//
+
+TEXT ·blockArm(SB), 7, $0
+	MOVD h+0(FP), R0
+	MOVD message+24(FP), R1
+	MOVD lenmessage+32(FP), R2 // length of message
+	SUBS $64, R2
+	BMI  complete
+
+	// Load constants table pointer
+	MOVD $·constants(SB), R3
+
+	// Cache constants table in registers v16 - v31
+	WORD $0x4cdf2870 // ld1	{v16.4s-v19.4s}, [x3], #64
+	WORD $0x4cdf7800 // ld1	{v0.4s}, [x0], #16
+	WORD $0x4cdf2874 // ld1	{v20.4s-v23.4s}, [x3], #64
+
+	WORD $0x4c407801 // ld1	{v1.4s}, [x0]
+	WORD $0x4cdf2878 // ld1	{v24.4s-v27.4s}, [x3], #64
+	WORD $0xd1004000 // sub	x0, x0, #0x10
+	WORD $0x4cdf287c // ld1	{v28.4s-v31.4s}, [x3], #64
+
+loop:
+	// Main loop
+	WORD $0x4cdf2025 // ld1	{v5.16b-v8.16b}, [x1], #64
+	WORD $0x4ea01c02 // mov	v2.16b, v0.16b
+	WORD $0x4ea11c23 // mov	v3.16b, v1.16b
+	WORD $0x6e2008a5 // rev32	v5.16b, v5.16b
+	WORD $0x6e2008c6 // rev32	v6.16b, v6.16b
+	WORD $0x4eb084a9 // add	v9.4s, v5.4s, v16.4s
+	WORD $0x6e2008e7 // rev32	v7.16b, v7.16b
+	WORD $0x4eb184ca // add	v10.4s, v6.4s, v17.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e2828c5 // sha256su0	v5.4s, v6.4s
+	WORD $0x6e200908 // rev32	v8.16b, v8.16b
+	WORD $0x4eb284e9 // add	v9.4s, v7.4s, v18.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828e6 // sha256su0	v6.4s, v7.4s
+	WORD $0x5e0860e5 // sha256su1	v5.4s, v7.4s, v8.4s
+	WORD $0x4eb3850a // add	v10.4s, v8.4s, v19.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e282907 // sha256su0	v7.4s, v8.4s
+	WORD $0x5e056106 // sha256su1	v6.4s, v8.4s, v5.4s
+	WORD $0x4eb484a9 // add	v9.4s, v5.4s, v20.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828a8 // sha256su0	v8.4s, v5.4s
+	WORD $0x5e0660a7 // sha256su1	v7.4s, v5.4s, v6.4s
+	WORD $0x4eb584ca // add	v10.4s, v6.4s, v21.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e2828c5 // sha256su0	v5.4s, v6.4s
+	WORD $0x5e0760c8 // sha256su1	v8.4s, v6.4s, v7.4s
+	WORD $0x4eb684e9 // add	v9.4s, v7.4s, v22.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828e6 // sha256su0	v6.4s, v7.4s
+	WORD $0x5e0860e5 // sha256su1	v5.4s, v7.4s, v8.4s
+	WORD $0x4eb7850a // add	v10.4s, v8.4s, v23.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e282907 // sha256su0	v7.4s, v8.4s
+	WORD $0x5e056106 // sha256su1	v6.4s, v8.4s, v5.4s
+	WORD $0x4eb884a9 // add	v9.4s, v5.4s, v24.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828a8 // sha256su0	v8.4s, v5.4s
+	WORD $0x5e0660a7 // sha256su1	v7.4s, v5.4s, v6.4s
+	WORD $0x4eb984ca // add	v10.4s, v6.4s, v25.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e2828c5 // sha256su0	v5.4s, v6.4s
+	WORD $0x5e0760c8 // sha256su1	v8.4s, v6.4s, v7.4s
+	WORD $0x4eba84e9 // add	v9.4s, v7.4s, v26.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828e6 // sha256su0	v6.4s, v7.4s
+	WORD $0x5e0860e5 // sha256su1	v5.4s, v7.4s, v8.4s
+	WORD $0x4ebb850a // add	v10.4s, v8.4s, v27.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e282907 // sha256su0	v7.4s, v8.4s
+	WORD $0x5e056106 // sha256su1	v6.4s, v8.4s, v5.4s
+	WORD $0x4ebc84a9 // add	v9.4s, v5.4s, v28.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x5e2828a8 // sha256su0	v8.4s, v5.4s
+	WORD $0x5e0660a7 // sha256su1	v7.4s, v5.4s, v6.4s
+	WORD $0x4ebd84ca // add	v10.4s, v6.4s, v29.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x5e0760c8 // sha256su1	v8.4s, v6.4s, v7.4s
+	WORD $0x4ebe84e9 // add	v9.4s, v7.4s, v30.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x4ebf850a // add	v10.4s, v8.4s, v31.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e094062 // sha256h	q2, q3, v9.4s
+	WORD $0x5e095083 // sha256h2	q3, q4, v9.4s
+	WORD $0x4ea21c44 // mov	v4.16b, v2.16b
+	WORD $0x5e0a4062 // sha256h	q2, q3, v10.4s
+	WORD $0x5e0a5083 // sha256h2	q3, q4, v10.4s
+	WORD $0x4ea38421 // add	v1.4s, v1.4s, v3.4s
+	WORD $0x4ea28400 // add	v0.4s, v0.4s, v2.4s
+
+	SUBS $64, R2
+	BPL  loop
+
+	// Store result
+	WORD $0x4c00a800 // st1	{v0.4s, v1.4s}, [x0]
+
+complete:
+	RET
+
+
+// Constants table
+DATA ·constants+0x0(SB)/8, $0x71374491428a2f98
+DATA ·constants+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
+DATA ·constants+0x10(SB)/8, $0x59f111f13956c25b
+DATA ·constants+0x18(SB)/8, $0xab1c5ed5923f82a4
+DATA ·constants+0x20(SB)/8, $0x12835b01d807aa98
+DATA ·constants+0x28(SB)/8, $0x550c7dc3243185be
+DATA ·constants+0x30(SB)/8, $0x80deb1fe72be5d74
+DATA ·constants+0x38(SB)/8, $0xc19bf1749bdc06a7
+DATA ·constants+0x40(SB)/8, $0xefbe4786e49b69c1
+DATA ·constants+0x48(SB)/8, $0x240ca1cc0fc19dc6
+DATA ·constants+0x50(SB)/8, $0x4a7484aa2de92c6f
+DATA ·constants+0x58(SB)/8, $0x76f988da5cb0a9dc
+DATA ·constants+0x60(SB)/8, $0xa831c66d983e5152
+DATA ·constants+0x68(SB)/8, $0xbf597fc7b00327c8
+DATA ·constants+0x70(SB)/8, $0xd5a79147c6e00bf3
+DATA ·constants+0x78(SB)/8, $0x1429296706ca6351
+DATA ·constants+0x80(SB)/8, $0x2e1b213827b70a85
+DATA ·constants+0x88(SB)/8, $0x53380d134d2c6dfc
+DATA ·constants+0x90(SB)/8, $0x766a0abb650a7354
+DATA ·constants+0x98(SB)/8, $0x92722c8581c2c92e
+DATA ·constants+0xa0(SB)/8, $0xa81a664ba2bfe8a1
+DATA ·constants+0xa8(SB)/8, $0xc76c51a3c24b8b70
+DATA ·constants+0xb0(SB)/8, $0xd6990624d192e819
+DATA ·constants+0xb8(SB)/8, $0x106aa070f40e3585
+DATA ·constants+0xc0(SB)/8, $0x1e376c0819a4c116
+DATA ·constants+0xc8(SB)/8, $0x34b0bcb52748774c
+DATA ·constants+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
+DATA ·constants+0xd8(SB)/8, $0x682e6ff35b9cca4f
+DATA ·constants+0xe0(SB)/8, $0x78a5636f748f82ee
+DATA ·constants+0xe8(SB)/8, $0x8cc7020884c87814
+DATA ·constants+0xf0(SB)/8, $0xa4506ceb90befffa
+DATA ·constants+0xf8(SB)/8, $0xc67178f2bef9a3f7
+
+GLOBL ·constants(SB), 8, $256
+

+ 136 - 0
vendor/github.com/minio/sha256-simd/sha256block_noasm.go

@@ -0,0 +1,136 @@
+//+build !arm64 !amd64 noasm appengine
+
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockGeneric(dig *digest, p []byte) {
+	var w [64]uint32
+	h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
+	for len(p) >= chunk {
+		// Can interlace the computation of w with the
+		// rounds below if needed for speed.
+		for i := 0; i < 16; i++ {
+			j := i * 4
+			w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
+		}
+		for i := 16; i < 64; i++ {
+			v1 := w[i-2]
+			t1 := (v1>>17 | v1<<(32-17)) ^ (v1>>19 | v1<<(32-19)) ^ (v1 >> 10)
+			v2 := w[i-15]
+			t2 := (v2>>7 | v2<<(32-7)) ^ (v2>>18 | v2<<(32-18)) ^ (v2 >> 3)
+			w[i] = t1 + w[i-7] + t2 + w[i-16]
+		}
+
+		a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7
+
+		for i := 0; i < 64; i++ {
+			t1 := h + ((e>>6 | e<<(32-6)) ^ (e>>11 | e<<(32-11)) ^ (e>>25 | e<<(32-25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i]
+
+			t2 := ((a>>2 | a<<(32-2)) ^ (a>>13 | a<<(32-13)) ^ (a>>22 | a<<(32-22))) + ((a & b) ^ (a & c) ^ (b & c))
+
+			h = g
+			g = f
+			f = e
+			e = d + t1
+			d = c
+			c = b
+			b = a
+			a = t1 + t2
+		}
+
+		h0 += a
+		h1 += b
+		h2 += c
+		h3 += d
+		h4 += e
+		h5 += f
+		h6 += g
+		h7 += h
+
+		p = p[chunk:]
+	}
+
+	dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h0, h1, h2, h3, h4, h5, h6, h7
+}
+
+var _K = []uint32{
+	0x428a2f98,
+	0x71374491,
+	0xb5c0fbcf,
+	0xe9b5dba5,
+	0x3956c25b,
+	0x59f111f1,
+	0x923f82a4,
+	0xab1c5ed5,
+	0xd807aa98,
+	0x12835b01,
+	0x243185be,
+	0x550c7dc3,
+	0x72be5d74,
+	0x80deb1fe,
+	0x9bdc06a7,
+	0xc19bf174,
+	0xe49b69c1,
+	0xefbe4786,
+	0x0fc19dc6,
+	0x240ca1cc,
+	0x2de92c6f,
+	0x4a7484aa,
+	0x5cb0a9dc,
+	0x76f988da,
+	0x983e5152,
+	0xa831c66d,
+	0xb00327c8,
+	0xbf597fc7,
+	0xc6e00bf3,
+	0xd5a79147,
+	0x06ca6351,
+	0x14292967,
+	0x27b70a85,
+	0x2e1b2138,
+	0x4d2c6dfc,
+	0x53380d13,
+	0x650a7354,
+	0x766a0abb,
+	0x81c2c92e,
+	0x92722c85,
+	0xa2bfe8a1,
+	0xa81a664b,
+	0xc24b8b70,
+	0xc76c51a3,
+	0xd192e819,
+	0xd6990624,
+	0xf40e3585,
+	0x106aa070,
+	0x19a4c116,
+	0x1e376c08,
+	0x2748774c,
+	0x34b0bcb5,
+	0x391c0cb3,
+	0x4ed8aa4a,
+	0x5b9cca4f,
+	0x682e6ff3,
+	0x748f82ee,
+	0x78a5636f,
+	0x84c87814,
+	0x8cc70208,
+	0x90befffa,
+	0xa4506ceb,
+	0xbef9a3f7,
+	0xc67178f2,
+}

+ 22 - 0
vendor/github.com/minio/sha256-simd/sha256block_ppc64.go

@@ -0,0 +1,22 @@
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockAvx2Go(dig *digest, p []byte) {}
+func blockAvxGo(dig *digest, p []byte)  {}
+func blockSsseGo(dig *digest, p []byte) {}
+func blockArmGo(dig *digest, p []byte)  {}

+ 22 - 0
vendor/github.com/minio/sha256-simd/sha256block_ppc64le.go

@@ -0,0 +1,22 @@
+/*
+ * Minio Cloud Storage, (C) 2016 Minio, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package sha256
+
+func blockAvx2Go(dig *digest, p []byte) {}
+func blockAvxGo(dig *digest, p []byte)  {}
+func blockSsseGo(dig *digest, p []byte) {}
+func blockArmGo(dig *digest, p []byte)  {}

+ 7 - 15
vendor/manifest

@@ -107,14 +107,6 @@
 			"branch": "master",
 			"notests": true
 		},
-		{
-			"importpath": "github.com/gogo/protobuf/proto",
-			"repository": "https://github.com/gogo/protobuf",
-			"revision": "7883e1468d48d969e1c3ce4bcde89b6a7dd4adc4",
-			"branch": "master",
-			"path": "/proto",
-			"notests": true
-		},
 		{
 			"importpath": "github.com/golang/groupcache/lru",
 			"repository": "https://github.com/golang/groupcache",
@@ -155,6 +147,13 @@
 			"branch": "master",
 			"notests": true
 		},
+		{
+			"importpath": "github.com/minio/sha256-simd",
+			"repository": "https://github.com/minio/sha256-simd",
+			"revision": "672e7bc9f3482375df73741cf57a157fe187ec26",
+			"branch": "master",
+			"notests": true
+		},
 		{
 			"importpath": "github.com/onsi/ginkgo",
 			"repository": "https://github.com/onsi/ginkgo",
@@ -199,13 +198,6 @@
 			"revision": "ab8b5dcf1042e818ab68e770d465112a899b668e",
 			"branch": "master"
 		},
-		{
-			"importpath": "github.com/syndtr/goleveldb/leveldb",
-			"repository": "https://github.com/syndtr/goleveldb",
-			"revision": "ad0d8b2ab58a55ed5c58073aa46451d5e1ca1280",
-			"branch": "master",
-			"path": "/leveldb"
-		},
 		{
 			"importpath": "github.com/thejerf/suture",
 			"repository": "https://github.com/thejerf/suture",