8 سال پیش · cbcc3ea132
--- a/lib/connections/kcp_dial.go
+++ b/lib/connections/kcp_dial.go
@@ -11,9 +11,9 @@ import (
 
				 	"net/url"
			
 
				 	"time"
			
 
				 
			
 
				+	"github.com/AudriusButkevicius/kcp-go"
			
 
				 	"github.com/syncthing/syncthing/lib/config"
			
 
				 	"github.com/syncthing/syncthing/lib/protocol"
			
 
				-	"github.com/xtaci/kcp-go"
			
 
				 	"github.com/xtaci/smux"
			
 
				 )
			
 
				 
			
@@ -38,7 +38,7 @@ func (d *kcpDialer) Dial(id protocol.DeviceID, uri *url.URL) (internalConn, erro
 
				 	// Try to dial via an existing listening connection
			
 
				 	// giving better changes punching through NAT.
			
 
				 	if f := getDialingFilter(); f != nil {
			
 
				-		conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}))
			
 
				+		conn, err = kcp.NewConn(uri.Host, nil, 0, 0, f.NewConn(kcpConversationFilterPriority, &kcpConversationFilter{}), false)
			
 
				 		l.Debugf("dial %s using existing conn on %s", uri.String(), conn.LocalAddr())
			
 
				 	} else {
			
 
				 		conn, err = kcp.DialWithOptions(uri.Host, nil, 0, 0)
			
--- a/lib/connections/kcp_listen.go
+++ b/lib/connections/kcp_listen.go
@@ -14,11 +14,11 @@ import (
 
				 	"sync"
			
 
				 	"time"
			
 
				 
			
 
				+	"github.com/AudriusButkevicius/kcp-go"
			
 
				 	"github.com/AudriusButkevicius/pfilter"
			
 
				 	"github.com/ccding/go-stun/stun"
			
 
				 	"github.com/syncthing/syncthing/lib/config"
			
 
				 	"github.com/syncthing/syncthing/lib/nat"
			
 
				-	"github.com/xtaci/kcp-go"
			
 
				 	"github.com/xtaci/smux"
			
 
				 )
			
 
				 
			
--- a/lib/protocol/benchmark_test.go
+++ b/lib/protocol/benchmark_test.go
@@ -8,8 +8,8 @@ import (
 
				 	"net"
			
 
				 	"testing"
			
 
				 
			
 
				+	"github.com/AudriusButkevicius/kcp-go"
			
 
				 	"github.com/syncthing/syncthing/lib/dialer"
			
 
				-	"github.com/xtaci/kcp-go"
			
 
				 )
			
 
				 
			
 
				 func BenchmarkRequestsRawTCP(b *testing.B) {
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/LICENSE
--- a/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/crypt.go
@@ -6,6 +6,8 @@ import (
 
				 	"crypto/des"
			
 
				 	"crypto/sha1"
			
 
				 
			
 
				+	"github.com/templexxx/xor"
			
 
				+
			
 
				 	"golang.org/x/crypto/blowfish"
			
 
				 	"golang.org/x/crypto/cast5"
			
 
				 	"golang.org/x/crypto/pbkdf2"
			
@@ -218,8 +220,8 @@ func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
 
				 	return c, nil
			
 
				 }
			
 
				 
			
 
				-func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
			
 
				-func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
			
 
				+func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }
			
 
				+func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xor.Bytes(dst, src, c.xortbl) }
			
 
				 
			
 
				 type noneBlockCrypt struct{}
			
 
				 
			
@@ -239,11 +241,11 @@ func encrypt(block cipher.Block, dst, src, buf []byte) {
 
				 	n := len(src) / blocksize
			
 
				 	base := 0
			
 
				 	for i := 0; i < n; i++ {
			
 
				-		xorWords(dst[base:], src[base:], tbl)
			
 
				+		xor.BytesSrc1(dst[base:], src[base:], tbl)
			
 
				 		block.Encrypt(tbl, dst[base:])
			
 
				 		base += blocksize
			
 
				 	}
			
 
				-	xorBytes(dst[base:], src[base:], tbl)
			
 
				+	xor.BytesSrc0(dst[base:], src[base:], tbl)
			
 
				 }
			
 
				 
			
 
				 func decrypt(block cipher.Block, dst, src, buf []byte) {
			
@@ -255,9 +257,9 @@ func decrypt(block cipher.Block, dst, src, buf []byte) {
 
				 	base := 0
			
 
				 	for i := 0; i < n; i++ {
			
 
				 		block.Encrypt(next, src[base:])
			
 
				-		xorWords(dst[base:], src[base:], tbl)
			
 
				+		xor.BytesSrc1(dst[base:], src[base:], tbl)
			
 
				 		tbl, next = next, tbl
			
 
				 		base += blocksize
			
 
				 	}
			
 
				-	xorBytes(dst[base:], src[base:], tbl)
			
 
				+	xor.BytesSrc0(dst[base:], src[base:], tbl)
			
 
				 }
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/fec.go
@@ -22,8 +22,8 @@ type (
 
				 		data  []byte
			
 
				 	}
			
 
				 
			
 
				-	// FECDecoder for decoding incoming packets
			
 
				-	FECDecoder struct {
			
 
				+	// fecDecoder for decoding incoming packets
			
 
				+	fecDecoder struct {
			
 
				 		rxlimit      int // queue size limit
			
 
				 		dataShards   int
			
 
				 		parityShards int
			
@@ -39,7 +39,7 @@ type (
 
				 	}
			
 
				 )
			
 
				 
			
 
				-func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
			
 
				+func newFECDecoder(rxlimit, dataShards, parityShards int) *fecDecoder {
			
 
				 	if dataShards <= 0 || parityShards <= 0 {
			
 
				 		return nil
			
 
				 	}
			
@@ -47,7 +47,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
 
				 		return nil
			
 
				 	}
			
 
				 
			
 
				-	fec := new(FECDecoder)
			
 
				+	fec := new(fecDecoder)
			
 
				 	fec.rxlimit = rxlimit
			
 
				 	fec.dataShards = dataShards
			
 
				 	fec.parityShards = parityShards
			
@@ -63,7 +63,7 @@ func newFECDecoder(rxlimit, dataShards, parityShards int) *FECDecoder {
 
				 }
			
 
				 
			
 
				 // decodeBytes a fec packet
			
 
				-func (dec *FECDecoder) decodeBytes(data []byte) fecPacket {
			
 
				+func (dec *fecDecoder) decodeBytes(data []byte) fecPacket {
			
 
				 	var pkt fecPacket
			
 
				 	pkt.seqid = binary.LittleEndian.Uint32(data)
			
 
				 	pkt.flag = binary.LittleEndian.Uint16(data[4:])
			
@@ -74,8 +74,8 @@ func (dec *FECDecoder) decodeBytes(data []byte) fecPacket {
 
				 	return pkt
			
 
				 }
			
 
				 
			
 
				-// Decode a fec packet
			
 
				-func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) {
			
 
				+// decode a fec packet
			
 
				+func (dec *fecDecoder) decode(pkt fecPacket) (recovered [][]byte) {
			
 
				 	// insertion
			
 
				 	n := len(dec.rx) - 1
			
 
				 	insertIdx := 0
			
@@ -179,7 +179,7 @@ func (dec *FECDecoder) Decode(pkt fecPacket) (recovered [][]byte) {
 
				 }
			
 
				 
			
 
				 // free a range of fecPacket, and zero for GC recycling
			
 
				-func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
			
 
				+func (dec *fecDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
			
 
				 	for i := first; i < first+n; i++ { // free
			
 
				 		xmitBuf.Put(q[i].data)
			
 
				 	}
			
@@ -191,8 +191,8 @@ func (dec *FECDecoder) freeRange(first, n int, q []fecPacket) []fecPacket {
 
				 }
			
 
				 
			
 
				 type (
			
 
				-	// FECEncoder for encoding outgoing packets
			
 
				-	FECEncoder struct {
			
 
				+	// fecEncoder for encoding outgoing packets
			
 
				+	fecEncoder struct {
			
 
				 		dataShards   int
			
 
				 		parityShards int
			
 
				 		shardSize    int
			
@@ -214,11 +214,11 @@ type (
 
				 	}
			
 
				 )
			
 
				 
			
 
				-func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder {
			
 
				+func newFECEncoder(dataShards, parityShards, offset int) *fecEncoder {
			
 
				 	if dataShards <= 0 || parityShards <= 0 {
			
 
				 		return nil
			
 
				 	}
			
 
				-	fec := new(FECEncoder)
			
 
				+	fec := new(fecEncoder)
			
 
				 	fec.dataShards = dataShards
			
 
				 	fec.parityShards = parityShards
			
 
				 	fec.shardSize = dataShards + parityShards
			
@@ -241,9 +241,9 @@ func newFECEncoder(dataShards, parityShards, offset int) *FECEncoder {
 
				 	return fec
			
 
				 }
			
 
				 
			
 
				-// Encode the packet, output parity shards if we have enough datashards
			
 
				-// the content of returned parityshards will change in next Encode
			
 
				-func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) {
			
 
				+// encode the packet, output parity shards if we have enough datashards
			
 
				+// the content of returned parityshards will change in next encode
			
 
				+func (enc *fecEncoder) encode(b []byte) (ps [][]byte) {
			
 
				 	enc.markData(b[enc.headerOffset:])
			
 
				 	binary.LittleEndian.PutUint16(b[enc.payloadOffset:], uint16(len(b[enc.payloadOffset:])))
			
 
				 
			
@@ -290,13 +290,13 @@ func (enc *FECEncoder) Encode(b []byte) (ps [][]byte) {
 
				 	return
			
 
				 }
			
 
				 
			
 
				-func (enc *FECEncoder) markData(data []byte) {
			
 
				+func (enc *fecEncoder) markData(data []byte) {
			
 
				 	binary.LittleEndian.PutUint32(data, enc.next)
			
 
				 	binary.LittleEndian.PutUint16(data[4:], typeData)
			
 
				 	enc.next++
			
 
				 }
			
 
				 
			
 
				-func (enc *FECEncoder) markFEC(data []byte) {
			
 
				+func (enc *fecEncoder) markFEC(data []byte) {
			
 
				 	binary.LittleEndian.PutUint32(data, enc.next)
			
 
				 	binary.LittleEndian.PutUint16(data[4:], typeFEC)
			
 
				 	enc.next = (enc.next + 1) % enc.paws
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/kcp.go
@@ -30,8 +30,8 @@ const (
 
				 	IKCP_PROBE_LIMIT = 120000 // up to 120 secs to probe window
			
 
				 )
			
 
				 
			
 
				-// Output is a closure which captures conn and calls conn.Write
			
 
				-type Output func(buf []byte, size int)
			
 
				+// output_callback is a prototype which ought capture conn and call conn.Write
			
 
				+type output_callback func(buf []byte, size int)
			
 
				 
			
 
				 /* encode 8 bits unsigned int */
			
 
				 func ikcp_encode8u(p []byte, c byte) []byte {
			
@@ -91,8 +91,8 @@ func _itimediff(later, earlier uint32) int32 {
 
				 	return (int32)(later - earlier)
			
 
				 }
			
 
				 
			
 
				-// Segment defines a KCP segment
			
 
				-type Segment struct {
			
 
				+// segment defines a KCP segment
			
 
				+type segment struct {
			
 
				 	conv     uint32
			
 
				 	cmd      uint8
			
 
				 	frg      uint8
			
@@ -108,11 +108,11 @@ type Segment struct {
 
				 }
			
 
				 
			
 
				 // encode a segment into buffer
			
 
				-func (seg *Segment) encode(ptr []byte) []byte {
			
 
				+func (seg *segment) encode(ptr []byte) []byte {
			
 
				 	ptr = ikcp_encode32u(ptr, seg.conv)
			
 
				-	ptr = ikcp_encode8u(ptr, uint8(seg.cmd))
			
 
				-	ptr = ikcp_encode8u(ptr, uint8(seg.frg))
			
 
				-	ptr = ikcp_encode16u(ptr, uint16(seg.wnd))
			
 
				+	ptr = ikcp_encode8u(ptr, seg.cmd)
			
 
				+	ptr = ikcp_encode8u(ptr, seg.frg)
			
 
				+	ptr = ikcp_encode16u(ptr, seg.wnd)
			
 
				 	ptr = ikcp_encode32u(ptr, seg.ts)
			
 
				 	ptr = ikcp_encode32u(ptr, seg.sn)
			
 
				 	ptr = ikcp_encode32u(ptr, seg.una)
			
@@ -137,15 +137,15 @@ type KCP struct {
 
				 	fastresend     int32
			
 
				 	nocwnd, stream int32
			
 
				 
			
 
				-	snd_queue []Segment
			
 
				-	rcv_queue []Segment
			
 
				-	snd_buf   []Segment
			
 
				-	rcv_buf   []Segment
			
 
				+	snd_queue []segment
			
 
				+	rcv_queue []segment
			
 
				+	snd_buf   []segment
			
 
				+	rcv_buf   []segment
			
 
				 
			
 
				 	acklist []ackItem
			
 
				 
			
 
				 	buffer []byte
			
 
				-	output Output
			
 
				+	output output_callback
			
 
				 }
			
 
				 
			
 
				 type ackItem struct {
			
@@ -155,7 +155,7 @@ type ackItem struct {
 
				 
			
 
				 // NewKCP create a new kcp control object, 'conv' must equal in two endpoint
			
 
				 // from the same connection.
			
 
				-func NewKCP(conv uint32, output Output) *KCP {
			
 
				+func NewKCP(conv uint32, output output_callback) *KCP {
			
 
				 	kcp := new(KCP)
			
 
				 	kcp.conv = conv
			
 
				 	kcp.snd_wnd = IKCP_WND_SND
			
@@ -175,13 +175,13 @@ func NewKCP(conv uint32, output Output) *KCP {
 
				 }
			
 
				 
			
 
				 // newSegment creates a KCP segment
			
 
				-func (kcp *KCP) newSegment(size int) (seg Segment) {
			
 
				+func (kcp *KCP) newSegment(size int) (seg segment) {
			
 
				 	seg.data = xmitBuf.Get().([]byte)[:size]
			
 
				 	return
			
 
				 }
			
 
				 
			
 
				 // delSegment recycles a KCP segment
			
 
				-func (kcp *KCP) delSegment(seg Segment) {
			
 
				+func (kcp *KCP) delSegment(seg segment) {
			
 
				 	xmitBuf.Put(seg.data)
			
 
				 }
			
 
				 
			
@@ -384,7 +384,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
 
				 		if sn == seg.sn {
			
 
				 			kcp.delSegment(*seg)
			
 
				 			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
			
 
				-			kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{}
			
 
				+			kcp.snd_buf[len(kcp.snd_buf)-1] = segment{}
			
 
				 			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
			
 
				 			break
			
 
				 		}
			
@@ -430,7 +430,7 @@ func (kcp *KCP) ack_push(sn, ts uint32) {
 
				 	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
			
 
				 }
			
 
				 
			
 
				-func (kcp *KCP) parse_data(newseg Segment) {
			
 
				+func (kcp *KCP) parse_data(newseg segment) {
			
 
				 	sn := newseg.sn
			
 
				 	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
			
 
				 		_itimediff(sn, kcp.rcv_nxt) < 0 {
			
@@ -458,7 +458,7 @@ func (kcp *KCP) parse_data(newseg Segment) {
 
				 		if insert_idx == n+1 {
			
 
				 			kcp.rcv_buf = append(kcp.rcv_buf, newseg)
			
 
				 		} else {
			
 
				-			kcp.rcv_buf = append(kcp.rcv_buf, Segment{})
			
 
				+			kcp.rcv_buf = append(kcp.rcv_buf, segment{})
			
 
				 			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
			
 
				 			kcp.rcv_buf[insert_idx] = newseg
			
 
				 		}
			
@@ -625,7 +625,7 @@ func (kcp *KCP) wnd_unused() uint16 {
 
				 
			
 
				 // flush pending data
			
 
				 func (kcp *KCP) flush(ackOnly bool) {
			
 
				-	var seg Segment
			
 
				+	var seg segment
			
 
				 	seg.conv = kcp.conv
			
 
				 	seg.cmd = IKCP_CMD_ACK
			
 
				 	seg.wnd = kcp.wnd_unused()
			
@@ -989,10 +989,10 @@ func (kcp *KCP) WaitSnd() int {
 
				 }
			
 
				 
			
 
				 // remove front n elements from queue
			
 
				-func (kcp *KCP) remove_front(q []Segment, n int) []Segment {
			
 
				+func (kcp *KCP) remove_front(q []segment, n int) []segment {
			
 
				 	newn := copy(q, q[n:])
			
 
				 	for i := newn; i < len(q); i++ {
			
 
				-		q[i] = Segment{} // manual set nil for GC
			
 
				+		q[i] = segment{} // manual set nil for GC
			
 
				 	}
			
 
				 	return q[:newn]
			
 
				 }
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/sess.go
@@ -3,6 +3,7 @@ package kcp
 
				 import (
			
 
				 	"crypto/rand"
			
 
				 	"encoding/binary"
			
 
				+	"fmt"
			
 
				 	"hash/crc32"
			
 
				 	"io"
			
 
				 	"net"
			
@@ -54,9 +55,6 @@ var (
 
				 	// global packet buffer
			
 
				 	// shared among sending/receiving/FEC
			
 
				 	xmitBuf sync.Pool
			
 
				-
			
 
				-	// monotonic session id
			
 
				-	sid uint32
			
 
				 )
			
 
				 
			
 
				 func init() {
			
@@ -68,11 +66,12 @@ func init() {
 
				 type (
			
 
				 	// UDPSession defines a KCP session implemented by UDP
			
 
				 	UDPSession struct {
			
 
				-		sid   uint32         // session id(monotonic)
			
 
				-		conn  net.PacketConn // the underlying packet connection
			
 
				-		kcp   *KCP           // KCP ARQ protocol
			
 
				-		l     *Listener      // point to the Listener if it's accepted by Listener
			
 
				-		block BlockCrypt     // block encryption
			
 
				+		updaterIdx int            // record slice index in updater
			
 
				+		conn       net.PacketConn // the underlying packet connection
			
 
				+		closeConn  bool           // Should we close the underlying conn once UDPSession is closed.
			
 
				+		kcp        *KCP           // KCP ARQ protocol
			
 
				+		l          *Listener      // point to the Listener if it's accepted by Listener
			
 
				+		block      BlockCrypt     // block encryption
			
 
				 
			
 
				 		// kcp receiving is based on packets
			
 
				 		// recvbuf turns packets into stream
			
@@ -82,22 +81,23 @@ type (
 
				 		ext []byte
			
 
				 
			
 
				 		// FEC
			
 
				-		fecDecoder *FECDecoder
			
 
				-		fecEncoder *FECEncoder
			
 
				+		fecDecoder *fecDecoder
			
 
				+		fecEncoder *fecEncoder
			
 
				 
			
 
				 		// settings
			
 
				-		remote         net.Addr      // remote peer address
			
 
				-		rd             time.Time     // read deadline
			
 
				-		wd             time.Time     // write deadline
			
 
				-		headerSize     int           // the overall header size added before KCP frame
			
 
				-		updateInterval time.Duration // interval in seconds to call kcp.flush()
			
 
				-		ackNoDelay     bool          // send ack immediately for each incoming packet
			
 
				-		writeDelay     bool          // delay kcp.flush() for Write() for bulk transfer
			
 
				+		remote     net.Addr  // remote peer address
			
 
				+		rd         time.Time // read deadline
			
 
				+		wd         time.Time // write deadline
			
 
				+		headerSize int       // the overall header size added before KCP frame
			
 
				+		ackNoDelay bool      // send ack immediately for each incoming packet
			
 
				+		writeDelay bool      // delay kcp.flush() for Write() for bulk transfer
			
 
				+		dup        int       // duplicate udp packets
			
 
				 
			
 
				 		// notifications
			
 
				 		die          chan struct{} // notify session has Closed
			
 
				 		chReadEvent  chan struct{} // notify Read() can be called without blocking
			
 
				 		chWriteEvent chan struct{} // notify Write() can be called without blocking
			
 
				+		chErrorEvent chan error    // notify Read() have an error
			
 
				 
			
 
				 		isClosed bool // flag the session has Closed
			
 
				 		mu       sync.Mutex
			
@@ -113,14 +113,15 @@ type (
 
				 )
			
 
				 
			
 
				 // newUDPSession create a new udp session for client or server
			
 
				-func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
			
 
				+func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt, closeConn bool) *UDPSession {
			
 
				 	sess := new(UDPSession)
			
 
				-	sess.sid = atomic.AddUint32(&sid, 1)
			
 
				 	sess.die = make(chan struct{})
			
 
				 	sess.chReadEvent = make(chan struct{}, 1)
			
 
				 	sess.chWriteEvent = make(chan struct{}, 1)
			
 
				+	sess.chErrorEvent = make(chan error, 1)
			
 
				 	sess.remote = remote
			
 
				 	sess.conn = conn
			
 
				+	sess.closeConn = closeConn
			
 
				 	sess.l = l
			
 
				 	sess.block = block
			
 
				 	sess.recvbuf = make([]byte, mtuLimit)
			
@@ -232,6 +233,11 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 
				 		case <-s.chReadEvent:
			
 
				 		case <-c:
			
 
				 		case <-s.die:
			
 
				+		case err = <-s.chErrorEvent:
			
 
				+			if timeout != nil {
			
 
				+				timeout.Stop()
			
 
				+			}
			
 
				+			return n, err
			
 
				 		}
			
 
				 
			
 
				 		if timeout != nil {
			
@@ -299,9 +305,11 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 
				 
			
 
				 // Close closes the connection.
			
 
				 func (s *UDPSession) Close() error {
			
 
				+	// remove this session from updater & listener(if necessary)
			
 
				 	updater.removeSession(s)
			
 
				 	if s.l != nil { // notify listener
			
 
				-		s.l.closeSession(s.remote)
			
 
				+		key := fmt.Sprintf("%s/%d", s.remote.String(), s.kcp.conv)
			
 
				+		s.l.closeSession(key)
			
 
				 	}
			
 
				 
			
 
				 	s.mu.Lock()
			
@@ -312,7 +320,7 @@ func (s *UDPSession) Close() error {
 
				 	close(s.die)
			
 
				 	s.isClosed = true
			
 
				 	atomic.AddUint64(&DefaultSnmp.CurrEstab, ^uint64(0))
			
 
				-	if s.l == nil { // client socket close
			
 
				+	if s.l == nil && s.closeConn { // client socket close
			
 
				 		return s.conn.Close()
			
 
				 	}
			
 
				 	return nil
			
@@ -393,12 +401,19 @@ func (s *UDPSession) SetACKNoDelay(nodelay bool) {
 
				 	s.ackNoDelay = nodelay
			
 
				 }
			
 
				 
			
 
				+// SetDUP duplicates udp packets for kcp output, for testing purpose only
			
 
				+func (s *UDPSession) SetDUP(dup int) {
			
 
				+	s.mu.Lock()
			
 
				+	defer s.mu.Unlock()
			
 
				+	s.dup = dup
			
 
				+}
			
 
				+
			
 
				 // SetNoDelay calls nodelay() of kcp
			
 
				+// https://github.com/skywind3000/kcp/blob/master/README.en.md#protocol-configuration
			
 
				 func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
			
 
				 	s.mu.Lock()
			
 
				 	defer s.mu.Unlock()
			
 
				 	s.kcp.NoDelay(nodelay, interval, resend, nc)
			
 
				-	s.updateInterval = time.Duration(interval) * time.Millisecond
			
 
				 }
			
 
				 
			
 
				 // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
			
@@ -406,8 +421,8 @@ func (s *UDPSession) SetDSCP(dscp int) error {
 
				 	s.mu.Lock()
			
 
				 	defer s.mu.Unlock()
			
 
				 	if s.l == nil {
			
 
				-		if nc, ok := s.conn.(*ConnectedUDPConn); ok {
			
 
				-			return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2)
			
 
				+		if nc, ok := s.conn.(*connectedUDPConn); ok {
			
 
				+			return ipv4.NewConn(nc.UDPConn).SetTOS(dscp << 2)
			
 
				 		} else if nc, ok := s.conn.(net.Conn); ok {
			
 
				 			return ipv4.NewConn(nc).SetTOS(dscp << 2)
			
 
				 		}
			
@@ -449,51 +464,47 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
 
				 func (s *UDPSession) output(buf []byte) {
			
 
				 	var ecc [][]byte
			
 
				 
			
 
				-	// extend buf's header space
			
 
				+	// 0. extend buf's header space(if necessary)
			
 
				 	ext := buf
			
 
				 	if s.headerSize > 0 {
			
 
				 		ext = s.ext[:s.headerSize+len(buf)]
			
 
				 		copy(ext[s.headerSize:], buf)
			
 
				 	}
			
 
				 
			
 
				-	// FEC stage
			
 
				+	// 1. FEC encoding
			
 
				 	if s.fecEncoder != nil {
			
 
				-		ecc = s.fecEncoder.Encode(ext)
			
 
				+		ecc = s.fecEncoder.encode(ext)
			
 
				 	}
			
 
				 
			
 
				-	// encryption stage
			
 
				+	// 2&3. crc32 & encryption
			
 
				 	if s.block != nil {
			
 
				 		io.ReadFull(rand.Reader, ext[:nonceSize])
			
 
				 		checksum := crc32.ChecksumIEEE(ext[cryptHeaderSize:])
			
 
				 		binary.LittleEndian.PutUint32(ext[nonceSize:], checksum)
			
 
				 		s.block.Encrypt(ext, ext)
			
 
				 
			
 
				-		if ecc != nil {
			
 
				-			for k := range ecc {
			
 
				-				io.ReadFull(rand.Reader, ecc[k][:nonceSize])
			
 
				-				checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
			
 
				-				binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum)
			
 
				-				s.block.Encrypt(ecc[k], ecc[k])
			
 
				-			}
			
 
				+		for k := range ecc {
			
 
				+			io.ReadFull(rand.Reader, ecc[k][:nonceSize])
			
 
				+			checksum := crc32.ChecksumIEEE(ecc[k][cryptHeaderSize:])
			
 
				+			binary.LittleEndian.PutUint32(ecc[k][nonceSize:], checksum)
			
 
				+			s.block.Encrypt(ecc[k], ecc[k])
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	// WriteTo kernel
			
 
				+	// 4. WriteTo kernel
			
 
				 	nbytes := 0
			
 
				 	npkts := 0
			
 
				-	// if mrand.Intn(100) < 50 {
			
 
				-	if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
			
 
				-		nbytes += n
			
 
				-		npkts++
			
 
				+	for i := 0; i < s.dup+1; i++ {
			
 
				+		if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
			
 
				+			nbytes += n
			
 
				+			npkts++
			
 
				+		}
			
 
				 	}
			
 
				-	// }
			
 
				 
			
 
				-	if ecc != nil {
			
 
				-		for k := range ecc {
			
 
				-			if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
			
 
				-				nbytes += n
			
 
				-				npkts++
			
 
				-			}
			
 
				+	for k := range ecc {
			
 
				+		if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
			
 
				+			nbytes += n
			
 
				+			npkts++
			
 
				 		}
			
 
				 	}
			
 
				 	atomic.AddUint64(&DefaultSnmp.OutPkts, uint64(npkts))
			
@@ -507,15 +518,13 @@ func (s *UDPSession) update() (interval time.Duration) {
 
				 	if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
			
 
				 		s.notifyWriteEvent()
			
 
				 	}
			
 
				-	interval = s.updateInterval
			
 
				+	interval = time.Duration(s.kcp.interval) * time.Millisecond
			
 
				 	s.mu.Unlock()
			
 
				 	return
			
 
				 }
			
 
				 
			
 
				 // GetConv gets conversation id of a session
			
 
				-func (s *UDPSession) GetConv() uint32 {
			
 
				-	return s.kcp.conv
			
 
				-}
			
 
				+func (s *UDPSession) GetConv() uint32 { return s.kcp.conv }
			
 
				 
			
 
				 func (s *UDPSession) notifyReadEvent() {
			
 
				 	select {
			
@@ -548,22 +557,21 @@ func (s *UDPSession) kcpInput(data []byte) {
 
				 				fecParityShards++
			
 
				 			}
			
 
				 
			
 
				-			if recovers := s.fecDecoder.Decode(f); recovers != nil {
			
 
				-				for _, r := range recovers {
			
 
				-					if len(r) >= 2 { // must be larger than 2bytes
			
 
				-						sz := binary.LittleEndian.Uint16(r)
			
 
				-						if int(sz) <= len(r) && sz >= 2 {
			
 
				-							if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
			
 
				-								fecRecovered++
			
 
				-							} else {
			
 
				-								kcpInErrors++
			
 
				-							}
			
 
				+			recovers := s.fecDecoder.decode(f)
			
 
				+			for _, r := range recovers {
			
 
				+				if len(r) >= 2 { // must be larger than 2bytes
			
 
				+					sz := binary.LittleEndian.Uint16(r)
			
 
				+					if int(sz) <= len(r) && sz >= 2 {
			
 
				+						if ret := s.kcp.Input(r[2:sz], false, s.ackNoDelay); ret == 0 {
			
 
				+							fecRecovered++
			
 
				 						} else {
			
 
				-							fecErrs++
			
 
				+							kcpInErrors++
			
 
				 						}
			
 
				 					} else {
			
 
				 						fecErrs++
			
 
				 					}
			
 
				+				} else {
			
 
				+					fecErrs++
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -601,7 +609,7 @@ func (s *UDPSession) kcpInput(data []byte) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (s *UDPSession) receiver(ch chan []byte) {
			
 
				+func (s *UDPSession) receiver(ch chan<- []byte) {
			
 
				 	for {
			
 
				 		data := xmitBuf.Get().([]byte)[:mtuLimit]
			
 
				 		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
			
@@ -611,6 +619,7 @@ func (s *UDPSession) receiver(ch chan []byte) {
 
				 				return
			
 
				 			}
			
 
				 		} else if err != nil {
			
 
				+			s.chErrorEvent <- err
			
 
				 			return
			
 
				 		} else {
			
 
				 			atomic.AddUint64(&DefaultSnmp.InErrs, 1)
			
@@ -658,12 +667,12 @@ type (
 
				 		block        BlockCrypt     // block encryption
			
 
				 		dataShards   int            // FEC data shard
			
 
				 		parityShards int            // FEC parity shard
			
 
				-		fecDecoder   *FECDecoder    // FEC mock initialization
			
 
				+		fecDecoder   *fecDecoder    // FEC mock initialization
			
 
				 		conn         net.PacketConn // the underlying packet connection
			
 
				 
			
 
				 		sessions        map[string]*UDPSession // all sessions accepted by this Listener
			
 
				 		chAccepts       chan *UDPSession       // Listen() backlog
			
 
				-		chSessionClosed chan net.Addr          // session close queue
			
 
				+		chSessionClosed chan string            // session close queue
			
 
				 		headerSize      int                    // the overall header size added before KCP frame
			
 
				 		die             chan struct{}          // notify the listener has closed
			
 
				 		rd              atomic.Value           // read deadline for Accept()
			
@@ -679,6 +688,10 @@ type (
 
				 
			
 
				 // monitor incoming data for all connections of server
			
 
				 func (l *Listener) monitor() {
			
 
				+	// cache last session
			
 
				+	var lastKey string
			
 
				+	var lastSession *UDPSession
			
 
				+
			
 
				 	chPacket := make(chan inPacket, qlen)
			
 
				 	go l.receiver(chPacket)
			
 
				 	for {
			
@@ -703,45 +716,60 @@ func (l *Listener) monitor() {
 
				 			}
			
 
				 
			
 
				 			if dataValid {
			
 
				-				addr := from.String()
			
 
				-				s, ok := l.sessions[addr]
			
 
				-				if !ok { // new session
			
 
				-					if len(l.chAccepts) < cap(l.chAccepts) { // do not let new session overwhelm accept queue
			
 
				-						var conv uint32
			
 
				-						convValid := false
			
 
				-						if l.fecDecoder != nil {
			
 
				-							isfec := binary.LittleEndian.Uint16(data[4:])
			
 
				-							if isfec == typeData {
			
 
				-								conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
			
 
				-								convValid = true
			
 
				-							}
			
 
				-						} else {
			
 
				-							conv = binary.LittleEndian.Uint32(data)
			
 
				-							convValid = true
			
 
				-						}
			
 
				+				var conv uint32
			
 
				+				convValid := false
			
 
				+				if l.fecDecoder != nil {
			
 
				+					isfec := binary.LittleEndian.Uint16(data[4:])
			
 
				+					if isfec == typeData {
			
 
				+						conv = binary.LittleEndian.Uint32(data[fecHeaderSizePlus2:])
			
 
				+						convValid = true
			
 
				+					}
			
 
				+				} else {
			
 
				+					conv = binary.LittleEndian.Uint32(data)
			
 
				+					convValid = true
			
 
				+				}
			
 
				 
			
 
				-						if convValid {
			
 
				-							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block)
			
 
				+				if convValid {
			
 
				+					addr := from.String()
			
 
				+					key := fmt.Sprintf("%s/%d", addr, conv)
			
 
				+					var s *UDPSession
			
 
				+					var ok bool
			
 
				+
			
 
				+					// packets received from an address always come in batch.
			
 
				+					// cache the session for next packet, without querying map.
			
 
				+					if key == lastKey {
			
 
				+						s, ok = lastSession, true
			
 
				+					} else if s, ok = l.sessions[key]; ok {
			
 
				+						lastSession = s
			
 
				+						lastKey = addr
			
 
				+					}
			
 
				+
			
 
				+					if !ok { // new session
			
 
				+						if len(l.chAccepts) < cap(l.chAccepts) && len(l.sessions) < 4096 { // do not let new session overwhelm accept queue and connection count
			
 
				+							s := newUDPSession(conv, l.dataShards, l.parityShards, l, l.conn, from, l.block, false)
			
 
				 							s.kcpInput(data)
			
 
				-							l.sessions[addr] = s
			
 
				+							l.sessions[key] = s
			
 
				 							l.chAccepts <- s
			
 
				 						}
			
 
				+					} else {
			
 
				+						s.kcpInput(data)
			
 
				 					}
			
 
				-				} else {
			
 
				-					s.kcpInput(data)
			
 
				 				}
			
 
				 			}
			
 
				 
			
 
				 			xmitBuf.Put(raw)
			
 
				-		case deadlink := <-l.chSessionClosed:
			
 
				-			delete(l.sessions, deadlink.String())
			
 
				+		case key := <-l.chSessionClosed:
			
 
				+			if key == lastKey {
			
 
				+				lastKey = ""
			
 
				+			}
			
 
				+			delete(l.sessions, key)
			
 
				 		case <-l.die:
			
 
				 			return
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func (l *Listener) receiver(ch chan inPacket) {
			
 
				+func (l *Listener) receiver(ch chan<- inPacket) {
			
 
				 	for {
			
 
				 		data := xmitBuf.Get().([]byte)[:mtuLimit]
			
 
				 		if n, from, err := l.conn.ReadFrom(data); err == nil && n >= l.headerSize+IKCP_OVERHEAD {
			
@@ -830,9 +858,9 @@ func (l *Listener) Close() error {
 
				 }
			
 
				 
			
 
				 // closeSession notify the listener that a session has closed
			
 
				-func (l *Listener) closeSession(remote net.Addr) bool {
			
 
				+func (l *Listener) closeSession(key string) bool {
			
 
				 	select {
			
 
				-	case l.chSessionClosed <- remote:
			
 
				+	case l.chSessionClosed <- key:
			
 
				 		return true
			
 
				 	case <-l.die:
			
 
				 		return false
			
@@ -840,14 +868,10 @@ func (l *Listener) closeSession(remote net.Addr) bool {
 
				 }
			
 
				 
			
 
				 // Addr returns the listener's network address, The Addr returned is shared by all invocations of Addr, so do not modify it.
			
 
				-func (l *Listener) Addr() net.Addr {
			
 
				-	return l.conn.LocalAddr()
			
 
				-}
			
 
				+func (l *Listener) Addr() net.Addr { return l.conn.LocalAddr() }
			
 
				 
			
 
				 // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
			
 
				-func Listen(laddr string) (net.Listener, error) {
			
 
				-	return ListenWithOptions(laddr, nil, 0, 0)
			
 
				-}
			
 
				+func Listen(laddr string) (net.Listener, error) { return ListenWithOptions(laddr, nil, 0, 0) }
			
 
				 
			
 
				 // ListenWithOptions listens for incoming KCP packets addressed to the local address laddr on the network "udp" with packet encryption,
			
 
				 // dataShards, parityShards defines Reed-Solomon Erasure Coding parameters
			
@@ -870,7 +894,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
 
				 	l.conn = conn
			
 
				 	l.sessions = make(map[string]*UDPSession)
			
 
				 	l.chAccepts = make(chan *UDPSession, acceptBacklog)
			
 
				-	l.chSessionClosed = make(chan net.Addr)
			
 
				+	l.chSessionClosed = make(chan string)
			
 
				 	l.die = make(chan struct{})
			
 
				 	l.dataShards = dataShards
			
 
				 	l.parityShards = parityShards
			
@@ -890,9 +914,7 @@ func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketCo
 
				 }
			
 
				 
			
 
				 // Dial connects to the remote address "raddr" on the network "udp"
			
 
				-func Dial(raddr string) (net.Conn, error) {
			
 
				-	return DialWithOptions(raddr, nil, 0, 0)
			
 
				-}
			
 
				+func Dial(raddr string) (net.Conn, error) { return DialWithOptions(raddr, nil, 0, 0) }
			
 
				 
			
 
				 // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
			
 
				 func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
			
@@ -906,11 +928,11 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in
 
				 		return nil, errors.Wrap(err, "net.DialUDP")
			
 
				 	}
			
 
				 
			
 
				-	return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn})
			
 
				+	return NewConn(raddr, block, dataShards, parityShards, &connectedUDPConn{udpconn}, true)
			
 
				 }
			
 
				 
			
 
				 // NewConn establishes a session and talks KCP protocol over a packet connection.
			
 
				-func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) {
			
 
				+func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn, closeConn bool) (*UDPSession, error) {
			
 
				 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
			
 
				 	if err != nil {
			
 
				 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
			
@@ -918,22 +940,16 @@ func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn
 
				 
			
 
				 	var convid uint32
			
 
				 	binary.Read(rand.Reader, binary.LittleEndian, &convid)
			
 
				-	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
			
 
				+	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block, closeConn), nil
			
 
				 }
			
 
				 
			
 
				-func currentMs() uint32 {
			
 
				-	return uint32(time.Now().UnixNano() / int64(time.Millisecond))
			
 
				-}
			
 
				+// returns current time in milliseconds
			
 
				+func currentMs() uint32 { return uint32(time.Now().UnixNano() / int64(time.Millisecond)) }
			
 
				 
			
 
				-// ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
			
 
				+// connectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
			
 
				 // to Write syscalls that are 4 times faster on some OS'es. This should only be
			
 
				 // used for connections that were produced by a net.Dial* call.
			
 
				-type ConnectedUDPConn struct {
			
 
				-	*net.UDPConn
			
 
				-	Conn net.Conn // underlying connection if any
			
 
				-}
			
 
				+type connectedUDPConn struct{ *net.UDPConn }
			
 
				 
			
 
				 // WriteTo redirects all writes to the Write syscall, which is 4 times faster.
			
 
				-func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
			
 
				-	return c.Write(b)
			
 
				-}
			
 
				+func (c *connectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) { return c.Write(b) }
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/snmp.go
--- a/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/updater.go
@@ -15,15 +15,13 @@ func init() {
 
				 
			
 
				 // entry contains a session update info
			
 
				 type entry struct {
			
 
				-	sid uint32
			
 
				-	ts  time.Time
			
 
				-	s   *UDPSession
			
 
				+	ts time.Time
			
 
				+	s  *UDPSession
			
 
				 }
			
 
				 
			
 
				 // a global heap managed kcp.flush() caller
			
 
				 type updateHeap struct {
			
 
				 	entries  []entry
			
 
				-	indices  map[uint32]int
			
 
				 	mu       sync.Mutex
			
 
				 	chWakeUp chan struct{}
			
 
				 }
			
@@ -32,41 +30,40 @@ func (h *updateHeap) Len() int           { return len(h.entries) }
 
				 func (h *updateHeap) Less(i, j int) bool { return h.entries[i].ts.Before(h.entries[j].ts) }
			
 
				 func (h *updateHeap) Swap(i, j int) {
			
 
				 	h.entries[i], h.entries[j] = h.entries[j], h.entries[i]
			
 
				-	h.indices[h.entries[i].sid] = i
			
 
				-	h.indices[h.entries[j].sid] = j
			
 
				+	h.entries[i].s.updaterIdx = i
			
 
				+	h.entries[j].s.updaterIdx = j
			
 
				 }
			
 
				 
			
 
				 func (h *updateHeap) Push(x interface{}) {
			
 
				 	h.entries = append(h.entries, x.(entry))
			
 
				 	n := len(h.entries)
			
 
				-	h.indices[h.entries[n-1].sid] = n - 1
			
 
				+	h.entries[n-1].s.updaterIdx = n - 1
			
 
				 }
			
 
				 
			
 
				 func (h *updateHeap) Pop() interface{} {
			
 
				 	n := len(h.entries)
			
 
				 	x := h.entries[n-1]
			
 
				+	h.entries[n-1].s.updaterIdx = -1
			
 
				 	h.entries[n-1] = entry{} // manual set nil for GC
			
 
				 	h.entries = h.entries[0 : n-1]
			
 
				-	delete(h.indices, x.sid)
			
 
				 	return x
			
 
				 }
			
 
				 
			
 
				 func (h *updateHeap) init() {
			
 
				-	h.indices = make(map[uint32]int)
			
 
				 	h.chWakeUp = make(chan struct{}, 1)
			
 
				 }
			
 
				 
			
 
				 func (h *updateHeap) addSession(s *UDPSession) {
			
 
				 	h.mu.Lock()
			
 
				-	heap.Push(h, entry{s.sid, time.Now(), s})
			
 
				+	heap.Push(h, entry{time.Now(), s})
			
 
				 	h.mu.Unlock()
			
 
				 	h.wakeup()
			
 
				 }
			
 
				 
			
 
				 func (h *updateHeap) removeSession(s *UDPSession) {
			
 
				 	h.mu.Lock()
			
 
				-	if idx, ok := h.indices[s.sid]; ok {
			
 
				-		heap.Remove(h, idx)
			
 
				+	if s.updaterIdx != -1 {
			
 
				+		heap.Remove(h, s.updaterIdx)
			
 
				 	}
			
 
				 	h.mu.Unlock()
			
 
				 }
			
@@ -99,7 +96,8 @@ func (h *updateHeap) updateTask() {
 
				 				break
			
 
				 			}
			
 
				 		}
			
 
				-		if h.Len() > 0 {
			
 
				+
			
 
				+		if hlen > 0 {
			
 
				 			timer = time.After(h.entries[0].ts.Sub(now))
			
 
				 		}
			
 
				 		h.mu.Unlock()
			
--- a/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
+++ b/vendor/github.com/AudriusButkevicius/kcp-go/xor.go
--- a/vendor/github.com/templexxx/xor/LICENSE
+++ b/vendor/github.com/templexxx/xor/LICENSE
@@ -0,0 +1,21 @@
 
				+MIT License
			
 
				+
			
 
				+Copyright (c) 2017 Temple3x
			
 
				+
			
 
				+Permission is hereby granted, free of charge, to any person obtaining a copy
			
 
				+of this software and associated documentation files (the "Software"), to deal
			
 
				+in the Software without restriction, including without limitation the rights
			
 
				+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+copies of the Software, and to permit persons to whom the Software is
			
 
				+furnished to do so, subject to the following conditions:
			
 
				+
			
 
				+The above copyright notice and this permission notice shall be included in all
			
 
				+copies or substantial portions of the Software.
			
 
				+
			
 
				+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
			
 
				+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
			
 
				+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
			
 
				+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
			
 
				+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
			
 
				+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
			
 
				+SOFTWARE.
			
--- a/vendor/github.com/templexxx/xor/avx2_amd64.s
+++ b/vendor/github.com/templexxx/xor/avx2_amd64.s
@@ -0,0 +1,442 @@
 
				+#include "textflag.h"
			
 
				+
			
 
				+// addr of mem
			
 
				+#define DST BX
			
 
				+#define SRC SI
			
 
				+#define SRC0 TMP4
			
 
				+#define SRC1 TMP5
			
 
				+
			
 
				+// loop args
			
 
				+// num of vect
			
 
				+#define VECT CX
			
 
				+#define LEN DX
			
 
				+// pos of matrix
			
 
				+#define POS R8
			
 
				+
			
 
				+// tmp store
			
 
				+// num of vect or ...
			
 
				+#define TMP1 R9
			
 
				+// pos of matrix or ...
			
 
				+#define TMP2 R10
			
 
				+// store addr of data/parity or ...
			
 
				+#define TMP3 R11
			
 
				+#define TMP4 R12
			
 
				+#define TMP5 R13
			
 
				+#define TMP6 R14
			
 
				+
			
 
				+// func bytesAVX2mini(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesAVX2mini(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $31, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop32b:
			
 
				+	VMOVDQU (SRC0)(POS*1), Y0
			
 
				+	VPXOR   (SRC1)(POS*1), Y0, Y0
			
 
				+	VMOVDQU Y0, (DST)(POS*1)
			
 
				+	ADDQ    $32, POS
			
 
				+	CMPQ    LEN, POS
			
 
				+	JNE     loop32b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $31, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $31, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $32
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesAVX2small(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesAVX2small(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $127, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop128b:
			
 
				+	VMOVDQU (SRC0)(POS*1), Y0
			
 
				+	VMOVDQU 32(SRC0)(POS*1), Y1
			
 
				+	VMOVDQU 64(SRC0)(POS*1), Y2
			
 
				+	VMOVDQU 96(SRC0)(POS*1), Y3
			
 
				+	VPXOR   (SRC1)(POS*1), Y0, Y0
			
 
				+	VPXOR   32(SRC1)(POS*1), Y1, Y1
			
 
				+	VPXOR   64(SRC1)(POS*1), Y2, Y2
			
 
				+	VPXOR   96(SRC1)(POS*1), Y3, Y3
			
 
				+	VMOVDQU Y0, (DST)(POS*1)
			
 
				+	VMOVDQU Y1, 32(DST)(POS*1)
			
 
				+	VMOVDQU Y2, 64(DST)(POS*1)
			
 
				+	VMOVDQU Y3, 96(DST)(POS*1)
			
 
				+
			
 
				+	ADDQ $128, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop128b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $127, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $127, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $128
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesAVX2big(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesAVX2big(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $127, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop128b:
			
 
				+	VMOVDQU (SRC0)(POS*1), Y0
			
 
				+	VMOVDQU 32(SRC0)(POS*1), Y1
			
 
				+	VMOVDQU 64(SRC0)(POS*1), Y2
			
 
				+	VMOVDQU 96(SRC0)(POS*1), Y3
			
 
				+	VPXOR   (SRC1)(POS*1), Y0, Y0
			
 
				+	VPXOR   32(SRC1)(POS*1), Y1, Y1
			
 
				+	VPXOR   64(SRC1)(POS*1), Y2, Y2
			
 
				+	VPXOR   96(SRC1)(POS*1), Y3, Y3
			
 
				+	LONG    $0xe77da1c4; WORD $0x0304
			
 
				+	LONG    $0xe77da1c4; WORD $0x034c; BYTE $0x20
			
 
				+	LONG    $0xe77da1c4; WORD $0x0354; BYTE $0x40
			
 
				+	LONG    $0xe77da1c4; WORD $0x035c; BYTE $0x60
			
 
				+
			
 
				+	ADDQ $128, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop128b
			
 
				+	SFENCE
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $127, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $127, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $128
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func matrixAVX2small(dst []byte, src [][]byte)
			
 
				+TEXT ·matrixAVX2small(SB), NOSPLIT, $0
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src+24(FP), SRC
			
 
				+	MOVQ  vec+32(FP), VECT
			
 
				+	MOVQ  len+8(FP), LEN
			
 
				+	TESTQ $127, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop128b:
			
 
				+	MOVQ    VECT, TMP1
			
 
				+	SUBQ    $2, TMP1
			
 
				+	MOVQ    $0, TMP2
			
 
				+	MOVQ    (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ    TMP3, TMP4
			
 
				+	VMOVDQU (TMP3)(POS*1), Y0
			
 
				+	VMOVDQU 32(TMP4)(POS*1), Y1
			
 
				+	VMOVDQU 64(TMP3)(POS*1), Y2
			
 
				+	VMOVDQU 96(TMP4)(POS*1), Y3
			
 
				+
			
 
				+next_vect:
			
 
				+	ADDQ    $24, TMP2
			
 
				+	MOVQ    (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ    TMP3, TMP4
			
 
				+	VMOVDQU (TMP3)(POS*1), Y4
			
 
				+	VMOVDQU 32(TMP4)(POS*1), Y5
			
 
				+	VMOVDQU 64(TMP3)(POS*1), Y6
			
 
				+	VMOVDQU 96(TMP4)(POS*1), Y7
			
 
				+	VPXOR   Y4, Y0, Y0
			
 
				+	VPXOR   Y5, Y1, Y1
			
 
				+	VPXOR   Y6, Y2, Y2
			
 
				+	VPXOR   Y7, Y3, Y3
			
 
				+	SUBQ    $1, TMP1
			
 
				+	JGE     next_vect
			
 
				+
			
 
				+	VMOVDQU Y0, (DST)(POS*1)
			
 
				+	VMOVDQU Y1, 32(DST)(POS*1)
			
 
				+	VMOVDQU Y2, 64(DST)(POS*1)
			
 
				+	VMOVDQU Y3, 96(DST)(POS*1)
			
 
				+
			
 
				+	ADDQ $128, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop128b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_1b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP6
			
 
				+	XORB TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_1b
			
 
				+
			
 
				+	MOVB  TMP5, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $127, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP4
			
 
				+	ANDQ  $127, TMP4
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_8b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP6
			
 
				+	XORQ TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_8b
			
 
				+
			
 
				+	MOVQ TMP5, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP4
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $128
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func matrixAVX2big(dst []byte, src [][]byte)
			
 
				+TEXT ·matrixAVX2big(SB), NOSPLIT, $0
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src+24(FP), SRC
			
 
				+	MOVQ  vec+32(FP), VECT
			
 
				+	MOVQ  len+8(FP), LEN
			
 
				+	TESTQ $127, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop128b:
			
 
				+	MOVQ    VECT, TMP1
			
 
				+	SUBQ    $2, TMP1
			
 
				+	MOVQ    $0, TMP2
			
 
				+	MOVQ    (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ    TMP3, TMP4
			
 
				+	VMOVDQU (TMP3)(POS*1), Y0
			
 
				+	VMOVDQU 32(TMP4)(POS*1), Y1
			
 
				+	VMOVDQU 64(TMP3)(POS*1), Y2
			
 
				+	VMOVDQU 96(TMP4)(POS*1), Y3
			
 
				+
			
 
				+next_vect:
			
 
				+	ADDQ    $24, TMP2
			
 
				+	MOVQ    (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ    TMP3, TMP4
			
 
				+	VMOVDQU (TMP3)(POS*1), Y4
			
 
				+	VMOVDQU 32(TMP4)(POS*1), Y5
			
 
				+	VMOVDQU 64(TMP3)(POS*1), Y6
			
 
				+	VMOVDQU 96(TMP4)(POS*1), Y7
			
 
				+	VPXOR   Y4, Y0, Y0
			
 
				+	VPXOR   Y5, Y1, Y1
			
 
				+	VPXOR   Y6, Y2, Y2
			
 
				+	VPXOR   Y7, Y3, Y3
			
 
				+	SUBQ    $1, TMP1
			
 
				+	JGE     next_vect
			
 
				+
			
 
				+	LONG $0xe77da1c4; WORD $0x0304             // VMOVNTDQ  go1.8 has
			
 
				+	LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20
			
 
				+	LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40
			
 
				+	LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60
			
 
				+
			
 
				+	ADDQ $128, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop128b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_1b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP6
			
 
				+	XORB TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_1b
			
 
				+
			
 
				+	MOVB  TMP5, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $127, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP4
			
 
				+	ANDQ  $127, TMP4
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_8b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP6
			
 
				+	XORQ TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_8b
			
 
				+
			
 
				+	MOVQ TMP5, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP4
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $128
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+TEXT ·hasAVX2(SB), NOSPLIT, $0
			
 
				+	XORQ AX, AX
			
 
				+	XORQ CX, CX
			
 
				+	ADDL $7, AX
			
 
				+	CPUID
			
 
				+	SHRQ $5, BX
			
 
				+	ANDQ $1, BX
			
 
				+	MOVB BX, ret+0(FP)
			
 
				+	RET
			
--- a/vendor/github.com/templexxx/xor/nosimd.go
+++ b/vendor/github.com/templexxx/xor/nosimd.go
@@ -0,0 +1,116 @@
 
				+// Copyright 2013 The Go Authors. All rights reserved.
			
 
				+// Use of this source code is governed by a BSD-style
			
 
				+// license that can be found in the LICENSE file.
			
 
				+
			
 
				+package xor
			
 
				+
			
 
				+import (
			
 
				+	"runtime"
			
 
				+	"unsafe"
			
 
				+)
			
 
				+
			
 
				+const wordSize = int(unsafe.Sizeof(uintptr(0)))
			
 
				+const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x"
			
 
				+
			
 
				+// xor the bytes in a and b. The destination is assumed to have enough space.
			
 
				+func bytesNoSIMD(dst, a, b []byte, size int) {
			
 
				+	if supportsUnaligned {
			
 
				+		fastXORBytes(dst, a, b, size)
			
 
				+	} else {
			
 
				+		// TODO(hanwen): if (dst, a, b) have common alignment
			
 
				+		// we could still try fastXORBytes. It is not clear
			
 
				+		// how often this happens, and it's only worth it if
			
 
				+		// the block encryption itself is hardware
			
 
				+		// accelerated.
			
 
				+		safeXORBytes(dst, a, b, size)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// split slice for cache-friendly
			
 
				+const unitSize = 16 * 1024
			
 
				+
			
 
				+func matrixNoSIMD(dst []byte, src [][]byte) {
			
 
				+	size := len(src[0])
			
 
				+	start := 0
			
 
				+	do := unitSize
			
 
				+	for start < size {
			
 
				+		end := start + do
			
 
				+		if end <= size {
			
 
				+			partNoSIMD(start, end, dst, src)
			
 
				+			start = start + do
			
 
				+		} else {
			
 
				+			partNoSIMD(start, size, dst, src)
			
 
				+			start = size
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// split vect will improve performance with big data by reducing cache pollution
			
 
				+func partNoSIMD(start, end int, dst []byte, src [][]byte) {
			
 
				+	bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start)
			
 
				+	for i := 2; i < len(src); i++ {
			
 
				+		bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// fastXORBytes xor in bulk. It only works on architectures that
			
 
				+// support unaligned read/writes.
			
 
				+func fastXORBytes(dst, a, b []byte, n int) {
			
 
				+	w := n / wordSize
			
 
				+	if w > 0 {
			
 
				+		wordBytes := w * wordSize
			
 
				+		fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes])
			
 
				+	}
			
 
				+	for i := n - n%wordSize; i < n; i++ {
			
 
				+		dst[i] = a[i] ^ b[i]
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func safeXORBytes(dst, a, b []byte, n int) {
			
 
				+	ex := n % 8
			
 
				+	for i := 0; i < ex; i++ {
			
 
				+		dst[i] = a[i] ^ b[i]
			
 
				+	}
			
 
				+
			
 
				+	for i := ex; i < n; i += 8 {
			
 
				+		_dst := dst[i : i+8]
			
 
				+		_a := a[i : i+8]
			
 
				+		_b := b[i : i+8]
			
 
				+		_dst[0] = _a[0] ^ _b[0]
			
 
				+		_dst[1] = _a[1] ^ _b[1]
			
 
				+		_dst[2] = _a[2] ^ _b[2]
			
 
				+		_dst[3] = _a[3] ^ _b[3]
			
 
				+
			
 
				+		_dst[4] = _a[4] ^ _b[4]
			
 
				+		_dst[5] = _a[5] ^ _b[5]
			
 
				+		_dst[6] = _a[6] ^ _b[6]
			
 
				+		_dst[7] = _a[7] ^ _b[7]
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.)
			
 
				+// The arguments are assumed to be of equal length.
			
 
				+func fastXORWords(dst, a, b []byte) {
			
 
				+	dw := *(*[]uintptr)(unsafe.Pointer(&dst))
			
 
				+	aw := *(*[]uintptr)(unsafe.Pointer(&a))
			
 
				+	bw := *(*[]uintptr)(unsafe.Pointer(&b))
			
 
				+	n := len(b) / wordSize
			
 
				+	ex := n % 8
			
 
				+	for i := 0; i < ex; i++ {
			
 
				+		dw[i] = aw[i] ^ bw[i]
			
 
				+	}
			
 
				+
			
 
				+	for i := ex; i < n; i += 8 {
			
 
				+		_dw := dw[i : i+8]
			
 
				+		_aw := aw[i : i+8]
			
 
				+		_bw := bw[i : i+8]
			
 
				+		_dw[0] = _aw[0] ^ _bw[0]
			
 
				+		_dw[1] = _aw[1] ^ _bw[1]
			
 
				+		_dw[2] = _aw[2] ^ _bw[2]
			
 
				+		_dw[3] = _aw[3] ^ _bw[3]
			
 
				+		_dw[4] = _aw[4] ^ _bw[4]
			
 
				+		_dw[5] = _aw[5] ^ _bw[5]
			
 
				+		_dw[6] = _aw[6] ^ _bw[6]
			
 
				+		_dw[7] = _aw[7] ^ _bw[7]
			
 
				+	}
			
 
				+}
			
--- a/vendor/github.com/templexxx/xor/sse2_amd64.s
+++ b/vendor/github.com/templexxx/xor/sse2_amd64.s
@@ -0,0 +1,574 @@
 
				+#include "textflag.h"
			
 
				+
			
 
				+// addr of mem
			
 
				+#define DST BX
			
 
				+#define SRC SI
			
 
				+#define SRC0 TMP4
			
 
				+#define SRC1 TMP5
			
 
				+
			
 
				+// loop args
			
 
				+// num of vect
			
 
				+#define VECT CX
			
 
				+#define LEN DX
			
 
				+// pos of matrix
			
 
				+#define POS R8
			
 
				+
			
 
				+// tmp store
			
 
				+// num of vect or ...
			
 
				+#define TMP1 R9
			
 
				+// pos of matrix or ...
			
 
				+#define TMP2 R10
			
 
				+// store addr of data/parity or ...
			
 
				+#define TMP3 R11
			
 
				+#define TMP4 R12
			
 
				+#define TMP5 R13
			
 
				+#define TMP6 R14
			
 
				+
			
 
				+// func bytesSrc0(dst, src0, src1 []byte)
			
 
				+TEXT ·xorSrc0(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+32(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $15, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop16b:
			
 
				+	MOVOU (SRC0)(POS*1), X0
			
 
				+	XORPD (SRC1)(POS*1), X0
			
 
				+	MOVOU X0, (DST)(POS*1)
			
 
				+	ADDQ  $16, POS
			
 
				+	CMPQ  LEN, POS
			
 
				+	JNE   loop16b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $15, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $15, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $16
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesSrc1(dst, src0, src1 []byte)
			
 
				+TEXT ·xorSrc1(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+56(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $15, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop16b:
			
 
				+	MOVOU (SRC0)(POS*1), X0
			
 
				+	XORPD (SRC1)(POS*1), X0
			
 
				+	MOVOU X0, (DST)(POS*1)
			
 
				+	ADDQ  $16, POS
			
 
				+	CMPQ  LEN, POS
			
 
				+	JNE   loop16b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $15, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $15, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $16
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesSSE2mini(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesSSE2mini(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $15, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop16b:
			
 
				+	MOVOU (SRC0)(POS*1), X0
			
 
				+	XORPD (SRC1)(POS*1), X0
			
 
				+
			
 
				+	// MOVOU (SRC1)(POS*1), X4
			
 
				+	// PXOR X4, X0
			
 
				+	MOVOU X0, (DST)(POS*1)
			
 
				+	ADDQ  $16, POS
			
 
				+	CMPQ  LEN, POS
			
 
				+	JNE   loop16b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $15, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $15, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $16
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesSSE2small(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesSSE2small(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $63, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop64b:
			
 
				+	MOVOU (SRC0)(POS*1), X0
			
 
				+	MOVOU 16(SRC0)(POS*1), X1
			
 
				+	MOVOU 32(SRC0)(POS*1), X2
			
 
				+	MOVOU 48(SRC0)(POS*1), X3
			
 
				+
			
 
				+	MOVOU (SRC1)(POS*1), X4
			
 
				+	MOVOU 16(SRC1)(POS*1), X5
			
 
				+	MOVOU 32(SRC1)(POS*1), X6
			
 
				+	MOVOU 48(SRC1)(POS*1), X7
			
 
				+
			
 
				+	PXOR X4, X0
			
 
				+	PXOR X5, X1
			
 
				+	PXOR X6, X2
			
 
				+	PXOR X7, X3
			
 
				+
			
 
				+	MOVOU X0, (DST)(POS*1)
			
 
				+	MOVOU X1, 16(DST)(POS*1)
			
 
				+	MOVOU X2, 32(DST)(POS*1)
			
 
				+	MOVOU X3, 48(DST)(POS*1)
			
 
				+
			
 
				+	ADDQ $64, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop64b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $63, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $63, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $64
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func bytesSSE2big(dst, src0, src1 []byte, size int)
			
 
				+TEXT ·bytesSSE2big(SB), NOSPLIT, $0
			
 
				+	MOVQ  len+72(FP), LEN
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src0+24(FP), SRC0
			
 
				+	MOVQ  src1+48(FP), SRC1
			
 
				+	TESTQ $63, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop64b:
			
 
				+	MOVOU (SRC0)(POS*1), X0
			
 
				+	MOVOU 16(SRC0)(POS*1), X1
			
 
				+	MOVOU 32(SRC0)(POS*1), X2
			
 
				+	MOVOU 48(SRC0)(POS*1), X3
			
 
				+
			
 
				+	MOVOU (SRC1)(POS*1), X4
			
 
				+	MOVOU 16(SRC1)(POS*1), X5
			
 
				+	MOVOU 32(SRC1)(POS*1), X6
			
 
				+	MOVOU 48(SRC1)(POS*1), X7
			
 
				+
			
 
				+	PXOR X4, X0
			
 
				+	PXOR X5, X1
			
 
				+	PXOR X6, X2
			
 
				+	PXOR X7, X3
			
 
				+
			
 
				+	LONG $0xe70f4266; WORD $0x0304             // MOVNTDQ
			
 
				+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
			
 
				+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
			
 
				+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
			
 
				+
			
 
				+	ADDQ $64, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop64b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVB  -1(SRC0)(LEN*1), TMP1
			
 
				+	MOVB  -1(SRC1)(LEN*1), TMP2
			
 
				+	XORB  TMP1, TMP2
			
 
				+	MOVB  TMP2, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $63, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP1
			
 
				+	ANDQ  $63, TMP1
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ -8(SRC0)(LEN*1), TMP2
			
 
				+	MOVQ -8(SRC1)(LEN*1), TMP3
			
 
				+	XORQ TMP2, TMP3
			
 
				+	MOVQ TMP3, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP1
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $64
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func matrixSSE2small(dst []byte, src [][]byte)
			
 
				+TEXT ·matrixSSE2small(SB), NOSPLIT, $0
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src+24(FP), SRC
			
 
				+	MOVQ  vec+32(FP), VECT
			
 
				+	MOVQ  len+8(FP), LEN
			
 
				+	TESTQ $63, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop64b:
			
 
				+	MOVQ  VECT, TMP1
			
 
				+	SUBQ  $2, TMP1
			
 
				+	MOVQ  $0, TMP2
			
 
				+	MOVQ  (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ  TMP3, TMP4
			
 
				+	MOVOU (TMP3)(POS*1), X0
			
 
				+	MOVOU 16(TMP4)(POS*1), X1
			
 
				+	MOVOU 32(TMP3)(POS*1), X2
			
 
				+	MOVOU 48(TMP4)(POS*1), X3
			
 
				+
			
 
				+next_vect:
			
 
				+	ADDQ  $24, TMP2
			
 
				+	MOVQ  (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ  TMP3, TMP4
			
 
				+	MOVOU (TMP3)(POS*1), X4
			
 
				+	MOVOU 16(TMP4)(POS*1), X5
			
 
				+	MOVOU 32(TMP3)(POS*1), X6
			
 
				+	MOVOU 48(TMP4)(POS*1), X7
			
 
				+	PXOR  X4, X0
			
 
				+	PXOR  X5, X1
			
 
				+	PXOR  X6, X2
			
 
				+	PXOR  X7, X3
			
 
				+	SUBQ  $1, TMP1
			
 
				+	JGE   next_vect
			
 
				+
			
 
				+	MOVOU X0, (DST)(POS*1)
			
 
				+	MOVOU X1, 16(DST)(POS*1)
			
 
				+	MOVOU X2, 32(DST)(POS*1)
			
 
				+	MOVOU X3, 48(DST)(POS*1)
			
 
				+
			
 
				+	ADDQ $64, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop64b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_1b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP6
			
 
				+	XORB TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_1b
			
 
				+
			
 
				+	MOVB  TMP5, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $63, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP4
			
 
				+	ANDQ  $63, TMP4
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_8b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP6
			
 
				+	XORQ TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_8b
			
 
				+
			
 
				+	MOVQ TMP5, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP4
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $64
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+// func matrixSSE2big(dst []byte, src [][]byte)
			
 
				+TEXT ·matrixSSE2big(SB), NOSPLIT, $0
			
 
				+	MOVQ  dst+0(FP), DST
			
 
				+	MOVQ  src+24(FP), SRC
			
 
				+	MOVQ  vec+32(FP), VECT
			
 
				+	MOVQ  len+8(FP), LEN
			
 
				+	TESTQ $63, LEN
			
 
				+	JNZ   not_aligned
			
 
				+
			
 
				+aligned:
			
 
				+	MOVQ $0, POS
			
 
				+
			
 
				+loop64b:
			
 
				+	MOVQ  VECT, TMP1
			
 
				+	SUBQ  $2, TMP1
			
 
				+	MOVQ  $0, TMP2
			
 
				+	MOVQ  (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ  TMP3, TMP4
			
 
				+	MOVOU (TMP3)(POS*1), X0
			
 
				+	MOVOU 16(TMP4)(POS*1), X1
			
 
				+	MOVOU 32(TMP3)(POS*1), X2
			
 
				+	MOVOU 48(TMP4)(POS*1), X3
			
 
				+
			
 
				+next_vect:
			
 
				+	ADDQ  $24, TMP2
			
 
				+	MOVQ  (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ  TMP3, TMP4
			
 
				+	MOVOU (TMP3)(POS*1), X4
			
 
				+	MOVOU 16(TMP4)(POS*1), X5
			
 
				+	MOVOU 32(TMP3)(POS*1), X6
			
 
				+	MOVOU 48(TMP4)(POS*1), X7
			
 
				+	PXOR  X4, X0
			
 
				+	PXOR  X5, X1
			
 
				+	PXOR  X6, X2
			
 
				+	PXOR  X7, X3
			
 
				+	SUBQ  $1, TMP1
			
 
				+	JGE   next_vect
			
 
				+
			
 
				+	LONG $0xe70f4266; WORD $0x0304
			
 
				+	LONG $0xe70f4266; WORD $0x034c; BYTE $0x10
			
 
				+	LONG $0xe70f4266; WORD $0x0354; BYTE $0x20
			
 
				+	LONG $0xe70f4266; WORD $0x035c; BYTE $0x30
			
 
				+
			
 
				+	ADDQ $64, POS
			
 
				+	CMPQ LEN, POS
			
 
				+	JNE  loop64b
			
 
				+	RET
			
 
				+
			
 
				+loop_1b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_1b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVB -1(TMP3)(LEN*1), TMP6
			
 
				+	XORB TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_1b
			
 
				+
			
 
				+	MOVB  TMP5, -1(DST)(LEN*1)
			
 
				+	SUBQ  $1, LEN
			
 
				+	TESTQ $7, LEN
			
 
				+	JNZ   loop_1b
			
 
				+
			
 
				+	CMPQ  LEN, $0
			
 
				+	JE    ret
			
 
				+	TESTQ $63, LEN
			
 
				+	JZ    aligned
			
 
				+
			
 
				+not_aligned:
			
 
				+	TESTQ $7, LEN
			
 
				+	JNE   loop_1b
			
 
				+	MOVQ  LEN, TMP4
			
 
				+	ANDQ  $63, TMP4
			
 
				+
			
 
				+loop_8b:
			
 
				+	MOVQ VECT, TMP1
			
 
				+	MOVQ $0, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	SUBQ $2, TMP1
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP5
			
 
				+
			
 
				+next_vect_8b:
			
 
				+	ADDQ $24, TMP2
			
 
				+	MOVQ (SRC)(TMP2*1), TMP3
			
 
				+	MOVQ -8(TMP3)(LEN*1), TMP6
			
 
				+	XORQ TMP6, TMP5
			
 
				+	SUBQ $1, TMP1
			
 
				+	JGE  next_vect_8b
			
 
				+
			
 
				+	MOVQ TMP5, -8(DST)(LEN*1)
			
 
				+	SUBQ $8, LEN
			
 
				+	SUBQ $8, TMP4
			
 
				+	JG   loop_8b
			
 
				+
			
 
				+	CMPQ LEN, $64
			
 
				+	JGE  aligned
			
 
				+	RET
			
 
				+
			
 
				+ret:
			
 
				+	RET
			
 
				+
			
 
				+TEXT ·hasSSE2(SB), NOSPLIT, $0
			
 
				+	XORQ AX, AX
			
 
				+	INCL AX
			
 
				+	CPUID
			
 
				+	SHRQ $26, DX
			
 
				+	ANDQ $1, DX
			
 
				+	MOVB DX, ret+0(FP)
			
 
				+	RET
			
 
				+
			
--- a/vendor/github.com/templexxx/xor/xor.go
+++ b/vendor/github.com/templexxx/xor/xor.go
@@ -0,0 +1,49 @@
 
				+package xor
			
 
				+
			
 
				+// SIMD Extensions
			
 
				+const (
			
 
				+	none = iota
			
 
				+	avx2
			
 
				+	// first introduced by Intel with the initial version of the Pentium 4 in 2001
			
 
				+	// so I think we can assume all amd64 has sse2
			
 
				+	sse2
			
 
				+)
			
 
				+
			
 
				+var extension = none
			
 
				+
			
 
				+// Bytes : chose the shortest one as xor size
			
 
				+// it's better to use it for big data ( > 64bytes )
			
 
				+func Bytes(dst, src0, src1 []byte) {
			
 
				+	size := len(dst)
			
 
				+	if size > len(src0) {
			
 
				+		size = len(src0)
			
 
				+	}
			
 
				+	if size > len(src1) {
			
 
				+		size = len(src1)
			
 
				+	}
			
 
				+	xorBytes(dst, src0, src1, size)
			
 
				+}
			
 
				+
			
 
				+// BytesSameLen : all slice's length must be equal
			
 
				+// cut size branch, save time for small data
			
 
				+func BytesSameLen(dst, src0, src1 []byte) {
			
 
				+	xorSrc1(dst, src0, src1)
			
 
				+}
			
 
				+
			
 
				+// BytesSrc0 : src1 >= src0, dst >= src0
			
 
				+// xor src0's len bytes
			
 
				+func BytesSrc0(dst, src0, src1 []byte) {
			
 
				+	xorSrc0(dst, src0, src1)
			
 
				+}
			
 
				+
			
 
				+// BytesSrc1 : src0 >= src1, dst >= src1
			
 
				+// xor src1's len bytes
			
 
				+func BytesSrc1(dst, src0, src1 []byte) {
			
 
				+	xorSrc1(dst, src0, src1)
			
 
				+}
			
 
				+
			
 
				+// Matrix : all slice's length must be equal && != 0
			
 
				+// len(src) must >= 2
			
 
				+func Matrix(dst []byte, src [][]byte) {
			
 
				+	xorMatrix(dst, src)
			
 
				+}
			
--- a/vendor/github.com/templexxx/xor/xor_amd64.go
+++ b/vendor/github.com/templexxx/xor/xor_amd64.go
@@ -0,0 +1,118 @@
 
				+package xor
			
 
				+
			
 
				+func init() {
			
 
				+	getEXT()
			
 
				+}
			
 
				+
			
 
				+func getEXT() {
			
 
				+	if hasAVX2() {
			
 
				+		extension = avx2
			
 
				+	} else {
			
 
				+		extension = sse2
			
 
				+	}
			
 
				+	return
			
 
				+}
			
 
				+
			
 
				+func xorBytes(dst, src0, src1 []byte, size int) {
			
 
				+	switch extension {
			
 
				+	case avx2:
			
 
				+		bytesAVX2(dst, src0, src1, size)
			
 
				+	default:
			
 
				+		bytesSSE2(dst, src0, src1, size)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// non-temporal hint store
			
 
				+const nontmp = 8 * 1024
			
 
				+const avx2loopsize = 128
			
 
				+
			
 
				+func bytesAVX2(dst, src0, src1 []byte, size int) {
			
 
				+	if size < avx2loopsize {
			
 
				+		bytesAVX2mini(dst, src0, src1, size)
			
 
				+	} else if size >= avx2loopsize && size <= nontmp {
			
 
				+		bytesAVX2small(dst, src0, src1, size)
			
 
				+	} else {
			
 
				+		bytesAVX2big(dst, src0, src1, size)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+const sse2loopsize = 64
			
 
				+
			
 
				+func bytesSSE2(dst, src0, src1 []byte, size int) {
			
 
				+	if size < sse2loopsize {
			
 
				+		bytesSSE2mini(dst, src0, src1, size)
			
 
				+	} else if size >= sse2loopsize && size <= nontmp {
			
 
				+		bytesSSE2small(dst, src0, src1, size)
			
 
				+	} else {
			
 
				+		bytesSSE2big(dst, src0, src1, size)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func xorMatrix(dst []byte, src [][]byte) {
			
 
				+	switch extension {
			
 
				+	case avx2:
			
 
				+		matrixAVX2(dst, src)
			
 
				+	default:
			
 
				+		matrixSSE2(dst, src)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func matrixAVX2(dst []byte, src [][]byte) {
			
 
				+	size := len(dst)
			
 
				+	if size > nontmp {
			
 
				+		matrixAVX2big(dst, src)
			
 
				+	} else {
			
 
				+		matrixAVX2small(dst, src)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func matrixSSE2(dst []byte, src [][]byte) {
			
 
				+	size := len(dst)
			
 
				+	if size > nontmp {
			
 
				+		matrixSSE2big(dst, src)
			
 
				+	} else {
			
 
				+		matrixSSE2small(dst, src)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+//go:noescape
			
 
				+func xorSrc0(dst, src0, src1 []byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func xorSrc1(dst, src0, src1 []byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesAVX2mini(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesAVX2big(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesAVX2small(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesSSE2mini(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesSSE2small(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func bytesSSE2big(dst, src0, src1 []byte, size int)
			
 
				+
			
 
				+//go:noescape
			
 
				+func matrixAVX2small(dst []byte, src [][]byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func matrixAVX2big(dst []byte, src [][]byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func matrixSSE2small(dst []byte, src [][]byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func matrixSSE2big(dst []byte, src [][]byte)
			
 
				+
			
 
				+//go:noescape
			
 
				+func hasAVX2() bool
			
 
				+
			
 
				+//go:noescape
			
 
				+func hasSSE2() bool
			
--- a/vendor/github.com/templexxx/xor/xor_other.go
+++ b/vendor/github.com/templexxx/xor/xor_other.go
@@ -0,0 +1,19 @@
 
				+// +build !amd64 noasm
			
 
				+
			
 
				+package xor
			
 
				+
			
 
				+func xorBytes(dst, src0, src1 []byte, size int) {
			
 
				+	bytesNoSIMD(dst, src0, src1, size)
			
 
				+}
			
 
				+
			
 
				+func xorMatrix(dst []byte, src [][]byte) {
			
 
				+	matrixNoSIMD(dst, src)
			
 
				+}
			
 
				+
			
 
				+func xorSrc0(dst, src0, src1 []byte) {
			
 
				+	bytesNoSIMD(dst, src0, src1, len(src0))
			
 
				+}
			
 
				+
			
 
				+func xorSrc1(dst, src0, src1 []byte) {
			
 
				+	bytesNoSIMD(dst, src0, src1, len(src1))
			
 
				+}
			
--- a/vendor/manifest
+++ b/vendor/manifest
@@ -17,6 +17,14 @@
 
				 			"branch": "master",
			
 
				 			"notests": true
			
 
				 		},
			
 
				+		{
			
 
				+			"importpath": "github.com/AudriusButkevicius/kcp-go",
			
 
				+			"repository": "https://github.com/AudriusButkevicius/kcp-go",
			
 
				+			"vcs": "git",
			
 
				+			"revision": "0ccc04f3b8a7bdf53e2d4d6d0769adbc7cb3851a",
			
 
				+			"branch": "master",
			
 
				+			"notests": true
			
 
				+		},
			
 
				 		{
			
 
				 			"importpath": "github.com/AudriusButkevicius/pfilter",
			
 
				 			"repository": "https://github.com/AudriusButkevicius/pfilter",
			
@@ -378,6 +386,14 @@
 
				 			"path": "/leveldb",
			
 
				 			"notests": true
			
 
				 		},
			
 
				+		{
			
 
				+			"importpath": "github.com/templexxx/xor",
			
 
				+			"repository": "https://github.com/templexxx/xor",
			
 
				+			"vcs": "git",
			
 
				+			"revision": "42f9c041c330b560afb991153bf183c25444bcdc",
			
 
				+			"branch": "master",
			
 
				+			"notests": true
			
 
				+		},
			
 
				 		{
			
 
				 			"importpath": "github.com/thejerf/suture",
			
 
				 			"repository": "https://github.com/thejerf/suture",
			
@@ -413,14 +429,6 @@
 
				 			"path": "/qr",
			
 
				 			"notests": true
			
 
				 		},
			
 
				-		{
			
 
				-			"importpath": "github.com/xtaci/kcp-go",
			
 
				-			"repository": "https://github.com/xtaci/kcp-go",
			
 
				-			"vcs": "git",
			
 
				-			"revision": "0b0731ef3f184a8985edcb4ca26a4b0598c6dc1a",
			
 
				-			"branch": "master",
			
 
				-			"notests": true
			
 
				-		},
			
 
				 		{
			
 
				 			"importpath": "github.com/xtaci/smux",
			
 
				 			"repository": "https://github.com/xtaci/smux",