Browse Source

lib/scanner, vendor: Update github.com/chmduquesne/rollinghash (fixes #5334) (#5335)

Updates the package and fixes a test that depended on the old behavior
of Write() being equivalent to Reset()+Write() which is no longer the
case. The scanner already did resets after each block write, so this is
fine.
Jakob Borg 7 năm trước cách đây
mục cha
commit
c0a26c918a

+ 1 - 2
lib/scanner/blocks_test.go

@@ -144,14 +144,13 @@ func TestAdler32Variants(t *testing.T) {
 
 	windowSize := 128
 
-	hf2.Reset()
-
 	hf3 := rollingAdler32.New()
 	hf3.Write(data[:windowSize])
 
 	for i := windowSize; i < len(data); i++ {
 		if i%windowSize == 0 {
 			// let the reference function catch up
+			hf2.Reset()
 			hf2.Write(data[i-windowSize : i])
 
 			// verify that they are in sync with the rolling function

+ 29 - 21
vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go

@@ -18,23 +18,24 @@ const (
 // It implements the adler32 algorithm https://en.wikipedia.org/wiki/Adler-32
 type Adler32 struct {
 	a, b uint32
+	n    uint32
 
 	// window is treated like a circular buffer, where the oldest element
 	// is indicated by d.oldest
 	window []byte
 	oldest int
-	n      uint32
 
 	vanilla hash.Hash32
 }
 
 // Reset resets the digest to its initial state.
 func (d *Adler32) Reset() {
-	d.window = d.window[:1] // Reset the size but don't reallocate
-	d.window[0] = 0
+	d.window = d.window[:0] // Reset the size but don't reallocate
+	d.oldest = 0
 	d.a = 1
 	d.b = 0
-	d.oldest = 0
+	d.n = 0
+	d.vanilla.Reset()
 }
 
 // New returns a new Adler32 digest
@@ -42,7 +43,8 @@ func New() *Adler32 {
 	return &Adler32{
 		a:       1,
 		b:       0,
-		window:  make([]byte, 1, rollinghash.DefaultWindowCap),
+		n:       0,
+		window:  make([]byte, 0, rollinghash.DefaultWindowCap),
 		oldest:  0,
 		vanilla: vanilla.New(),
 	}
@@ -54,30 +56,30 @@ func (d *Adler32) Size() int { return Size }
 // BlockSize is 1 byte
 func (d *Adler32) BlockSize() int { return 1 }
 
-// Write (re)initializes the rolling window with the input byte slice and
-// adds its data to the digest.
-func (d *Adler32) Write(p []byte) (int, error) {
-	// Copy the window, avoiding allocations where possible
-	l := len(p)
+// Write appends data to the rolling window and updates the digest.
+func (d *Adler32) Write(data []byte) (int, error) {
+	l := len(data)
 	if l == 0 {
-		l = 1
+		return 0, nil
 	}
-	if len(d.window) != l {
-		if cap(d.window) >= l {
-			d.window = d.window[:l]
-		} else {
-			d.window = make([]byte, len(p))
-		}
+	// Re-arrange the window so that the leftmost element is at index 0
+	n := len(d.window)
+	if d.oldest != 0 {
+		tmp := make([]byte, d.oldest)
+		copy(tmp, d.window[:d.oldest])
+		copy(d.window, d.window[d.oldest:])
+		copy(d.window[n-d.oldest:], tmp)
+		d.oldest = 0
 	}
-	copy(d.window, p)
+	d.window = append(d.window, data...)
 
 	// Piggy-back on the core implementation
 	d.vanilla.Reset()
-	d.vanilla.Write(p)
+	d.vanilla.Write(d.window)
 	s := d.vanilla.Sum32()
 	d.a, d.b = s&0xffff, s>>16
-	d.n = uint32(len(p)) % Mod
-	return len(d.window), nil
+	d.n = uint32(len(d.window)) % Mod
+	return len(data), nil
 }
 
 // Sum32 returns the hash as a uint32
@@ -94,6 +96,12 @@ func (d *Adler32) Sum(b []byte) []byte {
 // Roll updates the checksum of the window from the entering byte. You
 // MUST initialize a window with Write() before calling this method.
 func (d *Adler32) Roll(b byte) {
+	// This check costs 10-15% performance. If we disable it, we crash
+	// when the window is empty. If we enable it, we are always correct
+	// (an empty window never changes no matter how much you roll it).
+	//if len(d.window) == 0 {
+	//	return
+	//}
 	// extract the entering/leaving bytes and update the circular buffer.
 	enter := uint32(b)
 	leave := uint32(d.window[d.oldest])

+ 37 - 26
vendor/github.com/chmduquesne/rollinghash/bozo32/bozo32.go

@@ -14,9 +14,9 @@ const Size = 4
 
 // Bozo32 is a digest which satisfies the rollinghash.Hash32 interface.
 type Bozo32 struct {
-	a       uint32
-	h       uint32
-	aPowerN uint32
+	a     uint32
+	aⁿ    uint32
+	value uint32
 
 	// window is treated like a circular buffer, where the oldest element
 	// is indicated by d.oldest
@@ -26,19 +26,19 @@ type Bozo32 struct {
 
 // Reset resets the Hash to its initial state.
 func (d *Bozo32) Reset() {
-	d.h = 0
-	d.aPowerN = 1
-	d.window = nil
+	d.value = 0
+	d.aⁿ = 1
 	d.oldest = 0
+	d.window = d.window[:0]
 }
 
 func NewFromInt(a uint32) *Bozo32 {
 	return &Bozo32{
-		a:       a,
-		h:       0,
-		aPowerN: 1,
-		window:  make([]byte, 1, rollinghash.DefaultWindowCap),
-		oldest:  0,
+		a:      a,
+		value:  0,
+		aⁿ:     1,
+		window: make([]byte, 0, rollinghash.DefaultWindowCap),
+		oldest: 0,
 	}
 }
 
@@ -52,32 +52,37 @@ func (d *Bozo32) Size() int { return Size }
 // BlockSize is 1 byte
 func (d *Bozo32) BlockSize() int { return 1 }
 
-// Write (re)initializes the rolling window with the input byte slice and
-// adds its data to the digest. It never returns an error.
+// Write appends data to the rolling window and updates the digest. It
+// never returns an error.
 func (d *Bozo32) Write(data []byte) (int, error) {
-	// Copy the window
 	l := len(data)
 	if l == 0 {
-		l = 1
+		return 0, nil
 	}
-	if len(d.window) >= l {
-		d.window = d.window[:l]
-	} else {
-		d.window = make([]byte, l)
+	// Re-arrange the window so that the leftmost element is at index 0
+	n := len(d.window)
+	if d.oldest != 0 {
+		tmp := make([]byte, d.oldest)
+		copy(tmp, d.window[:d.oldest])
+		copy(d.window, d.window[d.oldest:])
+		copy(d.window[n-d.oldest:], tmp)
+		d.oldest = 0
 	}
-	copy(d.window, data)
+	d.window = append(d.window, data...)
 
+	d.value = 0
+	d.aⁿ = 1
 	for _, c := range d.window {
-		d.h *= d.a
-		d.h += uint32(c)
-		d.aPowerN *= d.a
+		d.value *= d.a
+		d.value += uint32(c)
+		d.a *= d.a
 	}
-	return len(d.window), nil
+	return len(data), nil
 }
 
 // Sum32 returns the hash as a uint32
 func (d *Bozo32) Sum32() uint32 {
-	return d.h
+	return d.value
 }
 
 // Sum returns the hash as byte slice
@@ -89,6 +94,12 @@ func (d *Bozo32) Sum(b []byte) []byte {
 // Roll updates the checksum of the window from the entering byte. You
 // MUST initialize a window with Write() before calling this method.
 func (d *Bozo32) Roll(c byte) {
+	// This check costs 10-15% performance. If we disable it, we crash
+	// when the window is empty. If we enable it, we are always correct
+	// (an empty window never changes no matter how much you roll it).
+	//if len(d.window) == 0 {
+	//	return
+	//}
 	// extract the entering/leaving bytes and update the circular buffer.
 	enter := uint32(c)
 	leave := uint32(d.window[d.oldest])
@@ -99,5 +110,5 @@ func (d *Bozo32) Roll(c byte) {
 		d.oldest = 0
 	}
 
-	d.h = d.h*d.a + enter - leave*d.aPowerN
+	d.value = d.value*d.a + enter - leave*d.aⁿ
 }

+ 21 - 13
vendor/github.com/chmduquesne/rollinghash/buzhash32/buzhash32.go

@@ -65,7 +65,7 @@ func New() *Buzhash32 {
 func NewFromUint32Array(b [256]uint32) *Buzhash32 {
 	return &Buzhash32{
 		sum:      0,
-		window:   make([]byte, 1, rollinghash.DefaultWindowCap),
+		window:   make([]byte, 0, rollinghash.DefaultWindowCap),
 		oldest:   0,
 		bytehash: b,
 	}
@@ -77,30 +77,31 @@ func (d *Buzhash32) Size() int { return Size }
 // BlockSize is 1 byte
 func (d *Buzhash32) BlockSize() int { return 1 }
 
-// Write (re)initializes the rolling window with the input byte slice and
-// adds its data to the digest.
+// Write appends data to the rolling window and updates the digest.
 func (d *Buzhash32) Write(data []byte) (int, error) {
-	// Copy the window, avoiding allocations where possible
 	l := len(data)
 	if l == 0 {
-		l = 1
+		return 0, nil
 	}
-	if len(d.window) != l {
-		if cap(d.window) >= l {
-			d.window = d.window[:l]
-		} else {
-			d.window = make([]byte, l)
-		}
+	// Re-arrange the window so that the leftmost element is at index 0
+	n := len(d.window)
+	if d.oldest != 0 {
+		tmp := make([]byte, d.oldest)
+		copy(tmp, d.window[:d.oldest])
+		copy(d.window, d.window[d.oldest:])
+		copy(d.window[n-d.oldest:], tmp)
+		d.oldest = 0
 	}
-	copy(d.window, data)
+	d.window = append(d.window, data...)
 
+	d.sum = 0
 	for _, c := range d.window {
 		d.sum = d.sum<<1 | d.sum>>31
 		d.sum ^= d.bytehash[int(c)]
 	}
 	d.nRotate = uint(len(d.window)) % 32
 	d.nRotateComplement = 32 - d.nRotate
-	return len(d.window), nil
+	return len(data), nil
 }
 
 // Sum32 returns the hash as a uint32
@@ -117,6 +118,13 @@ func (d *Buzhash32) Sum(b []byte) []byte {
 // Roll updates the checksum of the window from the entering byte. You
 // MUST initialize a window with Write() before calling this method.
 func (d *Buzhash32) Roll(c byte) {
+	// This check costs 10-15% performance. If we disable it, we crash
+	// when the window is empty. If we enable it, we are always correct
+	// (an empty window never changes no matter how much you roll it).
+	//if len(d.window) == 0 {
+	//	return
+	//}
+
 	// extract the entering/leaving bytes and update the circular buffer.
 	hn := d.bytehash[int(c)]
 	h0 := d.bytehash[int(d.window[d.oldest])]

+ 22 - 13
vendor/github.com/chmduquesne/rollinghash/buzhash64/buzhash64.go

@@ -65,7 +65,7 @@ func New() *Buzhash64 {
 func NewFromUint64Array(b [256]uint64) *Buzhash64 {
 	return &Buzhash64{
 		sum:      0,
-		window:   make([]byte, 1, rollinghash.DefaultWindowCap),
+		window:   make([]byte, 0, rollinghash.DefaultWindowCap),
 		oldest:   0,
 		bytehash: b,
 	}
@@ -77,30 +77,32 @@ func (d *Buzhash64) Size() int { return Size }
 // BlockSize is 1 byte
 func (d *Buzhash64) BlockSize() int { return 1 }
 
-// Write (re)initializes the rolling window with the input byte slice and
-// adds its data to the digest.
+// Write appends data to the rolling window and updates the digest. It
+// never returns an error.
 func (d *Buzhash64) Write(data []byte) (int, error) {
-	// Copy the window, avoiding allocations where possible
 	l := len(data)
 	if l == 0 {
-		l = 1
+		return 0, nil
 	}
-	if len(d.window) != l {
-		if cap(d.window) >= l {
-			d.window = d.window[:l]
-		} else {
-			d.window = make([]byte, l)
-		}
+	// Re-arrange the window so that the leftmost element is at index 0
+	n := len(d.window)
+	if d.oldest != 0 {
+		tmp := make([]byte, d.oldest)
+		copy(tmp, d.window[:d.oldest])
+		copy(d.window, d.window[d.oldest:])
+		copy(d.window[n-d.oldest:], tmp)
+		d.oldest = 0
 	}
-	copy(d.window, data)
+	d.window = append(d.window, data...)
 
+	d.sum = 0
 	for _, c := range d.window {
 		d.sum = d.sum<<1 | d.sum>>63
 		d.sum ^= d.bytehash[int(c)]
 	}
 	d.nRotate = uint(len(d.window)) % 64
 	d.nRotateComplement = 64 - d.nRotate
-	return len(d.window), nil
+	return len(data), nil
 }
 
 // Sum64 returns the hash as a uint64
@@ -117,6 +119,13 @@ func (d *Buzhash64) Sum(b []byte) []byte {
 // Roll updates the checksum of the window from the entering byte. You
 // MUST initialize a window with Write() before calling this method.
 func (d *Buzhash64) Roll(c byte) {
+	// This check costs 10-15% performance. If we disable it, we crash
+	// when the window is empty. If we enable it, we are always correct
+	// (an empty window never changes no matter how much you roll it).
+	//if len(d.window) == 0 {
+	//	return
+	//}
+
 	// extract the entering/leaving bytes and update the circular buffer.
 	hn := d.bytehash[int(c)]
 	h0 := d.bytehash[int(d.window[d.oldest])]

+ 39 - 23
vendor/github.com/chmduquesne/rollinghash/rabinkarp64/rabinkarp64.go

@@ -70,8 +70,10 @@ func init() {
 	cache.entries = make(map[index]*tables)
 }
 
-func (d *RabinKarp64) buildTables() {
+func (d *RabinKarp64) updateTables() {
 	windowsize := len(d.window)
+	pol := d.pol
+
 	idx := index{d.pol, windowsize}
 
 	cache.Lock()
@@ -82,8 +84,15 @@ func (d *RabinKarp64) buildTables() {
 		return
 	}
 
-	t = &tables{}
+	d.tables = buildTables(pol, windowsize)
+	cache.Lock()
+	cache.entries[idx] = d.tables
+	cache.Unlock()
+	return
+}
 
+func buildTables(pol Pol, windowsize int) (t *tables) {
+	t = &tables{}
 	// calculate table for sliding out bytes. The byte to slide out is used as
 	// the index for the table, the value contains the following:
 	// out_table[b] = Hash(b || 0 ||        ...        || 0)
@@ -99,17 +108,17 @@ func (d *RabinKarp64) buildTables() {
 		var h Pol
 		h <<= 8
 		h |= Pol(b)
-		h = h.Mod(d.pol)
+		h = h.Mod(pol)
 		for i := 0; i < windowsize-1; i++ {
 			h <<= 8
 			h |= Pol(0)
-			h = h.Mod(d.pol)
+			h = h.Mod(pol)
 		}
 		t.out[b] = h
 	}
 
 	// calculate table for reduction mod Polynomial
-	k := d.pol.Deg()
+	k := pol.Deg()
 	for b := 0; b < 256; b++ {
 		// mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and  B = b(x) * x^k
 		//
@@ -118,13 +127,10 @@ func (d *RabinKarp64) buildTables() {
 		// two parts: Part A contains the result of the modulus operation, part
 		// B is used to cancel out the 8 top bits so that one XOR operation is
 		// enough to reduce modulo Polynomial
-		t.mod[b] = Pol(uint64(b)<<uint(k)).Mod(d.pol) | (Pol(b) << uint(k))
+		t.mod[b] = Pol(uint64(b)<<uint(k)).Mod(pol) | (Pol(b) << uint(k))
 	}
 
-	d.tables = t
-	cache.Lock()
-	cache.entries[idx] = d.tables
-	cache.Unlock()
+	return t
 }
 
 // NewFromPol returns a RabinKarp64 digest from a polynomial over GF(2).
@@ -139,6 +145,7 @@ func NewFromPol(p Pol) *RabinKarp64 {
 		window:   make([]byte, 0, rollinghash.DefaultWindowCap),
 		oldest:   0,
 	}
+	res.updateTables()
 	return res
 }
 
@@ -156,9 +163,9 @@ func New() *RabinKarp64 {
 func (d *RabinKarp64) Reset() {
 	d.tables = nil
 	d.value = 0
-	d.window = d.window[:1]
-	d.window[0] = 0
+	d.window = d.window[:0]
 	d.oldest = 0
+	d.updateTables()
 }
 
 // Size is 8 bytes
@@ -167,30 +174,33 @@ func (d *RabinKarp64) Size() int { return Size }
 // BlockSize is 1 byte
 func (d *RabinKarp64) BlockSize() int { return 1 }
 
-// Write (re)initializes the rolling window with the input byte slice and
-// adds its data to the digest. It never returns an error.
+// Write appends data to the rolling window and updates the digest.
 func (d *RabinKarp64) Write(data []byte) (int, error) {
-	// Copy the window
 	l := len(data)
 	if l == 0 {
-		l = 1
+		return 0, nil
 	}
-	if len(d.window) >= l {
-		d.window = d.window[:l]
-	} else {
-		d.window = make([]byte, l)
+	// Re-arrange the window so that the leftmost element is at index 0
+	n := len(d.window)
+	if d.oldest != 0 {
+		tmp := make([]byte, d.oldest)
+		copy(tmp, d.window[:d.oldest])
+		copy(d.window, d.window[d.oldest:])
+		copy(d.window[n-d.oldest:], tmp)
+		d.oldest = 0
 	}
-	copy(d.window, data)
+	d.window = append(d.window, data...)
 
+	d.value = 0
 	for _, b := range d.window {
 		d.value <<= 8
 		d.value |= Pol(b)
 		d.value = d.value.Mod(d.pol)
 	}
 
-	d.buildTables()
+	d.updateTables()
 
-	return len(d.window), nil
+	return len(data), nil
 }
 
 // Sum64 returns the hash as a uint64
@@ -207,6 +217,12 @@ func (d *RabinKarp64) Sum(b []byte) []byte {
 // Roll updates the checksum of the window from the entering byte. You
 // MUST initialize a window with Write() before calling this method.
 func (d *RabinKarp64) Roll(c byte) {
+	// This check costs 10-15% performance. If we disable it, we crash
+	// when the window is empty. If we enable it, we are always correct
+	// (an empty window never changes no matter how much you roll it).
+	//if len(d.window) == 0 {
+	//	return
+	//}
 	// extract the entering/leaving bytes and update the circular buffer.
 	enter := c
 	leave := uint64(d.window[d.oldest])

+ 22 - 8
vendor/github.com/chmduquesne/rollinghash/roll/main.go

@@ -3,9 +3,11 @@ package main
 import (
 	"flag"
 	"fmt"
+	"hash"
 	"io"
 	"log"
 	"os"
+	"runtime/pprof"
 	"time"
 
 	"code.cloudfoundry.org/bytefmt"
@@ -33,7 +35,10 @@ func genMasks() (res []uint64) {
 	return
 }
 
-func hash2uint64(s []byte) (res uint64) {
+// Gets the hash sum as a uint64
+func sum64(h hash.Hash) (res uint64) {
+	buf := make([]byte, 0, 8)
+	s := h.Sum(buf)
 	for _, b := range s {
 		res <<= 8
 		res |= uint64(b)
@@ -42,18 +47,27 @@ func hash2uint64(s []byte) (res uint64) {
 }
 
 func main() {
+	cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file")
 	dostats := flag.Bool("stats", false, "Do some stats about the rolling sum")
 	size := flag.String("size", "256M", "How much data to read")
 	flag.Parse()
 
+	if *cpuprofile != "" {
+		f, err := os.Create(*cpuprofile)
+		if err != nil {
+			log.Fatal(err)
+		}
+		pprof.StartCPUProfile(f)
+		defer pprof.StopCPUProfile()
+	}
+
 	fileSize, err := bytefmt.ToBytes(*size)
 	if err != nil {
 		log.Fatal(err)
 	}
 
 	bufsize := 16 * MiB
-	rbuf := make([]byte, bufsize)
-	hbuf := make([]byte, 0, 8)
+	buf := make([]byte, bufsize)
 	t := time.Now()
 
 	f, err := os.Open("/dev/urandom")
@@ -66,10 +80,10 @@ func main() {
 		}
 	}()
 
-	io.ReadFull(f, rbuf)
+	io.ReadFull(f, buf)
 
 	roll := rollsum.New()
-	roll.Write(rbuf[:64])
+	roll.Write(buf[:64])
 
 	masks := genMasks()
 	hits := make(map[uint64]uint64)
@@ -97,15 +111,15 @@ func main() {
 				fmt.Printf(status)
 				fmt.Printf("\r")
 			}
-			_, err := io.ReadFull(f, rbuf)
+			_, err := io.ReadFull(f, buf)
 			if err != nil {
 				panic(err)
 			}
 			k = 0
 		}
-		roll.Roll(rbuf[k])
+		roll.Roll(buf[k])
 		if *dostats {
-			s := hash2uint64(roll.Sum(hbuf))
+			s := sum64(roll)
 			for _, m := range masks {
 				if s&m == m {
 					hits[m] += 1

+ 1 - 1
vendor/manifest

@@ -94,7 +94,7 @@
 			"importpath": "github.com/chmduquesne/rollinghash",
 			"repository": "https://github.com/chmduquesne/rollinghash",
 			"vcs": "git",
-			"revision": "abb8cbaf9915e48ee20cae94bcd94221b61707a2",
+			"revision": "a60f8e7142b536ea61bb5d84014171189eeaaa81",
 			"branch": "master",
 			"notests": true
 		},