Browse Source

Update goleveldb

Jakob Borg 11 years ago
parent
commit
68399601ce
43 changed files with 2354 additions and 1171 deletions
  1. 5 6
      Godeps/Godeps.json
  2. 132 96
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go
  3. 13 13
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go
  4. 2 2
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go
  5. 0 40
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go
  6. 97 58
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go
  7. 363 228
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go
  8. 16 18
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go
  9. 431 31
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go
  10. 7 4
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go
  11. 31 14
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go
  12. 2 22
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go
  13. 76 0
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go
  14. 14 23
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go
  15. 1 1
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go
  16. 7 8
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go
  17. 4 4
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go
  18. 68 65
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go
  19. 52 42
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go
  20. 2 2
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go
  21. 266 19
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go
  22. 65 9
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go
  23. 135 73
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go
  24. 98 93
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go
  25. 10 8
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go
  26. 34 33
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go
  27. 30 0
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go
  28. 87 33
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go
  29. 47 21
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table.go
  30. 150 104
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go
  31. 3 3
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go
  32. 3 2
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go
  33. 1 1
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go
  34. 2 1
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go
  35. 3 2
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go
  36. 2 2
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go
  37. 33 19
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go
  38. 0 1
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go
  39. 62 70
      Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go
  40. 0 0
      Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go
  41. 0 0
      Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go
  42. 0 0
      Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go
  43. 0 0
      Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go

+ 5 - 6
Godeps/Godeps.json

@@ -25,11 +25,6 @@
 			"Comment": "null-90",
 			"Rev": "d65bffbc88a153d23a6d2a864531e6e7c2cde59b"
 		},
-		{
-			"ImportPath": "code.google.com/p/snappy-go/snappy",
-			"Comment": "null-15",
-			"Rev": "12e4b4183793ac4b061921e7980845e750679fd0"
-		},
 		{
 			"ImportPath": "github.com/AudriusButkevicius/lfu-go",
 			"Rev": "164bcecceb92fd6037f4d18a8d97b495ec6ef669"
@@ -56,7 +51,11 @@
 		},
 		{
 			"ImportPath": "github.com/syndtr/goleveldb/leveldb",
-			"Rev": "cd2b8f743192883ab9fbc5f070ebda1dc90f3732"
+			"Rev": "d8d1d2a5cc2d34c950dffa2f554525415d59f737"
+		},
+		{
+			"ImportPath": "github.com/syndtr/gosnappy/snappy",
+			"Rev": "ce8acff4829e0c2458a67ead32390ac0a381c862"
 		},
 		{
 			"ImportPath": "github.com/vitrun/qart/coding",

+ 132 - 96
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go

@@ -8,65 +8,84 @@ package leveldb
 
 import (
 	"encoding/binary"
-	"errors"
+	"fmt"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/memdb"
 )
 
-var (
-	errBatchTooShort  = errors.New("leveldb: batch is too short")
-	errBatchBadRecord = errors.New("leveldb: bad record in batch")
-)
+type ErrBatchCorrupted struct {
+	Reason string
+}
+
+func (e *ErrBatchCorrupted) Error() string {
+	return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason)
+}
+
+func newErrBatchCorrupted(reason string) error {
+	return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason})
+}
 
-const kBatchHdrLen = 8 + 4
+const (
+	batchHdrLen  = 8 + 4
+	batchGrowRec = 3000
+)
 
-type batchReplay interface {
-	put(key, value []byte, seq uint64)
-	delete(key []byte, seq uint64)
+type BatchReplay interface {
+	Put(key, value []byte)
+	Delete(key []byte)
 }
 
 // Batch is a write batch.
 type Batch struct {
-	buf        []byte
+	data       []byte
 	rLen, bLen int
 	seq        uint64
 	sync       bool
 }
 
 func (b *Batch) grow(n int) {
-	off := len(b.buf)
+	off := len(b.data)
 	if off == 0 {
-		// include headers
-		off = kBatchHdrLen
-		n += off
+		off = batchHdrLen
+		if b.data != nil {
+			b.data = b.data[:off]
+		}
 	}
-	if cap(b.buf)-off >= n {
-		return
+	if cap(b.data)-off < n {
+		if b.data == nil {
+			b.data = make([]byte, off, off+n)
+		} else {
+			odata := b.data
+			div := 1
+			if b.rLen > batchGrowRec {
+				div = b.rLen / batchGrowRec
+			}
+			b.data = make([]byte, off, off+n+(off-batchHdrLen)/div)
+			copy(b.data, odata)
+		}
 	}
-	buf := make([]byte, 2*cap(b.buf)+n)
-	copy(buf, b.buf)
-	b.buf = buf[:off]
 }
 
-func (b *Batch) appendRec(t vType, key, value []byte) {
+func (b *Batch) appendRec(kt kType, key, value []byte) {
 	n := 1 + binary.MaxVarintLen32 + len(key)
-	if t == tVal {
+	if kt == ktVal {
 		n += binary.MaxVarintLen32 + len(value)
 	}
 	b.grow(n)
-	off := len(b.buf)
-	buf := b.buf[:off+n]
-	buf[off] = byte(t)
+	off := len(b.data)
+	data := b.data[:off+n]
+	data[off] = byte(kt)
 	off += 1
-	off += binary.PutUvarint(buf[off:], uint64(len(key)))
-	copy(buf[off:], key)
+	off += binary.PutUvarint(data[off:], uint64(len(key)))
+	copy(data[off:], key)
 	off += len(key)
-	if t == tVal {
-		off += binary.PutUvarint(buf[off:], uint64(len(value)))
-		copy(buf[off:], value)
+	if kt == ktVal {
+		off += binary.PutUvarint(data[off:], uint64(len(value)))
+		copy(data[off:], value)
 		off += len(value)
 	}
-	b.buf = buf[:off]
+	b.data = data[:off]
 	b.rLen++
 	//  Include 8-byte ikey header
 	b.bLen += len(key) + len(value) + 8
@@ -75,18 +94,51 @@ func (b *Batch) appendRec(t vType, key, value []byte) {
 // Put appends 'put operation' of the given key/value pair to the batch.
 // It is safe to modify the contents of the argument after Put returns.
 func (b *Batch) Put(key, value []byte) {
-	b.appendRec(tVal, key, value)
+	b.appendRec(ktVal, key, value)
 }
 
 // Delete appends 'delete operation' of the given key to the batch.
 // It is safe to modify the contents of the argument after Delete returns.
 func (b *Batch) Delete(key []byte) {
-	b.appendRec(tDel, key, nil)
+	b.appendRec(ktDel, key, nil)
+}
+
+// Dump dumps batch contents. The returned slice can be loaded into the
+// batch using Load method.
+// The returned slice is not its own copy, so the contents should not be
+// modified.
+func (b *Batch) Dump() []byte {
+	return b.encode()
+}
+
+// Load loads given slice into the batch. Previous contents of the batch
+// will be discarded.
+// The given slice will not be copied and will be used as batch buffer, so
+// it is not safe to modify the contents of the slice.
+func (b *Batch) Load(data []byte) error {
+	return b.decode(0, data)
+}
+
+// Replay replays batch contents.
+func (b *Batch) Replay(r BatchReplay) error {
+	return b.decodeRec(func(i int, kt kType, key, value []byte) {
+		switch kt {
+		case ktVal:
+			r.Put(key, value)
+		case ktDel:
+			r.Delete(key)
+		}
+	})
+}
+
+// Len returns number of records in the batch.
+func (b *Batch) Len() int {
+	return b.rLen
 }
 
 // Reset resets the batch.
 func (b *Batch) Reset() {
-	b.buf = nil
+	b.data = b.data[:0]
 	b.seq = 0
 	b.rLen = 0
 	b.bLen = 0
@@ -97,24 +149,10 @@ func (b *Batch) init(sync bool) {
 	b.sync = sync
 }
 
-func (b *Batch) put(key, value []byte, seq uint64) {
-	if b.rLen == 0 {
-		b.seq = seq
-	}
-	b.Put(key, value)
-}
-
-func (b *Batch) delete(key []byte, seq uint64) {
-	if b.rLen == 0 {
-		b.seq = seq
-	}
-	b.Delete(key)
-}
-
 func (b *Batch) append(p *Batch) {
 	if p.rLen > 0 {
-		b.grow(len(p.buf) - kBatchHdrLen)
-		b.buf = append(b.buf, p.buf[kBatchHdrLen:]...)
+		b.grow(len(p.data) - batchHdrLen)
+		b.data = append(b.data, p.data[batchHdrLen:]...)
 		b.rLen += p.rLen
 	}
 	if p.sync {
@@ -122,95 +160,93 @@ func (b *Batch) append(p *Batch) {
 	}
 }
 
-func (b *Batch) len() int {
-	return b.rLen
-}
-
+// size returns sums of key/value pair length plus 8-bytes ikey.
 func (b *Batch) size() int {
 	return b.bLen
 }
 
 func (b *Batch) encode() []byte {
 	b.grow(0)
-	binary.LittleEndian.PutUint64(b.buf, b.seq)
-	binary.LittleEndian.PutUint32(b.buf[8:], uint32(b.rLen))
+	binary.LittleEndian.PutUint64(b.data, b.seq)
+	binary.LittleEndian.PutUint32(b.data[8:], uint32(b.rLen))
 
-	return b.buf
+	return b.data
 }
 
-func (b *Batch) decode(buf []byte) error {
-	if len(buf) < kBatchHdrLen {
-		return errBatchTooShort
+func (b *Batch) decode(prevSeq uint64, data []byte) error {
+	if len(data) < batchHdrLen {
+		return newErrBatchCorrupted("too short")
 	}
 
-	b.seq = binary.LittleEndian.Uint64(buf)
-	b.rLen = int(binary.LittleEndian.Uint32(buf[8:]))
+	b.seq = binary.LittleEndian.Uint64(data)
+	if b.seq < prevSeq {
+		return newErrBatchCorrupted("invalid sequence number")
+	}
+	b.rLen = int(binary.LittleEndian.Uint32(data[8:]))
+	if b.rLen < 0 {
+		return newErrBatchCorrupted("invalid records length")
+	}
 	// No need to be precise at this point, it won't be used anyway
-	b.bLen = len(buf) - kBatchHdrLen
-	b.buf = buf
+	b.bLen = len(data) - batchHdrLen
+	b.data = data
 
 	return nil
 }
 
-func (b *Batch) decodeRec(f func(i int, t vType, key, value []byte)) error {
-	off := kBatchHdrLen
+func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) {
+	off := batchHdrLen
 	for i := 0; i < b.rLen; i++ {
-		if off >= len(b.buf) {
-			return errors.New("leveldb: invalid batch record length")
+		if off >= len(b.data) {
+			return newErrBatchCorrupted("invalid records length")
 		}
 
-		t := vType(b.buf[off])
-		if t > tVal {
-			return errors.New("leveldb: invalid batch record type in batch")
+		kt := kType(b.data[off])
+		if kt > ktVal {
+			return newErrBatchCorrupted("bad record: invalid type")
 		}
 		off += 1
 
-		x, n := binary.Uvarint(b.buf[off:])
+		x, n := binary.Uvarint(b.data[off:])
 		off += n
-		if n <= 0 || off+int(x) > len(b.buf) {
-			return errBatchBadRecord
+		if n <= 0 || off+int(x) > len(b.data) {
+			return newErrBatchCorrupted("bad record: invalid key length")
 		}
-		key := b.buf[off : off+int(x)]
+		key := b.data[off : off+int(x)]
 		off += int(x)
-
 		var value []byte
-		if t == tVal {
-			x, n := binary.Uvarint(b.buf[off:])
+		if kt == ktVal {
+			x, n := binary.Uvarint(b.data[off:])
 			off += n
-			if n <= 0 || off+int(x) > len(b.buf) {
-				return errBatchBadRecord
+			if n <= 0 || off+int(x) > len(b.data) {
+				return newErrBatchCorrupted("bad record: invalid value length")
 			}
-			value = b.buf[off : off+int(x)]
+			value = b.data[off : off+int(x)]
 			off += int(x)
 		}
 
-		f(i, t, key, value)
+		f(i, kt, key, value)
 	}
 
 	return nil
 }
 
-func (b *Batch) replay(to batchReplay) error {
-	return b.decodeRec(func(i int, t vType, key, value []byte) {
-		switch t {
-		case tVal:
-			to.put(key, value, b.seq+uint64(i))
-		case tDel:
-			to.delete(key, b.seq+uint64(i))
-		}
-	})
-}
-
 func (b *Batch) memReplay(to *memdb.DB) error {
-	return b.decodeRec(func(i int, t vType, key, value []byte) {
-		ikey := newIKey(key, b.seq+uint64(i), t)
+	return b.decodeRec(func(i int, kt kType, key, value []byte) {
+		ikey := newIkey(key, b.seq+uint64(i), kt)
 		to.Put(ikey, value)
 	})
 }
 
+func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) error {
+	if err := b.decode(prevSeq, data); err != nil {
+		return err
+	}
+	return b.memReplay(to)
+}
+
 func (b *Batch) revertMemReplay(to *memdb.DB) error {
-	return b.decodeRec(func(i int, t vType, key, value []byte) {
-		ikey := newIKey(key, b.seq+uint64(i), t)
+	return b.decodeRec(func(i int, kt kType, key, value []byte) {
+		ikey := newIkey(key, b.seq+uint64(i), kt)
 		to.Delete(ikey)
 	})
 }

+ 13 - 13
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go

@@ -15,7 +15,7 @@ import (
 )
 
 type tbRec struct {
-	t          vType
+	kt         kType
 	key, value []byte
 }
 
@@ -23,39 +23,39 @@ type testBatch struct {
 	rec []*tbRec
 }
 
-func (p *testBatch) put(key, value []byte, seq uint64) {
-	p.rec = append(p.rec, &tbRec{tVal, key, value})
+func (p *testBatch) Put(key, value []byte) {
+	p.rec = append(p.rec, &tbRec{ktVal, key, value})
 }
 
-func (p *testBatch) delete(key []byte, seq uint64) {
-	p.rec = append(p.rec, &tbRec{tDel, key, nil})
+func (p *testBatch) Delete(key []byte) {
+	p.rec = append(p.rec, &tbRec{ktDel, key, nil})
 }
 
 func compareBatch(t *testing.T, b1, b2 *Batch) {
 	if b1.seq != b2.seq {
 		t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq)
 	}
-	if b1.len() != b2.len() {
-		t.Fatalf("invalid record length want %d, got %d", b1.len(), b2.len())
+	if b1.Len() != b2.Len() {
+		t.Fatalf("invalid record length want %d, got %d", b1.Len(), b2.Len())
 	}
 	p1, p2 := new(testBatch), new(testBatch)
-	err := b1.replay(p1)
+	err := b1.Replay(p1)
 	if err != nil {
 		t.Fatal("error when replaying batch 1: ", err)
 	}
-	err = b2.replay(p2)
+	err = b2.Replay(p2)
 	if err != nil {
 		t.Fatal("error when replaying batch 2: ", err)
 	}
 	for i := range p1.rec {
 		r1, r2 := p1.rec[i], p2.rec[i]
-		if r1.t != r2.t {
-			t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.t, r2.t)
+		if r1.kt != r2.kt {
+			t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.kt, r2.kt)
 		}
 		if !bytes.Equal(r1.key, r2.key) {
 			t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key))
 		}
-		if r1.t == tVal {
+		if r1.kt == ktVal {
 			if !bytes.Equal(r1.value, r2.value) {
 				t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value))
 			}
@@ -75,7 +75,7 @@ func TestBatch_EncodeDecode(t *testing.T) {
 	b1.Delete([]byte("k"))
 	buf := b1.encode()
 	b2 := new(Batch)
-	err := b2.decode(buf)
+	err := b2.decode(0, buf)
 	if err != nil {
 		t.Error("error when decoding batch: ", err)
 	}

+ 2 - 2
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go

@@ -249,7 +249,7 @@ func (x *testingCacheObject) Release() {
 		x.releaseCalled = true
 		x.cnt.releaseOne()
 	} else {
-		x.t.Errorf("duplicate setfin NS#%d KEY#%s", x.ns, x.key)
+		x.t.Errorf("duplicate setfin NS#%d KEY#%d", x.ns, x.key)
 	}
 }
 
@@ -489,7 +489,7 @@ func TestLRUCache_Finalizer(t *testing.T) {
 			return true
 		} else {
 			if p.delfinCalled != keymax {
-				t.Errorf("(2) #%d not all delete fin called, diff=%d", p.ns, keymax-p.delfinCalled)
+				t.Errorf("(2) NS#%d not all delete fin called, diff=%d", p.nsid, keymax-p.delfinCalled)
 			}
 			return false
 		}

+ 0 - 40
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go

@@ -1,40 +0,0 @@
-// Copyright (c) 2012, Suryandaru Triandana <[email protected]>
-// All rights reserved.
-//
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-package leveldb
-
-const (
-	kNumLevels = 7
-
-	// Level-0 compaction is started when we hit this many files.
-	kL0_CompactionTrigger float64 = 4
-
-	// Soft limit on number of level-0 files.  We slow down writes at this point.
-	kL0_SlowdownWritesTrigger = 8
-
-	// Maximum number of level-0 files.  We stop writes at this point.
-	kL0_StopWritesTrigger = 12
-
-	// Maximum level to which a new compacted memdb is pushed if it
-	// does not create overlap.  We try to push to level 2 to avoid the
-	// relatively expensive level 0=>1 compactions and to avoid some
-	// expensive manifest file operations.  We do not push all the way to
-	// the largest level since that can generate a lot of wasted disk
-	// space if the same key space is being repeatedly overwritten.
-	kMaxMemCompactLevel = 2
-
-	// Maximum size of a table.
-	kMaxTableSize = 2 * 1048576
-
-	// Maximum bytes of overlaps in grandparent (i.e., level+2) before we
-	// stop building a single file in a level->level+1 compaction.
-	kMaxGrandParentOverlapBytes = 10 * kMaxTableSize
-
-	// Maximum number of bytes in all compacted files.  We avoid expanding
-	// the lower level file set of a compaction if it would make the
-	// total compaction cover more than this many bytes.
-	kExpCompactionMaxBytes = 25 * kMaxTableSize
-)

+ 97 - 58
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go

@@ -8,7 +8,6 @@ package leveldb
 
 import (
 	"container/list"
-	"errors"
 	"fmt"
 	"io"
 	"os"
@@ -18,6 +17,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/journal"
 	"github.com/syndtr/goleveldb/leveldb/memdb"
@@ -57,18 +57,19 @@ type DB struct {
 	writeMergedC chan bool
 	writeLockC   chan struct{}
 	writeAckC    chan error
+	writeDelay   time.Duration
+	writeDelayN  int
 	journalC     chan *Batch
 	journalAckC  chan error
 
 	// Compaction.
-	tcompCmdC     chan cCmd
-	tcompPauseC   chan chan<- struct{}
-	tcompTriggerC chan struct{}
-	mcompCmdC     chan cCmd
-	mcompTriggerC chan struct{}
-	compErrC      chan error
-	compErrSetC   chan error
-	compStats     [kNumLevels]cStats
+	tcompCmdC   chan cCmd
+	tcompPauseC chan chan<- struct{}
+	mcompCmdC   chan cCmd
+	compErrC    chan error
+	compPerErrC chan error
+	compErrSetC chan error
+	compStats   []cStats
 
 	// Close.
 	closeW sync.WaitGroup
@@ -83,7 +84,7 @@ func openDB(s *session) (*DB, error) {
 	db := &DB{
 		s: s,
 		// Initial sequence
-		seq: s.stSeq,
+		seq: s.stSeqNum,
 		// MemDB
 		memPool: make(chan *memdb.DB, 1),
 		// Snapshot
@@ -96,13 +97,13 @@ func openDB(s *session) (*DB, error) {
 		journalC:     make(chan *Batch),
 		journalAckC:  make(chan error),
 		// Compaction
-		tcompCmdC:     make(chan cCmd),
-		tcompPauseC:   make(chan chan<- struct{}),
-		tcompTriggerC: make(chan struct{}, 1),
-		mcompCmdC:     make(chan cCmd),
-		mcompTriggerC: make(chan struct{}, 1),
-		compErrC:      make(chan error),
-		compErrSetC:   make(chan error),
+		tcompCmdC:   make(chan cCmd),
+		tcompPauseC: make(chan chan<- struct{}),
+		mcompCmdC:   make(chan cCmd),
+		compErrC:    make(chan error),
+		compPerErrC: make(chan error),
+		compErrSetC: make(chan error),
+		compStats:   make([]cStats, s.o.GetNumLevel()),
 		// Close
 		closeC: make(chan struct{}),
 	}
@@ -121,14 +122,14 @@ func openDB(s *session) (*DB, error) {
 		return nil, err
 	}
 
-	// Don't include compaction error goroutine into wait group.
+	// Doesn't need to be included in the wait group.
 	go db.compactionError()
+	go db.mpoolDrain()
 
 	db.closeW.Add(3)
 	go db.tCompaction()
 	go db.mCompaction()
 	go db.jWriter()
-	go db.mpoolDrain()
 
 	s.logf("db@open done T·%v", time.Since(start))
 
@@ -255,6 +256,10 @@ func RecoverFile(path string, o *opt.Options) (db *DB, err error) {
 }
 
 func recoverTable(s *session, o *opt.Options) error {
+	o = dupOptions(o)
+	// Mask StrictReader, lets StrictRecovery doing its job.
+	o.Strict &= ^opt.StrictReader
+
 	// Get all tables and sort it by file number.
 	tableFiles_, err := s.getFiles(storage.TypeTable)
 	if err != nil {
@@ -263,10 +268,16 @@ func recoverTable(s *session, o *opt.Options) error {
 	tableFiles := files(tableFiles_)
 	tableFiles.sort()
 
-	var mSeq uint64
-	var good, corrupted int
-	rec := new(sessionRecord)
-	bpool := util.NewBufferPool(o.GetBlockSize() + 5)
+	var (
+		mSeq                                                              uint64
+		recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int
+
+		// We will drop corrupted table.
+		strict = o.GetStrict(opt.StrictRecovery)
+
+		rec   = &sessionRecord{numLevel: o.GetNumLevel()}
+		bpool = util.NewBufferPool(o.GetBlockSize() + 5)
+	)
 	buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) {
 		tmp = s.newTemp()
 		writer, err := tmp.Create()
@@ -321,25 +332,32 @@ func recoverTable(s *session, o *opt.Options) error {
 			return err
 		}
 
-		var tSeq uint64
-		var tgood, tcorrupted, blockerr int
-		var imin, imax []byte
-		tr := table.NewReader(reader, size, nil, bpool, o)
+		var (
+			tSeq                                     uint64
+			tgoodKey, tcorruptedKey, tcorruptedBlock int
+			imin, imax                               []byte
+		)
+		tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o)
+		if err != nil {
+			return err
+		}
 		iter := tr.NewIterator(nil, nil)
 		iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) {
-			s.logf("table@recovery found error @%d %q", file.Num(), err)
-			blockerr++
+			if errors.IsCorrupted(err) {
+				s.logf("table@recovery block corruption @%d %q", file.Num(), err)
+				tcorruptedBlock++
+			}
 		})
 
 		// Scan the table.
 		for iter.Next() {
 			key := iter.Key()
-			_, seq, _, ok := parseIkey(key)
-			if !ok {
-				tcorrupted++
+			_, seq, _, kerr := parseIkey(key)
+			if kerr != nil {
+				tcorruptedKey++
 				continue
 			}
-			tgood++
+			tgoodKey++
 			if seq > tSeq {
 				tSeq = seq
 			}
@@ -354,8 +372,18 @@ func recoverTable(s *session, o *opt.Options) error {
 		}
 		iter.Release()
 
-		if tgood > 0 {
-			if tcorrupted > 0 || blockerr > 0 {
+		goodKey += tgoodKey
+		corruptedKey += tcorruptedKey
+		corruptedBlock += tcorruptedBlock
+
+		if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) {
+			droppedTable++
+			s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
+			return nil
+		}
+
+		if tgoodKey > 0 {
+			if tcorruptedKey > 0 || tcorruptedBlock > 0 {
 				// Rebuild the table.
 				s.logf("table@recovery rebuilding @%d", file.Num())
 				iter := tr.NewIterator(nil, nil)
@@ -373,16 +401,15 @@ func recoverTable(s *session, o *opt.Options) error {
 			if tSeq > mSeq {
 				mSeq = tSeq
 			}
+			recoveredKey += tgoodKey
 			// Add table to level 0.
 			rec.addTable(0, file.Num(), uint64(size), imin, imax)
-			s.logf("table@recovery recovered @%d N·%d C·%d B·%d S·%d Q·%d", file.Num(), tgood, tcorrupted, blockerr, size, tSeq)
+			s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq)
 		} else {
-			s.logf("table@recovery unrecoverable @%d C·%d B·%d S·%d", file.Num(), tcorrupted, blockerr, size)
+			droppedTable++
+			s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size)
 		}
 
-		good += tgood
-		corrupted += tcorrupted
-
 		return nil
 	}
 
@@ -399,11 +426,11 @@ func recoverTable(s *session, o *opt.Options) error {
 			}
 		}
 
-		s.logf("table@recovery recovered F·%d N·%d C·%d Q·%d", len(tableFiles), good, corrupted, mSeq)
+		s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, mSeq)
 	}
 
 	// Set sequence number.
-	rec.setSeq(mSeq + 1)
+	rec.setSeqNum(mSeq + 1)
 
 	// Create new manifest.
 	if err := s.create(); err != nil {
@@ -486,26 +513,30 @@ func (db *DB) recoverJournal() error {
 				if err == io.EOF {
 					break
 				}
-				return err
+				return errors.SetFile(err, file)
 			}
 
 			buf.Reset()
 			if _, err := buf.ReadFrom(r); err != nil {
 				if err == io.ErrUnexpectedEOF {
+					// This is error returned due to corruption, with strict == false.
 					continue
 				} else {
-					return err
+					return errors.SetFile(err, file)
 				}
 			}
-			if err := batch.decode(buf.Bytes()); err != nil {
-				return err
-			}
-			if err := batch.memReplay(mem); err != nil {
-				return err
+			if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mem); err != nil {
+				if strict || !errors.IsCorrupted(err) {
+					return errors.SetFile(err, file)
+				} else {
+					db.s.logf("journal error: %v (skipped)", err)
+					// We won't apply sequence number as it might be corrupted.
+					continue
+				}
 			}
 
 			// Save sequence number.
-			db.seq = batch.seq + uint64(batch.len())
+			db.seq = batch.seq + uint64(batch.Len())
 
 			// Flush it if large enough.
 			if mem.Size() >= writeBuffer {
@@ -566,7 +597,7 @@ func (db *DB) recoverJournal() error {
 }
 
 func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) {
-	ikey := newIKey(key, seq, tSeek)
+	ikey := newIkey(key, seq, ktSeek)
 
 	em, fm := db.getMems()
 	for _, m := range [...]*memDB{em, fm} {
@@ -577,9 +608,13 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
 
 		mk, mv, me := m.mdb.Find(ikey)
 		if me == nil {
-			ukey, _, t, ok := parseIkey(mk)
-			if ok && db.s.icmp.uCompare(ukey, key) == 0 {
-				if t == tDel {
+			ukey, _, kt, kerr := parseIkey(mk)
+			if kerr != nil {
+				// Shouldn't have had happen.
+				panic(kerr)
+			}
+			if db.s.icmp.uCompare(ukey, key) == 0 {
+				if kt == ktDel {
 					return nil, ErrNotFound
 				}
 				return append([]byte{}, mv...), nil
@@ -594,7 +629,7 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er
 	v.release()
 	if cSched {
 		// Trigger table compaction.
-		db.compTrigger(db.tcompTriggerC)
+		db.compSendTrigger(db.tcompCmdC)
 	}
 	return
 }
@@ -697,7 +732,7 @@ func (db *DB) GetProperty(name string) (value string, err error) {
 		var level uint
 		var rest string
 		n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest)
-		if n != 1 || level >= kNumLevels {
+		if n != 1 || int(level) >= db.s.o.GetNumLevel() {
 			err = errors.New("leveldb: GetProperty: invalid property: " + name)
 		} else {
 			value = fmt.Sprint(v.tLen(int(level)))
@@ -759,8 +794,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) {
 
 	sizes := make(Sizes, 0, len(ranges))
 	for _, r := range ranges {
-		imin := newIKey(r.Start, kMaxSeq, tSeek)
-		imax := newIKey(r.Limit, kMaxSeq, tSeek)
+		imin := newIkey(r.Start, kMaxSeq, ktSeek)
+		imax := newIkey(r.Limit, kMaxSeq, ktSeek)
 		start, err := v.offsetOf(imin)
 		if err != nil {
 			return nil, err
@@ -816,6 +851,10 @@ func (db *DB) Close() error {
 		db.journalWriter.Close()
 	}
 
+	if db.writeDelayN > 0 {
+		db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
+	}
+
 	// Close session.
 	db.s.close()
 	db.logf("db@close done T·%v", time.Since(start))

+ 363 - 228
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go

@@ -7,11 +7,12 @@
 package leveldb
 
 import (
-	"errors"
 	"sync"
 	"time"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/memdb"
+	"github.com/syndtr/goleveldb/leveldb/opt"
 )
 
 var (
@@ -68,7 +69,7 @@ type cMem struct {
 }
 
 func newCMem(s *session) *cMem {
-	return &cMem{s: s, rec: new(sessionRecord)}
+	return &cMem{s: s, rec: &sessionRecord{numLevel: s.o.GetNumLevel()}}
 }
 
 func (c *cMem) flush(mem *memdb.DB, level int) error {
@@ -84,7 +85,9 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
 
 	// Pick level.
 	if level < 0 {
-		level = s.version_NB().pickLevel(t.imin.ukey(), t.imax.ukey())
+		v := s.version()
+		level = v.pickLevel(t.imin.ukey(), t.imax.ukey())
+		v.release()
 	}
 	c.rec.addTableFile(level, t)
 
@@ -95,24 +98,32 @@ func (c *cMem) flush(mem *memdb.DB, level int) error {
 }
 
 func (c *cMem) reset() {
-	c.rec = new(sessionRecord)
+	c.rec = &sessionRecord{numLevel: c.s.o.GetNumLevel()}
 }
 
 func (c *cMem) commit(journal, seq uint64) error {
 	c.rec.setJournalNum(journal)
-	c.rec.setSeq(seq)
+	c.rec.setSeqNum(seq)
 
 	// Commit changes.
 	return c.s.commit(c.rec)
 }
 
 func (db *DB) compactionError() {
-	var err error
+	var (
+		err     error
+		wlocked bool
+	)
 noerr:
+	// No error.
 	for {
 		select {
 		case err = <-db.compErrSetC:
-			if err != nil {
+			switch {
+			case err == nil:
+			case errors.IsCorrupted(err):
+				goto hasperr
+			default:
 				goto haserr
 			}
 		case _, _ = <-db.closeC:
@@ -120,17 +131,39 @@ noerr:
 		}
 	}
 haserr:
+	// Transient error.
 	for {
 		select {
 		case db.compErrC <- err:
 		case err = <-db.compErrSetC:
-			if err == nil {
+			switch {
+			case err == nil:
 				goto noerr
+			case errors.IsCorrupted(err):
+				goto hasperr
+			default:
 			}
 		case _, _ = <-db.closeC:
 			return
 		}
 	}
+hasperr:
+	// Persistent error.
+	for {
+		select {
+		case db.compErrC <- err:
+		case db.compPerErrC <- err:
+		case db.writeLockC <- struct{}{}:
+			// Hold write lock, so that write won't pass-through.
+			wlocked = true
+		case _, _ = <-db.closeC:
+			if wlocked {
+				// We should release the lock or Close will hang.
+				<-db.writeLockC
+			}
+			return
+		}
+	}
 }
 
 type compactionTransactCounter int
@@ -139,12 +172,17 @@ func (cnt *compactionTransactCounter) incr() {
 	*cnt++
 }
 
-func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactCounter) error, rollback func() error) {
+type compactionTransactInterface interface {
+	run(cnt *compactionTransactCounter) error
+	revert() error
+}
+
+func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
 	defer func() {
 		if x := recover(); x != nil {
-			if x == errCompactionTransactExiting && rollback != nil {
-				if err := rollback(); err != nil {
-					db.logf("%s rollback error %q", name, err)
+			if x == errCompactionTransactExiting {
+				if err := t.revert(); err != nil {
+					db.logf("%s revert error %q", name, err)
 				}
 			}
 			panic(x)
@@ -156,9 +194,13 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
 		backoffMax = 8 * time.Second
 		backoffMul = 2 * time.Second
 	)
-	backoff := backoffMin
-	backoffT := time.NewTimer(backoff)
-	lastCnt := compactionTransactCounter(0)
+	var (
+		backoff  = backoffMin
+		backoffT = time.NewTimer(backoff)
+		lastCnt  = compactionTransactCounter(0)
+
+		disableBackoff = db.s.o.GetDisableCompactionBackoff()
+	)
 	for n := 0; ; n++ {
 		// Check wether the DB is closed.
 		if db.isClosed() {
@@ -170,11 +212,19 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
 
 		// Execute.
 		cnt := compactionTransactCounter(0)
-		err := exec(&cnt)
+		err := t.run(&cnt)
+		if err != nil {
+			db.logf("%s error I·%d %q", name, cnt, err)
+		}
 
 		// Set compaction error status.
 		select {
 		case db.compErrSetC <- err:
+		case perr := <-db.compPerErrC:
+			if err != nil {
+				db.logf("%s exiting (persistent error %q)", name, perr)
+				db.compactionExitTransact()
+			}
 		case _, _ = <-db.closeC:
 			db.logf("%s exiting", name)
 			db.compactionExitTransact()
@@ -182,31 +232,56 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC
 		if err == nil {
 			return
 		}
-		db.logf("%s error I·%d %q", name, cnt, err)
-
-		// Reset backoff duration if counter is advancing.
-		if cnt > lastCnt {
-			backoff = backoffMin
-			lastCnt = cnt
+		if errors.IsCorrupted(err) {
+			db.logf("%s exiting (corruption detected)", name)
+			db.compactionExitTransact()
 		}
 
-		// Backoff.
-		backoffT.Reset(backoff)
-		if backoff < backoffMax {
-			backoff *= backoffMul
-			if backoff > backoffMax {
-				backoff = backoffMax
+		if !disableBackoff {
+			// Reset backoff duration if counter is advancing.
+			if cnt > lastCnt {
+				backoff = backoffMin
+				lastCnt = cnt
+			}
+
+			// Backoff.
+			backoffT.Reset(backoff)
+			if backoff < backoffMax {
+				backoff *= backoffMul
+				if backoff > backoffMax {
+					backoff = backoffMax
+				}
+			}
+			select {
+			case <-backoffT.C:
+			case _, _ = <-db.closeC:
+				db.logf("%s exiting", name)
+				db.compactionExitTransact()
 			}
-		}
-		select {
-		case <-backoffT.C:
-		case _, _ = <-db.closeC:
-			db.logf("%s exiting", name)
-			db.compactionExitTransact()
 		}
 	}
 }
 
+type compactionTransactFunc struct {
+	runFunc    func(cnt *compactionTransactCounter) error
+	revertFunc func() error
+}
+
+func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error {
+	return t.runFunc(cnt)
+}
+
+func (t *compactionTransactFunc) revert() error {
+	if t.revertFunc != nil {
+		return t.revertFunc()
+	}
+	return nil
+}
+
+func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) {
+	db.compactionTransact(name, &compactionTransactFunc{run, revert})
+}
+
 func (db *DB) compactionExitTransact() {
 	panic(errCompactionTransactExiting)
 }
@@ -232,20 +307,23 @@ func (db *DB) memCompaction() {
 	}
 
 	// Pause table compaction.
-	ch := make(chan struct{})
+	resumeC := make(chan struct{})
 	select {
-	case db.tcompPauseC <- (chan<- struct{})(ch):
+	case db.tcompPauseC <- (chan<- struct{})(resumeC):
+	case <-db.compPerErrC:
+		close(resumeC)
+		resumeC = nil
 	case _, _ = <-db.closeC:
 		return
 	}
 
-	db.compactionTransact("mem@flush", func(cnt *compactionTransactCounter) (err error) {
+	db.compactionTransactFunc("mem@flush", func(cnt *compactionTransactCounter) (err error) {
 		stats.startTimer()
 		defer stats.stopTimer()
 		return c.flush(mem.mdb, -1)
 	}, func() error {
 		for _, r := range c.rec.addedTables {
-			db.logf("mem@flush rollback @%d", r.num)
+			db.logf("mem@flush revert @%d", r.num)
 			f := db.s.getTableFile(r.num)
 			if err := f.Remove(); err != nil {
 				return err
@@ -254,7 +332,7 @@ func (db *DB) memCompaction() {
 		return nil
 	})
 
-	db.compactionTransact("mem@commit", func(cnt *compactionTransactCounter) (err error) {
+	db.compactionTransactFunc("mem@commit", func(cnt *compactionTransactCounter) (err error) {
 		stats.startTimer()
 		defer stats.stopTimer()
 		return c.commit(db.journalFile.Num(), db.frozenSeq)
@@ -271,214 +349,261 @@ func (db *DB) memCompaction() {
 	db.dropFrozenMem()
 
 	// Resume table compaction.
-	select {
-	case <-ch:
-	case _, _ = <-db.closeC:
-		return
+	if resumeC != nil {
+		select {
+		case <-resumeC:
+			close(resumeC)
+		case _, _ = <-db.closeC:
+			return
+		}
 	}
 
 	// Trigger table compaction.
-	db.compTrigger(db.mcompTriggerC)
+	db.compSendTrigger(db.tcompCmdC)
 }
 
-func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
-	rec := new(sessionRecord)
-	rec.addCompactionPointer(c.level, c.imax)
+type tableCompactionBuilder struct {
+	db           *DB
+	s            *session
+	c            *compaction
+	rec          *sessionRecord
+	stat0, stat1 *cStatsStaging
 
-	if !noTrivial && c.trivial() {
-		t := c.tables[0][0]
-		db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1)
-		rec.deleteTable(c.level, t.file.Num())
-		rec.addTableFile(c.level+1, t)
-		db.compactionTransact("table@move", func(cnt *compactionTransactCounter) (err error) {
-			return db.s.commit(rec)
-		}, nil)
-		return
-	}
+	snapHasLastUkey bool
+	snapLastUkey    []byte
+	snapLastSeq     uint64
+	snapIter        int
+	snapKerrCnt     int
+	snapDropCnt     int
 
-	var stats [2]cStatsStaging
-	for i, tables := range c.tables {
-		for _, t := range tables {
-			stats[i].read += t.size
-			// Insert deleted tables into record
-			rec.deleteTable(c.level+i, t.file.Num())
-		}
-	}
-	sourceSize := int(stats[0].read + stats[1].read)
-	minSeq := db.minSeq()
-	db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq)
+	kerrCnt int
+	dropCnt int
 
-	var snapUkey []byte
-	var snapHasUkey bool
-	var snapSeq uint64
-	var snapIter int
-	var snapDropCnt int
-	var dropCnt int
-	db.compactionTransact("table@build", func(cnt *compactionTransactCounter) (err error) {
-		ukey := append([]byte{}, snapUkey...)
-		hasUkey := snapHasUkey
-		lseq := snapSeq
-		dropCnt = snapDropCnt
-		snapSched := snapIter == 0
-
-		var tw *tWriter
-		finish := func() error {
-			t, err := tw.finish()
-			if err != nil {
-				return err
+	minSeq    uint64
+	strict    bool
+	tableSize int
+
+	tw *tWriter
+}
+
+func (b *tableCompactionBuilder) appendKV(key, value []byte) error {
+	// Create new table if not already.
+	if b.tw == nil {
+		// Check for pause event.
+		if b.db != nil {
+			select {
+			case ch := <-b.db.tcompPauseC:
+				b.db.pauseCompaction(ch)
+			case _, _ = <-b.db.closeC:
+				b.db.compactionExitTransact()
+			default:
 			}
-			rec.addTableFile(c.level+1, t)
-			stats[1].write += t.size
-			db.logf("table@build created L%d@%d N·%d S·%s %q:%q", c.level+1, t.file.Num(), tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
-			return nil
 		}
 
-		defer func() {
-			stats[1].stopTimer()
-			if tw != nil {
-				tw.drop()
-				tw = nil
-			}
-		}()
+		// Create new table.
+		var err error
+		b.tw, err = b.s.tops.create()
+		if err != nil {
+			return err
+		}
+	}
 
-		stats[1].startTimer()
-		iter := c.newIterator()
-		defer iter.Release()
-		for i := 0; iter.Next(); i++ {
-			// Incr transact counter.
-			cnt.incr()
-
-			// Skip until last state.
-			if i < snapIter {
-				continue
-			}
+	// Write key/value into table.
+	return b.tw.append(key, value)
+}
 
-			ikey := iKey(iter.Key())
+func (b *tableCompactionBuilder) needFlush() bool {
+	return b.tw.tw.BytesLen() >= b.tableSize
+}
 
-			if c.shouldStopBefore(ikey) && tw != nil {
-				err = finish()
-				if err != nil {
-					return
-				}
-				snapSched = true
-				tw = nil
-			}
+func (b *tableCompactionBuilder) flush() error {
+	t, err := b.tw.finish()
+	if err != nil {
+		return err
+	}
+	b.rec.addTableFile(b.c.level+1, t)
+	b.stat1.write += t.size
+	b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
+	b.tw = nil
+	return nil
+}
 
-			// Scheduled for snapshot, snapshot will used to retry compaction
-			// if error occured.
-			if snapSched {
-				snapUkey = append(snapUkey[:0], ukey...)
-				snapHasUkey = hasUkey
-				snapSeq = lseq
-				snapIter = i
-				snapDropCnt = dropCnt
-				snapSched = false
-			}
+func (b *tableCompactionBuilder) cleanup() {
+	if b.tw != nil {
+		b.tw.drop()
+		b.tw = nil
+	}
+}
 
-			if seq, vt, ok := ikey.parseNum(); !ok {
-				// Don't drop error keys
-				ukey = ukey[:0]
-				hasUkey = false
-				lseq = kMaxSeq
-			} else {
-				if !hasUkey || db.s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
-					// First occurrence of this user key
-					ukey = append(ukey[:0], ikey.ukey()...)
-					hasUkey = true
-					lseq = kMaxSeq
-				}
+func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
+	snapResumed := b.snapIter > 0
+	hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary.
+	lastUkey := append([]byte{}, b.snapLastUkey...)
+	lastSeq := b.snapLastSeq
+	b.kerrCnt = b.snapKerrCnt
+	b.dropCnt = b.snapDropCnt
+	// Restore compaction state.
+	b.c.restore()
 
-				drop := false
-				if lseq <= minSeq {
-					// Dropped because newer entry for same user key exist
-					drop = true // (A)
-				} else if vt == tDel && seq <= minSeq && c.baseLevelForKey(ukey) {
-					// For this user key:
-					// (1) there is no data in higher levels
-					// (2) data in lower levels will have larger seq numbers
-					// (3) data in layers that are being compacted here and have
-					//     smaller seq numbers will be dropped in the next
-					//     few iterations of this loop (by rule (A) above).
-					// Therefore this deletion marker is obsolete and can be dropped.
-					drop = true
-				}
+	defer b.cleanup()
 
-				lseq = seq
-				if drop {
-					dropCnt++
-					continue
-				}
-			}
+	b.stat1.startTimer()
+	defer b.stat1.stopTimer()
 
-			// Create new table if not already
-			if tw == nil {
-				// Check for pause event.
-				select {
-				case ch := <-db.tcompPauseC:
-					db.pauseCompaction(ch)
-				case _, _ = <-db.closeC:
-					db.compactionExitTransact()
-				default:
-				}
+	iter := b.c.newIterator()
+	defer iter.Release()
+	for i := 0; iter.Next(); i++ {
+		// Incr transact counter.
+		cnt.incr()
+
+		// Skip until last state.
+		if i < b.snapIter {
+			continue
+		}
+
+		resumed := false
+		if snapResumed {
+			resumed = true
+			snapResumed = false
+		}
 
-				// Create new table.
-				tw, err = db.s.tops.create()
-				if err != nil {
-					return
+		ikey := iter.Key()
+		ukey, seq, kt, kerr := parseIkey(ikey)
+
+		if kerr == nil {
+			shouldStop := !resumed && b.c.shouldStopBefore(ikey)
+
+			if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 {
+				// First occurrence of this user key.
+
+				// Only rotate tables if ukey doesn't hop across.
+				if b.tw != nil && (shouldStop || b.needFlush()) {
+					if err := b.flush(); err != nil {
+						return err
+					}
+
+					// Creates snapshot of the state.
+					b.c.save()
+					b.snapHasLastUkey = hasLastUkey
+					b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...)
+					b.snapLastSeq = lastSeq
+					b.snapIter = i
+					b.snapKerrCnt = b.kerrCnt
+					b.snapDropCnt = b.dropCnt
 				}
-			}
 
-			// Write key/value into table
-			err = tw.append(ikey, iter.Value())
-			if err != nil {
-				return
+				hasLastUkey = true
+				lastUkey = append(lastUkey[:0], ukey...)
+				lastSeq = kMaxSeq
 			}
 
-			// Finish table if it is big enough
-			if tw.tw.BytesLen() >= kMaxTableSize {
-				err = finish()
-				if err != nil {
-					return
-				}
-				snapSched = true
-				tw = nil
+			switch {
+			case lastSeq <= b.minSeq:
+				// Dropped because newer entry for same user key exist
+				fallthrough // (A)
+			case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey):
+				// For this user key:
+				// (1) there is no data in higher levels
+				// (2) data in lower levels will have larger seq numbers
+				// (3) data in layers that are being compacted here and have
+				//     smaller seq numbers will be dropped in the next
+				//     few iterations of this loop (by rule (A) above).
+				// Therefore this deletion marker is obsolete and can be dropped.
+				lastSeq = seq
+				b.dropCnt++
+				continue
+			default:
+				lastSeq = seq
+			}
+		} else {
+			if b.strict {
+				return kerr
 			}
+
+			// Don't drop corrupted keys.
+			hasLastUkey = false
+			lastUkey = lastUkey[:0]
+			lastSeq = kMaxSeq
+			b.kerrCnt++
 		}
 
-		err = iter.Error()
-		if err != nil {
-			return
+		if err := b.appendKV(ikey, iter.Value()); err != nil {
+			return err
 		}
+	}
 
-		// Finish last table
-		if tw != nil && !tw.empty() {
-			err = finish()
-			if err != nil {
-				return
-			}
-			tw = nil
+	if err := iter.Error(); err != nil {
+		return err
+	}
+
+	// Finish last table.
+	if b.tw != nil && !b.tw.empty() {
+		return b.flush()
+	}
+	return nil
+}
+
+func (b *tableCompactionBuilder) revert() error {
+	for _, at := range b.rec.addedTables {
+		b.s.logf("table@build revert @%d", at.num)
+		f := b.s.getTableFile(at.num)
+		if err := f.Remove(); err != nil {
+			return err
 		}
+	}
+	return nil
+}
+
+func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
+	defer c.release()
+
+	rec := &sessionRecord{numLevel: db.s.o.GetNumLevel()}
+	rec.addCompPtr(c.level, c.imax)
+
+	if !noTrivial && c.trivial() {
+		t := c.tables[0][0]
+		db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1)
+		rec.delTable(c.level, t.file.Num())
+		rec.addTableFile(c.level+1, t)
+		db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) {
+			return db.s.commit(rec)
+		}, nil)
 		return
-	}, func() error {
-		for _, r := range rec.addedTables {
-			db.logf("table@build rollback @%d", r.num)
-			f := db.s.getTableFile(r.num)
-			if err := f.Remove(); err != nil {
-				return err
-			}
+	}
+
+	var stats [2]cStatsStaging
+	for i, tables := range c.tables {
+		for _, t := range tables {
+			stats[i].read += t.size
+			// Insert deleted tables into record
+			rec.delTable(c.level+i, t.file.Num())
 		}
-		return nil
-	})
+	}
+	sourceSize := int(stats[0].read + stats[1].read)
+	minSeq := db.minSeq()
+	db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq)
+
+	b := &tableCompactionBuilder{
+		db:        db,
+		s:         db.s,
+		c:         c,
+		rec:       rec,
+		stat1:     &stats[1],
+		minSeq:    minSeq,
+		strict:    db.s.o.GetStrict(opt.StrictCompaction),
+		tableSize: db.s.o.GetCompactionTableSize(c.level + 1),
+	}
+	db.compactionTransact("table@build", b)
 
 	// Commit changes
-	db.compactionTransact("table@commit", func(cnt *compactionTransactCounter) (err error) {
+	db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) {
 		stats[1].startTimer()
 		defer stats[1].stopTimer()
 		return db.s.commit(rec)
 	}, nil)
 
 	resultSize := int(stats[1].write)
-	db.logf("table@compaction committed F%s S%s D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), dropCnt, stats[1].duration)
+	db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration)
 
 	// Save compaction stats
 	for i := range stats {
@@ -494,14 +619,14 @@ func (db *DB) tableRangeCompaction(level int, umin, umax []byte) {
 			db.tableCompaction(c, true)
 		}
 	} else {
-		v := db.s.version_NB()
-
+		v := db.s.version()
 		m := 1
 		for i, t := range v.tables[1:] {
 			if t.overlaps(db.s.icmp, umin, umax, false) {
 				m = i + 1
 			}
 		}
+		v.release()
 
 		for level := 0; level < m; level++ {
 			if c := db.s.getCompactionRange(level, umin, umax); c != nil {
@@ -518,7 +643,9 @@ func (db *DB) tableAutoCompaction() {
 }
 
 func (db *DB) tableNeedCompaction() bool {
-	return db.s.version_NB().needCompaction()
+	v := db.s.version()
+	defer v.release()
+	return v.needCompaction()
 }
 
 func (db *DB) pauseCompaction(ch chan<- struct{}) {
@@ -538,10 +665,12 @@ type cIdle struct {
 }
 
 func (r cIdle) ack(err error) {
-	defer func() {
-		recover()
-	}()
-	r.ackC <- err
+	if r.ackC != nil {
+		defer func() {
+			recover()
+		}()
+		r.ackC <- err
+	}
 }
 
 type cRange struct {
@@ -559,6 +688,7 @@ func (r cRange) ack(err error) {
 	}
 }
 
+// This will trigger auto compation and/or wait for all compaction to be done.
 func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
 	ch := make(chan error)
 	defer close(ch)
@@ -580,6 +710,15 @@ func (db *DB) compSendIdle(compC chan<- cCmd) (err error) {
 	return err
 }
 
+// This will trigger auto compaction but will not wait for it.
+func (db *DB) compSendTrigger(compC chan<- cCmd) {
+	select {
+	case compC <- cIdle{}:
+	default:
+	}
+}
+
+// Send range compaction request.
 func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) {
 	ch := make(chan error)
 	defer close(ch)
@@ -601,13 +740,6 @@ func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err
 	return err
 }
 
-func (db *DB) compTrigger(compTriggerC chan struct{}) {
-	select {
-	case compTriggerC <- struct{}{}:
-	default:
-	}
-}
-
 func (db *DB) mCompaction() {
 	var x cCmd
 
@@ -626,11 +758,14 @@ func (db *DB) mCompaction() {
 	for {
 		select {
 		case x = <-db.mcompCmdC:
-			db.memCompaction()
-			x.ack(nil)
-			x = nil
-		case <-db.mcompTriggerC:
-			db.memCompaction()
+			switch x.(type) {
+			case cIdle:
+				db.memCompaction()
+				x.ack(nil)
+				x = nil
+			default:
+				panic("leveldb: unknown command")
+			}
 		case _, _ = <-db.closeC:
 			return
 		}
@@ -661,7 +796,6 @@ func (db *DB) tCompaction() {
 		if db.tableNeedCompaction() {
 			select {
 			case x = <-db.tcompCmdC:
-			case <-db.tcompTriggerC:
 			case ch := <-db.tcompPauseC:
 				db.pauseCompaction(ch)
 				continue
@@ -677,7 +811,6 @@ func (db *DB) tCompaction() {
 			ackQ = ackQ[:0]
 			select {
 			case x = <-db.tcompCmdC:
-			case <-db.tcompTriggerC:
 			case ch := <-db.tcompPauseC:
 				db.pauseCompaction(ch)
 				continue
@@ -692,6 +825,8 @@ func (db *DB) tCompaction() {
 			case cRange:
 				db.tableRangeCompaction(cmd.level, cmd.min, cmd.max)
 				x.ack(nil)
+			default:
+				panic("leveldb: unknown command")
 			}
 			x = nil
 		}

+ 16 - 18
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go

@@ -48,7 +48,8 @@ func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
 		i = append(i, fmi)
 	}
 	i = append(i, ti...)
-	mi := iterator.NewMergedIterator(i, db.s.icmp, true)
+	strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader)
+	mi := iterator.NewMergedIterator(i, db.s.icmp, strict)
 	mi.SetReleaser(&versionReleaser{v: v})
 	return mi
 }
@@ -58,10 +59,10 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
 	if slice != nil {
 		islice = &util.Range{}
 		if slice.Start != nil {
-			islice.Start = newIKey(slice.Start, kMaxSeq, tSeek)
+			islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek)
 		}
 		if slice.Limit != nil {
-			islice.Limit = newIKey(slice.Limit, kMaxSeq, tSeek)
+			islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek)
 		}
 	}
 	rawIter := db.newRawIterator(islice, ro)
@@ -70,7 +71,7 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d
 		icmp:   db.s.icmp,
 		iter:   rawIter,
 		seq:    seq,
-		strict: db.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator),
+		strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader),
 		key:    make([]byte, 0),
 		value:  make([]byte, 0),
 	}
@@ -161,7 +162,7 @@ func (i *dbIter) Seek(key []byte) bool {
 		return false
 	}
 
-	ikey := newIKey(key, i.seq, tSeek)
+	ikey := newIkey(key, i.seq, ktSeek)
 	if i.iter.Seek(ikey) {
 		i.dir = dirSOI
 		return i.next()
@@ -173,15 +174,14 @@ func (i *dbIter) Seek(key []byte) bool {
 
 func (i *dbIter) next() bool {
 	for {
-		ukey, seq, t, ok := parseIkey(i.iter.Key())
-		if ok {
+		if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
 			if seq <= i.seq {
-				switch t {
-				case tDel:
+				switch kt {
+				case ktDel:
 					// Skip deleted key.
 					i.key = append(i.key[:0], ukey...)
 					i.dir = dirForward
-				case tVal:
+				case ktVal:
 					if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 {
 						i.key = append(i.key[:0], ukey...)
 						i.value = append(i.value[:0], i.iter.Value()...)
@@ -191,7 +191,7 @@ func (i *dbIter) next() bool {
 				}
 			}
 		} else if i.strict {
-			i.setErr(errInvalidIkey)
+			i.setErr(kerr)
 			break
 		}
 		if !i.iter.Next() {
@@ -224,20 +224,19 @@ func (i *dbIter) prev() bool {
 	del := true
 	if i.iter.Valid() {
 		for {
-			ukey, seq, t, ok := parseIkey(i.iter.Key())
-			if ok {
+			if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil {
 				if seq <= i.seq {
 					if !del && i.icmp.uCompare(ukey, i.key) < 0 {
 						return true
 					}
-					del = (t == tDel)
+					del = (kt == ktDel)
 					if !del {
 						i.key = append(i.key[:0], ukey...)
 						i.value = append(i.value[:0], i.iter.Value()...)
 					}
 				}
 			} else if i.strict {
-				i.setErr(errInvalidIkey)
+				i.setErr(kerr)
 				return false
 			}
 			if !i.iter.Prev() {
@@ -266,13 +265,12 @@ func (i *dbIter) Prev() bool {
 		return i.Last()
 	case dirForward:
 		for i.iter.Prev() {
-			ukey, _, _, ok := parseIkey(i.iter.Key())
-			if ok {
+			if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil {
 				if i.icmp.uCompare(ukey, i.key) < 0 {
 					goto cont
 				}
 			} else if i.strict {
-				i.setErr(errInvalidIkey)
+				i.setErr(kerr)
 				return false
 			}
 		}

+ 431 - 31
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go

@@ -7,6 +7,7 @@
 package leveldb
 
 import (
+	"bytes"
 	"container/list"
 	crand "crypto/rand"
 	"encoding/binary"
@@ -23,6 +24,7 @@ import (
 	"unsafe"
 
 	"github.com/syndtr/goleveldb/leveldb/comparer"
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/filter"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/opt"
@@ -151,7 +153,10 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
 	t := h.t
 	db := h.db
 
-	var res uint64
+	var (
+		maxOverlaps uint64
+		maxLevel    int
+	)
 	v := db.s.version()
 	for i, tt := range v.tables[1 : len(v.tables)-1] {
 		level := i + 1
@@ -159,15 +164,18 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) {
 		for _, t := range tt {
 			r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false)
 			sum := r.size()
-			if sum > res {
-				res = sum
+			if sum > maxOverlaps {
+				maxOverlaps = sum
+				maxLevel = level
 			}
 		}
 	}
 	v.release()
 
-	if res > want {
-		t.Errorf("next level overlapping bytes is more than %d, got=%d", want, res)
+	if maxOverlaps > want {
+		t.Errorf("next level most overlapping bytes is more than %d, got=%d level=%d", want, maxOverlaps, maxLevel)
+	} else {
+		t.Logf("next level most overlapping bytes is %d, level=%d want=%d", maxOverlaps, maxLevel, want)
 	}
 }
 
@@ -240,7 +248,7 @@ func (h *dbHarness) allEntriesFor(key, want string) {
 	db := h.db
 	s := db.s
 
-	ikey := newIKey([]byte(key), kMaxSeq, tVal)
+	ikey := newIkey([]byte(key), kMaxSeq, ktVal)
 	iter := db.newRawIterator(nil, nil)
 	if !iter.Seek(ikey) && iter.Error() != nil {
 		t.Error("AllEntries: error during seek, err: ", iter.Error())
@@ -249,19 +257,18 @@ func (h *dbHarness) allEntriesFor(key, want string) {
 	res := "[ "
 	first := true
 	for iter.Valid() {
-		rkey := iKey(iter.Key())
-		if _, t, ok := rkey.parseNum(); ok {
-			if s.icmp.uCompare(ikey.ukey(), rkey.ukey()) != 0 {
+		if ukey, _, kt, kerr := parseIkey(iter.Key()); kerr == nil {
+			if s.icmp.uCompare(ikey.ukey(), ukey) != 0 {
 				break
 			}
 			if !first {
 				res += ", "
 			}
 			first = false
-			switch t {
-			case tVal:
+			switch kt {
+			case ktVal:
 				res += string(iter.Value())
-			case tDel:
+			case ktDel:
 				res += "DEL"
 			}
 		} else {
@@ -326,6 +333,8 @@ func (h *dbHarness) compactMem() {
 	t := h.t
 	db := h.db
 
+	t.Log("starting memdb compaction")
+
 	db.writeLockC <- struct{}{}
 	defer func() {
 		<-db.writeLockC
@@ -341,6 +350,8 @@ func (h *dbHarness) compactMem() {
 	if h.totalTables() == 0 {
 		t.Error("zero tables after mem compaction")
 	}
+
+	t.Log("memdb compaction done")
 }
 
 func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) {
@@ -355,6 +366,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
 		_max = []byte(max)
 	}
 
+	t.Logf("starting table range compaction: level=%d, min=%q, max=%q", level, min, max)
+
 	if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil {
 		if wanterr {
 			t.Log("CompactRangeAt: got error (expected): ", err)
@@ -364,6 +377,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool)
 	} else if wanterr {
 		t.Error("CompactRangeAt: expect error")
 	}
+
+	t.Log("table range compaction done")
 }
 
 func (h *dbHarness) compactRangeAt(level int, min, max string) {
@@ -374,6 +389,8 @@ func (h *dbHarness) compactRange(min, max string) {
 	t := h.t
 	db := h.db
 
+	t.Logf("starting DB range compaction: min=%q, max=%q", min, max)
+
 	var r util.Range
 	if min != "" {
 		r.Start = []byte(min)
@@ -384,6 +401,8 @@ func (h *dbHarness) compactRange(min, max string) {
 	if err := db.CompactRange(r); err != nil {
 		t.Error("CompactRange: got error: ", err)
 	}
+
+	t.Log("DB range compaction done")
 }
 
 func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) {
@@ -505,10 +524,10 @@ func Test_FieldsAligned(t *testing.T) {
 	p1 := new(DB)
 	testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq))
 	p2 := new(session)
-	testAligned(t, "session.stFileNum", unsafe.Offsetof(p2.stFileNum))
+	testAligned(t, "session.stNextFileNum", unsafe.Offsetof(p2.stNextFileNum))
 	testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum))
 	testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum))
-	testAligned(t, "session.stSeq", unsafe.Offsetof(p2.stSeq))
+	testAligned(t, "session.stSeqNum", unsafe.Offsetof(p2.stSeqNum))
 }
 
 func TestDb_Locking(t *testing.T) {
@@ -944,7 +963,7 @@ func TestDb_RepeatedWritesToSameKey(t *testing.T) {
 	h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000})
 	defer h.close()
 
-	maxTables := kNumLevels + kL0_StopWritesTrigger
+	maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
 
 	value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
 	for i := 0; i < 5*maxTables; i++ {
@@ -962,7 +981,7 @@ func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) {
 
 	h.reopenDB()
 
-	maxTables := kNumLevels + kL0_StopWritesTrigger
+	maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger()
 
 	value := strings.Repeat("v", 2*h.o.GetWriteBuffer())
 	for i := 0; i < 5*maxTables; i++ {
@@ -978,7 +997,7 @@ func TestDb_SparseMerge(t *testing.T) {
 	h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression})
 	defer h.close()
 
-	h.putMulti(kNumLevels, "A", "Z")
+	h.putMulti(h.o.GetNumLevel(), "A", "Z")
 
 	// Suppose there is:
 	//    small amount of data with prefix A
@@ -1002,6 +1021,7 @@ func TestDb_SparseMerge(t *testing.T) {
 	h.put("C", "vc2")
 	h.compactMem()
 
+	h.waitCompaction()
 	h.maxNextLevelOverlappingBytes(20 * 1048576)
 	h.compactRangeAt(0, "", "")
 	h.waitCompaction()
@@ -1172,7 +1192,7 @@ func TestDb_HiddenValuesAreRemoved(t *testing.T) {
 
 		h.put("foo", "v1")
 		h.compactMem()
-		m := kMaxMemCompactLevel
+		m := h.o.GetMaxMemCompationLevel()
 		v := s.version()
 		num := v.tLen(m)
 		v.release()
@@ -1216,7 +1236,7 @@ func TestDb_DeletionMarkers2(t *testing.T) {
 
 	h.put("foo", "v1")
 	h.compactMem()
-	m := kMaxMemCompactLevel
+	m := h.o.GetMaxMemCompationLevel()
 	v := s.version()
 	num := v.tLen(m)
 	v.release()
@@ -1269,14 +1289,14 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
 		t.Errorf("total tables is %d, want %d", n, im)
 	}
 
-	h.stor.SetOpenErr(storage.TypeTable)
+	h.stor.SetEmuErr(storage.TypeTable, tsOpOpen)
 	go h.db.CompactRange(util.Range{})
 	if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil {
 		t.Log("compaction error: ", err)
 	}
 	h.closeDB0()
 	h.openDB()
-	h.stor.SetOpenErr(0)
+	h.stor.SetEmuErr(0, tsOpOpen)
 
 	for i := 0; i < im; i++ {
 		for j := 0; j < jm; j++ {
@@ -1287,7 +1307,7 @@ func TestDb_CompactionTableOpenError(t *testing.T) {
 
 func TestDb_OverlapInLevel0(t *testing.T) {
 	trun(t, func(h *dbHarness) {
-		if kMaxMemCompactLevel != 2 {
+		if h.o.GetMaxMemCompationLevel() != 2 {
 			t.Fatal("fix test to reflect the config")
 		}
 
@@ -1407,23 +1427,23 @@ func TestDb_ManifestWriteError(t *testing.T) {
 			h.compactMem()
 			h.getVal("foo", "bar")
 			v := h.db.s.version()
-			if n := v.tLen(kMaxMemCompactLevel); n != 1 {
+			if n := v.tLen(h.o.GetMaxMemCompationLevel()); n != 1 {
 				t.Errorf("invalid total tables, want=1 got=%d", n)
 			}
 			v.release()
 
 			if i == 0 {
-				h.stor.SetWriteErr(storage.TypeManifest)
+				h.stor.SetEmuErr(storage.TypeManifest, tsOpWrite)
 			} else {
-				h.stor.SetSyncErr(storage.TypeManifest)
+				h.stor.SetEmuErr(storage.TypeManifest, tsOpSync)
 			}
 
 			// Merging compaction (will fail)
-			h.compactRangeAtErr(kMaxMemCompactLevel, "", "", true)
+			h.compactRangeAtErr(h.o.GetMaxMemCompationLevel(), "", "", true)
 
 			h.db.Close()
-			h.stor.SetWriteErr(0)
-			h.stor.SetSyncErr(0)
+			h.stor.SetEmuErr(0, tsOpWrite)
+			h.stor.SetEmuErr(0, tsOpSync)
 
 			// Should not lose data
 			h.openDB()
@@ -1573,7 +1593,7 @@ func TestDb_ManualCompaction(t *testing.T) {
 	h := newDbHarness(t)
 	defer h.close()
 
-	if kMaxMemCompactLevel != 2 {
+	if h.o.GetMaxMemCompationLevel() != 2 {
 		t.Fatal("fix test to reflect the config")
 	}
 
@@ -1857,7 +1877,7 @@ func TestDb_DeletionMarkersOnMemdb(t *testing.T) {
 }
 
 func TestDb_LeveldbIssue178(t *testing.T) {
-	nKeys := (kMaxTableSize / 30) * 5
+	nKeys := (opt.DefaultCompactionTableSize / 30) * 5
 	key1 := func(i int) string {
 		return fmt.Sprintf("my_key_%d", i)
 	}
@@ -2125,7 +2145,7 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
 				}
 			}
 			if err := iter.Error(); err != nil {
-				t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, err)
+				t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, writei, err)
 			}
 			iter.Release()
 			snap.Release()
@@ -2164,5 +2184,385 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) {
 	}()
 
 	wg.Wait()
+}
+
+func TestDb_TransientError(t *testing.T) {
+	h := newDbHarnessWopt(t, &opt.Options{
+		WriteBuffer:              128 * opt.KiB,
+		CachedOpenFiles:          3,
+		DisableCompactionBackoff: true,
+	})
+	defer h.close()
+
+	const (
+		nSnap = 20
+		nKey  = 10000
+	)
+
+	var (
+		snaps [nSnap]*Snapshot
+		b     = &Batch{}
+	)
+	for i := range snaps {
+		vtail := fmt.Sprintf("VAL%030d", i)
+		b.Reset()
+		for k := 0; k < nKey; k++ {
+			key := fmt.Sprintf("KEY%8d", k)
+			b.Put([]byte(key), []byte(key+vtail))
+		}
+		h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
+		if err := h.db.Write(b, nil); err != nil {
+			t.Logf("WRITE #%d error: %v", i, err)
+			h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt, tsOpWrite)
+			for {
+				if err := h.db.Write(b, nil); err == nil {
+					break
+				} else if errors.IsCorrupted(err) {
+					t.Fatalf("WRITE #%d corrupted: %v", i, err)
+				}
+			}
+		}
+
+		snaps[i] = h.db.newSnapshot()
+		b.Reset()
+		for k := 0; k < nKey; k++ {
+			key := fmt.Sprintf("KEY%8d", k)
+			b.Delete([]byte(key))
+		}
+		h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt)
+		if err := h.db.Write(b, nil); err != nil {
+			t.Logf("WRITE #%d  error: %v", i, err)
+			h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
+			for {
+				if err := h.db.Write(b, nil); err == nil {
+					break
+				} else if errors.IsCorrupted(err) {
+					t.Fatalf("WRITE #%d corrupted: %v", i, err)
+				}
+			}
+		}
+	}
+	h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt)
+
+	runtime.GOMAXPROCS(runtime.NumCPU())
+
+	rnd := rand.New(rand.NewSource(0xecafdaed))
+	wg := &sync.WaitGroup{}
+	for i, snap := range snaps {
+		wg.Add(2)
+
+		go func(i int, snap *Snapshot, sk []int) {
+			defer wg.Done()
+
+			vtail := fmt.Sprintf("VAL%030d", i)
+			for _, k := range sk {
+				key := fmt.Sprintf("KEY%8d", k)
+				xvalue, err := snap.Get([]byte(key), nil)
+				if err != nil {
+					t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
+				}
+				value := key + vtail
+				if !bytes.Equal([]byte(value), xvalue) {
+					t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
+				}
+			}
+		}(i, snap, rnd.Perm(nKey))
+
+		go func(i int, snap *Snapshot) {
+			defer wg.Done()
+
+			vtail := fmt.Sprintf("VAL%030d", i)
+			iter := snap.NewIterator(nil, nil)
+			defer iter.Release()
+			for k := 0; k < nKey; k++ {
+				if !iter.Next() {
+					if err := iter.Error(); err != nil {
+						t.Fatalf("READER_ITER #%d K%d error: %v", i, k, err)
+					} else {
+						t.Fatalf("READER_ITER #%d K%d eoi", i, k)
+					}
+				}
+				key := fmt.Sprintf("KEY%8d", k)
+				xkey := iter.Key()
+				if !bytes.Equal([]byte(key), xkey) {
+					t.Fatalf("READER_ITER #%d K%d invalid key: want %q, got %q", i, k, key, xkey)
+				}
+				value := key + vtail
+				xvalue := iter.Value()
+				if !bytes.Equal([]byte(value), xvalue) {
+					t.Fatalf("READER_ITER #%d K%d invalid value: want %q, got %q", i, k, value, xvalue)
+				}
+			}
+		}(i, snap)
+	}
+
+	wg.Wait()
+}
 
+func TestDb_UkeyShouldntHopAcrossTable(t *testing.T) {
+	h := newDbHarnessWopt(t, &opt.Options{
+		WriteBuffer:                 112 * opt.KiB,
+		CompactionTableSize:         90 * opt.KiB,
+		CompactionExpandLimitFactor: 1,
+	})
+	defer h.close()
+
+	const (
+		nSnap = 190
+		nKey  = 140
+	)
+
+	var (
+		snaps [nSnap]*Snapshot
+		b     = &Batch{}
+	)
+	for i := range snaps {
+		vtail := fmt.Sprintf("VAL%030d", i)
+		b.Reset()
+		for k := 0; k < nKey; k++ {
+			key := fmt.Sprintf("KEY%08d", k)
+			b.Put([]byte(key), []byte(key+vtail))
+		}
+		if err := h.db.Write(b, nil); err != nil {
+			t.Fatalf("WRITE #%d error: %v", i, err)
+		}
+
+		snaps[i] = h.db.newSnapshot()
+		b.Reset()
+		for k := 0; k < nKey; k++ {
+			key := fmt.Sprintf("KEY%08d", k)
+			b.Delete([]byte(key))
+		}
+		if err := h.db.Write(b, nil); err != nil {
+			t.Fatalf("WRITE #%d  error: %v", i, err)
+		}
+	}
+
+	h.compactMem()
+
+	h.waitCompaction()
+	for level, tables := range h.db.s.stVersion.tables {
+		for _, table := range tables {
+			t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
+		}
+	}
+
+	h.compactRangeAt(0, "", "")
+	h.waitCompaction()
+	for level, tables := range h.db.s.stVersion.tables {
+		for _, table := range tables {
+			t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
+		}
+	}
+	h.compactRangeAt(1, "", "")
+	h.waitCompaction()
+	for level, tables := range h.db.s.stVersion.tables {
+		for _, table := range tables {
+			t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax)
+		}
+	}
+	runtime.GOMAXPROCS(runtime.NumCPU())
+
+	wg := &sync.WaitGroup{}
+	for i, snap := range snaps {
+		wg.Add(1)
+
+		go func(i int, snap *Snapshot) {
+			defer wg.Done()
+
+			vtail := fmt.Sprintf("VAL%030d", i)
+			for k := 0; k < nKey; k++ {
+				key := fmt.Sprintf("KEY%08d", k)
+				xvalue, err := snap.Get([]byte(key), nil)
+				if err != nil {
+					t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err)
+				}
+				value := key + vtail
+				if !bytes.Equal([]byte(value), xvalue) {
+					t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue)
+				}
+			}
+		}(i, snap)
+	}
+
+	wg.Wait()
+}
+
+func TestDb_TableCompactionBuilder(t *testing.T) {
+	stor := newTestStorage(t)
+	defer stor.Close()
+
+	const nSeq = 99
+
+	o := &opt.Options{
+		WriteBuffer:                 112 * opt.KiB,
+		CompactionTableSize:         43 * opt.KiB,
+		CompactionExpandLimitFactor: 1,
+		CompactionGPOverlapsFactor:  1,
+		BlockCache:                  opt.NoCache,
+	}
+	s, err := newSession(stor, o)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := s.create(); err != nil {
+		t.Fatal(err)
+	}
+	defer s.close()
+	var (
+		seq        uint64
+		targetSize = 5 * o.CompactionTableSize
+		value      = bytes.Repeat([]byte{'0'}, 100)
+	)
+	for i := 0; i < 2; i++ {
+		tw, err := s.tops.create()
+		if err != nil {
+			t.Fatal(err)
+		}
+		for k := 0; tw.tw.BytesLen() < targetSize; k++ {
+			key := []byte(fmt.Sprintf("%09d", k))
+			seq += nSeq - 1
+			for x := uint64(0); x < nSeq; x++ {
+				if err := tw.append(newIkey(key, seq-x, ktVal), value); err != nil {
+					t.Fatal(err)
+				}
+			}
+		}
+		tf, err := tw.finish()
+		if err != nil {
+			t.Fatal(err)
+		}
+		rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
+		rec.addTableFile(i, tf)
+		if err := s.commit(rec); err != nil {
+			t.Fatal(err)
+		}
+	}
+
+	// Build grandparent.
+	v := s.version()
+	c := newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
+	rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
+	b := &tableCompactionBuilder{
+		s:         s,
+		c:         c,
+		rec:       rec,
+		stat1:     new(cStatsStaging),
+		minSeq:    0,
+		strict:    true,
+		tableSize: o.CompactionTableSize/3 + 961,
+	}
+	if err := b.run(new(compactionTransactCounter)); err != nil {
+		t.Fatal(err)
+	}
+	for _, t := range c.tables[0] {
+		rec.delTable(c.level, t.file.Num())
+	}
+	if err := s.commit(rec); err != nil {
+		t.Fatal(err)
+	}
+	c.release()
+
+	// Build level-1.
+	v = s.version()
+	c = newCompaction(s, v, 0, append(tFiles{}, v.tables[0]...))
+	rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
+	b = &tableCompactionBuilder{
+		s:         s,
+		c:         c,
+		rec:       rec,
+		stat1:     new(cStatsStaging),
+		minSeq:    0,
+		strict:    true,
+		tableSize: o.CompactionTableSize,
+	}
+	if err := b.run(new(compactionTransactCounter)); err != nil {
+		t.Fatal(err)
+	}
+	for _, t := range c.tables[0] {
+		rec.delTable(c.level, t.file.Num())
+	}
+	// Move grandparent to level-3
+	for _, t := range v.tables[2] {
+		rec.delTable(2, t.file.Num())
+		rec.addTableFile(3, t)
+	}
+	if err := s.commit(rec); err != nil {
+		t.Fatal(err)
+	}
+	c.release()
+
+	v = s.version()
+	for level, want := range []bool{false, true, false, true, false} {
+		got := len(v.tables[level]) > 0
+		if want != got {
+			t.Fatalf("invalid level-%d tables len: want %v, got %v", level, want, got)
+		}
+	}
+	for i, f := range v.tables[1][:len(v.tables[1])-1] {
+		nf := v.tables[1][i+1]
+		if bytes.Equal(f.imax.ukey(), nf.imin.ukey()) {
+			t.Fatalf("KEY %q hop across table %d .. %d", f.imax.ukey(), f.file.Num(), nf.file.Num())
+		}
+	}
+	v.release()
+
+	// Compaction with transient error.
+	v = s.version()
+	c = newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...))
+	rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
+	b = &tableCompactionBuilder{
+		s:         s,
+		c:         c,
+		rec:       rec,
+		stat1:     new(cStatsStaging),
+		minSeq:    0,
+		strict:    true,
+		tableSize: o.CompactionTableSize,
+	}
+	stor.SetEmuErrOnce(storage.TypeTable, tsOpSync)
+	stor.SetEmuRandErr(storage.TypeTable, tsOpRead, tsOpReadAt, tsOpWrite)
+	stor.SetEmuRandErrProb(0xf0)
+	for {
+		if err := b.run(new(compactionTransactCounter)); err != nil {
+			t.Logf("(expected) b.run: %v", err)
+		} else {
+			break
+		}
+	}
+	if err := s.commit(rec); err != nil {
+		t.Fatal(err)
+	}
+	c.release()
+
+	stor.SetEmuErrOnce(0, tsOpSync)
+	stor.SetEmuRandErr(0, tsOpRead, tsOpReadAt, tsOpWrite)
+
+	v = s.version()
+	if len(v.tables[1]) != len(v.tables[2]) {
+		t.Fatalf("invalid tables length, want %d, got %d", len(v.tables[1]), len(v.tables[2]))
+	}
+	for i, f0 := range v.tables[1] {
+		f1 := v.tables[2][i]
+		iter0 := s.tops.newIterator(f0, nil, nil)
+		iter1 := s.tops.newIterator(f1, nil, nil)
+		for j := 0; true; j++ {
+			next0 := iter0.Next()
+			next1 := iter1.Next()
+			if next0 != next1 {
+				t.Fatalf("#%d.%d invalid eoi: want %v, got %v", i, j, next0, next1)
+			}
+			key0 := iter0.Key()
+			key1 := iter1.Key()
+			if !bytes.Equal(key0, key1) {
+				t.Fatalf("#%d.%d invalid key: want %q, got %q", i, j, key0, key1)
+			}
+			if next0 == false {
+				break
+			}
+		}
+		iter0.Release()
+		iter1.Release()
+	}
+	v.release()
 }

+ 7 - 4
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go

@@ -7,8 +7,7 @@
 package leveldb
 
 import (
-	"errors"
-
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/opt"
 	"github.com/syndtr/goleveldb/leveldb/storage"
@@ -38,7 +37,9 @@ func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) }
 
 // Check and clean files.
 func (db *DB) checkAndCleanFiles() error {
-	v := db.s.version_NB()
+	v := db.s.version()
+	defer v.release()
+
 	tablesMap := make(map[uint64]bool)
 	for _, tables := range v.tables {
 		for _, t := range tables {
@@ -78,12 +79,14 @@ func (db *DB) checkAndCleanFiles() error {
 	}
 
 	if nTables != len(tablesMap) {
+		var missing []*storage.FileInfo
 		for num, present := range tablesMap {
 			if !present {
+				missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num})
 				db.logf("db@janitor table missing @%d", num)
 			}
 		}
-		return ErrCorrupted{Type: MissingFiles, Err: errors.New("leveldb: table files missing")}
+		return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing})
 	}
 
 	db.logf("db@janitor F·%d G·%d", len(files), len(rem))

+ 31 - 14
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go

@@ -59,7 +59,7 @@ func (db *DB) rotateMem(n int) (mem *memDB, err error) {
 	}
 
 	// Schedule memdb compaction.
-	db.compTrigger(db.mcompTriggerC)
+	db.compSendTrigger(db.mcompCmdC)
 	return
 }
 
@@ -77,12 +77,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
 		}()
 		nn = mem.mdb.Free()
 		switch {
-		case v.tLen(0) >= kL0_SlowdownWritesTrigger && !delayed:
+		case v.tLen(0) >= db.s.o.GetWriteL0SlowdownTrigger() && !delayed:
 			delayed = true
 			time.Sleep(time.Millisecond)
 		case nn >= n:
 			return false
-		case v.tLen(0) >= kL0_StopWritesTrigger:
+		case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger():
 			delayed = true
 			err = db.compSendIdle(db.tcompCmdC)
 			if err != nil {
@@ -109,7 +109,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
 	for flush() {
 	}
 	if delayed {
-		db.logf("db@write delayed T·%v", time.Since(start))
+		db.writeDelay += time.Since(start)
+		db.writeDelayN++
+	} else if db.writeDelayN > 0 {
+		db.writeDelay = 0
+		db.writeDelayN = 0
+		db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay)
 	}
 	return
 }
@@ -120,7 +125,7 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) {
 // It is safe to modify the contents of the arguments after Write returns.
 func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
 	err = db.ok()
-	if err != nil || b == nil || b.len() == 0 {
+	if err != nil || b == nil || b.Len() == 0 {
 		return
 	}
 
@@ -133,6 +138,8 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) {
 			return <-db.writeAckC
 		}
 	case db.writeLockC <- struct{}{}:
+	case err = <-db.compPerErrC:
+		return
 	case _, _ = <-db.closeC:
 		return ErrClosed
 	}
@@ -188,35 +195,43 @@ drain:
 	if b.size() >= (128 << 10) {
 		// Push the write batch to the journal writer
 		select {
+		case db.journalC <- b:
+			// Write into memdb
+			if berr := b.memReplay(mem.mdb); berr != nil {
+				panic(berr)
+			}
+		case err = <-db.compPerErrC:
+			return
 		case _, _ = <-db.closeC:
 			err = ErrClosed
 			return
-		case db.journalC <- b:
-			// Write into memdb
-			b.memReplay(mem.mdb)
 		}
 		// Wait for journal writer
 		select {
-		case _, _ = <-db.closeC:
-			err = ErrClosed
-			return
 		case err = <-db.journalAckC:
 			if err != nil {
 				// Revert memdb if error detected
-				b.revertMemReplay(mem.mdb)
+				if berr := b.revertMemReplay(mem.mdb); berr != nil {
+					panic(berr)
+				}
 				return
 			}
+		case _, _ = <-db.closeC:
+			err = ErrClosed
+			return
 		}
 	} else {
 		err = db.writeJournal(b)
 		if err != nil {
 			return
 		}
-		b.memReplay(mem.mdb)
+		if berr := b.memReplay(mem.mdb); berr != nil {
+			panic(berr)
+		}
 	}
 
 	// Set last seq number.
-	db.addSeq(uint64(b.len()))
+	db.addSeq(uint64(b.Len()))
 
 	if b.size() >= memFree {
 		db.rotateMem(0)
@@ -268,6 +283,8 @@ func (db *DB) CompactRange(r util.Range) error {
 	// Lock writer.
 	select {
 	case db.writeLockC <- struct{}{}:
+	case err := <-db.compPerErrC:
+		return err
 	case _, _ = <-db.closeC:
 		return ErrClosed
 	}

+ 2 - 22
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go → Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go

@@ -7,32 +7,12 @@
 package leveldb
 
 import (
-	"errors"
-
-	"github.com/syndtr/goleveldb/leveldb/util"
+	"github.com/syndtr/goleveldb/leveldb/errors"
 )
 
 var (
-	ErrNotFound         = util.ErrNotFound
+	ErrNotFound         = errors.ErrNotFound
 	ErrSnapshotReleased = errors.New("leveldb: snapshot released")
 	ErrIterReleased     = errors.New("leveldb: iterator released")
 	ErrClosed           = errors.New("leveldb: closed")
 )
-
-type CorruptionType int
-
-const (
-	CorruptedManifest CorruptionType = iota
-	MissingFiles
-)
-
-// ErrCorrupted is the type that wraps errors that indicate corruption in
-// the database.
-type ErrCorrupted struct {
-	Type CorruptionType
-	Err  error
-}
-
-func (e ErrCorrupted) Error() string {
-	return e.Err.Error()
-}

+ 76 - 0
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go

@@ -0,0 +1,76 @@
+// Copyright (c) 2014, Suryandaru Triandana <[email protected]>
+// All rights reserved.
+//
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// Package errors provides common error types used throughout leveldb.
+package errors
+
+import (
+	"errors"
+	"fmt"
+
+	"github.com/syndtr/goleveldb/leveldb/storage"
+	"github.com/syndtr/goleveldb/leveldb/util"
+)
+
+var (
+	ErrNotFound    = New("leveldb: not found")
+	ErrReleased    = util.ErrReleased
+	ErrHasReleaser = util.ErrHasReleaser
+)
+
+// New returns an error that formats as the given text.
+func New(text string) error {
+	return errors.New(text)
+}
+
+// ErrCorrupted is the type that wraps errors that indicate corruption in
+// the database.
+type ErrCorrupted struct {
+	File *storage.FileInfo
+	Err  error
+}
+
+func (e *ErrCorrupted) Error() string {
+	if e.File != nil {
+		return fmt.Sprintf("%v [file=%v]", e.Err, e.File)
+	} else {
+		return e.Err.Error()
+	}
+}
+
+// NewErrCorrupted creates new ErrCorrupted error.
+func NewErrCorrupted(f storage.File, err error) error {
+	return &ErrCorrupted{storage.NewFileInfo(f), err}
+}
+
+// IsCorrupted returns a boolean indicating whether the error is indicating
+// a corruption.
+func IsCorrupted(err error) bool {
+	switch err.(type) {
+	case *ErrCorrupted:
+		return true
+	}
+	return false
+}
+
+// ErrMissingFiles is the type that indicating a corruption due to missing
+// files.
+type ErrMissingFiles struct {
+	Files []*storage.FileInfo
+}
+
+func (e *ErrMissingFiles) Error() string { return "file missing" }
+
+// SetFile sets 'file info' of the given error with the given file.
+// Currently only ErrCorrupted is supported, otherwise will do nothing.
+func SetFile(err error, f storage.File) error {
+	switch x := err.(type) {
+	case *ErrCorrupted:
+		x.File = storage.NewFileInfo(f)
+		return x
+	}
+	return err
+}

+ 14 - 23
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go

@@ -7,6 +7,7 @@
 package iterator
 
 import (
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
@@ -22,9 +23,8 @@ type IteratorIndexer interface {
 
 type indexedIterator struct {
 	util.BasicReleaser
-	index     IteratorIndexer
-	strict    bool
-	strictGet bool
+	index  IteratorIndexer
+	strict bool
 
 	data   Iterator
 	err    error
@@ -37,11 +37,6 @@ func (i *indexedIterator) setData() {
 		i.data.Release()
 	}
 	i.data = i.index.Get()
-	if i.strictGet {
-		if err := i.data.Error(); err != nil {
-			i.err = err
-		}
-	}
 }
 
 func (i *indexedIterator) clearData() {
@@ -61,13 +56,11 @@ func (i *indexedIterator) indexErr() {
 }
 
 func (i *indexedIterator) dataErr() bool {
-	if i.errf != nil {
-		if err := i.data.Error(); err != nil {
+	if err := i.data.Error(); err != nil {
+		if i.errf != nil {
 			i.errf(err)
 		}
-	}
-	if i.strict {
-		if err := i.data.Error(); err != nil {
+		if i.strict || !errors.IsCorrupted(err) {
 			i.err = err
 			return true
 		}
@@ -236,16 +229,14 @@ func (i *indexedIterator) SetErrorCallback(f func(err error)) {
 	i.errf = f
 }
 
-// NewIndexedIterator returns an indexed iterator. An index is iterator
-// that returns another iterator, a data iterator. A data iterator is the
+// NewIndexedIterator returns an 'indexed iterator'. An index is iterator
+// that returns another iterator, a 'data iterator'. A 'data iterator' is the
 // iterator that contains actual key/value pairs.
 //
-// If strict is true then error yield by data iterator will halt the indexed
-// iterator, on contrary if strict is false then the indexed iterator will
-// ignore those error and move on to the next index. If strictGet is true and
-// index.Get() yield an 'error iterator' then the indexed iterator will be halted.
-// An 'error iterator' is iterator which its Error() method always return non-nil
-// even before any 'seeks method' is called.
-func NewIndexedIterator(index IteratorIndexer, strict, strictGet bool) Iterator {
-	return &indexedIterator{index: index, strict: strict, strictGet: strictGet}
+// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
+// won't be ignored and will halt 'indexed iterator', otherwise the iterator will
+// continue to the next 'data iterator'. Corruption on 'index iterator' will not be
+// ignored and will halt the iterator.
+func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator {
+	return &indexedIterator{index: index, strict: strict}
 }

+ 1 - 1
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go

@@ -65,7 +65,7 @@ var _ = testutil.Defer(func() {
 					// Test the iterator.
 					t := testutil.IteratorTesting{
 						KeyValue: kv.Clone(),
-						Iter:     NewIndexedIterator(NewArrayIndexer(index), true, true),
+						Iter:     NewIndexedIterator(NewArrayIndexer(index), true),
 					}
 					testutil.DoIteratorTesting(&t)
 					done <- true

+ 7 - 8
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go

@@ -8,6 +8,7 @@ package iterator
 
 import (
 	"github.com/syndtr/goleveldb/leveldb/comparer"
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
@@ -42,13 +43,11 @@ func assertKey(key []byte) []byte {
 }
 
 func (i *mergedIterator) iterErr(iter Iterator) bool {
-	if i.errf != nil {
-		if err := iter.Error(); err != nil {
+	if err := iter.Error(); err != nil {
+		if i.errf != nil {
 			i.errf(err)
 		}
-	}
-	if i.strict {
-		if err := iter.Error(); err != nil {
+		if i.strict || !errors.IsCorrupted(err) {
 			i.err = err
 			return true
 		}
@@ -292,9 +291,9 @@ func (i *mergedIterator) SetErrorCallback(f func(err error)) {
 // keys: if iters[i] contains a key k then iters[j] will not contain that key k.
 // None of the iters may be nil.
 //
-// If strict is true then error yield by any iterators will halt the merged
-// iterator, on contrary if strict is false then the merged iterator will
-// ignore those error and move on to the next iterator.
+// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true)
+// won't be ignored and will halt 'merged iterator', otherwise the iterator will
+// continue to the next 'input iterator'.
 func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator {
 	return &mergedIterator{
 		iters:  iters,

+ 4 - 4
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go

@@ -79,10 +79,10 @@ package journal
 
 import (
 	"encoding/binary"
-	"errors"
 	"fmt"
 	"io"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
@@ -109,7 +109,7 @@ type ErrCorrupted struct {
 	Reason string
 }
 
-func (e ErrCorrupted) Error() string {
+func (e *ErrCorrupted) Error() string {
 	return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size)
 }
 
@@ -162,10 +162,10 @@ var errSkip = errors.New("leveldb/journal: skipped")
 
 func (r *Reader) corrupt(n int, reason string, skip bool) error {
 	if r.dropper != nil {
-		r.dropper.Drop(ErrCorrupted{n, reason})
+		r.dropper.Drop(&ErrCorrupted{n, reason})
 	}
 	if r.strict && !skip {
-		r.err = ErrCorrupted{n, reason}
+		r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason})
 		return r.err
 	}
 	return errSkip

+ 68 - 65
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go

@@ -9,15 +9,30 @@ package leveldb
 import (
 	"encoding/binary"
 	"fmt"
+
+	"github.com/syndtr/goleveldb/leveldb/errors"
 )
 
-type vType int
+type ErrIkeyCorrupted struct {
+	Ikey   []byte
+	Reason string
+}
+
+func (e *ErrIkeyCorrupted) Error() string {
+	return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason)
+}
+
+func newErrIkeyCorrupted(ikey []byte, reason string) error {
+	return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason})
+}
+
+type kType int
 
-func (t vType) String() string {
-	switch t {
-	case tDel:
+func (kt kType) String() string {
+	switch kt {
+	case ktDel:
 		return "d"
-	case tVal:
+	case ktVal:
 		return "v"
 	}
 	return "x"
@@ -26,16 +41,16 @@ func (t vType) String() string {
 // Value types encoded as the last component of internal keys.
 // Don't modify; this value are saved to disk.
 const (
-	tDel vType = iota
-	tVal
+	ktDel kType = iota
+	ktVal
 )
 
-// tSeek defines the vType that should be passed when constructing an
+// ktSeek defines the kType that should be passed when constructing an
 // internal key for seeking to a particular sequence number (since we
 // sort sequence numbers in decreasing order and the value type is
 // embedded as the low 8 bits in the sequence number in internal keys,
 // we need to use the highest-numbered ValueType, not the lowest).
-const tSeek = tVal
+const ktSeek = ktVal
 
 const (
 	// Maximum value possible for sequence number; the 8-bits are
@@ -43,7 +58,7 @@ const (
 	// 64-bit integer.
 	kMaxSeq uint64 = (uint64(1) << 56) - 1
 	// Maximum value possible for packed sequence number and type.
-	kMaxNum uint64 = (kMaxSeq << 8) | uint64(tSeek)
+	kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek)
 )
 
 // Maximum number encoded in bytes.
@@ -55,85 +70,73 @@ func init() {
 
 type iKey []byte
 
-func newIKey(ukey []byte, seq uint64, t vType) iKey {
-	if seq > kMaxSeq || t > tVal {
-		panic("invalid seq number or value type")
+func newIkey(ukey []byte, seq uint64, kt kType) iKey {
+	if seq > kMaxSeq {
+		panic("leveldb: invalid sequence number")
+	} else if kt > ktVal {
+		panic("leveldb: invalid type")
 	}
 
-	b := make(iKey, len(ukey)+8)
-	copy(b, ukey)
-	binary.LittleEndian.PutUint64(b[len(ukey):], (seq<<8)|uint64(t))
-	return b
+	ik := make(iKey, len(ukey)+8)
+	copy(ik, ukey)
+	binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt))
+	return ik
 }
 
-func parseIkey(p []byte) (ukey []byte, seq uint64, t vType, ok bool) {
-	if len(p) < 8 {
-		return
+func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) {
+	if len(ik) < 8 {
+		return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length")
 	}
-	num := binary.LittleEndian.Uint64(p[len(p)-8:])
-	seq, t = uint64(num>>8), vType(num&0xff)
-	if t > tVal {
-		return
+	num := binary.LittleEndian.Uint64(ik[len(ik)-8:])
+	seq, kt = uint64(num>>8), kType(num&0xff)
+	if kt > ktVal {
+		return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type")
 	}
-	ukey = p[:len(p)-8]
-	ok = true
+	ukey = ik[:len(ik)-8]
 	return
 }
 
-func validIkey(p []byte) bool {
-	_, _, _, ok := parseIkey(p)
-	return ok
+func validIkey(ik []byte) bool {
+	_, _, _, err := parseIkey(ik)
+	return err == nil
 }
 
-func (p iKey) assert() {
-	if p == nil {
-		panic("nil iKey")
+func (ik iKey) assert() {
+	if ik == nil {
+		panic("leveldb: nil iKey")
 	}
-	if len(p) < 8 {
-		panic(fmt.Sprintf("invalid iKey %q, len=%d", []byte(p), len(p)))
+	if len(ik) < 8 {
+		panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", ik, len(ik)))
 	}
 }
 
-func (p iKey) ok() bool {
-	if len(p) < 8 {
-		return false
-	}
-	_, _, ok := p.parseNum()
-	return ok
-}
-
-func (p iKey) ukey() []byte {
-	p.assert()
-	return p[:len(p)-8]
+func (ik iKey) ukey() []byte {
+	ik.assert()
+	return ik[:len(ik)-8]
 }
 
-func (p iKey) num() uint64 {
-	p.assert()
-	return binary.LittleEndian.Uint64(p[len(p)-8:])
+func (ik iKey) num() uint64 {
+	ik.assert()
+	return binary.LittleEndian.Uint64(ik[len(ik)-8:])
 }
 
-func (p iKey) parseNum() (seq uint64, t vType, ok bool) {
-	if p == nil {
-		panic("nil iKey")
+func (ik iKey) parseNum() (seq uint64, kt kType) {
+	num := ik.num()
+	seq, kt = uint64(num>>8), kType(num&0xff)
+	if kt > ktVal {
+		panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", ik, len(ik), kt))
 	}
-	if len(p) < 8 {
-		return
-	}
-	num := p.num()
-	seq, t = uint64(num>>8), vType(num&0xff)
-	if t > tVal {
-		return 0, 0, false
-	}
-	ok = true
 	return
 }
 
-func (p iKey) String() string {
-	if len(p) == 0 {
+func (ik iKey) String() string {
+	if ik == nil {
 		return "<nil>"
 	}
-	if seq, t, ok := p.parseNum(); ok {
-		return fmt.Sprintf("%s,%s%d", shorten(string(p.ukey())), t, seq)
+
+	if ukey, seq, kt, err := parseIkey(ik); err == nil {
+		return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq)
+	} else {
+		return "<invalid>"
 	}
-	return "<invalid>"
 }

+ 52 - 42
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go

@@ -15,8 +15,8 @@ import (
 
 var defaultIComparer = &iComparer{comparer.DefaultComparer}
 
-func ikey(key string, seq uint64, t vType) iKey {
-	return newIKey([]byte(key), uint64(seq), t)
+func ikey(key string, seq uint64, kt kType) iKey {
+	return newIkey([]byte(key), uint64(seq), kt)
 }
 
 func shortSep(a, b []byte) []byte {
@@ -37,27 +37,37 @@ func shortSuccessor(b []byte) []byte {
 	return dst
 }
 
-func testSingleKey(t *testing.T, key string, seq uint64, vt vType) {
-	ik := ikey(key, seq, vt)
+func testSingleKey(t *testing.T, key string, seq uint64, kt kType) {
+	ik := ikey(key, seq, kt)
 
 	if !bytes.Equal(ik.ukey(), []byte(key)) {
 		t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
 	}
 
-	if rseq, rt, ok := ik.parseNum(); ok {
+	rseq, rt := ik.parseNum()
+	if rseq != seq {
+		t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
+	}
+	if rt != kt {
+		t.Errorf("type does not equal, got %v, want %v", rt, kt)
+	}
+
+	if rukey, rseq, rt, kerr := parseIkey(ik); kerr == nil {
+		if !bytes.Equal(rukey, []byte(key)) {
+			t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key)
+		}
 		if rseq != seq {
 			t.Errorf("seq number does not equal, got %v, want %v", rseq, seq)
 		}
-
-		if rt != vt {
-			t.Errorf("type does not equal, got %v, want %v", rt, vt)
+		if rt != kt {
+			t.Errorf("type does not equal, got %v, want %v", rt, kt)
 		}
 	} else {
-		t.Error("cannot parse seq and type")
+		t.Errorf("key error: %v", kerr)
 	}
 }
 
-func TestIKey_EncodeDecode(t *testing.T) {
+func TestIkey_EncodeDecode(t *testing.T) {
 	keys := []string{"", "k", "hello", "longggggggggggggggggggggg"}
 	seqs := []uint64{
 		1, 2, 3,
@@ -67,8 +77,8 @@ func TestIKey_EncodeDecode(t *testing.T) {
 	}
 	for _, key := range keys {
 		for _, seq := range seqs {
-			testSingleKey(t, key, seq, tVal)
-			testSingleKey(t, "hello", 1, tDel)
+			testSingleKey(t, key, seq, ktVal)
+			testSingleKey(t, "hello", 1, ktDel)
 		}
 	}
 }
@@ -79,45 +89,45 @@ func assertBytes(t *testing.T, want, got []byte) {
 	}
 }
 
-func TestIKeyShortSeparator(t *testing.T) {
+func TestIkeyShortSeparator(t *testing.T) {
 	// When user keys are same
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("foo", 99, tVal)))
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("foo", 101, tVal)))
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("foo", 100, tVal)))
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("foo", 100, tDel)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("foo", 99, ktVal)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("foo", 101, ktVal)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("foo", 100, ktVal)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("foo", 100, ktDel)))
 
 	// When user keys are misordered
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("bar", 99, tVal)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("bar", 99, ktVal)))
 
 	// When user keys are different, but correctly ordered
-	assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("hello", 200, tVal)))
+	assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("hello", 200, ktVal)))
 
 	// When start user key is prefix of limit user key
-	assertBytes(t, ikey("foo", 100, tVal),
-		shortSep(ikey("foo", 100, tVal),
-			ikey("foobar", 200, tVal)))
+	assertBytes(t, ikey("foo", 100, ktVal),
+		shortSep(ikey("foo", 100, ktVal),
+			ikey("foobar", 200, ktVal)))
 
 	// When limit user key is prefix of start user key
-	assertBytes(t, ikey("foobar", 100, tVal),
-		shortSep(ikey("foobar", 100, tVal),
-			ikey("foo", 200, tVal)))
+	assertBytes(t, ikey("foobar", 100, ktVal),
+		shortSep(ikey("foobar", 100, ktVal),
+			ikey("foo", 200, ktVal)))
 }
 
-func TestIKeyShortestSuccessor(t *testing.T) {
-	assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek),
-		shortSuccessor(ikey("foo", 100, tVal)))
-	assertBytes(t, ikey("\xff\xff", 100, tVal),
-		shortSuccessor(ikey("\xff\xff", 100, tVal)))
+func TestIkeyShortestSuccessor(t *testing.T) {
+	assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek),
+		shortSuccessor(ikey("foo", 100, ktVal)))
+	assertBytes(t, ikey("\xff\xff", 100, ktVal),
+		shortSuccessor(ikey("\xff\xff", 100, ktVal)))
 }

+ 2 - 2
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go

@@ -8,17 +8,17 @@
 package memdb
 
 import (
-	"errors"
 	"math/rand"
 	"sync"
 
 	"github.com/syndtr/goleveldb/leveldb/comparer"
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
 var (
-	ErrNotFound     = util.ErrNotFound
+	ErrNotFound     = errors.ErrNotFound
 	ErrIterReleased = errors.New("leveldb/memdb: iterator released")
 )
 

+ 266 - 19
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go

@@ -11,6 +11,7 @@ import (
 	"github.com/syndtr/goleveldb/leveldb/cache"
 	"github.com/syndtr/goleveldb/leveldb/comparer"
 	"github.com/syndtr/goleveldb/leveldb/filter"
+	"math"
 )
 
 const (
@@ -20,12 +21,24 @@ const (
 )
 
 const (
-	DefaultBlockCacheSize       = 8 * MiB
-	DefaultBlockRestartInterval = 16
-	DefaultBlockSize            = 4 * KiB
-	DefaultCompressionType      = SnappyCompression
-	DefaultCachedOpenFiles      = 500
-	DefaultWriteBuffer          = 4 * MiB
+	DefaultBlockCacheSize                = 8 * MiB
+	DefaultBlockRestartInterval          = 16
+	DefaultBlockSize                     = 4 * KiB
+	DefaultCompactionExpandLimitFactor   = 25
+	DefaultCompactionGPOverlapsFactor    = 10
+	DefaultCompactionL0Trigger           = 4
+	DefaultCompactionSourceLimitFactor   = 1
+	DefaultCompactionTableSize           = 2 * MiB
+	DefaultCompactionTableSizeMultiplier = 1.0
+	DefaultCompactionTotalSize           = 10 * MiB
+	DefaultCompactionTotalSizeMultiplier = 10.0
+	DefaultCompressionType               = SnappyCompression
+	DefaultCachedOpenFiles               = 500
+	DefaultMaxMemCompationLevel          = 2
+	DefaultNumLevel                      = 7
+	DefaultWriteBuffer                   = 4 * MiB
+	DefaultWriteL0PauseTrigger           = 12
+	DefaultWriteL0SlowdownTrigger        = 8
 )
 
 type noCache struct{}
@@ -65,34 +78,47 @@ const (
 	nCompression
 )
 
-// Strict is the DB strict level.
+// Strict is the DB 'strict level'.
 type Strict uint
 
 const (
 	// If present then a corrupted or invalid chunk or block in manifest
-	// journal will cause an error istead of being dropped.
+	// journal will cause an error instead of being dropped.
+	// This will prevent database with corrupted manifest to be opened.
 	StrictManifest Strict = 1 << iota
 
-	// If present then a corrupted or invalid chunk or block in journal
-	// will cause an error istead of being dropped.
-	StrictJournal
-
 	// If present then journal chunk checksum will be verified.
 	StrictJournalChecksum
 
-	// If present then an invalid key/value pair will cause an error
-	// instead of being skipped.
-	StrictIterator
+	// If present then a corrupted or invalid chunk or block in journal
+	// will cause an error instead of being dropped.
+	// This will prevent database with corrupted journal to be opened.
+	StrictJournal
 
 	// If present then 'sorted table' block checksum will be verified.
+	// This has effect on both 'read operation' and compaction.
 	StrictBlockChecksum
 
+	// If present then a corrupted 'sorted table' will fails compaction.
+	// The database will enter read-only mode.
+	StrictCompaction
+
+	// If present then a corrupted 'sorted table' will halts 'read operation'.
+	StrictReader
+
+	// If present then leveldb.Recover will drop corrupted 'sorted table'.
+	StrictRecovery
+
+	// This only applicable for ReadOptions, if present then this ReadOptions
+	// 'strict level' will override global ones.
+	StrictOverride
+
 	// StrictAll enables all strict flags.
-	StrictAll = StrictManifest | StrictJournal | StrictJournalChecksum | StrictIterator | StrictBlockChecksum
+	StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader
 
 	// DefaultStrict is the default strict flags. Specify any strict flags
 	// will override default strict flags as whole (i.e. not OR'ed).
-	DefaultStrict = StrictJournalChecksum | StrictIterator | StrictBlockChecksum
+	DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader
 
 	// NoStrict disables all strict flags. Override default strict flags.
 	NoStrict = ^StrictAll
@@ -132,6 +158,73 @@ type Options struct {
 	// The default value is 500.
 	CachedOpenFiles int
 
+	// CompactionExpandLimitFactor limits compaction size after expanded.
+	// This will be multiplied by table size limit at compaction target level.
+	//
+	// The default value is 25.
+	CompactionExpandLimitFactor int
+
+	// CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a
+	// single 'sorted table' generates.
+	// This will be multiplied by table size limit at grandparent level.
+	//
+	// The default value is 10.
+	CompactionGPOverlapsFactor int
+
+	// CompactionL0Trigger defines number of 'sorted table' at level-0 that will
+	// trigger compaction.
+	//
+	// The default value is 4.
+	CompactionL0Trigger int
+
+	// CompactionSourceLimitFactor limits compaction source size. This doesn't apply to
+	// level-0.
+	// This will be multiplied by table size limit at compaction target level.
+	//
+	// The default value is 1.
+	CompactionSourceLimitFactor int
+
+	// CompactionTableSize limits size of 'sorted table' that compaction generates.
+	// The limits for each level will be calculated as:
+	//   CompactionTableSize * (CompactionTableSizeMultiplier ^ Level)
+	// The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel.
+	//
+	// The default value is 2MiB.
+	CompactionTableSize int
+
+	// CompactionTableSizeMultiplier defines multiplier for CompactionTableSize.
+	//
+	// The default value is 1.
+	CompactionTableSizeMultiplier float64
+
+	// CompactionTableSizeMultiplierPerLevel defines per-level multiplier for
+	// CompactionTableSize.
+	// Use zero to skip a level.
+	//
+	// The default value is nil.
+	CompactionTableSizeMultiplierPerLevel []float64
+
+	// CompactionTotalSize limits total size of 'sorted table' for each level.
+	// The limits for each level will be calculated as:
+	//   CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level)
+	// The multiplier for each level can also fine-tuned using
+	// CompactionTotalSizeMultiplierPerLevel.
+	//
+	// The default value is 10MiB.
+	CompactionTotalSize int
+
+	// CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize.
+	//
+	// The default value is 10.
+	CompactionTotalSizeMultiplier float64
+
+	// CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for
+	// CompactionTotalSize.
+	// Use zero to skip a level.
+	//
+	// The default value is nil.
+	CompactionTotalSizeMultiplierPerLevel []float64
+
 	// Comparer defines a total ordering over the space of []byte keys: a 'less
 	// than' relationship. The same comparison algorithm must be used for reads
 	// and writes over the lifetime of the DB.
@@ -144,6 +237,11 @@ type Options struct {
 	// The default value (DefaultCompression) uses snappy compression.
 	Compression Compression
 
+	// DisableCompactionBackoff allows disable compaction retry backoff.
+	//
+	// The default value is false.
+	DisableCompactionBackoff bool
+
 	// ErrorIfExist defines whether an error should returned if the DB already
 	// exist.
 	//
@@ -172,6 +270,19 @@ type Options struct {
 	// The default value is nil.
 	Filter filter.Filter
 
+	// MaxMemCompationLevel defines maximum level a newly compacted 'memdb'
+	// will be pushed into if doesn't creates overlap. This should less than
+	// NumLevel. Use -1 for level-0.
+	//
+	// The default is 2.
+	MaxMemCompationLevel int
+
+	// NumLevel defines number of database level. The level shouldn't changed
+	// between opens, or the database will panic.
+	//
+	// The default is 7.
+	NumLevel int
+
 	// Strict defines the DB strict level.
 	Strict Strict
 
@@ -183,6 +294,18 @@ type Options struct {
 	//
 	// The default value is 4MiB.
 	WriteBuffer int
+
+	// WriteL0StopTrigger defines number of 'sorted table' at level-0 that will
+	// pause write.
+	//
+	// The default value is 12.
+	WriteL0PauseTrigger int
+
+	// WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that
+	// will trigger write slowdown.
+	//
+	// The default value is 8.
+	WriteL0SlowdownTrigger int
 }
 
 func (o *Options) GetAltFilters() []filter.Filter {
@@ -222,6 +345,79 @@ func (o *Options) GetCachedOpenFiles() int {
 	return o.CachedOpenFiles
 }
 
+func (o *Options) GetCompactionExpandLimit(level int) int {
+	factor := DefaultCompactionExpandLimitFactor
+	if o != nil && o.CompactionExpandLimitFactor > 0 {
+		factor = o.CompactionExpandLimitFactor
+	}
+	return o.GetCompactionTableSize(level+1) * factor
+}
+
+func (o *Options) GetCompactionGPOverlaps(level int) int {
+	factor := DefaultCompactionGPOverlapsFactor
+	if o != nil && o.CompactionGPOverlapsFactor > 0 {
+		factor = o.CompactionGPOverlapsFactor
+	}
+	return o.GetCompactionTableSize(level+2) * factor
+}
+
+func (o *Options) GetCompactionL0Trigger() int {
+	if o == nil || o.CompactionL0Trigger == 0 {
+		return DefaultCompactionL0Trigger
+	}
+	return o.CompactionL0Trigger
+}
+
+func (o *Options) GetCompactionSourceLimit(level int) int {
+	factor := DefaultCompactionSourceLimitFactor
+	if o != nil && o.CompactionSourceLimitFactor > 0 {
+		factor = o.CompactionSourceLimitFactor
+	}
+	return o.GetCompactionTableSize(level+1) * factor
+}
+
+func (o *Options) GetCompactionTableSize(level int) int {
+	var (
+		base = DefaultCompactionTableSize
+		mult float64
+	)
+	if o != nil {
+		if o.CompactionTableSize > 0 {
+			base = o.CompactionTableSize
+		}
+		if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 {
+			mult = o.CompactionTableSizeMultiplierPerLevel[level]
+		} else if o.CompactionTableSizeMultiplier > 0 {
+			mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level))
+		}
+	}
+	if mult == 0 {
+		mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level))
+	}
+	return int(float64(base) * mult)
+}
+
+func (o *Options) GetCompactionTotalSize(level int) int64 {
+	var (
+		base = DefaultCompactionTotalSize
+		mult float64
+	)
+	if o != nil {
+		if o.CompactionTotalSize > 0 {
+			base = o.CompactionTotalSize
+		}
+		if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 {
+			mult = o.CompactionTotalSizeMultiplierPerLevel[level]
+		} else if o.CompactionTotalSizeMultiplier > 0 {
+			mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level))
+		}
+	}
+	if mult == 0 {
+		mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level))
+	}
+	return int64(float64(base) * mult)
+}
+
 func (o *Options) GetComparer() comparer.Comparer {
 	if o == nil || o.Comparer == nil {
 		return comparer.DefaultComparer
@@ -236,6 +432,13 @@ func (o *Options) GetCompression() Compression {
 	return o.Compression
 }
 
+func (o *Options) GetDisableCompactionBackoff() bool {
+	if o == nil {
+		return false
+	}
+	return o.DisableCompactionBackoff
+}
+
 func (o *Options) GetErrorIfExist() bool {
 	if o == nil {
 		return false
@@ -257,6 +460,28 @@ func (o *Options) GetFilter() filter.Filter {
 	return o.Filter
 }
 
+func (o *Options) GetMaxMemCompationLevel() int {
+	level := DefaultMaxMemCompationLevel
+	if o != nil {
+		if o.MaxMemCompationLevel > 0 {
+			level = o.MaxMemCompationLevel
+		} else if o.MaxMemCompationLevel == -1 {
+			level = 0
+		}
+	}
+	if level >= o.GetNumLevel() {
+		return o.GetNumLevel() - 1
+	}
+	return level
+}
+
+func (o *Options) GetNumLevel() int {
+	if o == nil || o.NumLevel <= 0 {
+		return DefaultNumLevel
+	}
+	return o.NumLevel
+}
+
 func (o *Options) GetStrict(strict Strict) bool {
 	if o == nil || o.Strict == 0 {
 		return DefaultStrict&strict != 0
@@ -271,6 +496,20 @@ func (o *Options) GetWriteBuffer() int {
 	return o.WriteBuffer
 }
 
+func (o *Options) GetWriteL0PauseTrigger() int {
+	if o == nil || o.WriteL0PauseTrigger == 0 {
+		return DefaultWriteL0PauseTrigger
+	}
+	return o.WriteL0PauseTrigger
+}
+
+func (o *Options) GetWriteL0SlowdownTrigger() int {
+	if o == nil || o.WriteL0SlowdownTrigger == 0 {
+		return DefaultWriteL0SlowdownTrigger
+	}
+	return o.WriteL0SlowdownTrigger
+}
+
 // ReadOptions holds the optional parameters for 'read operation'. The
 // 'read operation' includes Get, Find and NewIterator.
 type ReadOptions struct {
@@ -281,8 +520,8 @@ type ReadOptions struct {
 	// The default value is false.
 	DontFillCache bool
 
-	// Strict overrides global DB strict level. Only StrictIterator and
-	// StrictBlockChecksum that does have effects here.
+	// Strict will be OR'ed with global DB 'strict level' unless StrictOverride
+	// is present. Currently only StrictReader that has effect here.
 	Strict Strict
 }
 
@@ -324,3 +563,11 @@ func (wo *WriteOptions) GetSync() bool {
 	}
 	return wo.Sync
 }
+
+func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool {
+	if ro.GetStrict(StrictOverride) {
+		return ro.GetStrict(strict)
+	} else {
+		return o.GetStrict(strict) || ro.GetStrict(strict)
+	}
+}

+ 65 - 9
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go

@@ -12,30 +12,86 @@ import (
 	"github.com/syndtr/goleveldb/leveldb/opt"
 )
 
-func (s *session) setOptions(o *opt.Options) {
-	s.o = &opt.Options{}
+func dupOptions(o *opt.Options) *opt.Options {
+	newo := &opt.Options{}
 	if o != nil {
-		*s.o = *o
+		*newo = *o
 	}
+	return newo
+}
+
+func (s *session) setOptions(o *opt.Options) {
+	no := dupOptions(o)
 	// Alternative filters.
 	if filters := o.GetAltFilters(); len(filters) > 0 {
-		s.o.AltFilters = make([]filter.Filter, len(filters))
+		no.AltFilters = make([]filter.Filter, len(filters))
 		for i, filter := range filters {
-			s.o.AltFilters[i] = &iFilter{filter}
+			no.AltFilters[i] = &iFilter{filter}
 		}
 	}
 	// Block cache.
 	switch o.GetBlockCache() {
 	case nil:
-		s.o.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize)
+		no.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize)
 	case opt.NoCache:
-		s.o.BlockCache = nil
+		no.BlockCache = nil
 	}
 	// Comparer.
 	s.icmp = &iComparer{o.GetComparer()}
-	s.o.Comparer = s.icmp
+	no.Comparer = s.icmp
 	// Filter.
 	if filter := o.GetFilter(); filter != nil {
-		s.o.Filter = &iFilter{filter}
+		no.Filter = &iFilter{filter}
 	}
+
+	s.o = &cachedOptions{Options: no}
+	s.o.cache()
+}
+
+type cachedOptions struct {
+	*opt.Options
+
+	compactionExpandLimit []int
+	compactionGPOverlaps  []int
+	compactionSourceLimit []int
+	compactionTableSize   []int
+	compactionTotalSize   []int64
+}
+
+func (co *cachedOptions) cache() {
+	numLevel := co.Options.GetNumLevel()
+
+	co.compactionExpandLimit = make([]int, numLevel)
+	co.compactionGPOverlaps = make([]int, numLevel)
+	co.compactionSourceLimit = make([]int, numLevel)
+	co.compactionTableSize = make([]int, numLevel)
+	co.compactionTotalSize = make([]int64, numLevel)
+
+	for level := 0; level < numLevel; level++ {
+		co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level)
+		co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level)
+		co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level)
+		co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level)
+		co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level)
+	}
+}
+
+func (co *cachedOptions) GetCompactionExpandLimit(level int) int {
+	return co.compactionExpandLimit[level]
+}
+
+func (co *cachedOptions) GetCompactionGPOverlaps(level int) int {
+	return co.compactionGPOverlaps[level]
+}
+
+func (co *cachedOptions) GetCompactionSourceLimit(level int) int {
+	return co.compactionSourceLimit[level]
+}
+
+func (co *cachedOptions) GetCompactionTableSize(level int) int {
+	return co.compactionTableSize[level]
+}
+
+func (co *cachedOptions) GetCompactionTotalSize(level int) int64 {
+	return co.compactionTotalSize[level]
 }

+ 135 - 73
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go

@@ -7,12 +7,13 @@
 package leveldb
 
 import (
-	"errors"
+	"fmt"
 	"io"
 	"os"
 	"sync"
 	"sync/atomic"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/journal"
 	"github.com/syndtr/goleveldb/leveldb/opt"
@@ -20,18 +21,31 @@ import (
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
+type ErrManifestCorrupted struct {
+	Field  string
+	Reason string
+}
+
+func (e *ErrManifestCorrupted) Error() string {
+	return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason)
+}
+
+func newErrManifestCorrupted(f storage.File, field, reason string) error {
+	return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason})
+}
+
 // session represent a persistent database session.
 type session struct {
 	// Need 64-bit alignment.
-	stFileNum        uint64 // current unused file number
+	stNextFileNum    uint64 // current unused file number
 	stJournalNum     uint64 // current journal file number; need external synchronization
 	stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb
-	stSeq            uint64 // last mem compacted seq; need external synchronization
+	stSeqNum         uint64 // last mem compacted seq; need external synchronization
 	stTempFileNum    uint64
 
 	stor     storage.Storage
 	storLock util.Releaser
-	o        *opt.Options
+	o        *cachedOptions
 	icmp     *iComparer
 	tops     *tOps
 
@@ -39,9 +53,9 @@ type session struct {
 	manifestWriter storage.Writer
 	manifestFile   storage.File
 
-	stCptrs   [kNumLevels]iKey // compact pointers; need external synchronization
-	stVersion *version         // current version
-	vmu       sync.Mutex
+	stCompPtrs []iKey   // compaction pointers; need external synchronization
+	stVersion  *version // current version
+	vmu        sync.Mutex
 }
 
 // Creates new initialized session instance.
@@ -54,13 +68,14 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) {
 		return
 	}
 	s = &session{
-		stor:     stor,
-		storLock: storLock,
+		stor:       stor,
+		storLock:   storLock,
+		stCompPtrs: make([]iKey, o.GetNumLevel()),
 	}
 	s.setOptions(o)
 	s.tops = newTableOps(s, s.o.GetCachedOpenFiles())
-	s.setVersion(&version{s: s})
-	s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock D·DeletedEntry L·Level Q·SeqNum T·TimeElapsed")
+	s.setVersion(newVersion(s))
+	s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed")
 	return
 }
 
@@ -100,26 +115,26 @@ func (s *session) recover() (err error) {
 			// Don't return os.ErrNotExist if the underlying storage contains
 			// other files that belong to LevelDB. So the DB won't get trashed.
 			if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 {
-				err = ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest file missing")}
+				err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}}
 			}
 		}
 	}()
 
-	file, err := s.stor.GetManifest()
+	m, err := s.stor.GetManifest()
 	if err != nil {
 		return
 	}
 
-	reader, err := file.Open()
+	reader, err := m.Open()
 	if err != nil {
 		return
 	}
 	defer reader.Close()
 	strict := s.o.GetStrict(opt.StrictManifest)
-	jr := journal.NewReader(reader, dropper{s, file}, strict, true)
+	jr := journal.NewReader(reader, dropper{s, m}, strict, true)
 
-	staging := s.version_NB().newStaging()
-	rec := &sessionRecord{}
+	staging := s.stVersion.newStaging()
+	rec := &sessionRecord{numLevel: s.o.GetNumLevel()}
 	for {
 		var r io.Reader
 		r, err = jr.Next()
@@ -128,51 +143,57 @@ func (s *session) recover() (err error) {
 				err = nil
 				break
 			}
-			return
+			return errors.SetFile(err, m)
 		}
 
 		err = rec.decode(r)
 		if err == nil {
 			// save compact pointers
-			for _, r := range rec.compactionPointers {
-				s.stCptrs[r.level] = iKey(r.ikey)
+			for _, r := range rec.compPtrs {
+				s.stCompPtrs[r.level] = iKey(r.ikey)
 			}
 			// commit record to version staging
 			staging.commit(rec)
-		} else if strict {
-			return ErrCorrupted{Type: CorruptedManifest, Err: err}
 		} else {
-			s.logf("manifest error: %v (skipped)", err)
+			err = errors.SetFile(err, m)
+			if strict || !errors.IsCorrupted(err) {
+				return
+			} else {
+				s.logf("manifest error: %v (skipped)", errors.SetFile(err, m))
+			}
 		}
-		rec.resetCompactionPointers()
+		rec.resetCompPtrs()
 		rec.resetAddedTables()
 		rec.resetDeletedTables()
 	}
 
 	switch {
 	case !rec.has(recComparer):
-		return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing comparer name")}
+		return newErrManifestCorrupted(m, "comparer", "missing")
 	case rec.comparer != s.icmp.uName():
-		return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: comparer mismatch, " + "want '" + s.icmp.uName() + "', " + "got '" + rec.comparer + "'")}
-	case !rec.has(recNextNum):
-		return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing next file number")}
+		return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer))
+	case !rec.has(recNextFileNum):
+		return newErrManifestCorrupted(m, "next-file-num", "missing")
 	case !rec.has(recJournalNum):
-		return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing journal file number")}
-	case !rec.has(recSeq):
-		return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing seq number")}
+		return newErrManifestCorrupted(m, "journal-file-num", "missing")
+	case !rec.has(recSeqNum):
+		return newErrManifestCorrupted(m, "seq-num", "missing")
 	}
 
-	s.manifestFile = file
+	s.manifestFile = m
 	s.setVersion(staging.finish())
-	s.setFileNum(rec.nextNum)
+	s.setNextFileNum(rec.nextFileNum)
 	s.recordCommited(rec)
 	return nil
 }
 
 // Commit session; need external synchronization.
 func (s *session) commit(r *sessionRecord) (err error) {
+	v := s.version()
+	defer v.release()
+
 	// spawn new version based on current version
-	nv := s.version_NB().spawn(r)
+	nv := v.spawn(r)
 
 	if s.manifest == nil {
 		// manifest journal writer not yet created, create one
@@ -191,13 +212,13 @@ func (s *session) commit(r *sessionRecord) (err error) {
 
 // Pick a compaction based on current state; need external synchronization.
 func (s *session) pickCompaction() *compaction {
-	v := s.version_NB()
+	v := s.version()
 
 	var level int
 	var t0 tFiles
 	if v.cScore >= 1 {
 		level = v.cLevel
-		cptr := s.stCptrs[level]
+		cptr := s.stCompPtrs[level]
 		tables := v.tables[level]
 		for _, t := range tables {
 			if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 {
@@ -214,27 +235,21 @@ func (s *session) pickCompaction() *compaction {
 			level = ts.level
 			t0 = append(t0, ts.table)
 		} else {
+			v.release()
 			return nil
 		}
 	}
 
-	c := &compaction{s: s, v: v, level: level}
-	if level == 0 {
-		imin, imax := t0.getRange(s.icmp)
-		t0 = v.tables[0].getOverlaps(t0[:0], s.icmp, imin.ukey(), imax.ukey(), true)
-	}
-
-	c.tables[0] = t0
-	c.expand()
-	return c
+	return newCompaction(s, v, level, t0)
 }
 
 // Create compaction from given level and range; need external synchronization.
 func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
-	v := s.version_NB()
+	v := s.version()
 
 	t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0)
 	if len(t0) == 0 {
+		v.release()
 		return nil
 	}
 
@@ -243,7 +258,7 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
 	// and we must not pick one file and drop another older file if the
 	// two files overlap.
 	if level > 0 {
-		limit := uint64(kMaxTableSize)
+		limit := uint64(v.s.o.GetCompactionSourceLimit(level))
 		total := uint64(0)
 		for i, t := range t0 {
 			total += t.size
@@ -255,9 +270,20 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction {
 		}
 	}
 
-	c := &compaction{s: s, v: v, level: level}
-	c.tables[0] = t0
+	return newCompaction(s, v, level, t0)
+}
+
+func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction {
+	c := &compaction{
+		s:             s,
+		v:             v,
+		level:         level,
+		tables:        [2]tFiles{t0, nil},
+		maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)),
+		tPtrs:         make([]int, s.o.GetNumLevel()),
+	}
 	c.expand()
+	c.save()
 	return c
 }
 
@@ -266,25 +292,57 @@ type compaction struct {
 	s *session
 	v *version
 
-	level  int
-	tables [2]tFiles
+	level         int
+	tables        [2]tFiles
+	maxGPOverlaps uint64
+
+	gp                tFiles
+	gpi               int
+	seenKey           bool
+	gpOverlappedBytes uint64
+	imin, imax        iKey
+	tPtrs             []int
+	released          bool
+
+	snapGPI               int
+	snapSeenKey           bool
+	snapGPOverlappedBytes uint64
+	snapTPtrs             []int
+}
 
-	gp              tFiles
-	gpidx           int
-	seenKey         bool
-	overlappedBytes uint64
-	imin, imax      iKey
+func (c *compaction) save() {
+	c.snapGPI = c.gpi
+	c.snapSeenKey = c.seenKey
+	c.snapGPOverlappedBytes = c.gpOverlappedBytes
+	c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...)
+}
 
-	tPtrs [kNumLevels]int
+func (c *compaction) restore() {
+	c.gpi = c.snapGPI
+	c.seenKey = c.snapSeenKey
+	c.gpOverlappedBytes = c.snapGPOverlappedBytes
+	c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...)
+}
+
+func (c *compaction) release() {
+	if !c.released {
+		c.released = true
+		c.v.release()
+	}
 }
 
 // Expand compacted tables; need external synchronization.
 func (c *compaction) expand() {
-	level := c.level
-	vt0, vt1 := c.v.tables[level], c.v.tables[level+1]
+	limit := uint64(c.s.o.GetCompactionExpandLimit(c.level))
+	vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1]
 
 	t0, t1 := c.tables[0], c.tables[1]
 	imin, imax := t0.getRange(c.s.icmp)
+	// We expand t0 here just incase ukey hop across tables.
+	t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0)
+	if len(t0) != len(c.tables[0]) {
+		imin, imax = t0.getRange(c.s.icmp)
+	}
 	t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false)
 	// Get entire range covered by compaction.
 	amin, amax := append(t0, t1...).getRange(c.s.icmp)
@@ -292,13 +350,13 @@ func (c *compaction) expand() {
 	// See if we can grow the number of inputs in "level" without
 	// changing the number of "level+1" files we pick up.
 	if len(t1) > 0 {
-		exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), level == 0)
-		if len(exp0) > len(t0) && t1.size()+exp0.size() < kExpCompactionMaxBytes {
+		exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0)
+		if len(exp0) > len(t0) && t1.size()+exp0.size() < limit {
 			xmin, xmax := exp0.getRange(c.s.icmp)
 			exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false)
 			if len(exp1) == len(t1) {
 				c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)",
-					level, level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
+					c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())),
 					len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size())))
 				imin, imax = xmin, xmax
 				t0, t1 = exp0, exp1
@@ -309,8 +367,8 @@ func (c *compaction) expand() {
 
 	// Compute the set of grandparent files that overlap this compaction
 	// (parent == level+1; grandparent == level+2)
-	if level+2 < kNumLevels {
-		c.gp = c.v.tables[level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
+	if c.level+2 < c.s.o.GetNumLevel() {
+		c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false)
 	}
 
 	c.tables[0], c.tables[1] = t0, t1
@@ -319,7 +377,7 @@ func (c *compaction) expand() {
 
 // Check whether compaction is trivial.
 func (c *compaction) trivial() bool {
-	return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= kMaxGrandParentOverlapBytes
+	return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps
 }
 
 func (c *compaction) baseLevelForKey(ukey []byte) bool {
@@ -341,20 +399,20 @@ func (c *compaction) baseLevelForKey(ukey []byte) bool {
 }
 
 func (c *compaction) shouldStopBefore(ikey iKey) bool {
-	for ; c.gpidx < len(c.gp); c.gpidx++ {
-		gp := c.gp[c.gpidx]
+	for ; c.gpi < len(c.gp); c.gpi++ {
+		gp := c.gp[c.gpi]
 		if c.s.icmp.Compare(ikey, gp.imax) <= 0 {
 			break
 		}
 		if c.seenKey {
-			c.overlappedBytes += gp.size
+			c.gpOverlappedBytes += gp.size
 		}
 	}
 	c.seenKey = true
 
-	if c.overlappedBytes > kMaxGrandParentOverlapBytes {
+	if c.gpOverlappedBytes > c.maxGPOverlaps {
 		// Too much overlap for current output; start new output.
-		c.overlappedBytes = 0
+		c.gpOverlappedBytes = 0
 		return true
 	}
 	return false
@@ -373,8 +431,12 @@ func (c *compaction) newIterator() iterator.Iterator {
 	// Options.
 	ro := &opt.ReadOptions{
 		DontFillCache: true,
+		Strict:        opt.StrictOverride,
+	}
+	strict := c.s.o.GetStrict(opt.StrictCompaction)
+	if strict {
+		ro.Strict |= opt.StrictReader
 	}
-	strict := c.s.o.GetStrict(opt.StrictIterator)
 
 	for i, tables := range c.tables {
 		if len(tables) == 0 {
@@ -387,10 +449,10 @@ func (c *compaction) newIterator() iterator.Iterator {
 				its = append(its, c.s.tops.newIterator(t, nil, ro))
 			}
 		} else {
-			it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict, true)
+			it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict)
 			its = append(its, it)
 		}
 	}
 
-	return iterator.NewMergedIterator(its, c.s.icmp, true)
+	return iterator.NewMergedIterator(its, c.s.icmp, strict)
 }

+ 98 - 93
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go

@@ -9,11 +9,11 @@ package leveldb
 import (
 	"bufio"
 	"encoding/binary"
-	"errors"
 	"io"
-)
+	"strings"
 
-var errCorruptManifest = errors.New("leveldb: corrupt manifest")
+	"github.com/syndtr/goleveldb/leveldb/errors"
+)
 
 type byteReader interface {
 	io.Reader
@@ -22,13 +22,13 @@ type byteReader interface {
 
 // These numbers are written to disk and should not be changed.
 const (
-	recComparer          = 1
-	recJournalNum        = 2
-	recNextNum           = 3
-	recSeq               = 4
-	recCompactionPointer = 5
-	recDeletedTable      = 6
-	recNewTable          = 7
+	recComparer    = 1
+	recJournalNum  = 2
+	recNextFileNum = 3
+	recSeqNum      = 4
+	recCompPtr     = 5
+	recDelTable    = 6
+	recAddTable    = 7
 	// 8 was used for large value refs
 	recPrevJournalNum = 9
 )
@@ -38,7 +38,7 @@ type cpRecord struct {
 	ikey  iKey
 }
 
-type ntRecord struct {
+type atRecord struct {
 	level int
 	num   uint64
 	size  uint64
@@ -46,27 +46,26 @@ type ntRecord struct {
 	imax  iKey
 }
 
-func (r ntRecord) makeFile(s *session) *tFile {
-	return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
-}
-
 type dtRecord struct {
 	level int
 	num   uint64
 }
 
 type sessionRecord struct {
-	hasRec             int
-	comparer           string
-	journalNum         uint64
-	prevJournalNum     uint64
-	nextNum            uint64
-	seq                uint64
-	compactionPointers []cpRecord
-	addedTables        []ntRecord
-	deletedTables      []dtRecord
-	scratch            [binary.MaxVarintLen64]byte
-	err                error
+	numLevel int
+
+	hasRec         int
+	comparer       string
+	journalNum     uint64
+	prevJournalNum uint64
+	nextFileNum    uint64
+	seqNum         uint64
+	compPtrs       []cpRecord
+	addedTables    []atRecord
+	deletedTables  []dtRecord
+
+	scratch [binary.MaxVarintLen64]byte
+	err     error
 }
 
 func (p *sessionRecord) has(rec int) bool {
@@ -88,29 +87,29 @@ func (p *sessionRecord) setPrevJournalNum(num uint64) {
 	p.prevJournalNum = num
 }
 
-func (p *sessionRecord) setNextNum(num uint64) {
-	p.hasRec |= 1 << recNextNum
-	p.nextNum = num
+func (p *sessionRecord) setNextFileNum(num uint64) {
+	p.hasRec |= 1 << recNextFileNum
+	p.nextFileNum = num
 }
 
-func (p *sessionRecord) setSeq(seq uint64) {
-	p.hasRec |= 1 << recSeq
-	p.seq = seq
+func (p *sessionRecord) setSeqNum(num uint64) {
+	p.hasRec |= 1 << recSeqNum
+	p.seqNum = num
 }
 
-func (p *sessionRecord) addCompactionPointer(level int, ikey iKey) {
-	p.hasRec |= 1 << recCompactionPointer
-	p.compactionPointers = append(p.compactionPointers, cpRecord{level, ikey})
+func (p *sessionRecord) addCompPtr(level int, ikey iKey) {
+	p.hasRec |= 1 << recCompPtr
+	p.compPtrs = append(p.compPtrs, cpRecord{level, ikey})
 }
 
-func (p *sessionRecord) resetCompactionPointers() {
-	p.hasRec &= ^(1 << recCompactionPointer)
-	p.compactionPointers = p.compactionPointers[:0]
+func (p *sessionRecord) resetCompPtrs() {
+	p.hasRec &= ^(1 << recCompPtr)
+	p.compPtrs = p.compPtrs[:0]
 }
 
 func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) {
-	p.hasRec |= 1 << recNewTable
-	p.addedTables = append(p.addedTables, ntRecord{level, num, size, imin, imax})
+	p.hasRec |= 1 << recAddTable
+	p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax})
 }
 
 func (p *sessionRecord) addTableFile(level int, t *tFile) {
@@ -118,17 +117,17 @@ func (p *sessionRecord) addTableFile(level int, t *tFile) {
 }
 
 func (p *sessionRecord) resetAddedTables() {
-	p.hasRec &= ^(1 << recNewTable)
+	p.hasRec &= ^(1 << recAddTable)
 	p.addedTables = p.addedTables[:0]
 }
 
-func (p *sessionRecord) deleteTable(level int, num uint64) {
-	p.hasRec |= 1 << recDeletedTable
+func (p *sessionRecord) delTable(level int, num uint64) {
+	p.hasRec |= 1 << recDelTable
 	p.deletedTables = append(p.deletedTables, dtRecord{level, num})
 }
 
 func (p *sessionRecord) resetDeletedTables() {
-	p.hasRec &= ^(1 << recDeletedTable)
+	p.hasRec &= ^(1 << recDelTable)
 	p.deletedTables = p.deletedTables[:0]
 }
 
@@ -161,26 +160,26 @@ func (p *sessionRecord) encode(w io.Writer) error {
 		p.putUvarint(w, recJournalNum)
 		p.putUvarint(w, p.journalNum)
 	}
-	if p.has(recNextNum) {
-		p.putUvarint(w, recNextNum)
-		p.putUvarint(w, p.nextNum)
+	if p.has(recNextFileNum) {
+		p.putUvarint(w, recNextFileNum)
+		p.putUvarint(w, p.nextFileNum)
 	}
-	if p.has(recSeq) {
-		p.putUvarint(w, recSeq)
-		p.putUvarint(w, p.seq)
+	if p.has(recSeqNum) {
+		p.putUvarint(w, recSeqNum)
+		p.putUvarint(w, p.seqNum)
 	}
-	for _, r := range p.compactionPointers {
-		p.putUvarint(w, recCompactionPointer)
+	for _, r := range p.compPtrs {
+		p.putUvarint(w, recCompPtr)
 		p.putUvarint(w, uint64(r.level))
 		p.putBytes(w, r.ikey)
 	}
 	for _, r := range p.deletedTables {
-		p.putUvarint(w, recDeletedTable)
+		p.putUvarint(w, recDelTable)
 		p.putUvarint(w, uint64(r.level))
 		p.putUvarint(w, r.num)
 	}
 	for _, r := range p.addedTables {
-		p.putUvarint(w, recNewTable)
+		p.putUvarint(w, recAddTable)
 		p.putUvarint(w, uint64(r.level))
 		p.putUvarint(w, r.num)
 		p.putUvarint(w, r.size)
@@ -190,14 +189,16 @@ func (p *sessionRecord) encode(w io.Writer) error {
 	return p.err
 }
 
-func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 {
+func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF bool) uint64 {
 	if p.err != nil {
 		return 0
 	}
 	x, err := binary.ReadUvarint(r)
 	if err != nil {
-		if err == io.EOF {
-			p.err = errCorruptManifest
+		if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) {
+			p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
+		} else if strings.HasPrefix(err.Error(), "binary:") {
+			p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()})
 		} else {
 			p.err = err
 		}
@@ -206,35 +207,39 @@ func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 {
 	return x
 }
 
-func (p *sessionRecord) readBytes(r byteReader) []byte {
+func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 {
+	return p.readUvarintMayEOF(field, r, false)
+}
+
+func (p *sessionRecord) readBytes(field string, r byteReader) []byte {
 	if p.err != nil {
 		return nil
 	}
-	n := p.readUvarint(r)
+	n := p.readUvarint(field, r)
 	if p.err != nil {
 		return nil
 	}
 	x := make([]byte, n)
 	_, p.err = io.ReadFull(r, x)
 	if p.err != nil {
-		if p.err == io.EOF {
-			p.err = errCorruptManifest
+		if p.err == io.ErrUnexpectedEOF {
+			p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"})
 		}
 		return nil
 	}
 	return x
 }
 
-func (p *sessionRecord) readLevel(r io.ByteReader) int {
+func (p *sessionRecord) readLevel(field string, r io.ByteReader) int {
 	if p.err != nil {
 		return 0
 	}
-	x := p.readUvarint(r)
+	x := p.readUvarint(field, r)
 	if p.err != nil {
 		return 0
 	}
-	if x >= kNumLevels {
-		p.err = errCorruptManifest
+	if x >= uint64(p.numLevel) {
+		p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"})
 		return 0
 	}
 	return int(x)
@@ -247,59 +252,59 @@ func (p *sessionRecord) decode(r io.Reader) error {
 	}
 	p.err = nil
 	for p.err == nil {
-		rec, err := binary.ReadUvarint(br)
-		if err != nil {
-			if err == io.EOF {
-				err = nil
+		rec := p.readUvarintMayEOF("field-header", br, true)
+		if p.err != nil {
+			if p.err == io.EOF {
+				return nil
 			}
-			return err
+			return p.err
 		}
 		switch rec {
 		case recComparer:
-			x := p.readBytes(br)
+			x := p.readBytes("comparer", br)
 			if p.err == nil {
 				p.setComparer(string(x))
 			}
 		case recJournalNum:
-			x := p.readUvarint(br)
+			x := p.readUvarint("journal-num", br)
 			if p.err == nil {
 				p.setJournalNum(x)
 			}
 		case recPrevJournalNum:
-			x := p.readUvarint(br)
+			x := p.readUvarint("prev-journal-num", br)
 			if p.err == nil {
 				p.setPrevJournalNum(x)
 			}
-		case recNextNum:
-			x := p.readUvarint(br)
+		case recNextFileNum:
+			x := p.readUvarint("next-file-num", br)
 			if p.err == nil {
-				p.setNextNum(x)
+				p.setNextFileNum(x)
 			}
-		case recSeq:
-			x := p.readUvarint(br)
+		case recSeqNum:
+			x := p.readUvarint("seq-num", br)
 			if p.err == nil {
-				p.setSeq(x)
+				p.setSeqNum(x)
 			}
-		case recCompactionPointer:
-			level := p.readLevel(br)
-			ikey := p.readBytes(br)
+		case recCompPtr:
+			level := p.readLevel("comp-ptr.level", br)
+			ikey := p.readBytes("comp-ptr.ikey", br)
 			if p.err == nil {
-				p.addCompactionPointer(level, iKey(ikey))
+				p.addCompPtr(level, iKey(ikey))
 			}
-		case recNewTable:
-			level := p.readLevel(br)
-			num := p.readUvarint(br)
-			size := p.readUvarint(br)
-			imin := p.readBytes(br)
-			imax := p.readBytes(br)
+		case recAddTable:
+			level := p.readLevel("add-table.level", br)
+			num := p.readUvarint("add-table.num", br)
+			size := p.readUvarint("add-table.size", br)
+			imin := p.readBytes("add-table.imin", br)
+			imax := p.readBytes("add-table.imax", br)
 			if p.err == nil {
 				p.addTable(level, num, size, imin, imax)
 			}
-		case recDeletedTable:
-			level := p.readLevel(br)
-			num := p.readUvarint(br)
+		case recDelTable:
+			level := p.readLevel("del-table.level", br)
+			num := p.readUvarint("del-table.num", br)
 			if p.err == nil {
-				p.deleteTable(level, num)
+				p.delTable(level, num)
 			}
 		}
 	}

+ 10 - 8
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go

@@ -9,6 +9,8 @@ package leveldb
 import (
 	"bytes"
 	"testing"
+
+	"github.com/syndtr/goleveldb/leveldb/opt"
 )
 
 func decodeEncode(v *sessionRecord) (res bool, err error) {
@@ -17,7 +19,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
 	if err != nil {
 		return
 	}
-	v2 := new(sessionRecord)
+	v2 := &sessionRecord{numLevel: opt.DefaultNumLevel}
 	err = v.decode(b)
 	if err != nil {
 		return
@@ -32,7 +34,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) {
 
 func TestSessionRecord_EncodeDecode(t *testing.T) {
 	big := uint64(1) << 50
-	v := new(sessionRecord)
+	v := &sessionRecord{numLevel: opt.DefaultNumLevel}
 	i := uint64(0)
 	test := func() {
 		res, err := decodeEncode(v)
@@ -47,16 +49,16 @@ func TestSessionRecord_EncodeDecode(t *testing.T) {
 	for ; i < 4; i++ {
 		test()
 		v.addTable(3, big+300+i, big+400+i,
-			newIKey([]byte("foo"), big+500+1, tVal),
-			newIKey([]byte("zoo"), big+600+1, tDel))
-		v.deleteTable(4, big+700+i)
-		v.addCompactionPointer(int(i), newIKey([]byte("x"), big+900+1, tVal))
+			newIkey([]byte("foo"), big+500+1, ktVal),
+			newIkey([]byte("zoo"), big+600+1, ktDel))
+		v.delTable(4, big+700+i)
+		v.addCompPtr(int(i), newIkey([]byte("x"), big+900+1, ktVal))
 	}
 
 	v.setComparer("foo")
 	v.setJournalNum(big + 100)
 	v.setPrevJournalNum(big + 99)
-	v.setNextNum(big + 200)
-	v.setSeq(big + 1000)
+	v.setNextFileNum(big + 200)
+	v.setSeqNum(big + 1000)
 	test()
 }

+ 34 - 33
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go

@@ -22,7 +22,7 @@ type dropper struct {
 }
 
 func (d dropper) Drop(err error) {
-	if e, ok := err.(journal.ErrCorrupted); ok {
+	if e, ok := err.(*journal.ErrCorrupted); ok {
 		d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason)
 	} else {
 		d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err)
@@ -51,9 +51,14 @@ func (s *session) newTemp() storage.File {
 	return s.stor.GetFile(num, storage.TypeTemp)
 }
 
+func (s *session) tableFileFromRecord(r atRecord) *tFile {
+	return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax)
+}
+
 // Session state.
 
-// Get current version.
+// Get current version. This will incr version ref, must call
+// version.release (exactly once) after use.
 func (s *session) version() *version {
 	s.vmu.Lock()
 	defer s.vmu.Unlock()
@@ -61,61 +66,56 @@ func (s *session) version() *version {
 	return s.stVersion
 }
 
-// Get current version; no barrier.
-func (s *session) version_NB() *version {
-	return s.stVersion
-}
-
 // Set current version to v.
 func (s *session) setVersion(v *version) {
 	s.vmu.Lock()
-	v.ref = 1
+	v.ref = 1 // Holds by session.
 	if old := s.stVersion; old != nil {
-		v.ref++
+		v.ref++ // Holds by old version.
 		old.next = v
-		old.release_NB()
+		old.releaseNB()
 	}
 	s.stVersion = v
 	s.vmu.Unlock()
 }
 
 // Get current unused file number.
-func (s *session) fileNum() uint64 {
-	return atomic.LoadUint64(&s.stFileNum)
+func (s *session) nextFileNum() uint64 {
+	return atomic.LoadUint64(&s.stNextFileNum)
 }
 
-// Get current unused file number to num.
-func (s *session) setFileNum(num uint64) {
-	atomic.StoreUint64(&s.stFileNum, num)
+// Set current unused file number to num.
+func (s *session) setNextFileNum(num uint64) {
+	atomic.StoreUint64(&s.stNextFileNum, num)
 }
 
 // Mark file number as used.
 func (s *session) markFileNum(num uint64) {
-	num += 1
+	nextFileNum := num + 1
 	for {
-		old, x := s.stFileNum, num
+		old, x := s.stNextFileNum, nextFileNum
 		if old > x {
 			x = old
 		}
-		if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) {
+		if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
 			break
 		}
 	}
 }
 
 // Allocate a file number.
-func (s *session) allocFileNum() (num uint64) {
-	return atomic.AddUint64(&s.stFileNum, 1) - 1
+func (s *session) allocFileNum() uint64 {
+	return atomic.AddUint64(&s.stNextFileNum, 1) - 1
 }
 
 // Reuse given file number.
 func (s *session) reuseFileNum(num uint64) {
 	for {
-		old, x := s.stFileNum, num
+		old, x := s.stNextFileNum, num
 		if old != x+1 {
 			x = old
 		}
-		if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) {
+		if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) {
 			break
 		}
 	}
@@ -126,20 +126,20 @@ func (s *session) reuseFileNum(num uint64) {
 // Fill given session record obj with current states; need external
 // synchronization.
 func (s *session) fillRecord(r *sessionRecord, snapshot bool) {
-	r.setNextNum(s.fileNum())
+	r.setNextFileNum(s.nextFileNum())
 
 	if snapshot {
 		if !r.has(recJournalNum) {
 			r.setJournalNum(s.stJournalNum)
 		}
 
-		if !r.has(recSeq) {
-			r.setSeq(s.stSeq)
+		if !r.has(recSeqNum) {
+			r.setSeqNum(s.stSeqNum)
 		}
 
-		for level, ik := range s.stCptrs {
+		for level, ik := range s.stCompPtrs {
 			if ik != nil {
-				r.addCompactionPointer(level, ik)
+				r.addCompPtr(level, ik)
 			}
 		}
 
@@ -158,12 +158,12 @@ func (s *session) recordCommited(r *sessionRecord) {
 		s.stPrevJournalNum = r.prevJournalNum
 	}
 
-	if r.has(recSeq) {
-		s.stSeq = r.seq
+	if r.has(recSeqNum) {
+		s.stSeqNum = r.seqNum
 	}
 
-	for _, p := range r.compactionPointers {
-		s.stCptrs[p.level] = iKey(p.ikey)
+	for _, p := range r.compPtrs {
+		s.stCompPtrs[p.level] = iKey(p.ikey)
 	}
 }
 
@@ -178,10 +178,11 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) {
 	jw := journal.NewWriter(writer)
 
 	if v == nil {
-		v = s.version_NB()
+		v = s.version()
+		defer v.release()
 	}
 	if rec == nil {
-		rec = new(sessionRecord)
+		rec = &sessionRecord{numLevel: s.o.GetNumLevel()}
 	}
 	s.fillRecord(rec, true)
 	v.fillRecord(rec)

+ 30 - 0
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go

@@ -125,3 +125,33 @@ type Storage interface {
 	// Other methods should not be called after the storage has been closed.
 	Close() error
 }
+
+// FileInfo wraps basic file info.
+type FileInfo struct {
+	Type FileType
+	Num  uint64
+}
+
+func (fi FileInfo) String() string {
+	switch fi.Type {
+	case TypeManifest:
+		return fmt.Sprintf("MANIFEST-%06d", fi.Num)
+	case TypeJournal:
+		return fmt.Sprintf("%06d.log", fi.Num)
+	case TypeTable:
+		return fmt.Sprintf("%06d.ldb", fi.Num)
+	case TypeTemp:
+		return fmt.Sprintf("%06d.tmp", fi.Num)
+	default:
+		return fmt.Sprintf("%#x-%d", fi.Type, fi.Num)
+	}
+}
+
+// NewFileInfo creates new FileInfo from the given File. It will returns nil
+// if File is nil.
+func NewFileInfo(f File) *FileInfo {
+	if f == nil {
+		return nil
+	}
+	return &FileInfo{f.Type(), f.Num()}
+}

+ 87 - 33
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go

@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"sync"
@@ -36,6 +37,19 @@ var (
 	tsNum     = 0
 )
 
+type tsOp uint
+
+const (
+	tsOpOpen tsOp = iota
+	tsOpCreate
+	tsOpRead
+	tsOpReadAt
+	tsOpWrite
+	tsOpSync
+
+	tsOpNum
+)
+
 type tsLock struct {
 	ts *testStorage
 	r  util.Releaser
@@ -54,6 +68,9 @@ type tsReader struct {
 func (tr tsReader) Read(b []byte) (n int, err error) {
 	ts := tr.tf.ts
 	ts.countRead(tr.tf.Type())
+	if tr.tf.shouldErrLocked(tsOpRead) {
+		return 0, errors.New("leveldb.testStorage: emulated read error")
+	}
 	n, err = tr.Reader.Read(b)
 	if err != nil && err != io.EOF {
 		ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err)
@@ -64,6 +81,9 @@ func (tr tsReader) Read(b []byte) (n int, err error) {
 func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) {
 	ts := tr.tf.ts
 	ts.countRead(tr.tf.Type())
+	if tr.tf.shouldErrLocked(tsOpReadAt) {
+		return 0, errors.New("leveldb.testStorage: emulated readAt error")
+	}
 	n, err = tr.Reader.ReadAt(b, off)
 	if err != nil && err != io.EOF {
 		ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err)
@@ -83,15 +103,12 @@ type tsWriter struct {
 }
 
 func (tw tsWriter) Write(b []byte) (n int, err error) {
-	ts := tw.tf.ts
-	ts.mu.Lock()
-	defer ts.mu.Unlock()
-	if ts.emuWriteErr&tw.tf.Type() != 0 {
+	if tw.tf.shouldErrLocked(tsOpWrite) {
 		return 0, errors.New("leveldb.testStorage: emulated write error")
 	}
 	n, err = tw.Writer.Write(b)
 	if err != nil {
-		ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err)
+		tw.tf.ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err)
 	}
 	return
 }
@@ -99,23 +116,23 @@ func (tw tsWriter) Write(b []byte) (n int, err error) {
 func (tw tsWriter) Sync() (err error) {
 	ts := tw.tf.ts
 	ts.mu.Lock()
-	defer ts.mu.Unlock()
 	for ts.emuDelaySync&tw.tf.Type() != 0 {
 		ts.cond.Wait()
 	}
-	if ts.emuSyncErr&tw.tf.Type() != 0 {
+	ts.mu.Unlock()
+	if tw.tf.shouldErrLocked(tsOpSync) {
 		return errors.New("leveldb.testStorage: emulated sync error")
 	}
 	err = tw.Writer.Sync()
 	if err != nil {
-		ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err)
+		tw.tf.ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err)
 	}
 	return
 }
 
 func (tw tsWriter) Close() (err error) {
 	err = tw.Writer.Close()
-	tw.tf.close("reader", err)
+	tw.tf.close("writer", err)
 	return
 }
 
@@ -128,6 +145,16 @@ func (tf tsFile) x() uint64 {
 	return tf.Num()<<typeShift | uint64(tf.Type())
 }
 
+func (tf tsFile) shouldErr(op tsOp) bool {
+	return tf.ts.shouldErr(tf, op)
+}
+
+func (tf tsFile) shouldErrLocked(op tsOp) bool {
+	tf.ts.mu.Lock()
+	defer tf.ts.mu.Unlock()
+	return tf.shouldErr(op)
+}
+
 func (tf tsFile) checkOpen(m string) error {
 	ts := tf.ts
 	if writer, ok := ts.opens[tf.x()]; ok {
@@ -164,7 +191,7 @@ func (tf tsFile) Open() (r storage.Reader, err error) {
 	if err != nil {
 		return
 	}
-	if ts.emuOpenErr&tf.Type() != 0 {
+	if tf.shouldErr(tsOpOpen) {
 		err = errors.New("leveldb.testStorage: emulated open error")
 		return
 	}
@@ -191,7 +218,7 @@ func (tf tsFile) Create() (w storage.Writer, err error) {
 	if err != nil {
 		return
 	}
-	if ts.emuCreateErr&tf.Type() != 0 {
+	if tf.shouldErr(tsOpCreate) {
 		err = errors.New("leveldb.testStorage: emulated create error")
 		return
 	}
@@ -232,51 +259,75 @@ type testStorage struct {
 	cond sync.Cond
 	// Open files, true=writer, false=reader
 	opens         map[uint64]bool
-	emuOpenErr    storage.FileType
-	emuCreateErr  storage.FileType
 	emuDelaySync  storage.FileType
-	emuWriteErr   storage.FileType
-	emuSyncErr    storage.FileType
 	ignoreOpenErr storage.FileType
 	readCnt       uint64
 	readCntEn     storage.FileType
+
+	emuErr         [tsOpNum]storage.FileType
+	emuErrOnce     [tsOpNum]storage.FileType
+	emuRandErr     [tsOpNum]storage.FileType
+	emuRandErrProb int
+	emuErrOnceMap  map[uint64]uint
+	emuRandRand    *rand.Rand
+}
+
+func (ts *testStorage) shouldErr(tf tsFile, op tsOp) bool {
+	if ts.emuErr[op]&tf.Type() != 0 {
+		return true
+	} else if ts.emuRandErr[op]&tf.Type() != 0 || ts.emuErrOnce[op]&tf.Type() != 0 {
+		sop := uint(1) << op
+		eop := ts.emuErrOnceMap[tf.x()]
+		if eop&sop == 0 && (ts.emuRandRand.Int()%ts.emuRandErrProb == 0 || ts.emuErrOnce[op]&tf.Type() != 0) {
+			ts.emuErrOnceMap[tf.x()] = eop | sop
+			ts.t.Logf("I: emulated error: file=%d type=%v op=%v", tf.Num(), tf.Type(), op)
+			return true
+		}
+	}
+	return false
 }
 
-func (ts *testStorage) SetOpenErr(t storage.FileType) {
+func (ts *testStorage) SetEmuErr(t storage.FileType, ops ...tsOp) {
 	ts.mu.Lock()
-	ts.emuOpenErr = t
+	for _, op := range ops {
+		ts.emuErr[op] = t
+	}
 	ts.mu.Unlock()
 }
 
-func (ts *testStorage) SetCreateErr(t storage.FileType) {
+func (ts *testStorage) SetEmuErrOnce(t storage.FileType, ops ...tsOp) {
 	ts.mu.Lock()
-	ts.emuCreateErr = t
+	for _, op := range ops {
+		ts.emuErrOnce[op] = t
+	}
 	ts.mu.Unlock()
 }
 
-func (ts *testStorage) DelaySync(t storage.FileType) {
+func (ts *testStorage) SetEmuRandErr(t storage.FileType, ops ...tsOp) {
 	ts.mu.Lock()
-	ts.emuDelaySync |= t
-	ts.cond.Broadcast()
+	for _, op := range ops {
+		ts.emuRandErr[op] = t
+	}
 	ts.mu.Unlock()
 }
 
-func (ts *testStorage) ReleaseSync(t storage.FileType) {
+func (ts *testStorage) SetEmuRandErrProb(prob int) {
 	ts.mu.Lock()
-	ts.emuDelaySync &= ^t
-	ts.cond.Broadcast()
+	ts.emuRandErrProb = prob
 	ts.mu.Unlock()
 }
 
-func (ts *testStorage) SetWriteErr(t storage.FileType) {
+func (ts *testStorage) DelaySync(t storage.FileType) {
 	ts.mu.Lock()
-	ts.emuWriteErr = t
+	ts.emuDelaySync |= t
+	ts.cond.Broadcast()
 	ts.mu.Unlock()
 }
 
-func (ts *testStorage) SetSyncErr(t storage.FileType) {
+func (ts *testStorage) ReleaseSync(t storage.FileType) {
 	ts.mu.Lock()
-	ts.emuSyncErr = t
+	ts.emuDelaySync &= ^t
+	ts.cond.Broadcast()
 	ts.mu.Unlock()
 }
 
@@ -454,10 +505,13 @@ func newTestStorage(t *testing.T) *testStorage {
 		stor = storage.NewMemStorage()
 	}
 	ts := &testStorage{
-		t:       t,
-		Storage: stor,
-		closeFn: closeFn,
-		opens:   make(map[uint64]bool),
+		t:              t,
+		Storage:        stor,
+		closeFn:        closeFn,
+		opens:          make(map[uint64]bool),
+		emuErrOnceMap:  make(map[uint64]uint),
+		emuRandErrProb: 0x999,
+		emuRandRand:    rand.New(rand.NewSource(0xfacedead)),
 	}
 	ts.cond.L = &ts.mu
 	return ts

+ 47 - 21
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table.go

@@ -7,6 +7,7 @@
 package leveldb
 
 import (
+	"fmt"
 	"sort"
 	"sync/atomic"
 
@@ -82,6 +83,18 @@ type tFiles []*tFile
 func (tf tFiles) Len() int      { return len(tf) }
 func (tf tFiles) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] }
 
+func (tf tFiles) nums() string {
+	x := "[ "
+	for i, f := range tf {
+		if i != 0 {
+			x += ", "
+		}
+		x += fmt.Sprint(f.file.Num())
+	}
+	x += " ]"
+	return x
+}
+
 // Returns true if i smallest key is less than j.
 // This used for sort by key in ascending order.
 func (tf tFiles) lessByKey(icmp *iComparer, i, j int) bool {
@@ -149,7 +162,7 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
 	i := 0
 	if len(umin) > 0 {
 		// Find the earliest possible internal key for min.
-		i = tf.searchMax(icmp, newIKey(umin, kMaxSeq, tSeek))
+		i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek))
 	}
 	if i >= len(tf) {
 		// Beginning of range is after all files, so no overlap.
@@ -159,24 +172,25 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo
 }
 
 // Returns tables whose its key range overlaps with given key range.
-// If overlapped is true then the search will be expanded to tables that
-// overlaps with each other.
+// Range will be expanded if ukey found hop across tables.
+// If overlapped is true then the search will be restarted if umax
+// expanded.
+// The dst content will be overwritten.
 func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles {
-	x := len(dst)
+	dst = dst[:0]
 	for i := 0; i < len(tf); {
 		t := tf[i]
 		if t.overlaps(icmp, umin, umax) {
-			if overlapped {
-				// For overlapped files, check if the newly added file has
-				// expanded the range. If so, restart search.
-				if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
-					umin = t.imin.ukey()
-					dst = dst[:x]
-					i = 0
-					continue
-				} else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
-					umax = t.imax.ukey()
-					dst = dst[:x]
+			if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 {
+				umin = t.imin.ukey()
+				dst = dst[:0]
+				i = 0
+				continue
+			} else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 {
+				umax = t.imax.ukey()
+				// Restart search if it is overlapped.
+				if overlapped {
+					dst = dst[:0]
 					i = 0
 					continue
 				}
@@ -289,7 +303,7 @@ func (t *tOps) create() (*tWriter, error) {
 		t:    t,
 		file: file,
 		w:    fw,
-		tw:   table.NewWriter(fw, t.s.o),
+		tw:   table.NewWriter(fw, t.s.o.Options),
 	}, nil
 }
 
@@ -337,7 +351,13 @@ func (t *tOps) open(f *tFile) (ch cache.Handle, err error) {
 		if bc := t.s.o.GetBlockCache(); bc != nil {
 			bcacheNS = bc.GetNamespace(num)
 		}
-		return 1, table.NewReader(r, int64(f.size), bcacheNS, t.bpool, t.s.o)
+		var tr *table.Reader
+		tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcacheNS, t.bpool, t.s.o.Options)
+		if err != nil {
+			r.Close()
+			return 0, nil
+		}
+		return 1, tr
 	})
 	if ch == nil && err == nil {
 		err = ErrClosed
@@ -440,28 +460,34 @@ func (w *tWriter) empty() bool {
 	return w.first == nil
 }
 
+// Closes the storage.Writer.
+func (w *tWriter) close() {
+	if w.w != nil {
+		w.w.Close()
+		w.w = nil
+	}
+}
+
 // Finalizes the table and returns table file.
 func (w *tWriter) finish() (f *tFile, err error) {
+	defer w.close()
 	err = w.tw.Close()
 	if err != nil {
 		return
 	}
 	err = w.w.Sync()
 	if err != nil {
-		w.w.Close()
 		return
 	}
-	w.w.Close()
 	f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last))
 	return
 }
 
 // Drops the table.
 func (w *tWriter) drop() {
-	w.w.Close()
+	w.close()
 	w.file.Remove()
 	w.t.s.reuseFileNum(w.file.Num())
-	w.w = nil
 	w.file = nil
 	w.tw = nil
 	w.first = nil

+ 150 - 104
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go

@@ -8,29 +8,41 @@ package table
 
 import (
 	"encoding/binary"
-	"errors"
 	"fmt"
 	"io"
 	"sort"
 	"strings"
 	"sync"
 
-	"code.google.com/p/snappy-go/snappy"
+	"github.com/syndtr/gosnappy/snappy"
 
 	"github.com/syndtr/goleveldb/leveldb/cache"
 	"github.com/syndtr/goleveldb/leveldb/comparer"
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/filter"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/opt"
+	"github.com/syndtr/goleveldb/leveldb/storage"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
 var (
-	ErrNotFound       = util.ErrNotFound
+	ErrNotFound       = errors.ErrNotFound
 	ErrReaderReleased = errors.New("leveldb/table: reader released")
 	ErrIterReleased   = errors.New("leveldb/table: iterator released")
 )
 
+type ErrCorrupted struct {
+	Pos    int64
+	Size   int64
+	Kind   string
+	Reason string
+}
+
+func (e *ErrCorrupted) Error() string {
+	return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason)
+}
+
 func max(x, y int) int {
 	if x > y {
 		return x
@@ -38,13 +50,19 @@ func max(x, y int) int {
 	return y
 }
 
+func verifyBlockChecksum(data []byte) bool {
+	n := len(data) - 4
+	checksum0 := binary.LittleEndian.Uint32(data[n:])
+	checksum1 := util.NewCRC(data[:n]).Value()
+	return checksum0 == checksum1
+}
+
 type block struct {
 	bpool          *util.BufferPool
+	bh             blockHandle
 	data           []byte
 	restartsLen    int
 	restartsOffset int
-	// Whether checksum is verified and valid.
-	checksum bool
 }
 
 func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) {
@@ -77,7 +95,7 @@ func (b *block) restartOffset(index int) int {
 func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) {
 	if offset >= b.restartsOffset {
 		if offset != b.restartsOffset {
-			err = errors.New("leveldb/table: Reader: BlockEntry: invalid block (block entries offset not aligned)")
+			err = &ErrCorrupted{Reason: "entries offset not aligned"}
 		}
 		return
 	}
@@ -87,7 +105,7 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error)
 	m := n0 + n1 + n2
 	n = m + int(v1) + int(v2)
 	if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset {
-		err = errors.New("leveldb/table: Reader: invalid block (block entries corrupted)")
+		err = &ErrCorrupted{Reason: "entries corrupted"}
 		return
 	}
 	key = b.data[offset+m : offset+m+int(v1)]
@@ -251,7 +269,7 @@ func (i *blockIter) Next() bool {
 	for i.offset < i.offsetRealStart {
 		key, value, nShared, n, err := i.block.entry(i.offset)
 		if err != nil {
-			i.sErr(err)
+			i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
 			return false
 		}
 		if n == 0 {
@@ -265,13 +283,13 @@ func (i *blockIter) Next() bool {
 	if i.offset >= i.offsetLimit {
 		i.dir = dirEOI
 		if i.offset != i.offsetLimit {
-			i.sErr(errors.New("leveldb/table: Reader: Next: invalid block (block entries offset not aligned)"))
+			i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
 		}
 		return false
 	}
 	key, value, nShared, n, err := i.block.entry(i.offset)
 	if err != nil {
-		i.sErr(err)
+		i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
 		return false
 	}
 	if n == 0 {
@@ -356,7 +374,7 @@ func (i *blockIter) Prev() bool {
 	for {
 		key, value, nShared, n, err := i.block.entry(offset)
 		if err != nil {
-			i.sErr(err)
+			i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err))
 			return false
 		}
 		if offset >= i.offsetRealStart {
@@ -375,7 +393,7 @@ func (i *blockIter) Prev() bool {
 		// Stop if target offset reached.
 		if offset >= i.offset {
 			if offset != i.offset {
-				i.sErr(errors.New("leveldb/table: Reader: Prev: invalid block (block entries offset not aligned)"))
+				i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned"))
 				return false
 			}
 
@@ -473,7 +491,6 @@ type indexIter struct {
 	tr    *Reader
 	slice *util.Range
 	// Options
-	checksum  bool
 	fillCache bool
 }
 
@@ -484,28 +501,29 @@ func (i *indexIter) Get() iterator.Iterator {
 	}
 	dataBH, n := decodeBlockHandle(value)
 	if n == 0 {
-		return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid table (bad data block handle)"))
+		return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle"))
 	}
 
 	var slice *util.Range
 	if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) {
 		slice = i.slice
 	}
-	return i.tr.getDataIterErr(dataBH, slice, i.checksum, i.fillCache)
+	return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache)
 }
 
 // Reader is a table reader.
 type Reader struct {
 	mu     sync.RWMutex
+	fi     *storage.FileInfo
 	reader io.ReaderAt
 	cache  cache.Namespace
 	err    error
 	bpool  *util.BufferPool
 	// Options
-	cmp        comparer.Comparer
-	filter     filter.Filter
-	checksum   bool
-	strictIter bool
+	o              *opt.Options
+	cmp            comparer.Comparer
+	filter         filter.Filter
+	verifyChecksum bool
 
 	dataEnd           int64
 	indexBH, filterBH blockHandle
@@ -513,23 +531,43 @@ type Reader struct {
 	filterBlock       *filterBlock
 }
 
-func verifyChecksum(data []byte) bool {
-	n := len(data) - 4
-	checksum0 := binary.LittleEndian.Uint32(data[n:])
-	checksum1 := util.NewCRC(data[:n]).Value()
-	return checksum0 == checksum1
+func (r *Reader) blockKind(bh blockHandle) string {
+	switch bh.offset {
+	case r.indexBH.offset:
+		return "index-block"
+	case r.filterBH.offset:
+		return "filter-block"
+	default:
+		return "data-block"
+	}
 }
 
-func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) {
+func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error {
+	return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}}
+}
+
+func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error {
+	return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason)
+}
+
+func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error {
+	if cerr, ok := err.(*ErrCorrupted); ok {
+		cerr.Pos = int64(bh.offset)
+		cerr.Size = int64(bh.length)
+		cerr.Kind = r.blockKind(bh)
+		return &errors.ErrCorrupted{File: r.fi, Err: cerr}
+	}
+	return err
+}
+
+func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) {
 	data := r.bpool.Get(int(bh.length + blockTrailerLen))
 	if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF {
 		return nil, err
 	}
-	if checksum || r.checksum {
-		if !verifyChecksum(data) {
-			r.bpool.Put(data)
-			return nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")
-		}
+	if verifyChecksum && !verifyBlockChecksum(data) {
+		r.bpool.Put(data)
+		return nil, r.newErrCorruptedBH(bh, "checksum mismatch")
 	}
 	switch data[bh.length] {
 	case blockTypeNoCompression:
@@ -537,38 +575,40 @@ func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) {
 	case blockTypeSnappyCompression:
 		decLen, err := snappy.DecodedLen(data[:bh.length])
 		if err != nil {
-			return nil, err
+			return nil, r.newErrCorruptedBH(bh, err.Error())
 		}
-		tmp := data
-		data, err = snappy.Decode(r.bpool.Get(decLen), tmp[:bh.length])
-		r.bpool.Put(tmp)
+		decData := r.bpool.Get(decLen)
+		decData, err = snappy.Decode(decData, data[:bh.length])
+		r.bpool.Put(data)
 		if err != nil {
-			return nil, err
+			r.bpool.Put(decData)
+			return nil, r.newErrCorruptedBH(bh, err.Error())
 		}
+		data = decData
 	default:
 		r.bpool.Put(data)
-		return nil, fmt.Errorf("leveldb/table: Reader: unknown block compression type: %d", data[bh.length])
+		return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length]))
 	}
 	return data, nil
 }
 
-func (r *Reader) readBlock(bh blockHandle, checksum bool) (*block, error) {
-	data, err := r.readRawBlock(bh, checksum)
+func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) {
+	data, err := r.readRawBlock(bh, verifyChecksum)
 	if err != nil {
 		return nil, err
 	}
 	restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:]))
 	b := &block{
 		bpool:          r.bpool,
+		bh:             bh,
 		data:           data,
 		restartsLen:    restartsLen,
 		restartsOffset: len(data) - (restartsLen+1)*4,
-		checksum:       checksum || r.checksum,
 	}
 	return b, nil
 }
 
-func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*block, util.Releaser, error) {
+func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) {
 	if r.cache != nil {
 		var err error
 		ch := r.cache.Get(bh.offset, func() (charge int, value interface{}) {
@@ -576,7 +616,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
 				return 0, nil
 			}
 			var b *block
-			b, err = r.readBlock(bh, checksum)
+			b, err = r.readBlock(bh, verifyChecksum)
 			if err != nil {
 				return 0, nil
 			}
@@ -586,14 +626,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
 			b, ok := ch.Value().(*block)
 			if !ok {
 				ch.Release()
-				return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type")
-			}
-			if !b.checksum && (r.checksum || checksum) {
-				if !verifyChecksum(b.data) {
-					ch.Release()
-					return nil, nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)")
-				}
-				b.checksum = true
+				return nil, nil, errors.New("leveldb/table: inconsistent block type")
 			}
 			return b, ch, err
 		} else if err != nil {
@@ -601,7 +634,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo
 		}
 	}
 
-	b, err := r.readBlock(bh, checksum)
+	b, err := r.readBlock(bh, verifyChecksum)
 	return b, b, err
 }
 
@@ -612,12 +645,12 @@ func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) {
 	}
 	n := len(data)
 	if n < 5 {
-		return nil, errors.New("leveldb/table: Reader: invalid filter block (too short)")
+		return nil, r.newErrCorruptedBH(bh, "too short")
 	}
 	m := n - 5
 	oOffset := int(binary.LittleEndian.Uint32(data[m:]))
 	if oOffset > m {
-		return nil, errors.New("leveldb/table: Reader: invalid filter block (invalid offset)")
+		return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset")
 	}
 	b := &filterBlock{
 		bpool:      r.bpool,
@@ -647,7 +680,7 @@ func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterB
 			b, ok := ch.Value().(*filterBlock)
 			if !ok {
 				ch.Release()
-				return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type")
+				return nil, nil, errors.New("leveldb/table: inconsistent block type")
 			}
 			return b, ch, err
 		} else if err != nil {
@@ -673,25 +706,6 @@ func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, er
 	return r.filterBlock, util.NoopReleaser{}, nil
 }
 
-func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
-	b, rel, err := r.readBlockCached(dataBH, checksum, fillCache)
-	if err != nil {
-		return iterator.NewEmptyIterator(err)
-	}
-	return r.newBlockIter(b, rel, slice, false)
-}
-
-func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator {
-	r.mu.RLock()
-	defer r.mu.RUnlock()
-
-	if r.err != nil {
-		return iterator.NewEmptyIterator(r.err)
-	}
-
-	return r.getDataIter(dataBH, slice, checksum, fillCache)
-}
-
 func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter {
 	bi := &blockIter{
 		tr:            r,
@@ -726,12 +740,31 @@ func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Ran
 		}
 		bi.reset()
 		if bi.offsetStart > bi.offsetLimit {
-			bi.sErr(errors.New("leveldb/table: Reader: invalid slice range"))
+			bi.sErr(errors.New("leveldb/table: invalid slice range"))
 		}
 	}
 	return bi
 }
 
+func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
+	b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache)
+	if err != nil {
+		return iterator.NewEmptyIterator(err)
+	}
+	return r.newBlockIter(b, rel, slice, false)
+}
+
+func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator {
+	r.mu.RLock()
+	defer r.mu.RUnlock()
+
+	if r.err != nil {
+		return iterator.NewEmptyIterator(r.err)
+	}
+
+	return r.getDataIter(dataBH, slice, verifyChecksum, fillCache)
+}
+
 // NewIterator creates an iterator from the table.
 //
 // Slice allows slicing the iterator to only contains keys in the given
@@ -760,10 +793,9 @@ func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It
 		blockIter: r.newBlockIter(indexBlock, rel, slice, true),
 		tr:        r,
 		slice:     slice,
-		checksum:  ro.GetStrict(opt.StrictBlockChecksum),
 		fillCache: !ro.GetDontFillCache(),
 	}
-	return iterator.NewIndexedIterator(index, r.strictIter || ro.GetStrict(opt.StrictIterator), true)
+	return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader))
 }
 
 // Find finds key/value pair whose key is greater than or equal to the
@@ -798,7 +830,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
 	}
 	dataBH, n := decodeBlockHandle(index.Value())
 	if n == 0 {
-		err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)")
+		r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
 		return
 	}
 	if r.filter != nil {
@@ -811,7 +843,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err
 			rel.Release()
 		}
 	}
-	data := r.getDataIter(dataBH, nil, ro.GetStrict(opt.StrictBlockChecksum), !ro.GetDontFillCache())
+	data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache())
 	defer data.Release()
 	if !data.Seek(key) {
 		err = data.Error()
@@ -877,7 +909,7 @@ func (r *Reader) OffsetOf(key []byte) (offset int64, err error) {
 	if index.Seek(key) {
 		dataBH, n := decodeBlockHandle(index.Value())
 		if n == 0 {
-			err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)")
+			r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle")
 			return
 		}
 		offset = int64(dataBH.offset)
@@ -914,51 +946,56 @@ func (r *Reader) Release() {
 }
 
 // NewReader creates a new initialized table reader for the file.
-// The cache and bpool is optional and can be nil.
+// The fi, cache and bpool is optional and can be nil.
 //
 // The returned table reader instance is goroutine-safe.
-func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) *Reader {
+func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) (*Reader, error) {
 	r := &Reader{
-		reader:     f,
-		cache:      cache,
-		bpool:      bpool,
-		cmp:        o.GetComparer(),
-		checksum:   o.GetStrict(opt.StrictBlockChecksum),
-		strictIter: o.GetStrict(opt.StrictIterator),
+		fi:             fi,
+		reader:         f,
+		cache:          cache,
+		bpool:          bpool,
+		o:              o,
+		cmp:            o.GetComparer(),
+		verifyChecksum: o.GetStrict(opt.StrictBlockChecksum),
 	}
 	if f == nil {
-		r.err = errors.New("leveldb/table: Reader: nil file")
-		return r
+		return nil, errors.New("leveldb/table: nil file")
 	}
 	if size < footerLen {
-		r.err = errors.New("leveldb/table: Reader: invalid table (file size is too small)")
-		return r
+		r.err = r.newErrCorrupted(0, size, "table", "too small")
+		return r, nil
 	}
+	footerPos := size - footerLen
 	var footer [footerLen]byte
-	if _, err := r.reader.ReadAt(footer[:], size-footerLen); err != nil && err != io.EOF {
-		r.err = fmt.Errorf("leveldb/table: Reader: invalid table (could not read footer): %v", err)
+	if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF {
+		return nil, err
 	}
 	if string(footer[footerLen-len(magic):footerLen]) != magic {
-		r.err = errors.New("leveldb/table: Reader: invalid table (bad magic number)")
-		return r
+		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number")
+		return r, nil
 	}
 	// Decode the metaindex block handle.
 	metaBH, n := decodeBlockHandle(footer[:])
 	if n == 0 {
-		r.err = errors.New("leveldb/table: Reader: invalid table (bad metaindex block handle)")
-		return r
+		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle")
+		return r, nil
 	}
 	// Decode the index block handle.
 	r.indexBH, n = decodeBlockHandle(footer[n:])
 	if n == 0 {
-		r.err = errors.New("leveldb/table: Reader: invalid table (bad index block handle)")
-		return r
+		r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle")
+		return r, nil
 	}
 	// Read metaindex block.
 	metaBlock, err := r.readBlock(metaBH, true)
 	if err != nil {
-		r.err = err
-		return r
+		if errors.IsCorrupted(err) {
+			r.err = err
+			return r, nil
+		} else {
+			return nil, err
+		}
 	}
 	// Set data end.
 	r.dataEnd = int64(metaBH.offset)
@@ -995,13 +1032,22 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
 
 	// Cache index and filter block locally, since we don't have global cache.
 	if cache == nil {
-		r.indexBlock, r.err = r.readBlock(r.indexBH, true)
-		if r.err != nil {
-			return r
+		r.indexBlock, err = r.readBlock(r.indexBH, true)
+		if err != nil {
+			if errors.IsCorrupted(err) {
+				r.err = err
+				return r, nil
+			} else {
+				return nil, err
+			}
 		}
 		if r.filter != nil {
 			r.filterBlock, err = r.readFilterBlock(r.filterBH)
 			if err != nil {
+				if !errors.IsCorrupted(r.err) {
+					return nil, err
+				}
+
 				// Don't use filter then.
 				r.filter = nil
 				r.filterBH = blockHandle{}
@@ -1009,5 +1055,5 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf
 		}
 	}
 
-	return r
+	return r, nil
 }

+ 3 - 3
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go

@@ -133,9 +133,9 @@ Filter block trailer:
 
       +- 4-bytes -+
      /             \
-    +---------------+---------------+---------------+-------------------------+------------------+
-    |    offset 1   |      ....     |    offset n   | filter offset (4-bytes) | base Lg (1-byte) |
-    +-------------- +---------------+---------------+-------------------------+------------------+
+    +---------------+---------------+---------------+-------------------------------+------------------+
+    | data 1 offset |      ....     | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) |
+    +-------------- +---------------+---------------+-------------------------------+------------------+
 
 
 NOTE: All fixed-length integer are little-endian.

+ 3 - 2
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go

@@ -59,7 +59,8 @@ var _ = testutil.Defer(func() {
 			It("Should be able to approximate offset of a key correctly", func() {
 				Expect(err).ShouldNot(HaveOccurred())
 
-				tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o)
+				tr, err := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
+				Expect(err).ShouldNot(HaveOccurred())
 				CheckOffset := func(key string, expect, threshold int) {
 					offset, err := tr.OffsetOf([]byte(key))
 					Expect(err).ShouldNot(HaveOccurred())
@@ -95,7 +96,7 @@ var _ = testutil.Defer(func() {
 				tw.Close()
 
 				// Opening the table.
-				tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o)
+				tr, _ := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o)
 				return tableWrapper{tr}
 			}
 			Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() {

+ 1 - 1
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go

@@ -12,7 +12,7 @@ import (
 	"fmt"
 	"io"
 
-	"code.google.com/p/snappy-go/snappy"
+	"github.com/syndtr/gosnappy/snappy"
 
 	"github.com/syndtr/goleveldb/leveldb/comparer"
 	"github.com/syndtr/goleveldb/leveldb/filter"

+ 2 - 1
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go

@@ -12,6 +12,7 @@ import (
 
 	. "github.com/onsi/gomega"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/iterator"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
@@ -110,7 +111,7 @@ func (t *DBTesting) TestAllPresent() {
 
 func (t *DBTesting) TestDeletedKey(key []byte) {
 	_, err := t.DB.TestGet(key)
-	Expect(err).Should(Equal(util.ErrNotFound), "Get on deleted key %q, %s", key, t.text())
+	Expect(err).Should(Equal(errors.ErrNotFound), "Get on deleted key %q, %s", key, t.text())
 }
 
 func (t *DBTesting) TestAllDeleted() {

+ 3 - 2
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go

@@ -13,6 +13,7 @@ import (
 	. "github.com/onsi/ginkgo"
 	. "github.com/onsi/gomega"
 
+	"github.com/syndtr/goleveldb/leveldb/errors"
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
@@ -59,7 +60,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
 			}
 			rkey, _, err := db.TestFind(key)
 			Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey)
-			Expect(err).Should(Equal(util.ErrNotFound))
+			Expect(err).Should(Equal(errors.ErrNotFound))
 		}
 	})
 
@@ -77,7 +78,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB,
 				if len(key_) > 0 {
 					_, err = db.TestGet(key_)
 					Expect(err).Should(HaveOccurred(), "Error for key %q", key_)
-					Expect(err).Should(Equal(util.ErrNotFound))
+					Expect(err).Should(Equal(errors.ErrNotFound))
 				}
 			})
 		}

+ 2 - 2
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go

@@ -14,10 +14,10 @@ import (
 )
 
 func shorten(str string) string {
-	if len(str) <= 4 {
+	if len(str) <= 8 {
 		return str
 	}
-	return str[:1] + ".." + str[len(str)-1:]
+	return str[:3] + ".." + str[len(str)-3:]
 }
 
 var bunits = [...]string{"", "Ki", "Mi", "Gi"}

+ 33 - 19
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go

@@ -8,6 +8,7 @@ package util
 
 import (
 	"fmt"
+	"sync"
 	"sync/atomic"
 	"time"
 )
@@ -19,17 +20,16 @@ type buffer struct {
 
 // BufferPool is a 'buffer pool'.
 type BufferPool struct {
-	pool       [6]chan []byte
-	size       [5]uint32
-	sizeMiss   [5]uint32
-	sizeHalf   [5]uint32
-	baseline   [4]int
-	baselinex0 int
-	baselinex1 int
-	baseline0  int
-	baseline1  int
-	baseline2  int
-	close      chan struct{}
+	pool      [6]chan []byte
+	size      [5]uint32
+	sizeMiss  [5]uint32
+	sizeHalf  [5]uint32
+	baseline  [4]int
+	baseline0 int
+
+	mu     sync.RWMutex
+	closed bool
+	closeC chan struct{}
 
 	get     uint32
 	put     uint32
@@ -58,6 +58,13 @@ func (p *BufferPool) Get(n int) []byte {
 		return make([]byte, n)
 	}
 
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+
+	if p.closed {
+		return make([]byte, n)
+	}
+
 	atomic.AddUint32(&p.get, 1)
 
 	poolNum := p.poolNum(n)
@@ -153,12 +160,16 @@ func (p *BufferPool) Put(b []byte) {
 		return
 	}
 
+	p.mu.RLock()
+	defer p.mu.RUnlock()
+
+	if p.closed {
+		return
+	}
+
 	atomic.AddUint32(&p.put, 1)
 
 	pool := p.pool[p.poolNum(cap(b))]
-	defer func() {
-		recover()
-	}()
 	select {
 	case pool <- b:
 	default:
@@ -171,10 +182,12 @@ func (p *BufferPool) Close() {
 		return
 	}
 
-	select {
-	case p.close <- struct{}{}:
-	default:
+	p.mu.Lock()
+	if !p.closed {
+		p.closed = true
+		p.closeC <- struct{}{}
 	}
+	p.mu.Unlock()
 }
 
 func (p *BufferPool) String() string {
@@ -197,7 +210,8 @@ func (p *BufferPool) drain() {
 				default:
 				}
 			}
-		case <-p.close:
+		case <-p.closeC:
+			close(p.closeC)
 			for _, ch := range p.pool {
 				close(ch)
 			}
@@ -214,7 +228,7 @@ func NewBufferPool(baseline int) *BufferPool {
 	p := &BufferPool{
 		baseline0: baseline,
 		baseline:  [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4},
-		close:     make(chan struct{}, 1),
+		closeC:    make(chan struct{}, 1),
 	}
 	for i, cap := range []int{2, 2, 4, 4, 2, 1} {
 		p.pool[i] = make(chan []byte, cap)

+ 0 - 1
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go

@@ -12,7 +12,6 @@ import (
 )
 
 var (
-	ErrNotFound    = errors.New("leveldb: not found")
 	ErrReleased    = errors.New("leveldb: resource already relesed")
 	ErrHasReleaser = errors.New("leveldb: releaser already defined")
 )

+ 62 - 70
Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go

@@ -7,7 +7,6 @@
 package leveldb
 
 import (
-	"errors"
 	"sync/atomic"
 	"unsafe"
 
@@ -16,19 +15,6 @@ import (
 	"github.com/syndtr/goleveldb/leveldb/util"
 )
 
-var levelMaxSize [kNumLevels]float64
-
-func init() {
-	// Precompute max size of each level
-	for level := range levelMaxSize {
-		res := float64(10 * 1048576)
-		for n := level; n > 1; n-- {
-			res *= 10
-		}
-		levelMaxSize[level] = res
-	}
-}
-
 type tSet struct {
 	level int
 	table *tFile
@@ -37,7 +23,7 @@ type tSet struct {
 type version struct {
 	s *session
 
-	tables [kNumLevels]tFiles
+	tables []tFiles
 
 	// Level that should be compacted next and its compaction score.
 	// Score < 1 means compaction is not strictly needed. These fields
@@ -47,11 +33,16 @@ type version struct {
 
 	cSeek unsafe.Pointer
 
-	ref  int
+	ref int
+	// Succeeding version.
 	next *version
 }
 
-func (v *version) release_NB() {
+func newVersion(s *session) *version {
+	return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())}
+}
+
+func (v *version) releaseNB() {
 	v.ref--
 	if v.ref > 0 {
 		return
@@ -77,13 +68,13 @@ func (v *version) release_NB() {
 		}
 	}
 
-	v.next.release_NB()
+	v.next.releaseNB()
 	v.next = nil
 }
 
 func (v *version) release() {
 	v.s.vmu.Lock()
-	v.release_NB()
+	v.releaseNB()
 	v.s.vmu.Unlock()
 }
 
@@ -130,10 +121,11 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
 		tset  *tSet
 		tseek bool
 
-		l0found bool
-		l0seq   uint64
-		l0vt    vType
-		l0val   []byte
+		// Level-0.
+		zfound bool
+		zseq   uint64
+		zkt    kType
+		zval   []byte
 	)
 
 	err = ErrNotFound
@@ -150,55 +142,52 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool,
 			}
 		}
 
-		ikey__, val_, err_ := v.s.tops.find(t, ikey, ro)
-		switch err_ {
+		fikey, fval, ferr := v.s.tops.find(t, ikey, ro)
+		switch ferr {
 		case nil:
 		case ErrNotFound:
 			return true
 		default:
-			err = err_
+			err = ferr
 			return false
 		}
 
-		ikey_ := iKey(ikey__)
-		if seq, vt, ok := ikey_.parseNum(); ok {
-			if v.s.icmp.uCompare(ukey, ikey_.ukey()) != 0 {
-				return true
-			}
-
-			if level == 0 {
-				if seq >= l0seq {
-					l0found = true
-					l0seq = seq
-					l0vt = vt
-					l0val = val_
-				}
-			} else {
-				switch vt {
-				case tVal:
-					value = val_
-					err = nil
-				case tDel:
-				default:
-					panic("leveldb: invalid internal key type")
+		if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil {
+			if v.s.icmp.uCompare(ukey, fukey) == 0 {
+				if level == 0 {
+					if fseq >= zseq {
+						zfound = true
+						zseq = fseq
+						zkt = fkt
+						zval = fval
+					}
+				} else {
+					switch fkt {
+					case ktVal:
+						value = fval
+						err = nil
+					case ktDel:
+					default:
+						panic("leveldb: invalid iKey type")
+					}
+					return false
 				}
-				return false
 			}
 		} else {
-			err = errors.New("leveldb: internal key corrupted")
+			err = fkerr
 			return false
 		}
 
 		return true
 	}, func(level int) bool {
-		if l0found {
-			switch l0vt {
-			case tVal:
-				value = l0val
+		if zfound {
+			switch zkt {
+			case ktVal:
+				value = zval
 				err = nil
-			case tDel:
+			case ktDel:
 			default:
-				panic("leveldb: invalid internal key type")
+				panic("leveldb: invalid iKey type")
 			}
 			return false
 		}
@@ -216,13 +205,13 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
 		its = append(its, it)
 	}
 
-	strict := v.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator)
+	strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader)
 	for _, tables := range v.tables[1:] {
 		if len(tables) == 0 {
 			continue
 		}
 
-		it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict, true)
+		it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict)
 		its = append(its, it)
 	}
 
@@ -230,7 +219,7 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it
 }
 
 func (v *version) newStaging() *versionStaging {
-	return &versionStaging{base: v}
+	return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())}
 }
 
 // Spawn a new version based on this version.
@@ -285,12 +274,13 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) {
 func (v *version) pickLevel(umin, umax []byte) (level int) {
 	if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) {
 		var overlaps tFiles
-		for ; level < kMaxMemCompactLevel; level++ {
+		maxLevel := v.s.o.GetMaxMemCompationLevel()
+		for ; level < maxLevel; level++ {
 			if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) {
 				break
 			}
 			overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false)
-			if overlaps.size() > kMaxGrandParentOverlapBytes {
+			if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) {
 				break
 			}
 		}
@@ -318,9 +308,9 @@ func (v *version) computeCompaction() {
 			// file size is small (perhaps because of a small write-buffer
 			// setting, or very high compression ratios, or lots of
 			// overwrites/deletions).
-			score = float64(len(tables)) / kL0_CompactionTrigger
+			score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger())
 		} else {
-			score = float64(tables.size()) / levelMaxSize[level]
+			score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level))
 		}
 
 		if score > bestScore {
@@ -337,12 +327,14 @@ func (v *version) needCompaction() bool {
 	return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil
 }
 
+type tablesScratch struct {
+	added   map[uint64]atRecord
+	deleted map[uint64]struct{}
+}
+
 type versionStaging struct {
 	base   *version
-	tables [kNumLevels]struct {
-		added   map[uint64]ntRecord
-		deleted map[uint64]struct{}
-	}
+	tables []tablesScratch
 }
 
 func (p *versionStaging) commit(r *sessionRecord) {
@@ -367,7 +359,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
 		tm := &(p.tables[r.level])
 
 		if tm.added == nil {
-			tm.added = make(map[uint64]ntRecord)
+			tm.added = make(map[uint64]atRecord)
 		}
 		tm.added[r.num] = r
 
@@ -379,7 +371,7 @@ func (p *versionStaging) commit(r *sessionRecord) {
 
 func (p *versionStaging) finish() *version {
 	// Build new version.
-	nv := &version{s: p.base.s}
+	nv := newVersion(p.base.s)
 	for level, tm := range p.tables {
 		btables := p.base.tables[level]
 
@@ -402,7 +394,7 @@ func (p *versionStaging) finish() *version {
 
 		// New tables.
 		for _, r := range tm.added {
-			nt = append(nt, r.makeFile(p.base.s))
+			nt = append(nt, p.base.s.tableFileFromRecord(r))
 		}
 
 		// Sort tables.
@@ -429,7 +421,7 @@ func (vr *versionReleaser) Release() {
 	v := vr.v
 	v.s.vmu.Lock()
 	if !vr.once {
-		v.release_NB()
+		v.releaseNB()
 		vr.once = true
 	}
 	v.s.vmu.Unlock()

+ 0 - 0
Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/decode.go → Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go


+ 0 - 0
Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/encode.go → Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go


+ 0 - 0
Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy.go → Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go


+ 0 - 0
Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy_test.go → Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go