123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304 |
- package strmatcher
- import (
- "math/bits"
- "regexp"
- "sort"
- "strings"
- "unsafe"
- )
- // PrimeRK is the prime base used in Rabin-Karp algorithm.
- const PrimeRK = 16777619
- // calculate the rolling murmurHash of given string
- func RollingHash(s string) uint32 {
- h := uint32(0)
- for i := len(s) - 1; i >= 0; i-- {
- h = h*PrimeRK + uint32(s[i])
- }
- return h
- }
- // A MphMatcherGroup is divided into three parts:
- // 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table;
- // 2. `substr` patterns are matched by ac automaton;
- // 3. `regex` patterns are matched with the regex library.
- type MphMatcherGroup struct {
- ac *ACAutomaton
- otherMatchers []matcherEntry
- rules []string
- level0 []uint32
- level0Mask int
- level1 []uint32
- level1Mask int
- count uint32
- ruleMap *map[string]uint32
- }
- func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
- h := RollingHash(pattern)
- switch t {
- case Domain:
- (*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
- fallthrough
- case Full:
- (*g.ruleMap)[pattern] = h
- default:
- }
- }
- func NewMphMatcherGroup() *MphMatcherGroup {
- return &MphMatcherGroup{
- ac: nil,
- otherMatchers: nil,
- rules: nil,
- level0: nil,
- level0Mask: 0,
- level1: nil,
- level1Mask: 0,
- count: 1,
- ruleMap: &map[string]uint32{},
- }
- }
- // AddPattern adds a pattern to MphMatcherGroup
- func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
- switch t {
- case Substr:
- if g.ac == nil {
- g.ac = NewACAutomaton()
- }
- g.ac.Add(pattern, t)
- case Full, Domain:
- pattern = strings.ToLower(pattern)
- g.AddFullOrDomainPattern(pattern, t)
- case Regex:
- r, err := regexp.Compile(pattern)
- if err != nil {
- return 0, err
- }
- g.otherMatchers = append(g.otherMatchers, matcherEntry{
- m: ®exMatcher{pattern: r},
- id: g.count,
- })
- default:
- panic("Unknown type")
- }
- return g.count, nil
- }
- // Build builds a minimal perfect hash table and ac automaton from insert rules
- func (g *MphMatcherGroup) Build() {
- if g.ac != nil {
- g.ac.Build()
- }
- keyLen := len(*g.ruleMap)
- if keyLen == 0 {
- keyLen = 1
- (*g.ruleMap)["empty___"] = RollingHash("empty___")
- }
- g.level0 = make([]uint32, nextPow2(keyLen/4))
- g.level0Mask = len(g.level0) - 1
- g.level1 = make([]uint32, nextPow2(keyLen))
- g.level1Mask = len(g.level1) - 1
- sparseBuckets := make([][]int, len(g.level0))
- var ruleIdx int
- for rule, hash := range *g.ruleMap {
- n := int(hash) & g.level0Mask
- g.rules = append(g.rules, rule)
- sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
- ruleIdx++
- }
- g.ruleMap = nil
- var buckets []indexBucket
- for n, vals := range sparseBuckets {
- if len(vals) > 0 {
- buckets = append(buckets, indexBucket{n, vals})
- }
- }
- sort.Sort(bySize(buckets))
- occ := make([]bool, len(g.level1))
- var tmpOcc []int
- for _, bucket := range buckets {
- seed := uint32(0)
- for {
- findSeed := true
- tmpOcc = tmpOcc[:0]
- for _, i := range bucket.vals {
- n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
- if occ[n] {
- for _, n := range tmpOcc {
- occ[n] = false
- }
- seed++
- findSeed = false
- break
- }
- occ[n] = true
- tmpOcc = append(tmpOcc, n)
- g.level1[n] = uint32(i)
- }
- if findSeed {
- g.level0[bucket.n] = seed
- break
- }
- }
- }
- }
- func nextPow2(v int) int {
- if v <= 1 {
- return 1
- }
- const MaxUInt = ^uint(0)
- n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
- return int(n)
- }
- // Lookup searches for s in t and returns its index and whether it was found.
- func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
- i0 := int(h) & g.level0Mask
- seed := g.level0[i0]
- i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
- n := g.level1[i1]
- return s == g.rules[int(n)]
- }
- // Match implements IndexMatcher.Match.
- func (g *MphMatcherGroup) Match(pattern string) []uint32 {
- result := []uint32{}
- hash := uint32(0)
- for i := len(pattern) - 1; i >= 0; i-- {
- hash = hash*PrimeRK + uint32(pattern[i])
- if pattern[i] == '.' {
- if g.Lookup(hash, pattern[i:]) {
- result = append(result, 1)
- return result
- }
- }
- }
- if g.Lookup(hash, pattern) {
- result = append(result, 1)
- return result
- }
- if g.ac != nil && g.ac.Match(pattern) {
- result = append(result, 1)
- return result
- }
- for _, e := range g.otherMatchers {
- if e.m.Match(pattern) {
- result = append(result, e.id)
- return result
- }
- }
- return nil
- }
- type indexBucket struct {
- n int
- vals []int
- }
- type bySize []indexBucket
- func (s bySize) Len() int { return len(s) }
- func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
- func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
- type stringStruct struct {
- str unsafe.Pointer
- len int
- }
- func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
- x := (*stringStruct)(a)
- return memhashFallback(x.str, h, uintptr(x.len))
- }
- const (
- // Constants for multiplication: four random odd 64-bit numbers.
- m1 = 16877499708836156737
- m2 = 2820277070424839065
- m3 = 9497967016996688599
- m4 = 15839092249703872147
- )
- var hashkey = [4]uintptr{1, 1, 1, 1}
- func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
- h := uint64(seed + s*hashkey[0])
- tail:
- switch {
- case s == 0:
- case s < 4:
- h ^= uint64(*(*byte)(p))
- h ^= uint64(*(*byte)(add(p, s>>1))) << 8
- h ^= uint64(*(*byte)(add(p, s-1))) << 16
- h = rotl31(h*m1) * m2
- case s <= 8:
- h ^= uint64(readUnaligned32(p))
- h ^= uint64(readUnaligned32(add(p, s-4))) << 32
- h = rotl31(h*m1) * m2
- case s <= 16:
- h ^= readUnaligned64(p)
- h = rotl31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-8))
- h = rotl31(h*m1) * m2
- case s <= 32:
- h ^= readUnaligned64(p)
- h = rotl31(h*m1) * m2
- h ^= readUnaligned64(add(p, 8))
- h = rotl31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-16))
- h = rotl31(h*m1) * m2
- h ^= readUnaligned64(add(p, s-8))
- h = rotl31(h*m1) * m2
- default:
- v1 := h
- v2 := uint64(seed * hashkey[1])
- v3 := uint64(seed * hashkey[2])
- v4 := uint64(seed * hashkey[3])
- for s >= 32 {
- v1 ^= readUnaligned64(p)
- v1 = rotl31(v1*m1) * m2
- p = add(p, 8)
- v2 ^= readUnaligned64(p)
- v2 = rotl31(v2*m2) * m3
- p = add(p, 8)
- v3 ^= readUnaligned64(p)
- v3 = rotl31(v3*m3) * m4
- p = add(p, 8)
- v4 ^= readUnaligned64(p)
- v4 = rotl31(v4*m4) * m1
- p = add(p, 8)
- s -= 32
- }
- h = v1 ^ v2 ^ v3 ^ v4
- goto tail
- }
- h ^= h >> 29
- h *= m3
- h ^= h >> 32
- return uintptr(h)
- }
- func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
- return unsafe.Pointer(uintptr(p) + x)
- }
- func readUnaligned32(p unsafe.Pointer) uint32 {
- q := (*[4]byte)(p)
- return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
- }
- func rotl31(x uint64) uint64 {
- return (x << 31) | (x >> (64 - 31))
- }
- func readUnaligned64(p unsafe.Pointer) uint64 {
- q := (*[8]byte)(p)
- return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
- }
|