Bladeren bron

Merge pull request #37 from merryChris/master

实现了对索引的批量加入和删除操作,并添加单测
Hui Chen 9 jaren geleden
bovenliggende
commit
dd08355e61

+ 7 - 0
.gitignore

@@ -0,0 +1,7 @@
+# Ignore all swp files
+*.swo
+*.swp
+*~
+
+# Ignore weibo data file
+testdata/weibo_data.txt

+ 227 - 85
core/indexer.go

@@ -5,6 +5,7 @@ import (
 	"github.com/huichen/wukong/utils"
 	"log"
 	"math"
+	"sort"
 	"sync"
 )
 
@@ -14,8 +15,18 @@ type Indexer struct {
 	// 加了读写锁以保证读写安全
 	tableLock struct {
 		sync.RWMutex
-		table map[string]*KeywordIndices
-		docs  map[uint64]bool
+		table     map[string]*KeywordIndices
+		docsState map[uint64]int // 0: 存在于索引中,1: 等待删除,2: 等待加入
+	}
+	addCacheLock struct {
+		sync.RWMutex
+		addCachePointer int
+		addCache        types.DocumentsIndex
+	}
+	removeCacheLock struct {
+		sync.RWMutex
+		removeCachePointer int
+		removeCache        types.DocumentsId
 	}
 
 	initOptions types.IndexerInitOptions
@@ -33,6 +44,9 @@ type Indexer struct {
 
 // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
 type KeywordIndices struct {
+	// 用于标记在 docIds[] 进行批量加入索引时二分查找的下界
+	lowerBound int
+
 	// 下面的切片是否为空,取决于初始化时IndexType的值
 	docIds      []uint64  // 全部类型都有
 	frequencies []float32 // IndexType == FrequenciesIndex
@@ -44,87 +58,238 @@ func (indexer *Indexer) Init(options types.IndexerInitOptions) {
 	if indexer.initialized == true {
 		log.Fatal("索引器不能初始化两次")
 	}
+	options.Init()
+	indexer.initOptions = options
 	indexer.initialized = true
 
 	indexer.tableLock.table = make(map[string]*KeywordIndices)
-	indexer.tableLock.docs = make(map[uint64]bool)
-	indexer.initOptions = options
+	indexer.tableLock.docsState = make(map[uint64]int)
+	indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
+	indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
 	indexer.docTokenLengths = make(map[uint64]float32)
 }
 
-// 向反向索引表中加入一个文档
-func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
+// 从KeywordIndices中得到第i个文档的DocId
+func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
+	return ti.docIds[i]
+}
+
+// 得到KeywordIndices中文档总数
+func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
+	return len(ti.docIds)
+}
+
+// 向 ADDCACHE 中加入一个文档
+func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceUpdate bool) {
 	if indexer.initialized == false {
 		log.Fatal("索引器尚未初始化")
 	}
 
-	indexer.tableLock.Lock()
-	defer indexer.tableLock.Unlock()
+	indexer.addCacheLock.Lock()
+	if document != nil {
+		indexer.addCacheLock.addCache[indexer.addCacheLock.addCachePointer] = document
+		indexer.addCacheLock.addCachePointer++
+	}
+	if indexer.addCacheLock.addCachePointer >= indexer.initOptions.DocCacheSize || forceUpdate {
+		indexer.tableLock.Lock()
+		position := 0
+		for i := 0; i < indexer.addCacheLock.addCachePointer; i++ {
+			docIndex := indexer.addCacheLock.addCache[i]
+			if docState, ok := indexer.tableLock.docsState[docIndex.DocId]; ok && docState == 0 {
+				if position != i {
+					indexer.addCacheLock.addCache[position], indexer.addCacheLock.addCache[i] =
+						indexer.addCacheLock.addCache[i], indexer.addCacheLock.addCache[position]
+				}
+				indexer.removeCacheLock.Lock()
+				indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] =
+					docIndex.DocId
+				indexer.removeCacheLock.removeCachePointer++
+				indexer.removeCacheLock.Unlock()
+				indexer.tableLock.docsState[docIndex.DocId] = 1
+				indexer.numDocuments--
+				position++
+			} else if !(ok && docState == 1) {
+				// ok && docState == 1 表示等待删除或者删除当前 doc
+				indexer.tableLock.docsState[docIndex.DocId] = 2
+			}
+		}
 
-	// 更新文档关键词总长度
-	if document.TokenLength != 0 {
-		originalLength, found := indexer.docTokenLengths[document.DocId]
-		indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
-		if found {
-			indexer.totalTokenLength += document.TokenLength - originalLength
-		} else {
-			indexer.totalTokenLength += document.TokenLength
+		indexer.tableLock.Unlock()
+		if indexer.RemoveDocumentToCache(0, forceUpdate) {
+			position = 0
 		}
+
+		addCachedDocuments := indexer.addCacheLock.addCache[position:indexer.addCacheLock.addCachePointer]
+		indexer.addCacheLock.addCachePointer = position
+		indexer.addCacheLock.Unlock()
+		sort.Sort(addCachedDocuments)
+		indexer.AddDocuments(&addCachedDocuments)
+	} else {
+		indexer.addCacheLock.Unlock()
 	}
+}
 
-	docIdIsNew := true
-	for _, keyword := range document.Keywords {
-		indices, foundKeyword := indexer.tableLock.table[keyword.Text]
-		if !foundKeyword {
-			// 如果没找到该搜索键则加入
-			ti := KeywordIndices{}
-			switch indexer.initOptions.IndexType {
-			case types.LocationsIndex:
-				ti.locations = [][]int{keyword.Starts}
-			case types.FrequenciesIndex:
-				ti.frequencies = []float32{keyword.Frequency}
-			}
-			ti.docIds = []uint64{document.DocId}
-			indexer.tableLock.table[keyword.Text] = &ti
+// 向反向索引表中加入 ADDCACHE 中所有文档
+func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
+	if indexer.initialized == false {
+		log.Fatal("索引器尚未初始化")
+	}
+
+	indexer.tableLock.Lock()
+	defer indexer.tableLock.Unlock()
+	for _, indices := range indexer.tableLock.table {
+		indices.lowerBound = 0
+	}
+
+	// DocId 递增顺序遍历插入文档保证索引移动次数最少
+	for i, document := range *documents {
+		if i < len(*documents)-1 && (*documents)[i].DocId == (*documents)[i+1].DocId {
+			// 如果有重复文档加入,因为稳定排序,只加入最后一个
+			continue
+		}
+		if docState, ok := indexer.tableLock.docsState[document.DocId]; ok && docState == 1 {
+			// 如果此时 docState 仍为 1,说明该文档需被删除
 			continue
 		}
 
-		// 查找应该插入的位置
-		position, found := indexer.searchIndex(
-			indices, 0, indexer.getIndexLength(indices)-1, document.DocId)
-		if found {
-			docIdIsNew = false
+		// 更新文档关键词总长度
+		if document.TokenLength != 0 {
+			indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
+			indexer.totalTokenLength += document.TokenLength
+		}
+
+		docIdIsNew := true
+		for _, keyword := range document.Keywords {
+			indices, foundKeyword := indexer.tableLock.table[keyword.Text]
+			if !foundKeyword {
+				// 如果没找到该搜索键则加入
+				ti := KeywordIndices{}
+				switch indexer.initOptions.IndexType {
+				case types.LocationsIndex:
+					ti.locations = [][]int{keyword.Starts}
+				case types.FrequenciesIndex:
+					ti.frequencies = []float32{keyword.Frequency}
+				}
+				ti.docIds = []uint64{document.DocId}
+				indexer.tableLock.table[keyword.Text] = &ti
+				continue
+			}
 
-			// 覆盖已有的索引项
+			// 查找应该插入的位置,且索引一定不存在
+			position, _ := indexer.searchIndex(
+				indices, indices.lowerBound, indexer.getIndexLength(indices)-1, document.DocId)
+			indices.lowerBound = position
 			switch indexer.initOptions.IndexType {
 			case types.LocationsIndex:
+				indices.locations = append(indices.locations, []int{})
+				copy(indices.locations[position+1:], indices.locations[position:])
 				indices.locations[position] = keyword.Starts
 			case types.FrequenciesIndex:
+				indices.frequencies = append(indices.frequencies, float32(0))
+				copy(indices.frequencies[position+1:], indices.frequencies[position:])
 				indices.frequencies[position] = keyword.Frequency
 			}
-			continue
+			indices.docIds = append(indices.docIds, 0)
+			copy(indices.docIds[position+1:], indices.docIds[position:])
+			indices.docIds[position] = document.DocId
+		}
+
+		// 更新文章状态和总数
+		if docIdIsNew {
+			indexer.tableLock.docsState[document.DocId] = 0
+			indexer.numDocuments++
 		}
+	}
+}
+
+// 向 REMOVECACHE 中加入一个待删除文档
+func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
+	if indexer.initialized == false {
+		log.Fatal("索引器尚未初始化")
+	}
 
-		// 当索引不存在时,插入新索引项
-		switch indexer.initOptions.IndexType {
-		case types.LocationsIndex:
-			indices.locations = append(indices.locations, []int{})
-			copy(indices.locations[position+1:], indices.locations[position:])
-			indices.locations[position] = keyword.Starts
-		case types.FrequenciesIndex:
-			indices.frequencies = append(indices.frequencies, float32(0))
-			copy(indices.frequencies[position+1:], indices.frequencies[position:])
-			indices.frequencies[position] = keyword.Frequency
+	indexer.removeCacheLock.Lock()
+	if docId != 0 {
+		indexer.tableLock.Lock()
+		if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
+			indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
+			indexer.removeCacheLock.removeCachePointer++
+			indexer.tableLock.docsState[docId] = 1
+			indexer.numDocuments--
+		} else if !ok {
+			// 删除一个等待加入的文档
+			indexer.tableLock.docsState[docId] = 1
 		}
-		indices.docIds = append(indices.docIds, 0)
-		copy(indices.docIds[position+1:], indices.docIds[position:])
-		indices.docIds[position] = document.DocId
+		indexer.tableLock.Unlock()
+	}
+
+	if indexer.removeCacheLock.removeCachePointer > 0 &&
+		(indexer.removeCacheLock.removeCachePointer >= indexer.initOptions.DocCacheSize ||
+			forceUpdate) {
+		removeCachedDocuments := indexer.removeCacheLock.removeCache[:indexer.removeCacheLock.removeCachePointer]
+		indexer.removeCacheLock.removeCachePointer = 0
+		indexer.removeCacheLock.Unlock()
+		sort.Sort(removeCachedDocuments)
+		indexer.RemoveDocuments(&removeCachedDocuments)
+		return true
+	}
+	indexer.removeCacheLock.Unlock()
+	return false
+}
+
+// 向反向索引表中删除 REMOVECACHE 中所有文档
+func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
+	if indexer.initialized == false {
+		log.Fatal("索引器尚未初始化")
+	}
+
+	indexer.tableLock.Lock()
+	defer indexer.tableLock.Unlock()
+
+	// 更新文档关键词总长度,删除文档状态
+	for _, docId := range *documents {
+		indexer.totalTokenLength -= indexer.docTokenLengths[docId]
+		delete(indexer.docTokenLengths, docId)
+		delete(indexer.tableLock.docsState, docId)
 	}
 
-	// 更新文章总数
-	if docIdIsNew {
-		indexer.tableLock.docs[document.DocId] = true
-		indexer.numDocuments++
+	for keyword, indices := range indexer.tableLock.table {
+		indicesTop, indicesPointer := 0, 0
+		documentsPointer := sort.Search(
+			len(*documents), func(i int) bool { return (*documents)[i] >= indices.docIds[0] })
+		// 双指针扫描,进行批量删除操作
+		for ; documentsPointer < len(*documents) &&
+			indicesPointer < indexer.getIndexLength(indices); indicesPointer++ {
+			if indices.docIds[indicesPointer] < (*documents)[documentsPointer] {
+				if indicesTop != indicesPointer {
+					switch indexer.initOptions.IndexType {
+					case types.LocationsIndex:
+						indices.locations[indicesTop] = indices.locations[indicesPointer]
+					case types.FrequenciesIndex:
+						indices.frequencies[indicesTop] = indices.frequencies[indicesPointer]
+					}
+					indices.docIds[indicesTop] = indices.docIds[indicesPointer]
+				}
+				indicesTop++
+			} else {
+				documentsPointer++
+			}
+		}
+		if indicesTop != indicesPointer {
+			switch indexer.initOptions.IndexType {
+			case types.LocationsIndex:
+				indices.locations = append(
+					indices.locations[:indicesTop], indices.locations[indicesPointer:]...)
+			case types.FrequenciesIndex:
+				indices.frequencies = append(
+					indices.frequencies[:indicesTop], indices.frequencies[indicesPointer:]...)
+			}
+			indices.docIds = append(
+				indices.docIds[:indicesTop], indices.docIds[indicesPointer:]...)
+		}
+		if len(indices.docIds) == 0 {
+			delete(indexer.tableLock.table, keyword)
+		}
 	}
 }
 
@@ -177,8 +342,7 @@ func (indexer *Indexer) Lookup(
 		// 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
 		baseDocId := indexer.getDocId(table[0], indexPointers[0])
 		if docIds != nil {
-			_, found := docIds[baseDocId]
-			if !found {
+			if _, found := docIds[baseDocId]; !found {
 				continue
 			}
 		}
@@ -208,7 +372,7 @@ func (indexer *Indexer) Lookup(
 		}
 
 		if found {
-			if _, ok := indexer.tableLock.docs[baseDocId]; !ok {
+			if docState, ok := indexer.tableLock.docsState[baseDocId]; !ok || docState != 0 {
 				continue
 			}
 			indexedDoc := types.IndexedDocument{}
@@ -233,7 +397,7 @@ func (indexer *Indexer) Lookup(
 				}
 
 				// 计算搜索键在文档中的紧邻距离
-				tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
+				tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], &indexPointers, tokens)
 				indexedDoc.TokenProximity = int32(tokenProximity)
 				indexedDoc.TokenSnippetLocations = tokenLocations
 
@@ -323,7 +487,7 @@ func (indexer *Indexer) searchIndex(
 //
 // 具体由动态规划实现,依次计算前 i 个 token 在每个出现位置的最优值。
 // 选定的 P_i 通过 tokenLocations 参数传回。
-func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
+func computeTokenProximity(table []*KeywordIndices, indexPointers *[]int, tokens []string) (
 	minTokenProximity int, tokenLocations []int) {
 	minTokenProximity = -1
 	tokenLocations = make([]int, len(tokens))
@@ -337,14 +501,14 @@ func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens
 	// 初始化路径数组
 	path = make([][]int, len(tokens))
 	for i := 1; i < len(path); i++ {
-		path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
+		path[i] = make([]int, len(table[i].locations[(*indexPointers)[i]]))
 	}
 
 	// 动态规划
-	currentLocations = table[0].locations[indexPointers[0]]
+	currentLocations = table[0].locations[(*indexPointers)[0]]
 	currentMinValues = make([]int, len(currentLocations))
 	for i := 1; i < len(tokens); i++ {
-		nextLocations = table[i].locations[indexPointers[i]]
+		nextLocations = table[i].locations[(*indexPointers)[i]]
 		nextMinValues = make([]int, len(nextLocations))
 		for j, _ := range nextMinValues {
 			nextMinValues[j] = -1
@@ -396,29 +560,7 @@ func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens
 		if i != len(tokens)-1 {
 			cursor = path[i+1][cursor]
 		}
-		tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
+		tokenLocations[i] = table[i].locations[(*indexPointers)[i]][cursor]
 	}
 	return
 }
-
-// 从KeywordIndices中得到第i个文档的DocId
-func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
-	return ti.docIds[i]
-}
-
-// 得到KeywordIndices中文档总数
-func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
-	return len(ti.docIds)
-}
-
-// 删除某个文档
-func (indexer *Indexer) RemoveDoc(docId uint64) {
-	if indexer.initialized == false {
-		log.Fatal("排序器尚未初始化")
-	}
-
-	indexer.tableLock.Lock()
-	delete(indexer.tableLock.docs, docId)
-	indexer.numDocuments--
-	indexer.tableLock.Unlock()
-}

+ 217 - 147
core/indexer_test.go

@@ -9,200 +9,259 @@ import (
 func TestAddKeywords(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
-	indexer.AddDocument(&types.DocumentIndex{
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId:    1,
 		Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:    7,
-		Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
+	}, false)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId:    2,
-		Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:    3,
 		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:    1,
-		Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
+	}, false)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId:    3,
+		Keywords: []types.KeywordIndex{{"token3", 0, []int{}}},
+	}, false)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId:    7,
+		Keywords: []types.KeywordIndex{{"token7", 0, []int{}}},
+	}, false)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId:    1,
 		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:    2,
-		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
-	})
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:    0,
-		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
-	})
+	}, false)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId:    7,
+		Keywords: []types.KeywordIndex{{"token77", 0, []int{}}},
+	}, false)
+	indexer.AddDocumentToCache(nil, true)
 
-	utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
-	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "3 ", indicesToString(&indexer, "token3"))
+	utils.Expect(t, "7 ", indicesToString(&indexer, "token77"))
 }
 
-func TestLookup(t *testing.T) {
+func TestRemoveDocument(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
-	// doc0 = "token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
 		},
-	})
-	// doc1 = "token1 token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
+	}, false)
+	// doc2 = "token1 token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token2", 0, []int{7}},
+		},
+	}, true)
+	utils.Expect(t, "2 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 ", indicesToString(&indexer, "token3"))
+
+	indexer.RemoveDocumentToCache(2, false)
+	// doc1 = "token1 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 1,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token3", 0, []int{7}},
+		},
+	}, true)
+	utils.Expect(t, "1 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 ", indicesToString(&indexer, "token3"))
+
+	// doc2 = "token1 token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 			{"token3", 0, []int{14}},
 		},
-	})
-	// doc2 = "token1 token2"
-	indexer.AddDocument(&types.DocumentIndex{
+	}, true)
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "2 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token3"))
+
+	// doc3 = "token1 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 3,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token2", 0, []int{7}},
+		},
+	}, false)
+	indexer.RemoveDocumentToCache(3, false)
+	indexer.AddDocumentToCache(nil, true)
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "2 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token3"))
+}
+
+func TestLookupLocationsIndex(t *testing.T) {
+	var indexer Indexer
+	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
+	// doc1 = "token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
+		Keywords: []types.KeywordIndex{
+			{"token2", 0, []int{0}},
+			{"token3", 0, []int{7}},
+		},
+	}, false)
+	// doc2 = "token1 token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 2,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
+			{"token3", 0, []int{14}},
 		},
-	})
-	// doc3 = "token2"
-	indexer.AddDocument(&types.DocumentIndex{
+	}, false)
+	// doc3 = "token1 token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 3,
+		Keywords: []types.KeywordIndex{
+			{"token1", 0, []int{0}},
+			{"token2", 0, []int{7}},
+		},
+	}, false)
+	// doc4 = "token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 4,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
-	})
+	}, false)
 	// doc7 = "token1 token3"
-	indexer.AddDocument(&types.DocumentIndex{
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 7,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token3", 0, []int{7}},
 		},
-	})
+	}, false)
 	// doc9 = "token3"
-	indexer.AddDocument(&types.DocumentIndex{
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 9,
 		Keywords: []types.KeywordIndex{
 			{"token3", 0, []int{0}},
 		},
-	})
+	}, true)
 
-	utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
-	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
-	utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
+	utils.Expect(t, "2 3 7 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "1 2 3 4 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 2 7 9 ", indicesToString(&indexer, "token3"))
 
 	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ",
+	utils.Expect(t, "[7 0 [0]] [3 0 [0]] [2 0 [0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
 	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ",
+	utils.Expect(t, "[3 1 [0 7]] [2 1 [0 7]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
-	utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ",
+	utils.Expect(t, "[3 13 [7 0]] [2 13 [7 0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
-	utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ",
+	utils.Expect(t, "[7 1 [0 7]] [2 8 [0 14]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ",
+	utils.Expect(t, "[7 13 [7 0]] [2 20 [14 0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ",
+	utils.Expect(t, "[2 1 [7 14]] [1 1 [0 7]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ",
+	utils.Expect(t, "[2 13 [14 7]] [1 13 [7 0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[1 2 [0 7 14]] ",
+	utils.Expect(t, "[2 2 [0 7 14]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 26 [14 7 0]] ",
+	utils.Expect(t, "[2 26 [14 7 0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
 }
 
-func TestDocIdsIndex(t *testing.T) {
+func TestLookupDocIdsIndex(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
-	// doc0 = "token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
 		},
-	})
-	// doc1 = "token1 token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 1,
+	}, false)
+	// doc2 = "token1 token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 			{"token3", 0, []int{14}},
 		},
-	})
-	// doc2 = "token1 token2"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 2,
+	}, false)
+	// doc3 = "token1 token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 3,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 		},
-	})
-	// doc3 = "token2"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 3,
+	}, false)
+	// doc4 = "token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 4,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
-	})
+	}, false)
 	// doc7 = "token1 token3"
-	indexer.AddDocument(&types.DocumentIndex{
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 7,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token3", 0, []int{7}},
 		},
-	})
+	}, false)
 	// doc9 = "token3"
-	indexer.AddDocument(&types.DocumentIndex{
+	indexer.AddDocumentToCache(&types.DocumentIndex{
 		DocId: 9,
 		Keywords: []types.KeywordIndex{
 			{"token3", 0, []int{0}},
 		},
-	})
+	}, true)
 
-	utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
-	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
-	utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
+	utils.Expect(t, "2 3 7 ", indicesToString(&indexer, "token1"))
+	utils.Expect(t, "1 2 3 4 ", indicesToString(&indexer, "token2"))
+	utils.Expect(t, "1 2 7 9 ", indicesToString(&indexer, "token3"))
 
 	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
+	utils.Expect(t, "[7 0 []] [3 0 []] [2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
 	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[2 0 []] [1 0 []] ",
+	utils.Expect(t, "[3 0 []] [2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
-	utils.Expect(t, "[2 0 []] [1 0 []] ",
+	utils.Expect(t, "[3 0 []] [2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
-	utils.Expect(t, "[7 0 []] [1 0 []] ",
+	utils.Expect(t, "[7 0 []] [2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[7 0 []] [1 0 []] ",
+	utils.Expect(t, "[7 0 []] [2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 0 []] [0 0 []] ",
+	utils.Expect(t, "[2 0 []] [1 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 0 []] [0 0 []] ",
+	utils.Expect(t, "[2 0 []] [1 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
 
-	utils.Expect(t, "[1 0 []] ",
+	utils.Expect(t, "[2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
-	utils.Expect(t, "[1 0 []] ",
+	utils.Expect(t, "[2 0 []] ",
 		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
 }
 
@@ -210,69 +269,69 @@ func TestLookupWithProximity(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 
-	// doc0 = "token2 token4 token4 token2 token3 token4"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
 			{"token4", 0, []int{7, 14, 35}},
 		},
-	})
-	utils.Expect(t, "[0 1 [21 28]] ",
+	}, true)
+	utils.Expect(t, "[1 1 [21 28]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
 
-	// doc0 = "t2 t1 . . . t2 t3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "t2 t1 . . . t2 t3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"t1", 0, []int{3}},
 			{"t2", 0, []int{0, 12}},
 			{"t3", 0, []int{15}},
 		},
-	})
-	utils.Expect(t, "[0 8 [3 12 15]] ",
+	}, true)
+	utils.Expect(t, "[1 8 [3 12 15]] ",
 		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
 
-	// doc0 = "t3 t2 t1 . . . . . t2 t3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "t3 t2 t1 . . . . . t2 t3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"t1", 0, []int{6}},
 			{"t2", 0, []int{3, 19}},
 			{"t3", 0, []int{0, 22}},
 		},
-	})
-	utils.Expect(t, "[0 10 [6 3 0]] ",
+	}, true)
+	utils.Expect(t, "[1 10 [6 3 0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
 }
 
 func TestLookupWithPartialLocations(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
-	// doc0 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
 			{"label1", 0, []int{}},
 			{"token4", 0, []int{7, 14, 35}},
 		},
-	})
-	// doc1 = "token2 token4 token4 token2 token3 token4"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 1,
+	}, false)
+	// doc2 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
 			{"token4", 0, []int{7, 14, 35}},
 		},
-	})
+	}, true)
 
-	utils.Expect(t, "0 ", indicesToString(&indexer, "label1"))
+	utils.Expect(t, "1 ", indicesToString(&indexer, "label1"))
 
-	utils.Expect(t, "[0 1 [21 28]] ",
+	utils.Expect(t, "[1 1 [21 28]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false)))
 }
 
@@ -285,25 +344,25 @@ func TestLookupWithBM25(t *testing.T) {
 			B:  1,
 		},
 	})
-	// doc0 = "token2 token4 token4 token2 token3 token4"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:       0,
+	// doc1 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId:       1,
 		TokenLength: 6,
 		Keywords: []types.KeywordIndex{
 			{"token2", 3, []int{0, 21}},
 			{"token3", 7, []int{28}},
 			{"token4", 15, []int{7, 14, 35}},
 		},
-	})
-	// doc0 = "token6 token7"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId:       1,
+	}, false)
+	// doc2 = "token6 token7"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId:       2,
 		TokenLength: 2,
 		Keywords: []types.KeywordIndex{
 			{"token6", 3, []int{0}},
 			{"token7", 15, []int{7}},
 		},
-	})
+	}, true)
 
 	outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false)
 
@@ -314,59 +373,70 @@ func TestLookupWithBM25(t *testing.T) {
 func TestLookupWithinDocIds(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
-	// doc0 = "token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
 		},
-	})
-	// doc1 = "token1 token2 token3"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 1,
+	}, false)
+	// doc2 = "token1 token2 token3"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 			{"token3", 0, []int{14}},
 		},
-	})
-	// doc2 = "token1 token2"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 2,
+	}, false)
+	// doc3 = "token1 token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 3,
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 		},
-	})
-	// doc3 = "token2"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 3,
+	}, false)
+	// doc4 = "token2"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 4,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
-	})
+	}, true)
 
 	docIds := make(map[uint64]bool)
-	docIds[0] = true
-	docIds[2] = true
-	utils.Expect(t, "[2 0 [7]] [0 0 [0]] ",
+	docIds[1] = true
+	docIds[3] = true
+	utils.Expect(t, "[3 0 [7]] [1 0 [0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
 }
 
 func TestLookupWithLocations(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
-	// doc0 = "token2 token4 token4 token2 token3 token4"
-	indexer.AddDocument(&types.DocumentIndex{
-		DocId: 0,
+	// doc1 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 1,
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
 			{"token4", 0, []int{7, 14, 35}},
 		},
-	})
+	}, true)
+
+	// doc2 = "token2 token4 token4 token2 token3 token4"
+	indexer.AddDocumentToCache(&types.DocumentIndex{
+		DocId: 2,
+		Keywords: []types.KeywordIndex{
+			{"token3", 0, []int{0, 21}},
+			{"token5", 0, []int{28}},
+			{"token2", 0, []int{7, 14, 35}},
+		},
+	}, true)
 
+	indexer.RemoveDocumentToCache(2, true)
 	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
 	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
 }

+ 1 - 1
core/ranker_test.go

@@ -97,7 +97,7 @@ func TestRankWithCriteria(t *testing.T) {
 	utils.Expect(t, "[1 [25300 ]] [3 [17300 ]] ", scoredDocsToString(scoredDocs))
 }
 
-func TestRemoveDocument(t *testing.T) {
+func TestRemoveDoc(t *testing.T) {
 	var ranker Ranker
 	ranker.Init()
 	ranker.AddDoc(1, DummyScoringFields{

+ 5 - 4
core/test_utils.go

@@ -6,10 +6,11 @@ import (
 )
 
 func indicesToString(indexer *Indexer, token string) (output string) {
-	indices := indexer.tableLock.table[token]
-	for i := 0; i < indexer.getIndexLength(indices); i++ {
-		output += fmt.Sprintf("%d ",
-			indexer.getDocId(indices, i))
+	if indices, ok := indexer.tableLock.table[token]; ok {
+		for i := 0; i < indexer.getIndexLength(indices); i++ {
+			output += fmt.Sprintf("%d ",
+				indexer.getDocId(indices, i))
+		}
 	}
 	return
 }

+ 4 - 0
engine/counters.go

@@ -7,3 +7,7 @@ func (engine *Engine) NumTokenIndexAdded() uint64 {
 func (engine *Engine) NumDocumentsIndexed() uint64 {
 	return engine.numDocumentsIndexed
 }
+
+func (engine *Engine) NumDocumentsRemoved() uint64 {
+	return engine.numDocumentsRemoved
+}

+ 66 - 36
engine/engine.go

@@ -24,10 +24,14 @@ const (
 
 type Engine struct {
 	// 计数器,用来统计有多少文档被索引等信息
-	numDocumentsIndexed uint64
-	numIndexingRequests uint64
-	numTokenIndexAdded  uint64
-	numDocumentsStored  uint64
+	numDocumentsIndexed      uint64
+	numDocumentsRemoved      uint64
+	numDocumentsForceUpdated uint64
+	numIndexingRequests      uint64
+	numRemovingRequests      uint64
+	numForceUpdatingRequests uint64
+	numTokenIndexAdded       uint64
+	numDocumentsStored       uint64
 
 	// 记录初始化参数
 	initOptions types.EngineInitOptions
@@ -40,10 +44,10 @@ type Engine struct {
 	dbs        []storage.Storage
 
 	// 建立索引器使用的通信通道
-	segmenterChannel           chan segmenterRequest
-	indexerAddDocumentChannels []chan indexerAddDocumentRequest
-	indexerRemoveDocChannels   []chan indexerRemoveDocRequest
-	rankerAddDocChannels       []chan rankerAddDocRequest
+	segmenterChannel         chan segmenterRequest
+	indexerAddDocChannels    []chan indexerAddDocumentRequest
+	indexerRemoveDocChannels []chan indexerRemoveDocRequest
+	rankerAddDocChannels     []chan rankerAddDocRequest
 
 	// 建立排序器使用的通信通道
 	indexerLookupChannels   []chan indexerLookupRequest
@@ -86,20 +90,23 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 
 	// 初始化分词器通道
 	engine.segmenterChannel = make(
+		//chan segmenterRequest)
 		chan segmenterRequest, options.NumSegmenterThreads)
 
 	// 初始化索引器通道
-	engine.indexerAddDocumentChannels = make(
+	engine.indexerAddDocChannels = make(
 		[]chan indexerAddDocumentRequest, options.NumShards)
 	engine.indexerRemoveDocChannels = make(
 		[]chan indexerRemoveDocRequest, options.NumShards)
 	engine.indexerLookupChannels = make(
 		[]chan indexerLookupRequest, options.NumShards)
 	for shard := 0; shard < options.NumShards; shard++ {
-		engine.indexerAddDocumentChannels[shard] = make(
+		engine.indexerAddDocChannels[shard] = make(
+			//chan indexerAddDocumentRequest)
 			chan indexerAddDocumentRequest,
 			options.IndexerBufferLength)
 		engine.indexerRemoveDocChannels[shard] = make(
+			//chan indexerRemoveDocRequest)
 			chan indexerRemoveDocRequest,
 			options.IndexerBufferLength)
 		engine.indexerLookupChannels[shard] = make(
@@ -215,68 +222,74 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 // 将文档加入索引
 //
 // 输入参数:
-// 	docId	标识文档编号,必须唯一
-//	data	见DocumentIndexData注释
+//  docId	标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
+//  data	见DocumentIndexData注释
 //
 // 注意:
 //      1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
-// 	2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
+//      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
 //         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
-func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData) {
-	engine.internalIndexDocument(docId, data)
+func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData, forceUpdate bool) {
+	engine.internalIndexDocument(docId, data, forceUpdate)
 
 	hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
-	if engine.initOptions.UsePersistentStorage {
+	if engine.initOptions.UsePersistentStorage && docId != 0 {
 		engine.persistentStorageIndexDocumentChannels[hash] <- persistentStorageIndexDocumentRequest{docId: docId, data: data}
 	}
 }
 
-func (engine *Engine) internalIndexDocument(docId uint64, data types.DocumentIndexData) {
+func (engine *Engine) internalIndexDocument(
+	docId uint64, data types.DocumentIndexData, forceUpdate bool) {
 	if !engine.initialized {
 		log.Fatal("必须先初始化引擎")
 	}
 
-	atomic.AddUint64(&engine.numIndexingRequests, 1)
+	if docId != 0 {
+		atomic.AddUint64(&engine.numIndexingRequests, 1)
+	}
+	if forceUpdate {
+		atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
+	}
 	hash := murmur.Murmur3([]byte(fmt.Sprint("%d%s", docId, data.Content)))
 	engine.segmenterChannel <- segmenterRequest{
-		docId: docId, hash: hash, data: data}
+		docId: docId, hash: hash, data: data, forceUpdate: forceUpdate}
 }
 
 // 将文档从索引中删除
 //
 // 输入参数:
-// 	docId	标识文档编号,必须唯一
+//  docId	标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
 //
-// 注意:这个函数仅从排序器中删除文档,索引器不会发生变化。
-func (engine *Engine) RemoveDocument(docId uint64) {
+// 注意:
+//      1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
+//      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
+//         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
+func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
 	if !engine.initialized {
 		log.Fatal("必须先初始化引擎")
 	}
 
+	if docId != 0 {
+		atomic.AddUint64(&engine.numRemovingRequests, 1)
+	}
+	if forceUpdate {
+		atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
+	}
 	for shard := 0; shard < engine.initOptions.NumShards; shard++ {
-		engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId}
+		engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId, forceUpdate: forceUpdate}
+		if docId == 0 {
+			continue
+		}
 		engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
 	}
 
-	if engine.initOptions.UsePersistentStorage {
+	if engine.initOptions.UsePersistentStorage && docId != 0 {
 		// 从数据库中删除
 		hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
 		go engine.persistentStorageRemoveDocumentWorker(docId, hash)
 	}
 }
 
-// 阻塞等待直到所有索引添加完毕
-func (engine *Engine) FlushIndex() {
-	for {
-		runtime.Gosched()
-		if engine.numIndexingRequests == engine.numDocumentsIndexed &&
-			(!engine.initOptions.UsePersistentStorage ||
-				engine.numIndexingRequests == engine.numDocumentsStored) {
-			return
-		}
-	}
-}
-
 // 查找满足搜索条件的文档,此函数线程安全
 func (engine *Engine) Search(request types.SearchRequest) (output types.SearchResponse) {
 	if !engine.initialized {
@@ -397,6 +410,23 @@ func (engine *Engine) Search(request types.SearchRequest) (output types.SearchRe
 	return
 }
 
+// 阻塞等待直到所有索引添加完毕
+func (engine *Engine) FlushIndex() {
+	// 强制更新,CHANNEL 中 REQUESTS 的无序性可能会导致 CACHE 中有残留
+	engine.RemoveDocument(0, true)
+	engine.IndexDocument(0, types.DocumentIndexData{}, true)
+	for {
+		runtime.Gosched()
+		if engine.numIndexingRequests == engine.numDocumentsIndexed &&
+			engine.numRemovingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsRemoved &&
+			engine.numForceUpdatingRequests*uint64(engine.initOptions.NumShards) ==
+				engine.numDocumentsForceUpdated && (!engine.initOptions.UsePersistentStorage ||
+			engine.numIndexingRequests == engine.numDocumentsStored) {
+			return
+		}
+	}
+}
+
 // 关闭引擎
 func (engine *Engine) Close() {
 	engine.FlushIndex()

+ 41 - 39
engine/engine_test.go

@@ -14,31 +14,32 @@ type ScoringFields struct {
 }
 
 func AddDocs(engine *Engine) {
-	docId := uint64(0)
+	docId := uint64(1)
+	// 因为需要保证文档全部被加入到索引中,所以 forceUpdate 全部设置成 true
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "中国有十三亿人口人口",
 		Fields:  ScoringFields{1, 2, 3},
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "中国人口",
 		Fields:  nil,
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "有人口",
 		Fields:  ScoringFields{2, 3, 1},
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "有十三亿人口",
 		Fields:  ScoringFields{2, 3, 3},
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "中国十三亿人口",
 		Fields:  ScoringFields{0, 9, 1},
-	})
+	}, true)
 
 	engine.FlushIndex()
 }
@@ -76,15 +77,15 @@ func TestEngineIndexDocument(t *testing.T) {
 	utils.Expect(t, "人口", outputs.Tokens[1])
 	utils.Expect(t, "3", len(outputs.Docs))
 
-	utils.Expect(t, "1", outputs.Docs[0].DocId)
+	utils.Expect(t, "2", outputs.Docs[0].DocId)
 	utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
 	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
 
-	utils.Expect(t, "4", outputs.Docs[1].DocId)
+	utils.Expect(t, "5", outputs.Docs[1].DocId)
 	utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
 	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
 
-	utils.Expect(t, "0", outputs.Docs[2].DocId)
+	utils.Expect(t, "1", outputs.Docs[2].DocId)
 	utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
 	utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
 }
@@ -109,9 +110,9 @@ func TestReverseOrder(t *testing.T) {
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "3", len(outputs.Docs))
 
-	utils.Expect(t, "0", outputs.Docs[0].DocId)
-	utils.Expect(t, "4", outputs.Docs[1].DocId)
-	utils.Expect(t, "1", outputs.Docs[2].DocId)
+	utils.Expect(t, "1", outputs.Docs[0].DocId)
+	utils.Expect(t, "5", outputs.Docs[1].DocId)
+	utils.Expect(t, "2", outputs.Docs[2].DocId)
 }
 
 func TestOffsetAndMaxOutputs(t *testing.T) {
@@ -134,8 +135,8 @@ func TestOffsetAndMaxOutputs(t *testing.T) {
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "4", outputs.Docs[0].DocId)
-	utils.Expect(t, "1", outputs.Docs[1].DocId)
+	utils.Expect(t, "5", outputs.Docs[0].DocId)
+	utils.Expect(t, "2", outputs.Docs[1].DocId)
 }
 
 type TestScoringCriteria struct {
@@ -167,10 +168,10 @@ func TestSearchWithCriteria(t *testing.T) {
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "0", outputs.Docs[0].DocId)
+	utils.Expect(t, "1", outputs.Docs[0].DocId)
 	utils.Expect(t, "18000", int(outputs.Docs[0].Scores[0]*1000))
 
-	utils.Expect(t, "4", outputs.Docs[1].DocId)
+	utils.Expect(t, "5", outputs.Docs[1].DocId)
 	utils.Expect(t, "9000", int(outputs.Docs[1].Scores[0]*1000))
 }
 
@@ -188,10 +189,10 @@ func TestCompactIndex(t *testing.T) {
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "4", outputs.Docs[0].DocId)
+	utils.Expect(t, "5", outputs.Docs[0].DocId)
 	utils.Expect(t, "9000", int(outputs.Docs[0].Scores[0]*1000))
 
-	utils.Expect(t, "0", outputs.Docs[1].DocId)
+	utils.Expect(t, "1", outputs.Docs[1].DocId)
 	utils.Expect(t, "6000", int(outputs.Docs[1].Scores[0]*1000))
 }
 
@@ -223,11 +224,11 @@ func TestFrequenciesIndex(t *testing.T) {
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "4", outputs.Docs[0].DocId)
-	utils.Expect(t, "2311", int(outputs.Docs[0].Scores[0]*1000))
+	utils.Expect(t, "5", outputs.Docs[0].DocId)
+	utils.Expect(t, "2349", int(outputs.Docs[0].Scores[0]*1000))
 
-	utils.Expect(t, "0", outputs.Docs[1].DocId)
-	utils.Expect(t, "2211", int(outputs.Docs[1].Scores[0]*1000))
+	utils.Expect(t, "1", outputs.Docs[1].DocId)
+	utils.Expect(t, "2320", int(outputs.Docs[1].Scores[0]*1000))
 }
 
 func TestRemoveDocument(t *testing.T) {
@@ -240,12 +241,12 @@ func TestRemoveDocument(t *testing.T) {
 	})
 
 	AddDocs(&engine)
-	engine.RemoveDocument(4)
+	engine.RemoveDocument(5, true)
 
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "1", len(outputs.Docs))
 
-	utils.Expect(t, "0", outputs.Docs[0].DocId)
+	utils.Expect(t, "1", outputs.Docs[0].DocId)
 	utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
 }
 
@@ -263,7 +264,7 @@ func TestEngineIndexDocumentWithTokens(t *testing.T) {
 		},
 	})
 
-	docId := uint64(0)
+	docId := uint64(1)
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "",
 		Tokens: []types.TokenData{
@@ -271,7 +272,7 @@ func TestEngineIndexDocumentWithTokens(t *testing.T) {
 			{"人口", []int{18, 24}},
 		},
 		Fields: ScoringFields{1, 2, 3},
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "",
@@ -280,12 +281,12 @@ func TestEngineIndexDocumentWithTokens(t *testing.T) {
 			{"人口", []int{6}},
 		},
 		Fields: ScoringFields{1, 2, 3},
-	})
+	}, true)
 	docId++
 	engine.IndexDocument(docId, types.DocumentIndexData{
 		Content: "中国十三亿人口",
 		Fields:  ScoringFields{0, 9, 1},
-	})
+	}, true)
 
 	engine.FlushIndex()
 
@@ -295,15 +296,15 @@ func TestEngineIndexDocumentWithTokens(t *testing.T) {
 	utils.Expect(t, "人口", outputs.Tokens[1])
 	utils.Expect(t, "3", len(outputs.Docs))
 
-	utils.Expect(t, "1", outputs.Docs[0].DocId)
+	utils.Expect(t, "2", outputs.Docs[0].DocId)
 	utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
 	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
 
-	utils.Expect(t, "2", outputs.Docs[1].DocId)
+	utils.Expect(t, "3", outputs.Docs[1].DocId)
 	utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
 	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
 
-	utils.Expect(t, "0", outputs.Docs[2].DocId)
+	utils.Expect(t, "1", outputs.Docs[2].DocId)
 	utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
 	utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
 }
@@ -326,7 +327,7 @@ func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
 		PersistentStorageShards: 2,
 	})
 	AddDocs(&engine)
-	engine.RemoveDocument(4)
+	engine.RemoveDocument(5, true)
 	engine.Close()
 
 	var engine1 Engine
@@ -344,6 +345,7 @@ func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
 		PersistentStorageFolder: "wukong.persistent",
 		PersistentStorageShards: 2,
 	})
+	engine1.FlushIndex()
 
 	outputs := engine1.Search(types.SearchRequest{Text: "中国人口"})
 	utils.Expect(t, "2", len(outputs.Tokens))
@@ -351,11 +353,11 @@ func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
 	utils.Expect(t, "人口", outputs.Tokens[1])
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "1", outputs.Docs[0].DocId)
+	utils.Expect(t, "2", outputs.Docs[0].DocId)
 	utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
 	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
 
-	utils.Expect(t, "0", outputs.Docs[1].DocId)
+	utils.Expect(t, "1", outputs.Docs[1].DocId)
 	utils.Expect(t, "76", int(outputs.Docs[1].Scores[0]*1000))
 	utils.Expect(t, "[0 18]", outputs.Docs[1].TokenSnippetLocations)
 
@@ -379,7 +381,7 @@ func TestCountDocsOnly(t *testing.T) {
 	})
 
 	AddDocs(&engine)
-	engine.RemoveDocument(4)
+	engine.RemoveDocument(5, true)
 
 	outputs := engine.Search(types.SearchRequest{Text: "中国人口", CountDocsOnly: true})
 	utils.Expect(t, "0", len(outputs.Docs))
@@ -405,8 +407,8 @@ func TestSearchWithin(t *testing.T) {
 	AddDocs(&engine)
 
 	docIds := make(map[uint64]bool)
-	docIds[4] = true
-	docIds[0] = true
+	docIds[5] = true
+	docIds[1] = true
 	outputs := engine.Search(types.SearchRequest{
 		Text:   "中国人口",
 		DocIds: docIds,
@@ -416,11 +418,11 @@ func TestSearchWithin(t *testing.T) {
 	utils.Expect(t, "人口", outputs.Tokens[1])
 	utils.Expect(t, "2", len(outputs.Docs))
 
-	utils.Expect(t, "0", outputs.Docs[0].DocId)
+	utils.Expect(t, "1", outputs.Docs[0].DocId)
 	utils.Expect(t, "76", int(outputs.Docs[0].Scores[0]*1000))
 	utils.Expect(t, "[0 18]", outputs.Docs[0].TokenSnippetLocations)
 
-	utils.Expect(t, "4", outputs.Docs[1].DocId)
+	utils.Expect(t, "5", outputs.Docs[1].DocId)
 	utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
 	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
 }

+ 27 - 14
engine/indexer_worker.go

@@ -6,7 +6,8 @@ import (
 )
 
 type indexerAddDocumentRequest struct {
-	document *types.DocumentIndex
+	document    *types.DocumentIndex
+	forceUpdate bool
 }
 
 type indexerLookupRequest struct {
@@ -20,16 +21,35 @@ type indexerLookupRequest struct {
 }
 
 type indexerRemoveDocRequest struct {
-	docId uint64
+	docId       uint64
+	forceUpdate bool
 }
 
 func (engine *Engine) indexerAddDocumentWorker(shard int) {
 	for {
-		request := <-engine.indexerAddDocumentChannels[shard]
-		engine.indexers[shard].AddDocument(request.document)
-		atomic.AddUint64(&engine.numTokenIndexAdded,
-			uint64(len(request.document.Keywords)))
-		atomic.AddUint64(&engine.numDocumentsIndexed, 1)
+		request := <-engine.indexerAddDocChannels[shard]
+		engine.indexers[shard].AddDocumentToCache(request.document, request.forceUpdate)
+		if request.document != nil {
+			atomic.AddUint64(&engine.numTokenIndexAdded,
+				uint64(len(request.document.Keywords)))
+			atomic.AddUint64(&engine.numDocumentsIndexed, 1)
+		}
+		if request.forceUpdate {
+			atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
+		}
+	}
+}
+
+func (engine *Engine) indexerRemoveDocWorker(shard int) {
+	for {
+		request := <-engine.indexerRemoveDocChannels[shard]
+		engine.indexers[shard].RemoveDocumentToCache(request.docId, request.forceUpdate)
+		if request.docId != 0 {
+			atomic.AddUint64(&engine.numDocumentsRemoved, 1)
+		}
+		if request.forceUpdate {
+			atomic.AddUint64(&engine.numDocumentsForceUpdated, 1)
+		}
 	}
 }
 
@@ -79,10 +99,3 @@ func (engine *Engine) indexerLookupWorker(shard int) {
 		engine.rankerRankChannels[shard] <- rankerRequest
 	}
 }
-
-func (engine *Engine) indexerRemoveDocWorker(shard int) {
-	for {
-		request := <-engine.indexerRemoveDocChannels[shard]
-		engine.indexers[shard].RemoveDoc(request.docId)
-	}
-}

+ 1 - 1
engine/persistent_storage_worker.go

@@ -58,7 +58,7 @@ func (engine *Engine) persistentStorageInitWorker(shard int) {
 		err := dec.Decode(&data)
 		if err == nil {
 			// 添加索引
-			engine.internalIndexDocument(docId, data)
+			engine.internalIndexDocument(docId, data, false)
 		}
 		return nil
 	})

+ 24 - 5
engine/segmenter_worker.go

@@ -5,16 +5,25 @@ import (
 )
 
 type segmenterRequest struct {
-	docId uint64
-	hash  uint32
-	data  types.DocumentIndexData
+	docId       uint64
+	hash        uint32
+	data        types.DocumentIndexData
+	forceUpdate bool
 }
 
 func (engine *Engine) segmenterWorker() {
 	for {
 		request := <-engine.segmenterChannel
-		shard := engine.getShard(request.hash)
+		if request.docId == 0 {
+			if request.forceUpdate {
+				for i := 0; i < engine.initOptions.NumShards; i++ {
+					engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
+				}
+			}
+			continue
+		}
 
+		shard := engine.getShard(request.hash)
 		tokensMap := make(map[string][]int)
 		numTokens := 0
 		if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
@@ -54,6 +63,7 @@ func (engine *Engine) segmenterWorker() {
 				TokenLength: float32(numTokens),
 				Keywords:    make([]types.KeywordIndex, len(tokensMap)),
 			},
+			forceUpdate: request.forceUpdate,
 		}
 		iTokens := 0
 		for k, v := range tokensMap {
@@ -64,7 +74,16 @@ func (engine *Engine) segmenterWorker() {
 				Starts:    v}
 			iTokens++
 		}
-		engine.indexerAddDocumentChannels[shard] <- indexerRequest
+
+		engine.indexerAddDocChannels[shard] <- indexerRequest
+		if request.forceUpdate {
+			for i := 0; i < engine.initOptions.NumShards; i++ {
+				if i == shard {
+					continue
+				}
+				engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
+			}
+		}
 		rankerRequest := rankerAddDocRequest{
 			docId: request.docId, fields: request.data.Fields}
 		engine.rankerAddDocChannels[shard] <- rankerRequest

+ 26 - 11
examples/benchmark.go

@@ -39,6 +39,7 @@ var (
 	cpuprofile                = flag.String("cpuprofile", "", "处理器profile文件")
 	memprofile                = flag.String("memprofile", "", "内存profile文件")
 	num_repeat_text           = flag.Int("num_repeat_text", 10, "文本重复加入多少次")
+	num_delete_docs           = flag.Int("num_delete_docs", 1000, "测试删除文档的个数")
 	index_type                = flag.Int("index_type", types.DocIdsIndex, "索引类型")
 	use_persistent            = flag.Bool("use_persistent", false, "是否使用持久存储")
 	persistent_storage_folder = flag.String("persistent_storage_folder", "benchmark.persistent", "持久存储数据库保存的目录")
@@ -123,7 +124,7 @@ func main() {
 	for i := 0; i < *num_repeat_text; i++ {
 		for _, line := range lines {
 			searcher.IndexDocument(docId, types.DocumentIndexData{
-				Content: line})
+				Content: line}, false)
 			docId++
 			if docId-docId/1000000*1000000 == 0 {
 				log.Printf("已索引%d百万文档", docId/1000000)
@@ -150,28 +151,39 @@ func main() {
 		defer f.Close()
 	}
 
-	// 记录时间
+	// 记录时间并计算删除索引时间
 	t2 := time.Now()
+	for i := 1; i <= *num_delete_docs; i++ {
+		searcher.RemoveDocument(uint64(i), false)
+	}
+	searcher.FlushIndex()
+	t3 := time.Now()
+	log.Printf("删除 %d 条索引花费时间 %v", *num_delete_docs, t3.Sub(t2))
 
 	done := make(chan bool)
+	recordResponseLength := make(map[string]int)
 	for iThread := 0; iThread < numQueryThreads; iThread++ {
-		go search(done)
+		go search(done, recordResponseLength)
 	}
 	for iThread := 0; iThread < numQueryThreads; iThread++ {
 		<-done
 	}
+	// 测试搜索结果输出,因为不同 case 的 docId 对应不上,所以只测试总数
+	for keyword, count := range recordResponseLength {
+		log.Printf("关键词 [%s] 共搜索到 %d 个相关文档", keyword, count)
+	}
 
 	// 记录时间并计算分词速度
-	t3 := time.Now()
+	t4 := time.Now()
 	log.Printf("搜索平均响应时间 %v 毫秒",
-		t3.Sub(t2).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
+		t4.Sub(t3).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
 	log.Printf("搜索吞吐量每秒 %v 次查询",
 		float64(numRepeatQuery*numQueryThreads*len(searchQueries))/
-			t3.Sub(t2).Seconds())
+			t4.Sub(t3).Seconds())
 
 	if *use_persistent {
 		searcher.Close()
-		t4 := time.Now()
+		t5 := time.Now()
 		searcher1 := engine.Engine{}
 		searcher1.Init(types.EngineInitOptions{
 			SegmenterDictionaries: *dictionaries,
@@ -186,8 +198,8 @@ func main() {
 			PersistentStorageShards: *persistent_storage_shards,
 		})
 		defer searcher1.Close()
-		t5 := time.Now()
-		t := t5.Sub(t4).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
+		t6 := time.Now()
+		t := t6.Sub(t5).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
 		log.Print("从持久存储加入的索引总数", searcher1.NumTokenIndexAdded())
 		log.Printf("从持久存储建立索引花费时间 %v 秒", t)
 		log.Printf("从持久存储建立索引速度每秒添加 %f 百万个索引",
@@ -197,10 +209,13 @@ func main() {
 	//os.RemoveAll(*persistent_storage_folder)
 }
 
-func search(ch chan bool) {
+func search(ch chan bool, record map[string]int) {
 	for i := 0; i < numRepeatQuery; i++ {
 		for _, query := range searchQueries {
-			searcher.Search(types.SearchRequest{Text: query})
+			output := searcher.Search(types.SearchRequest{Text: query})
+			if _, found := record[query]; !found {
+				record[query] = len(output.Docs)
+			}
 		}
 	}
 	ch <- true

+ 26 - 0
types/index.go

@@ -42,3 +42,29 @@ type IndexedDocument struct {
 	// 仅当索引类型为LocationsIndex时返回有效值。
 	TokenLocations [][]int
 }
+
+// 方便批量加入文档索引
+type DocumentsIndex []*DocumentIndex
+
+func (docs DocumentsIndex) Len() int {
+	return len(docs)
+}
+func (docs DocumentsIndex) Swap(i, j int) {
+	docs[i], docs[j] = docs[j], docs[i]
+}
+func (docs DocumentsIndex) Less(i, j int) bool {
+	return docs[i].DocId < docs[j].DocId
+}
+
+// 方便批量删除文档索引
+type DocumentsId []uint64
+
+func (docs DocumentsId) Len() int {
+	return len(docs)
+}
+func (docs DocumentsId) Swap(i, j int) {
+	docs[i], docs[j] = docs[j], docs[i]
+}
+func (docs DocumentsId) Less(i, j int) bool {
+	return docs[i] < docs[j]
+}

+ 12 - 0
types/indexer_init_options.go

@@ -11,6 +11,9 @@ const (
 	// 存储关键词在文档中出现的具体字节位置(可能有多个)
 	// 如果你希望得到关键词紧邻度数据,必须使用LocationsIndex类型的索引
 	LocationsIndex = 2
+
+	// 默认插入索引表文档 CACHE SIZE
+	defaultDocCacheSize = 300000
 )
 
 // 初始化索引器选项
@@ -18,6 +21,9 @@ type IndexerInitOptions struct {
 	// 索引表的类型,见上面的常数
 	IndexType int
 
+	// 待插入索引表文档 CACHE SIZE
+	DocCacheSize int
+
 	// BM25参数
 	BM25Parameters *BM25Parameters
 }
@@ -28,3 +34,9 @@ type BM25Parameters struct {
 	K1 float32
 	B  float32
 }
+
+func (options *IndexerInitOptions) Init() {
+	if options.DocCacheSize == 0 {
+		options.DocCacheSize = defaultDocCacheSize
+	}
+}