Hui Chen 10 年之前
父節點
當前提交
8d5354aea0
共有 5 個文件被更改,包括 100 次插入49 次删除
  1. 28 6
      core/indexer.go
  2. 30 30
      core/indexer_test.go
  3. 1 1
      core/test_utils.go
  4. 22 10
      engine/engine.go
  5. 19 2
      engine/indexer_worker.go

+ 28 - 6
core/indexer.go

@@ -15,6 +15,7 @@ type Indexer struct {
 	tableLock struct {
 		sync.RWMutex
 		table map[string]*KeywordIndices
+		docs  map[uint64]bool
 	}
 
 	initOptions types.IndexerInitOptions
@@ -46,6 +47,7 @@ func (indexer *Indexer) Init(options types.IndexerInitOptions) {
 	indexer.initialized = true
 
 	indexer.tableLock.table = make(map[string]*KeywordIndices)
+	indexer.tableLock.docs = make(map[uint64]bool)
 	indexer.initOptions = options
 	indexer.docTokenLengths = make(map[uint64]float32)
 }
@@ -121,6 +123,7 @@ func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
 
 	// 更新文章总数
 	if docIdIsNew {
+		indexer.tableLock.docs[document.DocId] = true
 		indexer.numDocuments++
 	}
 }
@@ -128,7 +131,7 @@ func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
 // 查找包含全部搜索键(AND操作)的文档
 // 当docIds不为nil时仅从docIds指定的文档中查找
 func (indexer *Indexer) Lookup(
-	tokens []string, labels []string, docIds map[uint64]bool) (docs []types.IndexedDocument) {
+	tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
 	if indexer.initialized == false {
 		log.Fatal("索引器尚未初始化")
 	}
@@ -136,6 +139,7 @@ func (indexer *Indexer) Lookup(
 	if indexer.numDocuments == 0 {
 		return
 	}
+	numDocs = 0
 
 	// 合并关键词和标签为搜索键
 	keywords := make([]string, len(tokens)+len(labels))
@@ -204,7 +208,8 @@ func (indexer *Indexer) Lookup(
 			}
 		}
 
-		if found {
+		_, ok := indexer.tableLock.docs[baseDocId]
+		if found && ok {
 			indexedDoc := types.IndexedDocument{}
 
 			// 当为LocationsIndex时计算关键词紧邻距离
@@ -217,9 +222,12 @@ func (indexer *Indexer) Lookup(
 					}
 				}
 				if numTokensWithLocations != len(tokens) {
-					docs = append(docs, types.IndexedDocument{
-						DocId: baseDocId,
-					})
+					if !countDocsOnly {
+						docs = append(docs, types.IndexedDocument{
+							DocId: baseDocId,
+						})
+					}
+					numDocs++
 					break
 				}
 
@@ -261,7 +269,10 @@ func (indexer *Indexer) Lookup(
 			}
 
 			indexedDoc.DocId = baseDocId
-			docs = append(docs, indexedDoc)
+			if !countDocsOnly {
+				docs = append(docs, indexedDoc)
+			}
+			numDocs++
 		}
 	}
 	return
@@ -398,3 +409,14 @@ func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
 func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
 	return len(ti.docIds)
 }
+
+// 删除某个文档
+func (indexer *Indexer) RemoveDoc(docId uint64) {
+	if indexer.initialized == false {
+		log.Fatal("排序器尚未初始化")
+	}
+
+	indexer.tableLock.Lock()
+	delete(indexer.tableLock.docs, docId)
+	indexer.tableLock.Unlock()
+}

+ 30 - 30
core/indexer_test.go

@@ -101,29 +101,29 @@ func TestLookup(t *testing.T) {
 	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
 	utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
 
-	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil)))
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil)))
-	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
 	utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
 	utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[1 2 [0 7 14]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 26 [14 7 0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
 }
 
 func TestDocIdsIndex(t *testing.T) {
@@ -181,29 +181,29 @@ func TestDocIdsIndex(t *testing.T) {
 	utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
 	utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
 
-	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil)))
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil)))
-	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
+	utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[2 0 []] [1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
 	utils.Expect(t, "[2 0 []] [1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
 	utils.Expect(t, "[7 0 []] [1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[7 0 []] [1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 0 []] [0 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 0 []] [0 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
 
 	utils.Expect(t, "[1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
 	utils.Expect(t, "[1 0 []] ",
-		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
 }
 
 func TestLookupWithProximity(t *testing.T) {
@@ -220,7 +220,7 @@ func TestLookupWithProximity(t *testing.T) {
 		},
 	})
 	utils.Expect(t, "[0 1 [21 28]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
 
 	// doc0 = "t2 t1 . . . t2 t3"
 	indexer.AddDocument(&types.DocumentIndex{
@@ -232,7 +232,7 @@ func TestLookupWithProximity(t *testing.T) {
 		},
 	})
 	utils.Expect(t, "[0 8 [3 12 15]] ",
-		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
 
 	// doc0 = "t3 t2 t1 . . . . . t2 t3"
 	indexer.AddDocument(&types.DocumentIndex{
@@ -244,7 +244,7 @@ func TestLookupWithProximity(t *testing.T) {
 		},
 	})
 	utils.Expect(t, "[0 10 [6 3 0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
 }
 
 func TestLookupWithPartialLocations(t *testing.T) {
@@ -273,7 +273,7 @@ func TestLookupWithPartialLocations(t *testing.T) {
 	utils.Expect(t, "0 ", indicesToString(&indexer, "label1"))
 
 	utils.Expect(t, "[0 1 [21 28]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil)))
+		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false)))
 }
 
 func TestLookupWithBM25(t *testing.T) {
@@ -305,7 +305,7 @@ func TestLookupWithBM25(t *testing.T) {
 		},
 	})
 
-	outputs := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil)
+	outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false)
 
 	// BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433
 	utils.Expect(t, "76055", int(outputs[0].BM25*10000))
@@ -351,7 +351,7 @@ func TestLookupWithinDocIds(t *testing.T) {
 	docIds[0] = true
 	docIds[2] = true
 	utils.Expect(t, "[2 0 [7]] [0 0 [0]] ",
-		indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds)))
+		indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
 }
 
 func TestLookupWithLocations(t *testing.T) {
@@ -367,6 +367,6 @@ func TestLookupWithLocations(t *testing.T) {
 		},
 	})
 
-	utils.Expect(t, "[[0 21] [28]]",
-		indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)[0].TokenLocations)
+	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
+	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
 }

+ 1 - 1
core/test_utils.go

@@ -14,7 +14,7 @@ func indicesToString(indexer *Indexer, token string) (output string) {
 	return
 }
 
-func indexedDocsToString(docs []types.IndexedDocument) (output string) {
+func indexedDocsToString(docs []types.IndexedDocument, numDocs int) (output string) {
 	for _, doc := range docs {
 		output += fmt.Sprintf("[%d %d %v] ",
 			doc.DocId, doc.TokenProximity, doc.TokenSnippetLocations)

+ 22 - 10
engine/engine.go

@@ -42,6 +42,7 @@ type Engine struct {
 	// 建立索引器使用的通信通道
 	segmenterChannel           chan segmenterRequest
 	indexerAddDocumentChannels []chan indexerAddDocumentRequest
+	indexerRemoveDocChannels   []chan indexerRemoveDocRequest
 	rankerAddDocChannels       []chan rankerAddDocRequest
 
 	// 建立排序器使用的通信通道
@@ -90,12 +91,17 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	// 初始化索引器通道
 	engine.indexerAddDocumentChannels = make(
 		[]chan indexerAddDocumentRequest, options.NumShards)
+	engine.indexerRemoveDocChannels = make(
+		[]chan indexerRemoveDocRequest, options.NumShards)
 	engine.indexerLookupChannels = make(
 		[]chan indexerLookupRequest, options.NumShards)
 	for shard := 0; shard < options.NumShards; shard++ {
 		engine.indexerAddDocumentChannels[shard] = make(
 			chan indexerAddDocumentRequest,
 			options.IndexerBufferLength)
+		engine.indexerRemoveDocChannels[shard] = make(
+			chan indexerRemoveDocRequest,
+			options.IndexerBufferLength)
 		engine.indexerLookupChannels[shard] = make(
 			chan indexerLookupRequest,
 			options.IndexerBufferLength)
@@ -141,6 +147,7 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	// 启动索引器和排序器
 	for shard := 0; shard < options.NumShards; shard++ {
 		go engine.indexerAddDocumentWorker(shard)
+		go engine.indexerRemoveDocWorker(shard)
 		go engine.rankerAddDocWorker(shard)
 		go engine.rankerRemoveDocWorker(shard)
 
@@ -247,6 +254,7 @@ func (engine *Engine) RemoveDocument(docId uint64) {
 	}
 
 	for shard := 0; shard < engine.initOptions.NumShards; shard++ {
+		engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId}
 		engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
 	}
 
@@ -367,18 +375,22 @@ func (engine *Engine) Search(request types.SearchRequest) (output types.SearchRe
 
 	// 准备输出
 	output.Tokens = tokens
-	if !request.Orderless {
-		output.Docs = rankOutput
-	} else if !request.CountDocsOnly {
-		var start, end int
-		if rankOptions.MaxOutputs == 0 {
-			start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
-			end = len(rankOutput)
+	// 仅当CountDocsOnly为false时才充填output.Docs
+	if !request.CountDocsOnly {
+		if request.Orderless {
+			// 无序状态无需对Offset截断
+			output.Docs = rankOutput
 		} else {
-			start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
-			end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))
+			var start, end int
+			if rankOptions.MaxOutputs == 0 {
+				start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
+				end = len(rankOutput)
+			} else {
+				start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
+				end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))
+			}
+			output.Docs = rankOutput[start:end]
 		}
-		output.Docs = rankOutput[start:end]
 	}
 	output.NumDocs = numDocs
 	output.Timeout = isTimeout

+ 19 - 2
engine/indexer_worker.go

@@ -19,6 +19,10 @@ type indexerLookupRequest struct {
 	orderless           bool
 }
 
+type indexerRemoveDocRequest struct {
+	docId uint64
+}
+
 func (engine *Engine) indexerAddDocumentWorker(shard int) {
 	for {
 		request := <-engine.indexerAddDocumentChannels[shard]
@@ -34,10 +38,16 @@ func (engine *Engine) indexerLookupWorker(shard int) {
 		request := <-engine.indexerLookupChannels[shard]
 
 		var docs []types.IndexedDocument
+		var numDocs int
 		if request.docIds == nil {
-			docs = engine.indexers[shard].Lookup(request.tokens, request.labels, nil)
+			docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, nil, request.countDocsOnly)
 		} else {
-			docs = engine.indexers[shard].Lookup(request.tokens, request.labels, request.docIds)
+			docs, numDocs = engine.indexers[shard].Lookup(request.tokens, request.labels, request.docIds, request.countDocsOnly)
+		}
+
+		if request.countDocsOnly {
+			request.rankerReturnChannel <- rankerReturnRequest{numDocs: numDocs}
+			return
 		}
 
 		if len(docs) == 0 {
@@ -69,3 +79,10 @@ func (engine *Engine) indexerLookupWorker(shard int) {
 		engine.rankerRankChannels[shard] <- rankerRequest
 	}
 }
+
+func (engine *Engine) indexerRemoveDocWorker(shard int) {
+	for {
+		request := <-engine.indexerRemoveDocChannels[shard]
+		engine.indexers[shard].RemoveDoc(request.docId)
+	}
+}