瀏覽代碼

允许在悟空引擎外部对索引的文档进行分词。

Hui Chen 12 年之前
父節點
當前提交
d956874197
共有 5 個文件被更改,包括 97 次插入15 次删除
  1. 2 2
      core/ranker.go
  2. 58 3
      engine/engine_test.go
  3. 20 8
      engine/segmenter_worker.go
  4. 15 0
      types/document_index_data.go
  5. 2 2
      types/search_response.go

+ 2 - 2
core/ranker.go

@@ -65,8 +65,8 @@ func (ranker *Ranker) Rank(
 			outputDocs = append(outputDocs, types.ScoredDocument{
 				DocId:                 d.DocId,
 				Scores:                scores,
-				TokenSnippetPositions: d.TokenSnippetLocations,
-				TokenPositions:        d.TokenLocations})
+				TokenSnippetLocations: d.TokenSnippetLocations,
+				TokenLocations:        d.TokenLocations})
 		}
 	}
 

+ 58 - 3
engine/engine_test.go

@@ -76,15 +76,15 @@ func TestEngineIndexDocument(t *testing.T) {
 
 	utils.Expect(t, "1", outputs.Docs[0].DocId)
 	utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
-	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetPositions)
+	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
 
 	utils.Expect(t, "4", outputs.Docs[1].DocId)
 	utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
-	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetPositions)
+	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
 
 	utils.Expect(t, "0", outputs.Docs[2].DocId)
 	utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
-	utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetPositions)
+	utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
 }
 
 func TestReverseOrder(t *testing.T) {
@@ -246,3 +246,58 @@ func TestRemoveDocument(t *testing.T) {
 	utils.Expect(t, "0", outputs.Docs[0].DocId)
 	utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
 }
+
+func TestEngineIndexDocumentWithTokens(t *testing.T) {
+	var engine Engine
+	engine.Init(types.EngineInitOptions{
+		SegmenterDictionaries: "../testdata/test_dict.txt",
+		DefaultRankOptions: &types.RankOptions{
+			OutputOffset:    0,
+			MaxOutputs:      10,
+			ScoringCriteria: &RankByTokenProximity{},
+		},
+		IndexerInitOptions: &types.IndexerInitOptions{
+			IndexType: types.LocationsIndex,
+		},
+	})
+
+	docId := uint64(0)
+	engine.IndexDocument(docId, types.DocumentIndexData{
+		Content: "",
+		Tokens: []types.TokenData{
+			{"中国", []int{0}},
+			{"人口", []int{18, 24}},
+		},
+		Fields:  ScoringFields{1, 2, 3},
+	})
+	docId++
+	engine.IndexDocument(docId, types.DocumentIndexData{
+		Content: "",
+		Tokens: []types.TokenData{
+			{"中国", []int{0}},
+			{"人口", []int{6}},
+		},
+		Fields:  ScoringFields{1, 2, 3},
+	})
+	docId++
+	engine.IndexDocument(docId, types.DocumentIndexData{
+		Content: "中国十三亿人口",
+		Fields:  ScoringFields{0, 9, 1},
+	})
+
+	engine.FlushIndex()
+
+	outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
+	utils.Expect(t, "2", len(outputs.Tokens))
+	utils.Expect(t, "中国", outputs.Tokens[0])
+	utils.Expect(t, "人口", outputs.Tokens[1])
+	utils.Expect(t, "3", len(outputs.Docs))
+
+	utils.Expect(t, "1", outputs.Docs[0].DocId)
+	utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
+	utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
+
+	utils.Expect(t, "4", outputs.Docs[1].DocId)
+	utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
+	utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
+}

+ 20 - 8
engine/segmenter_worker.go

@@ -14,15 +14,27 @@ func (engine *Engine) segmenterWorker() {
 	for {
 		request := <-engine.segmenterChannel
 		shard := engine.getShard(request.hash)
-		segments := engine.segmenter.Segment([]byte(request.data.Content))
-		tokensMap := make(map[string][]int)
 
-		// 加入分词得到的关键词
-		for _, segment := range segments {
-			token := segment.Token().Text()
-			if !engine.stopTokens.IsStopToken(token) {
-				tokensMap[token] = append(tokensMap[token], segment.Start())
+		tokensMap := make(map[string][]int)
+		numTokens := 0
+		if request.data.Content != "" {
+			// 当文档正文不为空时,优先从内容分词中得到关键词
+			segments := engine.segmenter.Segment([]byte(request.data.Content))
+			for _, segment := range segments {
+				token := segment.Token().Text()
+				if !engine.stopTokens.IsStopToken(token) {
+					tokensMap[token] = append(tokensMap[token], segment.Start())
+				}
+			}
+			numTokens = len(segments)
+		} else {
+			// 否则载入用户输入的关键词
+			for _, t := range request.data.Tokens {
+				if !engine.stopTokens.IsStopToken(t.Text) {
+					tokensMap[t.Text] = t.Locations
+				}
 			}
+			numTokens = len(request.data.Tokens)
 		}
 
 		// 加入非分词的文档标签
@@ -35,7 +47,7 @@ func (engine *Engine) segmenterWorker() {
 		indexerRequest := indexerAddDocumentRequest{
 			document: &types.DocumentIndex{
 				DocId:       request.docId,
-				TokenLength: float32(len(segments)),
+				TokenLength: float32(numTokens),
 				Keywords:    make([]types.KeywordIndex, len(tokensMap)),
 			},
 		}

+ 15 - 0
types/document_index_data.go

@@ -4,9 +4,24 @@ type DocumentIndexData struct {
 	// 文档全文(必须是UTF-8格式),用于生成待索引的关键词
 	Content string
 
+	// 文档的关键词
+	// 当Content不为空的时候,优先从Content中分词得到关键词。
+	// Tokens存在的意义在于绕过悟空内置的分词器,在引擎外部
+	// 进行分词和预处理。
+	Tokens []TokenData
+
 	// 文档标签(必须是UTF-8格式),比如文档的类别属性等,这些标签并不出现在文档文本中
 	Labels []string
 
 	// 文档的评分字段,可以接纳任何类型的结构体
 	Fields interface{}
 }
+
+// 文档的一个关键词
+type TokenData struct {
+	// 关键词的字符串
+	Text string
+
+	// 关键词的首字节在文档中出现的位置
+	Locations []int
+}

+ 2 - 2
types/search_response.go

@@ -24,11 +24,11 @@ type ScoredDocument struct {
 
 	// 用于生成摘要的关键词在文本中的字节位置,该切片长度和SearchResponse.Tokens的长度一样
 	// 只有当IndexType == LocationsIndex时不为空
-	TokenSnippetPositions []int
+	TokenSnippetLocations []int
 
 	// 关键词出现的位置
 	// 只有当IndexType == LocationsIndex时不为空
-	TokenPositions [][]int
+	TokenLocations [][]int
 }
 
 // 为了方便排序