Quellcode durchsuchen

Merge pull request #45 from faberliu/master

修正特定情况下当数据类型为LocationsIndex时,搜索结果异常,部分搜索结果会被丢弃的bug
Hui Chen vor 9 Jahren
Ursprung
Commit
95597125aa
4 geänderte Dateien mit 141 neuen und 6 gelöschten Zeilen
  1. 5 3
      core/indexer.go
  2. 61 1
      core/indexer_test.go
  3. 8 2
      engine/segmenter_worker.go
  4. 67 0
      examples/enjoy_wukong.go

+ 5 - 3
core/indexer.go

@@ -1,12 +1,13 @@
 package core
 
 import (
-	"github.com/huichen/wukong/types"
-	"github.com/huichen/wukong/utils"
 	"log"
 	"math"
 	"sort"
 	"sync"
+
+	"github.com/huichen/wukong/types"
+	"github.com/huichen/wukong/utils"
 )
 
 // 索引器
@@ -393,7 +394,8 @@ func (indexer *Indexer) Lookup(
 						})
 					}
 					numDocs++
-					break
+					//当某个关键字对应多个文档且有lable关键字存在时,若直接break,将会丢失相当一部分搜索结果
+					continue
 				}
 
 				// 计算搜索键在文档中的紧邻距离

+ 61 - 1
core/indexer_test.go

@@ -1,9 +1,11 @@
 package core
 
 import (
+	"testing"
+
+	"github.com/huichen/wukong/engine"
 	"github.com/huichen/wukong/types"
 	"github.com/huichen/wukong/utils"
-	"testing"
 )
 
 func TestAddKeywords(t *testing.T) {
@@ -440,3 +442,61 @@ func TestLookupWithLocations(t *testing.T) {
 	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
 	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
 }
+
+func TestLookupWithLocations1(t *testing.T) {
+
+	type Data struct {
+		Id      int
+		Content string
+		Labels  []string
+	}
+
+	datas := make([]Data, 0)
+
+	data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
+	datas = append(datas, data0)
+
+	data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
+	datas = append(datas, data1)
+
+	data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
+	datas = append(datas, data2)
+
+	data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
+	datas = append(datas, data3)
+
+	data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
+	datas = append(datas, data4)
+
+	// 初始化
+	searcher_locations := engine.Engine{}
+	searcher_locations.Init(types.EngineInitOptions{
+		SegmenterDictionaries: "../data/dictionary.txt",
+		IndexerInitOptions: &types.IndexerInitOptions{
+			IndexType: types.LocationsIndex,
+		},
+	})
+	defer searcher_locations.Close()
+	for _, data := range datas {
+		searcher_locations.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
+	}
+	searcher_locations.FlushIndex()
+	res_locations := searcher_locations.Search(types.SearchRequest{Text: "百度"})
+
+	searcher_docids := engine.Engine{}
+	searcher_docids.Init(types.EngineInitOptions{
+		SegmenterDictionaries: "../data/dictionary.txt",
+		IndexerInitOptions: &types.IndexerInitOptions{
+			IndexType: types.DocIdsIndex,
+		},
+	})
+	defer searcher_docids.Close()
+	for _, data := range datas {
+		searcher_docids.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
+	}
+	searcher_docids.FlushIndex()
+	res_docids := searcher_docids.Search(types.SearchRequest{Text: "百度"})
+	if res_docids.NumDocs != res_locations.NumDocs {
+		t.Errorf("期待的搜索结果个数=\"%d\", 实际=\"%d\"", res_docids.NumDocs, res_locations.NumDocs)
+	}
+}

+ 8 - 2
engine/segmenter_worker.go

@@ -50,10 +50,16 @@ func (engine *Engine) segmenterWorker() {
 		for _, label := range request.data.Labels {
 			if !engine.initOptions.NotUsingSegmenter {
 				if !engine.stopTokens.IsStopToken(label) {
-					tokensMap[label] = []int{}
+					//当正文中已存在关键字时,若不判断,位置信息将会丢失
+					if _, ok := tokensMap[label]; !ok {
+						tokensMap[label] = []int{}
+					}
 				}
 			} else {
-				tokensMap[label] = []int{}
+				//当正文中已存在关键字时,若不判断,位置信息将会丢失
+				if _, ok := tokensMap[label]; !ok {
+					tokensMap[label] = []int{}
+				}
 			}
 		}
 

+ 67 - 0
examples/enjoy_wukong.go

@@ -0,0 +1,67 @@
+package main
+
+import (
+	"log"
+
+	"github.com/huichen/wukong/engine"
+	"github.com/huichen/wukong/types"
+)
+
+var (
+	searcher = engine.Engine{}
+)
+
+type Data struct {
+	Id      int
+	Content string
+	Labels  []string
+}
+
+func (d *Data) Print() {
+	log.Println(d.Id, d.Content, d.Labels)
+}
+
+func main() {
+	datas := make([]Data, 0)
+
+	data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
+	datas = append(datas, data0)
+
+	data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
+	datas = append(datas, data1)
+
+	data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
+	datas = append(datas, data2)
+
+	data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
+	datas = append(datas, data3)
+
+	data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
+	datas = append(datas, data4)
+
+	// 初始化
+	searcher.Init(types.EngineInitOptions{
+		SegmenterDictionaries: "../data/dictionary.txt",
+		IndexerInitOptions: &types.IndexerInitOptions{
+			IndexType: types.LocationsIndex,
+			//IndexType: types.FrequenciesIndex,
+			//IndexType: types.DocIdsIndex,
+		},
+	})
+	defer searcher.Close()
+
+	// 将文档加入索引
+	for _, data := range datas {
+		searcher.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
+	}
+
+	// 等待索引刷新完毕
+	searcher.FlushIndex()
+
+	// 搜索输出格式见types.SearchResponse结构体
+	res := searcher.Search(types.SearchRequest{Text: "百度"})
+	log.Println("关键字", res.Tokens, "共有", res.NumDocs, "条搜索结果")
+	for i := range res.Docs {
+		datas[res.Docs[i].DocId].Print()
+	}
+}