vor 9 Jahren · 95597125aa
--- a/core/indexer.go
+++ b/core/indexer.go
@@ -1,12 +1,13 @@
 
				 package core
			
 
				 
			
 
				 import (
			
 
				-	"github.com/huichen/wukong/types"
			
 
				-	"github.com/huichen/wukong/utils"
			
 
				 	"log"
			
 
				 	"math"
			
 
				 	"sort"
			
 
				 	"sync"
			
 
				+
			
 
				+	"github.com/huichen/wukong/types"
			
 
				+	"github.com/huichen/wukong/utils"
			
 
				 )
			
 
				 
			
 
				 // 索引器
			
@@ -393,7 +394,8 @@ func (indexer *Indexer) Lookup(
 
				 						})
			
 
				 					}
			
 
				 					numDocs++
			
 
				-					break
			
 
				+					//当某个关键字对应多个文档且有lable关键字存在时，若直接break,将会丢失相当一部分搜索结果
			
 
				+					continue
			
 
				 				}
			
 
				 
			
 
				 				// 计算搜索键在文档中的紧邻距离
			
--- a/core/indexer_test.go
+++ b/core/indexer_test.go
@@ -1,9 +1,11 @@
 
				 package core
			
 
				 
			
 
				 import (
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/huichen/wukong/engine"
			
 
				 	"github.com/huichen/wukong/types"
			
 
				 	"github.com/huichen/wukong/utils"
			
 
				-	"testing"
			
 
				 )
			
 
				 
			
 
				 func TestAddKeywords(t *testing.T) {
			
@@ -440,3 +442,61 @@ func TestLookupWithLocations(t *testing.T) {
 
				 	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
			
 
				 	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
			
 
				 }
			
 
				+
			
 
				+func TestLookupWithLocations1(t *testing.T) {
			
 
				+
			
 
				+	type Data struct {
			
 
				+		Id      int
			
 
				+		Content string
			
 
				+		Labels  []string
			
 
				+	}
			
 
				+
			
 
				+	datas := make([]Data, 0)
			
 
				+
			
 
				+	data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
			
 
				+	datas = append(datas, data0)
			
 
				+
			
 
				+	data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data1)
			
 
				+
			
 
				+	data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data2)
			
 
				+
			
 
				+	data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data3)
			
 
				+
			
 
				+	data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data4)
			
 
				+
			
 
				+	// 初始化
			
 
				+	searcher_locations := engine.Engine{}
			
 
				+	searcher_locations.Init(types.EngineInitOptions{
			
 
				+		SegmenterDictionaries: "../data/dictionary.txt",
			
 
				+		IndexerInitOptions: &types.IndexerInitOptions{
			
 
				+			IndexType: types.LocationsIndex,
			
 
				+		},
			
 
				+	})
			
 
				+	defer searcher_locations.Close()
			
 
				+	for _, data := range datas {
			
 
				+		searcher_locations.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
			
 
				+	}
			
 
				+	searcher_locations.FlushIndex()
			
 
				+	res_locations := searcher_locations.Search(types.SearchRequest{Text: "百度"})
			
 
				+
			
 
				+	searcher_docids := engine.Engine{}
			
 
				+	searcher_docids.Init(types.EngineInitOptions{
			
 
				+		SegmenterDictionaries: "../data/dictionary.txt",
			
 
				+		IndexerInitOptions: &types.IndexerInitOptions{
			
 
				+			IndexType: types.DocIdsIndex,
			
 
				+		},
			
 
				+	})
			
 
				+	defer searcher_docids.Close()
			
 
				+	for _, data := range datas {
			
 
				+		searcher_docids.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
			
 
				+	}
			
 
				+	searcher_docids.FlushIndex()
			
 
				+	res_docids := searcher_docids.Search(types.SearchRequest{Text: "百度"})
			
 
				+	if res_docids.NumDocs != res_locations.NumDocs {
			
 
				+		t.Errorf("期待的搜索结果个数=\"%d\", 实际=\"%d\"", res_docids.NumDocs, res_locations.NumDocs)
			
 
				+	}
			
 
				+}
			
--- a/engine/segmenter_worker.go
+++ b/engine/segmenter_worker.go
@@ -50,10 +50,16 @@ func (engine *Engine) segmenterWorker() {
 
				 		for _, label := range request.data.Labels {
			
 
				 			if !engine.initOptions.NotUsingSegmenter {
			
 
				 				if !engine.stopTokens.IsStopToken(label) {
			
 
				-					tokensMap[label] = []int{}
			
 
				+					//当正文中已存在关键字时，若不判断，位置信息将会丢失
			
 
				+					if _, ok := tokensMap[label]; !ok {
			
 
				+						tokensMap[label] = []int{}
			
 
				+					}
			
 
				 				}
			
 
				 			} else {
			
 
				-				tokensMap[label] = []int{}
			
 
				+				//当正文中已存在关键字时，若不判断，位置信息将会丢失
			
 
				+				if _, ok := tokensMap[label]; !ok {
			
 
				+					tokensMap[label] = []int{}
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 
			
--- a/examples/enjoy_wukong.go
+++ b/examples/enjoy_wukong.go
@@ -0,0 +1,67 @@
 
				+package main
			
 
				+
			
 
				+import (
			
 
				+	"log"
			
 
				+
			
 
				+	"github.com/huichen/wukong/engine"
			
 
				+	"github.com/huichen/wukong/types"
			
 
				+)
			
 
				+
			
 
				+var (
			
 
				+	searcher = engine.Engine{}
			
 
				+)
			
 
				+
			
 
				+type Data struct {
			
 
				+	Id      int
			
 
				+	Content string
			
 
				+	Labels  []string
			
 
				+}
			
 
				+
			
 
				+func (d *Data) Print() {
			
 
				+	log.Println(d.Id, d.Content, d.Labels)
			
 
				+}
			
 
				+
			
 
				+func main() {
			
 
				+	datas := make([]Data, 0)
			
 
				+
			
 
				+	data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
			
 
				+	datas = append(datas, data0)
			
 
				+
			
 
				+	data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data1)
			
 
				+
			
 
				+	data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data2)
			
 
				+
			
 
				+	data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data3)
			
 
				+
			
 
				+	data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
			
 
				+	datas = append(datas, data4)
			
 
				+
			
 
				+	// 初始化
			
 
				+	searcher.Init(types.EngineInitOptions{
			
 
				+		SegmenterDictionaries: "../data/dictionary.txt",
			
 
				+		IndexerInitOptions: &types.IndexerInitOptions{
			
 
				+			IndexType: types.LocationsIndex,
			
 
				+			//IndexType: types.FrequenciesIndex,
			
 
				+			//IndexType: types.DocIdsIndex,
			
 
				+		},
			
 
				+	})
			
 
				+	defer searcher.Close()
			
 
				+
			
 
				+	// 将文档加入索引
			
 
				+	for _, data := range datas {
			
 
				+		searcher.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
			
 
				+	}
			
 
				+
			
 
				+	// 等待索引刷新完毕
			
 
				+	searcher.FlushIndex()
			
 
				+
			
 
				+	// 搜索输出格式见types.SearchResponse结构体
			
 
				+	res := searcher.Search(types.SearchRequest{Text: "百度"})
			
 
				+	log.Println("关键字", res.Tokens, "共有", res.NumDocs, "条搜索结果")
			
 
				+	for i := range res.Docs {
			
 
				+		datas[res.Docs[i].DocId].Print()
			
 
				+	}
			
 
				+}