Browse Source

change docId to string type

xinxin 4 years ago
parent
commit
3d75a88c8c

+ 19 - 0
README.md

@@ -1,3 +1,22 @@
+wukongd
+========
+wukongd 是基于 [wukong](https://github.com/huichen/wukong) 的搜索引擎. wukong 是一个注重研究和部分实用性的项目,
+wukongd 在其基础上主要做了如下更新:
+
+* 文档 Id (docId) 不再是 uint64 类型, 而是 string 类型.
+	这样就简单地使 Id 有更强的表达能力, 轻易地支持多库多表多字段的索引. 只比 uint64 类型有轻微的性能损失
+* 增加 REST RPC 接口.
+	引擎部分不再作为一个库被使用, 只需要启动 wukongd 调用其接口即可
+* 增加 wukongd 配置文件.
+
+* 如果需要使用 wukong 的代码, 可以直接导航到其 github repo, 或 checkout orig 分支.
+
+TODO
+=====
+
+* 定义配置文件内容
+* 定义 API 协议
+
 悟空全文搜索引擎
 ======
 

+ 15 - 17
core/indexer.go

@@ -17,7 +17,7 @@ type Indexer struct {
 	tableLock struct {
 		sync.RWMutex
 		table     map[string]*KeywordIndices
-		docsState map[uint64]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
+		docsState map[string]int // nil: 表示无状态记录,0: 存在于索引中,1: 等待删除,2: 等待加入
 	}
 	addCacheLock struct {
 		sync.RWMutex
@@ -40,13 +40,13 @@ type Indexer struct {
 	totalTokenLength float32
 
 	// 每个文档的关键词长度
-	docTokenLengths map[uint64]float32
+	docTokenLengths map[string]float32
 }
 
 // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
 type KeywordIndices struct {
 	// 下面的切片是否为空,取决于初始化时IndexType的值
-	docIds      []uint64  // 全部类型都有
+	docIds      []string  // 全部类型都有
 	frequencies []float32 // IndexType == FrequenciesIndex
 	locations   [][]int   // IndexType == LocationsIndex
 }
@@ -61,14 +61,14 @@ func (indexer *Indexer) Init(options types.IndexerInitOptions) {
 	indexer.initialized = true
 
 	indexer.tableLock.table = make(map[string]*KeywordIndices)
-	indexer.tableLock.docsState = make(map[uint64]int)
+	indexer.tableLock.docsState = make(map[string]int)
 	indexer.addCacheLock.addCache = make([]*types.DocumentIndex, indexer.initOptions.DocCacheSize)
-	indexer.removeCacheLock.removeCache = make([]uint64, indexer.initOptions.DocCacheSize*2)
-	indexer.docTokenLengths = make(map[uint64]float32)
+	indexer.removeCacheLock.removeCache = make([]string, indexer.initOptions.DocCacheSize*2)
+	indexer.docTokenLengths = make(map[string]float32)
 }
 
 // 从KeywordIndices中得到第i个文档的DocId
-func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
+func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) string {
 	return ti.docIds[i]
 }
 
@@ -116,7 +116,7 @@ func (indexer *Indexer) AddDocumentToCache(document *types.DocumentIndex, forceU
 		}
 
 		indexer.tableLock.Unlock()
-		if indexer.RemoveDocumentToCache(0, forceUpdate) {
+		if indexer.RemoveDocumentToCache("", forceUpdate) {
 			// 只有当存在于索引表中的文档已被删除,其才可以重新加入到索引表中
 			position = 0
 		}
@@ -171,14 +171,13 @@ func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
 				case types.FrequenciesIndex:
 					ti.frequencies = []float32{keyword.Frequency}
 				}
-				ti.docIds = []uint64{document.DocId}
+				ti.docIds = []string{document.DocId}
 				indexer.tableLock.table[keyword.Text] = &ti
 				continue
 			}
 
 			// 查找应该插入的位置,且索引一定不存在
-			position, _ := indexer.searchIndex(
-				indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
+			position, _ := indexer.searchIndex(indices, indexPointers[keyword.Text], indexer.getIndexLength(indices)-1, document.DocId)
 			indexPointers[keyword.Text] = position
 			switch indexer.initOptions.IndexType {
 			case types.LocationsIndex:
@@ -190,7 +189,7 @@ func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
 				copy(indices.frequencies[position+1:], indices.frequencies[position:])
 				indices.frequencies[position] = keyword.Frequency
 			}
-			indices.docIds = append(indices.docIds, 0)
+			indices.docIds = append(indices.docIds, "")
 			copy(indices.docIds[position+1:], indices.docIds[position:])
 			indices.docIds[position] = document.DocId
 		}
@@ -205,13 +204,13 @@ func (indexer *Indexer) AddDocuments(documents *types.DocumentsIndex) {
 
 // 向 REMOVECACHE 中加入一个待删除文档
 // 返回值表示文档是否在索引表中被删除
-func (indexer *Indexer) RemoveDocumentToCache(docId uint64, forceUpdate bool) bool {
+func (indexer *Indexer) RemoveDocumentToCache(docId string, forceUpdate bool) bool {
 	if indexer.initialized == false {
 		log.Fatal("索引器尚未初始化")
 	}
 
 	indexer.removeCacheLock.Lock()
-	if docId != 0 {
+	if docId != "" {
 		indexer.tableLock.Lock()
 		if docState, ok := indexer.tableLock.docsState[docId]; ok && docState == 0 {
 			indexer.removeCacheLock.removeCache[indexer.removeCacheLock.removeCachePointer] = docId
@@ -303,7 +302,7 @@ func (indexer *Indexer) RemoveDocuments(documents *types.DocumentsId) {
 // 查找包含全部搜索键(AND操作)的文档
 // 当docIds不为nil时仅从docIds指定的文档中查找
 func (indexer *Indexer) Lookup(
-	tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
+	tokens []string, labels []string, docIds map[string]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
 	if indexer.initialized == false {
 		log.Fatal("索引器尚未初始化")
 	}
@@ -454,8 +453,7 @@ func (indexer *Indexer) Lookup(
 // 二分法查找indices中某文档的索引项
 // 第一个返回参数为找到的位置或需要插入的位置
 // 第二个返回参数标明是否找到
-func (indexer *Indexer) searchIndex(
-	indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
+func (indexer *Indexer) searchIndex(indices *KeywordIndices, start int, end int, docId string) (int, bool) {
 	// 特殊情况
 	if indexer.getIndexLength(indices) == start {
 		return start, false

+ 44 - 44
core/indexer_test.go

@@ -11,27 +11,27 @@ func TestAddKeywords(t *testing.T) {
 	var indexer Indexer
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    1,
+		DocId:    "1",
 		Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    2,
+		DocId:    "2",
 		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    3,
+		DocId:    "3",
 		Keywords: []types.KeywordIndex{{"token3", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    7,
+		DocId:    "7",
 		Keywords: []types.KeywordIndex{{"token7", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    1,
+		DocId:    "1",
 		Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:    7,
+		DocId:    "7",
 		Keywords: []types.KeywordIndex{{"token77", 0, []int{}}},
 	}, false)
 	indexer.AddDocumentToCache(nil, true)
@@ -48,7 +48,7 @@ func TestRemoveDocument(t *testing.T) {
 
 	// doc1 = "token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -56,7 +56,7 @@ func TestRemoveDocument(t *testing.T) {
 	}, false)
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -66,10 +66,10 @@ func TestRemoveDocument(t *testing.T) {
 	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token2"))
 	utils.Expect(t, "1 ", indicesToString(&indexer, "token3"))
 
-	indexer.RemoveDocumentToCache(2, false)
+	indexer.RemoveDocumentToCache("2", false)
 	// doc1 = "token1 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -81,7 +81,7 @@ func TestRemoveDocument(t *testing.T) {
 
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -94,20 +94,20 @@ func TestRemoveDocument(t *testing.T) {
 
 	// doc3 = "token1 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 3,
+		DocId: "3",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
 		},
 	}, true)
-	indexer.RemoveDocumentToCache(3, true)
+	indexer.RemoveDocumentToCache("3", true)
 	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token1"))
 	utils.Expect(t, "2 ", indicesToString(&indexer, "token2"))
 	utils.Expect(t, "1 2 ", indicesToString(&indexer, "token3"))
 
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -115,7 +115,7 @@ func TestRemoveDocument(t *testing.T) {
 	}, true)
 	// doc3 = "token1 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 3,
+		DocId: "3",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -131,7 +131,7 @@ func TestLookupLocationsIndex(t *testing.T) {
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 	// doc1 = "token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -139,7 +139,7 @@ func TestLookupLocationsIndex(t *testing.T) {
 	}, false)
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -148,7 +148,7 @@ func TestLookupLocationsIndex(t *testing.T) {
 	}, false)
 	// doc3 = "token1 token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 3,
+		DocId: "3",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -156,14 +156,14 @@ func TestLookupLocationsIndex(t *testing.T) {
 	}, false)
 	// doc4 = "token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 4,
+		DocId: "4",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
 	}, false)
 	// doc7 = "token1 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 7,
+		DocId: "7",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -171,7 +171,7 @@ func TestLookupLocationsIndex(t *testing.T) {
 	}, false)
 	// doc9 = "token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 9,
+		DocId: "9",
 		Keywords: []types.KeywordIndex{
 			{"token3", 0, []int{0}},
 		},
@@ -211,7 +211,7 @@ func TestLookupDocIdsIndex(t *testing.T) {
 	indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
 	// doc1 = "token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -219,7 +219,7 @@ func TestLookupDocIdsIndex(t *testing.T) {
 	}, false)
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -228,7 +228,7 @@ func TestLookupDocIdsIndex(t *testing.T) {
 	}, false)
 	// doc3 = "token1 token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 3,
+		DocId: "3",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -236,14 +236,14 @@ func TestLookupDocIdsIndex(t *testing.T) {
 	}, false)
 	// doc4 = "token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 4,
+		DocId: "4",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
 	}, false)
 	// doc7 = "token1 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 7,
+		DocId: "7",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -251,7 +251,7 @@ func TestLookupDocIdsIndex(t *testing.T) {
 	}, false)
 	// doc9 = "token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 9,
+		DocId: "9",
 		Keywords: []types.KeywordIndex{
 			{"token3", 0, []int{0}},
 		},
@@ -292,7 +292,7 @@ func TestLookupWithProximity(t *testing.T) {
 
 	// doc1 = "token2 token4 token4 token2 token3 token4"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
@@ -304,7 +304,7 @@ func TestLookupWithProximity(t *testing.T) {
 
 	// doc1 = "t2 t1 . . . t2 t3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"t1", 0, []int{3}},
 			{"t2", 0, []int{0, 12}},
@@ -316,7 +316,7 @@ func TestLookupWithProximity(t *testing.T) {
 
 	// doc1 = "t3 t2 t1 . . . . . t2 t3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"t1", 0, []int{6}},
 			{"t2", 0, []int{3, 19}},
@@ -332,7 +332,7 @@ func TestLookupWithPartialLocations(t *testing.T) {
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 	// doc1 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
@@ -342,7 +342,7 @@ func TestLookupWithPartialLocations(t *testing.T) {
 	}, false)
 	// doc2 = "token2 token4 token4 token2 token3 token4"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
@@ -367,7 +367,7 @@ func TestLookupWithBM25(t *testing.T) {
 	})
 	// doc1 = "token2 token4 token4 token2 token3 token4"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:       1,
+		DocId:       "1",
 		TokenLength: 6,
 		Keywords: []types.KeywordIndex{
 			{"token2", 3, []int{0, 21}},
@@ -377,7 +377,7 @@ func TestLookupWithBM25(t *testing.T) {
 	}, false)
 	// doc2 = "token6 token7"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId:       2,
+		DocId:       "2",
 		TokenLength: 2,
 		Keywords: []types.KeywordIndex{
 			{"token6", 3, []int{0}},
@@ -396,7 +396,7 @@ func TestLookupWithinDocIds(t *testing.T) {
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 	// doc1 = "token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 			{"token3", 0, []int{7}},
@@ -404,7 +404,7 @@ func TestLookupWithinDocIds(t *testing.T) {
 	}, false)
 	// doc2 = "token1 token2 token3"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -413,7 +413,7 @@ func TestLookupWithinDocIds(t *testing.T) {
 	}, false)
 	// doc3 = "token1 token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 3,
+		DocId: "3",
 		Keywords: []types.KeywordIndex{
 			{"token1", 0, []int{0}},
 			{"token2", 0, []int{7}},
@@ -421,15 +421,15 @@ func TestLookupWithinDocIds(t *testing.T) {
 	}, false)
 	// doc4 = "token2"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 4,
+		DocId: "4",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0}},
 		},
 	}, true)
 
-	docIds := make(map[uint64]bool)
-	docIds[1] = true
-	docIds[3] = true
+	docIds := make(map[string]bool)
+	docIds["1"] = true
+	docIds["3"] = true
 	utils.Expect(t, "[3 0 [7]] [1 0 [0]] ",
 		indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
 }
@@ -439,7 +439,7 @@ func TestLookupWithLocations(t *testing.T) {
 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
 	// doc1 = "token2 token4 token4 token2 token3 token4"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 1,
+		DocId: "1",
 		Keywords: []types.KeywordIndex{
 			{"token2", 0, []int{0, 21}},
 			{"token3", 0, []int{28}},
@@ -449,7 +449,7 @@ func TestLookupWithLocations(t *testing.T) {
 
 	// doc2 = "token2 token4 token4 token2 token3 token4"
 	indexer.AddDocumentToCache(&types.DocumentIndex{
-		DocId: 2,
+		DocId: "2",
 		Keywords: []types.KeywordIndex{
 			{"token3", 0, []int{0, 21}},
 			{"token5", 0, []int{28}},
@@ -457,7 +457,7 @@ func TestLookupWithLocations(t *testing.T) {
 		},
 	}, true)
 
-	indexer.RemoveDocumentToCache(2, true)
+	indexer.RemoveDocumentToCache("2", true)
 	docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
 	utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
 }

+ 9 - 8
core/ranker.go

@@ -1,18 +1,19 @@
 package core
 
 import (
-	"github.com/huichen/wukong/types"
-	"github.com/huichen/wukong/utils"
 	"log"
 	"sort"
 	"sync"
+
+	"github.com/huichen/wukong/types"
+	"github.com/huichen/wukong/utils"
 )
 
 type Ranker struct {
 	lock struct {
 		sync.RWMutex
-		fields map[uint64]interface{}
-		docs   map[uint64]bool
+		fields map[string]interface{}
+		docs   map[string]bool
 	}
 	initialized bool
 }
@@ -23,12 +24,12 @@ func (ranker *Ranker) Init() {
 	}
 	ranker.initialized = true
 
-	ranker.lock.fields = make(map[uint64]interface{})
-	ranker.lock.docs = make(map[uint64]bool)
+	ranker.lock.fields = make(map[string]interface{})
+	ranker.lock.docs = make(map[string]bool)
 }
 
 // 给某个文档添加评分字段
-func (ranker *Ranker) AddDoc(docId uint64, fields interface{}) {
+func (ranker *Ranker) AddDoc(docId string, fields interface{}) {
 	if ranker.initialized == false {
 		log.Fatal("排序器尚未初始化")
 	}
@@ -40,7 +41,7 @@ func (ranker *Ranker) AddDoc(docId uint64, fields interface{}) {
 }
 
 // 删除某个文档的评分字段
-func (ranker *Ranker) RemoveDoc(docId uint64) {
+func (ranker *Ranker) RemoveDoc(docId string) {
 	if ranker.initialized == false {
 		log.Fatal("排序器尚未初始化")
 	}

+ 33 - 32
core/ranker_test.go

@@ -1,10 +1,11 @@
 package core
 
 import (
-	"github.com/huichen/wukong/types"
-	"github.com/huichen/wukong/utils"
 	"reflect"
 	"testing"
+
+	"github.com/huichen/wukong/types"
+	"github.com/huichen/wukong/utils"
 )
 
 type DummyScoringFields struct {
@@ -33,22 +34,22 @@ func (criteria DummyScoringCriteria) Score(
 func TestRankDocument(t *testing.T) {
 	var ranker Ranker
 	ranker.Init()
-	ranker.AddDoc(1, DummyScoringFields{})
-	ranker.AddDoc(3, DummyScoringFields{})
-	ranker.AddDoc(4, DummyScoringFields{})
+	ranker.AddDoc("1", DummyScoringFields{})
+	ranker.AddDoc("3", DummyScoringFields{})
+	ranker.AddDoc("4", DummyScoringFields{})
 
 	scoredDocs, _ := ranker.Rank([]types.IndexedDocument{
-		types.IndexedDocument{DocId: 1, BM25: 6},
-		types.IndexedDocument{DocId: 3, BM25: 24},
-		types.IndexedDocument{DocId: 4, BM25: 18},
+		types.IndexedDocument{DocId: "1", BM25: 6},
+		types.IndexedDocument{DocId: "3", BM25: 24},
+		types.IndexedDocument{DocId: "4", BM25: 18},
 	}, types.RankOptions{ScoringCriteria: types.RankByBM25{}}, false)
 	utils.Expect(t, "[3 [24000 ]] [4 [18000 ]] [1 [6000 ]] ", scoredDocsToString(scoredDocs))
 
 	scoredDocs, _ = ranker.Rank([]types.IndexedDocument{
-		types.IndexedDocument{DocId: 1, BM25: 6},
-		types.IndexedDocument{DocId: 3, BM25: 24},
-		types.IndexedDocument{DocId: 2, BM25: 0},
-		types.IndexedDocument{DocId: 4, BM25: 18},
+		types.IndexedDocument{DocId: "1", BM25: 6},
+		types.IndexedDocument{DocId: "3", BM25: 24},
+		types.IndexedDocument{DocId: "2", BM25: 0},
+		types.IndexedDocument{DocId: "4", BM25: 18},
 	}, types.RankOptions{ScoringCriteria: types.RankByBM25{}, ReverseOrder: true}, false)
 	// doc0因为没有AddDoc所以没有添加进来
 	utils.Expect(t, "[1 [6000 ]] [4 [18000 ]] [3 [24000 ]] ", scoredDocsToString(scoredDocs))
@@ -57,22 +58,22 @@ func TestRankDocument(t *testing.T) {
 func TestRankWithCriteria(t *testing.T) {
 	var ranker Ranker
 	ranker.Init()
-	ranker.AddDoc(1, DummyScoringFields{
+	ranker.AddDoc("1", DummyScoringFields{
 		label:   "label3",
 		counter: 3,
 		amount:  22.3,
 	})
-	ranker.AddDoc(2, DummyScoringFields{
+	ranker.AddDoc("2", DummyScoringFields{
 		label:   "label4",
 		counter: 1,
 		amount:  2,
 	})
-	ranker.AddDoc(3, DummyScoringFields{
+	ranker.AddDoc("3", DummyScoringFields{
 		label:   "label1",
 		counter: 7,
 		amount:  10.3,
 	})
-	ranker.AddDoc(4, DummyScoringFields{
+	ranker.AddDoc("4", DummyScoringFields{
 		label:   "label1",
 		counter: -1,
 		amount:  2.3,
@@ -80,19 +81,19 @@ func TestRankWithCriteria(t *testing.T) {
 
 	criteria := DummyScoringCriteria{}
 	scoredDocs, _ := ranker.Rank([]types.IndexedDocument{
-		types.IndexedDocument{DocId: 1, TokenProximity: 6},
-		types.IndexedDocument{DocId: 2, TokenProximity: -1},
-		types.IndexedDocument{DocId: 3, TokenProximity: 24},
-		types.IndexedDocument{DocId: 4, TokenProximity: 18},
+		types.IndexedDocument{DocId: "1", TokenProximity: 6},
+		types.IndexedDocument{DocId: "2", TokenProximity: -1},
+		types.IndexedDocument{DocId: "3", TokenProximity: 24},
+		types.IndexedDocument{DocId: "4", TokenProximity: 18},
 	}, types.RankOptions{ScoringCriteria: criteria}, false)
 	utils.Expect(t, "[1 [25300 ]] [3 [17300 ]] [2 [3000 ]] [4 [1300 ]] ", scoredDocsToString(scoredDocs))
 
 	criteria.Threshold = 4
 	scoredDocs, _ = ranker.Rank([]types.IndexedDocument{
-		types.IndexedDocument{DocId: 1, TokenProximity: 6},
-		types.IndexedDocument{DocId: 2, TokenProximity: -1},
-		types.IndexedDocument{DocId: 3, TokenProximity: 24},
-		types.IndexedDocument{DocId: 4, TokenProximity: 18},
+		types.IndexedDocument{DocId: "1", TokenProximity: 6},
+		types.IndexedDocument{DocId: "2", TokenProximity: -1},
+		types.IndexedDocument{DocId: "3", TokenProximity: 24},
+		types.IndexedDocument{DocId: "4", TokenProximity: 18},
 	}, types.RankOptions{ScoringCriteria: criteria}, false)
 	utils.Expect(t, "[1 [25300 ]] [3 [17300 ]] ", scoredDocsToString(scoredDocs))
 }
@@ -100,29 +101,29 @@ func TestRankWithCriteria(t *testing.T) {
 func TestRemoveDoc(t *testing.T) {
 	var ranker Ranker
 	ranker.Init()
-	ranker.AddDoc(1, DummyScoringFields{
+	ranker.AddDoc("1", DummyScoringFields{
 		label:   "label3",
 		counter: 3,
 		amount:  22.3,
 	})
-	ranker.AddDoc(2, DummyScoringFields{
+	ranker.AddDoc("2", DummyScoringFields{
 		label:   "label4",
 		counter: 1,
 		amount:  2,
 	})
-	ranker.AddDoc(3, DummyScoringFields{
+	ranker.AddDoc("3", DummyScoringFields{
 		label:   "label1",
 		counter: 7,
 		amount:  10.3,
 	})
-	ranker.RemoveDoc(3)
+	ranker.RemoveDoc("3")
 
 	criteria := DummyScoringCriteria{}
 	scoredDocs, _ := ranker.Rank([]types.IndexedDocument{
-		types.IndexedDocument{DocId: 1, TokenProximity: 6},
-		types.IndexedDocument{DocId: 2, TokenProximity: -1},
-		types.IndexedDocument{DocId: 3, TokenProximity: 24},
-		types.IndexedDocument{DocId: 4, TokenProximity: 18},
+		types.IndexedDocument{DocId: "1", TokenProximity: 6},
+		types.IndexedDocument{DocId: "2", TokenProximity: -1},
+		types.IndexedDocument{DocId: "3", TokenProximity: 24},
+		types.IndexedDocument{DocId: "4", TokenProximity: 18},
 	}, types.RankOptions{ScoringCriteria: criteria}, false)
 	utils.Expect(t, "[1 [25300 ]] [2 [3000 ]] ", scoredDocsToString(scoredDocs))
 }

+ 4 - 5
core/test_utils.go

@@ -2,14 +2,14 @@ package core
 
 import (
 	"fmt"
+
 	"github.com/huichen/wukong/types"
 )
 
 func indicesToString(indexer *Indexer, token string) (output string) {
 	if indices, ok := indexer.tableLock.table[token]; ok {
 		for i := 0; i < indexer.getIndexLength(indices); i++ {
-			output += fmt.Sprintf("%d ",
-				indexer.getDocId(indices, i))
+			output += fmt.Sprintf("%s ", indexer.getDocId(indices, i))
 		}
 	}
 	return
@@ -17,15 +17,14 @@ func indicesToString(indexer *Indexer, token string) (output string) {
 
 func indexedDocsToString(docs []types.IndexedDocument, numDocs int) (output string) {
 	for _, doc := range docs {
-		output += fmt.Sprintf("[%d %d %v] ",
-			doc.DocId, doc.TokenProximity, doc.TokenSnippetLocations)
+		output += fmt.Sprintf("[%s %d %v] ", doc.DocId, doc.TokenProximity, doc.TokenSnippetLocations)
 	}
 	return
 }
 
 func scoredDocsToString(docs []types.ScoredDocument) (output string) {
 	for _, doc := range docs {
-		output += fmt.Sprintf("[%d [", doc.DocId)
+		output += fmt.Sprintf("[%s [", doc.DocId)
 		for _, score := range doc.Scores {
 			output += fmt.Sprintf("%d ", int(score*1000))
 		}

+ 47 - 37
engine/engine.go

@@ -2,12 +2,6 @@ package engine
 
 import (
 	"fmt"
-	"github.com/huichen/murmur"
-	"github.com/huichen/sego"
-	"github.com/huichen/wukong/core"
-	"github.com/huichen/wukong/storage"
-	"github.com/huichen/wukong/types"
-	"github.com/huichen/wukong/utils"
 	"log"
 	"os"
 	"runtime"
@@ -15,6 +9,13 @@ import (
 	"strconv"
 	"sync/atomic"
 	"time"
+
+	"github.com/huichen/murmur"
+	"github.com/huichen/sego"
+	"github.com/huichen/wukong/core"
+	"github.com/huichen/wukong/storage"
+	"github.com/huichen/wukong/types"
+	"github.com/huichen/wukong/utils"
 )
 
 const (
@@ -89,16 +90,12 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	}
 
 	// 初始化分词器通道
-	engine.segmenterChannel = make(
-		chan segmenterRequest, options.NumSegmenterThreads)
+	engine.segmenterChannel = make(chan segmenterRequest, options.NumSegmenterThreads)
 
 	// 初始化索引器通道
-	engine.indexerAddDocChannels = make(
-		[]chan indexerAddDocumentRequest, options.NumShards)
-	engine.indexerRemoveDocChannels = make(
-		[]chan indexerRemoveDocRequest, options.NumShards)
-	engine.indexerLookupChannels = make(
-		[]chan indexerLookupRequest, options.NumShards)
+	engine.indexerAddDocChannels = make([]chan indexerAddDocumentRequest, options.NumShards)
+	engine.indexerRemoveDocChannels = make([]chan indexerRemoveDocRequest, options.NumShards)
+	engine.indexerLookupChannels = make([]chan indexerLookupRequest, options.NumShards)
 	for shard := 0; shard < options.NumShards; shard++ {
 		engine.indexerAddDocChannels[shard] = make(
 			chan indexerAddDocumentRequest,
@@ -112,12 +109,9 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	}
 
 	// 初始化排序器通道
-	engine.rankerAddDocChannels = make(
-		[]chan rankerAddDocRequest, options.NumShards)
-	engine.rankerRankChannels = make(
-		[]chan rankerRankRequest, options.NumShards)
-	engine.rankerRemoveDocChannels = make(
-		[]chan rankerRemoveDocRequest, options.NumShards)
+	engine.rankerAddDocChannels = make([]chan rankerAddDocRequest, options.NumShards)
+	engine.rankerRankChannels = make([]chan rankerRankRequest, options.NumShards)
+	engine.rankerRemoveDocChannels = make([]chan rankerRemoveDocRequest, options.NumShards)
 	for shard := 0; shard < options.NumShards; shard++ {
 		engine.rankerAddDocChannels[shard] = make(
 			chan rankerAddDocRequest,
@@ -133,14 +127,11 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	// 初始化持久化存储通道
 	if engine.initOptions.UsePersistentStorage {
 		engine.persistentStorageIndexDocumentChannels =
-			make([]chan persistentStorageIndexDocumentRequest,
-				engine.initOptions.PersistentStorageShards)
+			make([]chan persistentStorageIndexDocumentRequest, engine.initOptions.PersistentStorageShards)
 		for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
-			engine.persistentStorageIndexDocumentChannels[shard] = make(
-				chan persistentStorageIndexDocumentRequest)
+			engine.persistentStorageIndexDocumentChannels[shard] = make(chan persistentStorageIndexDocumentRequest)
 		}
-		engine.persistentStorageInitChannel = make(
-			chan bool, engine.initOptions.PersistentStorageShards)
+		engine.persistentStorageInitChannel = make(chan bool, engine.initOptions.PersistentStorageShards)
 	}
 
 	// 启动分词器
@@ -216,6 +207,15 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 	atomic.AddUint64(&engine.numDocumentsStored, engine.numIndexingRequests)
 }
 
+func uidToSidComp(docId uint64) string {
+	id := strconv.FormatUint(docId, 10)
+	if docId == 0 {
+		id = ""
+	}
+	return id
+}
+
+// @deprecated Please use `IndexDocumentS` instead.
 // 将文档加入索引
 //
 // 输入参数:
@@ -228,31 +228,36 @@ func (engine *Engine) Init(options types.EngineInitOptions) {
 //      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
 //         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
 func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData, forceUpdate bool) {
+	id := uidToSidComp(docId)
+	engine.IndexDocumentS(id, data, forceUpdate)
+}
+
+func (engine *Engine) IndexDocumentS(docId string, data types.DocumentIndexData, forceUpdate bool) {
 	engine.internalIndexDocument(docId, data, forceUpdate)
 
-	hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
-	if engine.initOptions.UsePersistentStorage && docId != 0 {
+	hash := murmur.Murmur3([]byte(docId)) % uint32(engine.initOptions.PersistentStorageShards)
+	if engine.initOptions.UsePersistentStorage && docId != "" {
 		engine.persistentStorageIndexDocumentChannels[hash] <- persistentStorageIndexDocumentRequest{docId: docId, data: data}
 	}
 }
 
-func (engine *Engine) internalIndexDocument(
-	docId uint64, data types.DocumentIndexData, forceUpdate bool) {
+func (engine *Engine) internalIndexDocument(docId string, data types.DocumentIndexData, forceUpdate bool) {
 	if !engine.initialized {
 		log.Fatal("必须先初始化引擎")
 	}
 
-	if docId != 0 {
+	if docId != "" {
 		atomic.AddUint64(&engine.numIndexingRequests, 1)
 	}
 	if forceUpdate {
 		atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
 	}
-	hash := murmur.Murmur3([]byte(fmt.Sprint("%d%s", docId, data.Content)))
+	hash := murmur.Murmur3([]byte(fmt.Sprintf("%s%s", docId, data.Content)))
 	engine.segmenterChannel <- segmenterRequest{
 		docId: docId, hash: hash, data: data, forceUpdate: forceUpdate}
 }
 
+// @deprecated Please use `RemoveDocumentS` instead
 // 将文档从索引中删除
 //
 // 输入参数:
@@ -264,11 +269,16 @@ func (engine *Engine) internalIndexDocument(
 //      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
 //         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
 func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
+	id := uidToSidComp(docId)
+	engine.RemoveDocumentS(id, forceUpdate)
+}
+
+func (engine *Engine) RemoveDocumentS(docId string, forceUpdate bool) {
 	if !engine.initialized {
 		log.Fatal("必须先初始化引擎")
 	}
 
-	if docId != 0 {
+	if docId != "" {
 		atomic.AddUint64(&engine.numRemovingRequests, 1)
 	}
 	if forceUpdate {
@@ -276,15 +286,15 @@ func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
 	}
 	for shard := 0; shard < engine.initOptions.NumShards; shard++ {
 		engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId, forceUpdate: forceUpdate}
-		if docId == 0 {
+		if docId == "" {
 			continue
 		}
 		engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
 	}
 
-	if engine.initOptions.UsePersistentStorage && docId != 0 {
+	if engine.initOptions.UsePersistentStorage && docId != "" {
 		// 从数据库中删除
-		hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
+		hash := murmur.Murmur3([]byte(docId)) % uint32(engine.initOptions.PersistentStorageShards)
 		go engine.persistentStorageRemoveDocumentWorker(docId, hash)
 	}
 }
@@ -421,7 +431,7 @@ func (engine *Engine) FlushIndex() {
 		}
 	}
 	// 强制更新,保证其为最后的请求
-	engine.IndexDocument(0, types.DocumentIndexData{}, true)
+	engine.IndexDocumentS("", types.DocumentIndexData{}, true)
 	for {
 		runtime.Gosched()
 		if engine.numForceUpdatingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsForceUpdated {

+ 7 - 7
engine/engine_test.go

@@ -2,11 +2,12 @@ package engine
 
 import (
 	"encoding/gob"
-	"github.com/huichen/wukong/types"
-	"github.com/huichen/wukong/utils"
 	"os"
 	"reflect"
 	"testing"
+
+	"github.com/huichen/wukong/types"
+	"github.com/huichen/wukong/utils"
 )
 
 type ScoringFields struct {
@@ -226,8 +227,7 @@ func TestCompactIndex(t *testing.T) {
 type BM25ScoringCriteria struct {
 }
 
-func (criteria BM25ScoringCriteria) Score(
-	doc types.IndexedDocument, fields interface{}) []float32 {
+func (criteria BM25ScoringCriteria) Score(doc types.IndexedDocument, fields interface{}) []float32 {
 	if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
 		return []float32{}
 	}
@@ -470,9 +470,9 @@ func TestSearchWithin(t *testing.T) {
 
 	AddDocs(&engine)
 
-	docIds := make(map[uint64]bool)
-	docIds[5] = true
-	docIds[1] = true
+	docIds := make(map[string]bool)
+	docIds["5"] = true
+	docIds["1"] = true
 	outputs := engine.Search(types.SearchRequest{
 		Text:   "中国人口",
 		DocIds: docIds,

+ 6 - 5
engine/indexer_worker.go

@@ -1,8 +1,9 @@
 package engine
 
 import (
-	"github.com/huichen/wukong/types"
 	"sync/atomic"
+
+	"github.com/huichen/wukong/types"
 )
 
 type indexerAddDocumentRequest struct {
@@ -14,14 +15,14 @@ type indexerLookupRequest struct {
 	countDocsOnly       bool
 	tokens              []string
 	labels              []string
-	docIds              map[uint64]bool
+	docIds              map[string]bool
 	options             types.RankOptions
 	rankerReturnChannel chan rankerReturnRequest
 	orderless           bool
 }
 
 type indexerRemoveDocRequest struct {
-	docId       uint64
+	docId       string
 	forceUpdate bool
 }
 
@@ -44,7 +45,7 @@ func (engine *Engine) indexerRemoveDocWorker(shard int) {
 	for {
 		request := <-engine.indexerRemoveDocChannels[shard]
 		engine.indexers[shard].RemoveDocumentToCache(request.docId, request.forceUpdate)
-		if request.docId != 0 {
+		if request.docId != "" {
 			atomic.AddUint64(&engine.numDocumentsRemoved, 1)
 		}
 		if request.forceUpdate {
@@ -79,7 +80,7 @@ func (engine *Engine) indexerLookupWorker(shard int) {
 			var outputDocs []types.ScoredDocument
 			for _, d := range docs {
 				outputDocs = append(outputDocs, types.ScoredDocument{
-					DocId: d.DocId,
+					DocId:                 d.DocId,
 					TokenSnippetLocations: d.TokenSnippetLocations,
 					TokenLocations:        d.TokenLocations})
 			}

+ 7 - 13
engine/persistent_storage_worker.go

@@ -2,14 +2,14 @@ package engine
 
 import (
 	"bytes"
-	"encoding/binary"
 	"encoding/gob"
-	"github.com/huichen/wukong/types"
 	"sync/atomic"
+
+	"github.com/huichen/wukong/types"
 )
 
 type persistentStorageIndexDocumentRequest struct {
-	docId uint64
+	docId string
 	data  types.DocumentIndexData
 }
 
@@ -18,8 +18,6 @@ func (engine *Engine) persistentStorageIndexDocumentWorker(shard int) {
 		request := <-engine.persistentStorageIndexDocumentChannels[shard]
 
 		// 得到key
-		b := make([]byte, 10)
-		length := binary.PutUvarint(b, request.docId)
 
 		// 得到value
 		var buf bytes.Buffer
@@ -31,26 +29,22 @@ func (engine *Engine) persistentStorageIndexDocumentWorker(shard int) {
 		}
 
 		// 将key-value写入数据库
-		engine.dbs[shard].Set(b[0:length], buf.Bytes())
+		engine.dbs[shard].Set([]byte(request.docId), buf.Bytes())
 		atomic.AddUint64(&engine.numDocumentsStored, 1)
 	}
 }
 
-func (engine *Engine) persistentStorageRemoveDocumentWorker(docId uint64, shard uint32) {
+func (engine *Engine) persistentStorageRemoveDocumentWorker(docId string, shard uint32) {
 	// 得到key
-	b := make([]byte, 10)
-	length := binary.PutUvarint(b, docId)
 
 	// 从数据库删除该key
-	engine.dbs[shard].Delete(b[0:length])
+	engine.dbs[shard].Delete([]byte(docId))
 }
 
 func (engine *Engine) persistentStorageInitWorker(shard int) {
 	engine.dbs[shard].ForEach(func(k, v []byte) error {
-		key, value := k, v
 		// 得到docID
-		docId, _ := binary.Uvarint(key)
-
+		docId, value := string(k), v
 		// 得到data
 		buf := bytes.NewReader(value)
 		dec := gob.NewDecoder(buf)

+ 2 - 2
engine/ranker_worker.go

@@ -5,7 +5,7 @@ import (
 )
 
 type rankerAddDocRequest struct {
-	docId  uint64
+	docId  string
 	fields interface{}
 }
 
@@ -22,7 +22,7 @@ type rankerReturnRequest struct {
 }
 
 type rankerRemoveDocRequest struct {
-	docId uint64
+	docId string
 }
 
 func (engine *Engine) rankerAddDocWorker(shard int) {

+ 2 - 2
engine/segmenter_worker.go

@@ -5,7 +5,7 @@ import (
 )
 
 type segmenterRequest struct {
-	docId       uint64
+	docId       string
 	hash        uint32
 	data        types.DocumentIndexData
 	forceUpdate bool
@@ -14,7 +14,7 @@ type segmenterRequest struct {
 func (engine *Engine) segmenterWorker() {
 	for {
 		request := <-engine.segmenterChannel
-		if request.docId == 0 {
+		if request.docId == "" {
 			if request.forceUpdate {
 				for i := 0; i < engine.initOptions.NumShards; i++ {
 					engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}

+ 15 - 10
examples/codelab/search_server.go

@@ -6,8 +6,7 @@ import (
 	"encoding/gob"
 	"encoding/json"
 	"flag"
-	"github.com/huichen/wukong/engine"
-	"github.com/huichen/wukong/types"
+	"fmt"
 	"io"
 	"log"
 	"net/http"
@@ -16,6 +15,9 @@ import (
 	"reflect"
 	"strconv"
 	"strings"
+
+	"github.com/huichen/wukong/engine"
+	"github.com/huichen/wukong/types"
 )
 
 const (
@@ -24,12 +26,12 @@ const (
 )
 
 var (
-	searcher = engine.Engine{}
-	wbs      = map[uint64]Weibo{}
-	weiboData = flag.String("weibo_data", "../../testdata/weibo_data.txt", "微博数据文件")
-	dictFile = flag.String("dict_file", "../../data/dictionary.txt", "词典文件")
+	searcher      = engine.Engine{}
+	wbs           = map[uint64]Weibo{}
+	weiboData     = flag.String("weibo_data", "../../testdata/weibo_data.txt", "微博数据文件")
+	dictFile      = flag.String("dict_file", "../../data/dictionary.txt", "词典文件")
 	stopTokenFile = flag.String("stop_token_file", "../../data/stop_tokens.txt", "停用词文件")
-	staticFolder = flag.String("static_folder", "static", "静态文件目录")
+	staticFolder  = flag.String("static_folder", "static", "静态文件目录")
 )
 
 type Weibo struct {
@@ -67,13 +69,14 @@ func indexWeibo() {
 
 	log.Print("添加索引")
 	for docId, weibo := range wbs {
-		searcher.IndexDocument(docId, types.DocumentIndexData{
+		index := fmt.Sprintf("wb:%d", docId) // strconv.FormatUint(docId, 10)
+		searcher.IndexDocumentS(index, types.DocumentIndexData{
 			Content: weibo.Text,
 			Fields: WeiboScoringFields{
 				Timestamp:    weibo.Timestamp,
 				RepostsCount: weibo.RepostsCount,
 			},
-		})
+		}, false)
 	}
 
 	searcher.FlushIndex()
@@ -129,7 +132,9 @@ func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
 	// 整理为输出格式
 	docs := []*Weibo{}
 	for _, doc := range output.Docs {
-		wb := wbs[doc.DocId]
+		parts := strings.Split(doc.DocId, ":")
+		id, _ := strconv.ParseUint(parts[1], 10, 64)
+		wb := wbs[id]
 		for _, t := range output.Tokens {
 			wb.Text = strings.Replace(wb.Text, t, "<font color=red>"+t+"</font>", -1)
 		}

+ 8 - 5
examples/custom_scoring_criteria.go

@@ -17,13 +17,14 @@ import (
 	"encoding/gob"
 	"flag"
 	"fmt"
-	"github.com/huichen/wukong/engine"
-	"github.com/huichen/wukong/types"
 	"log"
 	"os"
 	"reflect"
 	"strconv"
 	"strings"
+
+	"github.com/huichen/wukong/engine"
+	"github.com/huichen/wukong/types"
 )
 
 const (
@@ -141,8 +142,8 @@ func main() {
 	// 建立索引
 	log.Print("建立索引")
 	for i, text := range lines {
-		searcher.IndexDocument(uint64(i),
-			types.DocumentIndexData{Content: text, Fields: fieldsSlice[i]})
+		searcher.IndexDocumentS(fmt.Sprintf("line:%d", i),
+			types.DocumentIndexData{Content: text, Fields: fieldsSlice[i]}, false)
 	}
 	searcher.FlushIndex()
 	log.Print("索引建立完毕")
@@ -154,7 +155,9 @@ func main() {
 	// 显示
 	fmt.Println()
 	for _, doc := range output.Docs {
-		fmt.Printf("%v %s\n\n", doc.Scores, lines[doc.DocId])
+		parts := strings.Split(doc.DocId, ":")
+		index, _ := strconv.ParseInt(parts[1], 10, 64)
+		fmt.Printf("%v score:%v %s\n\n", doc.DocId, doc.Scores, lines[int(index)])
 	}
 	log.Printf("查询完毕")
 }

+ 5 - 4
examples/simplest_example.go

@@ -7,9 +7,10 @@
 package main
 
 import (
+	"log"
+
 	"github.com/huichen/wukong/engine"
 	"github.com/huichen/wukong/types"
-	"log"
 )
 
 var (
@@ -24,9 +25,9 @@ func main() {
 	defer searcher.Close()
 
 	// 将文档加入索引,docId 从1开始
-	searcher.IndexDocument(1, types.DocumentIndexData{Content: "此次百度收购将成中国互联网最大并购"}, false)
-	searcher.IndexDocument(2, types.DocumentIndexData{Content: "百度宣布拟全资收购91无线业务"}, false)
-	searcher.IndexDocument(3, types.DocumentIndexData{Content: "百度是中国最大的搜索引擎"}, false)
+	searcher.IndexDocumentS("db:table:1", types.DocumentIndexData{Content: "此次百度收购将成中国互联网最大并购"}, false)
+	searcher.IndexDocumentS("db:table:2", types.DocumentIndexData{Content: "百度宣布拟全资收购91无线业务"}, false)
+	searcher.IndexDocumentS("db:table:3", types.DocumentIndexData{Content: "百度是中国最大的搜索引擎"}, false)
 
 	// 等待索引刷新完毕
 	searcher.FlushIndex()

+ 13 - 0
go.mod

@@ -0,0 +1,13 @@
+module github.com/huichen/wukong
+
+go 1.15
+
+require (
+	github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d // indirect
+	github.com/boltdb/bolt v1.3.1
+	github.com/huichen/murmur v0.0.0-20130808212358-e0489551cf51
+	github.com/huichen/sego v0.0.0-20180617034105-3f3c8a8cfacc
+	github.com/issue9/assert v1.4.1 // indirect
+	golang.org/x/sys v0.0.0-20210426230700-d19ff857e887 // indirect
+	modernc.org/kv v1.0.3
+)

+ 37 - 0
go.sum

@@ -0,0 +1,37 @@
+github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d h1:ir/IFJU5xbja5UaBEQLjcvn7aAU01nqU/NUyOBEU+ew=
+github.com/adamzy/cedar-go v0.0.0-20170805034717-80a9c64b256d/go.mod h1:PRWNwWq0yifz6XDPZu48aSld8BWwBfr2JKB2bGWiEd4=
+github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
+github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
+github.com/edsrzf/mmap-go v1.0.0 h1:CEBF7HpRnUCSJgGUb5h1Gm7e3VkmVDrR8lvWVLtrOFw=
+github.com/edsrzf/mmap-go v1.0.0/go.mod h1:YO35OhQPt3KJa3ryjFM5Bs14WD66h8eGKpfaBNrHW5M=
+github.com/golang/snappy v0.0.2 h1:aeE13tS0IiQgFjYdoL8qN3K1N2bXXtI6Vi51/y7BpMw=
+github.com/golang/snappy v0.0.2/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
+github.com/huichen/murmur v0.0.0-20130808212358-e0489551cf51 h1:86ZSBmeBzG7dWW2rx9imn5pVKgqe7YjWzZ9qhn4Z+9A=
+github.com/huichen/murmur v0.0.0-20130808212358-e0489551cf51/go.mod h1:UKrDR4kaPWAPk8cJGrHoTgyI8OmHPNDjUxx/aOK4ySU=
+github.com/huichen/sego v0.0.0-20180617034105-3f3c8a8cfacc h1:3LXYtoxQGFSjIL5ZJAn4PceSpwRohuTKYL1W4kJ7G8g=
+github.com/huichen/sego v0.0.0-20180617034105-3f3c8a8cfacc/go.mod h1:+/Bm7uk1bnJJMi9l6P88FgHeGtscOQiYbxW1j+BmgBY=
+github.com/issue9/assert v1.4.1 h1:gUtOpMTeaE4JTe9kACma5foOHBvVt1p5XTFrULDwdXI=
+github.com/issue9/assert v1.4.1/go.mod h1:Yktk83hAVl1SPSYtd9kjhBizuiBIqUQyj+D5SE2yjVY=
+github.com/remyoudompheng/bigfft v0.0.0-20190728182440-6a916e37a237/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
+github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6OkFY5QxjkYwrChwuRruF69c169dPK26NUlk=
+github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210426230700-d19ff857e887 h1:dXfMednGJh/SUUFjTLsWJz3P+TQt9qnR11GgeI3vWKs=
+golang.org/x/sys v0.0.0-20210426230700-d19ff857e887/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+modernc.org/bufs v1.0.0/go.mod h1:0FCJ1mAhGiiFos8v2PISFhP2fEj3CK4/ACCLidqU/Ok=
+modernc.org/exp v1.0.0/go.mod h1:nfZshN9GU3d+mveZaR6VW2wJfh8gVBDgp8i2NZ/pFM8=
+modernc.org/fileutil v1.0.0 h1:Z1AFLZwl6BO8A5NldQg/xTSjGLetp+1Ubvl4alfGx8w=
+modernc.org/fileutil v1.0.0/go.mod h1:JHsWpkrk/CnVV1H/eGlFf85BEpfkrp56ro8nojIq9Q8=
+modernc.org/internal v1.0.0 h1:XMDsFDcBDsibbBnHB2xzljZ+B1yrOVLEFkKL2u15Glw=
+modernc.org/internal v1.0.0/go.mod h1:VUD/+JAkhCpvkUitlEOnhpVxCgsBI90oTzSCRcqQVSM=
+modernc.org/kv v1.0.3 h1:XfQ8Cs/aTlJ6o11SOt/pRsIAmQuj3oZ1nhqZPDz27T8=
+modernc.org/kv v1.0.3/go.mod h1:P2r1em6l8wFhU985V9wlLu8C4hYULT1pObxAGfTkAIk=
+modernc.org/lldb v1.0.0 h1:6vjDJxQEfhlOLwl4bhpwIz00uyFK4EmSYcbwqwbynsc=
+modernc.org/lldb v1.0.0/go.mod h1:jcRvJGWfCGodDZz8BPwiKMJxGJngQ/5DrRapkQnLob8=
+modernc.org/mathutil v1.0.0/go.mod h1:wU0vUrJsVWBZ4P6e7xtFJEhFSNsfRLJ8H458uRjg03k=
+modernc.org/mathutil v1.1.1 h1:FeylZSVX8S+58VsyJlkEj2bcpdytmp9MmDKZkKx8OIE=
+modernc.org/mathutil v1.1.1/go.mod h1:mZW8CKdRPY1v87qxC/wUdX5O1qDzXMP5TH3wjfpga6E=
+modernc.org/sortutil v1.1.0 h1:oP3U4uM+NT/qBQcbg/K2iqAX0Nx7B1b6YZtq3Gk/PjM=
+modernc.org/sortutil v1.1.0/go.mod h1:ZyL98OQHJgH9IEfN71VsamvJgrtRX9Dj2gX+vH86L1k=
+modernc.org/zappy v1.0.1 h1:gR01yosq33KPfCO9H/N2Mod+AdpPxkMmlJVo9FCd4wU=
+modernc.org/zappy v1.0.1/go.mod h1:O0z5BRBwgfXAYDDhMqz9xVj0omSIEpspvGcwsyBe3FM=

+ 2 - 1
storage/kv_storage.go

@@ -1,8 +1,9 @@
 package storage
 
 import (
-	"github.com/cznic/kv"
 	"io"
+
+	"modernc.org/kv"
 )
 
 type kvStorage struct {

+ 3 - 3
types/index.go

@@ -2,7 +2,7 @@ package types
 
 type DocumentIndex struct {
 	// 文本的DocId
-	DocId uint64
+	DocId string
 
 	// 文本的关键词长
 	TokenLength float32
@@ -25,7 +25,7 @@ type KeywordIndex struct {
 
 // 索引器返回结果
 type IndexedDocument struct {
-	DocId uint64
+	DocId string
 
 	// BM25,仅当索引类型为FrequenciesIndex或者LocationsIndex时返回有效值
 	BM25 float32
@@ -57,7 +57,7 @@ func (docs DocumentsIndex) Less(i, j int) bool {
 }
 
 // 方便批量删除文档索引
-type DocumentsId []uint64
+type DocumentsId []string
 
 func (docs DocumentsId) Len() int {
 	return len(docs)

+ 1 - 1
types/search_request.go

@@ -13,7 +13,7 @@ type SearchRequest struct {
 	Labels []string
 
 	// 当不为nil时,仅从这些DocIds包含的键中搜索(忽略值)
-	DocIds map[uint64]bool
+	DocIds map[string]bool
 
 	// 排序选项
 	RankOptions *RankOptions

+ 1 - 1
types/search_response.go

@@ -19,7 +19,7 @@ type SearchResponse struct {
 }
 
 type ScoredDocument struct {
-	DocId uint64
+	DocId string
 
 	// 文档的打分值
 	// 搜索结果按照Scores的值排序,先按照第一个数排,如果相同则按照第二个数排序,依次类推。

+ 1 - 1
wukong.go

@@ -2,7 +2,7 @@ package wukong
 
 import (
 	_ "github.com/boltdb/bolt"
-	_ "github.com/cznic/kv"
 	_ "github.com/huichen/murmur"
 	_ "github.com/huichen/sego"
+	_ "modernc.org/kv"
 )