12 lat temu · 55c63d9fed
--- a/core/indexer.go
+++ b/core/indexer.go
@@ -2,6 +2,7 @@ package core
 
				 
			
 
				 import (
			
 
				 	"github.com/huichen/wukong/types"
			
 
				+	"github.com/huichen/wukong/utils"
			
 
				 	"log"
			
 
				 	"math"
			
 
				 	"sync"
			
@@ -223,9 +224,7 @@ func (indexer *Indexer) Lookup(
 
				 				}
			
 
				 
			
 
				 				// 计算搜索键在文档中的紧邻距离
			
 
				-				tokenLocations := make([]int, len(tokens))
			
 
				-				tokenProximity := computeTokenProximity(
			
 
				-					table[:len(tokens)], indexPointers, tokens, &tokenLocations)
			
 
				+				tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
			
 
				 				indexedDoc.TokenProximity = int32(tokenProximity)
			
 
				 				indexedDoc.TokenSnippetLocations = tokenLocations
			
 
				 
			
@@ -305,70 +304,89 @@ func (indexer *Indexer) searchIndex(
 
				 
			
 
				 // 计算搜索键在文本中的紧邻距离
			
 
				 //
			
 
				-// 假定第i个搜索键首字节出现在文本中的位置为P_i，长度L_i
			
 
				+// 假定第 i 个搜索键首字节出现在文本中的位置为 P_i，长度 L_i
			
 
				 // 紧邻距离计算公式为
			
 
				 //
			
 
				 // 	ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
			
 
				 //
			
 
				-// 具体计算过程为先取定一个P_1，计算所有P_2的可能值中令Abs(P_2 - P_1 - L1)最小,
			
 
				-// 然后固定P2后依照同样的方法选择P3，P4，等等。遍历所有可能的P_1得到最小的紧邻距离。
			
 
				-//
			
 
				-// 选定的P_i通过tokenLocations参数传回。
			
 
				-func computeTokenProximity(
			
 
				-	table []*KeywordIndices,
			
 
				-	indexPointers []int,
			
 
				-	tokens []string,
			
 
				-	tokenLocations *[]int) int {
			
 
				-	minTokenProximity := -1
			
 
				-	currentLocations := make([]int, len(tokens))
			
 
				-	for _, primaryLocation := range table[0].locations[indexPointers[0]] {
			
 
				-		tokenProximity := 0
			
 
				-		previousLocation := primaryLocation + len(tokens[0]) // P_1 + L_1
			
 
				-		for iToken := 1; iToken < len(tokens); iToken++ {
			
 
				-			locations := table[iToken].locations[indexPointers[iToken]]
			
 
				-
			
 
				-			// 寻找 P_i + L_i 后面最近的那个 P_(i+1)
			
 
				-			for currentLocations[iToken] = 0; currentLocations[iToken] < len(locations) &&
			
 
				-				locations[currentLocations[iToken]] < previousLocation; currentLocations[iToken]++ {
			
 
				+// 具体由动态规划实现，依次计算前 i 个 token 在每个出现位置的最优值。
			
 
				+// 选定的 P_i 通过 tokenLocations 参数传回。
			
 
				+func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
			
 
				+	minTokenProximity int, tokenLocations []int) {
			
 
				+	minTokenProximity = -1
			
 
				+	tokenLocations = make([]int, len(tokens))
			
 
				+
			
 
				+	var (
			
 
				+		currentLocations, nextLocations []int
			
 
				+		currentMinValues, nextMinValues []int
			
 
				+		path                            [][]int
			
 
				+	)
			
 
				+
			
 
				+	// 初始化路径数组
			
 
				+	path = make([][]int, len(tokens))
			
 
				+	for i := 1; i < len(path); i++ {
			
 
				+		path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
			
 
				+	}
			
 
				+
			
 
				+	// 动态规划
			
 
				+	currentLocations = table[0].locations[indexPointers[0]]
			
 
				+	currentMinValues = make([]int, len(currentLocations))
			
 
				+	for i := 0; i+1 < len(tokens); i++ {
			
 
				+		nextLocations = table[i+1].locations[indexPointers[i+1]]
			
 
				+		nextMinValues = make([]int, len(nextLocations))
			
 
				+		for j, _ := range nextMinValues {
			
 
				+			nextMinValues[j] = -1
			
 
				+		}
			
 
				+
			
 
				+		var iNext int
			
 
				+		for iCurrent, currentLocation := range currentLocations {
			
 
				+			if currentMinValues[iCurrent] == -1 {
			
 
				+				continue
			
 
				+			}
			
 
				+			for iNext+1 < len(nextLocations) && nextLocations[iNext] < currentLocation {
			
 
				+				iNext++
			
 
				 			}
			
 
				 
			
 
				-			if currentLocations[iToken] == 0 {
			
 
				-				// 找到的P_(i+1)是搜索键i+1出现的第一个位置
			
 
				-				tokenProximity += locations[currentLocations[iToken]] -
			
 
				-					previousLocation
			
 
				-			} else if currentLocations[iToken] == len(locations) {
			
 
				-				// 否则当搜索键i+1出现的最后一个位置仍然小于P_i + L_i
			
 
				-				tokenProximity += previousLocation -
			
 
				-					locations[currentLocations[iToken]-1]
			
 
				-				currentLocations[iToken]--
			
 
				-			} else {
			
 
				-				rightProximity := locations[currentLocations[iToken]] - previousLocation
			
 
				-				leftProximity := previousLocation - locations[currentLocations[iToken]-1]
			
 
				-				if rightProximity > leftProximity {
			
 
				-					// 左侧更接近
			
 
				-					tokenProximity += leftProximity
			
 
				-					currentLocations[iToken]--
			
 
				-				} else {
			
 
				-					// 右侧更接近
			
 
				-					tokenProximity += rightProximity
			
 
				+			update := func(from int, to int) {
			
 
				+				if to >= len(nextLocations) {
			
 
				+					return
			
 
				+				}
			
 
				+				value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i]))
			
 
				+				if nextMinValues[to] == -1 || value < nextMinValues[to] {
			
 
				+					nextMinValues[to] = value
			
 
				+					path[i+1][to] = from
			
 
				 				}
			
 
				 			}
			
 
				 
			
 
				-			// 更新 P_(i+1) + L_(i+1)
			
 
				-			previousLocation = locations[currentLocations[iToken]] + len(tokens[iToken])
			
 
				+			// 最优解的状态转移只发生在左右最接近的位置
			
 
				+			update(iCurrent, iNext)
			
 
				+			update(iCurrent, iNext+1)
			
 
				 		}
			
 
				 
			
 
				-		// 更新搜索键紧邻距离
			
 
				-		if minTokenProximity < 0 || minTokenProximity > tokenProximity {
			
 
				-			minTokenProximity = tokenProximity
			
 
				-			(*tokenLocations)[0] = primaryLocation
			
 
				-			for iToken := 1; iToken < len(tokens); iToken++ {
			
 
				-				(*tokenLocations)[iToken] = table[iToken].locations[indexPointers[iToken]][currentLocations[iToken]]
			
 
				-			}
			
 
				+		currentLocations = nextLocations
			
 
				+		currentMinValues = nextMinValues
			
 
				+	}
			
 
				+
			
 
				+	// 找出最优解
			
 
				+	var cursor int
			
 
				+	for i, value := range currentMinValues {
			
 
				+		if value == -1 {
			
 
				+			continue
			
 
				+		}
			
 
				+		if minTokenProximity == -1 || value < minTokenProximity {
			
 
				+			minTokenProximity = value
			
 
				+			cursor = i
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	return minTokenProximity
			
 
				+	// 从路径倒推出最优解的位置
			
 
				+	for i := len(tokens) - 1; i >= 0; i-- {
			
 
				+		if i != len(tokens)-1 {
			
 
				+			cursor = path[i+1][cursor]
			
 
				+		}
			
 
				+		tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
			
 
				+	}
			
 
				+	return
			
 
				 }
			
 
				 
			
 
				 // 从KeywordIndices中得到第i个文档的DocId
			
--- a/core/indexer_test.go
+++ b/core/indexer_test.go
@@ -209,6 +209,7 @@ func TestDocIdsIndex(t *testing.T) {
 
				 func TestLookupWithProximity(t *testing.T) {
			
 
				 	var indexer Indexer
			
 
				 	indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
			
 
				+
			
 
				 	// doc0 = "token2 token4 token4 token2 token3 token4"
			
 
				 	indexer.AddDocument(&types.DocumentIndex{
			
 
				 		DocId: 0,
			
@@ -218,9 +219,20 @@ func TestLookupWithProximity(t *testing.T) {
 
				 			{"token4", 0, []int{7, 14, 35}},
			
 
				 		},
			
 
				 	})
			
 
				-
			
 
				 	utils.Expect(t, "[0 1 [21 28]] ",
			
 
				 		indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
			
 
				+
			
 
				+	// doc0 = "t2 t1 . . . t2 t3"
			
 
				+	indexer.AddDocument(&types.DocumentIndex{
			
 
				+		DocId: 0,
			
 
				+		Keywords: []types.KeywordIndex{
			
 
				+			{"t1", 0, []int{3}},
			
 
				+			{"t2", 0, []int{0, 12}},
			
 
				+			{"t3", 0, []int{15}},
			
 
				+		},
			
 
				+	})
			
 
				+	utils.Expect(t, "[0 8 [3 12 15]] ",
			
 
				+		indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil)))
			
 
				 }
			
 
				 
			
 
				 func TestLookupWithPartialLocations(t *testing.T) {
			
--- a/docs/token_proximity.md
+++ b/docs/token_proximity.md
@@ -1,7 +1,7 @@
 
				 关键词紧邻距离（Token Proximity）
			
 
				 ===
			
 
				 
			
 
				-关键词紧邻距离用来衡量多个关键词在同一文档中是否相邻。比如用户搜索“中国足球”这一短语，包含“中国”和“足球”两个关键词，当这两个关键词按照同样顺序前后紧挨着出现在一个文档中时，紧邻距离为零，如果两词中间夹入很多词则紧邻距离较大。紧邻距离是一种衡量文档和多个关键词相关度的方法。紧邻距离虽然不应该作为给文档排序的唯一指标，但在一些情况下通过设定阀值可以过滤掉相当一部分无关的结果。
			
 
				+关键词紧邻距离用来衡量多个关键词在同一文档中是否相邻。比如用户搜索“中国足球”这一短语，包含“中国”和“足球”两个关键词，当这两个关键词按照同样顺序前后紧挨着出现在一个文档中时，紧邻距离为零，如果两词中间夹入很多词则紧邻距离较大。紧邻距离是一种衡量文档和多个关键词相关度的方法。紧邻距离虽然不应该作为给文档排序的唯一指标，但在一些情况下通过设定阈值可以过滤掉相当一部分无关的结果。
			
 
				 
			
 
				 N关键词的紧邻距离计算公式如下：