indexer.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. package core
  2. import (
  3. "github.com/huichen/wukong/types"
  4. "log"
  5. "math"
  6. "sync"
  7. )
  8. // 索引器
  9. type Indexer struct {
  10. // 从搜索键到文档列表的反向索引
  11. // 加了读写锁以保证读写安全
  12. tableLock struct {
  13. sync.RWMutex
  14. table map[string]*KeywordIndices
  15. }
  16. initOptions types.IndexerInitOptions
  17. initialized bool
  18. // 这实际上是总文档数的一个近似
  19. numDocuments uint64
  20. // 所有被索引文本的总关键词数
  21. totalTokenLength float32
  22. // 每个文档的关键词长度
  23. docTokenLengths map[uint64]float32
  24. }
  25. // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
  26. type KeywordIndices struct {
  27. // 下面的切片是否为空,取决于初始化时IndexType的值
  28. docIds []uint64 // 全部类型都有
  29. frequencies []float32 // IndexType == FrequenciesIndex
  30. locations [][]int // IndexType == LocationsIndex
  31. }
  32. // 初始化索引器
  33. func (indexer *Indexer) Init(options types.IndexerInitOptions) {
  34. if indexer.initialized == true {
  35. log.Fatal("索引器不能初始化两次")
  36. }
  37. indexer.initialized = true
  38. indexer.tableLock.table = make(map[string]*KeywordIndices)
  39. indexer.initOptions = options
  40. indexer.docTokenLengths = make(map[uint64]float32)
  41. }
  42. // 向反向索引表中加入一个文档
  43. func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
  44. if indexer.initialized == false {
  45. log.Fatal("索引器尚未初始化")
  46. }
  47. indexer.tableLock.Lock()
  48. defer indexer.tableLock.Unlock()
  49. // 更新文档关键词总长度
  50. if document.TokenLength != 0 {
  51. originalLength, found := indexer.docTokenLengths[document.DocId]
  52. indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
  53. if found {
  54. indexer.totalTokenLength += document.TokenLength - originalLength
  55. } else {
  56. indexer.totalTokenLength += document.TokenLength
  57. }
  58. }
  59. docIdIsNew := true
  60. for _, keyword := range document.Keywords {
  61. indices, foundKeyword := indexer.tableLock.table[keyword.Text]
  62. if !foundKeyword {
  63. // 如果没找到该搜索键则加入
  64. ti := KeywordIndices{}
  65. switch indexer.initOptions.IndexType {
  66. case types.LocationsIndex:
  67. ti.locations = [][]int{keyword.Starts}
  68. case types.FrequenciesIndex:
  69. ti.frequencies = []float32{keyword.Frequency}
  70. }
  71. ti.docIds = []uint64{document.DocId}
  72. indexer.tableLock.table[keyword.Text] = &ti
  73. continue
  74. }
  75. // 查找应该插入的位置
  76. position, found := indexer.searchIndex(
  77. indices, 0, indexer.getIndexLength(indices)-1, document.DocId)
  78. if found {
  79. docIdIsNew = false
  80. // 覆盖已有的索引项
  81. switch indexer.initOptions.IndexType {
  82. case types.LocationsIndex:
  83. indices.locations[position] = keyword.Starts
  84. case types.FrequenciesIndex:
  85. indices.frequencies[position] = keyword.Frequency
  86. }
  87. continue
  88. }
  89. // 当索引不存在时,插入新索引项
  90. switch indexer.initOptions.IndexType {
  91. case types.LocationsIndex:
  92. indices.locations = append(indices.locations, []int{})
  93. copy(indices.locations[position+1:], indices.locations[position:])
  94. indices.locations[position] = keyword.Starts
  95. case types.FrequenciesIndex:
  96. indices.frequencies = append(indices.frequencies, float32(0))
  97. copy(indices.frequencies[position+1:], indices.frequencies[position:])
  98. indices.frequencies[position] = keyword.Frequency
  99. }
  100. indices.docIds = append(indices.docIds, 0)
  101. copy(indices.docIds[position+1:], indices.docIds[position:])
  102. indices.docIds[position] = document.DocId
  103. }
  104. // 更新文章总数
  105. if docIdIsNew {
  106. indexer.numDocuments++
  107. }
  108. }
  109. // 查找包含全部搜索键(AND操作)的文档
  110. // 当docIds不为nil时仅从docIds指定的文档中查找
  111. func (indexer *Indexer) Lookup(
  112. tokens []string, labels []string, docIds *map[uint64]bool) (docs []types.IndexedDocument) {
  113. if indexer.initialized == false {
  114. log.Fatal("索引器尚未初始化")
  115. }
  116. if indexer.numDocuments == 0 {
  117. return
  118. }
  119. // 合并关键词和标签为搜索键
  120. keywords := make([]string, len(tokens)+len(labels))
  121. copy(keywords, tokens)
  122. copy(keywords[len(tokens):], labels)
  123. indexer.tableLock.RLock()
  124. defer indexer.tableLock.RUnlock()
  125. table := make([]*KeywordIndices, len(keywords))
  126. for i, keyword := range keywords {
  127. indices, found := indexer.tableLock.table[keyword]
  128. if !found {
  129. // 当反向索引表中无此搜索键时直接返回
  130. return
  131. } else {
  132. // 否则加入反向表中
  133. table[i] = indices
  134. }
  135. }
  136. // 当没有找到时直接返回
  137. if len(table) == 0 {
  138. return
  139. }
  140. // 归并查找各个搜索键出现文档的交集
  141. // 从后向前查保证先输出DocId较大文档
  142. indexPointers := make([]int, len(table))
  143. for iTable := 0; iTable < len(table); iTable++ {
  144. indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
  145. }
  146. // 平均文本关键词长度,用于计算BM25
  147. avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
  148. for ; indexPointers[0] >= 0; indexPointers[0]-- {
  149. // 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
  150. baseDocId := indexer.getDocId(table[0], indexPointers[0])
  151. if docIds != nil {
  152. _, found := (*docIds)[baseDocId]
  153. if !found {
  154. continue
  155. }
  156. }
  157. iTable := 1
  158. found := true
  159. for ; iTable < len(table); iTable++ {
  160. // 二分法比简单的顺序归并效率高,也有更高效率的算法,
  161. // 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
  162. // 以避免反向表添加新文档时的写锁。
  163. // TODO: 进一步研究不同求交集算法的速度和可扩展性。
  164. position, foundBaseDocId := indexer.searchIndex(table[iTable],
  165. 0, indexPointers[iTable], baseDocId)
  166. if foundBaseDocId {
  167. indexPointers[iTable] = position
  168. } else {
  169. if position == 0 {
  170. // 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
  171. // 继续查找的必要。
  172. return
  173. } else {
  174. // 继续下一indexPointers[0]的查找
  175. indexPointers[iTable] = position - 1
  176. found = false
  177. break
  178. }
  179. }
  180. }
  181. if found {
  182. indexedDoc := types.IndexedDocument{}
  183. // 当为LocationsIndex时计算关键词紧邻距离
  184. if indexer.initOptions.IndexType == types.LocationsIndex {
  185. // 计算有多少关键词是带有距离信息的
  186. numTokensWithLocations := 0
  187. for i, t := range table[:len(tokens)] {
  188. if len(t.locations[indexPointers[i]]) > 0 {
  189. numTokensWithLocations++
  190. }
  191. }
  192. if numTokensWithLocations != len(tokens) {
  193. docs = append(docs, types.IndexedDocument{
  194. DocId: baseDocId,
  195. })
  196. break
  197. }
  198. // 计算搜索键在文档中的紧邻距离
  199. tokenLocations := make([]int, len(tokens))
  200. tokenProximity := computeTokenProximity(
  201. table[:len(tokens)], indexPointers, tokens, &tokenLocations)
  202. indexedDoc.TokenProximity = int32(tokenProximity)
  203. indexedDoc.TokenSnippetLocations = tokenLocations
  204. // 添加TokenLocations
  205. indexedDoc.TokenLocations = make([][]int, len(tokens))
  206. for i, t := range table[:len(tokens)] {
  207. indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
  208. }
  209. }
  210. // 当为LocationsIndex或者FrequenciesIndex时计算BM25
  211. if indexer.initOptions.IndexType == types.LocationsIndex ||
  212. indexer.initOptions.IndexType == types.FrequenciesIndex {
  213. bm25 := float32(0)
  214. d := indexer.docTokenLengths[baseDocId]
  215. for i, t := range table[:len(tokens)] {
  216. var frequency float32
  217. if indexer.initOptions.IndexType == types.LocationsIndex {
  218. frequency = float32(len(t.locations[indexPointers[i]]))
  219. } else {
  220. frequency = t.frequencies[indexPointers[i]]
  221. }
  222. // 计算BM25
  223. if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
  224. // 带平滑的idf
  225. idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
  226. k1 := indexer.initOptions.BM25Parameters.K1
  227. b := indexer.initOptions.BM25Parameters.B
  228. bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
  229. }
  230. }
  231. indexedDoc.BM25 = float32(bm25)
  232. }
  233. indexedDoc.DocId = baseDocId
  234. docs = append(docs, indexedDoc)
  235. }
  236. }
  237. return
  238. }
  239. // 二分法查找indices中某文档的索引项
  240. // 第一个返回参数为找到的位置或需要插入的位置
  241. // 第二个返回参数标明是否找到
  242. func (indexer *Indexer) searchIndex(
  243. indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
  244. // 特殊情况
  245. if indexer.getIndexLength(indices) == start {
  246. return start, false
  247. }
  248. if docId < indexer.getDocId(indices, start) {
  249. return start, false
  250. } else if docId == indexer.getDocId(indices, start) {
  251. return start, true
  252. }
  253. if docId > indexer.getDocId(indices, end) {
  254. return end + 1, false
  255. } else if docId == indexer.getDocId(indices, end) {
  256. return end, true
  257. }
  258. // 二分
  259. var middle int
  260. for end-start > 1 {
  261. middle = (start + end) / 2
  262. if docId == indexer.getDocId(indices, middle) {
  263. return middle, true
  264. } else if docId > indexer.getDocId(indices, middle) {
  265. start = middle
  266. } else {
  267. end = middle
  268. }
  269. }
  270. return end, false
  271. }
  272. // 计算搜索键在文本中的紧邻距离
  273. //
  274. // 假定第i个搜索键首字节出现在文本中的位置为P_i,长度L_i
  275. // 紧邻距离计算公式为
  276. //
  277. // ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
  278. //
  279. // 具体计算过程为先取定一个P_1,计算所有P_2的可能值中令Abs(P_2 - P_1 - L1)最小,
  280. // 然后固定P2后依照同样的方法选择P3,P4,等等。遍历所有可能的P_1得到最小的紧邻距离。
  281. //
  282. // 选定的P_i通过tokenLocations参数传回。
  283. func computeTokenProximity(
  284. table []*KeywordIndices,
  285. indexPointers []int,
  286. tokens []string,
  287. tokenLocations *[]int) int {
  288. minTokenProximity := -1
  289. currentLocations := make([]int, len(tokens))
  290. for _, primaryLocation := range table[0].locations[indexPointers[0]] {
  291. tokenProximity := 0
  292. previousLocation := primaryLocation + len(tokens[0]) // P_1 + L_1
  293. for iToken := 1; iToken < len(tokens); iToken++ {
  294. locations := table[iToken].locations[indexPointers[iToken]]
  295. // 寻找 P_i + L_i 后面最近的那个 P_(i+1)
  296. for currentLocations[iToken] = 0; currentLocations[iToken] < len(locations) &&
  297. locations[currentLocations[iToken]] < previousLocation; currentLocations[iToken]++ {
  298. }
  299. if currentLocations[iToken] == 0 {
  300. // 找到的P_(i+1)是搜索键i+1出现的第一个位置
  301. tokenProximity += locations[currentLocations[iToken]] -
  302. previousLocation
  303. } else if currentLocations[iToken] == len(locations) {
  304. // 否则当搜索键i+1出现的最后一个位置仍然小于P_i + L_i
  305. tokenProximity += previousLocation -
  306. locations[currentLocations[iToken]-1]
  307. currentLocations[iToken]--
  308. } else {
  309. rightProximity := locations[currentLocations[iToken]] - previousLocation
  310. leftProximity := previousLocation - locations[currentLocations[iToken]-1]
  311. if rightProximity > leftProximity {
  312. // 左侧更接近
  313. tokenProximity += leftProximity
  314. currentLocations[iToken]--
  315. } else {
  316. // 右侧更接近
  317. tokenProximity += rightProximity
  318. }
  319. }
  320. // 更新 P_(i+1) + L_(i+1)
  321. previousLocation = locations[currentLocations[iToken]] + len(tokens[iToken])
  322. }
  323. // 更新搜索键紧邻距离
  324. if minTokenProximity < 0 || minTokenProximity > tokenProximity {
  325. minTokenProximity = tokenProximity
  326. (*tokenLocations)[0] = primaryLocation
  327. for iToken := 1; iToken < len(tokens); iToken++ {
  328. (*tokenLocations)[iToken] = table[iToken].locations[indexPointers[iToken]][currentLocations[iToken]]
  329. }
  330. }
  331. }
  332. return minTokenProximity
  333. }
  334. // 从KeywordIndices中得到第i个文档的DocId
  335. func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
  336. return ti.docIds[i]
  337. }
  338. // 得到KeywordIndices中文档总数
  339. func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
  340. return len(ti.docIds)
  341. }