indexer.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. package core
  2. import (
  3. "github.com/huichen/wukong/types"
  4. "github.com/huichen/wukong/utils"
  5. "log"
  6. "math"
  7. "sync"
  8. )
  9. // 索引器
  10. type Indexer struct {
  11. // 从搜索键到文档列表的反向索引
  12. // 加了读写锁以保证读写安全
  13. tableLock struct {
  14. sync.RWMutex
  15. table map[string]*KeywordIndices
  16. docs map[uint64]bool
  17. }
  18. initOptions types.IndexerInitOptions
  19. initialized bool
  20. // 这实际上是总文档数的一个近似
  21. numDocuments uint64
  22. // 所有被索引文本的总关键词数
  23. totalTokenLength float32
  24. // 每个文档的关键词长度
  25. docTokenLengths map[uint64]float32
  26. }
  27. // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
  28. type KeywordIndices struct {
  29. // 下面的切片是否为空,取决于初始化时IndexType的值
  30. docIds []uint64 // 全部类型都有
  31. frequencies []float32 // IndexType == FrequenciesIndex
  32. locations [][]int // IndexType == LocationsIndex
  33. }
  34. // 初始化索引器
  35. func (indexer *Indexer) Init(options types.IndexerInitOptions) {
  36. if indexer.initialized == true {
  37. log.Fatal("索引器不能初始化两次")
  38. }
  39. indexer.initialized = true
  40. indexer.tableLock.table = make(map[string]*KeywordIndices)
  41. indexer.tableLock.docs = make(map[uint64]bool)
  42. indexer.initOptions = options
  43. indexer.docTokenLengths = make(map[uint64]float32)
  44. }
  45. // 向反向索引表中加入一个文档
  46. func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
  47. if indexer.initialized == false {
  48. log.Fatal("索引器尚未初始化")
  49. }
  50. indexer.tableLock.Lock()
  51. defer indexer.tableLock.Unlock()
  52. // 更新文档关键词总长度
  53. if document.TokenLength != 0 {
  54. originalLength, found := indexer.docTokenLengths[document.DocId]
  55. indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
  56. if found {
  57. indexer.totalTokenLength += document.TokenLength - originalLength
  58. } else {
  59. indexer.totalTokenLength += document.TokenLength
  60. }
  61. }
  62. docIdIsNew := true
  63. for _, keyword := range document.Keywords {
  64. indices, foundKeyword := indexer.tableLock.table[keyword.Text]
  65. if !foundKeyword {
  66. // 如果没找到该搜索键则加入
  67. ti := KeywordIndices{}
  68. switch indexer.initOptions.IndexType {
  69. case types.LocationsIndex:
  70. ti.locations = [][]int{keyword.Starts}
  71. case types.FrequenciesIndex:
  72. ti.frequencies = []float32{keyword.Frequency}
  73. }
  74. ti.docIds = []uint64{document.DocId}
  75. indexer.tableLock.table[keyword.Text] = &ti
  76. continue
  77. }
  78. // 查找应该插入的位置
  79. position, found := indexer.searchIndex(
  80. indices, 0, indexer.getIndexLength(indices)-1, document.DocId)
  81. if found {
  82. docIdIsNew = false
  83. // 覆盖已有的索引项
  84. switch indexer.initOptions.IndexType {
  85. case types.LocationsIndex:
  86. indices.locations[position] = keyword.Starts
  87. case types.FrequenciesIndex:
  88. indices.frequencies[position] = keyword.Frequency
  89. }
  90. continue
  91. }
  92. // 当索引不存在时,插入新索引项
  93. switch indexer.initOptions.IndexType {
  94. case types.LocationsIndex:
  95. indices.locations = append(indices.locations, []int{})
  96. copy(indices.locations[position+1:], indices.locations[position:])
  97. indices.locations[position] = keyword.Starts
  98. case types.FrequenciesIndex:
  99. indices.frequencies = append(indices.frequencies, float32(0))
  100. copy(indices.frequencies[position+1:], indices.frequencies[position:])
  101. indices.frequencies[position] = keyword.Frequency
  102. }
  103. indices.docIds = append(indices.docIds, 0)
  104. copy(indices.docIds[position+1:], indices.docIds[position:])
  105. indices.docIds[position] = document.DocId
  106. }
  107. // 更新文章总数
  108. if docIdIsNew {
  109. indexer.tableLock.docs[document.DocId] = true
  110. indexer.numDocuments++
  111. }
  112. }
  113. // 查找包含全部搜索键(AND操作)的文档
  114. // 当docIds不为nil时仅从docIds指定的文档中查找
  115. func (indexer *Indexer) Lookup(
  116. tokens []string, labels []string, docIds map[uint64]bool, countDocsOnly bool) (docs []types.IndexedDocument, numDocs int) {
  117. if indexer.initialized == false {
  118. log.Fatal("索引器尚未初始化")
  119. }
  120. if indexer.numDocuments == 0 {
  121. return
  122. }
  123. numDocs = 0
  124. // 合并关键词和标签为搜索键
  125. keywords := make([]string, len(tokens)+len(labels))
  126. copy(keywords, tokens)
  127. copy(keywords[len(tokens):], labels)
  128. indexer.tableLock.RLock()
  129. defer indexer.tableLock.RUnlock()
  130. table := make([]*KeywordIndices, len(keywords))
  131. for i, keyword := range keywords {
  132. indices, found := indexer.tableLock.table[keyword]
  133. if !found {
  134. // 当反向索引表中无此搜索键时直接返回
  135. return
  136. } else {
  137. // 否则加入反向表中
  138. table[i] = indices
  139. }
  140. }
  141. // 当没有找到时直接返回
  142. if len(table) == 0 {
  143. return
  144. }
  145. // 归并查找各个搜索键出现文档的交集
  146. // 从后向前查保证先输出DocId较大文档
  147. indexPointers := make([]int, len(table))
  148. for iTable := 0; iTable < len(table); iTable++ {
  149. indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
  150. }
  151. // 平均文本关键词长度,用于计算BM25
  152. avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
  153. for ; indexPointers[0] >= 0; indexPointers[0]-- {
  154. // 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
  155. baseDocId := indexer.getDocId(table[0], indexPointers[0])
  156. if _, ok := indexer.tableLock.docs[baseDocId]; !ok {
  157. continue
  158. }
  159. if docIds != nil {
  160. _, found := docIds[baseDocId]
  161. if !found {
  162. continue
  163. }
  164. }
  165. iTable := 1
  166. found := true
  167. for ; iTable < len(table); iTable++ {
  168. // 二分法比简单的顺序归并效率高,也有更高效率的算法,
  169. // 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
  170. // 以避免反向表添加新文档时的写锁。
  171. // TODO: 进一步研究不同求交集算法的速度和可扩展性。
  172. position, foundBaseDocId := indexer.searchIndex(table[iTable],
  173. 0, indexPointers[iTable], baseDocId)
  174. if foundBaseDocId {
  175. indexPointers[iTable] = position
  176. } else {
  177. if position == 0 {
  178. // 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
  179. // 继续查找的必要。
  180. return
  181. } else {
  182. // 继续下一indexPointers[0]的查找
  183. indexPointers[iTable] = position - 1
  184. found = false
  185. break
  186. }
  187. }
  188. }
  189. if found {
  190. indexedDoc := types.IndexedDocument{}
  191. // 当为LocationsIndex时计算关键词紧邻距离
  192. if indexer.initOptions.IndexType == types.LocationsIndex {
  193. // 计算有多少关键词是带有距离信息的
  194. numTokensWithLocations := 0
  195. for i, t := range table[:len(tokens)] {
  196. if len(t.locations[indexPointers[i]]) > 0 {
  197. numTokensWithLocations++
  198. }
  199. }
  200. if numTokensWithLocations != len(tokens) {
  201. if !countDocsOnly {
  202. docs = append(docs, types.IndexedDocument{
  203. DocId: baseDocId,
  204. })
  205. }
  206. numDocs++
  207. break
  208. }
  209. // 计算搜索键在文档中的紧邻距离
  210. tokenProximity, tokenLocations := computeTokenProximity(table[:len(tokens)], indexPointers, tokens)
  211. indexedDoc.TokenProximity = int32(tokenProximity)
  212. indexedDoc.TokenSnippetLocations = tokenLocations
  213. // 添加TokenLocations
  214. indexedDoc.TokenLocations = make([][]int, len(tokens))
  215. for i, t := range table[:len(tokens)] {
  216. indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
  217. }
  218. }
  219. // 当为LocationsIndex或者FrequenciesIndex时计算BM25
  220. if indexer.initOptions.IndexType == types.LocationsIndex ||
  221. indexer.initOptions.IndexType == types.FrequenciesIndex {
  222. bm25 := float32(0)
  223. d := indexer.docTokenLengths[baseDocId]
  224. for i, t := range table[:len(tokens)] {
  225. var frequency float32
  226. if indexer.initOptions.IndexType == types.LocationsIndex {
  227. frequency = float32(len(t.locations[indexPointers[i]]))
  228. } else {
  229. frequency = t.frequencies[indexPointers[i]]
  230. }
  231. // 计算BM25
  232. if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
  233. // 带平滑的idf
  234. idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
  235. k1 := indexer.initOptions.BM25Parameters.K1
  236. b := indexer.initOptions.BM25Parameters.B
  237. bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
  238. }
  239. }
  240. indexedDoc.BM25 = float32(bm25)
  241. }
  242. indexedDoc.DocId = baseDocId
  243. if !countDocsOnly {
  244. docs = append(docs, indexedDoc)
  245. }
  246. numDocs++
  247. }
  248. }
  249. return
  250. }
  251. // 二分法查找indices中某文档的索引项
  252. // 第一个返回参数为找到的位置或需要插入的位置
  253. // 第二个返回参数标明是否找到
  254. func (indexer *Indexer) searchIndex(
  255. indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
  256. // 特殊情况
  257. if indexer.getIndexLength(indices) == start {
  258. return start, false
  259. }
  260. if docId < indexer.getDocId(indices, start) {
  261. return start, false
  262. } else if docId == indexer.getDocId(indices, start) {
  263. return start, true
  264. }
  265. if docId > indexer.getDocId(indices, end) {
  266. return end + 1, false
  267. } else if docId == indexer.getDocId(indices, end) {
  268. return end, true
  269. }
  270. // 二分
  271. var middle int
  272. for end-start > 1 {
  273. middle = (start + end) / 2
  274. if docId == indexer.getDocId(indices, middle) {
  275. return middle, true
  276. } else if docId > indexer.getDocId(indices, middle) {
  277. start = middle
  278. } else {
  279. end = middle
  280. }
  281. }
  282. return end, false
  283. }
  284. // 计算搜索键在文本中的紧邻距离
  285. //
  286. // 假定第 i 个搜索键首字节出现在文本中的位置为 P_i,长度 L_i
  287. // 紧邻距离计算公式为
  288. //
  289. // ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
  290. //
  291. // 具体由动态规划实现,依次计算前 i 个 token 在每个出现位置的最优值。
  292. // 选定的 P_i 通过 tokenLocations 参数传回。
  293. func computeTokenProximity(table []*KeywordIndices, indexPointers []int, tokens []string) (
  294. minTokenProximity int, tokenLocations []int) {
  295. minTokenProximity = -1
  296. tokenLocations = make([]int, len(tokens))
  297. var (
  298. currentLocations, nextLocations []int
  299. currentMinValues, nextMinValues []int
  300. path [][]int
  301. )
  302. // 初始化路径数组
  303. path = make([][]int, len(tokens))
  304. for i := 1; i < len(path); i++ {
  305. path[i] = make([]int, len(table[i].locations[indexPointers[i]]))
  306. }
  307. // 动态规划
  308. currentLocations = table[0].locations[indexPointers[0]]
  309. currentMinValues = make([]int, len(currentLocations))
  310. for i := 1; i < len(tokens); i++ {
  311. nextLocations = table[i].locations[indexPointers[i]]
  312. nextMinValues = make([]int, len(nextLocations))
  313. for j, _ := range nextMinValues {
  314. nextMinValues[j] = -1
  315. }
  316. var iNext int
  317. for iCurrent, currentLocation := range currentLocations {
  318. if currentMinValues[iCurrent] == -1 {
  319. continue
  320. }
  321. for iNext+1 < len(nextLocations) && nextLocations[iNext+1] < currentLocation {
  322. iNext++
  323. }
  324. update := func(from int, to int) {
  325. if to >= len(nextLocations) {
  326. return
  327. }
  328. value := currentMinValues[from] + utils.AbsInt(nextLocations[to]-currentLocations[from]-len(tokens[i-1]))
  329. if nextMinValues[to] == -1 || value < nextMinValues[to] {
  330. nextMinValues[to] = value
  331. path[i][to] = from
  332. }
  333. }
  334. // 最优解的状态转移只发生在左右最接近的位置
  335. update(iCurrent, iNext)
  336. update(iCurrent, iNext+1)
  337. }
  338. currentLocations = nextLocations
  339. currentMinValues = nextMinValues
  340. }
  341. // 找出最优解
  342. var cursor int
  343. for i, value := range currentMinValues {
  344. if value == -1 {
  345. continue
  346. }
  347. if minTokenProximity == -1 || value < minTokenProximity {
  348. minTokenProximity = value
  349. cursor = i
  350. }
  351. }
  352. // 从路径倒推出最优解的位置
  353. for i := len(tokens) - 1; i >= 0; i-- {
  354. if i != len(tokens)-1 {
  355. cursor = path[i+1][cursor]
  356. }
  357. tokenLocations[i] = table[i].locations[indexPointers[i]][cursor]
  358. }
  359. return
  360. }
  361. // 从KeywordIndices中得到第i个文档的DocId
  362. func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
  363. return ti.docIds[i]
  364. }
  365. // 得到KeywordIndices中文档总数
  366. func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
  367. return len(ti.docIds)
  368. }
  369. // 删除某个文档
  370. func (indexer *Indexer) RemoveDoc(docId uint64) {
  371. if indexer.initialized == false {
  372. log.Fatal("排序器尚未初始化")
  373. }
  374. indexer.tableLock.Lock()
  375. delete(indexer.tableLock.docs, docId)
  376. indexer.numDocuments--
  377. indexer.tableLock.Unlock()
  378. }