| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382 |
- package core
- import (
- "github.com/huichen/wukong/types"
- "log"
- "math"
- "sync"
- )
- // 索引器
- type Indexer struct {
- // 从搜索键到文档列表的反向索引
- // 加了读写锁以保证读写安全
- tableLock struct {
- sync.RWMutex
- table map[string]*KeywordIndices
- }
- initOptions types.IndexerInitOptions
- initialized bool
- // 这实际上是总文档数的一个近似
- numDocuments uint64
- // 所有被索引文本的总关键词数
- totalTokenLength float32
- // 每个文档的关键词长度
- docTokenLengths map[uint64]float32
- }
- // 反向索引表的一行,收集了一个搜索键出现的所有文档,按照DocId从小到大排序。
- type KeywordIndices struct {
- // 下面的切片是否为空,取决于初始化时IndexType的值
- docIds []uint64 // 全部类型都有
- frequencies []float32 // IndexType == FrequenciesIndex
- locations [][]int // IndexType == LocationsIndex
- }
- // 初始化索引器
- func (indexer *Indexer) Init(options types.IndexerInitOptions) {
- if indexer.initialized == true {
- log.Fatal("索引器不能初始化两次")
- }
- indexer.initialized = true
- indexer.tableLock.table = make(map[string]*KeywordIndices)
- indexer.initOptions = options
- indexer.docTokenLengths = make(map[uint64]float32)
- }
- // 向反向索引表中加入一个文档
- func (indexer *Indexer) AddDocument(document *types.DocumentIndex) {
- if indexer.initialized == false {
- log.Fatal("索引器尚未初始化")
- }
- indexer.tableLock.Lock()
- defer indexer.tableLock.Unlock()
- // 更新文档关键词总长度
- if document.TokenLength != 0 {
- originalLength, found := indexer.docTokenLengths[document.DocId]
- indexer.docTokenLengths[document.DocId] = float32(document.TokenLength)
- if found {
- indexer.totalTokenLength += document.TokenLength - originalLength
- } else {
- indexer.totalTokenLength += document.TokenLength
- }
- }
- docIdIsNew := true
- for _, keyword := range document.Keywords {
- indices, foundKeyword := indexer.tableLock.table[keyword.Text]
- if !foundKeyword {
- // 如果没找到该搜索键则加入
- ti := KeywordIndices{}
- switch indexer.initOptions.IndexType {
- case types.LocationsIndex:
- ti.locations = [][]int{keyword.Starts}
- case types.FrequenciesIndex:
- ti.frequencies = []float32{keyword.Frequency}
- }
- ti.docIds = []uint64{document.DocId}
- indexer.tableLock.table[keyword.Text] = &ti
- continue
- }
- // 查找应该插入的位置
- position, found := indexer.searchIndex(
- indices, 0, indexer.getIndexLength(indices)-1, document.DocId)
- if found {
- docIdIsNew = false
- // 覆盖已有的索引项
- switch indexer.initOptions.IndexType {
- case types.LocationsIndex:
- indices.locations[position] = keyword.Starts
- case types.FrequenciesIndex:
- indices.frequencies[position] = keyword.Frequency
- }
- continue
- }
- // 当索引不存在时,插入新索引项
- switch indexer.initOptions.IndexType {
- case types.LocationsIndex:
- indices.locations = append(indices.locations, []int{})
- copy(indices.locations[position+1:], indices.locations[position:])
- indices.locations[position] = keyword.Starts
- case types.FrequenciesIndex:
- indices.frequencies = append(indices.frequencies, float32(0))
- copy(indices.frequencies[position+1:], indices.frequencies[position:])
- indices.frequencies[position] = keyword.Frequency
- }
- indices.docIds = append(indices.docIds, 0)
- copy(indices.docIds[position+1:], indices.docIds[position:])
- indices.docIds[position] = document.DocId
- }
- // 更新文章总数
- if docIdIsNew {
- indexer.numDocuments++
- }
- }
- // 查找包含全部搜索键(AND操作)的文档
- // 当docIds不为nil时仅从docIds指定的文档中查找
- func (indexer *Indexer) Lookup(
- tokens []string, labels []string, docIds *map[uint64]bool) (docs []types.IndexedDocument) {
- if indexer.initialized == false {
- log.Fatal("索引器尚未初始化")
- }
- if indexer.numDocuments == 0 {
- return
- }
- // 合并关键词和标签为搜索键
- keywords := make([]string, len(tokens)+len(labels))
- copy(keywords, tokens)
- copy(keywords[len(tokens):], labels)
- indexer.tableLock.RLock()
- defer indexer.tableLock.RUnlock()
- table := make([]*KeywordIndices, len(keywords))
- for i, keyword := range keywords {
- indices, found := indexer.tableLock.table[keyword]
- if !found {
- // 当反向索引表中无此搜索键时直接返回
- return
- } else {
- // 否则加入反向表中
- table[i] = indices
- }
- }
- // 当没有找到时直接返回
- if len(table) == 0 {
- return
- }
- // 归并查找各个搜索键出现文档的交集
- // 从后向前查保证先输出DocId较大文档
- indexPointers := make([]int, len(table))
- for iTable := 0; iTable < len(table); iTable++ {
- indexPointers[iTable] = indexer.getIndexLength(table[iTable]) - 1
- }
- // 平均文本关键词长度,用于计算BM25
- avgDocLength := indexer.totalTokenLength / float32(indexer.numDocuments)
- for ; indexPointers[0] >= 0; indexPointers[0]-- {
- // 以第一个搜索键出现的文档作为基准,并遍历其他搜索键搜索同一文档
- baseDocId := indexer.getDocId(table[0], indexPointers[0])
- if docIds != nil {
- _, found := (*docIds)[baseDocId]
- if !found {
- continue
- }
- }
- iTable := 1
- found := true
- for ; iTable < len(table); iTable++ {
- // 二分法比简单的顺序归并效率高,也有更高效率的算法,
- // 但顺序归并也许是更好的选择,考虑到将来需要用链表重新实现
- // 以避免反向表添加新文档时的写锁。
- // TODO: 进一步研究不同求交集算法的速度和可扩展性。
- position, foundBaseDocId := indexer.searchIndex(table[iTable],
- 0, indexPointers[iTable], baseDocId)
- if foundBaseDocId {
- indexPointers[iTable] = position
- } else {
- if position == 0 {
- // 该搜索键中所有的文档ID都比baseDocId大,因此已经没有
- // 继续查找的必要。
- return
- } else {
- // 继续下一indexPointers[0]的查找
- indexPointers[iTable] = position - 1
- found = false
- break
- }
- }
- }
- if found {
- indexedDoc := types.IndexedDocument{}
- // 当为LocationsIndex时计算关键词紧邻距离
- if indexer.initOptions.IndexType == types.LocationsIndex {
- // 计算有多少关键词是带有距离信息的
- numTokensWithLocations := 0
- for i, t := range table[:len(tokens)] {
- if len(t.locations[indexPointers[i]]) > 0 {
- numTokensWithLocations++
- }
- }
- if numTokensWithLocations != len(tokens) {
- docs = append(docs, types.IndexedDocument{
- DocId: baseDocId,
- })
- break
- }
- // 计算搜索键在文档中的紧邻距离
- tokenLocations := make([]int, len(tokens))
- tokenProximity := computeTokenProximity(
- table[:len(tokens)], indexPointers, tokens, &tokenLocations)
- indexedDoc.TokenProximity = int32(tokenProximity)
- indexedDoc.TokenSnippetLocations = tokenLocations
- // 添加TokenLocations
- indexedDoc.TokenLocations = make([][]int, len(tokens))
- for i, t := range table[:len(tokens)] {
- indexedDoc.TokenLocations[i] = t.locations[indexPointers[i]]
- }
- }
- // 当为LocationsIndex或者FrequenciesIndex时计算BM25
- if indexer.initOptions.IndexType == types.LocationsIndex ||
- indexer.initOptions.IndexType == types.FrequenciesIndex {
- bm25 := float32(0)
- d := indexer.docTokenLengths[baseDocId]
- for i, t := range table[:len(tokens)] {
- var frequency float32
- if indexer.initOptions.IndexType == types.LocationsIndex {
- frequency = float32(len(t.locations[indexPointers[i]]))
- } else {
- frequency = t.frequencies[indexPointers[i]]
- }
- // 计算BM25
- if len(t.docIds) > 0 && frequency > 0 && indexer.initOptions.BM25Parameters != nil && avgDocLength != 0 {
- // 带平滑的idf
- idf := float32(math.Log2(float64(indexer.numDocuments)/float64(len(t.docIds)) + 1))
- k1 := indexer.initOptions.BM25Parameters.K1
- b := indexer.initOptions.BM25Parameters.B
- bm25 += idf * frequency * (k1 + 1) / (frequency + k1*(1-b+b*d/avgDocLength))
- }
- }
- indexedDoc.BM25 = float32(bm25)
- }
- indexedDoc.DocId = baseDocId
- docs = append(docs, indexedDoc)
- }
- }
- return
- }
- // 二分法查找indices中某文档的索引项
- // 第一个返回参数为找到的位置或需要插入的位置
- // 第二个返回参数标明是否找到
- func (indexer *Indexer) searchIndex(
- indices *KeywordIndices, start int, end int, docId uint64) (int, bool) {
- // 特殊情况
- if indexer.getIndexLength(indices) == start {
- return start, false
- }
- if docId < indexer.getDocId(indices, start) {
- return start, false
- } else if docId == indexer.getDocId(indices, start) {
- return start, true
- }
- if docId > indexer.getDocId(indices, end) {
- return end + 1, false
- } else if docId == indexer.getDocId(indices, end) {
- return end, true
- }
- // 二分
- var middle int
- for end-start > 1 {
- middle = (start + end) / 2
- if docId == indexer.getDocId(indices, middle) {
- return middle, true
- } else if docId > indexer.getDocId(indices, middle) {
- start = middle
- } else {
- end = middle
- }
- }
- return end, false
- }
- // 计算搜索键在文本中的紧邻距离
- //
- // 假定第i个搜索键首字节出现在文本中的位置为P_i,长度L_i
- // 紧邻距离计算公式为
- //
- // ArgMin(Sum(Abs(P_(i+1) - P_i - L_i)))
- //
- // 具体计算过程为先取定一个P_1,计算所有P_2的可能值中令Abs(P_2 - P_1 - L1)最小,
- // 然后固定P2后依照同样的方法选择P3,P4,等等。遍历所有可能的P_1得到最小的紧邻距离。
- //
- // 选定的P_i通过tokenLocations参数传回。
- func computeTokenProximity(
- table []*KeywordIndices,
- indexPointers []int,
- tokens []string,
- tokenLocations *[]int) int {
- minTokenProximity := -1
- currentLocations := make([]int, len(tokens))
- for _, primaryLocation := range table[0].locations[indexPointers[0]] {
- tokenProximity := 0
- previousLocation := primaryLocation + len(tokens[0]) // P_1 + L_1
- for iToken := 1; iToken < len(tokens); iToken++ {
- locations := table[iToken].locations[indexPointers[iToken]]
- // 寻找 P_i + L_i 后面最近的那个 P_(i+1)
- for currentLocations[iToken] = 0; currentLocations[iToken] < len(locations) &&
- locations[currentLocations[iToken]] < previousLocation; currentLocations[iToken]++ {
- }
- if currentLocations[iToken] == 0 {
- // 找到的P_(i+1)是搜索键i+1出现的第一个位置
- tokenProximity += locations[currentLocations[iToken]] -
- previousLocation
- } else if currentLocations[iToken] == len(locations) {
- // 否则当搜索键i+1出现的最后一个位置仍然小于P_i + L_i
- tokenProximity += previousLocation -
- locations[currentLocations[iToken]-1]
- currentLocations[iToken]--
- } else {
- rightProximity := locations[currentLocations[iToken]] - previousLocation
- leftProximity := previousLocation - locations[currentLocations[iToken]-1]
- if rightProximity > leftProximity {
- // 左侧更接近
- tokenProximity += leftProximity
- currentLocations[iToken]--
- } else {
- // 右侧更接近
- tokenProximity += rightProximity
- }
- }
- // 更新 P_(i+1) + L_(i+1)
- previousLocation = locations[currentLocations[iToken]] + len(tokens[iToken])
- }
- // 更新搜索键紧邻距离
- if minTokenProximity < 0 || minTokenProximity > tokenProximity {
- minTokenProximity = tokenProximity
- (*tokenLocations)[0] = primaryLocation
- for iToken := 1; iToken < len(tokens); iToken++ {
- (*tokenLocations)[iToken] = table[iToken].locations[indexPointers[iToken]][currentLocations[iToken]]
- }
- }
- }
- return minTokenProximity
- }
- // 从KeywordIndices中得到第i个文档的DocId
- func (indexer *Indexer) getDocId(ti *KeywordIndices, i int) uint64 {
- return ti.docIds[i]
- }
- // 得到KeywordIndices中文档总数
- func (indexer *Indexer) getIndexLength(ti *KeywordIndices) int {
- return len(ti.docIds)
- }
|