package engine import ( "github.com/huichen/wukong/types" ) type segmenterRequest struct { docId uint64 hash uint32 data types.DocumentIndexData forceUpdate bool } func (engine *Engine) segmenterWorker() { for { request := <-engine.segmenterChannel if request.docId == 0 { if request.forceUpdate { for i := 0; i < engine.initOptions.NumShards; i++ { engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true} } } continue } shard := engine.getShard(request.hash) tokensMap := make(map[string][]int) numTokens := 0 if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" { // 当文档正文不为空时,优先从内容分词中得到关键词 segments := engine.segmenter.Segment([]byte(request.data.Content)) for _, segment := range segments { token := segment.Token().Text() if !engine.stopTokens.IsStopToken(token) { tokensMap[token] = append(tokensMap[token], segment.Start()) } } numTokens = len(segments) } else { // 否则载入用户输入的关键词 for _, t := range request.data.Tokens { if !engine.stopTokens.IsStopToken(t.Text) { tokensMap[t.Text] = t.Locations } } numTokens = len(request.data.Tokens) } // 加入非分词的文档标签 for _, label := range request.data.Labels { if !engine.initOptions.NotUsingSegmenter { if !engine.stopTokens.IsStopToken(label) { //当正文中已存在关键字时,若不判断,位置信息将会丢失 if _, ok := tokensMap[label]; !ok { tokensMap[label] = []int{} } } } else { //当正文中已存在关键字时,若不判断,位置信息将会丢失 if _, ok := tokensMap[label]; !ok { tokensMap[label] = []int{} } } } indexerRequest := indexerAddDocumentRequest{ document: &types.DocumentIndex{ DocId: request.docId, TokenLength: float32(numTokens), Keywords: make([]types.KeywordIndex, len(tokensMap)), }, forceUpdate: request.forceUpdate, } iTokens := 0 for k, v := range tokensMap { indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{ Text: k, // 非分词标注的词频设置为0,不参与tf-idf计算 Frequency: float32(len(v)), Starts: v} iTokens++ } engine.indexerAddDocChannels[shard] <- indexerRequest if request.forceUpdate { for i := 0; i < engine.initOptions.NumShards; i++ { if i == shard { continue } engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true} } } rankerRequest := rankerAddDocRequest{ docId: request.docId, fields: request.data.Fields} engine.rankerAddDocChannels[shard] <- rankerRequest } }