segmenter_worker.go 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. }
  10. func (engine *Engine) segmenterWorker() {
  11. for {
  12. request := <-engine.segmenterChannel
  13. shard := engine.getShard(request.hash)
  14. segments := engine.segmenter.Segment([]byte(request.data.Content))
  15. tokensMap := make(map[string][]int)
  16. // 加入分词得到的关键词
  17. for _, segment := range segments {
  18. token := segment.Token().Text()
  19. if !engine.stopTokens.IsStopToken(token) {
  20. tokensMap[token] = append(tokensMap[token], segment.Start())
  21. }
  22. }
  23. // 加入非分词的文档标签
  24. for _, label := range request.data.Labels {
  25. if !engine.stopTokens.IsStopToken(label) {
  26. tokensMap[label] = []int{}
  27. }
  28. }
  29. indexerRequest := indexerAddDocumentRequest{
  30. document: &types.DocumentIndex{
  31. DocId: request.docId,
  32. TokenLength: float32(len(segments)),
  33. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  34. },
  35. }
  36. iTokens := 0
  37. for k, v := range tokensMap {
  38. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  39. Text: k,
  40. // 非分词标注的词频设置为0,不参与tf-idf计算
  41. Frequency: float32(len(v)),
  42. Starts: v}
  43. iTokens++
  44. }
  45. engine.indexerAddDocumentChannels[shard] <- indexerRequest
  46. rankerRequest := rankerAddScoringFieldsRequest{
  47. docId: request.docId, fields: request.data.Fields}
  48. engine.rankerAddScoringFieldsChannels[shard] <- rankerRequest
  49. }
  50. }