segmenter_worker.go 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. }
  10. func (engine *Engine) segmenterWorker() {
  11. for {
  12. request := <-engine.segmenterChannel
  13. shard := engine.getShard(request.hash)
  14. tokensMap := make(map[string][]int)
  15. numTokens := 0
  16. if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
  17. // 当文档正文不为空时,优先从内容分词中得到关键词
  18. segments := engine.segmenter.Segment([]byte(request.data.Content))
  19. for _, segment := range segments {
  20. token := segment.Token().Text()
  21. if !engine.stopTokens.IsStopToken(token) {
  22. tokensMap[token] = append(tokensMap[token], segment.Start())
  23. }
  24. }
  25. numTokens = len(segments)
  26. } else {
  27. // 否则载入用户输入的关键词
  28. for _, t := range request.data.Tokens {
  29. if !engine.stopTokens.IsStopToken(t.Text) {
  30. tokensMap[t.Text] = t.Locations
  31. }
  32. }
  33. numTokens = len(request.data.Tokens)
  34. }
  35. // 加入非分词的文档标签
  36. for _, label := range request.data.Labels {
  37. if !engine.initOptions.NotUsingSegmenter {
  38. if !engine.stopTokens.IsStopToken(label) {
  39. tokensMap[label] = []int{}
  40. }
  41. } else {
  42. tokensMap[label] = []int{}
  43. }
  44. }
  45. indexerRequest := indexerAddDocumentRequest{
  46. document: &types.DocumentIndex{
  47. DocId: request.docId,
  48. TokenLength: float32(numTokens),
  49. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  50. },
  51. }
  52. iTokens := 0
  53. for k, v := range tokensMap {
  54. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  55. Text: k,
  56. // 非分词标注的词频设置为0,不参与tf-idf计算
  57. Frequency: float32(len(v)),
  58. Starts: v}
  59. iTokens++
  60. }
  61. engine.indexerAddDocumentChannels[shard] <- indexerRequest
  62. rankerRequest := rankerAddDocRequest{
  63. docId: request.docId, fields: request.data.Fields}
  64. engine.rankerAddDocChannels[shard] <- rankerRequest
  65. }
  66. }