segmenter_worker.go 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. forceUpdate bool
  10. }
  11. func (engine *Engine) segmenterWorker() {
  12. for {
  13. request := <-engine.segmenterChannel
  14. if request.docId == 0 {
  15. if request.forceUpdate {
  16. for i := 0; i < engine.initOptions.NumShards; i++ {
  17. engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
  18. }
  19. }
  20. continue
  21. }
  22. shard := engine.getShard(request.hash)
  23. tokensMap := make(map[string][]int)
  24. numTokens := 0
  25. if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
  26. // 当文档正文不为空时,优先从内容分词中得到关键词
  27. segments := engine.segmenter.Segment([]byte(request.data.Content))
  28. for _, segment := range segments {
  29. token := segment.Token().Text()
  30. if !engine.stopTokens.IsStopToken(token) {
  31. tokensMap[token] = append(tokensMap[token], segment.Start())
  32. }
  33. }
  34. numTokens = len(segments)
  35. } else {
  36. // 否则载入用户输入的关键词
  37. for _, t := range request.data.Tokens {
  38. if !engine.stopTokens.IsStopToken(t.Text) {
  39. tokensMap[t.Text] = t.Locations
  40. }
  41. }
  42. numTokens = len(request.data.Tokens)
  43. }
  44. // 加入非分词的文档标签
  45. for _, label := range request.data.Labels {
  46. if !engine.initOptions.NotUsingSegmenter {
  47. if !engine.stopTokens.IsStopToken(label) {
  48. tokensMap[label] = []int{}
  49. }
  50. } else {
  51. tokensMap[label] = []int{}
  52. }
  53. }
  54. indexerRequest := indexerAddDocumentRequest{
  55. document: &types.DocumentIndex{
  56. DocId: request.docId,
  57. TokenLength: float32(numTokens),
  58. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  59. },
  60. forceUpdate: request.forceUpdate,
  61. }
  62. iTokens := 0
  63. for k, v := range tokensMap {
  64. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  65. Text: k,
  66. // 非分词标注的词频设置为0,不参与tf-idf计算
  67. Frequency: float32(len(v)),
  68. Starts: v}
  69. iTokens++
  70. }
  71. engine.indexerAddDocChannels[shard] <- indexerRequest
  72. if request.forceUpdate {
  73. for i := 0; i < engine.initOptions.NumShards; i++ {
  74. if i == shard {
  75. continue
  76. }
  77. engine.indexerAddDocChannels[i] <- indexerAddDocumentRequest{forceUpdate: true}
  78. }
  79. }
  80. rankerRequest := rankerAddDocRequest{
  81. docId: request.docId, fields: request.data.Fields}
  82. engine.rankerAddDocChannels[shard] <- rankerRequest
  83. }
  84. }