segmenter_worker.go 2.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. }
  10. func (engine *Engine) segmenterWorker() {
  11. for {
  12. request := <-engine.segmenterChannel
  13. shard := engine.getShard(request.hash)
  14. tokensMap := make(map[string][]int)
  15. numTokens := 0
  16. if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
  17. // 当文档正文不为空时,优先从内容分词中得到关键词
  18. segments := engine.segmenter.Segment([]byte(request.data.Content))
  19. for _, segment := range segments {
  20. token := segment.Token().Text()
  21. if !engine.stopTokens.IsStopToken(token) {
  22. tokensMap[token] = append(tokensMap[token], segment.Start())
  23. }
  24. }
  25. numTokens = len(segments)
  26. } else {
  27. // 否则载入用户输入的关键词
  28. for _, t := range request.data.Tokens {
  29. if !engine.stopTokens.IsStopToken(t.Text) {
  30. tokensMap[t.Text] = t.Locations
  31. }
  32. }
  33. numTokens = len(request.data.Tokens)
  34. }
  35. // 加入非分词的文档标签
  36. for _, label := range request.data.Labels {
  37. if !engine.initOptions.NotUsingSegmenter {
  38. if !engine.stopTokens.IsStopToken(label) {
  39. //当正文中已存在关键字时,若不判断,位置信息将会丢失
  40. if _, ok := tokensMap[label]; !ok {
  41. tokensMap[label] = []int{}
  42. }
  43. }
  44. } else {
  45. //当正文中已存在关键字时,若不判断,位置信息将会丢失
  46. if _, ok := tokensMap[label]; !ok {
  47. tokensMap[label] = []int{}
  48. }
  49. }
  50. }
  51. indexerRequest := indexerAddDocumentRequest{
  52. document: &types.DocumentIndex{
  53. DocId: request.docId,
  54. TokenLength: float32(numTokens),
  55. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  56. },
  57. }
  58. iTokens := 0
  59. for k, v := range tokensMap {
  60. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  61. Text: k,
  62. // 非分词标注的词频设置为0,不参与tf-idf计算
  63. Frequency: float32(len(v)),
  64. Starts: v}
  65. iTokens++
  66. }
  67. engine.indexerAddDocumentChannels[shard] <- indexerRequest
  68. rankerRequest := rankerAddDocRequest{
  69. docId: request.docId, fields: request.data.Fields}
  70. engine.rankerAddDocChannels[shard] <- rankerRequest
  71. }
  72. }