segmenter_worker.go 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. }
  10. func (engine *Engine) segmenterWorker() {
  11. for {
  12. request := <-engine.segmenterChannel
  13. shard := engine.getShard(request.hash)
  14. tokensMap := make(map[string][]int)
  15. numTokens := 0
  16. if request.data.Content != "" {
  17. // 当文档正文不为空时,优先从内容分词中得到关键词
  18. segments := engine.segmenter.Segment([]byte(request.data.Content))
  19. for _, segment := range segments {
  20. token := segment.Token().Text()
  21. if !engine.stopTokens.IsStopToken(token) {
  22. tokensMap[token] = append(tokensMap[token], segment.Start())
  23. }
  24. }
  25. numTokens = len(segments)
  26. } else {
  27. // 否则载入用户输入的关键词
  28. for _, t := range request.data.Tokens {
  29. if !engine.stopTokens.IsStopToken(t.Text) {
  30. tokensMap[t.Text] = t.Locations
  31. }
  32. }
  33. numTokens = len(request.data.Tokens)
  34. }
  35. // 加入非分词的文档标签
  36. for _, label := range request.data.Labels {
  37. if !engine.stopTokens.IsStopToken(label) {
  38. tokensMap[label] = []int{}
  39. }
  40. }
  41. indexerRequest := indexerAddDocumentRequest{
  42. document: &types.DocumentIndex{
  43. DocId: request.docId,
  44. TokenLength: float32(numTokens),
  45. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  46. },
  47. }
  48. iTokens := 0
  49. for k, v := range tokensMap {
  50. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  51. Text: k,
  52. // 非分词标注的词频设置为0,不参与tf-idf计算
  53. Frequency: float32(len(v)),
  54. Starts: v}
  55. iTokens++
  56. }
  57. engine.indexerAddDocumentChannels[shard] <- indexerRequest
  58. rankerRequest := rankerAddScoringFieldsRequest{
  59. docId: request.docId, fields: request.data.Fields}
  60. engine.rankerAddScoringFieldsChannels[shard] <- rankerRequest
  61. }
  62. }