segmenter_worker.go 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. package engine
  2. import (
  3. "github.com/huichen/wukong/types"
  4. )
  5. type segmenterRequest struct {
  6. docId uint64
  7. hash uint32
  8. data types.DocumentIndexData
  9. }
  10. func (engine *Engine) segmenterWorker() {
  11. for {
  12. request := <-engine.segmenterChannel
  13. shard := engine.getShard(request.hash)
  14. tokensMap := make(map[string][]int)
  15. numTokens := 0
  16. if !engine.initOptions.NotUsingSegmenter && request.data.Content != "" {
  17. // 当文档正文不为空时,优先从内容分词中得到关键词
  18. segments := engine.segmenter.Segment([]byte(request.data.Content))
  19. for _, segment := range segments {
  20. token := segment.Token().Text()
  21. if !engine.stopTokens.IsStopToken(token) {
  22. tokensMap[token] = append(tokensMap[token], segment.Start())
  23. }
  24. }
  25. numTokens = len(segments)
  26. } else {
  27. // 否则载入用户输入的关键词
  28. for _, t := range request.data.Tokens {
  29. if !engine.stopTokens.IsStopToken(t.Text) {
  30. tokensMap[t.Text] = t.Locations
  31. }
  32. }
  33. numTokens = len(request.data.Tokens)
  34. }
  35. if !engine.initOptions.NotUsingSegmenter {
  36. // 加入非分词的文档标签
  37. for _, label := range request.data.Labels {
  38. if !engine.stopTokens.IsStopToken(label) {
  39. tokensMap[label] = []int{}
  40. }
  41. }
  42. }
  43. indexerRequest := indexerAddDocumentRequest{
  44. document: &types.DocumentIndex{
  45. DocId: request.docId,
  46. TokenLength: float32(numTokens),
  47. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  48. },
  49. }
  50. iTokens := 0
  51. for k, v := range tokensMap {
  52. indexerRequest.document.Keywords[iTokens] = types.KeywordIndex{
  53. Text: k,
  54. // 非分词标注的词频设置为0,不参与tf-idf计算
  55. Frequency: float32(len(v)),
  56. Starts: v}
  57. iTokens++
  58. }
  59. engine.indexerAddDocumentChannels[shard] <- indexerRequest
  60. rankerRequest := rankerAddScoringFieldsRequest{
  61. docId: request.docId, fields: request.data.Fields}
  62. engine.rankerAddScoringFieldsChannels[shard] <- rankerRequest
  63. }
  64. }