search_server.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. // 一个微博搜索的例子。
  2. package main
  3. import (
  4. "bufio"
  5. "encoding/json"
  6. "flag"
  7. "github.com/huichen/wukong/engine"
  8. "github.com/huichen/wukong/types"
  9. "io"
  10. "log"
  11. "net/http"
  12. "os"
  13. "reflect"
  14. "strconv"
  15. "strings"
  16. )
  17. const (
  18. SecondsInADay = 86400
  19. MaxTokenProximity = 2
  20. )
  21. var (
  22. searcher = engine.Engine{}
  23. wbs = map[uint64]Weibo{}
  24. )
  25. type Weibo struct {
  26. Id uint64 `json:"id"`
  27. Timestamp uint64 `json:"timestamp"`
  28. UserName string `json:"user_name"`
  29. RepostsCount uint64 `json:"reposts_count"`
  30. Text string `json:"text"`
  31. }
  32. /*******************************************************************************
  33. 索引
  34. *******************************************************************************/
  35. func indexWeibo() {
  36. // 读入微博数据
  37. file, err := os.Open("../../testdata/weibo_data.txt")
  38. if err != nil {
  39. log.Fatal(err)
  40. }
  41. defer file.Close()
  42. scanner := bufio.NewScanner(file)
  43. for scanner.Scan() {
  44. data := strings.Split(scanner.Text(), "||||")
  45. if len(data) != 10 {
  46. continue
  47. }
  48. wb := Weibo{}
  49. wb.Id, _ = strconv.ParseUint(data[0], 10, 64)
  50. wb.Timestamp, _ = strconv.ParseUint(data[1], 10, 64)
  51. wb.UserName = data[3]
  52. wb.RepostsCount, _ = strconv.ParseUint(data[4], 10, 64)
  53. wb.Text = data[9]
  54. wbs[wb.Id] = wb
  55. }
  56. log.Print("添加索引")
  57. for docId, weibo := range wbs {
  58. searcher.IndexDocument(docId, types.DocumentIndexData{
  59. Content: weibo.Text,
  60. Fields: WeiboScoringFields{
  61. Timestamp: weibo.Timestamp,
  62. RepostsCount: weibo.RepostsCount,
  63. },
  64. })
  65. }
  66. searcher.FlushIndex()
  67. log.Printf("索引了%d条微博\n", len(wbs))
  68. }
  69. /*******************************************************************************
  70. 评分
  71. *******************************************************************************/
  72. type WeiboScoringFields struct {
  73. Timestamp uint64
  74. RepostsCount uint64
  75. }
  76. type WeiboScoringCriteria struct {
  77. }
  78. func (criteria WeiboScoringCriteria) Score(
  79. doc types.IndexedDocument, fields interface{}) []float32 {
  80. if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) {
  81. return []float32{}
  82. }
  83. wsf := fields.(WeiboScoringFields)
  84. output := make([]float32, 3)
  85. if doc.TokenProximity > MaxTokenProximity {
  86. output[0] = 1.0 / float32(doc.TokenProximity)
  87. } else {
  88. output[0] = 1.0
  89. }
  90. output[1] = float32(wsf.Timestamp / (SecondsInADay * 3))
  91. output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000))
  92. return output
  93. }
  94. /*******************************************************************************
  95. JSON-RPC
  96. *******************************************************************************/
  97. type JsonResponse struct {
  98. Docs []*Weibo `json:"docs"`
  99. }
  100. func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
  101. query := req.URL.Query().Get("query")
  102. output := searcher.Search(types.SearchRequest{
  103. Text: query,
  104. RankOptions: &types.RankOptions{
  105. ScoringCriteria: &WeiboScoringCriteria{},
  106. OutputOffset: 0,
  107. MaxOutputs: 100,
  108. },
  109. })
  110. // 整理为输出格式
  111. docs := []*Weibo{}
  112. for _, doc := range output.Docs {
  113. wb := wbs[doc.DocId]
  114. for _, t := range output.Tokens {
  115. wb.Text = strings.Replace(wb.Text, t, "<font color=red>"+t+"</font>", -1)
  116. }
  117. docs = append(docs, &wb)
  118. }
  119. response, _ := json.Marshal(&JsonResponse{Docs: docs})
  120. w.Header().Set("Content-Type", "application/json")
  121. io.WriteString(w, string(response))
  122. }
  123. /*******************************************************************************
  124. 主函数
  125. *******************************************************************************/
  126. func main() {
  127. // 解析命令行参数
  128. flag.Parse()
  129. // 初始化
  130. searcher.Init(types.EngineInitOptions{
  131. SegmenterDictionaries: "../../data/dictionary.txt",
  132. StopTokenFile: "../../data/stop_tokens.txt",
  133. IndexerInitOptions: &types.IndexerInitOptions{
  134. IndexType: types.LocationsIndex,
  135. },
  136. })
  137. wbs = make(map[uint64]Weibo)
  138. // 索引
  139. go indexWeibo()
  140. http.HandleFunc("/json", JsonRpcServer)
  141. http.Handle("/", http.FileServer(http.Dir("static")))
  142. log.Print("服务器启动")
  143. http.ListenAndServe("localhost:8080", nil)
  144. }