search_server.go 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. // 一个微博搜索的例子。
  2. package main
  3. import (
  4. "bufio"
  5. "encoding/gob"
  6. "encoding/json"
  7. "flag"
  8. "github.com/huichen/wukong/engine"
  9. "github.com/huichen/wukong/types"
  10. "io"
  11. "log"
  12. "net/http"
  13. "os"
  14. "os/signal"
  15. "reflect"
  16. "strconv"
  17. "strings"
  18. )
  19. const (
  20. SecondsInADay = 86400
  21. MaxTokenProximity = 2
  22. )
  23. var (
  24. searcher = engine.Engine{}
  25. wbs = map[uint64]Weibo{}
  26. weiboData = flag.String("weibo_data", "../../testdata/weibo_data.txt", "微博数据文件")
  27. dictFile = flag.String("dict_file", "../../data/dictionary.txt", "词典文件")
  28. stopTokenFile = flag.String("stop_token_file", "../../data/stop_tokens.txt", "停用词文件")
  29. staticFolder = flag.String("static_folder", "static", "静态文件目录")
  30. )
  31. type Weibo struct {
  32. Id uint64 `json:"id"`
  33. Timestamp uint64 `json:"timestamp"`
  34. UserName string `json:"user_name"`
  35. RepostsCount uint64 `json:"reposts_count"`
  36. Text string `json:"text"`
  37. }
  38. /*******************************************************************************
  39. 索引
  40. *******************************************************************************/
  41. func indexWeibo() {
  42. // 读入微博数据
  43. file, err := os.Open(*weiboData)
  44. if err != nil {
  45. log.Fatal(err)
  46. }
  47. defer file.Close()
  48. scanner := bufio.NewScanner(file)
  49. for scanner.Scan() {
  50. data := strings.Split(scanner.Text(), "||||")
  51. if len(data) != 10 {
  52. continue
  53. }
  54. wb := Weibo{}
  55. wb.Id, _ = strconv.ParseUint(data[0], 10, 64)
  56. wb.Timestamp, _ = strconv.ParseUint(data[1], 10, 64)
  57. wb.UserName = data[3]
  58. wb.RepostsCount, _ = strconv.ParseUint(data[4], 10, 64)
  59. wb.Text = data[9]
  60. wbs[wb.Id] = wb
  61. }
  62. log.Print("添加索引")
  63. for docId, weibo := range wbs {
  64. searcher.IndexDocument(docId, types.DocumentIndexData{
  65. Content: weibo.Text,
  66. Fields: WeiboScoringFields{
  67. Timestamp: weibo.Timestamp,
  68. RepostsCount: weibo.RepostsCount,
  69. },
  70. })
  71. }
  72. searcher.FlushIndex()
  73. log.Printf("索引了%d条微博\n", len(wbs))
  74. }
  75. /*******************************************************************************
  76. 评分
  77. *******************************************************************************/
  78. type WeiboScoringFields struct {
  79. Timestamp uint64
  80. RepostsCount uint64
  81. }
  82. type WeiboScoringCriteria struct {
  83. }
  84. func (criteria WeiboScoringCriteria) Score(
  85. doc types.IndexedDocument, fields interface{}) []float32 {
  86. if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) {
  87. return []float32{}
  88. }
  89. wsf := fields.(WeiboScoringFields)
  90. output := make([]float32, 3)
  91. if doc.TokenProximity > MaxTokenProximity {
  92. output[0] = 1.0 / float32(doc.TokenProximity)
  93. } else {
  94. output[0] = 1.0
  95. }
  96. output[1] = float32(wsf.Timestamp / (SecondsInADay * 3))
  97. output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000))
  98. return output
  99. }
  100. /*******************************************************************************
  101. JSON-RPC
  102. *******************************************************************************/
  103. type JsonResponse struct {
  104. Docs []*Weibo `json:"docs"`
  105. }
  106. func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
  107. query := req.URL.Query().Get("query")
  108. output := searcher.Search(types.SearchRequest{
  109. Text: query,
  110. RankOptions: &types.RankOptions{
  111. ScoringCriteria: &WeiboScoringCriteria{},
  112. OutputOffset: 0,
  113. MaxOutputs: 100,
  114. },
  115. })
  116. // 整理为输出格式
  117. docs := []*Weibo{}
  118. for _, doc := range output.Docs {
  119. wb := wbs[doc.DocId]
  120. for _, t := range output.Tokens {
  121. wb.Text = strings.Replace(wb.Text, t, "<font color=red>"+t+"</font>", -1)
  122. }
  123. docs = append(docs, &wb)
  124. }
  125. response, _ := json.Marshal(&JsonResponse{Docs: docs})
  126. w.Header().Set("Content-Type", "application/json")
  127. io.WriteString(w, string(response))
  128. }
  129. /*******************************************************************************
  130. 主函数
  131. *******************************************************************************/
  132. func main() {
  133. // 解析命令行参数
  134. flag.Parse()
  135. // 初始化
  136. gob.Register(WeiboScoringFields{})
  137. log.Print("引擎开始初始化")
  138. searcher.Init(types.EngineInitOptions{
  139. SegmenterDictionaries: *dictFile,
  140. StopTokenFile: *stopTokenFile,
  141. IndexerInitOptions: &types.IndexerInitOptions{
  142. IndexType: types.LocationsIndex,
  143. },
  144. // 如果你希望使用持久存储,启用下面的选项
  145. // 默认使用boltdb持久化,如果你希望修改数据库类型
  146. // 请修改 WUKONG_STORAGE_ENGINE 环境变量
  147. // UsePersistentStorage: true,
  148. // PersistentStorageFolder: "weibo_search",
  149. })
  150. log.Print("引擎初始化完毕")
  151. wbs = make(map[uint64]Weibo)
  152. // 索引
  153. log.Print("建索引开始")
  154. go indexWeibo()
  155. log.Print("建索引完毕")
  156. // 捕获ctrl-c
  157. c := make(chan os.Signal, 1)
  158. signal.Notify(c, os.Interrupt)
  159. go func() {
  160. for _ = range c {
  161. log.Print("捕获Ctrl-c,退出服务器")
  162. searcher.Close()
  163. os.Exit(0)
  164. }
  165. }()
  166. http.HandleFunc("/json", JsonRpcServer)
  167. http.Handle("/", http.FileServer(http.Dir(*staticFolder)))
  168. log.Print("服务器启动")
  169. log.Fatal(http.ListenAndServe(":8080", nil))
  170. }