search_server.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. // 一个微博搜索的例子。
  2. package main
  3. import (
  4. "bufio"
  5. "encoding/gob"
  6. "encoding/json"
  7. "flag"
  8. "fmt"
  9. "io"
  10. "log"
  11. "net/http"
  12. "os"
  13. "os/signal"
  14. "reflect"
  15. "strconv"
  16. "strings"
  17. "github.com/huichen/wukong/engine"
  18. "github.com/huichen/wukong/types"
  19. )
  20. const (
  21. SecondsInADay = 86400
  22. MaxTokenProximity = 2
  23. )
  24. var (
  25. searcher = engine.Engine{}
  26. wbs = map[uint64]Weibo{}
  27. weiboData = flag.String("weibo_data", "../../testdata/weibo_data.txt", "微博数据文件")
  28. dictFile = flag.String("dict_file", "../../data/dictionary.txt", "词典文件")
  29. stopTokenFile = flag.String("stop_token_file", "../../data/stop_tokens.txt", "停用词文件")
  30. staticFolder = flag.String("static_folder", "static", "静态文件目录")
  31. )
  32. type Weibo struct {
  33. Id uint64 `json:"id"`
  34. Timestamp uint64 `json:"timestamp"`
  35. UserName string `json:"user_name"`
  36. RepostsCount uint64 `json:"reposts_count"`
  37. Text string `json:"text"`
  38. }
  39. /*******************************************************************************
  40. 索引
  41. *******************************************************************************/
  42. func indexWeibo() {
  43. // 读入微博数据
  44. file, err := os.Open(*weiboData)
  45. if err != nil {
  46. log.Fatal(err)
  47. }
  48. defer file.Close()
  49. scanner := bufio.NewScanner(file)
  50. for scanner.Scan() {
  51. data := strings.Split(scanner.Text(), "||||")
  52. if len(data) != 10 {
  53. continue
  54. }
  55. wb := Weibo{}
  56. wb.Id, _ = strconv.ParseUint(data[0], 10, 64)
  57. wb.Timestamp, _ = strconv.ParseUint(data[1], 10, 64)
  58. wb.UserName = data[3]
  59. wb.RepostsCount, _ = strconv.ParseUint(data[4], 10, 64)
  60. wb.Text = data[9]
  61. wbs[wb.Id] = wb
  62. }
  63. log.Print("添加索引")
  64. for docId, weibo := range wbs {
  65. index := fmt.Sprintf("wb:%d", docId) // strconv.FormatUint(docId, 10)
  66. searcher.IndexDocumentS(index, types.DocumentIndexData{
  67. Content: weibo.Text,
  68. Fields: WeiboScoringFields{
  69. Timestamp: weibo.Timestamp,
  70. RepostsCount: weibo.RepostsCount,
  71. },
  72. }, false)
  73. }
  74. searcher.FlushIndex()
  75. log.Printf("索引了%d条微博\n", len(wbs))
  76. }
  77. /*******************************************************************************
  78. 评分
  79. *******************************************************************************/
  80. type WeiboScoringFields struct {
  81. Timestamp uint64
  82. RepostsCount uint64
  83. }
  84. type WeiboScoringCriteria struct {
  85. }
  86. func (criteria WeiboScoringCriteria) Score(
  87. doc types.IndexedDocument, fields interface{}) []float32 {
  88. if reflect.TypeOf(fields) != reflect.TypeOf(WeiboScoringFields{}) {
  89. return []float32{}
  90. }
  91. wsf := fields.(WeiboScoringFields)
  92. output := make([]float32, 3)
  93. if doc.TokenProximity > MaxTokenProximity {
  94. output[0] = 1.0 / float32(doc.TokenProximity)
  95. } else {
  96. output[0] = 1.0
  97. }
  98. output[1] = float32(wsf.Timestamp / (SecondsInADay * 3))
  99. output[2] = float32(doc.BM25 * (1 + float32(wsf.RepostsCount)/10000))
  100. return output
  101. }
  102. /*******************************************************************************
  103. JSON-RPC
  104. *******************************************************************************/
  105. type JsonResponse struct {
  106. Docs []*Weibo `json:"docs"`
  107. }
  108. func JsonRpcServer(w http.ResponseWriter, req *http.Request) {
  109. query := req.URL.Query().Get("query")
  110. output := searcher.Search(types.SearchRequest{
  111. Text: query,
  112. RankOptions: &types.RankOptions{
  113. ScoringCriteria: &WeiboScoringCriteria{},
  114. OutputOffset: 0,
  115. MaxOutputs: 100,
  116. },
  117. })
  118. // 整理为输出格式
  119. docs := []*Weibo{}
  120. for _, doc := range output.Docs {
  121. parts := strings.Split(doc.DocId, ":")
  122. id, _ := strconv.ParseUint(parts[1], 10, 64)
  123. wb := wbs[id]
  124. for _, t := range output.Tokens {
  125. wb.Text = strings.Replace(wb.Text, t, "<font color=red>"+t+"</font>", -1)
  126. }
  127. docs = append(docs, &wb)
  128. }
  129. response, _ := json.Marshal(&JsonResponse{Docs: docs})
  130. w.Header().Set("Content-Type", "application/json")
  131. io.WriteString(w, string(response))
  132. }
  133. /*******************************************************************************
  134. 主函数
  135. *******************************************************************************/
  136. func main() {
  137. // 解析命令行参数
  138. flag.Parse()
  139. // 初始化
  140. gob.Register(WeiboScoringFields{})
  141. log.Print("引擎开始初始化")
  142. searcher.Init(types.EngineInitOptions{
  143. SegmenterDictionaries: *dictFile,
  144. StopTokenFile: *stopTokenFile,
  145. IndexerInitOptions: &types.IndexerInitOptions{
  146. IndexType: types.LocationsIndex,
  147. },
  148. // 如果你希望使用持久存储,启用下面的选项
  149. // 默认使用boltdb持久化,如果你希望修改数据库类型
  150. // 请修改 WUKONG_STORAGE_ENGINE 环境变量
  151. // UsePersistentStorage: true,
  152. // PersistentStorageFolder: "weibo_search",
  153. })
  154. log.Print("引擎初始化完毕")
  155. wbs = make(map[uint64]Weibo)
  156. // 索引
  157. log.Print("建索引开始")
  158. go indexWeibo()
  159. log.Print("建索引完毕")
  160. // 捕获ctrl-c
  161. c := make(chan os.Signal, 1)
  162. signal.Notify(c, os.Interrupt)
  163. go func() {
  164. for _ = range c {
  165. log.Print("捕获Ctrl-c,退出服务器")
  166. searcher.Close()
  167. os.Exit(0)
  168. }
  169. }()
  170. http.HandleFunc("/json", JsonRpcServer)
  171. http.Handle("/", http.FileServer(http.Dir(*staticFolder)))
  172. log.Print("服务器启动")
  173. log.Fatal(http.ListenAndServe(":8080", nil))
  174. }