benchmark.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. // 悟空性能测试
  2. package main
  3. import (
  4. "bufio"
  5. "flag"
  6. "github.com/huichen/wukong/engine"
  7. "github.com/huichen/wukong/types"
  8. "log"
  9. "os"
  10. "runtime"
  11. "runtime/pprof"
  12. "strings"
  13. "time"
  14. )
  15. const (
  16. numRepeatQuery = 1000
  17. )
  18. var (
  19. weibo_data = flag.String(
  20. "weibo_data",
  21. "../testdata/weibo_data.txt",
  22. "微博数据")
  23. queries = flag.String(
  24. "queries",
  25. "女人母亲,你好中国,网络草根,热门微博,红十字会,"+
  26. "鳄鱼表演,星座歧视,chinajoy,高帅富,假期计划",
  27. "待搜索的关键词")
  28. dictionaries = flag.String(
  29. "dictionaries",
  30. "../data/dictionary.txt",
  31. "分词字典文件")
  32. stop_token_file = flag.String(
  33. "stop_token_file",
  34. "../data/stop_tokens.txt",
  35. "停用词文件")
  36. cpuprofile = flag.String("cpuprofile", "", "处理器profile文件")
  37. memprofile = flag.String("memprofile", "", "内存profile文件")
  38. num_repeat_text = flag.Int("num_repeat_text", 10, "文本重复加入多少次")
  39. num_delete_docs = flag.Int("num_delete_docs", 1000, "测试删除文档的个数")
  40. index_type = flag.Int("index_type", types.DocIdsIndex, "索引类型")
  41. use_persistent = flag.Bool("use_persistent", false, "是否使用持久存储")
  42. persistent_storage_folder = flag.String("persistent_storage_folder", "benchmark.persistent", "持久存储数据库保存的目录")
  43. persistent_storage_shards = flag.Int("persistent_storage_shards", 0, "持久数据库存储裂分数目")
  44. searcher = engine.Engine{}
  45. options = types.RankOptions{
  46. OutputOffset: 0,
  47. MaxOutputs: 100,
  48. }
  49. searchQueries = []string{}
  50. NumShards = 2
  51. numQueryThreads = runtime.NumCPU() / NumShards
  52. )
  53. func main() {
  54. // 解析命令行参数
  55. flag.Parse()
  56. searchQueries = strings.Split(*queries, ",")
  57. log.Printf("待搜索的关键词为\"%s\"", searchQueries)
  58. // 初始化
  59. tBeginInit := time.Now()
  60. searcher.Init(types.EngineInitOptions{
  61. SegmenterDictionaries: *dictionaries,
  62. StopTokenFile: *stop_token_file,
  63. IndexerInitOptions: &types.IndexerInitOptions{
  64. IndexType: *index_type,
  65. },
  66. NumShards: NumShards,
  67. DefaultRankOptions: &options,
  68. UsePersistentStorage: *use_persistent,
  69. PersistentStorageFolder: *persistent_storage_folder,
  70. PersistentStorageShards: *persistent_storage_shards,
  71. })
  72. tEndInit := time.Now()
  73. defer searcher.Close()
  74. // 打开将要搜索的文件
  75. file, err := os.Open(*weibo_data)
  76. if err != nil {
  77. log.Fatal(err)
  78. }
  79. defer file.Close()
  80. // 逐行读入
  81. log.Printf("读入文本 %s", *weibo_data)
  82. scanner := bufio.NewScanner(file)
  83. lines := []string{}
  84. size := 0
  85. for scanner.Scan() {
  86. var text string
  87. data := strings.Split(scanner.Text(), "||||")
  88. if len(data) != 10 {
  89. continue
  90. }
  91. text = data[9]
  92. if text != "" {
  93. size += len(text) * (*num_repeat_text)
  94. lines = append(lines, text)
  95. }
  96. }
  97. log.Print("文件行数", len(lines))
  98. // 记录时间
  99. t0 := time.Now()
  100. // 打开处理器profile文件
  101. if *cpuprofile != "" {
  102. f, err := os.Create(*cpuprofile)
  103. if err != nil {
  104. log.Fatal(err)
  105. }
  106. pprof.StartCPUProfile(f)
  107. defer pprof.StopCPUProfile()
  108. }
  109. // 建索引
  110. log.Print("建索引 ... ")
  111. docId := uint64(1)
  112. for i := 0; i < *num_repeat_text; i++ {
  113. for _, line := range lines {
  114. searcher.IndexDocument(docId, types.DocumentIndexData{
  115. Content: line}, false)
  116. docId++
  117. if docId-docId/1000000*1000000 == 0 {
  118. log.Printf("已索引%d百万文档", docId/1000000)
  119. runtime.GC()
  120. }
  121. }
  122. }
  123. searcher.FlushIndex()
  124. log.Print("加入的索引总数", searcher.NumTokenIndexAdded())
  125. // 记录时间
  126. t1 := time.Now()
  127. log.Printf("建立索引花费时间 %v", t1.Sub(t0))
  128. log.Printf("建立索引速度每秒添加 %f 百万个索引",
  129. float64(searcher.NumTokenIndexAdded())/t1.Sub(t0).Seconds()/(1000000))
  130. // 写入内存profile文件
  131. if *memprofile != "" {
  132. f, err := os.Create(*memprofile)
  133. if err != nil {
  134. log.Fatal(err)
  135. }
  136. pprof.WriteHeapProfile(f)
  137. defer f.Close()
  138. }
  139. // 记录时间并计算删除索引时间
  140. t2 := time.Now()
  141. for i := 1; i <= *num_delete_docs; i++ {
  142. searcher.RemoveDocument(uint64(i), false)
  143. }
  144. searcher.FlushIndex()
  145. t3 := time.Now()
  146. log.Printf("删除 %d 条索引花费时间 %v", *num_delete_docs, t3.Sub(t2))
  147. done := make(chan bool)
  148. recordResponseLength := make(map[string]int)
  149. for iThread := 0; iThread < numQueryThreads; iThread++ {
  150. go search(done, recordResponseLength)
  151. }
  152. for iThread := 0; iThread < numQueryThreads; iThread++ {
  153. <-done
  154. }
  155. // 测试搜索结果输出,因为不同 case 的 docId 对应不上,所以只测试总数
  156. for keyword, count := range recordResponseLength {
  157. log.Printf("关键词 [%s] 共搜索到 %d 个相关文档", keyword, count)
  158. }
  159. // 记录时间并计算分词速度
  160. t4 := time.Now()
  161. log.Printf("搜索平均响应时间 %v 毫秒",
  162. t4.Sub(t3).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
  163. log.Printf("搜索吞吐量每秒 %v 次查询",
  164. float64(numRepeatQuery*numQueryThreads*len(searchQueries))/
  165. t4.Sub(t3).Seconds())
  166. if *use_persistent {
  167. searcher.Close()
  168. t5 := time.Now()
  169. searcher1 := engine.Engine{}
  170. searcher1.Init(types.EngineInitOptions{
  171. SegmenterDictionaries: *dictionaries,
  172. StopTokenFile: *stop_token_file,
  173. IndexerInitOptions: &types.IndexerInitOptions{
  174. IndexType: *index_type,
  175. },
  176. NumShards: NumShards,
  177. DefaultRankOptions: &options,
  178. UsePersistentStorage: *use_persistent,
  179. PersistentStorageFolder: *persistent_storage_folder,
  180. PersistentStorageShards: *persistent_storage_shards,
  181. })
  182. defer searcher1.Close()
  183. t6 := time.Now()
  184. t := t6.Sub(t5).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
  185. log.Print("从持久存储加入的索引总数", searcher1.NumTokenIndexAdded())
  186. log.Printf("从持久存储建立索引花费时间 %v 秒", t)
  187. log.Printf("从持久存储建立索引速度每秒添加 %f 百万个索引",
  188. float64(searcher1.NumTokenIndexAdded())/t/(1000000))
  189. }
  190. //os.RemoveAll(*persistent_storage_folder)
  191. }
  192. func search(ch chan bool, record map[string]int) {
  193. for i := 0; i < numRepeatQuery; i++ {
  194. for _, query := range searchQueries {
  195. output := searcher.Search(types.SearchRequest{Text: query})
  196. if _, found := record[query]; !found {
  197. record[query] = len(output.Docs)
  198. }
  199. }
  200. }
  201. ch <- true
  202. }