benchmark.go 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // 悟空性能测试
  2. package main
  3. import (
  4. "bufio"
  5. "flag"
  6. "log"
  7. "math/rand"
  8. "os"
  9. "runtime"
  10. "runtime/pprof"
  11. "strings"
  12. "sync"
  13. "time"
  14. "github.com/huichen/wukong/engine"
  15. "github.com/huichen/wukong/types"
  16. )
  17. const (
  18. numRepeatQuery = 1000
  19. )
  20. var (
  21. weibo_data = flag.String(
  22. "weibo_data",
  23. "../testdata/weibo_data.txt",
  24. "微博数据")
  25. queries = flag.String(
  26. "queries",
  27. "女人母亲,你好中国,网络草根,热门微博,红十字会,"+
  28. "鳄鱼表演,星座歧视,chinajoy,高帅富,假期计划",
  29. "待搜索的关键词")
  30. dictionaries = flag.String(
  31. "dictionaries",
  32. "../data/dictionary.txt",
  33. "分词字典文件")
  34. stop_token_file = flag.String(
  35. "stop_token_file",
  36. "../data/stop_tokens.txt",
  37. "停用词文件")
  38. cpuprofile = flag.String("cpuprofile", "", "处理器profile文件")
  39. memprofile = flag.String("memprofile", "", "内存profile文件")
  40. num_repeat_text = flag.Int("num_repeat_text", 10, "文本重复加入多少次")
  41. num_delete_docs = flag.Int("num_delete_docs", 1000, "测试删除文档的个数")
  42. index_type = flag.Int("index_type", types.DocIdsIndex, "索引类型")
  43. use_persistent = flag.Bool("use_persistent", false, "是否使用持久存储")
  44. persistent_storage_folder = flag.String("persistent_storage_folder", "benchmark.persistent", "持久存储数据库保存的目录")
  45. persistent_storage_shards = flag.Int("persistent_storage_shards", 0, "持久数据库存储裂分数目")
  46. searcher = engine.Engine{}
  47. options = types.RankOptions{
  48. OutputOffset: 0,
  49. MaxOutputs: 100,
  50. }
  51. searchQueries = []string{}
  52. NumShards = 2
  53. numQueryThreads = runtime.NumCPU() / NumShards
  54. )
  55. func main() {
  56. // 解析命令行参数
  57. flag.Parse()
  58. searchQueries = strings.Split(*queries, ",")
  59. log.Printf("待搜索的关键词为\"%s\"", searchQueries)
  60. // 初始化
  61. tBeginInit := time.Now()
  62. searcher.Init(types.EngineInitOptions{
  63. SegmenterDictionaries: *dictionaries,
  64. StopTokenFile: *stop_token_file,
  65. IndexerInitOptions: &types.IndexerInitOptions{
  66. IndexType: *index_type,
  67. },
  68. NumShards: NumShards,
  69. DefaultRankOptions: &options,
  70. UsePersistentStorage: *use_persistent,
  71. PersistentStorageFolder: *persistent_storage_folder,
  72. PersistentStorageShards: *persistent_storage_shards,
  73. })
  74. tEndInit := time.Now()
  75. defer searcher.Close()
  76. // 打开将要搜索的文件
  77. file, err := os.Open(*weibo_data)
  78. if err != nil {
  79. log.Fatal(err)
  80. }
  81. defer file.Close()
  82. // 逐行读入
  83. log.Printf("读入文本 %s", *weibo_data)
  84. scanner := bufio.NewScanner(file)
  85. lines := []string{}
  86. size := 0
  87. for scanner.Scan() {
  88. var text string
  89. data := strings.Split(scanner.Text(), "||||")
  90. if len(data) != 10 {
  91. continue
  92. }
  93. text = data[9]
  94. if text != "" {
  95. size += len(text) * (*num_repeat_text)
  96. lines = append(lines, text)
  97. }
  98. }
  99. log.Print("文件行数", len(lines))
  100. // 记录时间
  101. t0 := time.Now()
  102. // 打开处理器profile文件
  103. if *cpuprofile != "" {
  104. f, err := os.Create(*cpuprofile)
  105. if err != nil {
  106. log.Fatal(err)
  107. }
  108. pprof.StartCPUProfile(f)
  109. defer pprof.StopCPUProfile()
  110. }
  111. // 建索引
  112. log.Print("建索引 ... ")
  113. // 打乱 docId 顺序进行测试,若 docId 最大值超 Int 则不能用 rand.Perm 方法
  114. docIds := rand.Perm(*num_repeat_text * len(lines))
  115. docIdx := 0
  116. for i := 0; i < *num_repeat_text; i++ {
  117. for _, line := range lines {
  118. searcher.IndexDocument(uint64(docIds[docIdx]+1), types.DocumentIndexData{
  119. Content: line}, false)
  120. docIdx++
  121. if docIdx-docIdx/1000000*1000000 == 0 {
  122. log.Printf("已索引%d百万文档", docIdx/1000000)
  123. runtime.GC()
  124. }
  125. }
  126. }
  127. searcher.FlushIndex()
  128. log.Print("加入的索引总数", searcher.NumTokenIndexAdded())
  129. // 记录时间
  130. t1 := time.Now()
  131. log.Printf("建立索引花费时间 %v", t1.Sub(t0))
  132. log.Printf("建立索引速度每秒添加 %f 百万个索引",
  133. float64(searcher.NumTokenIndexAdded())/t1.Sub(t0).Seconds()/(1000000))
  134. runtime.GC()
  135. // 写入内存profile文件
  136. if *memprofile != "" {
  137. f, err := os.Create(*memprofile)
  138. if err != nil {
  139. log.Fatal(err)
  140. }
  141. pprof.WriteHeapProfile(f)
  142. defer f.Close()
  143. }
  144. // 记录时间并计算删除索引时间
  145. t2 := time.Now()
  146. for i := 1; i <= *num_delete_docs; i++ {
  147. searcher.RemoveDocument(uint64(i), false)
  148. }
  149. searcher.FlushIndex()
  150. t3 := time.Now()
  151. log.Printf("删除 %d 条索引花费时间 %v", *num_delete_docs, t3.Sub(t2))
  152. done := make(chan bool)
  153. recordResponse := recordResponseLock{}
  154. recordResponse.count = make(map[string]int)
  155. for iThread := 0; iThread < numQueryThreads; iThread++ {
  156. go search(done, &recordResponse)
  157. }
  158. for iThread := 0; iThread < numQueryThreads; iThread++ {
  159. <-done
  160. }
  161. // 记录时间并计算分词速度
  162. t4 := time.Now()
  163. log.Printf("搜索平均响应时间 %v 毫秒",
  164. t4.Sub(t3).Seconds()*1000/float64(numRepeatQuery*len(searchQueries)))
  165. log.Printf("搜索吞吐量每秒 %v 次查询",
  166. float64(numRepeatQuery*numQueryThreads*len(searchQueries))/
  167. t4.Sub(t3).Seconds())
  168. // 测试搜索结果输出,因为不同 case 的 docId 对应不上,所以只测试总数
  169. recordResponse.RLock()
  170. for keyword, count := range recordResponse.count {
  171. log.Printf("关键词 [%s] 共搜索到 %d 个相关文档", keyword, count)
  172. }
  173. recordResponse.RUnlock()
  174. if *use_persistent {
  175. searcher.Close()
  176. t5 := time.Now()
  177. searcher1 := engine.Engine{}
  178. searcher1.Init(types.EngineInitOptions{
  179. SegmenterDictionaries: *dictionaries,
  180. StopTokenFile: *stop_token_file,
  181. IndexerInitOptions: &types.IndexerInitOptions{
  182. IndexType: *index_type,
  183. },
  184. NumShards: NumShards,
  185. DefaultRankOptions: &options,
  186. UsePersistentStorage: *use_persistent,
  187. PersistentStorageFolder: *persistent_storage_folder,
  188. PersistentStorageShards: *persistent_storage_shards,
  189. })
  190. defer searcher1.Close()
  191. t6 := time.Now()
  192. t := t6.Sub(t5).Seconds() - tEndInit.Sub(tBeginInit).Seconds()
  193. log.Print("从持久存储加入的索引总数", searcher1.NumTokenIndexAdded())
  194. log.Printf("从持久存储建立索引花费时间 %v 秒", t)
  195. log.Printf("从持久存储建立索引速度每秒添加 %f 百万个索引",
  196. float64(searcher1.NumTokenIndexAdded())/t/(1000000))
  197. }
  198. //os.RemoveAll(*persistent_storage_folder)
  199. }
  200. type recordResponseLock struct {
  201. sync.RWMutex
  202. count map[string]int
  203. }
  204. func search(ch chan bool, record *recordResponseLock) {
  205. for i := 0; i < numRepeatQuery; i++ {
  206. for _, query := range searchQueries {
  207. output := searcher.Search(types.SearchRequest{Text: query})
  208. record.RLock()
  209. if _, found := record.count[query]; !found {
  210. record.RUnlock()
  211. record.Lock()
  212. record.count[query] = len(output.Docs)
  213. record.Unlock()
  214. } else {
  215. record.RUnlock()
  216. }
  217. }
  218. }
  219. ch <- true
  220. }