engine_test.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. package engine
  2. import (
  3. "encoding/gob"
  4. "github.com/huichen/wukong/types"
  5. "github.com/huichen/wukong/utils"
  6. "os"
  7. "reflect"
  8. "testing"
  9. )
  10. type ScoringFields struct {
  11. A, B, C float32
  12. }
  13. func AddDocs(engine *Engine) {
  14. docId := uint64(1)
  15. // 因为需要保证文档全部被加入到索引中,所以 forceUpdate 全部设置成 true
  16. engine.IndexDocument(docId, types.DocumentIndexData{
  17. Content: "中国有十三亿人口人口",
  18. Fields: ScoringFields{1, 2, 3},
  19. }, true)
  20. docId++
  21. engine.IndexDocument(docId, types.DocumentIndexData{
  22. Content: "中国人口",
  23. Fields: nil,
  24. }, true)
  25. docId++
  26. engine.IndexDocument(docId, types.DocumentIndexData{
  27. Content: "有人口",
  28. Fields: ScoringFields{2, 3, 1},
  29. }, true)
  30. docId++
  31. engine.IndexDocument(docId, types.DocumentIndexData{
  32. Content: "有十三亿人口",
  33. Fields: ScoringFields{2, 3, 3},
  34. }, true)
  35. docId++
  36. engine.IndexDocument(docId, types.DocumentIndexData{
  37. Content: "中国十三亿人口",
  38. Fields: ScoringFields{0, 9, 1},
  39. }, true)
  40. engine.FlushIndex()
  41. }
  42. type RankByTokenProximity struct {
  43. }
  44. func (rule RankByTokenProximity) Score(
  45. doc types.IndexedDocument, fields interface{}) []float32 {
  46. if doc.TokenProximity < 0 {
  47. return []float32{}
  48. }
  49. return []float32{1.0 / (float32(doc.TokenProximity) + 1)}
  50. }
  51. func TestEngineIndexDocument(t *testing.T) {
  52. var engine Engine
  53. engine.Init(types.EngineInitOptions{
  54. SegmenterDictionaries: "../testdata/test_dict.txt",
  55. DefaultRankOptions: &types.RankOptions{
  56. OutputOffset: 0,
  57. MaxOutputs: 10,
  58. ScoringCriteria: &RankByTokenProximity{},
  59. },
  60. IndexerInitOptions: &types.IndexerInitOptions{
  61. IndexType: types.LocationsIndex,
  62. },
  63. })
  64. AddDocs(&engine)
  65. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  66. utils.Expect(t, "2", len(outputs.Tokens))
  67. utils.Expect(t, "中国", outputs.Tokens[0])
  68. utils.Expect(t, "人口", outputs.Tokens[1])
  69. utils.Expect(t, "3", len(outputs.Docs))
  70. utils.Expect(t, "2", outputs.Docs[0].DocId)
  71. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  72. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  73. utils.Expect(t, "5", outputs.Docs[1].DocId)
  74. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  75. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  76. utils.Expect(t, "1", outputs.Docs[2].DocId)
  77. utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
  78. utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
  79. }
  80. func TestReverseOrder(t *testing.T) {
  81. var engine Engine
  82. engine.Init(types.EngineInitOptions{
  83. SegmenterDictionaries: "../testdata/test_dict.txt",
  84. DefaultRankOptions: &types.RankOptions{
  85. ReverseOrder: true,
  86. OutputOffset: 0,
  87. MaxOutputs: 10,
  88. ScoringCriteria: &RankByTokenProximity{},
  89. },
  90. IndexerInitOptions: &types.IndexerInitOptions{
  91. IndexType: types.LocationsIndex,
  92. },
  93. })
  94. AddDocs(&engine)
  95. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  96. utils.Expect(t, "3", len(outputs.Docs))
  97. utils.Expect(t, "1", outputs.Docs[0].DocId)
  98. utils.Expect(t, "5", outputs.Docs[1].DocId)
  99. utils.Expect(t, "2", outputs.Docs[2].DocId)
  100. }
  101. func TestOffsetAndMaxOutputs(t *testing.T) {
  102. var engine Engine
  103. engine.Init(types.EngineInitOptions{
  104. SegmenterDictionaries: "../testdata/test_dict.txt",
  105. DefaultRankOptions: &types.RankOptions{
  106. ReverseOrder: true,
  107. OutputOffset: 1,
  108. MaxOutputs: 3,
  109. ScoringCriteria: &RankByTokenProximity{},
  110. },
  111. IndexerInitOptions: &types.IndexerInitOptions{
  112. IndexType: types.LocationsIndex,
  113. },
  114. })
  115. AddDocs(&engine)
  116. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  117. utils.Expect(t, "2", len(outputs.Docs))
  118. utils.Expect(t, "5", outputs.Docs[0].DocId)
  119. utils.Expect(t, "2", outputs.Docs[1].DocId)
  120. }
  121. type TestScoringCriteria struct {
  122. }
  123. func (criteria TestScoringCriteria) Score(
  124. doc types.IndexedDocument, fields interface{}) []float32 {
  125. if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
  126. return []float32{}
  127. }
  128. fs := fields.(ScoringFields)
  129. return []float32{float32(doc.TokenProximity)*fs.A + fs.B*fs.C}
  130. }
  131. func TestSearchWithCriteria(t *testing.T) {
  132. var engine Engine
  133. engine.Init(types.EngineInitOptions{
  134. SegmenterDictionaries: "../testdata/test_dict.txt",
  135. DefaultRankOptions: &types.RankOptions{
  136. ScoringCriteria: TestScoringCriteria{},
  137. },
  138. IndexerInitOptions: &types.IndexerInitOptions{
  139. IndexType: types.LocationsIndex,
  140. },
  141. })
  142. AddDocs(&engine)
  143. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  144. utils.Expect(t, "2", len(outputs.Docs))
  145. utils.Expect(t, "1", outputs.Docs[0].DocId)
  146. utils.Expect(t, "18000", int(outputs.Docs[0].Scores[0]*1000))
  147. utils.Expect(t, "5", outputs.Docs[1].DocId)
  148. utils.Expect(t, "9000", int(outputs.Docs[1].Scores[0]*1000))
  149. }
  150. func TestCompactIndex(t *testing.T) {
  151. var engine Engine
  152. engine.Init(types.EngineInitOptions{
  153. SegmenterDictionaries: "../testdata/test_dict.txt",
  154. DefaultRankOptions: &types.RankOptions{
  155. ScoringCriteria: TestScoringCriteria{},
  156. },
  157. })
  158. AddDocs(&engine)
  159. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  160. utils.Expect(t, "2", len(outputs.Docs))
  161. utils.Expect(t, "5", outputs.Docs[0].DocId)
  162. utils.Expect(t, "9000", int(outputs.Docs[0].Scores[0]*1000))
  163. utils.Expect(t, "1", outputs.Docs[1].DocId)
  164. utils.Expect(t, "6000", int(outputs.Docs[1].Scores[0]*1000))
  165. }
  166. type BM25ScoringCriteria struct {
  167. }
  168. func (criteria BM25ScoringCriteria) Score(
  169. doc types.IndexedDocument, fields interface{}) []float32 {
  170. if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
  171. return []float32{}
  172. }
  173. return []float32{doc.BM25}
  174. }
  175. func TestFrequenciesIndex(t *testing.T) {
  176. var engine Engine
  177. engine.Init(types.EngineInitOptions{
  178. SegmenterDictionaries: "../testdata/test_dict.txt",
  179. DefaultRankOptions: &types.RankOptions{
  180. ScoringCriteria: BM25ScoringCriteria{},
  181. },
  182. IndexerInitOptions: &types.IndexerInitOptions{
  183. IndexType: types.FrequenciesIndex,
  184. },
  185. })
  186. AddDocs(&engine)
  187. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  188. utils.Expect(t, "2", len(outputs.Docs))
  189. utils.Expect(t, "5", outputs.Docs[0].DocId)
  190. utils.Expect(t, "2349", int(outputs.Docs[0].Scores[0]*1000))
  191. utils.Expect(t, "1", outputs.Docs[1].DocId)
  192. utils.Expect(t, "2320", int(outputs.Docs[1].Scores[0]*1000))
  193. }
  194. func TestRemoveDocument(t *testing.T) {
  195. var engine Engine
  196. engine.Init(types.EngineInitOptions{
  197. SegmenterDictionaries: "../testdata/test_dict.txt",
  198. DefaultRankOptions: &types.RankOptions{
  199. ScoringCriteria: TestScoringCriteria{},
  200. },
  201. })
  202. AddDocs(&engine)
  203. engine.RemoveDocument(5, true)
  204. engine.FlushIndex()
  205. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  206. utils.Expect(t, "1", len(outputs.Docs))
  207. utils.Expect(t, "1", outputs.Docs[0].DocId)
  208. utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
  209. }
  210. func TestEngineIndexDocumentWithTokens(t *testing.T) {
  211. var engine Engine
  212. engine.Init(types.EngineInitOptions{
  213. SegmenterDictionaries: "../testdata/test_dict.txt",
  214. DefaultRankOptions: &types.RankOptions{
  215. OutputOffset: 0,
  216. MaxOutputs: 10,
  217. ScoringCriteria: &RankByTokenProximity{},
  218. },
  219. IndexerInitOptions: &types.IndexerInitOptions{
  220. IndexType: types.LocationsIndex,
  221. },
  222. })
  223. docId := uint64(1)
  224. engine.IndexDocument(docId, types.DocumentIndexData{
  225. Content: "",
  226. Tokens: []types.TokenData{
  227. {"中国", []int{0}},
  228. {"人口", []int{18, 24}},
  229. },
  230. Fields: ScoringFields{1, 2, 3},
  231. }, true)
  232. docId++
  233. engine.IndexDocument(docId, types.DocumentIndexData{
  234. Content: "",
  235. Tokens: []types.TokenData{
  236. {"中国", []int{0}},
  237. {"人口", []int{6}},
  238. },
  239. Fields: ScoringFields{1, 2, 3},
  240. }, true)
  241. docId++
  242. engine.IndexDocument(docId, types.DocumentIndexData{
  243. Content: "中国十三亿人口",
  244. Fields: ScoringFields{0, 9, 1},
  245. }, true)
  246. engine.FlushIndex()
  247. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  248. utils.Expect(t, "2", len(outputs.Tokens))
  249. utils.Expect(t, "中国", outputs.Tokens[0])
  250. utils.Expect(t, "人口", outputs.Tokens[1])
  251. utils.Expect(t, "3", len(outputs.Docs))
  252. utils.Expect(t, "2", outputs.Docs[0].DocId)
  253. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  254. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  255. utils.Expect(t, "3", outputs.Docs[1].DocId)
  256. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  257. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  258. utils.Expect(t, "1", outputs.Docs[2].DocId)
  259. utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
  260. utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
  261. }
  262. func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
  263. gob.Register(ScoringFields{})
  264. var engine Engine
  265. engine.Init(types.EngineInitOptions{
  266. SegmenterDictionaries: "../testdata/test_dict.txt",
  267. DefaultRankOptions: &types.RankOptions{
  268. OutputOffset: 0,
  269. MaxOutputs: 10,
  270. ScoringCriteria: &RankByTokenProximity{},
  271. },
  272. IndexerInitOptions: &types.IndexerInitOptions{
  273. IndexType: types.LocationsIndex,
  274. },
  275. UsePersistentStorage: true,
  276. PersistentStorageFolder: "wukong.persistent",
  277. PersistentStorageShards: 2,
  278. })
  279. AddDocs(&engine)
  280. engine.RemoveDocument(5, true)
  281. engine.Close()
  282. var engine1 Engine
  283. engine1.Init(types.EngineInitOptions{
  284. SegmenterDictionaries: "../testdata/test_dict.txt",
  285. DefaultRankOptions: &types.RankOptions{
  286. OutputOffset: 0,
  287. MaxOutputs: 10,
  288. ScoringCriteria: &RankByTokenProximity{},
  289. },
  290. IndexerInitOptions: &types.IndexerInitOptions{
  291. IndexType: types.LocationsIndex,
  292. },
  293. UsePersistentStorage: true,
  294. PersistentStorageFolder: "wukong.persistent",
  295. PersistentStorageShards: 2,
  296. })
  297. engine1.FlushIndex()
  298. outputs := engine1.Search(types.SearchRequest{Text: "中国人口"})
  299. utils.Expect(t, "2", len(outputs.Tokens))
  300. utils.Expect(t, "中国", outputs.Tokens[0])
  301. utils.Expect(t, "人口", outputs.Tokens[1])
  302. utils.Expect(t, "2", len(outputs.Docs))
  303. utils.Expect(t, "2", outputs.Docs[0].DocId)
  304. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  305. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  306. utils.Expect(t, "1", outputs.Docs[1].DocId)
  307. utils.Expect(t, "76", int(outputs.Docs[1].Scores[0]*1000))
  308. utils.Expect(t, "[0 18]", outputs.Docs[1].TokenSnippetLocations)
  309. engine1.Close()
  310. os.RemoveAll("wukong.persistent")
  311. }
  312. func TestCountDocsOnly(t *testing.T) {
  313. var engine Engine
  314. engine.Init(types.EngineInitOptions{
  315. SegmenterDictionaries: "../testdata/test_dict.txt",
  316. DefaultRankOptions: &types.RankOptions{
  317. ReverseOrder: true,
  318. OutputOffset: 0,
  319. MaxOutputs: 1,
  320. ScoringCriteria: &RankByTokenProximity{},
  321. },
  322. IndexerInitOptions: &types.IndexerInitOptions{
  323. IndexType: types.LocationsIndex,
  324. },
  325. })
  326. AddDocs(&engine)
  327. engine.RemoveDocument(5, true)
  328. engine.FlushIndex()
  329. outputs := engine.Search(types.SearchRequest{Text: "中国人口", CountDocsOnly: true})
  330. utils.Expect(t, "0", len(outputs.Docs))
  331. utils.Expect(t, "2", len(outputs.Tokens))
  332. utils.Expect(t, "2", outputs.NumDocs)
  333. }
  334. func TestSearchWithin(t *testing.T) {
  335. var engine Engine
  336. engine.Init(types.EngineInitOptions{
  337. SegmenterDictionaries: "../testdata/test_dict.txt",
  338. DefaultRankOptions: &types.RankOptions{
  339. ReverseOrder: true,
  340. OutputOffset: 0,
  341. MaxOutputs: 10,
  342. ScoringCriteria: &RankByTokenProximity{},
  343. },
  344. IndexerInitOptions: &types.IndexerInitOptions{
  345. IndexType: types.LocationsIndex,
  346. },
  347. })
  348. AddDocs(&engine)
  349. docIds := make(map[uint64]bool)
  350. docIds[5] = true
  351. docIds[1] = true
  352. outputs := engine.Search(types.SearchRequest{
  353. Text: "中国人口",
  354. DocIds: docIds,
  355. })
  356. utils.Expect(t, "2", len(outputs.Tokens))
  357. utils.Expect(t, "中国", outputs.Tokens[0])
  358. utils.Expect(t, "人口", outputs.Tokens[1])
  359. utils.Expect(t, "2", len(outputs.Docs))
  360. utils.Expect(t, "1", outputs.Docs[0].DocId)
  361. utils.Expect(t, "76", int(outputs.Docs[0].Scores[0]*1000))
  362. utils.Expect(t, "[0 18]", outputs.Docs[0].TokenSnippetLocations)
  363. utils.Expect(t, "5", outputs.Docs[1].DocId)
  364. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  365. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  366. }
  367. func TestLookupWithLocations1(t *testing.T) {
  368. type Data struct {
  369. Id int
  370. Content string
  371. Labels []string
  372. }
  373. datas := make([]Data, 0)
  374. data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
  375. datas = append(datas, data0)
  376. data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
  377. datas = append(datas, data1)
  378. data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
  379. datas = append(datas, data2)
  380. data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
  381. datas = append(datas, data3)
  382. data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
  383. datas = append(datas, data4)
  384. // 初始化
  385. searcher_locations := Engine{}
  386. searcher_locations.Init(types.EngineInitOptions{
  387. SegmenterDictionaries: "../data/dictionary.txt",
  388. IndexerInitOptions: &types.IndexerInitOptions{
  389. IndexType: types.LocationsIndex,
  390. },
  391. })
  392. defer searcher_locations.Close()
  393. for _, data := range datas {
  394. searcher_locations.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels}, true)
  395. }
  396. searcher_locations.FlushIndex()
  397. res_locations := searcher_locations.Search(types.SearchRequest{Text: "百度"})
  398. searcher_docids := Engine{}
  399. searcher_docids.Init(types.EngineInitOptions{
  400. SegmenterDictionaries: "../data/dictionary.txt",
  401. IndexerInitOptions: &types.IndexerInitOptions{
  402. IndexType: types.DocIdsIndex,
  403. },
  404. })
  405. defer searcher_docids.Close()
  406. for _, data := range datas {
  407. searcher_docids.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels}, true)
  408. }
  409. searcher_docids.FlushIndex()
  410. res_docids := searcher_docids.Search(types.SearchRequest{Text: "百度"})
  411. if res_docids.NumDocs != res_locations.NumDocs {
  412. t.Errorf("期待的搜索结果个数=\"%d\", 实际=\"%d\"", res_docids.NumDocs, res_locations.NumDocs)
  413. }
  414. }