indexer_test.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. package core
  2. import (
  3. "testing"
  4. "github.com/huichen/wukong/engine"
  5. "github.com/huichen/wukong/types"
  6. "github.com/huichen/wukong/utils"
  7. )
  8. func TestAddKeywords(t *testing.T) {
  9. var indexer Indexer
  10. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  11. indexer.AddDocument(&types.DocumentIndex{
  12. DocId: 1,
  13. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  14. })
  15. indexer.AddDocument(&types.DocumentIndex{
  16. DocId: 7,
  17. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  18. })
  19. indexer.AddDocument(&types.DocumentIndex{
  20. DocId: 2,
  21. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  22. })
  23. indexer.AddDocument(&types.DocumentIndex{
  24. DocId: 3,
  25. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  26. })
  27. indexer.AddDocument(&types.DocumentIndex{
  28. DocId: 1,
  29. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  30. })
  31. indexer.AddDocument(&types.DocumentIndex{
  32. DocId: 1,
  33. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  34. })
  35. indexer.AddDocument(&types.DocumentIndex{
  36. DocId: 2,
  37. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  38. })
  39. indexer.AddDocument(&types.DocumentIndex{
  40. DocId: 0,
  41. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  42. })
  43. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  44. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  45. }
  46. func TestLookup(t *testing.T) {
  47. var indexer Indexer
  48. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  49. // doc0 = "token2 token3"
  50. indexer.AddDocument(&types.DocumentIndex{
  51. DocId: 0,
  52. Keywords: []types.KeywordIndex{
  53. {"token2", 0, []int{0}},
  54. {"token3", 0, []int{7}},
  55. },
  56. })
  57. // doc1 = "token1 token2 token3"
  58. indexer.AddDocument(&types.DocumentIndex{
  59. DocId: 1,
  60. Keywords: []types.KeywordIndex{
  61. {"token1", 0, []int{0}},
  62. {"token2", 0, []int{7}},
  63. {"token3", 0, []int{14}},
  64. },
  65. })
  66. // doc2 = "token1 token2"
  67. indexer.AddDocument(&types.DocumentIndex{
  68. DocId: 2,
  69. Keywords: []types.KeywordIndex{
  70. {"token1", 0, []int{0}},
  71. {"token2", 0, []int{7}},
  72. },
  73. })
  74. // doc3 = "token2"
  75. indexer.AddDocument(&types.DocumentIndex{
  76. DocId: 3,
  77. Keywords: []types.KeywordIndex{
  78. {"token2", 0, []int{0}},
  79. },
  80. })
  81. // doc7 = "token1 token3"
  82. indexer.AddDocument(&types.DocumentIndex{
  83. DocId: 7,
  84. Keywords: []types.KeywordIndex{
  85. {"token1", 0, []int{0}},
  86. {"token3", 0, []int{7}},
  87. },
  88. })
  89. // doc9 = "token3"
  90. indexer.AddDocument(&types.DocumentIndex{
  91. DocId: 9,
  92. Keywords: []types.KeywordIndex{
  93. {"token3", 0, []int{0}},
  94. },
  95. })
  96. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  97. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  98. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  99. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
  100. utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ",
  101. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
  102. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
  103. utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ",
  104. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
  105. utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ",
  106. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
  107. utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ",
  108. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
  109. utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ",
  110. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
  111. utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ",
  112. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  113. utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ",
  114. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
  115. utils.Expect(t, "[1 2 [0 7 14]] ",
  116. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
  117. utils.Expect(t, "[1 26 [14 7 0]] ",
  118. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
  119. }
  120. func TestDocIdsIndex(t *testing.T) {
  121. var indexer Indexer
  122. indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
  123. // doc0 = "token2 token3"
  124. indexer.AddDocument(&types.DocumentIndex{
  125. DocId: 0,
  126. Keywords: []types.KeywordIndex{
  127. {"token2", 0, []int{0}},
  128. {"token3", 0, []int{7}},
  129. },
  130. })
  131. // doc1 = "token1 token2 token3"
  132. indexer.AddDocument(&types.DocumentIndex{
  133. DocId: 1,
  134. Keywords: []types.KeywordIndex{
  135. {"token1", 0, []int{0}},
  136. {"token2", 0, []int{7}},
  137. {"token3", 0, []int{14}},
  138. },
  139. })
  140. // doc2 = "token1 token2"
  141. indexer.AddDocument(&types.DocumentIndex{
  142. DocId: 2,
  143. Keywords: []types.KeywordIndex{
  144. {"token1", 0, []int{0}},
  145. {"token2", 0, []int{7}},
  146. },
  147. })
  148. // doc3 = "token2"
  149. indexer.AddDocument(&types.DocumentIndex{
  150. DocId: 3,
  151. Keywords: []types.KeywordIndex{
  152. {"token2", 0, []int{0}},
  153. },
  154. })
  155. // doc7 = "token1 token3"
  156. indexer.AddDocument(&types.DocumentIndex{
  157. DocId: 7,
  158. Keywords: []types.KeywordIndex{
  159. {"token1", 0, []int{0}},
  160. {"token3", 0, []int{7}},
  161. },
  162. })
  163. // doc9 = "token3"
  164. indexer.AddDocument(&types.DocumentIndex{
  165. DocId: 9,
  166. Keywords: []types.KeywordIndex{
  167. {"token3", 0, []int{0}},
  168. },
  169. })
  170. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  171. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  172. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  173. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
  174. utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
  175. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
  176. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
  177. utils.Expect(t, "[2 0 []] [1 0 []] ",
  178. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
  179. utils.Expect(t, "[2 0 []] [1 0 []] ",
  180. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
  181. utils.Expect(t, "[7 0 []] [1 0 []] ",
  182. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
  183. utils.Expect(t, "[7 0 []] [1 0 []] ",
  184. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
  185. utils.Expect(t, "[1 0 []] [0 0 []] ",
  186. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  187. utils.Expect(t, "[1 0 []] [0 0 []] ",
  188. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
  189. utils.Expect(t, "[1 0 []] ",
  190. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
  191. utils.Expect(t, "[1 0 []] ",
  192. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
  193. }
  194. func TestLookupWithProximity(t *testing.T) {
  195. var indexer Indexer
  196. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  197. // doc0 = "token2 token4 token4 token2 token3 token4"
  198. indexer.AddDocument(&types.DocumentIndex{
  199. DocId: 0,
  200. Keywords: []types.KeywordIndex{
  201. {"token2", 0, []int{0, 21}},
  202. {"token3", 0, []int{28}},
  203. {"token4", 0, []int{7, 14, 35}},
  204. },
  205. })
  206. utils.Expect(t, "[0 1 [21 28]] ",
  207. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  208. // doc0 = "t2 t1 . . . t2 t3"
  209. indexer.AddDocument(&types.DocumentIndex{
  210. DocId: 0,
  211. Keywords: []types.KeywordIndex{
  212. {"t1", 0, []int{3}},
  213. {"t2", 0, []int{0, 12}},
  214. {"t3", 0, []int{15}},
  215. },
  216. })
  217. utils.Expect(t, "[0 8 [3 12 15]] ",
  218. indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
  219. // doc0 = "t3 t2 t1 . . . . . t2 t3"
  220. indexer.AddDocument(&types.DocumentIndex{
  221. DocId: 0,
  222. Keywords: []types.KeywordIndex{
  223. {"t1", 0, []int{6}},
  224. {"t2", 0, []int{3, 19}},
  225. {"t3", 0, []int{0, 22}},
  226. },
  227. })
  228. utils.Expect(t, "[0 10 [6 3 0]] ",
  229. indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
  230. }
  231. func TestLookupWithPartialLocations(t *testing.T) {
  232. var indexer Indexer
  233. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  234. // doc0 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
  235. indexer.AddDocument(&types.DocumentIndex{
  236. DocId: 0,
  237. Keywords: []types.KeywordIndex{
  238. {"token2", 0, []int{0, 21}},
  239. {"token3", 0, []int{28}},
  240. {"label1", 0, []int{}},
  241. {"token4", 0, []int{7, 14, 35}},
  242. },
  243. })
  244. // doc1 = "token2 token4 token4 token2 token3 token4"
  245. indexer.AddDocument(&types.DocumentIndex{
  246. DocId: 1,
  247. Keywords: []types.KeywordIndex{
  248. {"token2", 0, []int{0, 21}},
  249. {"token3", 0, []int{28}},
  250. {"token4", 0, []int{7, 14, 35}},
  251. },
  252. })
  253. utils.Expect(t, "0 ", indicesToString(&indexer, "label1"))
  254. utils.Expect(t, "[0 1 [21 28]] ",
  255. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false)))
  256. }
  257. func TestLookupWithBM25(t *testing.T) {
  258. var indexer Indexer
  259. indexer.Init(types.IndexerInitOptions{
  260. IndexType: types.FrequenciesIndex,
  261. BM25Parameters: &types.BM25Parameters{
  262. K1: 1,
  263. B: 1,
  264. },
  265. })
  266. // doc0 = "token2 token4 token4 token2 token3 token4"
  267. indexer.AddDocument(&types.DocumentIndex{
  268. DocId: 0,
  269. TokenLength: 6,
  270. Keywords: []types.KeywordIndex{
  271. {"token2", 3, []int{0, 21}},
  272. {"token3", 7, []int{28}},
  273. {"token4", 15, []int{7, 14, 35}},
  274. },
  275. })
  276. // doc0 = "token6 token7"
  277. indexer.AddDocument(&types.DocumentIndex{
  278. DocId: 1,
  279. TokenLength: 2,
  280. Keywords: []types.KeywordIndex{
  281. {"token6", 3, []int{0}},
  282. {"token7", 15, []int{7}},
  283. },
  284. })
  285. outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false)
  286. // BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433
  287. utils.Expect(t, "76055", int(outputs[0].BM25*10000))
  288. }
  289. func TestLookupWithinDocIds(t *testing.T) {
  290. var indexer Indexer
  291. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  292. // doc0 = "token2 token3"
  293. indexer.AddDocument(&types.DocumentIndex{
  294. DocId: 0,
  295. Keywords: []types.KeywordIndex{
  296. {"token2", 0, []int{0}},
  297. {"token3", 0, []int{7}},
  298. },
  299. })
  300. // doc1 = "token1 token2 token3"
  301. indexer.AddDocument(&types.DocumentIndex{
  302. DocId: 1,
  303. Keywords: []types.KeywordIndex{
  304. {"token1", 0, []int{0}},
  305. {"token2", 0, []int{7}},
  306. {"token3", 0, []int{14}},
  307. },
  308. })
  309. // doc2 = "token1 token2"
  310. indexer.AddDocument(&types.DocumentIndex{
  311. DocId: 2,
  312. Keywords: []types.KeywordIndex{
  313. {"token1", 0, []int{0}},
  314. {"token2", 0, []int{7}},
  315. },
  316. })
  317. // doc3 = "token2"
  318. indexer.AddDocument(&types.DocumentIndex{
  319. DocId: 3,
  320. Keywords: []types.KeywordIndex{
  321. {"token2", 0, []int{0}},
  322. },
  323. })
  324. docIds := make(map[uint64]bool)
  325. docIds[0] = true
  326. docIds[2] = true
  327. utils.Expect(t, "[2 0 [7]] [0 0 [0]] ",
  328. indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
  329. }
  330. func TestLookupWithLocations(t *testing.T) {
  331. var indexer Indexer
  332. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  333. // doc0 = "token2 token4 token4 token2 token3 token4"
  334. indexer.AddDocument(&types.DocumentIndex{
  335. DocId: 0,
  336. Keywords: []types.KeywordIndex{
  337. {"token2", 0, []int{0, 21}},
  338. {"token3", 0, []int{28}},
  339. {"token4", 0, []int{7, 14, 35}},
  340. },
  341. })
  342. docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
  343. utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
  344. }
  345. func TestLookupWithLocations1(t *testing.T) {
  346. type Data struct {
  347. Id int
  348. Content string
  349. Labels []string
  350. }
  351. datas := make([]Data, 0)
  352. data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
  353. datas = append(datas, data0)
  354. data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
  355. datas = append(datas, data1)
  356. data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
  357. datas = append(datas, data2)
  358. data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
  359. datas = append(datas, data3)
  360. data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
  361. datas = append(datas, data4)
  362. // 初始化
  363. searcher_locations := engine.Engine{}
  364. searcher_locations.Init(types.EngineInitOptions{
  365. SegmenterDictionaries: "../data/dictionary.txt",
  366. IndexerInitOptions: &types.IndexerInitOptions{
  367. IndexType: types.LocationsIndex,
  368. },
  369. })
  370. defer searcher_locations.Close()
  371. for _, data := range datas {
  372. searcher_locations.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
  373. }
  374. searcher_locations.FlushIndex()
  375. res_locations := searcher_locations.Search(types.SearchRequest{Text: "百度"})
  376. searcher_docids := engine.Engine{}
  377. searcher_docids.Init(types.EngineInitOptions{
  378. SegmenterDictionaries: "../data/dictionary.txt",
  379. IndexerInitOptions: &types.IndexerInitOptions{
  380. IndexType: types.DocIdsIndex,
  381. },
  382. })
  383. defer searcher_docids.Close()
  384. for _, data := range datas {
  385. searcher_docids.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels})
  386. }
  387. searcher_docids.FlushIndex()
  388. res_docids := searcher_docids.Search(types.SearchRequest{Text: "百度"})
  389. if res_docids.NumDocs != res_locations.NumDocs {
  390. t.Errorf("期待的搜索结果个数=\"%d\", 实际=\"%d\"", res_docids.NumDocs, res_locations.NumDocs)
  391. }
  392. }