indexer_test.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. package core
  2. import (
  3. "github.com/huichen/wukong/types"
  4. "github.com/huichen/wukong/utils"
  5. "testing"
  6. )
  7. func TestAddKeywords(t *testing.T) {
  8. var indexer Indexer
  9. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  10. indexer.AddDocument(&types.DocumentIndex{
  11. DocId: 1,
  12. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  13. })
  14. indexer.AddDocument(&types.DocumentIndex{
  15. DocId: 7,
  16. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  17. })
  18. indexer.AddDocument(&types.DocumentIndex{
  19. DocId: 2,
  20. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  21. })
  22. indexer.AddDocument(&types.DocumentIndex{
  23. DocId: 3,
  24. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  25. })
  26. indexer.AddDocument(&types.DocumentIndex{
  27. DocId: 1,
  28. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  29. })
  30. indexer.AddDocument(&types.DocumentIndex{
  31. DocId: 1,
  32. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  33. })
  34. indexer.AddDocument(&types.DocumentIndex{
  35. DocId: 2,
  36. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  37. })
  38. indexer.AddDocument(&types.DocumentIndex{
  39. DocId: 0,
  40. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  41. })
  42. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  43. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  44. }
  45. func TestLookup(t *testing.T) {
  46. var indexer Indexer
  47. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  48. // doc0 = "token2 token3"
  49. indexer.AddDocument(&types.DocumentIndex{
  50. DocId: 0,
  51. Keywords: []types.KeywordIndex{
  52. {"token2", 0, []int{0}},
  53. {"token3", 0, []int{7}},
  54. },
  55. })
  56. // doc1 = "token1 token2 token3"
  57. indexer.AddDocument(&types.DocumentIndex{
  58. DocId: 1,
  59. Keywords: []types.KeywordIndex{
  60. {"token1", 0, []int{0}},
  61. {"token2", 0, []int{7}},
  62. {"token3", 0, []int{14}},
  63. },
  64. })
  65. // doc2 = "token1 token2"
  66. indexer.AddDocument(&types.DocumentIndex{
  67. DocId: 2,
  68. Keywords: []types.KeywordIndex{
  69. {"token1", 0, []int{0}},
  70. {"token2", 0, []int{7}},
  71. },
  72. })
  73. // doc3 = "token2"
  74. indexer.AddDocument(&types.DocumentIndex{
  75. DocId: 3,
  76. Keywords: []types.KeywordIndex{
  77. {"token2", 0, []int{0}},
  78. },
  79. })
  80. // doc7 = "token1 token3"
  81. indexer.AddDocument(&types.DocumentIndex{
  82. DocId: 7,
  83. Keywords: []types.KeywordIndex{
  84. {"token1", 0, []int{0}},
  85. {"token3", 0, []int{7}},
  86. },
  87. })
  88. // doc9 = "token3"
  89. indexer.AddDocument(&types.DocumentIndex{
  90. DocId: 9,
  91. Keywords: []types.KeywordIndex{
  92. {"token3", 0, []int{0}},
  93. },
  94. })
  95. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  96. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  97. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  98. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
  99. utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ",
  100. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
  101. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
  102. utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ",
  103. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
  104. utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ",
  105. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
  106. utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ",
  107. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
  108. utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ",
  109. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
  110. utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ",
  111. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  112. utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ",
  113. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
  114. utils.Expect(t, "[1 2 [0 7 14]] ",
  115. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
  116. utils.Expect(t, "[1 26 [14 7 0]] ",
  117. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
  118. }
  119. func TestDocIdsIndex(t *testing.T) {
  120. var indexer Indexer
  121. indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
  122. // doc0 = "token2 token3"
  123. indexer.AddDocument(&types.DocumentIndex{
  124. DocId: 0,
  125. Keywords: []types.KeywordIndex{
  126. {"token2", 0, []int{0}},
  127. {"token3", 0, []int{7}},
  128. },
  129. })
  130. // doc1 = "token1 token2 token3"
  131. indexer.AddDocument(&types.DocumentIndex{
  132. DocId: 1,
  133. Keywords: []types.KeywordIndex{
  134. {"token1", 0, []int{0}},
  135. {"token2", 0, []int{7}},
  136. {"token3", 0, []int{14}},
  137. },
  138. })
  139. // doc2 = "token1 token2"
  140. indexer.AddDocument(&types.DocumentIndex{
  141. DocId: 2,
  142. Keywords: []types.KeywordIndex{
  143. {"token1", 0, []int{0}},
  144. {"token2", 0, []int{7}},
  145. },
  146. })
  147. // doc3 = "token2"
  148. indexer.AddDocument(&types.DocumentIndex{
  149. DocId: 3,
  150. Keywords: []types.KeywordIndex{
  151. {"token2", 0, []int{0}},
  152. },
  153. })
  154. // doc7 = "token1 token3"
  155. indexer.AddDocument(&types.DocumentIndex{
  156. DocId: 7,
  157. Keywords: []types.KeywordIndex{
  158. {"token1", 0, []int{0}},
  159. {"token3", 0, []int{7}},
  160. },
  161. })
  162. // doc9 = "token3"
  163. indexer.AddDocument(&types.DocumentIndex{
  164. DocId: 9,
  165. Keywords: []types.KeywordIndex{
  166. {"token3", 0, []int{0}},
  167. },
  168. })
  169. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  170. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  171. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  172. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
  173. utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
  174. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
  175. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
  176. utils.Expect(t, "[2 0 []] [1 0 []] ",
  177. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
  178. utils.Expect(t, "[2 0 []] [1 0 []] ",
  179. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
  180. utils.Expect(t, "[7 0 []] [1 0 []] ",
  181. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
  182. utils.Expect(t, "[7 0 []] [1 0 []] ",
  183. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
  184. utils.Expect(t, "[1 0 []] [0 0 []] ",
  185. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  186. utils.Expect(t, "[1 0 []] [0 0 []] ",
  187. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
  188. utils.Expect(t, "[1 0 []] ",
  189. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
  190. utils.Expect(t, "[1 0 []] ",
  191. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
  192. }
  193. func TestLookupWithProximity(t *testing.T) {
  194. var indexer Indexer
  195. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  196. // doc0 = "token2 token4 token4 token2 token3 token4"
  197. indexer.AddDocument(&types.DocumentIndex{
  198. DocId: 0,
  199. Keywords: []types.KeywordIndex{
  200. {"token2", 0, []int{0, 21}},
  201. {"token3", 0, []int{28}},
  202. {"token4", 0, []int{7, 14, 35}},
  203. },
  204. })
  205. utils.Expect(t, "[0 1 [21 28]] ",
  206. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
  207. // doc0 = "t2 t1 . . . t2 t3"
  208. indexer.AddDocument(&types.DocumentIndex{
  209. DocId: 0,
  210. Keywords: []types.KeywordIndex{
  211. {"t1", 0, []int{3}},
  212. {"t2", 0, []int{0, 12}},
  213. {"t3", 0, []int{15}},
  214. },
  215. })
  216. utils.Expect(t, "[0 8 [3 12 15]] ",
  217. indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
  218. // doc0 = "t3 t2 t1 . . . . . t2 t3"
  219. indexer.AddDocument(&types.DocumentIndex{
  220. DocId: 0,
  221. Keywords: []types.KeywordIndex{
  222. {"t1", 0, []int{6}},
  223. {"t2", 0, []int{3, 19}},
  224. {"t3", 0, []int{0, 22}},
  225. },
  226. })
  227. utils.Expect(t, "[0 10 [6 3 0]] ",
  228. indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
  229. }
  230. func TestLookupWithPartialLocations(t *testing.T) {
  231. var indexer Indexer
  232. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  233. // doc0 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
  234. indexer.AddDocument(&types.DocumentIndex{
  235. DocId: 0,
  236. Keywords: []types.KeywordIndex{
  237. {"token2", 0, []int{0, 21}},
  238. {"token3", 0, []int{28}},
  239. {"label1", 0, []int{}},
  240. {"token4", 0, []int{7, 14, 35}},
  241. },
  242. })
  243. // doc1 = "token2 token4 token4 token2 token3 token4"
  244. indexer.AddDocument(&types.DocumentIndex{
  245. DocId: 1,
  246. Keywords: []types.KeywordIndex{
  247. {"token2", 0, []int{0, 21}},
  248. {"token3", 0, []int{28}},
  249. {"token4", 0, []int{7, 14, 35}},
  250. },
  251. })
  252. utils.Expect(t, "0 ", indicesToString(&indexer, "label1"))
  253. utils.Expect(t, "[0 1 [21 28]] ",
  254. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false)))
  255. }
  256. func TestLookupWithBM25(t *testing.T) {
  257. var indexer Indexer
  258. indexer.Init(types.IndexerInitOptions{
  259. IndexType: types.FrequenciesIndex,
  260. BM25Parameters: &types.BM25Parameters{
  261. K1: 1,
  262. B: 1,
  263. },
  264. })
  265. // doc0 = "token2 token4 token4 token2 token3 token4"
  266. indexer.AddDocument(&types.DocumentIndex{
  267. DocId: 0,
  268. TokenLength: 6,
  269. Keywords: []types.KeywordIndex{
  270. {"token2", 3, []int{0, 21}},
  271. {"token3", 7, []int{28}},
  272. {"token4", 15, []int{7, 14, 35}},
  273. },
  274. })
  275. // doc0 = "token6 token7"
  276. indexer.AddDocument(&types.DocumentIndex{
  277. DocId: 1,
  278. TokenLength: 2,
  279. Keywords: []types.KeywordIndex{
  280. {"token6", 3, []int{0}},
  281. {"token7", 15, []int{7}},
  282. },
  283. })
  284. outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false)
  285. // BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433
  286. utils.Expect(t, "76055", int(outputs[0].BM25*10000))
  287. }
  288. func TestLookupWithinDocIds(t *testing.T) {
  289. var indexer Indexer
  290. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  291. // doc0 = "token2 token3"
  292. indexer.AddDocument(&types.DocumentIndex{
  293. DocId: 0,
  294. Keywords: []types.KeywordIndex{
  295. {"token2", 0, []int{0}},
  296. {"token3", 0, []int{7}},
  297. },
  298. })
  299. // doc1 = "token1 token2 token3"
  300. indexer.AddDocument(&types.DocumentIndex{
  301. DocId: 1,
  302. Keywords: []types.KeywordIndex{
  303. {"token1", 0, []int{0}},
  304. {"token2", 0, []int{7}},
  305. {"token3", 0, []int{14}},
  306. },
  307. })
  308. // doc2 = "token1 token2"
  309. indexer.AddDocument(&types.DocumentIndex{
  310. DocId: 2,
  311. Keywords: []types.KeywordIndex{
  312. {"token1", 0, []int{0}},
  313. {"token2", 0, []int{7}},
  314. },
  315. })
  316. // doc3 = "token2"
  317. indexer.AddDocument(&types.DocumentIndex{
  318. DocId: 3,
  319. Keywords: []types.KeywordIndex{
  320. {"token2", 0, []int{0}},
  321. },
  322. })
  323. docIds := make(map[uint64]bool)
  324. docIds[0] = true
  325. docIds[2] = true
  326. utils.Expect(t, "[2 0 [7]] [0 0 [0]] ",
  327. indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
  328. }
  329. func TestLookupWithLocations(t *testing.T) {
  330. var indexer Indexer
  331. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  332. // doc0 = "token2 token4 token4 token2 token3 token4"
  333. indexer.AddDocument(&types.DocumentIndex{
  334. DocId: 0,
  335. Keywords: []types.KeywordIndex{
  336. {"token2", 0, []int{0, 21}},
  337. {"token3", 0, []int{28}},
  338. {"token4", 0, []int{7, 14, 35}},
  339. },
  340. })
  341. docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
  342. utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
  343. }