indexer_test.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. package core
  2. import (
  3. "github.com/huichen/wukong/types"
  4. "github.com/huichen/wukong/utils"
  5. "testing"
  6. )
  7. func TestAddKeywords(t *testing.T) {
  8. var indexer Indexer
  9. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  10. indexer.AddDocument(&types.DocumentIndex{
  11. DocId: 1,
  12. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  13. })
  14. indexer.AddDocument(&types.DocumentIndex{
  15. DocId: 7,
  16. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  17. })
  18. indexer.AddDocument(&types.DocumentIndex{
  19. DocId: 2,
  20. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  21. })
  22. indexer.AddDocument(&types.DocumentIndex{
  23. DocId: 3,
  24. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  25. })
  26. indexer.AddDocument(&types.DocumentIndex{
  27. DocId: 1,
  28. Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
  29. })
  30. indexer.AddDocument(&types.DocumentIndex{
  31. DocId: 1,
  32. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  33. })
  34. indexer.AddDocument(&types.DocumentIndex{
  35. DocId: 2,
  36. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  37. })
  38. indexer.AddDocument(&types.DocumentIndex{
  39. DocId: 0,
  40. Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
  41. })
  42. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  43. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  44. }
  45. func TestLookup(t *testing.T) {
  46. var indexer Indexer
  47. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  48. // doc0 = "token2 token3"
  49. indexer.AddDocument(&types.DocumentIndex{
  50. DocId: 0,
  51. Keywords: []types.KeywordIndex{
  52. {"token2", 0, []int{0}},
  53. {"token3", 0, []int{7}},
  54. },
  55. })
  56. // doc1 = "token1 token2 token3"
  57. indexer.AddDocument(&types.DocumentIndex{
  58. DocId: 1,
  59. Keywords: []types.KeywordIndex{
  60. {"token1", 0, []int{0}},
  61. {"token2", 0, []int{7}},
  62. {"token3", 0, []int{14}},
  63. },
  64. })
  65. // doc2 = "token1 token2"
  66. indexer.AddDocument(&types.DocumentIndex{
  67. DocId: 2,
  68. Keywords: []types.KeywordIndex{
  69. {"token1", 0, []int{0}},
  70. {"token2", 0, []int{7}},
  71. },
  72. })
  73. // doc3 = "token2"
  74. indexer.AddDocument(&types.DocumentIndex{
  75. DocId: 3,
  76. Keywords: []types.KeywordIndex{
  77. {"token2", 0, []int{0}},
  78. },
  79. })
  80. // doc7 = "token1 token3"
  81. indexer.AddDocument(&types.DocumentIndex{
  82. DocId: 7,
  83. Keywords: []types.KeywordIndex{
  84. {"token1", 0, []int{0}},
  85. {"token3", 0, []int{7}},
  86. },
  87. })
  88. // doc9 = "token3"
  89. indexer.AddDocument(&types.DocumentIndex{
  90. DocId: 9,
  91. Keywords: []types.KeywordIndex{
  92. {"token3", 0, []int{0}},
  93. },
  94. })
  95. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  96. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  97. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  98. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil)))
  99. utils.Expect(t, "[7 0 [0]] [2 0 [0]] [1 0 [0]] ",
  100. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil)))
  101. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil)))
  102. utils.Expect(t, "[2 1 [0 7]] [1 1 [0 7]] ",
  103. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil)))
  104. utils.Expect(t, "[2 13 [7 0]] [1 13 [7 0]] ",
  105. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil)))
  106. utils.Expect(t, "[7 1 [0 7]] [1 8 [0 14]] ",
  107. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil)))
  108. utils.Expect(t, "[7 13 [7 0]] [1 20 [14 0]] ",
  109. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil)))
  110. utils.Expect(t, "[1 1 [7 14]] [0 1 [0 7]] ",
  111. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
  112. utils.Expect(t, "[1 13 [14 7]] [0 13 [7 0]] ",
  113. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil)))
  114. utils.Expect(t, "[1 2 [0 7 14]] ",
  115. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil)))
  116. utils.Expect(t, "[1 26 [14 7 0]] ",
  117. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil)))
  118. }
  119. func TestDocIdsIndex(t *testing.T) {
  120. var indexer Indexer
  121. indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
  122. // doc0 = "token2 token3"
  123. indexer.AddDocument(&types.DocumentIndex{
  124. DocId: 0,
  125. Keywords: []types.KeywordIndex{
  126. {"token2", 0, []int{0}},
  127. {"token3", 0, []int{7}},
  128. },
  129. })
  130. // doc1 = "token1 token2 token3"
  131. indexer.AddDocument(&types.DocumentIndex{
  132. DocId: 1,
  133. Keywords: []types.KeywordIndex{
  134. {"token1", 0, []int{0}},
  135. {"token2", 0, []int{7}},
  136. {"token3", 0, []int{14}},
  137. },
  138. })
  139. // doc2 = "token1 token2"
  140. indexer.AddDocument(&types.DocumentIndex{
  141. DocId: 2,
  142. Keywords: []types.KeywordIndex{
  143. {"token1", 0, []int{0}},
  144. {"token2", 0, []int{7}},
  145. },
  146. })
  147. // doc3 = "token2"
  148. indexer.AddDocument(&types.DocumentIndex{
  149. DocId: 3,
  150. Keywords: []types.KeywordIndex{
  151. {"token2", 0, []int{0}},
  152. },
  153. })
  154. // doc7 = "token1 token3"
  155. indexer.AddDocument(&types.DocumentIndex{
  156. DocId: 7,
  157. Keywords: []types.KeywordIndex{
  158. {"token1", 0, []int{0}},
  159. {"token3", 0, []int{7}},
  160. },
  161. })
  162. // doc9 = "token3"
  163. indexer.AddDocument(&types.DocumentIndex{
  164. DocId: 9,
  165. Keywords: []types.KeywordIndex{
  166. {"token3", 0, []int{0}},
  167. },
  168. })
  169. utils.Expect(t, "1 2 7 ", indicesToString(&indexer, "token1"))
  170. utils.Expect(t, "0 1 2 3 ", indicesToString(&indexer, "token2"))
  171. utils.Expect(t, "0 1 7 9 ", indicesToString(&indexer, "token3"))
  172. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil)))
  173. utils.Expect(t, "[7 0 []] [2 0 []] [1 0 []] ",
  174. indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil)))
  175. utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil)))
  176. utils.Expect(t, "[2 0 []] [1 0 []] ",
  177. indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil)))
  178. utils.Expect(t, "[2 0 []] [1 0 []] ",
  179. indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil)))
  180. utils.Expect(t, "[7 0 []] [1 0 []] ",
  181. indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil)))
  182. utils.Expect(t, "[7 0 []] [1 0 []] ",
  183. indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil)))
  184. utils.Expect(t, "[1 0 []] [0 0 []] ",
  185. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
  186. utils.Expect(t, "[1 0 []] [0 0 []] ",
  187. indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil)))
  188. utils.Expect(t, "[1 0 []] ",
  189. indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil)))
  190. utils.Expect(t, "[1 0 []] ",
  191. indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil)))
  192. }
  193. func TestLookupWithProximity(t *testing.T) {
  194. var indexer Indexer
  195. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  196. // doc0 = "token2 token4 token4 token2 token3 token4"
  197. indexer.AddDocument(&types.DocumentIndex{
  198. DocId: 0,
  199. Keywords: []types.KeywordIndex{
  200. {"token2", 0, []int{0, 21}},
  201. {"token3", 0, []int{28}},
  202. {"token4", 0, []int{7, 14, 35}},
  203. },
  204. })
  205. utils.Expect(t, "[0 1 [21 28]] ",
  206. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)))
  207. }
  208. func TestLookupWithPartialLocations(t *testing.T) {
  209. var indexer Indexer
  210. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  211. // doc0 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
  212. indexer.AddDocument(&types.DocumentIndex{
  213. DocId: 0,
  214. Keywords: []types.KeywordIndex{
  215. {"token2", 0, []int{0, 21}},
  216. {"token3", 0, []int{28}},
  217. {"label1", 0, []int{}},
  218. {"token4", 0, []int{7, 14, 35}},
  219. },
  220. })
  221. // doc1 = "token2 token4 token4 token2 token3 token4"
  222. indexer.AddDocument(&types.DocumentIndex{
  223. DocId: 1,
  224. Keywords: []types.KeywordIndex{
  225. {"token2", 0, []int{0, 21}},
  226. {"token3", 0, []int{28}},
  227. {"token4", 0, []int{7, 14, 35}},
  228. },
  229. })
  230. utils.Expect(t, "0 ", indicesToString(&indexer, "label1"))
  231. utils.Expect(t, "[0 1 [21 28]] ",
  232. indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil)))
  233. }
  234. func TestLookupWithBM25(t *testing.T) {
  235. var indexer Indexer
  236. indexer.Init(types.IndexerInitOptions{
  237. IndexType: types.FrequenciesIndex,
  238. BM25Parameters: &types.BM25Parameters{
  239. K1: 1,
  240. B: 1,
  241. },
  242. })
  243. // doc0 = "token2 token4 token4 token2 token3 token4"
  244. indexer.AddDocument(&types.DocumentIndex{
  245. DocId: 0,
  246. TokenLength: 6,
  247. Keywords: []types.KeywordIndex{
  248. {"token2", 3, []int{0, 21}},
  249. {"token3", 7, []int{28}},
  250. {"token4", 15, []int{7, 14, 35}},
  251. },
  252. })
  253. // doc0 = "token6 token7"
  254. indexer.AddDocument(&types.DocumentIndex{
  255. DocId: 1,
  256. TokenLength: 2,
  257. Keywords: []types.KeywordIndex{
  258. {"token6", 3, []int{0}},
  259. {"token7", 15, []int{7}},
  260. },
  261. })
  262. outputs := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil)
  263. // BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433
  264. utils.Expect(t, "76055", int(outputs[0].BM25*10000))
  265. }
  266. func TestLookupWithinDocIds(t *testing.T) {
  267. var indexer Indexer
  268. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  269. // doc0 = "token2 token3"
  270. indexer.AddDocument(&types.DocumentIndex{
  271. DocId: 0,
  272. Keywords: []types.KeywordIndex{
  273. {"token2", 0, []int{0}},
  274. {"token3", 0, []int{7}},
  275. },
  276. })
  277. // doc1 = "token1 token2 token3"
  278. indexer.AddDocument(&types.DocumentIndex{
  279. DocId: 1,
  280. Keywords: []types.KeywordIndex{
  281. {"token1", 0, []int{0}},
  282. {"token2", 0, []int{7}},
  283. {"token3", 0, []int{14}},
  284. },
  285. })
  286. // doc2 = "token1 token2"
  287. indexer.AddDocument(&types.DocumentIndex{
  288. DocId: 2,
  289. Keywords: []types.KeywordIndex{
  290. {"token1", 0, []int{0}},
  291. {"token2", 0, []int{7}},
  292. },
  293. })
  294. // doc3 = "token2"
  295. indexer.AddDocument(&types.DocumentIndex{
  296. DocId: 3,
  297. Keywords: []types.KeywordIndex{
  298. {"token2", 0, []int{0}},
  299. },
  300. })
  301. docIds := make(map[uint64]bool)
  302. docIds[0] = true
  303. docIds[2] = true
  304. utils.Expect(t, "[2 0 [7]] [0 0 [0]] ",
  305. indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, &docIds)))
  306. }
  307. func TestLookupWithLocations(t *testing.T) {
  308. var indexer Indexer
  309. indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
  310. // doc0 = "token2 token4 token4 token2 token3 token4"
  311. indexer.AddDocument(&types.DocumentIndex{
  312. DocId: 0,
  313. Keywords: []types.KeywordIndex{
  314. {"token2", 0, []int{0, 21}},
  315. {"token3", 0, []int{28}},
  316. {"token4", 0, []int{7, 14, 35}},
  317. },
  318. })
  319. utils.Expect(t, "[[0 21] [28]]",
  320. indexer.Lookup([]string{"token2", "token3"}, []string{}, nil)[0].TokenLocations)
  321. }