engine_test.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. package engine
  2. import (
  3. "encoding/gob"
  4. "github.com/huichen/wukong/types"
  5. "github.com/huichen/wukong/utils"
  6. "os"
  7. "reflect"
  8. "testing"
  9. )
  10. type ScoringFields struct {
  11. A, B, C float32
  12. }
  13. func AddDocs(engine *Engine) {
  14. docId := uint64(0)
  15. engine.IndexDocument(docId, types.DocumentIndexData{
  16. Content: "中国有十三亿人口人口",
  17. Fields: ScoringFields{1, 2, 3},
  18. })
  19. docId++
  20. engine.IndexDocument(docId, types.DocumentIndexData{
  21. Content: "中国人口",
  22. Fields: nil,
  23. })
  24. docId++
  25. engine.IndexDocument(docId, types.DocumentIndexData{
  26. Content: "有人口",
  27. Fields: ScoringFields{2, 3, 1},
  28. })
  29. docId++
  30. engine.IndexDocument(docId, types.DocumentIndexData{
  31. Content: "有十三亿人口",
  32. Fields: ScoringFields{2, 3, 3},
  33. })
  34. docId++
  35. engine.IndexDocument(docId, types.DocumentIndexData{
  36. Content: "中国十三亿人口",
  37. Fields: ScoringFields{0, 9, 1},
  38. })
  39. engine.FlushIndex()
  40. }
  41. type RankByTokenProximity struct {
  42. }
  43. func (rule RankByTokenProximity) Score(
  44. doc types.IndexedDocument, fields interface{}) []float32 {
  45. if doc.TokenProximity < 0 {
  46. return []float32{}
  47. }
  48. return []float32{1.0 / (float32(doc.TokenProximity) + 1)}
  49. }
  50. func TestEngineIndexDocument(t *testing.T) {
  51. var engine Engine
  52. engine.Init(types.EngineInitOptions{
  53. SegmenterDictionaries: "../testdata/test_dict.txt",
  54. DefaultRankOptions: &types.RankOptions{
  55. OutputOffset: 0,
  56. MaxOutputs: 10,
  57. ScoringCriteria: &RankByTokenProximity{},
  58. },
  59. IndexerInitOptions: &types.IndexerInitOptions{
  60. IndexType: types.LocationsIndex,
  61. },
  62. })
  63. AddDocs(&engine)
  64. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  65. utils.Expect(t, "2", len(outputs.Tokens))
  66. utils.Expect(t, "中国", outputs.Tokens[0])
  67. utils.Expect(t, "人口", outputs.Tokens[1])
  68. utils.Expect(t, "3", len(outputs.Docs))
  69. utils.Expect(t, "1", outputs.Docs[0].DocId)
  70. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  71. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  72. utils.Expect(t, "4", outputs.Docs[1].DocId)
  73. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  74. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  75. utils.Expect(t, "0", outputs.Docs[2].DocId)
  76. utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
  77. utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
  78. }
  79. func TestReverseOrder(t *testing.T) {
  80. var engine Engine
  81. engine.Init(types.EngineInitOptions{
  82. SegmenterDictionaries: "../testdata/test_dict.txt",
  83. DefaultRankOptions: &types.RankOptions{
  84. ReverseOrder: true,
  85. OutputOffset: 0,
  86. MaxOutputs: 10,
  87. ScoringCriteria: &RankByTokenProximity{},
  88. },
  89. IndexerInitOptions: &types.IndexerInitOptions{
  90. IndexType: types.LocationsIndex,
  91. },
  92. })
  93. AddDocs(&engine)
  94. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  95. utils.Expect(t, "3", len(outputs.Docs))
  96. utils.Expect(t, "0", outputs.Docs[0].DocId)
  97. utils.Expect(t, "4", outputs.Docs[1].DocId)
  98. utils.Expect(t, "1", outputs.Docs[2].DocId)
  99. }
  100. func TestOffsetAndMaxOutputs(t *testing.T) {
  101. var engine Engine
  102. engine.Init(types.EngineInitOptions{
  103. SegmenterDictionaries: "../testdata/test_dict.txt",
  104. DefaultRankOptions: &types.RankOptions{
  105. ReverseOrder: true,
  106. OutputOffset: 1,
  107. MaxOutputs: 3,
  108. ScoringCriteria: &RankByTokenProximity{},
  109. },
  110. IndexerInitOptions: &types.IndexerInitOptions{
  111. IndexType: types.LocationsIndex,
  112. },
  113. })
  114. AddDocs(&engine)
  115. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  116. utils.Expect(t, "2", len(outputs.Docs))
  117. utils.Expect(t, "4", outputs.Docs[0].DocId)
  118. utils.Expect(t, "1", outputs.Docs[1].DocId)
  119. }
  120. type TestScoringCriteria struct {
  121. }
  122. func (criteria TestScoringCriteria) Score(
  123. doc types.IndexedDocument, fields interface{}) []float32 {
  124. if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
  125. return []float32{}
  126. }
  127. fs := fields.(ScoringFields)
  128. return []float32{float32(doc.TokenProximity)*fs.A + fs.B*fs.C}
  129. }
  130. func TestSearchWithCriteria(t *testing.T) {
  131. var engine Engine
  132. engine.Init(types.EngineInitOptions{
  133. SegmenterDictionaries: "../testdata/test_dict.txt",
  134. DefaultRankOptions: &types.RankOptions{
  135. ScoringCriteria: TestScoringCriteria{},
  136. },
  137. IndexerInitOptions: &types.IndexerInitOptions{
  138. IndexType: types.LocationsIndex,
  139. },
  140. })
  141. AddDocs(&engine)
  142. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  143. utils.Expect(t, "2", len(outputs.Docs))
  144. utils.Expect(t, "0", outputs.Docs[0].DocId)
  145. utils.Expect(t, "18000", int(outputs.Docs[0].Scores[0]*1000))
  146. utils.Expect(t, "4", outputs.Docs[1].DocId)
  147. utils.Expect(t, "9000", int(outputs.Docs[1].Scores[0]*1000))
  148. }
  149. func TestCompactIndex(t *testing.T) {
  150. var engine Engine
  151. engine.Init(types.EngineInitOptions{
  152. SegmenterDictionaries: "../testdata/test_dict.txt",
  153. DefaultRankOptions: &types.RankOptions{
  154. ScoringCriteria: TestScoringCriteria{},
  155. },
  156. })
  157. AddDocs(&engine)
  158. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  159. utils.Expect(t, "2", len(outputs.Docs))
  160. utils.Expect(t, "4", outputs.Docs[0].DocId)
  161. utils.Expect(t, "9000", int(outputs.Docs[0].Scores[0]*1000))
  162. utils.Expect(t, "0", outputs.Docs[1].DocId)
  163. utils.Expect(t, "6000", int(outputs.Docs[1].Scores[0]*1000))
  164. }
  165. type BM25ScoringCriteria struct {
  166. }
  167. func (criteria BM25ScoringCriteria) Score(
  168. doc types.IndexedDocument, fields interface{}) []float32 {
  169. if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
  170. return []float32{}
  171. }
  172. return []float32{doc.BM25}
  173. }
  174. func TestFrequenciesIndex(t *testing.T) {
  175. var engine Engine
  176. engine.Init(types.EngineInitOptions{
  177. SegmenterDictionaries: "../testdata/test_dict.txt",
  178. DefaultRankOptions: &types.RankOptions{
  179. ScoringCriteria: BM25ScoringCriteria{},
  180. },
  181. IndexerInitOptions: &types.IndexerInitOptions{
  182. IndexType: types.FrequenciesIndex,
  183. },
  184. })
  185. AddDocs(&engine)
  186. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  187. utils.Expect(t, "2", len(outputs.Docs))
  188. utils.Expect(t, "4", outputs.Docs[0].DocId)
  189. utils.Expect(t, "2311", int(outputs.Docs[0].Scores[0]*1000))
  190. utils.Expect(t, "0", outputs.Docs[1].DocId)
  191. utils.Expect(t, "2211", int(outputs.Docs[1].Scores[0]*1000))
  192. }
  193. func TestRemoveDocument(t *testing.T) {
  194. var engine Engine
  195. engine.Init(types.EngineInitOptions{
  196. SegmenterDictionaries: "../testdata/test_dict.txt",
  197. DefaultRankOptions: &types.RankOptions{
  198. ScoringCriteria: TestScoringCriteria{},
  199. },
  200. })
  201. AddDocs(&engine)
  202. engine.RemoveDocument(4)
  203. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  204. utils.Expect(t, "1", len(outputs.Docs))
  205. utils.Expect(t, "0", outputs.Docs[0].DocId)
  206. utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
  207. }
  208. func TestEngineIndexDocumentWithTokens(t *testing.T) {
  209. var engine Engine
  210. engine.Init(types.EngineInitOptions{
  211. SegmenterDictionaries: "../testdata/test_dict.txt",
  212. DefaultRankOptions: &types.RankOptions{
  213. OutputOffset: 0,
  214. MaxOutputs: 10,
  215. ScoringCriteria: &RankByTokenProximity{},
  216. },
  217. IndexerInitOptions: &types.IndexerInitOptions{
  218. IndexType: types.LocationsIndex,
  219. },
  220. })
  221. docId := uint64(0)
  222. engine.IndexDocument(docId, types.DocumentIndexData{
  223. Content: "",
  224. Tokens: []types.TokenData{
  225. {"中国", []int{0}},
  226. {"人口", []int{18, 24}},
  227. },
  228. Fields: ScoringFields{1, 2, 3},
  229. })
  230. docId++
  231. engine.IndexDocument(docId, types.DocumentIndexData{
  232. Content: "",
  233. Tokens: []types.TokenData{
  234. {"中国", []int{0}},
  235. {"人口", []int{6}},
  236. },
  237. Fields: ScoringFields{1, 2, 3},
  238. })
  239. docId++
  240. engine.IndexDocument(docId, types.DocumentIndexData{
  241. Content: "中国十三亿人口",
  242. Fields: ScoringFields{0, 9, 1},
  243. })
  244. engine.FlushIndex()
  245. outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
  246. utils.Expect(t, "2", len(outputs.Tokens))
  247. utils.Expect(t, "中国", outputs.Tokens[0])
  248. utils.Expect(t, "人口", outputs.Tokens[1])
  249. utils.Expect(t, "3", len(outputs.Docs))
  250. utils.Expect(t, "1", outputs.Docs[0].DocId)
  251. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  252. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  253. utils.Expect(t, "2", outputs.Docs[1].DocId)
  254. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  255. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  256. utils.Expect(t, "0", outputs.Docs[2].DocId)
  257. utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
  258. utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
  259. }
  260. func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
  261. gob.Register(ScoringFields{})
  262. var engine Engine
  263. engine.Init(types.EngineInitOptions{
  264. SegmenterDictionaries: "../testdata/test_dict.txt",
  265. DefaultRankOptions: &types.RankOptions{
  266. OutputOffset: 0,
  267. MaxOutputs: 10,
  268. ScoringCriteria: &RankByTokenProximity{},
  269. },
  270. IndexerInitOptions: &types.IndexerInitOptions{
  271. IndexType: types.LocationsIndex,
  272. },
  273. UsePersistentStorage: true,
  274. PersistentStorageFolder: "wukong.persistent",
  275. PersistentStorageShards: 2,
  276. })
  277. AddDocs(&engine)
  278. engine.RemoveDocument(4)
  279. engine.Close()
  280. var engine1 Engine
  281. engine1.Init(types.EngineInitOptions{
  282. SegmenterDictionaries: "../testdata/test_dict.txt",
  283. DefaultRankOptions: &types.RankOptions{
  284. OutputOffset: 0,
  285. MaxOutputs: 10,
  286. ScoringCriteria: &RankByTokenProximity{},
  287. },
  288. IndexerInitOptions: &types.IndexerInitOptions{
  289. IndexType: types.LocationsIndex,
  290. },
  291. UsePersistentStorage: true,
  292. PersistentStorageFolder: "wukong.persistent",
  293. PersistentStorageShards: 2,
  294. })
  295. outputs := engine1.Search(types.SearchRequest{Text: "中国人口"})
  296. utils.Expect(t, "2", len(outputs.Tokens))
  297. utils.Expect(t, "中国", outputs.Tokens[0])
  298. utils.Expect(t, "人口", outputs.Tokens[1])
  299. utils.Expect(t, "2", len(outputs.Docs))
  300. utils.Expect(t, "1", outputs.Docs[0].DocId)
  301. utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
  302. utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
  303. utils.Expect(t, "0", outputs.Docs[1].DocId)
  304. utils.Expect(t, "76", int(outputs.Docs[1].Scores[0]*1000))
  305. utils.Expect(t, "[0 18]", outputs.Docs[1].TokenSnippetLocations)
  306. engine1.Close()
  307. os.RemoveAll("wukong.persistent")
  308. }
  309. func TestCountDocsOnly(t *testing.T) {
  310. var engine Engine
  311. engine.Init(types.EngineInitOptions{
  312. SegmenterDictionaries: "../testdata/test_dict.txt",
  313. DefaultRankOptions: &types.RankOptions{
  314. ReverseOrder: true,
  315. OutputOffset: 0,
  316. MaxOutputs: 1,
  317. ScoringCriteria: &RankByTokenProximity{},
  318. },
  319. IndexerInitOptions: &types.IndexerInitOptions{
  320. IndexType: types.LocationsIndex,
  321. },
  322. })
  323. AddDocs(&engine)
  324. engine.RemoveDocument(4)
  325. outputs := engine.Search(types.SearchRequest{Text: "中国人口", CountDocsOnly: true})
  326. utils.Expect(t, "0", len(outputs.Docs))
  327. utils.Expect(t, "2", len(outputs.Tokens))
  328. utils.Expect(t, "2", outputs.NumDocs)
  329. }
  330. func TestSearchWithin(t *testing.T) {
  331. var engine Engine
  332. engine.Init(types.EngineInitOptions{
  333. SegmenterDictionaries: "../testdata/test_dict.txt",
  334. DefaultRankOptions: &types.RankOptions{
  335. ReverseOrder: true,
  336. OutputOffset: 0,
  337. MaxOutputs: 10,
  338. ScoringCriteria: &RankByTokenProximity{},
  339. },
  340. IndexerInitOptions: &types.IndexerInitOptions{
  341. IndexType: types.LocationsIndex,
  342. },
  343. })
  344. AddDocs(&engine)
  345. docIds := make(map[uint64]bool)
  346. docIds[4] = true
  347. docIds[0] = true
  348. outputs := engine.Search(types.SearchRequest{
  349. Text: "中国人口",
  350. DocIds: docIds,
  351. })
  352. utils.Expect(t, "2", len(outputs.Tokens))
  353. utils.Expect(t, "中国", outputs.Tokens[0])
  354. utils.Expect(t, "人口", outputs.Tokens[1])
  355. utils.Expect(t, "2", len(outputs.Docs))
  356. utils.Expect(t, "0", outputs.Docs[0].DocId)
  357. utils.Expect(t, "76", int(outputs.Docs[0].Scores[0]*1000))
  358. utils.Expect(t, "[0 18]", outputs.Docs[0].TokenSnippetLocations)
  359. utils.Expect(t, "4", outputs.Docs[1].DocId)
  360. utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
  361. utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
  362. }