| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442 |
- package core
- import (
- "github.com/huichen/wukong/types"
- "github.com/huichen/wukong/utils"
- "testing"
- )
- func TestAddKeywords(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{{"token1", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 3,
- Keywords: []types.KeywordIndex{{"token3", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 7,
- Keywords: []types.KeywordIndex{{"token7", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{{"token2", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 7,
- Keywords: []types.KeywordIndex{{"token77", 0, []int{}}},
- }, false)
- indexer.AddDocumentToCache(nil, true)
- utils.Expect(t, "", indicesToString(&indexer, "token1"))
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "3 ", indicesToString(&indexer, "token3"))
- utils.Expect(t, "7 ", indicesToString(&indexer, "token77"))
- }
- func TestRemoveDocument(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc2 = "token1 token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- },
- }, true)
- utils.Expect(t, "2 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 ", indicesToString(&indexer, "token3"))
- indexer.RemoveDocumentToCache(2, false)
- // doc1 = "token1 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, true)
- utils.Expect(t, "1 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 ", indicesToString(&indexer, "token3"))
- // doc2 = "token1 token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- {"token3", 0, []int{14}},
- },
- }, true)
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "2 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token3"))
- // doc3 = "token1 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 3,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- },
- }, false)
- indexer.RemoveDocumentToCache(3, false)
- indexer.AddDocumentToCache(nil, true)
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "2 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 2 ", indicesToString(&indexer, "token3"))
- }
- func TestLookupLocationsIndex(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc2 = "token1 token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- {"token3", 0, []int{14}},
- },
- }, false)
- // doc3 = "token1 token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 3,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- },
- }, false)
- // doc4 = "token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 4,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- },
- }, false)
- // doc7 = "token1 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 7,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc9 = "token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 9,
- Keywords: []types.KeywordIndex{
- {"token3", 0, []int{0}},
- },
- }, true)
- utils.Expect(t, "2 3 7 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "1 2 3 4 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 2 7 9 ", indicesToString(&indexer, "token3"))
- utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
- utils.Expect(t, "[7 0 [0]] [3 0 [0]] [2 0 [0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
- utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
- utils.Expect(t, "[3 1 [0 7]] [2 1 [0 7]] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
- utils.Expect(t, "[3 13 [7 0]] [2 13 [7 0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
- utils.Expect(t, "[7 1 [0 7]] [2 8 [0 14]] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[7 13 [7 0]] [2 20 [14 0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
- utils.Expect(t, "[2 1 [7 14]] [1 1 [0 7]] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[2 13 [14 7]] [1 13 [7 0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
- utils.Expect(t, "[2 2 [0 7 14]] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[2 26 [14 7 0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
- }
- func TestLookupDocIdsIndex(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.DocIdsIndex})
- // doc1 = "token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc2 = "token1 token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- {"token3", 0, []int{14}},
- },
- }, false)
- // doc3 = "token1 token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 3,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- },
- }, false)
- // doc4 = "token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 4,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- },
- }, false)
- // doc7 = "token1 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 7,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc9 = "token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 9,
- Keywords: []types.KeywordIndex{
- {"token3", 0, []int{0}},
- },
- }, true)
- utils.Expect(t, "2 3 7 ", indicesToString(&indexer, "token1"))
- utils.Expect(t, "1 2 3 4 ", indicesToString(&indexer, "token2"))
- utils.Expect(t, "1 2 7 9 ", indicesToString(&indexer, "token3"))
- utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token4"}, []string{}, nil, false)))
- utils.Expect(t, "[7 0 []] [3 0 []] [2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token1"}, []string{}, nil, false)))
- utils.Expect(t, "", indexedDocsToString(indexer.Lookup([]string{"token1", "token4"}, []string{}, nil, false)))
- utils.Expect(t, "[3 0 []] [2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token2"}, []string{}, nil, false)))
- utils.Expect(t, "[3 0 []] [2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token1"}, []string{}, nil, false)))
- utils.Expect(t, "[7 0 []] [2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[7 0 []] [2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token1"}, []string{}, nil, false)))
- utils.Expect(t, "[2 0 []] [1 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[2 0 []] [1 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token2"}, []string{}, nil, false)))
- utils.Expect(t, "[2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token1", "token2", "token3"}, []string{}, nil, false)))
- utils.Expect(t, "[2 0 []] ",
- indexedDocsToString(indexer.Lookup([]string{"token3", "token2", "token1"}, []string{}, nil, false)))
- }
- func TestLookupWithProximity(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token4 token4 token2 token3 token4"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0, 21}},
- {"token3", 0, []int{28}},
- {"token4", 0, []int{7, 14, 35}},
- },
- }, true)
- utils.Expect(t, "[1 1 [21 28]] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)))
- // doc1 = "t2 t1 . . . t2 t3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"t1", 0, []int{3}},
- {"t2", 0, []int{0, 12}},
- {"t3", 0, []int{15}},
- },
- }, true)
- utils.Expect(t, "[1 8 [3 12 15]] ",
- indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
- // doc1 = "t3 t2 t1 . . . . . t2 t3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"t1", 0, []int{6}},
- {"t2", 0, []int{3, 19}},
- {"t3", 0, []int{0, 22}},
- },
- }, true)
- utils.Expect(t, "[1 10 [6 3 0]] ",
- indexedDocsToString(indexer.Lookup([]string{"t1", "t2", "t3"}, []string{}, nil, false)))
- }
- func TestLookupWithPartialLocations(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token4 token4 token2 token3 token4" + "label1"(不在文本中)
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0, 21}},
- {"token3", 0, []int{28}},
- {"label1", 0, []int{}},
- {"token4", 0, []int{7, 14, 35}},
- },
- }, false)
- // doc2 = "token2 token4 token4 token2 token3 token4"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0, 21}},
- {"token3", 0, []int{28}},
- {"token4", 0, []int{7, 14, 35}},
- },
- }, true)
- utils.Expect(t, "1 ", indicesToString(&indexer, "label1"))
- utils.Expect(t, "[1 1 [21 28]] ",
- indexedDocsToString(indexer.Lookup([]string{"token2", "token3"}, []string{"label1"}, nil, false)))
- }
- func TestLookupWithBM25(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{
- IndexType: types.FrequenciesIndex,
- BM25Parameters: &types.BM25Parameters{
- K1: 1,
- B: 1,
- },
- })
- // doc1 = "token2 token4 token4 token2 token3 token4"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- TokenLength: 6,
- Keywords: []types.KeywordIndex{
- {"token2", 3, []int{0, 21}},
- {"token3", 7, []int{28}},
- {"token4", 15, []int{7, 14, 35}},
- },
- }, false)
- // doc2 = "token6 token7"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- TokenLength: 2,
- Keywords: []types.KeywordIndex{
- {"token6", 3, []int{0}},
- {"token7", 15, []int{7}},
- },
- }, true)
- outputs, _ := indexer.Lookup([]string{"token2", "token3", "token4"}, []string{}, nil, false)
- // BM25 = log2(3) * (12/9 + 28/17 + 60/33) = 6.3433
- utils.Expect(t, "76055", int(outputs[0].BM25*10000))
- }
- func TestLookupWithinDocIds(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- {"token3", 0, []int{7}},
- },
- }, false)
- // doc2 = "token1 token2 token3"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- {"token3", 0, []int{14}},
- },
- }, false)
- // doc3 = "token1 token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 3,
- Keywords: []types.KeywordIndex{
- {"token1", 0, []int{0}},
- {"token2", 0, []int{7}},
- },
- }, false)
- // doc4 = "token2"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 4,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0}},
- },
- }, true)
- docIds := make(map[uint64]bool)
- docIds[1] = true
- docIds[3] = true
- utils.Expect(t, "[3 0 [7]] [1 0 [0]] ",
- indexedDocsToString(indexer.Lookup([]string{"token2"}, []string{}, docIds, false)))
- }
- func TestLookupWithLocations(t *testing.T) {
- var indexer Indexer
- indexer.Init(types.IndexerInitOptions{IndexType: types.LocationsIndex})
- // doc1 = "token2 token4 token4 token2 token3 token4"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 1,
- Keywords: []types.KeywordIndex{
- {"token2", 0, []int{0, 21}},
- {"token3", 0, []int{28}},
- {"token4", 0, []int{7, 14, 35}},
- },
- }, true)
- // doc2 = "token2 token4 token4 token2 token3 token4"
- indexer.AddDocumentToCache(&types.DocumentIndex{
- DocId: 2,
- Keywords: []types.KeywordIndex{
- {"token3", 0, []int{0, 21}},
- {"token5", 0, []int{28}},
- {"token2", 0, []int{7, 14, 35}},
- },
- }, true)
- indexer.RemoveDocumentToCache(2, true)
- docs, _ := indexer.Lookup([]string{"token2", "token3"}, []string{}, nil, false)
- utils.Expect(t, "[[0 21] [28]]", docs[0].TokenLocations)
- }
|