| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- package engine
- import (
- "encoding/gob"
- "github.com/huichen/wukong/types"
- "github.com/huichen/wukong/utils"
- "os"
- "reflect"
- "testing"
- )
- type ScoringFields struct {
- A, B, C float32
- }
- func AddDocs(engine *Engine) {
- docId := uint64(1)
- // 因为需要保证文档全部被加入到索引中,所以 forceUpdate 全部设置成 true
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "中国有十三亿人口人口",
- Fields: ScoringFields{1, 2, 3},
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "中国人口",
- Fields: nil,
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "有人口",
- Fields: ScoringFields{2, 3, 1},
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "有十三亿人口",
- Fields: ScoringFields{2, 3, 3},
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "中国十三亿人口",
- Fields: ScoringFields{0, 9, 1},
- }, true)
- engine.FlushIndex()
- }
- type RankByTokenProximity struct {
- }
- func (rule RankByTokenProximity) Score(
- doc types.IndexedDocument, fields interface{}) []float32 {
- if doc.TokenProximity < 0 {
- return []float32{}
- }
- return []float32{1.0 / (float32(doc.TokenProximity) + 1)}
- }
- func TestEngineIndexDocument(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Tokens))
- utils.Expect(t, "中国", outputs.Tokens[0])
- utils.Expect(t, "人口", outputs.Tokens[1])
- utils.Expect(t, "3", len(outputs.Docs))
- utils.Expect(t, "2", outputs.Docs[0].DocId)
- utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
- utils.Expect(t, "5", outputs.Docs[1].DocId)
- utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
- utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
- utils.Expect(t, "1", outputs.Docs[2].DocId)
- utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
- utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
- }
- func TestReverseOrder(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ReverseOrder: true,
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "3", len(outputs.Docs))
- utils.Expect(t, "1", outputs.Docs[0].DocId)
- utils.Expect(t, "5", outputs.Docs[1].DocId)
- utils.Expect(t, "2", outputs.Docs[2].DocId)
- }
- func TestOffsetAndMaxOutputs(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ReverseOrder: true,
- OutputOffset: 1,
- MaxOutputs: 3,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "5", outputs.Docs[0].DocId)
- utils.Expect(t, "2", outputs.Docs[1].DocId)
- }
- type TestScoringCriteria struct {
- }
- func (criteria TestScoringCriteria) Score(
- doc types.IndexedDocument, fields interface{}) []float32 {
- if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
- return []float32{}
- }
- fs := fields.(ScoringFields)
- return []float32{float32(doc.TokenProximity)*fs.A + fs.B*fs.C}
- }
- func TestSearchWithCriteria(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ScoringCriteria: TestScoringCriteria{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "1", outputs.Docs[0].DocId)
- utils.Expect(t, "18000", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "5", outputs.Docs[1].DocId)
- utils.Expect(t, "9000", int(outputs.Docs[1].Scores[0]*1000))
- }
- func TestCompactIndex(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ScoringCriteria: TestScoringCriteria{},
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "5", outputs.Docs[0].DocId)
- utils.Expect(t, "9000", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "1", outputs.Docs[1].DocId)
- utils.Expect(t, "6000", int(outputs.Docs[1].Scores[0]*1000))
- }
- type BM25ScoringCriteria struct {
- }
- func (criteria BM25ScoringCriteria) Score(
- doc types.IndexedDocument, fields interface{}) []float32 {
- if reflect.TypeOf(fields) != reflect.TypeOf(ScoringFields{}) {
- return []float32{}
- }
- return []float32{doc.BM25}
- }
- func TestFrequenciesIndex(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ScoringCriteria: BM25ScoringCriteria{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.FrequenciesIndex,
- },
- })
- AddDocs(&engine)
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "5", outputs.Docs[0].DocId)
- utils.Expect(t, "2349", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "1", outputs.Docs[1].DocId)
- utils.Expect(t, "2320", int(outputs.Docs[1].Scores[0]*1000))
- }
- func TestRemoveDocument(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ScoringCriteria: TestScoringCriteria{},
- },
- })
- AddDocs(&engine)
- engine.RemoveDocument(5, true)
- engine.FlushIndex()
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "1", len(outputs.Docs))
- utils.Expect(t, "1", outputs.Docs[0].DocId)
- utils.Expect(t, "6000", int(outputs.Docs[0].Scores[0]*1000))
- }
- func TestEngineIndexDocumentWithTokens(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- docId := uint64(1)
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "",
- Tokens: []types.TokenData{
- {"中国", []int{0}},
- {"人口", []int{18, 24}},
- },
- Fields: ScoringFields{1, 2, 3},
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "",
- Tokens: []types.TokenData{
- {"中国", []int{0}},
- {"人口", []int{6}},
- },
- Fields: ScoringFields{1, 2, 3},
- }, true)
- docId++
- engine.IndexDocument(docId, types.DocumentIndexData{
- Content: "中国十三亿人口",
- Fields: ScoringFields{0, 9, 1},
- }, true)
- engine.FlushIndex()
- outputs := engine.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Tokens))
- utils.Expect(t, "中国", outputs.Tokens[0])
- utils.Expect(t, "人口", outputs.Tokens[1])
- utils.Expect(t, "3", len(outputs.Docs))
- utils.Expect(t, "2", outputs.Docs[0].DocId)
- utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
- utils.Expect(t, "3", outputs.Docs[1].DocId)
- utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
- utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
- utils.Expect(t, "1", outputs.Docs[2].DocId)
- utils.Expect(t, "76", int(outputs.Docs[2].Scores[0]*1000))
- utils.Expect(t, "[0 18]", outputs.Docs[2].TokenSnippetLocations)
- }
- func TestEngineIndexDocumentWithPersistentStorage(t *testing.T) {
- gob.Register(ScoringFields{})
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- UsePersistentStorage: true,
- PersistentStorageFolder: "wukong.persistent",
- PersistentStorageShards: 2,
- })
- AddDocs(&engine)
- engine.RemoveDocument(5, true)
- engine.Close()
- var engine1 Engine
- engine1.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- UsePersistentStorage: true,
- PersistentStorageFolder: "wukong.persistent",
- PersistentStorageShards: 2,
- })
- engine1.FlushIndex()
- outputs := engine1.Search(types.SearchRequest{Text: "中国人口"})
- utils.Expect(t, "2", len(outputs.Tokens))
- utils.Expect(t, "中国", outputs.Tokens[0])
- utils.Expect(t, "人口", outputs.Tokens[1])
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "2", outputs.Docs[0].DocId)
- utils.Expect(t, "1000", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "[0 6]", outputs.Docs[0].TokenSnippetLocations)
- utils.Expect(t, "1", outputs.Docs[1].DocId)
- utils.Expect(t, "76", int(outputs.Docs[1].Scores[0]*1000))
- utils.Expect(t, "[0 18]", outputs.Docs[1].TokenSnippetLocations)
- engine1.Close()
- os.RemoveAll("wukong.persistent")
- }
- func TestCountDocsOnly(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ReverseOrder: true,
- OutputOffset: 0,
- MaxOutputs: 1,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- engine.RemoveDocument(5, true)
- engine.FlushIndex()
- outputs := engine.Search(types.SearchRequest{Text: "中国人口", CountDocsOnly: true})
- utils.Expect(t, "0", len(outputs.Docs))
- utils.Expect(t, "2", len(outputs.Tokens))
- utils.Expect(t, "2", outputs.NumDocs)
- }
- func TestSearchWithin(t *testing.T) {
- var engine Engine
- engine.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../testdata/test_dict.txt",
- DefaultRankOptions: &types.RankOptions{
- ReverseOrder: true,
- OutputOffset: 0,
- MaxOutputs: 10,
- ScoringCriteria: &RankByTokenProximity{},
- },
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- AddDocs(&engine)
- docIds := make(map[uint64]bool)
- docIds[5] = true
- docIds[1] = true
- outputs := engine.Search(types.SearchRequest{
- Text: "中国人口",
- DocIds: docIds,
- })
- utils.Expect(t, "2", len(outputs.Tokens))
- utils.Expect(t, "中国", outputs.Tokens[0])
- utils.Expect(t, "人口", outputs.Tokens[1])
- utils.Expect(t, "2", len(outputs.Docs))
- utils.Expect(t, "1", outputs.Docs[0].DocId)
- utils.Expect(t, "76", int(outputs.Docs[0].Scores[0]*1000))
- utils.Expect(t, "[0 18]", outputs.Docs[0].TokenSnippetLocations)
- utils.Expect(t, "5", outputs.Docs[1].DocId)
- utils.Expect(t, "100", int(outputs.Docs[1].Scores[0]*1000))
- utils.Expect(t, "[0 15]", outputs.Docs[1].TokenSnippetLocations)
- }
- func TestLookupWithLocations1(t *testing.T) {
- type Data struct {
- Id int
- Content string
- Labels []string
- }
- datas := make([]Data, 0)
- data0 := Data{Id: 0, Content: "此次百度收购将成中国互联网最大并购", Labels: []string{"百度", "中国"}}
- datas = append(datas, data0)
- data1 := Data{Id: 1, Content: "百度宣布拟全资收购91无线业务", Labels: []string{"百度"}}
- datas = append(datas, data1)
- data2 := Data{Id: 2, Content: "百度是中国最大的搜索引擎", Labels: []string{"百度"}}
- datas = append(datas, data2)
- data3 := Data{Id: 3, Content: "百度在研制无人汽车", Labels: []string{"百度"}}
- datas = append(datas, data3)
- data4 := Data{Id: 4, Content: "BAT是中国互联网三巨头", Labels: []string{"百度"}}
- datas = append(datas, data4)
- // 初始化
- searcher_locations := Engine{}
- searcher_locations.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../data/dictionary.txt",
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.LocationsIndex,
- },
- })
- defer searcher_locations.Close()
- for _, data := range datas {
- searcher_locations.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels}, true)
- }
- searcher_locations.FlushIndex()
- res_locations := searcher_locations.Search(types.SearchRequest{Text: "百度"})
- searcher_docids := Engine{}
- searcher_docids.Init(types.EngineInitOptions{
- SegmenterDictionaries: "../data/dictionary.txt",
- IndexerInitOptions: &types.IndexerInitOptions{
- IndexType: types.DocIdsIndex,
- },
- })
- defer searcher_docids.Close()
- for _, data := range datas {
- searcher_docids.IndexDocument(uint64(data.Id), types.DocumentIndexData{Content: data.Content, Labels: data.Labels}, true)
- }
- searcher_docids.FlushIndex()
- res_docids := searcher_docids.Search(types.SearchRequest{Text: "百度"})
- if res_docids.NumDocs != res_locations.NumDocs {
- t.Errorf("期待的搜索结果个数=\"%d\", 实际=\"%d\"", res_docids.NumDocs, res_locations.NumDocs)
- }
- }
|