Skip to content

Commit 19b08c0

Browse files
committed
Allow code search by filename
Signed-off-by: Bruno Sofiato <[email protected]>
1 parent d6d3c96 commit 19b08c0

38 files changed

+688
-49
lines changed

models/fixtures/repo_unit.yml

+21
Original file line numberDiff line numberDiff line change
@@ -712,3 +712,24 @@
712712
type: 3
713713
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
714714
created_unix: 946684810
715+
716+
-
717+
id: 108
718+
repo_id: 62
719+
type: 1
720+
config: "{}"
721+
created_unix: 946684810
722+
723+
-
724+
id: 109
725+
repo_id: 62
726+
type: 2
727+
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
728+
created_unix: 946684810
729+
730+
-
731+
id: 110
732+
repo_id: 62
733+
type: 3
734+
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
735+
created_unix: 946684810

models/fixtures/repository.yml

+31
Original file line numberDiff line numberDiff line change
@@ -1768,3 +1768,34 @@
17681768
size: 0
17691769
is_fsck_enabled: true
17701770
close_issues_via_commit_in_any_branch: false
1771+
1772+
-
1773+
id: 62
1774+
owner_id: 42
1775+
owner_name: org42
1776+
lower_name: search-by-path
1777+
name: search-by-path
1778+
default_branch: master
1779+
num_watches: 0
1780+
num_stars: 0
1781+
num_forks: 0
1782+
num_issues: 0
1783+
num_closed_issues: 0
1784+
num_pulls: 0
1785+
num_closed_pulls: 0
1786+
num_milestones: 0
1787+
num_closed_milestones: 0
1788+
num_projects: 0
1789+
num_closed_projects: 0
1790+
is_private: false
1791+
is_empty: false
1792+
is_archived: false
1793+
is_mirror: false
1794+
status: 0
1795+
is_fork: false
1796+
fork_id: 0
1797+
is_template: false
1798+
template_id: 0
1799+
size: 0
1800+
is_fsck_enabled: true
1801+
close_issues_via_commit_in_any_branch: false

models/fixtures/user.yml

+37
Original file line numberDiff line numberDiff line change
@@ -1517,3 +1517,40 @@
15171517
repo_admin_change_team_access: false
15181518
theme: ""
15191519
keep_activity_private: false
1520+
1521+
-
1522+
id: 42
1523+
lower_name: org42
1524+
name: org42
1525+
full_name: Org42
1526+
1527+
keep_email_private: false
1528+
email_notifications_preference: onmention
1529+
passwd: ZogKvWdyEx:password
1530+
passwd_hash_algo: dummy
1531+
must_change_password: false
1532+
login_source: 0
1533+
login_name: org42
1534+
type: 1
1535+
salt: ZogKvWdyEx
1536+
max_repo_creation: -1
1537+
is_active: false
1538+
is_admin: false
1539+
is_restricted: false
1540+
allow_git_hook: false
1541+
allow_import_local: false
1542+
allow_create_organization: true
1543+
prohibit_login: false
1544+
avatar: avatar42
1545+
avatar_email: [email protected]
1546+
use_custom_avatar: false
1547+
num_followers: 0
1548+
num_following: 0
1549+
num_stars: 0
1550+
num_repos: 1
1551+
num_teams: 0
1552+
num_members: 0
1553+
visibility: 0
1554+
repo_admin_change_team_access: false
1555+
theme: ""
1556+
keep_activity_private: false

models/repo/repo_list_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,12 @@ func getTestCases() []struct {
138138
{
139139
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
140140
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
141-
count: 33,
141+
count: 34,
142142
},
143143
{
144144
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
145145
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
146-
count: 38,
146+
count: 39,
147147
},
148148
{
149149
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
@@ -158,7 +158,7 @@ func getTestCases() []struct {
158158
{
159159
name: "AllPublic/PublicRepositoriesOfOrganization",
160160
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
161-
count: 33,
161+
count: 34,
162162
},
163163
{
164164
name: "AllTemplates",

models/user/user_test.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) {
9292
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
9393
[]int64{26, 41})
9494

95-
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
95+
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
96+
[]int64{42})
97+
98+
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
9699
[]int64{})
97100

98101
// test users

modules/indexer/code/bleve/bleve.go

+37-7
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"code.gitea.io/gitea/modules/charset"
1818
"code.gitea.io/gitea/modules/git"
1919
"code.gitea.io/gitea/modules/gitrepo"
20+
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
2021
"code.gitea.io/gitea/modules/indexer/code/internal"
2122
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
2223
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
5354
RepoID int64
5455
CommitID string
5556
Content string
57+
Filename string
5658
Language string
5759
UpdatedAt time.Time
5860
}
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
6466

6567
const (
6668
repoIndexerAnalyzer = "repoIndexerAnalyzer"
69+
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
70+
filenameIndexerTokenizer = "filenameIndexerTokenizer"
6771
repoIndexerDocType = "repoIndexerDocType"
68-
repoIndexerLatestVersion = 6
72+
repoIndexerLatestVersion = 7
6973
)
7074

7175
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
7983
textFieldMapping.IncludeInAll = false
8084
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
8185

86+
fileNamedMapping := bleve.NewTextFieldMapping()
87+
fileNamedMapping.IncludeInAll = false
88+
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
89+
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
90+
8291
termFieldMapping := bleve.NewTextFieldMapping()
8392
termFieldMapping.IncludeInAll = false
8493
termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
9099
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
91100

92101
mapping := bleve.NewIndexMapping()
102+
93103
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
94104
return nil, err
95105
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
100110
}); err != nil {
101111
return nil, err
102112
}
113+
114+
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
115+
"type": analyzer_custom.Name,
116+
"char_filters": []string{},
117+
"tokenizer": unicode.Name,
118+
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
119+
}); err != nil {
120+
return nil, err
121+
}
122+
103123
mapping.DefaultAnalyzer = repoIndexerAnalyzer
104124
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
105125
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
174194
return batch.Index(id, &RepoIndexerData{
175195
RepoID: repo.ID,
176196
CommitID: commitSha,
197+
Filename: update.Filename,
177198
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
178199
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
179200
UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
240261
keywordQuery query.Query
241262
)
242263

243-
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
244-
phraseQuery.FieldVal = "Content"
245-
phraseQuery.Analyzer = repoIndexerAnalyzer
246-
keywordQuery = phraseQuery
264+
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
265+
pathQuery.FieldVal = "Filename"
266+
pathQuery.SetBoost(10)
267+
268+
contentQuery := bleve.NewMatchQuery(opts.Keyword)
269+
contentQuery.FieldVal = "Content"
270+
247271
if opts.IsKeywordFuzzy {
248-
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
272+
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
249273
}
250274

275+
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
276+
251277
if len(opts.RepoIDs) > 0 {
252278
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
253279
for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
277303

278304
from, pageSize := opts.GetSkipTake()
279305
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
280-
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
306+
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
281307
searchRequest.IncludeLocations = true
282308

283309
if len(opts.Language) == 0 {
@@ -307,6 +333,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
307333
endIndex = locationEnd
308334
}
309335
}
336+
if len(hit.Locations["Filename"]) > 0 {
337+
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
338+
}
339+
310340
language := hit.Fields["Language"].(string)
311341
var updatedUnix timeutil.TimeStamp
312342
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
// Copyright 2024 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package path
5+
6+
import (
7+
"slices"
8+
"strings"
9+
10+
"github.com/blevesearch/bleve/v2/analysis"
11+
"github.com/blevesearch/bleve/v2/registry"
12+
)
13+
14+
const (
15+
Name = "gitea/path"
16+
)
17+
18+
type TokenFilter struct{}
19+
20+
func NewTokenFilter() *TokenFilter {
21+
return &TokenFilter{}
22+
}
23+
24+
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
25+
return NewTokenFilter(), nil
26+
}
27+
28+
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
29+
if len(input) == 1 {
30+
// if there is only one token, we dont need to generate the reversed chain
31+
return generatePathTokens(input, false)
32+
}
33+
34+
normal := generatePathTokens(input, false)
35+
reversed := generatePathTokens(input, true)
36+
37+
return append(normal, reversed...)
38+
}
39+
40+
// Generates path tokens from the input tokens.
41+
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
42+
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
43+
//
44+
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
45+
// to efficiently search for filenames without supplying the fullpath.
46+
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
47+
terms := make([]string, 0, len(input))
48+
longestTerm := 0
49+
50+
if reversed {
51+
slices.Reverse(input)
52+
}
53+
54+
for i := 0; i < len(input); i++ {
55+
var sb strings.Builder
56+
sb.WriteString(string(input[0].Term))
57+
58+
for j := 1; j < i; j++ {
59+
sb.WriteString("/")
60+
sb.WriteString(string(input[j].Term))
61+
}
62+
63+
term := sb.String()
64+
65+
if longestTerm < len(term) {
66+
longestTerm = len(term)
67+
}
68+
69+
terms = append(terms, term)
70+
}
71+
72+
output := make(analysis.TokenStream, 0, len(terms))
73+
74+
for _, term := range terms {
75+
var start, end int
76+
77+
if reversed {
78+
start = 0
79+
end = len(term)
80+
} else {
81+
start = longestTerm - len(term)
82+
end = longestTerm
83+
}
84+
85+
token := analysis.Token{
86+
Position: 1,
87+
Start: start,
88+
End: end,
89+
Type: analysis.AlphaNumeric,
90+
Term: []byte(term),
91+
}
92+
93+
output = append(output, &token)
94+
}
95+
96+
return output
97+
}
98+
99+
func init() {
100+
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
101+
}

0 commit comments

Comments
 (0)