Skip to content

Commit 8ade039

Browse files
committed
Add detected file language to code search
Move langauge detection to separate module to be more reusable Add option to disable vendored file exclusion from file search Allways show all language stats for search
1 parent 17445bb commit 8ade039

File tree

20 files changed

+346
-63
lines changed

20 files changed

+346
-63
lines changed

docs/content/doc/advanced/config-cheat-sheet.en-us.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ relation to port exhaustion.
242242
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
243243
- `REPO_INDEXER_INCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **include** in the index. Use `**.txt` to match any files with .txt extension. An empty list means include all files.
244244
- `REPO_INDEXER_EXCLUDE`: **empty**: A comma separated list of glob patterns (see https://github.com/gobwas/glob) to **exclude** from the index. Files that match this list will not be indexed, even if they match in `REPO_INDEXER_INCLUDE`.
245+
- `REPO_INDEXER_EXCLUDE_VENDORED`: **true**: Exclude vendored files from index.
245246
- `UPDATE_BUFFER_LEN`: **20**: Buffer length of index request.
246247
- `MAX_FILE_SIZE`: **1048576**: Maximum size in bytes of files to be indexed.
247248
- `STARTUP_TIMEOUT`: **30s**: If the indexer takes longer than this timeout to start - fail. (This timeout will be added to the hammer time above for child processes - as bleve will not start until the previous parent is shutdown.) Set to zero to never timeout.

docs/content/doc/advanced/repo-indexer.en-us.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ Gitea applies glob pattern matching from the [`gobwas/glob` library](https://git
4242

4343
Limiting the list of files prevents the indexes from becoming polluted with derived or irrelevant files (e.g. lss, sym, map, etc.), so the search results are more relevant. It can also help reduce the index size.
4444

45+
`REPO_INDEXER_EXCLUDE_VENDORED` (default: true) excludes vendored files from index.
46+
4547
`REPO_INDEXER_INCLUDE` (default: empty) is a comma separated list of glob patterns to **include** in the index. An empty list means "_include all files_".
4648
`REPO_INDEXER_EXCLUDE` (default: empty) is a comma separated list of glob patterns to **exclude** from the index. Files that match this list will not be indexed. `REPO_INDEXER_EXCLUDE` takes precedence over `REPO_INDEXER_INCLUDE`.
4749

modules/analyze/code_langauge.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Copyright 2020 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
package analyze
6+
7+
import (
8+
"path/filepath"
9+
10+
"github.com/src-d/enry/v2"
11+
)
12+
13+
// GetCodeLanguageWithCallback detects code language based on file name and content using callback
14+
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
15+
if language, ok := enry.GetLanguageByExtension(filename); ok {
16+
return language
17+
}
18+
19+
if language, ok := enry.GetLanguageByFilename(filename); ok {
20+
return language
21+
}
22+
23+
content, err := contentFunc()
24+
if err != nil {
25+
return enry.OtherLanguage
26+
}
27+
28+
return enry.GetLanguage(filepath.Base(filename), content)
29+
}
30+
31+
// GetCodeLanguage detects code language based on file name and content
32+
func GetCodeLanguage(filename string, content []byte) string {
33+
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
34+
return content, nil
35+
})
36+
}

modules/git/repo_language_stats.go

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ import (
99
"io"
1010
"io/ioutil"
1111
"math"
12-
"path/filepath"
12+
13+
"code.gitea.io/gitea/modules/analyze"
1314

1415
"github.com/src-d/enry/v2"
1516
"gopkg.in/src-d/go-git.v4"
@@ -51,25 +52,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
5152

5253
// TODO: Use .gitattributes file for linguist overrides
5354

54-
language, ok := enry.GetLanguageByExtension(f.Name)
55-
if !ok {
56-
if language, ok = enry.GetLanguageByFilename(f.Name); !ok {
57-
content, err := readFile(f, fileSizeLimit)
58-
if err != nil {
59-
return nil
60-
}
61-
62-
language = enry.GetLanguage(filepath.Base(f.Name), content)
63-
if language == enry.OtherLanguage {
64-
return nil
65-
}
66-
}
55+
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
56+
return readFile(f, fileSizeLimit)
57+
})
58+
if language == enry.OtherLanguage || language == "" {
59+
return nil
6760
}
6861

69-
if language != "" {
70-
sizes[language] += f.Size
71-
total += f.Size
72-
}
62+
sizes[language] += f.Size
63+
total += f.Size
7364

7465
return nil
7566
})

modules/indexer/code/bleve.go

Lines changed: 96 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,28 @@ import (
99
"os"
1010
"strconv"
1111
"strings"
12+
"time"
1213

1314
"code.gitea.io/gitea/models"
15+
"code.gitea.io/gitea/modules/analyze"
1416
"code.gitea.io/gitea/modules/base"
1517
"code.gitea.io/gitea/modules/charset"
1618
"code.gitea.io/gitea/modules/git"
1719
"code.gitea.io/gitea/modules/log"
1820
"code.gitea.io/gitea/modules/setting"
21+
"code.gitea.io/gitea/modules/timeutil"
1922

2023
"github.com/blevesearch/bleve"
21-
"github.com/blevesearch/bleve/analysis/analyzer/custom"
24+
analyzer_custom "github.com/blevesearch/bleve/analysis/analyzer/custom"
25+
analyzer_keyword "github.com/blevesearch/bleve/analysis/analyzer/keyword"
2226
"github.com/blevesearch/bleve/analysis/token/lowercase"
2327
"github.com/blevesearch/bleve/analysis/token/unicodenorm"
2428
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
2529
"github.com/blevesearch/bleve/index/upsidedown"
2630
"github.com/blevesearch/bleve/mapping"
2731
"github.com/blevesearch/bleve/search/query"
2832
"github.com/ethantkoenig/rupture"
33+
"github.com/src-d/enry/v2"
2934
)
3035

3136
const unicodeNormalizeName = "unicodeNormalize"
@@ -86,16 +91,23 @@ func openIndexer(path string, latestVersion int) (bleve.Index, error) {
8691

8792
// RepoIndexerData data stored in the repo indexer
8893
type RepoIndexerData struct {
89-
RepoID int64
90-
Content string
94+
RepoID int64
95+
CommitID string
96+
Content string
97+
Language string
98+
UpdatedAt time.Time
9199
}
92100

93101
// Type returns the document type, for bleve's mapping.Classifier interface.
94102
func (d *RepoIndexerData) Type() string {
95103
return repoIndexerDocType
96104
}
97105

98-
func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
106+
func addUpdate(commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error {
107+
// Ignore vendored files in code search
108+
if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) {
109+
return nil
110+
}
99111
stdout, err := git.NewCommand("cat-file", "-s", update.BlobSha).
100112
RunInDir(repo.RepoPath())
101113
if err != nil {
@@ -118,8 +130,11 @@ func addUpdate(update fileUpdate, repo *models.Repository, batch rupture.Flushin
118130

119131
id := filenameIndexerID(repo.ID, update.Filename)
120132
return batch.Index(id, &RepoIndexerData{
121-
RepoID: repo.ID,
122-
Content: string(charset.ToUTF8DropErrors(fileContents)),
133+
RepoID: repo.ID,
134+
CommitID: commitSha,
135+
Content: string(charset.ToUTF8DropErrors(fileContents)),
136+
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
137+
UpdatedAt: time.Now().UTC(),
123138
})
124139
}
125140

@@ -131,7 +146,7 @@ func addDelete(filename string, repo *models.Repository, batch rupture.FlushingB
131146
const (
132147
repoIndexerAnalyzer = "repoIndexerAnalyzer"
133148
repoIndexerDocType = "repoIndexerDocType"
134-
repoIndexerLatestVersion = 4
149+
repoIndexerLatestVersion = 5
135150
)
136151

137152
// createRepoIndexer create a repo indexer if one does not already exist
@@ -145,11 +160,21 @@ func createRepoIndexer(path string, latestVersion int) (bleve.Index, error) {
145160
textFieldMapping.IncludeInAll = false
146161
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
147162

163+
termFieldMapping := bleve.NewTextFieldMapping()
164+
termFieldMapping.IncludeInAll = false
165+
termFieldMapping.Analyzer = analyzer_keyword.Name
166+
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
167+
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
168+
169+
timeFieldMapping := bleve.NewDateTimeFieldMapping()
170+
timeFieldMapping.IncludeInAll = false
171+
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
172+
148173
mapping := bleve.NewIndexMapping()
149174
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
150175
return nil, err
151176
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]interface{}{
152-
"type": custom.Name,
177+
"type": analyzer_custom.Name,
153178
"char_filters": []string{},
154179
"tokenizer": unicode.Name,
155180
"token_filters": []string{unicodeNormalizeName, lowercase.Name},
@@ -255,7 +280,7 @@ func (b *BleveIndexer) Index(repoID int64) error {
255280

256281
batch := rupture.NewFlushingBatch(b.indexer, maxBatchSize)
257282
for _, update := range changes.Updates {
258-
if err := addUpdate(update, repo, batch); err != nil {
283+
if err := addUpdate(sha, update, repo, batch); err != nil {
259284
return err
260285
}
261286
}
@@ -289,7 +314,7 @@ func (b *BleveIndexer) Delete(repoID int64) error {
289314

290315
// Search searches for files in the specified repo.
291316
// Returns the matching file-paths
292-
func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error) {
317+
func (b *BleveIndexer) Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error) {
293318
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
294319
phraseQuery.FieldVal = "Content"
295320
phraseQuery.Analyzer = repoIndexerAnalyzer
@@ -309,16 +334,35 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
309334
indexerQuery = phraseQuery
310335
}
311336

337+
// Save for reuse without language filter
338+
facetQuery := indexerQuery
339+
if len(language) > 0 {
340+
languageQuery := bleve.NewMatchQuery(language)
341+
languageQuery.FieldVal = "Language"
342+
languageQuery.Analyzer = analyzer_keyword.Name
343+
344+
indexerQuery = bleve.NewConjunctionQuery(
345+
indexerQuery,
346+
languageQuery,
347+
)
348+
}
349+
312350
from := (page - 1) * pageSize
313351
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
314-
searchRequest.Fields = []string{"Content", "RepoID"}
352+
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
315353
searchRequest.IncludeLocations = true
316354

355+
if len(language) == 0 {
356+
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
357+
}
358+
317359
result, err := b.indexer.Search(searchRequest)
318360
if err != nil {
319-
return 0, nil, err
361+
return 0, nil, nil, err
320362
}
321363

364+
total := int64(result.Total)
365+
322366
searchResults := make([]*SearchResult, len(result.Hits))
323367
for i, hit := range result.Hits {
324368
var startIndex, endIndex int = -1, -1
@@ -333,13 +377,47 @@ func (b *BleveIndexer) Search(repoIDs []int64, keyword string, page, pageSize in
333377
endIndex = locationEnd
334378
}
335379
}
380+
language := hit.Fields["Language"].(string)
381+
var updatedUnix timeutil.TimeStamp
382+
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
383+
updatedUnix = timeutil.TimeStamp(t.Unix())
384+
}
336385
searchResults[i] = &SearchResult{
337-
RepoID: int64(hit.Fields["RepoID"].(float64)),
338-
StartIndex: startIndex,
339-
EndIndex: endIndex,
340-
Filename: filenameOfIndexerID(hit.ID),
341-
Content: hit.Fields["Content"].(string),
386+
RepoID: int64(hit.Fields["RepoID"].(float64)),
387+
StartIndex: startIndex,
388+
EndIndex: endIndex,
389+
Filename: filenameOfIndexerID(hit.ID),
390+
Content: hit.Fields["Content"].(string),
391+
CommitID: hit.Fields["CommitID"].(string),
392+
UpdatedUnix: updatedUnix,
393+
Language: language,
394+
Color: enry.GetColor(language),
395+
}
396+
}
397+
398+
searchResultLanguages := make([]*SearchResultLanguages, 0, 10)
399+
if len(language) > 0 {
400+
// Use separate query to go get all language counts
401+
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
402+
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
403+
facetRequest.IncludeLocations = true
404+
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
405+
406+
if result, err = b.indexer.Search(facetRequest); err != nil {
407+
return 0, nil, nil, err
408+
}
409+
410+
}
411+
languagesFacet := result.Facets["languages"]
412+
for _, term := range languagesFacet.Terms {
413+
if len(term.Term) == 0 {
414+
continue
342415
}
416+
searchResultLanguages = append(searchResultLanguages, &SearchResultLanguages{
417+
Language: term.Term,
418+
Color: enry.GetColor(term.Term),
419+
Count: term.Count,
420+
})
343421
}
344-
return int64(result.Total), searchResults, nil
422+
return total, searchResults, searchResultLanguages, nil
345423
}

modules/indexer/code/bleve_test.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,27 +49,34 @@ func TestIndexAndSearch(t *testing.T) {
4949
keywords = []struct {
5050
Keyword string
5151
IDs []int64
52+
Langs int
5253
}{
5354
{
5455
Keyword: "Description",
5556
IDs: []int64{1},
57+
Langs: 1,
5658
},
5759
{
5860
Keyword: "repo1",
5961
IDs: []int64{1},
62+
Langs: 1,
6063
},
6164
{
6265
Keyword: "non-exist",
6366
IDs: []int64{},
67+
Langs: 0,
6468
},
6569
}
6670
)
6771

6872
for _, kw := range keywords {
69-
total, res, err := idx.Search(nil, kw.Keyword, 1, 10)
73+
total, res, langs, err := idx.Search(nil, "", kw.Keyword, 1, 10)
7074
assert.NoError(t, err)
7175
assert.EqualValues(t, len(kw.IDs), total)
7276

77+
assert.NotNil(t, langs)
78+
assert.Len(t, langs, kw.Langs)
79+
7380
var ids = make([]int64, 0, len(res))
7481
for _, hit := range res {
7582
ids = append(ids, hit.RepoID)

modules/indexer/code/indexer.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,34 @@ import (
1212
"code.gitea.io/gitea/modules/graceful"
1313
"code.gitea.io/gitea/modules/log"
1414
"code.gitea.io/gitea/modules/setting"
15+
"code.gitea.io/gitea/modules/timeutil"
1516
)
1617

1718
// SearchResult result of performing a search in a repo
1819
type SearchResult struct {
19-
RepoID int64
20-
StartIndex int
21-
EndIndex int
22-
Filename string
23-
Content string
20+
RepoID int64
21+
StartIndex int
22+
EndIndex int
23+
Filename string
24+
Content string
25+
CommitID string
26+
UpdatedUnix timeutil.TimeStamp
27+
Language string
28+
Color string
29+
}
30+
31+
// SearchResultLanguages result of top languages count in search results
32+
type SearchResultLanguages struct {
33+
Language string
34+
Color string
35+
Count int
2436
}
2537

2638
// Indexer defines an interface to indexer issues contents
2739
type Indexer interface {
2840
Index(repoID int64) error
2941
Delete(repoID int64) error
30-
Search(repoIDs []int64, keyword string, page, pageSize int) (int64, []*SearchResult, error)
42+
Search(repoIDs []int64, language, keyword string, page, pageSize int) (int64, []*SearchResult, []*SearchResultLanguages, error)
3143
Close()
3244
}
3345

0 commit comments

Comments
 (0)