Skip to content

Commit a83d597

Browse files
authored
Merge different languages for language stats (#24900) (#24921)
Backport #24900 Fix #24896 If users set different languages by `linguist-language`, the `stats` map could be: `java: 100, Java: 200`. Language stats are stored as case-insensitive in database and there is a unique key. So, the different language names should be merged to one unique name: `Java: 300`
1 parent c5dee88 commit a83d597

4 files changed

+58
-5
lines changed

modules/git/repo_language_stats.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,46 @@
33

44
package git
55

6+
import (
7+
"strings"
8+
"unicode"
9+
)
10+
611
const (
712
fileSizeLimit int64 = 16 * 1024 // 16 KiB
813
bigFileSize int64 = 1024 * 1024 // 1 MiB
914
)
15+
16+
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
17+
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
18+
names := map[string]struct {
19+
uniqueName string
20+
upperCount int
21+
}{}
22+
23+
countUpper := func(s string) (count int) {
24+
for _, r := range s {
25+
if unicode.IsUpper(r) {
26+
count++
27+
}
28+
}
29+
return count
30+
}
31+
32+
for name := range stats {
33+
cnt := countUpper(name)
34+
lower := strings.ToLower(name)
35+
if cnt >= names[lower].upperCount {
36+
names[lower] = struct {
37+
uniqueName string
38+
upperCount int
39+
}{uniqueName: name, upperCount: cnt}
40+
}
41+
}
42+
43+
res := make(map[string]int64, len(names))
44+
for name, num := range stats {
45+
res[names[strings.ToLower(name)].uniqueName] += num
46+
}
47+
return res
48+
}

modules/git/repo_language_stats_gogit.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
156156
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
157157
}
158158

159-
return sizes, nil
159+
return mergeLanguageStats(sizes), nil
160160
}
161161

162162
func readFile(f *object.File, limit int64) ([]byte, error) {

modules/git/repo_language_stats_nogogit.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
180180
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
181181
// - eg. do the all the detection tests using filename first before reading content.
182182
language := analyze.GetCodeLanguage(f.Name(), content)
183-
if language == enry.OtherLanguage || language == "" {
183+
if language == "" {
184184
continue
185185
}
186186

@@ -192,8 +192,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
192192

193193
included, checked := includedLanguage[language]
194194
if !checked {
195-
langtype := enry.GetLanguageType(language)
196-
included = langtype == enry.Programming || langtype == enry.Markup
195+
langType := enry.GetLanguageType(language)
196+
included = langType == enry.Programming || langType == enry.Markup
197197
includedLanguage[language] = included
198198
}
199199
if included {
@@ -210,7 +210,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
210210
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
211211
}
212212

213-
return sizes, nil
213+
return mergeLanguageStats(sizes), nil
214214
}
215215

216216
func discardFull(rd *bufio.Reader, discard int64) error {

modules/git/repo_language_stats_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,17 @@ func TestRepository_GetLanguageStats(t *testing.T) {
3030
"Java": 112,
3131
}, stats)
3232
}
33+
34+
func TestMergeLanguageStats(t *testing.T) {
35+
assert.EqualValues(t, map[string]int64{
36+
"PHP": 1,
37+
"python": 10,
38+
"JAVA": 700,
39+
}, mergeLanguageStats(map[string]int64{
40+
"PHP": 1,
41+
"python": 10,
42+
"Java": 100,
43+
"java": 200,
44+
"JAVA": 400,
45+
}))
46+
}

0 commit comments

Comments
 (0)