Skip to content

Commit 81702e6

Browse files
lafrikslunny
authored andcommitted
Detect charset and convert non UTF-8 files for display (#4950)
* Detect charset and convert non UTF-8 files for display * Refactor and move function to correct module * Revert unrelated changes * More unrelated changes * Duplicate content for small text to have better encoding detection * Check if original content is valid before duplicating it
1 parent 6780661 commit 81702e6

File tree

3 files changed

+44
-4
lines changed

3 files changed

+44
-4
lines changed

modules/base/tool.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) {
5959
return "UTF-8", nil
6060
}
6161

62-
result, err := chardet.NewTextDetector().DetectBest(content)
62+
textDetector := chardet.NewTextDetector()
63+
var detectContent []byte
64+
if len(content) < 1024 {
65+
// Check if original content is valid
66+
if _, err := textDetector.DetectBest(content); err != nil {
67+
return "", err
68+
}
69+
times := 1024 / len(content)
70+
detectContent = make([]byte, 0, times*len(content))
71+
for i := 0; i < times; i++ {
72+
detectContent = append(detectContent, content...)
73+
}
74+
} else {
75+
detectContent = content
76+
}
77+
result, err := textDetector.DetectBest(detectContent)
6378
if err != nil {
6479
return "", err
6580
}

modules/templates/helper.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
// Copyright 2018 The Gitea Authors. All rights reserved.
12
// Copyright 2014 The Gogs Authors. All rights reserved.
23
// Use of this source code is governed by a MIT-style
34
// license that can be found in the LICENSE file.
@@ -275,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
275276
}
276277

277278
// If there is an error, we concatenate the nicely decoded part and the
278-
// original left over. This way we won't loose data.
279+
// original left over. This way we won't lose data.
279280
result, n, err := transform.String(encoding.NewDecoder(), string(content))
280281
if err != nil {
281282
result = result + string(content[n:])
@@ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) {
284285
return result, err
285286
}
286287

288+
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
289+
func ToUTF8WithFallback(content []byte) []byte {
290+
charsetLabel, err := base.DetectEncoding(content)
291+
if err != nil || charsetLabel == "UTF-8" {
292+
return content
293+
}
294+
295+
encoding, _ := charset.Lookup(charsetLabel)
296+
if encoding == nil {
297+
return content
298+
}
299+
300+
// If there is an error, we concatenate the nicely decoded part and the
301+
// original left over. This way we won't lose data.
302+
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
303+
if err != nil {
304+
return append(result, content[n:]...)
305+
}
306+
307+
return result
308+
}
309+
287310
// ToUTF8 converts content to UTF8 encoding and ignore error
288311
func ToUTF8(content string) string {
289312
res, _ := ToUTF8WithErr([]byte(content))

routers/repo/view.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"code.gitea.io/gitea/modules/markup"
2626
"code.gitea.io/gitea/modules/setting"
2727
"code.gitea.io/gitea/modules/templates"
28+
2829
"github.com/Unknwon/paginater"
2930
)
3031

@@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) {
99100
ctx.Data["FileSize"] = readmeFile.Size()
100101
} else {
101102
d, _ := ioutil.ReadAll(dataRc)
102-
buf = append(buf, d...)
103+
buf = templates.ToUTF8WithFallback(append(buf, d...))
104+
103105
if markup.Type(readmeFile.Name()) != "" {
104106
ctx.Data["IsMarkup"] = true
105107
ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
@@ -203,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
203205
}
204206

205207
d, _ := ioutil.ReadAll(dataRc)
206-
buf = append(buf, d...)
208+
buf = templates.ToUTF8WithFallback(append(buf, d...))
207209

208210
readmeExist := markup.IsReadmeFile(blob.Name())
209211
ctx.Data["ReadmeExist"] = readmeExist

0 commit comments

Comments
 (0)