Skip to content

Convert files to utf-8 for indexing #7814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 15, 2019
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion models/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,14 +207,15 @@ func addUpdate(update fileUpdate, repo *Repository, batch rupture.FlushingBatch)
if err != nil {
return err
} else if !base.IsTextFile(fileContents) {
// FIXME: UTF-16 files will probably fail here
return nil
}
indexerUpdate := indexer.RepoIndexerUpdate{
Filepath: update.Filename,
Op: indexer.RepoIndexerOpUpdate,
Data: &indexer.RepoIndexerData{
RepoID: repo.ID,
Content: string(fileContents),
Content: string(base.ToUTF8DropErrors(fileContents)),
},
}
return indexerUpdate.AddToFlushingBatch(batch)
Expand Down
99 changes: 99 additions & 0 deletions modules/base/encoding.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// Copyright 2014 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package base

import (
"fmt"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
)

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(RemoveBOMIfPresent(content)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}

result = RemoveBOMIfPresent(result)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return RemoveBOMIfPresent(result)
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ToUTF8DropErrors makes sure the return string is valid utf-8; attempts conversion if possible
func ToUTF8DropErrors(content []byte) []byte {
charsetLabel, err := DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// We ignore any non-decodable parts from the file.
// Some parts might be lost
var decoded []byte
decoder := encoding.NewDecoder()
idx := 0
for {
result, n, err := transform.Bytes(decoder, content[idx:])
decoded = append(decoded, result...)
if err == nil {
break
}
decoded = append(decoded, ' ')
idx = idx + n + 1
if idx >= len(content) {
break
}
}

return RemoveBOMIfPresent(decoded)
}
145 changes: 145 additions & 0 deletions modules/base/encoding_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package base

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestToUTF8WithErr(t *testing.T) {
var res string
var err error

res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
assert.Equal(t, "ABC", res)
assert.NoError(t, err)

res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, "áéíóú", res)
assert.NoError(t, err)

res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, "áéíóú", res)
assert.NoError(t, err)

// This test FAILS
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
assert.Equal(t, "Hola, así cómo ños", res)
assert.NoError(t, err)

res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Regexp(t, "^Hola, así cómo", res)
assert.NoError(t, err)

res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Regexp(t, "^Hola, así cómo", res)
assert.NoError(t, err)

// Japanese (Shift-JIS)
res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
assert.Equal(t, "日属秘ぞしちゅ。", res)
assert.NoError(t, err)

res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
assert.Equal(t, "\x00\x00\x00\x00", res)
assert.NoError(t, err)
}

func TestToUTF8WithFallback(t *testing.T) {
res := ToUTF8WithFallback([]byte{0x41, 0x42, 0x43})
assert.Equal(t, []byte("ABC"), res)

res = ToUTF8WithFallback([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte("áéíóú"), res)

res = ToUTF8WithFallback([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte("áéíóú"), res)

res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
assert.Equal(t, []byte("Hola, así cómo ños"), res)

minmatch := []byte("Hola, así cómo ")

res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

// Japanese (Shift-JIS)
res = ToUTF8WithFallback([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
assert.Equal(t, []byte("日属秘ぞしちゅ。"), res)

res = ToUTF8WithFallback([]byte{0x00, 0x00, 0x00, 0x00})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}

func TestToUTF8(t *testing.T) {
res := ToUTF8("ABC")
assert.Equal(t, "ABC", res)

res = ToUTF8("áéíóú")
assert.Equal(t, "áéíóú", res)

// With utf-8 BOM
res = ToUTF8("\ufeffáéíóú")
assert.Equal(t, "áéíóú", res)

res = ToUTF8("Hola, así cómo ños")
assert.Equal(t, "Hola, así cómo ños", res)

res = ToUTF8("Hola, así cómo \x07ños")
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Regexp(t, "^Hola, así cómo", res)

// This test FAILS
// res = ToUTF8("Hola, así cómo \x81ños")
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
// assert.Regexp(t, "^Hola, así cómo", res)

// Japanese (Shift-JIS)
res = ToUTF8("\x93\xFA\x91\xAE\x94\xE9\x82\xBC\x82\xB5\x82\xBF\x82\xE3\x81\x42")
assert.Equal(t, "日属秘ぞしちゅ。", res)

res = ToUTF8("\x00\x00\x00\x00")
assert.Equal(t, "\x00\x00\x00\x00", res)
}

func TestToUTF8DropErrors(t *testing.T) {
res := ToUTF8DropErrors([]byte{0x41, 0x42, 0x43})
assert.Equal(t, []byte("ABC"), res)

res = ToUTF8DropErrors([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte("áéíóú"), res)

res = ToUTF8DropErrors([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
assert.Equal(t, []byte("áéíóú"), res)

res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
assert.Equal(t, []byte("Hola, así cómo ños"), res)

minmatch := []byte("Hola, así cómo ")

res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

res = ToUTF8DropErrors([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
assert.Equal(t, minmatch, res[0:len(minmatch)])

// Japanese (Shift-JIS)
res = ToUTF8DropErrors([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
assert.Equal(t, []byte("日属秘ぞしちゅ。"), res)

res = ToUTF8DropErrors([]byte{0x00, 0x00, 0x00, 0x00})
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, res)
}
2 changes: 1 addition & 1 deletion modules/indexer/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"

repoIndexerLatestVersion = 2
repoIndexerLatestVersion = 3
)

// repoIndexer (thread-safe) index for repository contents
Expand Down
56 changes: 0 additions & 56 deletions modules/templates/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@ import (
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/setting"

"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"gopkg.in/editorconfig/editorconfig-core-go.v1"
)

Expand Down Expand Up @@ -274,60 +272,6 @@ func Sha1(str string) string {
return base.EncodeSha1(str)
}

// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := base.DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(base.RemoveBOMIfPresent(content)), nil
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}

result = base.RemoveBOMIfPresent(result)

return string(result), err
}

// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return base.RemoveBOMIfPresent(content)
}

encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}

// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}

return base.RemoveBOMIfPresent(result)
}

// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}

// ReplaceLeft replaces all prefixes 'oldS' in 's' with 'newS'.
func ReplaceLeft(s, oldS, newS string) string {
oldLen, newLen, i, n := len(oldS), len(newS), 0, 0
Expand Down
3 changes: 1 addition & 2 deletions routers/repo/commit.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ import (
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"
)

const (
Expand Down Expand Up @@ -251,7 +250,7 @@ func Diff(ctx *context.Context) {
note := &git.Note{}
err = git.GetNote(ctx.Repo.GitRepo, commitID, note)
if err == nil {
ctx.Data["Note"] = string(templates.ToUTF8WithFallback(note.Message))
ctx.Data["Note"] = string(base.ToUTF8WithFallback(note.Message))
ctx.Data["NoteCommit"] = note.Commit
ctx.Data["NoteAuthor"] = models.ValidateCommitWithEmail(note.Commit)
}
Expand Down
3 changes: 1 addition & 2 deletions routers/repo/editor.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/repofiles"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/templates"
"code.gitea.io/gitea/modules/upload"
"code.gitea.io/gitea/modules/util"
)
Expand Down Expand Up @@ -118,7 +117,7 @@ func editFile(ctx *context.Context, isNewFile bool) {

d, _ := ioutil.ReadAll(dataRc)
buf = append(buf, d...)
if content, err := templates.ToUTF8WithErr(buf); err != nil {
if content, err := base.ToUTF8WithErr(buf); err != nil {
log.Error("ToUTF8WithErr: %v", err)
ctx.Data["FileContent"] = string(buf)
} else {
Expand Down
Loading