Skip to content

Commit a3f403f

Browse files
authored
Add option to disable ambiguous unicode characters detection (#28454) (#28499)
Backport #28454 (the only conflict is caused by some comments) * Close #24483 * Close #28123 * Close #23682 * Close #23149
1 parent 8ee1ed8 commit a3f403f

File tree

17 files changed

+110
-146
lines changed

17 files changed

+110
-146
lines changed

custom/conf/app.example.ini

+3
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,9 @@ LEVEL = Info
12101210
;; Max size of files to be displayed (default is 8MiB)
12111211
;MAX_DISPLAY_FILE_SIZE = 8388608
12121212
;;
1213+
;; Detect ambiguous unicode characters in file contents and show warnings on the UI
1214+
;AMBIGUOUS_UNICODE_DETECTION = true
1215+
;;
12131216
;; Whether the email of the user should be shown in the Explore Users page
12141217
;SHOW_USER_EMAIL = true
12151218
;;

docs/content/administration/config-cheat-sheet.en-us.md

+1
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ The following configuration set `Content-Type: application/vnd.android.package-a
220220
- `THEMES`: **auto,gitea,arc-green**: All available themes. Allow users select personalized themes.
221221
regardless of the value of `DEFAULT_THEME`.
222222
- `MAX_DISPLAY_FILE_SIZE`: **8388608**: Max size of files to be displayed (default is 8MiB)
223+
- `AMBIGUOUS_UNICODE_DETECTION`: **true**: Detect ambiguous unicode characters in file contents and show warnings on the UI
223224
- `REACTIONS`: All available reactions users can choose on issues/prs and comments
224225
Values can be emoji alias (:smile:) or a unicode emoji.
225226
For custom reactions, add a tightly cropped square image to public/assets/img/emoji/reaction_name.png

modules/charset/escape.go

+10-49
Original file line numberDiff line numberDiff line change
@@ -8,32 +8,31 @@
88
package charset
99

1010
import (
11-
"bufio"
11+
"html/template"
1212
"io"
1313
"strings"
1414

1515
"code.gitea.io/gitea/modules/log"
16+
"code.gitea.io/gitea/modules/setting"
1617
"code.gitea.io/gitea/modules/translation"
1718
)
1819

1920
// RuneNBSP is the codepoint for NBSP
2021
const RuneNBSP = 0xa0
2122

2223
// EscapeControlHTML escapes the unicode control sequences in a provided html document
23-
func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
24+
func EscapeControlHTML(html template.HTML, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output template.HTML) {
2425
sb := &strings.Builder{}
25-
outputStream := &HTMLStreamerWriter{Writer: sb}
26-
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
27-
28-
if err := StreamHTML(strings.NewReader(text), streamer); err != nil {
29-
streamer.escaped.HasError = true
30-
log.Error("Error whilst escaping: %v", err)
31-
}
32-
return streamer.escaped, sb.String()
26+
escaped, _ = EscapeControlReader(strings.NewReader(string(html)), sb, locale, allowed...) // err has been handled in EscapeControlReader
27+
return escaped, template.HTML(sb.String())
3328
}
3429

35-
// EscapeControlReaders escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
30+
// EscapeControlReader escapes the unicode control sequences in a provided reader of HTML content and writer in a locale and returns the findings as an EscapeStatus
3631
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
32+
if !setting.UI.AmbiguousUnicodeDetection {
33+
_, err = io.Copy(writer, reader)
34+
return &EscapeStatus{}, err
35+
}
3736
outputStream := &HTMLStreamerWriter{Writer: writer}
3837
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
3938

@@ -43,41 +42,3 @@ func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.
4342
}
4443
return streamer.escaped, err
4544
}
46-
47-
// EscapeControlStringReader escapes the unicode control sequences in a provided reader of string content and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte. HTML line breaks are not inserted after every newline by this method.
48-
func EscapeControlStringReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
49-
bufRd := bufio.NewReader(reader)
50-
outputStream := &HTMLStreamerWriter{Writer: writer}
51-
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
52-
53-
for {
54-
line, rdErr := bufRd.ReadString('\n')
55-
if len(line) > 0 {
56-
if err := streamer.Text(line); err != nil {
57-
streamer.escaped.HasError = true
58-
log.Error("Error whilst escaping: %v", err)
59-
return streamer.escaped, err
60-
}
61-
}
62-
if rdErr != nil {
63-
if rdErr != io.EOF {
64-
err = rdErr
65-
}
66-
break
67-
}
68-
}
69-
return streamer.escaped, err
70-
}
71-
72-
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
73-
func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
74-
sb := &strings.Builder{}
75-
outputStream := &HTMLStreamerWriter{Writer: sb}
76-
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
77-
78-
if err := streamer.Text(text); err != nil {
79-
streamer.escaped.HasError = true
80-
log.Error("Error whilst escaping: %v", err)
81-
}
82-
return streamer.escaped, sb.String()
83-
}

modules/charset/escape_stream.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ func (e *escapeStreamer) Text(data string) error {
6464
until, next = nextIdxs[0]+pos, nextIdxs[1]+pos
6565
}
6666

67-
// from pos until until we know that the runes are not \r\t\n or even ' '
67+
// from pos until we know that the runes are not \r\t\n or even ' '
6868
runes := make([]rune, 0, next-until)
6969
positions := make([]int, 0, next-until+1)
7070

modules/charset/escape_test.go

+16-36
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,14 @@
44
package charset
55

66
import (
7-
"reflect"
87
"strings"
98
"testing"
109

10+
"code.gitea.io/gitea/modules/setting"
11+
"code.gitea.io/gitea/modules/test"
1112
"code.gitea.io/gitea/modules/translation"
13+
14+
"github.com/stretchr/testify/assert"
1215
)
1316

1417
type escapeControlTest struct {
@@ -132,22 +135,8 @@ then resh (ר), and finally heh (ה) (which should appear leftmost).`,
132135
},
133136
}
134137

135-
func TestEscapeControlString(t *testing.T) {
136-
for _, tt := range escapeControlTests {
137-
t.Run(tt.name, func(t *testing.T) {
138-
status, result := EscapeControlString(tt.text, &translation.MockLocale{})
139-
if !reflect.DeepEqual(*status, tt.status) {
140-
t.Errorf("EscapeControlString() status = %v, wanted= %v", status, tt.status)
141-
}
142-
if result != tt.result {
143-
t.Errorf("EscapeControlString()\nresult= %v,\nwanted= %v", result, tt.result)
144-
}
145-
})
146-
}
147-
}
148-
149138
func TestEscapeControlReader(t *testing.T) {
150-
// lets add some control characters to the tests
139+
// add some control characters to the tests
151140
tests := make([]escapeControlTest, 0, len(escapeControlTests)*3)
152141
copy(tests, escapeControlTests)
153142

@@ -169,29 +158,20 @@ func TestEscapeControlReader(t *testing.T) {
169158

170159
for _, tt := range tests {
171160
t.Run(tt.name, func(t *testing.T) {
172-
input := strings.NewReader(tt.text)
173161
output := &strings.Builder{}
174-
status, err := EscapeControlReader(input, output, &translation.MockLocale{})
175-
result := output.String()
176-
if err != nil {
177-
t.Errorf("EscapeControlReader(): err = %v", err)
178-
}
179-
180-
if !reflect.DeepEqual(*status, tt.status) {
181-
t.Errorf("EscapeControlReader() status = %v, wanted= %v", status, tt.status)
182-
}
183-
if result != tt.result {
184-
t.Errorf("EscapeControlReader()\nresult= %v,\nwanted= %v", result, tt.result)
185-
}
162+
status, err := EscapeControlReader(strings.NewReader(tt.text), output, &translation.MockLocale{})
163+
assert.NoError(t, err)
164+
assert.Equal(t, tt.status, *status)
165+
assert.Equal(t, tt.result, output.String())
186166
})
187167
}
188168
}
189169

190-
func TestEscapeControlReader_panic(t *testing.T) {
191-
bs := make([]byte, 0, 20479)
192-
bs = append(bs, 'A')
193-
for i := 0; i < 6826; i++ {
194-
bs = append(bs, []byte("—")...)
195-
}
196-
_, _ = EscapeControlString(string(bs), &translation.MockLocale{})
170+
func TestSettingAmbiguousUnicodeDetection(t *testing.T) {
171+
defer test.MockVariableValue(&setting.UI.AmbiguousUnicodeDetection, true)()
172+
_, out := EscapeControlHTML("a test", &translation.MockLocale{})
173+
assert.EqualValues(t, `a<span class="escaped-code-point" data-escaped="[U+00A0]"><span class="char"> </span></span>test`, out)
174+
setting.UI.AmbiguousUnicodeDetection = false
175+
_, out = EscapeControlHTML("a test", &translation.MockLocale{})
176+
assert.EqualValues(t, `a test`, out)
197177
}

modules/git/command.go

+3-8
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"os/exec"
1515
"strings"
1616
"time"
17-
"unsafe"
1817

1918
"code.gitea.io/gitea/modules/git/internal" //nolint:depguard // only this file can use the internal type CmdArg, other files and packages should use AddXxx functions
2019
"code.gitea.io/gitea/modules/log"
@@ -389,15 +388,11 @@ func (r *runStdError) IsExitCode(code int) bool {
389388
return false
390389
}
391390

392-
func bytesToString(b []byte) string {
393-
return *(*string)(unsafe.Pointer(&b)) // that's what Golang's strings.Builder.String() does (go/src/strings/builder.go)
394-
}
395-
396391
// RunStdString runs the command with options and returns stdout/stderr as string. and store stderr to returned error (err combined with stderr).
397392
func (c *Command) RunStdString(opts *RunOpts) (stdout, stderr string, runErr RunStdError) {
398393
stdoutBytes, stderrBytes, err := c.RunStdBytes(opts)
399-
stdout = bytesToString(stdoutBytes)
400-
stderr = bytesToString(stderrBytes)
394+
stdout = util.UnsafeBytesToString(stdoutBytes)
395+
stderr = util.UnsafeBytesToString(stderrBytes)
401396
if err != nil {
402397
return stdout, stderr, &runStdError{err: err, stderr: stderr}
403398
}
@@ -432,7 +427,7 @@ func (c *Command) RunStdBytes(opts *RunOpts) (stdout, stderr []byte, runErr RunS
432427
err := c.Run(newOpts)
433428
stderr = stderrBuf.Bytes()
434429
if err != nil {
435-
return nil, stderr, &runStdError{err: err, stderr: bytesToString(stderr)}
430+
return nil, stderr, &runStdError{err: err, stderr: util.UnsafeBytesToString(stderr)}
436431
}
437432
// even if there is no err, there could still be some stderr output
438433
return stdoutBuf.Bytes(), stderr, nil

modules/highlight/highlight.go

+14-15
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"bytes"
1010
"fmt"
1111
gohtml "html"
12+
"html/template"
1213
"io"
1314
"path/filepath"
1415
"strings"
@@ -55,7 +56,7 @@ func NewContext() {
5556
}
5657

5758
// Code returns a HTML version of code string with chroma syntax highlighting classes and the matched lexer name
58-
func Code(fileName, language, code string) (string, string) {
59+
func Code(fileName, language, code string) (output template.HTML, lexerName string) {
5960
NewContext()
6061

6162
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
@@ -65,7 +66,7 @@ func Code(fileName, language, code string) (string, string) {
6566
}
6667

6768
if len(code) > sizeLimit {
68-
return code, ""
69+
return template.HTML(template.HTMLEscapeString(code)), ""
6970
}
7071

7172
var lexer chroma.Lexer
@@ -102,13 +103,11 @@ func Code(fileName, language, code string) (string, string) {
102103
cache.Add(fileName, lexer)
103104
}
104105

105-
lexerName := formatLexerName(lexer.Config().Name)
106-
107-
return CodeFromLexer(lexer, code), lexerName
106+
return CodeFromLexer(lexer, code), formatLexerName(lexer.Config().Name)
108107
}
109108

110109
// CodeFromLexer returns a HTML version of code string with chroma syntax highlighting classes
111-
func CodeFromLexer(lexer chroma.Lexer, code string) string {
110+
func CodeFromLexer(lexer chroma.Lexer, code string) template.HTML {
112111
formatter := html.New(html.WithClasses(true),
113112
html.WithLineNumbers(false),
114113
html.PreventSurroundingPre(true),
@@ -120,23 +119,23 @@ func CodeFromLexer(lexer chroma.Lexer, code string) string {
120119
iterator, err := lexer.Tokenise(nil, code)
121120
if err != nil {
122121
log.Error("Can't tokenize code: %v", err)
123-
return code
122+
return template.HTML(template.HTMLEscapeString(code))
124123
}
125124
// style not used for live site but need to pass something
126125
err = formatter.Format(htmlw, githubStyles, iterator)
127126
if err != nil {
128127
log.Error("Can't format code: %v", err)
129-
return code
128+
return template.HTML(template.HTMLEscapeString(code))
130129
}
131130

132131
_ = htmlw.Flush()
133132
// Chroma will add newlines for certain lexers in order to highlight them properly
134133
// Once highlighted, strip them here, so they don't cause copy/paste trouble in HTML output
135-
return strings.TrimSuffix(htmlbuf.String(), "\n")
134+
return template.HTML(strings.TrimSuffix(htmlbuf.String(), "\n"))
136135
}
137136

138137
// File returns a slice of chroma syntax highlighted HTML lines of code and the matched lexer name
139-
func File(fileName, language string, code []byte) ([]string, string, error) {
138+
func File(fileName, language string, code []byte) ([]template.HTML, string, error) {
140139
NewContext()
141140

142141
if len(code) > sizeLimit {
@@ -183,24 +182,24 @@ func File(fileName, language string, code []byte) ([]string, string, error) {
183182
tokensLines := chroma.SplitTokensIntoLines(iterator.Tokens())
184183
htmlBuf := &bytes.Buffer{}
185184

186-
lines := make([]string, 0, len(tokensLines))
185+
lines := make([]template.HTML, 0, len(tokensLines))
187186
for _, tokens := range tokensLines {
188187
iterator = chroma.Literator(tokens...)
189188
err = formatter.Format(htmlBuf, githubStyles, iterator)
190189
if err != nil {
191190
return nil, "", fmt.Errorf("can't format code: %w", err)
192191
}
193-
lines = append(lines, htmlBuf.String())
192+
lines = append(lines, template.HTML(htmlBuf.String()))
194193
htmlBuf.Reset()
195194
}
196195

197196
return lines, lexerName, nil
198197
}
199198

200199
// PlainText returns non-highlighted HTML for code
201-
func PlainText(code []byte) []string {
200+
func PlainText(code []byte) []template.HTML {
202201
r := bufio.NewReader(bytes.NewReader(code))
203-
m := make([]string, 0, bytes.Count(code, []byte{'\n'})+1)
202+
m := make([]template.HTML, 0, bytes.Count(code, []byte{'\n'})+1)
204203
for {
205204
content, err := r.ReadString('\n')
206205
if err != nil && err != io.EOF {
@@ -210,7 +209,7 @@ func PlainText(code []byte) []string {
210209
if content == "" && err == io.EOF {
211210
break
212211
}
213-
s := gohtml.EscapeString(content)
212+
s := template.HTML(gohtml.EscapeString(content))
214213
m = append(m, s)
215214
}
216215
return m

0 commit comments

Comments
 (0)