Skip to content

Commit 2628b15

Browse files
guillep2ktechknowlogick
authored andcommitted
Fix utf8 tests (#8192)
* Prevent compiler environment from making the tests fail * Remove unused function * Pass lint
1 parent 73f7e82 commit 2628b15

File tree

2 files changed

+81
-35
lines changed

2 files changed

+81
-35
lines changed

modules/charset/charset.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
3535
}
3636

3737
// If there is an error, we concatenate the nicely decoded part and the
38-
// original left over. This way we won't lose data.
38+
// original left over. This way we won't lose much data.
3939
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
4040
if err != nil {
4141
result = append(result, content[n:]...)

modules/charset/charset_test.go

Lines changed: 80 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -24,41 +24,55 @@ func TestToUTF8WithErr(t *testing.T) {
2424
var res string
2525
var err error
2626

27+
// Note: golang compiler seems so behave differently depending on the current
28+
// locale, so some conversions might behave differently. For that reason, we don't
29+
// depend on particular conversions but in expected behaviors.
30+
2731
res, err = ToUTF8WithErr([]byte{0x41, 0x42, 0x43})
28-
assert.Equal(t, "ABC", res)
2932
assert.NoError(t, err)
33+
assert.Equal(t, "ABC", res)
3034

35+
// "áéíóú"
3136
res, err = ToUTF8WithErr([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
32-
assert.Equal(t, "áéíóú", res)
3337
assert.NoError(t, err)
38+
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
3439

35-
res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba})
36-
assert.Equal(t, "áéíóú", res)
40+
// "áéíóú"
41+
res, err = ToUTF8WithErr([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
42+
0xc3, 0xba})
3743
assert.NoError(t, err)
44+
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
3845

39-
// This test FAILS
40-
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
41-
assert.Equal(t, "Hola, así cómo ños", res)
46+
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
47+
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
4248
assert.NoError(t, err)
49+
stringMustStartWith(t, "Hola,", res)
50+
stringMustEndWith(t, "AAA.", res)
4351

44-
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73})
45-
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
46-
assert.Regexp(t, "^Hola, así cómo", res)
52+
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
53+
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
4754
assert.NoError(t, err)
55+
stringMustStartWith(t, "Hola,", res)
56+
stringMustEndWith(t, "AAA.", res)
4857

49-
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73})
50-
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
51-
assert.Regexp(t, "^Hola, así cómo", res)
58+
res, err = ToUTF8WithErr([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
59+
0xF3, 0x6D, 0x6F, 0x20, 0x81, 0xA4, 0x6F, 0x73, 0x41, 0x41, 0x41, 0x2e})
5260
assert.NoError(t, err)
61+
stringMustStartWith(t, "Hola,", res)
62+
stringMustEndWith(t, "AAA.", res)
5363

5464
// Japanese (Shift-JIS)
55-
res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82, 0xBF, 0x82, 0xE3, 0x81, 0x42})
56-
assert.Equal(t, "日属秘ぞしちゅ。", res)
65+
// 日属秘ぞしちゅ。
66+
res, err = ToUTF8WithErr([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
67+
0xBF, 0x82, 0xE3, 0x81, 0x42})
5768
assert.NoError(t, err)
69+
assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
70+
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
71+
[]byte(res))
5872

5973
res, err = ToUTF8WithErr([]byte{0x00, 0x00, 0x00, 0x00})
60-
assert.Equal(t, "\x00\x00\x00\x00", res)
6174
assert.NoError(t, err)
75+
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
6276
}
6377

6478
func TestToUTF8WithFallback(t *testing.T) {
@@ -75,8 +89,10 @@ func TestToUTF8WithFallback(t *testing.T) {
7589
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, res)
7690

7791
// "Hola, así cómo ños"
78-
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63, 0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
79-
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
92+
res = ToUTF8WithFallback([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
93+
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73})
94+
assert.Equal(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63,
95+
0xC3, 0xB3, 0x6D, 0x6F, 0x20, 0xC3, 0xB1, 0x6F, 0x73}, res)
8096

8197
// "Hola, así cómo "
8298
minmatch := []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xC3, 0xAD, 0x20, 0x63, 0xC3, 0xB3, 0x6D, 0x6F, 0x20}
@@ -100,34 +116,52 @@ func TestToUTF8WithFallback(t *testing.T) {
100116
}
101117

102118
func TestToUTF8(t *testing.T) {
103-
res := ToUTF8("ABC")
104-
assert.Equal(t, "ABC", res)
105119

106-
res = ToUTF8("áéíóú")
107-
assert.Equal(t, "áéíóú", res)
120+
// Note: golang compiler seems so behave differently depending on the current
121+
// locale, so some conversions might behave differently. For that reason, we don't
122+
// depend on particular conversions but in expected behaviors.
108123

109-
// With utf-8 BOM
110-
res = ToUTF8("\ufeffáéíóú")
111-
assert.Equal(t, "áéíóú", res)
112-
113-
res = ToUTF8("Hola, así cómo ños")
114-
assert.Equal(t, "Hola, así cómo ños", res)
124+
res := ToUTF8(string([]byte{0x41, 0x42, 0x43}))
125+
assert.Equal(t, "ABC", res)
115126

116-
res = ToUTF8("Hola, así cómo \x07ños")
117-
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
118-
assert.Regexp(t, "^Hola, así cómo", res)
127+
// "áéíóú"
128+
res = ToUTF8(string([]byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}))
129+
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
130+
131+
// BOM + "áéíóú"
132+
res = ToUTF8(string([]byte{0xef, 0xbb, 0xbf, 0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3,
133+
0xc3, 0xba}))
134+
assert.Equal(t, []byte{0xc3, 0xa1, 0xc3, 0xa9, 0xc3, 0xad, 0xc3, 0xb3, 0xc3, 0xba}, []byte(res))
135+
136+
// Latin1
137+
// Hola, así cómo ños
138+
res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
139+
0xF3, 0x6D, 0x6F, 0x20, 0xF1, 0x6F, 0x73}))
140+
assert.Equal(t, []byte{0x48, 0x6f, 0x6c, 0x61, 0x2c, 0x20, 0x61, 0x73, 0xc3, 0xad, 0x20, 0x63,
141+
0xc3, 0xb3, 0x6d, 0x6f, 0x20, 0xc3, 0xb1, 0x6f, 0x73}, []byte(res))
142+
143+
// Latin1
144+
// Hola, así cómo \x07ños
145+
res = ToUTF8(string([]byte{0x48, 0x6F, 0x6C, 0x61, 0x2C, 0x20, 0x61, 0x73, 0xED, 0x20, 0x63,
146+
0xF3, 0x6D, 0x6F, 0x20, 0x07, 0xA4, 0x6F, 0x73}))
147+
// Hola,
148+
bytesMustStartWith(t, []byte{0x48, 0x6F, 0x6C, 0x61, 0x2C}, []byte(res))
119149

120150
// This test FAILS
121151
// res = ToUTF8("Hola, así cómo \x81ños")
122152
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
123153
// assert.Regexp(t, "^Hola, así cómo", res)
124154

125155
// Japanese (Shift-JIS)
126-
res = ToUTF8("\x93\xFA\x91\xAE\x94\xE9\x82\xBC\x82\xB5\x82\xBF\x82\xE3\x81\x42")
127-
assert.Equal(t, "日属秘ぞしちゅ。", res)
156+
// 日属秘ぞしちゅ。
157+
res = ToUTF8(string([]byte{0x93, 0xFA, 0x91, 0xAE, 0x94, 0xE9, 0x82, 0xBC, 0x82, 0xB5, 0x82,
158+
0xBF, 0x82, 0xE3, 0x81, 0x42}))
159+
assert.Equal(t, []byte{0xE6, 0x97, 0xA5, 0xE5, 0xB1, 0x9E, 0xE7, 0xA7, 0x98, 0xE3,
160+
0x81, 0x9E, 0xE3, 0x81, 0x97, 0xE3, 0x81, 0xA1, 0xE3, 0x82, 0x85, 0xE3, 0x80, 0x82},
161+
[]byte(res))
128162

129163
res = ToUTF8("\x00\x00\x00\x00")
130-
assert.Equal(t, "\x00\x00\x00\x00", res)
164+
assert.Equal(t, []byte{0x00, 0x00, 0x00, 0x00}, []byte(res))
131165
}
132166

133167
func TestToUTF8DropErrors(t *testing.T) {
@@ -203,3 +237,15 @@ func TestDetectEncoding(t *testing.T) {
203237
_, err = DetectEncoding(b)
204238
assert.Error(t, err)
205239
}
240+
241+
func stringMustStartWith(t *testing.T, expected string, value string) {
242+
assert.Equal(t, expected, string(value[:len(expected)]))
243+
}
244+
245+
func stringMustEndWith(t *testing.T, expected string, value string) {
246+
assert.Equal(t, expected, string(value[len(value)-len(expected):]))
247+
}
248+
249+
func bytesMustStartWith(t *testing.T, expected []byte, value []byte) {
250+
assert.Equal(t, expected, value[:len(expected)])
251+
}

0 commit comments

Comments
 (0)