@@ -24,41 +24,55 @@ func TestToUTF8WithErr(t *testing.T) {
24
24
var res string
25
25
var err error
26
26
27
+ // Note: golang compiler seems so behave differently depending on the current
28
+ // locale, so some conversions might behave differently. For that reason, we don't
29
+ // depend on particular conversions but in expected behaviors.
30
+
27
31
res , err = ToUTF8WithErr ([]byte {0x41 , 0x42 , 0x43 })
28
- assert .Equal (t , "ABC" , res )
29
32
assert .NoError (t , err )
33
+ assert .Equal (t , "ABC" , res )
30
34
35
+ // "áéíóú"
31
36
res , err = ToUTF8WithErr ([]byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba })
32
- assert .Equal (t , "áéíóú" , res )
33
37
assert .NoError (t , err )
38
+ assert .Equal (t , []byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }, []byte (res ))
34
39
35
- res , err = ToUTF8WithErr ([]byte {0xef , 0xbb , 0xbf , 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba })
36
- assert .Equal (t , "áéíóú" , res )
40
+ // "áéíóú"
41
+ res , err = ToUTF8WithErr ([]byte {0xef , 0xbb , 0xbf , 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 ,
42
+ 0xc3 , 0xba })
37
43
assert .NoError (t , err )
44
+ assert .Equal (t , []byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }, []byte (res ))
38
45
39
- // This test FAILS
40
- res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 , 0xF3 , 0x6D , 0x6F , 0x20 , 0xF1 , 0x6F , 0x73 })
41
- assert .Equal (t , "Hola, así cómo ños" , res )
46
+ res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
47
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0xF1 , 0x6F , 0x73 , 0x41 , 0x41 , 0x41 , 0x2e })
42
48
assert .NoError (t , err )
49
+ stringMustStartWith (t , "Hola," , res )
50
+ stringMustEndWith (t , "AAA." , res )
43
51
44
- res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 , 0xF3 , 0x6D , 0x6F , 0x20 , 0x07 , 0xA4 , 0x6F , 0x73 })
45
- // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
46
- assert .Regexp (t , "^Hola, así cómo" , res )
52
+ res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
53
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0x07 , 0xA4 , 0x6F , 0x73 , 0x41 , 0x41 , 0x41 , 0x2e })
47
54
assert .NoError (t , err )
55
+ stringMustStartWith (t , "Hola," , res )
56
+ stringMustEndWith (t , "AAA." , res )
48
57
49
- res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 , 0xF3 , 0x6D , 0x6F , 0x20 , 0x81 , 0xA4 , 0x6F , 0x73 })
50
- // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
51
- assert .Regexp (t , "^Hola, así cómo" , res )
58
+ res , err = ToUTF8WithErr ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
59
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0x81 , 0xA4 , 0x6F , 0x73 , 0x41 , 0x41 , 0x41 , 0x2e })
52
60
assert .NoError (t , err )
61
+ stringMustStartWith (t , "Hola," , res )
62
+ stringMustEndWith (t , "AAA." , res )
53
63
54
64
// Japanese (Shift-JIS)
55
- res , err = ToUTF8WithErr ([]byte {0x93 , 0xFA , 0x91 , 0xAE , 0x94 , 0xE9 , 0x82 , 0xBC , 0x82 , 0xB5 , 0x82 , 0xBF , 0x82 , 0xE3 , 0x81 , 0x42 })
56
- assert .Equal (t , "日属秘ぞしちゅ。" , res )
65
+ // 日属秘ぞしちゅ。
66
+ res , err = ToUTF8WithErr ([]byte {0x93 , 0xFA , 0x91 , 0xAE , 0x94 , 0xE9 , 0x82 , 0xBC , 0x82 , 0xB5 , 0x82 ,
67
+ 0xBF , 0x82 , 0xE3 , 0x81 , 0x42 })
57
68
assert .NoError (t , err )
69
+ assert .Equal (t , []byte {0xE6 , 0x97 , 0xA5 , 0xE5 , 0xB1 , 0x9E , 0xE7 , 0xA7 , 0x98 , 0xE3 ,
70
+ 0x81 , 0x9E , 0xE3 , 0x81 , 0x97 , 0xE3 , 0x81 , 0xA1 , 0xE3 , 0x82 , 0x85 , 0xE3 , 0x80 , 0x82 },
71
+ []byte (res ))
58
72
59
73
res , err = ToUTF8WithErr ([]byte {0x00 , 0x00 , 0x00 , 0x00 })
60
- assert .Equal (t , "\x00 \x00 \x00 \x00 " , res )
61
74
assert .NoError (t , err )
75
+ assert .Equal (t , []byte {0x00 , 0x00 , 0x00 , 0x00 }, []byte (res ))
62
76
}
63
77
64
78
func TestToUTF8WithFallback (t * testing.T ) {
@@ -75,8 +89,10 @@ func TestToUTF8WithFallback(t *testing.T) {
75
89
assert .Equal (t , []byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }, res )
76
90
77
91
// "Hola, así cómo ños"
78
- res = ToUTF8WithFallback ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 , 0xF3 , 0x6D , 0x6F , 0x20 , 0xF1 , 0x6F , 0x73 })
79
- assert .Equal (t , []byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xC3 , 0xAD , 0x20 , 0x63 , 0xC3 , 0xB3 , 0x6D , 0x6F , 0x20 , 0xC3 , 0xB1 , 0x6F , 0x73 }, res )
92
+ res = ToUTF8WithFallback ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
93
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0xF1 , 0x6F , 0x73 })
94
+ assert .Equal (t , []byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xC3 , 0xAD , 0x20 , 0x63 ,
95
+ 0xC3 , 0xB3 , 0x6D , 0x6F , 0x20 , 0xC3 , 0xB1 , 0x6F , 0x73 }, res )
80
96
81
97
// "Hola, así cómo "
82
98
minmatch := []byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xC3 , 0xAD , 0x20 , 0x63 , 0xC3 , 0xB3 , 0x6D , 0x6F , 0x20 }
@@ -100,34 +116,52 @@ func TestToUTF8WithFallback(t *testing.T) {
100
116
}
101
117
102
118
func TestToUTF8 (t * testing.T ) {
103
- res := ToUTF8 ("ABC" )
104
- assert .Equal (t , "ABC" , res )
105
119
106
- res = ToUTF8 ("áéíóú" )
107
- assert .Equal (t , "áéíóú" , res )
120
+ // Note: golang compiler seems so behave differently depending on the current
121
+ // locale, so some conversions might behave differently. For that reason, we don't
122
+ // depend on particular conversions but in expected behaviors.
108
123
109
- // With utf-8 BOM
110
- res = ToUTF8 ("\ufeff áéíóú" )
111
- assert .Equal (t , "áéíóú" , res )
112
-
113
- res = ToUTF8 ("Hola, así cómo ños" )
114
- assert .Equal (t , "Hola, así cómo ños" , res )
124
+ res := ToUTF8 (string ([]byte {0x41 , 0x42 , 0x43 }))
125
+ assert .Equal (t , "ABC" , res )
115
126
116
- res = ToUTF8 ("Hola, así cómo \x07 ños" )
117
- // Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
118
- assert .Regexp (t , "^Hola, así cómo" , res )
127
+ // "áéíóú"
128
+ res = ToUTF8 (string ([]byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }))
129
+ assert .Equal (t , []byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }, []byte (res ))
130
+
131
+ // BOM + "áéíóú"
132
+ res = ToUTF8 (string ([]byte {0xef , 0xbb , 0xbf , 0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 ,
133
+ 0xc3 , 0xba }))
134
+ assert .Equal (t , []byte {0xc3 , 0xa1 , 0xc3 , 0xa9 , 0xc3 , 0xad , 0xc3 , 0xb3 , 0xc3 , 0xba }, []byte (res ))
135
+
136
+ // Latin1
137
+ // Hola, así cómo ños
138
+ res = ToUTF8 (string ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
139
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0xF1 , 0x6F , 0x73 }))
140
+ assert .Equal (t , []byte {0x48 , 0x6f , 0x6c , 0x61 , 0x2c , 0x20 , 0x61 , 0x73 , 0xc3 , 0xad , 0x20 , 0x63 ,
141
+ 0xc3 , 0xb3 , 0x6d , 0x6f , 0x20 , 0xc3 , 0xb1 , 0x6f , 0x73 }, []byte (res ))
142
+
143
+ // Latin1
144
+ // Hola, así cómo \x07ños
145
+ res = ToUTF8 (string ([]byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C , 0x20 , 0x61 , 0x73 , 0xED , 0x20 , 0x63 ,
146
+ 0xF3 , 0x6D , 0x6F , 0x20 , 0x07 , 0xA4 , 0x6F , 0x73 }))
147
+ // Hola,
148
+ bytesMustStartWith (t , []byte {0x48 , 0x6F , 0x6C , 0x61 , 0x2C }, []byte (res ))
119
149
120
150
// This test FAILS
121
151
// res = ToUTF8("Hola, así cómo \x81ños")
122
152
// Do not fail for differences in invalid cases, as the library might change the conversion criteria for those
123
153
// assert.Regexp(t, "^Hola, así cómo", res)
124
154
125
155
// Japanese (Shift-JIS)
126
- res = ToUTF8 ("\x93 \xFA \x91 \xAE \x94 \xE9 \x82 \xBC \x82 \xB5 \x82 \xBF \x82 \xE3 \x81 \x42 " )
127
- assert .Equal (t , "日属秘ぞしちゅ。" , res )
156
+ // 日属秘ぞしちゅ。
157
+ res = ToUTF8 (string ([]byte {0x93 , 0xFA , 0x91 , 0xAE , 0x94 , 0xE9 , 0x82 , 0xBC , 0x82 , 0xB5 , 0x82 ,
158
+ 0xBF , 0x82 , 0xE3 , 0x81 , 0x42 }))
159
+ assert .Equal (t , []byte {0xE6 , 0x97 , 0xA5 , 0xE5 , 0xB1 , 0x9E , 0xE7 , 0xA7 , 0x98 , 0xE3 ,
160
+ 0x81 , 0x9E , 0xE3 , 0x81 , 0x97 , 0xE3 , 0x81 , 0xA1 , 0xE3 , 0x82 , 0x85 , 0xE3 , 0x80 , 0x82 },
161
+ []byte (res ))
128
162
129
163
res = ToUTF8 ("\x00 \x00 \x00 \x00 " )
130
- assert .Equal (t , " \x00 \x00 \x00 \x00 " , res )
164
+ assert .Equal (t , [] byte { 0x00 , 0x00 , 0x00 , 0x00 }, [] byte ( res ) )
131
165
}
132
166
133
167
func TestToUTF8DropErrors (t * testing.T ) {
@@ -203,3 +237,15 @@ func TestDetectEncoding(t *testing.T) {
203
237
_ , err = DetectEncoding (b )
204
238
assert .Error (t , err )
205
239
}
240
+
241
+ func stringMustStartWith (t * testing.T , expected string , value string ) {
242
+ assert .Equal (t , expected , string (value [:len (expected )]))
243
+ }
244
+
245
+ func stringMustEndWith (t * testing.T , expected string , value string ) {
246
+ assert .Equal (t , expected , string (value [len (value )- len (expected ):]))
247
+ }
248
+
249
+ func bytesMustStartWith (t * testing.T , expected []byte , value []byte ) {
250
+ assert .Equal (t , expected , value [:len (expected )])
251
+ }
0 commit comments