@@ -53,49 +53,9 @@ internal static byte[] AutomaticTransformBytes(byte[] bytes, Encoding targetEnco
5353 // For the below, false positives should be exceedingly rare (and would
5454 // be either slightly malformed UTF-8 (which would suit our purposes
5555 // anyway) or 8-bit extended ASCII/UTF-16/32 at a vanishingly long shot).
56- var i = 0 ;
57- var utf8 = false ;
58- while ( i < taster - 4 )
59- {
60- if ( bytes [ i ] <= 0x7F )
61- {
62- i += 1 ;
63- continue ;
64- }
56+
6557
66- // If all characters are below 0x80, then it is valid UTF8,
67- // but UTF8 is not 'required' (and therefore the text is more desirable to be treated as
68- // the default codepage of the computer). Hence, there's no "utf8 = true;"
69- // code unlike the next three checks.
70-
71- if ( bytes [ i ] >= 0xC2 && bytes [ i ] <= 0xDF && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 )
72- {
73- i += 2 ;
74- utf8 = true ;
75- continue ;
76- }
77-
78- if ( bytes [ i ] >= 0xE0 && bytes [ i ] <= 0xF0 && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 && bytes [ i + 2 ] >= 0x80 &&
79- bytes [ i + 2 ] < 0xC0 )
80- {
81- i += 3 ;
82- utf8 = true ;
83- continue ;
84- }
85-
86- if ( bytes [ i ] >= 0xF0 && bytes [ i ] <= 0xF4 && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 &&
87- bytes [ i + 2 ] >= 0x80 && bytes [ i + 2 ] < 0xC0 && bytes [ i + 3 ] >= 0x80 && bytes [ i + 3 ] < 0xC0 )
88- {
89- i += 4 ;
90- utf8 = true ;
91- continue ;
92- }
93-
94- utf8 = false ;
95- break ;
96- }
97-
98- if ( utf8 )
58+ if ( CheckForUtf8 ( bytes , taster ) )
9959 {
10060 text = provideText ? Encoding . UTF8 . GetString ( bytes ) : null ;
10161 return Encoding . UTF8 ;
@@ -138,6 +98,53 @@ internal static byte[] AutomaticTransformBytes(byte[] bytes, Encoding targetEnco
13898 return fallbackEncoding ?? FileEncoding . DefaultFallback ;
13999 }
140100
101+ private static bool CheckForUtf8 ( byte [ ] bytes , int taster )
102+ {
103+ var utf8 = false ;
104+ var i = 0 ;
105+ while ( i < taster - 4 )
106+ {
107+ if ( bytes [ i ] <= 0x7F )
108+ {
109+ i += 1 ;
110+ continue ;
111+ }
112+
113+ // If all characters are below 0x80, then it is valid UTF8,
114+ // but UTF8 is not 'required' (and therefore the text is more desirable to be treated as
115+ // the default codepage of the computer). Hence, there's no "utf8 = true;"
116+ // code unlike the next three checks.
117+
118+ if ( bytes [ i ] >= 0xC2 && bytes [ i ] <= 0xDF && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 )
119+ {
120+ i += 2 ;
121+ utf8 = true ;
122+ continue ;
123+ }
124+
125+ if ( bytes [ i ] >= 0xE0 && bytes [ i ] <= 0xF0 && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 && bytes [ i + 2 ] >= 0x80 &&
126+ bytes [ i + 2 ] < 0xC0 )
127+ {
128+ i += 3 ;
129+ utf8 = true ;
130+ continue ;
131+ }
132+
133+ if ( bytes [ i ] >= 0xF0 && bytes [ i ] <= 0xF4 && bytes [ i + 1 ] >= 0x80 && bytes [ i + 1 ] < 0xC0 &&
134+ bytes [ i + 2 ] >= 0x80 && bytes [ i + 2 ] < 0xC0 && bytes [ i + 3 ] >= 0x80 && bytes [ i + 3 ] < 0xC0 )
135+ {
136+ i += 4 ;
137+ utf8 = true ;
138+ continue ;
139+ }
140+
141+ utf8 = false ;
142+ break ;
143+ }
144+
145+ return utf8 ;
146+ }
147+
141148 /// <summary>
142149 /// A long shot - let's see if we can find "charset=xyz" or
143150 /// "encoding=xyz" to identify the encoding:
@@ -153,28 +160,40 @@ private static bool LongShot(ref string? text, bool provideText, int taster, byt
153160 for ( var n = 0 ; n < taster - 9 ; n ++ )
154161 {
155162 if ( ! IsCharsetMarker ( bytes , n ) && ! IsEncodingMarker ( bytes , n ) )
163+ {
156164 continue ;
165+ }
157166
158- if ( bytes [ n + 0 ] == 'c' || bytes [ n + 0 ] == 'C' ) n += 8 ;
159- else n += 9 ;
167+ if ( bytes [ n + 0 ] == 'c' || bytes [ n + 0 ] == 'C' )
168+ {
169+ n += 8 ;
170+ }
171+ else
172+ {
173+ n += 9 ;
174+ }
160175
161- if ( bytes [ n ] == '"' || bytes [ n ] == '\' ' ) n ++ ;
176+ if ( bytes [ n ] == '"' || bytes [ n ] == '\' ' )
177+ {
178+ n ++ ;
179+ }
162180
163181 var oldN = n ;
164182
165183 while ( IsCharsetNameRange ( taster , bytes , n ) )
184+ {
166185 n ++ ;
186+ }
167187
168188 var nb = new byte [ n - oldN ] ;
169189 Array . Copy ( bytes , oldN , nb , 0 , n - oldN ) ;
170190 try
171191 {
172192 var internalEnc = Encoding . ASCII . GetString ( nb ) ;
173193 text = provideText ? Encoding . GetEncoding ( internalEnc ) . GetString ( bytes ) : null ;
174- {
175- encoding = Encoding . GetEncoding ( internalEnc ) ;
176- return true ;
177- }
194+
195+ encoding = Encoding . GetEncoding ( internalEnc ) ;
196+ return true ;
178197 }
179198 catch
180199 {
@@ -252,9 +271,11 @@ private static bool IsCharsetNameRange(int taster, byte[] bytes, int n)
252271
253272 var bom = new byte [ 4 ] ;
254273 fileStream . Position = 0 ;
255-
256- // ReSharper disable once MustUseReturnValue
274+
275+ // read the BOM with dynamical length
276+ #pragma warning disable CA2022 , S2674
257277 fileStream . Read ( bom , 0 , 4 ) ;
278+ #pragma warning restore CA2022
258279
259280 return GetEncodingByBom ( bom , fallbackEncoding , out _ , false ) ;
260281 }
0 commit comments