Detect charset and convert non UTF-8 files for display (#4950)

* Detect charset and convert non UTF-8 files for display

* Refactor and move function to correct module

* Revert unrelated changes

* More unrelated changes

* Duplicate content for small text to have better encoding detection

* Check if original content is valid before duplicating it
This commit is contained in:
Lauris BH
2018-09-29 11:33:54 +03:00
committed by Lunny Xiao
parent 6780661192
commit 81702e6ec9
3 changed files with 44 additions and 4 deletions

View File

@ -1,3 +1,4 @@
// Copyright 2018 The Gitea Authors. All rights reserved.
// Copyright 2014 The Gogs Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
@ -275,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
}
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't loose data.
// original left over. This way we won't lose data.
result, n, err := transform.String(encoding.NewDecoder(), string(content))
if err != nil {
result = result + string(content[n:])
@ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) {
return result, err
}
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return content
}
encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}
return result
}
// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))