Convert files to utf-8 for indexing (#7814)

* Convert files to utf-8 for indexing

* Move utf8 functions to modules/base

* Bump repoIndexerLatestVersion to 3

* Add tests for base/encoding.go

* Changes to pass gosimple

* Move UTF8 funcs into new modules/charset package
This commit is contained in:
guillep2k
2019-08-15 09:07:28 -03:00
committed by Lunny Xiao
parent c2c35d169c
commit 5a44be627c
13 changed files with 371 additions and 166 deletions

View File

@ -28,8 +28,6 @@ import (
"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/setting"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"gopkg.in/editorconfig/editorconfig-core-go.v1"
)
@ -274,60 +272,6 @@ func Sha1(str string) string {
return base.EncodeSha1(str)
}
// ToUTF8WithErr converts content to UTF8 encoding
func ToUTF8WithErr(content []byte) (string, error) {
charsetLabel, err := base.DetectEncoding(content)
if err != nil {
return "", err
} else if charsetLabel == "UTF-8" {
return string(base.RemoveBOMIfPresent(content)), nil
}
encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return string(content), fmt.Errorf("Unknown encoding: %s", charsetLabel)
}
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
result = append(result, content[n:]...)
}
result = base.RemoveBOMIfPresent(result)
return string(result), err
}
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
func ToUTF8WithFallback(content []byte) []byte {
charsetLabel, err := base.DetectEncoding(content)
if err != nil || charsetLabel == "UTF-8" {
return base.RemoveBOMIfPresent(content)
}
encoding, _ := charset.Lookup(charsetLabel)
if encoding == nil {
return content
}
// If there is an error, we concatenate the nicely decoded part and the
// original left over. This way we won't lose data.
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
if err != nil {
return append(result, content[n:]...)
}
return base.RemoveBOMIfPresent(result)
}
// ToUTF8 converts content to UTF8 encoding and ignore error
func ToUTF8(content string) string {
res, _ := ToUTF8WithErr([]byte(content))
return res
}
// ReplaceLeft replaces all prefixes 'oldS' in 's' with 'newS'.
func ReplaceLeft(s, oldS, newS string) string {
oldLen, newLen, i, n := len(oldS), len(newS), 0, 0