From 81702e6ec9d5bd9d2185a8cb5a021047314baee9 Mon Sep 17 00:00:00 2001
From: Lauris BH <lauris@nix.lv>
Date: Sat, 29 Sep 2018 11:33:54 +0300
Subject: [PATCH] Detect charset and convert non UTF-8 files for display
 (#4950)

* Detect charset and convert non UTF-8 files for display

* Refactor and move function to correct module

* Revert unrelated changes

* More unrelated changes

* Duplicate content for small text to have better encoding detection

* Check if original content is valid before duplicating it
---
 modules/base/tool.go        | 17 ++++++++++++++++-
 modules/templates/helper.go | 25 ++++++++++++++++++++++++-
 routers/repo/view.go        |  6 ++++--
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/modules/base/tool.go b/modules/base/tool.go
index 2dfd8ffec0..d5ec9e83fc 100644
--- a/modules/base/tool.go
+++ b/modules/base/tool.go
@@ -59,7 +59,22 @@ func DetectEncoding(content []byte) (string, error) {
 		return "UTF-8", nil
 	}
 
-	result, err := chardet.NewTextDetector().DetectBest(content)
+	textDetector := chardet.NewTextDetector()
+	var detectContent []byte
+	if len(content) < 1024 {
+		// Check if original content is valid
+		if _, err := textDetector.DetectBest(content); err != nil {
+			return "", err
+		}
+		times := 1024 / len(content)
+		detectContent = make([]byte, 0, times*len(content))
+		for i := 0; i < times; i++ {
+			detectContent = append(detectContent, content...)
+		}
+	} else {
+		detectContent = content
+	}
+	result, err := textDetector.DetectBest(detectContent)
 	if err != nil {
 		return "", err
 	}
diff --git a/modules/templates/helper.go b/modules/templates/helper.go
index d55c122df0..ce077d1a92 100644
--- a/modules/templates/helper.go
+++ b/modules/templates/helper.go
@@ -1,3 +1,4 @@
+// Copyright 2018 The Gitea Authors. All rights reserved.
 // Copyright 2014 The Gogs Authors. All rights reserved.
 // Use of this source code is governed by a MIT-style
 // license that can be found in the LICENSE file.
@@ -275,7 +276,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
 	}
 
 	// If there is an error, we concatenate the nicely decoded part and the
-	// original left over. This way we won't loose data.
+	// original left over. This way we won't lose data.
 	result, n, err := transform.String(encoding.NewDecoder(), string(content))
 	if err != nil {
 		result = result + string(content[n:])
@@ -284,6 +285,28 @@ func ToUTF8WithErr(content []byte) (string, error) {
 	return result, err
 }
 
+// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
+func ToUTF8WithFallback(content []byte) []byte {
+	charsetLabel, err := base.DetectEncoding(content)
+	if err != nil || charsetLabel == "UTF-8" {
+		return content
+	}
+
+	encoding, _ := charset.Lookup(charsetLabel)
+	if encoding == nil {
+		return content
+	}
+
+	// If there is an error, we concatenate the nicely decoded part and the
+	// original left over. This way we won't lose data.
+	result, n, err := transform.Bytes(encoding.NewDecoder(), content)
+	if err != nil {
+		return append(result, content[n:]...)
+	}
+
+	return result
+}
+
 // ToUTF8 converts content to UTF8 encoding and ignore error
 func ToUTF8(content string) string {
 	res, _ := ToUTF8WithErr([]byte(content))
diff --git a/routers/repo/view.go b/routers/repo/view.go
index ff5c1afb49..210eb9fe5f 100644
--- a/routers/repo/view.go
+++ b/routers/repo/view.go
@@ -25,6 +25,7 @@ import (
 	"code.gitea.io/gitea/modules/markup"
 	"code.gitea.io/gitea/modules/setting"
 	"code.gitea.io/gitea/modules/templates"
+
 	"github.com/Unknwon/paginater"
 )
 
@@ -99,7 +100,8 @@ func renderDirectory(ctx *context.Context, treeLink string) {
 				ctx.Data["FileSize"] = readmeFile.Size()
 			} else {
 				d, _ := ioutil.ReadAll(dataRc)
-				buf = append(buf, d...)
+				buf = templates.ToUTF8WithFallback(append(buf, d...))
+
 				if markup.Type(readmeFile.Name()) != "" {
 					ctx.Data["IsMarkup"] = true
 					ctx.Data["FileContent"] = string(markup.Render(readmeFile.Name(), buf, treeLink, ctx.Repo.Repository.ComposeMetas()))
@@ -203,7 +205,7 @@ func renderFile(ctx *context.Context, entry *git.TreeEntry, treeLink, rawLink st
 		}
 
 		d, _ := ioutil.ReadAll(dataRc)
-		buf = append(buf, d...)
+		buf = templates.ToUTF8WithFallback(append(buf, d...))
 
 		readmeExist := markup.IsReadmeFile(blob.Name())
 		ctx.Data["ReadmeExist"] = readmeExist