github.com/keysonZZZ/kmg@v0.0.0-20151121023212-05317bfd7d39/encoding/kmgTextEncoding/Encoding.go (about)

     1  package kmgTextEncoding
     2  
     3  import (
     4  	"bytes"
     5  	"io/ioutil"
     6  	"net/http"
     7  	"strings"
     8  
     9  	"github.com/PuerkitoBio/goquery"
    10  	"github.com/bronze1man/kmg/kmgNet/kmgHttp"
    11  	"golang.org/x/text/encoding/japanese"
    12  	"golang.org/x/text/transform"
    13  )
    14  
    15  type encodingType string
    16  
    17  const (
    18  	Utf8     encodingType = "utf-8"
    19  	ShiftJis encodingType = "shift_jis"
    20  )
    21  
    22  var encodingGuessList []encodingType = []encodingType{
    23  	ShiftJis,
    24  	Utf8,
    25  }
    26  
    27  //目前只处理了编码是 shift_jis 时的情况
    28  func HttpResponseToUtf8(res *http.Response) (out []byte) {
    29  	body := kmgHttp.MustResponseReadAllBody(res)
    30  	for _, encoding := range encodingGuessList {
    31  		if !isResponseEncodingBy(encoding, res, body) {
    32  			continue
    33  		}
    34  		if encoding == ShiftJis {
    35  			tReader := transform.NewReader(bytes.NewReader(body), japanese.ShiftJIS.NewDecoder())
    36  			var err error
    37  			out, err = ioutil.ReadAll(tReader)
    38  			if err != nil {
    39  				panic(err)
    40  			}
    41  			return out
    42  		}
    43  		if encoding == Utf8 {
    44  			return body
    45  		}
    46  	}
    47  	//没猜到,原样返回
    48  	return body
    49  }
    50  
    51  func isResponseEncodingBy(encoding encodingType, res *http.Response, responseBody []byte) bool {
    52  	contentType := res.Header.Get("Content-Type")
    53  	charset := getCharsetFromHttpContentType(contentType)
    54  	if charset == string(encoding) {
    55  		return true
    56  	}
    57  	if charset != "" {
    58  		return false
    59  	}
    60  	dom, err := goquery.NewDocumentFromReader(bytes.NewReader(responseBody))
    61  	if err != nil {
    62  		panic(err)
    63  	}
    64  	contentType, ok := dom.Find(`meta[http-equiv="content-type"]`).Eq(0).Attr("content")
    65  	if !ok {
    66  		return false
    67  	}
    68  	charset = getCharsetFromHttpContentType(contentType)
    69  	return charset == string(encoding)
    70  }
    71  
    72  func getCharsetFromHttpContentType(contentType string) string {
    73  	list := strings.Split(contentType, "charset=")
    74  	if len(list) == 1 {
    75  		return ""
    76  	}
    77  	return strings.ToLower(list[len(list)-1])
    78  }