github.com/neohugo/neohugo@v0.123.8/resources/resource_factories/create/remote.go (about)

     1  // Copyright 2021 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package create
    15  
    16  import (
    17  	"bufio"
    18  	"bytes"
    19  	"fmt"
    20  	"io"
    21  	"math/rand"
    22  	"mime"
    23  	"net/http"
    24  	"net/http/httputil"
    25  	"net/url"
    26  	"path"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/neohugo/neohugo/common/hugio"
    31  	"github.com/neohugo/neohugo/common/maps"
    32  	"github.com/neohugo/neohugo/common/types"
    33  	"github.com/neohugo/neohugo/identity"
    34  	"github.com/neohugo/neohugo/media"
    35  	"github.com/neohugo/neohugo/resources"
    36  	"github.com/neohugo/neohugo/resources/resource"
    37  
    38  	"github.com/mitchellh/mapstructure"
    39  )
    40  
    41  type HTTPError struct {
    42  	error
    43  	Data map[string]any
    44  
    45  	StatusCode int
    46  	Body       string
    47  }
    48  
    49  func responseToData(res *http.Response, readBody bool) map[string]any {
    50  	var body []byte
    51  	if readBody {
    52  		body, _ = io.ReadAll(res.Body)
    53  	}
    54  
    55  	m := map[string]any{
    56  		"StatusCode":       res.StatusCode,
    57  		"Status":           res.Status,
    58  		"TransferEncoding": res.TransferEncoding,
    59  		"ContentLength":    res.ContentLength,
    60  		"ContentType":      res.Header.Get("Content-Type"),
    61  	}
    62  
    63  	if readBody {
    64  		m["Body"] = string(body)
    65  	}
    66  
    67  	return m
    68  }
    69  
    70  func toHTTPError(err error, res *http.Response, readBody bool) *HTTPError {
    71  	if err == nil {
    72  		panic("err is nil")
    73  	}
    74  	if res == nil {
    75  		return &HTTPError{
    76  			error: err,
    77  			Data:  map[string]any{},
    78  		}
    79  	}
    80  
    81  	return &HTTPError{
    82  		error: err,
    83  		Data:  responseToData(res, readBody),
    84  	}
    85  }
    86  
    87  var temporaryHTTPStatusCodes = map[int]bool{
    88  	408: true,
    89  	429: true,
    90  	500: true,
    91  	502: true,
    92  	503: true,
    93  	504: true,
    94  }
    95  
    96  // FromRemote expects one or n-parts of a URL to a resource
    97  // If you provide multiple parts they will be joined together to the final URL.
    98  func (c *Client) FromRemote(uri string, optionsm map[string]any) (resource.Resource, error) {
    99  	rURL, err := url.Parse(uri)
   100  	if err != nil {
   101  		return nil, fmt.Errorf("failed to parse URL for resource %s: %w", uri, err)
   102  	}
   103  
   104  	method := "GET"
   105  	if s, ok := maps.LookupEqualFold(optionsm, "method"); ok {
   106  		method = strings.ToUpper(s.(string))
   107  	}
   108  	isHeadMethod := method == "HEAD"
   109  
   110  	resourceID := calculateResourceID(uri, optionsm)
   111  
   112  	_, httpResponse, err := c.cacheGetResource.GetOrCreate(resourceID, func() (io.ReadCloser, error) {
   113  		options, err := decodeRemoteOptions(optionsm)
   114  		if err != nil {
   115  			return nil, fmt.Errorf("failed to decode options for resource %s: %w", uri, err)
   116  		}
   117  		if err := c.validateFromRemoteArgs(uri, options); err != nil {
   118  			return nil, err
   119  		}
   120  
   121  		var (
   122  			start          time.Time
   123  			nextSleep      = time.Duration((rand.Intn(1000) + 100)) * time.Millisecond
   124  			nextSleepLimit = time.Duration(5) * time.Second
   125  		)
   126  
   127  		for {
   128  			b, retry, err := func() ([]byte, bool, error) {
   129  				req, err := options.NewRequest(uri)
   130  				if err != nil {
   131  					return nil, false, fmt.Errorf("failed to create request for resource %s: %w", uri, err)
   132  				}
   133  
   134  				res, err := c.httpClient.Do(req)
   135  				if err != nil {
   136  					return nil, false, err
   137  				}
   138  				defer res.Body.Close()
   139  
   140  				if res.StatusCode != http.StatusNotFound {
   141  					if res.StatusCode < 200 || res.StatusCode > 299 {
   142  						return nil, temporaryHTTPStatusCodes[res.StatusCode], toHTTPError(fmt.Errorf("failed to fetch remote resource: %s", http.StatusText(res.StatusCode)), res, !isHeadMethod)
   143  					}
   144  				}
   145  
   146  				b, err := httputil.DumpResponse(res, true)
   147  				if err != nil {
   148  					return nil, false, toHTTPError(err, res, !isHeadMethod)
   149  				}
   150  
   151  				return b, false, nil
   152  			}()
   153  			if err != nil {
   154  				if retry {
   155  					if start.IsZero() {
   156  						start = time.Now()
   157  					} else if d := time.Since(start) + nextSleep; d >= c.rs.Cfg.Timeout() {
   158  						c.rs.Logger.Errorf("Retry timeout (configured to %s) fetching remote resource.", c.rs.Cfg.Timeout())
   159  						return nil, err
   160  					}
   161  					time.Sleep(nextSleep)
   162  					if nextSleep < nextSleepLimit {
   163  						nextSleep *= 2
   164  					}
   165  					continue
   166  				}
   167  				return nil, err
   168  			}
   169  
   170  			return hugio.ToReadCloser(bytes.NewReader(b)), nil
   171  
   172  		}
   173  	})
   174  	if err != nil {
   175  		return nil, err
   176  	}
   177  	defer httpResponse.Close()
   178  
   179  	res, err := http.ReadResponse(bufio.NewReader(httpResponse), nil)
   180  	if err != nil {
   181  		return nil, err
   182  	}
   183  	defer res.Body.Close()
   184  
   185  	if res.StatusCode == http.StatusNotFound {
   186  		// Not found. This matches how looksup for local resources work.
   187  		return nil, nil
   188  	}
   189  
   190  	var (
   191  		body      []byte
   192  		mediaType media.Type
   193  	)
   194  	// A response to a HEAD method should not have a body. If it has one anyway, that body must be ignored.
   195  	// See https://developer.mozilla.org/en-US/docs/Web/HTTP/Methods/HEAD
   196  	if !isHeadMethod && res.Body != nil {
   197  		body, err = io.ReadAll(res.Body)
   198  		if err != nil {
   199  			return nil, fmt.Errorf("failed to read remote resource %q: %w", uri, err)
   200  		}
   201  	}
   202  
   203  	filename := path.Base(rURL.Path)
   204  	if _, params, _ := mime.ParseMediaType(res.Header.Get("Content-Disposition")); params != nil {
   205  		if _, ok := params["filename"]; ok {
   206  			filename = params["filename"]
   207  		}
   208  	}
   209  
   210  	contentType := res.Header.Get("Content-Type")
   211  
   212  	// For HEAD requests we have no body to work with, so we need to use the Content-Type header.
   213  	if isHeadMethod || c.rs.ExecHelper.Sec().HTTP.MediaTypes.Accept(contentType) {
   214  		var found bool
   215  		mediaType, found = c.rs.MediaTypes().GetByType(contentType)
   216  		if !found {
   217  			// A media type not configured in Hugo, just create one from the content type string.
   218  			mediaType, _ = media.FromString(contentType)
   219  		}
   220  	}
   221  
   222  	if mediaType.IsZero() {
   223  
   224  		var extensionHints []string
   225  
   226  		// mime.ExtensionsByType gives a long list of extensions for text/plain,
   227  		// just use ".txt".
   228  		if strings.HasPrefix(contentType, "text/plain") {
   229  			extensionHints = []string{".txt"}
   230  		} else {
   231  			exts, _ := mime.ExtensionsByType(contentType)
   232  			if exts != nil {
   233  				extensionHints = exts
   234  			}
   235  		}
   236  
   237  		// Look for a file extension. If it's .txt, look for a more specific.
   238  		if extensionHints == nil || extensionHints[0] == ".txt" {
   239  			if ext := path.Ext(filename); ext != "" {
   240  				extensionHints = []string{ext}
   241  			}
   242  		}
   243  
   244  		// Now resolve the media type primarily using the content.
   245  		mediaType = media.FromContent(c.rs.MediaTypes(), extensionHints, body)
   246  
   247  	}
   248  
   249  	if mediaType.IsZero() {
   250  		return nil, fmt.Errorf("failed to resolve media type for remote resource %q", uri)
   251  	}
   252  
   253  	resourceID = filename[:len(filename)-len(path.Ext(filename))] + "_" + resourceID + mediaType.FirstSuffix.FullSuffix
   254  	data := responseToData(res, false)
   255  
   256  	return c.rs.NewResource(
   257  		resources.ResourceSourceDescriptor{
   258  			MediaType:     mediaType,
   259  			Data:          data,
   260  			GroupIdentity: identity.StringIdentity(resourceID),
   261  			LazyPublish:   true,
   262  			OpenReadSeekCloser: func() (hugio.ReadSeekCloser, error) {
   263  				return hugio.NewReadSeekerNoOpCloser(bytes.NewReader(body)), nil
   264  			},
   265  			TargetPath: resourceID,
   266  		})
   267  }
   268  
   269  func (c *Client) validateFromRemoteArgs(uri string, options fromRemoteOptions) error {
   270  	if err := c.rs.ExecHelper.Sec().CheckAllowedHTTPURL(uri); err != nil {
   271  		return err
   272  	}
   273  
   274  	if err := c.rs.ExecHelper.Sec().CheckAllowedHTTPMethod(options.Method); err != nil {
   275  		return err
   276  	}
   277  
   278  	return nil
   279  }
   280  
   281  func calculateResourceID(uri string, optionsm map[string]any) string {
   282  	if key, found := maps.LookupEqualFold(optionsm, "key"); found {
   283  		return identity.HashString(key)
   284  	}
   285  	return identity.HashString(uri, optionsm)
   286  }
   287  
   288  func addDefaultHeaders(req *http.Request) {
   289  	if !hasHeaderKey(req.Header, "User-Agent") {
   290  		req.Header.Add("User-Agent", "Hugo Static Site Generator")
   291  	}
   292  }
   293  
   294  func addUserProvidedHeaders(headers map[string]any, req *http.Request) {
   295  	if headers == nil {
   296  		return
   297  	}
   298  	for key, val := range headers {
   299  		vals := types.ToStringSlicePreserveString(val)
   300  		for _, s := range vals {
   301  			req.Header.Add(key, s)
   302  		}
   303  	}
   304  }
   305  
   306  func hasHeaderKey(m http.Header, key string) bool {
   307  	_, ok := m[key]
   308  	return ok
   309  }
   310  
   311  type fromRemoteOptions struct {
   312  	Method  string
   313  	Headers map[string]any
   314  	Body    []byte
   315  }
   316  
   317  func (o fromRemoteOptions) BodyReader() io.Reader {
   318  	if o.Body == nil {
   319  		return nil
   320  	}
   321  	return bytes.NewBuffer(o.Body)
   322  }
   323  
   324  func (o fromRemoteOptions) NewRequest(url string) (*http.Request, error) {
   325  	req, err := http.NewRequest(o.Method, url, o.BodyReader())
   326  	if err != nil {
   327  		return nil, err
   328  	}
   329  
   330  	// First add any user provided headers.
   331  	if o.Headers != nil {
   332  		addUserProvidedHeaders(o.Headers, req)
   333  	}
   334  
   335  	// Then add default headers not provided by the user.
   336  	addDefaultHeaders(req)
   337  
   338  	return req, nil
   339  }
   340  
   341  func decodeRemoteOptions(optionsm map[string]any) (fromRemoteOptions, error) {
   342  	options := fromRemoteOptions{
   343  		Method: "GET",
   344  	}
   345  
   346  	err := mapstructure.WeakDecode(optionsm, &options)
   347  	if err != nil {
   348  		return options, err
   349  	}
   350  	options.Method = strings.ToUpper(options.Method)
   351  
   352  	return options, nil
   353  }