github.com/everdrone/grab@v0.1.7-0.20230416223925-40674b995521/internal/instance/cache.go (about)

     1  package instance
     2  
     3  import (
     4  	"fmt"
     5  	"net/url"
     6  	"path/filepath"
     7  	"time"
     8  
     9  	"github.com/rs/zerolog/log"
    10  
    11  	"github.com/everdrone/grab/internal/config"
    12  	"github.com/everdrone/grab/internal/net"
    13  	"github.com/everdrone/grab/internal/utils"
    14  	"github.com/hashicorp/hcl/v2"
    15  )
    16  
    17  func (s *Grab) BuildSiteCache() {
    18  	for _, url := range s.URLs {
    19  		for i, site := range s.Config.Sites {
    20  			if s.RegexCache[site.Test].MatchString(url) {
    21  				if s.Config.Sites[i].URLs == nil {
    22  					s.Config.Sites[i].URLs = make([]string, 0)
    23  				}
    24  
    25  				s.Config.Sites[i].URLs = append(s.Config.Sites[i].URLs, url)
    26  				break
    27  			}
    28  		}
    29  	}
    30  }
    31  
    32  func removePathFromURL(str string) (*url.URL, error) {
    33  	base, err := url.Parse(str)
    34  	if err != nil {
    35  		return nil, err
    36  	}
    37  	// get only the base (scheme://host)
    38  	base.Path = ""
    39  	base.RawPath = ""
    40  	base.RawQuery = ""
    41  	base.Fragment = ""
    42  	base.RawFragment = ""
    43  
    44  	return base, nil
    45  }
    46  
    47  func (s *Grab) BuildAssetCache() *hcl.Diagnostics {
    48  	var diags *hcl.Diagnostics
    49  
    50  	for siteIndex, site := range s.Config.Sites {
    51  		log.Trace().Str("site", site.Name).Msg("visiting site block")
    52  
    53  		for _, pageUrl := range site.URLs {
    54  			log.Trace().Str("url", pageUrl).Msg("processing url")
    55  
    56  			// we already checked this url before, so we can skip the error
    57  			base, _ := removePathFromURL(pageUrl)
    58  
    59  			options := net.MergeFetchOptionsChain(s.Config.Global.Network, site.Network)
    60  
    61  			log.Info().Str("url", pageUrl).Msg("fetching")
    62  
    63  			// MARK: - get the page body
    64  
    65  			body, err := net.Fetch(pageUrl, options)
    66  			if err != nil {
    67  				diags = &hcl.Diagnostics{{
    68  					Severity: hcl.DiagError,
    69  					Summary:  "Failed to fetch page",
    70  					Detail:   fmt.Sprintf("%s: %s", pageUrl, err.Error()),
    71  				}}
    72  
    73  				// if we are in strict mode we need to return immediately
    74  				if s.Flags.Strict {
    75  					return diags
    76  				} else {
    77  					// FIXME: warn the user that we are skipping this page
    78  					continue
    79  				}
    80  			}
    81  
    82  			// MARK: - get the destination path (subdirectory)
    83  
    84  			var subdirectory string
    85  			if site.Subdirectory != nil {
    86  				// we have a subdirectory block
    87  
    88  				log.Trace().Str("site", site.Name).Msg("visiting subdirectory block")
    89  
    90  				var source string
    91  				if site.Subdirectory.From == "url" {
    92  					source = pageUrl
    93  				} else {
    94  					source = body
    95  				}
    96  
    97  				subDirs, err := utils.GetCaptures(s.RegexCache[site.Subdirectory.Pattern], false, site.Subdirectory.Capture, source)
    98  				if err != nil {
    99  					return &hcl.Diagnostics{{
   100  						Severity: hcl.DiagError,
   101  						Summary:  "Failed to get subdirectory",
   102  						Detail:   err.Error(),
   103  					}}
   104  				}
   105  
   106  				if len(subDirs) > 0 {
   107  					// do not append if the path is absolute
   108  					if filepath.IsAbs(subDirs[0]) {
   109  						subdirectory = subDirs[0]
   110  					} else {
   111  						subdirectory = filepath.Join(s.Config.Global.Location, site.Name, subDirs[0])
   112  					}
   113  
   114  					log.Trace().Str("site", site.Name).Str("subdirectory", subdirectory).Msg("subdirectory path")
   115  				}
   116  			} else {
   117  				// we have no subdirectory block, just use the site name
   118  				subdirectory = filepath.Join(s.Config.Global.Location, site.Name)
   119  
   120  				log.Trace().Str("site", site.Name).Str("subdirectory", subdirectory).Msg("no subdirectory block")
   121  			}
   122  
   123  			// MARK: - loop through the asset blocks
   124  
   125  			for assetIndex, asset := range site.Assets {
   126  				log.Debug().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting asset block")
   127  
   128  				// match against body
   129  				if s.RegexCache[asset.Pattern].MatchString(body) {
   130  					findAll := false
   131  					if asset.FindAll != nil {
   132  						findAll = *asset.FindAll
   133  					}
   134  
   135  					// get capture groups
   136  					captures, err := utils.GetCaptures(s.RegexCache[asset.Pattern], findAll, asset.Capture, body)
   137  					if err != nil {
   138  						return &hcl.Diagnostics{{
   139  							Severity: hcl.DiagError,
   140  							Summary:  "Failed to get captures",
   141  							Detail:   fmt.Sprintf("%s: %s", pageUrl, err.Error()),
   142  						}}
   143  					}
   144  
   145  					// remove duplicates
   146  					captures = utils.Unique(captures)
   147  
   148  					log.Trace().Str("site", site.Name).Str("asset", asset.Name).Strs("matches", captures).Msgf("%d %s found", len(captures), utils.Plural(len(captures), "match", "matches"))
   149  
   150  					// MARK: - transform url
   151  
   152  					// TODO: we should change the config schema to store transforms as a map
   153  					// where the key is the transform label, so we don't end up looping through an array
   154  					transformUrl := utils.Filter(asset.Transforms, func(t config.TransformConfig) bool {
   155  						return t.Name == "url"
   156  					})
   157  
   158  					if len(transformUrl) > 0 {
   159  						log.Trace().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting transforming url block")
   160  
   161  						// we have a transform url block
   162  						t := transformUrl[0]
   163  						for i, src := range captures {
   164  							captures[i] = s.RegexCache[t.Pattern].ReplaceAllString(src, t.Replace)
   165  						}
   166  
   167  						log.Trace().Str("site", site.Name).Str("asset", asset.Name).Strs("matches", captures).Msgf("%d matched %s replaced", len(captures), utils.Plural(len(captures), "url", "urls"))
   168  					}
   169  
   170  					// MARK: - transform filename
   171  
   172  					transformFilename := utils.Filter(asset.Transforms, func(t config.TransformConfig) bool {
   173  						return t.Name == "filename"
   174  					})
   175  
   176  					destinations := make(map[string]string, 0)
   177  
   178  					if len(transformFilename) > 0 {
   179  						log.Trace().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting transforming filename block")
   180  
   181  						// we have a transform filename block
   182  						t := transformFilename[0]
   183  						for _, src := range captures {
   184  							fileName := s.RegexCache[t.Pattern].ReplaceAllString(src, t.Replace)
   185  
   186  							// NOTE: the result of "transform filename" could be an absolute path!
   187  							//       so we should not append if absolute
   188  							if filepath.IsAbs(fileName) {
   189  								// FIXME: we should disallow absolute paths
   190  								// it's dangerous and they should be avoided
   191  								destinations[src] = fileName
   192  							} else {
   193  								destinations[src] = filepath.Join(subdirectory, fileName)
   194  							}
   195  
   196  							// unescape the filename to write on disk
   197  							unescaped, err := url.QueryUnescape(destinations[src])
   198  							if err != nil {
   199  								return &hcl.Diagnostics{{
   200  									Severity: hcl.DiagError,
   201  									Summary:  "Failed to unescape filename",
   202  									Detail:   fmt.Sprintf("%s: %s", fileName, err.Error()),
   203  								}}
   204  							}
   205  
   206  							destinations[src] = unescaped
   207  
   208  							log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", destinations[src]).Msg("transformed filename")
   209  						}
   210  					} else {
   211  						// we don't have any transform filename blocks
   212  						for _, src := range captures {
   213  							// simply get the filename from the url path
   214  							fileName := filepath.Base(src)
   215  							destinations[src] = filepath.Join(subdirectory, fileName)
   216  
   217  							// unescape the filename to write on disk
   218  							unescaped, err := url.QueryUnescape(destinations[src])
   219  							if err != nil {
   220  								return &hcl.Diagnostics{{
   221  									Severity: hcl.DiagError,
   222  									Summary:  "Failed to unescape filename",
   223  									Detail:   fmt.Sprintf("%s: %s", fileName, err.Error()),
   224  								}}
   225  							}
   226  
   227  							destinations[src] = unescaped
   228  
   229  							log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", destinations[src]).Msg("transformed filename")
   230  						}
   231  					}
   232  
   233  					// MARK: - loop through the map to check for relative urls
   234  
   235  					resolvedDestinations := make(map[string]string, 0)
   236  					for src, dst := range destinations {
   237  						parsed, err := url.Parse(src)
   238  						if err != nil {
   239  							return &hcl.Diagnostics{{
   240  								Severity: hcl.DiagError,
   241  								Summary:  "Failed to parse url",
   242  								Detail:   fmt.Sprintf("%s: %s", src, err.Error()),
   243  							}}
   244  						}
   245  
   246  						// if path is still relative, append it to the scheme://domain.name of the page
   247  						if !parsed.IsAbs() {
   248  							resolved, err := base.Parse(src)
   249  							if err != nil {
   250  								return &hcl.Diagnostics{{
   251  									Severity: hcl.DiagError,
   252  									Summary:  "Failed to resolve relative url",
   253  									Detail:   fmt.Sprintf("%s: %s", src, err.Error()),
   254  								}}
   255  							}
   256  
   257  							resolvedDestinations[resolved.String()] = dst
   258  
   259  							log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", resolved.String()).Msg("resolved relative url")
   260  						} else {
   261  							// nothing to do, the url is already absolute
   262  							resolvedDestinations[src] = dst
   263  						}
   264  					}
   265  
   266  					// initialize the map if nil
   267  					if s.Config.Sites[siteIndex].Assets[assetIndex].Downloads == nil {
   268  						s.Config.Sites[siteIndex].Assets[assetIndex].Downloads = make(map[string]string, 0)
   269  					}
   270  
   271  					// add the destinations to the asset
   272  					for src, dst := range resolvedDestinations {
   273  						s.Config.Sites[siteIndex].Assets[assetIndex].Downloads[src] = dst
   274  					}
   275  
   276  					// is this site going to perform downloads?
   277  					// if len(resolvedDestinations) > 0 {
   278  					// 	s.Config.Sites[siteIndex].HasMatches = true
   279  					// }
   280  
   281  					s.TotalAssets += int64(len(resolvedDestinations))
   282  				}
   283  			}
   284  
   285  			// MARK: - Indexing
   286  
   287  			// store the url and the timestamp by default
   288  			infoMap := make(map[string]string, 0)
   289  			infoMap["url"] = pageUrl
   290  			infoMap["timestamp"] = time.Now().UTC().Format(time.RFC3339Nano)
   291  
   292  			// loop through index blocks
   293  			for _, info := range site.Infos {
   294  				log.Trace().Str("site", site.Name).Str("info", info.Name).Msg("visiting info block")
   295  
   296  				key := info.Name
   297  
   298  				if s.RegexCache[info.Pattern].MatchString(body) {
   299  					captures, err := utils.GetCaptures(s.RegexCache[info.Pattern], false, info.Capture, body)
   300  					if err != nil {
   301  						return &hcl.Diagnostics{{
   302  							Severity: hcl.DiagError,
   303  							Summary:  "Failed to get capture",
   304  							Detail:   fmt.Sprintf("%s: %s", pageUrl, err.Error()),
   305  						}}
   306  					}
   307  
   308  					if len(captures) > 0 {
   309  						infoMap[key] = captures[0]
   310  						log.Trace().Str("site", site.Name).Str("info", info.Name).Strs("matches", captures).Msgf("%d %s found", len(captures), utils.Plural(len(captures), "match", "matches"))
   311  					}
   312  				}
   313  			}
   314  
   315  			if s.Config.Sites[siteIndex].InfoMap == nil {
   316  				s.Config.Sites[siteIndex].InfoMap = make(config.InfoCacheMap, 0)
   317  			}
   318  
   319  			s.Config.Sites[siteIndex].InfoMap[subdirectory] = infoMap
   320  		}
   321  	}
   322  
   323  	return &hcl.Diagnostics{}
   324  }