github.com/everdrone/grab@v0.1.7-0.20230416223925-40674b995521/internal/instance/cache.go (about) 1 package instance 2 3 import ( 4 "fmt" 5 "net/url" 6 "path/filepath" 7 "time" 8 9 "github.com/rs/zerolog/log" 10 11 "github.com/everdrone/grab/internal/config" 12 "github.com/everdrone/grab/internal/net" 13 "github.com/everdrone/grab/internal/utils" 14 "github.com/hashicorp/hcl/v2" 15 ) 16 17 func (s *Grab) BuildSiteCache() { 18 for _, url := range s.URLs { 19 for i, site := range s.Config.Sites { 20 if s.RegexCache[site.Test].MatchString(url) { 21 if s.Config.Sites[i].URLs == nil { 22 s.Config.Sites[i].URLs = make([]string, 0) 23 } 24 25 s.Config.Sites[i].URLs = append(s.Config.Sites[i].URLs, url) 26 break 27 } 28 } 29 } 30 } 31 32 func removePathFromURL(str string) (*url.URL, error) { 33 base, err := url.Parse(str) 34 if err != nil { 35 return nil, err 36 } 37 // get only the base (scheme://host) 38 base.Path = "" 39 base.RawPath = "" 40 base.RawQuery = "" 41 base.Fragment = "" 42 base.RawFragment = "" 43 44 return base, nil 45 } 46 47 func (s *Grab) BuildAssetCache() *hcl.Diagnostics { 48 var diags *hcl.Diagnostics 49 50 for siteIndex, site := range s.Config.Sites { 51 log.Trace().Str("site", site.Name).Msg("visiting site block") 52 53 for _, pageUrl := range site.URLs { 54 log.Trace().Str("url", pageUrl).Msg("processing url") 55 56 // we already checked this url before, so we can skip the error 57 base, _ := removePathFromURL(pageUrl) 58 59 options := net.MergeFetchOptionsChain(s.Config.Global.Network, site.Network) 60 61 log.Info().Str("url", pageUrl).Msg("fetching") 62 63 // MARK: - get the page body 64 65 body, err := net.Fetch(pageUrl, options) 66 if err != nil { 67 diags = &hcl.Diagnostics{{ 68 Severity: hcl.DiagError, 69 Summary: "Failed to fetch page", 70 Detail: fmt.Sprintf("%s: %s", pageUrl, err.Error()), 71 }} 72 73 // if we are in strict mode we need to return immediately 74 if s.Flags.Strict { 75 return diags 76 } else { 77 // FIXME: warn the user that we are skipping this page 78 continue 79 } 80 } 81 82 // MARK: - get the destination path (subdirectory) 83 84 var subdirectory string 85 if site.Subdirectory != nil { 86 // we have a subdirectory block 87 88 log.Trace().Str("site", site.Name).Msg("visiting subdirectory block") 89 90 var source string 91 if site.Subdirectory.From == "url" { 92 source = pageUrl 93 } else { 94 source = body 95 } 96 97 subDirs, err := utils.GetCaptures(s.RegexCache[site.Subdirectory.Pattern], false, site.Subdirectory.Capture, source) 98 if err != nil { 99 return &hcl.Diagnostics{{ 100 Severity: hcl.DiagError, 101 Summary: "Failed to get subdirectory", 102 Detail: err.Error(), 103 }} 104 } 105 106 if len(subDirs) > 0 { 107 // do not append if the path is absolute 108 if filepath.IsAbs(subDirs[0]) { 109 subdirectory = subDirs[0] 110 } else { 111 subdirectory = filepath.Join(s.Config.Global.Location, site.Name, subDirs[0]) 112 } 113 114 log.Trace().Str("site", site.Name).Str("subdirectory", subdirectory).Msg("subdirectory path") 115 } 116 } else { 117 // we have no subdirectory block, just use the site name 118 subdirectory = filepath.Join(s.Config.Global.Location, site.Name) 119 120 log.Trace().Str("site", site.Name).Str("subdirectory", subdirectory).Msg("no subdirectory block") 121 } 122 123 // MARK: - loop through the asset blocks 124 125 for assetIndex, asset := range site.Assets { 126 log.Debug().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting asset block") 127 128 // match against body 129 if s.RegexCache[asset.Pattern].MatchString(body) { 130 findAll := false 131 if asset.FindAll != nil { 132 findAll = *asset.FindAll 133 } 134 135 // get capture groups 136 captures, err := utils.GetCaptures(s.RegexCache[asset.Pattern], findAll, asset.Capture, body) 137 if err != nil { 138 return &hcl.Diagnostics{{ 139 Severity: hcl.DiagError, 140 Summary: "Failed to get captures", 141 Detail: fmt.Sprintf("%s: %s", pageUrl, err.Error()), 142 }} 143 } 144 145 // remove duplicates 146 captures = utils.Unique(captures) 147 148 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Strs("matches", captures).Msgf("%d %s found", len(captures), utils.Plural(len(captures), "match", "matches")) 149 150 // MARK: - transform url 151 152 // TODO: we should change the config schema to store transforms as a map 153 // where the key is the transform label, so we don't end up looping through an array 154 transformUrl := utils.Filter(asset.Transforms, func(t config.TransformConfig) bool { 155 return t.Name == "url" 156 }) 157 158 if len(transformUrl) > 0 { 159 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting transforming url block") 160 161 // we have a transform url block 162 t := transformUrl[0] 163 for i, src := range captures { 164 captures[i] = s.RegexCache[t.Pattern].ReplaceAllString(src, t.Replace) 165 } 166 167 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Strs("matches", captures).Msgf("%d matched %s replaced", len(captures), utils.Plural(len(captures), "url", "urls")) 168 } 169 170 // MARK: - transform filename 171 172 transformFilename := utils.Filter(asset.Transforms, func(t config.TransformConfig) bool { 173 return t.Name == "filename" 174 }) 175 176 destinations := make(map[string]string, 0) 177 178 if len(transformFilename) > 0 { 179 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Msg("visiting transforming filename block") 180 181 // we have a transform filename block 182 t := transformFilename[0] 183 for _, src := range captures { 184 fileName := s.RegexCache[t.Pattern].ReplaceAllString(src, t.Replace) 185 186 // NOTE: the result of "transform filename" could be an absolute path! 187 // so we should not append if absolute 188 if filepath.IsAbs(fileName) { 189 // FIXME: we should disallow absolute paths 190 // it's dangerous and they should be avoided 191 destinations[src] = fileName 192 } else { 193 destinations[src] = filepath.Join(subdirectory, fileName) 194 } 195 196 // unescape the filename to write on disk 197 unescaped, err := url.QueryUnescape(destinations[src]) 198 if err != nil { 199 return &hcl.Diagnostics{{ 200 Severity: hcl.DiagError, 201 Summary: "Failed to unescape filename", 202 Detail: fmt.Sprintf("%s: %s", fileName, err.Error()), 203 }} 204 } 205 206 destinations[src] = unescaped 207 208 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", destinations[src]).Msg("transformed filename") 209 } 210 } else { 211 // we don't have any transform filename blocks 212 for _, src := range captures { 213 // simply get the filename from the url path 214 fileName := filepath.Base(src) 215 destinations[src] = filepath.Join(subdirectory, fileName) 216 217 // unescape the filename to write on disk 218 unescaped, err := url.QueryUnescape(destinations[src]) 219 if err != nil { 220 return &hcl.Diagnostics{{ 221 Severity: hcl.DiagError, 222 Summary: "Failed to unescape filename", 223 Detail: fmt.Sprintf("%s: %s", fileName, err.Error()), 224 }} 225 } 226 227 destinations[src] = unescaped 228 229 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", destinations[src]).Msg("transformed filename") 230 } 231 } 232 233 // MARK: - loop through the map to check for relative urls 234 235 resolvedDestinations := make(map[string]string, 0) 236 for src, dst := range destinations { 237 parsed, err := url.Parse(src) 238 if err != nil { 239 return &hcl.Diagnostics{{ 240 Severity: hcl.DiagError, 241 Summary: "Failed to parse url", 242 Detail: fmt.Sprintf("%s: %s", src, err.Error()), 243 }} 244 } 245 246 // if path is still relative, append it to the scheme://domain.name of the page 247 if !parsed.IsAbs() { 248 resolved, err := base.Parse(src) 249 if err != nil { 250 return &hcl.Diagnostics{{ 251 Severity: hcl.DiagError, 252 Summary: "Failed to resolve relative url", 253 Detail: fmt.Sprintf("%s: %s", src, err.Error()), 254 }} 255 } 256 257 resolvedDestinations[resolved.String()] = dst 258 259 log.Trace().Str("site", site.Name).Str("asset", asset.Name).Str("source", src).Str("destination", resolved.String()).Msg("resolved relative url") 260 } else { 261 // nothing to do, the url is already absolute 262 resolvedDestinations[src] = dst 263 } 264 } 265 266 // initialize the map if nil 267 if s.Config.Sites[siteIndex].Assets[assetIndex].Downloads == nil { 268 s.Config.Sites[siteIndex].Assets[assetIndex].Downloads = make(map[string]string, 0) 269 } 270 271 // add the destinations to the asset 272 for src, dst := range resolvedDestinations { 273 s.Config.Sites[siteIndex].Assets[assetIndex].Downloads[src] = dst 274 } 275 276 // is this site going to perform downloads? 277 // if len(resolvedDestinations) > 0 { 278 // s.Config.Sites[siteIndex].HasMatches = true 279 // } 280 281 s.TotalAssets += int64(len(resolvedDestinations)) 282 } 283 } 284 285 // MARK: - Indexing 286 287 // store the url and the timestamp by default 288 infoMap := make(map[string]string, 0) 289 infoMap["url"] = pageUrl 290 infoMap["timestamp"] = time.Now().UTC().Format(time.RFC3339Nano) 291 292 // loop through index blocks 293 for _, info := range site.Infos { 294 log.Trace().Str("site", site.Name).Str("info", info.Name).Msg("visiting info block") 295 296 key := info.Name 297 298 if s.RegexCache[info.Pattern].MatchString(body) { 299 captures, err := utils.GetCaptures(s.RegexCache[info.Pattern], false, info.Capture, body) 300 if err != nil { 301 return &hcl.Diagnostics{{ 302 Severity: hcl.DiagError, 303 Summary: "Failed to get capture", 304 Detail: fmt.Sprintf("%s: %s", pageUrl, err.Error()), 305 }} 306 } 307 308 if len(captures) > 0 { 309 infoMap[key] = captures[0] 310 log.Trace().Str("site", site.Name).Str("info", info.Name).Strs("matches", captures).Msgf("%d %s found", len(captures), utils.Plural(len(captures), "match", "matches")) 311 } 312 } 313 } 314 315 if s.Config.Sites[siteIndex].InfoMap == nil { 316 s.Config.Sites[siteIndex].InfoMap = make(config.InfoCacheMap, 0) 317 } 318 319 s.Config.Sites[siteIndex].InfoMap[subdirectory] = infoMap 320 } 321 } 322 323 return &hcl.Diagnostics{} 324 }