github.com/graemephi/kahugo@v0.62.3-0.20211121071557-d78c0423784d/transform/urlreplacers/absurlreplacer.go (about)

     1  // Copyright 2018 The Hugo Authors. All rights reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package urlreplacers
    15  
    16  import (
    17  	"bytes"
    18  	"io"
    19  	"unicode"
    20  	"unicode/utf8"
    21  
    22  	"github.com/gohugoio/hugo/transform"
    23  )
    24  
    25  type absurllexer struct {
    26  	// the source to absurlify
    27  	content []byte
    28  	// the target for the new absurlified content
    29  	w io.Writer
    30  
    31  	// path may be set to a "." relative path
    32  	path []byte
    33  
    34  	pos   int // input position
    35  	start int // item start position
    36  
    37  	quotes [][]byte
    38  }
    39  
    40  type prefix struct {
    41  	disabled bool
    42  	b        []byte
    43  	f        func(l *absurllexer)
    44  
    45  	nextPos int
    46  }
    47  
    48  func (p *prefix) find(bs []byte, start int) bool {
    49  	if p.disabled {
    50  		return false
    51  	}
    52  
    53  	if p.nextPos == -1 {
    54  		idx := bytes.Index(bs[start:], p.b)
    55  
    56  		if idx == -1 {
    57  			p.disabled = true
    58  			// Find the closest match
    59  			return false
    60  		}
    61  
    62  		p.nextPos = start + idx + len(p.b)
    63  	}
    64  
    65  	return true
    66  }
    67  
    68  func newPrefixState() []*prefix {
    69  	return []*prefix{
    70  		{b: []byte("src="), f: checkCandidateBase},
    71  		{b: []byte("href="), f: checkCandidateBase},
    72  		{b: []byte("url="), f: checkCandidateBase},
    73  		{b: []byte("action="), f: checkCandidateBase},
    74  		{b: []byte("srcset="), f: checkCandidateSrcset},
    75  	}
    76  }
    77  
    78  func (l *absurllexer) emit() {
    79  	l.w.Write(l.content[l.start:l.pos])
    80  	l.start = l.pos
    81  }
    82  
    83  var (
    84  	relURLPrefix    = []byte("/")
    85  	relURLPrefixLen = len(relURLPrefix)
    86  )
    87  
    88  func (l *absurllexer) consumeQuote() []byte {
    89  	for _, q := range l.quotes {
    90  		if bytes.HasPrefix(l.content[l.pos:], q) {
    91  			l.pos += len(q)
    92  			l.emit()
    93  			return q
    94  		}
    95  	}
    96  	return nil
    97  }
    98  
    99  // handle URLs in src and href.
   100  func checkCandidateBase(l *absurllexer) {
   101  	l.consumeQuote()
   102  
   103  	if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
   104  		return
   105  	}
   106  
   107  	// check for schemaless URLs
   108  	posAfter := l.pos + relURLPrefixLen
   109  	if posAfter >= len(l.content) {
   110  		return
   111  	}
   112  	r, _ := utf8.DecodeRune(l.content[posAfter:])
   113  	if r == '/' {
   114  		// schemaless: skip
   115  		return
   116  	}
   117  	if l.pos > l.start {
   118  		l.emit()
   119  	}
   120  	l.pos += relURLPrefixLen
   121  	l.w.Write(l.path)
   122  	l.start = l.pos
   123  }
   124  
   125  func (l *absurllexer) posAfterURL(q []byte) int {
   126  	if len(q) > 0 {
   127  		// look for end quote
   128  		return bytes.Index(l.content[l.pos:], q)
   129  	}
   130  
   131  	return bytes.IndexFunc(l.content[l.pos:], func(r rune) bool {
   132  		return r == '>' || unicode.IsSpace(r)
   133  	})
   134  }
   135  
   136  // handle URLs in srcset.
   137  func checkCandidateSrcset(l *absurllexer) {
   138  	q := l.consumeQuote()
   139  	if q == nil {
   140  		// srcset needs to be quoted.
   141  		return
   142  	}
   143  
   144  	// special case, not frequent (me think)
   145  	if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
   146  		return
   147  	}
   148  
   149  	// check for schemaless URLs
   150  	posAfter := l.pos + relURLPrefixLen
   151  	if posAfter >= len(l.content) {
   152  		return
   153  	}
   154  	r, _ := utf8.DecodeRune(l.content[posAfter:])
   155  	if r == '/' {
   156  		// schemaless: skip
   157  		return
   158  	}
   159  
   160  	posEnd := l.posAfterURL(q)
   161  
   162  	// safe guard
   163  	if posEnd < 0 || posEnd > 2000 {
   164  		return
   165  	}
   166  
   167  	if l.pos > l.start {
   168  		l.emit()
   169  	}
   170  
   171  	section := l.content[l.pos : l.pos+posEnd+1]
   172  
   173  	fields := bytes.Fields(section)
   174  	for i, f := range fields {
   175  		if f[0] == '/' {
   176  			l.w.Write(l.path)
   177  			l.w.Write(f[1:])
   178  
   179  		} else {
   180  			l.w.Write(f)
   181  		}
   182  
   183  		if i < len(fields)-1 {
   184  			l.w.Write([]byte(" "))
   185  		}
   186  	}
   187  
   188  	l.pos += len(section)
   189  	l.start = l.pos
   190  }
   191  
   192  // main loop
   193  func (l *absurllexer) replace() {
   194  	contentLength := len(l.content)
   195  
   196  	prefixes := newPrefixState()
   197  
   198  	for {
   199  		if l.pos >= contentLength {
   200  			break
   201  		}
   202  
   203  		var match *prefix
   204  
   205  		for _, p := range prefixes {
   206  			if !p.find(l.content, l.pos) {
   207  				continue
   208  			}
   209  
   210  			if match == nil || p.nextPos < match.nextPos {
   211  				match = p
   212  			}
   213  		}
   214  
   215  		if match == nil {
   216  			// Done!
   217  			l.pos = contentLength
   218  			break
   219  		} else {
   220  			l.pos = match.nextPos
   221  			match.nextPos = -1
   222  			match.f(l)
   223  		}
   224  	}
   225  	// Done!
   226  	if l.pos > l.start {
   227  		l.emit()
   228  	}
   229  }
   230  
   231  func doReplace(path string, ct transform.FromTo, quotes [][]byte) {
   232  	lexer := &absurllexer{
   233  		content: ct.From().Bytes(),
   234  		w:       ct.To(),
   235  		path:    []byte(path),
   236  		quotes:  quotes,
   237  	}
   238  
   239  	lexer.replace()
   240  }
   241  
   242  type absURLReplacer struct {
   243  	htmlQuotes [][]byte
   244  	xmlQuotes  [][]byte
   245  }
   246  
   247  func newAbsURLReplacer() *absURLReplacer {
   248  	return &absURLReplacer{
   249  		htmlQuotes: [][]byte{[]byte("\""), []byte("'")},
   250  		xmlQuotes:  [][]byte{[]byte("&#34;"), []byte("&#39;")},
   251  	}
   252  }
   253  
   254  func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) {
   255  	doReplace(path, ct, au.htmlQuotes)
   256  }
   257  
   258  func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) {
   259  	doReplace(path, ct, au.xmlQuotes)
   260  }