github.com/serversong/goreporter@v0.0.0-20200325104552-3cfaf44fd178/linters/copycheck/syntax/syntax.go (about)

     1  package syntax
     2  
     3  import (
     4  	"crypto/sha1"
     5  
     6  	"github.com/360EntSecGroup-Skylar/goreporter/linters/copycheck/suffixtree"
     7  )
     8  
     9  type Node struct {
    10  	Type     int
    11  	Filename string
    12  	Pos, End int
    13  	Children []*Node
    14  	Owns     int
    15  }
    16  
    17  func NewNode() *Node {
    18  	return &Node{}
    19  }
    20  
    21  func (n *Node) AddChildren(children ...*Node) {
    22  	n.Children = append(n.Children, children...)
    23  }
    24  
    25  func (n *Node) Val() int {
    26  	return n.Type
    27  }
    28  
    29  type Match struct {
    30  	Hash  string
    31  	Frags [][]*Node
    32  }
    33  
    34  func Serialize(n *Node) []*Node {
    35  	stream := make([]*Node, 0, 10)
    36  	serial(n, &stream)
    37  	return stream
    38  }
    39  
    40  func serial(n *Node, stream *[]*Node) int {
    41  	*stream = append(*stream, n)
    42  	var count int
    43  	for _, child := range n.Children {
    44  		count += serial(child, stream)
    45  	}
    46  	n.Owns = count
    47  	return count + 1
    48  }
    49  
    50  // FindSyntaxUnits finds all complete syntax units in the match group and returns them
    51  // with the corresponding hash.
    52  func FindSyntaxUnits(data []*Node, m suffixtree.Match, threshold int) Match {
    53  	if len(m.Ps) == 0 {
    54  		return Match{}
    55  	}
    56  	firstSeq := data[m.Ps[0] : m.Ps[0]+m.Len]
    57  	indexes := getUnitsIndexes(firstSeq, threshold)
    58  
    59  	// TODO: is this really working?
    60  	indexCnt := len(indexes)
    61  	if indexCnt > 0 {
    62  		lasti := indexes[indexCnt-1]
    63  		firstn := firstSeq[lasti]
    64  		for i := 1; i < len(m.Ps); i++ {
    65  			n := data[int(m.Ps[i])+lasti]
    66  			if firstn.Owns != n.Owns {
    67  				indexes = indexes[:indexCnt-1]
    68  				break
    69  			}
    70  		}
    71  	}
    72  	if len(indexes) == 0 || isCyclic(indexes, firstSeq) || spansMultipleFiles(indexes, firstSeq) {
    73  		return Match{}
    74  	}
    75  
    76  	match := Match{Frags: make([][]*Node, len(m.Ps))}
    77  	for i, pos := range m.Ps {
    78  		match.Frags[i] = make([]*Node, len(indexes))
    79  		for j, index := range indexes {
    80  			match.Frags[i][j] = data[int(pos)+index]
    81  		}
    82  	}
    83  
    84  	lastIndex := indexes[len(indexes)-1]
    85  	match.Hash = hashSeq(firstSeq[indexes[0] : lastIndex+firstSeq[lastIndex].Owns])
    86  	return match
    87  }
    88  
    89  func getUnitsIndexes(nodeSeq []*Node, threshold int) []int {
    90  	var indexes []int
    91  	var split bool
    92  	for i := 0; i < len(nodeSeq); {
    93  		n := nodeSeq[i]
    94  		switch {
    95  		case n.Owns >= len(nodeSeq)-i:
    96  			// not complete syntax unit
    97  			i++
    98  			split = true
    99  			continue
   100  		case n.Owns+1 < threshold:
   101  			split = true
   102  		default:
   103  			if split {
   104  				indexes = indexes[:0]
   105  				split = false
   106  			}
   107  			indexes = append(indexes, i)
   108  		}
   109  		i += n.Owns + 1
   110  	}
   111  	return indexes
   112  }
   113  
   114  // isCyclic finds out whether there is a repetive pattern in the found clone. If positive,
   115  // it return false to point out that the clone would be redundant.
   116  func isCyclic(indexes []int, nodes []*Node) bool {
   117  	cnt := len(indexes)
   118  	if cnt <= 1 {
   119  		return false
   120  	}
   121  
   122  	alts := make(map[int]bool)
   123  	for i := 1; i <= cnt/2; i++ {
   124  		if cnt%i == 0 {
   125  			alts[i] = true
   126  		}
   127  	}
   128  
   129  	for i := 0; i < indexes[cnt/2]; i++ {
   130  		nstart := nodes[i+indexes[0]]
   131  	AltLoop:
   132  		for alt := range alts {
   133  			for j := alt; j < cnt; j += alt {
   134  				index := i + indexes[j]
   135  				if index < len(nodes) {
   136  					nalt := nodes[index]
   137  					if nstart.Owns == nalt.Owns && nstart.Type == nalt.Type {
   138  						continue
   139  					}
   140  				} else if i >= indexes[alt] {
   141  					return true
   142  				}
   143  				delete(alts, alt)
   144  				continue AltLoop
   145  			}
   146  		}
   147  		if len(alts) == 0 {
   148  			return false
   149  		}
   150  	}
   151  	return true
   152  }
   153  
   154  func spansMultipleFiles(indexes []int, nodes []*Node) bool {
   155  	if len(indexes) < 2 {
   156  		return false
   157  	}
   158  	f := nodes[indexes[0]].Filename
   159  	for i := 1; i < len(indexes); i++ {
   160  		if nodes[indexes[i]].Filename != f {
   161  			return true
   162  		}
   163  	}
   164  	return false
   165  }
   166  
   167  func hashSeq(nodes []*Node) string {
   168  	h := sha1.New()
   169  	bytes := make([]byte, len(nodes))
   170  	for i, node := range nodes {
   171  		bytes[i] = byte(node.Type)
   172  	}
   173  	h.Write(bytes)
   174  	return string(h.Sum(nil))
   175  }