github.com/serversong/goreporter@v0.0.0-20200325104552-3cfaf44fd178/linters/copycheck/syntax/syntax.go (about) 1 package syntax 2 3 import ( 4 "crypto/sha1" 5 6 "github.com/360EntSecGroup-Skylar/goreporter/linters/copycheck/suffixtree" 7 ) 8 9 type Node struct { 10 Type int 11 Filename string 12 Pos, End int 13 Children []*Node 14 Owns int 15 } 16 17 func NewNode() *Node { 18 return &Node{} 19 } 20 21 func (n *Node) AddChildren(children ...*Node) { 22 n.Children = append(n.Children, children...) 23 } 24 25 func (n *Node) Val() int { 26 return n.Type 27 } 28 29 type Match struct { 30 Hash string 31 Frags [][]*Node 32 } 33 34 func Serialize(n *Node) []*Node { 35 stream := make([]*Node, 0, 10) 36 serial(n, &stream) 37 return stream 38 } 39 40 func serial(n *Node, stream *[]*Node) int { 41 *stream = append(*stream, n) 42 var count int 43 for _, child := range n.Children { 44 count += serial(child, stream) 45 } 46 n.Owns = count 47 return count + 1 48 } 49 50 // FindSyntaxUnits finds all complete syntax units in the match group and returns them 51 // with the corresponding hash. 52 func FindSyntaxUnits(data []*Node, m suffixtree.Match, threshold int) Match { 53 if len(m.Ps) == 0 { 54 return Match{} 55 } 56 firstSeq := data[m.Ps[0] : m.Ps[0]+m.Len] 57 indexes := getUnitsIndexes(firstSeq, threshold) 58 59 // TODO: is this really working? 60 indexCnt := len(indexes) 61 if indexCnt > 0 { 62 lasti := indexes[indexCnt-1] 63 firstn := firstSeq[lasti] 64 for i := 1; i < len(m.Ps); i++ { 65 n := data[int(m.Ps[i])+lasti] 66 if firstn.Owns != n.Owns { 67 indexes = indexes[:indexCnt-1] 68 break 69 } 70 } 71 } 72 if len(indexes) == 0 || isCyclic(indexes, firstSeq) || spansMultipleFiles(indexes, firstSeq) { 73 return Match{} 74 } 75 76 match := Match{Frags: make([][]*Node, len(m.Ps))} 77 for i, pos := range m.Ps { 78 match.Frags[i] = make([]*Node, len(indexes)) 79 for j, index := range indexes { 80 match.Frags[i][j] = data[int(pos)+index] 81 } 82 } 83 84 lastIndex := indexes[len(indexes)-1] 85 match.Hash = hashSeq(firstSeq[indexes[0] : lastIndex+firstSeq[lastIndex].Owns]) 86 return match 87 } 88 89 func getUnitsIndexes(nodeSeq []*Node, threshold int) []int { 90 var indexes []int 91 var split bool 92 for i := 0; i < len(nodeSeq); { 93 n := nodeSeq[i] 94 switch { 95 case n.Owns >= len(nodeSeq)-i: 96 // not complete syntax unit 97 i++ 98 split = true 99 continue 100 case n.Owns+1 < threshold: 101 split = true 102 default: 103 if split { 104 indexes = indexes[:0] 105 split = false 106 } 107 indexes = append(indexes, i) 108 } 109 i += n.Owns + 1 110 } 111 return indexes 112 } 113 114 // isCyclic finds out whether there is a repetive pattern in the found clone. If positive, 115 // it return false to point out that the clone would be redundant. 116 func isCyclic(indexes []int, nodes []*Node) bool { 117 cnt := len(indexes) 118 if cnt <= 1 { 119 return false 120 } 121 122 alts := make(map[int]bool) 123 for i := 1; i <= cnt/2; i++ { 124 if cnt%i == 0 { 125 alts[i] = true 126 } 127 } 128 129 for i := 0; i < indexes[cnt/2]; i++ { 130 nstart := nodes[i+indexes[0]] 131 AltLoop: 132 for alt := range alts { 133 for j := alt; j < cnt; j += alt { 134 index := i + indexes[j] 135 if index < len(nodes) { 136 nalt := nodes[index] 137 if nstart.Owns == nalt.Owns && nstart.Type == nalt.Type { 138 continue 139 } 140 } else if i >= indexes[alt] { 141 return true 142 } 143 delete(alts, alt) 144 continue AltLoop 145 } 146 } 147 if len(alts) == 0 { 148 return false 149 } 150 } 151 return true 152 } 153 154 func spansMultipleFiles(indexes []int, nodes []*Node) bool { 155 if len(indexes) < 2 { 156 return false 157 } 158 f := nodes[indexes[0]].Filename 159 for i := 1; i < len(indexes); i++ { 160 if nodes[indexes[i]].Filename != f { 161 return true 162 } 163 } 164 return false 165 } 166 167 func hashSeq(nodes []*Node) string { 168 h := sha1.New() 169 bytes := make([]byte, len(nodes)) 170 for i, node := range nodes { 171 bytes[i] = byte(node.Type) 172 } 173 h.Write(bytes) 174 return string(h.Sum(nil)) 175 }