github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/email/parser.go (about) 1 // Copyright 2017 syzkaller project authors. All rights reserved. 2 // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. 3 4 package email 5 6 import ( 7 "encoding/base64" 8 "fmt" 9 "io" 10 "mime" 11 "mime/multipart" 12 "mime/quotedprintable" 13 "net/mail" 14 "net/url" 15 "regexp" 16 "sort" 17 "strings" 18 "time" 19 "unicode" 20 ) 21 22 type Email struct { 23 BugIDs []string 24 MessageID string 25 InReplyTo string 26 Date time.Time 27 Link string 28 Subject string 29 MailingList string 30 Author string 31 OwnEmail bool 32 Cc []string 33 RawCc []string // unstripped emails 34 Body string // text/plain part 35 Patch string // attached patch, if any 36 Commands []*SingleCommand 37 } 38 39 type SingleCommand struct { 40 Command Command 41 Str string // string representation 42 Args string // arguments for the command 43 } 44 45 type Command int 46 47 const ( 48 CmdUnknown Command = iota 49 CmdUpstream 50 CmdFix 51 CmdUnFix 52 CmdDup 53 CmdUnDup 54 CmdTest 55 CmdInvalid 56 CmdUnCC 57 CmdSet 58 CmdUnset 59 CmdRegenerate 60 61 cmdTest5 62 ) 63 64 const ForwardedPrefix = "Forwarded: " 65 66 var groupsLinkRe = regexp.MustCompile(`(?m)\nTo view this discussion (?:on the web )?visit` + 67 ` (https://groups\.google\.com/.*?)\.(:?$|\n|\r)`) 68 69 func prepareEmails(list []string) map[string]bool { 70 ret := make(map[string]bool) 71 for _, email := range list { 72 ret[email] = true 73 if addr, err := mail.ParseAddress(email); err == nil { 74 ret[addr.Address] = true 75 } 76 } 77 return ret 78 } 79 80 func Parse(r io.Reader, ownEmails, goodLists, domains []string) (*Email, error) { 81 msg, err := mail.ReadMessage(r) 82 if err != nil { 83 return nil, fmt.Errorf("failed to read email: %w", err) 84 } 85 from, err := msg.Header.AddressList("From") 86 if err != nil { 87 return nil, fmt.Errorf("failed to parse email header 'From': %w", err) 88 } 89 if len(from) == 0 { 90 return nil, fmt.Errorf("failed to parse email header 'To': no senders") 91 } 92 // Ignore errors since To: header may not be present (we've seen such case). 93 to, _ := msg.Header.AddressList("To") 94 // AddressList fails if the header is not present. 95 cc, _ := msg.Header.AddressList("Cc") 96 var ccList []string 97 ownAddrs := prepareEmails(ownEmails) 98 fromMe := false 99 for _, addr := range from { 100 cleaned, _, _ := RemoveAddrContext(addr.Address) 101 if addr, err := mail.ParseAddress(cleaned); err == nil && ownAddrs[addr.Address] { 102 fromMe = true 103 } 104 } 105 106 originalFrom := "" 107 // Ignore error since the header might not be present. 108 originalFroms, _ := msg.Header.AddressList("X-Original-From") 109 if len(originalFroms) > 0 { 110 originalFrom = originalFroms[0].String() 111 } 112 113 bugIDs := []string{} 114 rawCcList := append(append(append(cc, to...), from...), originalFroms...) 115 for _, addr := range rawCcList { 116 cleaned, context, _ := RemoveAddrContext(addr.Address) 117 if addr, err := mail.ParseAddress(cleaned); err == nil { 118 cleaned = addr.Address 119 } 120 if ownAddrs[cleaned] { 121 bugIDs = append(bugIDs, context) 122 } else { 123 ccList = append(ccList, CanonicalEmail(cleaned)) 124 } 125 } 126 ccList = MergeEmailLists(ccList) 127 128 sender := "" 129 // Ignore error since the header might not be present. 130 senders, _ := msg.Header.AddressList("Sender") 131 if len(senders) > 0 { 132 sender = senders[0].Address 133 } 134 135 body, attachments, err := parseBody(msg.Body, msg.Header) 136 if err != nil { 137 return nil, err 138 } 139 bodyStr := string(body) 140 subject := decodeSubject(msg.Header.Get("Subject")) 141 var cmds []*SingleCommand 142 var patch string 143 if !fromMe { 144 for _, a := range attachments { 145 patch = ParsePatch(a) 146 if patch != "" { 147 break 148 } 149 } 150 if patch == "" { 151 patch = ParsePatch(body) 152 } 153 cmds = extractCommands(subject + "\n" + bodyStr) 154 } 155 bugIDs = append(bugIDs, extractBodyBugIDs(bodyStr, ownAddrs, domains)...) 156 157 link := "" 158 if match := groupsLinkRe.FindStringSubmatchIndex(bodyStr); match != nil { 159 link = bodyStr[match[2]:match[3]] 160 if unescaped, err := url.QueryUnescape(link); err == nil { 161 link = unescaped 162 } 163 } 164 165 author := CanonicalEmail(from[0].Address) 166 mailingList := "" 167 168 goodListsMap := prepareEmails(goodLists) 169 if goodListsMap[author] { 170 // In some cases, the mailing list would change From and introduce X-Original-From. 171 mailingList = author 172 if originalFrom != "" { 173 author = CanonicalEmail(originalFrom) 174 } 175 // Not sure if `else` can happen here, but let it be mailingList == author in this case. 176 } else if goodListsMap[CanonicalEmail(sender)] { 177 // In other cases, the mailing list would preserve From and just change Sender. 178 mailingList = CanonicalEmail(sender) 179 } 180 date, _ := mail.ParseDate(msg.Header.Get("Date")) 181 email := &Email{ 182 BugIDs: unique(bugIDs), 183 MessageID: msg.Header.Get("Message-ID"), 184 InReplyTo: extractInReplyTo(msg.Header), 185 Date: date, 186 Link: link, 187 Author: author, 188 OwnEmail: fromMe, 189 MailingList: mailingList, 190 Subject: subject, 191 Cc: ccList, 192 RawCc: mergeRawAddresses(from, originalFroms, to, cc), 193 Body: bodyStr, 194 Patch: patch, 195 Commands: cmds, 196 } 197 return email, nil 198 } 199 200 // AddAddrContext embeds context into local part of the provided email address using '+'. 201 // Returns the resulting email address. 202 func AddAddrContext(email, context string) (string, error) { 203 addr, err := mail.ParseAddress(email) 204 if err != nil { 205 return "", fmt.Errorf("failed to parse %q as email: %w", email, err) 206 } 207 at := strings.IndexByte(addr.Address, '@') 208 if at == -1 { 209 return "", fmt.Errorf("failed to parse %q as email: no @", email) 210 } 211 result := addr.Address[:at] 212 if context != "" { 213 result += "+" + context 214 } 215 result += addr.Address[at:] 216 if addr.Name != "" { 217 addr.Address = result 218 result = addr.String() 219 } 220 return result, nil 221 } 222 223 // RemoveAddrContext extracts context after '+' from the local part of the provided email address. 224 // Returns address without the context and the context. 225 func RemoveAddrContext(email string) (string, string, error) { 226 addr, err := mail.ParseAddress(email) 227 if err != nil { 228 return "", "", fmt.Errorf("failed to parse %q as email: %w", email, err) 229 } 230 at := strings.IndexByte(addr.Address, '@') 231 if at == -1 { 232 return "", "", fmt.Errorf("failed to parse %q as email: no @", email) 233 } 234 plus := strings.LastIndexByte(addr.Address[:at], '+') 235 if plus == -1 { 236 return email, "", nil 237 } 238 context := addr.Address[plus+1 : at] 239 addr.Address = addr.Address[:plus] + addr.Address[at:] 240 return addr.String(), context, nil 241 } 242 243 func CanonicalEmail(email string) string { 244 addr, err := mail.ParseAddress(email) 245 if err != nil { 246 return email 247 } 248 at := strings.IndexByte(addr.Address, '@') 249 if at == -1 { 250 return email 251 } 252 if plus := strings.IndexByte(addr.Address[:at], '+'); plus != -1 { 253 addr.Address = addr.Address[:plus] + addr.Address[at:] 254 } 255 return strings.ToLower(addr.Address) 256 } 257 258 func extractCommands(body string) []*SingleCommand { 259 var ret []*SingleCommand 260 for body != "" { 261 cmd, end := extractCommand(body) 262 if cmd == nil { 263 break 264 } 265 ret = append(ret, cmd) 266 body = body[end:] 267 } 268 return ret 269 } 270 271 const commandPrefix = "#syz" 272 273 var commandStartRe = regexp.MustCompile(`(?:^|\n)(` + regexp.QuoteMeta(commandPrefix) + `[ \t-:])`) 274 275 // extractCommand extracts command to syzbot from email body. 276 // Commands are of the following form: 277 // ^#syz cmd args... 278 func extractCommand(body string) (*SingleCommand, int) { 279 var cmd Command 280 var str, args string 281 282 match := commandStartRe.FindStringSubmatchIndex(body) 283 if match == nil { 284 return nil, 0 285 } 286 cmdPos := match[2] + len(commandPrefix) + 1 287 for cmdPos < len(body) && unicode.IsSpace(rune(body[cmdPos])) { 288 cmdPos++ 289 } 290 cmdEnd := strings.IndexByte(body[cmdPos:], '\n') 291 if cmdEnd == -1 { 292 cmdEnd = len(body) - cmdPos 293 } 294 if cmdEnd1 := strings.IndexByte(body[cmdPos:], '\r'); cmdEnd1 != -1 && cmdEnd1 < cmdEnd { 295 cmdEnd = cmdEnd1 296 } 297 if cmdEnd1 := strings.IndexByte(body[cmdPos:], ' '); cmdEnd1 != -1 && cmdEnd1 < cmdEnd { 298 cmdEnd = cmdEnd1 299 } 300 if cmdEnd1 := strings.IndexByte(body[cmdPos:], '\t'); cmdEnd1 != -1 && cmdEnd1 < cmdEnd { 301 cmdEnd = cmdEnd1 302 } 303 str = body[cmdPos : cmdPos+cmdEnd] 304 cmd = strToCmd(str) 305 // Some email clients split text emails at 80 columns are the transformation is irrevesible. 306 // We try hard to restore what was there before. 307 // For "test:" command we know that there must be 2 tokens without spaces. 308 // For "fix:"/"dup:" we need a whole non-empty line of text. 309 switch cmd { 310 case CmdTest: 311 if strings.HasSuffix(str, ":") { 312 // For "#syz test:", we do want to query 2 arguments. 313 args = extractArgsTokens(body[cmdPos+cmdEnd:], 2) 314 } else { 315 // For "#syz test", it's likely there won't be anything else, so let's only parse 316 // the first line. 317 args = extractArgsLine(body[cmdPos+cmdEnd:], false) 318 } 319 case CmdSet, CmdUnset: 320 args = extractArgsLine(body[cmdPos+cmdEnd:], true) 321 case cmdTest5: 322 args = extractArgsTokens(body[cmdPos+cmdEnd:], 5) 323 case CmdFix, CmdDup: 324 args = extractArgsLine(body[cmdPos+cmdEnd:], true) 325 } 326 return &SingleCommand{ 327 Command: cmd, 328 Str: str, 329 Args: args, 330 }, cmdPos + cmdEnd 331 } 332 333 func strToCmd(str string) Command { 334 switch str { 335 default: 336 return CmdUnknown 337 case "upstream": 338 return CmdUpstream 339 case "fix", "fix:": 340 return CmdFix 341 case "unfix": 342 return CmdUnFix 343 case "dup", "dup:": 344 return CmdDup 345 case "undup": 346 return CmdUnDup 347 case "test", "test:": 348 return CmdTest 349 case "invalid": 350 return CmdInvalid 351 case "uncc", "uncc:": 352 return CmdUnCC 353 case "set", "set:": 354 return CmdSet 355 case "unset", "unset:": 356 return CmdUnset 357 case "regenerate": 358 return CmdRegenerate 359 case "test_5_arg_cmd": 360 return cmdTest5 361 } 362 } 363 364 func extractArgsTokens(body string, num int) string { 365 var args []string 366 for pos := 0; len(args) < num && pos < len(body); { 367 lineEnd := strings.IndexByte(body[pos:], '\n') 368 if lineEnd == -1 { 369 lineEnd = len(body) - pos 370 } 371 line := strings.TrimSpace(strings.ReplaceAll(body[pos:pos+lineEnd], "\t", " ")) 372 for { 373 line1 := strings.ReplaceAll(line, " ", " ") 374 if line == line1 { 375 break 376 } 377 line = line1 378 } 379 if line != "" { 380 args = append(args, strings.Split(line, " ")...) 381 } 382 pos += lineEnd + 1 383 } 384 return strings.TrimSpace(strings.Join(args, " ")) 385 } 386 387 func extractArgsLine(body string, skipWs bool) string { 388 pos := 0 389 if skipWs { 390 for pos < len(body) && unicode.IsSpace(rune(body[pos])) { 391 pos++ 392 } 393 } 394 lineEnd := strings.IndexByte(body[pos:], '\n') 395 if lineEnd == -1 { 396 lineEnd = len(body) - pos 397 } 398 return strings.TrimSpace(body[pos : pos+lineEnd]) 399 } 400 401 func parseBody(r io.Reader, headers mail.Header) ([]byte, [][]byte, error) { 402 // git-send-email sends emails without Content-Type, let's assume it's text. 403 mediaType := "text/plain" 404 var params map[string]string 405 if contentType := headers.Get("Content-Type"); contentType != "" { 406 var err error 407 mediaType, params, err = mime.ParseMediaType(headers.Get("Content-Type")) 408 if err != nil { 409 return nil, nil, fmt.Errorf("failed to parse email header 'Content-Type': %w", err) 410 } 411 } 412 switch strings.ToLower(headers.Get("Content-Transfer-Encoding")) { 413 case "quoted-printable": 414 r = quotedprintable.NewReader(r) 415 case "base64": 416 r = base64.NewDecoder(base64.StdEncoding, r) 417 } 418 disp, _, _ := mime.ParseMediaType(headers.Get("Content-Disposition")) 419 if disp == "attachment" { 420 attachment, err := io.ReadAll(r) 421 if err != nil { 422 return nil, nil, fmt.Errorf("failed to read email body: %w", err) 423 } 424 return nil, [][]byte{attachment}, nil 425 } 426 if mediaType == "text/plain" { 427 body, err := io.ReadAll(r) 428 if err != nil { 429 return nil, nil, fmt.Errorf("failed to read email body: %w", err) 430 } 431 return body, nil, nil 432 } 433 if !strings.HasPrefix(mediaType, "multipart/") { 434 return nil, nil, nil 435 } 436 var body []byte 437 var attachments [][]byte 438 mr := multipart.NewReader(r, params["boundary"]) 439 for { 440 p, err := mr.NextPart() 441 if err == io.EOF { 442 return body, attachments, nil 443 } 444 if err != nil { 445 return nil, nil, fmt.Errorf("failed to parse MIME parts: %w", err) 446 } 447 body1, attachments1, err1 := parseBody(p, mail.Header(p.Header)) 448 if err1 != nil { 449 return nil, nil, err1 450 } 451 if body == nil { 452 body = body1 453 } 454 attachments = append(attachments, attachments1...) 455 } 456 } 457 458 var extractMessageIDs = regexp.MustCompile(`<.+?>`) 459 460 func extractInReplyTo(header mail.Header) string { 461 value := header.Get("In-Reply-To") 462 // Normally there should be just one message, to which we reply. 463 // However, there have been some cases when multiple addresses were mentioned. 464 // For now let's just take the first one. 465 ret := extractMessageIDs.FindStringSubmatch(value) 466 if ret != nil { 467 return ret[0] 468 } 469 return "" 470 } 471 472 func extractBodyBugIDs(body string, ownEmailMap map[string]bool, domains []string) []string { 473 // Let's build a regular expression. 474 var rb strings.Builder 475 for email := range ownEmailMap { 476 escaped := regexp.QuoteMeta(email) 477 part := strings.ReplaceAll(escaped, `@`, `\+(\w+?)@`) 478 if rb.Len() > 0 { 479 rb.WriteString(`|`) 480 } 481 rb.WriteString(part) 482 } 483 for _, domain := range domains { 484 escaped := regexp.QuoteMeta(domain + "/bug?extid=") 485 if rb.Len() > 0 { 486 rb.WriteString(`|`) 487 } 488 rb.WriteString(escaped) 489 rb.WriteString(`([\w]+)`) 490 } 491 rg := regexp.MustCompile(rb.String()) 492 ids := []string{} 493 for _, match := range rg.FindAllStringSubmatch(body, -1) { 494 // Take all non-empty group matches. 495 for i := 1; i < len(match); i++ { 496 if match[i] == "" { 497 continue 498 } 499 ids = append(ids, match[i]) 500 } 501 } 502 return ids 503 } 504 505 func unique(list []string) []string { 506 // We preserve the original order since it's necessary for bug IDs. 507 var ret []string 508 dup := map[string]struct{}{} 509 for _, v := range list { 510 if _, ok := dup[v]; ok { 511 continue 512 } 513 dup[v] = struct{}{} 514 ret = append(ret, v) 515 } 516 return ret 517 } 518 519 // MergeEmailLists merges several email lists removing duplicates and invalid entries. 520 func MergeEmailLists(lists ...[]string) []string { 521 const ( 522 maxEmailLen = 1000 523 maxEmails = 50 524 ) 525 merged := make(map[string]bool) 526 for _, list := range lists { 527 for _, email := range list { 528 addr, err := mail.ParseAddress(email) 529 if err != nil || len(addr.Address) > maxEmailLen { 530 continue 531 } 532 merged[addr.Address] = true 533 } 534 } 535 var result []string 536 for e := range merged { 537 result = append(result, e) 538 } 539 sort.Strings(result) 540 if len(result) > maxEmails { 541 result = result[:maxEmails] 542 } 543 return result 544 } 545 546 func mergeRawAddresses(lists ...[]*mail.Address) []string { 547 var emails []string 548 for _, list := range lists { 549 for _, item := range list { 550 emails = append(emails, item.Address) 551 } 552 } 553 emails = unique(emails) 554 sort.Strings(emails) 555 return emails 556 } 557 558 func RemoveFromEmailList(list []string, toRemove string) []string { 559 var result []string 560 toRemove = CanonicalEmail(toRemove) 561 for _, email := range list { 562 if CanonicalEmail(email) != toRemove { 563 result = append(result, email) 564 } 565 } 566 return result 567 } 568 569 // Decode RFC 2047-encoded subjects. 570 func decodeSubject(rawSubject string) string { 571 decoder := new(mime.WordDecoder) 572 decodedSubject, err := decoder.DecodeHeader(rawSubject) 573 if err != nil { 574 return rawSubject 575 } 576 return decodedSubject 577 }