github.com/google/syzkaller@v0.0.0-20251211124644-a066d2bc4b02/pkg/email/parser.go (about)

     1  // Copyright 2017 syzkaller project authors. All rights reserved.
     2  // Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file.
     3  
     4  package email
     5  
     6  import (
     7  	"encoding/base64"
     8  	"fmt"
     9  	"io"
    10  	"mime"
    11  	"mime/multipart"
    12  	"mime/quotedprintable"
    13  	"net/mail"
    14  	"net/url"
    15  	"regexp"
    16  	"sort"
    17  	"strings"
    18  	"time"
    19  	"unicode"
    20  )
    21  
    22  type Email struct {
    23  	BugIDs      []string
    24  	MessageID   string
    25  	InReplyTo   string
    26  	Date        time.Time
    27  	Link        string
    28  	Subject     string
    29  	MailingList string
    30  	Author      string
    31  	OwnEmail    bool
    32  	Cc          []string
    33  	RawCc       []string // unstripped emails
    34  	Body        string   // text/plain part
    35  	Patch       string   // attached patch, if any
    36  	Commands    []*SingleCommand
    37  }
    38  
    39  type SingleCommand struct {
    40  	Command Command
    41  	Str     string // string representation
    42  	Args    string // arguments for the command
    43  }
    44  
    45  type Command int
    46  
    47  const (
    48  	CmdUnknown Command = iota
    49  	CmdUpstream
    50  	CmdFix
    51  	CmdUnFix
    52  	CmdDup
    53  	CmdUnDup
    54  	CmdTest
    55  	CmdInvalid
    56  	CmdUnCC
    57  	CmdSet
    58  	CmdUnset
    59  	CmdRegenerate
    60  
    61  	cmdTest5
    62  )
    63  
    64  const ForwardedPrefix = "Forwarded: "
    65  
    66  var groupsLinkRe = regexp.MustCompile(`(?m)\nTo view this discussion (?:on the web )?visit` +
    67  	` (https://groups\.google\.com/.*?)\.(:?$|\n|\r)`)
    68  
    69  func prepareEmails(list []string) map[string]bool {
    70  	ret := make(map[string]bool)
    71  	for _, email := range list {
    72  		ret[email] = true
    73  		if addr, err := mail.ParseAddress(email); err == nil {
    74  			ret[addr.Address] = true
    75  		}
    76  	}
    77  	return ret
    78  }
    79  
    80  func Parse(r io.Reader, ownEmails, goodLists, domains []string) (*Email, error) {
    81  	msg, err := mail.ReadMessage(r)
    82  	if err != nil {
    83  		return nil, fmt.Errorf("failed to read email: %w", err)
    84  	}
    85  	from, err := msg.Header.AddressList("From")
    86  	if err != nil {
    87  		return nil, fmt.Errorf("failed to parse email header 'From': %w", err)
    88  	}
    89  	if len(from) == 0 {
    90  		return nil, fmt.Errorf("failed to parse email header 'To': no senders")
    91  	}
    92  	// Ignore errors since To: header may not be present (we've seen such case).
    93  	to, _ := msg.Header.AddressList("To")
    94  	// AddressList fails if the header is not present.
    95  	cc, _ := msg.Header.AddressList("Cc")
    96  	var ccList []string
    97  	ownAddrs := prepareEmails(ownEmails)
    98  	fromMe := false
    99  	for _, addr := range from {
   100  		cleaned, _, _ := RemoveAddrContext(addr.Address)
   101  		if addr, err := mail.ParseAddress(cleaned); err == nil && ownAddrs[addr.Address] {
   102  			fromMe = true
   103  		}
   104  	}
   105  
   106  	originalFrom := ""
   107  	// Ignore error since the header might not be present.
   108  	originalFroms, _ := msg.Header.AddressList("X-Original-From")
   109  	if len(originalFroms) > 0 {
   110  		originalFrom = originalFroms[0].String()
   111  	}
   112  
   113  	bugIDs := []string{}
   114  	rawCcList := append(append(append(cc, to...), from...), originalFroms...)
   115  	for _, addr := range rawCcList {
   116  		cleaned, context, _ := RemoveAddrContext(addr.Address)
   117  		if addr, err := mail.ParseAddress(cleaned); err == nil {
   118  			cleaned = addr.Address
   119  		}
   120  		if ownAddrs[cleaned] {
   121  			bugIDs = append(bugIDs, context)
   122  		} else {
   123  			ccList = append(ccList, CanonicalEmail(cleaned))
   124  		}
   125  	}
   126  	ccList = MergeEmailLists(ccList)
   127  
   128  	sender := ""
   129  	// Ignore error since the header might not be present.
   130  	senders, _ := msg.Header.AddressList("Sender")
   131  	if len(senders) > 0 {
   132  		sender = senders[0].Address
   133  	}
   134  
   135  	body, attachments, err := parseBody(msg.Body, msg.Header)
   136  	if err != nil {
   137  		return nil, err
   138  	}
   139  	bodyStr := string(body)
   140  	subject := decodeSubject(msg.Header.Get("Subject"))
   141  	var cmds []*SingleCommand
   142  	var patch string
   143  	if !fromMe {
   144  		for _, a := range attachments {
   145  			patch = ParsePatch(a)
   146  			if patch != "" {
   147  				break
   148  			}
   149  		}
   150  		if patch == "" {
   151  			patch = ParsePatch(body)
   152  		}
   153  		cmds = extractCommands(subject + "\n" + bodyStr)
   154  	}
   155  	bugIDs = append(bugIDs, extractBodyBugIDs(bodyStr, ownAddrs, domains)...)
   156  
   157  	link := ""
   158  	if match := groupsLinkRe.FindStringSubmatchIndex(bodyStr); match != nil {
   159  		link = bodyStr[match[2]:match[3]]
   160  		if unescaped, err := url.QueryUnescape(link); err == nil {
   161  			link = unescaped
   162  		}
   163  	}
   164  
   165  	author := CanonicalEmail(from[0].Address)
   166  	mailingList := ""
   167  
   168  	goodListsMap := prepareEmails(goodLists)
   169  	if goodListsMap[author] {
   170  		// In some cases, the mailing list would change From and introduce X-Original-From.
   171  		mailingList = author
   172  		if originalFrom != "" {
   173  			author = CanonicalEmail(originalFrom)
   174  		}
   175  		// Not sure if `else` can happen here, but let it be mailingList == author in this case.
   176  	} else if goodListsMap[CanonicalEmail(sender)] {
   177  		// In other cases, the mailing list would preserve From and just change Sender.
   178  		mailingList = CanonicalEmail(sender)
   179  	}
   180  	date, _ := mail.ParseDate(msg.Header.Get("Date"))
   181  	email := &Email{
   182  		BugIDs:      unique(bugIDs),
   183  		MessageID:   msg.Header.Get("Message-ID"),
   184  		InReplyTo:   extractInReplyTo(msg.Header),
   185  		Date:        date,
   186  		Link:        link,
   187  		Author:      author,
   188  		OwnEmail:    fromMe,
   189  		MailingList: mailingList,
   190  		Subject:     subject,
   191  		Cc:          ccList,
   192  		RawCc:       mergeRawAddresses(from, originalFroms, to, cc),
   193  		Body:        bodyStr,
   194  		Patch:       patch,
   195  		Commands:    cmds,
   196  	}
   197  	return email, nil
   198  }
   199  
   200  // AddAddrContext embeds context into local part of the provided email address using '+'.
   201  // Returns the resulting email address.
   202  func AddAddrContext(email, context string) (string, error) {
   203  	addr, err := mail.ParseAddress(email)
   204  	if err != nil {
   205  		return "", fmt.Errorf("failed to parse %q as email: %w", email, err)
   206  	}
   207  	at := strings.IndexByte(addr.Address, '@')
   208  	if at == -1 {
   209  		return "", fmt.Errorf("failed to parse %q as email: no @", email)
   210  	}
   211  	result := addr.Address[:at]
   212  	if context != "" {
   213  		result += "+" + context
   214  	}
   215  	result += addr.Address[at:]
   216  	if addr.Name != "" {
   217  		addr.Address = result
   218  		result = addr.String()
   219  	}
   220  	return result, nil
   221  }
   222  
   223  // RemoveAddrContext extracts context after '+' from the local part of the provided email address.
   224  // Returns address without the context and the context.
   225  func RemoveAddrContext(email string) (string, string, error) {
   226  	addr, err := mail.ParseAddress(email)
   227  	if err != nil {
   228  		return "", "", fmt.Errorf("failed to parse %q as email: %w", email, err)
   229  	}
   230  	at := strings.IndexByte(addr.Address, '@')
   231  	if at == -1 {
   232  		return "", "", fmt.Errorf("failed to parse %q as email: no @", email)
   233  	}
   234  	plus := strings.LastIndexByte(addr.Address[:at], '+')
   235  	if plus == -1 {
   236  		return email, "", nil
   237  	}
   238  	context := addr.Address[plus+1 : at]
   239  	addr.Address = addr.Address[:plus] + addr.Address[at:]
   240  	return addr.String(), context, nil
   241  }
   242  
   243  func CanonicalEmail(email string) string {
   244  	addr, err := mail.ParseAddress(email)
   245  	if err != nil {
   246  		return email
   247  	}
   248  	at := strings.IndexByte(addr.Address, '@')
   249  	if at == -1 {
   250  		return email
   251  	}
   252  	if plus := strings.IndexByte(addr.Address[:at], '+'); plus != -1 {
   253  		addr.Address = addr.Address[:plus] + addr.Address[at:]
   254  	}
   255  	return strings.ToLower(addr.Address)
   256  }
   257  
   258  func extractCommands(body string) []*SingleCommand {
   259  	var ret []*SingleCommand
   260  	for body != "" {
   261  		cmd, end := extractCommand(body)
   262  		if cmd == nil {
   263  			break
   264  		}
   265  		ret = append(ret, cmd)
   266  		body = body[end:]
   267  	}
   268  	return ret
   269  }
   270  
   271  const commandPrefix = "#syz"
   272  
   273  var commandStartRe = regexp.MustCompile(`(?:^|\n)(` + regexp.QuoteMeta(commandPrefix) + `[ \t-:])`)
   274  
   275  // extractCommand extracts command to syzbot from email body.
   276  // Commands are of the following form:
   277  // ^#syz cmd args...
   278  func extractCommand(body string) (*SingleCommand, int) {
   279  	var cmd Command
   280  	var str, args string
   281  
   282  	match := commandStartRe.FindStringSubmatchIndex(body)
   283  	if match == nil {
   284  		return nil, 0
   285  	}
   286  	cmdPos := match[2] + len(commandPrefix) + 1
   287  	for cmdPos < len(body) && unicode.IsSpace(rune(body[cmdPos])) {
   288  		cmdPos++
   289  	}
   290  	cmdEnd := strings.IndexByte(body[cmdPos:], '\n')
   291  	if cmdEnd == -1 {
   292  		cmdEnd = len(body) - cmdPos
   293  	}
   294  	if cmdEnd1 := strings.IndexByte(body[cmdPos:], '\r'); cmdEnd1 != -1 && cmdEnd1 < cmdEnd {
   295  		cmdEnd = cmdEnd1
   296  	}
   297  	if cmdEnd1 := strings.IndexByte(body[cmdPos:], ' '); cmdEnd1 != -1 && cmdEnd1 < cmdEnd {
   298  		cmdEnd = cmdEnd1
   299  	}
   300  	if cmdEnd1 := strings.IndexByte(body[cmdPos:], '\t'); cmdEnd1 != -1 && cmdEnd1 < cmdEnd {
   301  		cmdEnd = cmdEnd1
   302  	}
   303  	str = body[cmdPos : cmdPos+cmdEnd]
   304  	cmd = strToCmd(str)
   305  	// Some email clients split text emails at 80 columns are the transformation is irrevesible.
   306  	// We try hard to restore what was there before.
   307  	// For "test:" command we know that there must be 2 tokens without spaces.
   308  	// For "fix:"/"dup:" we need a whole non-empty line of text.
   309  	switch cmd {
   310  	case CmdTest:
   311  		if strings.HasSuffix(str, ":") {
   312  			// For "#syz test:", we do want to query 2 arguments.
   313  			args = extractArgsTokens(body[cmdPos+cmdEnd:], 2)
   314  		} else {
   315  			// For "#syz test", it's likely there won't be anything else, so let's only parse
   316  			// the first line.
   317  			args = extractArgsLine(body[cmdPos+cmdEnd:], false)
   318  		}
   319  	case CmdSet, CmdUnset:
   320  		args = extractArgsLine(body[cmdPos+cmdEnd:], true)
   321  	case cmdTest5:
   322  		args = extractArgsTokens(body[cmdPos+cmdEnd:], 5)
   323  	case CmdFix, CmdDup:
   324  		args = extractArgsLine(body[cmdPos+cmdEnd:], true)
   325  	}
   326  	return &SingleCommand{
   327  		Command: cmd,
   328  		Str:     str,
   329  		Args:    args,
   330  	}, cmdPos + cmdEnd
   331  }
   332  
   333  func strToCmd(str string) Command {
   334  	switch str {
   335  	default:
   336  		return CmdUnknown
   337  	case "upstream":
   338  		return CmdUpstream
   339  	case "fix", "fix:":
   340  		return CmdFix
   341  	case "unfix":
   342  		return CmdUnFix
   343  	case "dup", "dup:":
   344  		return CmdDup
   345  	case "undup":
   346  		return CmdUnDup
   347  	case "test", "test:":
   348  		return CmdTest
   349  	case "invalid":
   350  		return CmdInvalid
   351  	case "uncc", "uncc:":
   352  		return CmdUnCC
   353  	case "set", "set:":
   354  		return CmdSet
   355  	case "unset", "unset:":
   356  		return CmdUnset
   357  	case "regenerate":
   358  		return CmdRegenerate
   359  	case "test_5_arg_cmd":
   360  		return cmdTest5
   361  	}
   362  }
   363  
   364  func extractArgsTokens(body string, num int) string {
   365  	var args []string
   366  	for pos := 0; len(args) < num && pos < len(body); {
   367  		lineEnd := strings.IndexByte(body[pos:], '\n')
   368  		if lineEnd == -1 {
   369  			lineEnd = len(body) - pos
   370  		}
   371  		line := strings.TrimSpace(strings.ReplaceAll(body[pos:pos+lineEnd], "\t", " "))
   372  		for {
   373  			line1 := strings.ReplaceAll(line, "  ", " ")
   374  			if line == line1 {
   375  				break
   376  			}
   377  			line = line1
   378  		}
   379  		if line != "" {
   380  			args = append(args, strings.Split(line, " ")...)
   381  		}
   382  		pos += lineEnd + 1
   383  	}
   384  	return strings.TrimSpace(strings.Join(args, " "))
   385  }
   386  
   387  func extractArgsLine(body string, skipWs bool) string {
   388  	pos := 0
   389  	if skipWs {
   390  		for pos < len(body) && unicode.IsSpace(rune(body[pos])) {
   391  			pos++
   392  		}
   393  	}
   394  	lineEnd := strings.IndexByte(body[pos:], '\n')
   395  	if lineEnd == -1 {
   396  		lineEnd = len(body) - pos
   397  	}
   398  	return strings.TrimSpace(body[pos : pos+lineEnd])
   399  }
   400  
   401  func parseBody(r io.Reader, headers mail.Header) ([]byte, [][]byte, error) {
   402  	// git-send-email sends emails without Content-Type, let's assume it's text.
   403  	mediaType := "text/plain"
   404  	var params map[string]string
   405  	if contentType := headers.Get("Content-Type"); contentType != "" {
   406  		var err error
   407  		mediaType, params, err = mime.ParseMediaType(headers.Get("Content-Type"))
   408  		if err != nil {
   409  			return nil, nil, fmt.Errorf("failed to parse email header 'Content-Type': %w", err)
   410  		}
   411  	}
   412  	switch strings.ToLower(headers.Get("Content-Transfer-Encoding")) {
   413  	case "quoted-printable":
   414  		r = quotedprintable.NewReader(r)
   415  	case "base64":
   416  		r = base64.NewDecoder(base64.StdEncoding, r)
   417  	}
   418  	disp, _, _ := mime.ParseMediaType(headers.Get("Content-Disposition"))
   419  	if disp == "attachment" {
   420  		attachment, err := io.ReadAll(r)
   421  		if err != nil {
   422  			return nil, nil, fmt.Errorf("failed to read email body: %w", err)
   423  		}
   424  		return nil, [][]byte{attachment}, nil
   425  	}
   426  	if mediaType == "text/plain" {
   427  		body, err := io.ReadAll(r)
   428  		if err != nil {
   429  			return nil, nil, fmt.Errorf("failed to read email body: %w", err)
   430  		}
   431  		return body, nil, nil
   432  	}
   433  	if !strings.HasPrefix(mediaType, "multipart/") {
   434  		return nil, nil, nil
   435  	}
   436  	var body []byte
   437  	var attachments [][]byte
   438  	mr := multipart.NewReader(r, params["boundary"])
   439  	for {
   440  		p, err := mr.NextPart()
   441  		if err == io.EOF {
   442  			return body, attachments, nil
   443  		}
   444  		if err != nil {
   445  			return nil, nil, fmt.Errorf("failed to parse MIME parts: %w", err)
   446  		}
   447  		body1, attachments1, err1 := parseBody(p, mail.Header(p.Header))
   448  		if err1 != nil {
   449  			return nil, nil, err1
   450  		}
   451  		if body == nil {
   452  			body = body1
   453  		}
   454  		attachments = append(attachments, attachments1...)
   455  	}
   456  }
   457  
   458  var extractMessageIDs = regexp.MustCompile(`<.+?>`)
   459  
   460  func extractInReplyTo(header mail.Header) string {
   461  	value := header.Get("In-Reply-To")
   462  	// Normally there should be just one message, to which we reply.
   463  	// However, there have been some cases when multiple addresses were mentioned.
   464  	// For now let's just take the first one.
   465  	ret := extractMessageIDs.FindStringSubmatch(value)
   466  	if ret != nil {
   467  		return ret[0]
   468  	}
   469  	return ""
   470  }
   471  
   472  func extractBodyBugIDs(body string, ownEmailMap map[string]bool, domains []string) []string {
   473  	// Let's build a regular expression.
   474  	var rb strings.Builder
   475  	for email := range ownEmailMap {
   476  		escaped := regexp.QuoteMeta(email)
   477  		part := strings.ReplaceAll(escaped, `@`, `\+(\w+?)@`)
   478  		if rb.Len() > 0 {
   479  			rb.WriteString(`|`)
   480  		}
   481  		rb.WriteString(part)
   482  	}
   483  	for _, domain := range domains {
   484  		escaped := regexp.QuoteMeta(domain + "/bug?extid=")
   485  		if rb.Len() > 0 {
   486  			rb.WriteString(`|`)
   487  		}
   488  		rb.WriteString(escaped)
   489  		rb.WriteString(`([\w]+)`)
   490  	}
   491  	rg := regexp.MustCompile(rb.String())
   492  	ids := []string{}
   493  	for _, match := range rg.FindAllStringSubmatch(body, -1) {
   494  		// Take all non-empty group matches.
   495  		for i := 1; i < len(match); i++ {
   496  			if match[i] == "" {
   497  				continue
   498  			}
   499  			ids = append(ids, match[i])
   500  		}
   501  	}
   502  	return ids
   503  }
   504  
   505  func unique(list []string) []string {
   506  	// We preserve the original order since it's necessary for bug IDs.
   507  	var ret []string
   508  	dup := map[string]struct{}{}
   509  	for _, v := range list {
   510  		if _, ok := dup[v]; ok {
   511  			continue
   512  		}
   513  		dup[v] = struct{}{}
   514  		ret = append(ret, v)
   515  	}
   516  	return ret
   517  }
   518  
   519  // MergeEmailLists merges several email lists removing duplicates and invalid entries.
   520  func MergeEmailLists(lists ...[]string) []string {
   521  	const (
   522  		maxEmailLen = 1000
   523  		maxEmails   = 50
   524  	)
   525  	merged := make(map[string]bool)
   526  	for _, list := range lists {
   527  		for _, email := range list {
   528  			addr, err := mail.ParseAddress(email)
   529  			if err != nil || len(addr.Address) > maxEmailLen {
   530  				continue
   531  			}
   532  			merged[addr.Address] = true
   533  		}
   534  	}
   535  	var result []string
   536  	for e := range merged {
   537  		result = append(result, e)
   538  	}
   539  	sort.Strings(result)
   540  	if len(result) > maxEmails {
   541  		result = result[:maxEmails]
   542  	}
   543  	return result
   544  }
   545  
   546  func mergeRawAddresses(lists ...[]*mail.Address) []string {
   547  	var emails []string
   548  	for _, list := range lists {
   549  		for _, item := range list {
   550  			emails = append(emails, item.Address)
   551  		}
   552  	}
   553  	emails = unique(emails)
   554  	sort.Strings(emails)
   555  	return emails
   556  }
   557  
   558  func RemoveFromEmailList(list []string, toRemove string) []string {
   559  	var result []string
   560  	toRemove = CanonicalEmail(toRemove)
   561  	for _, email := range list {
   562  		if CanonicalEmail(email) != toRemove {
   563  			result = append(result, email)
   564  		}
   565  	}
   566  	return result
   567  }
   568  
   569  // Decode RFC 2047-encoded subjects.
   570  func decodeSubject(rawSubject string) string {
   571  	decoder := new(mime.WordDecoder)
   572  	decodedSubject, err := decoder.DecodeHeader(rawSubject)
   573  	if err != nil {
   574  		return rawSubject
   575  	}
   576  	return decodedSubject
   577  }