diff --git a/gitdiff/patch_header.go b/gitdiff/patch_header.go index af3293b..f047059 100644 --- a/gitdiff/patch_header.go +++ b/gitdiff/patch_header.go @@ -68,62 +68,6 @@ func (h *PatchHeader) Message() string { return msg.String() } -// PatchIdentity identifies a person who authored or committed a patch. -type PatchIdentity struct { - Name string - Email string -} - -func (i PatchIdentity) String() string { - name := i.Name - if name == "" { - name = `""` - } - return fmt.Sprintf("%s <%s>", name, i.Email) -} - -// ParsePatchIdentity parses a patch identity string. A valid string contains -// an optional name followed by an email address in angle brackets. The angle -// brackets must always exist, but may enclose an empty address. At least one -// of the name or the email address must be non-empty. If the string only -// contains an email address, that value is also used as the name. -// -// The name must not contain a left angle bracket, '<', and the email address -// must not contain a right angle bracket, '>'. Otherwise, there are no -// restrictions on the format of either field. -func ParsePatchIdentity(s string) (PatchIdentity, error) { - var emailStart, emailEnd int - for i, c := range s { - if c == '<' && emailStart == 0 { - emailStart = i + 1 - } - if c == '>' && emailStart > 0 { - emailEnd = i - break - } - } - if emailStart > 0 && emailEnd == 0 { - return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s) - } - - var name, email string - if emailStart > 0 { - name = strings.TrimSpace(s[:emailStart-1]) - } - if emailStart > 0 && emailEnd > 0 { - email = strings.TrimSpace(s[emailStart:emailEnd]) - } - if name == "" && email != "" { - name = email - } - - if name == "" && email == "" { - return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s) - } - - return PatchIdentity{Name: name, Email: email}, nil -} - // ParsePatchDate parses a patch date string. It returns the parsed time or an // error if s has an unknown format. ParsePatchDate supports the iso, rfc, // short, raw, unix, and default formats (with local variants) used by the @@ -425,16 +369,13 @@ func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*Pa } } - addrs, err := msg.Header.AddressList("From") - if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) { - return nil, err - } - if len(addrs) > 0 { - addr := addrs[0] - if addr.Name == "" { - addr.Name = addr.Address + from := msg.Header.Get("From") + if from != "" { + u, err := ParsePatchIdentity(from) + if err != nil { + return nil, err } - h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address} + h.Author = &u } date := msg.Header.Get("Date") diff --git a/gitdiff/patch_header_test.go b/gitdiff/patch_header_test.go index 33ec209..c8559b0 100644 --- a/gitdiff/patch_header_test.go +++ b/gitdiff/patch_header_test.go @@ -5,83 +5,6 @@ import ( "time" ) -func TestParsePatchIdentity(t *testing.T) { - tests := map[string]struct { - Input string - Output PatchIdentity - Err interface{} - }{ - "simple": { - Input: "Morton Haypenny ", - Output: PatchIdentity{ - Name: "Morton Haypenny", - Email: "mhaypenny@example.com", - }, - }, - "extraWhitespace": { - Input: " Morton Haypenny ", - Output: PatchIdentity{ - Name: "Morton Haypenny", - Email: "mhaypenny@example.com", - }, - }, - "trailingCharacters": { - Input: "Morton Haypenny unrelated garbage", - Output: PatchIdentity{ - Name: "Morton Haypenny", - Email: "mhaypenny@example.com", - }, - }, - "onlyEmail": { - Input: "", - Output: PatchIdentity{ - Name: "mhaypenny@example.com", - Email: "mhaypenny@example.com", - }, - }, - "emptyEmail": { - Input: "Morton Haypenny <>", - Output: PatchIdentity{ - Name: "Morton Haypenny", - Email: "", - }, - }, - "missingEmail": { - Input: "Morton Haypenny", - Err: "invalid identity", - }, - "missingNameAndEmptyEmail": { - Input: "<>", - Err: "invalid identity", - }, - "empty": { - Input: "", - Err: "invalid identity", - }, - "unclosedEmail": { - Input: "Morton Haypenny +Date: Sat, 11 Apr 2020 15:21:23 -0700 +Subject: [PATCH] A sample commit to test header parsing + +The medium format shows the body, which +may wrap on to multiple lines. + +Another body line. +`, + Header: PatchHeader{ + SHA: expectedSHA, + Author: &PatchIdentity{ + Name: "dependabot[bot]", + Email: "12345+dependabot[bot]@users.noreply.github.com", + }, + AuthorDate: expectedDate, + Title: expectedTitle, + Body: expectedBody, + }, + }, "mailboxAppendix": { Input: `From 61f5cd90bed4d204ee3feb3aa41ee91d4734855b Mon Sep 17 00:00:00 2001 From: Morton Haypenny diff --git a/gitdiff/patch_identity.go b/gitdiff/patch_identity.go new file mode 100644 index 0000000..018f80c --- /dev/null +++ b/gitdiff/patch_identity.go @@ -0,0 +1,166 @@ +package gitdiff + +import ( + "fmt" + "strings" +) + +// PatchIdentity identifies a person who authored or committed a patch. +type PatchIdentity struct { + Name string + Email string +} + +func (i PatchIdentity) String() string { + name := i.Name + if name == "" { + name = `""` + } + return fmt.Sprintf("%s <%s>", name, i.Email) +} + +// ParsePatchIdentity parses a patch identity string. A patch identity contains +// an email address and an optional name in [RFC 5322] format. This is either a +// plain email adddress or a name followed by an address in angle brackets: +// +// author@example.com +// Author Name +// +// If the input is not one of these formats, ParsePatchIdentity applies a +// heuristic to separate the name and email portions. If both the name and +// email are missing or empty, ParsePatchIdentity returns an error. It +// otherwise does not validate the result. +// +// [RFC 5322]: https://datatracker.ietf.org/doc/html/rfc5322 +func ParsePatchIdentity(s string) (PatchIdentity, error) { + s = normalizeSpace(s) + s = unquotePairs(s) + + var name, email string + if at := strings.IndexByte(s, '@'); at >= 0 { + start, end := at, at + for start >= 0 && !isRFC5332Space(s[start]) && s[start] != '<' { + start-- + } + for end < len(s) && !isRFC5332Space(s[end]) && s[end] != '>' { + end++ + } + email = s[start+1 : end] + + // Adjust the boundaries so that we drop angle brackets, but keep + // spaces when removing the email to form the name. + if start < 0 || s[start] != '<' { + start++ + } + if end >= len(s) || s[end] != '>' { + end-- + } + name = s[:start] + s[end+1:] + } else { + start, end := 0, 0 + for i := 0; i < len(s); i++ { + if s[i] == '<' && start == 0 { + start = i + 1 + } + if s[i] == '>' && start > 0 { + end = i + break + } + } + if start > 0 && end >= start { + email = strings.TrimSpace(s[start:end]) + name = s[:start-1] + } + } + + // After extracting the email, the name might contain extra whitespace + // again and may be surrounded by comment characters. The git source gives + // these examples of when this can happen: + // + // "Name " + // "email@domain (Name)" + // "Name (Comment)" + // + name = normalizeSpace(name) + if strings.HasPrefix(name, "(") && strings.HasSuffix(name, ")") { + name = name[1 : len(name)-1] + } + name = strings.TrimSpace(name) + + // If the name is empty or contains email-like characters, use the email + // instead (assuming one exists) + if name == "" || strings.ContainsAny(name, "@<>") { + name = email + } + + if name == "" && email == "" { + return PatchIdentity{}, fmt.Errorf("invalid identity string %q", s) + } + return PatchIdentity{Name: name, Email: email}, nil +} + +// unquotePairs process the RFC5322 tokens "quoted-string" and "comment" to +// remove any "quoted-pairs" (backslash-espaced characters). It also removes +// the quotes from any quoted strings, but leaves the comment delimiters. +func unquotePairs(s string) string { + quote := false + comments := 0 + escaped := false + + var out strings.Builder + for i := 0; i < len(s); i++ { + if escaped { + escaped = false + } else { + switch s[i] { + case '\\': + // quoted-pair is only allowed in quoted-string/comment + if quote || comments > 0 { + escaped = true + continue // drop '\' character + } + + case '"': + if comments == 0 { + quote = !quote + continue // drop '"' character + } + + case '(': + if !quote { + comments++ + } + case ')': + if comments > 0 { + comments-- + } + } + } + out.WriteByte(s[i]) + } + return out.String() +} + +// normalizeSpace trims leading and trailing whitespace from s and converts +// inner sequences of one or more whitespace characters to single spaces. +func normalizeSpace(s string) string { + var sb strings.Builder + for i := 0; i < len(s); i++ { + c := s[i] + if !isRFC5332Space(c) { + if sb.Len() > 0 && isRFC5332Space(s[i-1]) { + sb.WriteByte(' ') + } + sb.WriteByte(c) + } + } + return sb.String() +} + +func isRFC5332Space(c byte) bool { + switch c { + case '\t', '\n', '\r', ' ': + return true + } + return false +} diff --git a/gitdiff/patch_identity_test.go b/gitdiff/patch_identity_test.go new file mode 100644 index 0000000..f15fe38 --- /dev/null +++ b/gitdiff/patch_identity_test.go @@ -0,0 +1,127 @@ +package gitdiff + +import ( + "testing" +) + +func TestParsePatchIdentity(t *testing.T) { + tests := map[string]struct { + Input string + Output PatchIdentity + Err interface{} + }{ + "simple": { + Input: "Morton Haypenny ", + Output: PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + }, + "extraWhitespace": { + Input: "\t Morton Haypenny \r\n ", + Output: PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny@example.com", + }, + }, + "trailingCharacters": { + Input: "Morton Haypenny II", + Output: PatchIdentity{ + Name: "Morton Haypenny II", + Email: "mhaypenny@example.com", + }, + }, + "onlyEmail": { + Input: "mhaypenny@example.com", + Output: PatchIdentity{ + Name: "mhaypenny@example.com", + Email: "mhaypenny@example.com", + }, + }, + "onlyEmailInBrackets": { + Input: "", + Output: PatchIdentity{ + Name: "mhaypenny@example.com", + Email: "mhaypenny@example.com", + }, + }, + "rfc5322SpecialCharacters": { + Input: `"dependabot[bot]" <12345+dependabot[bot]@users.noreply.github.com>`, + Output: PatchIdentity{ + Name: "dependabot[bot]", + Email: "12345+dependabot[bot]@users.noreply.github.com", + }, + }, + "rfc5322QuotedPairs": { + Input: `"Morton \"Old-Timer\" Haypenny" <"mhaypenny\+[1900]"@example.com> (III \(PhD\))`, + Output: PatchIdentity{ + Name: `Morton "Old-Timer" Haypenny (III (PhD))`, + Email: "mhaypenny+[1900]@example.com", + }, + }, + "rfc5322QuotedPairsOutOfContext": { + Input: `Morton \\Backslash Haypenny `, + Output: PatchIdentity{ + Name: `Morton \\Backslash Haypenny`, + Email: "mhaypenny@example.com", + }, + }, + "emptyEmail": { + Input: "Morton Haypenny <>", + Output: PatchIdentity{ + Name: "Morton Haypenny", + Email: "", + }, + }, + "unclosedEmail": { + Input: "Morton Haypenny ", + Output: PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny", + }, + }, + "bogusEmailWithWhitespace": { + Input: "Morton Haypenny < mhaypenny >", + Output: PatchIdentity{ + Name: "Morton Haypenny", + Email: "mhaypenny", + }, + }, + "missingEmail": { + Input: "Morton Haypenny", + Err: "invalid identity", + }, + "missingNameAndEmptyEmail": { + Input: "<>", + Err: "invalid identity", + }, + "empty": { + Input: "", + Err: "invalid identity", + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + id, err := ParsePatchIdentity(test.Input) + if test.Err != nil { + assertError(t, test.Err, err, "parsing identity") + return + } + if err != nil { + t.Fatalf("unexpected error parsing identity: %v", err) + } + + if test.Output != id { + t.Errorf("incorrect identity: expected %#v, actual %#v", test.Output, id) + } + }) + } +}