From bd74996cf63f60e41df951e281410a1de32cb0b7 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Sun, 21 Feb 2016 21:27:08 -0500
Subject: Add name parsing/formatting logic

---
 names.go | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 utils.go | 149 ++++++++++++++++++++++++++++
 2 files changed, 480 insertions(+)
 create mode 100644 names.go
 create mode 100644 utils.go

diff --git a/names.go b/names.go
new file mode 100644
index 0000000..6491d35
--- /dev/null
+++ b/names.go
@@ -0,0 +1,331 @@
+package bibtex
+
+import (
+	"bytes"
+	"log"
+	"strings"
+	"unicode"
+)
+
+const sepChars = "-~"
+
+var lcommands map[string]bool = map[string]bool{
+	"aa": true, "oe": true, "l": true, "ae": true,
+	"o": true, "ss": true, "i": true, "j": true,
+}
+
+var ucommands map[string]bool = map[string]bool{
+	"AA": true, "L": true, "OE": true, "AE": true, "O": true,
+}
+
+type Token struct {
+	Sep  rune // the separator preceding the token
+	Text string
+}
+
+type NamePart []Token
+
+// a proto name before the parts have been separated in f,v,l,f
+type name []NamePart
+
+// abbreviate a string. return the first letter or special character
+func _abbrv(s string) string {
+	reader := newReader(s)
+	var buf bytes.Buffer
+	for c := reader.readRune(); c != eof; c = reader.readRune() {
+		switch {
+		case c == '{':
+			c = reader.readRune()
+			if c == '\\' {
+				buf.WriteString("{\\")
+				buf.WriteString(reader.readBraces())
+				return buf.String()
+			} else {
+				reader.UnreadRune()
+			}
+		case unicode.IsLetter(c):
+			buf.WriteRune(c)
+			return buf.String()
+		}
+	}
+	return buf.String()
+}
+
+func printSep(buf *bytes.Buffer, sep rune, last bool) {
+	switch {
+	case sep == '-', sep == '~':
+		buf.WriteRune(sep)
+	case last || length(buf.String()) <= 2:
+		buf.WriteRune('~')
+	default:
+		buf.WriteRune(' ')
+	}
+}
+
+// Format returns a string representation of NamePart p.
+// If abbrv the token will be abbreviated. If def is true, the default
+// inter-token will be used: this is either space or ~ depending on the length
+// of the token. If def is false sep is used as the inter-token separator.
+func (p NamePart) Format(abbrv bool, sep string, def bool) string {
+	var buf bytes.Buffer
+	for i, t := range p {
+		if i > 0 {
+			if abbrv {
+				buf.WriteRune('.')
+			}
+			if def {
+				printSep(&buf, t.Sep, i == len(p)-1)
+			} else {
+				buf.WriteString(sep)
+			}
+		}
+		if abbrv {
+			buf.WriteString(_abbrv(t.Text))
+		} else {
+			buf.WriteString(t.Text)
+		}
+	}
+	return buf.String()
+}
+
+func addNonEmptyToken(b *bytes.Buffer, t *Token, part *NamePart, c rune) {
+	if b.Len() > 0 {
+		t.Text = b.String()
+		b.Reset()
+		*part = append(*part, *t)
+		*t = Token{Sep: c}
+	}
+}
+
+func addNonEmptyPart(part *NamePart, name *name, t *Token, c rune) {
+	if len(*part) > 0 || len(*name) == 0 {
+		*name = append(*name, *part)
+	}
+	*part = nil
+	*t = Token{Sep: c}
+}
+
+// tokenize a string and return a list of proto names, the parts of the proto
+// name are the result of splitting on commas, but further splitting is done
+// later
+func splitNames(s string) (names []name) {
+	if len(s) == 0 {
+		return nil
+	}
+	var name name                 // current name
+	var part NamePart             // current name Part
+	Token := &Token{Sep: rune(0)} // current token
+	bracelevel := 0
+	space := false // was the last non-special character a WS?
+	var buf bytes.Buffer
+	for _, c := range s {
+		switch {
+		case c == '{':
+			buf.WriteRune(c)
+			bracelevel += 1
+		case c == '}':
+			buf.WriteRune(c)
+			bracelevel -= 1
+		case unicode.IsSpace(c) && bracelevel == 0:
+			if space && strings.ToLower(buf.String()) == "and" {
+				buf.Reset()
+				addNonEmptyPart(&part, &name, Token, c)
+				names = append(names, name)
+				name = nil
+			}
+			addNonEmptyToken(&buf, Token, &part, c)
+			space = true
+		case bracelevel == 0 && strings.ContainsRune(sepChars, c):
+			space = false
+			addNonEmptyToken(&buf, Token, &part, c)
+		case bracelevel == 0 && c == ',':
+			space = false
+			addNonEmptyToken(&buf, Token, &part, c)
+			// a name cannot have more than 3 proto parts, hence  a comma
+			// counts as a separation between parts only the first two times,
+			// i.e. when len(name) == 0 or len(name) == 1
+			if len(name) <= 1 {
+				addNonEmptyPart(&part, &name, Token, c)
+			}
+		default:
+			buf.WriteRune(c)
+		}
+	}
+	addNonEmptyToken(&buf, Token, &part, ',')
+	addNonEmptyPart(&part, &name, Token, ',')
+	names = append(names, name)
+	return
+}
+
+type Name struct {
+	First NamePart
+	Von   NamePart
+	Last  NamePart
+	Jr    NamePart
+}
+
+func (n Name) getPart(c rune) (p NamePart) {
+	switch c {
+	case 'f':
+		p = n.First
+	case 'v':
+		p = n.Von
+	case 'l':
+		p = n.Last
+	case 'j':
+		p = n.Jr
+	}
+	return
+}
+
+// here the format string f is assumed to only contain one part
+func (n Name) formatPart(f string) string {
+	f = f[1 : len(f)-1]
+	r := newReader(f)
+	var buf bytes.Buffer
+	abbrv := true
+	found := false
+	def := true
+	sep := ""
+	for c := r.readRune(); c != eof; c = r.readRune() {
+		switch {
+		case strings.ContainsRune("fvljFVLJ", c) && !found:
+			found = true
+			typ := unicode.ToLower(c)
+			if d := r.readRune(); unicode.ToLower(d) == typ {
+				abbrv = false
+			} else {
+				r.UnreadRune()
+			}
+			if d := r.readRune(); d == '{' {
+				def = false
+				sep = "{" + r.readBraces()
+				sep = sep[1 : len(sep)-1]
+			} else {
+				r.UnreadRune()
+			}
+			part := n.getPart(typ)
+			if len(part) == 0 {
+				return ""
+			}
+			s := part.Format(abbrv, sep, def)
+			buf.WriteString(s)
+		case unicode.IsLetter(c):
+			log.Printf("Non valid char")
+		default:
+			buf.WriteRune(c)
+		}
+	}
+	return dtie(buf.String())
+}
+
+// handles discretionary ties
+func dtie(s string) string {
+	if len(s) >= 2 && s[len(s)-1] == '~' && s[len(s)-2] != '~' {
+		s = s[:len(s)-1]
+		if length(s) <= 2 {
+			s += "~"
+		} else {
+			s += " "
+		}
+	}
+	return s
+}
+
+// Format formats the name n according to the format string s.
+func (n Name) Format(s string) string {
+	r := newReader(s)
+	var buf bytes.Buffer
+	for c := r.readRune(); c != eof; c = r.readRune() {
+		switch {
+		case c == '{':
+			fmt := "{" + r.readBraces()
+			buf.WriteString(n.formatPart(fmt))
+		case c == '}':
+			log.Printf("Unbalanced braced in format string %s", s)
+		default:
+			buf.WriteRune(c)
+		}
+	}
+	return buf.String()
+}
+
+// the von part starts at the first non last lower-case taken
+func findVon(part NamePart) int {
+	for i, t := range part {
+		if t.isLower() && i < len(part)-1 {
+			return i
+		}
+	}
+	return -1
+}
+
+// reading tokens from the penultimate token, the boundary between von and last
+// is after the first lower token
+func splitVonLast(part NamePart) (NamePart, NamePart) {
+	var i int
+	if len(part) < 2 {
+		return part[:0], part[:]
+	}
+	for i = len(part) - 2; i >= 0; i-- {
+		if part[i].isLower() {
+			i += 1
+			break
+		}
+	}
+	if i == -1 {
+		return part[:0], part[:]
+	} else {
+		return part[:i], part[i:]
+	}
+}
+
+// only when there is no von part and no comma
+// reading tokens from the last token, tokens are added to the last name if
+// they are connected to the following token by a hyphen
+func splitFirstLast(part NamePart) (NamePart, NamePart) {
+	var i int
+	if len(part) == 0 {
+		return part[:0], part[:]
+	}
+	for i = len(part) - 1; i >= 0; i-- {
+		if part[i].Sep != '-' {
+			break
+		}
+	}
+	if i == -1 {
+		return part[:0], part[:]
+	} else {
+		return part[:i], part[i:]
+	}
+}
+
+// SplitNames parses the string s and returns a list of names.
+func SplitNames(s string) (res []Name) {
+	names := splitNames(s)
+	var first, von, last, vonlast, jr NamePart
+	for _, name := range names {
+		first, von, last, vonlast, jr = nil, nil, nil, nil, nil
+		if len(name) == 1 {
+			i := findVon(name[0])
+			if i >= 0 {
+				first = name[0][:i]
+				vonlast = name[0][i:]
+				von, last = splitVonLast(vonlast)
+			} else {
+				first, last = splitFirstLast(name[0])
+			}
+		} else if len(name) == 2 {
+			vonlast = name[0]
+			von, last = splitVonLast(vonlast)
+			first = name[1]
+		} else if len(name) == 3 {
+			vonlast = name[0]
+			von, last = splitVonLast(vonlast)
+			first = name[2]
+			jr = name[1]
+		}
+		res = append(res, Name{first, von, last, jr})
+	}
+	return
+}
diff --git a/utils.go b/utils.go
new file mode 100644
index 0000000..1ca8298
--- /dev/null
+++ b/utils.go
@@ -0,0 +1,149 @@
+package bibtex
+
+import (
+	"bytes"
+	"strings"
+	"unicode"
+)
+
+// reader for LaTeX strings with methods to easily handle special chars, etc
+type reader struct {
+	*strings.Reader
+}
+
+func newReader(s string) *reader {
+	return &reader{Reader: strings.NewReader(s)}
+}
+
+func (r *reader) readRune() rune {
+	c, _, _ := r.Reader.ReadRune()
+	return c
+}
+
+func (r *reader) readBraces() string {
+	bracelevel := 1
+	var c rune
+	var buf bytes.Buffer
+	for bracelevel > 0 {
+		c = r.readRune()
+		switch {
+		case c == '{':
+			bracelevel += 1
+		case c == '}':
+			bracelevel -= 1
+		case c == eof:
+			return buf.String()
+		}
+		buf.WriteRune(c)
+	}
+	return buf.String()
+}
+
+func (r *reader) skipBraces() {
+	bracelevel := 1
+	var c rune
+	for bracelevel > 0 {
+		c = r.readRune()
+		switch {
+		case c == '{':
+			bracelevel += 1
+		case c == '}':
+			bracelevel -= 1
+		case c == eof:
+			return
+		}
+	}
+}
+
+func (r *reader) peek() rune {
+	ch := r.readRune()
+	r.UnreadRune()
+	return ch
+}
+
+// length of a LaTeX string. A special char counts as one, braces count as one
+// to be consitent with the original BibTeX implementation, but this seems
+// stupid
+func length(s string) (res int) {
+	reader := newReader(s)
+	res = 0
+	blevel := 0
+	for c := reader.readRune(); c != eof; c = reader.readRune() {
+		switch {
+		case c == '{' && blevel == 0:
+			blevel += 1
+			if c := reader.peek(); c == '\\' {
+				// special char, count as one
+				reader.skipBraces()
+				blevel = 0
+			}
+		case c == '{':
+			blevel += 1
+		case c == '}':
+			blevel -= 1
+		}
+		res += 1
+	}
+	return
+}
+
+func (r *reader) searchBraces() (bool, bool) {
+	bracelevel := 1
+	var c rune
+	for bracelevel > 0 {
+		c = r.readRune()
+		switch {
+		case c == '{':
+			bracelevel += 1
+		case c == '}':
+			bracelevel -= 1
+		case unicode.IsLower(c):
+			return true, true
+		case unicode.IsUpper(c):
+			return false, true
+		case c == eof:
+			return false, false
+		}
+	}
+	return false, false
+}
+
+func (r *reader) readCommand() string {
+	var buf bytes.Buffer
+	for c := r.readRune(); c != eof; c = r.readRune() {
+		if !unicode.IsLetter(c) {
+			r.UnreadRune()
+			break
+		} else {
+			buf.WriteRune(c)
+		}
+	}
+	return buf.String()
+}
+
+func (t *Token) isLower() bool {
+	reader := newReader(t.Text)
+	for c := reader.readRune(); c != eof; c = reader.readRune() {
+		switch {
+		case c == '{':
+			if c := reader.readRune(); c == '\\' {
+				command := reader.readCommand()
+				if lcommands[command] {
+					return true
+				} else if ucommands[command] {
+					return false
+				} else if a, b := reader.searchBraces(); b {
+					return a
+				}
+			} else {
+				reader.UnreadRune()
+				reader.skipBraces()
+			}
+		case unicode.IsLower(c):
+			return true
+		case unicode.IsUpper(c):
+			return false
+		}
+	}
+	return false
+}
-- 
cgit v1.2.3-70-g09d2