summaryrefslogtreecommitdiffstats
path: root/names.go
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2016-02-21 21:27:08 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2016-02-21 21:27:08 -0500
commitbd74996cf63f60e41df951e281410a1de32cb0b7 (patch)
tree9be46abf1fd4aea8ea728585fc42c53b912ea871 /names.go
parent3870898761174089f13596f147b8412300a17207 (diff)
downloadbibtex-bd74996cf63f60e41df951e281410a1de32cb0b7.tar.gz
Add name parsing/formatting logic
Diffstat (limited to 'names.go')
-rw-r--r--names.go331
1 files changed, 331 insertions, 0 deletions
diff --git a/names.go b/names.go
new file mode 100644
index 0000000..6491d35
--- /dev/null
+++ b/names.go
@@ -0,0 +1,331 @@
+package bibtex
+
+import (
+ "bytes"
+ "log"
+ "strings"
+ "unicode"
+)
+
+const sepChars = "-~"
+
+var lcommands map[string]bool = map[string]bool{
+ "aa": true, "oe": true, "l": true, "ae": true,
+ "o": true, "ss": true, "i": true, "j": true,
+}
+
+var ucommands map[string]bool = map[string]bool{
+ "AA": true, "L": true, "OE": true, "AE": true, "O": true,
+}
+
+type Token struct {
+ Sep rune // the separator preceding the token
+ Text string
+}
+
+type NamePart []Token
+
+// a proto name before the parts have been separated in f,v,l,f
+type name []NamePart
+
+// abbreviate a string. return the first letter or special character
+func _abbrv(s string) string {
+ reader := newReader(s)
+ var buf bytes.Buffer
+ for c := reader.readRune(); c != eof; c = reader.readRune() {
+ switch {
+ case c == '{':
+ c = reader.readRune()
+ if c == '\\' {
+ buf.WriteString("{\\")
+ buf.WriteString(reader.readBraces())
+ return buf.String()
+ } else {
+ reader.UnreadRune()
+ }
+ case unicode.IsLetter(c):
+ buf.WriteRune(c)
+ return buf.String()
+ }
+ }
+ return buf.String()
+}
+
+func printSep(buf *bytes.Buffer, sep rune, last bool) {
+ switch {
+ case sep == '-', sep == '~':
+ buf.WriteRune(sep)
+ case last || length(buf.String()) <= 2:
+ buf.WriteRune('~')
+ default:
+ buf.WriteRune(' ')
+ }
+}
+
+// Format returns a string representation of NamePart p.
+// If abbrv the token will be abbreviated. If def is true, the default
+// inter-token will be used: this is either space or ~ depending on the length
+// of the token. If def is false sep is used as the inter-token separator.
+func (p NamePart) Format(abbrv bool, sep string, def bool) string {
+ var buf bytes.Buffer
+ for i, t := range p {
+ if i > 0 {
+ if abbrv {
+ buf.WriteRune('.')
+ }
+ if def {
+ printSep(&buf, t.Sep, i == len(p)-1)
+ } else {
+ buf.WriteString(sep)
+ }
+ }
+ if abbrv {
+ buf.WriteString(_abbrv(t.Text))
+ } else {
+ buf.WriteString(t.Text)
+ }
+ }
+ return buf.String()
+}
+
+func addNonEmptyToken(b *bytes.Buffer, t *Token, part *NamePart, c rune) {
+ if b.Len() > 0 {
+ t.Text = b.String()
+ b.Reset()
+ *part = append(*part, *t)
+ *t = Token{Sep: c}
+ }
+}
+
+func addNonEmptyPart(part *NamePart, name *name, t *Token, c rune) {
+ if len(*part) > 0 || len(*name) == 0 {
+ *name = append(*name, *part)
+ }
+ *part = nil
+ *t = Token{Sep: c}
+}
+
+// tokenize a string and return a list of proto names, the parts of the proto
+// name are the result of splitting on commas, but further splitting is done
+// later
+func splitNames(s string) (names []name) {
+ if len(s) == 0 {
+ return nil
+ }
+ var name name // current name
+ var part NamePart // current name Part
+ Token := &Token{Sep: rune(0)} // current token
+ bracelevel := 0
+ space := false // was the last non-special character a WS?
+ var buf bytes.Buffer
+ for _, c := range s {
+ switch {
+ case c == '{':
+ buf.WriteRune(c)
+ bracelevel += 1
+ case c == '}':
+ buf.WriteRune(c)
+ bracelevel -= 1
+ case unicode.IsSpace(c) && bracelevel == 0:
+ if space && strings.ToLower(buf.String()) == "and" {
+ buf.Reset()
+ addNonEmptyPart(&part, &name, Token, c)
+ names = append(names, name)
+ name = nil
+ }
+ addNonEmptyToken(&buf, Token, &part, c)
+ space = true
+ case bracelevel == 0 && strings.ContainsRune(sepChars, c):
+ space = false
+ addNonEmptyToken(&buf, Token, &part, c)
+ case bracelevel == 0 && c == ',':
+ space = false
+ addNonEmptyToken(&buf, Token, &part, c)
+ // a name cannot have more than 3 proto parts, hence a comma
+ // counts as a separation between parts only the first two times,
+ // i.e. when len(name) == 0 or len(name) == 1
+ if len(name) <= 1 {
+ addNonEmptyPart(&part, &name, Token, c)
+ }
+ default:
+ buf.WriteRune(c)
+ }
+ }
+ addNonEmptyToken(&buf, Token, &part, ',')
+ addNonEmptyPart(&part, &name, Token, ',')
+ names = append(names, name)
+ return
+}
+
+type Name struct {
+ First NamePart
+ Von NamePart
+ Last NamePart
+ Jr NamePart
+}
+
+func (n Name) getPart(c rune) (p NamePart) {
+ switch c {
+ case 'f':
+ p = n.First
+ case 'v':
+ p = n.Von
+ case 'l':
+ p = n.Last
+ case 'j':
+ p = n.Jr
+ }
+ return
+}
+
+// here the format string f is assumed to only contain one part
+func (n Name) formatPart(f string) string {
+ f = f[1 : len(f)-1]
+ r := newReader(f)
+ var buf bytes.Buffer
+ abbrv := true
+ found := false
+ def := true
+ sep := ""
+ for c := r.readRune(); c != eof; c = r.readRune() {
+ switch {
+ case strings.ContainsRune("fvljFVLJ", c) && !found:
+ found = true
+ typ := unicode.ToLower(c)
+ if d := r.readRune(); unicode.ToLower(d) == typ {
+ abbrv = false
+ } else {
+ r.UnreadRune()
+ }
+ if d := r.readRune(); d == '{' {
+ def = false
+ sep = "{" + r.readBraces()
+ sep = sep[1 : len(sep)-1]
+ } else {
+ r.UnreadRune()
+ }
+ part := n.getPart(typ)
+ if len(part) == 0 {
+ return ""
+ }
+ s := part.Format(abbrv, sep, def)
+ buf.WriteString(s)
+ case unicode.IsLetter(c):
+ log.Printf("Non valid char")
+ default:
+ buf.WriteRune(c)
+ }
+ }
+ return dtie(buf.String())
+}
+
+// handles discretionary ties
+func dtie(s string) string {
+ if len(s) >= 2 && s[len(s)-1] == '~' && s[len(s)-2] != '~' {
+ s = s[:len(s)-1]
+ if length(s) <= 2 {
+ s += "~"
+ } else {
+ s += " "
+ }
+ }
+ return s
+}
+
+// Format formats the name n according to the format string s.
+func (n Name) Format(s string) string {
+ r := newReader(s)
+ var buf bytes.Buffer
+ for c := r.readRune(); c != eof; c = r.readRune() {
+ switch {
+ case c == '{':
+ fmt := "{" + r.readBraces()
+ buf.WriteString(n.formatPart(fmt))
+ case c == '}':
+ log.Printf("Unbalanced braced in format string %s", s)
+ default:
+ buf.WriteRune(c)
+ }
+ }
+ return buf.String()
+}
+
+// the von part starts at the first non last lower-case taken
+func findVon(part NamePart) int {
+ for i, t := range part {
+ if t.isLower() && i < len(part)-1 {
+ return i
+ }
+ }
+ return -1
+}
+
+// reading tokens from the penultimate token, the boundary between von and last
+// is after the first lower token
+func splitVonLast(part NamePart) (NamePart, NamePart) {
+ var i int
+ if len(part) < 2 {
+ return part[:0], part[:]
+ }
+ for i = len(part) - 2; i >= 0; i-- {
+ if part[i].isLower() {
+ i += 1
+ break
+ }
+ }
+ if i == -1 {
+ return part[:0], part[:]
+ } else {
+ return part[:i], part[i:]
+ }
+}
+
+// only when there is no von part and no comma
+// reading tokens from the last token, tokens are added to the last name if
+// they are connected to the following token by a hyphen
+func splitFirstLast(part NamePart) (NamePart, NamePart) {
+ var i int
+ if len(part) == 0 {
+ return part[:0], part[:]
+ }
+ for i = len(part) - 1; i >= 0; i-- {
+ if part[i].Sep != '-' {
+ break
+ }
+ }
+ if i == -1 {
+ return part[:0], part[:]
+ } else {
+ return part[:i], part[i:]
+ }
+}
+
+// SplitNames parses the string s and returns a list of names.
+func SplitNames(s string) (res []Name) {
+ names := splitNames(s)
+ var first, von, last, vonlast, jr NamePart
+ for _, name := range names {
+ first, von, last, vonlast, jr = nil, nil, nil, nil, nil
+ if len(name) == 1 {
+ i := findVon(name[0])
+ if i >= 0 {
+ first = name[0][:i]
+ vonlast = name[0][i:]
+ von, last = splitVonLast(vonlast)
+ } else {
+ first, last = splitFirstLast(name[0])
+ }
+ } else if len(name) == 2 {
+ vonlast = name[0]
+ von, last = splitVonLast(vonlast)
+ first = name[1]
+ } else if len(name) == 3 {
+ vonlast = name[0]
+ von, last = splitVonLast(vonlast)
+ first = name[2]
+ jr = name[1]
+ }
+ res = append(res, Name{first, von, last, jr})
+ }
+ return
+}