From bd74996cf63f60e41df951e281410a1de32cb0b7 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 21 Feb 2016 21:27:08 -0500 Subject: Add name parsing/formatting logic --- names.go | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.go | 149 ++++++++++++++++++++++++++++ 2 files changed, 480 insertions(+) create mode 100644 names.go create mode 100644 utils.go diff --git a/names.go b/names.go new file mode 100644 index 0000000..6491d35 --- /dev/null +++ b/names.go @@ -0,0 +1,331 @@ +package bibtex + +import ( + "bytes" + "log" + "strings" + "unicode" +) + +const sepChars = "-~" + +var lcommands map[string]bool = map[string]bool{ + "aa": true, "oe": true, "l": true, "ae": true, + "o": true, "ss": true, "i": true, "j": true, +} + +var ucommands map[string]bool = map[string]bool{ + "AA": true, "L": true, "OE": true, "AE": true, "O": true, +} + +type Token struct { + Sep rune // the separator preceding the token + Text string +} + +type NamePart []Token + +// a proto name before the parts have been separated in f,v,l,f +type name []NamePart + +// abbreviate a string. return the first letter or special character +func _abbrv(s string) string { + reader := newReader(s) + var buf bytes.Buffer + for c := reader.readRune(); c != eof; c = reader.readRune() { + switch { + case c == '{': + c = reader.readRune() + if c == '\\' { + buf.WriteString("{\\") + buf.WriteString(reader.readBraces()) + return buf.String() + } else { + reader.UnreadRune() + } + case unicode.IsLetter(c): + buf.WriteRune(c) + return buf.String() + } + } + return buf.String() +} + +func printSep(buf *bytes.Buffer, sep rune, last bool) { + switch { + case sep == '-', sep == '~': + buf.WriteRune(sep) + case last || length(buf.String()) <= 2: + buf.WriteRune('~') + default: + buf.WriteRune(' ') + } +} + +// Format returns a string representation of NamePart p. +// If abbrv the token will be abbreviated. If def is true, the default +// inter-token will be used: this is either space or ~ depending on the length +// of the token. If def is false sep is used as the inter-token separator. +func (p NamePart) Format(abbrv bool, sep string, def bool) string { + var buf bytes.Buffer + for i, t := range p { + if i > 0 { + if abbrv { + buf.WriteRune('.') + } + if def { + printSep(&buf, t.Sep, i == len(p)-1) + } else { + buf.WriteString(sep) + } + } + if abbrv { + buf.WriteString(_abbrv(t.Text)) + } else { + buf.WriteString(t.Text) + } + } + return buf.String() +} + +func addNonEmptyToken(b *bytes.Buffer, t *Token, part *NamePart, c rune) { + if b.Len() > 0 { + t.Text = b.String() + b.Reset() + *part = append(*part, *t) + *t = Token{Sep: c} + } +} + +func addNonEmptyPart(part *NamePart, name *name, t *Token, c rune) { + if len(*part) > 0 || len(*name) == 0 { + *name = append(*name, *part) + } + *part = nil + *t = Token{Sep: c} +} + +// tokenize a string and return a list of proto names, the parts of the proto +// name are the result of splitting on commas, but further splitting is done +// later +func splitNames(s string) (names []name) { + if len(s) == 0 { + return nil + } + var name name // current name + var part NamePart // current name Part + Token := &Token{Sep: rune(0)} // current token + bracelevel := 0 + space := false // was the last non-special character a WS? + var buf bytes.Buffer + for _, c := range s { + switch { + case c == '{': + buf.WriteRune(c) + bracelevel += 1 + case c == '}': + buf.WriteRune(c) + bracelevel -= 1 + case unicode.IsSpace(c) && bracelevel == 0: + if space && strings.ToLower(buf.String()) == "and" { + buf.Reset() + addNonEmptyPart(&part, &name, Token, c) + names = append(names, name) + name = nil + } + addNonEmptyToken(&buf, Token, &part, c) + space = true + case bracelevel == 0 && strings.ContainsRune(sepChars, c): + space = false + addNonEmptyToken(&buf, Token, &part, c) + case bracelevel == 0 && c == ',': + space = false + addNonEmptyToken(&buf, Token, &part, c) + // a name cannot have more than 3 proto parts, hence a comma + // counts as a separation between parts only the first two times, + // i.e. when len(name) == 0 or len(name) == 1 + if len(name) <= 1 { + addNonEmptyPart(&part, &name, Token, c) + } + default: + buf.WriteRune(c) + } + } + addNonEmptyToken(&buf, Token, &part, ',') + addNonEmptyPart(&part, &name, Token, ',') + names = append(names, name) + return +} + +type Name struct { + First NamePart + Von NamePart + Last NamePart + Jr NamePart +} + +func (n Name) getPart(c rune) (p NamePart) { + switch c { + case 'f': + p = n.First + case 'v': + p = n.Von + case 'l': + p = n.Last + case 'j': + p = n.Jr + } + return +} + +// here the format string f is assumed to only contain one part +func (n Name) formatPart(f string) string { + f = f[1 : len(f)-1] + r := newReader(f) + var buf bytes.Buffer + abbrv := true + found := false + def := true + sep := "" + for c := r.readRune(); c != eof; c = r.readRune() { + switch { + case strings.ContainsRune("fvljFVLJ", c) && !found: + found = true + typ := unicode.ToLower(c) + if d := r.readRune(); unicode.ToLower(d) == typ { + abbrv = false + } else { + r.UnreadRune() + } + if d := r.readRune(); d == '{' { + def = false + sep = "{" + r.readBraces() + sep = sep[1 : len(sep)-1] + } else { + r.UnreadRune() + } + part := n.getPart(typ) + if len(part) == 0 { + return "" + } + s := part.Format(abbrv, sep, def) + buf.WriteString(s) + case unicode.IsLetter(c): + log.Printf("Non valid char") + default: + buf.WriteRune(c) + } + } + return dtie(buf.String()) +} + +// handles discretionary ties +func dtie(s string) string { + if len(s) >= 2 && s[len(s)-1] == '~' && s[len(s)-2] != '~' { + s = s[:len(s)-1] + if length(s) <= 2 { + s += "~" + } else { + s += " " + } + } + return s +} + +// Format formats the name n according to the format string s. +func (n Name) Format(s string) string { + r := newReader(s) + var buf bytes.Buffer + for c := r.readRune(); c != eof; c = r.readRune() { + switch { + case c == '{': + fmt := "{" + r.readBraces() + buf.WriteString(n.formatPart(fmt)) + case c == '}': + log.Printf("Unbalanced braced in format string %s", s) + default: + buf.WriteRune(c) + } + } + return buf.String() +} + +// the von part starts at the first non last lower-case taken +func findVon(part NamePart) int { + for i, t := range part { + if t.isLower() && i < len(part)-1 { + return i + } + } + return -1 +} + +// reading tokens from the penultimate token, the boundary between von and last +// is after the first lower token +func splitVonLast(part NamePart) (NamePart, NamePart) { + var i int + if len(part) < 2 { + return part[:0], part[:] + } + for i = len(part) - 2; i >= 0; i-- { + if part[i].isLower() { + i += 1 + break + } + } + if i == -1 { + return part[:0], part[:] + } else { + return part[:i], part[i:] + } +} + +// only when there is no von part and no comma +// reading tokens from the last token, tokens are added to the last name if +// they are connected to the following token by a hyphen +func splitFirstLast(part NamePart) (NamePart, NamePart) { + var i int + if len(part) == 0 { + return part[:0], part[:] + } + for i = len(part) - 1; i >= 0; i-- { + if part[i].Sep != '-' { + break + } + } + if i == -1 { + return part[:0], part[:] + } else { + return part[:i], part[i:] + } +} + +// SplitNames parses the string s and returns a list of names. +func SplitNames(s string) (res []Name) { + names := splitNames(s) + var first, von, last, vonlast, jr NamePart + for _, name := range names { + first, von, last, vonlast, jr = nil, nil, nil, nil, nil + if len(name) == 1 { + i := findVon(name[0]) + if i >= 0 { + first = name[0][:i] + vonlast = name[0][i:] + von, last = splitVonLast(vonlast) + } else { + first, last = splitFirstLast(name[0]) + } + } else if len(name) == 2 { + vonlast = name[0] + von, last = splitVonLast(vonlast) + first = name[1] + } else if len(name) == 3 { + vonlast = name[0] + von, last = splitVonLast(vonlast) + first = name[2] + jr = name[1] + } + res = append(res, Name{first, von, last, jr}) + } + return +} diff --git a/utils.go b/utils.go new file mode 100644 index 0000000..1ca8298 --- /dev/null +++ b/utils.go @@ -0,0 +1,149 @@ +package bibtex + +import ( + "bytes" + "strings" + "unicode" +) + +// reader for LaTeX strings with methods to easily handle special chars, etc +type reader struct { + *strings.Reader +} + +func newReader(s string) *reader { + return &reader{Reader: strings.NewReader(s)} +} + +func (r *reader) readRune() rune { + c, _, _ := r.Reader.ReadRune() + return c +} + +func (r *reader) readBraces() string { + bracelevel := 1 + var c rune + var buf bytes.Buffer + for bracelevel > 0 { + c = r.readRune() + switch { + case c == '{': + bracelevel += 1 + case c == '}': + bracelevel -= 1 + case c == eof: + return buf.String() + } + buf.WriteRune(c) + } + return buf.String() +} + +func (r *reader) skipBraces() { + bracelevel := 1 + var c rune + for bracelevel > 0 { + c = r.readRune() + switch { + case c == '{': + bracelevel += 1 + case c == '}': + bracelevel -= 1 + case c == eof: + return + } + } +} + +func (r *reader) peek() rune { + ch := r.readRune() + r.UnreadRune() + return ch +} + +// length of a LaTeX string. A special char counts as one, braces count as one +// to be consitent with the original BibTeX implementation, but this seems +// stupid +func length(s string) (res int) { + reader := newReader(s) + res = 0 + blevel := 0 + for c := reader.readRune(); c != eof; c = reader.readRune() { + switch { + case c == '{' && blevel == 0: + blevel += 1 + if c := reader.peek(); c == '\\' { + // special char, count as one + reader.skipBraces() + blevel = 0 + } + case c == '{': + blevel += 1 + case c == '}': + blevel -= 1 + } + res += 1 + } + return +} + +func (r *reader) searchBraces() (bool, bool) { + bracelevel := 1 + var c rune + for bracelevel > 0 { + c = r.readRune() + switch { + case c == '{': + bracelevel += 1 + case c == '}': + bracelevel -= 1 + case unicode.IsLower(c): + return true, true + case unicode.IsUpper(c): + return false, true + case c == eof: + return false, false + } + } + return false, false +} + +func (r *reader) readCommand() string { + var buf bytes.Buffer + for c := r.readRune(); c != eof; c = r.readRune() { + if !unicode.IsLetter(c) { + r.UnreadRune() + break + } else { + buf.WriteRune(c) + } + } + return buf.String() +} + +func (t *Token) isLower() bool { + reader := newReader(t.Text) + for c := reader.readRune(); c != eof; c = reader.readRune() { + switch { + case c == '{': + if c := reader.readRune(); c == '\\' { + command := reader.readCommand() + if lcommands[command] { + return true + } else if ucommands[command] { + return false + } else if a, b := reader.searchBraces(); b { + return a + } + } else { + reader.UnreadRune() + reader.skipBraces() + } + case unicode.IsLower(c): + return true + case unicode.IsUpper(c): + return false + } + } + return false +} -- cgit v1.2.3-70-g09d2