From 3f442a5cfa42488efd2d070fc1e0e2cd92289f81 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Sun, 28 Feb 2016 11:25:23 -0500 Subject: Latex sanitizing --- names.go | 4 +-- names_test.go | 17 ++++++++++ utils.go | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 2 deletions(-) diff --git a/names.go b/names.go index 6491d35..ddd7b17 100644 --- a/names.go +++ b/names.go @@ -253,7 +253,7 @@ func (n Name) Format(s string) string { // the von part starts at the first non last lower-case taken func findVon(part NamePart) int { for i, t := range part { - if t.isLower() && i < len(part)-1 { + if isLower(t.Text) && i < len(part)-1 { return i } } @@ -268,7 +268,7 @@ func splitVonLast(part NamePart) (NamePart, NamePart) { return part[:0], part[:] } for i = len(part) - 2; i >= 0; i-- { - if part[i].isLower() { + if isLower(part[i].Text) { i += 1 break } diff --git a/names_test.go b/names_test.go index ad8ba05..2d74f1c 100644 --- a/names_test.go +++ b/names_test.go @@ -124,3 +124,20 @@ func TestIsLower(t *testing.T) { } } } + +func TestRemoveLatex(t *testing.T) { + tests := []struct { + s string + expected string + }{ + {"\\' a\\OE{\\c {cd}}\\&{\\emph{test}}", "áŒç&test"}, + } + for _, test := range tests { + actual := removeLatexCommands(test.s) + if test.expected != actual { + t.Errorf("%q, expected: %q, actual: %q", test.s, + test.expected, actual) + } + } + +} diff --git a/utils.go b/utils.go index e91ae44..3a1dd6a 100644 --- a/utils.go +++ b/utils.go @@ -4,8 +4,48 @@ import ( "bytes" "strings" "unicode" + + "golang.org/x/text/unicode/norm" ) +var accents = map[string]string{ + "'": "\u0301", + "`": "\u0300", + "^": "\u0302", + "\"": "\u0308", + "~": "\u0303", + "=": "\u0304", + ".": "\u0307", + "u": "\u0306", + "v": "\u030C", + "H": "\u030B", + "t": "\u0361", + "c": "\u0327", + "d": "\u0323", + "b": "\u0331", +} + +var commands = map[string]string{ + "aa": "å", + "AA": "Å", + "oe": "œ", + "ae": "æ", + "OE": "Œ", + "AE": "Æ", + "ss": "ß", + "o": "ø", + "O": "Ø", + "L": "Ł", + "l": "ł", + "$": "$", + "{": "{", + "}": "}", + "_": "_", + "#": "#", + "%": "%", + "&": "&", +} + // reader for LaTeX strings with methods to easily handle special chars, etc type reader struct { *strings.Reader @@ -61,6 +101,71 @@ func (r *reader) peek() rune { return ch } +func (r *reader) eatSpace() { + for c := r.readRune(); c != eof; c = r.readRune() { + if !unicode.IsSpace(c) { + r.UnreadRune() + break + } + } +} + +func (r *reader) readLetter() (d rune) { + c := r.readRune() + switch c { + case '{': + blevel := 1 + for c != eof { + c = r.readRune() + if c == '{' { + blevel += 1 + } else if c == '}' { + blevel -= 1 + if blevel == 0 { + break + } + } else { + if d == rune(0) { + d = c + } + } + } + default: + d = c + return + } + if d == rune(0) { + d = ' ' + } + return +} + +func removeLatexCommands(s string) string { + r := newReader(s) + var buf bytes.Buffer + for c := r.readRune(); c != eof; c = r.readRune() { + switch c { + case '\\': + command := r.readCommand() + if command == "" { + command = string(r.readRune()) + } + if v, in := commands[command]; in { + buf.WriteString(v) + } else if v, in := accents[command]; in { + r.eatSpace() + l := r.readLetter() + buf.WriteRune(l) + buf.WriteString(v) + } + case '{', '}': + default: + buf.WriteRune(c) + } + } + return string(norm.NFC.Bytes(buf.Bytes())) +} + // length of a LaTeX string. A special char counts as one, braces count as one // to be consitent with the original BibTeX implementation, but this seems // stupid -- cgit v1.2.3-70-g09d2