From 3f442a5cfa42488efd2d070fc1e0e2cd92289f81 Mon Sep 17 00:00:00 2001
From: Thibaut Horel <thibaut.horel@gmail.com>
Date: Sun, 28 Feb 2016 11:25:23 -0500
Subject: Latex sanitizing

---
 names.go      |   4 +--
 names_test.go |  17 ++++++++++
 utils.go      | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 2 deletions(-)

diff --git a/names.go b/names.go
index 6491d35..ddd7b17 100644
--- a/names.go
+++ b/names.go
@@ -253,7 +253,7 @@ func (n Name) Format(s string) string {
 // the von part starts at the first non last lower-case taken
 func findVon(part NamePart) int {
 	for i, t := range part {
-		if t.isLower() && i < len(part)-1 {
+		if isLower(t.Text) && i < len(part)-1 {
 			return i
 		}
 	}
@@ -268,7 +268,7 @@ func splitVonLast(part NamePart) (NamePart, NamePart) {
 		return part[:0], part[:]
 	}
 	for i = len(part) - 2; i >= 0; i-- {
-		if part[i].isLower() {
+		if isLower(part[i].Text) {
 			i += 1
 			break
 		}
diff --git a/names_test.go b/names_test.go
index ad8ba05..2d74f1c 100644
--- a/names_test.go
+++ b/names_test.go
@@ -124,3 +124,20 @@ func TestIsLower(t *testing.T) {
 		}
 	}
 }
+
+func TestRemoveLatex(t *testing.T) {
+	tests := []struct {
+		s        string
+		expected string
+	}{
+		{"\\' a\\OE{\\c {cd}}\\&{\\emph{test}}", "áŒç&test"},
+	}
+	for _, test := range tests {
+		actual := removeLatexCommands(test.s)
+		if test.expected != actual {
+			t.Errorf("%q, expected: %q, actual: %q", test.s,
+				test.expected, actual)
+		}
+	}
+
+}
diff --git a/utils.go b/utils.go
index e91ae44..3a1dd6a 100644
--- a/utils.go
+++ b/utils.go
@@ -4,8 +4,48 @@ import (
 	"bytes"
 	"strings"
 	"unicode"
+
+	"golang.org/x/text/unicode/norm"
 )
 
+var accents = map[string]string{
+	"'":  "\u0301",
+	"`":  "\u0300",
+	"^":  "\u0302",
+	"\"": "\u0308",
+	"~":  "\u0303",
+	"=":  "\u0304",
+	".":  "\u0307",
+	"u":  "\u0306",
+	"v":  "\u030C",
+	"H":  "\u030B",
+	"t":  "\u0361",
+	"c":  "\u0327",
+	"d":  "\u0323",
+	"b":  "\u0331",
+}
+
+var commands = map[string]string{
+	"aa": "å",
+	"AA": "Å",
+	"oe": "œ",
+	"ae": "æ",
+	"OE": "Œ",
+	"AE": "Æ",
+	"ss": "ß",
+	"o":  "ø",
+	"O":  "Ø",
+	"L":  "Ł",
+	"l":  "ł",
+	"$":  "$",
+	"{":  "{",
+	"}":  "}",
+	"_":  "_",
+	"#":  "#",
+	"%":  "%",
+	"&":  "&",
+}
+
 // reader for LaTeX strings with methods to easily handle special chars, etc
 type reader struct {
 	*strings.Reader
@@ -61,6 +101,71 @@ func (r *reader) peek() rune {
 	return ch
 }
 
+func (r *reader) eatSpace() {
+	for c := r.readRune(); c != eof; c = r.readRune() {
+		if !unicode.IsSpace(c) {
+			r.UnreadRune()
+			break
+		}
+	}
+}
+
+func (r *reader) readLetter() (d rune) {
+	c := r.readRune()
+	switch c {
+	case '{':
+		blevel := 1
+		for c != eof {
+			c = r.readRune()
+			if c == '{' {
+				blevel += 1
+			} else if c == '}' {
+				blevel -= 1
+				if blevel == 0 {
+					break
+				}
+			} else {
+				if d == rune(0) {
+					d = c
+				}
+			}
+		}
+	default:
+		d = c
+		return
+	}
+	if d == rune(0) {
+		d = ' '
+	}
+	return
+}
+
+func removeLatexCommands(s string) string {
+	r := newReader(s)
+	var buf bytes.Buffer
+	for c := r.readRune(); c != eof; c = r.readRune() {
+		switch c {
+		case '\\':
+			command := r.readCommand()
+			if command == "" {
+				command = string(r.readRune())
+			}
+			if v, in := commands[command]; in {
+				buf.WriteString(v)
+			} else if v, in := accents[command]; in {
+				r.eatSpace()
+				l := r.readLetter()
+				buf.WriteRune(l)
+				buf.WriteString(v)
+			}
+		case '{', '}':
+		default:
+			buf.WriteRune(c)
+		}
+	}
+	return string(norm.NFC.Bytes(buf.Bytes()))
+}
+
 // length of a LaTeX string. A special char counts as one, braces count as one
 // to be consitent with the original BibTeX implementation, but this seems
 // stupid
-- 
cgit v1.2.3-70-g09d2