diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2016-02-28 11:25:23 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2016-02-28 11:25:23 -0500 |
| commit | 3f442a5cfa42488efd2d070fc1e0e2cd92289f81 (patch) | |
| tree | 0e79e497ba11f764ffe3e1048cbf73fd3f15349d /utils.go | |
| parent | c4060f7d80fc3d24b9f5f31552a969d41b6b7b00 (diff) | |
| download | bibtex-3f442a5cfa42488efd2d070fc1e0e2cd92289f81.tar.gz | |
Latex sanitizing
Diffstat (limited to 'utils.go')
| -rw-r--r-- | utils.go | 105 |
1 files changed, 105 insertions, 0 deletions
@@ -4,8 +4,48 @@ import ( "bytes" "strings" "unicode" + + "golang.org/x/text/unicode/norm" ) +var accents = map[string]string{ + "'": "\u0301", + "`": "\u0300", + "^": "\u0302", + "\"": "\u0308", + "~": "\u0303", + "=": "\u0304", + ".": "\u0307", + "u": "\u0306", + "v": "\u030C", + "H": "\u030B", + "t": "\u0361", + "c": "\u0327", + "d": "\u0323", + "b": "\u0331", +} + +var commands = map[string]string{ + "aa": "å", + "AA": "Å", + "oe": "œ", + "ae": "æ", + "OE": "Œ", + "AE": "Æ", + "ss": "ß", + "o": "ø", + "O": "Ø", + "L": "Ł", + "l": "ł", + "$": "$", + "{": "{", + "}": "}", + "_": "_", + "#": "#", + "%": "%", + "&": "&", +} + // reader for LaTeX strings with methods to easily handle special chars, etc type reader struct { *strings.Reader @@ -61,6 +101,71 @@ func (r *reader) peek() rune { return ch } +func (r *reader) eatSpace() { + for c := r.readRune(); c != eof; c = r.readRune() { + if !unicode.IsSpace(c) { + r.UnreadRune() + break + } + } +} + +func (r *reader) readLetter() (d rune) { + c := r.readRune() + switch c { + case '{': + blevel := 1 + for c != eof { + c = r.readRune() + if c == '{' { + blevel += 1 + } else if c == '}' { + blevel -= 1 + if blevel == 0 { + break + } + } else { + if d == rune(0) { + d = c + } + } + } + default: + d = c + return + } + if d == rune(0) { + d = ' ' + } + return +} + +func removeLatexCommands(s string) string { + r := newReader(s) + var buf bytes.Buffer + for c := r.readRune(); c != eof; c = r.readRune() { + switch c { + case '\\': + command := r.readCommand() + if command == "" { + command = string(r.readRune()) + } + if v, in := commands[command]; in { + buf.WriteString(v) + } else if v, in := accents[command]; in { + r.eatSpace() + l := r.readLetter() + buf.WriteRune(l) + buf.WriteString(v) + } + case '{', '}': + default: + buf.WriteRune(c) + } + } + return string(norm.NFC.Bytes(buf.Bytes())) +} + // length of a LaTeX string. A special char counts as one, braces count as one // to be consitent with the original BibTeX implementation, but this seems // stupid |
