package bibtex import ( "bytes" "strings" "unicode" "golang.org/x/text/unicode/norm" ) var accents = map[string]string{ "'": "\u0301", "`": "\u0300", "^": "\u0302", "\"": "\u0308", "~": "\u0303", "=": "\u0304", ".": "\u0307", "u": "\u0306", "v": "\u030C", "H": "\u030B", "t": "\u0361", "c": "\u0327", "d": "\u0323", "b": "\u0331", } var commands = map[string]string{ "aa": "å", "AA": "Å", "oe": "œ", "ae": "æ", "OE": "Œ", "AE": "Æ", "ss": "ß", "o": "ø", "O": "Ø", "L": "Ł", "l": "ł", "$": "$", "{": "{", "}": "}", "_": "_", "#": "#", "%": "%", "&": "&", } // reader for LaTeX strings with methods to easily handle special chars, etc type reader struct { *strings.Reader } func newReader(s string) *reader { return &reader{Reader: strings.NewReader(s)} } func (r *reader) readRune() rune { c, _, _ := r.Reader.ReadRune() return c } func (r *reader) readBraces() string { bracelevel := 1 var c rune var buf bytes.Buffer for bracelevel > 0 { c = r.readRune() switch { case c == '{': bracelevel += 1 case c == '}': bracelevel -= 1 case c == eof: return buf.String() } buf.WriteRune(c) } return buf.String() } func (r *reader) skipBraces() { bracelevel := 1 var c rune for bracelevel > 0 { c = r.readRune() switch { case c == '{': bracelevel += 1 case c == '}': bracelevel -= 1 case c == eof: return } } } func (r *reader) peek() rune { ch := r.readRune() r.UnreadRune() return ch } func (r *reader) eatSpace() { for c := r.readRune(); c != eof; c = r.readRune() { if !unicode.IsSpace(c) { r.UnreadRune() break } } } func (r *reader) readLetter() (d rune) { c := r.readRune() switch c { case '{': blevel := 1 for c != eof { c = r.readRune() if c == '{' { blevel += 1 } else if c == '}' { blevel -= 1 if blevel == 0 { break } } else { if d == rune(0) { d = c } } } default: d = c return } if d == rune(0) { d = ' ' } return } func removeLatexCommands(s string) string { r := newReader(s) var buf bytes.Buffer for c := r.readRune(); c != eof; c = r.readRune() { switch c { case '\\': command := r.readCommand() if command == "" { command = string(r.readRune()) } if v, in := commands[command]; in { buf.WriteString(v) } else if v, in := accents[command]; in { r.eatSpace() l := r.readLetter() buf.WriteRune(l) buf.WriteString(v) } case '{', '}': default: buf.WriteRune(c) } } return string(norm.NFC.Bytes(buf.Bytes())) } // length of a LaTeX string. A special char counts as one, braces count as one // to be consitent with the original BibTeX implementation, but this seems // stupid func length(s string) (res int) { reader := newReader(s) res = 0 blevel := 0 for c := reader.readRune(); c != eof; c = reader.readRune() { switch { case c == '{' && blevel == 0: blevel += 1 if c := reader.peek(); c == '\\' { // special char, count as one reader.skipBraces() blevel = 0 } case c == '{': blevel += 1 case c == '}': blevel -= 1 } res += 1 } return } func (r *reader) searchBraces() (bool, bool) { bracelevel := 1 var c rune for bracelevel > 0 { c = r.readRune() switch { case c == '{': bracelevel += 1 case c == '}': bracelevel -= 1 case unicode.IsLower(c): return true, true case unicode.IsUpper(c): return false, true case c == eof: return false, false } } return false, false } func (r *reader) readCommand() string { var buf bytes.Buffer for c := r.readRune(); c != eof; c = r.readRune() { if !unicode.IsLetter(c) { r.UnreadRune() break } else { buf.WriteRune(c) } } return buf.String() } func isLower(s string) bool { reader := newReader(s) for c := reader.readRune(); c != eof; c = reader.readRune() { switch { case c == '{': if c := reader.readRune(); c == '\\' { command := reader.readCommand() if lcommands[command] { return true } else if ucommands[command] { return false } else if a, b := reader.searchBraces(); b { return a } } else { reader.UnreadRune() reader.skipBraces() } case unicode.IsLower(c): return true case unicode.IsUpper(c): return false } } return false }