utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118

import re
import os

ALLCHARS = "".join(map(chr,range(256)))
RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
ACCENT_MAP = { "'" : 'acute',
               "`" : 'grave',
               "~" : 'tilde',
               "^" : 'circ',
               '"' : 'uml',
               "c" : 'cedil',
               }

UNICODE_MAP = { '&nacute;' : '&#x0144;', }
HTML_LIGATURE_MAP = {
    'AE' : '&AElig;',
    'ae' : '&aelig;',
    'OE' : '&OElig;',
    'oe' : '&oelig;',
    'AA' : '&Aring;',
    'aa' : '&aring;',
    'O'  : '&Oslash;',
    'o'  : '&oslash;',
    'ss' : '&szlig;',
    }
RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")

def url_untranslate(s):
    """Change a BibTeX key into a string suitable for use in a URL."""
    s = re.sub(r'([%<>`#, &_\';])', lambda m: "_%02x" % ord(m.group(1)), s)
    s = s.replace("/", ":")
    return s

def txtize(s):
    """Turn a TeX string into decnent plaintext."""
    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
    s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
    s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
    s = RE_TEX_CMD.sub("", s)
    s = s.translate(ALLCHARS, "{}")
    return s

def unTeXescapeURL(s):
    """Turn a URL as formatted in TeX into a real URL."""
    s = s.replace("\\_", "_")
    s = s.replace("\\-", "")
    s = s.replace("\{}", "")
    s = s.replace("{}", "")
    return s

def TeXescapeURL(s):
    """Escape a URL for use in TeX"""
    s = s.replace("_", "\\_")
    s = s.replace("~", "\{}~")
    return s

def _unaccent(m):
    accent,char = m.groups()
    if char[0] == '{':
        char = char[1]
    accented = "&%s%s;" % (char, ACCENT_MAP[accent])
    return UNICODE_MAP.get(accented, accented)

def _unlig_html(m):
    return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2))

def htmlize(s):
    """Turn a TeX string into good-looking HTML."""
    s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
    s = RE_ACCENT.sub(_unaccent, s)
    s = unTeXescapeURL(s)
    s = RE_LIGATURE.sub(_unlig_html, s);
    s = RE_TEX_CMD.sub("", s)
    s = s.translate(ALLCHARS, "{}")
    s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
    s = s.replace("---", "&mdash;");
    s = s.replace("--", "&ndash;");
    return s

def smartJoin(*lst):
    """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
    """
    lst = [item for item in lst if item != "."]
    idx = 0
    while idx < len(lst):
        if idx > 0 and lst[idx] == "..":
            del lst[idx]
        else:
            idx += 1
    return os.path.join(*lst)

def _split(s,w=79,indent=8):
    r = []
    s = re.sub(r"\s+", " ", s)
    first = 1
    indentation = ""
    while len(s) > w:
        for i in xrange(w-1, 20, -1):
            if s[i] == ' ':
                r.append(indentation+s[:i])
                s = s[i+1:]
                break
        else:
            r.append(indentation+s.strip())
            s = ""
        if first:
            first = 0
            w -= indent
            indentation = " "*indent
    if (s):
        r.append(indentation+s)
    r.append("")
    return "\n".join(r)