regsub.py 6.05 KB
Newer Older
1
"""Regexp-based split and replace using the obsolete regex module.
2

3 4 5 6 7 8 9 10 11
This module is only for backward compatibility.  These operations
are now provided by the new regular expression module, "re".

sub(pat, repl, str):        replace first occurrence of pattern in string
gsub(pat, repl, str):       replace all occurrences of pattern in string
split(str, pat, maxsplit):  split string using pattern as delimiter
splitx(str, pat, maxsplit): split string using pattern as delimiter plus
                            return delimiters
"""
12

13 14
import warnings
warnings.warn("the regsub module is deprecated; please use re.sub()",
Tim Peters's avatar
Tim Peters committed
15
              DeprecationWarning)
16 17 18 19

# Ignore further deprecation warnings about this module
warnings.filterwarnings("ignore", "", DeprecationWarning, __name__)

20 21
import regex

22
__all__ = ["sub","gsub","split","splitx","capwords"]
23 24 25 26 27 28 29 30

# Replace first occurrence of pattern pat in string str by replacement
# repl.  If the pattern isn't found, the string is returned unchanged.
# The replacement may contain references \digit to subpatterns and
# escaped backslashes.  The pattern may be a string or an already
# compiled pattern.

def sub(pat, repl, str):
Tim Peters's avatar
Tim Peters committed
31 32 33 34 35 36
    prog = compile(pat)
    if prog.search(str) >= 0:
        regs = prog.regs
        a, b = regs[0]
        str = str[:a] + expand(repl, regs, str) + str[b:]
    return str
37 38 39 40 41 42 43 44


# Replace all (non-overlapping) occurrences of pattern pat in string
# str by replacement repl.  The same rules as for sub() apply.
# Empty matches for the pattern are replaced only when not adjacent to
# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.

def gsub(pat, repl, str):
Tim Peters's avatar
Tim Peters committed
45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
    prog = compile(pat)
    new = ''
    start = 0
    first = 1
    while prog.search(str, start) >= 0:
        regs = prog.regs
        a, b = regs[0]
        if a == b == start and not first:
            if start >= len(str) or prog.search(str, start+1) < 0:
                break
            regs = prog.regs
            a, b = regs[0]
        new = new + str[start:a] + expand(repl, regs, str)
        start = b
        first = 0
    new = new + str[start:]
    return new
62 63 64 65 66


# Split string str in fields separated by delimiters matching pattern
# pat.  Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc'].
67
# The optional 3rd argument sets the number of splits that are performed.
68

69
def split(str, pat, maxsplit = 0):
Tim Peters's avatar
Tim Peters committed
70
    return intsplit(str, pat, maxsplit, 0)
71 72 73 74 75 76 77 78 79

# Split string str in fields separated by delimiters matching pattern
# pat.  Only non-empty matches for the pattern are considered, so e.g.
# split('abc', '') returns ['abc']. The delimiters are also included
# in the list.
# The optional 3rd argument sets the number of splits that are performed.


def splitx(str, pat, maxsplit = 0):
Tim Peters's avatar
Tim Peters committed
80 81
    return intsplit(str, pat, maxsplit, 1)

82 83 84
# Internal function used to implement split() and splitx().

def intsplit(str, pat, maxsplit, retain):
Tim Peters's avatar
Tim Peters committed
85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
    prog = compile(pat)
    res = []
    start = next = 0
    splitcount = 0
    while prog.search(str, next) >= 0:
        regs = prog.regs
        a, b = regs[0]
        if a == b:
            next = next + 1
            if next >= len(str):
                break
        else:
            res.append(str[start:a])
            if retain:
                res.append(str[a:b])
            start = next = b
            splitcount = splitcount + 1
            if (maxsplit and (splitcount >= maxsplit)):
                break
    res.append(str[start:])
    return res
106 107


108 109
# Capitalize words split using a pattern

110
def capwords(str, pat='[^a-zA-Z0-9_]+'):
Tim Peters's avatar
Tim Peters committed
111 112
    words = splitx(str, pat)
    for i in range(0, len(words), 2):
113
        words[i] = words[i].capitalize()
114
    return "".join(words)
115 116


117 118 119 120 121 122
# Internal subroutines:
# compile(pat): compile a pattern, caching already compiled patterns
# expand(repl, regs, str): expand \digit escapes in replacement string


# Manage a cache of compiled regular expressions.
123 124 125
#
# If the pattern is a string a compiled version of it is returned.  If
# the pattern has been used before we return an already compiled
126
# version from the cache; otherwise we compile it now and save the
127 128 129
# compiled version in the cache, along with the syntax it was compiled
# with.  Instead of a string, a compiled regular expression can also
# be passed.
130 131 132 133

cache = {}

def compile(pat):
Tim Peters's avatar
Tim Peters committed
134 135 136 137 138 139 140 141
    if type(pat) != type(''):
        return pat              # Assume it is a compiled regex
    key = (pat, regex.get_syntax())
    if cache.has_key(key):
        prog = cache[key]       # Get it from the cache
    else:
        prog = cache[key] = regex.compile(pat)
    return prog
142 143


144
def clear_cache():
Tim Peters's avatar
Tim Peters committed
145 146
    global cache
    cache = {}
147 148


149 150 151 152 153 154 155
# Expand \digit in the replacement.
# Each occurrence of \digit is replaced by the substring of str
# indicated by regs[digit].  To include a literal \ in the
# replacement, double it; other \ escapes are left unchanged (i.e.
# the \ and the following character are both copied).

def expand(repl, regs, str):
Tim Peters's avatar
Tim Peters committed
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
    if '\\' not in repl:
        return repl
    new = ''
    i = 0
    ord0 = ord('0')
    while i < len(repl):
        c = repl[i]; i = i+1
        if c != '\\' or i >= len(repl):
            new = new + c
        else:
            c = repl[i]; i = i+1
            if '0' <= c <= '9':
                a, b = regs[ord(c)-ord0]
                new = new + str[a:b]
            elif c == '\\':
                new = new + c
            else:
                new = new + '\\' + c
    return new
175 176 177 178 179 180


# Test program, reads sequences "pat repl str" from stdin.
# Optional argument specifies pattern used to split lines.

def test():
Tim Peters's avatar
Tim Peters committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
    import sys
    if sys.argv[1:]:
        delpat = sys.argv[1]
    else:
        delpat = '[ \t\n]+'
    while 1:
        if sys.stdin.isatty(): sys.stderr.write('--> ')
        line = sys.stdin.readline()
        if not line: break
        if line[-1] == '\n': line = line[:-1]
        fields = split(line, delpat)
        if len(fields) != 3:
            print 'Sorry, not three fields'
            print 'split:', `fields`
            continue
        [pat, repl, str] = split(line, delpat)
        print 'sub :', `sub(pat, repl, str)`
        print 'gsub:', `gsub(pat, repl, str)`