string.py 20.3 KB
Newer Older
1
"""A collection of string operations (most are no longer used).
Guido van Rossum's avatar
Guido van Rossum committed
2

3 4 5 6
Warning: most of the code you see here isn't normally used nowadays.
Beginning with Python 1.6, many of these functions are implemented as
methods on the standard string object. They used to be implemented by
a built-in module called strop, but strop is now obsolete itself.
7 8 9 10 11 12 13 14 15 16

Public module variables:

whitespace -- a string containing all characters considered whitespace
lowercase -- a string containing all characters considered lowercase letters
uppercase -- a string containing all characters considered uppercase letters
letters -- a string containing all characters considered letters
digits -- a string containing all characters considered decimal digits
hexdigits -- a string containing all characters considered hexadecimal digits
octdigits -- a string containing all characters considered octal digits
Fred Drake's avatar
Fred Drake committed
17 18
punctuation -- a string containing all characters considered punctuation
printable -- a string containing all characters considered printable
19 20 21

"""

Guido van Rossum's avatar
Guido van Rossum committed
22
# Some strings for ctype-style character classification
23
whitespace = ' \t\n\r\v\f'
Guido van Rossum's avatar
Guido van Rossum committed
24 25 26
lowercase = 'abcdefghijklmnopqrstuvwxyz'
uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
letters = lowercase + uppercase
27 28 29
ascii_lowercase = lowercase
ascii_uppercase = uppercase
ascii_letters = ascii_lowercase + ascii_uppercase
Guido van Rossum's avatar
Guido van Rossum committed
30 31 32
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'
Tim Peters's avatar
Tim Peters committed
33
punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
34
printable = digits + letters + punctuation + whitespace
Guido van Rossum's avatar
Guido van Rossum committed
35 36

# Case conversion helpers
37 38 39 40
# Use str to convert Unicode literal in case of -U
l = map(chr, xrange(256))
_idmap = str('').join(l)
del l
Guido van Rossum's avatar
Guido van Rossum committed
41

42 43 44 45
# Functions which aren't available as string methods.

# Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
def capwords(s, sep=None):
46
    """capwords(s [,sep]) -> string
47 48 49

    Split the argument into words using split, capitalize each
    word using capitalize, and join the capitalized words using
50 51 52 53
    join.  If the optional second argument sep is absent or None,
    runs of whitespace characters are replaced by a single space
    and leading and trailing whitespace are removed, otherwise
    sep is used to split and join the words.
54 55

    """
56
    return (sep or ' ').join(x.capitalize() for x in s.split(sep))
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72


# Construct a translation string
_idmapL = None
def maketrans(fromstr, tostr):
    """maketrans(frm, to) -> string

    Return a translation table (a string of 256 bytes long)
    suitable for use in string.translate.  The strings frm and to
    must be of the same length.

    """
    if len(fromstr) != len(tostr):
        raise ValueError, "maketrans arguments must have same length"
    global _idmapL
    if not _idmapL:
73
        _idmapL = list(_idmap)
74 75 76 77 78 79 80
    L = _idmapL[:]
    fromstr = map(ord, fromstr)
    for i in range(len(fromstr)):
        L[fromstr[i]] = tostr[i]
    return ''.join(L)


81

82
####################################################################
83 84
import re as _re

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
class _multimap:
    """Helper class for combining multiple mappings.

    Used by .{safe_,}substitute() to combine the mapping and keyword
    arguments.
    """
    def __init__(self, primary, secondary):
        self._primary = primary
        self._secondary = secondary

    def __getitem__(self, key):
        try:
            return self._primary[key]
        except KeyError:
            return self._secondary[key]


102 103
class _TemplateMetaclass(type):
    pattern = r"""
104 105 106 107 108 109
    %(delim)s(?:
      (?P<escaped>%(delim)s) |   # Escape sequence of two delimiters
      (?P<named>%(id)s)      |   # delimiter and a Python identifier
      {(?P<braced>%(id)s)}   |   # delimiter and a braced identifier
      (?P<invalid>)              # Other ill-formed delimiter exprs
    )
110 111 112
    """

    def __init__(cls, name, bases, dct):
113
        super(_TemplateMetaclass, cls).__init__(name, bases, dct)
114 115 116 117
        if 'pattern' in dct:
            pattern = cls.pattern
        else:
            pattern = _TemplateMetaclass.pattern % {
118
                'delim' : _re.escape(cls.delimiter),
119 120 121 122 123 124
                'id'    : cls.idpattern,
                }
        cls.pattern = _re.compile(pattern, _re.IGNORECASE | _re.VERBOSE)


class Template:
125
    """A string class for supporting $-substitutions."""
126 127
    __metaclass__ = _TemplateMetaclass

128
    delimiter = '$'
129 130 131 132
    idpattern = r'[_a-z][_a-z0-9]*'

    def __init__(self, template):
        self.template = template
133 134

    # Search for $$, $identifier, ${identifier}, and any bare $'s
135

136 137
    def _invalid(self, mo):
        i = mo.start('invalid')
138 139 140 141 142 143 144 145 146 147
        lines = self.template[:i].splitlines(True)
        if not lines:
            colno = 1
            lineno = 1
        else:
            colno = i - len(''.join(lines[:-1]))
            lineno = len(lines)
        raise ValueError('Invalid placeholder in string: line %d, col %d' %
                         (lineno, colno))

148 149 150 151 152
    def substitute(self, *args, **kws):
        if len(args) > 1:
            raise TypeError('Too many positional arguments')
        if not args:
            mapping = kws
153
        elif kws:
154 155 156
            mapping = _multimap(kws, args[0])
        else:
            mapping = args[0]
157
        # Helper function for .sub()
158
        def convert(mo):
159 160 161 162 163 164
            # Check the most common path first.
            named = mo.group('named') or mo.group('braced')
            if named is not None:
                val = mapping[named]
                # We use this idiom instead of str() because the latter will
                # fail if val is a Unicode containing non-ASCII characters.
165
                return '%s' % (val,)
166
            if mo.group('escaped') is not None:
167
                return self.delimiter
168 169
            if mo.group('invalid') is not None:
                self._invalid(mo)
170 171
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
172
        return self.pattern.sub(convert, self.template)
173

174 175 176 177 178
    def safe_substitute(self, *args, **kws):
        if len(args) > 1:
            raise TypeError('Too many positional arguments')
        if not args:
            mapping = kws
179
        elif kws:
180 181 182
            mapping = _multimap(kws, args[0])
        else:
            mapping = args[0]
183
        # Helper function for .sub()
184
        def convert(mo):
185
            named = mo.group('named')
186 187
            if named is not None:
                try:
188 189
                    # We use this idiom instead of str() because the latter
                    # will fail if val is a Unicode containing non-ASCII
190
                    return '%s' % (mapping[named],)
191
                except KeyError:
192
                    return self.delimiter + named
193
            braced = mo.group('braced')
194 195
            if braced is not None:
                try:
196
                    return '%s' % (mapping[braced],)
197
                except KeyError:
198
                    return self.delimiter + '{' + braced + '}'
199
            if mo.group('escaped') is not None:
200
                return self.delimiter
201
            if mo.group('invalid') is not None:
202
                return self.delimiter
203 204
            raise ValueError('Unrecognized named group in pattern',
                             self.pattern)
205
        return self.pattern.sub(convert, self.template)
206 207


208

209
####################################################################
210 211 212
# NOTE: Everything below here is deprecated.  Use string methods instead.
# This stuff will go away in Python 3.0.

213 214 215 216 217 218
# Backward compatible names for exceptions
index_error = ValueError
atoi_error = ValueError
atof_error = ValueError
atol_error = ValueError

Guido van Rossum's avatar
Guido van Rossum committed
219 220
# convert UPPER CASE letters to lower case
def lower(s):
221
    """lower(s) -> string
222

223
    Return a copy of the string s converted to lowercase.
224

225 226
    """
    return s.lower()
Guido van Rossum's avatar
Guido van Rossum committed
227 228 229

# Convert lower case letters to UPPER CASE
def upper(s):
230
    """upper(s) -> string
231

232
    Return a copy of the string s converted to uppercase.
233

234 235
    """
    return s.upper()
Guido van Rossum's avatar
Guido van Rossum committed
236 237 238

# Swap lower case letters and UPPER CASE
def swapcase(s):
239
    """swapcase(s) -> string
240

241 242
    Return a copy of the string s with upper case characters
    converted to lowercase and vice versa.
243

244 245
    """
    return s.swapcase()
Guido van Rossum's avatar
Guido van Rossum committed
246 247

# Strip leading and trailing tabs and spaces
248
def strip(s, chars=None):
249
    """strip(s [,chars]) -> string
250

251 252
    Return a copy of the string s with leading and trailing
    whitespace removed.
253
    If chars is given and not None, remove characters in chars instead.
254
    If chars is unicode, S will be converted to unicode before stripping.
255

256
    """
257
    return s.strip(chars)
Guido van Rossum's avatar
Guido van Rossum committed
258

259
# Strip leading tabs and spaces
260 261
def lstrip(s, chars=None):
    """lstrip(s [,chars]) -> string
262

263
    Return a copy of the string s with leading whitespace removed.
264
    If chars is given and not None, remove characters in chars instead.
265

266
    """
267
    return s.lstrip(chars)
268 269

# Strip trailing tabs and spaces
270 271
def rstrip(s, chars=None):
    """rstrip(s [,chars]) -> string
272

273 274
    Return a copy of the string s with trailing whitespace removed.
    If chars is given and not None, remove characters in chars instead.
275

276
    """
277
    return s.rstrip(chars)
278 279


Guido van Rossum's avatar
Guido van Rossum committed
280
# Split a string into a list of space/tab-separated words
281
def split(s, sep=None, maxsplit=-1):
282
    """split(s [,sep [,maxsplit]]) -> list of strings
283

284
    Return a list of the words in the string s, using sep as the
285 286
    delimiter string.  If maxsplit is given, splits at no more than
    maxsplit places (resulting in at most maxsplit+1 words).  If sep
287
    is not specified or is None, any whitespace string is a separator.
288

289
    (split and splitfields are synonymous)
290

291 292 293
    """
    return s.split(sep, maxsplit)
splitfields = split
294

295 296 297 298 299 300 301 302 303 304 305 306
# Split a string into a list of space/tab-separated words
def rsplit(s, sep=None, maxsplit=-1):
    """rsplit(s [,sep [,maxsplit]]) -> list of strings

    Return a list of the words in the string s, using sep as the
    delimiter string, starting at the end of the string and working
    to the front.  If maxsplit is given, at most maxsplit splits are
    done. If sep is not specified or is None, any whitespace string
    is a separator.
    """
    return s.rsplit(sep, maxsplit)

307
# Join fields with optional separator
308 309
def join(words, sep = ' '):
    """join(list [,sep]) -> string
310

311
    Return a string composed of the words in list, with
312
    intervening occurrences of sep.  The default separator is a
313
    single space.
314

315
    (joinfields and join are synonymous)
316

317 318 319
    """
    return sep.join(words)
joinfields = join
320

321 322 323
# Find substring, raise exception if not found
def index(s, *args):
    """index(s, sub [,start [,end]]) -> int
324

325
    Like find but raises ValueError when the substring is not found.
326

327
    """
328
    return s.index(*args)
329

330
# Find last substring, raise exception if not found
331 332
def rindex(s, *args):
    """rindex(s, sub [,start [,end]]) -> int
333

334
    Like rfind but raises ValueError when the substring is not found.
335

336
    """
337
    return s.rindex(*args)
338 339

# Count non-overlapping occurrences of substring
340 341 342 343 344 345 346 347
def count(s, *args):
    """count(s, sub[, start[,end]]) -> int

    Return the number of occurrences of substring sub in string
    s[start:end].  Optional arguments start and end are
    interpreted as in slice notation.

    """
348
    return s.count(*args)
349

350
# Find substring, return -1 if not found
351 352 353 354 355 356 357 358 359 360
def find(s, *args):
    """find(s, sub [,start [,end]]) -> in

    Return the lowest index in s where substring sub is found,
    such that sub is contained within s[start,end].  Optional
    arguments start and end are interpreted as in slice notation.

    Return -1 on failure.

    """
361
    return s.find(*args)
Guido van Rossum's avatar
Guido van Rossum committed
362

363
# Find last substring, return -1 if not found
364 365 366 367 368 369 370 371 372 373
def rfind(s, *args):
    """rfind(s, sub [,start [,end]]) -> int

    Return the highest index in s where substring sub is found,
    such that sub is contained within s[start,end].  Optional
    arguments start and end are interpreted as in slice notation.

    Return -1 on failure.

    """
374
    return s.rfind(*args)
375 376 377 378 379

# for a bit of speed
_float = float
_int = int
_long = long
380

381
# Convert string to float
382 383 384 385 386 387
def atof(s):
    """atof(s) -> float

    Return the floating point number represented by the string s.

    """
388 389
    return _float(s)

390

Guido van Rossum's avatar
Guido van Rossum committed
391
# Convert string to integer
392
def atoi(s , base=10):
393 394 395 396 397 398 399 400 401 402
    """atoi(s [,base]) -> int

    Return the integer represented by the string s in the given
    base, which defaults to 10.  The string s must consist of one
    or more digits, possibly preceded by a sign.  If base is 0, it
    is chosen from the leading characters of s, 0 for octal, 0x or
    0X for hexadecimal.  If base is 16, a preceding 0x or 0X is
    accepted.

    """
403
    return _int(s, base)
404

Guido van Rossum's avatar
Guido van Rossum committed
405

406
# Convert string to long integer
407
def atol(s, base=10):
408 409 410 411 412 413 414 415 416 417 418
    """atol(s [,base]) -> long

    Return the long integer represented by the string s in the
    given base, which defaults to 10.  The string s must consist
    of one or more digits, possibly preceded by a sign.  If base
    is 0, it is chosen from the leading characters of s, 0 for
    octal, 0x or 0X for hexadecimal.  If base is 16, a preceding
    0x or 0X is accepted.  A trailing L or l is not accepted,
    unless base is 0.

    """
419
    return _long(s, base)
420

421

Guido van Rossum's avatar
Guido van Rossum committed
422
# Left-justify a string
423 424
def ljust(s, width, *args):
    """ljust(s, width[, fillchar]) -> string
425

426 427
    Return a left-justified version of s, in a field of the
    specified width, padded with spaces as needed.  The string is
428
    never truncated.  If specified the fillchar is used instead of spaces.
429

430
    """
431
    return s.ljust(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
432 433

# Right-justify a string
434 435
def rjust(s, width, *args):
    """rjust(s, width[, fillchar]) -> string
436

437 438
    Return a right-justified version of s, in a field of the
    specified width, padded with spaces as needed.  The string is
439
    never truncated.  If specified the fillchar is used instead of spaces.
440

441
    """
442
    return s.rjust(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
443 444

# Center a string
445 446
def center(s, width, *args):
    """center(s, width[, fillchar]) -> string
447

448 449
    Return a center version of s, in a field of the specified
    width. padded with spaces as needed.  The string is never
450
    truncated.  If specified the fillchar is used instead of spaces.
451

452
    """
453
    return s.center(width, *args)
Guido van Rossum's avatar
Guido van Rossum committed
454 455 456 457 458

# Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
# Decadent feature: the argument may be a string or a number
# (Use of this is deprecated; it should be a string as with ljust c.s.)
def zfill(x, width):
459
    """zfill(x, width) -> string
460

461 462
    Pad a numeric string x with zeros on the left, to fill a field
    of the specified width.  The string x is never truncated.
463

464
    """
465
    if not isinstance(x, basestring):
466 467
        x = repr(x)
    return x.zfill(width)
468 469 470

# Expand tabs in a string.
# Doesn't take non-printing chars into account, but does understand \n.
Guido van Rossum's avatar
Guido van Rossum committed
471
def expandtabs(s, tabsize=8):
472 473 474 475 476 477 478
    """expandtabs(s [,tabsize]) -> string

    Return a copy of the string s with all tab characters replaced
    by the appropriate number of spaces, depending on the current
    column, and the tabsize (default 8).

    """
479
    return s.expandtabs(tabsize)
480

481
# Character translation through look-up table.
482
def translate(s, table, deletions=""):
483
    """translate(s,table [,deletions]) -> string
484 485

    Return a copy of the string s, where all characters occurring
486
    in the optional argument deletions are removed, and the
487
    remaining characters have been mapped through the given
488 489
    translation table, which must be a string of length 256.  The
    deletions argument is not allowed for Unicode strings.
490 491

    """
492
    if deletions or table is None:
493 494 495 496 497 498
        return s.translate(table, deletions)
    else:
        # Add s[:0] so that if s is Unicode and table is an 8-bit string,
        # table is converted to Unicode.  This means that table *cannot*
        # be a dictionary -- for that feature, use u.translate() directly.
        return s.translate(table + s[:0])
499

500 501
# Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
def capitalize(s):
502
    """capitalize(s) -> string
503

504 505
    Return a copy of the string s with only its first character
    capitalized.
506

507 508
    """
    return s.capitalize()
509

510
# Substring replacement (global)
511 512
def replace(s, old, new, maxreplace=-1):
    """replace (str, old, new[, maxreplace]) -> string
513

514
    Return a copy of string str with all occurrences of substring
515 516
    old replaced by new. If the optional argument maxreplace is
    given, only the first maxreplace occurrences are replaced.
517

518
    """
519
    return s.replace(old, new, maxreplace)
520 521


522 523
# Try importing optional built-in module "strop" -- if it exists,
# it redefines some string operations that are 100-1000 times faster.
524 525
# It also defines values for whitespace, lowercase and uppercase
# that match <ctype.h>'s definitions.
526 527

try:
528 529
    from strop import maketrans, lowercase, uppercase, whitespace
    letters = lowercase + uppercase
530
except ImportError:
Fred Drake's avatar
Fred Drake committed
531
    pass                                          # Use the original versions
532 533 534 535 536

########################################################################
# the Formatter class
# see PEP 3101 for details and purpose of this class

Benjamin Peterson's avatar
Benjamin Peterson committed
537 538
# The hard parts are reused from the C implementation.  They're exposed as "_"
# prefixed methods of str and unicode.
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609

# The overall parser is implemented in str._formatter_parser.
# The field name parser is implemented in str._formatter_field_name_split

class Formatter(object):
    def format(self, format_string, *args, **kwargs):
        return self.vformat(format_string, args, kwargs)

    def vformat(self, format_string, args, kwargs):
        used_args = set()
        result = self._vformat(format_string, args, kwargs, used_args, 2)
        self.check_unused_args(used_args, args, kwargs)
        return result

    def _vformat(self, format_string, args, kwargs, used_args, recursion_depth):
        if recursion_depth < 0:
            raise ValueError('Max string recursion exceeded')
        result = []
        for literal_text, field_name, format_spec, conversion in \
                self.parse(format_string):

            # output the literal text
            if literal_text:
                result.append(literal_text)

            # if there's a field, output it
            if field_name is not None:
                # this is some markup, find the object and do
                #  the formatting

                # given the field_name, find the object it references
                #  and the argument it came from
                obj, arg_used = self.get_field(field_name, args, kwargs)
                used_args.add(arg_used)

                # do any conversion on the resulting object
                obj = self.convert_field(obj, conversion)

                # expand the format spec, if needed
                format_spec = self._vformat(format_spec, args, kwargs,
                                            used_args, recursion_depth-1)

                # format the object and append to the result
                result.append(self.format_field(obj, format_spec))

        return ''.join(result)


    def get_value(self, key, args, kwargs):
        if isinstance(key, (int, long)):
            return args[key]
        else:
            return kwargs[key]


    def check_unused_args(self, used_args, args, kwargs):
        pass


    def format_field(self, value, format_spec):
        return format(value, format_spec)


    def convert_field(self, value, conversion):
        # do any conversion on the resulting object
        if conversion == 'r':
            return repr(value)
        elif conversion == 's':
            return str(value)
        elif conversion is None:
            return value
610
        raise ValueError("Unknown conversion specifier {0!s}".format(conversion))
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642


    # returns an iterable that contains tuples of the form:
    # (literal_text, field_name, format_spec, conversion)
    # literal_text can be zero length
    # field_name can be None, in which case there's no
    #  object to format and output
    # if field_name is not None, it is looked up, formatted
    #  with format_spec and conversion and then used
    def parse(self, format_string):
        return format_string._formatter_parser()


    # given a field_name, find the object it references.
    #  field_name:   the field being looked up, e.g. "0.name"
    #                 or "lookup[3]"
    #  used_args:    a set of which args have been used
    #  args, kwargs: as passed in to vformat
    def get_field(self, field_name, args, kwargs):
        first, rest = field_name._formatter_field_name_split()

        obj = self.get_value(first, args, kwargs)

        # loop through the rest of the field_name, doing
        #  getattr or getitem as needed
        for is_attr, i in rest:
            if is_attr:
                obj = getattr(obj, i)
            else:
                obj = obj[i]

        return obj, first