tabnanny.py 11.1 KB
Newer Older
1
#! /usr/bin/env python3
2

3 4
"""The Tab Nanny despises ambiguous indentation.  She knows no mercy.

Tim Peters's avatar
Tim Peters committed
5
tabnanny -- Detection of ambiguous indentation
6 7 8

For the time being this module is intended to be called as a script.
However it is possible to import it into an IDE and use the function
Tim Peters's avatar
Tim Peters committed
9
check() described below.
10 11

Warning: The API provided by this module is likely to change in future
Tim Peters's avatar
Tim Peters committed
12
releases; such changes may not be backward compatible.
13
"""
14

15
# Released to the public domain, by Tim Peters, 15 April 1998.
16

17 18 19 20
# XXX Note: this is now a standard library module.
# XXX The API needs to undergo changes however; the current code is too
# XXX script-like.  This will be addressed later.

Guido van Rossum's avatar
Guido van Rossum committed
21
__version__ = "6"
22 23 24 25 26

import os
import sys
import getopt
import tokenize
27 28
if not hasattr(tokenize, 'NL'):
    raise ValueError("tokenize.NL doesn't exist -- tokenize module too old")
29

30
__all__ = ["check", "NannyNag", "process_tokens"]
31

32
verbose = 0
33
filename_only = 0
34

35 36 37 38 39 40 41
def errprint(*args):
    sep = ""
    for arg in args:
        sys.stderr.write(sep + str(arg))
        sep = " "
    sys.stderr.write("\n")

42
def main():
43
    global verbose, filename_only
44
    try:
45
        opts, args = getopt.getopt(sys.argv[1:], "qv")
46
    except getopt.error as msg:
47
        errprint(msg)
48
        return
49
    for o, a in opts:
50 51
        if o == '-q':
            filename_only = filename_only + 1
52 53
        if o == '-v':
            verbose = verbose + 1
54
    if not args:
55
        errprint("Usage:", sys.argv[0], "[-v] file_or_directory ...")
56
        return
57 58 59
    for arg in args:
        check(arg)

60
class NannyNag(Exception):
61 62
    """
    Raised by tokeneater() if detecting an ambiguous indent.
Tim Peters's avatar
Tim Peters committed
63
    Captured and handled in check().
64
    """
65 66 67 68 69 70 71 72 73 74
    def __init__(self, lineno, msg, line):
        self.lineno, self.msg, self.line = lineno, msg, line
    def get_lineno(self):
        return self.lineno
    def get_msg(self):
        return self.msg
    def get_line(self):
        return self.line

def check(file):
75
    """check(file_or_dir)
Tim Peters's avatar
Tim Peters committed
76

77 78 79 80
    If file_or_dir is a directory and not a symbolic link, then recursively
    descend the directory tree named by file_or_dir, checking all .py files
    along the way. If file_or_dir is an ordinary Python source file, it is
    checked for whitespace related problems. The diagnostic messages are
Tim Peters's avatar
Tim Peters committed
81
    written to standard output using the print statement.
82
    """
Tim Peters's avatar
Tim Peters committed
83

84 85
    if os.path.isdir(file) and not os.path.islink(file):
        if verbose:
86
            print("%r: listing directory" % (file,))
87 88 89 90 91 92 93 94 95 96
        names = os.listdir(file)
        for name in names:
            fullname = os.path.join(file, name)
            if (os.path.isdir(fullname) and
                not os.path.islink(fullname) or
                os.path.normcase(name[-3:]) == ".py"):
                check(fullname)
        return

    try:
97
        f = tokenize.open(file)
98
    except IOError as msg:
99
        errprint("%r: I/O Error: %s" % (file, msg))
100 101 102
        return

    if verbose > 1:
103
        print("checking %r ..." % file)
104 105

    try:
106
        process_tokens(tokenize.generate_tokens(f.readline))
107

108
    except tokenize.TokenError as msg:
109
        errprint("%r: Token Error: %s" % (file, msg))
110 111
        return

112
    except IndentationError as msg:
113 114 115
        errprint("%r: Indentation Error: %s" % (file, msg))
        return

116
    except NannyNag as nag:
117 118 119
        badline = nag.get_lineno()
        line = nag.get_line()
        if verbose:
120 121 122
            print("%r: *** Line %d: trouble in tab city! ***" % (file, badline))
            print("offending line: %r" % (line,))
            print(nag.get_msg())
123
        else:
Guido van Rossum's avatar
Guido van Rossum committed
124
            if ' ' in file: file = '"' + file + '"'
125 126
            if filename_only: print(file)
            else: print(file, badline, repr(line))
127 128
        return

129 130 131
    finally:
        f.close()

132
    if verbose:
133
        print("%r: Clean bill of health." % (file,))
134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227

class Whitespace:
    # the characters used for space and tab
    S, T = ' \t'

    # members:
    #   raw
    #       the original string
    #   n
    #       the number of leading whitespace characters in raw
    #   nt
    #       the number of tabs in raw[:n]
    #   norm
    #       the normal form as a pair (count, trailing), where:
    #       count
    #           a tuple such that raw[:n] contains count[i]
    #           instances of S * i + T
    #       trailing
    #           the number of trailing spaces in raw[:n]
    #       It's A Theorem that m.indent_level(t) ==
    #       n.indent_level(t) for all t >= 1 iff m.norm == n.norm.
    #   is_simple
    #       true iff raw[:n] is of the form (T*)(S*)

    def __init__(self, ws):
        self.raw  = ws
        S, T = Whitespace.S, Whitespace.T
        count = []
        b = n = nt = 0
        for ch in self.raw:
            if ch == S:
                n = n + 1
                b = b + 1
            elif ch == T:
                n = n + 1
                nt = nt + 1
                if b >= len(count):
                    count = count + [0] * (b - len(count) + 1)
                count[b] = count[b] + 1
                b = 0
            else:
                break
        self.n    = n
        self.nt   = nt
        self.norm = tuple(count), b
        self.is_simple = len(count) <= 1

    # return length of longest contiguous run of spaces (whether or not
    # preceding a tab)
    def longest_run_of_spaces(self):
        count, trailing = self.norm
        return max(len(count)-1, trailing)

    def indent_level(self, tabsize):
        # count, il = self.norm
        # for i in range(len(count)):
        #    if count[i]:
        #        il = il + (i/tabsize + 1)*tabsize * count[i]
        # return il

        # quicker:
        # il = trailing + sum (i/ts + 1)*ts*count[i] =
        # trailing + ts * sum (i/ts + 1)*count[i] =
        # trailing + ts * sum i/ts*count[i] + count[i] =
        # trailing + ts * [(sum i/ts*count[i]) + (sum count[i])] =
        # trailing + ts * [(sum i/ts*count[i]) + num_tabs]
        # and note that i/ts*count[i] is 0 when i < ts

        count, trailing = self.norm
        il = 0
        for i in range(tabsize, len(count)):
            il = il + i/tabsize * count[i]
        return trailing + tabsize * (il + self.nt)

    # return true iff self.indent_level(t) == other.indent_level(t)
    # for all t >= 1
    def equal(self, other):
        return self.norm == other.norm

    # return a list of tuples (ts, i1, i2) such that
    # i1 == self.indent_level(ts) != other.indent_level(ts) == i2.
    # Intended to be used after not self.equal(other) is known, in which
    # case it will return at least one witnessing tab size.
    def not_equal_witness(self, other):
        n = max(self.longest_run_of_spaces(),
                other.longest_run_of_spaces()) + 1
        a = []
        for ts in range(1, n+1):
            if self.indent_level(ts) != other.indent_level(ts):
                a.append( (ts,
                           self.indent_level(ts),
                           other.indent_level(ts)) )
        return a

228
    # Return True iff self.indent_level(t) < other.indent_level(t)
229 230 231 232 233 234 235 236 237 238 239 240 241 242
    # for all t >= 1.
    # The algorithm is due to Vincent Broman.
    # Easy to prove it's correct.
    # XXXpost that.
    # Trivial to prove n is sharp (consider T vs ST).
    # Unknown whether there's a faster general way.  I suspected so at
    # first, but no longer.
    # For the special (but common!) case where M and N are both of the
    # form (T*)(S*), M.less(N) iff M.len() < N.len() and
    # M.num_tabs() <= N.num_tabs(). Proof is easy but kinda long-winded.
    # XXXwrite that up.
    # Note that M is of the form (T*)(S*) iff len(M.norm[0]) <= 1.
    def less(self, other):
        if self.n >= other.n:
243
            return False
244 245 246 247 248 249 250
        if self.is_simple and other.is_simple:
            return self.nt <= other.nt
        n = max(self.longest_run_of_spaces(),
                other.longest_run_of_spaces()) + 1
        # the self.n >= other.n test already did it for ts=1
        for ts in range(2, n+1):
            if self.indent_level(ts) >= other.indent_level(ts):
251 252
                return False
        return True
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269

    # return a list of tuples (ts, i1, i2) such that
    # i1 == self.indent_level(ts) >= other.indent_level(ts) == i2.
    # Intended to be used after not self.less(other) is known, in which
    # case it will return at least one witnessing tab size.
    def not_less_witness(self, other):
        n = max(self.longest_run_of_spaces(),
                other.longest_run_of_spaces()) + 1
        a = []
        for ts in range(1, n+1):
            if self.indent_level(ts) >= other.indent_level(ts):
                a.append( (ts,
                           self.indent_level(ts),
                           other.indent_level(ts)) )
        return a

def format_witnesses(w):
270
    firsts = (str(tup[0]) for tup in w)
271 272 273
    prefix = "at tab size"
    if len(w) > 1:
        prefix = prefix + "s"
274
    return prefix + " " + ', '.join(firsts)
275

276 277 278 279 280
def process_tokens(tokens):
    INDENT = tokenize.INDENT
    DEDENT = tokenize.DEDENT
    NEWLINE = tokenize.NEWLINE
    JUNK = tokenize.COMMENT, tokenize.NL
281 282 283 284
    indents = [Whitespace("")]
    check_equal = 0

    for (type, token, start, end, line) in tokens:
Tim Peters's avatar
Tim Peters committed
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
        if type == NEWLINE:
            # a program statement, or ENDMARKER, will eventually follow,
            # after some (possibly empty) run of tokens of the form
            #     (NL | COMMENT)* (INDENT | DEDENT+)?
            # If an INDENT appears, setting check_equal is wrong, and will
            # be undone when we see the INDENT.
            check_equal = 1

        elif type == INDENT:
            check_equal = 0
            thisguy = Whitespace(token)
            if not indents[-1].less(thisguy):
                witness = indents[-1].not_less_witness(thisguy)
                msg = "indent not greater e.g. " + format_witnesses(witness)
                raise NannyNag(start[0], msg, line)
            indents.append(thisguy)

        elif type == DEDENT:
            # there's nothing we need to check here!  what's important is
            # that when the run of DEDENTs ends, the indentation of the
            # program statement (or ENDMARKER) that triggered the run is
            # equal to what's left at the top of the indents stack

            # Ouch!  This assert triggers if the last line of the source
            # is indented *and* lacks a newline -- then DEDENTs pop out
            # of thin air.
            # assert check_equal  # else no earlier NEWLINE, or an earlier INDENT
            check_equal = 1

            del indents[-1]

        elif check_equal and type not in JUNK:
            # this is the first "real token" following a NEWLINE, so it
            # must be the first token of the next program statement, or an
            # ENDMARKER; the "line" argument exposes the leading whitespace
            # for this statement; in the case of ENDMARKER, line is an empty
            # string, so will properly match the empty string with which the
            # "indents" stack was seeded
            check_equal = 0
            thisguy = Whitespace(line)
            if not indents[-1].equal(thisguy):
                witness = indents[-1].not_equal_witness(thisguy)
                msg = "indent not equal e.g. " + format_witnesses(witness)
                raise NannyNag(start[0], msg, line)
329

330 331 332

if __name__ == '__main__':
    main()