texcheck.py 9.04 KB
Newer Older
1 2 3 4 5 6 7
""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.

   Written by Raymond D. Hettinger <python at rcn.com>
   Copyright (c) 2003 Python Software Foundation.  All rights reserved.

Designed to catch common markup errors including:
* Unbalanced or mismatched parenthesis, brackets, and braces.
8
* Unbalanced or mismatched \\begin and \\end blocks.
9 10
* Misspelled or invalid LaTeX commands.
* Use of forward slashes instead of backslashes for commands.
11
* Table line size mismatches.
12

13 14
Sample command line usage:
    python texcheck.py -k chapterheading -m lib/librandomtex *.tex
15 16

Options:
17 18
    -m          Munge parenthesis and brackets. [0,n) would normally mismatch.
    -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
19 20 21
    -d:         Delimiter check only (useful for non-LaTeX files).
    -h:         Help
    -s lineno:  Start at lineno (useful for skipping complex sections).
22
    -v:         Verbose.  Trace the matching of //begin and //end blocks.
23 24 25 26 27 28
"""

import re
import sys
import getopt
from itertools import izip, count, islice
29
import glob
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51

cmdstr = r"""
    \section \module \declaremodule \modulesynopsis \moduleauthor
    \sectionauthor \versionadded \code \class \method \begin
    \optional \var \ref \end \subsection \lineiii \hline \label
    \indexii \textrm \ldots \keyword \stindex \index \item \note
    \withsubitem \ttindex \footnote \citetitle \samp \opindex
    \noindent \exception \strong \dfn \ctype \obindex \character
    \indexiii \function \bifuncindex \refmodule \refbimodindex
    \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
    \regexp \program \production \token \productioncont \term
    \grammartoken \lineii \seemodule \file \EOF \documentclass
    \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
    \tableofcontents \kbd \programopt \envvar \refstmodindex
    \cfunction \constant \NULL \moreargs \cfuncline \cdata
    \textasciicircum \n \ABC \setindexsubitem \versionchanged
    \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
    \verbatiminput \methodline \textgreater \seetitle \lineiv
    \funclineni \ulink \manpage \funcline \dataline \unspecified
    \textbackslash \mimetype \mailheader \seepep \textunderscore
    \longprogramopt \infinity \plusminus \shortversion \version
    \refmodindex \seerfc \makeindex \makemodindex \renewcommand
52 53
    \indexname \appendix \protect \indexiv \mbox \textasciitilde
    \platform \seeurl \leftmargin \labelwidth \localmoduletable
54 55 56 57 58
    \LaTeX \copyright \memberline \backslash \pi \centerline
    \caption \vspace \textwidth \menuselection \textless
    \makevar \csimplemacro \menuselection \bfcode \sub \release
    \email \kwindex \refexmodindex \filenq \e \menuselection
    \exindex \linev \newsgroup \verbatim \setshortversion
59
    \author \authoraddress \paragraph \subparagraph \cmemberline
60
    \textbar \C \seelink
61 62 63 64 65 66 67
"""

def matchclose(c_lineno, c_symbol, openers, pairmap):
    "Verify that closing delimiter matches most recent opening delimiter"
    try:
        o_lineno, o_symbol = openers.pop()
    except IndexError:
68 69
        print "\nDelimiter mismatch.  On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
        return
70
    if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
71 72
    print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
    return
73 74

def checkit(source, opts, morecmds=[]):
75
    """Check the LaTeX formatting in a sequence of lines.
76 77 78 79

    Opts is a mapping of options to option values if any:
        -m          munge parenthesis and brackets
        -d          delimiters only checking
80
        -v          verbose trace of delimiter matching
81 82
        -s lineno:  linenumber to start scan (default is 1).

83
    Morecmds is a sequence of LaTeX commands (without backslashes) that
84 85 86 87
    are to be considered valid in the scan.
    """

    texcmd = re.compile(r'\\[A-Za-z]+')
88
    falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
89

90
    validcmds = set(cmdstr.split())
91 92 93 94 95 96 97
    for cmd in morecmds:
        validcmds.add('\\' + cmd)

    if '-m' in opts:
        pairmap = {']':'[(', ')':'(['}      # Munged openers
    else:
        pairmap = {']':'[', ')':'('}        # Normal opener for a given closer
98
    openpunct = set('([')                   # Set of valid openers
99 100

    delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
101
    braces = re.compile(r'({)|(})')
102
    doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
103
    spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s')
104 105 106

    openers = []                            # Stack of pending open delimiters
    bracestack = []                         # Stack of pending open braces
107

108 109 110 111 112 113
    tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
    tableline = re.compile(r'\\line([iv]+){')
    tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
    tablelevel = ''
    tablestartline = 0

114 115 116 117 118 119
    startline = int(opts.get('-s', '1'))
    lineno = 0

    for lineno, line in izip(count(startline), islice(source, startline-1, None)):
        line = line.rstrip()

120
        # Check balancing of open/close parenthesis, brackets, and begin/end blocks
121 122 123 124 125 126 127 128 129 130 131 132 133 134
        for begend, name, punct in delimiters.findall(line):
            if '-v' in opts:
                print lineno, '|', begend, name, punct,
            if begend == 'begin' and '-d' not in opts:
                openers.append((lineno, name))
            elif punct in openpunct:
                openers.append((lineno, punct))
            elif begend == 'end' and '-d' not in opts:
                matchclose(lineno, name, openers, pairmap)
            elif punct in pairmap:
                matchclose(lineno, punct, openers, pairmap)
            if '-v' in opts:
                print '   --> ', openers

135 136 137 138 139 140 141 142 143
        # Balance opening and closing braces
        for open, close in braces.findall(line):
            if open == '{':
                bracestack.append(lineno)
            if close == '}':
                try:
                    bracestack.pop()
                except IndexError:
                    print r'Warning, unmatched } on line %s.' % (lineno,)
144 145 146 147 148 149 150 151 152 153 154 155

        # Optionally, skip LaTeX specific checks
        if '-d' in opts:
            continue

        # Warn whenever forward slashes encountered with a LaTeX command
        for cmd in falsetexcmd.findall(line):
            if '822' in line or '.html' in line:
                continue    # Ignore false positives for urls and for /rfc822
            if '\\' + cmd in validcmds:
                print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)

156 157 158
        # Check for markup requiring {} for correct spacing
        for cmd in spacingmarkup.findall(line):
            print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno)
159

160 161 162 163 164 165 166 167 168
        # Validate commands
        nc = line.find(r'\newcommand')
        if nc != -1:
            start = line.find('{', nc)
            end = line.find('}', start)
            validcmds.add(line[start+1:end])
        for cmd in texcmd.findall(line):
            if cmd not in validcmds:
                print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
169

Raymond Hettinger's avatar
Raymond Hettinger committed
170
        # Check table levels (make sure lineii only inside tableii)
171 172 173 174 175 176 177 178 179 180
        m = tablestart.search(line)
        if m:
            tablelevel = m.group(1)
            tablestartline = lineno
        m = tableline.search(line)
        if m and m.group(1) != tablelevel:
            print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
        if tableend.search(line):
            tablelevel = ''

181 182 183 184
        # Style guide warnings
        if 'e.g.' in line or 'i.e.' in line:
            print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)

185 186
        for dw in doubledwords.findall(line):
            print r'Doubled word warning.  "%s" on line %d' % (dw, lineno)
187

Raymond Hettinger's avatar
Raymond Hettinger committed
188
    lastline = lineno
189
    for lineno, symbol in openers:
190
        print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
191 192
    for lineno in bracestack:
        print "Unmatched { on line %d" % (lineno,)
Raymond Hettinger's avatar
Raymond Hettinger committed
193
    print 'Done checking %d lines.' % (lastline,)
194 195 196 197 198
    return 0

def main(args=None):
    if args is None:
        args = sys.argv[1:]
199
    optitems, arglist = getopt.getopt(args, "k:mdhs:v")
200 201 202 203 204 205 206 207 208
    opts = dict(optitems)
    if '-h' in opts or args==[]:
        print __doc__
        return 0

    if len(arglist) < 1:
        print 'Please specify a file to be checked'
        return 1

209 210 211
    for i, filespec in enumerate(arglist):
        if '*' in filespec or '?' in filespec:
            arglist[i:i+1] = glob.glob(filespec)
212

213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
    morecmds = [v for k,v in optitems if k=='-k']
    err = []

    for filename in arglist:
        print '=' * 30
        print "Checking", filename
        try:
            f = open(filename)
        except IOError:
            print 'Cannot open file %s.' % arglist[0]
            return 2

        try:
            err.append(checkit(f, opts, morecmds))
        finally:
            f.close()

    return max(err)
231 232 233

if __name__ == '__main__':
    sys.exit(main())