text_file.py 14.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
"""text_file

provides the TextFile class, which gives an interface to text files
that (optionally) takes care of stripping comments, ignoring blank
lines, and joining lines with backslashes."""

# created 1999/01/12, Greg Ward

__revision__ = "$Id$"

from types import *
12
import sys, os, string
13 14 15 16


class TextFile:

Greg Ward's avatar
Greg Ward committed
17 18
    """Provides a file-like object that takes care of all the things you
       commonly want to do when processing a text file that has some
19 20 21 22 23
       line-by-line syntax: strip comments (as long as "#" is your
       comment character), skip blank lines, join adjacent lines by
       escaping the newline (ie. backslash at end of line), strip
       leading and/or trailing whitespace.  All of these are optional
       and independently controllable.
Greg Ward's avatar
Greg Ward committed
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52

       Provides a 'warn()' method so you can generate warning messages that
       report physical line number, even if the logical line in question
       spans multiple physical lines.  Also provides 'unreadline()' for
       implementing line-at-a-time lookahead.

       Constructor is called as:

           TextFile (filename=None, file=None, **options)

       It bombs (RuntimeError) if both 'filename' and 'file' are None;
       'filename' should be a string, and 'file' a file object (or
       something that provides 'readline()' and 'close()' methods).  It is
       recommended that you supply at least 'filename', so that TextFile
       can include it in warning messages.  If 'file' is not supplied,
       TextFile creates its own using the 'open()' builtin.

       The options are all boolean, and affect the value returned by
       'readline()':
         strip_comments [default: true]
           strip from "#" to end-of-line, as well as any whitespace
           leading up to the "#" -- unless it is escaped by a backslash
         lstrip_ws [default: false]
           strip leading whitespace from each line before returning it
         rstrip_ws [default: true]
           strip trailing whitespace (including line terminator!) from
           each line before returning it
         skip_blanks [default: true}
           skip lines that are empty *after* stripping comments and
53
           whitespace.  (If both lstrip_ws and rstrip_ws are false,
Greg Ward's avatar
Greg Ward committed
54 55 56 57 58 59 60 61
           then some lines may consist of solely whitespace: these will
           *not* be skipped, even if 'skip_blanks' is true.)
         join_lines [default: false]
           if a backslash is the last non-newline character on a line
           after stripping comments and whitespace, join the following line
           to it to form one "logical line"; if N consecutive lines end
           with a backslash, then N+1 physical lines will be joined to
           form one logical line.
62 63 64
         collapse_join [default: false]
           strip leading whitespace from lines that are joined to their
           predecessor; only matters if (join_lines and not lstrip_ws)
Greg Ward's avatar
Greg Ward committed
65 66 67 68 69 70 71 72

       Note that since 'rstrip_ws' can strip the trailing newline, the
       semantics of 'readline()' must differ from those of the builtin file
       object's 'readline()' method!  In particular, 'readline()' returns
       None for end-of-file: an empty string might just be a blank line (or
       an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
       not."""

73 74 75 76
    default_options = { 'strip_comments': 1,
                        'skip_blanks':    1,
                        'lstrip_ws':      0,
                        'rstrip_ws':      1,
77 78
                        'join_lines':     0,
                        'collapse_join':  0,
79 80
                      }

81
    def __init__ (self, filename=None, file=None, **options):
Greg Ward's avatar
Greg Ward committed
82 83 84 85
        """Construct a new TextFile object.  At least one of 'filename'
           (a string) and 'file' (a file-like object) must be supplied.
           They keyword argument options are described above and affect
           the values returned by 'readline()'."""
86 87 88 89

        if filename is None and file is None:
            raise RuntimeError, \
                  "you must supply either or both of 'filename' and 'file'" 
90 91 92 93 94

        # set values for all options -- either from client option hash
        # or fallback to default_options
        for opt in self.default_options.keys():
            if options.has_key (opt):
95
                setattr (self, opt, options[opt])
96 97 98 99 100 101 102 103 104

            else:
                setattr (self, opt, self.default_options[opt])

        # sanity check client option hash
        for opt in options.keys():
            if not self.default_options.has_key (opt):
                raise KeyError, "invalid TextFile option '%s'" % opt

105 106 107
        if file is None:
            self.open (filename)
        else:
108
            self.filename = filename
109 110
            self.file = file
            self.current_line = 0       # assuming that file is at BOF!
111

112 113 114 115
        # 'linebuf' is a stack of lines that will be emptied before we
        # actually read from the file; it's only populated by an
        # 'unreadline()' operation
        self.linebuf = []
116
        
117

118
    def open (self, filename):
Greg Ward's avatar
Greg Ward committed
119 120 121
        """Open a new file named 'filename'.  This overrides both the
           'filename' and 'file' arguments to the constructor."""

122
        self.filename = filename
123 124 125 126 127
        self.file = open (self.filename, 'r')
        self.current_line = 0


    def close (self):
Greg Ward's avatar
Greg Ward committed
128 129 130
        """Close the current file and forget everything we know about it
           (filename, current line number)."""

131 132 133 134 135 136
        self.file.close ()
        self.file = None
        self.filename = None
        self.current_line = None


137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
    def gen_error (self, msg, line=None):
        outmsg = []
        if line is None:
            line = self.current_line
        outmsg.append(self.filename + ", ")
        if type (line) in (ListType, TupleType):
            outmsg.append("lines %d-%d: " % tuple (line))
        else:
            outmsg.append("line %d: " % line)
        outmsg.append(str(msg))
        return string.join(outmsg, "")


    def error (self, msg, line=None):
        raise ValueError, "error: " + self.gen_error(msg, line)

153
    def warn (self, msg, line=None):
Greg Ward's avatar
Greg Ward committed
154 155 156 157 158 159 160
        """Print (to stderr) a warning message tied to the current logical
           line in the current file.  If the current logical line in the
           file spans multiple physical lines, the warning refers to the
           whole range, eg. "lines 3-5".  If 'line' supplied, it overrides
           the current line number; it may be a list or tuple to indicate a
           range of physical lines, or an integer for a single physical
           line."""
161
        sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n")
Greg Ward's avatar
Greg Ward committed
162 163


164
    def readline (self):
Greg Ward's avatar
Greg Ward committed
165 166 167 168 169 170 171 172 173
        """Read and return a single logical line from the current file (or
           from an internal buffer if lines have previously been "unread"
           with 'unreadline()').  If the 'join_lines' option is true, this
           may involve reading multiple physical lines concatenated into a
           single string.  Updates the current line number, so calling
           'warn()' after 'readline()' emits a warning about the physical
           line(s) just read.  Returns None on end-of-file, since the empty
           string can occur if 'rstrip_ws' is true but 'strip_blanks' is
           not."""
174

175 176 177 178 179 180 181 182 183
        # If any "unread" lines waiting in 'linebuf', return the top
        # one.  (We don't actually buffer read-ahead data -- lines only
        # get put in 'linebuf' if the client explicitly does an
        # 'unreadline()'.
        if self.linebuf:
            line = self.linebuf[-1]
            del self.linebuf[-1]
            return line

184 185 186
        buildup_line = ''

        while 1:
187
            # read the line, make it None if EOF
188
            line = self.file.readline()
189 190
            if line == '': line = None

191
            if self.strip_comments and line:
192 193 194 195 196 197 198 199 200 201 202 203

                # Look for the first "#" in the line.  If none, never
                # mind.  If we find one and it's the first character, or
                # is not preceded by "\", then it starts a comment --
                # strip the comment, strip whitespace before it, and
                # carry on.  Otherwise, it's just an escaped "#", so
                # unescape it (and any other escaped "#"'s that might be
                # lurking in there) and otherwise leave the line alone.

                pos = string.find (line, "#")
                if pos == -1:           # no "#" -- no comments
                    pass
204 205 206 207

                # It's definitely a comment -- either "#" is the first
                # character, or it's elsewhere and unescaped.
                elif pos == 0 or line[pos-1] != "\\":
Greg Ward's avatar
Greg Ward committed
208 209 210 211 212
                    # Have to preserve the trailing newline, because it's
                    # the job of a later step (rstrip_ws) to remove it --
                    # and if rstrip_ws is false, we'd better preserve it!
                    # (NB. this means that if the final line is all comment
                    # and has no trailing newline, we will think that it's
213
                    # EOF; I think that's OK.)
Greg Ward's avatar
Greg Ward committed
214 215
                    eol = (line[-1] == '\n') and '\n' or ''
                    line = line[0:pos] + eol
216
                    
217 218 219 220 221 222 223 224 225 226
                    # If all that's left is whitespace, then skip line
                    # *now*, before we try to join it to 'buildup_line' --
                    # that way constructs like
                    #   hello \\
                    #   # comment that should be ignored
                    #   there
                    # result in "hello there".
                    if string.strip(line) == "":
                        continue

227 228 229
                else:                   # it's an escaped "#"
                    line = string.replace (line, "\\#", "#")
                
230 231 232 233

            # did previous line end with a backslash? then accumulate
            if self.join_lines and buildup_line:
                # oops: end of file
234
                if line is None:
235 236 237 238
                    self.warn ("continuation line immediately precedes "
                               "end-of-file")
                    return buildup_line

239 240
                if self.collapse_join:
                    line = string.lstrip (line)
241 242 243 244 245 246
                line = buildup_line + line

                # careful: pay attention to line number when incrementing it
                if type (self.current_line) is ListType:
                    self.current_line[1] = self.current_line[1] + 1
                else:
247 248
                    self.current_line = [self.current_line,
                                         self.current_line+1]
249 250
            # just an ordinary line, read it as usual
            else:
251
                if line is None:        # eof
252 253 254 255 256 257 258
                    return None

                # still have to be careful about incrementing the line number!
                if type (self.current_line) is ListType:
                    self.current_line = self.current_line[1] + 1
                else:
                    self.current_line = self.current_line + 1
259
                
260 261 262 263 264

            # strip whitespace however the client wants (leading and
            # trailing, or one or the other, or neither)
            if self.lstrip_ws and self.rstrip_ws:
                line = string.strip (line)
Greg Ward's avatar
Greg Ward committed
265 266 267 268
            elif self.lstrip_ws:
                line = string.lstrip (line)
            elif self.rstrip_ws:
                line = string.rstrip (line)
269 270 271

            # blank line (whether we rstrip'ed or not)? skip to next line
            # if appropriate
Greg Ward's avatar
Greg Ward committed
272
            if (line == '' or line == '\n') and self.skip_blanks:
273 274 275 276 277 278 279 280 281 282 283 284 285 286
                continue

            if self.join_lines:
                if line[-1] == '\\':
                    buildup_line = line[:-1]
                    continue

                if line[-2:] == '\\\n':
                    buildup_line = line[0:-2] + '\n'
                    continue

            # well, I guess there's some actual content there: return it
            return line

287
    # readline ()
288 289 290


    def readlines (self):
Greg Ward's avatar
Greg Ward committed
291 292 293
        """Read and return the list of all logical lines remaining in the
           current file."""

294 295 296 297 298 299 300 301
        lines = []
        while 1:
            line = self.readline()
            if line is None:
                return lines
            lines.append (line)


302
    def unreadline (self, line):
Greg Ward's avatar
Greg Ward committed
303 304 305 306
        """Push 'line' (a string) onto an internal buffer that will be
           checked by future 'readline()' calls.  Handy for implementing
           a parser with line-at-a-time lookahead."""

307 308 309
        self.linebuf.append (line)


310 311 312 313
if __name__ == "__main__":
    test_data = """# test file

line 3 \\
314
# intervening comment
315
  continues on next line
316 317 318 319 320
"""
    # result 1: no fancy options
    result1 = map (lambda x: x + "\n", string.split (test_data, "\n")[0:-1])

    # result 2: just strip comments
321 322 323
    result2 = ["\n",
               "line 3 \\\n",
               "  continues on next line\n"]
324 325

    # result 3: just strip blank lines
326 327 328 329
    result3 = ["# test file\n",
               "line 3 \\\n",
               "# intervening comment\n",
               "  continues on next line\n"]
330 331

    # result 4: default, strip comments, blank lines, and trailing whitespace
332 333
    result4 = ["line 3 \\",
               "  continues on next line"]
334

335 336 337 338 339 340 341
    # result 5: strip comments and blanks, plus join lines (but don't
    # "collapse" joined lines
    result5 = ["line 3   continues on next line"]

    # result 6: strip comments and blanks, plus join lines (and
    # "collapse" joined lines
    result6 = ["line 3 continues on next line"]
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361

    def test_input (count, description, file, expected_result):
        result = file.readlines ()
        # result = string.join (result, '')
        if result == expected_result:
            print "ok %d (%s)" % (count, description)
        else:
            print "not ok %d (%s):" % (count, description)
            print "** expected:"
            print expected_result
            print "** received:"
            print result
            

    filename = "test.txt"
    out_file = open (filename, "w")
    out_file.write (test_data)
    out_file.close ()

    in_file = TextFile (filename, strip_comments=0, skip_blanks=0,
362
                        lstrip_ws=0, rstrip_ws=0)
363 364 365
    test_input (1, "no processing", in_file, result1)

    in_file = TextFile (filename, strip_comments=1, skip_blanks=0,
366
                        lstrip_ws=0, rstrip_ws=0)
367 368 369
    test_input (2, "strip comments", in_file, result2)

    in_file = TextFile (filename, strip_comments=0, skip_blanks=1,
370
                        lstrip_ws=0, rstrip_ws=0)
371 372 373 374 375 376 377
    test_input (3, "strip blanks", in_file, result3)

    in_file = TextFile (filename)
    test_input (4, "default processing", in_file, result4)

    in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
                        join_lines=1, rstrip_ws=1)
378 379 380 381 382
    test_input (5, "join lines without collapsing", in_file, result5)

    in_file = TextFile (filename, strip_comments=1, skip_blanks=1,
                        join_lines=1, rstrip_ws=1, collapse_join=1)
    test_input (6, "join lines with collapsing", in_file, result6)
383 384 385

    os.remove (filename)