csv.py 15.4 KB
Newer Older
1 2 3 4 5 6

"""
csv.py - read/write/investigate CSV files
"""

import re
7
from functools import reduce
8 9
from _csv import Error, __version__, writer, reader, register_dialect, \
                 unregister_dialect, get_dialect, list_dialects, \
10
                 field_size_limit, \
11 12
                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
                 __doc__
13
from _csv import Dialect as _Dialect
14

15 16 17 18 19
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO

20
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 22
            "Error", "Dialect", "__doc__", "excel", "excel_tab",
            "field_size_limit", "reader", "writer",
23 24 25 26
            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]

class Dialect:
27 28 29 30 31 32 33
    """Describe an Excel dialect.

    This must be subclassed (see csv.excel).  Valid attributes are:
    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
    lineterminator, quoting.

    """
34 35 36 37 38 39 40 41 42 43 44 45 46 47
    _name = ""
    _valid = False
    # placeholders
    delimiter = None
    quotechar = None
    escapechar = None
    doublequote = None
    skipinitialspace = None
    lineterminator = None
    quoting = None

    def __init__(self):
        if self.__class__ != Dialect:
            self._valid = True
48
        self._validate()
49 50

    def _validate(self):
51 52 53 54 55
        try:
            _Dialect(self)
        except TypeError, e:
            # We do this for compatibility with py2.3
            raise Error(str(e))
56 57

class excel(Dialect):
58
    """Describe the usual properties of Excel-generated CSV files."""
59 60 61 62 63 64 65 66 67
    delimiter = ','
    quotechar = '"'
    doublequote = True
    skipinitialspace = False
    lineterminator = '\r\n'
    quoting = QUOTE_MINIMAL
register_dialect("excel", excel)

class excel_tab(excel):
68
    """Describe the usual properties of Excel-generated TAB-delimited files."""
69 70 71 72 73
    delimiter = '\t'
register_dialect("excel-tab", excel_tab)


class DictReader:
74
    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75
                 dialect="excel", *args, **kwds):
Skip Montanaro's avatar
Skip Montanaro committed
76
        self._fieldnames = fieldnames   # list of keys for the dict
77 78
        self.restkey = restkey          # key to catch long rows
        self.restval = restval          # default value for short rows
79
        self.reader = reader(f, dialect, *args, **kwds)
80 81
        self.dialect = dialect
        self.line_num = 0
82 83 84 85

    def __iter__(self):
        return self

Skip Montanaro's avatar
Skip Montanaro committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99
    @property
    def fieldnames(self):
        if self._fieldnames is None:
            try:
                self._fieldnames = self.reader.next()
            except StopIteration:
                pass
        self.line_num = self.reader.line_num
        return self._fieldnames

    @fieldnames.setter
    def fieldnames(self, value):
        self._fieldnames = value

100
    def next(self):
Skip Montanaro's avatar
Skip Montanaro committed
101 102 103
        if self.line_num == 0:
            # Used only for its side effect.
            self.fieldnames
104
        row = self.reader.next()
105
        self.line_num = self.reader.line_num
106

107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
        # unlike the basic reader, we prefer not to return blanks,
        # because we will typically wind up with a dict full of None
        # values
        while row == []:
            row = self.reader.next()
        d = dict(zip(self.fieldnames, row))
        lf = len(self.fieldnames)
        lr = len(row)
        if lf < lr:
            d[self.restkey] = row[lf:]
        elif lf > lr:
            for key in self.fieldnames[lr:]:
                d[key] = self.restval
        return d


class DictWriter:
    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125
                 dialect="excel", *args, **kwds):
126 127 128 129 130 131 132
        self.fieldnames = fieldnames    # list of keys for the dict
        self.restval = restval          # for writing short dicts
        if extrasaction.lower() not in ("raise", "ignore"):
            raise ValueError, \
                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
                   extrasaction)
        self.extrasaction = extrasaction
133
        self.writer = writer(f, dialect, *args, **kwds)
134 135 136

    def _dict_to_list(self, rowdict):
        if self.extrasaction == "raise":
137 138 139 140
            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
            if wrong_fields:
                raise ValueError("dict contains fields not in fieldnames: " +
                                 ", ".join(wrong_fields))
141 142 143 144 145 146 147 148 149 150 151
        return [rowdict.get(key, self.restval) for key in self.fieldnames]

    def writerow(self, rowdict):
        return self.writer.writerow(self._dict_to_list(rowdict))

    def writerows(self, rowdicts):
        rows = []
        for rowdict in rowdicts:
            rows.append(self._dict_to_list(rowdict))
        return self.writer.writerows(rows)

152 153 154 155 156
# Guard Sniffer's type checking against builds that exclude complex()
try:
    complex
except NameError:
    complex = float
157 158 159 160

class Sniffer:
    '''
    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161
    Returns a Dialect object.
162
    '''
163
    def __init__(self):
164 165 166 167
        # in case there is more than one possible delimiter
        self.preferred = [',', '\t', ';', ' ', ':']


168
    def sniff(self, sample, delimiters=None):
169
        """
170
        Returns a dialect (or None) corresponding to the sample
171 172 173
        """

        quotechar, delimiter, skipinitialspace = \
174
                   self._guess_quote_and_delimiter(sample, delimiters)
175
        if not delimiter:
176 177
            delimiter, skipinitialspace = self._guess_delimiter(sample,
                                                                delimiters)
178

179 180 181
        if not delimiter:
            raise Error, "Could not determine delimiter"

182
        class dialect(Dialect):
183 184
            _name = "sniffed"
            lineterminator = '\r\n'
185
            quoting = QUOTE_MINIMAL
186 187 188
            # escapechar = ''
            doublequote = False

189 190 191 192
        dialect.delimiter = delimiter
        # _csv.reader won't accept a quotechar of ''
        dialect.quotechar = quotechar or '"'
        dialect.skipinitialspace = skipinitialspace
193

194
        return dialect
195 196


197
    def _guess_quote_and_delimiter(self, data, delimiters):
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
        """
        Looks for text enclosed between two identical quotes
        (the probable quotechar) which are preceded and followed
        by the same character (the probable delimiter).
        For example:
                         ,'some text',
        The quote with the most wins, same with the delimiter.
        If there is no quotechar the delimiter can't be determined
        this way.
        """

        matches = []
        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
214
            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
            matches = regexp.findall(data)
            if matches:
                break

        if not matches:
            return ('', None, 0) # (quotechar, delimiter, skipinitialspace)

        quotes = {}
        delims = {}
        spaces = 0
        for m in matches:
            n = regexp.groupindex['quote'] - 1
            key = m[n]
            if key:
                quotes[key] = quotes.get(key, 0) + 1
            try:
                n = regexp.groupindex['delim'] - 1
                key = m[n]
            except KeyError:
                continue
235
            if key and (delimiters is None or key in delimiters):
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
                delims[key] = delims.get(key, 0) + 1
            try:
                n = regexp.groupindex['space'] - 1
            except KeyError:
                continue
            if m[n]:
                spaces += 1

        quotechar = reduce(lambda a, b, quotes = quotes:
                           (quotes[a] > quotes[b]) and a or b, quotes.keys())

        if delims:
            delim = reduce(lambda a, b, delims = delims:
                           (delims[a] > delims[b]) and a or b, delims.keys())
            skipinitialspace = delims[delim] == spaces
            if delim == '\n': # most likely a file with a single column
                delim = ''
        else:
            # there is *no* delimiter, it's a single column of quoted data
            delim = ''
            skipinitialspace = 0

        return (quotechar, delim, skipinitialspace)


261
    def _guess_delimiter(self, data, delimiters):
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
        """
        The delimiter /should/ occur the same number of times on
        each row. However, due to malformed data, it may not. We don't want
        an all or nothing approach, so we allow for small variations in this
        number.
          1) build a table of the frequency of each character on every line.
          2) build a table of freqencies of this frequency (meta-frequency?),
             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
             7 times in 2 rows'
          3) use the mode of the meta-frequency to determine the /expected/
             frequency for that character
          4) find out how often the character actually meets that goal
          5) the character that best meets its goal is the delimiter
        For performance reasons, the data is evaluated in chunks, so it can
        try and evaluate the smallest portion of the data possible, evaluating
        additional chunks as necessary.
        """

        data = filter(None, data.split('\n'))

        ascii = [chr(c) for c in range(127)] # 7-bit ASCII

        # build frequency tables
        chunkLength = min(10, len(data))
        iteration = 0
        charFrequency = {}
        modes = {}
        delims = {}
        start, end = 0, min(chunkLength, len(data))
        while start < len(data):
            iteration += 1
            for line in data[start:end]:
                for char in ascii:
295
                    metaFrequency = charFrequency.get(char, {})
296
                    # must count even if frequency is 0
297
                    freq = line.count(char)
298
                    # value is the mode
299 300
                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
                    charFrequency[char] = metaFrequency
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328

            for char in charFrequency.keys():
                items = charFrequency[char].items()
                if len(items) == 1 and items[0][0] == 0:
                    continue
                # get the mode of the frequencies
                if len(items) > 1:
                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
                                         items)
                    # adjust the mode - subtract the sum of all
                    # other frequencies
                    items.remove(modes[char])
                    modes[char] = (modes[char][0], modes[char][1]
                                   - reduce(lambda a, b: (0, a[1] + b[1]),
                                            items)[1])
                else:
                    modes[char] = items[0]

            # build a list of possible delimiters
            modeList = modes.items()
            total = float(chunkLength * iteration)
            # (rows of consistent data) / (number of rows) = 100%
            consistency = 1.0
            # minimum consistency threshold
            threshold = 0.9
            while len(delims) == 0 and consistency >= threshold:
                for k, v in modeList:
                    if v[0] > 0 and v[1] > 0:
329 330
                        if ((v[1]/total) >= consistency and
                            (delimiters is None or k in delimiters)):
331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
                            delims[k] = v
                consistency -= 0.01

            if len(delims) == 1:
                delim = delims.keys()[0]
                skipinitialspace = (data[0].count(delim) ==
                                    data[0].count("%c " % delim))
                return (delim, skipinitialspace)

            # analyze another chunkLength lines
            start = end
            end += chunkLength

        if not delims:
            return ('', 0)

        # if there's more than one, fall back to a 'preferred' list
        if len(delims) > 1:
            for d in self.preferred:
                if d in delims.keys():
                    skipinitialspace = (data[0].count(d) ==
                                        data[0].count("%c " % d))
                    return (d, skipinitialspace)

355 356 357 358 359 360
        # nothing else indicates a preference, pick the character that
        # dominates(?)
        items = [(v,k) for (k,v) in delims.items()]
        items.sort()
        delim = items[-1][1]

361 362 363 364 365
        skipinitialspace = (data[0].count(delim) ==
                            data[0].count("%c " % delim))
        return (delim, skipinitialspace)


366
    def has_header(self, sample):
367 368 369 370 371 372 373 374 375
        # Creates a dictionary of types of data in each column. If any
        # column is of a single type (say, integers), *except* for the first
        # row, then the first row is presumed to be labels. If the type
        # can't be determined, it is assumed to be a string in which case
        # the length of the string is the determining factor: if all of the
        # rows except for the first are the same length, it's a header.
        # Finally, a 'vote' is taken at the end for each column, adding or
        # subtracting from the likelihood of the first row being a header.

376
        rdr = reader(StringIO(sample), self.sniff(sample))
377

378
        header = rdr.next() # assume first row is header
379 380 381 382 383 384

        columns = len(header)
        columnTypes = {}
        for i in range(columns): columnTypes[i] = None

        checked = 0
385
        for row in rdr:
386 387 388 389 390 391 392 393 394
            # arbitrary number of rows to check, to keep it sane
            if checked > 20:
                break
            checked += 1

            if len(row) != columns:
                continue # skip rows that have irregular number of columns

            for col in columnTypes.keys():
395 396

                for thisType in [int, long, float, complex]:
397
                    try:
398 399
                        thisType(row[col])
                        break
400
                    except (ValueError, OverflowError):
401 402
                        pass
                else:
403 404 405
                    # fallback to length of string
                    thisType = len(row[col])

406 407 408 409
                # treat longs as ints
                if thisType == long:
                    thisType = int

410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
                if thisType != columnTypes[col]:
                    if columnTypes[col] is None: # add new column type
                        columnTypes[col] = thisType
                    else:
                        # type is inconsistent, remove column from
                        # consideration
                        del columnTypes[col]

        # finally, compare results against first row and "vote"
        # on whether it's a header
        hasHeader = 0
        for col, colType in columnTypes.items():
            if type(colType) == type(0): # it's a length
                if len(header[col]) != colType:
                    hasHeader += 1
                else:
                    hasHeader -= 1
            else: # attempt typecast
                try:
429
                    colType(header[col])
430
                except (ValueError, TypeError):
431 432 433 434 435
                    hasHeader += 1
                else:
                    hasHeader -= 1

        return hasHeader > 0