csv.py 15.8 KB
Newer Older
1 2 3 4 5 6 7 8

"""
csv.py - read/write/investigate CSV files
"""

import re
from _csv import Error, __version__, writer, reader, register_dialect, \
                 unregister_dialect, get_dialect, list_dialects, \
9
                 field_size_limit, \
10 11
                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
                 __doc__
12
from _csv import Dialect as _Dialect
13

14
from io import StringIO
15

16
__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
17 18
            "Error", "Dialect", "__doc__", "excel", "excel_tab",
            "field_size_limit", "reader", "writer",
19 20 21 22
            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]

class Dialect:
23
    """Describe a CSV dialect.
24 25 26 27 28 29

    This must be subclassed (see csv.excel).  Valid attributes are:
    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
    lineterminator, quoting.

    """
30 31 32 33 34 35 36 37 38 39 40 41 42 43
    _name = ""
    _valid = False
    # placeholders
    delimiter = None
    quotechar = None
    escapechar = None
    doublequote = None
    skipinitialspace = None
    lineterminator = None
    quoting = None

    def __init__(self):
        if self.__class__ != Dialect:
            self._valid = True
44
        self._validate()
45 46

    def _validate(self):
47 48
        try:
            _Dialect(self)
49
        except TypeError as e:
50 51
            # We do this for compatibility with py2.3
            raise Error(str(e))
52 53

class excel(Dialect):
54
    """Describe the usual properties of Excel-generated CSV files."""
55 56 57 58 59 60 61 62 63
    delimiter = ','
    quotechar = '"'
    doublequote = True
    skipinitialspace = False
    lineterminator = '\r\n'
    quoting = QUOTE_MINIMAL
register_dialect("excel", excel)

class excel_tab(excel):
64
    """Describe the usual properties of Excel-generated TAB-delimited files."""
65 66 67
    delimiter = '\t'
register_dialect("excel-tab", excel_tab)

68 69 70 71 72 73 74 75 76 77
class unix_dialect(Dialect):
    """Describe the usual properties of Unix-generated CSV files."""
    delimiter = ','
    quotechar = '"'
    doublequote = True
    skipinitialspace = False
    lineterminator = '\n'
    quoting = QUOTE_ALL
register_dialect("unix", unix_dialect)

78 79

class DictReader:
80
    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
81
                 dialect="excel", *args, **kwds):
82
        self._fieldnames = fieldnames   # list of keys for the dict
83 84
        self.restkey = restkey          # key to catch long rows
        self.restval = restval          # default value for short rows
85
        self.reader = reader(f, dialect, *args, **kwds)
Christian Heimes's avatar
Christian Heimes committed
86 87
        self.dialect = dialect
        self.line_num = 0
88 89 90 91

    def __iter__(self):
        return self

92 93 94 95 96 97 98 99 100 101 102 103 104 105
    @property
    def fieldnames(self):
        if self._fieldnames is None:
            try:
                self._fieldnames = next(self.reader)
            except StopIteration:
                pass
        self.line_num = self.reader.line_num
        return self._fieldnames

    @fieldnames.setter
    def fieldnames(self, value):
        self._fieldnames = value

106
    def __next__(self):
107 108 109
        if self.line_num == 0:
            # Used only for its side effect.
            self.fieldnames
110
        row = next(self.reader)
Christian Heimes's avatar
Christian Heimes committed
111
        self.line_num = self.reader.line_num
112

113 114 115 116
        # unlike the basic reader, we prefer not to return blanks,
        # because we will typically wind up with a dict full of None
        # values
        while row == []:
117
            row = next(self.reader)
118 119 120 121 122 123 124 125 126 127 128 129 130
        d = dict(zip(self.fieldnames, row))
        lf = len(self.fieldnames)
        lr = len(row)
        if lf < lr:
            d[self.restkey] = row[lf:]
        elif lf > lr:
            for key in self.fieldnames[lr:]:
                d[key] = self.restval
        return d


class DictWriter:
    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
131
                 dialect="excel", *args, **kwds):
132 133 134
        self.fieldnames = fieldnames    # list of keys for the dict
        self.restval = restval          # for writing short dicts
        if extrasaction.lower() not in ("raise", "ignore"):
135 136
            raise ValueError("extrasaction (%s) must be 'raise' or 'ignore'"
                             % extrasaction)
137
        self.extrasaction = extrasaction
138
        self.writer = writer(f, dialect, *args, **kwds)
139

140 141 142 143
    def writeheader(self):
        header = dict(zip(self.fieldnames, self.fieldnames))
        self.writerow(header)

144 145
    def _dict_to_list(self, rowdict):
        if self.extrasaction == "raise":
146 147
            wrong_fields = [k for k in rowdict if k not in self.fieldnames]
            if wrong_fields:
148
                raise ValueError("dict contains fields not in fieldnames: "
149
                                 + ", ".join([repr(x) for x in wrong_fields]))
150 151 152 153 154 155 156 157 158 159 160
        return [rowdict.get(key, self.restval) for key in self.fieldnames]

    def writerow(self, rowdict):
        return self.writer.writerow(self._dict_to_list(rowdict))

    def writerows(self, rowdicts):
        rows = []
        for rowdict in rowdicts:
            rows.append(self._dict_to_list(rowdict))
        return self.writer.writerows(rows)

161 162 163 164 165
# Guard Sniffer's type checking against builds that exclude complex()
try:
    complex
except NameError:
    complex = float
166 167 168 169

class Sniffer:
    '''
    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
170
    Returns a Dialect object.
171
    '''
172
    def __init__(self):
173 174 175 176
        # in case there is more than one possible delimiter
        self.preferred = [',', '\t', ';', ' ', ':']


177
    def sniff(self, sample, delimiters=None):
178
        """
179
        Returns a dialect (or None) corresponding to the sample
180 181
        """

Benjamin Peterson's avatar
Benjamin Peterson committed
182
        quotechar, doublequote, delimiter, skipinitialspace = \
183
                   self._guess_quote_and_delimiter(sample, delimiters)
184
        if not delimiter:
185 186
            delimiter, skipinitialspace = self._guess_delimiter(sample,
                                                                delimiters)
187

188
        if not delimiter:
189
            raise Error("Could not determine delimiter")
190

191
        class dialect(Dialect):
192 193
            _name = "sniffed"
            lineterminator = '\r\n'
194
            quoting = QUOTE_MINIMAL
195 196
            # escapechar = ''

Benjamin Peterson's avatar
Benjamin Peterson committed
197
        dialect.doublequote = doublequote
198 199 200 201
        dialect.delimiter = delimiter
        # _csv.reader won't accept a quotechar of ''
        dialect.quotechar = quotechar or '"'
        dialect.skipinitialspace = skipinitialspace
202

203
        return dialect
204 205


206
    def _guess_quote_and_delimiter(self, data, delimiters):
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
        """
        Looks for text enclosed between two identical quotes
        (the probable quotechar) which are preceded and followed
        by the same character (the probable delimiter).
        For example:
                         ,'some text',
        The quote with the most wins, same with the delimiter.
        If there is no quotechar the delimiter can't be determined
        this way.
        """

        matches = []
        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
223
            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
224 225 226 227 228
            matches = regexp.findall(data)
            if matches:
                break

        if not matches:
Benjamin Peterson's avatar
Benjamin Peterson committed
229 230
            # (quotechar, doublequote, delimiter, skipinitialspace)
            return ('', False, None, 0)
231 232 233 234 235 236 237 238 239 240 241 242 243
        quotes = {}
        delims = {}
        spaces = 0
        for m in matches:
            n = regexp.groupindex['quote'] - 1
            key = m[n]
            if key:
                quotes[key] = quotes.get(key, 0) + 1
            try:
                n = regexp.groupindex['delim'] - 1
                key = m[n]
            except KeyError:
                continue
244
            if key and (delimiters is None or key in delimiters):
245 246 247 248 249 250 251 252
                delims[key] = delims.get(key, 0) + 1
            try:
                n = regexp.groupindex['space'] - 1
            except KeyError:
                continue
            if m[n]:
                spaces += 1

253
        quotechar = max(quotes, key=quotes.get)
254 255

        if delims:
256
            delim = max(delims, key=delims.get)
257 258 259 260 261 262 263 264
            skipinitialspace = delims[delim] == spaces
            if delim == '\n': # most likely a file with a single column
                delim = ''
        else:
            # there is *no* delimiter, it's a single column of quoted data
            delim = ''
            skipinitialspace = 0

Benjamin Peterson's avatar
Benjamin Peterson committed
265 266
        # if we see an extra quote between delimiters, we've got a
        # double quoted format
267 268 269
        dq_regexp = re.compile(
                               r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
                               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
Benjamin Peterson's avatar
Benjamin Peterson committed
270 271 272 273 274 275 276 277 278



        if dq_regexp.search(data):
            doublequote = True
        else:
            doublequote = False

        return (quotechar, doublequote, delim, skipinitialspace)
279 280


281
    def _guess_delimiter(self, data, delimiters):
282 283 284 285 286 287
        """
        The delimiter /should/ occur the same number of times on
        each row. However, due to malformed data, it may not. We don't want
        an all or nothing approach, so we allow for small variations in this
        number.
          1) build a table of the frequency of each character on every line.
288
          2) build a table of frequencies of this frequency (meta-frequency?),
289 290 291 292 293 294 295 296 297 298 299
             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
             7 times in 2 rows'
          3) use the mode of the meta-frequency to determine the /expected/
             frequency for that character
          4) find out how often the character actually meets that goal
          5) the character that best meets its goal is the delimiter
        For performance reasons, the data is evaluated in chunks, so it can
        try and evaluate the smallest portion of the data possible, evaluating
        additional chunks as necessary.
        """

300
        data = list(filter(None, data.split('\n')))
301 302 303 304 305 306 307 308 309 310 311 312 313 314

        ascii = [chr(c) for c in range(127)] # 7-bit ASCII

        # build frequency tables
        chunkLength = min(10, len(data))
        iteration = 0
        charFrequency = {}
        modes = {}
        delims = {}
        start, end = 0, min(chunkLength, len(data))
        while start < len(data):
            iteration += 1
            for line in data[start:end]:
                for char in ascii:
315
                    metaFrequency = charFrequency.get(char, {})
316
                    # must count even if frequency is 0
317
                    freq = line.count(char)
318
                    # value is the mode
319 320
                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
                    charFrequency[char] = metaFrequency
321 322

            for char in charFrequency.keys():
323
                items = list(charFrequency[char].items())
324 325 326 327
                if len(items) == 1 and items[0][0] == 0:
                    continue
                # get the mode of the frequencies
                if len(items) > 1:
328
                    modes[char] = max(items, key=lambda x: x[1])
329 330 331 332
                    # adjust the mode - subtract the sum of all
                    # other frequencies
                    items.remove(modes[char])
                    modes[char] = (modes[char][0], modes[char][1]
333
                                   - sum(item[1] for item in items))
334 335 336 337 338 339 340 341 342 343 344 345 346
                else:
                    modes[char] = items[0]

            # build a list of possible delimiters
            modeList = modes.items()
            total = float(chunkLength * iteration)
            # (rows of consistent data) / (number of rows) = 100%
            consistency = 1.0
            # minimum consistency threshold
            threshold = 0.9
            while len(delims) == 0 and consistency >= threshold:
                for k, v in modeList:
                    if v[0] > 0 and v[1] > 0:
347 348
                        if ((v[1]/total) >= consistency and
                            (delimiters is None or k in delimiters)):
349 350 351 352
                            delims[k] = v
                consistency -= 0.01

            if len(delims) == 1:
353
                delim = list(delims.keys())[0]
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
                skipinitialspace = (data[0].count(delim) ==
                                    data[0].count("%c " % delim))
                return (delim, skipinitialspace)

            # analyze another chunkLength lines
            start = end
            end += chunkLength

        if not delims:
            return ('', 0)

        # if there's more than one, fall back to a 'preferred' list
        if len(delims) > 1:
            for d in self.preferred:
                if d in delims.keys():
                    skipinitialspace = (data[0].count(d) ==
                                        data[0].count("%c " % d))
                    return (d, skipinitialspace)

373 374 375 376 377 378
        # nothing else indicates a preference, pick the character that
        # dominates(?)
        items = [(v,k) for (k,v) in delims.items()]
        items.sort()
        delim = items[-1][1]

379 380 381 382 383
        skipinitialspace = (data[0].count(delim) ==
                            data[0].count("%c " % delim))
        return (delim, skipinitialspace)


384
    def has_header(self, sample):
385 386 387 388 389 390 391 392 393
        # Creates a dictionary of types of data in each column. If any
        # column is of a single type (say, integers), *except* for the first
        # row, then the first row is presumed to be labels. If the type
        # can't be determined, it is assumed to be a string in which case
        # the length of the string is the determining factor: if all of the
        # rows except for the first are the same length, it's a header.
        # Finally, a 'vote' is taken at the end for each column, adding or
        # subtracting from the likelihood of the first row being a header.

394
        rdr = reader(StringIO(sample), self.sniff(sample))
395

396
        header = next(rdr) # assume first row is header
397 398 399 400 401 402

        columns = len(header)
        columnTypes = {}
        for i in range(columns): columnTypes[i] = None

        checked = 0
403
        for row in rdr:
404 405 406 407 408 409 410 411
            # arbitrary number of rows to check, to keep it sane
            if checked > 20:
                break
            checked += 1

            if len(row) != columns:
                continue # skip rows that have irregular number of columns

412
            for col in list(columnTypes.keys()):
413

414
                for thisType in [int, float, complex]:
415
                    try:
416 417
                        thisType(row[col])
                        break
418
                    except (ValueError, OverflowError):
419 420
                        pass
                else:
421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
                    # fallback to length of string
                    thisType = len(row[col])

                if thisType != columnTypes[col]:
                    if columnTypes[col] is None: # add new column type
                        columnTypes[col] = thisType
                    else:
                        # type is inconsistent, remove column from
                        # consideration
                        del columnTypes[col]

        # finally, compare results against first row and "vote"
        # on whether it's a header
        hasHeader = 0
        for col, colType in columnTypes.items():
            if type(colType) == type(0): # it's a length
                if len(header[col]) != colType:
                    hasHeader += 1
                else:
                    hasHeader -= 1
            else: # attempt typecast
                try:
443
                    colType(header[col])
444
                except (ValueError, TypeError):
445 446 447 448 449
                    hasHeader += 1
                else:
                    hasHeader -= 1

        return hasHeader > 0