Kaydet (Commit) 77892373 authored tarafından Skip Montanaro's avatar Skip Montanaro

* Correct Sniffer doc to correspond to the implementation.

* Add optional delimiters arg to Sniffer.sniff() which restricts the set of
  candidate field delimiters.
üst c626658a
...@@ -152,17 +152,17 @@ attributes, which are used to define the parameters for a specific ...@@ -152,17 +152,17 @@ attributes, which are used to define the parameters for a specific
\class{reader} or \class{writer} instance. \class{reader} or \class{writer} instance.
\end{classdesc*} \end{classdesc*}
\begin{classdesc}{Sniffer}{\optional{sample=16384}} \begin{classdesc}{Sniffer}{}
The \class{Sniffer} class is used to deduce the format of a CSV file. The The \class{Sniffer} class is used to deduce the format of a CSV file.
optional \var{sample} argument to the constructor specifies the number of
bytes to use when determining Dialect parameters.
\end{classdesc} \end{classdesc}
The \class{Sniffer} class provides a single method: The \class{Sniffer} class provides a single method:
\begin{methoddesc}{sniff}{fileobj} \begin{methoddesc}{sniff}{sample\optional{,delimiters=None}}
Analyze the next chunk of \var{fileobj} and return a \class{Dialect} subclass Analyze the given \var{sample} and return a \class{Dialect} subclass
reflecting the parameters found. reflecting the parameters found. If the optional \var{delimiters} parameter
is given, it is interpreted as a string containing possible valid delimiter
characters.
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}{has_header}{sample} \begin{methoddesc}{has_header}{sample}
......
...@@ -159,15 +159,16 @@ class Sniffer: ...@@ -159,15 +159,16 @@ class Sniffer:
self.preferred = [',', '\t', ';', ' ', ':'] self.preferred = [',', '\t', ';', ' ', ':']
def sniff(self, sample): def sniff(self, sample, delimiters=None):
""" """
Returns a dialect (or None) corresponding to the sample Returns a dialect (or None) corresponding to the sample
""" """
quotechar, delimiter, skipinitialspace = \ quotechar, delimiter, skipinitialspace = \
self._guess_quote_and_delimiter(sample) self._guess_quote_and_delimiter(sample, delimiters)
if delimiter is None: if delimiter is None:
delimiter, skipinitialspace = self._guess_delimiter(sample) delimiter, skipinitialspace = self._guess_delimiter(sample,
delimiters)
class dialect(Dialect): class dialect(Dialect):
_name = "sniffed" _name = "sniffed"
...@@ -184,7 +185,7 @@ class Sniffer: ...@@ -184,7 +185,7 @@ class Sniffer:
return dialect return dialect
def _guess_quote_and_delimiter(self, data): def _guess_quote_and_delimiter(self, data, delimiters):
""" """
Looks for text enclosed between two identical quotes Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed (the probable quotechar) which are preceded and followed
...@@ -222,7 +223,7 @@ class Sniffer: ...@@ -222,7 +223,7 @@ class Sniffer:
key = m[n] key = m[n]
except KeyError: except KeyError:
continue continue
if key: if key and (delimiters is None or key in delimiters):
delims[key] = delims.get(key, 0) + 1 delims[key] = delims.get(key, 0) + 1
try: try:
n = regexp.groupindex['space'] - 1 n = regexp.groupindex['space'] - 1
...@@ -248,7 +249,7 @@ class Sniffer: ...@@ -248,7 +249,7 @@ class Sniffer:
return (quotechar, delim, skipinitialspace) return (quotechar, delim, skipinitialspace)
def _guess_delimiter(self, data): def _guess_delimiter(self, data, delimiters):
""" """
The delimiter /should/ occur the same number of times on The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want each row. However, due to malformed data, it may not. We don't want
...@@ -316,7 +317,8 @@ class Sniffer: ...@@ -316,7 +317,8 @@ class Sniffer:
while len(delims) == 0 and consistency >= threshold: while len(delims) == 0 and consistency >= threshold:
for k, v in modeList: for k, v in modeList:
if v[0] > 0 and v[1] > 0: if v[0] > 0 and v[1] > 0:
if (v[1]/total) >= consistency: if ((v[1]/total) >= consistency and
(delimiters is None or k in delimiters)):
delims[k] = v delims[k] = v
consistency -= 0.01 consistency -= 0.01
......
...@@ -551,6 +551,12 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back ...@@ -551,6 +551,12 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
header = '''\ header = '''\
"venue","city","state","date","performers" "venue","city","state","date","performers"
''' '''
sample3 = '''\
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
05/05/03?05/05/03?05/05/03?05/05/03?05/05/03?05/05/03
'''
def test_has_header(self): def test_has_header(self):
sniffer = csv.Sniffer() sniffer = csv.Sniffer()
self.assertEqual(sniffer.has_header(self.sample1), False) self.assertEqual(sniffer.has_header(self.sample1), False)
...@@ -568,6 +574,15 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back ...@@ -568,6 +574,15 @@ Stonecutters Seafood and Chop House, Lemont, IL, 12/19/02, Week Back
self.assertEqual(dialect.quotechar, "'") self.assertEqual(dialect.quotechar, "'")
self.assertEqual(dialect.skipinitialspace, False) self.assertEqual(dialect.skipinitialspace, False)
def test_delimiters(self):
sniffer = csv.Sniffer()
dialect = sniffer.sniff(self.sample3)
self.assertEqual(dialect.delimiter, "0")
dialect = sniffer.sniff(self.sample3, delimiters="?,")
self.assertEqual(dialect.delimiter, "?")
dialect = sniffer.sniff(self.sample3, delimiters="/,")
self.assertEqual(dialect.delimiter, "/")
if not hasattr(sys, "gettotalrefcount"): if not hasattr(sys, "gettotalrefcount"):
if test_support.verbose: print "*** skipping leakage tests ***" if test_support.verbose: print "*** skipping leakage tests ***"
else: else:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment