Kaydet (Commit) f24eb35d authored tarafından Raymond Hettinger's avatar Raymond Hettinger

SF patch 629637: Add sample(population, k) method to the random module.

Used for random sampling without replacement.
üst 3a7ad5c5
...@@ -179,6 +179,25 @@ Functions for sequences: ...@@ -179,6 +179,25 @@ Functions for sequences:
long sequence can never be generated. long sequence can never be generated.
\end{funcdesc} \end{funcdesc}
\begin{funcdesc}{sample}{population, k}
Return a \var{k} length list of unique elements chosen from the
population sequence. Used for random sampling without replacement.
Returns a new list containing elements from the population. The
list itself is in random order so that all sub-slices are also
random samples. The original sequence is left undisturbed.
If the population has repeated elements, then each occurence is a
possible selection in the sample.
If indices are needed for a large population, use \function{xrange}
as an argument: \code{sample(xrange(10000000), 60)}.
Optional argument random is a 0-argument function returning a random
float in [0.0, 1.0); by default, the standard random.random.
\versionadded{2.3}
\end{funcdesc}
The following functions generate specific real-valued distributions. The following functions generate specific real-valued distributions.
Function parameters are named after the corresponding variables in the Function parameters are named after the corresponding variables in the
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
sequences sequences
--------- ---------
pick random element pick random element
pick random sample
generate random permutation generate random permutation
distributions on the real line: distributions on the real line:
...@@ -77,7 +78,7 @@ from math import log as _log, exp as _exp, pi as _pi, e as _e ...@@ -77,7 +78,7 @@ from math import log as _log, exp as _exp, pi as _pi, e as _e
from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin
from math import floor as _floor from math import floor as _floor
__all__ = ["Random","seed","random","uniform","randint","choice", __all__ = ["Random","seed","random","uniform","randint","choice","sample",
"randrange","shuffle","normalvariate","lognormvariate", "randrange","shuffle","normalvariate","lognormvariate",
"cunifvariate","expovariate","vonmisesvariate","gammavariate", "cunifvariate","expovariate","vonmisesvariate","gammavariate",
"stdgamma","gauss","betavariate","paretovariate","weibullvariate", "stdgamma","gauss","betavariate","paretovariate","weibullvariate",
...@@ -373,6 +374,43 @@ class Random: ...@@ -373,6 +374,43 @@ class Random:
j = int(random() * (i+1)) j = int(random() * (i+1))
x[i], x[j] = x[j], x[i] x[i], x[j] = x[j], x[i]
def sample(self, population, k, random=None, int=int):
"""Chooses k unique random elements from a population sequence.
Returns a new list containing elements from the population. The
list itself is in random order so that all sub-slices are also
random samples. The original sequence is left undisturbed.
If the population has repeated elements, then each occurence is
a possible selection in the sample.
If indices are needed for a large population, use xrange as an
argument: sample(xrange(10000000), 60)
Optional arg random is a 0-argument function returning a random
float in [0.0, 1.0); by default, the standard random.random.
"""
n = len(population)
if not 0 <= k <= n:
raise ValueError, "sample larger than population"
if random is None:
random = self.random
if n < 6 * k: # if n len list takes less space than a k len dict
pool = list(population)
for i in xrange(n-1, n-k-1, -1):
j = int(random() * (i+1))
pool[i], pool[j] = pool[j], pool[i]
return pool[-k:]
inorder = [None] * k
selections = {}
for i in xrange(k):
j = int(random() * n)
while j in selections:
j = int(random() * n)
selections[j] = inorder[i] = population[j]
return inorder # return selections in the order they were picked
## -------------------- real-valued distributions ------------------- ## -------------------- real-valued distributions -------------------
## -------------------- uniform distribution ------------------- ## -------------------- uniform distribution -------------------
...@@ -711,7 +749,19 @@ def _test_generator(n, funccall): ...@@ -711,7 +749,19 @@ def _test_generator(n, funccall):
print 'avg %g, stddev %g, min %g, max %g' % \ print 'avg %g, stddev %g, min %g, max %g' % \
(avg, stddev, smallest, largest) (avg, stddev, smallest, largest)
def _test(N=20000): def _test_sample(n):
# For the entire allowable range of 0 <= k <= n, validate that
# the sample is of the correct length and contains only unique items
population = xrange(n)
for k in xrange(n+1):
s = sample(population, k)
assert len(dict([(elem,True) for elem in s])) == len(s) == k
def _sample_generator(n, k):
# Return a fixed element from the sample. Validates random ordering.
return sample(xrange(n), k)[k//2]
def _test(N=2000):
print 'TWOPI =', TWOPI print 'TWOPI =', TWOPI
print 'LOG4 =', LOG4 print 'LOG4 =', LOG4
print 'NV_MAGICCONST =', NV_MAGICCONST print 'NV_MAGICCONST =', NV_MAGICCONST
...@@ -735,6 +785,9 @@ def _test(N=20000): ...@@ -735,6 +785,9 @@ def _test(N=20000):
_test_generator(N, 'betavariate(3.0, 3.0)') _test_generator(N, 'betavariate(3.0, 3.0)')
_test_generator(N, 'paretovariate(1.0)') _test_generator(N, 'paretovariate(1.0)')
_test_generator(N, 'weibullvariate(1.0, 1.0)') _test_generator(N, 'weibullvariate(1.0, 1.0)')
_test_generator(N, '_sample_generator(50, 5)') # expected s.d.: 14.4
_test_generator(N, '_sample_generator(50, 45)') # expected s.d.: 14.4
_test_sample(1000)
# Test jumpahead. # Test jumpahead.
s = getstate() s = getstate()
...@@ -760,6 +813,7 @@ uniform = _inst.uniform ...@@ -760,6 +813,7 @@ uniform = _inst.uniform
randint = _inst.randint randint = _inst.randint
choice = _inst.choice choice = _inst.choice
randrange = _inst.randrange randrange = _inst.randrange
sample = _inst.sample
shuffle = _inst.shuffle shuffle = _inst.shuffle
normalvariate = _inst.normalvariate normalvariate = _inst.normalvariate
lognormvariate = _inst.lognormvariate lognormvariate = _inst.lognormvariate
......
...@@ -427,6 +427,9 @@ Library ...@@ -427,6 +427,9 @@ Library
- Added operator.pow(a,b) which is equivalent to a**b. - Added operator.pow(a,b) which is equivalent to a**b.
- Added random.sample(population,k) for random sampling without replacement.
Returns a k length list of unique elements chosen from the population.
- random.randrange(-sys.maxint-1, sys.maxint) no longer raises - random.randrange(-sys.maxint-1, sys.maxint) no longer raises
OverflowError. That is, it now accepts any combination of 'start' OverflowError. That is, it now accepts any combination of 'start'
and 'stop' arguments so long as each is in the range of Python's and 'stop' arguments so long as each is in the range of Python's
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment