Kaydet (Commit) 277842ef authored tarafından Raymond Hettinger's avatar Raymond Hettinger

Issue #21424: Optimize heaqp.nlargest() to make fewer tuple comparisons.

Consolidates the logic for nlargest() into a single function so that
decoration tuples (elem,order) or (key, order, elem) only need to
be formed when a new element is added to the heap.  Formerly, a tuple
was created for every element regardless of whether it was added to
the heap.

The change reduces the number of tuples created, the number of ordering
integers created, and total number of tuple comparisons.
üst d6a46ae7
...@@ -192,81 +192,6 @@ def _heapify_max(x): ...@@ -192,81 +192,6 @@ def _heapify_max(x):
for i in reversed(range(n//2)): for i in reversed(range(n//2)):
_siftup_max(x, i) _siftup_max(x, i)
# Algorithm notes for nlargest() and nsmallest()
# ==============================================
#
# Makes just one pass over the data while keeping the n most extreme values
# in a heap. Memory consumption is limited to keeping n values in a list.
#
# Number of comparisons for n random inputs, keeping the k smallest values:
# -----------------------------------------------------------
# Step Comparisons Action
# 1 1.66*k heapify the first k-inputs
# 2 n - k compare new input elements to top of heap
# 3 k*lg2(k)*(ln(n)-ln(k)) add new extreme values to the heap
# 4 k*lg2(k) final sort of the k most extreme values
#
# number of comparisons
# n-random inputs k-extreme values average of 5 trials % more than min()
# --------------- ---------------- ------------------- -----------------
# 10,000 100 14,046 40.5%
# 100,000 100 105,749 5.7%
# 1,000,000 100 1,007,751 0.8%
#
# Computing the number of comparisons for step 3:
# -----------------------------------------------
# * For the i-th new value from the iterable, the probability of being in the
# k most extreme values is k/i. For example, the probability of the 101st
# value seen being in the 100 most extreme values is 100/101.
# * If the value is a new extreme value, the cost of inserting it into the
# heap is log(k, 2).
# * The probabilty times the cost gives:
# (k/i) * log(k, 2)
# * Summing across the remaining n-k elements gives:
# sum((k/i) * log(k, 2) for xrange(k+1, n+1))
# * This reduces to:
# (H(n) - H(k)) * k * log(k, 2)
# * Where H(n) is the n-th harmonic number estimated by:
# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
# gamma = 0.5772156649
# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
# comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
#
# Worst-case for step 3:
# ----------------------
# In the worst case, the input data is reversed sorted so that every new element
# must be inserted in the heap:
# comparisons = log(k, 2) * (n - k)
#
# Alternative Algorithms
# ----------------------
# Other algorithms were not used because they:
# 1) Took much more auxiliary memory,
# 2) Made multiple passes over the data.
# 3) Made more comparisons in common cases (small k, large n, semi-random input).
# See detailed comparisons at:
# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
def nlargest(n, iterable):
"""Find the n largest elements in a dataset.
Equivalent to: sorted(iterable, reverse=True)[:n]
"""
if n <= 0:
return []
it = iter(iterable)
result = list(islice(it, n))
if not result:
return result
heapify(result)
_heappushpop = heappushpop
for elem in it:
_heappushpop(result, elem)
result.sort(reverse=True)
return result
def nsmallest(n, iterable): def nsmallest(n, iterable):
"""Find the n smallest elements in a dataset. """Find the n smallest elements in a dataset.
...@@ -480,7 +405,6 @@ def nsmallest(n, iterable, key=None): ...@@ -480,7 +405,6 @@ def nsmallest(n, iterable, key=None):
result = _nsmallest(n, it) result = _nsmallest(n, it)
return [r[2] for r in result] # undecorate return [r[2] for r in result] # undecorate
_nlargest = nlargest
def nlargest(n, iterable, key=None): def nlargest(n, iterable, key=None):
"""Find the n largest elements in a dataset. """Find the n largest elements in a dataset.
...@@ -490,12 +414,12 @@ def nlargest(n, iterable, key=None): ...@@ -490,12 +414,12 @@ def nlargest(n, iterable, key=None):
# Short-cut for n==1 is to use max() when len(iterable)>0 # Short-cut for n==1 is to use max() when len(iterable)>0
if n == 1: if n == 1:
it = iter(iterable) it = iter(iterable)
head = list(islice(it, 1)) sentinel = object()
if not head:
return []
if key is None: if key is None:
return [max(chain(head, it))] result = max(it, default=sentinel)
return [max(chain(head, it), key=key)] else:
result = max(it, default=sentinel, key=key)
return [] if result is sentinel else [result]
# When n>=size, it's faster to use sorted() # When n>=size, it's faster to use sorted()
try: try:
...@@ -508,15 +432,40 @@ def nlargest(n, iterable, key=None): ...@@ -508,15 +432,40 @@ def nlargest(n, iterable, key=None):
# When key is none, use simpler decoration # When key is none, use simpler decoration
if key is None: if key is None:
it = zip(iterable, count(0,-1)) # decorate it = iter(iterable)
result = _nlargest(n, it) result = list(islice(zip(it, count(0, -1)), n))
return [r[0] for r in result] # undecorate if not result:
return result
heapify(result)
order = -n
top = result[0][0]
_heapreplace = heapreplace
for elem in it:
if top < elem:
order -= 1
_heapreplace(result, (elem, order))
top = result[0][0]
result.sort(reverse=True)
return [r[0] for r in result]
# General case, slowest method # General case, slowest method
in1, in2 = tee(iterable) it = iter(iterable)
it = zip(map(key, in1), count(0,-1), in2) # decorate result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
result = _nlargest(n, it) if not result:
return [r[2] for r in result] # undecorate return result
heapify(result)
order = -n
top = result[0][0]
_heapreplace = heapreplace
for elem in it:
k = key(elem)
if top < k:
order -= 1
_heapreplace(result, (k, order, elem))
top = result[0][0]
result.sort(reverse=True)
return [r[2] for r in result]
if __name__ == "__main__": if __name__ == "__main__":
# Simple sanity test # Simple sanity test
......
...@@ -13,7 +13,7 @@ c_heapq = support.import_fresh_module('heapq', fresh=['_heapq']) ...@@ -13,7 +13,7 @@ c_heapq = support.import_fresh_module('heapq', fresh=['_heapq'])
# _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when # _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when
# _heapq is imported, so check them there # _heapq is imported, so check them there
func_names = ['heapify', 'heappop', 'heappush', 'heappushpop', func_names = ['heapify', 'heappop', 'heappush', 'heappushpop',
'heapreplace', '_nlargest', '_nsmallest'] 'heapreplace', '_nsmallest']
class TestModules(TestCase): class TestModules(TestCase):
def test_py_functions(self): def test_py_functions(self):
......
...@@ -81,6 +81,9 @@ Library ...@@ -81,6 +81,9 @@ Library
- Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a - Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a
staticmethod. staticmethod.
- Issue #21424: Simplified and optimized heaqp.nlargest() to make fewer
tuple comparisons.
- Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a - Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a
flush() on the underlying binary stream. Patch by akira. flush() on the underlying binary stream. Patch by akira.
......
...@@ -267,89 +267,6 @@ heapify(PyObject *self, PyObject *heap) ...@@ -267,89 +267,6 @@ heapify(PyObject *self, PyObject *heap)
PyDoc_STRVAR(heapify_doc, PyDoc_STRVAR(heapify_doc,
"Transform list into a heap, in-place, in O(len(heap)) time."); "Transform list into a heap, in-place, in O(len(heap)) time.");
static PyObject *
nlargest(PyObject *self, PyObject *args)
{
PyObject *heap=NULL, *elem, *iterable, *sol, *it, *oldelem;
Py_ssize_t i, n;
int cmp;
if (!PyArg_ParseTuple(args, "nO:nlargest", &n, &iterable))
return NULL;
it = PyObject_GetIter(iterable);
if (it == NULL)
return NULL;
heap = PyList_New(0);
if (heap == NULL)
goto fail;
for (i=0 ; i<n ; i++ ){
elem = PyIter_Next(it);
if (elem == NULL) {
if (PyErr_Occurred())
goto fail;
else
goto sortit;
}
if (PyList_Append(heap, elem) == -1) {
Py_DECREF(elem);
goto fail;
}
Py_DECREF(elem);
}
if (PyList_GET_SIZE(heap) == 0)
goto sortit;
for (i=n/2-1 ; i>=0 ; i--)
if(_siftup((PyListObject *)heap, i) == -1)
goto fail;
sol = PyList_GET_ITEM(heap, 0);
while (1) {
elem = PyIter_Next(it);
if (elem == NULL) {
if (PyErr_Occurred())
goto fail;
else
goto sortit;
}
cmp = PyObject_RichCompareBool(sol, elem, Py_LT);
if (cmp == -1) {
Py_DECREF(elem);
goto fail;
}
if (cmp == 0) {
Py_DECREF(elem);
continue;
}
oldelem = PyList_GET_ITEM(heap, 0);
PyList_SET_ITEM(heap, 0, elem);
Py_DECREF(oldelem);
if (_siftup((PyListObject *)heap, 0) == -1)
goto fail;
sol = PyList_GET_ITEM(heap, 0);
}
sortit:
if (PyList_Sort(heap) == -1)
goto fail;
if (PyList_Reverse(heap) == -1)
goto fail;
Py_DECREF(it);
return heap;
fail:
Py_DECREF(it);
Py_XDECREF(heap);
return NULL;
}
PyDoc_STRVAR(nlargest_doc,
"Find the n largest elements in a dataset.\n\
\n\
Equivalent to: sorted(iterable, reverse=True)[:n]\n");
static int static int
_siftdownmax(PyListObject *heap, Py_ssize_t startpos, Py_ssize_t pos) _siftdownmax(PyListObject *heap, Py_ssize_t startpos, Py_ssize_t pos)
{ {
...@@ -531,8 +448,6 @@ static PyMethodDef heapq_methods[] = { ...@@ -531,8 +448,6 @@ static PyMethodDef heapq_methods[] = {
METH_VARARGS, heapreplace_doc}, METH_VARARGS, heapreplace_doc},
{"heapify", (PyCFunction)heapify, {"heapify", (PyCFunction)heapify,
METH_O, heapify_doc}, METH_O, heapify_doc},
{"nlargest", (PyCFunction)nlargest,
METH_VARARGS, nlargest_doc},
{"nsmallest", (PyCFunction)nsmallest, {"nsmallest", (PyCFunction)nsmallest,
METH_VARARGS, nsmallest_doc}, METH_VARARGS, nsmallest_doc},
{NULL, NULL} /* sentinel */ {NULL, NULL} /* sentinel */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment