Source code for arsenal.iterextras.sorted_intersection
from bisect import bisect_left
[docs]def sorted_intersection(A, B):
"""Baeza-Yates set intersection, aka double-binary search.
Let `N = min(|A|, |B|)` and `M=max(|A|, |B|)`
- hash-based set intersection: O(N+M)
- Baeza-Yates intersection: O(N log M) worst-case, better for most problems
on "average"
References:
- https://mail.python.org/pipermail/python-list/2008-April/508321.html
"""
if len(A) > len(B): A,B = B,A # The smaller set drives the outer loop.
def rec(lo1, hi1, lo2, hi2):
if hi1 <= lo1: return
if hi2 <= lo2: return
mid1 = (lo1 + hi1) // 2
x1 = A[mid1] # median of this part of set A
mid2 = bisect_left(B, x1, lo=lo2, hi=hi2) # find position of mid1 in B
yield from rec(lo1, mid1, lo2, mid2) # recurse on the first halves
# Is the median of `A` in the intersection? (uses binary search above)
if mid2 < hi2 and x1 == B[mid2]:
#assert max(A[lo1], B[lo2]) <= x1 <= min(A[hi1-1], B[hi2-1])
yield x1 # recursing in this order ensures that `out` is sorted.
yield from rec(mid1+1, hi1, mid2, hi2) # recurse on the second halves
yield from rec(0, len(A), 0, len(B))
[docs]def test():
import numpy as np
import matplotlib.pyplot as pl
from arsenal.timer import timers
from arsenal import iterview
T = timers()
params = [
(i, rep)
for i in range(5, 20)
for rep in range(5)
]
np.random.shuffle(params)
for (i, _) in iterview(params):
# BY's runtime scales logarithmically with the bigger set and linearly
# with the smaller set. Hash-based intersetion scales with their sum.
n = 2**i
m = 2*10
U = range(max(n,m)*5)
A = list(np.random.choice(U, n, replace=0))
B = list(np.random.choice(U, m, replace=0))
A.sort()
B.sort()
sA = set(A)
sB = set(B)
with T['set'](i=2**i):
E = (sA & sB)
with T['make-set'](i=2**i):
E = (set(A) & set(B))
with T['B-Y'](i=2**i):
C = list(sorted_intersection(A, B))
assert sorted(E) == C
T.compare()
T.plot_feature('i')
pl.show()
if __name__ == '__main__':
test()