import matplotlib.pyplot as pl
import pandas as pd
import numpy as np
from sys import stderr
from time import time
from contextlib import contextmanager
from arsenal.humanreadable import htime
from arsenal.terminal import colors
from arsenal.misc import ddict
from scipy.stats import mannwhitneyu
[docs]def timers(title=None):
return Benchmark(title)
[docs]class Benchmark(object):
def __init__(self, title):
self.title = title
self.timers = ddict(Timer)
def __getitem__(self, name):
return self.timers[name]
[docs] def compare(self, statistic=np.median):
if len(self.timers) == 0:
return
if len(self.timers) == 1:
[x] = self.timers.values()
print(x)
return
best = min(self.timers.values(), key=lambda t: statistic(t.times))
for name in sorted(self.timers):
if name != best.name:
best.compare(self.timers[name])
[docs] def values(self):
return list(self.timers.values())
[docs] def keys(self):
return list(self.timers.keys())
[docs] def items(self):
return list(self.timers.items())
def __len__(self):
return len(self.timers)
def __iter__(self):
return iter(sorted(self.keys()))
[docs] def plot_feature(self, feature, timecol='timer', ax=None, **kw):
if ax is None: ax = pl.figure().add_subplot(111)
for t in list(self.timers.values()):
t.plot_feature(feature=feature,
timecol=timecol,
ax=ax,
**kw)
if self.title is not None: ax.set_title(self.title)
ax.legend(loc=2)
return ax
[docs] def plot_survival(self,*args,**kwargs):
"Show the probability each algorithm is still running."
for _, t in sorted(self.items()):
t.plot_survival(*args,**kwargs)
[docs] def run(self, methods, reps):
from arsenal import iterview, restore_random_state
if isinstance(methods, (tuple, list)):
methods = {m.__name__: m for m in methods}
jobs = [
(name, seed)
for seed in range(reps) # TODO: use a better strategy for picking random seeds.
for name in methods
]
np.random.shuffle(jobs) # shuffle jobs to avoid weird ordering correlations
for name, seed in iterview(jobs):
with restore_random_state(seed):
with self[name]:
methods[name]()
[docs]class Timer:
"""
>>> from time import sleep
>>> a = Timer('A')
>>> b = Timer('B')
>>> with a:
... sleep(0.5)
>>> with b:
... sleep(1)
>>> a.compare(b) #doctest:+SKIP
A is 2.0018x faster
"""
def __init__(self, name=None):
self.name = name
self.times = []
self.features = []
self.b4 = None
def __enter__(self):
self.b4 = time()
def __exit__(self, *_):
self.times.append(time() - self.b4)
def __str__(self):
return 'Timer(name=%s, avg=%g, std=%g)' % (self.name, self.mean, self.std)
def __call__(self, **features):
self.features.append(features)
return self
@property
def mean(self):
return np.mean(self.times)
@property
def median(self):
return np.median(self.times)
@property
def std(self):
if len(self.times) <= 1:
return 0.0
return np.std(self.times, ddof=1)
@property
def total(self):
return sum(self.times)
[docs] def compare(self, other, attr='mean', verbose=True):
if len(self.times) == 0 or len(other.times) == 0:
print('%s %s %s' % (self.name, '???', other.name))
return
self_attr = getattr(self, attr)
other_attr = getattr(other, attr)
if self_attr <= other_attr:
# use_continuity=True, alternative=None
# alternative = {None, ‘two-sided’, ‘less’, ‘greater’}
# XXX: support Wilcoxon signed rank test for paired examples.
#print(np.array(self.times) - np.array(other.times))
U = mannwhitneyu(self.times, other.times, alternative='two-sided')
extra = ''
if verbose:
pval = f'p={U.pvalue:.5f}'
if U.pvalue < 0.05:
pval = colors.green % pval
else:
pval = colors.yellow % pval
extra = f'({pval}, {attr}: {other.name}: {other_attr:g}, {self.name}: {self_attr:g})'
print(f'{self.name} is %6.4fx faster than {other.name} %s' \
% (other_attr / self_attr, extra))
else:
other.compare(self, attr=attr, verbose=verbose)
# def compare_many(self, *others, **kw):
# for x in sorted(others, key=lambda x: x.name):
# if x != self:
# self.compare(x, **kw)
[docs] def plot_feature(self, feature, timecol='timer',
ax=None, label=None, scatter=False,
show_curve=False, **kw):
if ax is None: ax = pl.figure().add_subplot(111)
df = self.dataframe(timecol)
a = df.groupby(feature).median()
# a = df.groupby(feature).mean()
X = a.index
Y = a[timecol]
ax.set_xlabel(feature)
ax.set_ylabel('time (seconds)')
if label is None: label = self.name
[line] = ax.plot(X, Y, lw=2, alpha=0.5, label=label, **kw)
kw.pop('c', None)
c = line.get_color()
ax.scatter(X, Y, c=c, lw=0, label=None, alpha=0.25, **kw)
#ax.scatter(df[feature], df[timecol], c=c, alpha=0.25, marker='.', label=None, **kw)
if show_curve:
xs = np.array(X); ys = np.array(Y)
assert np.all(xs > 0) and np.all(ys > 0)
a,b = np.polyfit(np.log(xs), np.log(ys), deg=1)
l = r'%s $\approx {%.2f} \cdot $%s$^{%.2f}$' % (label, np.exp(b), feature, a)
ax.plot(xs, np.exp(b)*xs**a, alpha=0.5, label=l, c=c, linestyle=':')
data = []
for f, dd in df.groupby(feature):
data.append([
f,
np.percentile(dd[timecol], 20),
np.percentile(dd[timecol], 80),
])
if scatter:
ax.scatter([f]*len(dd), dd[timecol], c=c, alpha=0.25)
data = list(sorted(data))
fs, ls, us = zip(*data)
ax.fill_between(fs, ls, us, alpha=0.2, color=c)
#elif 'box' in show:
# # TODO: doen't work very well yet. need to fill out the x-axis since
# # feature might not be dense. Should throw an error if feature isn't
# # integral.
# ddd = [np.asarray(dd[timecol]) for f, dd in sorted(df.groupby(feature))]
# ax.boxplot(ddd)
return ax
[docs] def dataframe(self, timecol='timer'):
df = pd.DataFrame(list(self.features))
df[timecol] = self.times
return df
[docs] def filter(self, f, name=None):
t = Timer(name)
t.times, t.features = list(zip(*[(x,y) for (x,y) in zip(self.times, self.features) if f(x,y)]))
return t
[docs] def bucket_filter(self, feature_to_bucket, bucket_filter):
df = self.dataframe()
data = []
for k, d in df.groupby(feature_to_bucket):
d = d[bucket_filter(k, d)]
_, x = list(zip(*d.iterrows()))
data.append(x)
return pd.DataFrame(data)
[docs] def trim_slow(self, feature_to_bucket, threshold):
return self.bucket_filter(
feature_to_bucket,
lambda k, d: d.timer <= d.timer.quantile(threshold)
)
[docs] def plot_survival(self, ax=None):
if ax is None: ax = pl.figure().add_subplot(111)
from arsenal.maths import cdf
ts = np.array(self.times)
xs = np.linspace(0, ts.max(), 1000)
ax.plot(xs, 1-cdf(ts)(xs), label=self.name)
ax.legend(loc='best'); ax.set_xscale('log'); ax.set_yscale('log')
[docs]@contextmanager
def timeit(name, fmt='{name} ({htime})', header=None):
"""Context Manager which prints the time it took to run code block."""
if header is not None:
print(header)
b4 = time()
yield
sec = time() - b4
if sec < 60:
ht = '%.4f sec' % sec
else:
ht = htime(sec)
print(fmt.format(name=name, htime=ht, sec=sec), file=stderr)
[docs]def main():
from arsenal.iterview import iterview
from time import sleep
from numpy.random import uniform
T = Benchmark('A vs B')
for _ in iterview(range(1000), T.title):
with T['A']:
sleep(np.random.exponential(.001))
with T['B']:
sleep(np.random.exponential(.001))
T.compare()
t = Timer('test')
for i in iterview(range(1, 20)):
for _ in range(10):
with t(i=i):
c = 0.01
z = max(i**2 * 0.0001 + uniform(-c, c), 0.0)
sleep(z)
t.plot_feature('i')
pl.show()
if __name__ == '__main__':
main()