#!/usr/bin/env python # Copyright 2015-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. import math import sys import numpy as np import numpy.random as npr import scipy.stats def _bootstrap_mean_sem(samples): """Return the estimated standard error for a distribution's mean.""" samples = np.array(samples) n = len(samples) indices = npr.randint(0, n, (10000, n)) samples = samples[indices] means = np.sort(np.mean(samples, axis=1)) return np.std(means, ddof=1) def _read_measurements(f): """Read measurements from a file. Returns {'a': [1.0, 2.0, 3.0], 'b': [5.0, 5.0, 5.0]} for a file containing the six lines: ['a 1', 'a 2', 'a 3', 'b 5', 'b 5', 'b 5']. """ measurements = {} for line in f: label, value = line.split(None, 1) measurements.setdefault(label, []).append(float(value)) return measurements def _compute_mean_and_sd_of_ratio_from_delta_method( mean_test, sem_test, mean_control, sem_control ): mean = ( ((mean_test - mean_control) / mean_control) - (pow(sem_control, 2) * mean_test / pow(mean_control, 3)) ) var = ( pow(sem_test / mean_control, 2) + (pow(sem_control * mean_test, 2) / pow(mean_control, 4)) ) return (mean, math.sqrt(var)) def _main(): if len(sys.argv) != 3: sys.stderr.write("usage: analyze.py control.txt test.txt\n") return 1 ci_size = 0.99 p_value = scipy.stats.norm.ppf(0.5 * (1 + ci_size)) control, test = sys.argv[1:] with open(control) as f: control_measurements = _read_measurements(f) with open(test) as f: test_measurements = _read_measurements(f) keys = set() keys.update(control_measurements.iterkeys()) keys.update(test_measurements.iterkeys()) print "Comparing %s (control) vs %s (test)" % (control, test) print "Significant differences marked by ***" print "%% change from control to test, with %g%% CIs:" % (ci_size * 100,) print any_sig = False for key in sorted(keys): print "* %s" % (key,) control_nums = control_measurements.get(key, []) test_nums = test_measurements.get(key, []) if not control_nums or not test_nums: print " skipping..." continue mean_control = np.mean(control_nums) mean_test = np.mean(test_nums) sem_control = _bootstrap_mean_sem(control_nums) sem_test = _bootstrap_mean_sem(test_nums) rat_mean, rat_sem = _compute_mean_and_sd_of_ratio_from_delta_method( mean_test, sem_test, mean_control, sem_control ) rat_low = rat_mean - p_value * rat_sem rat_high = rat_mean + p_value * rat_sem sig = rat_high < 0 or rat_low > 0 any_sig = any_sig or sig print " %% change: %+6.2f%% [%+6.2f%%, %+6.2f%%]%s" % ( 100 * rat_mean, 100 * rat_low, 100 * rat_high, ' ***' if sig else '' ) print " means: %g (control), %g (test)" % (mean_control, mean_test) if __name__ == '__main__': sys.exit(_main())