import copy import numpy as np import pytest from numpy.testing import assert_allclose from scipy import stats from scipy.stats._multicomp import _pvalue_dunnett, DunnettResult class TestDunnett: # For the following tests, p-values were computed using Matlab, e.g. # sample = [18. 15. 18. 16. 17. 15. 14. 14. 14. 15. 15.... # 14. 15. 14. 22. 18. 21. 21. 10. 10. 11. 9.... # 25. 26. 17.5 16. 15.5 14.5 22. 22. 24. 22.5 29.... # 24.5 20. 18. 18.5 17.5 26.5 13. 16.5 13. 13. 13.... # 28. 27. 34. 31. 29. 27. 24. 23. 38. 36. 25.... # 38. 26. 22. 36. 27. 27. 32. 28. 31.... # 24. 27. 33. 32. 28. 19. 37. 31. 36. 36.... # 34. 38. 32. 38. 32.... # 26. 24. 26. 25. 29. 29.5 16.5 36. 44.... # 25. 27. 19.... # 25. 20.... # 28.]; # j = [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... # 0 0 0 0... # 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1... # 2 2 2 2 2 2 2 2 2... # 3 3 3... # 4 4... # 5]; # [~, ~, stats] = anova1(sample, j, "off"); # [results, ~, ~, gnames] = multcompare(stats, ... # "CriticalValueType", "dunnett", ... # "Approximate", false); # tbl = array2table(results, "VariableNames", ... # ["Group", "Control Group", "Lower Limit", ... # "Difference", "Upper Limit", "P-value"]); # tbl.("Group") = gnames(tbl.("Group")); # tbl.("Control Group") = gnames(tbl.("Control Group")) # Matlab doesn't report the statistic, so the statistics were # computed using R multcomp `glht`, e.g.: # library(multcomp) # options(digits=16) # control < - c(18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0, # 15.0, 15.0, 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0, # 10.0, 10.0, 11.0, 9.0, 25.0, 26.0, 17.5, 16.0, 15.5, # 14.5, 22.0, 22.0, 24.0, 22.5, 29.0, 24.5, 20.0, 18.0, # 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0, 28.0, # 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0, # 25.0, 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0, # 31.0) # t < - c(24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0, # 34.0, 38.0, 32.0, 38.0, 32.0) # w < - c(26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0) # x < - c(25.0, 27.0, 19.0) # y < - c(25.0, 20.0) # z < - c(28.0) # # groups = factor(rep(c("control", "t", "w", "x", "y", "z"), # times=c(length(control), length(t), length(w), # length(x), length(y), length(z)))) # df < - data.frame(response=c(control, t, w, x, y, z), # group=groups) # model < - aov(response # ~group, data = df) # test < - glht(model=model, # linfct=mcp(group="Dunnett"), # alternative="g") # summary(test) # confint(test) # p-values agreed with those produced by Matlab to at least atol=1e-3 # From Matlab's documentation on multcompare samples_1 = [ [ 24.0, 27.0, 33.0, 32.0, 28.0, 19.0, 37.0, 31.0, 36.0, 36.0, 34.0, 38.0, 32.0, 38.0, 32.0 ], [26.0, 24.0, 26.0, 25.0, 29.0, 29.5, 16.5, 36.0, 44.0], [25.0, 27.0, 19.0], [25.0, 20.0], [28.0] ] control_1 = [ 18.0, 15.0, 18.0, 16.0, 17.0, 15.0, 14.0, 14.0, 14.0, 15.0, 15.0, 14.0, 15.0, 14.0, 22.0, 18.0, 21.0, 21.0, 10.0, 10.0, 11.0, 9.0, 25.0, 26.0, 17.5, 16.0, 15.5, 14.5, 22.0, 22.0, 24.0, 22.5, 29.0, 24.5, 20.0, 18.0, 18.5, 17.5, 26.5, 13.0, 16.5, 13.0, 13.0, 13.0, 28.0, 27.0, 34.0, 31.0, 29.0, 27.0, 24.0, 23.0, 38.0, 36.0, 25.0, 38.0, 26.0, 22.0, 36.0, 27.0, 27.0, 32.0, 28.0, 31.0 ] pvalue_1 = [4.727e-06, 0.022346, 0.97912, 0.99953, 0.86579] # Matlab # Statistic, alternative p-values, and CIs computed with R multcomp `glht` p_1_twosided = [1e-4, 0.02237, 0.97913, 0.99953, 0.86583] p_1_greater = [1e-4, 0.011217, 0.768500, 0.896991, 0.577211] p_1_less = [1, 1, 0.99660, 0.98398, .99953] statistic_1 = [5.27356, 2.91270, 0.60831, 0.27002, 0.96637] ci_1_twosided = [[5.3633917835622, 0.7296142201217, -8.3879817106607, -11.9090753452911, -11.7655021543469], [15.9709832164378, 13.8936496687672, 13.4556900439941, 14.6434503452911, 25.4998771543469]] ci_1_greater = [5.9036402398526, 1.4000632918725, -7.2754756323636, -10.5567456382391, -9.8675629499576] ci_1_less = [15.4306165948619, 13.2230539537359, 12.3429406339544, 13.2908248513211, 23.6015228251660] pvalues_1 = dict(twosided=p_1_twosided, less=p_1_less, greater=p_1_greater) cis_1 = dict(twosided=ci_1_twosided, less=ci_1_less, greater=ci_1_greater) case_1 = dict(samples=samples_1, control=control_1, statistic=statistic_1, pvalues=pvalues_1, cis=cis_1) # From Dunnett1955 comparing with R's DescTools: DunnettTest samples_2 = [[9.76, 8.80, 7.68, 9.36], [12.80, 9.68, 12.16, 9.20, 10.55]] control_2 = [7.40, 8.50, 7.20, 8.24, 9.84, 8.32] pvalue_2 = [0.6201, 0.0058] # Statistic, alternative p-values, and CIs computed with R multcomp `glht` p_2_twosided = [0.6201020, 0.0058254] p_2_greater = [0.3249776, 0.0029139] p_2_less = [0.91676, 0.99984] statistic_2 = [0.85703, 3.69375] ci_2_twosided = [[-1.2564116462124, 0.8396273539789], [2.5564116462124, 4.4163726460211]] ci_2_greater = [-0.9588591188156, 1.1187563667543] ci_2_less = [2.2588591188156, 4.1372436332457] pvalues_2 = dict(twosided=p_2_twosided, less=p_2_less, greater=p_2_greater) cis_2 = dict(twosided=ci_2_twosided, less=ci_2_less, greater=ci_2_greater) case_2 = dict(samples=samples_2, control=control_2, statistic=statistic_2, pvalues=pvalues_2, cis=cis_2) samples_3 = [[55, 64, 64], [55, 49, 52], [50, 44, 41]] control_3 = [55, 47, 48] pvalue_3 = [0.0364, 0.8966, 0.4091] # Statistic, alternative p-values, and CIs computed with R multcomp `glht` p_3_twosided = [0.036407, 0.896539, 0.409295] p_3_greater = [0.018277, 0.521109, 0.981892] p_3_less = [0.99944, 0.90054, 0.20974] statistic_3 = [3.09073, 0.56195, -1.40488] ci_3_twosided = [[0.7529028025053, -8.2470971974947, -15.2470971974947], [21.2470971974947, 12.2470971974947, 5.2470971974947]] ci_3_greater = [2.4023682323149, -6.5976317676851, -13.5976317676851] ci_3_less = [19.5984402363662, 10.5984402363662, 3.5984402363662] pvalues_3 = dict(twosided=p_3_twosided, less=p_3_less, greater=p_3_greater) cis_3 = dict(twosided=ci_3_twosided, less=ci_3_less, greater=ci_3_greater) case_3 = dict(samples=samples_3, control=control_3, statistic=statistic_3, pvalues=pvalues_3, cis=cis_3) # From Thomson and Short, # Mucociliary function in health, chronic obstructive airway disease, # and asbestosis, Journal of Applied Physiology, 1969. Table 1 # Comparing with R's DescTools: DunnettTest samples_4 = [[3.8, 2.7, 4.0, 2.4], [2.8, 3.4, 3.7, 2.2, 2.0]] control_4 = [2.9, 3.0, 2.5, 2.6, 3.2] pvalue_4 = [0.5832, 0.9982] # Statistic, alternative p-values, and CIs computed with R multcomp `glht` p_4_twosided = [0.58317, 0.99819] p_4_greater = [0.30225, 0.69115] p_4_less = [0.91929, 0.65212] statistic_4 = [0.90875, -0.05007] ci_4_twosided = [[-0.6898153448579, -1.0333456251632], [1.4598153448579, 0.9933456251632]] ci_4_greater = [-0.5186459268412, -0.8719655502147 ] ci_4_less = [1.2886459268412, 0.8319655502147] pvalues_4 = dict(twosided=p_4_twosided, less=p_4_less, greater=p_4_greater) cis_4 = dict(twosided=ci_4_twosided, less=ci_4_less, greater=ci_4_greater) case_4 = dict(samples=samples_4, control=control_4, statistic=statistic_4, pvalues=pvalues_4, cis=cis_4) @pytest.mark.parametrize( 'rho, n_groups, df, statistic, pvalue, alternative', [ # From Dunnett1955 # Tables 1a and 1b pages 1117-1118 (0.5, 1, 10, 1.81, 0.05, "greater"), # different than two-sided (0.5, 3, 10, 2.34, 0.05, "greater"), (0.5, 2, 30, 1.99, 0.05, "greater"), (0.5, 5, 30, 2.33, 0.05, "greater"), (0.5, 4, 12, 3.32, 0.01, "greater"), (0.5, 7, 12, 3.56, 0.01, "greater"), (0.5, 2, 60, 2.64, 0.01, "greater"), (0.5, 4, 60, 2.87, 0.01, "greater"), (0.5, 4, 60, [2.87, 2.21], [0.01, 0.05], "greater"), # Tables 2a and 2b pages 1119-1120 (0.5, 1, 10, 2.23, 0.05, "two-sided"), # two-sided (0.5, 3, 10, 2.81, 0.05, "two-sided"), (0.5, 2, 30, 2.32, 0.05, "two-sided"), (0.5, 3, 20, 2.57, 0.05, "two-sided"), (0.5, 4, 12, 3.76, 0.01, "two-sided"), (0.5, 7, 12, 4.08, 0.01, "two-sided"), (0.5, 2, 60, 2.90, 0.01, "two-sided"), (0.5, 4, 60, 3.14, 0.01, "two-sided"), (0.5, 4, 60, [3.14, 2.55], [0.01, 0.05], "two-sided"), ], ) def test_critical_values( self, rho, n_groups, df, statistic, pvalue, alternative ): rng = np.random.default_rng(165250594791731684851746311027739134893) rho = np.full((n_groups, n_groups), rho) np.fill_diagonal(rho, 1) statistic = np.array(statistic) res = _pvalue_dunnett( rho=rho, df=df, statistic=statistic, alternative=alternative, rng=rng ) assert_allclose(res, pvalue, atol=5e-3) @pytest.mark.parametrize( 'samples, control, pvalue, statistic', [ (samples_1, control_1, pvalue_1, statistic_1), (samples_2, control_2, pvalue_2, statistic_2), (samples_3, control_3, pvalue_3, statistic_3), (samples_4, control_4, pvalue_4, statistic_4), ] ) def test_basic(self, samples, control, pvalue, statistic): rng = np.random.default_rng(11681140010308601919115036826969764808) res = stats.dunnett(*samples, control=control, random_state=rng) assert isinstance(res, DunnettResult) assert_allclose(res.statistic, statistic, rtol=5e-5) assert_allclose(res.pvalue, pvalue, rtol=1e-2, atol=1e-4) @pytest.mark.parametrize( 'alternative', ['two-sided', 'less', 'greater'] ) def test_ttest_ind(self, alternative): # check that `dunnett` agrees with `ttest_ind` # when there are only two groups rng = np.random.default_rng(114184017807316971636137493526995620351) for _ in range(10): sample = rng.integers(-100, 100, size=(10,)) control = rng.integers(-100, 100, size=(10,)) res = stats.dunnett( sample, control=control, alternative=alternative, random_state=rng ) ref = stats.ttest_ind( sample, control, alternative=alternative, random_state=rng ) assert_allclose(res.statistic, ref.statistic, rtol=1e-3, atol=1e-5) assert_allclose(res.pvalue, ref.pvalue, rtol=1e-3, atol=1e-5) @pytest.mark.parametrize( 'alternative, pvalue', [ ('less', [0, 1]), ('greater', [1, 0]), ('two-sided', [0, 0]), ] ) def test_alternatives(self, alternative, pvalue): rng = np.random.default_rng(114184017807316971636137493526995620351) # width of 20 and min diff between samples/control is 60 # and maximal diff would be 100 sample_less = rng.integers(0, 20, size=(10,)) control = rng.integers(80, 100, size=(10,)) sample_greater = rng.integers(160, 180, size=(10,)) res = stats.dunnett( sample_less, sample_greater, control=control, alternative=alternative, random_state=rng ) assert_allclose(res.pvalue, pvalue, atol=1e-7) ci = res.confidence_interval() # two-sided is comparable for high/low if alternative == 'less': assert np.isneginf(ci.low).all() assert -100 < ci.high[0] < -60 assert 60 < ci.high[1] < 100 elif alternative == 'greater': assert -100 < ci.low[0] < -60 assert 60 < ci.low[1] < 100 assert np.isposinf(ci.high).all() elif alternative == 'two-sided': assert -100 < ci.low[0] < -60 assert 60 < ci.low[1] < 100 assert -100 < ci.high[0] < -60 assert 60 < ci.high[1] < 100 @pytest.mark.parametrize("case", [case_1, case_2, case_3, case_4]) @pytest.mark.parametrize("alternative", ['less', 'greater', 'two-sided']) def test_against_R_multicomp_glht(self, case, alternative): rng = np.random.default_rng(189117774084579816190295271136455278291) samples = case['samples'] control = case['control'] alternatives = {'less': 'less', 'greater': 'greater', 'two-sided': 'twosided'} p_ref = case['pvalues'][alternative.replace('-', '')] res = stats.dunnett(*samples, control=control, alternative=alternative, random_state=rng) # atol can't be tighter because R reports some pvalues as "< 1e-4" assert_allclose(res.pvalue, p_ref, rtol=5e-3, atol=1e-4) ci_ref = case['cis'][alternatives[alternative]] if alternative == "greater": ci_ref = [ci_ref, np.inf] elif alternative == "less": ci_ref = [-np.inf, ci_ref] assert res._ci is None assert res._ci_cl is None ci = res.confidence_interval(confidence_level=0.95) assert_allclose(ci.low, ci_ref[0], rtol=5e-3, atol=1e-5) assert_allclose(ci.high, ci_ref[1], rtol=5e-3, atol=1e-5) # re-run to use the cached value "is" to check id as same object assert res._ci is ci assert res._ci_cl == 0.95 ci_ = res.confidence_interval(confidence_level=0.95) assert ci_ is ci @pytest.mark.parametrize('alternative', ["two-sided", "less", "greater"]) def test_str(self, alternative): rng = np.random.default_rng(189117774084579816190295271136455278291) res = stats.dunnett( *self.samples_3, control=self.control_3, alternative=alternative, random_state=rng ) # check some str output res_str = str(res) assert '(Sample 2 - Control)' in res_str assert '95.0%' in res_str if alternative == 'less': assert '-inf' in res_str assert '19.' in res_str elif alternative == 'greater': assert 'inf' in res_str assert '-13.' in res_str else: assert 'inf' not in res_str assert '21.' in res_str def test_warnings(self): rng = np.random.default_rng(189117774084579816190295271136455278291) res = stats.dunnett( *self.samples_3, control=self.control_3, random_state=rng ) msg = r"Computation of the confidence interval did not converge" with pytest.warns(UserWarning, match=msg): res._allowance(tol=1e-5) def test_raises(self): samples, control = self.samples_3, self.control_3 # alternative with pytest.raises(ValueError, match="alternative must be"): stats.dunnett(*samples, control=control, alternative='bob') # 2D for a sample samples_ = copy.deepcopy(samples) samples_[0] = [samples_[0]] with pytest.raises(ValueError, match="must be 1D arrays"): stats.dunnett(*samples_, control=control) # 2D for control control_ = copy.deepcopy(control) control_ = [control_] with pytest.raises(ValueError, match="must be 1D arrays"): stats.dunnett(*samples, control=control_) # No obs in a sample samples_ = copy.deepcopy(samples) samples_[1] = [] with pytest.raises(ValueError, match="at least 1 observation"): stats.dunnett(*samples_, control=control) # No obs in control control_ = [] with pytest.raises(ValueError, match="at least 1 observation"): stats.dunnett(*samples, control=control_) res = stats.dunnett(*samples, control=control) with pytest.raises(ValueError, match="Confidence level must"): res.confidence_interval(confidence_level=3) @pytest.mark.filterwarnings("ignore:Computation of the confidence") @pytest.mark.parametrize('n_samples', [1, 2, 3]) def test_shapes(self, n_samples): rng = np.random.default_rng(689448934110805334) samples = rng.normal(size=(n_samples, 10)) control = rng.normal(size=10) res = stats.dunnett(*samples, control=control, random_state=rng) assert res.statistic.shape == (n_samples,) assert res.pvalue.shape == (n_samples,) ci = res.confidence_interval() assert ci.low.shape == (n_samples,) assert ci.high.shape == (n_samples,)