265 lines
9.3 KiB
Python
265 lines
9.3 KiB
Python
|
|
import operator
|
|
from dataclasses import dataclass
|
|
import numpy as np
|
|
from scipy.special import ndtri
|
|
from ._common import ConfidenceInterval
|
|
|
|
|
|
def _validate_int(n, bound, name):
|
|
msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
|
|
try:
|
|
n = operator.index(n)
|
|
except TypeError:
|
|
raise TypeError(msg) from None
|
|
if n < bound:
|
|
raise ValueError(msg)
|
|
return n
|
|
|
|
|
|
@dataclass
|
|
class RelativeRiskResult:
|
|
"""
|
|
Result of `scipy.stats.contingency.relative_risk`.
|
|
|
|
Attributes
|
|
----------
|
|
relative_risk : float
|
|
This is::
|
|
|
|
(exposed_cases/exposed_total) / (control_cases/control_total)
|
|
|
|
exposed_cases : int
|
|
The number of "cases" (i.e. occurrence of disease or other event
|
|
of interest) among the sample of "exposed" individuals.
|
|
exposed_total : int
|
|
The total number of "exposed" individuals in the sample.
|
|
control_cases : int
|
|
The number of "cases" among the sample of "control" or non-exposed
|
|
individuals.
|
|
control_total : int
|
|
The total number of "control" individuals in the sample.
|
|
|
|
Methods
|
|
-------
|
|
confidence_interval :
|
|
Compute the confidence interval for the relative risk estimate.
|
|
"""
|
|
|
|
relative_risk: float
|
|
exposed_cases: int
|
|
exposed_total: int
|
|
control_cases: int
|
|
control_total: int
|
|
|
|
def confidence_interval(self, confidence_level=0.95):
|
|
"""
|
|
Compute the confidence interval for the relative risk.
|
|
|
|
The confidence interval is computed using the Katz method
|
|
(i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
|
|
|
|
Parameters
|
|
----------
|
|
confidence_level : float, optional
|
|
The confidence level to use for the confidence interval.
|
|
Default is 0.95.
|
|
|
|
Returns
|
|
-------
|
|
ci : ConfidenceInterval instance
|
|
The return value is an object with attributes ``low`` and
|
|
``high`` that hold the confidence interval.
|
|
|
|
References
|
|
----------
|
|
.. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
|
|
confidence intervals for the risk ratio in cohort studies",
|
|
Biometrics, 34, 469-474 (1978).
|
|
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
|
CRC Press LLC, Boca Raton, FL, USA (1996).
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from scipy.stats.contingency import relative_risk
|
|
>>> result = relative_risk(exposed_cases=10, exposed_total=75,
|
|
... control_cases=12, control_total=225)
|
|
>>> result.relative_risk
|
|
2.5
|
|
>>> result.confidence_interval()
|
|
ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
|
|
"""
|
|
if not 0 <= confidence_level <= 1:
|
|
raise ValueError('confidence_level must be in the interval '
|
|
'[0, 1].')
|
|
|
|
# Handle edge cases where either exposed_cases or control_cases
|
|
# is zero. We follow the convention of the R function riskratio
|
|
# from the epitools library.
|
|
if self.exposed_cases == 0 and self.control_cases == 0:
|
|
# relative risk is nan.
|
|
return ConfidenceInterval(low=np.nan, high=np.nan)
|
|
elif self.exposed_cases == 0:
|
|
# relative risk is 0.
|
|
return ConfidenceInterval(low=0.0, high=np.nan)
|
|
elif self.control_cases == 0:
|
|
# relative risk is inf
|
|
return ConfidenceInterval(low=np.nan, high=np.inf)
|
|
|
|
alpha = 1 - confidence_level
|
|
z = ndtri(1 - alpha/2)
|
|
rr = self.relative_risk
|
|
|
|
# Estimate of the variance of log(rr) is
|
|
# var(log(rr)) = 1/exposed_cases - 1/exposed_total +
|
|
# 1/control_cases - 1/control_total
|
|
# and the standard error is the square root of that.
|
|
se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
|
|
1/self.control_cases - 1/self.control_total)
|
|
delta = z*se
|
|
katz_lo = rr*np.exp(-delta)
|
|
katz_hi = rr*np.exp(delta)
|
|
return ConfidenceInterval(low=katz_lo, high=katz_hi)
|
|
|
|
|
|
def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
|
|
"""
|
|
Compute the relative risk (also known as the risk ratio).
|
|
|
|
This function computes the relative risk associated with a 2x2
|
|
contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
|
|
of accepting a table as an argument, the individual numbers that are
|
|
used to compute the relative risk are given as separate parameters.
|
|
This is to avoid the ambiguity of which row or column of the contingency
|
|
table corresponds to the "exposed" cases and which corresponds to the
|
|
"control" cases. Unlike, say, the odds ratio, the relative risk is not
|
|
invariant under an interchange of the rows or columns.
|
|
|
|
Parameters
|
|
----------
|
|
exposed_cases : nonnegative int
|
|
The number of "cases" (i.e. occurrence of disease or other event
|
|
of interest) among the sample of "exposed" individuals.
|
|
exposed_total : positive int
|
|
The total number of "exposed" individuals in the sample.
|
|
control_cases : nonnegative int
|
|
The number of "cases" among the sample of "control" or non-exposed
|
|
individuals.
|
|
control_total : positive int
|
|
The total number of "control" individuals in the sample.
|
|
|
|
Returns
|
|
-------
|
|
result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
|
|
The object has the float attribute ``relative_risk``, which is::
|
|
|
|
rr = (exposed_cases/exposed_total) / (control_cases/control_total)
|
|
|
|
The object also has the method ``confidence_interval`` to compute
|
|
the confidence interval of the relative risk for a given confidence
|
|
level.
|
|
|
|
See Also
|
|
--------
|
|
odds_ratio
|
|
|
|
Notes
|
|
-----
|
|
The R package epitools has the function `riskratio`, which accepts
|
|
a table with the following layout::
|
|
|
|
disease=0 disease=1
|
|
exposed=0 (ref) n00 n01
|
|
exposed=1 n10 n11
|
|
|
|
With a 2x2 table in the above format, the estimate of the CI is
|
|
computed by `riskratio` when the argument method="wald" is given,
|
|
or with the function `riskratio.wald`.
|
|
|
|
For example, in a test of the incidence of lung cancer among a
|
|
sample of smokers and nonsmokers, the "exposed" category would
|
|
correspond to "is a smoker" and the "disease" category would
|
|
correspond to "has or had lung cancer".
|
|
|
|
To pass the same data to ``relative_risk``, use::
|
|
|
|
relative_risk(n11, n10 + n11, n01, n00 + n01)
|
|
|
|
.. versionadded:: 1.7.0
|
|
|
|
References
|
|
----------
|
|
.. [1] Alan Agresti, An Introduction to Categorical Data Analysis
|
|
(second edition), Wiley, Hoboken, NJ, USA (2007).
|
|
.. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
|
|
CRC Press LLC, Boca Raton, FL, USA (1996).
|
|
|
|
Examples
|
|
--------
|
|
>>> from scipy.stats.contingency import relative_risk
|
|
|
|
This example is from Example 3.1 of [2]_. The results of a heart
|
|
disease study are summarized in the following table::
|
|
|
|
High CAT Low CAT Total
|
|
-------- ------- -----
|
|
CHD 27 44 71
|
|
No CHD 95 443 538
|
|
|
|
Total 122 487 609
|
|
|
|
CHD is coronary heart disease, and CAT refers to the level of
|
|
circulating catecholamine. CAT is the "exposure" variable, and
|
|
high CAT is the "exposed" category. So the data from the table
|
|
to be passed to ``relative_risk`` is::
|
|
|
|
exposed_cases = 27
|
|
exposed_total = 122
|
|
control_cases = 44
|
|
control_total = 487
|
|
|
|
>>> result = relative_risk(27, 122, 44, 487)
|
|
>>> result.relative_risk
|
|
2.4495156482861398
|
|
|
|
Find the confidence interval for the relative risk.
|
|
|
|
>>> result.confidence_interval(confidence_level=0.95)
|
|
ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
|
|
|
|
The interval does not contain 1, so the data supports the statement
|
|
that high CAT is associated with greater risk of CHD.
|
|
"""
|
|
# Relative risk is a trivial calculation. The nontrivial part is in the
|
|
# `confidence_interval` method of the RelativeRiskResult class.
|
|
|
|
exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
|
|
exposed_total = _validate_int(exposed_total, 1, "exposed_total")
|
|
control_cases = _validate_int(control_cases, 0, "control_cases")
|
|
control_total = _validate_int(control_total, 1, "control_total")
|
|
|
|
if exposed_cases > exposed_total:
|
|
raise ValueError('exposed_cases must not exceed exposed_total.')
|
|
if control_cases > control_total:
|
|
raise ValueError('control_cases must not exceed control_total.')
|
|
|
|
if exposed_cases == 0 and control_cases == 0:
|
|
# relative risk is 0/0.
|
|
rr = np.nan
|
|
elif exposed_cases == 0:
|
|
# relative risk is 0/nonzero
|
|
rr = 0.0
|
|
elif control_cases == 0:
|
|
# relative risk is nonzero/0.
|
|
rr = np.inf
|
|
else:
|
|
p1 = exposed_cases / exposed_total
|
|
p2 = control_cases / control_total
|
|
rr = p1 / p2
|
|
return RelativeRiskResult(relative_risk=rr,
|
|
exposed_cases=exposed_cases,
|
|
exposed_total=exposed_total,
|
|
control_cases=control_cases,
|
|
control_total=control_total)
|