"""
Utility classes and functions related to computing fairness evaluations
:author: Anastassia Loukina (aloukina@ets.org)
:author: Jeremy Biggs (jbiggs@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)
:organization: ETS
"""
import pickle
from os.path import join
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from rsmtool.container import DataContainer
from rsmtool.writer import DataWriter
from statsmodels.stats.anova import anova_lm
def convert_to_ordered_category(group_values, base_group=None):
"""
Convert supplied series to an ordered category
with levels ordered by category size.
If multiple categories have the same size,
the order is determined alphabetically.
Parameters
----------
group_values: pandas Series
A series indicating group membership
base_group: str, optional
The group to use as the first category.
This overrides the default ordering.
Defaults to `None`.
Returns
-------
group_ordered_category: pandas Series
"""
# get ordered list by size
# We convert the value counts to data frame to allow for multilevel sorting.
# This makes sure that the order is consistent and reproducible across runs
# when there are more than
# one group with the maximum number of occurrences.
df_groups_by_size = pd.DataFrame(group_values.value_counts()).reset_index()
df_groups_by_size.columns = ['group_name', 'count']
df_groups_by_size_sorted = df_groups_by_size.sort_values(['count', 'group_name'],
ascending=[False, True])
groups_by_size = df_groups_by_size_sorted['group_name'].tolist()
if base_group is not None:
# if we have user-supplied base group, check that it's actually in the data
if base_group not in group_values.values:
raise ValueError("The reference group {} must be one of the existing "
"values for this group".format(base_group))
else:
# move the supplied reference group to the beginning of the list
base_index = groups_by_size.index(base_group)
groups_by_size.insert(0, groups_by_size.pop(base_index))
# convert to category and reorder
group_category = group_values.astype("category")
group_ordered_category = group_category.cat.reorder_categories(groups_by_size,
ordered=True)
return group_ordered_category
def get_coefficients(fit, base_category):
"""
Extract estimates, significance, and confidence intervals
for members of the group of interest. The names of the
predictors are processed to remove the prefix added by `statmodels`.
The name of the base category is added in parenthesis to the Intercept.
Parameters:
-----------
fit: pandas.stats.ols.OLS
Linear regression object fitted using statsmodels
base_category: str
Name of the group used as reference category when fitting the model
Returns:
-------
df_results: pandas DataFrame
A dataframe with rows for each category and the following columns:
- estimate
- P>[t]
- Lower and upper boundaries for 95% confidence intervals
"""
# extract the data we need
df_results = pd.concat([fit.params, fit.pvalues, fit.conf_int()], axis=1)
df_results.columns = ['estimate', 'P>[t]', '[0.025', '0.975]']
# identify the rows we are interested in
groups = ['Intercept'] + [v for v in df_results.index if 'group' in v]
df_results = df_results.loc[groups]
# rename the rows
df_results.index = [v.split('.')[1].strip(']')
if not v == 'Intercept'
else 'Intercept ({})'.format(base_category)
for v in df_results.index]
return df_results
def write_fairness_results(fit_dictionary,
fairness_container,
group,
output_dir,
experiment_id,
file_format):
"""
Save the results of fairness analysis to disk.
Parameters
----------
fit_dictionary: dict
A dictionary of fitted models generated by `get_fairness_analyses()`
fairness_container: container.DataContainer
A data container with the results of fairness analysis generated by `get_fairness_analyses()`
group: str
The subgroup considered in this analysis
output_dir: str
The directory where the results will be saved
experiment_id: str
Experiment id
file_format: str
File format for data files
"""
# let's first save model files and summaries
for model in fit_dictionary:
fit = fit_dictionary[model]
ols_file = join(output_dir, '{}_{}_by_{}.ols'.format(experiment_id,
model,
group))
summary_file = join(output_dir, '{}_{}_by_{}_ols_summary.txt'.format(experiment_id,
model,
group))
with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
pickle.dump(fit, olsf)
summf.write(str(fit.summary()))
# Now let's write out the content of the data container
writer = DataWriter(experiment_id)
writer.write_experiment_output(output_dir,
fairness_container,
file_format=file_format,
index=True)
[docs]def get_fairness_analyses(df,
group,
system_score_column,
human_score_column='sc1',
base_group=None):
"""Compute fairness analyses described
in `Loukina et al. 2019 <https://www.aclweb.org/anthology/W19-4401/>`_.
The functions computes how much variance group membership explains in
overall score accuracy (osa), overall score difference (osd),
and conditional score difference (csd).
See paper for further detail.
Parameters
----------
df: pandas DataFrame
A dataframe containing columns with numeric human scores, numeric system scores
and a column showing group membership
group: str
Name of the column containing group membership
system_score_column: str
Name of the column containing system scores
human_score_column: str
Name of the column containing human scores
base_group: str, optional
Name of the group to use as the reference category.
Defaults to `None` in which case the group with the largest number
of cases will be used a reference category. In case of a tie, the groups
are sorted alphabetically.
Returns
-------
model_dict: dictionary
A dictionary with different proposed metrics as keys
and fitted models as values
fairness_container: container.DataContainer
A datacontainer with the following datasets:
- `estimates_METRICS_by_GROUP` where `GROUP` corresponds to group and `METRICS` can be osa, osd and csd estimates for each group computed by the respective models
- `fairness_metrics_by_GROUP` - a summary of model fits (R2 and p values)
"""
# compute error and squared error
df['error'] = df[system_score_column] - df[human_score_column]
df['SE'] = df['error']**2
# convert group values to category and reorder them using
# the largest category as reference
df['group'] = convert_to_ordered_category(df[group], base_group=base_group)
base_group = df['group'].cat.categories[0]
df['sc1_cat'] = convert_to_ordered_category(df[human_score_column])
# Overall score accuracy (OSA)
# Variance in squared error explained by L1
# fit the model
osa_model = smf.ols(formula='SE ~ group', data=df)
osa_fit = osa_model.fit()
# collect the results
osa_dict = {'R2': osa_fit.rsquared_adj,
'sig': osa_fit.f_pvalue}
osa_results = pd.Series(osa_dict, name='Overall score accuracy')
df_coefficients_osa = get_coefficients(osa_fit, base_group)
# Overall score difference (OSD)
# variance in signed residuals (raw error) explained by L1
# fit the model
osd_model = smf.ols(formula='error ~ group', data=df)
osd_fit = osd_model.fit()
# collect the results
osd_dict = {'R2': osd_fit.rsquared_adj,
'sig': osd_fit.f_pvalue}
osd_results = pd.Series(osd_dict, name='Overall score difference')
df_coefficients_osd = get_coefficients(osd_fit, base_group)
# conditional score difference CSD
# Variance in score difference conditioned on Native language
# fit "null" model with human score only
csd_null_mod = smf.ols(formula='error ~ sc1_cat', data=df)
csd_null_fit = csd_null_mod.fit()
# fit model with both human score and group
csd_mod = smf.ols(formula='error ~ group + sc1_cat', data=df)
csd_fit = csd_mod.fit()
# compare the two models using anova_lm
# we filter warnings for this function because we get
# runtime warning due to NaNs in the data.
# these seem to be by design: https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k
np.warnings.filterwarnings('ignore')
anova_results = anova_lm(csd_null_fit, csd_fit)
# we reset warnings
np.warnings.resetwarnings()
# collect the results. Note that R2 in this case is a difference
# in R2 between the two models and significance is obtained from anova
csd_dict = {'R2': csd_fit.rsquared_adj - csd_null_fit.rsquared_adj,
'sig': anova_results.values[1][-1]}
csd_results = pd.Series(csd_dict, name='Conditional score difference')
df_coefficients_csd = get_coefficients(csd_fit, base_group)
# create a summary table
df_r2_all = pd.concat([osa_results, osd_results, csd_results], axis=1, sort=True)
df_r2_all['base_category'] = base_group
# assemble all datasets into a DataContainer
datasets = [{'name': 'estimates_osa_by_{}'.format(group),
'frame': df_coefficients_osa},
{'name': 'estimates_osd_by_{}'.format(group),
'frame': df_coefficients_osd},
{'name': 'estimates_csd_by_{}'.format(group),
'frame': df_coefficients_csd},
{'name': 'fairness_metrics_by_{}'.format(group),
'frame': df_r2_all}]
# assemble all models into a dictionary
model_dict = {'osa': osa_fit,
'osd': osd_fit,
'csd': csd_fit}
return model_dict, DataContainer(datasets=datasets)