Source code for rsmtool.utils.prmse

"""
Utility classes and functions related to computing test
theory based evaluations.

The derivations and formulas were provided by Matt Johnson.

:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import pandas as pd
import numpy as np


def get_n_human_scores(human_scores):
    """
    Get the number of available human scores for each response.

    Parameters
    ----------
    human_scores : array-like of shape (n_samples, n_ratings)
        Human ratings for each response.

    Returns
    -------
    n_scores : array-like of shape (n_samples, )
        Total number of not None human scores
    """
    n_scores = (~np.isnan(human_scores)).sum(axis=1)
    return n_scores


[docs]def variance_of_errors(human_scores): """ Estimate the variance of errors in human scores. Parameters ---------- human_scores : array-like of shape (n_samples, n_ratings) Human ratings for each response. Returns ------- variance_of_errors : float Estimated variance of errors in human scores. """ # we first compute the total number of scores # available for each response n_scores = get_n_human_scores(human_scores) # we will only be using responses with more # than one score multiple_mask = n_scores > 1 # raise an error if we don't have any such responses if multiple_mask.sum() == 0: raise ValueError("Variance of human errors " "necessary for true score " "evaluations requires " "at least a subset of responses " "to be scored by 2 or more " "raters.") # only select the responses with multiple scores multiple_scores = human_scores[multiple_mask] n_scores = n_scores[multiple_mask] # now let's compute the rater error variance for each # response response_variances = np.nanvar(multiple_scores, ddof=1, axis=1) # finally, let's compute the variance of errors as a weighted average # of response variances variance_of_errors = np.average(response_variances, weights=n_scores - 1) return variance_of_errors
def true_score_variance(human_scores, variance_errors_human=None): """ Compute variance of true scores for multiple raters. Parameters ---------- human_scores : array-like of shape (n_samples, n_ratings) Human ratings for each response. variance_errors_human : float Estimated variance of errors in human scores When no value is supplied, the variance will be estimated from the data. In this case at least some responses must have more than one human score. Returns ------- variance_true_scores : float Variance of true scores. """ # if we don't have variance of errors, compute it # from the data if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) # compute mean human score and total number of scores # for each response mean_scores = np.nanmean(human_scores, axis=1) n_scores = get_n_human_scores(human_scores) # compute overall mean mean_human_score = np.nanmean(human_scores) # let N be total number of responses N = len(human_scores) # let M be total number of human ratings M = n_scores.sum() # compute squared deviations squared_devs = (mean_scores - mean_human_score)**2 # adjust them by the number of human scores available # for each responses: deviations with higher number of # human scores are assigned a greater weight adjusted_squared_devs = n_scores * squared_devs # compute sum of squares sum_of_squares = adjusted_squared_devs.sum() # now compute the numerator as sum of squares # adjusted for the variance of human errors numerator = sum_of_squares - (N-1) * variance_errors_human # compute the denominator as the adjusted total number of scores denominator = M - ((n_scores**2).sum() / M) # finally compute variance of true scores variance_true_scores = numerator / denominator return variance_true_scores def mse_true(system, human_scores, variance_errors_human=None): """ Compute mean squared error (MSE) when predicting true score from system score. Parameters ---------- system : array-like of shape (n_samples,) System scores for each response. human_scores : array-like of shape (n_samples, n_ratings) Human ratings for each response. variance_errors_human : float Estimated variance of errors in human scores When no value is supplied, the variance will be estimated from the data. In this case at least some responses must have more than one human score. Returns ------- variance_true_scores : float Variance of true scores. """ # if we don't have variance of errors, compute it # from the data if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) # get total number of scores for each response n_scores = get_n_human_scores(human_scores) mean_scores = np.nanmean(human_scores, axis=1) N = len(system) se = ((mean_scores - system)**2) * n_scores # Compute mean squared error when predicting true score mse = (se.sum() - N * variance_errors_human) / n_scores.sum() return mse
[docs]def prmse_true(system, human_scores, variance_errors_human=None): """ Compute Proportional Reduction in Mean Squared Error (PRMSE) when predicting true score from system scores. Parameters ---------- system : array-like of shape (n_samples,) System scores for each response. human_scores : array-like of shape (n_samples, n_ratings) Human ratings for each response. variance_errors_human : float Estimated variance of errors in human scores When no value is supplied, the variance will be estimated from the data. In this case at least some responses must have more than one human score. Returns ------- prmse : float Proportional reduction in mean squared error """ # check that human_scors is a two dimensional array # and reshape if necessary if len(human_scores.shape) == 1: current_length = human_scores.shape[0] # first assume we have a pandas series try: human_scores = human_scores.values.reshape(current_length, 1) # if not, treat this as an array except AttributeError: human_scores = human_scores.reshape(current_length, 1) if variance_errors_human is None: variance_errors_human = variance_of_errors(human_scores) variance_true = true_score_variance(human_scores, variance_errors_human) mse = mse_true(system, human_scores, variance_errors_human) prmse = 1 - (mse / variance_true) return prmse
def get_true_score_evaluations(df, system_score_columns, human_score_columns, variance_errors_human=None): """ Get true score evaluations for reporting Parameters ---------- df: pandas DataFrame Input data frame. Must contain columns listed in ``system_score_columns`` and ``human_score_columns``. system_score_columns: str or list System score column name or list of columns containing system scores. human_score_columns: str or list Human score column or list of columns containing human scores. True score evaluations require estimating variance of human errors, which can only be computed when a subset of responses has two or more human ratings. If ``human_score_columns`` is a single column name, ``variance_errors_human`` must also be specified. variance_errors_human : float Estimated variance of errors in human scores. If no value is supplied, the variance will be estimated from the data in which case some responses must have more than one human rating. Returns ------- prmse_metrics: pandas DataFrame DataFrame containing different evaluation metrics related to the evaluation of system scores against true scores: - ``N``: total number of responses - ``N raters``: maximum number of ratings available for a single response - ``N_single``: total number of responses with a single human score - ``N_multiple``: total number of responses with more than one human score - ``variance_of_errors``: estimated variance of human errors - ``tru_var``: estimated true score variance - ``mse_true``: mean squared error when predicting true score from machine score - ``prmse``: proportional reduction in mean squared error when predicting true score """ # check that if we only have one human column, we were also given # variance of errors if isinstance(human_score_columns, str): if variance_errors_human is None: raise(ValueError("True score evaluations require estimating " "variance of human errors, " "which can only be computed when a subset " "of responses has two or more human ratings. " "If a single human_score_column " "is supplied, one must also specify variance_errors_human")) if isinstance(system_score_columns, str): system_score_columns = [system_score_columns] # compute variance of errors if it wasn't specified if variance_errors_human is None: variance_errors_human = variance_of_errors(df[human_score_columns]) # compute prmse prmse_all = [] for system in system_score_columns: mse = mse_true(df[system], df[human_score_columns], variance_errors_human) prmse = prmse_true(df[system], df[human_score_columns], variance_errors_human) prmse_metrics = pd.Series({'MSE true': mse, 'PRMSE true': prmse}, name=system) prmse_all.append(prmse_metrics) df_prmse = pd.concat(prmse_all, axis=1, sort=True).transpose() score_counts = get_n_human_scores(df[human_score_columns]) # compute values that are the same for all scores df_prmse.insert(0, 'N', len(df)) df_prmse.insert(1, 'N raters', score_counts.max()) df_prmse.insert(2, 'N single', (score_counts == 1).sum()), df_prmse.insert(3, 'N multiple', (score_counts > 1).sum()), df_prmse.insert(4, 'Variance of errors', variance_errors_human) df_prmse.insert(5, 'True score var', true_score_variance(df[human_score_columns], variance_errors_human)) return df_prmse