Source code for rsmtool.container

"""
Classes for storing any kind of data contained
in a pd.DataFrame object.

:author: Jeremy Biggs (jbiggs@ets.org)
:author: Anastassia Loukina (aloukina@ets.org)
:author: Nitin Madnani (nmadnani@ets.org)

:organization: ETS
"""

import warnings
from copy import copy, deepcopy


[docs]class DataContainer: """ A class to encapsulate datasets. """ def __init__(self, datasets=None): """ Initialize ``DataContainer`` object. Parameters ---------- datasets : list of dicts, optional A list of dataset dicts. Each dict should have the following keys: ``name`` containing the name of the dataset, ``frame`` containing the dataframe object that contains the dataset, and ``path`` containing the file from which the dataset was read. """ self._names = [] self._dataframes = {} self._data_paths = {} if datasets is not None: for dataset_dict in datasets: self.add_dataset(dataset_dict, update=False) def __contains__(self, key): """ Check if DataContainer object contains a given key. Parameters ---------- key : str A key to check in the DataContainer object Returns ------- key_check : bool True if key in DataContainer object, else False """ return key in self._names def __getitem__(self, key): """ Get frame, given key. Parameters ---------- key : str Name for the data. Returns ------- frame : pd.DataFrame The DataFrame. Raises ------ KeyError If the key does not exist. """ return self.get_frame(key) def __len__(self): """ Return the length of the DataContainer names. Returns ------- length : int The length of the container (i.e. number of frames) """ return len(self._names) def __str__(self): """ String representation of the object. Returns ------- container_names : str A comma-separated list of names from the container. """ return ', '.join(self._names) def __add__(self, other): """ Add two DataContainer objects together and return a new DataContainer object with DataFrames common to both DataContainers Raises ------ ValueError If the object being added is not a DataContainer KeyError If there are duplicate keys in the two DataContainers. """ if not isinstance(other, DataContainer): raise ValueError('Object must be `DataContainer`, ' 'not {}.'.format(type(other))) # Make sure there are no duplicate keys common_keys = set(other._names).intersection(self._names) if common_keys: raise KeyError('The key(s) `{}` already exist in the ' 'DataContainer.'.format(', '.join(common_keys))) dicts = DataContainer.to_datasets(self) dicts.extend(DataContainer.to_datasets(other)) return DataContainer(dicts) def __iter__(self): """ Iterate through configuration object keys. Yields ------ key A key in the container dictionary """ for key in self.keys(): yield key
[docs] @staticmethod def to_datasets(data_container): """ Convert a DataContainer object to a list of dataset dictionaries with keys {`name`, `path`, `frame`}. Parameters ---------- data_container : DataContainer A DataContainer object. Returns ------- datasets_dict : list of dicts A list of dataset dictionaries. """ dataset_dicts = [] for name in data_container.keys(): dataset_dict = {'name': name, 'path': data_container.get_path(name), 'frame': data_container.get_frame(name)} dataset_dicts.append(dataset_dict) return dataset_dicts
[docs] def add_dataset(self, dataset_dict, update=False): """ Update or add a new DataFrame to the instance. Parameters ---------- dataset_dict : pd.DataFrame The dataset dictionary to add. update : bool, optional Update an existing DataFrame, if True. Defaults to False. """ name = dataset_dict['name'] data_frame = dataset_dict['frame'] path = dataset_dict.get('path') if not update: if name in self._names: raise KeyError('The name {} already exists in the ' 'DataContainer dictionary.'.format(name)) if name not in self._names: self._names.append(name) self._dataframes[name] = data_frame self._data_paths[name] = path self.__setattr__(name, data_frame)
[docs] def get_path(self, key, default=None): """ Get path, given key. Parameters ---------- key : str Name for the data. Returns ------- path : str Path to the data. """ if key not in self._names: return default return self._data_paths[key]
[docs] def get_frame(self, key, default=None): """ Get frame, given key. Parameters ---------- key : str Name for the data. default The default argument, if the frame does not exist Returns ------- frame : pd.DataFrame The DataFrame. """ if key not in self._names: return default return self._dataframes[key]
[docs] def get_frames(self, prefix=None, suffix=None): """ Get all data frames in the container that have a specified prefix and/or suffix. Note that the selection by prefix or suffix will be case-insensitive. Parameters ---------- prefix : str or None, optional Only return frames with the given prefix. If None, then do not exclude any frames based on their prefix. Defaults to None. suffix : str or None, optional Only return frames with the given suffix. If None, then do not exclude any frames based on their suffix. Defaults to None. Returns ------- frames : dict A dictionary with all of the data frames that contain the specified prefix and suffix. The keys are the names of the data frames. """ if prefix is None: prefix = '' if suffix is None: suffix = '' names = [name for name in self._names if name.lower().startswith(prefix) and name.lower().endswith(suffix)] frames = {} for name in names: frames[name] = self._dataframes[name] return frames
[docs] def keys(self): """ Return keys as a list. Returns ------- keys : list A list of keys in the Configuration object. """ return self._names
[docs] def values(self): """ Return values as a list. Returns ------- values : list A list of values in the Configuration object. """ return [self._dataframes[name] for name in self._names]
[docs] def items(self): """ Return items as a list of tuples. Returns ------- items : list of tuples A list of (key, value) tuples in the Configuration object. """ return [(name, self._dataframe[name]) for name in self._names]
[docs] def drop(self, name): """ Drop a given data frame from the container. Parameters ---------- name : str The name of the data frame to drop from the container object. Returns ------- self """ if name not in self: warnings.warn('The name `{}` is not in the ' 'container. No data frames will ' 'be dropped.'.format(name)) else: self._names.remove(name) self._dataframes.pop(name) self._data_paths.pop(name) return self
[docs] def rename(self, name, new_name): """ Rename a given data frame in the container. Parameters ---------- name : str The name of the current data frame in the container object. new_name : str The the new name for the data frame in the container object. Returns ------- self """ if name not in self: warnings.warn('The name `{}` is not in the ' 'container and cannot ' 'be renamed.'.format(name)) else: frame = self._dataframes[name] path = self._data_paths[name] self.add_dataset({'name': new_name, 'frame': frame, 'path': path}, update=True) self.drop(name) return self
[docs] def copy(self, deep=True): """ Create a copy of the DataContainer object. Parameters ---------- deep : bool, optional If True, create a deep copy of the underlying data frames. Defaults to True. """ if deep: return deepcopy(self) return copy(self)