Source code for bias_amplification.utils.datacreator

# Importing Libraries
import numpy as np
import pandas as pd


# validator
[docs]def validate_error_percent(error_percent, name):
    """
    Checks if the error percentage is valid.
    It raises an error, otherwise.

    Parameters:
    ----------
    error_percent: float
        The error percentage to validate
    name: str
        The name of the error percentage to validate

    Raises:
    -------
    ValueError: If the error percentage is not in the range [0, 0.25]
    """
    if 0 <= error_percent <= 0.25:
        pass
    else:
        raise ValueError(f"{name}_percent needs to be a float value in [0,0.25].")


# dataCreator function
[docs]def dataCreator(N=512, error_percent=0.1, shuffle=False, data_error_percent=None):
    """
    This function generates a synthetic dataset for the bias amplification analysis.
    It creates a dataset with protected attributes, ground truth task label,
    biased task data label, model prediction without bias, and model prediction with bias.

    Parameters
    ----------
     N: int, default=512
        The size of the dataset, must be >= 3.
     error_percent: float, default=0.1
        The error percentage of the model, must be between 0 and 0.25.
     shuffle: boolean, default=False
        Whether to shuffle the data.
     data_error_percent: float, default=None
        The error percentage of the data, must be between 0 and 0.25.

    Returns
    -------
    P: numpy.ndarray, shape (N,)
        The protected attributes.
    D: numpy.ndarray, shape (N,)
        The ground truth task labels.
    D_bias: numpy.ndarray, shape (N,)
        The biased task data labels.
    M_unbias: numpy.ndarray, shape (N,)
        The model predictions without bias.
    M2: numpy.ndarray, shape (N,)
        The model predictions with bias.

    Examples
    --------
    >>> from bias_amplification.utils.datacreator import dataCreator
    >>> P, D, D_bias, M_unbias, M2 = dataCreator(
             N=1000, error_percent=0.1, shuffle=False, data_error_percent=0.05)
    """
    if N < 3 or type(N) != int:
        raise ValueError("N must be an integer >= 3. Got {N}")

    error_percent = error_percent / 2
    validate_error_percent(error_percent, "error")

    if data_error_percent == None:
        data_error_percent = error_percent
    else:
        data_error_percent = data_error_percent / 2
        validate_error_percent(data_error_percent, "data_error")

    # Calculating partitions
    q1 = N // 4
    q2 = N // 2
    q3 = 3 * N // 4

    # Initialize protected attribute P
    # First half: group 0, Second half: group 1
    P = np.zeros(N)
    P[q2:] = 1

    # Initialize ground truth task label D
    # First quarter: group 0, Second quarter: group 1, Third quarter: group 0, Fourth quarter: group 1
    D = np.zeros(N)
    D[q1:q2] = 1
    D[q3:] = 1

    # Part A: P=0, D=0
    # Part B: P=0, D=1
    # Part C: P=1, D=0
    # Part D: P=1, D=1

    M_unbias = D.copy()  # Model without bias
    M2 = D.copy()  # Model with bias
    D_bias = D.copy()  # Bias in the data

    num_errors = int(N * error_percent)
    num_data_errors = int(N * data_error_percent)

    # First quarter positions
    A_pos = np.array([i for i in range(0, N // 4)])
    # Third quarter positions
    C_pos = np.array([i for i in range(N // 2, 3 * N // 4)])

    # For M_unbias: introducing balanced error across quarters 1 and 3
    # Randomly choosing num_errors//2 indices from quarter 1
    swaps_m_unbias_in_A = np.random.choice(A_pos, num_errors // 2, replace=False)
    # Randomly choosing other num_errors // 2 indices from quarter 3
    swaps_m_unbias_in_C = np.random.choice(C_pos, num_errors - num_errors // 2, replace=False)
    # Flip the chosen indices from M_unbias=0 to M_unbias=1 in quarter 1
    M_unbias[swaps_m_unbias_in_A] = 1
    # Flip from M_unbias=1 to M_unbias=0 in corresponding positions in quarter 2
    M_unbias[swaps_m_unbias_in_A + (q1)] = 0
    # Flip the chosen indices from M_unbias=0 to M_unbias=1 in quarter 3
    M_unbias[swaps_m_unbias_in_C] = 1
    # Flip from M_unbias=1 to M_unbias=0 in corresponding positions in quarter 4
    M_unbias[swaps_m_unbias_in_C + (q1)] = 0

    # For M2: introducing all errors in quarter 1 only
    # Randomly choosing num_errors indices from quarter 1
    swaps_m_bias_in_A = np.random.choice(A_pos, num_errors, replace=False)
    # Flip the chosen indices from M2=0 to M2=1 in quarter 1
    M2[swaps_m_bias_in_A] = 1
    # Flip from M2=1 to M2=0 in corresponding positions in quarter 4
    M2[swaps_m_bias_in_A + (q3)] = 0

    # For D_bias: introducing bias in quarter 1 only in the data
    swaps_d_bias_in_A = np.random.choice(A_pos, num_data_errors, replace=False)
    # Flip the chosen indices from D_bias=0 to D_bias=1 in quarter 1
    D_bias[swaps_d_bias_in_A] = 1
    # Flip from D_bias=1 to D_bias=0 in corresponding positions in quarter 4
    D_bias[swaps_d_bias_in_A + (q3)] = 0

    if shuffle:
        permut = np.random.permutation(N)
        P = P[permut]
        D = D[permut]
        M_unbias = M_unbias[permut]
        M2 = M2[permut]
    return P, D, D_bias, M_unbias, M2


# Stability Experiment
def StabilityExp(N, data_error_w=0.5, model_error_w=0.2, poly_pow=4, data_range=(0, 1)):
    data_min, data_max = data_range
    A = data_min + (np.random.random(N) * (data_max - data_min))
    coeffs = np.random.randint(10, size=1 + poly_pow)
    polynom = np.poly1d(coeffs)
    error = np.random.random(N)
    D = polynom(A + data_error_w * error)
    M = polynom(A + model_error_w * error)
    return A, D, M


# COMPAS Dataset
COMPAS_SENSITIVE_ATTRS = ["sex", "race", "age"]


def COMPASData(attributes="race"):
    df = pd.read_csv(
        "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"
    )
    """
    Removing records where charged for alternative reasons. Reference below:
    Jeff Larson, Surya Mattu, Lauren Kirchner, and Julia Angwin. How we analyzed the compas recidivism algorithm. 2016. URL: https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm.
    """
    df = df[
        (df["days_b_screening_arrest"] <= 30)
        & (df["days_b_screening_arrest"] >= -30)
        & (df["is_recid"] != -1)
        & (df["c_charge_degree"] != "O")
        & (df["score_text"] != "N/A")
    ].reset_index(drop=True)
    if type(attributes) == str:
        attributes = [attributes]
    for item in attributes:
        if not (item in COMPAS_SENSITIVE_ATTRS):
            raise Exception(
                f"{item} not in known sensitive attribute list: {COMPAS_SENSITIVE_ATTRS}"
            )
        A = df[attributes].values
        T = df["is_recid"].values
        T_pred = df["two_year_recid"].values
    return df


if __name__ == "__main__":
    P1, D1, D_bias1, M_unbias1, M2_1 = dataCreator(32, 0.1, False, 0.2)
    print(f"{P1=}")
    print(f"{D1=}")
    print(f"{D_bias1=}")
    print(f"{M_unbias1=}")
    print(f"{M2_1=}")

    df = COMPASData()
    print(f"{df=}")