Source code for bias_amplification.utils.datacreator

# Importing Libraries
import numpy as np
import pandas as pd


# validator
[docs]def validate_error_percent(error_percent, name): """ Checks if the error percentage is valid. It raises an error, otherwise. Parameters: ---------- error_percent: float The error percentage to validate name: str The name of the error percentage to validate Raises: ------- ValueError: If the error percentage is not in the range [0, 0.25] """ if 0 <= error_percent <= 0.25: pass else: raise ValueError(f"{name}_percent needs to be a float value in [0,0.25].")
# dataCreator function
[docs]def dataCreator(N=512, error_percent=0.1, shuffle=False, data_error_percent=None): """ This function generates a synthetic dataset for the bias amplification analysis. It creates a dataset with protected attributes, ground truth task label, biased task data label, model prediction without bias, and model prediction with bias. Parameters ---------- N: int, default=512 The size of the dataset, must be >= 3. error_percent: float, default=0.1 The error percentage of the model, must be between 0 and 0.25. shuffle: boolean, default=False Whether to shuffle the data. data_error_percent: float, default=None The error percentage of the data, must be between 0 and 0.25. Returns ------- P: numpy.ndarray, shape (N,) The protected attributes. D: numpy.ndarray, shape (N,) The ground truth task labels. D_bias: numpy.ndarray, shape (N,) The biased task data labels. M_unbias: numpy.ndarray, shape (N,) The model predictions without bias. M2: numpy.ndarray, shape (N,) The model predictions with bias. Examples -------- >>> from bias_amplification.utils.datacreator import dataCreator >>> P, D, D_bias, M_unbias, M2 = dataCreator( N=1000, error_percent=0.1, shuffle=False, data_error_percent=0.05) """ if N < 3 or type(N) != int: raise ValueError("N must be an integer >= 3. Got {N}") error_percent = error_percent / 2 validate_error_percent(error_percent, "error") if data_error_percent == None: data_error_percent = error_percent else: data_error_percent = data_error_percent / 2 validate_error_percent(data_error_percent, "data_error") # Calculating partitions q1 = N // 4 q2 = N // 2 q3 = 3 * N // 4 # Initialize protected attribute P # First half: group 0, Second half: group 1 P = np.zeros(N) P[q2:] = 1 # Initialize ground truth task label D # First quarter: group 0, Second quarter: group 1, Third quarter: group 0, Fourth quarter: group 1 D = np.zeros(N) D[q1:q2] = 1 D[q3:] = 1 # Part A: P=0, D=0 # Part B: P=0, D=1 # Part C: P=1, D=0 # Part D: P=1, D=1 M_unbias = D.copy() # Model without bias M2 = D.copy() # Model with bias D_bias = D.copy() # Bias in the data num_errors = int(N * error_percent) num_data_errors = int(N * data_error_percent) # First quarter positions A_pos = np.array([i for i in range(0, N // 4)]) # Third quarter positions C_pos = np.array([i for i in range(N // 2, 3 * N // 4)]) # For M_unbias: introducing balanced error across quarters 1 and 3 # Randomly choosing num_errors//2 indices from quarter 1 swaps_m_unbias_in_A = np.random.choice(A_pos, num_errors // 2, replace=False) # Randomly choosing other num_errors // 2 indices from quarter 3 swaps_m_unbias_in_C = np.random.choice(C_pos, num_errors - num_errors // 2, replace=False) # Flip the chosen indices from M_unbias=0 to M_unbias=1 in quarter 1 M_unbias[swaps_m_unbias_in_A] = 1 # Flip from M_unbias=1 to M_unbias=0 in corresponding positions in quarter 2 M_unbias[swaps_m_unbias_in_A + (q1)] = 0 # Flip the chosen indices from M_unbias=0 to M_unbias=1 in quarter 3 M_unbias[swaps_m_unbias_in_C] = 1 # Flip from M_unbias=1 to M_unbias=0 in corresponding positions in quarter 4 M_unbias[swaps_m_unbias_in_C + (q1)] = 0 # For M2: introducing all errors in quarter 1 only # Randomly choosing num_errors indices from quarter 1 swaps_m_bias_in_A = np.random.choice(A_pos, num_errors, replace=False) # Flip the chosen indices from M2=0 to M2=1 in quarter 1 M2[swaps_m_bias_in_A] = 1 # Flip from M2=1 to M2=0 in corresponding positions in quarter 4 M2[swaps_m_bias_in_A + (q3)] = 0 # For D_bias: introducing bias in quarter 1 only in the data swaps_d_bias_in_A = np.random.choice(A_pos, num_data_errors, replace=False) # Flip the chosen indices from D_bias=0 to D_bias=1 in quarter 1 D_bias[swaps_d_bias_in_A] = 1 # Flip from D_bias=1 to D_bias=0 in corresponding positions in quarter 4 D_bias[swaps_d_bias_in_A + (q3)] = 0 if shuffle: permut = np.random.permutation(N) P = P[permut] D = D[permut] M_unbias = M_unbias[permut] M2 = M2[permut] return P, D, D_bias, M_unbias, M2
# Stability Experiment def StabilityExp(N, data_error_w=0.5, model_error_w=0.2, poly_pow=4, data_range=(0, 1)): data_min, data_max = data_range A = data_min + (np.random.random(N) * (data_max - data_min)) coeffs = np.random.randint(10, size=1 + poly_pow) polynom = np.poly1d(coeffs) error = np.random.random(N) D = polynom(A + data_error_w * error) M = polynom(A + model_error_w * error) return A, D, M # COMPAS Dataset COMPAS_SENSITIVE_ATTRS = ["sex", "race", "age"] def COMPASData(attributes="race"): df = pd.read_csv( "https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv" ) """ Removing records where charged for alternative reasons. Reference below: Jeff Larson, Surya Mattu, Lauren Kirchner, and Julia Angwin. How we analyzed the compas recidivism algorithm. 2016. URL: https://www.propublica.org/article/how-we-analyzed-the-compas-recidivism-algorithm. """ df = df[ (df["days_b_screening_arrest"] <= 30) & (df["days_b_screening_arrest"] >= -30) & (df["is_recid"] != -1) & (df["c_charge_degree"] != "O") & (df["score_text"] != "N/A") ].reset_index(drop=True) if type(attributes) == str: attributes = [attributes] for item in attributes: if not (item in COMPAS_SENSITIVE_ATTRS): raise Exception( f"{item} not in known sensitive attribute list: {COMPAS_SENSITIVE_ATTRS}" ) A = df[attributes].values T = df["is_recid"].values T_pred = df["two_year_recid"].values return df if __name__ == "__main__": P1, D1, D_bias1, M_unbias1, M2_1 = dataCreator(32, 0.1, False, 0.2) print(f"{P1=}") print(f"{D1=}") print(f"{D_bias1=}") print(f"{M_unbias1=}") print(f"{M2_1=}") df = COMPASData() print(f"{df=}")