In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt
import numpy as np
from statsmodels.stats.proportion import proportions_ztest
from scipy.stats import chi2_contingency
import warnings
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import TwoSlopeNorm
import matplotlib
In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Specify the folder path
folder_path = 'csvs/IncarcerationSupervision'

# List all files in the folder
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize a dictionary to store dataframes and their corresponding file names
dataframes = {}

# Loop through each file and read it into a dataframe
for i, file in enumerate(files, start=1):
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes[file] = df

# Iterate over each dataframe and print info() and head()
for file_name, df in dataframes.items():
    print(f"DataFrame {file_name}:")
    print("Info:")
    print(df.info())
    print("\nHead:")
    print(df.head())
    print("\n---\n")
DataFrame IncarcerationSupervisionDisciplinary_00001.csv:
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54524 entries, 0 to 54523
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype 
---  ------                                          --------------  ----- 
 0   INCARCERATION_SUPERVISION_ID                    54524 non-null  int64 
 1   INCAR_DISCIPLINARY_VIOL_CODE                    54524 non-null  int64 
 2   INCAR_DISCIPLINARY_ACTION_CODE                  54524 non-null  object
 3   INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC     54524 non-null  object
 4   INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  54524 non-null  object
 5   INCAR_SUPERV_CREATED_DATE                       54524 non-null  object
 6   UNIQUE_CORRELATION_ID                           54524 non-null  object
 7   CORRELATION_ID                                  54524 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB
None

Head:
   INCARCERATION_SUPERVISION_ID  INCAR_DISCIPLINARY_VIOL_CODE  \
0                       3853839                           314   
1                       3853839                            61   
2                       3853839                           314   
3                       3853839                           917   
4                       3853839                           927   

  INCAR_DISCIPLINARY_ACTION_CODE INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC  \
0                             DC                    Disciplinary Confinement   
1                             GT                           Loss of Gain Time   
2                             GT                           Loss of Gain Time   
3                             CC                     Cashless Canteen Denial   
4                             GT                           Loss of Gain Time   

      INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  \
0  Unauthorized possession/use of communication d...   
1                 Disobeying verbal or written order   
2  Unauthorized possession/use of communication d...   
3                                 Disorderly conduct   
4                          Use of unauthorized drugs   

     INCAR_SUPERV_CREATED_DATE                 UNIQUE_CORRELATION_ID  \
0  2024-10-16 10:19:19.3990000  72a3a208-c80d-4e43-95ac-90c25ab873a9   
1  2024-10-16 10:19:19.3990000  72a3a208-c80d-4e43-95ac-90c25ab873a9   
2  2024-10-16 10:19:19.3990000  72a3a208-c80d-4e43-95ac-90c25ab873a9   
3  2024-10-16 10:19:19.3990000  72a3a208-c80d-4e43-95ac-90c25ab873a9   
4  2024-10-16 10:19:19.3990000  72a3a208-c80d-4e43-95ac-90c25ab873a9   

                         CORRELATION_ID  
0  0f4f4bcc-84f6-484c-9af8-f0657d877b95  
1  0f4f4bcc-84f6-484c-9af8-f0657d877b95  
2  0f4f4bcc-84f6-484c-9af8-f0657d877b95  
3  0f4f4bcc-84f6-484c-9af8-f0657d877b95  
4  0f4f4bcc-84f6-484c-9af8-f0657d877b95  

---

DataFrame IncarcerationSupervisionDisciplinary_00000.csv:
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574092 entries, 0 to 574091
Data columns (total 8 columns):
 #   Column                                          Non-Null Count   Dtype 
---  ------                                          --------------   ----- 
 0   INCARCERATION_SUPERVISION_ID                    574092 non-null  int64 
 1   INCAR_DISCIPLINARY_VIOL_CODE                    574092 non-null  int64 
 2   INCAR_DISCIPLINARY_ACTION_CODE                  574092 non-null  object
 3   INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC     574092 non-null  object
 4   INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  574092 non-null  object
 5   INCAR_SUPERV_CREATED_DATE                       574092 non-null  object
 6   UNIQUE_CORRELATION_ID                           574092 non-null  object
 7   CORRELATION_ID                                  574092 non-null  object
dtypes: int64(2), object(6)
memory usage: 35.0+ MB
None

Head:
   INCARCERATION_SUPERVISION_ID  INCAR_DISCIPLINARY_VIOL_CODE  \
0                            26                            52   
1                            26                            61   
2                            26                           934   
3                            26                           916   
4                            26                            23   

  INCAR_DISCIPLINARY_ACTION_CODE INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC  \
0                             DC                    Disciplinary Confinement   
1                             VR                            Verbal Reprimand   
2                             DC                    Disciplinary Confinement   
3                             GT                           Loss of Gain Time   
4                             DC                    Disciplinary Confinement   

      INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  \
0            Failure to comply with count procedures   
1                 Disobeying verbal or written order   
2  Tampering/defeating any fire or other safety d...   
3  Refusing to work or participate in mandatory p...   
4  Creating, participating in or inciting a minor...   

     INCAR_SUPERV_CREATED_DATE                 UNIQUE_CORRELATION_ID  \
0  2022-02-25 17:43:59.9930000  20042f87-0e82-422e-8844-3171a46edd08   
1  2022-02-25 17:43:59.9930000  20042f87-0e82-422e-8844-3171a46edd08   
2  2022-02-25 17:43:59.9930000  20042f87-0e82-422e-8844-3171a46edd08   
3  2022-02-25 17:43:59.9930000  20042f87-0e82-422e-8844-3171a46edd08   
4  2022-02-25 17:43:59.9930000  20042f87-0e82-422e-8844-3171a46edd08   

                         CORRELATION_ID  
0  20042f87-0e82-422e-8844-3171a46edd08  
1  20042f87-0e82-422e-8844-3171a46edd08  
2  20042f87-0e82-422e-8844-3171a46edd08  
3  20042f87-0e82-422e-8844-3171a46edd08  
4  20042f87-0e82-422e-8844-3171a46edd08  

---

DataFrame Incarcerationsupervision_00000.csv:
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162196 entries, 0 to 162195
Data columns (total 29 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   INCARCERATION_SUPERVISION_ID           162196 non-null  int64  
 1   UNIQUE_CORRELATION_ID                  162196 non-null  object 
 2   OWNER_ORI                              162196 non-null  object 
 3   PERSON_ID                              162196 non-null  int64  
 4   MDM_PERSON_ID                          161521 non-null  float64
 5   RACE_CODE                              162196 non-null  object 
 6   SEX_CODE                               162196 non-null  object 
 7   ETHNICITY_CODE                         162196 non-null  object 
 8   CUSTODY_ADMISSION_DATE                 162196 non-null  object 
 9   SUPERVISION_BEGIN_DATE                 3592 non-null    object 
 10  CURRENT_INSTITUTION_ORI_TYP_CODE       162196 non-null  object 
 11  INCAR_CUSTODY_LEVEL_TYPE_CODE          134267 non-null  float64
 12  CORRECTION_ADMISSION_REASON_CODE       162196 non-null  object 
 13  PRIOR_INCARCERATION_IND                162196 non-null  bool   
 14  CUSTODY_RELEASE_REASON_CODE            23950 non-null   float64
 15  INCARC_PROG_PARTICIPATION_IND          162196 non-null  bool   
 16  SUPERVISION_CATEGORY_CODE              3592 non-null    object 
 17  HIGHEST_EDUCATION_LEVEL                154821 non-null  object 
 18  SEXUAL_OFFENDER_IND                    162196 non-null  bool   
 19  GANG_AFFILIATION_IND                   162196 non-null  bool   
 20  INMATE_AGE                             162196 non-null  int64  
 21  CORRECTION_ADMISSION_REASON_DESC       162196 non-null  object 
 22  CURRENT_INSTITUTION_ORI_TYPE_DESC      162196 non-null  object 
 23  CUSTODY_RELEASE_REASON_DESC            23950 non-null   object 
 24  INCARCERATION_CUSTODY_LEVEL_TYPE_DESC  134267 non-null  object 
 25  SUPERVISION_CATEGORY_DESC              3592 non-null    object 
 26  AGENCY_NAME                            162196 non-null  object 
 27  COUNTY_DESCRIPTION                     162196 non-null  object 
 28  STATE                                  162196 non-null  object 
dtypes: bool(4), float64(3), int64(3), object(19)
memory usage: 31.6+ MB
None

Head:
   INCARCERATION_SUPERVISION_ID                 UNIQUE_CORRELATION_ID  \
0                       2642920  5f3f8275-6d04-4a7b-8a81-8c3e4534560d   
1                       2642921  5716fe66-fba9-45f8-893c-0f3d41e97fd5   
2                       2642928  adfe5180-01b4-46a9-b70f-4ca17627dbf3   
3                       2642948  d92fca90-b85f-4485-9c1c-f4beed7298f3   
4                       2642958  032a39ff-5072-4ce2-9d60-90fb3cc97a74   

   OWNER_ORI  PERSON_ID  MDM_PERSON_ID RACE_CODE SEX_CODE  \
0  FL037275C   57675189       708281.0     White   Female   
1  FL037275C   57675190       647720.0     White     Male   
2  FL037275C   57675199      1309746.0     White     Male   
3  FL037275C   57675291      1612413.0     White     Male   
4  FL037275C   57675329       276821.0     Black     Male   

           ETHNICITY_CODE       CUSTODY_ADMISSION_DATE SUPERVISION_BEGIN_DATE  \
0  Not Hispanic or Latino  2022-12-19 00:00:00.0000000                    NaN   
1  Not Hispanic or Latino  2005-05-04 00:00:00.0000000                    NaN   
2  Not Hispanic or Latino  2023-02-01 00:00:00.0000000                    NaN   
3  Not Hispanic or Latino  2017-01-26 00:00:00.0000000                    NaN   
4  Not Hispanic or Latino  2022-08-24 00:00:00.0000000                    NaN   

  CURRENT_INSTITUTION_ORI_TYP_CODE  INCAR_CUSTODY_LEVEL_TYPE_CODE  \
0                        FL042375C                            NaN   
1                        FL048055C                            4.0   
2                        FL017045C                            1.0   
3                        FL057025C                            4.0   
4                        FL032125C                            4.0   

             CORRECTION_ADMISSION_REASON_CODE  PRIOR_INCARCERATION_IND  \
0                              New Conviction                    False   
1                              New Conviction                    False   
2          Violation of Probation - Technical                    False   
3                              New Conviction                     True   
4  Violation of Probation - New Law Violation                     True   

   CUSTODY_RELEASE_REASON_CODE  INCARC_PROG_PARTICIPATION_IND  \
0                         40.0                          False   
1                          NaN                           True   
2                          NaN                          False   
3                          NaN                           True   
4                          NaN                           True   

  SUPERVISION_CATEGORY_CODE HIGHEST_EDUCATION_LEVEL  SEXUAL_OFFENDER_IND  \
0                       NaN             Ninth Grade                False   
1                       NaN           Twelfth Grade                 True   
2                       NaN           Twelfth Grade                False   
3                       NaN             Ninth Grade                False   
4                       NaN             Tenth Grade                 True   

   GANG_AFFILIATION_IND  INMATE_AGE  \
0                 False          37   
1                 False          43   
2                 False          40   
3                  True          33   
4                 False          50   

             CORRECTION_ADMISSION_REASON_DESC  \
0                              New Conviction   
1                              New Conviction   
2          Violation of Probation - Technical   
3                              New Conviction   
4  Violation of Probation - New Law Violation   

           CURRENT_INSTITUTION_ORI_TYPE_DESC CUSTODY_RELEASE_REASON_DESC  \
0            FLORIDA WOMENS RECEPTION CENTER                    Released   
1      CENTRAL FLORIDA RECEPTION CENTER MAIN                         NaN   
2          PENSACOLA COMUNITY RELEASE CENTER                         NaN   
3  SANTA ROSA CORRECTIONAL INSTITUTION ANNEX                         NaN   
4           GRACEVILLE CORRECTIONAL FACILITY                         NaN   

  INCARCERATION_CUSTODY_LEVEL_TYPE_DESC SUPERVISION_CATEGORY_DESC  \
0                                   NaN                       NaN   
1                                 Close                       NaN   
2                             Community                       NaN   
3                                 Close                       NaN   
4                                 Close                       NaN   

                                 AGENCY_NAME COUNTY_DESCRIPTION    STATE  
0            FLORIDA WOMENS RECEPTION CENTER             Marion  Florida  
1      CENTRAL FLORIDA RECEPTION CENTER MAIN             Orange  Florida  
2          PENSACOLA COMUNITY RELEASE CENTER           Escambia  Florida  
3  SANTA ROSA CORRECTIONAL INSTITUTION ANNEX         Santa Rosa  Florida  
4           GRACEVILLE CORRECTIONAL FACILITY            Jackson  Florida  

---

DataFrame IncarcerationSupervisionCharge_00000.csv:
Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979 entries, 0 to 978
Data columns (total 29 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   INCARCERATION_SUPERVISION_ID       979 non-null    int64  
 1   CONVICTION_COUNTY                  979 non-null    int64  
 2   CONCURRENT_SENTENCES_SERVED_DAYS   979 non-null    int64  
 3   CONSECUTIVE_SENTENCES_SERVED_DAYS  979 non-null    int64  
 4   SENTENCE_SERVED_DAYS               979 non-null    int64  
 5   SUP_ASSIGNED_TERM_DURATION_DAYS    979 non-null    int64  
 6   SUP_ASSIGNED_MAXIMUM_TERM_CODE     0 non-null      float64
 7   GAIN_TIME_EARNED                   87 non-null     float64
 8   STATUTE                            979 non-null    object 
 9   STATUTE_CHAPTER_CODE               979 non-null    int64  
 10  STATUTE_SECTION                    979 non-null    int64  
 11  STATUTE_SUBSECTION                 340 non-null    object 
 12  FCIC_CODE                          979 non-null    int64  
 13  CHARGE_LEVEL                       979 non-null    object 
 14  CHARGE_DEGREE                      979 non-null    object 
 15  SENTENCE_STATUS_CODE               979 non-null    int64  
 16  MINIMUM_TERM_DURATION_DAYS         979 non-null    int64  
 17  MAXIMUM_TERM_DURATION_DAYS         979 non-null    int64  
 18  MAXIMUM_TERM_CODE                  0 non-null      float64
 19  HABITUAL_OFNDR_IND                 979 non-null    bool   
 20  HABITUAL_VIOL_FELONY_OFNDR_IND     979 non-null    bool   
 21  PRISON_RELEASEE_REOFNDR_IND        979 non-null    bool   
 22  VIOLENT_CAREER_CRIM_IND            979 non-null    bool   
 23  SENTENCE_PROBATION_DURATION_DAYS   979 non-null    int64  
 24  SUPERVISION_SERVED_DAYS            979 non-null    int64  
 25  SUPERVISION_REVOCATION_REASON      0 non-null      float64
 26  DRUG_TYPE_DESC                     4 non-null      object 
 27  OFFENSE_FCIC_TYPE_DESC             979 non-null    object 
 28  SENTENCE_STATUS                    979 non-null    object 
dtypes: bool(4), float64(4), int64(14), object(7)
memory usage: 195.2+ KB
None

Head:
   INCARCERATION_SUPERVISION_ID  CONVICTION_COUNTY  \
0                        155880                 13   
1                        157651                 52   
2                        158929                  6   
3                         52577                 52   
4                         54032                  5   

   CONCURRENT_SENTENCES_SERVED_DAYS  CONSECUTIVE_SENTENCES_SERVED_DAYS  \
0                                 0                                  0   
1                                 0                                  0   
2                                 0                                  0   
3                                 0                                  0   
4                                 0                                  0   

   SENTENCE_SERVED_DAYS  SUP_ASSIGNED_TERM_DURATION_DAYS  \
0                     0                              731   
1                     0                             1096   
2                     0                             1096   
3                     0                             1461   
4                     0                              731   

   SUP_ASSIGNED_MAXIMUM_TERM_CODE  GAIN_TIME_EARNED      STATUTE  \
0                             NaN               NaN       893.13   
1                             NaN               NaN       893.13   
2                             NaN               NaN       893.13   
3                             NaN               NaN   893.13.1a2   
4                             NaN               NaN  812.014.2c1   

   STATUTE_CHAPTER_CODE  STATUTE_SECTION STATUTE_SUBSECTION  FCIC_CODE  \
0                   893               13                NaN       3581   
1                   893               13                NaN       3599   
2                   893               13                NaN       3599   
3                   893               13                1a2       3562   
4                   812               14                2c1       2399   

  CHARGE_LEVEL CHARGE_DEGREE  SENTENCE_STATUS_CODE  \
0       Felony  Third Degree                     8   
1       Felony  Third Degree                     8   
2       Felony  Third Degree                     8   
3       Felony  Third Degree                     8   
4       Felony  Third Degree                     8   

   MINIMUM_TERM_DURATION_DAYS  MAXIMUM_TERM_DURATION_DAYS  MAXIMUM_TERM_CODE  \
0                           0                           0                NaN   
1                           0                           0                NaN   
2                           0                           0                NaN   
3                           0                           0                NaN   
4                           0                           0                NaN   

   HABITUAL_OFNDR_IND  HABITUAL_VIOL_FELONY_OFNDR_IND  \
0               False                           False   
1               False                           False   
2               False                           False   
3               False                           False   
4               False                           False   

   PRISON_RELEASEE_REOFNDR_IND  VIOLENT_CAREER_CRIM_IND  \
0                        False                    False   
1                        False                    False   
2                        False                    False   
3                        False                    False   
4                        False                    False   

   SENTENCE_PROBATION_DURATION_DAYS  SUPERVISION_SERVED_DAYS  \
0                                 0                      731   
1                                 0                     1096   
2                                 0                     1096   
3                                 0                     1461   
4                                 0                      731   

   SUPERVISION_REVOCATION_REASON DRUG_TYPE_DESC OFFENSE_FCIC_TYPE_DESC  \
0                            NaN            NaN       Barbiturate-Sell   
1                            NaN            NaN        Dangerous Drugs   
2                            NaN            NaN        Dangerous Drugs   
3                            NaN            NaN      Marijuana-Possess   
4                            NaN            NaN                Larceny   

  SENTENCE_STATUS  
0  Not Applicable  
1  Not Applicable  
2  Not Applicable  
3  Not Applicable  
4  Not Applicable  

---

In [3]:
df = dataframes['IncarcerationSupervisionDisciplinary_00000.csv']

df['INCAR_SUPERV_CREATED_DATE'] = pd.to_datetime(df['INCAR_SUPERV_CREATED_DATE'])

# Print the range of dates
print(f"Range of Dates: from {df['INCAR_SUPERV_CREATED_DATE'].min()} to {df['INCAR_SUPERV_CREATED_DATE'].max()}")

# Plot the distribution of dates
plt.figure(figsize=(12, 6))
plt.hist(df['INCAR_SUPERV_CREATED_DATE'], bins=30, edgecolor='black')
plt.title('Distribution of INCAR_SUPERV_CREATED_DATE')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
Range of Dates: from 2022-02-25 17:43:59.993000 to 2025-08-14 08:00:02.897000
No description has been provided for this image
In [4]:
df = dataframes['IncarcerationSupervisionDisciplinary_00001.csv']

df['INCAR_SUPERV_CREATED_DATE'] = pd.to_datetime(df['INCAR_SUPERV_CREATED_DATE'])

# Print the range of dates
print(f"Range of Dates: from {df['INCAR_SUPERV_CREATED_DATE'].min()} to {df['INCAR_SUPERV_CREATED_DATE'].max()}")

# Plot the distribution of dates
plt.figure(figsize=(12, 6))
plt.hist(df['INCAR_SUPERV_CREATED_DATE'], bins=30, edgecolor='black')
plt.title('Distribution of INCAR_SUPERV_CREATED_DATE')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()
Range of Dates: from 2024-10-16 10:19:19.399000 to 2024-10-16 12:07:30.478000
No description has been provided for this image
In [5]:
df0 = dataframes['IncarcerationSupervisionDisciplinary_00000.csv']
df1 = dataframes['IncarcerationSupervisionDisciplinary_00001.csv']

df0['INCAR_SUPERV_CREATED_DATE'] = pd.to_datetime(df0['INCAR_SUPERV_CREATED_DATE'])
df1['INCAR_SUPERV_CREATED_DATE'] = pd.to_datetime(df1['INCAR_SUPERV_CREATED_DATE'])

min0, max0 = df0['INCAR_SUPERV_CREATED_DATE'].min(), df0['INCAR_SUPERV_CREATED_DATE'].max()
min1, max1 = df1['INCAR_SUPERV_CREATED_DATE'].min(), df1['INCAR_SUPERV_CREATED_DATE'].max()

# --- theme colors ---
bg = "#181f22"
text = "#fff"
bar = "#22eff5"
accent = "#8af6e6"
grid = "#25333b"

fig, axs = plt.subplots(2, 1, figsize=(11, 7), sharex=True, facecolor=bg)

for ax in axs:
    ax.set_facecolor(bg)
    ax.tick_params(axis='x', colors=text)
    ax.tick_params(axis='y', colors=text)
    for side in ['bottom', 'top', 'left', 'right']:
        ax.spines[side].set_color(text)
    ax.title.set_color(text)
    ax.yaxis.label.set_color(text)
    ax.xaxis.label.set_color(text)
    ax.grid(axis='y', color=grid, linestyle=':', alpha=0.4)

# First file
axs[0].hist(df0['INCAR_SUPERV_CREATED_DATE'], bins=30, edgecolor=bg, color=bar)
axs[0].set_title('00000.csv: INCAR_SUPERV_CREATED_DATE')
axs[0].set_ylabel('Frequency')
axs[0].text(0.01, 0.92, f'Range: {min0.date()} to {max0.date()}', transform=axs[0].transAxes, fontsize=10, color=accent, fontweight='bold')

# Second file
axs[1].hist(df1['INCAR_SUPERV_CREATED_DATE'], bins=30, edgecolor=bar, color=bar)
axs[1].set_title('00001.csv: INCAR_SUPERV_CREATED_DATE')
axs[1].set_xlabel('Date')
axs[1].set_ylabel('Frequency')
axs[1].text(0.01, 0.92, f'Range: {min1.date()} to {max1.date()}', transform=axs[1].transAxes, fontsize=10, color=accent, fontweight='bold')

fig.autofmt_xdate(rotation=45)
plt.tight_layout()
plt.savefig("assets/date_distributions_composite.png", bbox_inches='tight', dpi=160, facecolor=bg)
plt.show()
No description has been provided for this image
In [6]:
df_main = dataframes['IncarcerationSupervisionDisciplinary_00000.csv']
df_patch = dataframes['IncarcerationSupervisionDisciplinary_00001.csv']

cols_for_identity = [
    'INCARCERATION_SUPERVISION_ID',
    'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC',
    'INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'
]


patch_id_set = set(tuple(x) for x in df_patch[cols_for_identity].to_records(index=False))
main_id_set = set(tuple(x) for x in df_main[cols_for_identity].to_records(index=False))

in_patch_not_in_main = patch_id_set - main_id_set
in_main_not_in_patch = main_id_set - patch_id_set
overlap = patch_id_set & main_id_set  # intersection

print(f"Logical records in both main and patch: {len(overlap)}")
print(f"Logical records only in patch: {len(in_patch_not_in_main)}")
print(f"Logical records only in main: {len(in_main_not_in_patch)}")
print(f"Total logical records in patch: {len(patch_id_set)}")
print(f"Total logical records in main: {len(main_id_set)}")
print(f"Total union (all unique logical records): {len(patch_id_set | main_id_set)}")
Logical records in both main and patch: 1740
Logical records only in patch: 52784
Logical records only in main: 572352
Total logical records in patch: 54524
Total logical records in main: 574092
Total union (all unique logical records): 626876
In [7]:
# Check uniqueness in main
main_dupes = df_main['UNIQUE_CORRELATION_ID'].duplicated().sum()
print(f"Duplicated UNIQUE_CORRELATION_ID in main: {main_dupes}")
main_dupes = df_main['INCARCERATION_SUPERVISION_ID'].duplicated().sum()
print(f"Duplicated INCARCERATION_SUPERVISION_ID in main: {main_dupes}")
main_dupes = df_main['CORRELATION_ID'].duplicated().sum()
print(f"Duplicated CORRELATION_ID in main: {main_dupes}")

# Same for patch
patch_dupes = df_patch['UNIQUE_CORRELATION_ID'].duplicated().sum()
print(f"Duplicated UNIQUE_CORRELATION_ID in patch: {patch_dupes}")
patch_dupes = df_patch['INCARCERATION_SUPERVISION_ID'].duplicated().sum()
print(f"Duplicated INCARCERATION_SUPERVISION_ID in patch: {patch_dupes}")
patch_dupes = df_patch['CORRELATION_ID'].duplicated().sum()
print(f"Duplicated CORRELATION_ID in patch: {patch_dupes}")
Duplicated UNIQUE_CORRELATION_ID in main: 495824
Duplicated INCARCERATION_SUPERVISION_ID in main: 495824
Duplicated CORRELATION_ID in main: 495824
Duplicated UNIQUE_CORRELATION_ID in patch: 47204
Duplicated INCARCERATION_SUPERVISION_ID in patch: 47204
Duplicated CORRELATION_ID in patch: 47204
In [8]:
# For violation types:
violation_map = df_main.groupby('INCAR_DISCIPLINARY_VIOL_CODE')['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].nunique()
print("Number of violation codes with >1 unique description:", (violation_map > 1).sum())

desc_map = df_main.groupby('INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC')['INCAR_DISCIPLINARY_VIOL_CODE'].nunique()
print("Number of descriptions with >1 unique violation code:", (desc_map > 1).sum())

# For action types:
action_map = df_main.groupby('INCAR_DISCIPLINARY_ACTION_CODE')['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'].nunique()
print("Number of action codes with >1 unique description:", (action_map > 1).sum())

action_desc_map = df_main.groupby('INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC')['INCAR_DISCIPLINARY_ACTION_CODE'].nunique()
print("Number of action descriptions with >1 unique action code:", (action_desc_map > 1).sum())
Number of violation codes with >1 unique description: 0
Number of descriptions with >1 unique violation code: 0
Number of action codes with >1 unique description: 0
Number of action descriptions with >1 unique action code: 0
In [9]:
# Count the number of records per logical event
event_counts = df_patch.groupby([
    'INCARCERATION_SUPERVISION_ID',
    'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC',
    'INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'
]).size().reset_index(name='count')

print(event_counts[event_counts['count'] > 1].head())
print(f"Number of (supervision/action/violation) combos with >1 record: {(event_counts['count'] > 1).sum()}")

# Count the number of records per logical event
event_counts = df_main.groupby([
    'INCARCERATION_SUPERVISION_ID',
    'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC',
    'INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'
]).size().reset_index(name='count')

print(event_counts[event_counts['count'] > 1].head())
print(f"Number of (supervision/action/violation) combos with >1 record: {(event_counts['count'] > 1).sum()}")
Empty DataFrame
Columns: [INCARCERATION_SUPERVISION_ID, INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC, INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC, count]
Index: []
Number of (supervision/action/violation) combos with >1 record: 0
Empty DataFrame
Columns: [INCARCERATION_SUPERVISION_ID, INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC, INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC, count]
Index: []
Number of (supervision/action/violation) combos with >1 record: 0
In [10]:
informative_cols = [
    'INCARCERATION_SUPERVISION_ID',
    'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC',
    'INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'
]

combo = pd.concat([df_main, df_patch], ignore_index=True)
combo = combo.drop_duplicates(informative_cols)
combo = combo[informative_cols]
print(combo.info(), combo.head())
<class 'pandas.core.frame.DataFrame'>
Index: 626876 entries, 0 to 628615
Data columns (total 3 columns):
 #   Column                                          Non-Null Count   Dtype 
---  ------                                          --------------   ----- 
 0   INCARCERATION_SUPERVISION_ID                    626876 non-null  int64 
 1   INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC     626876 non-null  object
 2   INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  626876 non-null  object
dtypes: int64(1), object(2)
memory usage: 19.1+ MB
None    INCARCERATION_SUPERVISION_ID INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC  \
0                            26                    Disciplinary Confinement   
1                            26                            Verbal Reprimand   
2                            26                    Disciplinary Confinement   
3                            26                           Loss of Gain Time   
4                            26                    Disciplinary Confinement   

      INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  
0            Failure to comply with count procedures  
1                 Disobeying verbal or written order  
2  Tampering/defeating any fire or other safety d...  
3  Refusing to work or participate in mandatory p...  
4  Creating, participating in or inciting a minor...  
In [11]:
# Get action outcome types, ordered by frequency
action_order = combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'].value_counts(ascending=False).index.tolist()

# Make the crosstab
top_violations = combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].value_counts().index

crosstab = pd.crosstab(
    combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'],
    combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC']
).loc[top_violations, action_order]

crosstab_html = crosstab.to_html(classes="discipline-table display nowrap", border=0)

with open("discipline_sanctions_by_violation_top.html", "w") as f:
    f.write(crosstab_html)
crosstab.to_csv('assets/discipline_sanctions_by_violation.csv')
crosstab
Out[11]:
INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC Disciplinary Confinement Loss of Gain Time Cashless Canteen Denial Verbal Reprimand Privilege Suspension - Other Loss of Future Gain Time Disciplinary Squad Extra Duty Restitution Payment Alternative Housing Privilege Suspension - Mail, Visitation Privilege Suspension - Visitation Privilege Suspension - Mail Restricted Labor Squad Disciplinary Confinement Part Time
INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC
Disobeying verbal or written order 38343 16026 9898 5258 3002 2105 2433 1928 53 340 78 115 6 4 11
Disrespect persons of authority 21168 8103 5252 2384 1372 874 1390 597 3 368 18 37 4 22 7
Disorderly conduct 19639 7260 3803 1789 1011 898 944 509 5 235 12 28 1 3 2
Use of unauthorized drugs 19547 10021 3222 194 921 946 197 108 20 84 45 33 3 3 1
Fighting 18350 5975 3635 1701 859 734 271 198 6 134 5 9 1 5 0
Unauthorized possession/use of communication device/technology 18134 7422 1853 100 978 760 77 28 470 41 23 35 2 0 0
Possession/manufacture of weapons, ammunition or explosives 17369 6211 1485 35 615 1236 22 8 15 39 12 17 0 0 0
Spoken, written or gestured threats 13305 4855 1528 187 800 788 84 37 1 34 14 17 3 3 0
Possession of narcotics/unauthorized drugs and paraphernalia 12707 5910 944 26 199 509 31 26 1 4 14 7 0 0 1
Being in unauthorized area 5716 3794 3625 2566 781 571 1300 1097 1 126 21 12 0 7 0
Destruction of State or another property 7612 1832 1163 533 351 284 185 150 6049 28 4 0 1 2 2
Lewd or lascivious exhibition 9290 3995 1454 221 563 1083 27 11 0 16 18 75 0 1 1
Possession or transfer of other contraband 6225 4051 1665 1280 344 289 891 1512 157 18 8 7 4 3 0
Disobeying institutional regulations 4537 2832 1559 1523 466 400 671 1536 54 37 6 12 2 6 5
Battery or attempted battery on an inmate 8964 2948 746 27 232 463 5 6 1 1 3 7 0 0 0
Lying to staff or falsifying records 6633 2306 1231 669 252 134 496 331 8 43 8 8 2 2 2
Battery or attempted battery on correctional officer 6518 2740 713 10 316 611 0 0 2 1 1 11 1 0 0
Unauthorized absence from assigned area 3876 1715 1664 1270 242 143 670 413 1 18 6 3 0 0 0
Possession of aromatic stimulants or depressants 6617 2087 463 15 214 203 46 33 0 26 7 8 1 1 0
Personal or business relationship with staff member or volunteer 5955 2107 655 114 168 279 44 14 0 8 4 5 3 0 0
Possession of tobacco-related product by non-death row or non-community release inmate 4420 1917 1141 693 239 96 325 376 0 19 4 21 0 1 2
Aggravated battery or attempted on inmate 5612 2070 409 5 155 537 0 0 3 0 6 3 0 0 0
Telephone regulation violations 2745 1071 859 592 2300 83 276 351 1 6 179 23 4 2 0
Refusing to work or participate in mandatory programs 3534 1833 1216 941 221 95 302 144 0 35 1 9 0 2 0
Obscene or profane act, gesture or statement 4338 1978 490 123 161 206 54 14 0 14 19 8 1 0 0
Creating, participating in or inciting a minor disturbance 5246 1106 402 26 210 265 13 2 2 2 0 1 0 0 0
Failure to comply with count procedures 2718 1014 868 739 208 58 344 543 0 16 1 3 0 0 0
Misuse of State or another property 1432 1175 731 1746 343 438 25 59 87 3 1 0 0 0 0
Refusing to submit to substance abuse testing 3713 1681 380 49 33 89 6 1 0 6 9 2 0 0 0
Tampering/defeating/depriving staff of security device 3549 838 731 123 422 137 21 22 58 4 2 3 0 0 0
Attempt or conspire to commit crime or violate rules 2850 1294 527 152 348 110 68 36 13 14 36 23 32 1 1
Theft of property under $50 in value 1977 863 496 274 88 48 223 180 1010 17 0 2 0 1 0
Introduction of any contraband 2283 1169 210 81 43 78 25 33 1 3 23 43 7 0 0
Tattooing, branding or body art including piercing 1517 793 553 170 125 38 301 313 3 1 0 8 0 2 0
Possession of gang-related paraphernalia or material 2306 552 274 40 62 77 121 56 0 7 1 7 0 2 0
Possession of unauthorized beverages 1521 676 404 104 167 44 110 34 1 15 2 7 0 0 0
Failure to maintain acceptable hygiene/appearance of housing 687 510 231 1098 82 147 39 83 1 1 0 0 0 0 0
Sex acts or unauthorized physical contact involving inmates 1485 734 262 112 73 35 43 53 0 8 4 16 0 1 0
Tampering/defeating any fire or other safety device 1584 331 99 8 48 87 1 0 612 0 0 0 0 0 0
Failure to maintain personal hygiene/appearance 699 396 257 942 113 92 30 21 0 0 2 0 0 0 0
Inciting riots, strikes, mutinous acts or disturbances 1713 536 85 5 37 131 3 0 0 0 1 0 0 0 0
Possession of negotiables in unauthorized amounts, forms or location 1082 655 220 144 59 54 45 61 0 3 6 9 2 0 1
Other assault/battery or attempted 1412 755 2 1 54 64 3 4 2 0 0 0 0 0 0
Unauthorized/illegal financial transaction 1230 509 288 30 116 54 11 6 16 4 4 5 0 0 0
Participating in riots, strikes, mutinous acts or disturbances 1438 511 43 2 9 203 0 1 1 0 0 24 0 0 0
Altering/defacing State or another property 608 329 177 164 32 22 52 84 498 1 0 0 0 0 0
Gang-related activities 1309 299 158 8 53 44 30 17 1 3 2 0 1 0 0
Assault or attempted assault on a correctional officer 1031 319 27 1 12 76 0 0 0 0 0 2 0 0 0
Possession of stolen property 573 358 83 65 18 20 58 101 119 4 0 0 0 0 0
Mail regulation violations 633 238 83 28 78 8 27 57 0 0 111 0 121 0 0
Canteen Shortage over $50 598 192 64 17 9 6 12 4 454 5 0 0 0 0 0
Possession of escape paraphernalia 834 283 61 6 34 34 2 1 2 0 0 1 0 0 0
Possession of items facilitating unauthorized/illegal financial transaction 743 275 109 26 48 14 8 1 2 0 5 6 1 0 0
Bartering with others 522 345 134 64 33 26 40 66 3 2 0 3 0 0 0
Manufacture of drugs or unauthorized beverages 738 274 99 9 33 23 26 5 0 7 2 3 0 2 2
Gambling or possession of gambling paraphernalia 483 300 39 41 16 9 42 139 0 0 1 1 0 2 0
Unauthorized physical contact involving non-inmates 659 255 56 29 12 12 10 5 0 1 11 18 0 0 0
Aggravated battery or attempted on correctional officer 602 224 51 0 22 75 0 0 0 0 0 1 1 0 0
Theft of property exceeding $50 in value 457 163 43 15 8 8 12 3 220 1 0 0 0 0 0
Assault or attempted assault on an inmate 685 148 9 2 7 22 0 1 0 0 0 0 0 0 0
Battery or attempted battery on staff other than correctional officer 542 185 43 2 22 39 0 0 1 0 0 1 0 0 0
Possession of unauthorized clothing or linen 227 123 92 123 20 37 50 84 5 1 0 0 0 0 0
Trafficking in drugs or unauthorized beverages 459 210 47 0 13 19 1 0 0 0 2 0 0 0 0
Arson or attempted arson 445 109 38 3 24 31 1 1 55 2 0 0 0 0 0
Frivolous, malicious, false information presented to court 329 132 61 60 16 6 22 25 1 6 1 0 2 0 0
Assault/battery or attempted with deadly weapon 398 231 0 0 12 13 0 0 1 0 0 0 0 0 0
Destruction of State or another property by gross negligence 134 69 25 40 7 22 14 20 224 0 0 0 0 0 0
Aggravated assault or attempted on inmate 405 104 16 1 3 23 0 0 0 0 0 0 0 0 0
Use of Alcohol 281 131 32 16 7 11 5 6 0 3 0 0 0 0 0
Escape or escape attempt 261 173 13 2 5 13 0 0 1 0 6 4 0 0 0
Visiting regulation violations 161 100 19 29 38 7 6 14 0 0 22 41 0 0 0
Insufficient work 97 101 60 69 8 10 27 31 4 0 0 1 0 0 0
Missing count 217 77 37 29 1 2 14 2 0 1 0 0 0 0 0
Fraud or attempted fraud 231 94 18 4 9 5 6 2 8 0 1 0 0 0 0
Extortion or attempted extortion 245 85 18 2 11 9 4 0 1 0 0 0 0 0 0
Feigning illness or malingering 124 89 58 54 14 15 3 12 0 1 0 0 0 0 0
Loaning or borrowing money or other valuables 181 101 26 20 9 6 9 13 0 1 0 0 0 0 0
Wasting State or another property 103 46 35 32 10 2 27 16 72 0 0 0 0 0 0
Presenting false testimony or information 186 68 31 11 3 1 2 3 0 0 0 0 0 0 0
Possession of unauthorized or altered identification 117 56 43 29 10 12 11 21 5 1 0 0 0 0 0
Unauthorized use or tampering with computer or office equipment 174 53 23 7 4 1 0 0 0 0 0 0 8 0 0
Bribery or attempted bribery 154 53 13 8 1 1 3 0 0 0 1 0 0 0 0
Aggravated assault or attempted on correctional officer 150 52 4 0 0 10 0 0 0 0 0 0 0 0 0
Robbery or attempted robbery 100 39 16 0 2 1 2 0 2 1 0 0 0 0 0
Assault or attempted assault on staff other than correctional officer 97 35 7 0 0 14 0 0 0 0 0 0 0 0 0
Sexual battery or attempted 76 32 0 0 1 7 0 0 0 0 1 0 0 0 0
Deviating from approved plan 0 56 10 24 0 1 0 6 0 0 0 0 0 0 0
Breaking and entering or attempt 51 20 14 2 3 0 6 0 0 0 0 0 0 0 0
Aggravated battery or attempted on staff other than correctional officer 52 24 4 0 0 6 0 0 1 0 0 2 0 0 0
Self-Mutilation 55 27 0 0 0 1 0 2 0 0 0 0 0 0 0
Canteen Shortage under $50 22 14 4 7 1 0 0 0 29 0 0 0 0 0 1
Battery or attempted on other than staff or inmates 26 12 3 0 0 0 0 0 0 0 0 0 0 0 0
Tampering/damaging electronic monitoring equipment 5 4 0 3 0 0 0 2 22 0 0 0 0 0 0
Failure to deposit earnings 2 18 5 9 0 0 1 0 0 0 0 0 0 0 0
Failure to proceed to/from designated area 2 9 4 5 0 0 0 2 0 0 0 0 0 0 0
Possession of tobacco-related product by death row inmate 16 1 0 1 0 0 0 1 0 0 0 0 0 0 0
Assault or attempted assault on other than staff/inmates 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0
Unauthorized contact in behalf of another inmate 3 5 2 0 1 0 0 0 0 0 0 0 0 0 0
False property lien 5 1 2 1 0 0 0 0 0 0 0 0 0 0 0
Aggravated assault or attempted on staff other than correctional officer 5 2 0 0 0 1 0 0 0 0 0 0 0 0 0
Failure to remain within designated area of release plan 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0
Making purchase or contract without approval 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0
Aggravated battery or attempted not on staff/inmates 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
Consume Intoxicants 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
In [12]:
# Get top 25 action outcome types, ordered by frequency
action_order = combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'].value_counts(ascending=False).index.tolist()

# Make the crosstab as before, limited to top 25 violations
top_violations = combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].value_counts().head(25).index

crosstab2 = pd.crosstab(
    combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'],
    combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC']
).loc[top_violations, action_order]

crosstab2
Out[12]:
INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC Disciplinary Confinement Loss of Gain Time Cashless Canteen Denial Verbal Reprimand Privilege Suspension - Other Loss of Future Gain Time Disciplinary Squad Extra Duty Restitution Payment Alternative Housing Privilege Suspension - Mail, Visitation Privilege Suspension - Visitation Privilege Suspension - Mail Restricted Labor Squad Disciplinary Confinement Part Time
INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC
Disobeying verbal or written order 38343 16026 9898 5258 3002 2105 2433 1928 53 340 78 115 6 4 11
Disrespect persons of authority 21168 8103 5252 2384 1372 874 1390 597 3 368 18 37 4 22 7
Disorderly conduct 19639 7260 3803 1789 1011 898 944 509 5 235 12 28 1 3 2
Use of unauthorized drugs 19547 10021 3222 194 921 946 197 108 20 84 45 33 3 3 1
Fighting 18350 5975 3635 1701 859 734 271 198 6 134 5 9 1 5 0
Unauthorized possession/use of communication device/technology 18134 7422 1853 100 978 760 77 28 470 41 23 35 2 0 0
Possession/manufacture of weapons, ammunition or explosives 17369 6211 1485 35 615 1236 22 8 15 39 12 17 0 0 0
Spoken, written or gestured threats 13305 4855 1528 187 800 788 84 37 1 34 14 17 3 3 0
Possession of narcotics/unauthorized drugs and paraphernalia 12707 5910 944 26 199 509 31 26 1 4 14 7 0 0 1
Being in unauthorized area 5716 3794 3625 2566 781 571 1300 1097 1 126 21 12 0 7 0
Destruction of State or another property 7612 1832 1163 533 351 284 185 150 6049 28 4 0 1 2 2
Lewd or lascivious exhibition 9290 3995 1454 221 563 1083 27 11 0 16 18 75 0 1 1
Possession or transfer of other contraband 6225 4051 1665 1280 344 289 891 1512 157 18 8 7 4 3 0
Disobeying institutional regulations 4537 2832 1559 1523 466 400 671 1536 54 37 6 12 2 6 5
Battery or attempted battery on an inmate 8964 2948 746 27 232 463 5 6 1 1 3 7 0 0 0
Lying to staff or falsifying records 6633 2306 1231 669 252 134 496 331 8 43 8 8 2 2 2
Battery or attempted battery on correctional officer 6518 2740 713 10 316 611 0 0 2 1 1 11 1 0 0
Unauthorized absence from assigned area 3876 1715 1664 1270 242 143 670 413 1 18 6 3 0 0 0
Possession of aromatic stimulants or depressants 6617 2087 463 15 214 203 46 33 0 26 7 8 1 1 0
Personal or business relationship with staff member or volunteer 5955 2107 655 114 168 279 44 14 0 8 4 5 3 0 0
Possession of tobacco-related product by non-death row or non-community release inmate 4420 1917 1141 693 239 96 325 376 0 19 4 21 0 1 2
Aggravated battery or attempted on inmate 5612 2070 409 5 155 537 0 0 3 0 6 3 0 0 0
Telephone regulation violations 2745 1071 859 592 2300 83 276 351 1 6 179 23 4 2 0
Refusing to work or participate in mandatory programs 3534 1833 1216 941 221 95 302 144 0 35 1 9 0 2 0
Obscene or profane act, gesture or statement 4338 1978 490 123 161 206 54 14 0 14 19 8 1 0 0
In [13]:
# Count occurrences
violation_counts = combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].value_counts().reset_index()
violation_counts.columns = ['Violation', 'Count']

# Add percent of total
total = violation_counts['Count'].sum()
violation_counts['Percent'] = violation_counts['Count'] / total * 100

# Format for HTML display
violation_counts['Percent'] = violation_counts['Percent'].round(2)  # or 1 if you want

# Save full table with percent
violation_counts.to_html('assets/top_violations.html', index=False, float_format='%.2f')

# Save CSV if desired
violation_counts.to_csv('csvs/violations.csv', index=False)
# Get top 25
top_violations = violation_counts.head(25).copy()

# For Altair, add label for % display in tooltip (optional, pretty)
top_violations['Percent_Label'] = top_violations['Percent'].map('{:.2f}%'.format)

bar = alt.Chart(top_violations).mark_bar(size=18).encode(
    y=alt.Y('Violation:N', sort='-x', title=None, axis=alt.Axis(labelLimit=370)),
    x=alt.X('Percent:Q', title='Share of All Disciplinary Events (%)', axis=alt.Axis(format='.1f')),
    color=alt.value('#24d7d7'),
    tooltip=[
        alt.Tooltip('Violation:N', title='Violation'),
        alt.Tooltip('Count:Q', title='Raw Count'),
        alt.Tooltip('Percent_Label:N', title='% of Total')
    ]
).properties(
    width=420,
    height=520,
    background='#181f22',
    title=alt.TitleParams(
        text="Most Common Rule Violations (Share of All Events)",
        color='white'
    )
).configure_axis(
    labelColor='white', titleColor='white'
).configure_title(
    color='white'
).configure_view(
    stroke=None
)

bar.save('assets/violations_barchart.json')
bar
Out[13]:
In [14]:
# Count occurrences of each disciplinary outcome
outcome_counts = combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'].value_counts().reset_index()
outcome_counts.columns = ['Outcome', 'Count']

# Add percent of total
total = outcome_counts['Count'].sum()
outcome_counts['Percent'] = (outcome_counts['Count'] / total * 100).round(2)

# Save full table with percent
outcome_counts.to_html('assets/outcomes.html', index=False, float_format='%.2f')
outcome_counts.to_csv('csvs/outcomes.csv', index=False)

# Prepare top N (if you want to limit for viz; otherwise just use all)
top_outcomes = outcome_counts.head(25)  # Or whatever N you want

# Altair bar chart: show count, but you could use 'Percent:Q' for %
bar2 = alt.Chart(top_outcomes).mark_bar(size=18).encode(
    y=alt.Y('Outcome:N', sort='-x', title=None, axis=alt.Axis(labelLimit=370)),
    x=alt.X('Percent:Q', title='Percent'),  # Or change to 'Percent:Q' for %
    color=alt.value('#24d7d7'),
    tooltip=['Outcome', 'Count', 'Percent']
).properties(
    width=420,
    height=330,
    background='#181f22',
    title=alt.TitleParams(
        text="Most Common Disciplinary Outcomes",
        color='white'
    )
).configure_axis(
    labelColor='white', titleColor='white'
).configure_title(
    color='white'
).configure_view(
    stroke=None
)

bar2.save('assets/outcomes_barchart.json')
bar2
Out[14]:
In [15]:
# Suppose your crosstab2 is as before
df_heat = crosstab2.reset_index().melt(
    id_vars='INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC',
    var_name='action',
    value_name='count'
)

grand_total = df_heat['count'].sum()
df_heat['percent'] = 100 * df_heat['count'] / grand_total

max_pct = df_heat['percent'].max()
min_pct = df_heat['percent'][df_heat['percent'] > 0].min()  # exclude zero for a clean legend if desired

# If your top_violations is a DataFrame, get the list:
top_violations_list = top_violations['Violation'].tolist()

chart = alt.Chart(df_heat).mark_rect().encode(
    x=alt.X(
        'action:N',
        sort=list(action_order),
        title='Disciplinary Action Outcome',
        axis=alt.Axis(labelColor='white', titleColor='white', labelAngle=45)
    ),
    y=alt.Y(
        'INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC:N',
        sort=top_violations_list,
        title=None,
        axis=alt.Axis(labelLimit=400, labelFontSize=13, labelOverlap=False)
    ),
    color=alt.Color(
        'percent:Q',
        scale=alt.Scale(scheme='tealblues', domain=[0, max_pct]),
        title='% of All Events'
    ),
    tooltip=[
        alt.Tooltip('INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC', title='Violation'),
        alt.Tooltip('action', title='Action'),
        alt.Tooltip('count', title='Count'),
        alt.Tooltip('percent', title='% of All Events', format='.2f')
    ]
).properties(
    width=480,
    height=440,
    background='#181f22',
    title=alt.TitleParams(
        text="Top 25 Rule Violations × Disciplinary Outcomes (Share of All Events)",
        color='white'
    )
).configure_axis(
    labelColor='white', titleColor='white'
).configure_title(
    color='white'
).configure_legend(
    labelColor='white', titleColor='white'
).configure_view(
    stroke=None
)

chart.save('assets/violations_heatmap1.json')
chart
Out[15]:
In [16]:
print(dataframes['Incarcerationsupervision_00000.csv']['RACE_CODE'].value_counts(), 
      dataframes['Incarcerationsupervision_00000.csv']['ETHNICITY_CODE'].value_counts())
RACE_CODE
White                               79107
Black                               74159
Unknown                              8714
American Indian or Alaska Native      173
Asian                                  43
Name: count, dtype: int64 ETHNICITY_CODE
Not Hispanic or Latino    139473
Hispanic or Latino         20041
Unknown                     2682
Name: count, dtype: int64
In [17]:
dataframes['Incarcerationsupervision_00000.csv']['race_ethnicity_group'] = np.where(
    (dataframes['Incarcerationsupervision_00000.csv']['RACE_CODE'] == 'White') & (dataframes['Incarcerationsupervision_00000.csv']['ETHNICITY_CODE'] == 'Not Hispanic or Latino'),
    'majority',
    'minority'
)
dataframes['Incarcerationsupervision_00000.csv']['race_ethnicity_group'].value_counts(normalize=True)
Out[17]:
race_ethnicity_group
minority    0.589756
majority    0.410244
Name: proportion, dtype: float64
In [18]:
# 1. Get the demographic info, including sex and group
demo_df = dataframes['Incarcerationsupervision_00000.csv'][[
    'INCARCERATION_SUPERVISION_ID', 'race_ethnicity_group', 'SEX_CODE'
]].copy()

# 2. For each person in demo_df, are they in the infractions table? (Right join on all people)
# First, get set of IDs with at least one infraction
has_infraction = pd.Series(combo['INCARCERATION_SUPERVISION_ID'].unique())
has_infraction = pd.DataFrame({'INCARCERATION_SUPERVISION_ID': has_infraction, 'has_infraction': 1})

# 3. Merge: demo_df (all people) LEFT joined to infraction marker (so all people in demo_df stay)
merged = demo_df.merge(has_infraction, on='INCARCERATION_SUPERVISION_ID', how='left')
merged['has_infraction'] = merged['has_infraction'].fillna(0).astype(int)

# 4. Now for each (race_ethnicity_group, SEX_CODE), compute:
#    - count
#    - how many have infraction
#    - percent with infraction

summary = (
    merged
    .groupby(['race_ethnicity_group', 'SEX_CODE'])
    .agg(
        total=('INCARCERATION_SUPERVISION_ID', 'nunique'),
        num_with_infraction=('has_infraction', 'sum')
    )
    .reset_index()
)
summary['percent_with_infraction'] = 100 * summary['num_with_infraction'] / summary['total']

# 5. Show as a confusion matrix (pivot)
conf_matrix = summary.pivot(index='race_ethnicity_group', columns='SEX_CODE', values='percent_with_infraction')
conf_matrix = conf_matrix.round(2)

# Optional: Print nicely
print("Proportion (%) of people who ever received at least one disciplinary infraction:")
print(conf_matrix)

# To save as CSV or HTML:
#conf_matrix.to_csv('assets/infraction_by_group_sex.csv')
conf_matrix.to_html('assets/infraction_by_group_sex.html')


# Build the contingency table with counts (not percentages!)
contingency = summary.pivot(index='race_ethnicity_group', columns='SEX_CODE', values='num_with_infraction').fillna(0).astype(int)
# Also get counts of those with NO infraction
no_infraction = summary.pivot(index='race_ethnicity_group', columns='SEX_CODE', values='total') - contingency

# Build full table:
# rows = [majority, minority], columns = [female_infraction, female_no_infraction, male_infraction, male_no_infraction]
cont_table = pd.DataFrame({
    'Female_infraction': contingency['Female'],
    'Female_no_infraction': no_infraction['Female'],
    'Male_infraction': contingency['Male'],
    'Male_no_infraction': no_infraction['Male'],
}, index=contingency.index)

# For chi2, use a 2x2 table, e.g., for Male vs Female, or minority vs majority
# Here's a 2x2 for 'any infraction' by 'minority/majority' (collapse across sex)
infraction_by_group = merged.groupby('race_ethnicity_group')['has_infraction'].agg(['sum', 'count'])
chi2, p, dof, ex = chi2_contingency([
    [infraction_by_group.loc['majority', 'sum'], infraction_by_group.loc['majority', 'count'] - infraction_by_group.loc['majority', 'sum']],
    [infraction_by_group.loc['minority', 'sum'], infraction_by_group.loc['minority', 'count'] - infraction_by_group.loc['minority', 'sum']]
])
print(f"Chi-squared p-value (minority vs majority): {p:.4g}")

# Or full chi2 on the 2x2x2 table
# For just race x sex x infraction/no infraction
contingency2 = merged.groupby(['race_ethnicity_group', 'SEX_CODE'])['has_infraction'].value_counts().unstack(fill_value=0)
chi2, p, dof, ex = chi2_contingency(contingency2)
print(f"Chi-squared p-value (race x sex): {p:.4g}")
Proportion (%) of people who ever received at least one disciplinary infraction:
SEX_CODE              Female   Male
race_ethnicity_group               
majority               26.28  43.35
minority               40.89  53.09
Chi-squared p-value (minority vs majority): 0
Chi-squared p-value (race x sex): 0
In [19]:
demo_df = dataframes['Incarcerationsupervision_00000.csv'][['INCARCERATION_SUPERVISION_ID', 'race_ethnicity_group']].copy()
merged = combo.merge(demo_df, on='INCARCERATION_SUPERVISION_ID', how='inner')
# 1. Set baseline (already calculated)
baseline_minority = dataframes['Incarcerationsupervision_00000.csv']['race_ethnicity_group'].value_counts(normalize=True)['minority']

# 2. Groupby and calculate the cell-level minority proportion
cell_group = (
    merged
    .groupby(['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC', 'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'])
)

# 3. Calculate: number of minorities, total count, and proportion for each cell
cell_stats = cell_group['race_ethnicity_group'].value_counts(normalize=False).unstack(fill_value=0)
cell_stats['total'] = cell_stats.sum(axis=1)
cell_stats['minority_prop'] = cell_stats.get('minority', 0) / cell_stats['total']

# 4. Compute deviation from baseline
cell_stats['minority_pct_diff'] = 100 * (cell_stats['minority_prop'] / baseline_minority - 1)


# 5. Reset index for visualization
cell_stats = cell_stats.reset_index()
viz_df = cell_stats[['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC', 
                     'INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC', 
                     'total', 'minority_prop', 'minority_pct_diff']].copy()

# 6. Run a z-test for each cell and add significance to viz_df
viz_df['zstat'] = np.nan
viz_df['pval'] = np.nan
viz_df['statistically_significant'] = False

for idx, row in viz_df.iterrows():
    count = int(round(row['minority_prop'] * row['total']))
    nobs = int(row['total'])
    if nobs > 0:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            stat, pval = proportions_ztest(count, nobs, baseline_minority)
        viz_df.at[idx, 'zstat'] = stat
        viz_df.at[idx, 'pval'] = pval
        viz_df.at[idx, 'statistically_significant'] = (pval < 0.05) and (nobs >= 10)
    else:
        viz_df.at[idx, 'statistically_significant'] = False

# (Optional) Save for d3 or further use
viz_df.to_csv('discipline_disparity_long.csv', index=False)
In [20]:
print(viz_df.info(), viz_df.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 986 entries, 0 to 985
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  986 non-null    object 
 1   INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC     986 non-null    object 
 2   total                                           986 non-null    int64  
 3   minority_prop                                   986 non-null    float64
 4   minority_pct_diff                               986 non-null    float64
 5   zstat                                           986 non-null    float64
 6   pval                                            986 non-null    float64
 7   statistically_significant                       986 non-null    bool   
dtypes: bool(1), float64(4), int64(1), object(2)
memory usage: 55.0+ KB
None race_ethnicity_group     INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  \
0                     Aggravated assault or attempted on correctiona...   
1                     Aggravated assault or attempted on correctiona...   
2                     Aggravated assault or attempted on correctiona...   
3                     Aggravated assault or attempted on correctiona...   
4                             Aggravated assault or attempted on inmate   

race_ethnicity_group INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC  total  \
0                                        Cashless Canteen Denial      4   
1                                       Disciplinary Confinement    131   
2                                       Loss of Future Gain Time      9   
3                                              Loss of Gain Time     47   
4                                        Cashless Canteen Denial     16   

race_ethnicity_group  minority_prop  minority_pct_diff     zstat  \
0                          1.000000          69.561763       inf   
1                          0.809160          37.202648  6.390428   
2                          0.777778          31.881371  1.356778   
3                          0.808511          37.092489  3.811464   
4                          0.875000          48.366543  3.449992   

race_ethnicity_group          pval  statistically_significant  
0                     0.000000e+00                      False  
1                     1.654223e-10                       True  
2                     1.748517e-01                      False  
3                     1.381463e-04                       True  
4                     5.606033e-04                       True  
In [21]:
# 1. Get Top 25 Violation Types (by total event count)
top_violations = crosstab.sum(axis=1).sort_values(ascending=False).index[:25].tolist()

# 2. Filter crosstab and viz_df to only those violations
crosstab_top = crosstab.loc[top_violations]
viz_df_top = viz_df[viz_df['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].isin(top_violations)]

# 3. Get actions sorted by total count (descending, so most frequent on left/front)
actions_sorted = crosstab_top.sum(axis=0).sort_values(ascending=True).index.tolist()
actions_sorted = actions_sorted[::-1]
violations_sorted = top_violations[::-1]  # most frequent at the front

# 4. Prepare matrix for the plot
z_sorted = crosstab_top.loc[violations_sorted, actions_sorted].values

# 5. Set up axes for bar3d
_x = np.arange(len(actions_sorted))
_y = np.arange(len(violations_sorted))
_xx, _yy = np.meshgrid(_x, _y)
x, y = _xx.ravel(), _yy.ravel()
top_vals = z_sorted.ravel()
bottom = np.zeros_like(top_vals)
width = depth = 0.8

# 6. Build color matrix for significant racial disparities (PuOr colormap for divergence)
color_matrix = np.full_like(z_sorted, fill_value=0.0, dtype=np.float64)
sig_matrix = np.zeros_like(z_sorted, dtype=bool)

for i, v in enumerate(violations_sorted):
    for j, a in enumerate(actions_sorted):
        match = viz_df_top[
            (viz_df_top['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'] == v) &
            (viz_df_top['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'] == a)
        ]
        if not match.empty:
            color_matrix[i, j] = match['minority_pct_diff'].iloc[0]
            sig_matrix[i, j] = match['statistically_significant'].iloc[0]

# 7. Set up color mapping for the significant cells (gray if not significant)
actual_min = np.nanmin(color_matrix[sig_matrix]) if np.any(sig_matrix) else 0
actual_max = np.nanmax(color_matrix[sig_matrix]) if np.any(sig_matrix) else 0
norm = TwoSlopeNorm(vmin=actual_min, vcenter=0, vmax=actual_max)
cmap = matplotlib.colormaps['PuOr']
colors = []
gray = (0.75, 0.75, 0.75, 1.0)  # RGBA for gray

for val, sig in zip(color_matrix.ravel(), sig_matrix.ravel()):
    if sig:
        colors.append(cmap(norm(val)))
    else:
        colors.append(gray)

# 8. Plot 3D bars
fig = plt.figure(figsize=(17, 14))
ax = fig.add_subplot(111, projection='3d')
fig.patch.set_facecolor('black')
ax.set_facecolor('black')

# 3D bars
ax.bar3d(x, y, bottom, width, depth, top_vals, shade=True, color=colors)

# Axis and tick label formatting for white text
ax.set_xticks(_x + width/2)
ax.set_xticklabels(actions_sorted, rotation=45, ha='right', fontsize=9, color='white')
ax.set_yticks(_y + depth/2)
ax.set_yticklabels(violations_sorted, rotation=0, ha='left', fontsize=9, color='white')
ax.set_zlabel('Event Count', fontsize=12, color='white')
ax.set_xlabel('', fontsize=13, color='white')
ax.set_ylabel('', fontsize=13, color='white')
ax.set_title('FDLE Disciplinary Event Counts (Top 25 Violations × Outcomes)', fontsize=16, pad=40, color='white')

# Set all 3D axes lines and panes to white
ax.xaxis._axinfo['grid'].update({'color' : (1,1,1,0.13)})
ax.yaxis._axinfo['grid'].update({'color' : (1,1,1,0.13)})
ax.zaxis._axinfo['grid'].update({'color' : (1,1,1,0.13)})
for axis in [ax.xaxis, ax.yaxis, ax.zaxis]:
    axis.line.set_color('white')
    axis.label.set_color('white')
    axis.pane.set_facecolor((0,0,0,1))
    axis.pane.set_edgecolor('white')

# 3D tick lines white
ax.tick_params(axis='x', colors='white')
ax.tick_params(axis='y', colors='white')
ax.tick_params(axis='z', colors='white')

ax.view_init(elev=30, azim=-65)

# COLORBAR — move higher on the plot using 'anchor' and 'fraction'
if np.any(sig_matrix):
    mappable = matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap)
    mappable.set_array(color_matrix[sig_matrix])
    cbar = fig.colorbar(
        mappable, ax=ax,
        shrink=0.5, aspect=12, pad=0.08,
        anchor=(0.0, 0.85),  # (x, y) between (0, 0) and (1, 1)
        label="% More/Less Minorities Than Baseline (statistically significant only)"
    )
    cbar.ax.yaxis.label.set_color('white')
    cbar.ax.tick_params(color='white', labelcolor='white')
    cbar.outline.set_edgecolor('white')
bg = "#181f22"
fig.patch.set_facecolor(bg)
ax.set_facecolor(bg)
plt.savefig("assets/prelim_viz.png", bbox_inches='tight', dpi=220, facecolor=bg)
plt.show()
No description has been provided for this image
In [22]:
violation_to_category = {
    "Disobeying verbal or written order": "Officer Got Upset",
    "Disrespect persons of authority": "Officer Got Upset",
    "Disorderly conduct": "Officer Got Upset",
    "Use of unauthorized drugs": "Self-Harm/Addiction",
    "Fighting": "Violence/Threat",
    "Unauthorized possession/use of communication device/technology": "Banned Everyday Item",
    "Possession/manufacture of weapons, ammunition or explosives": "Weapons",
    "Spoken, written or gestured threats": "Violence/Threat",
    "Possession of narcotics/unauthorized drugs and paraphernalia": "Self-Harm/Addiction",
    "Being in unauthorized area": "Administrative/Compliance",
    "Destruction of State or another property": "Property",
    "Lewd or lascivious exhibition": "Sexual/Relational",
    "Possession or transfer of other contraband": "Banned Everyday Item",
    "Disobeying institutional regulations": "Administrative/Compliance",
    "Battery or attempted battery on an inmate": "Violence/Threat",
    "Lying to staff or falsifying records": "Administrative/Compliance",
    "Battery or attempted battery on correctional officer": "Violence/Threat",
    "Unauthorized absence from assigned area": "Administrative/Compliance",
    "Possession of aromatic stimulants or depressants": "Self-Harm/Addiction",
    "Personal or business relationship with staff member or volunteer": "Sexual/Relational",
    "Possession of tobacco-related product by non-death row or non-community release inmate": "Banned Everyday Item",
    "Aggravated battery or attempted on inmate": "Violence/Threat",
    "Telephone regulation violations": "Administrative/Compliance",
    "Refusing to work or participate in mandatory programs": "Administrative/Compliance",
    "Obscene or profane act, gesture or statement": "Officer Got Upset",
    "Creating, participating in or inciting a minor disturbance": "Officer Got Upset",
    "Failure to comply with count procedures": "Officer Got Upset",
    "Misuse of State or another property": "Property",
    "Refusing to submit to substance abuse testing": "Administrative/Compliance",
    "Tampering/defeating/depriving staff of security device": "Property",
    "Attempt or conspire to commit crime or violate rules": "Administrative/Compliance",
    "Theft of property under $50 in value": "Property",
    "Introduction of any contraband": "Banned Everyday Item",
    "Tattooing, branding or body art including piercing": "Surviving Inside",
    "Possession of gang-related paraphernalia or material": "Gang/Security State",
    "Possession of unauthorized beverages": "Banned Everyday Item",
    "Failure to maintain acceptable hygiene/appearance of housing": "Administrative/Compliance",
    "Sex acts or unauthorized physical contact involving inmates": "Sexual/Relational",
    "Tampering/defeating any fire or other safety device": "Property",
    "Failure to maintain personal hygiene/appearance": "Administrative/Compliance",
    "Inciting riots, strikes, mutinous acts or disturbances": "Officer Got Upset",
    "Possession of negotiables in unauthorized amounts, forms or location": "Banned Everyday Item",
    "Other assault/battery or attempted": "Violence/Threat",
    "Unauthorized/illegal financial transaction": "Surviving Inside",
    "Participating in riots, strikes, mutinous acts or disturbances": "Officer Got Upset",
    "Altering/defacing State or another property": "Property",
    "Gang-related activities": "Gang/Security State",
    "Assault or attempted assault on a correctional officer": "Violence/Threat",
    "Possession of stolen property": "Property",
    "Mail regulation violations": "Administrative/Compliance",
    "Canteen Shortage over $50": "Property",
    "Possession of escape paraphernalia": "Escape/Evasion",
    "Possession of items facilitating unauthorized/illegal financial transaction": "Surviving Inside",
    "Bartering with others": "Surviving Inside",
    "Manufacture of drugs or unauthorized beverages": "Drug Trade",
    "Gambling or possession of gambling paraphernalia": "Surviving Inside",
    "Unauthorized physical contact involving non-inmates": "Sexual/Relational",
    "Aggravated battery or attempted on correctional officer": "Violence/Threat",
    "Theft of property exceeding $50 in value": "Property",
    "Assault or attempted assault on an inmate": "Violence/Threat",
    "Battery or attempted battery on staff other than correctional officer": "Violence/Threat",
    "Possession of unauthorized clothing or linen": "Banned Everyday Item",
    "Trafficking in drugs or unauthorized beverages": "Drug Trade",
    "Arson or attempted arson": "Violence/Threat",
    "Frivolous, malicious, false information presented to court": "Officer Got Upset",
    "Assault/battery or attempted with deadly weapon": "Violence/Threat",
    "Destruction of State or another property by gross negligence": "Property",
    "Aggravated assault or attempted on inmate": "Violence/Threat",
    "Use of Alcohol": "Self-Harm/Addiction",
    "Escape or escape attempt": "Escape/Evasion",
    "Visiting regulation violations": "Administrative/Compliance",
    "Insufficient work": "Administrative/Compliance",
    "Missing count": "Administrative/Compliance",
    "Fraud or attempted fraud": "Surviving Inside",
    "Extortion or attempted extortion": "Surviving Inside",
    "Feigning illness or malingering": "Surviving Inside",
    "Loaning or borrowing money or other valuables": "Surviving Inside",
    "Wasting State or another property": "Property",
    "Presenting false testimony or information": "Administrative/Compliance",
    "Possession of unauthorized or altered identification": "Administrative/Compliance",
    "Unauthorized use or tampering with computer or office equipment": "Property",
    "Bribery or attempted bribery": "Surviving Inside",
    "Aggravated assault or attempted on correctional officer": "Violence/Threat",
    "Robbery or attempted robbery": "Violence/Threat",
    "Assault or attempted assault on staff other than correctional officer": "Violence/Threat",
    "Sexual battery or attempted": "Sexual/Relational",
    "Deviating from approved plan": "Officer Got Upset",
    "Breaking and entering or attempt": "Property",
    "Aggravated battery or attempted on staff other than correctional officer": "Violence/Threat",
    "Self-Mutilation": "Self-Harm/Addiction",
    "Canteen Shortage under $50": "Property",
    "Battery or attempted on other than staff or inmates": "Violence/Threat",
    "Tampering/damaging electronic monitoring equipment": "Administrative/Compliance",
    "Failure to deposit earnings": "Administrative/Compliance",
    "Failure to proceed to/from designated area": "Administrative/Compliance",
    "Possession of tobacco-related product by death row inmate": "Banned Everyday Item",
    "Assault or attempted assault on other than staff/inmates": "Violence/Threat",
    "Unauthorized contact in behalf of another inmate": "Administrative/Compliance",
    "False property lien": "Administrative/Compliance",
    "Aggravated assault or attempted on staff other than correctional officer": "Violence/Threat",
    "Failure to remain within designated area of release plan": "Administrative/Compliance",
    "Making purchase or contract without approval": "Surviving Inside",
    "Aggravated battery or attempted not on staff/inmates": "Violence/Threat",
    "Consume Intoxicants": "Self-Harm/Addiction",
}
combo['Violation_Category'] = combo['INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC'].map(violation_to_category)
In [23]:
action_to_bin = {
    "Disciplinary Confinement": "Cell Confinement",
    "Disciplinary Confinement Part Time": "Cell Confinement",
    "Loss of Gain Time": "Extended Incarceration",
    "Loss of Future Gain Time": "Extended Incarceration",
    "Cashless Canteen Denial": "Canteen Suspension",
    "Privilege Suspension - Other": "Privilege Suspension",
    "Privilege Suspension - Mail, Visitation": "Mail & Visitation Suspension",
    "Privilege Suspension - Visitation": "Visitation Suspension",
    "Privilege Suspension - Mail": "Mail Suspension",
    "Verbal Reprimand": "Verbal Warning",
    "Disciplinary Squad": "Forced Labor Detail",
    "Restricted Labor Squad": "Forced Labor Detail",
    "Extra Duty": "Forced Labor Detail",
    "Restitution Payment": "Financial Penalty",
    "Alternative Housing": "Alternative Housing",
}
combo['Action_Category'] = combo['INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC'].map(action_to_bin)
In [24]:
demo_df2 = dataframes['Incarcerationsupervision_00000.csv'][['INCARCERATION_SUPERVISION_ID', 'race_ethnicity_group']].copy()
merged2 = combo.merge(demo_df2, on='INCARCERATION_SUPERVISION_ID', how='inner')
merged2.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 562357 entries, 0 to 562356
Data columns (total 6 columns):
 #   Column                                          Non-Null Count   Dtype 
---  ------                                          --------------   ----- 
 0   INCARCERATION_SUPERVISION_ID                    562357 non-null  int64 
 1   INCARCERATION_DISCIPLINARY_ACTION_TYPE_DESC     562357 non-null  object
 2   INCARCERATION_DISCIPLINARY_VIOLATION_TYPE_DESC  562357 non-null  object
 3   Violation_Category                              562357 non-null  object
 4   Action_Category                                 562357 non-null  object
 5   race_ethnicity_group                            562357 non-null  object
dtypes: int64(1), object(5)
memory usage: 25.7+ MB
In [25]:
# 2. Groupby and calculate the cell-level minority proportion
cell_group = (
    merged2
    .groupby(['Violation_Category', 'Action_Category'])
)

# 3. Calculate: number of minorities, total count, and proportion for each cell
cell_stats = cell_group['race_ethnicity_group'].value_counts(normalize=False).unstack(fill_value=0)
cell_stats['total'] = cell_stats.sum(axis=1)
cell_stats['minority_prop'] = cell_stats.get('minority', 0) / cell_stats['total']

# 4. Compute deviation from baseline
cell_stats['minority_pct_diff'] = 100 * (cell_stats['minority_prop'] / baseline_minority - 1)


# 5. Reset index for visualization
cell_stats = cell_stats.reset_index()
viz_df2 = cell_stats[['Violation_Category', 
                     'Action_Category', 
                     'total', 'minority_prop', 'minority_pct_diff']].copy()

# 6. Run a z-test for each cell and add significance to viz_df
viz_df2['zstat'] = np.nan
viz_df2['pval'] = np.nan
viz_df2['statistically_significant'] = False

for idx, row in viz_df2.iterrows():
    count = int(round(row['minority_prop'] * row['total']))
    nobs = int(row['total'])
    if nobs > 0:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            stat, pval = proportions_ztest(count, nobs, baseline_minority)
        viz_df2.at[idx, 'zstat'] = stat
        viz_df2.at[idx, 'pval'] = pval
        viz_df2.at[idx, 'statistically_significant'] = (pval < 0.05) and (nobs >= 10)
    else:
        viz_df2.at[idx, 'statistically_significant'] = False

print(viz_df2.info(), viz_df2.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Violation_Category         125 non-null    object 
 1   Action_Category            125 non-null    object 
 2   total                      125 non-null    int64  
 3   minority_prop              125 non-null    float64
 4   minority_pct_diff          125 non-null    float64
 5   zstat                      125 non-null    float64
 6   pval                       125 non-null    float64
 7   statistically_significant  125 non-null    bool   
dtypes: bool(1), float64(4), int64(1), object(2)
memory usage: 7.1+ KB
None race_ethnicity_group         Violation_Category         Action_Category  \
0                     Administrative/Compliance     Alternative Housing   
1                     Administrative/Compliance      Canteen Suspension   
2                     Administrative/Compliance        Cell Confinement   
3                     Administrative/Compliance  Extended Incarceration   
4                     Administrative/Compliance       Financial Penalty   

race_ethnicity_group  total  minority_prop  minority_pct_diff      zstat  \
0                       251       0.752988          27.677981   5.996396   
1                     10589       0.706866          19.857380  26.474007   
2                     32458       0.707406          19.949092  46.589627   
3                     18135       0.666998          13.097275  22.071201   
4                        87       0.632184           7.194218   0.820688   

race_ethnicity_group           pval  statistically_significant  
0                      2.017445e-09                       True  
1                     1.931303e-154                       True  
2                      0.000000e+00                       True  
3                     5.978248e-108                       True  
4                      4.118239e-01                      False  
In [26]:
# Ensure sorted axes, so 3D placement is consistent
actions_sorted = viz_df2.groupby('Action_Category')['total'].sum().sort_values(ascending=False).index.tolist()
violations_sorted = viz_df2.groupby('Violation_Category')['total'].sum().sort_values(ascending=False).index.tolist()

# Add i (row) and j (col) indices for easy 3D plotting later
viz_df2['action_idx'] = viz_df2['Action_Category'].apply(lambda x: actions_sorted.index(x))
viz_df2['violation_idx'] = viz_df2['Violation_Category'].apply(lambda x: violations_sorted.index(x))

# Filter only top violations/actions if needed, or leave as is for all
# (Optional) e.g. top 20 violations:
top_violations = violations_sorted[:25]
viz_df2 = viz_df2[viz_df2['Violation_Category'].isin(top_violations)]

# Select columns for three.js
export_df = viz_df2[[
    'Violation_Category', 'Action_Category',
    'total', 'minority_pct_diff', 'statistically_significant',
    'violation_idx', 'action_idx'
]].rename(columns={
    'Violation_Category': 'violation',
    'Action_Category': 'action',
    'total': 'count'
})

# Export as JSON (best for JavaScript)
export_df.to_json('fdle_discipline_cats_3d.json', orient='records', indent=2)