• Source: Florida Department of Corrections — “Inmate Mortality — Statistics and Publications” https://www.fdc.myflorida.com/statistics-and-publications/inmate-mortality
  • Retrieved: 2025-08-10 14:07:00 (UTC)
  • Why snapshot: Preserves exactly what FDOC published on the retrieval date for reproducibility.
  • Method: Read the first HTML table from the saved snapshot and export to CSV. No manual edits here; any cleaning happens later.
In [1]:
from io import StringIO
from pathlib import Path
import pandas as pd
from bs4 import BeautifulSoup

SNAPSHOT_PATH = Path("fdoc_deaths/data/Inmate Mortality _ Statistics and Publications - Florida Department of Corrections.html")
OUT_CSV = Path("fdoc_deaths/data/FDOC's Mortality Table.csv")

with SNAPSHOT_PATH.open(encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# Try the first <table>; fall back to scanning the whole document
table = soup.find("table")
if table is not None:
    df = pd.read_html(StringIO(str(table)))[0]
else:
    tables = pd.read_html(StringIO(str(soup)))
    if not len(tables):
        raise ValueError(f"No <table> elements found in snapshot: {SNAPSHOT_PATH}")
    df = tables[0]

OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(OUT_CSV, index=False)
print(f"Saved CSV to {OUT_CSV} with shape {df.shape}")
Saved CSV to fdoc_deaths/data/FDOC's Mortality Table.csv with shape (6, 8)
In [53]:
from IPython.display import Image, display, Markdown


IMG_PATH = "assets/FDOH Screenshot 2025-08-05 at 9.53.12 AM.png"

# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Death/Rate"
QUERY_TEXT = "Crude Death Rate by Year, Homicide by Firearms Discharge, Homicide by Other and Unspecified Means"
RETRIEVED = "2025-08-05 at 13.53.12 UTC"

# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))

# Caption below the image
display(Markdown(
    f"Source: [FDOH data portal]({FDOH_URL})  \n"
    f"Query: {QUERY_TEXT}  \n"
    f"Retrieved: {RETRIEVED}"
))
No description has been provided for this image

Source: FDOH data portal
Query: Crude Death Rate by Year, Homicide by Firearms Discharge, Homicide by Other and Unspecified Means
Retrieved: 2025-08-05 at 13.53.12 UTC

In [52]:
df_m = pd.read_csv("fdoc_deaths/data/FDOC's Mortality Table.csv")
#print(df_m)

df_m = df_m[df_m['Inmate Population on June 30th'] != 0]

# Florida Yearly Homicde rates from screenshot above
fl_rates = [6.3, 7.0, 6.6, 6.6, 5.9]

df_m['Homicide Rate'] = (df_m['Homicide'] / df_m['Inmate Population on June 30th'] * 100000).round(1)
df_m['FL Statewide Homicide Rate'] = fl_rates
df_m.drop(['Accident', 'Suicide', 'Pending', "Natural*", "Total"], axis=1, inplace=True)

mean_fd_homicide = df_m['Homicide Rate'].mean().round(1)
mean_fl_homicide = df_m['FL Statewide Homicide Rate'].mean().round(1)

# Build a summary row
summary_row = {
    'Fiscal Year': 'MEAN',
    'Homicide': '',
    'Inmate Population on June 30th': '',
    'Homicide Rate': mean_fd_homicide,
    'FL Statewide Homicide Rate': mean_fl_homicide
}

# Append the row
df_m = pd.concat([df_m, pd.DataFrame([summary_row])], ignore_index=True)

table_html = df_m.to_html(index=False, classes='striped-table')

# Wrap it in a full HTML5 doc
full_html = f"""<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>FDOC Mortality Table (2019–2024)</title>
  <link rel="stylesheet" href="../style.css">
</head>
<body>
  <h2>FDOC Annual Mortality Table (2019–2024)</h2>
  {table_html}
</body>
</html>
"""

with open("fdoc_deaths/FDOC_mortality_table.html", "w") as f:
    f.write(full_html)

“According to available data, globally there were 34.2 deaths by suicide of persons deprived of liberty per 100,000, which is higher than the suicide rate among the general population, which is 9.2 deaths per 100,000 inhabitants. On the other hand, the intentional homicide rate inside prisons is 12.2 per 100,000 persons deprived of liberty, while among the general population the rate is 5.8 intentional homicide deaths per 100,000 inhabitants. In other words, in most of the countries with available data, the rate of intentional homicide is higher inside prisons than among the general population, and the Americas is the region with the highest rate of intentional homicide deaths inside prisons, with 18.3 victims per 100,000 persons deprived of liberty.”

— UNODC (2024). Global prison population and trends: a focus on rehabilitation.
Source: https://www.cdeunodc.inegi.org.mx/unodc/index.php/2024/08/15/global-prison-population-and-trends-a-focus-on-rehabilitation/index.htm Accessed: 2025-08-10 17:34:00 UTC

Note: 12.2 per 100,000 = 0.122 per 1,000.

In [4]:
import altair as alt

data = pd.DataFrame({
    'GroupShort': [
        'FDOC Prison Homicide',
        'Global Prison Homicide',
        'FL State Homicide'
    ],
    'GroupFull': [
        'FDOC In-Prison Homicide (2019–24)',
        'Global Prison Homicide (UNODC, 2024)',
        'Florida Statewide Homicide (2019–24)'
    ],
    'Rate': [20.35, 12.2, 6.48]
})

highlight = ['FDOC Prison Homicide']

chart = alt.Chart(data).mark_bar(
    cornerRadiusEnd=8,
    size=35
).encode(
    x=alt.X(
        'Rate:Q',
        title='Deaths per 1,000,000',
        axis=alt.Axis(
            labelFontSize=13,
            titleFontSize=13,
            labelColor='white',
            titleColor='white'
        ),
        scale=alt.Scale(domain=[0, 22])
    ),
    y=alt.Y(
        'GroupShort:N',
        sort='-x',
        axis=alt.Axis(
            title=None,
            labelFontSize=13,
            labelAlign='right',
            labelPadding=12,
            labelColor='white'
        )
    ),
    color=alt.condition(
        alt.FieldOneOfPredicate(field="GroupShort", oneOf=highlight),
        alt.value('#e8596a'),
        alt.value('#166a6a')
    ),
    tooltip=[
        alt.Tooltip('GroupFull:N', title='Group'),
        alt.Tooltip('Rate:Q', title='Deaths per 1,000,000', format='.1f')
    ]
).properties(
    width=500, height=180
).configure_view(
    stroke=None,
    fill=None
).configure_axis(
    grid=True
).configure(
    background='rgba(0,0,0,0)'
)
chart.save("assets/deaths_per_1000000.json")
chart.display()
In [12]:
#bringing in the raw OBIS data, pulled 2025-08-11 from https://www.fdc.myflorida.com/statistics-and-publications/public-records-requests-for-the-obis-database
import os

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

def convert_xlsx_sheets_to_csvs(
    xlsx_path,
    output_dir=None,
    prefix=None
):
    """Convert all sheets in an Excel file to CSV files. Prints the mapping."""
    # Prepare output directory
    if output_dir is None:
        output_dir = os.path.dirname(os.path.abspath(xlsx_path))
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    if prefix is None:
        prefix = os.path.splitext(os.path.basename(xlsx_path))[0]
    # Load Excel file (all sheets)
    xl = pd.ExcelFile(xlsx_path)
    mapping = {}
    for sheet in xl.sheet_names:
        # Clean sheet name for filesystems
        safe_sheet = ''.join(c if c.isalnum() or c in (' ','-','_') else '_' for c in sheet).rstrip()
        outpath = os.path.join(output_dir, f"{prefix}__{safe_sheet}.csv")
        df = xl.parse(sheet)
        df.to_csv(outpath, index=False)
        mapping[sheet] = outpath
        print(f"Sheet '{sheet}' -> '{outpath}'")
    return mapping

# --- Usage ---
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Active.XLSX', output_dir='./fdoc_deaths/csvs')
mapping = convert_xlsx_sheets_to_csvs('fdoc_deaths/data/Inmate_Release.XLSX', output_dir='./fdoc_deaths/csvs')
Sheet 'Inmate Active Root' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv'
Sheet 'Inmate Active Aliases' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Aliases.csv'
Sheet 'Inmate Active Offenses CPS' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses CPS.csv'
Sheet 'Inmate Active Offenses PRPR' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Offenses PRPR.csv'
Sheet 'Inmate Active Detainers' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Detainers.csv'
Sheet 'Inmate Active Incarhist' -> './fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv'
Sheet 'Inmate Release Root' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv'
Sheet 'Inmate Release Alias_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_1.csv'
Sheet 'Inmate Release Alias_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_2.csv'
Sheet 'Inmate Release Alias_3' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Alias_3.csv'
Sheet 'Inmate Release Offenses CPS_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_1.csv'
Sheet 'Inmate Release Offenses CPS_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses CPS_2.csv'
Sheet 'Inmate Release Offenses PRPR_1' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_1.csv'
Sheet 'Inmate Release Offenses PRPR_2' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Offenses PRPR_2.csv'
Sheet 'Inmate Release Detainer' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Detainer.csv'
Sheet 'Inmate Release Incarhist' -> './fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv'
In [14]:
# To build a population pyramid of the FODC, we must first determine who was incarcerated when
df_incarcerated_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_incarcerated_incarhist.info(), df_incarcerated_incarhist.head())

df_released_incarhist = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Incarhist.csv", dtype={'DCNumber': 'object'})
print(df_released_incarhist.info(), df_released_incarhist.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196525 entries, 0 to 196524
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DCNumber     196525 non-null  object
 1   ReceiptDate  196525 non-null  object
 2   ReleaseDate  106400 non-null  object
dtypes: object(3)
memory usage: 4.5+ MB
None   DCNumber ReceiptDate ReleaseDate
0   000111  2022-08-09  2023-09-18
1   000111  2024-05-16         NaN
2   000132  2018-10-08         NaN
3   000155  2024-05-23         NaN
4   000155  2018-10-24  2019-03-27
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 915242 entries, 0 to 915241
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   DCNumber     915242 non-null  object
 1   ReceiptDate  915242 non-null  object
 2   ReleaseDate  915192 non-null  object
dtypes: object(3)
memory usage: 20.9+ MB
None   DCNumber ReceiptDate ReleaseDate
0    47880  1954-04-02  1955-02-19
1    47880  1951-02-12  1953-09-28
2    53946  1954-11-27  1955-08-17
3    53946  1965-06-03  1965-06-13
4    56050  1955-11-09  1957-08-13
In [15]:
from datetime import datetime

df_all_hist = pd.concat([df_incarcerated_incarhist, df_released_incarhist], ignore_index=True)


df_all_hist['ReceiptDate'] = pd.to_datetime(df_all_hist['ReceiptDate'])
df_all_hist['ReleaseDate'] = pd.to_datetime(df_all_hist['ReleaseDate'])


# Fill ongoing incarcerations with extraction date - 'presently incarcerated'
DATA_PULL_DATE = pd.Timestamp('2025-08-11')
df_all_hist['ReleaseDate'] = df_all_hist['ReleaseDate'].fillna(DATA_PULL_DATE)
print(df_all_hist.info(), df_all_hist.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111767 entries, 0 to 1111766
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   DCNumber     1111767 non-null  object        
 1   ReceiptDate  1111767 non-null  datetime64[ns]
 2   ReleaseDate  1111767 non-null  datetime64[ns]
dtypes: datetime64[ns](2), object(1)
memory usage: 25.4+ MB
None   DCNumber ReceiptDate ReleaseDate
0   000111  2022-08-09  2023-09-18
1   000111  2024-05-16  2025-08-11
2   000132  2018-10-08  2025-08-11
3   000155  2024-05-23  2025-08-11
4   000155  2018-10-24  2019-03-27
In [16]:
df_all_hist = df_all_hist.sort_values(['DCNumber', 'ReceiptDate'])

# Group and produce lists of pd.Intervals
def make_intervals(subdf):
    intervals = []
    for rec, rel in zip(subdf['ReceiptDate'], subdf['ReleaseDate']):
        if pd.notnull(rec) and pd.notnull(rel) and rec <= rel:
            intervals.append(pd.Interval(left=rec, right=rel, closed='both'))
    return intervals

grouped = (
    df_all_hist
    .sort_values(['DCNumber', 'ReceiptDate'])
    .groupby('DCNumber', group_keys=False)[['ReceiptDate', 'ReleaseDate']]
    .apply(make_intervals)
    .reset_index(name='Incar_intervals')
)

print(grouped.head())
  DCNumber  \
0    47880   
1    53946   
2    56050   
3   000001   
4   000004   

                                                                                                                                                                                                                Incar_intervals  
0                                                                                                                                      [[1951-02-12 00:00:00, 1953-09-28 00:00:00], [1954-04-02 00:00:00, 1955-02-19 00:00:00]]  
1                                                                                                                                      [[1954-11-27 00:00:00, 1955-08-17 00:00:00], [1965-06-03 00:00:00, 1965-06-13 00:00:00]]  
2  [[1955-11-09 00:00:00, 1957-08-13 00:00:00], [1957-09-03 00:00:00, 1965-11-09 00:00:00], [1969-02-26 00:00:00, 1971-12-28 00:00:00], [1979-02-08 00:00:00, 1980-11-07 00:00:00], [1980-11-10 00:00:00, 1981-07-14 00:00:00]]  
3                                                                                                                                                                                  [[1960-07-01 00:00:00, 2005-04-24 00:00:00]]  
4                                                                                                                                                                                  [[1995-10-25 00:00:00, 1997-12-11 00:00:00]]  
In [17]:
# Now that we have intervals of incarceration for everyone ever or currently incarcerated, we need to assoicate that with age/sex data
df_incarcerated_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Active__Inmate Active Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_incarcerated_root.info(), df_incarcerated_root.head())

df_released_root = pd.read_csv("fdoc_deaths/csvs/Inmate_Release__Inmate Release Root.csv", dtype={'DCNumber': 'object', 'Sex':'category'})
print(df_released_root.info(), df_released_root.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90169 entries, 0 to 90168
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   DCNumber               90169 non-null  object  
 1   LastName               90168 non-null  object  
 2   FirstName              90169 non-null  object  
 3   MiddleName             68553 non-null  object  
 4   NameSuffix             7372 non-null   object  
 5   Race                   90169 non-null  object  
 6   Sex                    90169 non-null  category
 7   BirthDate              90169 non-null  object  
 8   PrisonReleaseDate      74057 non-null  object  
 9   ReceiptDate            90169 non-null  object  
 10  releasedateflag_descr  90169 non-null  object  
 11  race_descr             90169 non-null  object  
 12  custody_description    87249 non-null  object  
 13  FACILITY_description   90169 non-null  object  
dtypes: category(1), object(13)
memory usage: 9.0+ MB
None   DCNumber    LastName FirstName MiddleName NameSuffix Race Sex   BirthDate  \
0   000111      NELSON     ALVON          L        NaN    B   M  1998-03-11   
1   000132     TELFAIR   MICHAEL        NaN        NaN    B   M  1967-12-04   
2   000155     LOCKETT     JERRY          I        NaN    B   M  1985-09-26   
3   000175      ELMORE      CHAD          R        NaN    W   M  1982-10-20   
4   000203  RHINESMITH    AUSTIN          A        NaN    W   M  2000-09-12   

     PrisonReleaseDate ReceiptDate releasedateflag_descr race_descr  \
0  2025-09-10 00:00:00  2024-05-16    valid release date      BLACK   
1  2033-04-07 00:00:00  2018-10-08    valid release date      BLACK   
2  2026-07-23 00:00:00  2024-05-23    valid release date      BLACK   
3  2028-08-29 00:00:00  2025-06-11    valid release date      WHITE   
4  2028-01-08 00:00:00  2025-07-24    valid release date      WHITE   

  custody_description FACILITY_description  
0               CLOSE         SUWANNEE C.I  
1             MINIMUM         MADISON C.I.  
2              MEDIUM            GULF C.I.  
3             MINIMUM         JACKSON C.I.  
4                 NaN            CFRC-MAIN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470403 entries, 0 to 470402
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   DCNumber               470403 non-null  object  
 1   LastName               470389 non-null  object  
 2   FirstName              470399 non-null  object  
 3   MiddleName             342540 non-null  object  
 4   NameSuffix             25548 non-null   object  
 5   Race                   470403 non-null  object  
 6   Sex                    470403 non-null  category
 7   BirthDate              470400 non-null  object  
 8   PrisonReleaseDate      470403 non-null  object  
 9   releasedateflag_descr  470403 non-null  object  
 10  race_descr             470403 non-null  object  
 11  custody_description    454906 non-null  object  
dtypes: category(1), object(11)
memory usage: 39.9+ MB
None   DCNumber LastName FirstName MiddleName NameSuffix Race Sex   BirthDate  \
0    47880   MARTIN   WILLARD        NaN        NaN    W   M  1915-06-17   
1    53946   MEARNS   WILLIAM          W        NaN    W   M  1922-06-01   
2    56050   CARVER    JESSIE        NaN        NaN    B   M  1932-10-16   
3   000001  WHITNEY    DENNIS        NaN        NaN    W   M  1942-08-15   
4   000004  ROBBINS     WAYNE        NaN        NaN    W   M  1969-01-13   

  PrisonReleaseDate releasedateflag_descr race_descr custody_description  
0        1955-02-19    valid release date      WHITE                 NaN  
1        1965-06-13    valid release date      WHITE                 NaN  
2        2007-03-20    valid release date      BLACK                 NaN  
3        2005-04-24              deceased      WHITE               CLOSE  
4        1997-12-11    valid release date      WHITE             MINIMUM  
In [18]:
df_birth_incar = df_incarcerated_root[['DCNumber', 'BirthDate', 'Sex']].copy()
df_birth_released = df_released_root[['DCNumber', 'BirthDate', 'Sex']].copy()

df_birth = pd.concat([df_birth_incar, df_birth_released], ignore_index=True)

df_birth = df_birth.drop_duplicates(subset='DCNumber')



df_birth['BirthDate'] = pd.to_datetime(df_birth['BirthDate'])

print("Min birthdate:", df_birth['BirthDate'].min())
print("Max birthdate:", df_birth['BirthDate'].max())
print("Null birthdays:", df_birth['BirthDate'].isna().sum())
print(df_birth[df_birth['BirthDate'].isna()].head())
#Null birthdays verified null in https://pubapps.fdc.myflorida.com/OffenderSearch/InmateInfoMenu.aspx
Min birthdate: 1906-01-14 00:00:00
Max birthdate: 2010-01-19 00:00:00
Null birthdays: 3
       DCNumber BirthDate Sex
252785   969628       NaT   M
360654   K02167       NaT   M
549686   Y13143       NaT   M
In [19]:
df_birth.head()
Out[19]:
DCNumber BirthDate Sex
0 000111 1998-03-11 M
1 000132 1967-12-04 M
2 000155 1985-09-26 M
3 000175 1982-10-20 M
4 000203 2000-09-12 M
In [20]:
# Ensure DCNumber is of the same type in both DataFrames
df_birth['DCNumber'] = df_birth['DCNumber'].astype(str).str.strip()
grouped['DCNumber'] = grouped['DCNumber'].astype(str).str.strip()

# Perform a left join:
df_merged = grouped.merge(df_birth, on='DCNumber', how='left')
In [21]:
# We now get counts per-year per-population group
df = df_merged

# Reference dates: June 30 each year
ref_dates = {year: pd.Timestamp(year=year, month=6, day=30) for year in range(2019, 2025)}

# Helpers
def was_incarcerated_on(intervals, on_date):
    if not isinstance(intervals, (list, tuple)) or len(intervals) == 0:
        return False
    for iv in intervals:
        try:
            left, right = iv.left, iv.right
        except AttributeError:
            left, right = iv
        if left <= on_date <= right:
            return True
    return False

def age_on(birth, on_date):
    if pd.isna(birth):
        return pd.NA
    return (on_date.year - birth.year) - ( (on_date.month, on_date.day) < (birth.month, birth.day) )

# Ensure BirthDate is datetime
if not pd.api.types.is_datetime64_any_dtype(df['BirthDate']):
    df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')

# Compute age columns
for year, d in ref_dates.items():
    col = f'age_in_{year}'
    df[col] = df.apply(
        lambda r: age_on(r['BirthDate'], d) if was_incarcerated_on(r['Incar_intervals'], d) else pd.NA,
        axis=1
    ).astype('Int64')  # nullable integer dtype
In [23]:
# sanity check on total population count
df[['age_in_2024']].count()
Out[23]:
age_in_2024    88376
dtype: int64

Note: FDOC’s published “active population” tallies count only those physically present at a state facility on snapshot day. This reconstruction, using incarceration intervals, includes all individuals with an interval covering the census date, regardless of temporary status (e.g., out to court, hospitalized, etc.). This slightly overstates the population compared to FDOC snapshot counts, which is a well-known, standard limitation for historical administrative interval analysis.

In [24]:
from pandas.api.types import CategoricalDtype

age_bins = [0, 1, 5, 10, 15, 20, 25, 35, 45, 55, 65, 75, 85, 200]
age_labels = [
    "<1", "1-4", "5-9", "10-14", "15-19",
    "20-24", "25-34", "35-44", "45-54", "55-64",
    "65-74", "75-84", "85+"
]
# Remove unused categories up front
if isinstance(df['Sex'].dtype, CategoricalDtype):
    df['Sex'] = df['Sex'].cat.remove_unused_categories()

dfs = {}
for year in range(2019, 2025):
    age_col = f'age_in_{year}'
    bins = pd.cut(df[age_col], bins=age_bins, labels=age_labels, right=False, include_lowest=True)
    temp = pd.DataFrame({'age_bin': bins, 'Sex': df['Sex']})
    
    # Group and pivot; observed=True to avoid unused category columns
    group = temp.groupby(['age_bin', 'Sex'], observed=True).size().unstack(fill_value=0)
    # Reset index so 'age_bin' is a column again, and manually clear any index/column names
    group = group.reset_index()
    group.columns.name = None
    group = group.rename(columns={'M': 'Male', 'F': 'Female'})
    dfs[year] = group
    print(f"\nAge and sex distribution for {year}:")
    print(group)
Age and sex distribution for 2019:
  age_bin   Male  Female
0   10-14      2       0
1   15-19    794      32
2   20-24   6733     420
3   25-34  26604    2284
4   35-44  23963    2049
5   45-54  17252    1224
6   55-64  10972     564
7   65-74   3254     123
8   75-84    618      18
9     85+     57       0

Age and sex distribution for 2020:
  age_bin  Female   Male
0   15-19      27    561
1   20-24     341   5428
2   25-34    1872  23421
3   35-44    1820  22493
4   45-54    1115  16071
5   55-64     555  10885
6   65-74     127   3393
7   75-84      20    657
8     85+       1     59

Age and sex distribution for 2021:
  age_bin  Female   Male
0   15-19      18    427
1   20-24     280   4552
2   25-34    1566  20518
3   35-44    1622  21212
4   45-54     934  14828
5   55-64     518  10561
6   65-74     122   3489
7   75-84      17    682
8     85+       2     53

Age and sex distribution for 2022:
  age_bin   Male  Female
0   10-14      1       0
1   15-19    424      23
2   20-24   4334     282
3   25-34  20520    1587
4   35-44  21973    1741
5   45-54  15132     973
6   55-64  10866     531
7   65-74   3791     127
8   75-84    769      21
9     85+     58       1

Age and sex distribution for 2023:
  age_bin  Female   Male
0   15-19      18    480
1   20-24     298   4393
2   25-34    1647  20467
3   35-44    2031  23119
4   45-54    1110  15734
5   55-64     564  11140
6   65-74     138   4135
7   75-84      27    878
8     85+       2     70

Age and sex distribution for 2024:
  age_bin  Female   Male
0   15-19      20    504
1   20-24     309   4359
2   25-34    1706  20028
3   35-44    2149  23926
4   45-54    1210  16350
5   55-64     618  11442
6   65-74     148   4519
7   75-84      26    976
8     85+       4     82
In [25]:
# Pull in FDOH data on state population from https://www.flhealthcharts.gov/FLQUERY_New/Population/Count
df_state = pd.read_csv("fdoc_deaths/csvs/Population_081220251605.csv", header=1)
df_state = df_state.iloc[2:].copy()
df_state = df_state.rename(columns={'Unnamed: 0': 'age_bin'})
df_state = df_state.reset_index(drop=True)

cols = {
    'female_2019': 1, 'male_2019': 2,
    'female_2020': 4, 'male_2020': 5,
    'female_2021': 7, 'male_2021': 8,
    'female_2022':10, 'male_2022':11,
    'female_2023':13, 'male_2023':14,
}

df_age_sex = pd.DataFrame()
df_age_sex['age_bin'] = df_state['age_bin']

# Assign and clean each desired column
for newcol, idx in cols.items():
    df_age_sex[newcol] = (
        df_state.iloc[:, idx]
    )

# Remove any "Total" or extra summary rows
df_age_sex = df_age_sex[~df_age_sex['age_bin'].str.contains("Total", na=False)].copy()

for col in df_age_sex.columns:
    if col != 'age_bin':  # don't convert bin labels!
        df_age_sex[col] = pd.to_numeric(df_age_sex[col])


print(df_age_sex.info(),df_age_sex)
<class 'pandas.core.frame.DataFrame'>
Index: 13 entries, 0 to 12
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age_bin      13 non-null     object
 1   female_2019  13 non-null     int64 
 2   male_2019    13 non-null     int64 
 3   female_2020  13 non-null     int64 
 4   male_2020    13 non-null     int64 
 5   female_2021  13 non-null     int64 
 6   male_2021    13 non-null     int64 
 7   female_2022  13 non-null     int64 
 8   male_2022    13 non-null     int64 
 9   female_2023  13 non-null     int64 
 10  male_2023    13 non-null     int64 
dtypes: int64(10), object(1)
memory usage: 1.2+ KB
None    age_bin  female_2019  male_2019  female_2020  male_2020  female_2021  \
0       <1       105837     110740       100144     105001       103515   
1      1-4       452146     473774       461835     483003       459749   
2      5-9       565917     590432       574982     600993       583720   
3    10-14       590585     614014       595752     619774       601845   
4    15-19       588355     616780       592774     619696       594807   
5    20-24       620093     649481       621957     648074       629862   
6    25-34      1370805    1417463      1388821    1437525      1403076   
7    35-44      1294285    1273377      1321951    1302602      1349385   
8    45-54      1398322    1345694      1393406    1339768      1390803   
9    55-64      1500802    1348036      1533563    1384124      1561551   
10   65-74      1286140    1103480      1334870    1141115      1386162   
11   75-84       755539     627404       795562     659897       830232   
12     85+       342951     226101       348827     234750       357797   

    male_2021  female_2022  male_2022  female_2023  male_2023  
0      108035       108250     112813       106527     111631  
1      478846       446345     465464       446903     465106  
2      610256       592247     619869       591461     619054  
3      624646       624767     654269       613277     643749  
4      623517       617216     648019       632772     663571  
5      651645       633981     651718       678944     699869  
6     1454968      1401101    1438657      1405448    1460442  
7     1335878      1388739    1400465      1407017    1437365  
8     1337748      1398209    1374181      1396753    1385056  
9     1415552      1578259    1473402      1569641    1477506  
10    1180546      1418697    1231516      1418872    1227549  
11     687257       818514     684502       884067     731385  
12     244189       324276     223702       371580     240038  
In [26]:
years = range(2019, 2024)
merged_dfs = {}

for year in years:
    # Get FDOC population by sex/age for the year
    df_fdoc = dfs[year].copy()
    # Get state population columns for the year
    mcol = f'male_{year}'
    fcol = f'female_{year}'
    # Merge on 'age_bin'
    merged = df_fdoc.merge(
        df_age_sex[['age_bin', fcol, mcol]], on='age_bin', how='left'
    )
    # Rename columns
    merged = merged.rename(columns={
        'Male': f'fdoc_male_{year}',
        'Female': f'fdoc_female_{year}',
        fcol: f'flpop_female_{year}',
        mcol: f'flpop_male_{year}'
    })
    # Put columns in standard order
    merged = merged[['age_bin', f'fdoc_male_{year}', f'fdoc_female_{year}', f'flpop_male_{year}', f'flpop_female_{year}']]
    merged_dfs[year] = merged

    # Preview result
    print(f"\nMerged distribution for {year}:")
    print(merged)
Merged distribution for 2019:
  age_bin  fdoc_male_2019  fdoc_female_2019  flpop_male_2019  \
0   10-14               2                 0           614014   
1   15-19             794                32           616780   
2   20-24            6733               420           649481   
3   25-34           26604              2284          1417463   
4   35-44           23963              2049          1273377   
5   45-54           17252              1224          1345694   
6   55-64           10972               564          1348036   
7   65-74            3254               123          1103480   
8   75-84             618                18           627404   
9     85+              57                 0           226101   

   flpop_female_2019  
0             590585  
1             588355  
2             620093  
3            1370805  
4            1294285  
5            1398322  
6            1500802  
7            1286140  
8             755539  
9             342951  

Merged distribution for 2020:
  age_bin  fdoc_male_2020  fdoc_female_2020  flpop_male_2020  \
0   15-19             561                27           619696   
1   20-24            5428               341           648074   
2   25-34           23421              1872          1437525   
3   35-44           22493              1820          1302602   
4   45-54           16071              1115          1339768   
5   55-64           10885               555          1384124   
6   65-74            3393               127          1141115   
7   75-84             657                20           659897   
8     85+              59                 1           234750   

   flpop_female_2020  
0             592774  
1             621957  
2            1388821  
3            1321951  
4            1393406  
5            1533563  
6            1334870  
7             795562  
8             348827  

Merged distribution for 2021:
  age_bin  fdoc_male_2021  fdoc_female_2021  flpop_male_2021  \
0   15-19             427                18           623517   
1   20-24            4552               280           651645   
2   25-34           20518              1566          1454968   
3   35-44           21212              1622          1335878   
4   45-54           14828               934          1337748   
5   55-64           10561               518          1415552   
6   65-74            3489               122          1180546   
7   75-84             682                17           687257   
8     85+              53                 2           244189   

   flpop_female_2021  
0             594807  
1             629862  
2            1403076  
3            1349385  
4            1390803  
5            1561551  
6            1386162  
7             830232  
8             357797  

Merged distribution for 2022:
  age_bin  fdoc_male_2022  fdoc_female_2022  flpop_male_2022  \
0   10-14               1                 0           654269   
1   15-19             424                23           648019   
2   20-24            4334               282           651718   
3   25-34           20520              1587          1438657   
4   35-44           21973              1741          1400465   
5   45-54           15132               973          1374181   
6   55-64           10866               531          1473402   
7   65-74            3791               127          1231516   
8   75-84             769                21           684502   
9     85+              58                 1           223702   

   flpop_female_2022  
0             624767  
1             617216  
2             633981  
3            1401101  
4            1388739  
5            1398209  
6            1578259  
7            1418697  
8             818514  
9             324276  

Merged distribution for 2023:
  age_bin  fdoc_male_2023  fdoc_female_2023  flpop_male_2023  \
0   15-19             480                18           663571   
1   20-24            4393               298           699869   
2   25-34           20467              1647          1460442   
3   35-44           23119              2031          1437365   
4   45-54           15734              1110          1385056   
5   55-64           11140               564          1477506   
6   65-74            4135               138          1227549   
7   75-84             878                27           731385   
8     85+              70                 2           240038   

   flpop_female_2023  
0             632772  
1             678944  
2            1405448  
3            1407017  
4            1396753  
5            1569641  
6            1418872  
7             884067  
8             371580  
In [28]:
df = merged_dfs[2023].copy()
for col in ['fdoc_male_2023', 'fdoc_female_2023', 'flpop_male_2023', 'flpop_female_2023']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

df['fdoc_female_2023_neg'] = -df['fdoc_female_2023']
df['flpop_female_2023_neg'] = -df['flpop_female_2023']
df['age_bin'] = pd.Categorical(df['age_bin'], categories=df['age_bin'].tolist(), ordered=True)

# Global config for dark mode and bar gaps
alt.themes.enable('none')
custom_config = {
    "config": {
        "background": 'rgba(0,0,0,0)',
        "axis": {
            "domainColor": "white",
            "gridColor": "white",
            "labelColor": "white",
            "tickColor": "white",
            "titleColor": "white"
        },
        "title": {"color": "white"},
        "legend": {"labelColor": "white", "titleColor": "white"},
        "view": {"stroke": "white"},
        "bar": {"binSpacing": 0}
    }
}
alt.themes.register('johnwise_black', lambda: custom_config)
alt.themes.enable('johnwise_black')

#### --- FDOC Chart ---
base_fdoc = alt.Chart(df).encode(
    y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)

male_fdoc = base_fdoc.mark_bar(color="#348ABD", size = 35).encode(
    x=alt.X('fdoc_male_2023:Q', title='FDOC Male'),
    tooltip=['age_bin', alt.Tooltip('fdoc_male_2023')]
)
female_fdoc = base_fdoc.mark_bar(color="#E24A33", size = 35).encode(
    x=alt.X('fdoc_female_2023_neg:Q', title='FDOC Female',
            axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
    tooltip=['age_bin', alt.Tooltip('fdoc_female_2023')]
)

pyramid_fdoc = (female_fdoc + male_fdoc).properties(
    title=alt.TitleParams('FDOC 2023 Prison Population Pyramid', color='white'),
    width=260, height=320
)

#### --- Statewide Chart ---
base_state = alt.Chart(df).encode(
    y=alt.Y('age_bin', sort=None, title='Age Group', axis=alt.Axis(labelAlign='right', labelPadding=10, labelColor='white', titleColor='white')),
)
male_state = base_state.mark_bar(color="#348ABD", size = 35).encode(
    x=alt.X('flpop_male_2023:Q', title='FL Male'),
    tooltip=['age_bin', alt.Tooltip('flpop_male_2023')]
)
female_state = base_state.mark_bar(color="#E24A33", size = 35).encode(
    x=alt.X('flpop_female_2023_neg:Q', title='FL Female',
            axis=alt.Axis(labelExpr="abs(datum.value)", titleColor='white', labelColor='white')),
    tooltip=['age_bin', alt.Tooltip('flpop_female_2023')]
)
pyramid_state = (female_state + male_state).properties(
    title=alt.TitleParams('Florida 2023 Population Pyramid', color='white'),
    width=260, height=320
)


chart = alt.vconcat(
    pyramid_fdoc,
    pyramid_state
)
chart.save("assets/double_pop_pyramid.json")
chart
Out[28]:
In [30]:
# information on individual deaths pulled from yearly queries https://www.fdc.myflorida.com/statistics-and-publications/inmate-mortality
df19 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2019-2020.csv")
df20 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2020-2021.csv")
df21 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2021-2022.csv")
df22 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2022-2023.csv")
df23 = pd.read_csv("/Users/johnwise/johnwise.me_site/fdoc_deaths/data/FY 2023-2024.csv")
df_all_FY = pd.concat([df19, df20, df21, df22, df23], ignore_index = True)
print(df_all_FY.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   Name                              2148 non-null   object
 1   DC Number                         2148 non-null   object
 2   Date of Death                     2260 non-null   object
 3   Institution Name                  2260 non-null   object
 4   Manner of Death Determined by ME  2260 non-null   object
 5   Investigative Status              2260 non-null   object
dtypes: object(6)
memory usage: 106.1+ KB
None
In [31]:
# tidy
df_deaths_19_23 = df_all_FY[['Name', 'DC Number', 'Date of Death']].copy()
# Rename 'DC Number' to 'DCNumber'
df_deaths_19_23.rename(columns={'DC Number': 'DCNumber'}, inplace=True)
df_deaths_19_23['Date of Death'] = pd.to_datetime(df_deaths_19_23['Date of Death'], format = '%m/%d/%Y')
print(df_deaths_19_23.head(), df_deaths_19_23.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2148 non-null   object        
 1   DCNumber       2148 non-null   object        
 2   Date of Death  2260 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 53.1+ KB
              Name DCNumber Date of Death
0              NaN      NaN    2019-07-01
1  HORTON, MICHAEL   073661    2019-07-02
2              NaN      NaN    2019-07-05
3  FERGUSON, LEROY   716169    2019-07-06
4  BUSSER, WILLIAM   N26218    2019-07-06 None
In [34]:
# Join to previous data
df_deaths_births_19_23 = pd.merge(df_deaths_19_23, df_birth, on='DCNumber', how='left')
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2260 entries, 0 to 2259
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2148 non-null   object        
 1   DCNumber       2148 non-null   object        
 2   Date of Death  2260 non-null   datetime64[ns]
 3   BirthDate      2145 non-null   datetime64[ns]
 4   Sex            2145 non-null   object        
dtypes: datetime64[ns](2), object(3)
memory usage: 88.4+ KB
None               Name DCNumber Date of Death  BirthDate  Sex
0              NaN      NaN    2019-07-01        NaT  NaN
1  HORTON, MICHAEL   073661    2019-07-02 1952-07-19    M
2              NaN      NaN    2019-07-05        NaT  NaN
3  FERGUSON, LEROY   716169    2019-07-06 1960-11-29    M
4  BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17    M
In [35]:
# investigate completeness
missing_birthdate_rows = df_deaths_births_19_23[
    (df_deaths_births_19_23['DCNumber'].notna()) &
    (df_deaths_births_19_23['BirthDate'].isna())
]
print(missing_birthdate_rows)
missing_birthdate_rows2 = df_deaths_births_19_23[
    (df_deaths_births_19_23['BirthDate'].notna()) &
    (df_deaths_births_19_23['DCNumber'].isna())
]
print(missing_birthdate_rows2)
                  Name DCNumber Date of Death BirthDate  Sex
799    BAILEY, ROMULUS   168749    2021-01-05       NaT  NaN
1551     JORDAN, JAMES   555250    2022-11-02       NaT  NaN
2032  CHEATHAM, GERALD   083980    2023-11-28       NaT  NaN
Empty DataFrame
Columns: [Name, DCNumber, Date of Death, BirthDate, Sex]
Index: []

The above names and DC Numbers were searched for in the FDOC Offender Database https://pubapps.fdc.myflorida.com/OffenderSearch/search.aspx and no conclusive matches were found

In [36]:
'''To account for missing birthdate data (~5% of deaths, presumed to be evenly distributed across ages and sex), the age/sex-specific 
population denominators are multiplied by 95% to match the coverage rate of the numerator.'''
adj = 1- df_deaths_births_19_23[df_deaths_births_19_23['BirthDate'].isna()].shape[0]/df_all_FY.shape[0]
print(adj)
0.9491150442477876
In [37]:
df_deaths_births_19_23.dropna(subset =['BirthDate'], inplace = True)
print(df_deaths_births_19_23.info(), df_deaths_births_19_23.head())
<class 'pandas.core.frame.DataFrame'>
Index: 2145 entries, 1 to 2258
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Name           2145 non-null   object        
 1   DCNumber       2145 non-null   object        
 2   Date of Death  2145 non-null   datetime64[ns]
 3   BirthDate      2145 non-null   datetime64[ns]
 4   Sex            2145 non-null   object        
dtypes: datetime64[ns](2), object(3)
memory usage: 100.5+ KB
None                Name DCNumber Date of Death  BirthDate Sex
1   HORTON, MICHAEL   073661    2019-07-02 1952-07-19   M
3   FERGUSON, LEROY   716169    2019-07-06 1960-11-29   M
4   BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17   M
5      GIGLIO, DUKE   053611    2019-07-06 1954-12-16   M
6  COPPERTINO, JOHN   341241    2019-07-07 1952-05-21   M
In [38]:
# calculate age at death
df_deaths_births_19_23['age_at_death'] = df_deaths_births_19_23.apply(
    lambda r: age_on(r['BirthDate'], r['Date of Death']),
    axis=1
).astype('Int64')
print(df_deaths_births_19_23.head())
               Name DCNumber Date of Death  BirthDate Sex  age_at_death
1   HORTON, MICHAEL   073661    2019-07-02 1952-07-19   M            66
3   FERGUSON, LEROY   716169    2019-07-06 1960-11-29   M            58
4   BUSSER, WILLIAM   N26218    2019-07-06 1937-07-17   M            81
5      GIGLIO, DUKE   053611    2019-07-06 1954-12-16   M            64
6  COPPERTINO, JOHN   341241    2019-07-07 1952-05-21   M            67
In [39]:
deaths_by_year = {}

for year in range(2019, 2024):
    mask = df_deaths_births_19_23['Date of Death'].dt.year == year
    df_year = df_deaths_births_19_23[mask].copy()
    df_year['age_bin'] = pd.cut(
        df_year['age_at_death'], 
        bins=age_bins, 
        labels=age_labels, 
        right=False, 
        include_lowest=True
    )
    grouped = (
        df_year.groupby(['age_bin', 'Sex'], observed=True)
        .size()
        .unstack(fill_value=0)
        .reset_index()
    )
    grouped.columns.name = None
    grouped = grouped.rename(columns={'M': f'male_deaths_{year}', 'F': f'female_deaths_{year}'})

    deaths_by_year[year] = grouped

    print(f"\nDeaths by age and sex, {year}:")
    print(grouped)
Deaths by age and sex, 2019:
  age_bin  female_deaths_2019  male_deaths_2019
0   25-34                   1                11
1   35-44                   1                14
2   45-54                   1                30
3   55-64                   1                46
4   65-74                   0                52
5   75-84                   1                27
6     85+                   0                 3

Deaths by age and sex, 2020:
  age_bin  female_deaths_2020  male_deaths_2020
0   20-24                   0                 2
1   25-34                   3                18
2   35-44                   4                47
3   45-54                   1                63
4   55-64                   6               146
5   65-74                   5               171
6   75-84                   3                86
7     85+                   0                17

Deaths by age and sex, 2021:
  age_bin  female_deaths_2021  male_deaths_2021
0   25-34                   1                18
1   35-44                   2                34
2   45-54                   4                48
3   55-64                   1               131
4   65-74                   2               108
5   75-84                   1                59
6     85+                   0                11

Deaths by age and sex, 2022:
  age_bin  female_deaths_2022  male_deaths_2022
0   25-34                   1                20
1   35-44                   3                31
2   45-54                   1                53
3   55-64                   0               103
4   65-74                   2                95
5   75-84                   0                52
6     85+                   0                 8

Deaths by age and sex, 2023:
  age_bin  female_deaths_2023  male_deaths_2023
0   20-24                   1                 4
1   25-34                   1                26
2   35-44                   1                35
3   45-54                   1                54
4   55-64                   4               108
5   65-74                   4               101
6   75-84                   3                65
7     85+                   0                19
In [40]:
final_merged = {}

for year in range(2019, 2024):
    popdf = merged_dfs[year]
    deathsdf = deaths_by_year[year]
    merged = popdf.merge(deathsdf, on='age_bin', how='left')
    for sexcol in [f'male_deaths_{year}', f'female_deaths_{year}']:
        if sexcol in merged.columns:
            merged[sexcol] = merged[sexcol].fillna(0).astype(int)
    final_merged[year] = merged
    print(f"\nMerged table for {year}:")
    print(merged)
Merged table for 2019:
  age_bin  fdoc_male_2019  fdoc_female_2019  flpop_male_2019  \
0   10-14               2                 0           614014   
1   15-19             794                32           616780   
2   20-24            6733               420           649481   
3   25-34           26604              2284          1417463   
4   35-44           23963              2049          1273377   
5   45-54           17252              1224          1345694   
6   55-64           10972               564          1348036   
7   65-74            3254               123          1103480   
8   75-84             618                18           627404   
9     85+              57                 0           226101   

   flpop_female_2019  female_deaths_2019  male_deaths_2019  
0             590585                   0                 0  
1             588355                   0                 0  
2             620093                   0                 0  
3            1370805                   1                11  
4            1294285                   1                14  
5            1398322                   1                30  
6            1500802                   1                46  
7            1286140                   0                52  
8             755539                   1                27  
9             342951                   0                 3  

Merged table for 2020:
  age_bin  fdoc_male_2020  fdoc_female_2020  flpop_male_2020  \
0   15-19             561                27           619696   
1   20-24            5428               341           648074   
2   25-34           23421              1872          1437525   
3   35-44           22493              1820          1302602   
4   45-54           16071              1115          1339768   
5   55-64           10885               555          1384124   
6   65-74            3393               127          1141115   
7   75-84             657                20           659897   
8     85+              59                 1           234750   

   flpop_female_2020  female_deaths_2020  male_deaths_2020  
0             592774                   0                 0  
1             621957                   0                 2  
2            1388821                   3                18  
3            1321951                   4                47  
4            1393406                   1                63  
5            1533563                   6               146  
6            1334870                   5               171  
7             795562                   3                86  
8             348827                   0                17  

Merged table for 2021:
  age_bin  fdoc_male_2021  fdoc_female_2021  flpop_male_2021  \
0   15-19             427                18           623517   
1   20-24            4552               280           651645   
2   25-34           20518              1566          1454968   
3   35-44           21212              1622          1335878   
4   45-54           14828               934          1337748   
5   55-64           10561               518          1415552   
6   65-74            3489               122          1180546   
7   75-84             682                17           687257   
8     85+              53                 2           244189   

   flpop_female_2021  female_deaths_2021  male_deaths_2021  
0             594807                   0                 0  
1             629862                   0                 0  
2            1403076                   1                18  
3            1349385                   2                34  
4            1390803                   4                48  
5            1561551                   1               131  
6            1386162                   2               108  
7             830232                   1                59  
8             357797                   0                11  

Merged table for 2022:
  age_bin  fdoc_male_2022  fdoc_female_2022  flpop_male_2022  \
0   10-14               1                 0           654269   
1   15-19             424                23           648019   
2   20-24            4334               282           651718   
3   25-34           20520              1587          1438657   
4   35-44           21973              1741          1400465   
5   45-54           15132               973          1374181   
6   55-64           10866               531          1473402   
7   65-74            3791               127          1231516   
8   75-84             769                21           684502   
9     85+              58                 1           223702   

   flpop_female_2022  female_deaths_2022  male_deaths_2022  
0             624767                   0                 0  
1             617216                   0                 0  
2             633981                   0                 0  
3            1401101                   1                20  
4            1388739                   3                31  
5            1398209                   1                53  
6            1578259                   0               103  
7            1418697                   2                95  
8             818514                   0                52  
9             324276                   0                 8  

Merged table for 2023:
  age_bin  fdoc_male_2023  fdoc_female_2023  flpop_male_2023  \
0   15-19             480                18           663571   
1   20-24            4393               298           699869   
2   25-34           20467              1647          1460442   
3   35-44           23119              2031          1437365   
4   45-54           15734              1110          1385056   
5   55-64           11140               564          1477506   
6   65-74            4135               138          1227549   
7   75-84             878                27           731385   
8     85+              70                 2           240038   

   flpop_female_2023  female_deaths_2023  male_deaths_2023  
0             632772                   0                 0  
1             678944                   1                 4  
2            1405448                   1                26  
3            1407017                   1                35  
4            1396753                   1                54  
5            1569641                   4               108  
6            1418872                   4               101  
7             884067                   3                65  
8             371580                   0                19  
In [42]:
# compute the projected death rate on the general FL populations from FDOC population rates
for year in range(2019, 2024):
    for sex in ['male', 'female']:
        deaths_col = f"{sex}_deaths_{year}"
        pop_col = f"fdoc_{sex}_{year}"
        flpop_col = f"flpop_{sex}_{year}"
        proj_deaths_col = f"{sex}_deaths_proj"
        final_merged[year][proj_deaths_col] = (
            final_merged[year][deaths_col] / final_merged[year][pop_col] * final_merged[year][flpop_col] * adj
        )
In [43]:
# sanity check
final_merged[2020]
Out[43]:
age_bin fdoc_male_2020 fdoc_female_2020 flpop_male_2020 flpop_female_2020 female_deaths_2020 male_deaths_2020 male_deaths_proj female_deaths_proj
0 15-19 561 27 619696 592774 0 0 0.000000 0.000000
1 20-24 5428 341 648074 621957 0 2 226.638461 0.000000
2 25-34 23421 1872 1437525 1388821 3 18 1048.579432 2112.421322
3 35-44 22493 1820 1302602 1321951 4 47 2583.337051 2757.546334
4 45-54 16071 1115 1339768 1393406 1 63 4984.781269 1186.100984
5 55-64 10885 555 1384124 1533563 6 146 17620.502074 15735.434752
6 65-74 3393 127 1141115 1334870 5 171 54583.392203 49879.732249
7 75-84 657 20 659897 795562 3 86 81983.809209 113261.979425
8 85+ 59 1 234750 348827 0 17 64197.980726 0.000000
In [44]:
years = range(2019, 2024)
results = []

for year in years:
    df = final_merged[year]
    total_deaths_proj = df['male_deaths_proj'].sum() + df['female_deaths_proj'].sum()
    total_population = df[f'flpop_male_{year}'].sum() + df[f'flpop_female_{year}'].sum()
    yearly_rate = round(total_deaths_proj / total_population * 100000, 2)
    results.append({'year': year, 'AADR': yearly_rate})

# Output DataFrame with rates per year
rates_df = pd.DataFrame(results)
print(rates_df)

# Mean across the 5 years
mean_aadr = round(rates_df['AADR'].mean(), 2)
print(f"\nMean AADR (2019–2023): {mean_aadr}")
   year     AADR
0  2019   566.76
1  2020  2277.23
2  2021  1313.25
3  2022   746.67
4  2023  1633.91

Mean AADR (2019–2023): 1307.56
In [54]:
IMG_PATH = "assets/Screenshot 2025-08-17 at 4.23.46 PM.png"

# Optional metadata
FDOH_URL = "https://www.flhealthcharts.gov/FLQUERY_New/Population/Count"
QUERY_TEXT = "Crude Death Rate by Year, All causes"
RETRIEVED = "2025-08-17 at 22.23.46 UTC"

# Display the image (embedded in exported HTML)
display(Image(filename=IMG_PATH, embed=True, width=1000))

# Caption below the image
display(Markdown(
    f"Source: [FDOH data portal]({FDOH_URL})  \n"
    f"Query: {QUERY_TEXT}  \n"
    f"Retrieved: {RETRIEVED}"
))
No description has been provided for this image

Source: FDOH data portal
Query: Crude Death Rate by Year, All causes
Retrieved: 2025-08-17 at 22.23.46 UTC

In [48]:
df_fl_crude = pd.read_csv('fdoc_deaths/csvs/FDOH_Crude_death_rate.csv', header =2)
#print(df_fl_crude.info(), df_fl_crude.head())

# Reshape df_fl_crude from wide to long (pivot)
df_fl_crude_long = df_fl_crude.drop(columns=['Unnamed: 10'], errors='ignore').melt(
    var_name='year', value_name='FL_Crude_Death_Rate'
)

# Filter to years of interest and ensure type
years = [2019, 2020, 2021, 2022, 2023]
df_fl_crude_long = df_fl_crude_long[df_fl_crude_long['year'].astype(int).isin(years)].copy()
df_fl_crude_long['year'] = df_fl_crude_long['year'].astype(int)

# Merge with rates dataframe
combined = rates_df.merge(df_fl_crude_long, on='year', how='left')

print(combined)
   year     AADR  FL_Crude_Death_Rate
0  2019   566.76                973.2
1  2020  2277.23               1106.2
2  2021  1313.25               1187.2
3  2022   746.67               1070.1
4  2023  1633.91               1009.7
In [50]:
df_bar = combined.copy()
df_bar['Series'] = 'FDOC AADR'
df_bar = df_bar.rename(columns={'AADR': 'value'})

df_line = combined.copy()
df_line['Series'] = 'FL Crude Death Rate'
df_line = df_line.rename(columns={'FL_Crude_Death_Rate': 'value'})

long_df = pd.concat([df_bar[['year','Series','value']], df_line[['year','Series','value']]], ignore_index=True)

color_scale = alt.Scale(
    domain=['FDOC AADR', 'FL Crude Death Rate'],
    range=['#4bb4e6', '#ffa600']
)

base = alt.Chart(long_df).encode(
    x=alt.X('year:O', title='Year')
)

bars = base.transform_filter(
    alt.datum.Series == 'FDOC AADR'
).mark_bar(
    size=40,
    opacity=0.85,
    cornerRadiusTopLeft=8,
    cornerRadiusTopRight=8
).encode(
    y=alt.Y('value:Q', title='Death Rate (per 100,000)'),
    color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FDOC AADR')
    ]
)

line = base.transform_filter(
    alt.datum.Series == 'FL Crude Death Rate'
).mark_line(
    interpolate='monotone',
    strokeWidth=3
).encode(
    y='value:Q',
    color=alt.Color('Series:N', scale=color_scale, legend=alt.Legend(title=None)),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
    ]
)

points = base.transform_filter(
    alt.datum.Series == 'FL Crude Death Rate'
).mark_circle(size=120).encode(
    y='value:Q',
    color=alt.Color('Series:N', scale=color_scale, legend=None),
    tooltip=[
        alt.Tooltip('year:O', title='Year'),
        alt.Tooltip('value:Q', format='.2f', title='FL Crude Death Rate')
    ]
)

mean_aadr = round(combined['AADR'].mean(), 2)
mean_rule = alt.Chart(pd.DataFrame({'mean_aadr': [mean_aadr]})).mark_rule(
    color='red', strokeDash=[8,4], size=3
).encode(
    y='mean_aadr:Q',
    tooltip=[alt.Tooltip('mean_aadr:Q', format='.2f', title='Mean FDOC AADR')]
)

final_plot = (bars + line + points).properties(
    width=500,
    height=320,
    title=alt.TitleParams(
        'Florida Statewide Death Rate vs. FDOC Age Adjusted Death Rate (2019–2023)', 
        color='white'
    )
)

final_plot.save("assets/FDOC_AADR_vs_FL_Crude.json")
final_plot
Out[50]: